#import numpy as np import gzip from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn import metrics #df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False) train_X = [] train_y = [] with gzip.open('train/train.tsv.gz','r') as fin: for line in fin: sline = line.decode('UTF-8').replace("\n", "").split("\t") train_y.append(sline[0]) train_X.append(''.join(sline[1:])) test_X = [] with open('dev-0/in.tsv','r') as test_in_file: for line in test_in_file: test_X.append(line.rstrip('\n')) test_y = [] with open('dev-0/expected.tsv','r') as test_expected_file: for line in test_expected_file: test_y.append(line.rstrip('\n')) vectorizer = TfidfVectorizer(lowercase = True) X_train_tf = vectorizer.fit_transform(train_X) print("n_samples: %d, n_features: %d" % X_train_tf.shape) X_test_tf = vectorizer.transform(test_X) print("n_samples: %d, n_features: %d" % X_test_tf.shape) naive_bayes_classifier = MultinomialNB() naive_bayes_classifier.fit(X_train_tf, train_y) y_pred = naive_bayes_classifier.predict(X_test_tf) score1 = metrics.accuracy_score(test_y, y_pred) print("accuracy: %0.3f" % score1) print(metrics.classification_report(test_y, y_pred, target_names=['1', '0'])) print("confusion matrix:") print(metrics.confusion_matrix(test_y, y_pred)) print('------------------------------') file = open('dev-0/out.tsv',"w") for i in y_pred: file.writelines("{}\n".format(i)) file.close() val_X = [] with open('test-A/in.tsv','r') as test_in_file: for line in test_in_file: val_X.append(line.rstrip('\n')) X_val_tf = vectorizer.transform(val_X) print("n_samples: %d, n_features: %d" % X_val_tf.shape) val_y_pred = naive_bayes_classifier.predict(X_val_tf) file = open('test-A/out.tsv',"w") for i in val_y_pred: file.writelines("{}\n".format(i)) file.close()