import pandas as pd import numpy as np import gzip from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics # Read data dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None) test = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None) test_expected = pd.read_table( 'dev-0/expected.tsv', error_bad_lines=False, header=None) X_train = [] y_train = [] with gzip.open('train/train.tsv.gz', 'r') as f: for l in f: line = l.decode('UTF-8').replace("\n", "").split("\t") y_train.append(int(line[0])) X_train.append(str(line[1:])) # Convert to unified types X_train = np.asarray(X_train) y_train = np.asarray(y_train) X_dev = dev[0].values X_test = test[0].values # Create model model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(X_train, y_train) # Predict dev_p = model.predict(X_dev) test_p = model.predict(X_test) # Accuracy score = metrics.accuracy_score(test_expected, dev_p) print("Accuracy: %0.3f" % score) # Save to files dev_p.tofile('./dev-0/out.tsv', sep='\n') test_p.tofile('./test-A/out.tsv', sep='\n')