from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB import pandas as pd train = pd.read_csv('train/train.tsv', sep='\t', header=None, error_bad_lines=False) X_train = train[0].astype(str).tolist() Y_train = train[1].astype(str).tolist() naive_b = MultinomialNB() count_vec = CountVectorizer() Y_train=count_vec.fit_transform(Y_train) naive_b.fit(Y_train, X_train) dev = pd.read_csv('dev-0/in.tsv', sep='\n', header=None) X_dev = dev[0].astype(str).tolist() Y_dev = count_vec.transform(X_dev) dev_predict = naive_b.predict(Y_dev) dev_out = open('dev-0/out.tsv', 'w') for p in dev_predict: dev_out.write(p + '\n') test = pd.read_csv('test-A/in.tsv', sep='\n', header=None) X_test = test[0].astype(str).tolist() Y_test = count_vec.transform(X_test) test_predict = naive_b.predict(Y_test) test_out = open('test-A/out.tsv', 'w') for p in test_predict: test_out.write(p + '\n')