import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline data = pd.read_csv('train/train.tsv', sep='\t', header=None, error_bad_lines=False) X = data[1] with open('dev-0/in.tsv', 'r', encoding='utf8') as f: Xdev = f.readlines() Xdev = pd.Series(Xdev) with open('test-A/in.tsv', 'r', encoding='utf8') as f: Xtest = f.readlines() Xtest = pd.Series(Xtest) y = data[0].astype('str') ydev = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None) ydev = ydev.squeeze() model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(X, y) predictions_dev0 = model.predict(Xdev) predictions_dev0 = pd.Series(predictions_dev0) predictions_dev0 = predictions_dev0.astype('int') with open('dev-0/out.tsv', 'wt') as f: for pred in predictions_dev0: f.write(str(pred)+'\n') predictions_testA = model.predict(Xtest) predictions_testA = pd.Series(predictions_testA) predictions_testA = predictions_testA.astype('int') with open('test-A/out.tsv', 'wt') as f: for pred in predictions_testA: f.write(str(pred)+'\n')