import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from sklearn.metrics import accuracy_score df = pd.read_csv("train/train.tsv", sep="\t", header=None, error_bad_lines=False) df = df.head(1000) dev_x = pd.read_csv("dev-0/in.tsv", sep="\t", header=None, error_bad_lines=False) with open('test-A/in.tsv', 'r', encoding='utf8') as file: test = file.readlines() test = pd.Series(test) x = df[1] y = df[0] model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(x,y) pred_dev = model.predict(dev_x[0]) pred_dev = pd.Series(pred_dev) with open('dev-0/out.tsv', 'wt') as file: for pred in pred_dev: file.write(str(pred)+'\n') pred_test = model.predict(test) pred_test = pd.Series(pred_test) pred_test = pred_test.astype('int') with open('test-A/out.tsv', 'wt') as file: for pred in pred_test: file.write(str(pred)+'\n')