import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import accuracy_score from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline df = pd.read_csv("train/train.tsv.gz", header=None, sep="\t", error_bad_lines=False) dev0 = pd.read_csv("dev-0/in.tsv", header=None, sep="\t", error_bad_lines=False) testA = pd.read_csv("test-A/in.tsv", header=None, sep="\t", error_bad_lines=False) expected = pd.read_csv("dev-0/expected.tsv", header=None, sep="\t", error_bad_lines=False) dev0_X = dev0.iloc[:, 0].tolist() testA_X = testA.iloc[:, 0].tolist() Y = df.iloc[:, 0].tolist() X = df.iloc[:, 1].tolist() model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(X, Y) predicted_dev0 = model.predict(dev0_X) predicted_testA = model.predict(testA_X) pd.Series(predicted_dev0).to_csv("dev-0/out.tsv", header=False, sep="\t", index=False) pd.Series(predicted_testA).to_csv("test-A/out.tsv", header=False, sep="\t", index=False) dev0_accuracy = accuracy_score(expected, predicted_dev0) f = open("dev0_accuracy.txt", "w") f.write(str(dev0_accuracy)) f.close()