from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB import string import csv import re MNB = MultinomialNB() vectorizer = TfidfVectorizer() X = [] Y = [] with open("train/train.tsv", 'r', encoding="utf-8") as train: for line in csv.reader(train, delimiter="\t"): X.append(line[0]) Y.append(line[1]) Y = vectorizer.fit_transform(Y) MNB.fit(Y, X) ## Wrtie dev dev_in = open('dev-0/in.tsv', 'r', encoding="utf-8") with open('dev-0/out.tsv', 'w', encoding="utf-8") as dev_out: dev = vectorizer.transform(dev_in.readlines()) dev_predict = MNB.predict(dev) for s in dev_predict: dev_out.write(str(s) + '\n') dev_in.close() ## Write test test_in = open('test-A/in.tsv', 'r', encoding="utf-8") with open('test-A/out.tsv', 'w', encoding="utf-8") as test_out: test = vectorizer.transform(test_in.readlines()) test_predict = MNB.predict(test) for s in test_predict: test_out.write(str(s) + '\n') test_in.close()