import lzma from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import BernoulliNB import numpy as np import csv X = [] with lzma.open('train/in.tsv.xz') as f: for line in f: X.append(line.decode('utf-8')) Y = [] with open('train/expected.tsv') as f: for line in f: Y.append(line.replace('\n', '')) vectorizer = TfidfVectorizer() textVectors = vectorizer.fit_transform(X) trainY = np.array(Y) bernoulli = BernoulliNB() bernoulli.fit(textVectors, trainY) # dev-0 testX = [] with open('dev-0/in.tsv', encoding='utf8') as f: for line in f: testX.append(line) testX = vectorizer.transform(testX) predictedY = bernoulli.predict(testX) expectedY = [] with open('dev-0/expected.tsv') as f: for line in f: expectedY.append(line.replace('\n', '')) with open('dev-0/out.tsv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(predictedY) # dev-1 testX = [] with open('dev-1/in.tsv', encoding='utf8') as f: for line in f: testX.append(line) testX = vectorizer.transform(testX) predictedY = bernoulli.predict(testX) expectedY = [] with open('dev-1/expected.tsv') as f: for line in f: expectedY.append(line.replace('\n', '')) with open('dev-1/out.tsv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(predictedY) # test-A testX = [] with open('test-A/in.tsv', encoding='utf8') as f: for line in f: testX.append(line) testX = vectorizer.transform(testX) predictedY = bernoulli.predict(testX) with open('test-A/out.tsv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(predictedY)