from naivebayes import NaiveBayesTextClassifier import lzma from spacy.lang.en.stop_words import STOP_WORDS as en_stop categories_list = [0, 1] classifier = NaiveBayesTextClassifier( categories=categories_list, stop_words=en_stop ) X = [] Y = [] with lzma.open('train/in.tsv.xz', 'r') as file: for line in file: line = line.strip() X.append(line.decode("utf-8")) with open('train/expected.tsv', 'r') as file: for line in file: line = line.strip() Y.append(int(line)) print(len(X), len(Y)) classifier.train(X[:15000], Y[:15000]) classifier.train(X[15000:30000], Y[15000:30000]) # classifier.train(X[30000:60000], Y[30000:60000])l # classifier.train(X[60000:90000], Y[60000:90000]) test_x = [] with lzma.open('dev-0/in.tsv.xz', 'r') as file: for line in file: line = line.strip() test_x.append(line.decode("utf-8")) predicted_classes = classifier.classify(test_x) f = open("dev-0/out.tsv", "a") for p in predicted_classes: f.write(str(p) + '\n') f.close() test_x = [] with lzma.open('test-A/in.tsv.xz', 'r') as file: for line in file: line = line.strip() test_x.append(line.decode("utf-8")) predicted_classes = classifier.classify(test_x) f = open("test-A/out.tsv", "a") for p in predicted_classes: f.write(str(p) + '\n') f.close()