import lzma from naivebayes import NaiveBayesTextClassifier from spacy.lang.en.stop_words import STOP_WORDS import numpy as np import pandas as pd np.max_length = 1200000 def get_data(fname): with open(fname, 'r', encoding='utf8') as file: return file.readlines() def get_data_zipped(fname): with lzma.open(fname, 'r') as file: return file.readlines() def train_bayes(model, x, y, step=10000): start = 0 end = step for _ in range(0, len(x), step): model.train(x[start:end], y[start:end]) if start + step < len(x): start += step else: start = 0 end = min(start + step, len(x)) train_x = get_data_zipped('train/in.tsv.xz') train_y = get_data('train/expected.tsv') train_y = [int(y) for y in train_y] test_x = get_data_zipped('test-A/in.tsv.xz') dev_x = get_data_zipped('dev-0/in.tsv.xz') model = NaiveBayesTextClassifier( categories=[0, 1], stop_words=STOP_WORDS ) train_bayes(model, train_x, train_y) predicted = model.classify(dev_x) predicted2 = model.classify(test_x) pd.DataFrame(predicted).to_csv('dev-0/out.tsv', sep='\t', encoding="utf-8", index=False) pd.DataFrame(predicted2).to_csv('test-A/out.tsv', sep='\t', encoding="utf-8", index=False)