This commit is contained in:
Mateusz 2022-05-11 23:35:54 +02:00
parent 756ef4277a
commit 5f7fe04d90
3 changed files with 10473 additions and 0 deletions

5273
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

47
run.py Normal file
View File

@ -0,0 +1,47 @@
import lzma
from naivebayes import NaiveBayesTextClassifier
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np
import pandas as pd
np.max_length = 1200000
def get_data(fname):
with open(fname, 'r', encoding='utf8') as file:
return file.readlines()
def get_data_zipped(fname):
with lzma.open(fname, 'r') as file:
return file.readlines()
def train_bayes(model, x, y, step=10000):
start = 0
end = step
for _ in range(0, len(x), step):
model.train(x[start:end], y[start:end])
if start + step < len(x):
start += step
else:
start = 0
end = min(start + step, len(x))
train_x = get_data_zipped('train/in.tsv.xz')
train_y = get_data('train/expected.tsv')
train_y = [int(y) for y in train_y]
test_x = get_data_zipped('test-A/in.tsv.xz')
dev_x = get_data_zipped('dev-0/in.tsv.xz')
model = NaiveBayesTextClassifier(
categories=[0, 1],
stop_words=STOP_WORDS
)
train_bayes(model, train_x, train_y)
predicted = model.classify(dev_x)
predicted2 = model.classify(test_x)
pd.DataFrame(predicted).to_csv('dev-0/out.tsv', sep='\t', encoding="utf-8", index=False)
pd.DataFrame(predicted2).to_csv('test-A/out.tsv', sep='\t', encoding="utf-8", index=False)

5153
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff