solve

2022-05-11 23:35:54 +02:00 · 2022-05-11 23:35:54 +02:00 · 5f7fe04d90
commit 5f7fe04d90
parent 756ef4277a
3 changed files with 10473 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,47 @@
+import lzma
+from naivebayes import NaiveBayesTextClassifier
+from spacy.lang.en.stop_words import STOP_WORDS
+import numpy as np
+import pandas as pd
+
+np.max_length = 1200000
+
+def get_data(fname):
+    with open(fname, 'r', encoding='utf8') as file:
+        return file.readlines()
+
+def get_data_zipped(fname):
+    with lzma.open(fname, 'r') as file:
+        return file.readlines()
+
+def train_bayes(model, x, y, step=10000):
+    start = 0
+    end = step
+
+    for _ in range(0, len(x), step):
+        model.train(x[start:end], y[start:end])
+        if start + step < len(x):
+            start += step
+        else:
+            start = 0
+        end = min(start + step, len(x))
+
+train_x = get_data_zipped('train/in.tsv.xz')
+train_y = get_data('train/expected.tsv')
+train_y = [int(y) for y in train_y]
+
+test_x = get_data_zipped('test-A/in.tsv.xz')
+dev_x = get_data_zipped('dev-0/in.tsv.xz')
+
+model = NaiveBayesTextClassifier(
+    categories=[0, 1],
+    stop_words=STOP_WORDS
+)
+
+train_bayes(model, train_x, train_y)
+
+predicted = model.classify(dev_x)
+predicted2 = model.classify(test_x)
+
+pd.DataFrame(predicted).to_csv('dev-0/out.tsv', sep='\t', encoding="utf-8", index=False)
+pd.DataFrame(predicted2).to_csv('test-A/out.tsv', sep='\t', encoding="utf-8", index=False)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv