first sollution by bayes

This commit is contained in:
Mikołaj Pokrywka 2022-05-07 18:45:00 +02:00
parent 756ef4277a
commit 3a91a4f3f7
3 changed files with 10481 additions and 0 deletions

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

57
run.py Normal file
View File

@ -0,0 +1,57 @@
from naivebayes import NaiveBayesTextClassifier
import lzma
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
categories_list = [0, 1]
classifier = NaiveBayesTextClassifier(
categories=categories_list,
stop_words=en_stop
)
X = []
Y = []
with lzma.open('train/in.tsv.xz', 'r') as file:
for line in file:
line = line.strip()
X.append(line.decode("utf-8"))
with open('train/expected.tsv', 'r') as file:
for line in file:
line = line.strip()
Y.append(int(line))
print(len(X), len(Y))
classifier.train(X[:15000], Y[:15000])
classifier.train(X[15000:30000], Y[15000:30000])
# classifier.train(X[30000:60000], Y[30000:60000])l
# classifier.train(X[60000:90000], Y[60000:90000])
test_x = []
with lzma.open('dev-0/in.tsv.xz', 'r') as file:
for line in file:
line = line.strip()
test_x.append(line.decode("utf-8"))
predicted_classes = classifier.classify(test_x)
f = open("dev-0/out.tsv", "a")
for p in predicted_classes:
f.write(str(p) + '\n')
f.close()
test_x = []
with lzma.open('test-A/in.tsv.xz', 'r') as file:
for line in file:
line = line.strip()
test_x.append(line.decode("utf-8"))
predicted_classes = classifier.classify(test_x)
f = open("test-A/out.tsv", "a")
for p in predicted_classes:
f.write(str(p) + '\n')
f.close()

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff