sport-text-classification/skrypt.py

40 lines
1.0 KiB
Python
Raw Normal View History

2021-05-04 13:07:13 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import string
import csv
import re
MNB = MultinomialNB()
vectorizer = TfidfVectorizer()
X = []
Y = []
with open("train/train.tsv", 'r', encoding="utf-8") as train:
for line in csv.reader(train, delimiter="\t"):
X.append(line[0])
Y.append(line[1])
Y = vectorizer.fit_transform(Y)
MNB.fit(Y, X)
## Wrtie dev
dev_in = open('dev-0/in.tsv', 'r', encoding="utf-8")
with open('dev-0/out.tsv', 'w', encoding="utf-8") as dev_out:
dev = vectorizer.transform(dev_in.readlines())
dev_predict = MNB.predict(dev)
for s in dev_predict:
dev_out.write(str(s) + '\n')
dev_in.close()
## Write test
test_in = open('test-A/in.tsv', 'r', encoding="utf-8")
with open('test-A/out.tsv', 'w', encoding="utf-8") as test_out:
test = vectorizer.transform(test_in.readlines())
test_predict = MNB.predict(test)
for s in test_predict:
test_out.write(str(s) + '\n')
test_in.close()