80 lines
1.6 KiB
Python
80 lines
1.6 KiB
Python
import lzma
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import BernoulliNB
|
|
import numpy as np
|
|
import csv
|
|
|
|
|
|
X = []
|
|
with lzma.open('train/in.tsv.xz') as f:
|
|
for line in f:
|
|
X.append(line.decode('utf-8'))
|
|
|
|
Y = []
|
|
with open('train/expected.tsv') as f:
|
|
for line in f:
|
|
Y.append(line.replace('\n', ''))
|
|
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
textVectors = vectorizer.fit_transform(X)
|
|
|
|
trainY = np.array(Y)
|
|
|
|
bernoulli = BernoulliNB()
|
|
bernoulli.fit(textVectors, trainY)
|
|
|
|
# dev-0
|
|
testX = []
|
|
with open('dev-0/in.tsv', encoding='utf8') as f:
|
|
for line in f:
|
|
testX.append(line)
|
|
|
|
testX = vectorizer.transform(testX)
|
|
predictedY = bernoulli.predict(testX)
|
|
|
|
expectedY = []
|
|
with open('dev-0/expected.tsv') as f:
|
|
for line in f:
|
|
expectedY.append(line.replace('\n', ''))
|
|
|
|
with open('dev-0/out.tsv', 'w', newline='') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerows(predictedY)
|
|
|
|
|
|
# dev-1
|
|
testX = []
|
|
|
|
with open('dev-1/in.tsv', encoding='utf8') as f:
|
|
for line in f:
|
|
testX.append(line)
|
|
|
|
testX = vectorizer.transform(testX)
|
|
predictedY = bernoulli.predict(testX)
|
|
|
|
expectedY = []
|
|
|
|
with open('dev-1/expected.tsv') as f:
|
|
for line in f:
|
|
expectedY.append(line.replace('\n', ''))
|
|
|
|
with open('dev-1/out.tsv', 'w', newline='') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerows(predictedY)
|
|
|
|
|
|
# test-A
|
|
testX = []
|
|
|
|
with open('test-A/in.tsv', encoding='utf8') as f:
|
|
for line in f:
|
|
testX.append(line)
|
|
|
|
testX = vectorizer.transform(testX)
|
|
|
|
predictedY = bernoulli.predict(testX)
|
|
|
|
with open('test-A/out.tsv', 'w', newline='') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerows(predictedY) |