5.5 KiB
5.5 KiB
import lzma
X = []
with lzma.open('train/in.tsv.xz') as f:
for line in f:
X.append(line.decode('utf-8'))
Y = []
with open('train/expected.tsv') as f:
for line in f:
txt = line
txt = txt.replace('\n', '')
Y.append(txt)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
textVectors = vectorizer.fit_transform(X)
textVectors[0].todense()
matrix([[0., 0., 0., ..., 0., 0., 0.]])
from sklearn.naive_bayes import BernoulliNB
import numpy as np
trainY = np.array(Y)
bernoulli = BernoulliNB()
bernoulli.fit(textVectors, trainY)
BernoulliNB()
import csv
testX = []
with open('dev-0/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
with open('dev-0/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)
expectedY = []
with open('dev-0/expected.tsv') as f:
for line in f:
txt = line
txt = txt.replace('\n', '')
expectedY.append(txt)
bernoulli.score(testX, expectedY)
0.6577260876531162
# dev-1
testX = []
with open('dev-1/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
expectedY = []
with open('dev-1/expected.tsv') as f:
for line in f:
expectedY.append(line.replace('\n', ''))
print('Score for dev-1:', bernoulli.score(testX, expectedY))
with open('dev-1/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)
Score for dev-1: 0.6406778795193032
# test-A
testX = []
with open('test-A/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
with open('test-A/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)