petite-difference-challenge2/main.py

80 lines
1.6 KiB
Python

import lzma
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
import numpy as np
import csv
X = []
with lzma.open('train/in.tsv.xz') as f:
for line in f:
X.append(line.decode('utf-8'))
Y = []
with open('train/expected.tsv') as f:
for line in f:
Y.append(line.replace('\n', ''))
vectorizer = TfidfVectorizer()
textVectors = vectorizer.fit_transform(X)
trainY = np.array(Y)
bernoulli = BernoulliNB()
bernoulli.fit(textVectors, trainY)
# dev-0
testX = []
with open('dev-0/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
expectedY = []
with open('dev-0/expected.tsv') as f:
for line in f:
expectedY.append(line.replace('\n', ''))
with open('dev-0/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)
# dev-1
testX = []
with open('dev-1/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
expectedY = []
with open('dev-1/expected.tsv') as f:
for line in f:
expectedY.append(line.replace('\n', ''))
with open('dev-1/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)
# test-A
testX = []
with open('test-A/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
with open('test-A/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)