petite-difference-challenge2/sheSaid.ipynb

5.5 KiB

import lzma
X = []

with lzma.open('train/in.tsv.xz') as f:
    for line in f:
        X.append(line.decode('utf-8'))
Y = []

with open('train/expected.tsv') as f:
    for line in f:
        txt = line
        txt = txt.replace('\n', '')
        Y.append(txt)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
textVectors = vectorizer.fit_transform(X)
textVectors[0].todense()
matrix([[0., 0., 0., ..., 0., 0., 0.]])
from sklearn.naive_bayes import BernoulliNB
import numpy as np

trainY = np.array(Y)
bernoulli = BernoulliNB()
bernoulli.fit(textVectors, trainY)
BernoulliNB()
import csv

testX = []

with open('dev-0/in.tsv', encoding='utf8') as f:
    for line in f:
        testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)

with open('dev-0/out.tsv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(predictedY)
expectedY = []
with open('dev-0/expected.tsv') as f:
    for line in f:
        txt = line
        txt = txt.replace('\n', '')
        expectedY.append(txt)
bernoulli.score(testX, expectedY)
0.6577260876531162
# dev-1
testX = []

with open('dev-1/in.tsv', encoding='utf8') as f:
    for line in f:
        testX.append(line)

testX = vectorizer.transform(testX)

predictedY = bernoulli.predict(testX)

expectedY = []

with open('dev-1/expected.tsv') as f:
    for line in f:
        expectedY.append(line.replace('\n', ''))

print('Score for dev-1:', bernoulli.score(testX, expectedY))

with open('dev-1/out.tsv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(predictedY)
Score for dev-1: 0.6406778795193032
# test-A
testX = []

with open('test-A/in.tsv', encoding='utf8') as f:
    for line in f:
        testX.append(line)

testX = vectorizer.transform(testX)

predictedY = bernoulli.predict(testX)

with open('test-A/out.tsv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(predictedY)