petite-difference-challenge2/run.py

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

dane = []
with open('train/in.tsv') as data:
    for idx, line in enumerate(data.readlines()):
        dane.append(line.replace('\n', ''))
        if idx == 20000:
            break

wyniki = []
with open('train/expected.tsv') as data:
    for idx, line in enumerate(data.readlines()):
        wyniki.append(line.replace('\n', ''))
        if idx == 20000:
            break

print("Załadowano dane")

X_train, X_test, y_train, y_test = train_test_split(dane, wyniki, random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)
print("Utworzono model")

predicted = []
with open('dev-0/in.tsv') as data:
    for line in data.readlines():
        predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])


def evaluate(output, expectedFile):
    ok = 0
    bad = 0
    with open(expectedFile) as data:
        for idx, line in enumerate(data.readlines()):
            if line.replace('\n', '') == str(output[idx]):
                ok += 1
            else:
                bad += 1
    return ok / (ok + bad)

print(evaluate(predicted, "dev-0/expected.tsv"))


with open('dev-0/out.tsv', 'w') as file:
    for p in predicted:
        file.write(p + "\n")

print("Przetworzono dev-0")


predicted = []
with open('test-A/in.tsv') as data:
    for line in data.readlines():
        predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])


with open('test-A/out.tsv', 'w') as file:
    for p in predicted:
        file.write(p + "\n")


print("Przetworzono test-A")


predicted = []
with open('dev-1/in.tsv') as data:
    for line in data.readlines():
        predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])


with open('dev-1/out.tsv', 'w') as file:
    for p in predicted:
        file.write(p + "\n")