petite-difference-challenge2/run.py

80 lines
2.1 KiB
Python

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
dane = []
with open('train/in.tsv') as data:
for idx, line in enumerate(data.readlines()):
dane.append(line.replace('\n', ''))
if idx == 20000:
break
wyniki = []
with open('train/expected.tsv') as data:
for idx, line in enumerate(data.readlines()):
wyniki.append(line.replace('\n', ''))
if idx == 20000:
break
print("Załadowano dane")
X_train, X_test, y_train, y_test = train_test_split(dane, wyniki, random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)
print("Utworzono model")
predicted = []
with open('dev-0/in.tsv') as data:
for line in data.readlines():
predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])
def evaluate(output, expectedFile):
ok = 0
bad = 0
with open(expectedFile) as data:
for idx, line in enumerate(data.readlines()):
if line.replace('\n', '') == str(output[idx]):
ok += 1
else:
bad += 1
return ok / (ok + bad)
print(evaluate(predicted, "dev-0/expected.tsv"))
with open('dev-0/out.tsv', 'w') as file:
for p in predicted:
file.write(p + "\n")
print("Przetworzono dev-0")
predicted = []
with open('test-A/in.tsv') as data:
for line in data.readlines():
predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])
with open('test-A/out.tsv', 'w') as file:
for p in predicted:
file.write(p + "\n")
print("Przetworzono test-A")
predicted = []
with open('dev-1/in.tsv') as data:
for line in data.readlines():
predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])
with open('dev-1/out.tsv', 'w') as file:
for p in predicted:
file.write(p + "\n")