80 lines
2.1 KiB
Python
80 lines
2.1 KiB
Python
from sklearn.model_selection import train_test_split
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
|
|
dane = []
|
|
with open('train/in.tsv') as data:
|
|
for idx, line in enumerate(data.readlines()):
|
|
dane.append(line.replace('\n', ''))
|
|
if idx == 20000:
|
|
break
|
|
|
|
wyniki = []
|
|
with open('train/expected.tsv') as data:
|
|
for idx, line in enumerate(data.readlines()):
|
|
wyniki.append(line.replace('\n', ''))
|
|
if idx == 20000:
|
|
break
|
|
|
|
print("Załadowano dane")
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(dane, wyniki, random_state = 0)
|
|
count_vect = CountVectorizer()
|
|
X_train_counts = count_vect.fit_transform(X_train)
|
|
tfidf_transformer = TfidfTransformer()
|
|
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
|
|
clf = MultinomialNB().fit(X_train_tfidf, y_train)
|
|
print("Utworzono model")
|
|
|
|
predicted = []
|
|
with open('dev-0/in.tsv') as data:
|
|
for line in data.readlines():
|
|
predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])
|
|
|
|
|
|
def evaluate(output, expectedFile):
|
|
ok = 0
|
|
bad = 0
|
|
with open(expectedFile) as data:
|
|
for idx, line in enumerate(data.readlines()):
|
|
if line.replace('\n', '') == str(output[idx]):
|
|
ok += 1
|
|
else:
|
|
bad += 1
|
|
return ok / (ok + bad)
|
|
|
|
print(evaluate(predicted, "dev-0/expected.tsv"))
|
|
|
|
|
|
with open('dev-0/out.tsv', 'w') as file:
|
|
for p in predicted:
|
|
file.write(p + "\n")
|
|
|
|
print("Przetworzono dev-0")
|
|
|
|
|
|
|
|
predicted = []
|
|
with open('test-A/in.tsv') as data:
|
|
for line in data.readlines():
|
|
predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])
|
|
|
|
|
|
with open('test-A/out.tsv', 'w') as file:
|
|
for p in predicted:
|
|
file.write(p + "\n")
|
|
|
|
|
|
print("Przetworzono test-A")
|
|
|
|
|
|
predicted = []
|
|
with open('dev-1/in.tsv') as data:
|
|
for line in data.readlines():
|
|
predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0])
|
|
|
|
|
|
with open('dev-1/out.tsv', 'w') as file:
|
|
for p in predicted:
|
|
file.write(p + "\n") |