from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB dane = [] with open('train/in.tsv') as data: for idx, line in enumerate(data.readlines()): dane.append(line.replace('\n', '')) if idx == 20000: break wyniki = [] with open('train/expected.tsv') as data: for idx, line in enumerate(data.readlines()): wyniki.append(line.replace('\n', '')) if idx == 20000: break print("ZaƂadowano dane") X_train, X_test, y_train, y_test = train_test_split(dane, wyniki, random_state = 0) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) print("Utworzono model") predicted = [] with open('dev-0/in.tsv') as data: for line in data.readlines(): predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0]) def evaluate(output, expectedFile): ok = 0 bad = 0 with open(expectedFile) as data: for idx, line in enumerate(data.readlines()): if line.replace('\n', '') == str(output[idx]): ok += 1 else: bad += 1 return ok / (ok + bad) print(evaluate(predicted, "dev-0/expected.tsv")) with open('dev-0/out.tsv', 'w') as file: for p in predicted: file.write(p + "\n") print("Przetworzono dev-0") predicted = [] with open('test-A/in.tsv') as data: for line in data.readlines(): predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0]) with open('test-A/out.tsv', 'w') as file: for p in predicted: file.write(p + "\n") print("Przetworzono test-A") predicted = [] with open('dev-1/in.tsv') as data: for line in data.readlines(): predicted.append(clf.predict(count_vect.transform([line.replace('\n', '')]))[0]) with open('dev-1/out.tsv', 'w') as file: for p in predicted: file.write(p + "\n")