import pandas as pd import numpy as np from sklearn.naive_bayes import GaussianNB from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics import accuracy_score # Pliki zostały wygenerowane poza repozytorium, dlatego są inne ścieżki i nazwy plików df = pd.read_csv("train.tsv", sep="\t", header=None) dev_X = pd.read_csv("in.tsv", sep="\t", header=None) test_X = pd.read_csv("test_in.tsv", sep="\t", header=None) df = df.head(1500) vectorizer = CountVectorizer() X = vectorizer.fit_transform(df[1]) y = df[0] model = GaussianNB() model.fit(X.toarray(), y) data_for_dev = model.predict(vectorizer.transform(dev_X[0]).toarray()) data_for_dev = data_for_dev.tolist() data_for_dev = [str(x)+'\n' for x in data_for_dev] data_for_test = model.predict(vectorizer.transform(test_X[0]).toarray()) data_for_test = data_for_test.tolist() data_for_test = [str(x)+'\n' for x in data_for_test] with open("out.tsv", "w", encoding="UTF-8") as output_file: output_file.writelines(data_for_dev) with open("test_out.tsv", "w", encoding="UTF-8") as output_file: output_file.writelines(data_for_test)