import pandas as pd import numpy as np import gzip from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from sklearn.metrics import accuracy_score with gzip.open('train/train.tsv.gz', 'rb') as file: data = pd.read_csv(file, sep='\t', header=None, error_bad_lines=False) # dane do trenowania y_train = data[0] x_train = data[1] # dev x_dev = pd.read_csv('dev-0/in.tsv',header = None, sep = '/t',engine = 'python') x_dev = x_dev[0] #test x_test_A = pd.read_csv('test-A/in.tsv',header = None, sep = '/t',engine = 'python') x_test_A = x_test_A[0] # model model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(x_train, y_train) def predictions(zb, path_out, model): res = model.predict(zb) with open(path_out, 'wt') as file: for i in res: file.write(str(i) + '\n') predictions(x_dev,'dev-0/out.tsv', model) predictions(x_test_A,'test-A/out.tsv', model)