#!/usr/bin/env python # coding: utf-8 # In[1]: from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB import pandas as pd import csv import numpy as np from sklearn.preprocessing import LabelEncoder # In[2]: steps = make_pipeline(TfidfVectorizer(),MultinomialNB()) # In[14]: #training all_train_data_in = pd.read_csv('train/in.tsv.xz', compression='xz', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\t', nrows=3000) train_data_ex = pd.read_csv('train/expected.tsv', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\t', nrows=3000) train_data_in = [] for value in all_train_data_in.values: temp = "" for el in value: if(temp == ""): temp = str(el) else: temp += '\t' + str(el) train_data_in.append(temp) nb=steps.fit(train_data_in, LabelEncoder().fit_transform(train_data_ex.values)) # In[17]: #dev0 all_dev0_data = pd.read_csv('dev-0/in.tsv.xz', compression='xz', header=None, quoting=csv.QUOTE_NONE, sep='\t') dev0_data = [] for value in all_dev0_data.values: temp = "" for el in value: if(temp == ""): temp = str(el) else: temp += '\t' + str(el) dev0_data.append(temp) dev0_y = nb.predict(dev0_data) #zapis wyników dev0_y.tofile('dev-0/out.tsv', sep='\n') # In[16]: #test-A all_testA_data = pd.read_csv('test-A/in.tsv.xz', compression='xz', header=None, quoting=csv.QUOTE_NONE, sep='\t') testA_data = [] for value in all_testA_data.values: temp = "" for el in value: if(temp == ""): temp = str(el) else: temp += '\t' + str(el) testA_data.append(temp) testA_y = nb.predict(testA_data) #zapis wyników testA_y.tofile('test-A/out.tsv', sep='\n')