import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
import csv

prep = preprocessing.LabelEncoder()

with open("train/train.tsv") as file_train:
    csv_input = csv.reader(file_train, delimiter='\t')
    X = []
    Y = []
    for line in csv_input:
        Y.append(line[0])
        X.append(line[1])
    
    Y = prep.fit_transform(Y)
    with open("test-A/in.tsv") as file_in:
        work_file_lines = file_in.readlines()
        MNB = make_pipeline(TfidfVectorizer(use_idf = True), MultinomialNB())
      
        model = MNB.fit(X,Y)

        y_predict = model.predict(work_file_lines)
        y_predict = np.array(y_predict)

        np.set_printoptions(threshold=np.inf)
        labels = np.array2string(y_predict.flatten(), separator='\n', suppress_small=True)

        file_out = open("test-A/out.tsv", 'w')
        file_out.write(labels[1:-1])

        with open("test-A/out.tsv", 'r') as fix_space:
            lines = fix_space.readlines()

            lines = [line.replace(' ', '') for line in lines]
        with open("test-A/out.tsv", 'w') as fix_space:
            fix_space.writelines(lines)