import numpy as np from sklearn import preprocessing from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer import lzma le=preprocessing.LabelEncoder() def open_file_xz(file: str): with lzma.open(file, mode='rt') as f: return f.readlines() def open_file(file: str): with open(file) as f: return f.readlines() def train_model(train_in, train_exp): train_exp = le.fit_transform(train_exp) pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB()) model=pipeline.fit(train_in, train_exp) return model def make_prediction(catalog: str): train_in = open_file_xz("train/in.tsv.xz") train_exp = open_file("train/expected.tsv") to_be_predict = open_file_xz(catalog + "/in.tsv.xz") model = train_model(train_in,train_exp) prediction=model.predict(to_be_predict) np.savetxt(catalog + "/out.tsv", prediction, fmt='%d', delimiter='\n') make_prediction("dev-0") make_prediction("test-A")