from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np def getData(path): with open(path) as source: return source.readlines() trainInData = getData('./train/in.tsv') trainExpData = getData('./train/expected.tsv') afterTransform = LabelEncoder().fit_transform(trainExpData) pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer()),('naive-bayes', MultinomialNB())]) model = pipeline.fit(trainInData, afterTransform) def getResult(path): dataToPredict = getData(path + 'in.tsv') pred = model.predict(dataToPredict) with open(path + "out.tsv", "w") as result: for prediction in pred: result.write(str(prediction) + '\n') getResult('./dev-0/') getResult('./test-A/')