import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer def train_model(train_in, train_expected): with open(train_expected, 'r', encoding='utf-8') as f: exp = f.readlines() with open(train_in, 'r', encoding='utf-8') as f: train_data = f.readlines() exp_encoded = LabelEncoder().fit_transform(exp) pipeline = Pipeline(steps=[ ('tfidf', TfidfVectorizer()), ('naive-bayes', MultinomialNB()) ]) return pipeline.fit(train_data, exp_encoded) def predict(model, in_file, out_file): with open(in_file, 'r', encoding='utf-8') as f: lines = f.readlines() prediction = model.predict(lines) np.savetxt(out_file, prediction, fmt='%d') def main(): model = train_model("train/in.tsv", "train/expected.tsv") predict(model, "dev-0/in.tsv", "dev-0/out.tsv") predict(model, "test-A/in.tsv", "test-A/out.tsv") if __name__ == '__main__': main()