from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import Pipeline with open('train/in.tsv') as f: data_train_X = f.readlines() with open('train/expected.tsv') as f: data_train_Y = f.readlines() with open('dev-0/in.tsv') as f: data_dev_X = f.readlines() with open('test-A/in.tsv') as f: data_test_X = f.readlines() data_train_Y = LabelEncoder().fit_transform(data_train_Y) model = Pipeline(steps=[('tfidf', TfidfVectorizer()),('bayes', MultinomialNB())]) clf = model.fit(data_train_X, data_train_Y) with open('train/out.tsv', 'w') as writer: for result in clf.predict(data_train_X): writer.write(str(result) + '\n') with open('dev-0/out.tsv', 'w') as writer: for result in clf.predict(data_dev_X): writer.write(str(result) + '\n') with open('test-A/out.tsv', 'w') as writer: for result in clf.predict(data_test_X): writer.write(str(result) + '\n')