import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer def Create_model(X_tsv, Y_tsv): with open(X_tsv) as f: X = f.readlines() with open(Y_tsv) as f: Y = f.readlines() Y = LabelEncoder().fit_transform(Y) pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB()) return pipeline.fit(X, Y) def predict(model, X_tsv, file_name): with open(X_tsv) as f: X = f.readlines() prediction = model.predict(X) np.savetxt(file_name, prediction, fmt='%d') def main(): model = Create_model("train/in.tsv", "train/expected.tsv") predict(model, "dev-0/in.tsv", "dev-0/out.tsv") predict(model, "test-A/in.tsv", "test-A/out.tsv") if __name__ == '__main__': main()