import numpy import lzma from sklearn.naive_bayes import MultinomialNB from sklearn import preprocessing from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer TEST_A = "test-A" DEV_0 = "dev-0" TRAIN_IN = "./train/in.tsv.xz" TRAIN_EXPECTED = "./train/expected.tsv" def open_file(path): with open(path) as file: return file.readlines() def open_xz(path): with lzma.open(path, 'rt') as file: return file.readlines() def get_model(train_in, train_expected): label_encoder = preprocessing.LabelEncoder() train_expected = label_encoder.fit_transform(train_expected) pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB()) model = pipeline.fit(train_in, train_expected) return model def predict(train_test_in_path, train_in_path, train_expected_path): train_in = open_xz(train_in_path) train_expected = open_file(train_expected_path) train_test_in = open_xz(train_test_in_path + '/in.tsv.xz') model = get_model(train_in, train_expected) prediction = model.predict(train_test_in) return prediction def save_result(path, prediction): numpy.savetxt(path + "/out.tsv", prediction, '%d') if __name__ == '__main__': prediction_dev_0 = predict(DEV_0, TRAIN_IN, TRAIN_EXPECTED) prediction_test_a = predict(TEST_A, TRAIN_IN, TRAIN_EXPECTED) save_result(DEV_0, prediction_dev_0) save_result(TEST_A, prediction_test_a)