#!/usr/bin/env python # coding: utf-8 from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score from sklearn.feature_extraction.text import CountVectorizer import lzma X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines() y_train = open('train/expected.tsv').readlines() X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines() y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines() X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines() count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) X_dev0_counts = count_vect.transform(X_dev0) X_test_counts = count_vect.transform(X_test) clf = MultinomialNB().fit(X_train_counts, y_train) y_predicted_dev0_MNB = clf.predict(X_dev0_counts) y_predicted_test_MNB = clf.predict(X_test_counts) open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB) open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB)