from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import csv def get_test_posts(path): posts = [] with open(path) as f: for line in f: text, timestamp = line.rstrip('\n').split('\t') posts.append(text) return posts def get_expected(path): expected = [] with open(path) as f: for line in f: class_ = line.rstrip('\n').replace(" ", "") expected.append(class_) return expected count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv")) y = get_expected("train/expected.tsv") clf = MultinomialNB() clf.fit(X_train_counts, y) def predict_posts(path, clf): X = count_vect.transform(get_test_posts(path+'/in.tsv')) classes = clf.predict(X) with open(path+"/out.tsv", 'wt') as tsvfile: tsv_writer = csv.writer(tsvfile, delimiter='\t') for i in classes: tsv_writer.writerow(i) predict_posts("dev-0", clf) predict_posts("test-A", clf)