paranormal-or-skeptic/readymadesolution.py
2020-03-30 22:59:26 +02:00

43 lines
1.1 KiB
Python

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import csv
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
def get_expected(path):
expected = []
with open(path) as f:
for line in f:
class_ = line.rstrip('\n').replace(" ", "")
expected.append(class_)
return expected
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
y = get_expected("train/expected.tsv")
clf = MultinomialNB()
clf.fit(X_train_counts, y)
def predict_posts(path, clf):
X = count_vect.transform(get_test_posts(path+'/in.tsv'))
classes = clf.predict(X)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in classes:
tsv_writer.writerow(i)
predict_posts("dev-0", clf)
predict_posts("test-A", clf)