43 lines
1.1 KiB
Python
43 lines
1.1 KiB
Python
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
import csv
|
|
|
|
|
|
def get_test_posts(path):
|
|
posts = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
text, timestamp = line.rstrip('\n').split('\t')
|
|
posts.append(text)
|
|
return posts
|
|
|
|
|
|
def get_expected(path):
|
|
expected = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
class_ = line.rstrip('\n').replace(" ", "")
|
|
expected.append(class_)
|
|
return expected
|
|
|
|
count_vect = CountVectorizer()
|
|
X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
|
|
|
|
y = get_expected("train/expected.tsv")
|
|
|
|
clf = MultinomialNB()
|
|
clf.fit(X_train_counts, y)
|
|
|
|
def predict_posts(path, clf):
|
|
X = count_vect.transform(get_test_posts(path+'/in.tsv'))
|
|
classes = clf.predict(X)
|
|
with open(path+"/out.tsv", 'wt') as tsvfile:
|
|
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
|
for i in classes:
|
|
tsv_writer.writerow(i)
|
|
|
|
predict_posts("dev-0", clf)
|
|
predict_posts("test-A", clf)
|
|
|