paranormal-or-skeptic-ISI-p.../runNB.ipynb
2022-05-25 22:54:49 +02:00

3.0 KiB

#!/usr/bin/env python
# coding: utf-8

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import  accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma

X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train = open('train/expected.tsv').readlines()
X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines()
X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_dev0_counts = count_vect.transform(X_dev0)
X_test_counts = count_vect.transform(X_test)
clf = MultinomialNB().fit(X_train_counts, y_train)

y_predicted_dev0_MNB = clf.predict(X_dev0_counts)
y_predicted_test_MNB = clf.predict(X_test_counts)
accuracy_dev0_MNB = accuracy_score(y_expected_dev0, y_predicted_dev0_MNB)
print(f"Accuracy dev0: {accuracy_dev0_MNB}")
Accuracy dev0: 0.8025417298937785
open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB)
open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB)