paranormal-or-skeptic/solution.py
2020-03-29 00:27:17 +01:00

43 lines
1.2 KiB
Python

import pandas as pd
import numpy as np
import csv
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None)
#print(y)
#train
X_train_counts = count_vect.fit_transform(texts)
clf = MultinomialNB().fit(X_train_counts, y)
print(texts[0])
print(len(texts))
print(len(y))
#predict
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
dev0_new_counts = count_vect.transform(dev0)
testA_new_counts = count_vect.transform(testA)
predicted_dev0 = clf.predict(dev0_new_counts)
predicted_testA = clf.predict(testA_new_counts)
print(len(dev0))
print(len(predicted_dev0))
with open("dev-0/out.tsv", "w") as out1:
for line in predicted_dev0:
out1.write(line)
out1.write("\n")
with open("test-A/out.tsv", "w") as out2:
for line in predicted_testA:
out2.write(line)
out2.write("\n")