paranormal-or-skeptic-ISI-p.../script.py
2021-05-31 00:54:42 +02:00

39 lines
1.2 KiB
Python

import numpy
import lzma
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
PIPELINE = make_pipeline(TfidfVectorizer(), MultinomialNB())
LABEL_ENC = preprocessing.LabelEncoder()
def get_file(path):
with open(path, encoding='utf-8') as file:
return file.readlines()
def get_xz(path):
with lzma.open(path, 'rt', encoding='utf-8') as file:
return file.readlines()
def get_model(train_in, train_expected):
train_expected = LABEL_ENC.fit_transform(train_expected)
model = PIPELINE.fit(train_in, train_expected)
return model
def do_prediction(path_a, path_b, train_expected_path):
train_in = get_xz(path_b)
train_expected = get_file(train_expected_path)
train_test_in = get_xz(path_a + '/in.tsv.xz')
model = get_model(train_in, train_expected)
prediction = model.predict(train_test_in)
numpy.savetxt(path_a + "/out.tsv", prediction, '%d')
if __name__ == '__main__':
do_prediction("dev-0", "./train/in.tsv.xz", "./train/expected.tsv")
do_prediction("test-A", "./train/in.tsv.xz", "./train/expected.tsv")