52 lines
1.4 KiB
Python
52 lines
1.4 KiB
Python
import numpy
|
|
import lzma
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn import preprocessing
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
TEST_A = "test-A"
|
|
DEV_0 = "dev-0"
|
|
TRAIN_IN = "./train/in.tsv.xz"
|
|
TRAIN_EXPECTED = "./train/expected.tsv"
|
|
|
|
|
|
def open_file(path):
|
|
with open(path) as file:
|
|
return file.readlines()
|
|
|
|
|
|
def open_xz(path):
|
|
with lzma.open(path, 'rt') as file:
|
|
return file.readlines()
|
|
|
|
|
|
def get_model(train_in, train_expected):
|
|
label_encoder = preprocessing.LabelEncoder()
|
|
train_expected = label_encoder.fit_transform(train_expected)
|
|
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())
|
|
model = pipeline.fit(train_in, train_expected)
|
|
return model
|
|
|
|
|
|
def predict(train_test_in_path, train_in_path, train_expected_path):
|
|
train_in = open_xz(train_in_path)
|
|
train_expected = open_file(train_expected_path)
|
|
train_test_in = open_xz(train_test_in_path + '/in.tsv.xz')
|
|
model = get_model(train_in, train_expected)
|
|
prediction = model.predict(train_test_in)
|
|
return prediction
|
|
|
|
|
|
def save_result(path, prediction):
|
|
numpy.savetxt(path + "/out.tsv", prediction, '%d')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
prediction_dev_0 = predict(DEV_0, TRAIN_IN, TRAIN_EXPECTED)
|
|
prediction_test_a = predict(TEST_A, TRAIN_IN, TRAIN_EXPECTED)
|
|
|
|
save_result(DEV_0, prediction_dev_0)
|
|
save_result(TEST_A, prediction_test_a)
|