2022-05-17 22:20:36 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
|
|
from sklearn.pipeline import make_pipeline
|
|
|
|
|
|
|
|
|
|
|
|
data = pd.read_csv('train/train.tsv', sep='\t', header=None, error_bad_lines=False)
|
|
|
|
|
|
|
|
|
|
|
|
X = data[1]
|
|
|
|
|
|
|
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
|
|
|
Xdev = f.readlines()
|
|
|
|
Xdev = pd.Series(Xdev)
|
|
|
|
|
|
|
|
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
|
|
|
Xtest = f.readlines()
|
|
|
|
Xtest = pd.Series(Xtest)
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-06-15 10:19:33 +02:00
|
|
|
y = data[0].astype('str')
|
2022-05-17 22:20:36 +02:00
|
|
|
|
|
|
|
ydev = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None)
|
|
|
|
ydev = ydev.squeeze()
|
|
|
|
|
|
|
|
|
|
|
|
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
|
|
|
|
|
|
|
|
model.fit(X, y)
|
|
|
|
|
|
|
|
|
|
|
|
predictions_dev0 = model.predict(Xdev)
|
|
|
|
predictions_dev0 = pd.Series(predictions_dev0)
|
2022-06-15 10:19:33 +02:00
|
|
|
predictions_dev0 = predictions_dev0.astype('int')
|
2022-05-17 22:20:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
with open('dev-0/out.tsv', 'wt') as f:
|
|
|
|
for pred in predictions_dev0:
|
|
|
|
f.write(str(pred)+'\n')
|
|
|
|
|
|
|
|
|
|
|
|
predictions_testA = model.predict(Xtest)
|
|
|
|
predictions_testA = pd.Series(predictions_testA)
|
2022-06-15 10:19:33 +02:00
|
|
|
predictions_testA = predictions_testA.astype('int')
|
2022-05-17 22:20:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('test-A/out.tsv', 'wt') as f:
|
|
|
|
for pred in predictions_testA:
|
2022-06-15 10:19:33 +02:00
|
|
|
f.write(str(pred)+'\n')
|