40 lines
1.0 KiB
Python
40 lines
1.0 KiB
Python
|
import pandas as pd
|
||
|
from sklearn.naive_bayes import GaussianNB
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
|
||
|
|
||
|
def read_train_data():
|
||
|
print('Load train data')
|
||
|
train_data = pd.read_csv("train/train.tsv", sep='\t', header=None)
|
||
|
train_data = train_data[:10000]
|
||
|
|
||
|
return train_data[1], train_data[0]
|
||
|
|
||
|
def read_pred_data():
|
||
|
print('Load pred data')
|
||
|
x_p = []
|
||
|
with open("dev-0/in.tsv", encoding='utf-8') as f:
|
||
|
for line in f:
|
||
|
x_p.append(line)
|
||
|
return x_p
|
||
|
|
||
|
def vectorize(x,x_p):
|
||
|
print('Vectorize')
|
||
|
vectorizer = TfidfVectorizer()
|
||
|
x = vectorizer.fit_transform(x)
|
||
|
x_p = vectorizer.transform(x_p)
|
||
|
return x, x_p
|
||
|
|
||
|
def calc_score(x, y, x_p):
|
||
|
print('Calculate score')
|
||
|
model = GaussianNB()
|
||
|
model.fit(x.toarray(), y)
|
||
|
return model.predict(x_p.toarray())
|
||
|
|
||
|
def get_result():
|
||
|
x, y = read_train_data()
|
||
|
x_p = read_pred_data()
|
||
|
x, x_p = vectorize(x, x_p)
|
||
|
return calc_score(x, y, x_p)
|
||
|
|
||
|
pd.DataFrame(get_result()).to_csv("dev-0/out.tsv", header=False, index=None)
|