sport-text-classification/run.py

40 lines
1.0 KiB
Python
Raw Normal View History

2022-05-18 16:37:17 +02:00
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
def read_train_data():
print('Load train data')
train_data = pd.read_csv("train/train.tsv", sep='\t', header=None)
train_data = train_data[:10000]
return train_data[1], train_data[0]
def read_pred_data():
print('Load pred data')
x_p = []
2022-05-19 17:00:13 +02:00
with open("test-A/in.tsv", encoding='utf-8') as f:
2022-05-18 16:37:17 +02:00
for line in f:
x_p.append(line)
return x_p
def vectorize(x,x_p):
print('Vectorize')
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x)
x_p = vectorizer.transform(x_p)
return x, x_p
def calc_score(x, y, x_p):
print('Calculate score')
model = GaussianNB()
model.fit(x.toarray(), y)
return model.predict(x_p.toarray())
def get_result():
x, y = read_train_data()
x_p = read_pred_data()
x, x_p = vectorize(x, x_p)
return calc_score(x, y, x_p)
2022-05-19 17:00:13 +02:00
pd.DataFrame(get_result()).to_csv("test-A/out.tsv", header=False, index=None)