This commit is contained in:
mkozlowskiAzimuthe 2022-05-18 16:37:17 +02:00
parent d043e30286
commit d3d2656597
4 changed files with 103629 additions and 0 deletions

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"python.formatting.provider": "black"
}

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

42
run.py Normal file
View File

@ -0,0 +1,42 @@
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
def read_train_data():
print('Load train data')
train_data = pd.read_csv("train/train.tsv", sep='\t', header=None)
train_data = train_data[:10000]
return train_data[1], train_data[0]
def read_pred_data():
print('Load pred data')
x_p = []
with open("dev-0/in.tsv", encoding='utf-8') as f:
for line in f:
x_p.append(line)
return x_p
def vectorize(x,x_p):
print('Vectorize')
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x)
x_p = vectorizer.transform(x_p)
return x, x_p
def calc_score(x, y, x_p):
print('Calculate score')
model = GaussianNB()
model.fit(x.toarray(), y)
return model.predict(x_p.toarray())
def get_result():
x, y = read_train_data()
x_p = read_pred_data()
x, x_p = vectorize(x, x_p)
return calc_score(x, y, x_p)
pd.DataFrame(get_result()).to_csv("dev-0/out.tsv", header=False, index=None)

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff