sport-text-classification-b.../main.py
2021-05-31 15:05:35 +02:00

35 lines
944 B
Python

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
train = pd.read_csv('train/train.tsv', sep='\t', header=None, error_bad_lines=False)
X_train = train[0].astype(str).tolist()
Y_train = train[1].astype(str).tolist()
naive_b = MultinomialNB()
count_vec = CountVectorizer()
Y_train=count_vec.fit_transform(Y_train)
naive_b.fit(Y_train, X_train)
dev = pd.read_csv('dev-0/in.tsv', sep='\n', header=None)
X_dev = dev[0].astype(str).tolist()
Y_dev = count_vec.transform(X_dev)
dev_predict = naive_b.predict(Y_dev)
dev_out = open('dev-0/out.tsv', 'w')
for p in dev_predict:
dev_out.write(p + '\n')
test = pd.read_csv('test-A/in.tsv', sep='\n', header=None)
X_test = test[0].astype(str).tolist()
Y_test = count_vec.transform(X_test)
test_predict = naive_b.predict(Y_test)
test_out = open('test-A/out.tsv', 'w')
for p in test_predict:
test_out.write(p + '\n')