41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
import pandas as pd
|
|
|
|
# czytanie danych treningowych
|
|
train_data = pd.read_csv('train/train.tsv', sep='\t', header=None, error_bad_lines=False)
|
|
y_train = train_data.iloc[:, 0]
|
|
x_train = train_data.iloc[:, 1]
|
|
|
|
# wektoryzacji tfidf
|
|
tfidf = TfidfVectorizer()
|
|
tfidf_train = tfidf.fit_transform(x_train)
|
|
|
|
# naiwny klasyfikator Bayesa - trenowanie
|
|
bayes = MultinomialNB()
|
|
bayes.fit(tfidf_train, y_train)
|
|
|
|
# naiwny klasyfikator Bayesa - ewaluacja zbioru dev
|
|
dev_data = pd.read_csv('dev-0/in.tsv', sep='\n', header=None)
|
|
x_dev = dev_data.iloc[:, 0]
|
|
tfidf_dev = tfidf.transform(x_dev)
|
|
dev_predictions = bayes.predict(tfidf_dev)
|
|
dev_out_file = open('dev-0/out.tsv', 'w')
|
|
for prediction in dev_predictions:
|
|
dev_out_file.write(str(prediction) + '\n')
|
|
|
|
# naiwny klasyfikator Bayesa - ewaluacja zbioru test
|
|
test_data = pd.read_csv('test-A/in.tsv', sep='\n', header=None)
|
|
x_test = dev_data.iloc[:, 0]
|
|
tfidf_test = tfidf.transform(x_test)
|
|
test_predictions = bayes.predict(tfidf_test)
|
|
test_out_file = open('test-A/out.tsv', 'w')
|
|
for prediction in test_predictions:
|
|
test_out_file.write(str(prediction) + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|