first commit

This commit is contained in:
JPogodzinski 2021-06-08 13:05:13 +02:00
commit 883032dac6
5 changed files with 114549 additions and 0 deletions

5452
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

61
main.py Normal file
View File

@ -0,0 +1,61 @@
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from stop_words import get_stop_words
from numpy import random
stop_words = get_stop_words('polish')
v = TfidfVectorizer(stop_words=None)
naive_bayes=MultinomialNB()
ball_train = pd.read_csv('train/train.tsv', sep='\t', error_bad_lines=False, header=None)
ball_dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t', error_bad_lines=False, header=None)
y_train = pd.DataFrame(ball_train[0])
x_train = pd.DataFrame(ball_train[1])
x_np=x_train.to_numpy()
x_np = [str(item) for item in x_np]
x_train=v.fit_transform(x_np)
naive_bayes.fit(x_train, y_train)
ball_dev_in = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None)
X_dev = pd.DataFrame(ball_dev_in)
X_dev_np=X_dev.to_numpy()
X_dev_np = [str(item) for item in X_dev_np]
X_dev=v.transform(X_dev_np)
model = LogisticRegression() # definicja modelu
model.fit(x_train, y_train) # dopasowanie modelu
Y_dev_predictedNB = naive_bayes.predict(X_dev)
Y_dev_predicted_baseline=np.ones_like(Y_dev_predictedNB)
Y_dev_predicted_random=random.choice([0,1],size=len(Y_dev_predictedNB))
Y_dev_predictedLogReg=model.predict(X_dev)
print(f1_score(ball_dev_expected, Y_dev_predicted_baseline))
print(f1_score(ball_dev_expected, Y_dev_predictedLogReg))
print(f1_score(ball_dev_expected, Y_dev_predicted_random))
print(f1_score(ball_dev_expected, Y_dev_predictedNB))
print()
print(accuracy_score(ball_dev_expected, Y_dev_predicted_baseline))
print(accuracy_score(ball_dev_expected, Y_dev_predictedLogReg))
print(accuracy_score(ball_dev_expected, Y_dev_predicted_random))
print(accuracy_score(ball_dev_expected, Y_dev_predictedNB))
print()
print(precision_score(ball_dev_expected, Y_dev_predicted_baseline))
print(precision_score(ball_dev_expected, Y_dev_predictedLogReg))
print(precision_score(ball_dev_expected, Y_dev_predicted_random))
print(precision_score(ball_dev_expected, Y_dev_predictedNB))
print()
print(recall_score(ball_dev_expected, Y_dev_predicted_baseline))
print(recall_score(ball_dev_expected, Y_dev_predictedLogReg))
print(recall_score(ball_dev_expected, Y_dev_predicted_random))
print(recall_score(ball_dev_expected, Y_dev_predictedNB))

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff