187 lines
3.9 KiB
Python
187 lines
3.9 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
# # zadania domowe naiwny bayes2 gotowa biblioteka
|
|
|
|
# - wybrać jedno z poniższych repozytoriów i je sforkować:
|
|
# - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public
|
|
# - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public
|
|
# - stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf
|
|
# - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
|
|
# - wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67
|
|
# - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo
|
|
# termin 12.05, 40 punktów
|
|
#
|
|
|
|
# In[1]:
|
|
|
|
|
|
import pathlib
|
|
import gzip
|
|
import numpy as np
|
|
import gensim
|
|
from stop_words import get_stop_words
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
# In[2]:
|
|
|
|
|
|
SPORT_TEXT_PATH = pathlib.Path('C:/Users/Fijka/Documents/sport-text-classification-ball-ISI-public')
|
|
file_name = 'train'
|
|
|
|
|
|
# In[3]:
|
|
|
|
|
|
def read_data(filename):
|
|
all_data = gzip.open(filename).read().decode('UTF-8').split('\n')
|
|
data, expected_class = [], []
|
|
for i in [line.split('\t') for line in all_data][:-1]:
|
|
data.append(i[1])
|
|
expected_class.append(i[0])
|
|
return data, expected_class
|
|
|
|
train_data, train_clesses = read_data(SPORT_TEXT_PATH/file_name/'train.tsv.gz')
|
|
train_data, train_clesses = train_data[:20000], train_clesses[:20000]
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']
|
|
print(stop_words)
|
|
|
|
|
|
# In[5]:
|
|
|
|
|
|
print(train_clesses[0])
|
|
print(train_data[0])
|
|
|
|
|
|
# In[6]:
|
|
|
|
|
|
train_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in train_data]
|
|
|
|
|
|
# In[7]:
|
|
|
|
|
|
train_data_tokenized[0]
|
|
|
|
|
|
# In[8]:
|
|
|
|
|
|
train_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]
|
|
tmp = [i.sort() for i in train_data_lemmatized]
|
|
|
|
|
|
# In[9]:
|
|
|
|
|
|
print(train_data_lemmatized[0])
|
|
|
|
|
|
# In[10]:
|
|
|
|
|
|
print(train_data_lemmatized[0])
|
|
print([' '.join(i) for i in train_data_lemmatized[:2]])
|
|
|
|
|
|
# In[11]:
|
|
|
|
|
|
import itertools
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
X = vectorizer.fit_transform([' '.join(i) for i in train_data_lemmatized])
|
|
|
|
|
|
# In[12]:
|
|
|
|
|
|
vocabulary = vectorizer.get_feature_names()
|
|
|
|
|
|
# In[13]:
|
|
|
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
model = GaussianNB()
|
|
model.fit(X.toarray(), train_clesses)
|
|
score_train = model.score(X.toarray(), train_clesses)
|
|
|
|
|
|
# In[14]:
|
|
|
|
|
|
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
|
|
dev_0_data = [line.rstrip() for line in f]
|
|
|
|
|
|
# In[15]:
|
|
|
|
|
|
dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]
|
|
|
|
|
|
# In[16]:
|
|
|
|
|
|
dev_0_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]
|
|
|
|
|
|
# In[17]:
|
|
|
|
|
|
f = open("dev-0/out.tsv", "a")
|
|
for i in [' '.join(i) for i in dev_0_data_lemmatized]:
|
|
f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n')
|
|
f.close()
|
|
|
|
|
|
# In[18]:
|
|
|
|
|
|
with open('dev-0/out.tsv', "r", encoding="utf-8") as f:
|
|
o = [line.rstrip() for line in f]
|
|
|
|
|
|
# In[19]:
|
|
|
|
|
|
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
|
|
e = [line.rstrip() for line in f]
|
|
|
|
|
|
# In[20]:
|
|
|
|
|
|
t, f = 0, 0
|
|
|
|
for i in range(len(o)):
|
|
if o[i] == e[i]:
|
|
t += 1
|
|
else:
|
|
f += 1
|
|
print(t, f)
|
|
print(t/(t + f))
|
|
|
|
|
|
# In[21]:
|
|
|
|
|
|
with open('test-A/in.tsv', "r", encoding="utf-8") as f:
|
|
test_A_data = [line.rstrip() for line in f]
|
|
test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]
|
|
test_A_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]
|
|
f = open("test-A/out.tsv", "a")
|
|
for i in [' '.join(i) for i in test_A_data_lemmatized]:
|
|
f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n')
|
|
f.close()
|
|
|