Compare commits

..

1 Commits

Author SHA1 Message Date
Zosia
fa2d34d49b add script and outputs 2021-05-12 22:22:14 +02:00
4 changed files with 11454 additions and 0 deletions

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,369 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# zadania domowe naiwny bayes2 gotowa biblioteka"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- wybrać jedno z poniższych repozytoriów i je sforkować:\n",
" - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
" - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public\n",
"- stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf\n",
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
"- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
"termin 12.05, 40 punktów\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import gzip\n",
"import numpy as np\n",
"import gensim\n",
"from stop_words import get_stop_words\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"SPORT_TEXT_PATH = pathlib.Path('C:/Users/Fijka/Documents/sport-text-classification-ball-ISI-public')\n",
"file_name = 'train'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def read_data(filename):\n",
" all_data = gzip.open(filename).read().decode('UTF-8').split('\\n')\n",
" data, expected_class = [], []\n",
" for i in [line.split('\\t') for line in all_data][:-1]:\n",
" data.append(i[1])\n",
" expected_class.append(i[0])\n",
" return data, expected_class\n",
"\n",
"train_data, train_clesses = read_data(SPORT_TEXT_PATH/file_name/'train.tsv.gz')\n",
"train_data, train_clesses = train_data[:20000], train_clesses[:20000]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego', 'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile', 'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem', 'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja', 'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich', 'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem', 'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie', 'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc', 'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że', 'a', 'u', 'i', 'z', 'w', 'o']\n"
]
}
],
"source": [
"stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']\n",
"print(stop_words)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"Mindaugas Budzinauskas wierzy w odbudowę formy Kevina Johnsona. Czy ktoś opuści Polpharmę? Mindaugas Budzinauskas w rozmowie z WP SportoweFakty opowiada o transferze Kevina Johnsona, ewentualnych odejściach z Polpharmy i kolejnym meczu PLK z Anwilem. - Potrzebowaliśmy takiego gracza, jak Johnson - podkreśla szkoleniowiec starogardzian.\n"
]
}
],
"source": [
"print(train_clesses[0])\n",
"print(train_data[0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"train_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in train_data]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['i',\n",
" 'kolejnym',\n",
" 'polpharmę',\n",
" 'ktoś',\n",
" 'takiego',\n",
" 'gracza',\n",
" 'formy',\n",
" 'johnsona',\n",
" 'anwilem',\n",
" 'szkoleniowiec',\n",
" 'z',\n",
" 'mindaugas',\n",
" 'starogardzian',\n",
" 'czy',\n",
" 'podkreśla',\n",
" 'transferze',\n",
" 'budzinauskas',\n",
" 'plk',\n",
" 'kevina',\n",
" 'polpharmy',\n",
" 'opuści',\n",
" 'sportowefakty',\n",
" 'o',\n",
" 'wp',\n",
" 'rozmowie',\n",
" 'w',\n",
" 'opowiada',\n",
" 'wierzy',\n",
" 'meczu',\n",
" 'potrzebowaliśmy',\n",
" 'ewentualnych',\n",
" 'jak',\n",
" 'odejściach',\n",
" 'johnson',\n",
" 'odbudowę']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data_tokenized[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"train_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]\n",
"tmp = [i.sort() for i in train_data_lemmatized]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['anwile', 'budzin', 'ewentu', 'formy', 'gracza', 'johnso', 'kevina', 'kolejn', 'ktoś', 'meczu', 'mindau', 'odbudo', 'odejśc', 'opowia', 'opuści', 'plk', 'podkre', 'polpha', 'potrze', 'rozmow', 'sporto', 'starog', 'szkole', 'takieg', 'transf', 'wierzy', 'wp']\n"
]
}
],
"source": [
"print(train_data_lemmatized[0])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['anwile', 'budzin', 'ewentu', 'formy', 'gracza', 'johnso', 'kevina', 'kolejn', 'ktoś', 'meczu', 'mindau', 'odbudo', 'odejśc', 'opowia', 'opuści', 'plk', 'podkre', 'polpha', 'potrze', 'rozmow', 'sporto', 'starog', 'szkole', 'takieg', 'transf', 'wierzy', 'wp']\n",
"['anwile budzin ewentu formy gracza johnso kevina kolejn ktoś meczu mindau odbudo odejśc opowia opuści plk podkre polpha potrze rozmow sporto starog szkole takieg transf wierzy wp', 'artura barwac bełcha będzie kolejn kontra lata pge polski przyjm reprez rok rozbra sezoni skry skrą szalpu trwał tylko wrócił występ zawart znów został']\n"
]
}
],
"source": [
"print(train_data_lemmatized[0])\n",
"print([' '.join(i) for i in train_data_lemmatized[:2]])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"X = vectorizer.fit_transform([' '.join(i) for i in train_data_lemmatized])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"vocabulary = vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"model = GaussianNB()\n",
"model.fit(X.toarray(), train_clesses)\n",
"score_train = model.score(X.toarray(), train_clesses)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/in.tsv', \"r\", encoding=\"utf-8\") as f:\n",
" dev_0_data = [line.rstrip() for line in f]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"dev_0_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"f = open(\"dev-0/out.tsv\", \"a\")\n",
"for i in [' '.join(i) for i in dev_0_data_lemmatized]:\n",
" f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\\n')\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', \"r\", encoding=\"utf-8\") as f:\n",
" o = [line.rstrip() for line in f]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/expected.tsv', \"r\", encoding=\"utf-8\") as f:\n",
" e = [line.rstrip() for line in f]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5023 429\n",
"0.9213132795304475\n"
]
}
],
"source": [
"t, f = 0, 0\n",
"\n",
"for i in range(len(o)):\n",
" if o[i] == e[i]:\n",
" t += 1\n",
" else:\n",
" f += 1\n",
"print(t, f)\n",
"print(t/(t + f))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', \"r\", encoding=\"utf-8\") as f:\n",
" test_A_data = [line.rstrip() for line in f]\n",
"test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]\n",
"test_A_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]\n",
"f = open(\"test-A/out.tsv\", \"a\")\n",
"for i in [' '.join(i) for i in test_A_data_lemmatized]:\n",
" f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\\n')\n",
"f.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,186 @@
#!/usr/bin/env python
# coding: utf-8
# # zadania domowe naiwny bayes2 gotowa biblioteka
# - wybrać jedno z poniższych repozytoriów i je sforkować:
# - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public
# - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public
# - stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf
# - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
# - wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67
# - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo
# termin 12.05, 40 punktów
#
# In[1]:
import pathlib
import gzip
import numpy as np
import gensim
from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
# In[2]:
SPORT_TEXT_PATH = pathlib.Path('C:/Users/Fijka/Documents/sport-text-classification-ball-ISI-public')
file_name = 'train'
# In[3]:
def read_data(filename):
all_data = gzip.open(filename).read().decode('UTF-8').split('\n')
data, expected_class = [], []
for i in [line.split('\t') for line in all_data][:-1]:
data.append(i[1])
expected_class.append(i[0])
return data, expected_class
train_data, train_clesses = read_data(SPORT_TEXT_PATH/file_name/'train.tsv.gz')
train_data, train_clesses = train_data[:20000], train_clesses[:20000]
# In[4]:
stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']
print(stop_words)
# In[5]:
print(train_clesses[0])
print(train_data[0])
# In[6]:
train_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in train_data]
# In[7]:
train_data_tokenized[0]
# In[8]:
train_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]
tmp = [i.sort() for i in train_data_lemmatized]
# In[9]:
print(train_data_lemmatized[0])
# In[10]:
print(train_data_lemmatized[0])
print([' '.join(i) for i in train_data_lemmatized[:2]])
# In[11]:
import itertools
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([' '.join(i) for i in train_data_lemmatized])
# In[12]:
vocabulary = vectorizer.get_feature_names()
# In[13]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X.toarray(), train_clesses)
score_train = model.score(X.toarray(), train_clesses)
# In[14]:
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
dev_0_data = [line.rstrip() for line in f]
# In[15]:
dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]
# In[16]:
dev_0_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]
# In[17]:
f = open("dev-0/out.tsv", "a")
for i in [' '.join(i) for i in dev_0_data_lemmatized]:
f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n')
f.close()
# In[18]:
with open('dev-0/out.tsv', "r", encoding="utf-8") as f:
o = [line.rstrip() for line in f]
# In[19]:
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
e = [line.rstrip() for line in f]
# In[20]:
t, f = 0, 0
for i in range(len(o)):
if o[i] == e[i]:
t += 1
else:
f += 1
print(t, f)
print(t/(t + f))
# In[21]:
with open('test-A/in.tsv', "r", encoding="utf-8") as f:
test_A_data = [line.rstrip() for line in f]
test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]
test_A_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]
f = open("test-A/out.tsv", "a")
for i in [' '.join(i) for i in test_A_data_lemmatized]:
f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n')
f.close()

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff