Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
fa2d34d49b |
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
369
naiwny_bayes2_gotowa_biblioteka_fras.ipynb
Normal file
369
naiwny_bayes2_gotowa_biblioteka_fras.ipynb
Normal file
@ -0,0 +1,369 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# zadania domowe naiwny bayes2 gotowa biblioteka"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"- wybrać jedno z poniższych repozytoriów i je sforkować:\n",
|
||||||
|
" - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
|
||||||
|
" - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public\n",
|
||||||
|
"- stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf\n",
|
||||||
|
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||||
|
"- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
|
||||||
|
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||||
|
"termin 12.05, 40 punktów\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pathlib\n",
|
||||||
|
"import gzip\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import gensim\n",
|
||||||
|
"from stop_words import get_stop_words\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"SPORT_TEXT_PATH = pathlib.Path('C:/Users/Fijka/Documents/sport-text-classification-ball-ISI-public')\n",
|
||||||
|
"file_name = 'train'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def read_data(filename):\n",
|
||||||
|
" all_data = gzip.open(filename).read().decode('UTF-8').split('\\n')\n",
|
||||||
|
" data, expected_class = [], []\n",
|
||||||
|
" for i in [line.split('\\t') for line in all_data][:-1]:\n",
|
||||||
|
" data.append(i[1])\n",
|
||||||
|
" expected_class.append(i[0])\n",
|
||||||
|
" return data, expected_class\n",
|
||||||
|
"\n",
|
||||||
|
"train_data, train_clesses = read_data(SPORT_TEXT_PATH/file_name/'train.tsv.gz')\n",
|
||||||
|
"train_data, train_clesses = train_data[:20000], train_clesses[:20000]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego', 'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile', 'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem', 'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja', 'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich', 'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem', 'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie', 'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc', 'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że', 'a', 'u', 'i', 'z', 'w', 'o']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']\n",
|
||||||
|
"print(stop_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"1\n",
|
||||||
|
"Mindaugas Budzinauskas wierzy w odbudowę formy Kevina Johnsona. Czy ktoś opuści Polpharmę? Mindaugas Budzinauskas w rozmowie z WP SportoweFakty opowiada o transferze Kevina Johnsona, ewentualnych odejściach z Polpharmy i kolejnym meczu PLK z Anwilem. - Potrzebowaliśmy takiego gracza, jak Johnson - podkreśla szkoleniowiec starogardzian.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(train_clesses[0])\n",
|
||||||
|
"print(train_data[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in train_data]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['i',\n",
|
||||||
|
" 'kolejnym',\n",
|
||||||
|
" 'polpharmę',\n",
|
||||||
|
" 'ktoś',\n",
|
||||||
|
" 'takiego',\n",
|
||||||
|
" 'gracza',\n",
|
||||||
|
" 'formy',\n",
|
||||||
|
" 'johnsona',\n",
|
||||||
|
" 'anwilem',\n",
|
||||||
|
" 'szkoleniowiec',\n",
|
||||||
|
" 'z',\n",
|
||||||
|
" 'mindaugas',\n",
|
||||||
|
" 'starogardzian',\n",
|
||||||
|
" 'czy',\n",
|
||||||
|
" 'podkreśla',\n",
|
||||||
|
" 'transferze',\n",
|
||||||
|
" 'budzinauskas',\n",
|
||||||
|
" 'plk',\n",
|
||||||
|
" 'kevina',\n",
|
||||||
|
" 'polpharmy',\n",
|
||||||
|
" 'opuści',\n",
|
||||||
|
" 'sportowefakty',\n",
|
||||||
|
" 'o',\n",
|
||||||
|
" 'wp',\n",
|
||||||
|
" 'rozmowie',\n",
|
||||||
|
" 'w',\n",
|
||||||
|
" 'opowiada',\n",
|
||||||
|
" 'wierzy',\n",
|
||||||
|
" 'meczu',\n",
|
||||||
|
" 'potrzebowaliśmy',\n",
|
||||||
|
" 'ewentualnych',\n",
|
||||||
|
" 'jak',\n",
|
||||||
|
" 'odejściach',\n",
|
||||||
|
" 'johnson',\n",
|
||||||
|
" 'odbudowę']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"train_data_tokenized[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]\n",
|
||||||
|
"tmp = [i.sort() for i in train_data_lemmatized]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['anwile', 'budzin', 'ewentu', 'formy', 'gracza', 'johnso', 'kevina', 'kolejn', 'ktoś', 'meczu', 'mindau', 'odbudo', 'odejśc', 'opowia', 'opuści', 'plk', 'podkre', 'polpha', 'potrze', 'rozmow', 'sporto', 'starog', 'szkole', 'takieg', 'transf', 'wierzy', 'wp']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(train_data_lemmatized[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['anwile', 'budzin', 'ewentu', 'formy', 'gracza', 'johnso', 'kevina', 'kolejn', 'ktoś', 'meczu', 'mindau', 'odbudo', 'odejśc', 'opowia', 'opuści', 'plk', 'podkre', 'polpha', 'potrze', 'rozmow', 'sporto', 'starog', 'szkole', 'takieg', 'transf', 'wierzy', 'wp']\n",
|
||||||
|
"['anwile budzin ewentu formy gracza johnso kevina kolejn ktoś meczu mindau odbudo odejśc opowia opuści plk podkre polpha potrze rozmow sporto starog szkole takieg transf wierzy wp', 'artura barwac bełcha będzie kolejn kontra lata pge polski przyjm reprez rok rozbra sezoni skry skrą szalpu trwał tylko wrócił występ zawart znów został']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(train_data_lemmatized[0])\n",
|
||||||
|
"print([' '.join(i) for i in train_data_lemmatized[:2]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import itertools\n",
|
||||||
|
"\n",
|
||||||
|
"vectorizer = TfidfVectorizer()\n",
|
||||||
|
"X = vectorizer.fit_transform([' '.join(i) for i in train_data_lemmatized])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"vocabulary = vectorizer.get_feature_names()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.naive_bayes import GaussianNB\n",
|
||||||
|
"model = GaussianNB()\n",
|
||||||
|
"model.fit(X.toarray(), train_clesses)\n",
|
||||||
|
"score_train = model.score(X.toarray(), train_clesses)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('dev-0/in.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||||||
|
" dev_0_data = [line.rstrip() for line in f]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dev_0_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"f = open(\"dev-0/out.tsv\", \"a\")\n",
|
||||||
|
"for i in [' '.join(i) for i in dev_0_data_lemmatized]:\n",
|
||||||
|
" f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\\n')\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('dev-0/out.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||||||
|
" o = [line.rstrip() for line in f]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('dev-0/expected.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||||||
|
" e = [line.rstrip() for line in f]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"5023 429\n",
|
||||||
|
"0.9213132795304475\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"t, f = 0, 0\n",
|
||||||
|
"\n",
|
||||||
|
"for i in range(len(o)):\n",
|
||||||
|
" if o[i] == e[i]:\n",
|
||||||
|
" t += 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" f += 1\n",
|
||||||
|
"print(t, f)\n",
|
||||||
|
"print(t/(t + f))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('test-A/in.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||||||
|
" test_A_data = [line.rstrip() for line in f]\n",
|
||||||
|
"test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]\n",
|
||||||
|
"test_A_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]\n",
|
||||||
|
"f = open(\"test-A/out.tsv\", \"a\")\n",
|
||||||
|
"for i in [' '.join(i) for i in test_A_data_lemmatized]:\n",
|
||||||
|
" f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\\n')\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
186
naiwny_bayes2_gotowa_biblioteka_fras.py
Normal file
186
naiwny_bayes2_gotowa_biblioteka_fras.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# # zadania domowe naiwny bayes2 gotowa biblioteka
|
||||||
|
|
||||||
|
# - wybrać jedno z poniższych repozytoriów i je sforkować:
|
||||||
|
# - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public
|
||||||
|
# - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public
|
||||||
|
# - stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf
|
||||||
|
# - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
|
||||||
|
# - wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67
|
||||||
|
# - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo
|
||||||
|
# termin 12.05, 40 punktów
|
||||||
|
#
|
||||||
|
|
||||||
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
import gzip
|
||||||
|
import numpy as np
|
||||||
|
import gensim
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
|
||||||
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
|
SPORT_TEXT_PATH = pathlib.Path('C:/Users/Fijka/Documents/sport-text-classification-ball-ISI-public')
|
||||||
|
file_name = 'train'
|
||||||
|
|
||||||
|
|
||||||
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(filename):
|
||||||
|
all_data = gzip.open(filename).read().decode('UTF-8').split('\n')
|
||||||
|
data, expected_class = [], []
|
||||||
|
for i in [line.split('\t') for line in all_data][:-1]:
|
||||||
|
data.append(i[1])
|
||||||
|
expected_class.append(i[0])
|
||||||
|
return data, expected_class
|
||||||
|
|
||||||
|
train_data, train_clesses = read_data(SPORT_TEXT_PATH/file_name/'train.tsv.gz')
|
||||||
|
train_data, train_clesses = train_data[:20000], train_clesses[:20000]
|
||||||
|
|
||||||
|
|
||||||
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
|
stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']
|
||||||
|
print(stop_words)
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
print(train_clesses[0])
|
||||||
|
print(train_data[0])
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
train_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in train_data]
|
||||||
|
|
||||||
|
|
||||||
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
|
train_data_tokenized[0]
|
||||||
|
|
||||||
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
train_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]
|
||||||
|
tmp = [i.sort() for i in train_data_lemmatized]
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
print(train_data_lemmatized[0])
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
print(train_data_lemmatized[0])
|
||||||
|
print([' '.join(i) for i in train_data_lemmatized[:2]])
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
X = vectorizer.fit_transform([' '.join(i) for i in train_data_lemmatized])
|
||||||
|
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
vocabulary = vectorizer.get_feature_names()
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
model = GaussianNB()
|
||||||
|
model.fit(X.toarray(), train_clesses)
|
||||||
|
score_train = model.score(X.toarray(), train_clesses)
|
||||||
|
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
|
||||||
|
dev_0_data = [line.rstrip() for line in f]
|
||||||
|
|
||||||
|
|
||||||
|
# In[15]:
|
||||||
|
|
||||||
|
|
||||||
|
dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]
|
||||||
|
|
||||||
|
|
||||||
|
# In[16]:
|
||||||
|
|
||||||
|
|
||||||
|
dev_0_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]
|
||||||
|
|
||||||
|
|
||||||
|
# In[17]:
|
||||||
|
|
||||||
|
|
||||||
|
f = open("dev-0/out.tsv", "a")
|
||||||
|
for i in [' '.join(i) for i in dev_0_data_lemmatized]:
|
||||||
|
f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n')
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[18]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('dev-0/out.tsv', "r", encoding="utf-8") as f:
|
||||||
|
o = [line.rstrip() for line in f]
|
||||||
|
|
||||||
|
|
||||||
|
# In[19]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
|
||||||
|
e = [line.rstrip() for line in f]
|
||||||
|
|
||||||
|
|
||||||
|
# In[20]:
|
||||||
|
|
||||||
|
|
||||||
|
t, f = 0, 0
|
||||||
|
|
||||||
|
for i in range(len(o)):
|
||||||
|
if o[i] == e[i]:
|
||||||
|
t += 1
|
||||||
|
else:
|
||||||
|
f += 1
|
||||||
|
print(t, f)
|
||||||
|
print(t/(t + f))
|
||||||
|
|
||||||
|
|
||||||
|
# In[21]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('test-A/in.tsv', "r", encoding="utf-8") as f:
|
||||||
|
test_A_data = [line.rstrip() for line in f]
|
||||||
|
test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]
|
||||||
|
test_A_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]
|
||||||
|
f = open("test-A/out.tsv", "a")
|
||||||
|
for i in [' '.join(i) for i in test_A_data_lemmatized]:
|
||||||
|
f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n')
|
||||||
|
f.close()
|
||||||
|
|
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user