09
This commit is contained in:
parent
31a53bb658
commit
d4b869c0ce
733
cw/09_word2vec_oraz_biblioteki_NLP.ipynb
Normal file
733
cw/09_word2vec_oraz_biblioteki_NLP.ipynb
Normal file
@ -0,0 +1,733 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||||
|
"<div class=\"alert alert-block alert-info\">\n",
|
||||||
|
"<h1> Ekstrakcja informacji </h1>\n",
|
||||||
|
"<h2> 8. <i>word2vec i gotowe biblioteki</i> [ćwiczenia]</h2> \n",
|
||||||
|
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||||
|
"</div>\n",
|
||||||
|
"\n",
|
||||||
|
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Requirement already satisfied: gensim in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (4.2.0)\n",
|
||||||
|
"Requirement already satisfied: numpy>=1.17.0 in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (from gensim) (1.22.3)\n",
|
||||||
|
"Requirement already satisfied: scipy>=0.18.1 in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (from gensim) (1.8.0)\n",
|
||||||
|
"Requirement already satisfied: smart-open>=1.8.1 in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (from gensim) (6.0.0)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!pip install gensim "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import gensim.downloader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/tmp/ipykernel_62420/1831104553.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
|
||||||
|
" from IPython.core.display import display, HTML\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from IPython.display import Image\n",
|
||||||
|
"from IPython.core.display import display, HTML"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Mikolov et al., Efficient Estimation of Word Representations in Vector Space (2013)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"![title](obrazki/w2v.png)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"word_vectors = gensim.downloader.load(\"glove-wiki-gigaword-100\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array([ 0.30817 , 0.30938 , 0.52803 , -0.92543 , -0.73671 ,\n",
|
||||||
|
" 0.63475 , 0.44197 , 0.10262 , -0.09142 , -0.56607 ,\n",
|
||||||
|
" -0.5327 , 0.2013 , 0.7704 , -0.13983 , 0.13727 ,\n",
|
||||||
|
" 1.1128 , 0.89301 , -0.17869 , -0.0019722, 0.57289 ,\n",
|
||||||
|
" 0.59479 , 0.50428 , -0.28991 , -1.3491 , 0.42756 ,\n",
|
||||||
|
" 1.2748 , -1.1613 , -0.41084 , 0.042804 , 0.54866 ,\n",
|
||||||
|
" 0.18897 , 0.3759 , 0.58035 , 0.66975 , 0.81156 ,\n",
|
||||||
|
" 0.93864 , -0.51005 , -0.070079 , 0.82819 , -0.35346 ,\n",
|
||||||
|
" 0.21086 , -0.24412 , -0.16554 , -0.78358 , -0.48482 ,\n",
|
||||||
|
" 0.38968 , -0.86356 , -0.016391 , 0.31984 , -0.49246 ,\n",
|
||||||
|
" -0.069363 , 0.018869 , -0.098286 , 1.3126 , -0.12116 ,\n",
|
||||||
|
" -1.2399 , -0.091429 , 0.35294 , 0.64645 , 0.089642 ,\n",
|
||||||
|
" 0.70294 , 1.1244 , 0.38639 , 0.52084 , 0.98787 ,\n",
|
||||||
|
" 0.79952 , -0.34625 , 0.14095 , 0.80167 , 0.20987 ,\n",
|
||||||
|
" -0.86007 , -0.15308 , 0.074523 , 0.40816 , 0.019208 ,\n",
|
||||||
|
" 0.51587 , -0.34428 , -0.24525 , -0.77984 , 0.27425 ,\n",
|
||||||
|
" 0.22418 , 0.20164 , 0.017431 , -0.014697 , -1.0235 ,\n",
|
||||||
|
" -0.39695 , -0.0056188, 0.30569 , 0.31748 , 0.021404 ,\n",
|
||||||
|
" 0.11837 , -0.11319 , 0.42456 , 0.53405 , -0.16717 ,\n",
|
||||||
|
" -0.27185 , -0.6255 , 0.12883 , 0.62529 , -0.52086 ],\n",
|
||||||
|
" dtype=float32)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors['dog']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(word_vectors['dog'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$ A = (a_1, a_2, \\ldots, a_n)$\n",
|
||||||
|
"\n",
|
||||||
|
"$ B = (b_1, b_2, \\ldots, b_n)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$A \\cdot B = a_1* b_1 + a_2*b_2 + \\ldots a_n*b_n$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$A \\cdot B = |A| |B| cos(\\theta)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"cosine_similarity = $\\frac{A \\cdot B}{|A||B|}$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"![image.png](obrazki/cos.png)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array([ 0.24340999, 0.23372999, 0.34519994, -1.19175 , -1.4724072 ,\n",
|
||||||
|
" 0.34235 , 0.60779 , 0.261443 , 0.06009999, -1.37846 ,\n",
|
||||||
|
" -0.88091004, 0.08861998, 1.05097 , -0.37221998, -0.05504 ,\n",
|
||||||
|
" 2.07504 , 1.2128501 , -0.17209001, 0.5188256 , 0.68386996,\n",
|
||||||
|
" 0.26919997, 0.977559 , -0.41735998, -2.29253 , 0.06891 ,\n",
|
||||||
|
" 1.9723799 , -1.7875899 , -0.1394 , -0.08426201, 0.73421997,\n",
|
||||||
|
" 0.449713 , 0.27947 , 1.1328939 , 1.48901 , 1.44769 ,\n",
|
||||||
|
" 2.25301 , -0.23492998, -0.721868 , 0.78779006, -0.73836505,\n",
|
||||||
|
" 0.88069 , -0.447323 , -1.29005 , -1.39741 , -1.10009 ,\n",
|
||||||
|
" 0.50502 , -1.6576351 , -0.055184 , 0.38991004, -0.76956004,\n",
|
||||||
|
" 0.185334 , 0.43640798, -0.882702 , 0.83290005, 0.13615999,\n",
|
||||||
|
" -0.23210001, 0.58739203, 0.24005997, 0.05180001, -0.398276 ,\n",
|
||||||
|
" 0.99437 , 1.40552 , 1.3153701 , 1.20883 , 1.23647 ,\n",
|
||||||
|
" 1.692517 , -1.5952799 , -0.22698998, 2.10365 , 0.15522999,\n",
|
||||||
|
" -1.87457 , -0.01184002, 0.03998601, 1.0829899 , -0.315964 ,\n",
|
||||||
|
" 0.98266095, -0.86874 , 0.09540001, -1.0042601 , 0.83836997,\n",
|
||||||
|
" -0.29442003, 0.05798 , 0.063619 , 0.197066 , -0.7356999 ,\n",
|
||||||
|
" -0.222 , 0.5118224 , 0.73807997, 0.733638 , 0.577438 ,\n",
|
||||||
|
" -0.04933 , 0.14863001, 0.39170003, 1.022125 , -0.08759001,\n",
|
||||||
|
" -0.589356 , -0.86798 , 1.19477 , 1.211442 , -0.50261 ],\n",
|
||||||
|
" dtype=float32)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors['dog'] + word_vectors['dog'] - word_vectors['man']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('yellow', 0.7358633279800415),\n",
|
||||||
|
" ('red', 0.7140780091285706),\n",
|
||||||
|
" ('blue', 0.7118036150932312),\n",
|
||||||
|
" ('green', 0.7111418843269348),\n",
|
||||||
|
" ('pink', 0.677507221698761),\n",
|
||||||
|
" ('purple', 0.6774231791496277),\n",
|
||||||
|
" ('black', 0.6709616780281067),\n",
|
||||||
|
" ('colored', 0.665260910987854),\n",
|
||||||
|
" ('lemon', 0.6251963973045349),\n",
|
||||||
|
" ('peach', 0.6168624758720398)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors.most_similar(positive=['orange'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('queen', 0.7698540687561035),\n",
|
||||||
|
" ('monarch', 0.6843381524085999),\n",
|
||||||
|
" ('throne', 0.6755736470222473),\n",
|
||||||
|
" ('daughter', 0.6594556570053101),\n",
|
||||||
|
" ('princess', 0.6520534157752991),\n",
|
||||||
|
" ('prince', 0.6517034769058228),\n",
|
||||||
|
" ('elizabeth', 0.6464517712593079),\n",
|
||||||
|
" ('mother', 0.631171703338623),\n",
|
||||||
|
" ('emperor', 0.6106470823287964),\n",
|
||||||
|
" ('wife', 0.6098655462265015)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('berlin', 0.8846380710601807),\n",
|
||||||
|
" ('frankfurt', 0.7985543608665466),\n",
|
||||||
|
" ('vienna', 0.7675994038581848),\n",
|
||||||
|
" ('munich', 0.7542588114738464),\n",
|
||||||
|
" ('hamburg', 0.7182371616363525),\n",
|
||||||
|
" ('bonn', 0.6890878081321716),\n",
|
||||||
|
" ('prague', 0.6842440962791443),\n",
|
||||||
|
" ('cologne', 0.6762093305587769),\n",
|
||||||
|
" ('zurich', 0.6653268933296204),\n",
|
||||||
|
" ('leipzig', 0.6619253754615784)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors.most_similar(positive=['paris', 'germany'], negative=['france'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('walked', 0.6780266761779785),\n",
|
||||||
|
" ('crawled', 0.6523419618606567),\n",
|
||||||
|
" ('wandered', 0.6384280323982239),\n",
|
||||||
|
" ('hopped', 0.6131909489631653),\n",
|
||||||
|
" ('walks', 0.6122221946716309),\n",
|
||||||
|
" ('walk', 0.6120144724845886),\n",
|
||||||
|
" ('strolled', 0.6010454893112183),\n",
|
||||||
|
" ('slept', 0.5912748575210571),\n",
|
||||||
|
" ('wandering', 0.5861443877220154),\n",
|
||||||
|
" ('waited', 0.5791574716567993)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors.most_similar(positive=['walking', 'swam'], negative=['swimming'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('puppies', 0.6867596507072449),\n",
|
||||||
|
" ('kitten', 0.6866797208786011),\n",
|
||||||
|
" ('kittens', 0.6383703947067261),\n",
|
||||||
|
" ('monkey', 0.6171091198921204),\n",
|
||||||
|
" ('rabbit', 0.6136822700500488),\n",
|
||||||
|
" ('pup', 0.6054644584655762),\n",
|
||||||
|
" ('tabby', 0.5937005281448364),\n",
|
||||||
|
" ('retriever', 0.5934329628944397),\n",
|
||||||
|
" ('bitch', 0.5817775130271912),\n",
|
||||||
|
" ('hound', 0.57785564661026)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors.most_similar(positive=['puppy', 'cat'], negative=['dog'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('dog', 0.8798074722290039),\n",
|
||||||
|
" ('rabbit', 0.7424427270889282),\n",
|
||||||
|
" ('cats', 0.732300341129303),\n",
|
||||||
|
" ('monkey', 0.7288709878921509),\n",
|
||||||
|
" ('pet', 0.719014048576355),\n",
|
||||||
|
" ('dogs', 0.7163872718811035),\n",
|
||||||
|
" ('mouse', 0.6915250420570374),\n",
|
||||||
|
" ('puppy', 0.6800068020820618),\n",
|
||||||
|
" ('rat', 0.6641027331352234),\n",
|
||||||
|
" ('spider', 0.6501135230064392)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"word_vectors.most_similar(positive=['cat'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"![image.png](obrazki/linear-relationships.png)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## vowpal wabbit "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.datasets import fetch_20newsgroups"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"newsgroups_train = fetch_20newsgroups(subset = 'train')\n",
|
||||||
|
"newsgroups_test = fetch_20newsgroups(subset = 'test')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('vw_20_newsgroup_train', 'w') as f:\n",
|
||||||
|
" for target, text in zip(newsgroups_train['target'],newsgroups_train['data']):\n",
|
||||||
|
" f.write(str(target + 1) + ' |text ' + text.replace('\\n',' ').replace(':','') + '\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('vw_20_newsgroup_test', 'w') as f, open('20_newsgroup_test_targets', 'w') as f_targets:\n",
|
||||||
|
" for target, text in zip(newsgroups_test['target'],newsgroups_test['data']):\n",
|
||||||
|
" f.write('1 |text ' + text.replace('\\n',' ').replace(':','') + '\\n')\n",
|
||||||
|
" f_targets.write(str(target + 1) + '\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"final_regressor = vw_newsgroup_model.vw\n",
|
||||||
|
"Num weight bits = 18\n",
|
||||||
|
"learning rate = 0.5\n",
|
||||||
|
"initial_t = 0\n",
|
||||||
|
"power_t = 0.5\n",
|
||||||
|
"using no cache\n",
|
||||||
|
"Reading datafile = vw_20_newsgroup_train\n",
|
||||||
|
"num sources = 1\n",
|
||||||
|
"average since example example current current current\n",
|
||||||
|
"loss last counter weight label predict features\n",
|
||||||
|
"1.000000 1.000000 1 1.0 8 1 124\n",
|
||||||
|
"1.000000 1.000000 2 2.0 5 8 124\n",
|
||||||
|
"0.750000 0.500000 4 4.0 2 5 114\n",
|
||||||
|
"0.875000 1.000000 8 8.0 4 15 417\n",
|
||||||
|
"0.937500 1.000000 16 16.0 1 15 203\n",
|
||||||
|
"0.968750 1.000000 32 32.0 14 7 236\n",
|
||||||
|
"0.953125 0.937500 64 64.0 7 5 50\n",
|
||||||
|
"0.875000 0.796875 128 128.0 17 15 416\n",
|
||||||
|
"0.828125 0.781250 256 256.0 3 1 251\n",
|
||||||
|
"0.757812 0.687500 512 512.0 4 5 163\n",
|
||||||
|
"0.680664 0.603516 1024 1024.0 14 1 183\n",
|
||||||
|
"0.559570 0.438477 2048 2048.0 7 13 65\n",
|
||||||
|
"0.440918 0.322266 4096 4096.0 15 15 94\n",
|
||||||
|
"0.337402 0.233887 8192 8192.0 16 16 384\n",
|
||||||
|
"\n",
|
||||||
|
"finished run\n",
|
||||||
|
"number of examples = 11314\n",
|
||||||
|
"weighted example sum = 11314.000000\n",
|
||||||
|
"weighted label sum = 0.000000\n",
|
||||||
|
"average loss = 0.300601\n",
|
||||||
|
"total feature number = 3239430\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!vw --oaa 20 -d 'vw_20_newsgroup_train' -f vw_newsgroup_model.vw"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"only testing\n",
|
||||||
|
"predictions = vw_20_newsgroup_train_pred\n",
|
||||||
|
"Num weight bits = 18\n",
|
||||||
|
"learning rate = 0.5\n",
|
||||||
|
"initial_t = 0\n",
|
||||||
|
"power_t = 0.5\n",
|
||||||
|
"using no cache\n",
|
||||||
|
"Reading datafile = vw_20_newsgroup_train\n",
|
||||||
|
"num sources = 1\n",
|
||||||
|
"average since example example current current current\n",
|
||||||
|
"loss last counter weight label predict features\n",
|
||||||
|
"0.000000 0.000000 1 1.0 8 8 124\n",
|
||||||
|
"0.000000 0.000000 2 2.0 5 5 124\n",
|
||||||
|
"0.000000 0.000000 4 4.0 2 2 114\n",
|
||||||
|
"0.000000 0.000000 8 8.0 4 4 417\n",
|
||||||
|
"0.000000 0.000000 16 16.0 1 1 203\n",
|
||||||
|
"0.000000 0.000000 32 32.0 14 14 236\n",
|
||||||
|
"0.000000 0.000000 64 64.0 7 7 50\n",
|
||||||
|
"0.015625 0.031250 128 128.0 17 17 416\n",
|
||||||
|
"0.015625 0.015625 256 256.0 3 3 251\n",
|
||||||
|
"0.011719 0.007812 512 512.0 4 4 163\n",
|
||||||
|
"0.018555 0.025391 1024 1024.0 14 14 183\n",
|
||||||
|
"0.017578 0.016602 2048 2048.0 7 7 65\n",
|
||||||
|
"0.018555 0.019531 4096 4096.0 15 15 94\n",
|
||||||
|
"0.020264 0.021973 8192 8192.0 16 16 384\n",
|
||||||
|
"\n",
|
||||||
|
"finished run\n",
|
||||||
|
"number of examples = 11314\n",
|
||||||
|
"weighted example sum = 11314.000000\n",
|
||||||
|
"weighted label sum = 0.000000\n",
|
||||||
|
"average loss = 0.020771\n",
|
||||||
|
"total feature number = 3239430\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!vw -i vw_newsgroup_model.vw -t -d vw_20_newsgroup_train -p vw_20_newsgroup_train_pred"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"only testing\n",
|
||||||
|
"predictions = 20_newsgroup_test_pred\n",
|
||||||
|
"Num weight bits = 18\n",
|
||||||
|
"learning rate = 0.5\n",
|
||||||
|
"initial_t = 0\n",
|
||||||
|
"power_t = 0.5\n",
|
||||||
|
"using no cache\n",
|
||||||
|
"Reading datafile = vw_20_newsgroup_test\n",
|
||||||
|
"num sources = 1\n",
|
||||||
|
"average since example example current current current\n",
|
||||||
|
"loss last counter weight label predict features\n",
|
||||||
|
"1.000000 1.000000 1 1.0 1 10 118\n",
|
||||||
|
"0.500000 0.000000 2 2.0 1 1 145\n",
|
||||||
|
"0.250000 0.000000 4 4.0 1 1 885\n",
|
||||||
|
"0.625000 1.000000 8 8.0 1 14 112\n",
|
||||||
|
"0.750000 0.875000 16 16.0 1 4 427\n",
|
||||||
|
"0.843750 0.937500 32 32.0 1 6 111\n",
|
||||||
|
"0.906250 0.968750 64 64.0 1 20 65\n",
|
||||||
|
"0.921875 0.937500 128 128.0 1 1 322\n",
|
||||||
|
"0.933594 0.945312 256 256.0 1 18 183\n",
|
||||||
|
"0.933594 0.933594 512 512.0 1 10 507\n",
|
||||||
|
"0.935547 0.937500 1024 1024.0 1 5 139\n",
|
||||||
|
"0.937500 0.939453 2048 2048.0 1 6 154\n",
|
||||||
|
"0.933350 0.929199 4096 4096.0 1 10 180\n",
|
||||||
|
"\n",
|
||||||
|
"finished run\n",
|
||||||
|
"number of examples = 7532\n",
|
||||||
|
"weighted example sum = 7532.000000\n",
|
||||||
|
"weighted label sum = 0.000000\n",
|
||||||
|
"average loss = 0.932953\n",
|
||||||
|
"total feature number = 2086305\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!vw -i vw_newsgroup_model.vw -t -d vw_20_newsgroup_test -p 20_newsgroup_test_pred"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.68441317047265\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!geval --metric Accuracy -o 20_newsgroup_test_pred -e 20_newsgroup_test_targets"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# starspace"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('ss_20_newsgroup_train', 'w') as f:\n",
|
||||||
|
" for target, text in zip(newsgroups_train['target'],newsgroups_train['data']):\n",
|
||||||
|
" f.write(text.replace('\\n',' ').replace(':','') + '__label__'+ str(target + 1) + '\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('ss_20_newsgroup_test', 'w') as f:\n",
|
||||||
|
" for target, text in zip(newsgroups_test['target'],newsgroups_test['data']):\n",
|
||||||
|
" f.write(text.replace('\\n',' ').replace(':','') +'\\n')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Read 3M words\n",
|
||||||
|
"Number of words: 275356\n",
|
||||||
|
"Number of labels: 20\n",
|
||||||
|
"Progress: 100.0% words/sec/thread: 1103389 lr: 0.000000 avg.loss: 0.817102 ETA: 0h 0m 0s\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!/home/kuba/fastText/fasttext supervised -input ss_20_newsgroup_train -output ss_model -epoch 50"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!/home/kuba/fastText/fasttext predict ss_model.bin ss_20_newsgroup_test > ss_20_newsgroup_test_pred"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! cat ss_20_newsgroup_test_pred | sed 's|__label__||' > ss_20_newsgroup_test_pred_label_only"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.7199946893255443\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!geval --metric Accuracy -o ss_20_newsgroup_test_pred_label_only -e 20_newsgroup_test_targets"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"author": "Jakub Pokrywka",
|
||||||
|
"email": "kubapok@wmi.amu.edu.pl",
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"lang": "pl",
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.4"
|
||||||
|
},
|
||||||
|
"subtitle": "8.Regresja logistyczna[ćwiczenia]",
|
||||||
|
"title": "Ekstrakcja informacji",
|
||||||
|
"year": "2021"
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user