09

2022-05-16 10:38:28 +02:00 · 2022-05-16 10:38:28 +02:00 · d4b869c0ce
commit d4b869c0ce
parent 31a53bb658
1 changed files with 733 additions and 0 deletions
--- a/cw/09_word2vec_oraz_biblioteki_NLP.ipynb
+++ b/cw/09_word2vec_oraz_biblioteki_NLP.ipynb
@ -0,0 +1,733 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<h1> Ekstrakcja informacji </h1>\n",
+    "<h2> 8. <i>word2vec i gotowe biblioteki</i>  [ćwiczenia]</h2> \n",
+    "<h3> Jakub Pokrywka (2021)</h3>\n",
+    "</div>\n",
+    "\n",
+    "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: gensim in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (4.2.0)\n",
+      "Requirement already satisfied: numpy>=1.17.0 in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (from gensim) (1.22.3)\n",
+      "Requirement already satisfied: scipy>=0.18.1 in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (from gensim) (1.8.0)\n",
+      "Requirement already satisfied: smart-open>=1.8.1 in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (from gensim) (6.0.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install gensim "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim.downloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_62420/1831104553.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
+      "  from IPython.core.display import display, HTML\n"
+     ]
+    }
+   ],
+   "source": [
+    "from IPython.display import Image\n",
+    "from IPython.core.display import display, HTML"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Mikolov et al., Efficient Estimation of Word Representations in Vector Space (2013)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![title](obrazki/w2v.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_vectors = gensim.downloader.load(\"glove-wiki-gigaword-100\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.30817  ,  0.30938  ,  0.52803  , -0.92543  , -0.73671  ,\n",
+       "        0.63475  ,  0.44197  ,  0.10262  , -0.09142  , -0.56607  ,\n",
+       "       -0.5327   ,  0.2013   ,  0.7704   , -0.13983  ,  0.13727  ,\n",
+       "        1.1128   ,  0.89301  , -0.17869  , -0.0019722,  0.57289  ,\n",
+       "        0.59479  ,  0.50428  , -0.28991  , -1.3491   ,  0.42756  ,\n",
+       "        1.2748   , -1.1613   , -0.41084  ,  0.042804 ,  0.54866  ,\n",
+       "        0.18897  ,  0.3759   ,  0.58035  ,  0.66975  ,  0.81156  ,\n",
+       "        0.93864  , -0.51005  , -0.070079 ,  0.82819  , -0.35346  ,\n",
+       "        0.21086  , -0.24412  , -0.16554  , -0.78358  , -0.48482  ,\n",
+       "        0.38968  , -0.86356  , -0.016391 ,  0.31984  , -0.49246  ,\n",
+       "       -0.069363 ,  0.018869 , -0.098286 ,  1.3126   , -0.12116  ,\n",
+       "       -1.2399   , -0.091429 ,  0.35294  ,  0.64645  ,  0.089642 ,\n",
+       "        0.70294  ,  1.1244   ,  0.38639  ,  0.52084  ,  0.98787  ,\n",
+       "        0.79952  , -0.34625  ,  0.14095  ,  0.80167  ,  0.20987  ,\n",
+       "       -0.86007  , -0.15308  ,  0.074523 ,  0.40816  ,  0.019208 ,\n",
+       "        0.51587  , -0.34428  , -0.24525  , -0.77984  ,  0.27425  ,\n",
+       "        0.22418  ,  0.20164  ,  0.017431 , -0.014697 , -1.0235   ,\n",
+       "       -0.39695  , -0.0056188,  0.30569  ,  0.31748  ,  0.021404 ,\n",
+       "        0.11837  , -0.11319  ,  0.42456  ,  0.53405  , -0.16717  ,\n",
+       "       -0.27185  , -0.6255   ,  0.12883  ,  0.62529  , -0.52086  ],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors['dog']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(word_vectors['dog'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$ A = (a_1, a_2, \\ldots, a_n)$\n",
+    "\n",
+    "$ B = (b_1, b_2, \\ldots, b_n)$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$A \\cdot B =  a_1* b_1 + a_2*b_2 + \\ldots a_n*b_n$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$A \\cdot B =  |A|  |B|  cos(\\theta)$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "cosine_similarity = $\\frac{A \\cdot B}{|A||B|}$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![image.png](obrazki/cos.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.24340999,  0.23372999,  0.34519994, -1.19175   , -1.4724072 ,\n",
+       "        0.34235   ,  0.60779   ,  0.261443  ,  0.06009999, -1.37846   ,\n",
+       "       -0.88091004,  0.08861998,  1.05097   , -0.37221998, -0.05504   ,\n",
+       "        2.07504   ,  1.2128501 , -0.17209001,  0.5188256 ,  0.68386996,\n",
+       "        0.26919997,  0.977559  , -0.41735998, -2.29253   ,  0.06891   ,\n",
+       "        1.9723799 , -1.7875899 , -0.1394    , -0.08426201,  0.73421997,\n",
+       "        0.449713  ,  0.27947   ,  1.1328939 ,  1.48901   ,  1.44769   ,\n",
+       "        2.25301   , -0.23492998, -0.721868  ,  0.78779006, -0.73836505,\n",
+       "        0.88069   , -0.447323  , -1.29005   , -1.39741   , -1.10009   ,\n",
+       "        0.50502   , -1.6576351 , -0.055184  ,  0.38991004, -0.76956004,\n",
+       "        0.185334  ,  0.43640798, -0.882702  ,  0.83290005,  0.13615999,\n",
+       "       -0.23210001,  0.58739203,  0.24005997,  0.05180001, -0.398276  ,\n",
+       "        0.99437   ,  1.40552   ,  1.3153701 ,  1.20883   ,  1.23647   ,\n",
+       "        1.692517  , -1.5952799 , -0.22698998,  2.10365   ,  0.15522999,\n",
+       "       -1.87457   , -0.01184002,  0.03998601,  1.0829899 , -0.315964  ,\n",
+       "        0.98266095, -0.86874   ,  0.09540001, -1.0042601 ,  0.83836997,\n",
+       "       -0.29442003,  0.05798   ,  0.063619  ,  0.197066  , -0.7356999 ,\n",
+       "       -0.222     ,  0.5118224 ,  0.73807997,  0.733638  ,  0.577438  ,\n",
+       "       -0.04933   ,  0.14863001,  0.39170003,  1.022125  , -0.08759001,\n",
+       "       -0.589356  , -0.86798   ,  1.19477   ,  1.211442  , -0.50261   ],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors['dog'] + word_vectors['dog'] - word_vectors['man']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('yellow', 0.7358633279800415),\n",
+       " ('red', 0.7140780091285706),\n",
+       " ('blue', 0.7118036150932312),\n",
+       " ('green', 0.7111418843269348),\n",
+       " ('pink', 0.677507221698761),\n",
+       " ('purple', 0.6774231791496277),\n",
+       " ('black', 0.6709616780281067),\n",
+       " ('colored', 0.665260910987854),\n",
+       " ('lemon', 0.6251963973045349),\n",
+       " ('peach', 0.6168624758720398)]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors.most_similar(positive=['orange'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('queen', 0.7698540687561035),\n",
+       " ('monarch', 0.6843381524085999),\n",
+       " ('throne', 0.6755736470222473),\n",
+       " ('daughter', 0.6594556570053101),\n",
+       " ('princess', 0.6520534157752991),\n",
+       " ('prince', 0.6517034769058228),\n",
+       " ('elizabeth', 0.6464517712593079),\n",
+       " ('mother', 0.631171703338623),\n",
+       " ('emperor', 0.6106470823287964),\n",
+       " ('wife', 0.6098655462265015)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('berlin', 0.8846380710601807),\n",
+       " ('frankfurt', 0.7985543608665466),\n",
+       " ('vienna', 0.7675994038581848),\n",
+       " ('munich', 0.7542588114738464),\n",
+       " ('hamburg', 0.7182371616363525),\n",
+       " ('bonn', 0.6890878081321716),\n",
+       " ('prague', 0.6842440962791443),\n",
+       " ('cologne', 0.6762093305587769),\n",
+       " ('zurich', 0.6653268933296204),\n",
+       " ('leipzig', 0.6619253754615784)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors.most_similar(positive=['paris', 'germany'], negative=['france'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('walked', 0.6780266761779785),\n",
+       " ('crawled', 0.6523419618606567),\n",
+       " ('wandered', 0.6384280323982239),\n",
+       " ('hopped', 0.6131909489631653),\n",
+       " ('walks', 0.6122221946716309),\n",
+       " ('walk', 0.6120144724845886),\n",
+       " ('strolled', 0.6010454893112183),\n",
+       " ('slept', 0.5912748575210571),\n",
+       " ('wandering', 0.5861443877220154),\n",
+       " ('waited', 0.5791574716567993)]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors.most_similar(positive=['walking', 'swam'], negative=['swimming'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('puppies', 0.6867596507072449),\n",
+       " ('kitten', 0.6866797208786011),\n",
+       " ('kittens', 0.6383703947067261),\n",
+       " ('monkey', 0.6171091198921204),\n",
+       " ('rabbit', 0.6136822700500488),\n",
+       " ('pup', 0.6054644584655762),\n",
+       " ('tabby', 0.5937005281448364),\n",
+       " ('retriever', 0.5934329628944397),\n",
+       " ('bitch', 0.5817775130271912),\n",
+       " ('hound', 0.57785564661026)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors.most_similar(positive=['puppy', 'cat'], negative=['dog'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('dog', 0.8798074722290039),\n",
+       " ('rabbit', 0.7424427270889282),\n",
+       " ('cats', 0.732300341129303),\n",
+       " ('monkey', 0.7288709878921509),\n",
+       " ('pet', 0.719014048576355),\n",
+       " ('dogs', 0.7163872718811035),\n",
+       " ('mouse', 0.6915250420570374),\n",
+       " ('puppy', 0.6800068020820618),\n",
+       " ('rat', 0.6641027331352234),\n",
+       " ('spider', 0.6501135230064392)]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_vectors.most_similar(positive=['cat'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![image.png](obrazki/linear-relationships.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## vowpal wabbit "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import fetch_20newsgroups"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newsgroups_train = fetch_20newsgroups(subset = 'train')\n",
+    "newsgroups_test = fetch_20newsgroups(subset = 'test')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('vw_20_newsgroup_train', 'w') as f:\n",
+    "    for target, text in zip(newsgroups_train['target'],newsgroups_train['data']):\n",
+    "        f.write(str(target + 1) + ' |text ' + text.replace('\\n',' ').replace(':','') + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('vw_20_newsgroup_test', 'w') as f, open('20_newsgroup_test_targets', 'w') as f_targets:\n",
+    "    for target, text in zip(newsgroups_test['target'],newsgroups_test['data']):\n",
+    "        f.write('1 |text ' + text.replace('\\n',' ').replace(':','') + '\\n')\n",
+    "        f_targets.write(str(target + 1) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "final_regressor = vw_newsgroup_model.vw\n",
+      "Num weight bits = 18\n",
+      "learning rate = 0.5\n",
+      "initial_t = 0\n",
+      "power_t = 0.5\n",
+      "using no cache\n",
+      "Reading datafile = vw_20_newsgroup_train\n",
+      "num sources = 1\n",
+      "average  since         example        example  current  current  current\n",
+      "loss     last          counter         weight    label  predict features\n",
+      "1.000000 1.000000            1            1.0        8        1      124\n",
+      "1.000000 1.000000            2            2.0        5        8      124\n",
+      "0.750000 0.500000            4            4.0        2        5      114\n",
+      "0.875000 1.000000            8            8.0        4       15      417\n",
+      "0.937500 1.000000           16           16.0        1       15      203\n",
+      "0.968750 1.000000           32           32.0       14        7      236\n",
+      "0.953125 0.937500           64           64.0        7        5       50\n",
+      "0.875000 0.796875          128          128.0       17       15      416\n",
+      "0.828125 0.781250          256          256.0        3        1      251\n",
+      "0.757812 0.687500          512          512.0        4        5      163\n",
+      "0.680664 0.603516         1024         1024.0       14        1      183\n",
+      "0.559570 0.438477         2048         2048.0        7       13       65\n",
+      "0.440918 0.322266         4096         4096.0       15       15       94\n",
+      "0.337402 0.233887         8192         8192.0       16       16      384\n",
+      "\n",
+      "finished run\n",
+      "number of examples = 11314\n",
+      "weighted example sum = 11314.000000\n",
+      "weighted label sum = 0.000000\n",
+      "average loss = 0.300601\n",
+      "total feature number = 3239430\n"
+     ]
+    }
+   ],
+   "source": [
+    "!vw --oaa 20 -d 'vw_20_newsgroup_train'  -f vw_newsgroup_model.vw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "only testing\n",
+      "predictions = vw_20_newsgroup_train_pred\n",
+      "Num weight bits = 18\n",
+      "learning rate = 0.5\n",
+      "initial_t = 0\n",
+      "power_t = 0.5\n",
+      "using no cache\n",
+      "Reading datafile = vw_20_newsgroup_train\n",
+      "num sources = 1\n",
+      "average  since         example        example  current  current  current\n",
+      "loss     last          counter         weight    label  predict features\n",
+      "0.000000 0.000000            1            1.0        8        8      124\n",
+      "0.000000 0.000000            2            2.0        5        5      124\n",
+      "0.000000 0.000000            4            4.0        2        2      114\n",
+      "0.000000 0.000000            8            8.0        4        4      417\n",
+      "0.000000 0.000000           16           16.0        1        1      203\n",
+      "0.000000 0.000000           32           32.0       14       14      236\n",
+      "0.000000 0.000000           64           64.0        7        7       50\n",
+      "0.015625 0.031250          128          128.0       17       17      416\n",
+      "0.015625 0.015625          256          256.0        3        3      251\n",
+      "0.011719 0.007812          512          512.0        4        4      163\n",
+      "0.018555 0.025391         1024         1024.0       14       14      183\n",
+      "0.017578 0.016602         2048         2048.0        7        7       65\n",
+      "0.018555 0.019531         4096         4096.0       15       15       94\n",
+      "0.020264 0.021973         8192         8192.0       16       16      384\n",
+      "\n",
+      "finished run\n",
+      "number of examples = 11314\n",
+      "weighted example sum = 11314.000000\n",
+      "weighted label sum = 0.000000\n",
+      "average loss = 0.020771\n",
+      "total feature number = 3239430\n"
+     ]
+    }
+   ],
+   "source": [
+    "!vw -i vw_newsgroup_model.vw -t -d vw_20_newsgroup_train -p vw_20_newsgroup_train_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "only testing\n",
+      "predictions = 20_newsgroup_test_pred\n",
+      "Num weight bits = 18\n",
+      "learning rate = 0.5\n",
+      "initial_t = 0\n",
+      "power_t = 0.5\n",
+      "using no cache\n",
+      "Reading datafile = vw_20_newsgroup_test\n",
+      "num sources = 1\n",
+      "average  since         example        example  current  current  current\n",
+      "loss     last          counter         weight    label  predict features\n",
+      "1.000000 1.000000            1            1.0        1       10      118\n",
+      "0.500000 0.000000            2            2.0        1        1      145\n",
+      "0.250000 0.000000            4            4.0        1        1      885\n",
+      "0.625000 1.000000            8            8.0        1       14      112\n",
+      "0.750000 0.875000           16           16.0        1        4      427\n",
+      "0.843750 0.937500           32           32.0        1        6      111\n",
+      "0.906250 0.968750           64           64.0        1       20       65\n",
+      "0.921875 0.937500          128          128.0        1        1      322\n",
+      "0.933594 0.945312          256          256.0        1       18      183\n",
+      "0.933594 0.933594          512          512.0        1       10      507\n",
+      "0.935547 0.937500         1024         1024.0        1        5      139\n",
+      "0.937500 0.939453         2048         2048.0        1        6      154\n",
+      "0.933350 0.929199         4096         4096.0        1       10      180\n",
+      "\n",
+      "finished run\n",
+      "number of examples = 7532\n",
+      "weighted example sum = 7532.000000\n",
+      "weighted label sum = 0.000000\n",
+      "average loss = 0.932953\n",
+      "total feature number = 2086305\n"
+     ]
+    }
+   ],
+   "source": [
+    "!vw -i vw_newsgroup_model.vw -t -d vw_20_newsgroup_test -p 20_newsgroup_test_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.68441317047265\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!geval  --metric Accuracy -o 20_newsgroup_test_pred -e 20_newsgroup_test_targets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# starspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('ss_20_newsgroup_train', 'w') as f:\n",
+    "    for target, text in zip(newsgroups_train['target'],newsgroups_train['data']):\n",
+    "        f.write(text.replace('\\n',' ').replace(':','') + '__label__'+  str(target + 1) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('ss_20_newsgroup_test', 'w') as f:\n",
+    "    for target, text in zip(newsgroups_test['target'],newsgroups_test['data']):\n",
+    "        f.write(text.replace('\\n',' ').replace(':','')  +'\\n')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Read 3M words\n",
+      "Number of words:  275356\n",
+      "Number of labels: 20\n",
+      "Progress: 100.0% words/sec/thread: 1103389 lr:  0.000000 avg.loss:  0.817102 ETA:   0h 0m 0s\n"
+     ]
+    }
+   ],
+   "source": [
+    "!/home/kuba/fastText/fasttext supervised -input ss_20_newsgroup_train -output ss_model -epoch 50"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!/home/kuba/fastText/fasttext predict ss_model.bin  ss_20_newsgroup_test > ss_20_newsgroup_test_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cat ss_20_newsgroup_test_pred | sed 's|__label__||' > ss_20_newsgroup_test_pred_label_only"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.7199946893255443\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!geval  --metric Accuracy -o ss_20_newsgroup_test_pred_label_only -e 20_newsgroup_test_targets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "author": "Jakub Pokrywka",
+  "email": "kubapok@wmi.amu.edu.pl",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "lang": "pl",
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "subtitle": "8.Regresja logistyczna[ćwiczenia]",
+  "title": "Ekstrakcja informacji",
+  "year": "2021"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}