Merge branch 'master' of git.wmi.amu.edu.pl:filipg/aitech-eks

2021-04-27 19:00:56 +02:00 · 2021-04-27 19:00:56 +02:00 · b5d6d177af
commit b5d6d177af
parent c5864ab9ad d5a60b064e
16 changed files with 4788 additions and 7 deletions
--- a/cw/05_NDA_IE.ipynb
+++ b/cw/05_NDA_IE.ipynb
@ -210,13 +210,6 @@
    "\n",
    "Termin 5 maj 2021 (proszę w MS TEAMS podać link do repozytorium albo publicznego albo z dostępem dla kubapok i filipg na git.wmi)"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/cw/06_klasyfikacja.ipynb
+++ b/cw/06_klasyfikacja.ipynb
@ -0,0 +1,965 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Zajęcia klasyfikacja"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Zbiór kleister"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "from collections import Counter\n",
+    "from sklearn.metrics import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "KLEISTER_PATH = pathlib.Path('/home/kuba/Syncthing/przedmioty/2020-02/IE/applica/kleister-nda')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Pytanie\n",
+    "\n",
+    "Czy jurysdykcja musi być zapisana explicite w umowie?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_expected_jurisdiction(filepath):\n",
+    "    dataset_expected_jurisdiction = []\n",
+    "    with open(filepath,'r') as train_expected_file:\n",
+    "        for line in train_expected_file:\n",
+    "            key_values = line.rstrip('\\n').split(' ')\n",
+    "            jurisdiction = None\n",
+    "            for key_value in key_values:\n",
+    "                key, value = key_value.split('=')\n",
+    "                if key == 'jurisdiction':\n",
+    "                    jurisdiction = value\n",
+    "            if jurisdiction is None:\n",
+    "                jurisdiction = 'NONE'\n",
+    "            dataset_expected_jurisdiction.append(jurisdiction)\n",
+    "    return dataset_expected_jurisdiction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_expected_jurisdiction = get_expected_jurisdiction(KLEISTER_PATH/'train'/'expected.tsv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev_expected_jurisdiction = get_expected_jurisdiction(KLEISTER_PATH/'dev-0'/'expected.tsv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "254"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train_expected_jurisdiction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'NONE' in train_expected_jurisdiction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "31"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(set(train_expected_jurisdiction))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Czy wszystkie stany muszą występować w zbiorze trenującym w zbiorze kleister?\n",
+    "\n",
+    "https://en.wikipedia.org/wiki/U.S._state\n",
+    "\n",
+    "### Jaki jest baseline?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_counter = Counter(train_expected_jurisdiction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('New_York', 43),\n",
+       " ('Delaware', 39),\n",
+       " ('California', 32),\n",
+       " ('Massachusetts', 15),\n",
+       " ('Texas', 13),\n",
+       " ('Illinois', 10),\n",
+       " ('Oregon', 9),\n",
+       " ('Florida', 9),\n",
+       " ('Pennsylvania', 9),\n",
+       " ('Missouri', 9),\n",
+       " ('Ohio', 8),\n",
+       " ('New_Jersey', 7),\n",
+       " ('Georgia', 6),\n",
+       " ('Indiana', 5),\n",
+       " ('Nevada', 5),\n",
+       " ('Colorado', 4),\n",
+       " ('Virginia', 4),\n",
+       " ('Washington', 4),\n",
+       " ('Michigan', 3),\n",
+       " ('Minnesota', 3),\n",
+       " ('Connecticut', 2),\n",
+       " ('Wisconsin', 2),\n",
+       " ('Maine', 2),\n",
+       " ('North_Carolina', 2),\n",
+       " ('Kansas', 2),\n",
+       " ('Utah', 2),\n",
+       " ('Iowa', 1),\n",
+       " ('Idaho', 1),\n",
+       " ('South_Dakota', 1),\n",
+       " ('South_Carolina', 1),\n",
+       " ('Rhode_Island', 1)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_counter.most_common(100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "most_common_answer = train_counter.most_common(100)[0][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'New_York'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "most_common_answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev_predictions_jurisdiction = [most_common_answer] * len(dev_expected_jurisdiction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['New_York',\n",
+       " 'New_York',\n",
+       " 'Delaware',\n",
+       " 'Massachusetts',\n",
+       " 'Delaware',\n",
+       " 'Washington',\n",
+       " 'Delaware',\n",
+       " 'New_Jersey',\n",
+       " 'New_York',\n",
+       " 'NONE',\n",
+       " 'NONE',\n",
+       " 'Delaware',\n",
+       " 'Delaware',\n",
+       " 'Delaware',\n",
+       " 'New_York',\n",
+       " 'Massachusetts',\n",
+       " 'Minnesota',\n",
+       " 'California',\n",
+       " 'New_York',\n",
+       " 'California',\n",
+       " 'Iowa',\n",
+       " 'California',\n",
+       " 'Virginia',\n",
+       " 'North_Carolina',\n",
+       " 'Arizona',\n",
+       " 'Indiana',\n",
+       " 'New_Jersey',\n",
+       " 'California',\n",
+       " 'Delaware',\n",
+       " 'Georgia',\n",
+       " 'New_York',\n",
+       " 'New_York',\n",
+       " 'California',\n",
+       " 'Minnesota',\n",
+       " 'California',\n",
+       " 'Kentucky',\n",
+       " 'Minnesota',\n",
+       " 'Ohio',\n",
+       " 'Michigan',\n",
+       " 'California',\n",
+       " 'Minnesota',\n",
+       " 'California',\n",
+       " 'Delaware',\n",
+       " 'Illinois',\n",
+       " 'Minnesota',\n",
+       " 'Texas',\n",
+       " 'New_Jersey',\n",
+       " 'Delaware',\n",
+       " 'Washington',\n",
+       " 'NONE',\n",
+       " 'Delaware',\n",
+       " 'Oregon',\n",
+       " 'Delaware',\n",
+       " 'Delaware',\n",
+       " 'Delaware',\n",
+       " 'Massachusetts',\n",
+       " 'California',\n",
+       " 'NONE',\n",
+       " 'Delaware',\n",
+       " 'Illinois',\n",
+       " 'Idaho',\n",
+       " 'Washington',\n",
+       " 'New_York',\n",
+       " 'New_York',\n",
+       " 'California',\n",
+       " 'Utah',\n",
+       " 'Delaware',\n",
+       " 'Washington',\n",
+       " 'Virginia',\n",
+       " 'New_York',\n",
+       " 'New_York',\n",
+       " 'Illinois',\n",
+       " 'California',\n",
+       " 'Delaware',\n",
+       " 'NONE',\n",
+       " 'Texas',\n",
+       " 'California',\n",
+       " 'Washington',\n",
+       " 'Delaware',\n",
+       " 'Washington',\n",
+       " 'New_York',\n",
+       " 'Washington',\n",
+       " 'Illinois']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dev_expected_jurisdiction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy:  0.14457831325301204\n"
+     ]
+    }
+   ],
+   "source": [
+    "counter = 0 \n",
+    "for pred, exp in zip(dev_predictions_jurisdiction, dev_expected_jurisdiction):\n",
+    "    if pred == exp:\n",
+    "        counter +=1\n",
+    "print('accuracy: ', counter/len(dev_predictions_jurisdiction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.14457831325301204"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "accuracy_score(dev_predictions_jurisdiction, dev_expected_jurisdiction)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Co jeżeli nazwy klas nie występują explicite w zbiorach?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
+    "    \n",
+    "https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SPORT_PATH='/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia6_klasyfikacja/repos/sport-text-classification-ball'\n",
+    "\n",
+    "SPORT_TRAIN=$SPORT_PATH/train/train.tsv.gz\n",
+    "    \n",
+    "SPORT_DEV_EXP=$SPORT_PATH/dev-0/expected.tsv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### jaki jest baseline dla sport classification ball?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "zcat  $SPORT_TRAIN | awk '{print $1}'  | wc -l"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "zcat  $SPORT_TRAIN | awk '{print $1}'  | grep 1 | wc -l"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "cat  $SPORT_DEV_EXP | wc -l\n",
+    "\n",
+    "grep 1  $SPORT_DEV_EXP | wc -l"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sprytne podejście do klasyfikacji tekstu? Naiwny bayess"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/kuba/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
+      "  warnings.warn(msg)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "import numpy as np\n",
+    "import sklearn.metrics\n",
+    "import gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newsgroups = fetch_20newsgroups()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newsgroups_text = newsgroups['data']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From: lerxst@wam.umd.edu (where's my thing)\n",
+      "Subject: WHAT car is this!?\n",
+      "Nntp-Posting-Host: rac3.wam.umd.edu\n",
+      "Organization: University of Maryland, College Park\n",
+      "Lines: 15\n",
+      "\n",
+      " I was wondering if anyone out there could enlighten me on this car I saw\n",
+      "the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
+      "early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
+      "the front bumper was separate from the rest of the body. This is \n",
+      "all I know. If anyone can tellme a model name, engine specs, years\n",
+      "of production, where this car is made, history, or whatever info you\n",
+      "have on this funky looking car, please e-mail.\n",
+      "\n",
+      "Thanks,\n",
+      "- IL\n",
+      "   ---- brought to you by your neighborhood Lerxst ----\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(newsgroups_text[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['where', 'name', 'looked', 'to', 'have', 'out', 'on', 'by', 'park', 'what', 'from', 'host', 'doors', 'day', 'be', 'organization', 'e', 'front', 'in', 'it', 'history', 'brought', 'know', 'addition', 'il', 'of', 'lines', 'i', 'your', 'bumper', 'there', 'please', 'me', 'separate', 'is', 'tellme', 'can', 'could', 'called', 'specs', 'college', 'this', 'thanks', 'looking', 'if', 'production', 'sports', 'lerxst', 'whatever', 'anyone', 'enlighten', 'saw', 'all', 'small', 'you', 'wam', 'mail', 'rest', 's', 'late', 'rac', 'funky', 'edu', 'info', 'the', 'wondering', 'years', 'door', 'posting', 'car', 'made', 'or', 'maryland', 'subject', 'bricklin', 'was', 'model', 'thing', 'university', 'engine', 'nntp', 'other', 'really', 'neighborhood', 'early', 'a', 'umd', 'my', 'body', 'were']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(newsgroups_text_tokenized[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y = newsgroups['target']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([7, 4, 4, ..., 3, 1, 8])"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_names = newsgroups['target_names']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['alt.atheism',\n",
+       " 'comp.graphics',\n",
+       " 'comp.os.ms-windows.misc',\n",
+       " 'comp.sys.ibm.pc.hardware',\n",
+       " 'comp.sys.mac.hardware',\n",
+       " 'comp.windows.x',\n",
+       " 'misc.forsale',\n",
+       " 'rec.autos',\n",
+       " 'rec.motorcycles',\n",
+       " 'rec.sport.baseball',\n",
+       " 'rec.sport.hockey',\n",
+       " 'sci.crypt',\n",
+       " 'sci.electronics',\n",
+       " 'sci.med',\n",
+       " 'sci.space',\n",
+       " 'soc.religion.christian',\n",
+       " 'talk.politics.guns',\n",
+       " 'talk.politics.mideast',\n",
+       " 'talk.politics.misc',\n",
+       " 'talk.religion.misc']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Y_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'talk.politics.guns'"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Y_names[16]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$P('talk.politics.guns' | 'gun')=  ?$ \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "$P(A|B) * P(A) = P(B) * P(B|A)$\n",
+    "\n",
+    "$P(A|B) = \\frac{P(B) * P(B|A)}{P(A)}$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$P('talk.politics.guns' | 'gun') * P('gun') = P('gun'|'talk.politics.guns') * P('talk.politics.guns')$\n",
+    "\n",
+    "\n",
+    "$P('talk.politics.guns' | 'gun')  = \\frac{P('gun'|'talk.politics.guns') * P('talk.politics.guns')}{P('gun')}$\n",
+    "\n",
+    "\n",
+    "$p1 = P('gun'|'talk.politics.guns')$\n",
+    "\n",
+    "\n",
+    "$p2 = P('talk.politics.guns')$\n",
+    "\n",
+    "\n",
+    "$p3 = P('gun')$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## obliczanie $p1 = P('gun'|'talk.politics.guns')$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# samodzielne wykonanie"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## obliczanie $p2 = P('talk.politics.guns')$\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# samodzielne wykonanie"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## obliczanie $p3 = P('gun')$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# samodzielne wykonanie"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ostatecznie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'p1' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-31-447f586cc09f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mp1\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mp2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mp3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m: name 'p1' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "(p1 * p2) / p3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_prob(index ):\n",
+    "    talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
+    "\n",
+    "    len([x for x in talks_topic if 'gun' in x])\n",
+    "\n",
+    "    if len(talks_topic) == 0:\n",
+    "        return 0.0\n",
+    "    p1 = len([x for x in talks_topic if 'gun' in x]) / len(talks_topic)\n",
+    "    p2 = len(talks_topic) / len(Y)\n",
+    "    p3 = len([x for x in newsgroups_text_tokenized if 'gun' in x]) / len(Y)\n",
+    "\n",
+    "    if p3 == 0:\n",
+    "        return 0.0\n",
+    "    else: \n",
+    "        return (p1 * p2)/ p3\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.01622 \t\t alt.atheism\n",
+      "0.00000 \t\t comp.graphics\n",
+      "0.00541 \t\t comp.os.ms-windows.misc\n",
+      "0.01892 \t\t comp.sys.ibm.pc.hardware\n",
+      "0.00270 \t\t comp.sys.mac.hardware\n",
+      "0.00000 \t\t comp.windows.x\n",
+      "0.01351 \t\t misc.forsale\n",
+      "0.04054 \t\t rec.autos\n",
+      "0.01892 \t\t rec.motorcycles\n",
+      "0.00270 \t\t rec.sport.baseball\n",
+      "0.00541 \t\t rec.sport.hockey\n",
+      "0.03784 \t\t sci.crypt\n",
+      "0.02973 \t\t sci.electronics\n",
+      "0.00541 \t\t sci.med\n",
+      "0.01622 \t\t sci.space\n",
+      "0.00270 \t\t soc.religion.christian\n",
+      "0.68378 \t\t talk.politics.guns\n",
+      "0.04595 \t\t talk.politics.mideast\n",
+      "0.03784 \t\t talk.politics.misc\n",
+      "0.01622 \t\t talk.religion.misc\n",
+      "1.00000 \t\tsuma\n"
+     ]
+    }
+   ],
+   "source": [
+    "probs = []\n",
+    "for i in range(len(Y_names)):\n",
+    "    probs.append(get_prob(i))\n",
+    "    print(\"%.5f\" %   get_prob(i),'\\t\\t', Y_names[i])\n",
+    "    \n",
+    "print(\"%.5f\" % sum(probs), '\\t\\tsuma',)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### zadanie samodzielne"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_prob2(index, word ):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# listing dla get_prob2, słowo 'god'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## założenie naiwnego bayesa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$P(class | word1, word2, word3)  = \\frac{P(word1, word2, word3|class) * P(class)}{P(word1, word2, word3)}$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**przy założeniu o niezależności zmiennych losowych $word1$, $word2$, $word3$**:\n",
+    "\n",
+    "\n",
+    "$P(word1, word2, word3|class) = P(word1|class)* P(word2|class) *  P(word3|class)$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**ostatecznie:**\n",
+    "\n",
+    "\n",
+    "$P(class | word1, word2, word3)  = \\frac{P(word1|class)* P(word2|class) *  P(word3|class)  * P(class)}{\\sum_k{P(word1|class_k)* P(word2|class_k) *  P(word3|class_k)  * P(class_k)}}$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## zadania domowe naiwny bayes1 ręcznie"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- analogicznie zaimplementować funkcję get_prob3(index, document_tokenized), argument document_tokenized ma być zbiorem słów dokumentu. funkcja ma być naiwnym klasyfikatorem bayesowskim (w przypadku wielu słów)\n",
+    "- odpalić powyższy listing prawdopodobieństw z funkcją get_prob3 dla dokumentów: {'i','love','guns'} oraz {'is','there','life','after'\n",
+    ",'death'}\n",
+    "- zadanie proszę zrobić w jupyterze, wygenerować pdf (kod + wyniki odpalenia) i umieścić go jako zadanie w teams\n",
+    "- termin 12.05, punktów: 40\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## zadania domowe naiwny bayes2 gotowa biblioteka"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- wybrać jedno z poniższych repozytoriów i je sforkować:\n",
+    "  - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
+    "  - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public\n",
+    "- stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf\n",
+    "- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
+    "- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
+    "- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
+    "termin 12.05, 40 punktów\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/cw/06_klasyfikacja_ODPOWIEDZI.ipynb
+++ b/cw/06_klasyfikacja_ODPOWIEDZI.ipynb
--- a/cw/07_regresja_liniowa.ipynb
+++ b/cw/07_regresja_liniowa.ipynb
--- a/cw/07_regresja_liniowa_ODPOWIEDZI.ipynb
+++ b/cw/07_regresja_liniowa_ODPOWIEDZI.ipynb
--- a/cw/obrazki/1.png
+++ b/cw/obrazki/1.png
--- a/cw/obrazki/1.svg
+++ b/cw/obrazki/1.svg
@ -0,0 +1,266 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="800mm"
+   height="800mm"
+   viewBox="0 0 800 800"
+   version="1.1"
+   id="svg16"
+   sodipodi:docname="1.svg"
+   inkscape:export-filename="/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/obrazki/6.png"
+   inkscape:export-xdpi="96"
+   inkscape:export-ydpi="96"
+   inkscape:version="0.92.5 (2060ec1f9f, 2020-04-08)">
+  <defs
+     id="defs10" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.35"
+     inkscape:cx="1485.1537"
+     inkscape:cy="1417.9979"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     width="800mm"
+     inkscape:window-width="2560"
+     inkscape:window-height="1389"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1">
+    <inkscape:grid
+       type="xygrid"
+       id="grid253" />
+  </sodipodi:namedview>
+  <metadata
+     id="metadata13">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(0,503)">
+    <rect
+       id="rect18"
+       width="700.24615"
+       height="11.759859"
+       x="62.006527"
+       y="148.39815"
+       style="stroke-width:0.26458332" />
+    <rect
+       id="rect18-3"
+       width="700.24615"
+       height="11.759859"
+       x="-475.47943"
+       y="-99.864838"
+       style="stroke-width:0.26458332"
+       transform="rotate(90.042959)" />
+    <circle
+       id="path37"
+       cx="138.44562"
+       cy="-13.583364"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-9"
+       cx="298.2728"
+       cy="-3.4271142"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-7"
+       cx="293.99649"
+       cy="-161.65015"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-92"
+       cx="349.58853"
+       cy="-91.091507"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-0"
+       cx="551.64429"
+       cy="-123.16381"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-2"
+       cx="505.67395"
+       cy="-385.08951"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-3"
+       cx="709.86786"
+       cy="-417.16187"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <circle
+       id="path37-75"
+       cx="450.08188"
+       cy="-214.03429"
+       r="11.22532"
+       style="stroke-width:0.26458332" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:42.33333333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332;"
+       x="655.34485"
+       y="192.23036"
+       id="text215"><tspan
+         sodipodi:role="line"
+         id="tspan213"
+         x="655.34485"
+         y="192.23036"
+         style="stroke-width:0.26458332;font-size:42.33333333px;">x</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:42.33333206px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="36.73391"
+       y="-383.11801"
+       id="text215-8"><tspan
+         sodipodi:role="line"
+         id="tspan213-9"
+         x="36.73391"
+         y="-345.66293"
+         style="font-size:42.33333206px;stroke-width:0.26458332" /></text>
+    <rect
+       style="fill:#000000;stroke-width:0.26458332"
+       id="rect263"
+       width="6.8035712"
+       height="38.55357"
+       x="-218.69528"
+       y="-431.2952"
+       transform="rotate(37.42867)" />
+    <rect
+       style="fill:#000000;stroke-width:0.26458332"
+       id="rect263-7"
+       width="6.8035712"
+       height="38.55357"
+       x="-386.60941"
+       y="255.82913"
+       transform="rotate(139.04298)"
+       inkscape:transform-center-x="-20.410714"
+       inkscape:transform-center-y="6.8035653" />
+    <rect
+       style="fill:#000000;stroke-width:0.26458332"
+       id="rect263-3"
+       width="6.8035712"
+       height="38.55357"
+       x="-371.74628"
+       y="-681.80341"
+       transform="rotate(129.61772)" />
+    <rect
+       style="fill:#000000;stroke-width:0.26458332"
+       id="rect263-7-6"
+       width="6.8035712"
+       height="38.55357"
+       x="-601.17584"
+       y="456.17935"
+       transform="rotate(-128.76797)"
+       inkscape:transform-center-x="7.5782166"
+       inkscape:transform-center-y="20.135944" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:42.33333206px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="48.032505"
+       y="-377.82925"
+       id="text215-1"><tspan
+         sodipodi:role="line"
+         id="tspan213-2"
+         x="48.032505"
+         y="-377.82925"
+         style="font-size:42.33333206px;stroke-width:0.26458332">y</tspan><tspan
+         sodipodi:role="line"
+         x="48.032505"
+         y="-324.9126"
+         style="font-size:42.33333206px;stroke-width:0.26458332"
+         id="tspan334" /></text>
+    <rect
+       id="rect18-9"
+       width="670.43402"
+       height="13.544262"
+       x="114.69541"
+       y="-151.7952"
+       style="fill:#ff0000;stroke-width:0.27783805"
+       transform="matrix(0.99999973,7.380958e-4,0.11550968,0.99330635,0,0)" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect390"
+       width="5.2916665"
+       height="134.55952"
+       x="136.07143"
+       y="-146.74403" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect392"
+       width="5.2916665"
+       height="20.410715"
+       x="290.28571"
+       y="-164.13097" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect396"
+       width="6.0476379"
+       height="143.63097"
+       x="295.57736"
+       y="-143.72026" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect398"
+       width="4.5357141"
+       height="55.184521"
+       x="346.98215"
+       y="-143.72023" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect400"
+       width="5.2916665"
+       height="73.327377"
+       x="448.27979"
+       y="-215.53571" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect402"
+       width="3.7797618"
+       height="243.41666"
+       x="503.46429"
+       y="-386.38095" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.22913587"
+       id="rect404"
+       width="4.5357146"
+       height="27.970238"
+       x="547.30951"
+       y="-145.9881" />
+    <rect
+       style="fill:#00ff00;stroke-width:0.26458332"
+       id="rect406"
+       width="4.5357141"
+       height="276.67856"
+       x="707.57141"
+       y="-419.64285" />
+  </g>
+</svg>
--- a/cw/obrazki/10.png
+++ b/cw/obrazki/10.png
--- a/cw/obrazki/2.png
+++ b/cw/obrazki/2.png
--- a/cw/obrazki/3.png
+++ b/cw/obrazki/3.png
--- a/cw/obrazki/4.png
+++ b/cw/obrazki/4.png
--- a/cw/obrazki/5.png
+++ b/cw/obrazki/5.png
--- a/cw/obrazki/6.png
+++ b/cw/obrazki/6.png
--- a/cw/obrazki/7.png
+++ b/cw/obrazki/7.png
--- a/cw/obrazki/8.png
+++ b/cw/obrazki/8.png
--- a/cw/obrazki/9.png
+++ b/cw/obrazki/9.png