Merge branch 'master' of git.wmi.amu.edu.pl:filipg/aitech-eks

This commit is contained in:
Filip Gralinski 2021-04-27 19:00:56 +02:00
commit b5d6d177af
16 changed files with 4788 additions and 7 deletions

View File

@ -210,13 +210,6 @@
"\n", "\n",
"Termin 5 maj 2021 (proszę w MS TEAMS podać link do repozytorium albo publicznego albo z dostępem dla kubapok i filipg na git.wmi)" "Termin 5 maj 2021 (proszę w MS TEAMS podać link do repozytorium albo publicznego albo z dostępem dla kubapok i filipg na git.wmi)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

965
cw/06_klasyfikacja.ipynb Normal file
View File

@ -0,0 +1,965 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Zajęcia klasyfikacja"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zbiór kleister"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"from collections import Counter\n",
"from sklearn.metrics import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"KLEISTER_PATH = pathlib.Path('/home/kuba/Syncthing/przedmioty/2020-02/IE/applica/kleister-nda')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pytanie\n",
"\n",
"Czy jurysdykcja musi być zapisana explicite w umowie?"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def get_expected_jurisdiction(filepath):\n",
" dataset_expected_jurisdiction = []\n",
" with open(filepath,'r') as train_expected_file:\n",
" for line in train_expected_file:\n",
" key_values = line.rstrip('\\n').split(' ')\n",
" jurisdiction = None\n",
" for key_value in key_values:\n",
" key, value = key_value.split('=')\n",
" if key == 'jurisdiction':\n",
" jurisdiction = value\n",
" if jurisdiction is None:\n",
" jurisdiction = 'NONE'\n",
" dataset_expected_jurisdiction.append(jurisdiction)\n",
" return dataset_expected_jurisdiction"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train_expected_jurisdiction = get_expected_jurisdiction(KLEISTER_PATH/'train'/'expected.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"dev_expected_jurisdiction = get_expected_jurisdiction(KLEISTER_PATH/'dev-0'/'expected.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"254"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(train_expected_jurisdiction)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'NONE' in train_expected_jurisdiction"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"31"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(set(train_expected_jurisdiction))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Czy wszystkie stany muszą występować w zbiorze trenującym w zbiorze kleister?\n",
"\n",
"https://en.wikipedia.org/wiki/U.S._state\n",
"\n",
"### Jaki jest baseline?"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"train_counter = Counter(train_expected_jurisdiction)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('New_York', 43),\n",
" ('Delaware', 39),\n",
" ('California', 32),\n",
" ('Massachusetts', 15),\n",
" ('Texas', 13),\n",
" ('Illinois', 10),\n",
" ('Oregon', 9),\n",
" ('Florida', 9),\n",
" ('Pennsylvania', 9),\n",
" ('Missouri', 9),\n",
" ('Ohio', 8),\n",
" ('New_Jersey', 7),\n",
" ('Georgia', 6),\n",
" ('Indiana', 5),\n",
" ('Nevada', 5),\n",
" ('Colorado', 4),\n",
" ('Virginia', 4),\n",
" ('Washington', 4),\n",
" ('Michigan', 3),\n",
" ('Minnesota', 3),\n",
" ('Connecticut', 2),\n",
" ('Wisconsin', 2),\n",
" ('Maine', 2),\n",
" ('North_Carolina', 2),\n",
" ('Kansas', 2),\n",
" ('Utah', 2),\n",
" ('Iowa', 1),\n",
" ('Idaho', 1),\n",
" ('South_Dakota', 1),\n",
" ('South_Carolina', 1),\n",
" ('Rhode_Island', 1)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_counter.most_common(100)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"most_common_answer = train_counter.most_common(100)[0][0]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'New_York'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"most_common_answer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"dev_predictions_jurisdiction = [most_common_answer] * len(dev_expected_jurisdiction)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['New_York',\n",
" 'New_York',\n",
" 'Delaware',\n",
" 'Massachusetts',\n",
" 'Delaware',\n",
" 'Washington',\n",
" 'Delaware',\n",
" 'New_Jersey',\n",
" 'New_York',\n",
" 'NONE',\n",
" 'NONE',\n",
" 'Delaware',\n",
" 'Delaware',\n",
" 'Delaware',\n",
" 'New_York',\n",
" 'Massachusetts',\n",
" 'Minnesota',\n",
" 'California',\n",
" 'New_York',\n",
" 'California',\n",
" 'Iowa',\n",
" 'California',\n",
" 'Virginia',\n",
" 'North_Carolina',\n",
" 'Arizona',\n",
" 'Indiana',\n",
" 'New_Jersey',\n",
" 'California',\n",
" 'Delaware',\n",
" 'Georgia',\n",
" 'New_York',\n",
" 'New_York',\n",
" 'California',\n",
" 'Minnesota',\n",
" 'California',\n",
" 'Kentucky',\n",
" 'Minnesota',\n",
" 'Ohio',\n",
" 'Michigan',\n",
" 'California',\n",
" 'Minnesota',\n",
" 'California',\n",
" 'Delaware',\n",
" 'Illinois',\n",
" 'Minnesota',\n",
" 'Texas',\n",
" 'New_Jersey',\n",
" 'Delaware',\n",
" 'Washington',\n",
" 'NONE',\n",
" 'Delaware',\n",
" 'Oregon',\n",
" 'Delaware',\n",
" 'Delaware',\n",
" 'Delaware',\n",
" 'Massachusetts',\n",
" 'California',\n",
" 'NONE',\n",
" 'Delaware',\n",
" 'Illinois',\n",
" 'Idaho',\n",
" 'Washington',\n",
" 'New_York',\n",
" 'New_York',\n",
" 'California',\n",
" 'Utah',\n",
" 'Delaware',\n",
" 'Washington',\n",
" 'Virginia',\n",
" 'New_York',\n",
" 'New_York',\n",
" 'Illinois',\n",
" 'California',\n",
" 'Delaware',\n",
" 'NONE',\n",
" 'Texas',\n",
" 'California',\n",
" 'Washington',\n",
" 'Delaware',\n",
" 'Washington',\n",
" 'New_York',\n",
" 'Washington',\n",
" 'Illinois']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev_expected_jurisdiction"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.14457831325301204\n"
]
}
],
"source": [
"counter = 0 \n",
"for pred, exp in zip(dev_predictions_jurisdiction, dev_expected_jurisdiction):\n",
" if pred == exp:\n",
" counter +=1\n",
"print('accuracy: ', counter/len(dev_predictions_jurisdiction))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.14457831325301204"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(dev_predictions_jurisdiction, dev_expected_jurisdiction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Co jeżeli nazwy klas nie występują explicite w zbiorach?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
" \n",
"https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"SPORT_PATH='/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia6_klasyfikacja/repos/sport-text-classification-ball'\n",
"\n",
"SPORT_TRAIN=$SPORT_PATH/train/train.tsv.gz\n",
" \n",
"SPORT_DEV_EXP=$SPORT_PATH/dev-0/expected.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### jaki jest baseline dla sport classification ball?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"zcat $SPORT_TRAIN | awk '{print $1}' | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"zcat $SPORT_TRAIN | awk '{print $1}' | grep 1 | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"cat $SPORT_DEV_EXP | wc -l\n",
"\n",
"grep 1 $SPORT_DEV_EXP | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sprytne podejście do klasyfikacji tekstu? Naiwny bayess"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/kuba/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"import sklearn.metrics\n",
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"newsgroups = fetch_20newsgroups()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"newsgroups_text = newsgroups['data']"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"From: lerxst@wam.umd.edu (where's my thing)\n",
"Subject: WHAT car is this!?\n",
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
"Organization: University of Maryland, College Park\n",
"Lines: 15\n",
"\n",
" I was wondering if anyone out there could enlighten me on this car I saw\n",
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
"the front bumper was separate from the rest of the body. This is \n",
"all I know. If anyone can tellme a model name, engine specs, years\n",
"of production, where this car is made, history, or whatever info you\n",
"have on this funky looking car, please e-mail.\n",
"\n",
"Thanks,\n",
"- IL\n",
" ---- brought to you by your neighborhood Lerxst ----\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"source": [
"print(newsgroups_text[0])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['where', 'name', 'looked', 'to', 'have', 'out', 'on', 'by', 'park', 'what', 'from', 'host', 'doors', 'day', 'be', 'organization', 'e', 'front', 'in', 'it', 'history', 'brought', 'know', 'addition', 'il', 'of', 'lines', 'i', 'your', 'bumper', 'there', 'please', 'me', 'separate', 'is', 'tellme', 'can', 'could', 'called', 'specs', 'college', 'this', 'thanks', 'looking', 'if', 'production', 'sports', 'lerxst', 'whatever', 'anyone', 'enlighten', 'saw', 'all', 'small', 'you', 'wam', 'mail', 'rest', 's', 'late', 'rac', 'funky', 'edu', 'info', 'the', 'wondering', 'years', 'door', 'posting', 'car', 'made', 'or', 'maryland', 'subject', 'bricklin', 'was', 'model', 'thing', 'university', 'engine', 'nntp', 'other', 'really', 'neighborhood', 'early', 'a', 'umd', 'my', 'body', 'were']\n"
]
}
],
"source": [
"print(newsgroups_text_tokenized[0])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"Y = newsgroups['target']"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([7, 4, 4, ..., 3, 1, 8])"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"Y_names = newsgroups['target_names']"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['alt.atheism',\n",
" 'comp.graphics',\n",
" 'comp.os.ms-windows.misc',\n",
" 'comp.sys.ibm.pc.hardware',\n",
" 'comp.sys.mac.hardware',\n",
" 'comp.windows.x',\n",
" 'misc.forsale',\n",
" 'rec.autos',\n",
" 'rec.motorcycles',\n",
" 'rec.sport.baseball',\n",
" 'rec.sport.hockey',\n",
" 'sci.crypt',\n",
" 'sci.electronics',\n",
" 'sci.med',\n",
" 'sci.space',\n",
" 'soc.religion.christian',\n",
" 'talk.politics.guns',\n",
" 'talk.politics.mideast',\n",
" 'talk.politics.misc',\n",
" 'talk.religion.misc']"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_names"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'talk.politics.guns'"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_names[16]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$P('talk.politics.guns' | 'gun')= ?$ \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"$P(A|B) * P(A) = P(B) * P(B|A)$\n",
"\n",
"$P(A|B) = \\frac{P(B) * P(B|A)}{P(A)}$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$P('talk.politics.guns' | 'gun') * P('gun') = P('gun'|'talk.politics.guns') * P('talk.politics.guns')$\n",
"\n",
"\n",
"$P('talk.politics.guns' | 'gun') = \\frac{P('gun'|'talk.politics.guns') * P('talk.politics.guns')}{P('gun')}$\n",
"\n",
"\n",
"$p1 = P('gun'|'talk.politics.guns')$\n",
"\n",
"\n",
"$p2 = P('talk.politics.guns')$\n",
"\n",
"\n",
"$p3 = P('gun')$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## obliczanie $p1 = P('gun'|'talk.politics.guns')$"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# samodzielne wykonanie"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## obliczanie $p2 = P('talk.politics.guns')$\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# samodzielne wykonanie"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## obliczanie $p3 = P('gun')$"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# samodzielne wykonanie"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ostatecznie"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'p1' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-31-447f586cc09f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mp1\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mp2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mp3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'p1' is not defined"
]
}
],
"source": [
"(p1 * p2) / p3"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def get_prob(index ):\n",
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
"\n",
" len([x for x in talks_topic if 'gun' in x])\n",
"\n",
" if len(talks_topic) == 0:\n",
" return 0.0\n",
" p1 = len([x for x in talks_topic if 'gun' in x]) / len(talks_topic)\n",
" p2 = len(talks_topic) / len(Y)\n",
" p3 = len([x for x in newsgroups_text_tokenized if 'gun' in x]) / len(Y)\n",
"\n",
" if p3 == 0:\n",
" return 0.0\n",
" else: \n",
" return (p1 * p2)/ p3\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.01622 \t\t alt.atheism\n",
"0.00000 \t\t comp.graphics\n",
"0.00541 \t\t comp.os.ms-windows.misc\n",
"0.01892 \t\t comp.sys.ibm.pc.hardware\n",
"0.00270 \t\t comp.sys.mac.hardware\n",
"0.00000 \t\t comp.windows.x\n",
"0.01351 \t\t misc.forsale\n",
"0.04054 \t\t rec.autos\n",
"0.01892 \t\t rec.motorcycles\n",
"0.00270 \t\t rec.sport.baseball\n",
"0.00541 \t\t rec.sport.hockey\n",
"0.03784 \t\t sci.crypt\n",
"0.02973 \t\t sci.electronics\n",
"0.00541 \t\t sci.med\n",
"0.01622 \t\t sci.space\n",
"0.00270 \t\t soc.religion.christian\n",
"0.68378 \t\t talk.politics.guns\n",
"0.04595 \t\t talk.politics.mideast\n",
"0.03784 \t\t talk.politics.misc\n",
"0.01622 \t\t talk.religion.misc\n",
"1.00000 \t\tsuma\n"
]
}
],
"source": [
"probs = []\n",
"for i in range(len(Y_names)):\n",
" probs.append(get_prob(i))\n",
" print(\"%.5f\" % get_prob(i),'\\t\\t', Y_names[i])\n",
" \n",
"print(\"%.5f\" % sum(probs), '\\t\\tsuma',)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zadanie samodzielne"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def get_prob2(index, word ):\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# listing dla get_prob2, słowo 'god'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## założenie naiwnego bayesa"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$P(class | word1, word2, word3) = \\frac{P(word1, word2, word3|class) * P(class)}{P(word1, word2, word3)}$\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**przy założeniu o niezależności zmiennych losowych $word1$, $word2$, $word3$**:\n",
"\n",
"\n",
"$P(word1, word2, word3|class) = P(word1|class)* P(word2|class) * P(word3|class)$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**ostatecznie:**\n",
"\n",
"\n",
"$P(class | word1, word2, word3) = \\frac{P(word1|class)* P(word2|class) * P(word3|class) * P(class)}{\\sum_k{P(word1|class_k)* P(word2|class_k) * P(word3|class_k) * P(class_k)}}$\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## zadania domowe naiwny bayes1 ręcznie"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- analogicznie zaimplementować funkcję get_prob3(index, document_tokenized), argument document_tokenized ma być zbiorem słów dokumentu. funkcja ma być naiwnym klasyfikatorem bayesowskim (w przypadku wielu słów)\n",
"- odpalić powyższy listing prawdopodobieństw z funkcją get_prob3 dla dokumentów: {'i','love','guns'} oraz {'is','there','life','after'\n",
",'death'}\n",
"- zadanie proszę zrobić w jupyterze, wygenerować pdf (kod + wyniki odpalenia) i umieścić go jako zadanie w teams\n",
"- termin 12.05, punktów: 40\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## zadania domowe naiwny bayes2 gotowa biblioteka"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- wybrać jedno z poniższych repozytoriów i je sforkować:\n",
" - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
" - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public\n",
"- stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf\n",
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
"- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
"termin 12.05, 40 punktów\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

1069
cw/07_regresja_liniowa.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

BIN
cw/obrazki/1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

266
cw/obrazki/1.svg Normal file
View File

@ -0,0 +1,266 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="800mm"
height="800mm"
viewBox="0 0 800 800"
version="1.1"
id="svg16"
sodipodi:docname="1.svg"
inkscape:export-filename="/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/obrazki/6.png"
inkscape:export-xdpi="96"
inkscape:export-ydpi="96"
inkscape:version="0.92.5 (2060ec1f9f, 2020-04-08)">
<defs
id="defs10" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="0.35"
inkscape:cx="1485.1537"
inkscape:cy="1417.9979"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
showgrid="false"
width="800mm"
inkscape:window-width="2560"
inkscape:window-height="1389"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1">
<inkscape:grid
type="xygrid"
id="grid253" />
</sodipodi:namedview>
<metadata
id="metadata13">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(0,503)">
<rect
id="rect18"
width="700.24615"
height="11.759859"
x="62.006527"
y="148.39815"
style="stroke-width:0.26458332" />
<rect
id="rect18-3"
width="700.24615"
height="11.759859"
x="-475.47943"
y="-99.864838"
style="stroke-width:0.26458332"
transform="rotate(90.042959)" />
<circle
id="path37"
cx="138.44562"
cy="-13.583364"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-9"
cx="298.2728"
cy="-3.4271142"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-7"
cx="293.99649"
cy="-161.65015"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-92"
cx="349.58853"
cy="-91.091507"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-0"
cx="551.64429"
cy="-123.16381"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-2"
cx="505.67395"
cy="-385.08951"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-3"
cx="709.86786"
cy="-417.16187"
r="11.22532"
style="stroke-width:0.26458332" />
<circle
id="path37-75"
cx="450.08188"
cy="-214.03429"
r="11.22532"
style="stroke-width:0.26458332" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:42.33333333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332;"
x="655.34485"
y="192.23036"
id="text215"><tspan
sodipodi:role="line"
id="tspan213"
x="655.34485"
y="192.23036"
style="stroke-width:0.26458332;font-size:42.33333333px;">x</tspan></text>
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:42.33333206px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
x="36.73391"
y="-383.11801"
id="text215-8"><tspan
sodipodi:role="line"
id="tspan213-9"
x="36.73391"
y="-345.66293"
style="font-size:42.33333206px;stroke-width:0.26458332" /></text>
<rect
style="fill:#000000;stroke-width:0.26458332"
id="rect263"
width="6.8035712"
height="38.55357"
x="-218.69528"
y="-431.2952"
transform="rotate(37.42867)" />
<rect
style="fill:#000000;stroke-width:0.26458332"
id="rect263-7"
width="6.8035712"
height="38.55357"
x="-386.60941"
y="255.82913"
transform="rotate(139.04298)"
inkscape:transform-center-x="-20.410714"
inkscape:transform-center-y="6.8035653" />
<rect
style="fill:#000000;stroke-width:0.26458332"
id="rect263-3"
width="6.8035712"
height="38.55357"
x="-371.74628"
y="-681.80341"
transform="rotate(129.61772)" />
<rect
style="fill:#000000;stroke-width:0.26458332"
id="rect263-7-6"
width="6.8035712"
height="38.55357"
x="-601.17584"
y="456.17935"
transform="rotate(-128.76797)"
inkscape:transform-center-x="7.5782166"
inkscape:transform-center-y="20.135944" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:42.33333206px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
x="48.032505"
y="-377.82925"
id="text215-1"><tspan
sodipodi:role="line"
id="tspan213-2"
x="48.032505"
y="-377.82925"
style="font-size:42.33333206px;stroke-width:0.26458332">y</tspan><tspan
sodipodi:role="line"
x="48.032505"
y="-324.9126"
style="font-size:42.33333206px;stroke-width:0.26458332"
id="tspan334" /></text>
<rect
id="rect18-9"
width="670.43402"
height="13.544262"
x="114.69541"
y="-151.7952"
style="fill:#ff0000;stroke-width:0.27783805"
transform="matrix(0.99999973,7.380958e-4,0.11550968,0.99330635,0,0)" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect390"
width="5.2916665"
height="134.55952"
x="136.07143"
y="-146.74403" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect392"
width="5.2916665"
height="20.410715"
x="290.28571"
y="-164.13097" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect396"
width="6.0476379"
height="143.63097"
x="295.57736"
y="-143.72026" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect398"
width="4.5357141"
height="55.184521"
x="346.98215"
y="-143.72023" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect400"
width="5.2916665"
height="73.327377"
x="448.27979"
y="-215.53571" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect402"
width="3.7797618"
height="243.41666"
x="503.46429"
y="-386.38095" />
<rect
style="fill:#00ff00;stroke-width:0.22913587"
id="rect404"
width="4.5357146"
height="27.970238"
x="547.30951"
y="-145.9881" />
<rect
style="fill:#00ff00;stroke-width:0.26458332"
id="rect406"
width="4.5357141"
height="276.67856"
x="707.57141"
y="-419.64285" />
</g>
</svg>

After

Width:  |  Height:  |  Size: 7.6 KiB

BIN
cw/obrazki/10.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

BIN
cw/obrazki/2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

BIN
cw/obrazki/3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

BIN
cw/obrazki/4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

BIN
cw/obrazki/5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

BIN
cw/obrazki/6.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

BIN
cw/obrazki/7.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

BIN
cw/obrazki/8.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 KiB

BIN
cw/obrazki/9.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB