final commit

This commit is contained in:
Piotr Biskup 2021-09-20 22:24:34 +02:00
commit 3f5fac6712
2 changed files with 13467 additions and 0 deletions

13290
bayes_1 .html Normal file

File diff suppressed because it is too large Load Diff

177
bayes_1.ipynb Normal file
View File

@ -0,0 +1,177 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"import sklearn.metrics\n",
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"newsgroups = fetch_20newsgroups()\n",
"newsgroups_text = newsgroups['data']\n",
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"Y = newsgroups['target']\n",
"Y_names = newsgroups['target_names']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def get_prob3(index, document_tokenized):\n",
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
"\n",
" if len(talks_topic) == 0:\n",
" return 0.0\n",
" \n",
" p1_list = []\n",
" for word in document_tokenized:\n",
" to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)\n",
" p1_list.append(to_p1)\n",
" \n",
" p1 = np.prod(p1_list)\n",
" \n",
" p2 = len(talks_topic) / len(Y)\n",
" \n",
" return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez \n",
" # bez wpływu na działanie klasyfikatora"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def print_results(list_of_words):\n",
" probs = []\n",
" for i in range(len(Y_names)):\n",
" p = get_prob3(i, list_of_words)\n",
" probs.append(p)\n",
" print(\"%.5f\" % p,'\\t\\t', Y_names[i])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00001 \t\t alt.atheism\n",
"0.00000 \t\t comp.graphics\n",
"0.00000 \t\t comp.os.ms-windows.misc\n",
"0.00000 \t\t comp.sys.ibm.pc.hardware\n",
"0.00000 \t\t comp.sys.mac.hardware\n",
"0.00000 \t\t comp.windows.x\n",
"0.00000 \t\t misc.forsale\n",
"0.00000 \t\t rec.autos\n",
"0.00002 \t\t rec.motorcycles\n",
"0.00000 \t\t rec.sport.baseball\n",
"0.00001 \t\t rec.sport.hockey\n",
"0.00001 \t\t sci.crypt\n",
"0.00000 \t\t sci.electronics\n",
"0.00000 \t\t sci.med\n",
"0.00000 \t\t sci.space\n",
"0.00000 \t\t soc.religion.christian\n",
"0.00087 \t\t talk.politics.guns\n",
"0.00003 \t\t talk.politics.mideast\n",
"0.00005 \t\t talk.politics.misc\n",
"0.00006 \t\t talk.religion.misc\n"
]
}
],
"source": [
"print_results(['i','love','guns'])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00004 \t\t alt.atheism\n",
"0.00000 \t\t comp.graphics\n",
"0.00000 \t\t comp.os.ms-windows.misc\n",
"0.00000 \t\t comp.sys.ibm.pc.hardware\n",
"0.00000 \t\t comp.sys.mac.hardware\n",
"0.00000 \t\t comp.windows.x\n",
"0.00000 \t\t misc.forsale\n",
"0.00000 \t\t rec.autos\n",
"0.00000 \t\t rec.motorcycles\n",
"0.00000 \t\t rec.sport.baseball\n",
"0.00000 \t\t rec.sport.hockey\n",
"0.00000 \t\t sci.crypt\n",
"0.00000 \t\t sci.electronics\n",
"0.00000 \t\t sci.med\n",
"0.00000 \t\t sci.space\n",
"0.00012 \t\t soc.religion.christian\n",
"0.00004 \t\t talk.politics.guns\n",
"0.00007 \t\t talk.politics.mideast\n",
"0.00003 \t\t talk.politics.misc\n",
"0.00008 \t\t talk.religion.misc\n"
]
}
],
"source": [
"print_results(['is','there','life','after','death'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}