final commit
This commit is contained in:
commit
3f5fac6712
13290
bayes_1 .html
Normal file
13290
bayes_1 .html
Normal file
File diff suppressed because it is too large
Load Diff
177
bayes_1.ipynb
Normal file
177
bayes_1.ipynb
Normal file
@ -0,0 +1,177 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"import numpy as np\n",
|
||||
"import sklearn.metrics\n",
|
||||
"import gensim"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups = fetch_20newsgroups()\n",
|
||||
"newsgroups_text = newsgroups['data']\n",
|
||||
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y = newsgroups['target']\n",
|
||||
"Y_names = newsgroups['target_names']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_prob3(index, document_tokenized):\n",
|
||||
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
|
||||
"\n",
|
||||
" if len(talks_topic) == 0:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" p1_list = []\n",
|
||||
" for word in document_tokenized:\n",
|
||||
" to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)\n",
|
||||
" p1_list.append(to_p1)\n",
|
||||
" \n",
|
||||
" p1 = np.prod(p1_list)\n",
|
||||
" \n",
|
||||
" p2 = len(talks_topic) / len(Y)\n",
|
||||
" \n",
|
||||
" return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez \n",
|
||||
" # bez wpływu na działanie klasyfikatora"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def print_results(list_of_words):\n",
|
||||
" probs = []\n",
|
||||
" for i in range(len(Y_names)):\n",
|
||||
" p = get_prob3(i, list_of_words)\n",
|
||||
" probs.append(p)\n",
|
||||
" print(\"%.5f\" % p,'\\t\\t', Y_names[i])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.00001 \t\t alt.atheism\n",
|
||||
"0.00000 \t\t comp.graphics\n",
|
||||
"0.00000 \t\t comp.os.ms-windows.misc\n",
|
||||
"0.00000 \t\t comp.sys.ibm.pc.hardware\n",
|
||||
"0.00000 \t\t comp.sys.mac.hardware\n",
|
||||
"0.00000 \t\t comp.windows.x\n",
|
||||
"0.00000 \t\t misc.forsale\n",
|
||||
"0.00000 \t\t rec.autos\n",
|
||||
"0.00002 \t\t rec.motorcycles\n",
|
||||
"0.00000 \t\t rec.sport.baseball\n",
|
||||
"0.00001 \t\t rec.sport.hockey\n",
|
||||
"0.00001 \t\t sci.crypt\n",
|
||||
"0.00000 \t\t sci.electronics\n",
|
||||
"0.00000 \t\t sci.med\n",
|
||||
"0.00000 \t\t sci.space\n",
|
||||
"0.00000 \t\t soc.religion.christian\n",
|
||||
"0.00087 \t\t talk.politics.guns\n",
|
||||
"0.00003 \t\t talk.politics.mideast\n",
|
||||
"0.00005 \t\t talk.politics.misc\n",
|
||||
"0.00006 \t\t talk.religion.misc\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_results(['i','love','guns'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.00004 \t\t alt.atheism\n",
|
||||
"0.00000 \t\t comp.graphics\n",
|
||||
"0.00000 \t\t comp.os.ms-windows.misc\n",
|
||||
"0.00000 \t\t comp.sys.ibm.pc.hardware\n",
|
||||
"0.00000 \t\t comp.sys.mac.hardware\n",
|
||||
"0.00000 \t\t comp.windows.x\n",
|
||||
"0.00000 \t\t misc.forsale\n",
|
||||
"0.00000 \t\t rec.autos\n",
|
||||
"0.00000 \t\t rec.motorcycles\n",
|
||||
"0.00000 \t\t rec.sport.baseball\n",
|
||||
"0.00000 \t\t rec.sport.hockey\n",
|
||||
"0.00000 \t\t sci.crypt\n",
|
||||
"0.00000 \t\t sci.electronics\n",
|
||||
"0.00000 \t\t sci.med\n",
|
||||
"0.00000 \t\t sci.space\n",
|
||||
"0.00012 \t\t soc.religion.christian\n",
|
||||
"0.00004 \t\t talk.politics.guns\n",
|
||||
"0.00007 \t\t talk.politics.mideast\n",
|
||||
"0.00003 \t\t talk.politics.misc\n",
|
||||
"0.00008 \t\t talk.religion.misc\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_results(['is','there','life','after','death'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user