commit 3f5fac6712ac9a2b464aa4e7679f5609bf810141 Author: Piotr Biskup Date: Mon Sep 20 22:24:34 2021 +0200 final commit diff --git a/bayes_1 .html b/bayes_1 .html new file mode 100644 index 0000000..7673c34 --- /dev/null +++ b/bayes_1 .html @@ -0,0 +1,13290 @@ + + + + +bayes_1 + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
In [1]:
+
+
+
from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+import sklearn.metrics
+import gensim
+
+ +
+
+
+ +
+
+
+
In [2]:
+
+
+
newsgroups = fetch_20newsgroups()
+newsgroups_text = newsgroups['data']
+newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]
+
+ +
+
+
+ +
+
+
+
In [3]:
+
+
+
Y = newsgroups['target']
+Y_names = newsgroups['target_names']
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
def get_prob3(index, document_tokenized):
+    talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]
+
+    if len(talks_topic) == 0:
+        return 0.0
+    
+    p1_list = []
+    for word in document_tokenized:
+        to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)
+        p1_list.append(to_p1)
+        
+    p1 = np.prod(p1_list)
+    
+    p2 = len(talks_topic) / len(Y)
+    
+    return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez 
+                     # bez wpływu na działanie klasyfikatora
+
+ +
+
+
+ +
+
+
+
In [15]:
+
+
+
def print_results(list_of_words):
+    probs = []
+    for i in range(len(Y_names)):
+        p = get_prob3(i, list_of_words)
+        probs.append(p)
+        print("%.5f" %   p,'\t\t', Y_names[i])
+
+ +
+
+
+ +
+
+
+
In [17]:
+
+
+
print_results(['i','love','guns'])
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
0.00001 		 alt.atheism
+0.00000 		 comp.graphics
+0.00000 		 comp.os.ms-windows.misc
+0.00000 		 comp.sys.ibm.pc.hardware
+0.00000 		 comp.sys.mac.hardware
+0.00000 		 comp.windows.x
+0.00000 		 misc.forsale
+0.00000 		 rec.autos
+0.00002 		 rec.motorcycles
+0.00000 		 rec.sport.baseball
+0.00001 		 rec.sport.hockey
+0.00001 		 sci.crypt
+0.00000 		 sci.electronics
+0.00000 		 sci.med
+0.00000 		 sci.space
+0.00000 		 soc.religion.christian
+0.00087 		 talk.politics.guns
+0.00003 		 talk.politics.mideast
+0.00005 		 talk.politics.misc
+0.00006 		 talk.religion.misc
+
+
+
+ +
+
+ +
+
+
+
In [19]:
+
+
+
print_results(['is','there','life','after','death'])
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
0.00004 		 alt.atheism
+0.00000 		 comp.graphics
+0.00000 		 comp.os.ms-windows.misc
+0.00000 		 comp.sys.ibm.pc.hardware
+0.00000 		 comp.sys.mac.hardware
+0.00000 		 comp.windows.x
+0.00000 		 misc.forsale
+0.00000 		 rec.autos
+0.00000 		 rec.motorcycles
+0.00000 		 rec.sport.baseball
+0.00000 		 rec.sport.hockey
+0.00000 		 sci.crypt
+0.00000 		 sci.electronics
+0.00000 		 sci.med
+0.00000 		 sci.space
+0.00012 		 soc.religion.christian
+0.00004 		 talk.politics.guns
+0.00007 		 talk.politics.mideast
+0.00003 		 talk.politics.misc
+0.00008 		 talk.religion.misc
+
+
+
+ +
+
+ +
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+ + + + + + diff --git a/bayes_1.ipynb b/bayes_1.ipynb new file mode 100644 index 0000000..61a6799 --- /dev/null +++ b/bayes_1.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import numpy as np\n", + "import sklearn.metrics\n", + "import gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "newsgroups = fetch_20newsgroups()\n", + "newsgroups_text = newsgroups['data']\n", + "newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "Y = newsgroups['target']\n", + "Y_names = newsgroups['target_names']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prob3(index, document_tokenized):\n", + " talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n", + "\n", + " if len(talks_topic) == 0:\n", + " return 0.0\n", + " \n", + " p1_list = []\n", + " for word in document_tokenized:\n", + " to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)\n", + " p1_list.append(to_p1)\n", + " \n", + " p1 = np.prod(p1_list)\n", + " \n", + " p2 = len(talks_topic) / len(Y)\n", + " \n", + " return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez \n", + " # bez wpływu na działanie klasyfikatora" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def print_results(list_of_words):\n", + " probs = []\n", + " for i in range(len(Y_names)):\n", + " p = get_prob3(i, list_of_words)\n", + " probs.append(p)\n", + " print(\"%.5f\" % p,'\\t\\t', Y_names[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.00001 \t\t alt.atheism\n", + "0.00000 \t\t comp.graphics\n", + "0.00000 \t\t comp.os.ms-windows.misc\n", + "0.00000 \t\t comp.sys.ibm.pc.hardware\n", + "0.00000 \t\t comp.sys.mac.hardware\n", + "0.00000 \t\t comp.windows.x\n", + "0.00000 \t\t misc.forsale\n", + "0.00000 \t\t rec.autos\n", + "0.00002 \t\t rec.motorcycles\n", + "0.00000 \t\t rec.sport.baseball\n", + "0.00001 \t\t rec.sport.hockey\n", + "0.00001 \t\t sci.crypt\n", + "0.00000 \t\t sci.electronics\n", + "0.00000 \t\t sci.med\n", + "0.00000 \t\t sci.space\n", + "0.00000 \t\t soc.religion.christian\n", + "0.00087 \t\t talk.politics.guns\n", + "0.00003 \t\t talk.politics.mideast\n", + "0.00005 \t\t talk.politics.misc\n", + "0.00006 \t\t talk.religion.misc\n" + ] + } + ], + "source": [ + "print_results(['i','love','guns'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.00004 \t\t alt.atheism\n", + "0.00000 \t\t comp.graphics\n", + "0.00000 \t\t comp.os.ms-windows.misc\n", + "0.00000 \t\t comp.sys.ibm.pc.hardware\n", + "0.00000 \t\t comp.sys.mac.hardware\n", + "0.00000 \t\t comp.windows.x\n", + "0.00000 \t\t misc.forsale\n", + "0.00000 \t\t rec.autos\n", + "0.00000 \t\t rec.motorcycles\n", + "0.00000 \t\t rec.sport.baseball\n", + "0.00000 \t\t rec.sport.hockey\n", + "0.00000 \t\t sci.crypt\n", + "0.00000 \t\t sci.electronics\n", + "0.00000 \t\t sci.med\n", + "0.00000 \t\t sci.space\n", + "0.00012 \t\t soc.religion.christian\n", + "0.00004 \t\t talk.politics.guns\n", + "0.00007 \t\t talk.politics.mideast\n", + "0.00003 \t\t talk.politics.misc\n", + "0.00008 \t\t talk.religion.misc\n" + ] + } + ], + "source": [ + "print_results(['is','there','life','after','death'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}