In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
import gensim
In [2]:
newsgroups = fetch_20newsgroups()
newsgroups_text = newsgroups['data']
newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]
In [3]:
Y = newsgroups['target']
Y_names = newsgroups['target_names']
In [11]:
def get_prob3(index, document_tokenized):
    talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]

    if len(talks_topic) == 0:
        return 0.0
    
    p1_list = []
    for word in document_tokenized:
        to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)
        p1_list.append(to_p1)
        
    p1 = np.prod(p1_list)
    
    p2 = len(talks_topic) / len(Y)
    
    return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez 
                     # bez wpływu na działanie klasyfikatora
In [15]:
def print_results(list_of_words):
    probs = []
    for i in range(len(Y_names)):
        p = get_prob3(i, list_of_words)
        probs.append(p)
        print("%.5f" %   p,'\t\t', Y_names[i])
In [17]:
print_results(['i','love','guns'])
0.00001 		 alt.atheism
0.00000 		 comp.graphics
0.00000 		 comp.os.ms-windows.misc
0.00000 		 comp.sys.ibm.pc.hardware
0.00000 		 comp.sys.mac.hardware
0.00000 		 comp.windows.x
0.00000 		 misc.forsale
0.00000 		 rec.autos
0.00002 		 rec.motorcycles
0.00000 		 rec.sport.baseball
0.00001 		 rec.sport.hockey
0.00001 		 sci.crypt
0.00000 		 sci.electronics
0.00000 		 sci.med
0.00000 		 sci.space
0.00000 		 soc.religion.christian
0.00087 		 talk.politics.guns
0.00003 		 talk.politics.mideast
0.00005 		 talk.politics.misc
0.00006 		 talk.religion.misc
In [19]:
print_results(['is','there','life','after','death'])
0.00004 		 alt.atheism
0.00000 		 comp.graphics
0.00000 		 comp.os.ms-windows.misc
0.00000 		 comp.sys.ibm.pc.hardware
0.00000 		 comp.sys.mac.hardware
0.00000 		 comp.windows.x
0.00000 		 misc.forsale
0.00000 		 rec.autos
0.00000 		 rec.motorcycles
0.00000 		 rec.sport.baseball
0.00000 		 rec.sport.hockey
0.00000 		 sci.crypt
0.00000 		 sci.electronics
0.00000 		 sci.med
0.00000 		 sci.space
0.00012 		 soc.religion.christian
0.00004 		 talk.politics.guns
0.00007 		 talk.politics.mideast
0.00003 		 talk.politics.misc
0.00008 		 talk.religion.misc
In [ ]: