4.6 KiB
4.6 KiB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
import gensim
newsgroups = fetch_20newsgroups()
newsgroups_text = newsgroups['data']
newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]
Y = newsgroups['target']
Y_names = newsgroups['target_names']
def get_prob3(index, document_tokenized):
talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]
if len(talks_topic) == 0:
return 0.0
p1_list = []
for word in document_tokenized:
to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)
p1_list.append(to_p1)
p1 = np.prod(p1_list)
p2 = len(talks_topic) / len(Y)
return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez
# bez wpływu na działanie klasyfikatora
def print_results(list_of_words):
probs = []
for i in range(len(Y_names)):
p = get_prob3(i, list_of_words)
probs.append(p)
print("%.5f" % p,'\t\t', Y_names[i])
print_results(['i','love','guns'])
0.00001 alt.atheism 0.00000 comp.graphics 0.00000 comp.os.ms-windows.misc 0.00000 comp.sys.ibm.pc.hardware 0.00000 comp.sys.mac.hardware 0.00000 comp.windows.x 0.00000 misc.forsale 0.00000 rec.autos 0.00002 rec.motorcycles 0.00000 rec.sport.baseball 0.00001 rec.sport.hockey 0.00001 sci.crypt 0.00000 sci.electronics 0.00000 sci.med 0.00000 sci.space 0.00000 soc.religion.christian 0.00087 talk.politics.guns 0.00003 talk.politics.mideast 0.00005 talk.politics.misc 0.00006 talk.religion.misc
print_results(['is','there','life','after','death'])
0.00004 alt.atheism 0.00000 comp.graphics 0.00000 comp.os.ms-windows.misc 0.00000 comp.sys.ibm.pc.hardware 0.00000 comp.sys.mac.hardware 0.00000 comp.windows.x 0.00000 misc.forsale 0.00000 rec.autos 0.00000 rec.motorcycles 0.00000 rec.sport.baseball 0.00000 rec.sport.hockey 0.00000 sci.crypt 0.00000 sci.electronics 0.00000 sci.med 0.00000 sci.space 0.00012 soc.religion.christian 0.00004 talk.politics.guns 0.00007 talk.politics.mideast 0.00003 talk.politics.misc 0.00008 talk.religion.misc