from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
import gensim
newsgroups = fetch_20newsgroups()
newsgroups_text = newsgroups['data']
newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]
Y = newsgroups['target']
Y_names = newsgroups['target_names']
def get_prob3(index, document_tokenized):
talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]
if len(talks_topic) == 0:
return 0.0
p1_list = []
for word in document_tokenized:
to_p1 = len([x for x in talks_topic if word in x]) / len(talks_topic)
p1_list.append(to_p1)
p1 = np.prod(p1_list)
p2 = len(talks_topic) / len(Y)
return (p1 * p2) # / p3 --- mianownik dla wszystkich klas będzie taki sam dlatego można go pominąć, bez
# bez wpływu na działanie klasyfikatora
def print_results(list_of_words):
probs = []
for i in range(len(Y_names)):
p = get_prob3(i, list_of_words)
probs.append(p)
print("%.5f" % p,'\t\t', Y_names[i])
print_results(['i','love','guns'])
print_results(['is','there','life','after','death'])