7.7 KiB
7.7 KiB
from sklearn.datasets import fetch_20newsgroups
# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
import gensim
newsgroups = fetch_20newsgroups()
newsgroups_text = newsgroups['data']
newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]
Y = newsgroups['target']
Y_names = newsgroups['target_names']
print(newsgroups_text[0])
print(len(newsgroups_text_tokenized))
print(len(Y))
From: lerxst@wam.umd.edu (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- 11314 11314
Y_names
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
def get_prob3(index=16, document_tokenized = ['i','love','guns']):
talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]
numerator = len(talks_topic) / len(Y)
for word in document_tokenized:
numerator *= len([x for x in talks_topic if word in x]) / len(talks_topic)
denominator = 0
for idx, _ in enumerate(Y_names):
tt = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == idx]
p = len(tt) / len(Y)
for word in document_tokenized:
p *= len([x for x in tt if word in x]) / len(tt)
denominator += p
return numerator/denominator
get_prob3()
0.8071918251862595
sum_ = 0
for idx, name in enumerate(Y_names):
temp = get_prob3(idx)
print(temp, name)
sum_ += temp
print(sum_)
0.011441319584519272 alt.atheism 0.0 comp.graphics 0.0 comp.os.ms-windows.misc 0.003002399875191552 comp.sys.ibm.pc.hardware 0.0 comp.sys.mac.hardware 0.0 comp.windows.x 0.00309826447536255 misc.forsale 0.004196307855354198 rec.autos 0.020726417246496816 rec.motorcycles 0.0 rec.sport.baseball 0.005430275030820152 rec.sport.hockey 0.00639817080713953 sci.crypt 0.002400149041276129 sci.electronics 0.0 sci.med 0.003973929193182238 sci.space 0.0 soc.religion.christian 0.8071918251862595 talk.politics.guns 0.029527819874460234 talk.politics.mideast 0.04872929309529775 talk.politics.misc 0.053883828734640093 talk.religion.misc 1.0
sum_ = 0
for idx, name in enumerate(Y_names):
temp = get_prob3(idx, ['is','there','life','after' ,'death'])
print(temp, name)
sum_ += temp
print(sum_)
0.09992417561379101 alt.atheism 0.00013625470859758159 comp.graphics 0.0005000231638560848 comp.os.ms-windows.misc 0.000511103648847933 comp.sys.ibm.pc.hardware 0.0015231860361372294 comp.sys.mac.hardware 0.0005531668782177577 comp.windows.x 3.6311784651612556e-05 misc.forsale 0.0057831942216877335 rec.autos 0.0037764847299935015 rec.motorcycles 0.0006549716594887765 rec.sport.baseball 0.0007349736544003172 rec.sport.hockey 0.002114333224731742 sci.crypt 0.00016344509681853365 sci.electronics 0.0119987496304634 sci.med 0.012351707895276336 sci.space 0.30485241626343873 soc.religion.christian 0.10270535698356416 talk.politics.guns 0.17315690370552841 talk.politics.mideast 0.08166799428082018 talk.politics.misc 0.19685524681968897 talk.religion.misc 1.0