295 lines
7.7 KiB
Plaintext
295 lines
7.7 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"id": "51cf2311",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.datasets import fetch_20newsgroups\n",
|
||
|
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||
|
"\n",
|
||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
|
"import numpy as np\n",
|
||
|
"import sklearn.metrics\n",
|
||
|
"import gensim"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"id": "fcd66c5d",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"newsgroups = fetch_20newsgroups()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"id": "d88d795e",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"newsgroups_text = newsgroups['data']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"id": "56872498",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"id": "0e520f15",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"Y = newsgroups['target']\n",
|
||
|
"Y_names = newsgroups['target_names']"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"id": "2538de8c",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"From: lerxst@wam.umd.edu (where's my thing)\n",
|
||
|
"Subject: WHAT car is this!?\n",
|
||
|
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
|
||
|
"Organization: University of Maryland, College Park\n",
|
||
|
"Lines: 15\n",
|
||
|
"\n",
|
||
|
" I was wondering if anyone out there could enlighten me on this car I saw\n",
|
||
|
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
|
||
|
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
|
||
|
"the front bumper was separate from the rest of the body. This is \n",
|
||
|
"all I know. If anyone can tellme a model name, engine specs, years\n",
|
||
|
"of production, where this car is made, history, or whatever info you\n",
|
||
|
"have on this funky looking car, please e-mail.\n",
|
||
|
"\n",
|
||
|
"Thanks,\n",
|
||
|
"- IL\n",
|
||
|
" ---- brought to you by your neighborhood Lerxst ----\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"11314\n",
|
||
|
"11314\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"print(newsgroups_text[0])\n",
|
||
|
"print(len(newsgroups_text_tokenized))\n",
|
||
|
"print(len(Y))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"id": "47f1919b",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"['alt.atheism',\n",
|
||
|
" 'comp.graphics',\n",
|
||
|
" 'comp.os.ms-windows.misc',\n",
|
||
|
" 'comp.sys.ibm.pc.hardware',\n",
|
||
|
" 'comp.sys.mac.hardware',\n",
|
||
|
" 'comp.windows.x',\n",
|
||
|
" 'misc.forsale',\n",
|
||
|
" 'rec.autos',\n",
|
||
|
" 'rec.motorcycles',\n",
|
||
|
" 'rec.sport.baseball',\n",
|
||
|
" 'rec.sport.hockey',\n",
|
||
|
" 'sci.crypt',\n",
|
||
|
" 'sci.electronics',\n",
|
||
|
" 'sci.med',\n",
|
||
|
" 'sci.space',\n",
|
||
|
" 'soc.religion.christian',\n",
|
||
|
" 'talk.politics.guns',\n",
|
||
|
" 'talk.politics.mideast',\n",
|
||
|
" 'talk.politics.misc',\n",
|
||
|
" 'talk.religion.misc']"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 15,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"Y_names"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"id": "d9bcab94",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.8071918251862595"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 16,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"def get_prob3(index=16, document_tokenized = ['i','love','guns']):\n",
|
||
|
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
|
||
|
" numerator = len(talks_topic) / len(Y)\n",
|
||
|
" for word in document_tokenized:\n",
|
||
|
" numerator *= len([x for x in talks_topic if word in x]) / len(talks_topic)\n",
|
||
|
"\n",
|
||
|
" denominator = 0\n",
|
||
|
" for idx, _ in enumerate(Y_names):\n",
|
||
|
" tt = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == idx]\n",
|
||
|
" p = len(tt) / len(Y)\n",
|
||
|
" for word in document_tokenized:\n",
|
||
|
" p *= len([x for x in tt if word in x]) / len(tt)\n",
|
||
|
" denominator += p\n",
|
||
|
" return numerator/denominator\n",
|
||
|
"get_prob3()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"id": "b38fd7b8",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"0.011441319584519272 alt.atheism\n",
|
||
|
"0.0 comp.graphics\n",
|
||
|
"0.0 comp.os.ms-windows.misc\n",
|
||
|
"0.003002399875191552 comp.sys.ibm.pc.hardware\n",
|
||
|
"0.0 comp.sys.mac.hardware\n",
|
||
|
"0.0 comp.windows.x\n",
|
||
|
"0.00309826447536255 misc.forsale\n",
|
||
|
"0.004196307855354198 rec.autos\n",
|
||
|
"0.020726417246496816 rec.motorcycles\n",
|
||
|
"0.0 rec.sport.baseball\n",
|
||
|
"0.005430275030820152 rec.sport.hockey\n",
|
||
|
"0.00639817080713953 sci.crypt\n",
|
||
|
"0.002400149041276129 sci.electronics\n",
|
||
|
"0.0 sci.med\n",
|
||
|
"0.003973929193182238 sci.space\n",
|
||
|
"0.0 soc.religion.christian\n",
|
||
|
"0.8071918251862595 talk.politics.guns\n",
|
||
|
"0.029527819874460234 talk.politics.mideast\n",
|
||
|
"0.04872929309529775 talk.politics.misc\n",
|
||
|
"0.053883828734640093 talk.religion.misc\n",
|
||
|
"1.0\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"sum_ = 0\n",
|
||
|
"for idx, name in enumerate(Y_names):\n",
|
||
|
" temp = get_prob3(idx)\n",
|
||
|
" print(temp, name)\n",
|
||
|
" sum_ += temp\n",
|
||
|
"print(sum_)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"id": "73e5c38d",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"0.09992417561379101 alt.atheism\n",
|
||
|
"0.00013625470859758159 comp.graphics\n",
|
||
|
"0.0005000231638560848 comp.os.ms-windows.misc\n",
|
||
|
"0.000511103648847933 comp.sys.ibm.pc.hardware\n",
|
||
|
"0.0015231860361372294 comp.sys.mac.hardware\n",
|
||
|
"0.0005531668782177577 comp.windows.x\n",
|
||
|
"3.6311784651612556e-05 misc.forsale\n",
|
||
|
"0.0057831942216877335 rec.autos\n",
|
||
|
"0.0037764847299935015 rec.motorcycles\n",
|
||
|
"0.0006549716594887765 rec.sport.baseball\n",
|
||
|
"0.0007349736544003172 rec.sport.hockey\n",
|
||
|
"0.002114333224731742 sci.crypt\n",
|
||
|
"0.00016344509681853365 sci.electronics\n",
|
||
|
"0.0119987496304634 sci.med\n",
|
||
|
"0.012351707895276336 sci.space\n",
|
||
|
"0.30485241626343873 soc.religion.christian\n",
|
||
|
"0.10270535698356416 talk.politics.guns\n",
|
||
|
"0.17315690370552841 talk.politics.mideast\n",
|
||
|
"0.08166799428082018 talk.politics.misc\n",
|
||
|
"0.19685524681968897 talk.religion.misc\n",
|
||
|
"1.0\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"sum_ = 0\n",
|
||
|
"for idx, name in enumerate(Y_names):\n",
|
||
|
" temp = get_prob3(idx, ['is','there','life','after' ,'death'])\n",
|
||
|
" print(temp, name)\n",
|
||
|
" sum_ += temp\n",
|
||
|
"print(sum_)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "9ce4ec99",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.2"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|