sport-text-classification-b.../.ipynb_checkpoints/naiwny bayes1 ręcznie-checkpoint.ipynb
Karol Idaszak 5b47b256bb final
2022-05-18 12:04:38 +02:00

295 lines
7.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "51cf2311",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"import sklearn.metrics\n",
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fcd66c5d",
"metadata": {},
"outputs": [],
"source": [
"newsgroups = fetch_20newsgroups()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d88d795e",
"metadata": {},
"outputs": [],
"source": [
"newsgroups_text = newsgroups['data']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "56872498",
"metadata": {},
"outputs": [],
"source": [
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "0e520f15",
"metadata": {},
"outputs": [],
"source": [
"Y = newsgroups['target']\n",
"Y_names = newsgroups['target_names']"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2538de8c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"From: lerxst@wam.umd.edu (where's my thing)\n",
"Subject: WHAT car is this!?\n",
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
"Organization: University of Maryland, College Park\n",
"Lines: 15\n",
"\n",
" I was wondering if anyone out there could enlighten me on this car I saw\n",
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
"the front bumper was separate from the rest of the body. This is \n",
"all I know. If anyone can tellme a model name, engine specs, years\n",
"of production, where this car is made, history, or whatever info you\n",
"have on this funky looking car, please e-mail.\n",
"\n",
"Thanks,\n",
"- IL\n",
" ---- brought to you by your neighborhood Lerxst ----\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"11314\n",
"11314\n"
]
}
],
"source": [
"print(newsgroups_text[0])\n",
"print(len(newsgroups_text_tokenized))\n",
"print(len(Y))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "47f1919b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['alt.atheism',\n",
" 'comp.graphics',\n",
" 'comp.os.ms-windows.misc',\n",
" 'comp.sys.ibm.pc.hardware',\n",
" 'comp.sys.mac.hardware',\n",
" 'comp.windows.x',\n",
" 'misc.forsale',\n",
" 'rec.autos',\n",
" 'rec.motorcycles',\n",
" 'rec.sport.baseball',\n",
" 'rec.sport.hockey',\n",
" 'sci.crypt',\n",
" 'sci.electronics',\n",
" 'sci.med',\n",
" 'sci.space',\n",
" 'soc.religion.christian',\n",
" 'talk.politics.guns',\n",
" 'talk.politics.mideast',\n",
" 'talk.politics.misc',\n",
" 'talk.religion.misc']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_names"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d9bcab94",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8071918251862595"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_prob3(index=16, document_tokenized = ['i','love','guns']):\n",
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
" numerator = len(talks_topic) / len(Y)\n",
" for word in document_tokenized:\n",
" numerator *= len([x for x in talks_topic if word in x]) / len(talks_topic)\n",
"\n",
" denominator = 0\n",
" for idx, _ in enumerate(Y_names):\n",
" tt = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == idx]\n",
" p = len(tt) / len(Y)\n",
" for word in document_tokenized:\n",
" p *= len([x for x in tt if word in x]) / len(tt)\n",
" denominator += p\n",
" return numerator/denominator\n",
"get_prob3()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b38fd7b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.011441319584519272 alt.atheism\n",
"0.0 comp.graphics\n",
"0.0 comp.os.ms-windows.misc\n",
"0.003002399875191552 comp.sys.ibm.pc.hardware\n",
"0.0 comp.sys.mac.hardware\n",
"0.0 comp.windows.x\n",
"0.00309826447536255 misc.forsale\n",
"0.004196307855354198 rec.autos\n",
"0.020726417246496816 rec.motorcycles\n",
"0.0 rec.sport.baseball\n",
"0.005430275030820152 rec.sport.hockey\n",
"0.00639817080713953 sci.crypt\n",
"0.002400149041276129 sci.electronics\n",
"0.0 sci.med\n",
"0.003973929193182238 sci.space\n",
"0.0 soc.religion.christian\n",
"0.8071918251862595 talk.politics.guns\n",
"0.029527819874460234 talk.politics.mideast\n",
"0.04872929309529775 talk.politics.misc\n",
"0.053883828734640093 talk.religion.misc\n",
"1.0\n"
]
}
],
"source": [
"sum_ = 0\n",
"for idx, name in enumerate(Y_names):\n",
" temp = get_prob3(idx)\n",
" print(temp, name)\n",
" sum_ += temp\n",
"print(sum_)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "73e5c38d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.09992417561379101 alt.atheism\n",
"0.00013625470859758159 comp.graphics\n",
"0.0005000231638560848 comp.os.ms-windows.misc\n",
"0.000511103648847933 comp.sys.ibm.pc.hardware\n",
"0.0015231860361372294 comp.sys.mac.hardware\n",
"0.0005531668782177577 comp.windows.x\n",
"3.6311784651612556e-05 misc.forsale\n",
"0.0057831942216877335 rec.autos\n",
"0.0037764847299935015 rec.motorcycles\n",
"0.0006549716594887765 rec.sport.baseball\n",
"0.0007349736544003172 rec.sport.hockey\n",
"0.002114333224731742 sci.crypt\n",
"0.00016344509681853365 sci.electronics\n",
"0.0119987496304634 sci.med\n",
"0.012351707895276336 sci.space\n",
"0.30485241626343873 soc.religion.christian\n",
"0.10270535698356416 talk.politics.guns\n",
"0.17315690370552841 talk.politics.mideast\n",
"0.08166799428082018 talk.politics.misc\n",
"0.19685524681968897 talk.religion.misc\n",
"1.0\n"
]
}
],
"source": [
"sum_ = 0\n",
"for idx, name in enumerate(Y_names):\n",
" temp = get_prob3(idx, ['is','there','life','after' ,'death'])\n",
" print(temp, name)\n",
" sum_ += temp\n",
"print(sum_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ce4ec99",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}