This commit is contained in:
Karol Idaszak 2022-05-18 12:04:38 +02:00
parent 7c17af952e
commit 5b47b256bb
4 changed files with 11451 additions and 0 deletions

View File

@ -0,0 +1,294 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "51cf2311",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"import sklearn.metrics\n",
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fcd66c5d",
"metadata": {},
"outputs": [],
"source": [
"newsgroups = fetch_20newsgroups()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d88d795e",
"metadata": {},
"outputs": [],
"source": [
"newsgroups_text = newsgroups['data']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "56872498",
"metadata": {},
"outputs": [],
"source": [
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "0e520f15",
"metadata": {},
"outputs": [],
"source": [
"Y = newsgroups['target']\n",
"Y_names = newsgroups['target_names']"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2538de8c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"From: lerxst@wam.umd.edu (where's my thing)\n",
"Subject: WHAT car is this!?\n",
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
"Organization: University of Maryland, College Park\n",
"Lines: 15\n",
"\n",
" I was wondering if anyone out there could enlighten me on this car I saw\n",
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
"the front bumper was separate from the rest of the body. This is \n",
"all I know. If anyone can tellme a model name, engine specs, years\n",
"of production, where this car is made, history, or whatever info you\n",
"have on this funky looking car, please e-mail.\n",
"\n",
"Thanks,\n",
"- IL\n",
" ---- brought to you by your neighborhood Lerxst ----\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"11314\n",
"11314\n"
]
}
],
"source": [
"print(newsgroups_text[0])\n",
"print(len(newsgroups_text_tokenized))\n",
"print(len(Y))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "47f1919b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['alt.atheism',\n",
" 'comp.graphics',\n",
" 'comp.os.ms-windows.misc',\n",
" 'comp.sys.ibm.pc.hardware',\n",
" 'comp.sys.mac.hardware',\n",
" 'comp.windows.x',\n",
" 'misc.forsale',\n",
" 'rec.autos',\n",
" 'rec.motorcycles',\n",
" 'rec.sport.baseball',\n",
" 'rec.sport.hockey',\n",
" 'sci.crypt',\n",
" 'sci.electronics',\n",
" 'sci.med',\n",
" 'sci.space',\n",
" 'soc.religion.christian',\n",
" 'talk.politics.guns',\n",
" 'talk.politics.mideast',\n",
" 'talk.politics.misc',\n",
" 'talk.religion.misc']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_names"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d9bcab94",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8071918251862595"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_prob3(index=16, document_tokenized = ['i','love','guns']):\n",
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
" numerator = len(talks_topic) / len(Y)\n",
" for word in document_tokenized:\n",
" numerator *= len([x for x in talks_topic if word in x]) / len(talks_topic)\n",
"\n",
" denominator = 0\n",
" for idx, _ in enumerate(Y_names):\n",
" tt = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == idx]\n",
" p = len(tt) / len(Y)\n",
" for word in document_tokenized:\n",
" p *= len([x for x in tt if word in x]) / len(tt)\n",
" denominator += p\n",
" return numerator/denominator\n",
"get_prob3()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b38fd7b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.011441319584519272 alt.atheism\n",
"0.0 comp.graphics\n",
"0.0 comp.os.ms-windows.misc\n",
"0.003002399875191552 comp.sys.ibm.pc.hardware\n",
"0.0 comp.sys.mac.hardware\n",
"0.0 comp.windows.x\n",
"0.00309826447536255 misc.forsale\n",
"0.004196307855354198 rec.autos\n",
"0.020726417246496816 rec.motorcycles\n",
"0.0 rec.sport.baseball\n",
"0.005430275030820152 rec.sport.hockey\n",
"0.00639817080713953 sci.crypt\n",
"0.002400149041276129 sci.electronics\n",
"0.0 sci.med\n",
"0.003973929193182238 sci.space\n",
"0.0 soc.religion.christian\n",
"0.8071918251862595 talk.politics.guns\n",
"0.029527819874460234 talk.politics.mideast\n",
"0.04872929309529775 talk.politics.misc\n",
"0.053883828734640093 talk.religion.misc\n",
"1.0\n"
]
}
],
"source": [
"sum_ = 0\n",
"for idx, name in enumerate(Y_names):\n",
" temp = get_prob3(idx)\n",
" print(temp, name)\n",
" sum_ += temp\n",
"print(sum_)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "73e5c38d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.09992417561379101 alt.atheism\n",
"0.00013625470859758159 comp.graphics\n",
"0.0005000231638560848 comp.os.ms-windows.misc\n",
"0.000511103648847933 comp.sys.ibm.pc.hardware\n",
"0.0015231860361372294 comp.sys.mac.hardware\n",
"0.0005531668782177577 comp.windows.x\n",
"3.6311784651612556e-05 misc.forsale\n",
"0.0057831942216877335 rec.autos\n",
"0.0037764847299935015 rec.motorcycles\n",
"0.0006549716594887765 rec.sport.baseball\n",
"0.0007349736544003172 rec.sport.hockey\n",
"0.002114333224731742 sci.crypt\n",
"0.00016344509681853365 sci.electronics\n",
"0.0119987496304634 sci.med\n",
"0.012351707895276336 sci.space\n",
"0.30485241626343873 soc.religion.christian\n",
"0.10270535698356416 talk.politics.guns\n",
"0.17315690370552841 talk.politics.mideast\n",
"0.08166799428082018 talk.politics.misc\n",
"0.19685524681968897 talk.religion.misc\n",
"1.0\n"
]
}
],
"source": [
"sum_ = 0\n",
"for idx, name in enumerate(Y_names):\n",
" temp = get_prob3(idx, ['is','there','life','after' ,'death'])\n",
" print(temp, name)\n",
" sum_ += temp\n",
"print(sum_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ce4ec99",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

5452
out_2.tsv Normal file

File diff suppressed because it is too large Load Diff

5445
out_t.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,260 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "6dd16441",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from collections import Counter\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "28750094",
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'train\\\\train.tsv'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Input \u001b[1;32mIn [2]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;130;43;01m\\\\\u001b[39;49;00m\u001b[38;5;124;43mtrain.tsv\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;130;43;01m\\t\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon_bad_lines\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mskip\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnames\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43my\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mloc[:,[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m 3\u001b[0m df\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\pandas\\util\\_decorators.py:311\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 305\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 306\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 307\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39marguments),\n\u001b[0;32m 308\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 309\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mstacklevel,\n\u001b[0;32m 310\u001b[0m )\n\u001b[1;32m--> 311\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\pandas\\io\\parsers\\readers.py:680\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 665\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 666\u001b[0m dialect,\n\u001b[0;32m 667\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 676\u001b[0m defaults\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelimiter\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[0;32m 677\u001b[0m )\n\u001b[0;32m 678\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 680\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\pandas\\io\\parsers\\readers.py:575\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 572\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 574\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 575\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[0;32m 577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 578\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\pandas\\io\\parsers\\readers.py:933\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 932\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m--> 933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\pandas\\io\\parsers\\readers.py:1217\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1213\u001b[0m mode \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1214\u001b[0m \u001b[38;5;66;03m# error: No overload variant of \"get_handle\" matches argument types\u001b[39;00m\n\u001b[0;32m 1215\u001b[0m \u001b[38;5;66;03m# \"Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]\"\u001b[39;00m\n\u001b[0;32m 1216\u001b[0m \u001b[38;5;66;03m# , \"str\", \"bool\", \"Any\", \"Any\", \"Any\", \"Any\", \"Any\"\u001b[39;00m\n\u001b[1;32m-> 1217\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[call-overload]\u001b[39;49;00m\n\u001b[0;32m 1218\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1219\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1220\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1221\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1222\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1223\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1224\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1225\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1226\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1227\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1228\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\pandas\\io\\common.py:789\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 784\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 785\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 786\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 787\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 788\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 789\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m 790\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 796\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 797\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 798\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'train\\\\train.tsv'"
]
}
],
"source": [
"df = pd.read_csv('train\\\\train.tsv', sep = '\\t', on_bad_lines='skip', names=['y', 'x'])\n",
"df = df.loc[:,['x','y']]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c75b4e41",
"metadata": {},
"outputs": [],
"source": [
"# df = df.head(20000)\n",
"Y = df.y\n",
"# data = df.T.to_dict().values()\n",
"data = df.x\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71f0990e",
"metadata": {},
"outputs": [],
"source": [
"# from sklearn.feature_extraction import DictVectorizer\n",
"\n",
"# dv = DictVectorizer(sparse=False)\n",
"# X = dv.fit_transform(data)\n",
"# Y, X\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"dv = TfidfVectorizer()\n",
"X = dv.fit_transform(data)\n",
"Y, X"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "090afa16",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import BernoulliNB\n",
"algorithm = BernoulliNB()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2420db6",
"metadata": {},
"outputs": [],
"source": [
"t = algorithm.fit(X, Y)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f38ec860",
"metadata": {},
"outputs": [],
"source": [
"test = pd.read_csv('dev-0\\\\in.tsv', sep = '\\t', on_bad_lines='warn', names=['x'])\n",
"Y_t = pd.read_csv('dev-0\\\\expected.tsv', sep = '\\t', on_bad_lines='warn', names=['y'])\n",
"# Y_t = Y_t.drop([1983, 5199])\n",
"# test = test.drop([1983, 5199])\n",
"\n",
"# Y_t = Y_t.head(5400)\n",
"# test = test.head(5400)\n",
"# X_t = dv.transform(test.T.to_dict().values())\n",
"X_t = dv.transform(test.x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ba103e1",
"metadata": {},
"outputs": [],
"source": [
"# Y_t"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b1f9034",
"metadata": {},
"outputs": [],
"source": [
"prediction = [item[1] for item in algorithm.predict_proba(X_t)]\n",
"prediction\n",
"prediction_bin = [item[1] for item in algorithm.predict_proba(X_t).round()]\n",
"prediction_bin"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef5ea24f",
"metadata": {},
"outputs": [],
"source": [
"out = pd.DataFrame(prediction_bin)\n",
"out.to_csv('out_2.tsv', sep='\\t', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "541b3a4c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from sklearn.metrics import log_loss, accuracy_score\n",
"print(log_loss(Y_t, prediction))\n",
"print(accuracy_score(Y_t, prediction_bin))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7933215b",
"metadata": {},
"outputs": [],
"source": [
"test_a = pd.read_csv('test-A\\\\in.tsv', sep = '\\t', on_bad_lines='warn', names=['x'])\n",
"# test_a[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f5812918",
"metadata": {},
"outputs": [],
"source": [
"# test_a = 0\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "463e2bf8",
"metadata": {},
"outputs": [],
"source": [
"test_a = dv.transform(test_a.x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1abb074",
"metadata": {},
"outputs": [],
"source": [
"test_a"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d1ec12f",
"metadata": {},
"outputs": [],
"source": [
"prediction = [item[1] for item in algorithm.predict_proba(test_a)]\n",
"prediction_bin = [item[1] for item in algorithm.predict_proba(test_a).round()]\n",
"prediction_bin"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d548b2a",
"metadata": {},
"outputs": [],
"source": [
"out = pd.DataFrame(prediction_bin)\n",
"out.to_csv('out_t.tsv', sep='\\t', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84403848",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}