2022-04-04 10:26:15 +02:00
{
"cells": [
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 10,
2022-04-04 10:26:15 +02:00
"id": "21c9b695",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"import regex as re\n",
"import nltk\n",
"from collections import Counter, defaultdict\n",
"import string\n",
"import unicodedata\n",
"\n",
"def clean_text(text): \n",
" return re.sub(r\"\\p{P}\", \"\", str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \"))\n",
"\n",
"def train_model(data, model):\n",
" for _, row in data.iterrows():\n",
" words = nltk.word_tokenize(clean_text(row[\"final\"]))\n",
" for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
" if w1 and w2:\n",
" model[w2][w1] += 1\n",
2022-04-04 14:39:18 +02:00
" for w2 in model:\n",
" total_count = float(sum(model[w2].values()))\n",
" for w1 in model[w2]:\n",
2022-04-04 10:26:15 +02:00
" model[w2][w1] /= total_count\n",
"\n",
"\n",
"def predict(word, model):\n",
" predictions = dict(model[word])\n",
" most_common = dict(Counter(predictions).most_common(5))\n",
"\n",
" total_prob = 0.0\n",
" str_prediction = \"\"\n",
"\n",
" for word, prob in most_common.items():\n",
" total_prob += prob\n",
" str_prediction += f\"{word}:{prob} \"\n",
"\n",
" if not total_prob:\n",
" return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
"\n",
" if 1 - total_prob >= 0.01:\n",
" str_prediction += f\":{1-total_prob}\"\n",
" else:\n",
" str_prediction += f\":0.01\"\n",
"\n",
" return str_prediction\n",
"\n",
"\n",
"def predict_data(read_path, save_path, model):\n",
" data = pd.read_csv(\n",
2022-04-04 14:39:18 +02:00
" read_path,\n",
" sep=\"\\t\",\n",
" error_bad_lines=False,\n",
" header=None,\n",
" quoting=csv.QUOTE_NONE,\n",
" encoding=\"utf8\"\n",
2022-04-04 10:26:15 +02:00
" )\n",
" with open(save_path, \"w\") as file:\n",
" for _, row in data.iterrows():\n",
" words = nltk.word_tokenize(clean_text(row[7]))\n",
" if len(words) < 3:\n",
" prediction = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
" else:\n",
" prediction = predict(words[-1], model)\n",
" file.write(prediction + \"\\n\")\n"
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 11,
2022-04-04 10:26:15 +02:00
"id": "e39473e2",
"metadata": {},
"outputs": [],
"source": [
"with open(\"in-header.tsv\") as f:\n",
" in_cols = f.read().strip().split(\"\\t\")\n",
"\n",
"with open(\"out-header.tsv\") as f:\n",
" out_cols = f.read().strip().split(\"\\t\")"
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 12,
2022-04-04 10:26:15 +02:00
"id": "bde510c9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['FileId', 'Year', 'LeftContext', 'RightContext']"
]
},
2022-04-04 14:39:18 +02:00
"execution_count": 12,
2022-04-04 10:26:15 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"in_cols"
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 13,
2022-04-04 10:26:15 +02:00
"id": "0e8b31dd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Word']"
]
},
2022-04-04 14:39:18 +02:00
"execution_count": 13,
2022-04-04 10:26:15 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out_cols"
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 7,
2022-04-04 10:26:15 +02:00
"id": "7662d802",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\n",
" \"train/in.tsv.xz\",\n",
" sep=\"\\t\",\n",
" on_bad_lines='skip',\n",
" header=None,\n",
" # names=in_cols,\n",
" quoting=csv.QUOTE_NONE,\n",
")\n",
"\n",
2022-04-04 14:39:18 +02:00
"train_words = pd.read_csv(\n",
2022-04-04 10:26:15 +02:00
" \"train/expected.tsv\",\n",
" sep=\"\\t\",\n",
" on_bad_lines='skip',\n",
" header=None,\n",
" # names=out_cols,\n",
" quoting=csv.QUOTE_NONE,\n",
")\n",
"\n",
"train_data = data[[7, 6]]\n",
2022-04-04 14:39:18 +02:00
"train_data = pd.concat([train_data, train_words], axis=1)\n",
2022-04-04 10:26:15 +02:00
"\n",
"train_data[\"final\"] = train_data[7] + train_data[0] + train_data[6]\n"
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 8,
2022-04-04 10:26:15 +02:00
"id": "c3d2cfec",
"metadata": {},
2022-04-04 14:39:18 +02:00
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>7</th>\n",
" <th>6</th>\n",
" <th>0</th>\n",
" <th>final</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>said\\nit's all squash. The best I could get\\ni...</td>\n",
" <td>came fiom the last place to this\\nplace, and t...</td>\n",
" <td>lie</td>\n",
" <td>said\\nit's all squash. The best I could get\\ni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>\\ninto a proper perspective with those\\nminor ...</td>\n",
" <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n",
" <td>himself</td>\n",
" <td>\\ninto a proper perspective with those\\nminor ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>all notU\\nashore and afloat arc subjects for I...</td>\n",
" <td>\"Thera were in 1771 only aeventy-nine\\n*ub*erl...</td>\n",
" <td>of</td>\n",
" <td>all notU\\nashore and afloat arc subjects for I...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ceucju l< d no; <o waste it nud so\\nsunk it in...</td>\n",
" <td>A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg...</td>\n",
" <td>ably</td>\n",
" <td>ceucju l< d no; <o waste it nud so\\nsunk it in...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ascertained w? OCt the COOltS of ibis\\nletale ...</td>\n",
" <td>Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t...</td>\n",
" <td>j</td>\n",
" <td>ascertained w? OCt the COOltS of ibis\\nletale ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432017</th>\n",
" <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n",
" <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n",
" <td>and</td>\n",
" <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432018</th>\n",
" <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n",
" <td>Wita.htt halting the party ware dilven to the ...</td>\n",
" <td>paasliic</td>\n",
" <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432019</th>\n",
" <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n",
" <td>It was the last thing that either of\\nthem exp...</td>\n",
" <td>for</td>\n",
" <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432020</th>\n",
" <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n",
" <td>settlement with the department.\\nIt is also sh...</td>\n",
" <td>for</td>\n",
" <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432021</th>\n",
" <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n",
" <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n",
" <td>at</td>\n",
" <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>432022 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" 7 \\\n",
"0 said\\nit's all squash. The best I could get\\ni... \n",
"1 \\ninto a proper perspective with those\\nminor ... \n",
"2 all notU\\nashore and afloat arc subjects for I... \n",
"3 ceucju l< d no; <o waste it nud so\\nsunk it in... \n",
"4 ascertained w? OCt the COOltS of ibis\\nletale ... \n",
"... ... \n",
"432017 \\nSam was arrested.\\nThe case excited a great ... \n",
"432018 through the alnp the »Uitors laapeeeed tia.»\\n... \n",
"432019 Agua Negra across the line.\\nIt was a grim pla... \n",
"432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n",
"432021 3214c;do White at 3614c: Mixed Western at\\n331... \n",
"\n",
" 6 0 \\\n",
"0 came fiom the last place to this\\nplace, and t... lie \n",
"1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... himself \n",
"2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl... of \n",
"3 A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg... ably \n",
"4 Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t... j \n",
"... ... ... \n",
"432017 Sam Clendenin bad a fancy for Ui«\\nscience of ... and \n",
"432018 Wita.htt halting the party ware dilven to the ... paasliic \n",
"432019 It was the last thing that either of\\nthem exp... for \n",
"432020 settlement with the department.\\nIt is also sh... for \n",
"432021 Flour quotations—low extras at 1 R0®2 50;\\ncit... at \n",
"\n",
" final \n",
"0 said\\nit's all squash. The best I could get\\ni... \n",
"1 \\ninto a proper perspective with those\\nminor ... \n",
"2 all notU\\nashore and afloat arc subjects for I... \n",
"3 ceucju l< d no; <o waste it nud so\\nsunk it in... \n",
"4 ascertained w? OCt the COOltS of ibis\\nletale ... \n",
"... ... \n",
"432017 \\nSam was arrested.\\nThe case excited a great ... \n",
"432018 through the alnp the »Uitors laapeeeed tia.»\\n... \n",
"432019 Agua Negra across the line.\\nIt was a grim pla... \n",
"432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n",
"432021 3214c;do White at 3614c: Mixed Western at\\n331... \n",
"\n",
"[432022 rows x 4 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
2022-04-04 10:26:15 +02:00
"source": [
"train_data"
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 14,
2022-04-04 10:26:15 +02:00
"id": "bd92ba07",
"metadata": {},
2022-04-04 14:39:18 +02:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(\n",
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(\n"
]
},
{
"ename": "UnicodeEncodeError",
"evalue": "'charmap' codec can't encode character '\\u03b2' in position 21: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 7'\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[1;34m()\u001b[0m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=2'>3</a>\u001b[0m train_model(train_data, model)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=3'>4</a>\u001b[0m predict_data(\u001b[39m\"\u001b[39m\u001b[39mdev-0/in.tsv.xz\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mdev-0/out.tsv\u001b[39m\u001b[39m\"\u001b[39m, model)\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=4'>5</a>\u001b[0m predict_data(\u001b[39m\"\u001b[39;49m\u001b[39mtest-A/in.tsv.xz\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mtest-A/out.tsv\u001b[39;49m\u001b[39m\"\u001b[39;49m, model)\n",
"\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 1'\u001b[0m in \u001b[0;36mpredict_data\u001b[1;34m(read_path, save_path, model)\u001b[0m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=54'>55</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=55'>56</a>\u001b[0m prediction \u001b[39m=\u001b[39m predict(words[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m], model)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=56'>57</a>\u001b[0m file\u001b[39m.\u001b[39;49mwrite(prediction \u001b[39m+\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39m\\n\u001b[39;49;00m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[1;32mC:\\Python310\\lib\\encodings\\cp1250.py:19\u001b[0m, in \u001b[0;36mIncrementalEncoder.encode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Python310/lib/encodings/cp1250.py?line=17'>18</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mencode\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m, final\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m---> <a href='file:///c%3A/Python310/lib/encodings/cp1250.py?line=18'>19</a>\u001b[0m \u001b[39mreturn\u001b[39;00m codecs\u001b[39m.\u001b[39;49mcharmap_encode(\u001b[39minput\u001b[39;49m,\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merrors,encoding_table)[\u001b[39m0\u001b[39m]\n",
"\u001b[1;31mUnicodeEncodeError\u001b[0m: 'charmap' codec can't encode character '\\u03b2' in position 21: character maps to <undefined>"
]
}
],
2022-04-04 10:26:15 +02:00
"source": [
"\n",
"model = defaultdict(lambda: defaultdict(lambda: 0))\n",
"\n",
2022-04-04 14:39:18 +02:00
"train_model(train_data, model)"
2022-04-04 10:26:15 +02:00
]
},
{
"cell_type": "code",
2022-04-04 14:39:18 +02:00
"execution_count": 15,
2022-04-04 10:26:15 +02:00
"id": "ad23240e",
"metadata": {},
2022-04-04 14:39:18 +02:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(\n"
]
}
],
"source": [
"predict_data(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\", model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "195cb6cf",
"metadata": {},
2022-04-04 10:26:15 +02:00
"outputs": [],
2022-04-04 14:39:18 +02:00
"source": [
"predict_data(\"test-A/in.tsv.xz\", \"test-A/out.tsv\", model)"
]
2022-04-04 10:26:15 +02:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2022-04-04 14:39:18 +02:00
"version": "3.10.2"
2022-04-04 10:26:15 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}