work in progress

This commit is contained in:
s440054 2022-04-04 16:03:59 +02:00
parent 5af6e29a07
commit a0a24dd6b5

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 36,
"id": "21c9b695", "id": "21c9b695",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -20,7 +20,7 @@
"\n", "\n",
"def train_model(data, model):\n", "def train_model(data, model):\n",
" for _, row in data.iterrows():\n", " for _, row in data.iterrows():\n",
" words = nltk.word_tokenize(clean_text(row[\"final\"]))\n", " words = nltk.word_tokenize(clean_text(row[760]))\n",
" for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n", " for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
" if w1 and w2:\n", " if w1 and w2:\n",
" model[w2][w1] += 1\n", " model[w2][w1] += 1\n",
@ -36,7 +36,6 @@
"\n", "\n",
" total_prob = 0.0\n", " total_prob = 0.0\n",
" str_prediction = \"\"\n", " str_prediction = \"\"\n",
"\n",
" for word, prob in most_common.items():\n", " for word, prob in most_common.items():\n",
" total_prob += prob\n", " total_prob += prob\n",
" str_prediction += f\"{word}:{prob} \"\n", " str_prediction += f\"{word}:{prob} \"\n",
@ -129,7 +128,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 22,
"id": "7662d802", "id": "7662d802",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -150,19 +149,19 @@
" on_bad_lines='skip',\n", " on_bad_lines='skip',\n",
" header=None,\n", " header=None,\n",
" # names=out_cols,\n", " # names=out_cols,\n",
" quoting=csv.QUOTE_NONE,,\n", " quoting=csv.QUOTE_NONE,\n",
" encoding=\"utf-8\"\n", " encoding=\"utf-8\"\n",
")\n", ")\n",
"\n", "\n",
"train_data = data[[7, 6]]\n", "train_data = data[[7, 6]]\n",
"train_data = pd.concat([train_data, train_words], axis=1)\n", "train_data = pd.concat([train_data, train_words], axis=1)\n",
"\n", "\n",
"train_data[\"final\"] = train_data[7] + train_data[0] + train_data[6]\n" "train_data[760] = train_data[7] + train_data[0] + train_data[6]\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 23,
"id": "c3d2cfec", "id": "c3d2cfec",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -190,7 +189,7 @@
" <th>7</th>\n", " <th>7</th>\n",
" <th>6</th>\n", " <th>6</th>\n",
" <th>0</th>\n", " <th>0</th>\n",
" <th>final</th>\n", " <th>760</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
@ -277,7 +276,7 @@
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" 7 \\\n", " 7 \\\n",
"0 said\\nit's all squash. The best I could get\\ni... \n", "0 said\\nit's all squash. The best I could get\\ni... \n",
"1 \\ninto a proper perspective with those\\nminor ... \n", "1 \\ninto a proper perspective with those\\nminor ... \n",
"2 all notU\\nashore and afloat arc subjects for I... \n", "2 all notU\\nashore and afloat arc subjects for I... \n",
@ -290,7 +289,7 @@
"432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n", "432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n",
"432021 3214c;do White at 3614c: Mixed Western at\\n331... \n", "432021 3214c;do White at 3614c: Mixed Western at\\n331... \n",
"\n", "\n",
" 6 0 \\\n", " 6 0 \\\n",
"0 came fiom the last place to this\\nplace, and t... lie \n", "0 came fiom the last place to this\\nplace, and t... lie \n",
"1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... himself \n", "1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... himself \n",
"2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl... of \n", "2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl... of \n",
@ -303,7 +302,7 @@
"432020 settlement with the department.\\nIt is also sh... for \n", "432020 settlement with the department.\\nIt is also sh... for \n",
"432021 Flour quotations—low extras at 1 R0®2 50;\\ncit... at \n", "432021 Flour quotations—low extras at 1 R0®2 50;\\ncit... at \n",
"\n", "\n",
" final \n", " 760 \n",
"0 said\\nit's all squash. The best I could get\\ni... \n", "0 said\\nit's all squash. The best I could get\\ni... \n",
"1 \\ninto a proper perspective with those\\nminor ... \n", "1 \\ninto a proper perspective with those\\nminor ... \n",
"2 all notU\\nashore and afloat arc subjects for I... \n", "2 all notU\\nashore and afloat arc subjects for I... \n",
@ -319,7 +318,7 @@
"[432022 rows x 4 columns]" "[432022 rows x 4 columns]"
] ]
}, },
"execution_count": 8, "execution_count": 23,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -330,38 +329,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 26,
"id": "bd92ba07", "id": "bd92ba07",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(\n",
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" data = pd.read_csv(\n"
]
},
{
"ename": "UnicodeEncodeError",
"evalue": "'charmap' codec can't encode character '\\u03b2' in position 21: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 7'\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[1;34m()\u001b[0m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=2'>3</a>\u001b[0m train_model(train_data, model)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=3'>4</a>\u001b[0m predict_data(\u001b[39m\"\u001b[39m\u001b[39mdev-0/in.tsv.xz\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mdev-0/out.tsv\u001b[39m\u001b[39m\"\u001b[39m, model)\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=4'>5</a>\u001b[0m predict_data(\u001b[39m\"\u001b[39;49m\u001b[39mtest-A/in.tsv.xz\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mtest-A/out.tsv\u001b[39;49m\u001b[39m\"\u001b[39;49m, model)\n",
"\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 1'\u001b[0m in \u001b[0;36mpredict_data\u001b[1;34m(read_path, save_path, model)\u001b[0m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=54'>55</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=55'>56</a>\u001b[0m prediction \u001b[39m=\u001b[39m predict(words[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m], model)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=56'>57</a>\u001b[0m file\u001b[39m.\u001b[39;49mwrite(prediction \u001b[39m+\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39m\\n\u001b[39;49;00m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[1;32mC:\\Python310\\lib\\encodings\\cp1250.py:19\u001b[0m, in \u001b[0;36mIncrementalEncoder.encode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Python310/lib/encodings/cp1250.py?line=17'>18</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mencode\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m, final\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m---> <a href='file:///c%3A/Python310/lib/encodings/cp1250.py?line=18'>19</a>\u001b[0m \u001b[39mreturn\u001b[39;00m codecs\u001b[39m.\u001b[39;49mcharmap_encode(\u001b[39minput\u001b[39;49m,\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merrors,encoding_table)[\u001b[39m0\u001b[39m]\n",
"\u001b[1;31mUnicodeEncodeError\u001b[0m: 'charmap' codec can't encode character '\\u03b2' in position 21: character maps to <undefined>"
]
}
],
"source": [ "source": [
"\n", "\n",
"model = defaultdict(lambda: defaultdict(lambda: 0))\n", "model = defaultdict(lambda: defaultdict(lambda: 0))\n",
@ -371,7 +342,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 37,
"id": "ad23240e", "id": "ad23240e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -379,7 +350,7 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n", "\n",
"\n", "\n",
" data = pd.read_csv(\n" " data = pd.read_csv(\n"
@ -392,7 +363,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 38,
"id": "195cb6cf", "id": "195cb6cf",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -400,7 +371,7 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\751703071.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n", "\n",
"\n", "\n",
" data = pd.read_csv(\n" " data = pd.read_csv(\n"