work in progress

2022-04-04 16:03:59 +02:00 · 2022-04-04 16:03:59 +02:00 · a0a24dd6b5
commit a0a24dd6b5
parent 5af6e29a07
1 changed files with 17 additions and 46 deletions
--- a/testing.ipynb
+++ b/testing.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 36,
   "id": "21c9b695",
   "metadata": {},
   "outputs": [],
@ -20,7 +20,7 @@
    "\n",
    "def train_model(data, model):\n",
    "    for _, row in data.iterrows():\n",
-    "        words = nltk.word_tokenize(clean_text(row[\"final\"]))\n",
+    "        words = nltk.word_tokenize(clean_text(row[760]))\n",
    "        for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
    "            if w1 and w2:\n",
    "                model[w2][w1] += 1\n",
@ -36,7 +36,6 @@
    "\n",
    "    total_prob = 0.0\n",
    "    str_prediction = \"\"\n",
    "\n",
    "    for word, prob in most_common.items():\n",
    "        total_prob += prob\n",
    "        str_prediction += f\"{word}:{prob} \"\n",
@ -129,7 +128,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
   "id": "7662d802",
   "metadata": {},
   "outputs": [],
@ -150,19 +149,19 @@
    "    on_bad_lines='skip',\n",
    "    header=None,\n",
    "    # names=out_cols,\n",
-    "    quoting=csv.QUOTE_NONE,,\n",
+    "    quoting=csv.QUOTE_NONE,\n",
    "    encoding=\"utf-8\"\n",
    ")\n",
    "\n",
    "train_data = data[[7, 6]]\n",
    "train_data = pd.concat([train_data, train_words], axis=1)\n",
    "\n",
-    "train_data[\"final\"] = train_data[7] + train_data[0] + train_data[6]\n"
+    "train_data[760] = train_data[7] + train_data[0] + train_data[6]\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 23,
   "id": "c3d2cfec",
   "metadata": {},
   "outputs": [
@ -190,7 +189,7 @@
       "      <th>7</th>\n",
       "      <th>6</th>\n",
       "      <th>0</th>\n",
-       "      <th>final</th>\n",
+       "      <th>760</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
@ -277,7 +276,7 @@
       "</div>"
      ],
      "text/plain": [
-       "                                                        7  \\\n",
+       "                                                      7    \\\n",
       "0       said\\nit's all squash. The best I could get\\ni...   \n",
       "1       \\ninto a proper perspective with those\\nminor ...   \n",
       "2       all notU\\nashore and afloat arc subjects for I...   \n",
@ -290,7 +289,7 @@
       "432020  \\na note of Wood, Dialogue fc Co., for\\nc27,im...   \n",
       "432021  3214c;do White at 3614c: Mixed Western at\\n331...   \n",
       "\n",
-       "                                                        6         0  \\\n",
+       "                                                      6         0    \\\n",
       "0       came fiom the last place to this\\nplace, and t...       lie   \n",
       "1       MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...   himself   \n",
       "2       \"Thera were in 1771 only aeventy-nine\\n*ub*erl...        of   \n",
@ -303,7 +302,7 @@
       "432020  settlement with the department.\\nIt is also sh...       for   \n",
       "432021  Flour quotations—low extras at 1 R0®2 50;\\ncit...        at   \n",
       "\n",
-       "                                                    final  \n",
+       "                                                      760  \n",
       "0       said\\nit's all squash. The best I could get\\ni...  \n",
       "1       \\ninto a proper perspective with those\\nminor ...  \n",
       "2       all notU\\nashore and afloat arc subjects for I...  \n",
@ -319,7 +318,7 @@
       "[432022 rows x 4 columns]"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -330,38 +329,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 26,
   "id": "bd92ba07",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
      "\n",
      "\n",
      "  data = pd.read_csv(\n",
      "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
      "\n",
      "\n",
      "  data = pd.read_csv(\n"
     ]
    },
    {
     "ename": "UnicodeEncodeError",
     "evalue": "'charmap' codec can't encode character '\\u03b2' in position 21: character maps to <undefined>",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mUnicodeEncodeError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 7'\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=2'>3</a>\u001b[0m train_model(train_data, model)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=3'>4</a>\u001b[0m predict_data(\u001b[39m\"\u001b[39m\u001b[39mdev-0/in.tsv.xz\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mdev-0/out.tsv\u001b[39m\u001b[39m\"\u001b[39m, model)\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000006?line=4'>5</a>\u001b[0m predict_data(\u001b[39m\"\u001b[39;49m\u001b[39mtest-A/in.tsv.xz\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mtest-A/out.tsv\u001b[39;49m\u001b[39m\"\u001b[39;49m, model)\n",
      "\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 1'\u001b[0m in \u001b[0;36mpredict_data\u001b[1;34m(read_path, save_path, model)\u001b[0m\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=54'>55</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=55'>56</a>\u001b[0m     prediction \u001b[39m=\u001b[39m predict(words[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m], model)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/Norbert/code/challenging-america-word-gap-prediction/testing.ipynb#ch0000000?line=56'>57</a>\u001b[0m file\u001b[39m.\u001b[39;49mwrite(prediction \u001b[39m+\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39m\\n\u001b[39;49;00m\u001b[39m\"\u001b[39;49m)\n",
      "File \u001b[1;32mC:\\Python310\\lib\\encodings\\cp1250.py:19\u001b[0m, in \u001b[0;36mIncrementalEncoder.encode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m     <a href='file:///c%3A/Python310/lib/encodings/cp1250.py?line=17'>18</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mencode\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m, final\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m---> <a href='file:///c%3A/Python310/lib/encodings/cp1250.py?line=18'>19</a>\u001b[0m     \u001b[39mreturn\u001b[39;00m codecs\u001b[39m.\u001b[39;49mcharmap_encode(\u001b[39minput\u001b[39;49m,\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merrors,encoding_table)[\u001b[39m0\u001b[39m]\n",
      "\u001b[1;31mUnicodeEncodeError\u001b[0m: 'charmap' codec can't encode character '\\u03b2' in position 21: character maps to <undefined>"
     ]
    }
   ],
   "source": [
    "\n",
    "model = defaultdict(lambda: defaultdict(lambda: 0))\n",
@ -371,7 +342,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 37,
   "id": "ad23240e",
   "metadata": {},
   "outputs": [
@ -379,7 +350,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
      "\n",
      "\n",
      "  data = pd.read_csv(\n"
@ -392,7 +363,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 38,
   "id": "195cb6cf",
   "metadata": {},
   "outputs": [
@ -400,7 +371,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\751703071.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
      "\n",
      "\n",
      "  data = pd.read_csv(\n"