440054

2022-04-25 01:17:13 +02:00 · 2022-04-25 01:17:13 +02:00 · d09c77e228
commit d09c77e228
parent 3e73ddf02d
3 changed files with 18002 additions and 41 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.ipynb
+++ b/run.ipynb
@ -14,7 +14,7 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "id": "032ba328",
+   "id": "41336a5e",
   "metadata": {},
   "outputs": [],
   "source": [
@ -24,7 +24,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "e0d94073",
+   "id": "fe2e1dd3",
   "metadata": {},
   "outputs": [],
   "source": [
@ -34,7 +34,7 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "id": "7c055510",
+   "id": "4aad410d",
   "metadata": {},
   "outputs": [],
   "source": [
@ -44,7 +44,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "bd81e581",
+   "id": "d6f0f760",
   "metadata": {},
   "outputs": [],
   "source": [
@ -54,7 +54,7 @@
  {
   "cell_type": "code",
   "execution_count": 6,
-   "id": "0c4a5486",
+   "id": "02bda814",
   "metadata": {},
   "outputs": [],
   "source": [
@ -64,7 +64,7 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "id": "aec319cd",
+   "id": "e3128e14",
   "metadata": {},
   "outputs": [],
   "source": [
@ -74,7 +74,7 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "id": "9b794391",
+   "id": "fe2c05e4",
   "metadata": {},
   "outputs": [
    {
@ -106,7 +106,7 @@
  {
   "cell_type": "code",
   "execution_count": 15,
-   "id": "f21d9139",
+   "id": "5b9b593c",
   "metadata": {},
   "outputs": [],
   "source": [
@ -118,7 +118,7 @@
  {
   "cell_type": "code",
   "execution_count": 10,
-   "id": "362a6b83",
+   "id": "dd9e3fd6",
   "metadata": {},
   "outputs": [
    {
@ -142,7 +142,7 @@
  {
   "cell_type": "code",
   "execution_count": 11,
-   "id": "456fa286",
+   "id": "f8cba81c",
   "metadata": {},
   "outputs": [],
   "source": [
@ -151,9 +151,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
-   "id": "3eaaf27b",
-   "metadata": {},
+   "execution_count": 16,
+   "id": "2c3db836",
+   "metadata": {
+    "scrolled": true
+   },
   "outputs": [
    {
     "name": "stderr",
@ -161,23 +163,8 @@
     "text": [
      "Loading the LM will be faster if you build a binary file.\n",
      "Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"
-     ]
-    },
-    {
-     "ename": "OSError",
-     "evalue": "Cannot read model './model.arpa' (End of file Byte: 0)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-      "File \u001b[0;32mkenlm.pyx:139\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mRuntimeError\u001b[0m: End of file Byte: 0",
-      "\nThe above exception was the direct cause of the following exception:\n",
-      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mkenlm\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mkenlm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mModel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./model.arpa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32mkenlm.pyx:142\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mOSError\u001b[0m: Cannot read model './model.arpa' (End of file Byte: 0)"
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "****************************************************************************************************\n"
     ]
    }
   ],
@ -188,11 +175,42 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "b3a22dcd",
+   "execution_count": 23,
+   "id": "35fb75ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Defaulting to user installation because normal site-packages is not writeable\n",
+      "Collecting english_words\n",
+      "  Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
+      "     |████████████████████████████████| 1.1 MB 985 kB/s            \n",
+      "\u001b[?25hBuilding wheels for collected packages: english-words\n",
+      "  Building wheel for english-words (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=818b2393457321fc616c24465b2c7ce020853e36d9d8e1b03142a5e18076713d\n",
+      "  Stored in directory: /home/me/.cache/pip/wheels/2c/48/9a/f697d8d989ca4e4c1060f9da73caea372d7e1b78402abff8bb\n",
+      "Successfully built english-words\n",
+      "Installing collected packages: english-words\n",
+      "Successfully installed english-words-1.1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install english_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "6adb5045",
   "metadata": {},
   "outputs": [],
   "source": [
+    "from english_words import english_words_alpha_set\n",
+    "from math import log10\n",
+    "\n",
    "def predict(before, after):\n",
    "    result = ''\n",
    "    prob = 0.0\n",
@ -224,16 +242,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "913dcf54",
+   "execution_count": 27,
+   "id": "1ce44bdc",
   "metadata": {},
   "outputs": [],
   "source": [
+    "from nltk import trigrams, word_tokenize\n",
+    "\n",
    "def make_prediction(path, result_path):\n",
-    "    data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
+    "    pdata = get_csv(path)\n",
    "    with open(result_path, 'w', encoding='utf-8') as file_out:\n",
-    "        for _, row in data.iterrows():\n",
-    "            before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))\n",
+    "        for _, row in pdata.iterrows():\n",
+    "            before, after = word_tokenize(clean_text(str(row[6]))), word_tokenize(clean_text(str(row[7])))\n",
    "            if len(before) < 2 or len(after) < 2:\n",
    "                pred = prediction\n",
    "            else:\n",
@ -243,8 +263,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "01c1b58d",
+   "execution_count": 28,
+   "id": "154c9ff2",
   "metadata": {},
   "outputs": [],
   "source": [
@ -253,13 +273,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "6d37cd24",
+   "execution_count": 29,
+   "id": "82d0cc3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf45ce49",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/test-A/out.tsv
+++ b/test-A/out.tsv