challenging-america-word-ga.../run.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f834096a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from utils import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "41336a5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = get_csv(\"train/in.tsv.xz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fe2e1dd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = get_csv(\"train/expected.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4aad410d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = data[[6,7]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d6f0f760",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = pd.concat([train_data, train_labels], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "02bda814",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data[607] = train_data[6] + train_data[0] + train_data[7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e3128e14",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data[607] = train_data[607].apply(clean_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fe2c05e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         came fiom the last place to thisnplace and thi...\n",
       "1         mb boot political obeednattempt to imagine a p...\n",
       "2         thera were in   only aeventyninenuberlbers lo ...\n",
       "3         a gixnl man y niterertiiiv diiclosurs regard  ...\n",
       "4         tin  ub tv thf bbabbt qabjenmr schiffs tutemen...\n",
       "                                ...                        \n",
       "432017    sam clendenin bad a fancy for uinscience of me...\n",
       "432018    witahtt halting the party ware dilven to the s...\n",
       "432019    it was the last thing that either ofnthem expe...\n",
       "432020    settlement with the departmentnit is also show...\n",
       "432021    flour quotationslow extras at   r ®   ncity mi...\n",
       "Name: 607, Length: 432022, dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data[607]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5b9b593c",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"tmp\",  \"w+\") as f:\n",
    "    for t in train_data[607]:\n",
    "        f.write(t + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "dd9e3fd6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",
      "Special word <s> is not allowed in the corpus.  I plan to support models containing <unk> in the future.  Pass --skip_symbols to convert these symbols to whitespace.\n",
      "/bin/bash: linia 1:  5055 Przerwane               (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"
     ]
    }
   ],
   "source": [
    "KENLM_BUILD_PATH = \"../kenlm/build/\"\n",
    "!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f8cba81c",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "2c3db836",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the LM will be faster if you build a binary file.\n",
      "Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "****************************************************************************************************\n"
     ]
    }
   ],
   "source": [
    "import kenlm\n",
    "model = kenlm.Model(\"./model.arpa\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "35fb75ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Defaulting to user installation because normal site-packages is not writeable\n",
      "Collecting english_words\n",
      "  Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
      "     |████████████████████████████████| 1.1 MB 985 kB/s            \n",
      "\u001b[?25hBuilding wheels for collected packages: english-words\n",
      "  Building wheel for english-words (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=818b2393457321fc616c24465b2c7ce020853e36d9d8e1b03142a5e18076713d\n",
      "  Stored in directory: /home/me/.cache/pip/wheels/2c/48/9a/f697d8d989ca4e4c1060f9da73caea372d7e1b78402abff8bb\n",
      "Successfully built english-words\n",
      "Installing collected packages: english-words\n",
      "Successfully installed english-words-1.1.0\n"
     ]
    }
   ],
   "source": [
    "!pip install english_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6adb5045",
   "metadata": {},
   "outputs": [],
   "source": [
    "from english_words import english_words_alpha_set\n",
    "from math import log10\n",
    "\n",
    "def predict(before, after):\n",
    "    result = ''\n",
    "    prob = 0.0\n",
    "    best = []\n",
    "    for word in english_words_alpha_set:\n",
    "        text = ' '.join([before, word, after])\n",
    "        text_score = model.score(text, bos=False, eos=False)\n",
    "        if len(best) < 12:\n",
    "            best.append((word, text_score))\n",
    "        else:\n",
    "            is_better = False\n",
    "            worst_score = None\n",
    "            for score in best:\n",
    "                if not worst_score:\n",
    "                    worst_score = score\n",
    "                else:\n",
    "                    if worst_score[1] > score[1]:\n",
    "                        worst_score = score\n",
    "            if worst_score[1] < text_score:\n",
    "                best.remove(worst_score)\n",
    "                best.append((word, text_score))\n",
    "    probs = sorted(best, key=lambda tup: tup[1], reverse=True)\n",
    "    pred_str = ''\n",
    "    for word, prob in probs:\n",
    "        pred_str += f'{word}:{prob} '\n",
    "    pred_str += f':{log10(0.99)}'\n",
    "    return pred_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1ce44bdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import trigrams, word_tokenize\n",
    "\n",
    "def make_prediction(path, result_path):\n",
    "    pdata = get_csv(path)\n",
    "    with open(result_path, 'w', encoding='utf-8') as file_out:\n",
    "        for _, row in pdata.iterrows():\n",
    "            before, after = word_tokenize(clean_text(str(row[6]))), word_tokenize(clean_text(str(row[7])))\n",
    "            if len(before) < 2 or len(after) < 2:\n",
    "                pred = prediction\n",
    "            else:\n",
    "                pred = predict(before[-1], after[0])\n",
    "            file_out.write(pred + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "154c9ff2",
   "metadata": {},
   "outputs": [],
   "source": [
    "make_prediction(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "82d0cc3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf45ce49",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
arpa 2022-04-25 00:28:09 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"id": "f834096a",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pandas as pd\n",`
			`"from utils import *"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
440054 2022-04-25 01:17:13 +02:00			`"id": "41336a5e",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"data = get_csv(\"train/in.tsv.xz\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
440054 2022-04-25 01:17:13 +02:00			`"id": "fe2e1dd3",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"train_labels = get_csv(\"train/expected.tsv\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
440054 2022-04-25 01:17:13 +02:00			`"id": "4aad410d",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"train_data = data[[6,7]]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
440054 2022-04-25 01:17:13 +02:00			`"id": "d6f0f760",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"train_data = pd.concat([train_data, train_labels], axis=1)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
440054 2022-04-25 01:17:13 +02:00			`"id": "02bda814",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"train_data[607] = train_data[6] + train_data[0] + train_data[7]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
wip 2022-04-25 00:52:20 +02:00			`"execution_count": 7,`
440054 2022-04-25 01:17:13 +02:00			`"id": "e3128e14",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"train_data[607] = train_data[607].apply(clean_text)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
wip 2022-04-25 00:52:20 +02:00			`"execution_count": 8,`
440054 2022-04-25 01:17:13 +02:00			`"id": "fe2c05e4",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"0 came fiom the last place to thisnplace and thi...\n",`
			`"1 mb boot political obeednattempt to imagine a p...\n",`
			`"2 thera were in only aeventyninenuberlbers lo ...\n",`
			`"3 a gixnl man y niterertiiiv diiclosurs regard ...\n",`
			`"4 tin ub tv thf bbabbt qabjenmr schiffs tutemen...\n",`
			`" ... \n",`
			`"432017 sam clendenin bad a fancy for uinscience of me...\n",`
			`"432018 witahtt halting the party ware dilven to the s...\n",`
			`"432019 it was the last thing that either ofnthem expe...\n",`
			`"432020 settlement with the departmentnit is also show...\n",`
			`"432021 flour quotationslow extras at r ® ncity mi...\n",`
			`"Name: 607, Length: 432022, dtype: object"`
			`]`
			`},`
wip 2022-04-25 00:52:20 +02:00			`"execution_count": 8,`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"train_data[607]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
wip 2022-04-25 00:52:20 +02:00			`"execution_count": 15,`
440054 2022-04-25 01:17:13 +02:00			`"id": "5b9b593c",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open(\"tmp\", \"w+\") as f:\n",`
			`" for t in train_data[607]:\n",`
			`" f.write(t + \"\\n\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
wip 2022-04-25 00:52:20 +02:00			`"execution_count": 10,`
440054 2022-04-25 01:17:13 +02:00			`"id": "dd9e3fd6",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"=== 1/5 Counting and sorting n-grams ===\n",`
			`"Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",`
			`"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",`
			`"************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",`
			`"Special word <s> is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.\n",`
wip 2022-04-25 00:52:20 +02:00			`"/bin/bash: linia 1: 5055 Przerwane (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"`
arpa 2022-04-25 00:28:09 +02:00			`]`
			`}`
			`],`
			`"source": [`
			`"KENLM_BUILD_PATH = \"../kenlm/build/\"\n",`
			`"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
wip 2022-04-25 00:52:20 +02:00			`"execution_count": 11,`
440054 2022-04-25 01:17:13 +02:00			`"id": "f8cba81c",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"!rm tmp"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
440054 2022-04-25 01:17:13 +02:00			`"execution_count": 16,`
			`"id": "2c3db836",`
			`"metadata": {`
			`"scrolled": true`
			`},`
arpa 2022-04-25 00:28:09 +02:00			`"outputs": [`
			`{`
wip 2022-04-25 00:52:20 +02:00			`"name": "stderr",`
arpa 2022-04-25 00:28:09 +02:00			`"output_type": "stream",`
			`"text": [`
wip 2022-04-25 00:52:20 +02:00			`"Loading the LM will be faster if you build a binary file.\n",`
			`"Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",`
440054 2022-04-25 01:17:13 +02:00			`"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",`
			`"****************************************************************************************************\n"`
arpa 2022-04-25 00:28:09 +02:00			`]`
			`}`
			`],`
			`"source": [`
wip 2022-04-25 00:52:20 +02:00			`"import kenlm\n",`
			`"model = kenlm.Model(\"./model.arpa\")"`
arpa 2022-04-25 00:28:09 +02:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
440054 2022-04-25 01:17:13 +02:00			`"execution_count": 23,`
			`"id": "35fb75ee",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Defaulting to user installation because normal site-packages is not writeable\n",`
			`"Collecting english_words\n",`
			`" Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",`
			`" \|████████████████████████████████\| 1.1 MB 985 kB/s \n",`
			`"\u001b[?25hBuilding wheels for collected packages: english-words\n",`
			`" Building wheel for english-words (setup.py) ... \u001b[?25ldone\n",`
			`"\u001b[?25h Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=818b2393457321fc616c24465b2c7ce020853e36d9d8e1b03142a5e18076713d\n",`
			`" Stored in directory: /home/me/.cache/pip/wheels/2c/48/9a/f697d8d989ca4e4c1060f9da73caea372d7e1b78402abff8bb\n",`
			`"Successfully built english-words\n",`
			`"Installing collected packages: english-words\n",`
			`"Successfully installed english-words-1.1.0\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"!pip install english_words"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 24,`
			`"id": "6adb5045",`
arpa 2022-04-25 00:28:09 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
440054 2022-04-25 01:17:13 +02:00			`"from english_words import english_words_alpha_set\n",`
			`"from math import log10\n",`
			`"\n",`
wip 2022-04-25 00:52:20 +02:00			`"def predict(before, after):\n",`
			`" result = ''\n",`
			`" prob = 0.0\n",`
			`" best = []\n",`
			`" for word in english_words_alpha_set:\n",`
			`" text = ' '.join([before, word, after])\n",`
			`" text_score = model.score(text, bos=False, eos=False)\n",`
			`" if len(best) < 12:\n",`
			`" best.append((word, text_score))\n",`
			`" else:\n",`
			`" is_better = False\n",`
			`" worst_score = None\n",`
			`" for score in best:\n",`
			`" if not worst_score:\n",`
			`" worst_score = score\n",`
			`" else:\n",`
			`" if worst_score[1] > score[1]:\n",`
			`" worst_score = score\n",`
			`" if worst_score[1] < text_score:\n",`
			`" best.remove(worst_score)\n",`
			`" best.append((word, text_score))\n",`
			`" probs = sorted(best, key=lambda tup: tup[1], reverse=True)\n",`
			`" pred_str = ''\n",`
			`" for word, prob in probs:\n",`
			`" pred_str += f'{word}:{prob} '\n",`
			`" pred_str += f':{log10(0.99)}'\n",`
			`" return pred_str"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
440054 2022-04-25 01:17:13 +02:00			`"execution_count": 27,`
			`"id": "1ce44bdc",`
wip 2022-04-25 00:52:20 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
440054 2022-04-25 01:17:13 +02:00			`"from nltk import trigrams, word_tokenize\n",`
			`"\n",`
wip 2022-04-25 00:52:20 +02:00			`"def make_prediction(path, result_path):\n",`
440054 2022-04-25 01:17:13 +02:00			`" pdata = get_csv(path)\n",`
wip 2022-04-25 00:52:20 +02:00			`" with open(result_path, 'w', encoding='utf-8') as file_out:\n",`
440054 2022-04-25 01:17:13 +02:00			`" for _, row in pdata.iterrows():\n",`
			`" before, after = word_tokenize(clean_text(str(row[6]))), word_tokenize(clean_text(str(row[7])))\n",`
wip 2022-04-25 00:52:20 +02:00			`" if len(before) < 2 or len(after) < 2:\n",`
			`" pred = prediction\n",`
			`" else:\n",`
			`" pred = predict(before[-1], after[0])\n",`
			`" file_out.write(pred + '\\n')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
440054 2022-04-25 01:17:13 +02:00			`"execution_count": 28,`
			`"id": "154c9ff2",`
wip 2022-04-25 00:52:20 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"make_prediction(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
440054 2022-04-25 01:17:13 +02:00			`"execution_count": 29,`
			`"id": "82d0cc3f",`
wip 2022-04-25 00:52:20 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"`
arpa 2022-04-25 00:28:09 +02:00			`]`
440054 2022-04-25 01:17:13 +02:00			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "cf45ce49",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
arpa 2022-04-25 00:28:09 +02:00			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.4"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`