challenging-america-word-ga.../run.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f834096a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from utils import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "032ba328",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = get_csv(\"train/in.tsv.xz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e0d94073",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = get_csv(\"train/expected.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7c055510",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = data[[6,7]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bd81e581",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = pd.concat([train_data, train_labels], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0c4a5486",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data[607] = train_data[6] + train_data[0] + train_data[7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "aec319cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data[607] = train_data[607].apply(clean_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9b794391",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         came fiom the last place to thisnplace and thi...\n",
       "1         mb boot political obeednattempt to imagine a p...\n",
       "2         thera were in   only aeventyninenuberlbers lo ...\n",
       "3         a gixnl man y niterertiiiv diiclosurs regard  ...\n",
       "4         tin  ub tv thf bbabbt qabjenmr schiffs tutemen...\n",
       "                                ...                        \n",
       "432017    sam clendenin bad a fancy for uinscience of me...\n",
       "432018    witahtt halting the party ware dilven to the s...\n",
       "432019    it was the last thing that either ofnthem expe...\n",
       "432020    settlement with the departmentnit is also show...\n",
       "432021    flour quotationslow extras at   r ®   ncity mi...\n",
       "Name: 607, Length: 432022, dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data[607]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f21d9139",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"tmp\",  \"w+\") as f:\n",
    "    for t in train_data[607]:\n",
    "        f.write(t + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "362a6b83",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 1/5 Counting and sorting n-grams ===\n",
      "Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
      "************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",
      "Special word <s> is not allowed in the corpus.  I plan to support models containing <unk> in the future.  Pass --skip_symbols to convert these symbols to whitespace.\n",
      "/bin/bash: linia 1:  5055 Przerwane               (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"
     ]
    }
   ],
   "source": [
    "KENLM_BUILD_PATH = \"../kenlm/build/\"\n",
    "!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "456fa286",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3eaaf27b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the LM will be faster if you build a binary file.\n",
      "Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",
      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"
     ]
    },
    {
     "ename": "OSError",
     "evalue": "Cannot read model './model.arpa' (End of file Byte: 0)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "File \u001b[0;32mkenlm.pyx:139\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mRuntimeError\u001b[0m: End of file Byte: 0",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mkenlm\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mkenlm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mModel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./model.arpa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32mkenlm.pyx:142\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mOSError\u001b[0m: Cannot read model './model.arpa' (End of file Byte: 0)"
     ]
    }
   ],
   "source": [
    "import kenlm\n",
    "model = kenlm.Model(\"./model.arpa\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3a22dcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(before, after):\n",
    "    result = ''\n",
    "    prob = 0.0\n",
    "    best = []\n",
    "    for word in english_words_alpha_set:\n",
    "        text = ' '.join([before, word, after])\n",
    "        text_score = model.score(text, bos=False, eos=False)\n",
    "        if len(best) < 12:\n",
    "            best.append((word, text_score))\n",
    "        else:\n",
    "            is_better = False\n",
    "            worst_score = None\n",
    "            for score in best:\n",
    "                if not worst_score:\n",
    "                    worst_score = score\n",
    "                else:\n",
    "                    if worst_score[1] > score[1]:\n",
    "                        worst_score = score\n",
    "            if worst_score[1] < text_score:\n",
    "                best.remove(worst_score)\n",
    "                best.append((word, text_score))\n",
    "    probs = sorted(best, key=lambda tup: tup[1], reverse=True)\n",
    "    pred_str = ''\n",
    "    for word, prob in probs:\n",
    "        pred_str += f'{word}:{prob} '\n",
    "    pred_str += f':{log10(0.99)}'\n",
    "    return pred_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "913dcf54",
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_prediction(path, result_path):\n",
    "    data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
    "    with open(result_path, 'w', encoding='utf-8') as file_out:\n",
    "        for _, row in data.iterrows():\n",
    "            before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))\n",
    "            if len(before) < 2 or len(after) < 2:\n",
    "                pred = prediction\n",
    "            else:\n",
    "                pred = predict(before[-1], after[0])\n",
    "            file_out.write(pred + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01c1b58d",
   "metadata": {},
   "outputs": [],
   "source": [
    "make_prediction(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d37cd24",
   "metadata": {},
   "outputs": [],
   "source": [
    "make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}