{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from itertools import islice\n",
    "from collections import Counter\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lzma\n",
    "from collections import Counter, OrderedDict\n",
    "import matplotlib.pyplot as plt\n",
    "from math import log\n",
    "import re\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"train/in.tsv\", encoding='utf8', mode=\"rt\") as file:\n",
    "    a = file.readlines()\n",
    "\n",
    "a = [line.split(\"\\t\") for line in a]\n",
    "text = \" \".join([line[-2] + \" \" + line[-1] for line in a])\n",
    "text = re.sub(r\"\\\\+n\", \" \", text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "del a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19560075"
      ]
     },
     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = re.findall(\"\\w+\", text)\n",
    "bigram_counter = Counter(zip(words, islice(words, 1, None)))\n",
    "bigram_counter = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))\n",
    "\n",
    "del words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "bigram_counter_short = {}\n",
    "for key, value in bigram_counter.items():\n",
    "    if value > 5:\n",
    "        bigram_counter_short[key] = value\n",
    "\n",
    "bigram_counter = bigram_counter_short\n",
    "del bigram_counter_short"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "unigram_counter = Counter(text.split(' '))\n",
    "unigram_counter = unigram_counter.most_common(10_000)\n",
    "# unigram_counter = dict(sorted(unigram_counter.items(), key=lambda item: item[1]), reverse=True)\n",
    "unigram_counter_list = unigram_counter\n",
    "unigram_counter = dict(unigram_counter) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open(\"dev-0/in.tsv\", encoding='utf8', mode=\"rt\") as file:\n",
    "#     a = file.readlines()\n",
    "\n",
    "# a = [line.split(\"\\t\") for line in a]\n",
    "# text = \" \".join([line[-2] + \" \" + line[-1] for line in a])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\micha\\AppData\\Local\\Temp\\ipykernel_14716\\2692353843.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
      "\n",
      "\n",
      "  test_data = pd.read_csv('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None)\n",
      "Skipping line 654: expected 8 fields, saw 9\n",
      "Skipping line 2220: expected 8 fields, saw 9\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_data = pd.read_csv('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>662ed514d56f7bc8743aa6f23794c731</td>\n",
       "      <td>LINCOLN TELEGRAPH</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1838.834247</td>\n",
       "      <td>43.910755</td>\n",
       "      <td>-69.820862</td>\n",
       "      <td>rin 11K ui i rsognfd inlriliinnts i&gt;r the town...</td>\n",
       "      <td>Northeasterly hv the head of said .^corn’s\\nan...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0c3ac40edfe6a167ab692fdb9219a93c</td>\n",
       "      <td>THE WYANDOT PIONEER</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1857.691781</td>\n",
       "      <td>40.827279</td>\n",
       "      <td>-83.281309</td>\n",
       "      <td>ton County feel an interest in. tn great is-\\n...</td>\n",
       "      <td>and design,\\nand hence, every election, be it ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>b298097f3afd2f8c06b61fa2308ec725</td>\n",
       "      <td>RICHMOND ENQUIRER</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1847.012329</td>\n",
       "      <td>37.538509</td>\n",
       "      <td>-77.434280</td>\n",
       "      <td>But at our own doors we have evidence ten\\ning...</td>\n",
       "      <td>Democrat\\nenlisting lor the Mexican wvir. They...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1d50cf957a6a9cbbe0ee7773a72a76d4</td>\n",
       "      <td>RAFTSMAN'S JOURNAL</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1867.541096</td>\n",
       "      <td>41.027280</td>\n",
       "      <td>-78.439188</td>\n",
       "      <td>The wonderful Flexibility and great comfort\\na...</td>\n",
       "      <td>will preserve their perfect aud grace\\nful sha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5a7297b76de00c7d9e1fb159384238c0</td>\n",
       "      <td>RICHMOND ENQUIRER</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1826.083562</td>\n",
       "      <td>37.538509</td>\n",
       "      <td>-77.434280</td>\n",
       "      <td>Illinois.—The Legislature met at Ya:.ualia\\non...</td>\n",
       "      <td>to run the line between Arkansas and\\nthe’Vhnc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10397</th>\n",
       "      <td>02e9e019df1992daeafe82b041d94aac</td>\n",
       "      <td>WATERBURY EVENING DEMOCRAT</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1888.949454</td>\n",
       "      <td>41.558153</td>\n",
       "      <td>-73.051497</td>\n",
       "      <td>the Fitzgeralds should perish like a common\\nt...</td>\n",
       "      <td>Brian, but there was also a touch\\nof self int...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10398</th>\n",
       "      <td>74fa28868cbc998d15c242baea4e1faa</td>\n",
       "      <td>RICHMOND ENQUIRER</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1836.012295</td>\n",
       "      <td>37.538509</td>\n",
       "      <td>-77.434280</td>\n",
       "      <td>herd, so soon as he conveniently can, after th...</td>\n",
       "      <td>Court dotli lurlher adjudge, order, and decree...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10399</th>\n",
       "      <td>147be715e90bac01c55969d90254f29e</td>\n",
       "      <td>EVENING CAPITAL</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1907.004110</td>\n",
       "      <td>38.978640</td>\n",
       "      <td>-76.492786</td>\n",
       "      <td>Drs. James J. Murphy, of Annapo-\\nlis, and Tho...</td>\n",
       "      <td>in the matter\\nor show any inclination to help...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10400</th>\n",
       "      <td>1357f703947d912523ac23540cb99a0f</td>\n",
       "      <td>RAFTSMAN'S JOURNAL</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1868.077869</td>\n",
       "      <td>41.027280</td>\n",
       "      <td>-78.439188</td>\n",
       "      <td>the soles of the feet spikes or corks are fixe...</td>\n",
       "      <td>\\nIn order to prevent \"the giant\" from\\nfright...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10401</th>\n",
       "      <td>23346293dbc949ee2edc3380db29f33b</td>\n",
       "      <td>THE DEMOCRATIC WHIG</td>\n",
       "      <td>ChronAm</td>\n",
       "      <td>1843.760274</td>\n",
       "      <td>33.495674</td>\n",
       "      <td>-88.427263</td>\n",
       "      <td>tion which his opponent had taken, and whilst\\...</td>\n",
       "      <td>come criterion, by which to judge\\nof a nation...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10402 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      0                           1        2  \\\n",
       "0      662ed514d56f7bc8743aa6f23794c731           LINCOLN TELEGRAPH  ChronAm   \n",
       "1      0c3ac40edfe6a167ab692fdb9219a93c         THE WYANDOT PIONEER  ChronAm   \n",
       "2      b298097f3afd2f8c06b61fa2308ec725           RICHMOND ENQUIRER  ChronAm   \n",
       "3      1d50cf957a6a9cbbe0ee7773a72a76d4          RAFTSMAN'S JOURNAL  ChronAm   \n",
       "4      5a7297b76de00c7d9e1fb159384238c0           RICHMOND ENQUIRER  ChronAm   \n",
       "...                                 ...                         ...      ...   \n",
       "10397  02e9e019df1992daeafe82b041d94aac  WATERBURY EVENING DEMOCRAT  ChronAm   \n",
       "10398  74fa28868cbc998d15c242baea4e1faa           RICHMOND ENQUIRER  ChronAm   \n",
       "10399  147be715e90bac01c55969d90254f29e             EVENING CAPITAL  ChronAm   \n",
       "10400  1357f703947d912523ac23540cb99a0f          RAFTSMAN'S JOURNAL  ChronAm   \n",
       "10401  23346293dbc949ee2edc3380db29f33b         THE DEMOCRATIC WHIG  ChronAm   \n",
       "\n",
       "                 3          4          5  \\\n",
       "0      1838.834247  43.910755 -69.820862   \n",
       "1      1857.691781  40.827279 -83.281309   \n",
       "2      1847.012329  37.538509 -77.434280   \n",
       "3      1867.541096  41.027280 -78.439188   \n",
       "4      1826.083562  37.538509 -77.434280   \n",
       "...            ...        ...        ...   \n",
       "10397  1888.949454  41.558153 -73.051497   \n",
       "10398  1836.012295  37.538509 -77.434280   \n",
       "10399  1907.004110  38.978640 -76.492786   \n",
       "10400  1868.077869  41.027280 -78.439188   \n",
       "10401  1843.760274  33.495674 -88.427263   \n",
       "\n",
       "                                                       6  \\\n",
       "0      rin 11K ui i rsognfd inlriliinnts i>r the town...   \n",
       "1      ton County feel an interest in. tn great is-\\n...   \n",
       "2      But at our own doors we have evidence ten\\ning...   \n",
       "3      The wonderful Flexibility and great comfort\\na...   \n",
       "4      Illinois.—The Legislature met at Ya:.ualia\\non...   \n",
       "...                                                  ...   \n",
       "10397  the Fitzgeralds should perish like a common\\nt...   \n",
       "10398  herd, so soon as he conveniently can, after th...   \n",
       "10399  Drs. James J. Murphy, of Annapo-\\nlis, and Tho...   \n",
       "10400  the soles of the feet spikes or corks are fixe...   \n",
       "10401  tion which his opponent had taken, and whilst\\...   \n",
       "\n",
       "                                                       7  \n",
       "0      Northeasterly hv the head of said .^corn’s\\nan...  \n",
       "1      and design,\\nand hence, every election, be it ...  \n",
       "2      Democrat\\nenlisting lor the Mexican wvir. They...  \n",
       "3      will preserve their perfect aud grace\\nful sha...  \n",
       "4      to run the line between Arkansas and\\nthe’Vhnc...  \n",
       "...                                                  ...  \n",
       "10397  Brian, but there was also a touch\\nof self int...  \n",
       "10398  Court dotli lurlher adjudge, order, and decree...  \n",
       "10399  in the matter\\nor show any inclination to help...  \n",
       "10400  \\nIn order to prevent \"the giant\" from\\nfright...  \n",
       "10401  come criterion, by which to judge\\nof a nation...  \n",
       "\n",
       "[10402 rows x 8 columns]"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10519it [02:47, 62.67it/s]\n"
     ]
    }
   ],
   "source": [
    "results_string = []\n",
    "\n",
    "with lzma.open(\"dev-0/in.tsv.xz\", encoding='utf8', mode=\"rt\") as file:\n",
    "    for line in tqdm(file):\n",
    "        line = line.split(\"\\t\")\n",
    "        text_before = str(line[-2]).replace('\\\\n', ' ').replace('\\n', ' ')\n",
    "        text_after = str(line[-1]).replace('\\\\n', ' ').replace('\\n', ' ')\n",
    "\n",
    "        if text_before[-1] == ' ':\n",
    "            text_before = text_before[:-1]\n",
    "        if text_before[0] == ' ':\n",
    "            text_before = text_before[1:]\n",
    "\n",
    "        if text_after[-1] == ' ':\n",
    "            text_after = text_after[:-1]\n",
    "        if text_after[0] == ' ':\n",
    "            text_after = text_after[1:]\n",
    "\n",
    "        word_before = text_before.split(' ')[-1]\n",
    "        word_after = text_after.split(' ')[0]\n",
    "\n",
    "        best_words = {}\n",
    "\n",
    "        for word_middle, _ in unigram_counter_list:\n",
    "            current_score = 0\n",
    "            if (word_before, word_middle) in bigram_counter.keys() and (word_middle, word_after) in bigram_counter.keys() and word_before in unigram_counter.keys() and word_after in unigram_counter.keys():\n",
    "                current_score = (bigram_counter[(word_before, word_middle)] / unigram_counter[word_before]) * (bigram_counter[(word_middle, word_after)] / unigram_counter[word_middle])\n",
    "            best_words[word_middle] = current_score\n",
    "\n",
    "        best_words = sorted(best_words.items(), key=lambda item: item[1], reverse=True)\n",
    "        leftover_probability = 0\n",
    "        for _, value in best_words[:5]:\n",
    "            if value == 0:\n",
    "                break\n",
    "            leftover_probability += value\n",
    "        leftover_probability = max(1 - leftover_probability, 0.01)\n",
    "\n",
    "        result = f'{best_words[0][0]}:{round(best_words[0][1], 7):.8f} {best_words[1][0]}:{round(best_words[1][1], 7):.8f} {best_words[2][0]}:{round(best_words[2][1], 7):.8f} {best_words[3][0]}:{round(best_words[3][1], 7):.8f} {best_words[4][0]}:{round(best_words[4][1], 7):.8f} :{round(leftover_probability, 3):.8f}'\n",
    "        results_string.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'own:0.00076790 way:0.00069960 head:0.00058630 work:0.00051990 place:0.00045550 :0.99700000',\n",
       " 'the:0.00001150 a:0.00001040 Madison:0.00000230 every:0.00000210 Missouri:0.00000120 :1.00000000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'a:0.00008660 him:0.00000250 no:0.00000170 all:0.00000150 them:0.00000120 :1.00000000',\n",
       " 'trees:0.00092990 and:0.00057150 is:0.00030980 growers:0.00029090 growing:0.00014300 :0.99800000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'that:0.00131940 as:0.00047920 sure:0.00009500 and:0.00009330 better:0.00007450 :0.99800000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'be:0.00032910 the:0.00014090 show:0.00012860 this:0.00001380 a:0.00000790 :0.99900000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'country:0.00007340 world:0.00007170 people:0.00006310 city:0.00005530 time:0.00004280 :1.00000000',\n",
       " 'to:0.00030170 I:0.00005940 and:0.00002870 a:0.00001570 t:0.00001340 :1.00000000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'that:0.00014960 for:0.00010580 God:0.00005010 to:0.00002530 ing:0.00001430 :1.00000000',\n",
       " 'founded:0.00097130 known:0.00064890 posted:0.00052370 as:0.00032530 fed:0.00027720 :0.99700000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
       " 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000']"
      ]
     },
     "execution_count": 214,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results_string[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(r'test-A/out.tsv', 'w') as fp:\n",
    "    for item in results_string:\n",
    "        fp.write(\"%s\\n\" % item)\n",
    "    print('Done')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "scweet",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.15"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}