challenging-america-word-ga.../bigram_model.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "813cb6fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python3\n",
    "\n",
    "import sys\n",
    "from collections import Counter \n",
    "# print(sys.executable)\n",
    "\n",
    "def ngrams(iter, size):\n",
    "  ngram = []\n",
    "  for item in iter:\n",
    "     ngram.append(item)\n",
    "     if len(ngram) == size:\n",
    "        yield tuple(ngram)\n",
    "        ngram = ngram[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4ea32711",
   "metadata": {},
   "outputs": [],
   "source": [
    "def update_counts(dict_dest, dict_temp):\n",
    "    for key, value in dict_temp.items():\n",
    "        dict_dest[key]= dict_dest.get(key, 0) + 1\n",
    "    return dict_dest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "02f1dd40",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'a': 2, 'b': 3}\n"
     ]
    }
   ],
   "source": [
    "a = {\"a\":1, \"b\":2}\n",
    "b = {\"a\":1, \"b\":2}\n",
    "print(update_counts(a,b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "01ba3ec0",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def update_V_stats(text, V, V_bigrams):\n",
    "   V_b = list(ngrams(text.split(\" \"), 2))\n",
    "   count_V = Counter(text.split(\" \"))\n",
    "   count_V_bigrams = Counter(V_b)\n",
    "#    V = {key: count_V.get(key, 0) + V.get(key, 0) for key in set(V) | set(count_V)}\n",
    "#    V_bigrams = {key: count_V_bigrams.get(key, 0) + V_bigrams.get(key, 0) for key in set(V_bigrams) | set(count_V_bigrams)}\n",
    "   update_counts(V, count_V)\n",
    "   update_counts(V_bigrams, count_V_bigrams)\n",
    "   return V, V_bigrams\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2f1f10c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def Prob_bigram(presc_word, foll_word, Udict, Bdict): \n",
    "    return Bdict.get((presc_word, foll_word))/Udict.get(presc_word)\n",
    "\n",
    "def Prob_of_word(word, word_before, word_after, Udict, Bdict):\n",
    "    return Prob_bigram(word_before, word, Udict, Bdict) * Prob_bigram(word, word_after, Udict, Bdict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0351a8b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_last_word(text):\n",
    "    \"\"\"Return the last word of a string.\"\"\"\n",
    "    last_word = \"\"\n",
    "    for i in range(len(text)-1, -1, -1):\n",
    "        if text[i] == ' ':\n",
    "            return last_word[::-1]\n",
    "        else:\n",
    "            last_word += text[i]\n",
    "    return last_word[::-1]\n",
    "\n",
    "def get_first_word(text):\n",
    "    \"\"\"Return the last word of a string.\"\"\"\n",
    "    word = \"\"\n",
    "    for i in range(len(text)-1):\n",
    "        if text[i] == ' ':\n",
    "            return word\n",
    "        else:\n",
    "            word += text[i]\n",
    "    return word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7032a595",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5\n"
     ]
    }
   ],
   "source": [
    "print(Prob_bigram(\"from\",\"the\",V,V2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "cd9443a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "305\n"
     ]
    }
   ],
   "source": [
    "p_list = []\n",
    "for word in V:\n",
    "    p_list.append(Prob_two())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "82f052a0",
   "metadata": {},
   "outputs": [
    {
     "ename": "UnicodeDecodeError",
     "evalue": "'charmap' codec can't decode byte 0x98 in position 5004: character maps to <undefined>",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-9-d0544724dc0c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0ma\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"./test-A/in.tsv\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'r+'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m     \u001b[1;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[1;31m#         print(list(ngrams(line.split(\" \"), 2)))\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m         \u001b[0mV\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mupdate_V_stats\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\program files\\python39\\lib\\encodings\\cp1250.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m     21\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mIncrementalDecoder\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mIncrementalDecoder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     22\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcharmap_decode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdecoding_table\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mStreamWriter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCodec\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStreamWriter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x98 in position 5004: character maps to <undefined>"
     ]
    }
   ],
   "source": [
    "V, V2 = {}, {}\n",
    "k = 100\n",
    "a=1\n",
    "with open(\"./test-A/in.tsv\", 'r+') as file:\n",
    "    for line in file:\n",
    "#         print(list(ngrams(line.split(\" \"), 2)))\n",
    "        V, V2 = update_V_stats(line, V, V2)\n",
    "#         a+=1\n",
    "#         if a>100:\n",
    "#             break\n",
    "        \n",
    "V=dict(sorted(V.items(), key=lambda x: x[1], reverse=True)[:k])\n",
    "V2=dict(sorted(V2.items(), key=lambda x: x[1], reverse=True)[:k])\n",
    "\n",
    "print(V, V2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e6111e8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'and': 7316, 'of': 7298, 'to': 7265, 'the': 7241, 'a': 7023, 'in': 6632, 'that': 6061, 'for': 5813, 'by': 5175, 'as': 5148, 'on': 4965, 'with': 4962, 'at': 4917, 'be': 4867, 'is': 4856, 'it': 4349, 'not': 4218, 'was': 4170, 'from': 4122, 'or': 3992, 'which': 3956, 'this': 3838, 'The': 3751, 'have': 3715, 'all': 3553, 'are': 3500, 'but': 3357, 'an': 3356, 'In': 3322, 'will': 3218, 'his': 3206, 'he': 3165, 'been': 3074, 'one': 2960, 'has': 2951, 'who': 2860, 'their': 2859, 'It': 2850, 'they': 2794, 'had': 2624, 'were': 2456, 'no': 2409, 'so': 2348, 'tho': 2334, 'would': 2300, 'any': 2267, 'when': 2076, 'there': 2030, 'than': 1949, 'I': 1939, 'out': 1917, 'more': 1901, 'other': 1884, 'upon': 1801, 'up': 1788, 'made': 1756, 'DAILY': 1740, 'if': 1731, 'its': 1706, 'two': 1701, 'them': 1652, 'only': 1640, 'time': 1626, 'some': 1613, 'such': 1612, 'about': 1556, 'may': 1553, 'we': 1543, 'can': 1528, 'him': 1505, 'of\\\\nthe': 1505, 'said': 1470, 'Is': 1463, '.': 1443, 'being': 1431, 'our': 1417, 'now': 1410, 'very': 1405, 'do': 1404, 'into': 1402, 'over': 1354, 'WHEELING': 1347, 'should': 1308, 'most': 1297, 'after': 1273, 'under': 1263, 'A': 1250, 'before': 1230, 'could': 1186, 'first': 1186, 'great': 1183, 'every': 1181, 'He': 1170, 'these': 1150, 'those': 1147, 'day': 1141, 'what': 1136, 'man': 1127, 'same': 1122, 'Mr.': 1119} {('of', 'the'): 5890, ('to', 'the'): 3881, ('in', 'the'): 3730, ('and', 'the'): 2467, ('on', 'the'): 2362, ('for', 'the'): 2213, ('by', 'the'): 2027, ('to', 'be'): 2016, ('that', 'the'): 1859, ('of', 'a'): 1762, ('at', 'the'): 1711, ('with', 'the'): 1492, ('from', 'the'): 1444, ('WHEELING', 'DAILY'): 1347, ('it', 'is'): 1197, ('In', 'the'): 1152, ('will', 'be'): 1132, ('in', 'a'): 1082, ('has', 'been'): 939, ('of', 'tho'): 939, ('have', 'been'): 935, ('is', 'a'): 895, ('of', 'this'): 884, ('to', 'a'): 871, ('one', 'of'): 845, ('as', 'a'): 832, ('for', 'a'): 829, ('of', 'his'): 826, ('the', 'same'): 806, ('and', 'a'): 801, ('as', 'the'): 800, ('with', 'a'): 790, ('It', 'is'): 786, ('it', 'was'): 719, ('all', 'the'): 705, ('and', 'that'): 700, ('is', 'the'): 676, ('that', 'he'): 670, ('was', 'a'): 663, ('in', 'this'): 638, ('by', 'a'): 632, ('out', 'of'): 616, ('may', 'be'): 596, ('as', 'to'): 595, ('of', 'their'): 584, ('is', 'not'): 574, ('upon', 'the'): 572, ('and', 'in'): 562, ('the', 'first'): 560, ('that', 'it'): 553, ('had', 'been'): 543, ('would', 'be'): 536, ('the', 'most'): 529, ('and', 'to'): 524, ('part', 'of'): 522, ('a', 'few'): 520, ('to', 'have'): 514, ('of', 'our'): 500, ('of', 'all'): 492, ('to', 'make'): 489, ('It', 'was'): 485, ('he', 'was'): 469, ('to', 'tho'): 465, ('into', 'the'): 462, ('there', 'is'): 462, ('that', 'they'): 453, ('and', 'it'): 450, ('under', 'the'): 441, ('in', 'his'): 440, ('did', 'not'): 422, ('which', 'the'): 418, ('to', 'do'): 418, ('they', 'are'): 417, ('was', 'the'): 413, ('should', 'be'): 405, ('on', 'a'): 404, ('is', 'to'): 403, ('at', 'a'): 398, ('be', 'a'): 396, ('the', 'United'): 393, ('was', 'not'): 391, ('more', 'than'): 389, ('he', 'had'): 388, ('but', 'the'): 382, ('day', 'of'): 381, ('of', 'which'): 373, ('when', 'the'): 371, ('not', 'be'): 370, ('that', 'a'): 370, ('to', 'his'): 367, ('before', 'the'): 364, ('can', 'be'): 356, ('of', 'said'): 355, ('in', 'tho'): 354, ('such', 'a'): 354, ('shall', 'be'): 352, ('the', 'other'): 352, ('of', 'that'): 348, ('over', 'the'): 347, ('number', 'of'): 347}\n",
      "[('first\\\\n', 'on'), ('that\\\\n', 'in'), ('been', 'same,\\\\nand'), ('a', 'by'), ('Gossett', 'other\\\\ngentlemen'), ('but\\\\nclinging', 'and'), ('mo', 'people'), ('own', '\\\\ntion'), ('and\\\\nwere', '011'), ('heart.\\\\n', 'was'), ('what\\xad\\\\never', 'the'), ('John', '\\\\nBaptist,'), ('few', 'a'), ('be\\xad\\\\n', 'their'), ('first,', 'are'), ('elevator;', 'the'), ('on', 'they\\\\nwere'), ('at', 'stage\\\\nor'), ('the', 'village'), ('disaffection\\\\n', 'the'), ('con-\\\\n', 'may'), ('hour\\\\nexiinusiing', 'lie'), ('6', 'to'), ('we\\\\n', 'aoeaaas'), ('Western', '\\\\n8i'), ('yesterday\\\\n', 'been'), ('were\\\\n', 'to'), ('to', 'supplications'), ('perhaps,', 'is'), ('do', 'to,'), ('unthoughtedly', 'that\\\\nthe'), ('was', 'on\\\\nschedule'), ('room\\\\n', 'and'), ('instruments', '\\\\nlast.'), ('pending', '\\\\nfore'), ('least\\\\nThe', 'Settlement'), ('be', 'that'), ('thoro', 'In'), ('boasted\\\\nforeign', 'movements,'), ('to', 'nt'), ('individual\\\\neases', 'bo'), ('railroad', 'approaches'), ('and\\\\nState', 'Michigan,'), ('toy,\\\\n', 'playing'), ('will\\\\nsoon', 'if'), ('members', 'the'), ('HIanchc', '\\\\nceeded'), ('Cleveland\\\\n', 'return'), ('the', '\\\\nIs'), ('about', 'a\\\\nyear,'), ('and\\\\nIlttiuiiwli', 'anti'), ('true', 'Ned'), ('they\\\\nfound', 'smoking'), ('attend', 'it'), ('m', '\\\\n•2.29'), ('statement', 'that'), ('Henderson,', 'Washing\\\\nton,Pa.,owuer'), ('There', '\\\\nare'), ('keen', 'lancets,'), ('pane', 'and'), ('won', 'Mrt.'), ('was', '\\\\nabove'), ('Legislature\\\\n', 'its'), ('tbe\\\\n', 'scientific'), ('it', 'unnecessary'), ('composed', 'a\\\\nrectangular'), ('it', 'in\\\\nthe'), ('will', 'relatively'), ('building.\\\\n', 'Dorchester'), ('series', 'wsndcriDgs\\\\nIrom'), ('order,\\\\ngenerally', 'the'), ('05;504,5', '0«,'), ('supervis-\\\\n', 'of'), ('conveyed', 'mortgaged,'), ('and', 'she\\\\nplneed'), ('Center.', 'Garnett'), ('oi', '\\\\nThe'), ('that', 'felt'), ('corresponded\\\\n', 'people'), ('suffering,', 'even'), ('down', '\\\\narm'), ('they', '\\\\nthis'), ('to', '\\\\norated.'), ('a', 'deceiver'), ('can\\\\n', 'examined'), ('equipment', 'have'), ('rello\\\\n', 'to'), ('be\\\\n', 'unduly'), ('elevation', 'literature,'), ('the\\\\n', 'in'), ('There', '\\\\ntransit'), ('the\\\\nneeds', 'modern'), ('life.\\\\n', 'approaching'), ('rising', '156%'), ('the\\\\nleaders', 'them,'), ('ilaro', 'opposition'), ('the', '\\\\nsance,'), ('the', 'nnd'), ('work\\xad\\\\ning', 'the'), ('lapped', 'with'), ('I\\\\n', 'will'), ('somowhat\\\\n', 'Never'), ('deliberately', '\\\\nwith'), ('fried.', 'tell'), ('al\\\\n', 'fundB'), ('I', 'so\\\\nrestless'), ('of', 'to'), ('degrees', 'four'), ('up', 'many\\\\nfortifications'), ('de.\\\\ntached', 'nor'), ('commission', 'season.\\\\nFlashing'), ('Porter', 'his\\\\nlifetime:'), ('Circle,\"', 'the'), ('making', 'imprint'), ('entrenched\\\\nminers,', 'the'), ('let', 'who'), ('a', 'against'), ('grand', 'to'), ('that', 'hired'), ('the', 'what\\xad\\\\never'), ('lucre,', 'palms'), ('in', \"turn!'\\\\n'Read\"), ('of\"', 'delivered,'), ('have', 'the'), ('OKDKK', 'enclose'), ('pots.\\\\n', 'blowers'), ('cites\\\\n', 'Paris'), ('not', 'than'), ('dream', 'paradise,'), ('was', 'up.\\\\nDave'), ('the', 'who'), ('having', 'twice\\\\nshot'), ('the', 'd\\\\nIt'), ('the', 'cemetery'), ('said', 'should'), ('most', 'and'), ('in\\\\n', 'his'), ('actual', 'of'), ('attend', 'it'), ('less\\\\n', 'has'), ('why\\\\nAmerica', 'forced'), ('peri\\\\n', 'from'), ('aforosald,thence', '\\\\nallai'), ('instant\\\\n', 'particularly'), ('side', 'also'), ('i\\\\n', 'existence,'), ('all', 'singular'), ('the\\\\n', 'has'), ('mo:', 'on'), ('every', 'is'), ('these\\\\n', 'and'), ('coming', 'again,'), ('McLoughlin,', 'Mulberry'), ('\"', 'a'), ('ho\\\\nwould', 'him.'), ('finally', 'up'), ('city', 'again'), ('which\\\\nho', 'with'), ('Miss', 'was'), ('brick', 'and\\\\nlot.'), ('had\\\\n', 's'), ('represented.\\\\nWe', 'allow'), ('In', 'to'), ('tor', '\\\\ntransaction'), ('a', 'He'), ('any', '\\\\nsels'), ('element\\\\nmost', 'In'), ('tli\\\\n', 'debato'), ('of', \"Mitchell'
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "unsupported operand type(s) for /: 'NoneType' and 'NoneType'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[18], line 19\u001b[0m\n\u001b[1;32m     17\u001b[0m probabilities \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m V\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m---> 19\u001b[0m     probabilities\u001b[38;5;241m.\u001b[39mappend((key, \u001b[43mProb_of_word\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mV\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mV2\u001b[49m\u001b[43m)\u001b[49m))\n\u001b[1;32m     20\u001b[0m prob_else \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;28msum\u001b[39m([x[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m probabilities])\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28mprint\u001b[39m(probabilities)\n",
      "Cell \u001b[0;32mIn[6], line 5\u001b[0m, in \u001b[0;36mProb_of_word\u001b[0;34m(word, word_before, word_after, Udict, Bdict)\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mProb_of_word\u001b[39m(word, word_before, word_after, Udict, Bdict):\n\u001b[0;32m----> 5\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mProb_bigram\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_before\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mUdict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBdict\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m*\u001b[39m Prob_bigram(word, word_after, Udict, Bdict)\n",
      "Cell \u001b[0;32mIn[6], line 2\u001b[0m, in \u001b[0;36mProb_bigram\u001b[0;34m(presc_word, foll_word, Udict, Bdict)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mProb_bigram\u001b[39m(presc_word, foll_word, Udict, Bdict): \n\u001b[0;32m----> 2\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mBdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpresc_word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfoll_word\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mUdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpresc_word\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'NoneType' and 'NoneType'"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "if __name__=='__main__':\n",
    "    V, V2= {}, {} #unigram stats\n",
    "#     k= int(sys.argv[1])\n",
    "    k=100\n",
    "    predict_words = []\n",
    "    with open(\"./test-A/in.tsv\", 'r+') as file:\n",
    "        for line in file:\n",
    "            V, V2 = update_V_stats(line, V, V2)\n",
    "            split = line.split('\\t')[6:]\n",
    "            predict_words.append((get_last_word(split[0]), get_first_word(split[1])))\n",
    "            \n",
    "        V=dict(sorted(V.items(), key=lambda x: x[1], reverse=True)[:k])\n",
    "        V2=dict(sorted(V2.items(), key=lambda x: x[1], reverse=True)[:k])\n",
    "        print(V, V2)\n",
    "        print(predict_words)\n",
    "        for item in predict_words:\n",
    "            probabilities = []\n",
    "            for key, value in V.items():\n",
    "                probabilities.append((key, Prob_of_word(key, item[0], item[1], V, V2)))\n",
    "            prob_else = 1-sum([x[1] for x in probabilities])\n",
    "\n",
    "            print(probabilities)\n",
    "##lewy i prawy kontektst: P(w|wi-2wi-1)*P(wi+1|wi-1w)\n",
    "#czyli trzy trigramy, w których w jest w z lewej/w środku/z prawej\n",
    "#P(wi|wi-1wi-2) = #wi wi-1 wi-2/(wi-1 wi-2)\n",
    "\n",
    "#<UNK> dla słów spoza n pierwszych słów (co do częstości)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60ed4e59",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
test push for separate repo zad8 2023-06-24 13:41:56 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "813cb6fc",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"#!/usr/bin/env python3\n",`
			`"\n",`
			`"import sys\n",`
			`"from collections import Counter \n",`
			`"# print(sys.executable)\n",`
			`"\n",`
			`"def ngrams(iter, size):\n",`
			`" ngram = []\n",`
			`" for item in iter:\n",`
			`" ngram.append(item)\n",`
			`" if len(ngram) == size:\n",`
			`" yield tuple(ngram)\n",`
			`" ngram = ngram[1:]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"id": "4ea32711",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def update_counts(dict_dest, dict_temp):\n",`
			`" for key, value in dict_temp.items():\n",`
			`" dict_dest[key]= dict_dest.get(key, 0) + 1\n",`
			`" return dict_dest"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"id": "02f1dd40",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"{'a': 2, 'b': 3}\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"a = {\"a\":1, \"b\":2}\n",`
			`"b = {\"a\":1, \"b\":2}\n",`
			`"print(update_counts(a,b))"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"id": "01ba3ec0",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"\n",`
			`"def update_V_stats(text, V, V_bigrams):\n",`
			`" V_b = list(ngrams(text.split(\" \"), 2))\n",`
			`" count_V = Counter(text.split(\" \"))\n",`
			`" count_V_bigrams = Counter(V_b)\n",`
			`"# V = {key: count_V.get(key, 0) + V.get(key, 0) for key in set(V) \| set(count_V)}\n",`
			`"# V_bigrams = {key: count_V_bigrams.get(key, 0) + V_bigrams.get(key, 0) for key in set(V_bigrams) \| set(count_V_bigrams)}\n",`
			`" update_counts(V, count_V)\n",`
			`" update_counts(V_bigrams, count_V_bigrams)\n",`
			`" return V, V_bigrams\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"id": "2f1f10c4",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def Prob_bigram(presc_word, foll_word, Udict, Bdict): \n",`
			`" return Bdict.get((presc_word, foll_word))/Udict.get(presc_word)\n",`
			`"\n",`
			`"def Prob_of_word(word, word_before, word_after, Udict, Bdict):\n",`
			`" return Prob_bigram(word_before, word, Udict, Bdict) * Prob_bigram(word, word_after, Udict, Bdict)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"id": "0351a8b4",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_last_word(text):\n",`
			`" \"\"\"Return the last word of a string.\"\"\"\n",`
			`" last_word = \"\"\n",`
			`" for i in range(len(text)-1, -1, -1):\n",`
			`" if text[i] == ' ':\n",`
			`" return last_word[::-1]\n",`
			`" else:\n",`
			`" last_word += text[i]\n",`
			`" return last_word[::-1]\n",`
			`"\n",`
			`"def get_first_word(text):\n",`
			`" \"\"\"Return the last word of a string.\"\"\"\n",`
			`" word = \"\"\n",`
			`" for i in range(len(text)-1):\n",`
			`" if text[i] == ' ':\n",`
			`" return word\n",`
			`" else:\n",`
			`" word += text[i]\n",`
			`" return word"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"id": "7032a595",`
			`"metadata": {`
			`"scrolled": false`
			`},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"0.5\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"print(Prob_bigram(\"from\",\"the\",V,V2))"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 46,`
			`"id": "cd9443a2",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"305\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"p_list = []\n",`
			`"for word in V:\n",`
			`" p_list.append(Prob_two())"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"id": "82f052a0",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"ename": "UnicodeDecodeError",`
			`"evalue": "'charmap' codec can't decode byte 0x98 in position 5004: character maps to <undefined>",`
			`"output_type": "error",`
			`"traceback": [`
			`"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",`
			`"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",`
			"\u001b[1;32m<ipython-input-9-d0544724dc0c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0ma\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"./test-A/in.tsv\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'r+'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m# print(list(ngrams(line.split(\" \"), 2)))\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mV\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mupdate_V_stats\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
			"\u001b[1;32mc:\\program files\\python39\\lib\\encodings\\cp1250.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mIncrementalDecoder\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mIncrementalDecoder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcharmap_decode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdecoding_table\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mStreamWriter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCodec\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStreamWriter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
			`"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x98 in position 5004: character maps to <undefined>"`
			`]`
			`}`
			`],`
			`"source": [`
			`"V, V2 = {}, {}\n",`
			`"k = 100\n",`
			`"a=1\n",`
			`"with open(\"./test-A/in.tsv\", 'r+') as file:\n",`
			`" for line in file:\n",`
			`"# print(list(ngrams(line.split(\" \"), 2)))\n",`
			`" V, V2 = update_V_stats(line, V, V2)\n",`
			`"# a+=1\n",`
			`"# if a>100:\n",`
			`"# break\n",`
			`" \n",`
			`"V=dict(sorted(V.items(), key=lambda x: x[1], reverse=True)[:k])\n",`
			`"V2=dict(sorted(V2.items(), key=lambda x: x[1], reverse=True)[:k])\n",`
			`"\n",`
			`"print(V, V2)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 18,`
			`"id": "e6111e8b",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			"{'and': 7316, 'of': 7298, 'to': 7265, 'the': 7241, 'a': 7023, 'in': 6632, 'that': 6061, 'for': 5813, 'by': 5175, 'as': 5148, 'on': 4965, 'with': 4962, 'at': 4917, 'be': 4867, 'is': 4856, 'it': 4349, 'not': 4218, 'was': 4170, 'from': 4122, 'or': 3992, 'which': 3956, 'this': 3838, 'The': 3751, 'have': 3715, 'all': 3553, 'are': 3500, 'but': 3357, 'an': 3356, 'In': 3322, 'will': 3218, 'his': 3206, 'he': 3165, 'been': 3074, 'one': 2960, 'has': 2951, 'who': 2860, 'their': 2859, 'It': 2850, 'they': 2794, 'had': 2624, 'were': 2456, 'no': 2409, 'so': 2348, 'tho': 2334, 'would': 2300, 'any': 2267, 'when': 2076, 'there': 2030, 'than': 1949, 'I': 1939, 'out': 1917, 'more': 1901, 'other': 1884, 'upon': 1801, 'up': 1788, 'made': 1756, 'DAILY': 1740, 'if': 1731, 'its': 1706, 'two': 1701, 'them': 1652, 'only': 1640, 'time': 1626, 'some': 1613, 'such': 1612, 'about': 1556, 'may': 1553, 'we': 1543, 'can': 1528, 'him': 1505, 'of\\\\nthe': 1505, 'said': 1470, 'Is': 1463, '.': 1443, 'being': 1431, 'our': 1417, 'now': 1410, 'very': 1405, 'do': 1404, 'into': 1402, 'over': 1354, 'WHEELING': 1347, 'should': 1308, 'most': 1297, 'after': 1273, 'under': 1263, 'A': 1250, 'before': 1230, 'could': 1186, 'first': 1186, 'great': 1183, 'every': 1181, 'He': 1170, 'these': 1150, 'those': 1147, 'day': 1141, 'what': 1136, 'man': 1127, 'same': 1122, 'Mr.': 1119} {('of', 'the'): 5890, ('to', 'the'): 3881, ('in', 'the'): 3730, ('and', 'the'): 2467, ('on', 'the'): 2362, ('for', 'the'): 2213, ('by', 'the'): 2027, ('to', 'be'): 2016, ('that', 'the'): 1859, ('of', 'a'): 1762, ('at', 'the'): 1711, ('with', 'the'): 1492, ('from', 'the'): 1444, ('WHEELING', 'DAILY'): 1347, ('it', 'is'): 1197, ('In', 'the'): 1152, ('will', 'be'): 1132, ('in', 'a'): 1082, ('has', 'been'): 939, ('of', 'tho'): 939, ('have', 'been'): 935, ('is', 'a'): 895, ('of', 'this'): 884, ('to', 'a'): 871, ('one', 'of'): 845, ('as', 'a'): 832, ('for', 'a'): 829, ('of', 'his'): 826, ('the', 'same'): 806, ('and', 'a'): 801, ('as', 'the'): 800, ('with', 'a'): 790, ('It', 'is'): 786, ('it', 'was'): 719, ('all', 'the'): 705, ('and', 'that'): 700, ('is', 'the'): 676, ('that', 'he'): 670, ('was', 'a'): 663, ('in', 'this'): 638, ('by', 'a'): 632, ('out', 'of'): 616, ('may', 'be'): 596, ('as', 'to'): 595, ('of', 'their'): 584, ('is', 'not'): 574, ('upon', 'the'): 572, ('and', 'in'): 562, ('the', 'first'): 560, ('that', 'it'): 553, ('had', 'been'): 543, ('would', 'be'): 536, ('the', 'most'): 529, ('and', 'to'): 524, ('part', 'of'): 522, ('a', 'few'): 520, ('to', 'have'): 514, ('of', 'our'): 500, ('of', 'all'): 492, ('to', 'make'): 489, ('It', 'was'): 485, ('he', 'was'): 469, ('to', 'tho'): 465, ('into', 'the'): 462, ('there', 'is'): 462, ('that', 'they'): 453, ('and', 'it'): 450, ('under', 'the'): 441, ('in', 'his'): 440, ('did', 'not'): 422, ('which', 'the'): 418, ('to', 'do'): 418, ('they', 'are'): 417, ('was', 'the'): 413, ('should', 'be'): 405, ('on', 'a'): 404, ('is', 'to'): 403, ('at', 'a'): 398, ('be', 'a'): 396, ('the', 'United'): 393, ('was', 'not'): 391, ('more', 'than'): 389, ('he', 'had'): 388, ('but', 'the'): 382, ('day', 'of'): 381, ('of', 'which'): 373, ('when', 'the'): 371, ('not', 'be'): 370, ('that', 'a'): 370, ('to', 'his'): 367, ('before', 'the'): 364, ('can', 'be'): 356, ('of', 'said'): 355, ('in', 'tho'): 354, ('such', 'a'): 354, ('shall', 'be'): 352, ('the', 'other'): 352, ('of', 'that'): 348, ('over', 'the'): 347, ('number', 'of'): 347}\n",
			"[('first\\\\n', 'on'), ('that\\\\n', 'in'), ('been', 'same,\\\\nand'), ('a', 'by'), ('Gossett', 'other\\\\ngentlemen'), ('but\\\\nclinging', 'and'), ('mo', 'people'), ('own', '\\\\ntion'), ('and\\\\nwere', '011'), ('heart.\\\\n', 'was'), ('what\\xad\\\\never', 'the'), ('John', '\\\\nBaptist,'), ('few', 'a'), ('be\\xad\\\\n', 'their'), ('first,', 'are'), ('elevator;', 'the'), ('on', 'they\\\\nwere'), ('at', 'stage\\\\nor'), ('the', 'village'), ('disaffection\\\\n', 'the'), ('con-\\\\n', 'may'), ('hour\\\\nexiinusiing', 'lie'), ('6', 'to'), ('we\\\\n', 'aoeaaas'), ('Western', '\\\\n8i'), ('yesterday\\\\n', 'been'), ('were\\\\n', 'to'), ('to', 'supplications'), ('perhaps,', 'is'), ('do', 'to,'), ('unthoughtedly', 'that\\\\nthe'), ('was', 'on\\\\nschedule'), ('room\\\\n', 'and'), ('instruments', '\\\\nlast.'), ('pending', '\\\\nfore'), ('least\\\\nThe', 'Settlement'), ('be', 'that'), ('thoro', 'In'), ('boasted\\\\nforeign', 'movements,'), ('to', 'nt'), ('individual\\\\neases', 'bo'), ('railroad', 'approaches'), ('and\\\\nState', 'Michigan,'), ('toy,\\\\n', 'playing'), ('will\\\\nsoon', 'if'), ('members', 'the'), ('HIanchc', '\\\\nceeded'), ('Cleveland\\\\n', 'return'), ('the', '\\\\nIs'), ('about', 'a\\\\nyear,'), ('and\\\\nIlttiuiiwli', 'anti'), ('true', 'Ned'), ('they\\\\nfound', 'smoking'), ('attend', 'it'), ('m', '\\\\n•2.29'), ('statement', 'that'), ('Henderson,', 'Washing\\\\nton,Pa.,owuer'), ('There', '\\\\nare'), ('keen', 'lancets,'), ('pane', 'and'), ('won', 'Mrt.'), ('was', '\\\\nabove'), ('Legislature\\\\n', 'its'), ('tbe\\\\n', 'scientific'), ('it', 'unnecessary'), ('composed', 'a\\\\nrectangular'), ('it', 'in\\\\nthe'), ('will', 'relatively'), ('building.\\\\n', 'Dorchester'), ('series', 'wsndcriDgs\\\\nIrom'), ('order,\\\\ngenerally', 'the'), ('05;504,5', '0«,'), ('supervis-\\\\n', 'of'), ('conveyed', 'mortgaged,'), ('and', 'she\\\\nplneed'), ('Center.', 'Garnett'), ('oi', '\\\\nThe'), ('that', 'felt'), ('corresponded\\\\n', 'people'), ('suffering,', 'even'), ('down', '\\\\narm'), ('they', '\\\\nthis'), ('to', '\\\\norated.'), ('a', 'deceiver'), ('can\\\\n', 'examined'), ('equipment', 'have'), ('rello\\\\n', 'to'), ('be\\\\n', 'unduly'), ('elevation', 'literature,'), ('the\\\\n', 'in'), ('There', '\\\\ntransit'), ('the\\\\nneeds', 'modern'), ('life.\\\\n', 'approaching'), ('rising', '156%'), ('the\\\\nleaders', 'them,'), ('ilaro', 'opposition'), ('the', '\\\\nsance,'), ('the', 'nnd'), ('work\\xad\\\\ning', 'the'), ('lapped', 'with'), ('I\\\\n', 'will'), ('somowhat\\\\n', 'Never'), ('deliberately', '\\\\nwith'), ('fried.', 'tell'), ('al\\\\n', 'fundB'), ('I', 'so\\\\nrestless'), ('of', 'to'), ('degrees', 'four'), ('up', 'many\\\\nfortifications'), ('de.\\\\ntached', 'nor'), ('commission', 'season.\\\\nFlashing'), ('Porter', 'his\\\\nlifetime:'), ('Circle,\"', 'the'), ('making', 'imprint'), ('entrenched\\\\nminers,', 'the'), ('let', 'who'), ('a', 'against'), ('grand', 'to'), ('that', 'hired'), ('the', 'what\\xad\\\\never'), ('lucre,', 'palms'), ('in', \"turn!'\\\\n'Read\"), ('of\"', 'delivered,'), ('have', 'the'), ('OKDKK', 'enclose'), ('pots.\\\\n', 'blowers'), ('cites\\\\n', 'Paris'), ('not', 'than'), ('dream', 'paradise,'), ('was', 'up.\\\\nDave'), ('the', 'who'), ('having', 'twice\\\\nshot'), ('the', 'd\\\\nIt'), ('the', 'cemetery'), ('said', 'should'), ('most', 'and'), ('in\\\\n', 'his'), ('actual', 'of'), ('attend', 'it'), ('less\\\\n', 'has'), ('why\\\\nAmerica', 'forced'), ('peri\\\\n', 'from'), ('aforosald,thence', '\\\\nallai'), ('instant\\\\n', 'particularly'), ('side', 'also'), ('i\\\\n', 'existence,'), ('all', 'singular'), ('the\\\\n', 'has'), ('mo:', 'on'), ('every', 'is'), ('these\\\\n', 'and'), ('coming', 'again,'), ('McLoughlin,', 'Mulberry'), ('\"', 'a'), ('ho\\\\nwould', 'him.'), ('finally', 'up'), ('city', 'again'), ('which\\\\nho', 'with'), ('Miss', 'was'), ('brick', 'and\\\\nlot.'), ('had\\\\n', 's'), ('represented.\\\\nWe', 'allow'), ('In', 'to'), ('tor', '\\\\ntransaction'), ('a', 'He'), ('any', '\\\\nsels'), ('element\\\\nmost', 'In'), ('tli\\\\n', 'debato'), ('of', \"Mitchell'
			`]`
			`},`
			`{`
			`"ename": "TypeError",`
			`"evalue": "unsupported operand type(s) for /: 'NoneType' and 'NoneType'",`
			`"output_type": "error",`
			`"traceback": [`
			`"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",`
			`"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",`
			"Cell \u001b[0;32mIn[18], line 19\u001b[0m\n\u001b[1;32m 17\u001b[0m probabilities \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m V\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m---> 19\u001b[0m probabilities\u001b[38;5;241m.\u001b[39mappend((key, \u001b[43mProb_of_word\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mV\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mV2\u001b[49m\u001b[43m)\u001b[49m))\n\u001b[1;32m 20\u001b[0m prob_else \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;28msum\u001b[39m([x[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m probabilities])\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(probabilities)\n",
			"Cell \u001b[0;32mIn[6], line 5\u001b[0m, in \u001b[0;36mProb_of_word\u001b[0;34m(word, word_before, word_after, Udict, Bdict)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mProb_of_word\u001b[39m(word, word_before, word_after, Udict, Bdict):\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mProb_bigram\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_before\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mUdict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBdict\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m*\u001b[39m Prob_bigram(word, word_after, Udict, Bdict)\n",
			"Cell \u001b[0;32mIn[6], line 2\u001b[0m, in \u001b[0;36mProb_bigram\u001b[0;34m(presc_word, foll_word, Udict, Bdict)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mProb_bigram\u001b[39m(presc_word, foll_word, Udict, Bdict): \n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mBdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpresc_word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfoll_word\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mUdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpresc_word\u001b[49m\u001b[43m)\u001b[49m\n",
			`"\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'NoneType' and 'NoneType'"`
			`]`
			`}`
			`],`
			`"source": [`
			`"\n",`
			`"\n",`
			`"if __name__=='__main__':\n",`
			`" V, V2= {}, {} #unigram stats\n",`
			`"# k= int(sys.argv[1])\n",`
			`" k=100\n",`
			`" predict_words = []\n",`
			`" with open(\"./test-A/in.tsv\", 'r+') as file:\n",`
			`" for line in file:\n",`
			`" V, V2 = update_V_stats(line, V, V2)\n",`
			`" split = line.split('\\t')[6:]\n",`
			`" predict_words.append((get_last_word(split[0]), get_first_word(split[1])))\n",`
			`" \n",`
			`" V=dict(sorted(V.items(), key=lambda x: x[1], reverse=True)[:k])\n",`
			`" V2=dict(sorted(V2.items(), key=lambda x: x[1], reverse=True)[:k])\n",`
			`" print(V, V2)\n",`
			`" print(predict_words)\n",`
			`" for item in predict_words:\n",`
			`" probabilities = []\n",`
			`" for key, value in V.items():\n",`
			`" probabilities.append((key, Prob_of_word(key, item[0], item[1], V, V2)))\n",`
			`" prob_else = 1-sum([x[1] for x in probabilities])\n",`
			`"\n",`
			`" print(probabilities)\n",`
			`"##lewy i prawy kontektst: P(w\|wi-2wi-1)*P(wi+1\|wi-1w)\n",`
			`"#czyli trzy trigramy, w których w jest w z lewej/w środku/z prawej\n",`
			`"#P(wi\|wi-1wi-2) = #wi wi-1 wi-2/(wi-1 wi-2)\n",`
			`"\n",`
			`"#<UNK> dla słów spoza n pierwszych słów (co do częstości)\n",`
			`"\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "60ed4e59",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.10"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`