challenging-america-word-ga.../bigram_model.ipynb

288 lines
187 KiB
Plaintext
Raw Normal View History

2023-06-24 13:41:56 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "813cb6fc",
"metadata": {},
"outputs": [],
"source": [
"#!/usr/bin/env python3\n",
"\n",
"import sys\n",
"from collections import Counter \n",
"# print(sys.executable)\n",
"\n",
"def ngrams(iter, size):\n",
" ngram = []\n",
" for item in iter:\n",
" ngram.append(item)\n",
" if len(ngram) == size:\n",
" yield tuple(ngram)\n",
" ngram = ngram[1:]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4ea32711",
"metadata": {},
"outputs": [],
"source": [
"def update_counts(dict_dest, dict_temp):\n",
" for key, value in dict_temp.items():\n",
" dict_dest[key]= dict_dest.get(key, 0) + 1\n",
" return dict_dest"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "02f1dd40",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'a': 2, 'b': 3}\n"
]
}
],
"source": [
"a = {\"a\":1, \"b\":2}\n",
"b = {\"a\":1, \"b\":2}\n",
"print(update_counts(a,b))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "01ba3ec0",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def update_V_stats(text, V, V_bigrams):\n",
" V_b = list(ngrams(text.split(\" \"), 2))\n",
" count_V = Counter(text.split(\" \"))\n",
" count_V_bigrams = Counter(V_b)\n",
"# V = {key: count_V.get(key, 0) + V.get(key, 0) for key in set(V) | set(count_V)}\n",
"# V_bigrams = {key: count_V_bigrams.get(key, 0) + V_bigrams.get(key, 0) for key in set(V_bigrams) | set(count_V_bigrams)}\n",
" update_counts(V, count_V)\n",
" update_counts(V_bigrams, count_V_bigrams)\n",
" return V, V_bigrams\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2f1f10c4",
"metadata": {},
"outputs": [],
"source": [
"def Prob_bigram(presc_word, foll_word, Udict, Bdict): \n",
" return Bdict.get((presc_word, foll_word))/Udict.get(presc_word)\n",
"\n",
"def Prob_of_word(word, word_before, word_after, Udict, Bdict):\n",
" return Prob_bigram(word_before, word, Udict, Bdict) * Prob_bigram(word, word_after, Udict, Bdict)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0351a8b4",
"metadata": {},
"outputs": [],
"source": [
"def get_last_word(text):\n",
" \"\"\"Return the last word of a string.\"\"\"\n",
" last_word = \"\"\n",
" for i in range(len(text)-1, -1, -1):\n",
" if text[i] == ' ':\n",
" return last_word[::-1]\n",
" else:\n",
" last_word += text[i]\n",
" return last_word[::-1]\n",
"\n",
"def get_first_word(text):\n",
" \"\"\"Return the last word of a string.\"\"\"\n",
" word = \"\"\n",
" for i in range(len(text)-1):\n",
" if text[i] == ' ':\n",
" return word\n",
" else:\n",
" word += text[i]\n",
" return word"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7032a595",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5\n"
]
}
],
"source": [
"print(Prob_bigram(\"from\",\"the\",V,V2))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "cd9443a2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"305\n"
]
}
],
"source": [
"p_list = []\n",
"for word in V:\n",
" p_list.append(Prob_two())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "82f052a0",
"metadata": {},
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x98 in position 5004: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-9-d0544724dc0c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0ma\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"./test-A/in.tsv\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'r+'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m# print(list(ngrams(line.split(\" \"), 2)))\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mV\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mupdate_V_stats\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mV2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\program files\\python39\\lib\\encodings\\cp1250.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mIncrementalDecoder\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mIncrementalDecoder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcharmap_decode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdecoding_table\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mStreamWriter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCodec\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStreamWriter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x98 in position 5004: character maps to <undefined>"
]
}
],
"source": [
"V, V2 = {}, {}\n",
"k = 100\n",
"a=1\n",
"with open(\"./test-A/in.tsv\", 'r+') as file:\n",
" for line in file:\n",
"# print(list(ngrams(line.split(\" \"), 2)))\n",
" V, V2 = update_V_stats(line, V, V2)\n",
"# a+=1\n",
"# if a>100:\n",
"# break\n",
" \n",
"V=dict(sorted(V.items(), key=lambda x: x[1], reverse=True)[:k])\n",
"V2=dict(sorted(V2.items(), key=lambda x: x[1], reverse=True)[:k])\n",
"\n",
"print(V, V2)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e6111e8b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'and': 7316, 'of': 7298, 'to': 7265, 'the': 7241, 'a': 7023, 'in': 6632, 'that': 6061, 'for': 5813, 'by': 5175, 'as': 5148, 'on': 4965, 'with': 4962, 'at': 4917, 'be': 4867, 'is': 4856, 'it': 4349, 'not': 4218, 'was': 4170, 'from': 4122, 'or': 3992, 'which': 3956, 'this': 3838, 'The': 3751, 'have': 3715, 'all': 3553, 'are': 3500, 'but': 3357, 'an': 3356, 'In': 3322, 'will': 3218, 'his': 3206, 'he': 3165, 'been': 3074, 'one': 2960, 'has': 2951, 'who': 2860, 'their': 2859, 'It': 2850, 'they': 2794, 'had': 2624, 'were': 2456, 'no': 2409, 'so': 2348, 'tho': 2334, 'would': 2300, 'any': 2267, 'when': 2076, 'there': 2030, 'than': 1949, 'I': 1939, 'out': 1917, 'more': 1901, 'other': 1884, 'upon': 1801, 'up': 1788, 'made': 1756, 'DAILY': 1740, 'if': 1731, 'its': 1706, 'two': 1701, 'them': 1652, 'only': 1640, 'time': 1626, 'some': 1613, 'such': 1612, 'about': 1556, 'may': 1553, 'we': 1543, 'can': 1528, 'him': 1505, 'of\\\\nthe': 1505, 'said': 1470, 'Is': 1463, '.': 1443, 'being': 1431, 'our': 1417, 'now': 1410, 'very': 1405, 'do': 1404, 'into': 1402, 'over': 1354, 'WHEELING': 1347, 'should': 1308, 'most': 1297, 'after': 1273, 'under': 1263, 'A': 1250, 'before': 1230, 'could': 1186, 'first': 1186, 'great': 1183, 'every': 1181, 'He': 1170, 'these': 1150, 'those': 1147, 'day': 1141, 'what': 1136, 'man': 1127, 'same': 1122, 'Mr.': 1119} {('of', 'the'): 5890, ('to', 'the'): 3881, ('in', 'the'): 3730, ('and', 'the'): 2467, ('on', 'the'): 2362, ('for', 'the'): 2213, ('by', 'the'): 2027, ('to', 'be'): 2016, ('that', 'the'): 1859, ('of', 'a'): 1762, ('at', 'the'): 1711, ('with', 'the'): 1492, ('from', 'the'): 1444, ('WHEELING', 'DAILY'): 1347, ('it', 'is'): 1197, ('In', 'the'): 1152, ('will', 'be'): 1132, ('in', 'a'): 1082, ('has', 'been'): 939, ('of', 'tho'): 939, ('have', 'been'): 935, ('is', 'a'): 895, ('of', 'this'): 884, ('to', 'a'): 871, ('one', 'of'): 845, ('as', 'a'): 832, ('for', 'a'): 829, ('of', 'his'): 826, ('the', 'same'): 806, ('and', 'a'): 801, ('as', 'the'): 800, ('with', 'a'): 790, ('It', 'is'): 786, ('it', 'was'): 719, ('all', 'the'): 705, ('and', 'that'): 700, ('is', 'the'): 676, ('that', 'he'): 670, ('was', 'a'): 663, ('in', 'this'): 638, ('by', 'a'): 632, ('out', 'of'): 616, ('may', 'be'): 596, ('as', 'to'): 595, ('of', 'their'): 584, ('is', 'not'): 574, ('upon', 'the'): 572, ('and', 'in'): 562, ('the', 'first'): 560, ('that', 'it'): 553, ('had', 'been'): 543, ('would', 'be'): 536, ('the', 'most'): 529, ('and', 'to'): 524, ('part', 'of'): 522, ('a', 'few'): 520, ('to', 'have'): 514, ('of', 'our'): 500, ('of', 'all'): 492, ('to', 'make'): 489, ('It', 'was'): 485, ('he', 'was'): 469, ('to', 'tho'): 465, ('into', 'the'): 462, ('there', 'is'): 462, ('that', 'they'): 453, ('and', 'it'): 450, ('under', 'the'): 441, ('in', 'his'): 440, ('did', 'not'): 422, ('which', 'the'): 418, ('to', 'do'): 418, ('they', 'are'): 417, ('was', 'the'): 413, ('should', 'be'): 405, ('on', 'a'): 404, ('is', 'to'): 403, ('at', 'a'): 398, ('be', 'a'): 396, ('the', 'United'): 393, ('was', 'not'): 391, ('more', 'than'): 389, ('he', 'had'): 388, ('but', 'the'): 382, ('day', 'of'): 381, ('of', 'which'): 373, ('when', 'the'): 371, ('not', 'be'): 370, ('that', 'a'): 370, ('to', 'his'): 367, ('before', 'the'): 364, ('can', 'be'): 356, ('of', 'said'): 355, ('in', 'tho'): 354, ('such', 'a'): 354, ('shall', 'be'): 352, ('the', 'other'): 352, ('of', 'that'): 348, ('over', 'the'): 347, ('number', 'of'): 347}\n",
"[('first\\\\n', 'on'), ('that\\\\n', 'in'), ('been', 'same,\\\\nand'), ('a', 'by'), ('Gossett', 'other\\\\ngentlemen'), ('but\\\\nclinging', 'and'), ('mo', 'people'), ('own', '\\\\ntion'), ('and\\\\nwere', '011'), ('heart.\\\\n', 'was'), ('what\\xad\\\\never', 'the'), ('John', '\\\\nBaptist,'), ('few', 'a'), ('be\\xad\\\\n', 'their'), ('first,', 'are'), ('elevator;', 'the'), ('on', 'they\\\\nwere'), ('at', 'stage\\\\nor'), ('the', 'village'), ('disaffection\\\\n', 'the'), ('con-\\\\n', 'may'), ('hour\\\\nexiinusiing', 'lie'), ('6', 'to'), ('we\\\\n', 'aoeaaas'), ('Western', '\\\\n8i'), ('yesterday\\\\n', 'been'), ('were\\\\n', 'to'), ('to', 'supplications'), ('perhaps,', 'is'), ('do', 'to,'), ('unthoughtedly', 'that\\\\nthe'), ('was', 'on\\\\nschedule'), ('room\\\\n', 'and'), ('instruments', '\\\\nlast.'), ('pending', '\\\\nfore'), ('least\\\\nThe', 'Settlement'), ('be', 'that'), ('thoro', 'In'), ('boasted\\\\nforeign', 'movements,'), ('to', 'nt'), ('individual\\\\neases', 'bo'), ('railroad', 'approaches'), ('and\\\\nState', 'Michigan,'), ('toy,\\\\n', 'playing'), ('will\\\\nsoon', 'if'), ('members', 'the'), ('HIanchc', '\\\\nceeded'), ('Cleveland\\\\n', 'return'), ('the', '\\\\nIs'), ('about', 'a\\\\nyear,'), ('and\\\\nIlttiuiiwli', 'anti'), ('true', 'Ned'), ('they\\\\nfound', 'smoking'), ('attend', 'it'), ('m', '\\\\n•2.29'), ('statement', 'that'), ('Henderson,', 'Washing\\\\nton,Pa.,owuer'), ('There', '\\\\nare'), ('keen', 'lancets,'), ('pane', 'and'), ('won', 'Mrt.'), ('was', '\\\\nabove'), ('Legislature\\\\n', 'its'), ('tbe\\\\n', 'scientific'), ('it', 'unnecessary'), ('composed', 'a\\\\nrectangular'), ('it', 'in\\\\nthe'), ('will', 'relatively'), ('building.\\\\n', 'Dorchester'), ('series', 'wsndcriDgs\\\\nIrom'), ('order,\\\\ngenerally', 'the'), ('05;504,5', '0«,'), ('supervis-\\\\n', 'of'), ('conveyed', 'mortgaged,'), ('and', 'she\\\\nplneed'), ('Center.', 'Garnett'), ('oi', '\\\\nThe'), ('that', 'felt'), ('corresponded\\\\n', 'people'), ('suffering,', 'even'), ('down', '\\\\narm'), ('they', '\\\\nthis'), ('to', '\\\\norated.'), ('a', 'deceiver'), ('can\\\\n', 'examined'), ('equipment', 'have'), ('rello\\\\n', 'to'), ('be\\\\n', 'unduly'), ('elevation', 'literature,'), ('the\\\\n', 'in'), ('There', '\\\\ntransit'), ('the\\\\nneeds', 'modern'), ('life.\\\\n', 'approaching'), ('rising', '156%'), ('the\\\\nleaders', 'them,'), ('ilaro', 'opposition'), ('the', '\\\\nsance,'), ('the', 'nnd'), ('work\\xad\\\\ning', 'the'), ('lapped', 'with'), ('I\\\\n', 'will'), ('somowhat\\\\n', 'Never'), ('deliberately', '\\\\nwith'), ('fried.', 'tell'), ('al\\\\n', 'fundB'), ('I', 'so\\\\nrestless'), ('of', 'to'), ('degrees', 'four'), ('up', 'many\\\\nfortifications'), ('de.\\\\ntached', 'nor'), ('commission', 'season.\\\\nFlashing'), ('Porter', 'his\\\\nlifetime:'), ('Circle,\"', 'the'), ('making', 'imprint'), ('entrenched\\\\nminers,', 'the'), ('let', 'who'), ('a', 'against'), ('grand', 'to'), ('that', 'hired'), ('the', 'what\\xad\\\\never'), ('lucre,', 'palms'), ('in', \"turn!'\\\\n'Read\"), ('of\"', 'delivered,'), ('have', 'the'), ('OKDKK', 'enclose'), ('pots.\\\\n', 'blowers'), ('cites\\\\n', 'Paris'), ('not', 'than'), ('dream', 'paradise,'), ('was', 'up.\\\\nDave'), ('the', 'who'), ('having', 'twice\\\\nshot'), ('the', 'd\\\\nIt'), ('the', 'cemetery'), ('said', 'should'), ('most', 'and'), ('in\\\\n', 'his'), ('actual', 'of'), ('attend', 'it'), ('less\\\\n', 'has'), ('why\\\\nAmerica', 'forced'), ('peri\\\\n', 'from'), ('aforosald,thence', '\\\\nallai'), ('instant\\\\n', 'particularly'), ('side', 'also'), ('i\\\\n', 'existence,'), ('all', 'singular'), ('the\\\\n', 'has'), ('mo:', 'on'), ('every', 'is'), ('these\\\\n', 'and'), ('coming', 'again,'), ('McLoughlin,', 'Mulberry'), ('\"', 'a'), ('ho\\\\nwould', 'him.'), ('finally', 'up'), ('city', 'again'), ('which\\\\nho', 'with'), ('Miss', 'was'), ('brick', 'and\\\\nlot.'), ('had\\\\n', 's'), ('represented.\\\\nWe', 'allow'), ('In', 'to'), ('tor', '\\\\ntransaction'), ('a', 'He'), ('any', '\\\\nsels'), ('element\\\\nmost', 'In'), ('tli\\\\n', 'debato'), ('of', \"Mitchell'
]
},
{
"ename": "TypeError",
"evalue": "unsupported operand type(s) for /: 'NoneType' and 'NoneType'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[18], line 19\u001b[0m\n\u001b[1;32m 17\u001b[0m probabilities \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m V\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m---> 19\u001b[0m probabilities\u001b[38;5;241m.\u001b[39mappend((key, \u001b[43mProb_of_word\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mV\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mV2\u001b[49m\u001b[43m)\u001b[49m))\n\u001b[1;32m 20\u001b[0m prob_else \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;28msum\u001b[39m([x[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m probabilities])\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(probabilities)\n",
"Cell \u001b[0;32mIn[6], line 5\u001b[0m, in \u001b[0;36mProb_of_word\u001b[0;34m(word, word_before, word_after, Udict, Bdict)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mProb_of_word\u001b[39m(word, word_before, word_after, Udict, Bdict):\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mProb_bigram\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_before\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mUdict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBdict\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m*\u001b[39m Prob_bigram(word, word_after, Udict, Bdict)\n",
"Cell \u001b[0;32mIn[6], line 2\u001b[0m, in \u001b[0;36mProb_bigram\u001b[0;34m(presc_word, foll_word, Udict, Bdict)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mProb_bigram\u001b[39m(presc_word, foll_word, Udict, Bdict): \n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mBdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpresc_word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfoll_word\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mUdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpresc_word\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'NoneType' and 'NoneType'"
]
}
],
"source": [
"\n",
"\n",
"if __name__=='__main__':\n",
" V, V2= {}, {} #unigram stats\n",
"# k= int(sys.argv[1])\n",
" k=100\n",
" predict_words = []\n",
" with open(\"./test-A/in.tsv\", 'r+') as file:\n",
" for line in file:\n",
" V, V2 = update_V_stats(line, V, V2)\n",
" split = line.split('\\t')[6:]\n",
" predict_words.append((get_last_word(split[0]), get_first_word(split[1])))\n",
" \n",
" V=dict(sorted(V.items(), key=lambda x: x[1], reverse=True)[:k])\n",
" V2=dict(sorted(V2.items(), key=lambda x: x[1], reverse=True)[:k])\n",
" print(V, V2)\n",
" print(predict_words)\n",
" for item in predict_words:\n",
" probabilities = []\n",
" for key, value in V.items():\n",
" probabilities.append((key, Prob_of_word(key, item[0], item[1], V, V2)))\n",
" prob_else = 1-sum([x[1] for x in probabilities])\n",
"\n",
" print(probabilities)\n",
"##lewy i prawy kontektst: P(w|wi-2wi-1)*P(wi+1|wi-1w)\n",
"#czyli trzy trigramy, w których w jest w z lewej/w środku/z prawej\n",
"#P(wi|wi-1wi-2) = #wi wi-1 wi-2/(wi-1 wi-2)\n",
"\n",
"#<UNK> dla słów spoza n pierwszych słów (co do częstości)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60ed4e59",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}