This commit is contained in:
Ramon Dyzman 2022-04-11 10:54:53 +02:00
parent 20249e59db
commit a23511bd83

View File

@ -24,7 +24,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 88,
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
@ -36,23 +36,28 @@
"# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n", "# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n",
"# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n", "# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n",
"dev_data = list()\n", "dev_data = list()\n",
"with open('dev-0/in.tsv', \"r\") as f:\n", "data_path = 'dev-0/in.tsv'\n",
"expected_path = 'dev-0/expected.tsv'\n",
"out_path = 'dev-0/out.tsv'\n",
"\n",
"with open(data_path, \"r\") as f:\n",
" for line in f.readlines():\n", " for line in f.readlines():\n",
" dev_data.append(line.split('\\t')[-2])\n", " dev_data.append(line.split('\\t')[-2])\n",
"\n", "\n",
"dev_expected = list()\n", "dev_expected = list()\n",
"with open('dev-0/expected.tsv', \"r\") as f:\n", "with open(expected_path, \"r\") as f:\n",
" for line in f.readlines():\n", " for line in f.readlines():\n",
" dev_expected.append(line.replace('\\n',''))" " dev_expected.append(line.replace('\\n',''))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 89,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from nltk.tokenize import word_tokenize \n", "from nltk.tokenize import word_tokenize \n",
"import re\n",
"\n", "\n",
"def createBigram(data, expected):\n", "def createBigram(data, expected):\n",
" listOfBigrams = []\n", " listOfBigrams = []\n",
@ -62,12 +67,15 @@
" for i in range(len(data)):\n", " for i in range(len(data)):\n",
" tokenized = word_tokenize(data[i])\n", " tokenized = word_tokenize(data[i])\n",
" word = tokenized[-1]\n", " word = tokenized[-1]\n",
" listOfBigrams.append((word, expected[i]))\n", " word = word.lower()\n",
" if (word, expected[i]) in bigramCounts:\n", " word = re.sub('\\W+','', word)\n",
" bigramCounts[(word, expected[i])] += 1\n", " exp = expected[i].lower()\n",
" listOfBigrams.append((word, exp))\n",
" if (word, exp) in bigramCounts:\n",
" bigramCounts[(word, exp)] += 1\n",
" else:\n", " else:\n",
" bigramCounts[(word, expected[i])] = 1\n", " bigramCounts[(word, exp)] = 1\n",
" if data[i] in unigramCounts:\n", " if word in unigramCounts:\n",
" unigramCounts[word] += 1\n", " unigramCounts[word] += 1\n",
" else:\n", " else:\n",
" unigramCounts[word] = 1\n", " unigramCounts[word] = 1\n",
@ -79,13 +87,13 @@
" for bigram in listOfBigrams:\n", " for bigram in listOfBigrams:\n",
" word1 = bigram[0]\n", " word1 = bigram[0]\n",
" word2 = bigram[1]\n", " word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))\n", " listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n",
" return listOfProb" " return listOfProb"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 90,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -94,791 +102,43 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 91,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{ "source": [
"output_type": "execute_result", "probs = calcBigramProb(bigrams, uniCounts, biCounts)"
"data": {
"text/plain": [
"'day'): 1.0,\n",
" ('ol', 'powers'): 1.0,\n",
" ('real', 'Asiatic'): 1.0,\n",
" ('perfect', 'and'): 1.0,\n",
" ('stcu-\\\\n', 'i«d'): 1.0,\n",
" ('The', 'commis-'): 1.0,\n",
" ('otr', 'just'): 1.0,\n",
" ('for', 'men.'): 1.0,\n",
" ('She', 'his'): 1.0,\n",
" ('in\\\\n', 'thccityol'): 1.0,\n",
" ('j\\\\n', 'ons'): 1.0,\n",
" ('tlio', 'Convention'): 1.0,\n",
" ('9rdick', 'is'): 1.0,\n",
" (\"'s\", 'weight'): 1.0,\n",
" ('Charleston', 'to'): 1.0,\n",
" ('foaming', 'beasts'): 1.0,\n",
" ('of\\\\n', 'these'): 1.0,\n",
" ('pay\\\\nin', 'that'): 1.0,\n",
" ('from', '$1'): 1.0,\n",
" ('quack', 'medicines.'): 1.0,\n",
" ('found', 'neces-'): 1.0,\n",
" ('west', 'lino'): 1.0,\n",
" ('to\\\\nsay', 'whether'): 1.0,\n",
" ('away\\\\nuntil', 'the'): 1.0,\n",
" ('givo', 'it'): 1.0,\n",
" ('from', 'the'): 12.0,\n",
" (',', 'too,'): 1.0,\n",
" ('paws\\\\nhad', 'been'): 1.0,\n",
" ('organs', ';'): 1.0,\n",
" ('capitalists', 'great'): 1.0,\n",
" (',', 'whllt'): 1.0,\n",
" ('guarantees', 'the'): 1.0,\n",
" ('than\\\\ndo', 'so,'): 1.0,\n",
" ('\\\\n', 'And'): 1.0,\n",
" ('support\\\\n', '1'): 1.0,\n",
" ('legislative', 'body'): 1.0,\n",
" ('\\\\n', 'as'): 4.0,\n",
" ('fishes.\\\\n', 'We'): 1.0,\n",
" ('Tho', 'night'): 1.0,\n",
" ('wages', 'of'): 1.0,\n",
" ('of', 'returning'): 1.0,\n",
" ('are', 'for'): 1.0,\n",
" ('w.s', 'recently'): 1.0,\n",
" ('te', 'the'): 1.0,\n",
" ('the', 'finances,'): 1.0,\n",
" ('that', 'Imrns'): 1.0,\n",
" ('a', 'trust'): 1.0,\n",
" ('next', 'year,'): 1.0,\n",
" ('ready\\\\n', 'to'): 1.0,\n",
" ('nt', 'a'): 1.0,\n",
" ('Noth-\\\\n', 'ng,'): 1.0,\n",
" ('agree', 'on'): 1.0,\n",
" ('were', 'present.'): 1.0,\n",
" ('Ills', 'country,'): 1.0,\n",
" ('crossing\\\\nBoar', 'River;'): 1.0,\n",
" ('the', 'place'): 3.0,\n",
" ('This', 'reception'): 1.0,\n",
" ('de-\\\\n', 'pendent'): 1.0,\n",
" ('tribunals', 'will'): 1.0,\n",
" ('a', 'steamer'): 1.0,\n",
" ('I', 'am'): 3.0,\n",
" (',', 'at'): 3.0,\n",
" ('on', 'or'): 1.0,\n",
" ('preached', 'on'): 1.0,\n",
" ('of\\\\n', 'things'): 1.0,\n",
" ('for', 'the'): 13.0,\n",
" (']', 'e'): 1.0,\n",
" ('the', 'transit'): 1.0,\n",
" ('hy\\\\n', 'which'): 1.0,\n",
" ('opinion', 'that'): 1.0,\n",
" ('.', 'At'): 3.0,\n",
" ('', 'I'): 1.0,\n",
" ('and\\\\n', 'and'): 1.0,\n",
" ('atrango', 'positions'): 1.0,\n",
" ('pn', 'Tues\\xad'): 1.0,\n",
" ('and', 'all'): 4.0,\n",
" ('is', 'honestly'): 1.0,\n",
" ('honorable', 'citizens.'): 1.0,\n",
" ('per', 'cent'): 1.0,\n",
" ('riveting', 'it,'): 1.0,\n",
" ('the\\\\nCommissioner', 'of'): 1.0,\n",
" ('separating', 'from'): 1.0,\n",
" ('are', 'startled'): 1.0,\n",
" ('voice', 'for'): 1.0,\n",
" ('when', 'thete'): 1.0,\n",
" ('\\\\nformed', 'the'): 1.0,\n",
" ('remembered', 'that'): 1.0,\n",
" ('#', 'of'): 1.0,\n",
" ('very', 'process'): 1.0,\n",
" ('would', 'gel'): 1.0,\n",
" ('me', 'with'): 1.0,\n",
" ('aflllcted', 'with'): 1.0,\n",
" ('ulti\\\\n', 'mate'): 1.0,\n",
" ('utmost\\\\nstretch', 'of'): 1.0,\n",
" ('-rstand', 'much'): 1.0,\n",
" ('was', 'after'): 1.0,\n",
" ('Its\\\\npresent', 'strength,'): 1.0,\n",
" ('the\\\\n', 'peiiod'): 1.0,\n",
" ('son\\\\nhas', 'been'): 1.0,\n",
" ('is', 'covered'): 1.0,\n",
" ('Female\\\\n', 'Seminary;'): 1.0,\n",
" ('they', 'used'): 1.0,\n",
" ('out-\\\\n', 'numbered'): 1.0,\n",
" ('came\\\\nfrom', 'heavy'): 1.0,\n",
" ('a\\\\n', 'pall'): 1.0,\n",
" ('adranoe', 'guard.'): 1.0,\n",
" ('some\\xad\\\\n', 'what'): 1.0,\n",
" ('tho', 'sulo'): 1.0,\n",
" ('upper', 'navigation'): 1.0,\n",
" ('whs\\\\n', 'therefore'): 1.0,\n",
" ('against', 'the'): 2.0,\n",
" ('expendi-\\\\n', 'tures'): 1.0,\n",
" ('treason\\\\nwith', 'indignant'): 1.0,\n",
" ('very', 'strong'): 1.0,\n",
" ('mil-\\\\n', 'lions'): 1.0,\n",
" ('rights', 'supported'): 1.0,\n",
" (',', 'shall'): 2.0,\n",
" ('Moat\\\\nbeing', 'piloted'): 1.0,\n",
" ('be-\\\\n', 'ing'): 2.0,\n",
" ('little', 'barefooted'): 1.0,\n",
" ('of\\\\n', 'developenient'): 1.0,\n",
" ('be-\\\\n', 'Cause'): 1.0,\n",
" (\"'Abraham\\\\n\", 'llu.li'): 1.0,\n",
" ('the', 'estate'): 2.0,\n",
" ('his', 'friends'): 1.0,\n",
" ('P', \"('nderwood.a\"): 1.0,\n",
" ('the\\\\n', 'roof'): 1.0,\n",
" ('Cwk', 'r-;'): 1.0,\n",
" ('.', \"Johnson's\"): 1.0,\n",
" (';', 'while'): 1.0,\n",
" ('*', 'of'): 1.0,\n",
" ('have', 'power'): 1.0,\n",
" ('in', 'search'): 1.0,\n",
" ('and\\\\nthe', 'principal'): 1.0,\n",
" ('HAY.the\\\\nlast', 'named'): 1.0,\n",
" ('have', 'occasioned;'): 1.0,\n",
" ('the\\\\nWestern', 'boundary'): 1.0,\n",
" (',', 'whose'): 1.0,\n",
" ('be', 'visited'): 1.0,\n",
" ('\\\\n', 'and'): 13.0,\n",
" ('party', 'zeal;—and'): 1.0,\n",
" ('exercise', 'over'): 1.0,\n",
" ('foremost', 'in'): 1.0,\n",
" ('three', 'pounds'): 1.0,\n",
" ('exceed-\\\\n', 'ed'): 1.0,\n",
" ('could', 'find,'): 1.0,\n",
" ('-', '.'): 1.0,\n",
" ('could\\\\n', 'f.'): 1.0,\n",
" ('our', 'power.'): 1.0,\n",
" ('intimated', 'that'): 1.0,\n",
" ('Thomas', 'J.'): 2.0,\n",
" ('laid', 'me'): 1.0,\n",
" ('that', 'there'): 3.0,\n",
" ('that', 'on'): 1.0,\n",
" ('it', 'to'): 2.0,\n",
" ('Mc-\\\\n', 'Xamara.'): 1.0,\n",
" ('and\\\\nthrough', 'the'): 1.0,\n",
" ('trembled', 'with'): 1.0,\n",
" ('chin-imps.\\\\nling', 'him,as'): 1.0,\n",
" ('the', 'agricultural'): 1.0,\n",
" ('have', 'proceeded'): 1.0,\n",
" (',', 'P.'): 1.0,\n",
" ('rlin', 'sale'): 1.0,\n",
" ('sight', 'of'): 2.0,\n",
" ('\\\\n', 'rails'): 1.0,\n",
" ('know\\\\nof', 'no'): 1.0,\n",
" ('.', 'MCI).Bids.'): 1.0,\n",
" ('member', 'of'): 1.0,\n",
" ('\\\\nand', 'they'): 1.0,\n",
" (\"'\", 'of'): 1.0,\n",
" ('the', 'ground;'): 1.0,\n",
" (',', 'and'): 88.0,\n",
" ('traverses', 'the'): 1.0,\n",
" ('could', 'lie'): 1.0,\n",
" ('sounded', 'by'): 1.0,\n",
" ('kianr.h', 'of'): 1.0,\n",
" ('be\\\\n', 'promptly'): 1.0,\n",
" ('what', 'history'): 1.0,\n",
" ('high', 'taxation.'): 1.0,\n",
" ('they', 'will'): 4.0,\n",
" ('with', 'the'): 13.0,\n",
" ('ho', '\"would'): 1.0,\n",
" (\"'s\", 'hole'): 1.0,\n",
" ('these', 'circumstances.'): 1.0,\n",
" ('n', 'belief'): 1.0,\n",
" ('because', 'they'): 1.0,\n",
" ('.', 'When'): 2.0,\n",
" (',', 'a'): 12.0,\n",
" ('as', 'far'): 2.0,\n",
" ('the', 'means'): 3.0,\n",
" ('receive', 'as'): 1.0,\n",
" ('business', 'of'): 1.0,\n",
" ('them.\\\\nWhy', 'should'): 1.0,\n",
" ('tangled', 'thickets'): 1.0,\n",
" (',', \"Blannerhasset's\"): 1.0,\n",
" ('happiness', 'of'): 1.0,\n",
" ('district', 'of'): 1.0,\n",
" ('\\\\n', 'chosen'): 1.0,\n",
" ('lias', 'sustained'): 1.0,\n",
" ('mid', 'large'): 1.0,\n",
" ('work', 'now.\"'): 1.0,\n",
" ('certified', 'by'): 1.0,\n",
" ('were', 'entitled,'): 1.0,\n",
" ('submission', 'fb'): 1.0,\n",
" ('bind¬\\\\n', 'ing'): 1.0,\n",
" ('or\\\\n', 'in'): 1.0,\n",
" ('vir\\xad\\\\n', 'tue'): 1.0,\n",
" ('asking', 'that'): 1.0,\n",
" ('as', 'there'): 2.0,\n",
" ('immediate', 'approval'): 1.0,\n",
" ('thcro\\\\nwas', 'not'): 1.0,\n",
" ('kind', 'of'): 2.0,\n",
" ('us\\\\n', 'ow'): 1.0,\n",
" ('thence', 'North'): 1.0,\n",
" ('bring-\\\\n', 'ing'): 1.0,\n",
" ('.', '-Cut'): 1.0,\n",
" ('a', 'supper'): 1.0,\n",
" ('surroundings.\\\\n', '“Youre'): 1.0,\n",
" ('the', 'men'): 3.0,\n",
" ('Is\\\\n', 'formed'): 1.0,\n",
" ('intrinsic', 'value'): 1.0,\n",
" ('ordinance.\\\\n', 'Section'): 1.0,\n",
" ('tojustily', 'Protection,'): 1.0,\n",
" ('scramble\\\\n', 'for'): 1.0,\n",
" ('s', 'part,'): 1.0,\n",
" ('both', 'to'): 1.0,\n",
" ('with\\\\nthe', 'idea'): 1.0,\n",
" ('tell', 'of'): 1.0,\n",
" ('gone', 'by'): 1.0,\n",
" ('the', 'liberty'): 1.0,\n",
" ('they', 'lived'): 1.0,\n",
" ('skating\\\\n', 'rink,'): 1.0,\n",
" (',', 'se'): 1.0,\n",
" ('work', 'and'): 2.0,\n",
" ('last\\\\n', 'sale,'): 1.0,\n",
" ('modern', 'times'): 1.0,\n",
" ('Icould', 'see'): 1.0,\n",
" ('they', 'led'): 1.0,\n",
" ('all\\\\nalong', 'the'): 1.0,\n",
" ('of\\\\n', 'all'): 2.0,\n",
" ('1919', 'certificate'): 1.0,\n",
" ('from', 'my'): 1.0,\n",
" ('that', 'the'): 9.0,\n",
" ('aeenmed', 'that'): 1.0,\n",
" ('on', 'Wednesday,'): 2.0,\n",
" ('kept\\\\n', 'up'): 2.0,\n",
" ('after', 'tea'): 1.0,\n",
" ('equipping', 'each'): 1.0,\n",
" ('mode\\\\n', 'of'): 1.0,\n",
" ('the\\\\n', 'ocean,'): 1.0,\n",
" ('.', 'Messersmith'): 1.0,\n",
" ('not\\\\n', 'there,'): 1.0,\n",
" ('n', 'fam-'): 1.0,\n",
" ('beg', 'this'): 1.0,\n",
" ('to', 'attempt'): 1.0,\n",
" ('and\\\\n', 'the'): 5.0,\n",
" ('they\\\\n', 'were'): 1.0,\n",
" ('be', 'more'): 1.0,\n",
" ('ui.cieisioi.ti\\\\n', 'b\\\\'): 1.0,\n",
" ('our\\\\n', 'constitutional'): 1.0,\n",
" ('to', 'die'): 1.0,\n",
" ('and', 'honest'): 1.0,\n",
" ('bo', 'too'): 1.0,\n",
" ('to', 'take'): 2.0,\n",
" (',', 'Thomas'): 1.0,\n",
" ('many', 'months'): 1.0,\n",
" ('150', 'of'): 1.0,\n",
" ('Billy\\\\n', 'one'): 1.0,\n",
" ('.', 'His'): 2.0,\n",
" ('ancP\\\\n', '67-100'): 1.0,\n",
" ('conduct', 'the'): 1.0,\n",
" ('the', 'small'): 1.0,\n",
" ('by', 'ample'): 1.0,\n",
" ('be', 'well'): 3.0,\n",
" ('speak', 'of'): 1.0,\n",
" ('de-\\\\n', 'termined'): 1.0,\n",
" ('window.\\\\n', 'I'): 1.0,\n",
" ('not', 'bo'): 1.0,\n",
" ('train', 'hearing'): 1.0,\n",
" ('minds\\\\nof', 'men'): 1.0,\n",
" ('supply', 'this'): 1.0,\n",
" ('.', 'Also'): 1.0,\n",
" ('we', 'sought'): 1.0,\n",
" ('chief', 'houor'): 1.0,\n",
" ('road', 'was'): 1.0,\n",
" ('strikes', 'in'): 1.0,\n",
" ('at', \"Hawkins'\"): 1.0,\n",
" ('sure', 'that'): 1.0,\n",
" ('lelease\\\\n', 'he'): 1.0,\n",
" ('in\\\\neach', 'year,'): 1.0,\n",
" ('proclaimed\\\\n', 'by'): 1.0,\n",
" ('polirira\\\\n', 'Course'): 1.0,\n",
" ('the', 'advance'): 1.0,\n",
" ('elaborate', 'frescoing'): 1.0,\n",
" ('work', 'nnd'): 1.0,\n",
" ('pension', 'frauds'): 1.0,\n",
" ('was', 'in'): 1.0,\n",
" ('.', 'Gen.'): 1.0,\n",
" ('west\\\\n', 'coast,'): 1.0,\n",
" ('where', 'Charles'): 1.0,\n",
" ('the', 'stare'): 1.0,\n",
" ('for', 'all'): 1.0,\n",
" ('.', 'But'): 4.0,\n",
" ('the', 'Committee'): 2.0,\n",
" ('so', 'as'): 4.0,\n",
" ('upon\\\\n', 'their'): 1.0,\n",
" ('from', 'just'): 1.0,\n",
" ('the', 'valley'): 3.0,\n",
" ('\\\\n', 'lie'): 1.0,\n",
" ('You\\\\n', 'must'): 1.0,\n",
" ('harvest.\\\\n', 'The'): 1.0,\n",
" (',', 'will,'): 1.0,\n",
" ('—', 'Burleigh'): 1.0,\n",
" (',', 'of'): 5.0,\n",
" ('of', 'fun-'): 1.0,\n",
" ('are', 'gen-'): 1.0,\n",
" ('he', 'was'): 4.0,\n",
" (',', 'another'): 1.0,\n",
" ('\\\\nand', 'possession'): 1.0,\n",
" ('to\\\\n', 'gude'): 1.0,\n",
" ('come.\\\\n', 'The'): 1.0,\n",
" ('\\\\nthat', 'it'): 1.0,\n",
" ('think\\\\nIt', 'a'): 1.0,\n",
" ('the', 'receipt'): 1.0,\n",
" ('those', 'districts,'): 1.0,\n",
" ('firing\\\\n', 'pan'): 1.0,\n",
" ('contempt', 'of'): 1.0,\n",
" ('George\\\\n', 'J.'): 1.0,\n",
" ('of', '*00'): 1.0,\n",
" ('the\\\\nhomestead', 'might'): 1.0,\n",
" ('se\\\\nwho', 'have'): 1.0,\n",
" ('place', 'the'): 1.0,\n",
" ('for\\\\n', '1916'): 1.0,\n",
" (',', 'which'): 9.0,\n",
" ('that', 'their'): 1.0,\n",
" ('the', 'gen-'): 2.0,\n",
" ('would\\\\n', 'do'): 1.0,\n",
" ('be', 're-'): 1.0,\n",
" ('unharmed.\\\\n', 'I'): 1.0,\n",
" ('a\\\\n', 'substitute'): 1.0,\n",
" ('n\\\\n', 'prodigy'): 1.0,\n",
" ('switchmen', 'to-'): 1.0,\n",
" ('rest', 'in'): 1.0,\n",
" ('when\\\\n', 'heavily'): 1.0,\n",
" ('that', 'question,'): 1.0,\n",
" ('capable', 'of'): 1.0,\n",
" ('to', 'lupn'): 1.0,\n",
" ('very', 'in\"-*\\''): 1.0,\n",
" ('.', 'In'): 4.0,\n",
" ('phenomenal', 'growth.'): 1.0,\n",
" ('hut', 'we'): 1.0,\n",
" ('that', 'he'): 3.0,\n",
" ('the', 'earliest'): 1.0,\n",
" ('the', 'gun,'): 1.0,\n",
" ('I', 'know'): 2.0,\n",
" ('about', 'the'): 2.0,\n",
" ('to', 'President'): 1.0,\n",
" ('The', 'British'): 1.0,\n",
" ('the', 'colored'): 1.0,\n",
" ('recent', 'au-'): 1.0,\n",
" ('new', 'proprietary'): 1.0,\n",
" ('a', 'village'): 1.0,\n",
" ('will\\\\n', 'equaliy'): 1.0,\n",
" ('but', 'also'): 1.0,\n",
" ('\\\\n', 'No'): 1.0,\n",
" ('be\\\\nenforced', 'with'): 1.0,\n",
" ('statement', 'is'): 1.0,\n",
" ('questioning', 'the'): 1.0,\n",
" ('ihe\\\\n', 'I'): 1.0,\n",
" ('he', 'appreciate'): 1.0,\n",
" ('MortunY', 'Traveller'): 1.0,\n",
" ('was', 'to'): 1.0,\n",
" ('once', 'formed'): 1.0,\n",
" ('traduced', 'her'): 1.0,\n",
" (',', 'according'): 2.0,\n",
" (',', 'alwavs'): 1.0,\n",
" ('6|d', ':'): 1.0,\n",
" ('\\\\n', 'which'): 4.0,\n",
" ('were', 'fit'): 1.0,\n",
" ('references\\\\n', 'to'): 1.0,\n",
" ('of\\\\n', 'feet'): 1.0,\n",
" ('brick', 'fioor'): 1.0,\n",
" ('was', 'not'): 4.0,\n",
" ('is', 'more'): 1.0,\n",
" ('consider', 'it'): 2.0,\n",
" ('The', 'latter'): 1.0,\n",
" ('in', 'a'): 8.0,\n",
" ('good\\\\nname', 'and'): 1.0,\n",
" (',', 'trom'): 1.0,\n",
" ('several\\\\njoints', 'were'): 1.0,\n",
" ('the', 'party'): 1.0,\n",
" ('not', 'know'): 2.0,\n",
" ('\\\\n', '1913,'): 2.0,\n",
" ('is', 'possible'): 1.0,\n",
" ('is', 'seven'): 1.0,\n",
" ('powerful', 'foree'): 1.0,\n",
" ('a\\\\ngood', 'jilace,'): 1.0,\n",
" ('\\\\nit', 'the'): 1.0,\n",
" ('for\\\\nalUr', 'taking'): 1.0,\n",
" ('anil', 'did'): 1.0,\n",
" ('defeat', 'the'): 1.0,\n",
" ('attempts', 'lo'): 1.0,\n",
" ('State', 'of'): 1.0,\n",
" ('a\\\\n', 'quaint'): 1.0,\n",
" (',', 'doing'): 1.0,\n",
" ('Frias\\\\nhis', 'Minister'): 1.0,\n",
" ('moon\\\\nlight', 'nights'): 1.0,\n",
" ('pav', 't«xthr'): 1.0,\n",
" ('proposition', 'was'): 1.0,\n",
" ('Into\\\\n', 'the'): 1.0,\n",
" ('ask', 'him'): 1.0,\n",
" ('Houses', 'of'): 1.0,\n",
" ('be', 'brought'): 1.0,\n",
" ('engaged', 'in'): 1.0,\n",
" ('Dos-\\\\n', 'well'): 1.0,\n",
" ('of', 'depriving'): 1.0,\n",
" ('from', 'a'): 2.0,\n",
" ('return', 'to'): 1.0,\n",
" ('pay-\\\\n', 'ment'): 1.0,\n",
" ('fact', 'that'): 1.0,\n",
" ('night\\\\nThe', 'strictest'): 1.0,\n",
" ('\\\\n', '<'): 1.0,\n",
" ('*', 'before'): 1.0,\n",
" ('Australia', 'live'): 1.0,\n",
" ('a', 'majority,'): 1.0,\n",
" ('the\\\\n', 'arc'): 1.0,\n",
" ('the\\\\n', 'result'): 1.0,\n",
" ('heartily\\\\n', 'accede'): 1.0,\n",
" ('years', 'ago'): 1.0,\n",
" ('pa\\\\n', 'yers,'): 1.0,\n",
" ('\\\\n', 'gave'): 1.0,\n",
" ('that', 'lien'): 1.0,\n",
" ('deny', 'that'): 1.0,\n",
" (',', '1884.'): 1.0,\n",
" ('in', 'this'): 2.0,\n",
" ('inform\\\\n', 'Congress'): 1.0,\n",
" ('stoop', 'to'): 1.0,\n",
" ('transferred', 'the'): 1.0,\n",
" ('\\\\n', 'has'): 1.0,\n",
" ('by', 'the'): 13.0,\n",
" ('event\\\\n', 'an'): 1.0,\n",
" ('rn\\\\n', 'than'): 1.0,\n",
" ('advan-\\\\n', 'tages.'): 1.0,\n",
" ('at', 'tho'): 1.0,\n",
" ('yard\\\\n', 'when'): 1.0,\n",
" ('only\\\\nsuffeicr', 'nolmdy'): 1.0,\n",
" ('the', 'Indians'): 1.0,\n",
" ('And', 'unless'): 1.0,\n",
" ('ordinary', 'American'): 1.0,\n",
" ('death.\\\\n', '“Jesus'): 1.0,\n",
" ('to', 'ascend'): 1.0,\n",
" ('but', 'those'): 1.0,\n",
" ('death.\\\\n', 'We'): 1.0,\n",
" ('it', 'he'): 1.0,\n",
" ('are\\\\n', 'not'): 1.0,\n",
" ('the', 'coura'): 1.0,\n",
" ('wastes\\\\nheat', 'most'): 1.0,\n",
" ('as', 'Vice'): 1.0,\n",
" ('claimants', 'of'): 1.0,\n",
" ('returned', 'tt)'): 1.0,\n",
" ('San', 'Francisco'): 1.0,\n",
" ('to', 'civil'): 1.0,\n",
" ('Hanks\\\\n', 'of'): 1.0,\n",
" ('wife—as\\\\n', 'she'): 1.0,\n",
" (',', '7'): 1.0,\n",
" ('tri-\\\\nfle', 'taller'): 1.0,\n",
" ('him', 'a'): 1.0,\n",
" (',', 'Miss'): 1.0,\n",
" ('it', 'has'): 3.0,\n",
" ('will', 'erect'): 1.0,\n",
" ('testified', 'that'): 1.0,\n",
" ('seed', 'potatoes,'): 1.0,\n",
" ('control', 'over'): 1.0,\n",
" ('ap\\\\n', 'proved,'): 1.0,\n",
" ('have', 'added'): 1.0,\n",
" ('cents', 'for'): 1.0,\n",
" ('committee\\\\n', 'and'): 1.0,\n",
" ('of', 'such'): 2.0,\n",
" ('and', 'that'): 2.0,\n",
" ('District\\\\nNumber', '7,'): 1.0,\n",
" ('every\\\\nand', 'he'): 1.0,\n",
" ('union', 'of'): 1.0,\n",
" ('the', 'oilier'): 1.0,\n",
" ('and', 'hopelessly'): 1.0,\n",
" ('the', 'notes'): 1.0,\n",
" ('the', 'floor'): 1.0,\n",
" ('with', 'tons'): 1.0,\n",
" ('disregard\\\\n', '«»f'): 1.0,\n",
" ('at\\\\n', 'Per'): 1.0,\n",
" ('\\\\ncountry', 'will'): 1.0,\n",
" ('yet\\\\nthe', 'ball'): 1.0,\n",
" ('.', 'Glick'): 1.0,\n",
" ('any', 'district'): 1.0,\n",
" ('is', 'qualified'): 1.0,\n",
" ('to', 'wait'): 1.0,\n",
" ('and\\\\n', 'that'): 2.0,\n",
" ('west', 'by'): 1.0,\n",
" (',', ';ynl'): 1.0,\n",
" ('different\\\\n', 'kinds'): 1.0,\n",
" ('benefit', 'he'): 1.0,\n",
" ('initiaiiec', 'wiil'): 1.0,\n",
" ('a', 'noise'): 1.0,\n",
" (',', 'l/ic~M'): 1.0,\n",
" ('Bv\\\\n', 'this'): 1.0,\n",
" ('the', 'respec\\xad'): 1.0,\n",
" ('cash', 'in'): 1.0,\n",
" ('scarce', 'bear'): 1.0,\n",
" ('coin\\\\nfinely', 'into'): 1.0,\n",
" ('now', 'to'): 1.0,\n",
" ('relief', 'that'): 1.0,\n",
" ('at\\\\n', 'Aldershot'): 1.0,\n",
" ('benefit', 'of'): 2.0,\n",
" ('the', 'nations'): 1.0,\n",
" ('of\\\\n', 'her'): 1.0,\n",
" ('after', 'the'): 3.0,\n",
" ('will', 'he'): 1.0,\n",
" ('That', 'all'): 1.0,\n",
" ('reach\\\\nLloyd', 'George'): 1.0,\n",
" ('Judg\\\\n', 'ing'): 1.0,\n",
" ('of', '\"Vic'): 1.0,\n",
" ('of', 'Pensaco-'): 1.0,\n",
" (',', 'although'): 1.0,\n",
" ('by', 'their'): 1.0,\n",
" ('xan\\\\n', 'raise'): 1.0,\n",
" (',', 'Jno'): 1.0,\n",
" ('of\\\\n', 'Congrtss.'): 1.0,\n",
" ('mind', 'of'): 1.0,\n",
" (',', 'light'): 1.0,\n",
" ('strange', 'that'): 1.0,\n",
" ('sullied', 'its'): 1.0,\n",
" ('easy', 'and'): 1.0,\n",
" ('The', 'case,'): 1.0,\n",
" (',', 'without'): 2.0,\n",
" ('it', 'would'): 3.0,\n",
" ('feet\\\\nless', 'altitude'): 1.0,\n",
" ('fall', 'and'): 1.0,\n",
" ('castles\\\\n', 'which'): 1.0,\n",
" ('there', 'is'): 4.0,\n",
" ('In', 'the'): 6.0,\n",
" ('his', 'duly'): 2.0,\n",
" ('I\\\\nhad', 'seen'): 1.0,\n",
" ('all', 'his'): 1.0,\n",
" ('who\\\\n', 'is'): 1.0,\n",
" ('Grand', 'Prize'): 1.0,\n",
" ('he', 'entcitaiued,'): 1.0,\n",
" ('sentiments', 'to-'): 1.0,\n",
" ('court\\\\n', 'and'): 1.0,\n",
" ('turh', 'uias:niliccul'): 1.0,\n",
" ('capacities', 'for'): 1.0,\n",
" ('the', 'hospital.'): 1.0,\n",
" ('armies\\\\n', 'and'): 1.0,\n",
" ('the', 'bond,'): 1.0,\n",
" (')', 'from'): 1.0,\n",
" ('and', 'competent,'): 1.0,\n",
" (',', 'or'): 9.0,\n",
" ('to\\\\n', 'go'): 1.0,\n",
" ('that\\\\n', 'ordinarily'): 1.0,\n",
" ('duty.\\\\n', 'Resolved,'): 1.0,\n",
" ('an', 'infetlor'): 1.0,\n",
" ('enough', 'of'): 1.0,\n",
" ('his', 'native'): 1.0,\n",
" ('four', 'artillery'): 1.0,\n",
" ('Paul\\\\n', 'half'): 1.0,\n",
" ('the', 'appearance'): 2.0,\n",
" ('inha-\\\\nbitants', 'on'): 1.0,\n",
" ('and', 'resolutions'): 1.0,\n",
" ('anil', 'exhibits,'): 1.0,\n",
" ('keeping', 'myself'): 1.0,\n",
" ('hand', 'of'): 1.0,\n",
" ('as', 'I'): 1.0,\n",
" (',', 'never'): 1.0,\n",
" ('my\\\\n', 'soda'): 1.0,\n",
" ('pro.\\\\nclaiming', 'that'): 1.0,\n",
" ('sta-\\\\ntioned', 'in'): 1.0,\n",
" (',', 'the'): 16.0,\n",
" ('F.iurth\\\\n', 'avenue'): 1.0,\n",
" ('remote', 'objects,'): 1.0,\n",
" ('certain', 'Masons'): 1.0,\n",
" ('Jefferson', ':'): 1.0,\n",
" ('to', 'participate'): 2.0,\n",
" ('the', 'farmer'): 1.0,\n",
" ('the', 'city'): 5.0,\n",
" ('Supreme', 'Judge,'): 1.0,\n",
" ('City', 'authorities'): 1.0,\n",
" ('.', 'The'): 22.0,\n",
" ('mar\\\\n', 'vellous'): 1.0,\n",
" ('distant', 'as'): 1.0,\n",
" ('with-\\\\nin', 'the'): 1.0,\n",
" ('Blundle', 'Maple'): 1.0,\n",
" ('e', ',o'): 1.0,\n",
" ('much', 'surprised'): 1.0,\n",
" ('moro\\\\n', 'wanton'): 1.0,\n",
" ('divided\\\\n', 'among'): 1.0,\n",
" ('the', 'tame'): 1.0,\n",
" ('manager', 'qf'): 1.0,\n",
" ('by\\\\n', 'Mrs.'): 1.0,\n",
" ('young\\\\n', 'theologians'): 1.0,\n",
" ('the', 'lib*'): 1.0,\n",
" ('recent', 'homo'): 1.0,\n",
" ('said\\\\n', 'administrator'): 1.0,\n",
" ('It', 'has'): 1.0,\n",
" (',', 'Concentrated'): 1.0,\n",
" ('the', 'sugar'): 1.0,\n",
" ('little', 'bov'): 1.0,\n",
" ('min', 'W'): 1.0,\n",
" ('dupes', 'of'): 1.0,\n",
" ('One', 'of'): 2.0,\n",
" ('Mr.\\\\n', 'Bo:tj'): 1.0,\n",
" ('Bunker', 'Hill'): 1.0,\n",
" ('judgment', 'Bhall'): 1.0,\n",
" ('In', 'part:'): 1.0,\n",
" ('to', 'file'): 1.0,\n",
" ('New', 'Jersey'): 1.0,\n",
" (',', 'she'): 2.0,\n",
" ('tasacl', 'out,'): 1.0,\n",
" ('of', 'alarm'): 1.0,\n",
" ('it', 'in'): 2.0,\n",
" ('.', 'W.'): 1.0,\n",
" ('and', 'computed'): 1.0,\n",
" ('complaint', 'came'): 1.0,\n",
" ('and', 'not'): 1.0,\n",
" ('distrust\\\\n', 'each'): 1.0,\n",
" ('knew', 'how'): 1.0,\n",
" ('In', '1898'): 1.0,\n",
" ('republicanism', 'must'): 1.0,\n",
" ('posi\\xad\\\\n', 'tion'): 1.0,\n",
" ('break\\\\n', 'troth'): 1.0,\n",
" ('.\\\\n', 'Diploma'): 1.0,\n",
" ('did', 'Ia6t.'): 1.0,\n",
" ('is', 'real,'): 1.0,\n",
" ('that\\\\n', 'Cato,'): 1.0,\n",
" ('.', 'in'): 1.0,\n",
" ('throwing', 'off'): 1.0,\n",
" ('have', 'been'): 9.0,\n",
" ('Folding', 'Cae'): 1.0,\n",
" ('will\\\\nhave', 'a'): 1.0,\n",
" ('re\\\\n', 'garding'): 1.0,\n",
" ('but\\\\n', 'trom'): 1.0,\n",
" ('walked', 'be-'): 1.0,\n",
" ('swept', 'away'): 1.0,\n",
" ('until\\\\n', 'fairly'): 1.0,\n",
" ('awakened', 'by'): 1.0,\n",
" ('tha\\\\n', 'smallest'): 1.0,\n",
" ('prop', 'r'): 1.0,\n",
" ('is', 'kept'): 2.0,\n",
" ('circumstance.\\\\n', 'Had'): 1.0,\n",
" ('are\\\\nvariant—some', 'saying'): 1.0,\n",
" ('when', 'president.'): 1.0,\n",
" ('j\\\\nmeat', 'oj'): 1.0,\n",
" ('help', 'ye'): 1.0,\n",
" ('on\\\\neach', 'one'): 1.0,\n",
" ('by\\\\n', 'the'): 3.0,\n",
" ('four', 'thousand'): 2.0,\n",
" ('quarrH', '-'): 1.0,\n",
" ('\\\\n', 'muscular,'): 1.0,\n",
" ('his', 'own'): 3.0,\n",
" ('Maryland\\\\nRailroad', 'Company,'): 1.0,\n",
" ('!', 'Innocent'): 1.0,\n",
" ('eradicating', 'all'): 1.0,\n",
" (',', 'that'): 14.0,\n",
" ('damage', 'done,'): 1.0,\n",
" ('lie', 'knew'): 1.0,\n",
" ('get', 'seed,'): 1.0,\n",
" ('the', 'bidder,'): 2.0,\n",
" ('and', 'the'): 18.0,\n",
" ('ef-\\\\n', 'fective.'): 1.0,\n",
" ('the\\\\n', 'extent'): 1.0,\n",
" ('quite', 'firm.'): 1.0,\n",
" ('Away\\\\nfrom', 'home'): 1.0,\n",
" (',', 'have'): 2.0,\n",
" ('the\\\\n', 'means'): 1.0,\n",
" ('indifference.\\\\n', 'Tuberculosis'): 1.0,\n",
" ('home\\\\n', 'with'): 1.0,\n",
" ('never', 'have'): 1.0,\n",
" ('to', 'show'): 1.0,\n",
" ('thence\\\\n', 'south'): 1.0,\n",
" ('laid', 'in'): 1.0,\n",
" ('and\\\\nTruekee', 'Kailroad'): 1.0,\n",
" ('.', 'Brown'): 1.0,\n",
" ('the', 'death'): 2.0,\n",
" ('horses', 'Mrs.'): 1.0,\n",
" ('statistics', 'in'): 1.0,\n",
" ('on', 'timothy'): 1.0,\n",
" ('shall', 'be'): 8.0,\n",
" ('the', 'prejudices'): 1.0,\n",
" ('stallion\\\\n', '|'): 1.0,\n",
" ('half', 'of'): 1.0,\n",
" ('it', 'is'): 9.0,\n",
" ('do', 'it;'): 1.0,\n",
" ('declares', 'that'): 1.0,\n",
" ('themselves', 'f-atriots'): 1.0,\n",
" ('which', 'you'): 2.0,\n",
" ('this\\\\n', 'tide'): 1.0,\n",
" ('en\\xad\\\\ntitled', 'to'): 1.0,\n",
" ('all', 'round.'): 1.0,\n",
" ('the', 'free.'): 1.0,\n",
" (',', 'be'): 1.0,\n",
" ('oi', 'till'): 1.0,\n",
" ('utterest\\\\n', 'scorn'): 1.0,\n",
" ('was', 'over'): 1.0,\n",
" ('the', 'besl'): 1.0,\n",
" ('third\\\\nof', 'the*'): 1.0,\n",
" ('admission', 'that'): 1.0,\n",
" ('been', 'put'): 1.0,\n",
" ('164', 'perchei'): 1.0,\n",
" ('no', 'an-'): 1.0,\n",
" ('twenty', 'lings'): 1.0,\n",
" ('It', 'is'): 6.0,\n",
" ('hard', 'to'): 1.0,\n",
" (',', 'is'): 4.0,\n",
" ('sides', 'and'): 1.0,\n",
" ('deprived', 'the'): 1.0,\n",
" ('elo\\xad\\\\n', 'quently'): 1.0,\n",
" ('s', 'who'): 1.0,\n",
" ('advantages\\\\nwould', 'not'): 1.0,\n",
" ('Foushec', 'ft.'): 1.0,\n",
" ('.', 'Clairs'): 1.0,\n",
" ('the\\\\n', 'Young'): 1.0,\n",
" ('was', 'clear,'): 1.0,\n",
" ('food\\\\nfrom', 'a'): 1.0,\n",
" ('not', 'only'): 3.0,\n",
" ('blood-stained', 'weap'): 1.0,\n",
" ('property\\\\n', 'has'): 1.0,\n",
" ('aro', 'perfectly'): 1.0,\n",
" ('the', 'iumily'): 1.0,\n",
" (',', 'reed*,'): 1.0,\n",
" ('just', 'for'): 1.0,\n",
" ('the', 'D'): 1.0,\n",
" ('had', 'evidently'): 1.0,\n",
" ('the', 'measures'): 1.0,\n",
" ('I\\\\n', 'finally'): 1.0,\n",
" ('people', 'of'): 2.0,\n",
" ('the', 'expense'): 1.0,\n",
" ('felt', 'if'): 1.0,\n",
" ('jour-\\\\nnalism', 'which'): 1.0,\n",
" ('as', 'to'): 7.0,\n",
" ('47', 'deg.'): 1.0,\n",
" (\"'\", 'after'): 1.0,\n",
" ('.', 'He'): 11.0,\n",
" ('About\\\\n', 'midnight'): 1.0,\n",
" ('my', 'wife'): 2.0,\n",
" ('effect', 'npon'): 1.0,\n",
" ('throughout', 'the\"'): 1.0,\n",
" ('the', 'last'): 4.0,\n",
" ('of', 'Kennehee'): 1.0,\n",
" ('.V', 'York.'): 1.0,\n",
" ('of\\\\n', 'Inue.'): 1.0,\n",
" ('are', 'awarded'): 1.0,\n",
" ('bound', 'to'): 1.0,\n",
" ('the', 'upper'): 1.0,\n",
" ('species', 'of'): 1.0,\n",
" ('day', 'ol'): 1.0,\n",
" ('a', 'fair'): 3.0,\n",
" ('public', 'trustee'): 1.0,\n",
" ('has', 'become'): 1.0,\n",
" ('lading', 'and,'): 1.0,\n",
" ('Another', 'potent'): 1.0,\n",
" ('to', 'ihe'): 1.0,\n",
" (';', 'the'): 3.0,\n",
" ('a', 'large'): 4.0,\n",
" ('II.\\\\n', 'one'): 1.0,\n",
" ('ben-\\\\n', 'efit'): 1.0,\n",
" ('which\\\\nwaa', 'fastened'): 1.0,\n",
" ('is', 'laid'): 1.0,\n",
" ('in', 'Smith'): 1.0,\n",
" ('the\\\\nbest', 'Mexico'): 1.0,\n",
" ('each', 'three'): 1.0,\n",
" ('a', 'moun-'): 1.0,\n",
" ...}"
] ]
}, },
{
"cell_type": "code",
"execution_count": 112,
"metadata": {}, "metadata": {},
"execution_count": 17 "outputs": [],
}
],
"source": [ "source": [
"probs = calcBigramProb(bigrams, uniCounts, biCounts)\n", "def save_results(probs, in_data):\n",
"probs" " with open(out_path, 'w') as f:\n",
" for i in range(len(in_data)):\n",
" tokenized = word_tokenize(in_data[i])\n",
" word = tokenized[-1]\n",
" word = word.lower()\n",
" word = re.sub('\\W+','', word)\n",
" word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n",
" rest = 1.0 - sum(word_probs.values())\n",
" word_probs = list(map(lambda elem: elem[0][0] + \":\" + str(elem[1]), list(word_probs.items())))\n",
" word_probs.append(':'+str(rest))\n",
" word_probs = ' '.join(word_probs)\n",
" f.write(word_probs)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"save_results(probs, dev_data)"
] ]
} }
] ]