diff --git a/main.ipynb b/main.ipynb index 9c62c6d..69945b3 100644 --- a/main.ipynb +++ b/main.ipynb @@ -24,7 +24,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "metadata": { "tags": [] }, @@ -36,23 +36,28 @@ "# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n", "# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n", "dev_data = list()\n", - "with open('dev-0/in.tsv', \"r\") as f:\n", + "data_path = 'dev-0/in.tsv'\n", + "expected_path = 'dev-0/expected.tsv'\n", + "out_path = 'dev-0/out.tsv'\n", + "\n", + "with open(data_path, \"r\") as f:\n", " for line in f.readlines():\n", " dev_data.append(line.split('\\t')[-2])\n", "\n", "dev_expected = list()\n", - "with open('dev-0/expected.tsv', \"r\") as f:\n", + "with open(expected_path, \"r\") as f:\n", " for line in f.readlines():\n", " dev_expected.append(line.replace('\\n',''))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize \n", + "import re\n", "\n", "def createBigram(data, expected):\n", " listOfBigrams = []\n", @@ -62,12 +67,15 @@ " for i in range(len(data)):\n", " tokenized = word_tokenize(data[i])\n", " word = tokenized[-1]\n", - " listOfBigrams.append((word, expected[i]))\n", - " if (word, expected[i]) in bigramCounts:\n", - " bigramCounts[(word, expected[i])] += 1\n", + " word = word.lower()\n", + " word = re.sub('\\W+','', word)\n", + " exp = expected[i].lower()\n", + " listOfBigrams.append((word, exp))\n", + " if (word, exp) in bigramCounts:\n", + " bigramCounts[(word, exp)] += 1\n", " else:\n", - " bigramCounts[(word, expected[i])] = 1\n", - " if data[i] in unigramCounts:\n", + " bigramCounts[(word, exp)] = 1\n", + " if word in unigramCounts:\n", " unigramCounts[word] += 1\n", " else:\n", " unigramCounts[word] = 1\n", @@ -79,13 +87,13 @@ " for bigram in listOfBigrams:\n", " word1 = bigram[0]\n", " word2 = bigram[1]\n", - " listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))\n", + " listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n", " return listOfProb" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -94,791 +102,43 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 91, "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'day'): 1.0,\n", - " ('ol', 'powers'): 1.0,\n", - " ('real', 'Asiatic'): 1.0,\n", - " ('perfect', 'and'): 1.0,\n", - " ('stcu-\\\\n', 'i«d'): 1.0,\n", - " ('The', 'commis-'): 1.0,\n", - " ('otr', 'just'): 1.0,\n", - " ('for', 'men.'): 1.0,\n", - " ('She', 'his'): 1.0,\n", - " ('in\\\\n', 'thccityol'): 1.0,\n", - " ('j\\\\n', 'ons'): 1.0,\n", - " ('tlio', 'Convention'): 1.0,\n", - " ('9rdick', 'is'): 1.0,\n", - " (\"'s\", 'weight'): 1.0,\n", - " ('Charleston', 'to'): 1.0,\n", - " ('foaming', 'beasts'): 1.0,\n", - " ('of\\\\n', 'these'): 1.0,\n", - " ('pay\\\\nin', 'that'): 1.0,\n", - " ('from', '$1'): 1.0,\n", - " ('quack', 'medicines.'): 1.0,\n", - " ('found', 'neces-'): 1.0,\n", - " ('west', 'lino'): 1.0,\n", - " ('to\\\\nsay', 'whether'): 1.0,\n", - " ('away\\\\nuntil', 'the'): 1.0,\n", - " ('givo', 'it'): 1.0,\n", - " ('from', 'the'): 12.0,\n", - " (',', 'too,'): 1.0,\n", - " ('paws\\\\nhad', 'been'): 1.0,\n", - " ('organs', ';'): 1.0,\n", - " ('capitalists', 'great'): 1.0,\n", - " (',', 'whllt'): 1.0,\n", - " ('guarantees', 'the'): 1.0,\n", - " ('than\\\\ndo', 'so,'): 1.0,\n", - " ('\\\\n', 'And'): 1.0,\n", - " ('support\\\\n', '1'): 1.0,\n", - " ('legislative', 'body'): 1.0,\n", - " ('\\\\n', 'as'): 4.0,\n", - " ('fishes.\\\\n', 'We'): 1.0,\n", - " ('Tho', 'night'): 1.0,\n", - " ('wages', 'of'): 1.0,\n", - " ('of', 'returning'): 1.0,\n", - " ('are', 'for'): 1.0,\n", - " ('w.s', 'recently'): 1.0,\n", - " ('te', 'the'): 1.0,\n", - " ('the', 'finances,'): 1.0,\n", - " ('that', 'Imrns'): 1.0,\n", - " ('a', 'trust'): 1.0,\n", - " ('next', 'year,'): 1.0,\n", - " ('ready\\\\n', 'to'): 1.0,\n", - " ('nt', 'a'): 1.0,\n", - " ('Noth-\\\\n', 'ng,'): 1.0,\n", - " ('agree', 'on'): 1.0,\n", - " ('were', 'present.'): 1.0,\n", - " ('Ills', 'country,'): 1.0,\n", - " ('crossing\\\\nBoar', 'River;'): 1.0,\n", - " ('the', 'place'): 3.0,\n", - " ('This', 'reception'): 1.0,\n", - " ('de-\\\\n', 'pendent'): 1.0,\n", - " ('tribunals', 'will'): 1.0,\n", - " ('a', 'steamer'): 1.0,\n", - " ('I', 'am'): 3.0,\n", - " (',', 'at'): 3.0,\n", - " ('on', 'or'): 1.0,\n", - " ('preached', 'on'): 1.0,\n", - " ('of\\\\n', 'things'): 1.0,\n", - " ('for', 'the'): 13.0,\n", - " (']', 'e'): 1.0,\n", - " ('the', 'transit'): 1.0,\n", - " ('hy\\\\n', 'which'): 1.0,\n", - " ('opinion', 'that'): 1.0,\n", - " ('.', 'At'): 3.0,\n", - " ('’', 'I'): 1.0,\n", - " ('and\\\\n', 'and'): 1.0,\n", - " ('atrango', 'positions'): 1.0,\n", - " ('pn', 'Tues\\xad'): 1.0,\n", - " ('and', 'all'): 4.0,\n", - " ('is', 'honestly'): 1.0,\n", - " ('honorable', 'citizens.'): 1.0,\n", - " ('per', 'cent'): 1.0,\n", - " ('riveting', 'it,'): 1.0,\n", - " ('the\\\\nCommissioner', 'of'): 1.0,\n", - " ('separating', 'from'): 1.0,\n", - " ('are', 'startled'): 1.0,\n", - " ('voice', 'for'): 1.0,\n", - " ('when', 'thete'): 1.0,\n", - " ('\\\\nformed', 'the'): 1.0,\n", - " ('remembered', 'that'): 1.0,\n", - " ('#', 'of'): 1.0,\n", - " ('very', 'process'): 1.0,\n", - " ('would', 'gel'): 1.0,\n", - " ('me', 'with'): 1.0,\n", - " ('aflllcted', 'with'): 1.0,\n", - " ('ulti\\\\n', 'mate'): 1.0,\n", - " ('utmost\\\\nstretch', 'of'): 1.0,\n", - " ('-rstand', 'much'): 1.0,\n", - " ('was', 'after'): 1.0,\n", - " ('Its\\\\npresent', 'strength,'): 1.0,\n", - " ('the\\\\n', 'peiiod'): 1.0,\n", - " ('son\\\\nhas', 'been'): 1.0,\n", - " ('is', 'covered'): 1.0,\n", - " ('Female\\\\n', 'Seminary;'): 1.0,\n", - " ('they', 'used'): 1.0,\n", - " ('out-\\\\n', 'numbered'): 1.0,\n", - " ('came\\\\nfrom', 'heavy'): 1.0,\n", - " ('a\\\\n', 'pall'): 1.0,\n", - " ('adranoe', 'guard.'): 1.0,\n", - " ('some\\xad\\\\n', 'what'): 1.0,\n", - " ('tho', 'sulo'): 1.0,\n", - " ('upper', 'navigation'): 1.0,\n", - " ('whs\\\\n', 'therefore'): 1.0,\n", - " ('against', 'the'): 2.0,\n", - " ('expendi-\\\\n', 'tures'): 1.0,\n", - " ('treason\\\\nwith', 'indignant'): 1.0,\n", - " ('very', 'strong'): 1.0,\n", - " ('mil-\\\\n', 'lions'): 1.0,\n", - " ('rights', 'supported'): 1.0,\n", - " (',', 'shall'): 2.0,\n", - " ('Moat\\\\nbeing', 'piloted'): 1.0,\n", - " ('be-\\\\n', 'ing'): 2.0,\n", - " ('little', 'barefooted'): 1.0,\n", - " ('of\\\\n', 'developenient'): 1.0,\n", - " ('be-\\\\n', 'Cause'): 1.0,\n", - " (\"'Abraham\\\\n\", 'llu.li'): 1.0,\n", - " ('the', 'estate'): 2.0,\n", - " ('his', 'friends'): 1.0,\n", - " ('P', \"('nderwood.a\"): 1.0,\n", - " ('the\\\\n', 'roof'): 1.0,\n", - " ('Cwk', 'r-;'): 1.0,\n", - " ('.', \"Johnson's\"): 1.0,\n", - " (';', 'while'): 1.0,\n", - " ('*', 'of'): 1.0,\n", - " ('have', 'power'): 1.0,\n", - " ('in', 'search'): 1.0,\n", - " ('and\\\\nthe', 'principal'): 1.0,\n", - " ('HAY.the\\\\nlast', 'named'): 1.0,\n", - " ('have', 'occasioned;'): 1.0,\n", - " ('the\\\\nWestern', 'boundary'): 1.0,\n", - " (',', 'whose'): 1.0,\n", - " ('be', 'visited'): 1.0,\n", - " ('\\\\n', 'and'): 13.0,\n", - " ('party', 'zeal;—and'): 1.0,\n", - " ('exercise', 'over'): 1.0,\n", - " ('foremost', 'in'): 1.0,\n", - " ('three', 'pounds'): 1.0,\n", - " ('exceed-\\\\n', 'ed'): 1.0,\n", - " ('could', 'find,'): 1.0,\n", - " ('-', '.'): 1.0,\n", - " ('could\\\\n', 'f.'): 1.0,\n", - " ('our', 'power.'): 1.0,\n", - " ('intimated', 'that'): 1.0,\n", - " ('Thomas', 'J.'): 2.0,\n", - " ('laid', 'me'): 1.0,\n", - " ('that', 'there'): 3.0,\n", - " ('that', 'on'): 1.0,\n", - " ('it', 'to'): 2.0,\n", - " ('Mc-\\\\n', 'Xamara.'): 1.0,\n", - " ('and\\\\nthrough', 'the'): 1.0,\n", - " ('trembled', 'with'): 1.0,\n", - " ('chin-imps.\\\\nling', 'him,as'): 1.0,\n", - " ('the', 'agricultural'): 1.0,\n", - " ('have', 'proceeded'): 1.0,\n", - " (',', 'P.'): 1.0,\n", - " ('rlin', 'sale'): 1.0,\n", - " ('sight', 'of'): 2.0,\n", - " ('\\\\n', 'rails'): 1.0,\n", - " ('know\\\\nof', 'no'): 1.0,\n", - " ('.', 'MCI).Bids.'): 1.0,\n", - " ('member', 'of'): 1.0,\n", - " ('\\\\nand', 'they'): 1.0,\n", - " (\"'\", 'of'): 1.0,\n", - " ('the', 'ground;'): 1.0,\n", - " (',', 'and'): 88.0,\n", - " ('traverses', 'the'): 1.0,\n", - " ('could', 'lie'): 1.0,\n", - " ('sounded', 'by'): 1.0,\n", - " ('kianr.h', 'of'): 1.0,\n", - " ('be\\\\n', 'promptly'): 1.0,\n", - " ('what', 'history'): 1.0,\n", - " ('high', 'taxation.'): 1.0,\n", - " ('they', 'will'): 4.0,\n", - " ('with', 'the'): 13.0,\n", - " ('ho', '\"would'): 1.0,\n", - " (\"'s\", 'hole'): 1.0,\n", - " ('these', 'circumstances.'): 1.0,\n", - " ('n', 'belief'): 1.0,\n", - " ('because', 'they'): 1.0,\n", - " ('.', 'When'): 2.0,\n", - " (',', 'a'): 12.0,\n", - " ('as', 'far'): 2.0,\n", - " ('the', 'means'): 3.0,\n", - " ('receive', 'as'): 1.0,\n", - " ('business', 'of'): 1.0,\n", - " ('them.\\\\nWhy', 'should'): 1.0,\n", - " ('tangled', 'thickets'): 1.0,\n", - " (',', \"Blannerhasset's\"): 1.0,\n", - " ('happiness', 'of'): 1.0,\n", - " ('district', 'of'): 1.0,\n", - " ('\\\\n', 'chosen'): 1.0,\n", - " ('lias', 'sustained'): 1.0,\n", - " ('mid', 'large'): 1.0,\n", - " ('work', 'now.\"'): 1.0,\n", - " ('certified', 'by'): 1.0,\n", - " ('were', 'entitled,'): 1.0,\n", - " ('submission', 'fb'): 1.0,\n", - " ('bind¬\\\\n', 'ing'): 1.0,\n", - " ('or\\\\n', 'in'): 1.0,\n", - " ('vir\\xad\\\\n', 'tue'): 1.0,\n", - " ('asking', 'that'): 1.0,\n", - " ('as', 'there'): 2.0,\n", - " ('immediate', 'approval'): 1.0,\n", - " ('thcro\\\\nwas', 'not'): 1.0,\n", - " ('kind', 'of'): 2.0,\n", - " ('us\\\\n', 'ow'): 1.0,\n", - " ('thence', 'North'): 1.0,\n", - " ('bring-\\\\n', 'ing'): 1.0,\n", - " ('.', '-Cut'): 1.0,\n", - " ('a', 'supper'): 1.0,\n", - " ('surroundings.\\\\n', '“You’re'): 1.0,\n", - " ('the', 'men'): 3.0,\n", - " ('Is\\\\n', 'formed'): 1.0,\n", - " ('intrinsic', 'value'): 1.0,\n", - " ('ordinance.\\\\n', 'Section'): 1.0,\n", - " ('tojustily', 'Protection,'): 1.0,\n", - " ('scramble\\\\n', 'for'): 1.0,\n", - " ('s', 'part,'): 1.0,\n", - " ('both', 'to'): 1.0,\n", - " ('with\\\\nthe', 'idea'): 1.0,\n", - " ('tell', 'of'): 1.0,\n", - " ('gone', 'by'): 1.0,\n", - " ('the', 'liberty'): 1.0,\n", - " ('they', 'lived'): 1.0,\n", - " ('skating\\\\n', 'rink,'): 1.0,\n", - " (',', 'se'): 1.0,\n", - " ('work', 'and'): 2.0,\n", - " ('last\\\\n', 'sale,'): 1.0,\n", - " ('modern', 'times'): 1.0,\n", - " ('Icould', 'see'): 1.0,\n", - " ('they', 'led'): 1.0,\n", - " ('all\\\\nalong', 'the'): 1.0,\n", - " ('of\\\\n', 'all'): 2.0,\n", - " ('1919', 'certificate'): 1.0,\n", - " ('from', 'my'): 1.0,\n", - " ('that', 'the'): 9.0,\n", - " ('aeenmed', 'that'): 1.0,\n", - " ('on', 'Wednesday,'): 2.0,\n", - " ('kept\\\\n', 'up'): 2.0,\n", - " ('after', 'tea'): 1.0,\n", - " ('equipping', 'each'): 1.0,\n", - " ('mode\\\\n', 'of'): 1.0,\n", - " ('the\\\\n', 'ocean,'): 1.0,\n", - " ('.', 'Messersmith'): 1.0,\n", - " ('not\\\\n', 'there,'): 1.0,\n", - " ('n', 'fam-'): 1.0,\n", - " ('beg', 'this'): 1.0,\n", - " ('to', 'attempt'): 1.0,\n", - " ('and\\\\n', 'the'): 5.0,\n", - " ('they\\\\n', 'were'): 1.0,\n", - " ('be', 'more'): 1.0,\n", - " ('ui.cieisioi.ti\\\\n', 'b\\\\'): 1.0,\n", - " ('our\\\\n', 'constitutional'): 1.0,\n", - " ('to', 'die'): 1.0,\n", - " ('and', 'honest'): 1.0,\n", - " ('bo', 'too'): 1.0,\n", - " ('to', 'take'): 2.0,\n", - " (',', 'Thomas'): 1.0,\n", - " ('many', 'months'): 1.0,\n", - " ('150', 'of'): 1.0,\n", - " ('Billy\\\\n', 'one'): 1.0,\n", - " ('.', 'His'): 2.0,\n", - " ('ancP\\\\n', '67-100'): 1.0,\n", - " ('conduct', 'the'): 1.0,\n", - " ('the', 'small'): 1.0,\n", - " ('by', 'ample'): 1.0,\n", - " ('be', 'well'): 3.0,\n", - " ('speak', 'of'): 1.0,\n", - " ('de-\\\\n', 'termined'): 1.0,\n", - " ('window.\\\\n', 'I'): 1.0,\n", - " ('not', 'bo'): 1.0,\n", - " ('train', 'hearing'): 1.0,\n", - " ('minds\\\\nof', 'men'): 1.0,\n", - " ('supply', 'this'): 1.0,\n", - " ('.', 'Also'): 1.0,\n", - " ('we', 'sought'): 1.0,\n", - " ('chief', 'houor'): 1.0,\n", - " ('road', 'was'): 1.0,\n", - " ('strikes', 'in'): 1.0,\n", - " ('at', \"Hawkins'\"): 1.0,\n", - " ('sure', 'that'): 1.0,\n", - " ('lelease\\\\n', 'he'): 1.0,\n", - " ('in\\\\neach', 'year,'): 1.0,\n", - " ('proclaimed\\\\n', 'by'): 1.0,\n", - " ('polirira\\\\n', 'Course'): 1.0,\n", - " ('the', 'advance'): 1.0,\n", - " ('elaborate', 'frescoing'): 1.0,\n", - " ('work', 'nnd'): 1.0,\n", - " ('pension', 'frauds'): 1.0,\n", - " ('was', 'in'): 1.0,\n", - " ('.', 'Gen.'): 1.0,\n", - " ('west\\\\n', 'coast,'): 1.0,\n", - " ('where', 'Charles'): 1.0,\n", - " ('the', 'stare'): 1.0,\n", - " ('for', 'all'): 1.0,\n", - " ('.', 'But'): 4.0,\n", - " ('the', 'Committee'): 2.0,\n", - " ('so', 'as'): 4.0,\n", - " ('upon\\\\n', 'their'): 1.0,\n", - " ('from', 'just'): 1.0,\n", - " ('the', 'valley'): 3.0,\n", - " ('\\\\n', 'lie'): 1.0,\n", - " ('You\\\\n', 'must'): 1.0,\n", - " ('harvest.\\\\n', 'The'): 1.0,\n", - " (',', 'will,'): 1.0,\n", - " ('—', 'Burleigh'): 1.0,\n", - " (',', 'of'): 5.0,\n", - " ('of', 'fun-'): 1.0,\n", - " ('are', 'gen-'): 1.0,\n", - " ('he', 'was'): 4.0,\n", - " (',', 'another'): 1.0,\n", - " ('\\\\nand', 'possession'): 1.0,\n", - " ('to\\\\n', 'gu’de'): 1.0,\n", - " ('come.\\\\n', 'The'): 1.0,\n", - " ('\\\\nthat', 'it'): 1.0,\n", - " ('think\\\\nIt', 'a'): 1.0,\n", - " ('the', 'receipt'): 1.0,\n", - " ('those', 'districts,'): 1.0,\n", - " ('firing\\\\n', 'pan'): 1.0,\n", - " ('contempt', 'of'): 1.0,\n", - " ('George\\\\n', 'J.'): 1.0,\n", - " ('of', '*00'): 1.0,\n", - " ('the\\\\nhomestead', 'might'): 1.0,\n", - " ('se\\\\nwho', 'have'): 1.0,\n", - " ('place', 'the'): 1.0,\n", - " ('for\\\\n', '1916'): 1.0,\n", - " (',', 'which'): 9.0,\n", - " ('that', 'their'): 1.0,\n", - " ('the', 'gen-'): 2.0,\n", - " ('would\\\\n', 'do'): 1.0,\n", - " ('be', 're-'): 1.0,\n", - " ('unharmed.\\\\n', 'I'): 1.0,\n", - " ('a\\\\n', 'substitute'): 1.0,\n", - " ('n\\\\n', 'prodigy'): 1.0,\n", - " ('switchmen', 'to-'): 1.0,\n", - " ('rest', 'in'): 1.0,\n", - " ('when\\\\n', 'heavily'): 1.0,\n", - " ('that', 'question,'): 1.0,\n", - " ('capable', 'of'): 1.0,\n", - " ('to', 'lupn'): 1.0,\n", - " ('very', 'in\"-*\\''): 1.0,\n", - " ('.', 'In'): 4.0,\n", - " ('phenomenal', 'growth.'): 1.0,\n", - " ('hut', 'we'): 1.0,\n", - " ('that', 'he'): 3.0,\n", - " ('the', 'earliest'): 1.0,\n", - " ('the', 'gun,'): 1.0,\n", - " ('I', 'know'): 2.0,\n", - " ('about', 'the'): 2.0,\n", - " ('to', 'President'): 1.0,\n", - " ('The', 'British'): 1.0,\n", - " ('the', 'colored'): 1.0,\n", - " ('recent', 'au-'): 1.0,\n", - " ('new', 'proprietary'): 1.0,\n", - " ('a', 'village'): 1.0,\n", - " ('will\\\\n', 'equaliy'): 1.0,\n", - " ('but', 'also'): 1.0,\n", - " ('\\\\n', 'No'): 1.0,\n", - " ('be\\\\nenforced', 'with'): 1.0,\n", - " ('statement', 'is'): 1.0,\n", - " ('questioning', 'the'): 1.0,\n", - " ('ihe\\\\n', 'I'): 1.0,\n", - " ('he', 'appreciate'): 1.0,\n", - " ('MortunY', 'Traveller'): 1.0,\n", - " ('was', 'to'): 1.0,\n", - " ('once', 'formed'): 1.0,\n", - " ('traduced', 'her'): 1.0,\n", - " (',', 'according'): 2.0,\n", - " (',', 'alwavs'): 1.0,\n", - " ('6|d', ':'): 1.0,\n", - " ('\\\\n', 'which'): 4.0,\n", - " ('were', 'fit'): 1.0,\n", - " ('references\\\\n', 'to'): 1.0,\n", - " ('of\\\\n', 'feet'): 1.0,\n", - " ('brick', 'fioor'): 1.0,\n", - " ('was', 'not'): 4.0,\n", - " ('is', 'more'): 1.0,\n", - " ('consider', 'it'): 2.0,\n", - " ('The', 'latter'): 1.0,\n", - " ('in', 'a'): 8.0,\n", - " ('good\\\\nname', 'and'): 1.0,\n", - " (',', 'trom'): 1.0,\n", - " ('several\\\\njoints', 'were'): 1.0,\n", - " ('the', 'party'): 1.0,\n", - " ('not', 'know'): 2.0,\n", - " ('\\\\n', '1913,'): 2.0,\n", - " ('is', 'possible'): 1.0,\n", - " ('is', 'seven'): 1.0,\n", - " ('powerful', 'foree'): 1.0,\n", - " ('a\\\\ngood', 'jilace,'): 1.0,\n", - " ('\\\\nit', 'the'): 1.0,\n", - " ('for\\\\nalUr', 'taking'): 1.0,\n", - " ('anil', 'did'): 1.0,\n", - " ('defeat', 'the'): 1.0,\n", - " ('attempts', 'lo'): 1.0,\n", - " ('State', 'of'): 1.0,\n", - " ('a\\\\n', 'quaint'): 1.0,\n", - " (',', 'doing'): 1.0,\n", - " ('Frias\\\\nhis', 'Minister'): 1.0,\n", - " ('moon\\\\nlight', 'nights'): 1.0,\n", - " ('pav', 't«xthr'): 1.0,\n", - " ('proposition', 'was'): 1.0,\n", - " ('Into\\\\n', 'the'): 1.0,\n", - " ('ask', 'him'): 1.0,\n", - " ('Houses', 'of'): 1.0,\n", - " ('be', 'brought'): 1.0,\n", - " ('engaged', 'in'): 1.0,\n", - " ('Dos-\\\\n', 'well'): 1.0,\n", - " ('of', 'depriving'): 1.0,\n", - " ('from', 'a'): 2.0,\n", - " ('return', 'to'): 1.0,\n", - " ('pay-\\\\n', 'ment'): 1.0,\n", - " ('fact', 'that'): 1.0,\n", - " ('night\\\\nThe', 'strictest'): 1.0,\n", - " ('\\\\n', '<'): 1.0,\n", - " ('*', 'before'): 1.0,\n", - " ('Australia', 'live'): 1.0,\n", - " ('a', 'majority,'): 1.0,\n", - " ('the\\\\n', 'arc'): 1.0,\n", - " ('the\\\\n', 'result'): 1.0,\n", - " ('heartily\\\\n', 'accede'): 1.0,\n", - " ('years', 'ago'): 1.0,\n", - " ('pa\\\\n', 'yers,'): 1.0,\n", - " ('\\\\n', 'gave'): 1.0,\n", - " ('that', 'lien'): 1.0,\n", - " ('deny', 'that'): 1.0,\n", - " (',', '1884.'): 1.0,\n", - " ('in', 'this'): 2.0,\n", - " ('inform\\\\n', 'Congress'): 1.0,\n", - " ('stoop', 'to'): 1.0,\n", - " ('transferred', 'the'): 1.0,\n", - " ('\\\\n', 'has'): 1.0,\n", - " ('by', 'the'): 13.0,\n", - " ('event\\\\n', 'an'): 1.0,\n", - " ('rn\\\\n', 'than'): 1.0,\n", - " ('advan-\\\\n', 'tages.'): 1.0,\n", - " ('at', 'tho'): 1.0,\n", - " ('yard\\\\n', 'when'): 1.0,\n", - " ('only\\\\nsuffeicr', 'nolmdy'): 1.0,\n", - " ('the', 'Indians'): 1.0,\n", - " ('And', 'unless'): 1.0,\n", - " ('ordinary', 'American'): 1.0,\n", - " ('death.\\\\n', '“Jesus'): 1.0,\n", - " ('to', 'ascend'): 1.0,\n", - " ('but', 'those'): 1.0,\n", - " ('death.\\\\n', 'We'): 1.0,\n", - " ('it', 'he'): 1.0,\n", - " ('are\\\\n', 'not'): 1.0,\n", - " ('the', 'coura'): 1.0,\n", - " ('wastes\\\\nheat', 'most'): 1.0,\n", - " ('as', 'Vice'): 1.0,\n", - " ('claimants', 'of'): 1.0,\n", - " ('returned', 'tt)'): 1.0,\n", - " ('San', 'Francisco'): 1.0,\n", - " ('to', 'civil'): 1.0,\n", - " ('Hanks\\\\n', 'of'): 1.0,\n", - " ('wife—as\\\\n', 'she'): 1.0,\n", - " (',', '7'): 1.0,\n", - " ('tri-\\\\nfle', 'taller'): 1.0,\n", - " ('him', 'a'): 1.0,\n", - " (',', 'Miss'): 1.0,\n", - " ('it', 'has'): 3.0,\n", - " ('will', 'erect'): 1.0,\n", - " ('testified', 'that'): 1.0,\n", - " ('seed', 'potatoes,'): 1.0,\n", - " ('control', 'over'): 1.0,\n", - " ('ap\\\\n', 'proved,'): 1.0,\n", - " ('have', 'added'): 1.0,\n", - " ('cents', 'for'): 1.0,\n", - " ('committee\\\\n', 'and'): 1.0,\n", - " ('of', 'such'): 2.0,\n", - " ('and', 'that'): 2.0,\n", - " ('District\\\\nNumber', '7,'): 1.0,\n", - " ('every\\\\nand', 'he'): 1.0,\n", - " ('union', 'of'): 1.0,\n", - " ('the', 'oilier'): 1.0,\n", - " ('and', 'hopelessly'): 1.0,\n", - " ('the', 'notes'): 1.0,\n", - " ('the', 'floor'): 1.0,\n", - " ('with', 'tons'): 1.0,\n", - " ('disregard\\\\n', '«»f'): 1.0,\n", - " ('at\\\\n', 'Per'): 1.0,\n", - " ('\\\\ncountry', 'will'): 1.0,\n", - " ('yet\\\\nthe', 'ball'): 1.0,\n", - " ('.', 'Glick'): 1.0,\n", - " ('any', 'district'): 1.0,\n", - " ('is', 'qualified'): 1.0,\n", - " ('to', 'wait'): 1.0,\n", - " ('and\\\\n', 'that'): 2.0,\n", - " ('west', 'by'): 1.0,\n", - " (',', ';ynl'): 1.0,\n", - " ('different\\\\n', 'kinds'): 1.0,\n", - " ('benefit', 'he'): 1.0,\n", - " ('initiaiiec', 'wiil'): 1.0,\n", - " ('a', 'noise'): 1.0,\n", - " (',', 'l/ic~M'): 1.0,\n", - " ('Bv\\\\n', 'this'): 1.0,\n", - " ('the', 'respec\\xad'): 1.0,\n", - " ('cash', 'in'): 1.0,\n", - " ('scarce', 'bear'): 1.0,\n", - " ('coin\\\\nfinely', 'into'): 1.0,\n", - " ('now', 'to'): 1.0,\n", - " ('relief', 'that'): 1.0,\n", - " ('at\\\\n', 'Aldershot'): 1.0,\n", - " ('benefit', 'of'): 2.0,\n", - " ('the', 'nation’s'): 1.0,\n", - " ('of\\\\n', 'her'): 1.0,\n", - " ('after', 'the'): 3.0,\n", - " ('will', 'he'): 1.0,\n", - " ('That', 'all'): 1.0,\n", - " ('reach\\\\nLloyd', 'George'): 1.0,\n", - " ('Judg\\\\n', 'ing'): 1.0,\n", - " ('of', '\"Vic'): 1.0,\n", - " ('of', 'Pensaco-'): 1.0,\n", - " (',', 'although'): 1.0,\n", - " ('by', 'their'): 1.0,\n", - " ('xan\\\\n', 'raise'): 1.0,\n", - " (',', 'Jno'): 1.0,\n", - " ('of\\\\n', 'Congrtss.'): 1.0,\n", - " ('mind', 'of'): 1.0,\n", - " (',', 'light'): 1.0,\n", - " ('strange', 'that'): 1.0,\n", - " ('sullied', 'its'): 1.0,\n", - " ('easy', 'and'): 1.0,\n", - " ('The', 'case,'): 1.0,\n", - " (',', 'without'): 2.0,\n", - " ('it', 'would'): 3.0,\n", - " ('feet\\\\nless', 'altitude'): 1.0,\n", - " ('fall', 'and'): 1.0,\n", - " ('castles\\\\n', 'which'): 1.0,\n", - " ('there', 'is'): 4.0,\n", - " ('In', 'the'): 6.0,\n", - " ('his', 'duly'): 2.0,\n", - " ('I\\\\nhad', 'seen'): 1.0,\n", - " ('all', 'his'): 1.0,\n", - " ('who\\\\n', 'is'): 1.0,\n", - " ('Grand', 'Prize'): 1.0,\n", - " ('he', 'entcitaiued,'): 1.0,\n", - " ('sentiments', 'to-'): 1.0,\n", - " ('court\\\\n', 'and'): 1.0,\n", - " ('turh', 'uias:niliccul'): 1.0,\n", - " ('capacities', 'for'): 1.0,\n", - " ('the', 'hospital.'): 1.0,\n", - " ('armies\\\\n', 'and'): 1.0,\n", - " ('the', 'bond,'): 1.0,\n", - " (')', 'from'): 1.0,\n", - " ('and', 'competent,'): 1.0,\n", - " (',', 'or'): 9.0,\n", - " ('to\\\\n', 'go'): 1.0,\n", - " ('that\\\\n', 'ordinarily'): 1.0,\n", - " ('duty.\\\\n', 'Resolved,'): 1.0,\n", - " ('an', 'infetlor'): 1.0,\n", - " ('enough', 'of'): 1.0,\n", - " ('his', 'native'): 1.0,\n", - " ('four', 'artillery'): 1.0,\n", - " ('Paul\\\\n', 'half'): 1.0,\n", - " ('the', 'appearance'): 2.0,\n", - " ('inha-\\\\nbitants', 'on'): 1.0,\n", - " ('and', 'resolutions'): 1.0,\n", - " ('anil', 'exhibits,'): 1.0,\n", - " ('keeping', 'myself'): 1.0,\n", - " ('hand', 'of'): 1.0,\n", - " ('as', 'I'): 1.0,\n", - " (',', 'never'): 1.0,\n", - " ('my\\\\n', 'soda'): 1.0,\n", - " ('pro.\\\\nclaiming', 'that'): 1.0,\n", - " ('sta-\\\\ntioned', 'in'): 1.0,\n", - " (',', 'the'): 16.0,\n", - " ('F.iurth\\\\n', 'avenue'): 1.0,\n", - " ('remote', 'objects,'): 1.0,\n", - " ('certain', 'Masons'): 1.0,\n", - " ('Jefferson', ':'): 1.0,\n", - " ('to', 'participate'): 2.0,\n", - " ('the', 'farmer'): 1.0,\n", - " ('the', 'city'): 5.0,\n", - " ('Supreme', 'Judge,'): 1.0,\n", - " ('City', 'authorities'): 1.0,\n", - " ('.', 'The'): 22.0,\n", - " ('mar\\\\n', 'vellous'): 1.0,\n", - " ('distant', 'as'): 1.0,\n", - " ('with-\\\\nin', 'the'): 1.0,\n", - " ('Blundle', 'Maple'): 1.0,\n", - " ('e', ',o'): 1.0,\n", - " ('much', 'surprised'): 1.0,\n", - " ('moro\\\\n', 'wanton'): 1.0,\n", - " ('divided\\\\n', 'among'): 1.0,\n", - " ('the', 'tame'): 1.0,\n", - " ('manager', 'qf'): 1.0,\n", - " ('by\\\\n', 'Mrs.'): 1.0,\n", - " ('young\\\\n', 'theologians'): 1.0,\n", - " ('the', 'lib*'): 1.0,\n", - " ('recent', 'homo'): 1.0,\n", - " ('said\\\\n', 'administrator'): 1.0,\n", - " ('It', 'has'): 1.0,\n", - " (',', 'Concentrated'): 1.0,\n", - " ('the', 'sugar'): 1.0,\n", - " ('little', 'bov'): 1.0,\n", - " ('min', 'W'): 1.0,\n", - " ('dupes', 'of'): 1.0,\n", - " ('One', 'of'): 2.0,\n", - " ('Mr.\\\\n', 'Bo:tj'): 1.0,\n", - " ('Bunker', 'Hill'): 1.0,\n", - " ('judgment', 'Bhall'): 1.0,\n", - " ('In', 'part:'): 1.0,\n", - " ('to', 'file'): 1.0,\n", - " ('New', 'Jersey'): 1.0,\n", - " (',', 'she'): 2.0,\n", - " ('tasacl', 'out,'): 1.0,\n", - " ('of', 'alarm'): 1.0,\n", - " ('it', 'in'): 2.0,\n", - " ('.', 'W.'): 1.0,\n", - " ('and', 'computed'): 1.0,\n", - " ('complaint', 'came'): 1.0,\n", - " ('and', 'not'): 1.0,\n", - " ('distrust\\\\n', 'each'): 1.0,\n", - " ('knew', 'how'): 1.0,\n", - " ('In', '1898'): 1.0,\n", - " ('republicanism', 'must'): 1.0,\n", - " ('posi\\xad\\\\n', 'tion'): 1.0,\n", - " ('break\\\\n', 'troth'): 1.0,\n", - " ('.\\\\n', 'Diploma'): 1.0,\n", - " ('did', 'Ia6t.'): 1.0,\n", - " ('is', 'real,'): 1.0,\n", - " ('that\\\\n', 'Cato,'): 1.0,\n", - " ('.', 'in'): 1.0,\n", - " ('throwing', 'off'): 1.0,\n", - " ('have', 'been'): 9.0,\n", - " ('Folding', 'Cae'): 1.0,\n", - " ('will\\\\nhave', 'a'): 1.0,\n", - " ('re\\\\n', 'garding'): 1.0,\n", - " ('but\\\\n', 'trom'): 1.0,\n", - " ('walked', 'be-'): 1.0,\n", - " ('swept', 'away'): 1.0,\n", - " ('until\\\\n', 'fairly'): 1.0,\n", - " ('awakened', 'by'): 1.0,\n", - " ('tha\\\\n', 'smallest'): 1.0,\n", - " ('prop', 'r'): 1.0,\n", - " ('is', 'kept'): 2.0,\n", - " ('circumstance.\\\\n', 'Had'): 1.0,\n", - " ('are\\\\nvariant—some', 'saying'): 1.0,\n", - " ('when', 'president.'): 1.0,\n", - " ('j\\\\nmeat', 'oj'): 1.0,\n", - " ('help', 'ye'): 1.0,\n", - " ('on\\\\neach', 'one'): 1.0,\n", - " ('by\\\\n', 'the'): 3.0,\n", - " ('four', 'thousand'): 2.0,\n", - " ('quarrH', '-'): 1.0,\n", - " ('\\\\n', 'muscular,'): 1.0,\n", - " ('his', 'own'): 3.0,\n", - " ('Maryland\\\\nRailroad', 'Company,'): 1.0,\n", - " ('!', 'Innocent'): 1.0,\n", - " ('eradicating', 'all'): 1.0,\n", - " (',', 'that'): 14.0,\n", - " ('damage', 'done,'): 1.0,\n", - " ('lie', 'knew'): 1.0,\n", - " ('get', 'seed,'): 1.0,\n", - " ('the', 'bidder,'): 2.0,\n", - " ('and', 'the'): 18.0,\n", - " ('ef-\\\\n', 'fective.'): 1.0,\n", - " ('the\\\\n', 'extent'): 1.0,\n", - " ('quite', 'firm.'): 1.0,\n", - " ('Away\\\\nfrom', 'home'): 1.0,\n", - " (',', 'have'): 2.0,\n", - " ('the\\\\n', 'means'): 1.0,\n", - " ('indifference.\\\\n', 'Tuberculosis'): 1.0,\n", - " ('home\\\\n', 'with'): 1.0,\n", - " ('never', 'have'): 1.0,\n", - " ('to', 'show'): 1.0,\n", - " ('thence\\\\n', 'south'): 1.0,\n", - " ('laid', 'in'): 1.0,\n", - " ('and\\\\nTruekee', 'Kailroad'): 1.0,\n", - " ('.', 'Brown'): 1.0,\n", - " ('the', 'death'): 2.0,\n", - " ('horses', 'Mrs.'): 1.0,\n", - " ('statistics', 'in'): 1.0,\n", - " ('on', 'timothy'): 1.0,\n", - " ('shall', 'be'): 8.0,\n", - " ('the', 'prejudices'): 1.0,\n", - " ('stallion\\\\n', '|'): 1.0,\n", - " ('half', 'of'): 1.0,\n", - " ('it', 'is'): 9.0,\n", - " ('do', 'it;'): 1.0,\n", - " ('declares', 'that'): 1.0,\n", - " ('themselves', 'f-atriots'): 1.0,\n", - " ('which', 'you'): 2.0,\n", - " ('this\\\\n', 'tide'): 1.0,\n", - " ('en\\xad\\\\ntitled', 'to'): 1.0,\n", - " ('all', 'round.'): 1.0,\n", - " ('the', 'free.'): 1.0,\n", - " (',', 'be'): 1.0,\n", - " ('oi', 'till'): 1.0,\n", - " ('utterest\\\\n', 'scorn'): 1.0,\n", - " ('was', 'over'): 1.0,\n", - " ('the', 'besl'): 1.0,\n", - " ('third\\\\nof', 'the*'): 1.0,\n", - " ('admission', 'that'): 1.0,\n", - " ('been', 'put'): 1.0,\n", - " ('164', 'perchei'): 1.0,\n", - " ('no', 'an-'): 1.0,\n", - " ('twenty', 'lings'): 1.0,\n", - " ('It', 'is'): 6.0,\n", - " ('hard', 'to'): 1.0,\n", - " (',', 'is'): 4.0,\n", - " ('sides', 'and'): 1.0,\n", - " ('deprived', 'the'): 1.0,\n", - " ('elo\\xad\\\\n', 'quently'): 1.0,\n", - " ('s', 'who'): 1.0,\n", - " ('advantages\\\\nwould', 'not'): 1.0,\n", - " ('Foushec', 'ft.'): 1.0,\n", - " ('.', 'Clair’s'): 1.0,\n", - " ('the\\\\n', 'Young'): 1.0,\n", - " ('was', 'clear,'): 1.0,\n", - " ('food\\\\nfrom', 'a'): 1.0,\n", - " ('not', 'only'): 3.0,\n", - " ('blood-stained', 'weap'): 1.0,\n", - " ('property\\\\n', 'has'): 1.0,\n", - " ('aro', 'perfectly'): 1.0,\n", - " ('the', 'iumily'): 1.0,\n", - " (',', 'reed*,'): 1.0,\n", - " ('just', 'for'): 1.0,\n", - " ('the', 'D'): 1.0,\n", - " ('had', 'evidently'): 1.0,\n", - " ('the', 'measures'): 1.0,\n", - " ('I\\\\n', 'finally'): 1.0,\n", - " ('people', 'of'): 2.0,\n", - " ('the', 'expense'): 1.0,\n", - " ('felt', 'if'): 1.0,\n", - " ('jour-\\\\nnalism', 'which'): 1.0,\n", - " ('as', 'to'): 7.0,\n", - " ('47', 'deg.'): 1.0,\n", - " (\"'\", 'after'): 1.0,\n", - " ('.', 'He'): 11.0,\n", - " ('About\\\\n', 'midnight'): 1.0,\n", - " ('my', 'wife'): 2.0,\n", - " ('effect', 'npon'): 1.0,\n", - " ('throughout', 'the\"'): 1.0,\n", - " ('the', 'last'): 4.0,\n", - " ('of', 'Kennehee'): 1.0,\n", - " ('.V', 'York.'): 1.0,\n", - " ('of\\\\n', 'Inue.'): 1.0,\n", - " ('are', 'awarded'): 1.0,\n", - " ('bound', 'to'): 1.0,\n", - " ('the', 'upper'): 1.0,\n", - " ('species', 'of'): 1.0,\n", - " ('day', 'ol'): 1.0,\n", - " ('a', 'fair'): 3.0,\n", - " ('public', 'trustee'): 1.0,\n", - " ('has', 'become'): 1.0,\n", - " ('lading', 'and,'): 1.0,\n", - " ('Another', 'potent'): 1.0,\n", - " ('to', 'ihe'): 1.0,\n", - " (';', 'the'): 3.0,\n", - " ('a', 'large'): 4.0,\n", - " ('II.\\\\n', 'one'): 1.0,\n", - " ('ben-\\\\n', 'efit'): 1.0,\n", - " ('which\\\\nwaa', 'fastened'): 1.0,\n", - " ('is', 'laid'): 1.0,\n", - " ('in', 'Smith'): 1.0,\n", - " ('the\\\\nbest', 'Mexico'): 1.0,\n", - " ('each', 'three'): 1.0,\n", - " ('a', 'moun-'): 1.0,\n", - " ...}" - ] - }, - "metadata": {}, - "execution_count": 17 - } - ], + "outputs": [], "source": [ - "probs = calcBigramProb(bigrams, uniCounts, biCounts)\n", - "probs" + "probs = calcBigramProb(bigrams, uniCounts, biCounts)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "def save_results(probs, in_data):\n", + " with open(out_path, 'w') as f:\n", + " for i in range(len(in_data)):\n", + " tokenized = word_tokenize(in_data[i])\n", + " word = tokenized[-1]\n", + " word = word.lower()\n", + " word = re.sub('\\W+','', word)\n", + " word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n", + " rest = 1.0 - sum(word_probs.values())\n", + " word_probs = list(map(lambda elem: elem[0][0] + \":\" + str(elem[1]), list(word_probs.items())))\n", + " word_probs.append(':'+str(rest))\n", + " word_probs = ' '.join(word_probs)\n", + " f.write(word_probs)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "save_results(probs, dev_data)" ] } ]