diff --git a/.gitignore b/.gitignore index 1c18d74..e1f2d69 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ *.o .DS_Store .token +.vscode +dev-0/.~lock* +dev-0/in.tsv +venv \ No newline at end of file diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..9c62c6d --- /dev/null +++ b/main.ipynb @@ -0,0 +1,885 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3 (ipykernel)", + "language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n", + "\n", + "# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n", + "# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n", + "dev_data = list()\n", + "with open('dev-0/in.tsv', \"r\") as f:\n", + " for line in f.readlines():\n", + " dev_data.append(line.split('\\t')[-2])\n", + "\n", + "dev_expected = list()\n", + "with open('dev-0/expected.tsv', \"r\") as f:\n", + " for line in f.readlines():\n", + " dev_expected.append(line.replace('\\n',''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.tokenize import word_tokenize \n", + "\n", + "def createBigram(data, expected):\n", + " listOfBigrams = []\n", + " bigramCounts = {}\n", + " unigramCounts = {}\n", + "\n", + " for i in range(len(data)):\n", + " tokenized = word_tokenize(data[i])\n", + " word = tokenized[-1]\n", + " listOfBigrams.append((word, expected[i]))\n", + " if (word, expected[i]) in bigramCounts:\n", + " bigramCounts[(word, expected[i])] += 1\n", + " else:\n", + " bigramCounts[(word, expected[i])] = 1\n", + " if data[i] in unigramCounts:\n", + " unigramCounts[word] += 1\n", + " else:\n", + " unigramCounts[word] = 1\n", + " \n", + " return listOfBigrams, unigramCounts, bigramCounts\n", + "\n", + "def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n", + " listOfProb = {}\n", + " for bigram in listOfBigrams:\n", + " word1 = bigram[0]\n", + " word2 = bigram[1]\n", + " listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))\n", + " return listOfProb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'day'): 1.0,\n", + " ('ol', 'powers'): 1.0,\n", + " ('real', 'Asiatic'): 1.0,\n", + " ('perfect', 'and'): 1.0,\n", + " ('stcu-\\\\n', 'i«d'): 1.0,\n", + " ('The', 'commis-'): 1.0,\n", + " ('otr', 'just'): 1.0,\n", + " ('for', 'men.'): 1.0,\n", + " ('She', 'his'): 1.0,\n", + " ('in\\\\n', 'thccityol'): 1.0,\n", + " ('j\\\\n', 'ons'): 1.0,\n", + " ('tlio', 'Convention'): 1.0,\n", + " ('9rdick', 'is'): 1.0,\n", + " (\"'s\", 'weight'): 1.0,\n", + " ('Charleston', 'to'): 1.0,\n", + " ('foaming', 'beasts'): 1.0,\n", + " ('of\\\\n', 'these'): 1.0,\n", + " ('pay\\\\nin', 'that'): 1.0,\n", + " ('from', '$1'): 1.0,\n", + " ('quack', 'medicines.'): 1.0,\n", + " ('found', 'neces-'): 1.0,\n", + " ('west', 'lino'): 1.0,\n", + " ('to\\\\nsay', 'whether'): 1.0,\n", + " ('away\\\\nuntil', 'the'): 1.0,\n", + " ('givo', 'it'): 1.0,\n", + " ('from', 'the'): 12.0,\n", + " (',', 'too,'): 1.0,\n", + " ('paws\\\\nhad', 'been'): 1.0,\n", + " ('organs', ';'): 1.0,\n", + " ('capitalists', 'great'): 1.0,\n", + " (',', 'whllt'): 1.0,\n", + " ('guarantees', 'the'): 1.0,\n", + " ('than\\\\ndo', 'so,'): 1.0,\n", + " ('\\\\n', 'And'): 1.0,\n", + " ('support\\\\n', '1'): 1.0,\n", + " ('legislative', 'body'): 1.0,\n", + " ('\\\\n', 'as'): 4.0,\n", + " ('fishes.\\\\n', 'We'): 1.0,\n", + " ('Tho', 'night'): 1.0,\n", + " ('wages', 'of'): 1.0,\n", + " ('of', 'returning'): 1.0,\n", + " ('are', 'for'): 1.0,\n", + " ('w.s', 'recently'): 1.0,\n", + " ('te', 'the'): 1.0,\n", + " ('the', 'finances,'): 1.0,\n", + " ('that', 'Imrns'): 1.0,\n", + " ('a', 'trust'): 1.0,\n", + " ('next', 'year,'): 1.0,\n", + " ('ready\\\\n', 'to'): 1.0,\n", + " ('nt', 'a'): 1.0,\n", + " ('Noth-\\\\n', 'ng,'): 1.0,\n", + " ('agree', 'on'): 1.0,\n", + " ('were', 'present.'): 1.0,\n", + " ('Ills', 'country,'): 1.0,\n", + " ('crossing\\\\nBoar', 'River;'): 1.0,\n", + " ('the', 'place'): 3.0,\n", + " ('This', 'reception'): 1.0,\n", + " ('de-\\\\n', 'pendent'): 1.0,\n", + " ('tribunals', 'will'): 1.0,\n", + " ('a', 'steamer'): 1.0,\n", + " ('I', 'am'): 3.0,\n", + " (',', 'at'): 3.0,\n", + " ('on', 'or'): 1.0,\n", + " ('preached', 'on'): 1.0,\n", + " ('of\\\\n', 'things'): 1.0,\n", + " ('for', 'the'): 13.0,\n", + " (']', 'e'): 1.0,\n", + " ('the', 'transit'): 1.0,\n", + " ('hy\\\\n', 'which'): 1.0,\n", + " ('opinion', 'that'): 1.0,\n", + " ('.', 'At'): 3.0,\n", + " ('’', 'I'): 1.0,\n", + " ('and\\\\n', 'and'): 1.0,\n", + " ('atrango', 'positions'): 1.0,\n", + " ('pn', 'Tues\\xad'): 1.0,\n", + " ('and', 'all'): 4.0,\n", + " ('is', 'honestly'): 1.0,\n", + " ('honorable', 'citizens.'): 1.0,\n", + " ('per', 'cent'): 1.0,\n", + " ('riveting', 'it,'): 1.0,\n", + " ('the\\\\nCommissioner', 'of'): 1.0,\n", + " ('separating', 'from'): 1.0,\n", + " ('are', 'startled'): 1.0,\n", + " ('voice', 'for'): 1.0,\n", + " ('when', 'thete'): 1.0,\n", + " ('\\\\nformed', 'the'): 1.0,\n", + " ('remembered', 'that'): 1.0,\n", + " ('#', 'of'): 1.0,\n", + " ('very', 'process'): 1.0,\n", + " ('would', 'gel'): 1.0,\n", + " ('me', 'with'): 1.0,\n", + " ('aflllcted', 'with'): 1.0,\n", + " ('ulti\\\\n', 'mate'): 1.0,\n", + " ('utmost\\\\nstretch', 'of'): 1.0,\n", + " ('-rstand', 'much'): 1.0,\n", + " ('was', 'after'): 1.0,\n", + " ('Its\\\\npresent', 'strength,'): 1.0,\n", + " ('the\\\\n', 'peiiod'): 1.0,\n", + " ('son\\\\nhas', 'been'): 1.0,\n", + " ('is', 'covered'): 1.0,\n", + " ('Female\\\\n', 'Seminary;'): 1.0,\n", + " ('they', 'used'): 1.0,\n", + " ('out-\\\\n', 'numbered'): 1.0,\n", + " ('came\\\\nfrom', 'heavy'): 1.0,\n", + " ('a\\\\n', 'pall'): 1.0,\n", + " ('adranoe', 'guard.'): 1.0,\n", + " ('some\\xad\\\\n', 'what'): 1.0,\n", + " ('tho', 'sulo'): 1.0,\n", + " ('upper', 'navigation'): 1.0,\n", + " ('whs\\\\n', 'therefore'): 1.0,\n", + " ('against', 'the'): 2.0,\n", + " ('expendi-\\\\n', 'tures'): 1.0,\n", + " ('treason\\\\nwith', 'indignant'): 1.0,\n", + " ('very', 'strong'): 1.0,\n", + " ('mil-\\\\n', 'lions'): 1.0,\n", + " ('rights', 'supported'): 1.0,\n", + " (',', 'shall'): 2.0,\n", + " ('Moat\\\\nbeing', 'piloted'): 1.0,\n", + " ('be-\\\\n', 'ing'): 2.0,\n", + " ('little', 'barefooted'): 1.0,\n", + " ('of\\\\n', 'developenient'): 1.0,\n", + " ('be-\\\\n', 'Cause'): 1.0,\n", + " (\"'Abraham\\\\n\", 'llu.li'): 1.0,\n", + " ('the', 'estate'): 2.0,\n", + " ('his', 'friends'): 1.0,\n", + " ('P', \"('nderwood.a\"): 1.0,\n", + " ('the\\\\n', 'roof'): 1.0,\n", + " ('Cwk', 'r-;'): 1.0,\n", + " ('.', \"Johnson's\"): 1.0,\n", + " (';', 'while'): 1.0,\n", + " ('*', 'of'): 1.0,\n", + " ('have', 'power'): 1.0,\n", + " ('in', 'search'): 1.0,\n", + " ('and\\\\nthe', 'principal'): 1.0,\n", + " ('HAY.the\\\\nlast', 'named'): 1.0,\n", + " ('have', 'occasioned;'): 1.0,\n", + " ('the\\\\nWestern', 'boundary'): 1.0,\n", + " (',', 'whose'): 1.0,\n", + " ('be', 'visited'): 1.0,\n", + " ('\\\\n', 'and'): 13.0,\n", + " ('party', 'zeal;—and'): 1.0,\n", + " ('exercise', 'over'): 1.0,\n", + " ('foremost', 'in'): 1.0,\n", + " ('three', 'pounds'): 1.0,\n", + " ('exceed-\\\\n', 'ed'): 1.0,\n", + " ('could', 'find,'): 1.0,\n", + " ('-', '.'): 1.0,\n", + " ('could\\\\n', 'f.'): 1.0,\n", + " ('our', 'power.'): 1.0,\n", + " ('intimated', 'that'): 1.0,\n", + " ('Thomas', 'J.'): 2.0,\n", + " ('laid', 'me'): 1.0,\n", + " ('that', 'there'): 3.0,\n", + " ('that', 'on'): 1.0,\n", + " ('it', 'to'): 2.0,\n", + " ('Mc-\\\\n', 'Xamara.'): 1.0,\n", + " ('and\\\\nthrough', 'the'): 1.0,\n", + " ('trembled', 'with'): 1.0,\n", + " ('chin-imps.\\\\nling', 'him,as'): 1.0,\n", + " ('the', 'agricultural'): 1.0,\n", + " ('have', 'proceeded'): 1.0,\n", + " (',', 'P.'): 1.0,\n", + " ('rlin', 'sale'): 1.0,\n", + " ('sight', 'of'): 2.0,\n", + " ('\\\\n', 'rails'): 1.0,\n", + " ('know\\\\nof', 'no'): 1.0,\n", + " ('.', 'MCI).Bids.'): 1.0,\n", + " ('member', 'of'): 1.0,\n", + " ('\\\\nand', 'they'): 1.0,\n", + " (\"'\", 'of'): 1.0,\n", + " ('the', 'ground;'): 1.0,\n", + " (',', 'and'): 88.0,\n", + " ('traverses', 'the'): 1.0,\n", + " ('could', 'lie'): 1.0,\n", + " ('sounded', 'by'): 1.0,\n", + " ('kianr.h', 'of'): 1.0,\n", + " ('be\\\\n', 'promptly'): 1.0,\n", + " ('what', 'history'): 1.0,\n", + " ('high', 'taxation.'): 1.0,\n", + " ('they', 'will'): 4.0,\n", + " ('with', 'the'): 13.0,\n", + " ('ho', '\"would'): 1.0,\n", + " (\"'s\", 'hole'): 1.0,\n", + " ('these', 'circumstances.'): 1.0,\n", + " ('n', 'belief'): 1.0,\n", + " ('because', 'they'): 1.0,\n", + " ('.', 'When'): 2.0,\n", + " (',', 'a'): 12.0,\n", + " ('as', 'far'): 2.0,\n", + " ('the', 'means'): 3.0,\n", + " ('receive', 'as'): 1.0,\n", + " ('business', 'of'): 1.0,\n", + " ('them.\\\\nWhy', 'should'): 1.0,\n", + " ('tangled', 'thickets'): 1.0,\n", + " (',', \"Blannerhasset's\"): 1.0,\n", + " ('happiness', 'of'): 1.0,\n", + " ('district', 'of'): 1.0,\n", + " ('\\\\n', 'chosen'): 1.0,\n", + " ('lias', 'sustained'): 1.0,\n", + " ('mid', 'large'): 1.0,\n", + " ('work', 'now.\"'): 1.0,\n", + " ('certified', 'by'): 1.0,\n", + " ('were', 'entitled,'): 1.0,\n", + " ('submission', 'fb'): 1.0,\n", + " ('bind¬\\\\n', 'ing'): 1.0,\n", + " ('or\\\\n', 'in'): 1.0,\n", + " ('vir\\xad\\\\n', 'tue'): 1.0,\n", + " ('asking', 'that'): 1.0,\n", + " ('as', 'there'): 2.0,\n", + " ('immediate', 'approval'): 1.0,\n", + " ('thcro\\\\nwas', 'not'): 1.0,\n", + " ('kind', 'of'): 2.0,\n", + " ('us\\\\n', 'ow'): 1.0,\n", + " ('thence', 'North'): 1.0,\n", + " ('bring-\\\\n', 'ing'): 1.0,\n", + " ('.', '-Cut'): 1.0,\n", + " ('a', 'supper'): 1.0,\n", + " ('surroundings.\\\\n', '“You’re'): 1.0,\n", + " ('the', 'men'): 3.0,\n", + " ('Is\\\\n', 'formed'): 1.0,\n", + " ('intrinsic', 'value'): 1.0,\n", + " ('ordinance.\\\\n', 'Section'): 1.0,\n", + " ('tojustily', 'Protection,'): 1.0,\n", + " ('scramble\\\\n', 'for'): 1.0,\n", + " ('s', 'part,'): 1.0,\n", + " ('both', 'to'): 1.0,\n", + " ('with\\\\nthe', 'idea'): 1.0,\n", + " ('tell', 'of'): 1.0,\n", + " ('gone', 'by'): 1.0,\n", + " ('the', 'liberty'): 1.0,\n", + " ('they', 'lived'): 1.0,\n", + " ('skating\\\\n', 'rink,'): 1.0,\n", + " (',', 'se'): 1.0,\n", + " ('work', 'and'): 2.0,\n", + " ('last\\\\n', 'sale,'): 1.0,\n", + " ('modern', 'times'): 1.0,\n", + " ('Icould', 'see'): 1.0,\n", + " ('they', 'led'): 1.0,\n", + " ('all\\\\nalong', 'the'): 1.0,\n", + " ('of\\\\n', 'all'): 2.0,\n", + " ('1919', 'certificate'): 1.0,\n", + " ('from', 'my'): 1.0,\n", + " ('that', 'the'): 9.0,\n", + " ('aeenmed', 'that'): 1.0,\n", + " ('on', 'Wednesday,'): 2.0,\n", + " ('kept\\\\n', 'up'): 2.0,\n", + " ('after', 'tea'): 1.0,\n", + " ('equipping', 'each'): 1.0,\n", + " ('mode\\\\n', 'of'): 1.0,\n", + " ('the\\\\n', 'ocean,'): 1.0,\n", + " ('.', 'Messersmith'): 1.0,\n", + " ('not\\\\n', 'there,'): 1.0,\n", + " ('n', 'fam-'): 1.0,\n", + " ('beg', 'this'): 1.0,\n", + " ('to', 'attempt'): 1.0,\n", + " ('and\\\\n', 'the'): 5.0,\n", + " ('they\\\\n', 'were'): 1.0,\n", + " ('be', 'more'): 1.0,\n", + " ('ui.cieisioi.ti\\\\n', 'b\\\\'): 1.0,\n", + " ('our\\\\n', 'constitutional'): 1.0,\n", + " ('to', 'die'): 1.0,\n", + " ('and', 'honest'): 1.0,\n", + " ('bo', 'too'): 1.0,\n", + " ('to', 'take'): 2.0,\n", + " (',', 'Thomas'): 1.0,\n", + " ('many', 'months'): 1.0,\n", + " ('150', 'of'): 1.0,\n", + " ('Billy\\\\n', 'one'): 1.0,\n", + " ('.', 'His'): 2.0,\n", + " ('ancP\\\\n', '67-100'): 1.0,\n", + " ('conduct', 'the'): 1.0,\n", + " ('the', 'small'): 1.0,\n", + " ('by', 'ample'): 1.0,\n", + " ('be', 'well'): 3.0,\n", + " ('speak', 'of'): 1.0,\n", + " ('de-\\\\n', 'termined'): 1.0,\n", + " ('window.\\\\n', 'I'): 1.0,\n", + " ('not', 'bo'): 1.0,\n", + " ('train', 'hearing'): 1.0,\n", + " ('minds\\\\nof', 'men'): 1.0,\n", + " ('supply', 'this'): 1.0,\n", + " ('.', 'Also'): 1.0,\n", + " ('we', 'sought'): 1.0,\n", + " ('chief', 'houor'): 1.0,\n", + " ('road', 'was'): 1.0,\n", + " ('strikes', 'in'): 1.0,\n", + " ('at', \"Hawkins'\"): 1.0,\n", + " ('sure', 'that'): 1.0,\n", + " ('lelease\\\\n', 'he'): 1.0,\n", + " ('in\\\\neach', 'year,'): 1.0,\n", + " ('proclaimed\\\\n', 'by'): 1.0,\n", + " ('polirira\\\\n', 'Course'): 1.0,\n", + " ('the', 'advance'): 1.0,\n", + " ('elaborate', 'frescoing'): 1.0,\n", + " ('work', 'nnd'): 1.0,\n", + " ('pension', 'frauds'): 1.0,\n", + " ('was', 'in'): 1.0,\n", + " ('.', 'Gen.'): 1.0,\n", + " ('west\\\\n', 'coast,'): 1.0,\n", + " ('where', 'Charles'): 1.0,\n", + " ('the', 'stare'): 1.0,\n", + " ('for', 'all'): 1.0,\n", + " ('.', 'But'): 4.0,\n", + " ('the', 'Committee'): 2.0,\n", + " ('so', 'as'): 4.0,\n", + " ('upon\\\\n', 'their'): 1.0,\n", + " ('from', 'just'): 1.0,\n", + " ('the', 'valley'): 3.0,\n", + " ('\\\\n', 'lie'): 1.0,\n", + " ('You\\\\n', 'must'): 1.0,\n", + " ('harvest.\\\\n', 'The'): 1.0,\n", + " (',', 'will,'): 1.0,\n", + " ('—', 'Burleigh'): 1.0,\n", + " (',', 'of'): 5.0,\n", + " ('of', 'fun-'): 1.0,\n", + " ('are', 'gen-'): 1.0,\n", + " ('he', 'was'): 4.0,\n", + " (',', 'another'): 1.0,\n", + " ('\\\\nand', 'possession'): 1.0,\n", + " ('to\\\\n', 'gu’de'): 1.0,\n", + " ('come.\\\\n', 'The'): 1.0,\n", + " ('\\\\nthat', 'it'): 1.0,\n", + " ('think\\\\nIt', 'a'): 1.0,\n", + " ('the', 'receipt'): 1.0,\n", + " ('those', 'districts,'): 1.0,\n", + " ('firing\\\\n', 'pan'): 1.0,\n", + " ('contempt', 'of'): 1.0,\n", + " ('George\\\\n', 'J.'): 1.0,\n", + " ('of', '*00'): 1.0,\n", + " ('the\\\\nhomestead', 'might'): 1.0,\n", + " ('se\\\\nwho', 'have'): 1.0,\n", + " ('place', 'the'): 1.0,\n", + " ('for\\\\n', '1916'): 1.0,\n", + " (',', 'which'): 9.0,\n", + " ('that', 'their'): 1.0,\n", + " ('the', 'gen-'): 2.0,\n", + " ('would\\\\n', 'do'): 1.0,\n", + " ('be', 're-'): 1.0,\n", + " ('unharmed.\\\\n', 'I'): 1.0,\n", + " ('a\\\\n', 'substitute'): 1.0,\n", + " ('n\\\\n', 'prodigy'): 1.0,\n", + " ('switchmen', 'to-'): 1.0,\n", + " ('rest', 'in'): 1.0,\n", + " ('when\\\\n', 'heavily'): 1.0,\n", + " ('that', 'question,'): 1.0,\n", + " ('capable', 'of'): 1.0,\n", + " ('to', 'lupn'): 1.0,\n", + " ('very', 'in\"-*\\''): 1.0,\n", + " ('.', 'In'): 4.0,\n", + " ('phenomenal', 'growth.'): 1.0,\n", + " ('hut', 'we'): 1.0,\n", + " ('that', 'he'): 3.0,\n", + " ('the', 'earliest'): 1.0,\n", + " ('the', 'gun,'): 1.0,\n", + " ('I', 'know'): 2.0,\n", + " ('about', 'the'): 2.0,\n", + " ('to', 'President'): 1.0,\n", + " ('The', 'British'): 1.0,\n", + " ('the', 'colored'): 1.0,\n", + " ('recent', 'au-'): 1.0,\n", + " ('new', 'proprietary'): 1.0,\n", + " ('a', 'village'): 1.0,\n", + " ('will\\\\n', 'equaliy'): 1.0,\n", + " ('but', 'also'): 1.0,\n", + " ('\\\\n', 'No'): 1.0,\n", + " ('be\\\\nenforced', 'with'): 1.0,\n", + " ('statement', 'is'): 1.0,\n", + " ('questioning', 'the'): 1.0,\n", + " ('ihe\\\\n', 'I'): 1.0,\n", + " ('he', 'appreciate'): 1.0,\n", + " ('MortunY', 'Traveller'): 1.0,\n", + " ('was', 'to'): 1.0,\n", + " ('once', 'formed'): 1.0,\n", + " ('traduced', 'her'): 1.0,\n", + " (',', 'according'): 2.0,\n", + " (',', 'alwavs'): 1.0,\n", + " ('6|d', ':'): 1.0,\n", + " ('\\\\n', 'which'): 4.0,\n", + " ('were', 'fit'): 1.0,\n", + " ('references\\\\n', 'to'): 1.0,\n", + " ('of\\\\n', 'feet'): 1.0,\n", + " ('brick', 'fioor'): 1.0,\n", + " ('was', 'not'): 4.0,\n", + " ('is', 'more'): 1.0,\n", + " ('consider', 'it'): 2.0,\n", + " ('The', 'latter'): 1.0,\n", + " ('in', 'a'): 8.0,\n", + " ('good\\\\nname', 'and'): 1.0,\n", + " (',', 'trom'): 1.0,\n", + " ('several\\\\njoints', 'were'): 1.0,\n", + " ('the', 'party'): 1.0,\n", + " ('not', 'know'): 2.0,\n", + " ('\\\\n', '1913,'): 2.0,\n", + " ('is', 'possible'): 1.0,\n", + " ('is', 'seven'): 1.0,\n", + " ('powerful', 'foree'): 1.0,\n", + " ('a\\\\ngood', 'jilace,'): 1.0,\n", + " ('\\\\nit', 'the'): 1.0,\n", + " ('for\\\\nalUr', 'taking'): 1.0,\n", + " ('anil', 'did'): 1.0,\n", + " ('defeat', 'the'): 1.0,\n", + " ('attempts', 'lo'): 1.0,\n", + " ('State', 'of'): 1.0,\n", + " ('a\\\\n', 'quaint'): 1.0,\n", + " (',', 'doing'): 1.0,\n", + " ('Frias\\\\nhis', 'Minister'): 1.0,\n", + " ('moon\\\\nlight', 'nights'): 1.0,\n", + " ('pav', 't«xthr'): 1.0,\n", + " ('proposition', 'was'): 1.0,\n", + " ('Into\\\\n', 'the'): 1.0,\n", + " ('ask', 'him'): 1.0,\n", + " ('Houses', 'of'): 1.0,\n", + " ('be', 'brought'): 1.0,\n", + " ('engaged', 'in'): 1.0,\n", + " ('Dos-\\\\n', 'well'): 1.0,\n", + " ('of', 'depriving'): 1.0,\n", + " ('from', 'a'): 2.0,\n", + " ('return', 'to'): 1.0,\n", + " ('pay-\\\\n', 'ment'): 1.0,\n", + " ('fact', 'that'): 1.0,\n", + " ('night\\\\nThe', 'strictest'): 1.0,\n", + " ('\\\\n', '<'): 1.0,\n", + " ('*', 'before'): 1.0,\n", + " ('Australia', 'live'): 1.0,\n", + " ('a', 'majority,'): 1.0,\n", + " ('the\\\\n', 'arc'): 1.0,\n", + " ('the\\\\n', 'result'): 1.0,\n", + " ('heartily\\\\n', 'accede'): 1.0,\n", + " ('years', 'ago'): 1.0,\n", + " ('pa\\\\n', 'yers,'): 1.0,\n", + " ('\\\\n', 'gave'): 1.0,\n", + " ('that', 'lien'): 1.0,\n", + " ('deny', 'that'): 1.0,\n", + " (',', '1884.'): 1.0,\n", + " ('in', 'this'): 2.0,\n", + " ('inform\\\\n', 'Congress'): 1.0,\n", + " ('stoop', 'to'): 1.0,\n", + " ('transferred', 'the'): 1.0,\n", + " ('\\\\n', 'has'): 1.0,\n", + " ('by', 'the'): 13.0,\n", + " ('event\\\\n', 'an'): 1.0,\n", + " ('rn\\\\n', 'than'): 1.0,\n", + " ('advan-\\\\n', 'tages.'): 1.0,\n", + " ('at', 'tho'): 1.0,\n", + " ('yard\\\\n', 'when'): 1.0,\n", + " ('only\\\\nsuffeicr', 'nolmdy'): 1.0,\n", + " ('the', 'Indians'): 1.0,\n", + " ('And', 'unless'): 1.0,\n", + " ('ordinary', 'American'): 1.0,\n", + " ('death.\\\\n', '“Jesus'): 1.0,\n", + " ('to', 'ascend'): 1.0,\n", + " ('but', 'those'): 1.0,\n", + " ('death.\\\\n', 'We'): 1.0,\n", + " ('it', 'he'): 1.0,\n", + " ('are\\\\n', 'not'): 1.0,\n", + " ('the', 'coura'): 1.0,\n", + " ('wastes\\\\nheat', 'most'): 1.0,\n", + " ('as', 'Vice'): 1.0,\n", + " ('claimants', 'of'): 1.0,\n", + " ('returned', 'tt)'): 1.0,\n", + " ('San', 'Francisco'): 1.0,\n", + " ('to', 'civil'): 1.0,\n", + " ('Hanks\\\\n', 'of'): 1.0,\n", + " ('wife—as\\\\n', 'she'): 1.0,\n", + " (',', '7'): 1.0,\n", + " ('tri-\\\\nfle', 'taller'): 1.0,\n", + " ('him', 'a'): 1.0,\n", + " (',', 'Miss'): 1.0,\n", + " ('it', 'has'): 3.0,\n", + " ('will', 'erect'): 1.0,\n", + " ('testified', 'that'): 1.0,\n", + " ('seed', 'potatoes,'): 1.0,\n", + " ('control', 'over'): 1.0,\n", + " ('ap\\\\n', 'proved,'): 1.0,\n", + " ('have', 'added'): 1.0,\n", + " ('cents', 'for'): 1.0,\n", + " ('committee\\\\n', 'and'): 1.0,\n", + " ('of', 'such'): 2.0,\n", + " ('and', 'that'): 2.0,\n", + " ('District\\\\nNumber', '7,'): 1.0,\n", + " ('every\\\\nand', 'he'): 1.0,\n", + " ('union', 'of'): 1.0,\n", + " ('the', 'oilier'): 1.0,\n", + " ('and', 'hopelessly'): 1.0,\n", + " ('the', 'notes'): 1.0,\n", + " ('the', 'floor'): 1.0,\n", + " ('with', 'tons'): 1.0,\n", + " ('disregard\\\\n', '«»f'): 1.0,\n", + " ('at\\\\n', 'Per'): 1.0,\n", + " ('\\\\ncountry', 'will'): 1.0,\n", + " ('yet\\\\nthe', 'ball'): 1.0,\n", + " ('.', 'Glick'): 1.0,\n", + " ('any', 'district'): 1.0,\n", + " ('is', 'qualified'): 1.0,\n", + " ('to', 'wait'): 1.0,\n", + " ('and\\\\n', 'that'): 2.0,\n", + " ('west', 'by'): 1.0,\n", + " (',', ';ynl'): 1.0,\n", + " ('different\\\\n', 'kinds'): 1.0,\n", + " ('benefit', 'he'): 1.0,\n", + " ('initiaiiec', 'wiil'): 1.0,\n", + " ('a', 'noise'): 1.0,\n", + " (',', 'l/ic~M'): 1.0,\n", + " ('Bv\\\\n', 'this'): 1.0,\n", + " ('the', 'respec\\xad'): 1.0,\n", + " ('cash', 'in'): 1.0,\n", + " ('scarce', 'bear'): 1.0,\n", + " ('coin\\\\nfinely', 'into'): 1.0,\n", + " ('now', 'to'): 1.0,\n", + " ('relief', 'that'): 1.0,\n", + " ('at\\\\n', 'Aldershot'): 1.0,\n", + " ('benefit', 'of'): 2.0,\n", + " ('the', 'nation’s'): 1.0,\n", + " ('of\\\\n', 'her'): 1.0,\n", + " ('after', 'the'): 3.0,\n", + " ('will', 'he'): 1.0,\n", + " ('That', 'all'): 1.0,\n", + " ('reach\\\\nLloyd', 'George'): 1.0,\n", + " ('Judg\\\\n', 'ing'): 1.0,\n", + " ('of', '\"Vic'): 1.0,\n", + " ('of', 'Pensaco-'): 1.0,\n", + " (',', 'although'): 1.0,\n", + " ('by', 'their'): 1.0,\n", + " ('xan\\\\n', 'raise'): 1.0,\n", + " (',', 'Jno'): 1.0,\n", + " ('of\\\\n', 'Congrtss.'): 1.0,\n", + " ('mind', 'of'): 1.0,\n", + " (',', 'light'): 1.0,\n", + " ('strange', 'that'): 1.0,\n", + " ('sullied', 'its'): 1.0,\n", + " ('easy', 'and'): 1.0,\n", + " ('The', 'case,'): 1.0,\n", + " (',', 'without'): 2.0,\n", + " ('it', 'would'): 3.0,\n", + " ('feet\\\\nless', 'altitude'): 1.0,\n", + " ('fall', 'and'): 1.0,\n", + " ('castles\\\\n', 'which'): 1.0,\n", + " ('there', 'is'): 4.0,\n", + " ('In', 'the'): 6.0,\n", + " ('his', 'duly'): 2.0,\n", + " ('I\\\\nhad', 'seen'): 1.0,\n", + " ('all', 'his'): 1.0,\n", + " ('who\\\\n', 'is'): 1.0,\n", + " ('Grand', 'Prize'): 1.0,\n", + " ('he', 'entcitaiued,'): 1.0,\n", + " ('sentiments', 'to-'): 1.0,\n", + " ('court\\\\n', 'and'): 1.0,\n", + " ('turh', 'uias:niliccul'): 1.0,\n", + " ('capacities', 'for'): 1.0,\n", + " ('the', 'hospital.'): 1.0,\n", + " ('armies\\\\n', 'and'): 1.0,\n", + " ('the', 'bond,'): 1.0,\n", + " (')', 'from'): 1.0,\n", + " ('and', 'competent,'): 1.0,\n", + " (',', 'or'): 9.0,\n", + " ('to\\\\n', 'go'): 1.0,\n", + " ('that\\\\n', 'ordinarily'): 1.0,\n", + " ('duty.\\\\n', 'Resolved,'): 1.0,\n", + " ('an', 'infetlor'): 1.0,\n", + " ('enough', 'of'): 1.0,\n", + " ('his', 'native'): 1.0,\n", + " ('four', 'artillery'): 1.0,\n", + " ('Paul\\\\n', 'half'): 1.0,\n", + " ('the', 'appearance'): 2.0,\n", + " ('inha-\\\\nbitants', 'on'): 1.0,\n", + " ('and', 'resolutions'): 1.0,\n", + " ('anil', 'exhibits,'): 1.0,\n", + " ('keeping', 'myself'): 1.0,\n", + " ('hand', 'of'): 1.0,\n", + " ('as', 'I'): 1.0,\n", + " (',', 'never'): 1.0,\n", + " ('my\\\\n', 'soda'): 1.0,\n", + " ('pro.\\\\nclaiming', 'that'): 1.0,\n", + " ('sta-\\\\ntioned', 'in'): 1.0,\n", + " (',', 'the'): 16.0,\n", + " ('F.iurth\\\\n', 'avenue'): 1.0,\n", + " ('remote', 'objects,'): 1.0,\n", + " ('certain', 'Masons'): 1.0,\n", + " ('Jefferson', ':'): 1.0,\n", + " ('to', 'participate'): 2.0,\n", + " ('the', 'farmer'): 1.0,\n", + " ('the', 'city'): 5.0,\n", + " ('Supreme', 'Judge,'): 1.0,\n", + " ('City', 'authorities'): 1.0,\n", + " ('.', 'The'): 22.0,\n", + " ('mar\\\\n', 'vellous'): 1.0,\n", + " ('distant', 'as'): 1.0,\n", + " ('with-\\\\nin', 'the'): 1.0,\n", + " ('Blundle', 'Maple'): 1.0,\n", + " ('e', ',o'): 1.0,\n", + " ('much', 'surprised'): 1.0,\n", + " ('moro\\\\n', 'wanton'): 1.0,\n", + " ('divided\\\\n', 'among'): 1.0,\n", + " ('the', 'tame'): 1.0,\n", + " ('manager', 'qf'): 1.0,\n", + " ('by\\\\n', 'Mrs.'): 1.0,\n", + " ('young\\\\n', 'theologians'): 1.0,\n", + " ('the', 'lib*'): 1.0,\n", + " ('recent', 'homo'): 1.0,\n", + " ('said\\\\n', 'administrator'): 1.0,\n", + " ('It', 'has'): 1.0,\n", + " (',', 'Concentrated'): 1.0,\n", + " ('the', 'sugar'): 1.0,\n", + " ('little', 'bov'): 1.0,\n", + " ('min', 'W'): 1.0,\n", + " ('dupes', 'of'): 1.0,\n", + " ('One', 'of'): 2.0,\n", + " ('Mr.\\\\n', 'Bo:tj'): 1.0,\n", + " ('Bunker', 'Hill'): 1.0,\n", + " ('judgment', 'Bhall'): 1.0,\n", + " ('In', 'part:'): 1.0,\n", + " ('to', 'file'): 1.0,\n", + " ('New', 'Jersey'): 1.0,\n", + " (',', 'she'): 2.0,\n", + " ('tasacl', 'out,'): 1.0,\n", + " ('of', 'alarm'): 1.0,\n", + " ('it', 'in'): 2.0,\n", + " ('.', 'W.'): 1.0,\n", + " ('and', 'computed'): 1.0,\n", + " ('complaint', 'came'): 1.0,\n", + " ('and', 'not'): 1.0,\n", + " ('distrust\\\\n', 'each'): 1.0,\n", + " ('knew', 'how'): 1.0,\n", + " ('In', '1898'): 1.0,\n", + " ('republicanism', 'must'): 1.0,\n", + " ('posi\\xad\\\\n', 'tion'): 1.0,\n", + " ('break\\\\n', 'troth'): 1.0,\n", + " ('.\\\\n', 'Diploma'): 1.0,\n", + " ('did', 'Ia6t.'): 1.0,\n", + " ('is', 'real,'): 1.0,\n", + " ('that\\\\n', 'Cato,'): 1.0,\n", + " ('.', 'in'): 1.0,\n", + " ('throwing', 'off'): 1.0,\n", + " ('have', 'been'): 9.0,\n", + " ('Folding', 'Cae'): 1.0,\n", + " ('will\\\\nhave', 'a'): 1.0,\n", + " ('re\\\\n', 'garding'): 1.0,\n", + " ('but\\\\n', 'trom'): 1.0,\n", + " ('walked', 'be-'): 1.0,\n", + " ('swept', 'away'): 1.0,\n", + " ('until\\\\n', 'fairly'): 1.0,\n", + " ('awakened', 'by'): 1.0,\n", + " ('tha\\\\n', 'smallest'): 1.0,\n", + " ('prop', 'r'): 1.0,\n", + " ('is', 'kept'): 2.0,\n", + " ('circumstance.\\\\n', 'Had'): 1.0,\n", + " ('are\\\\nvariant—some', 'saying'): 1.0,\n", + " ('when', 'president.'): 1.0,\n", + " ('j\\\\nmeat', 'oj'): 1.0,\n", + " ('help', 'ye'): 1.0,\n", + " ('on\\\\neach', 'one'): 1.0,\n", + " ('by\\\\n', 'the'): 3.0,\n", + " ('four', 'thousand'): 2.0,\n", + " ('quarrH', '-'): 1.0,\n", + " ('\\\\n', 'muscular,'): 1.0,\n", + " ('his', 'own'): 3.0,\n", + " ('Maryland\\\\nRailroad', 'Company,'): 1.0,\n", + " ('!', 'Innocent'): 1.0,\n", + " ('eradicating', 'all'): 1.0,\n", + " (',', 'that'): 14.0,\n", + " ('damage', 'done,'): 1.0,\n", + " ('lie', 'knew'): 1.0,\n", + " ('get', 'seed,'): 1.0,\n", + " ('the', 'bidder,'): 2.0,\n", + " ('and', 'the'): 18.0,\n", + " ('ef-\\\\n', 'fective.'): 1.0,\n", + " ('the\\\\n', 'extent'): 1.0,\n", + " ('quite', 'firm.'): 1.0,\n", + " ('Away\\\\nfrom', 'home'): 1.0,\n", + " (',', 'have'): 2.0,\n", + " ('the\\\\n', 'means'): 1.0,\n", + " ('indifference.\\\\n', 'Tuberculosis'): 1.0,\n", + " ('home\\\\n', 'with'): 1.0,\n", + " ('never', 'have'): 1.0,\n", + " ('to', 'show'): 1.0,\n", + " ('thence\\\\n', 'south'): 1.0,\n", + " ('laid', 'in'): 1.0,\n", + " ('and\\\\nTruekee', 'Kailroad'): 1.0,\n", + " ('.', 'Brown'): 1.0,\n", + " ('the', 'death'): 2.0,\n", + " ('horses', 'Mrs.'): 1.0,\n", + " ('statistics', 'in'): 1.0,\n", + " ('on', 'timothy'): 1.0,\n", + " ('shall', 'be'): 8.0,\n", + " ('the', 'prejudices'): 1.0,\n", + " ('stallion\\\\n', '|'): 1.0,\n", + " ('half', 'of'): 1.0,\n", + " ('it', 'is'): 9.0,\n", + " ('do', 'it;'): 1.0,\n", + " ('declares', 'that'): 1.0,\n", + " ('themselves', 'f-atriots'): 1.0,\n", + " ('which', 'you'): 2.0,\n", + " ('this\\\\n', 'tide'): 1.0,\n", + " ('en\\xad\\\\ntitled', 'to'): 1.0,\n", + " ('all', 'round.'): 1.0,\n", + " ('the', 'free.'): 1.0,\n", + " (',', 'be'): 1.0,\n", + " ('oi', 'till'): 1.0,\n", + " ('utterest\\\\n', 'scorn'): 1.0,\n", + " ('was', 'over'): 1.0,\n", + " ('the', 'besl'): 1.0,\n", + " ('third\\\\nof', 'the*'): 1.0,\n", + " ('admission', 'that'): 1.0,\n", + " ('been', 'put'): 1.0,\n", + " ('164', 'perchei'): 1.0,\n", + " ('no', 'an-'): 1.0,\n", + " ('twenty', 'lings'): 1.0,\n", + " ('It', 'is'): 6.0,\n", + " ('hard', 'to'): 1.0,\n", + " (',', 'is'): 4.0,\n", + " ('sides', 'and'): 1.0,\n", + " ('deprived', 'the'): 1.0,\n", + " ('elo\\xad\\\\n', 'quently'): 1.0,\n", + " ('s', 'who'): 1.0,\n", + " ('advantages\\\\nwould', 'not'): 1.0,\n", + " ('Foushec', 'ft.'): 1.0,\n", + " ('.', 'Clair’s'): 1.0,\n", + " ('the\\\\n', 'Young'): 1.0,\n", + " ('was', 'clear,'): 1.0,\n", + " ('food\\\\nfrom', 'a'): 1.0,\n", + " ('not', 'only'): 3.0,\n", + " ('blood-stained', 'weap'): 1.0,\n", + " ('property\\\\n', 'has'): 1.0,\n", + " ('aro', 'perfectly'): 1.0,\n", + " ('the', 'iumily'): 1.0,\n", + " (',', 'reed*,'): 1.0,\n", + " ('just', 'for'): 1.0,\n", + " ('the', 'D'): 1.0,\n", + " ('had', 'evidently'): 1.0,\n", + " ('the', 'measures'): 1.0,\n", + " ('I\\\\n', 'finally'): 1.0,\n", + " ('people', 'of'): 2.0,\n", + " ('the', 'expense'): 1.0,\n", + " ('felt', 'if'): 1.0,\n", + " ('jour-\\\\nnalism', 'which'): 1.0,\n", + " ('as', 'to'): 7.0,\n", + " ('47', 'deg.'): 1.0,\n", + " (\"'\", 'after'): 1.0,\n", + " ('.', 'He'): 11.0,\n", + " ('About\\\\n', 'midnight'): 1.0,\n", + " ('my', 'wife'): 2.0,\n", + " ('effect', 'npon'): 1.0,\n", + " ('throughout', 'the\"'): 1.0,\n", + " ('the', 'last'): 4.0,\n", + " ('of', 'Kennehee'): 1.0,\n", + " ('.V', 'York.'): 1.0,\n", + " ('of\\\\n', 'Inue.'): 1.0,\n", + " ('are', 'awarded'): 1.0,\n", + " ('bound', 'to'): 1.0,\n", + " ('the', 'upper'): 1.0,\n", + " ('species', 'of'): 1.0,\n", + " ('day', 'ol'): 1.0,\n", + " ('a', 'fair'): 3.0,\n", + " ('public', 'trustee'): 1.0,\n", + " ('has', 'become'): 1.0,\n", + " ('lading', 'and,'): 1.0,\n", + " ('Another', 'potent'): 1.0,\n", + " ('to', 'ihe'): 1.0,\n", + " (';', 'the'): 3.0,\n", + " ('a', 'large'): 4.0,\n", + " ('II.\\\\n', 'one'): 1.0,\n", + " ('ben-\\\\n', 'efit'): 1.0,\n", + " ('which\\\\nwaa', 'fastened'): 1.0,\n", + " ('is', 'laid'): 1.0,\n", + " ('in', 'Smith'): 1.0,\n", + " ('the\\\\nbest', 'Mexico'): 1.0,\n", + " ('each', 'three'): 1.0,\n", + " ('a', 'moun-'): 1.0,\n", + " ...}" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "probs = calcBigramProb(bigrams, uniCounts, biCounts)\n", + "probs" + ] + } + ] +} \ No newline at end of file