From 975dd50258f317b0ee89ce94494ab1f3b94e6252 Mon Sep 17 00:00:00 2001 From: Jakub Eichner Date: Wed, 26 Apr 2023 08:23:58 +0200 Subject: [PATCH] kenlm solution --- kenlm.ipynb | 1988 ------------------------------- kenlm_2words.py | 79 -- kenlm_4words.py => kenlm_run.py | 12 +- 3 files changed, 4 insertions(+), 2075 deletions(-) delete mode 100644 kenlm.ipynb delete mode 100644 kenlm_2words.py rename kenlm_4words.py => kenlm_run.py (84%) diff --git a/kenlm.ipynb b/kenlm.ipynb deleted file mode 100644 index fee8f38..0000000 --- a/kenlm.ipynb +++ /dev/null @@ -1,1988 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# xzcat -f1 train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../kenlm/build/bin/lmplz -o 5 > kenlm_model.arpa\n", - "# ../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading train data...\n" - ] - } - ], - "source": [ - "import regex as re\n", - "\n", - "# save train text to file\n", - "\n", - "def clean_string(text):\n", - " text = text.lower()\n", - " text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n", - " text = re.sub(r\"\\\\n\", \" \", text)\n", - " text = text.strip()\n", - " return text\n", - "\n", - "train_text = \"\"\n", - "print(\"Reading train data...\")\n", - "with open(\"train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n", - " for t_line, e_line in zip(file, expected):\n", - " t_line = t_line.split(\"\\t\")\n", - " train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n", - "\n", - "# save train_text to file\n", - "print(\"saving to file...\")\n", - "with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n", - " file.write(train_text)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-7.822547912597656\n" - ] - } - ], - "source": [ - "import kenlm\n", - "\n", - "path = 'test_model.binary'\n", - "model = kenlm.Model(path)\n", - "\n", - "sentence = \"of the way\"\n", - "print(model.score(sentence))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Run predictions on dev-0 data...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 8/10519 [08:16<40:44:33, 13.95s/it] " - ] - } - ], - "source": [ - "from tqdm import tqdm\n", - "import regex as re\n", - "from nltk.tokenize import word_tokenize\n", - "from english_words import get_english_words_set\n", - "\n", - "\n", - "\n", - "def clean_string(text):\n", - " text = text.lower()\n", - " text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n", - " text = re.sub(r\"\\\\n\", \" \", text)\n", - " text = text.strip()\n", - " return text\n", - "\n", - "\n", - "def get_word_predictions(w1, w2,):\n", - " for word in get_english_words_set(['web2'], lower=True):\n", - " sentence = w1 + ' ' + word + ' ' + w2\n", - " text_score = model.score(sentence, bos=False, eos=False)\n", - " yield((word, text_score))\n", - "\n", - "def argmax(w1,w2):\n", - " # get top 10 predictions from predict_line\n", - " top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:10]\n", - " output_line = \" \".join([\"{}:{:.8f}\".format(w, p) for w, p in top_10])\n", - " return output_line\n", - "\n", - " # print(f\"{sentence}: {text_score}\")\n", - "\n", - " # probs = list(argmax(w1, w2, w4, w5, v, v2, v3))\n", - " # sum_prob = sum(p for (w, p) in probs)\n", - "\n", - " # try:\n", - " # probs = [(w, p / sum_prob) for w, p in probs]\n", - " # except ZeroDivisionError:\n", - " # return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n", - "\n", - " # top_probs = sorted(probs, key=lambda x: -x[1])[:4]\n", - " # top_probs = [(w,p) for (w,p) in top_probs if p > 0]\n", - " \n", - " # del probs\n", - " # del sum_prob\n", - "\n", - " # if len(top_probs) == 0:\n", - " # return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n", - " \n", - " # left_prob = 1 - sum(p for (w, p) in top_probs)\n", - " # if left_prob < 0.1:\n", - " # left_prob = 0.1\n", - "\n", - " # output_line = \" \".join([\"{}:{:.8f}\".format(w, p) for w, p in top_probs])\n", - " # output_line += \" :{:.8f}\".format(left_prob)\n", - "\n", - " # # print(f\"{w1} {w2} {w}\" for w in out_line.split(\" \"))\n", - "\n", - " # return output_line\n", - "\n", - "\n", - "def run_predictions(source_folder):\n", - " print(f\"Run predictions on {source_folder} data...\")\n", - " \n", - " with open(f\"{source_folder}/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file:\n", - " train_data = file.readlines()\n", - "\n", - " with open(f\"{source_folder}/out_kenlm.tsv\", \"w\", encoding=\"utf-8\") as output_file:\n", - " for line in tqdm(train_data):\n", - " line = line.split(\"\\t\")\n", - " \n", - " l1 = clean_string(line[-2])\n", - " l2 = clean_string(line[-1])\n", - "\n", - " if not l1 or not l2:\n", - " out_line = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n", - " else:\n", - " w1 = word_tokenize(l1)[-1:][0]\n", - " w2 = word_tokenize(l2)[0][0] \n", - " out_line = argmax(w1, w2)\n", - " \n", - " output_file.write(out_line + \"\\n\")\n", - " \n", - "\n", - "run_predictions(\"dev-0\")\n", - "# run_predictions(\"test-A\", V_counter, V2, V3, V4)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "432022\n" - ] - } - ], - "source": [ - "# with open(\"train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file:\n", - "# train_data = file.readlines()\n", - "# print(len(train_data))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['rin',\n", - " '11K',\n", - " 'ui',\n", - " 'i',\n", - " 'rsognfd',\n", - " 'inlriliinnts',\n", - " 'i',\n", - " '>',\n", - " 'r',\n", - " 'the',\n", - " 'town',\n", - " 'ofy',\n", - " '.-Jinn',\n", - " ',',\n", - " 'in',\n", - " 'the',\n", - " 'county',\n", - " 'of',\n", - " 'Lincoln',\n", - " 'Rrspcrtfully',\n", - " 'rop',\n", - " 'HHont',\n", - " ',',\n", - " 'that',\n", - " 'the',\n", - " 'part',\n", - " 'ol',\n", - " 'said',\n", - " 'town',\n", - " 'whi',\n", - " '<',\n", - " 'h',\n", - " 'they',\n", - " 'inhabits',\n", - " 'remote',\n", - " 'from',\n", - " 'tiie',\n", - " 'viII',\n", - " 'no',\n", - " ',',\n", - " 'and',\n", - " 'tliat',\n", - " 'they',\n", - " 'are',\n", - " 'so',\n", - " 'sit',\n", - " 'jutfd',\n", - " '(',\n", - " 'h',\n", - " 'it',\n", - " 'they',\n", - " 'would',\n", - " 'he',\n", - " 'much',\n", - " 'hotter',\n", - " 'accomodated',\n", - " ',',\n", - " 'f',\n", - " 'their',\n", - " 'lands',\n", - " 'were',\n", - " 'to',\n", - " '1',\n", - " 'c',\n", - " 'm',\n", - " 'oil',\n", - " '*',\n", - " 'from',\n", - " 'raid',\n", - " 'town',\n", - " 'ofMna',\n", - " 'and',\n", - " 'allix',\n", - " '*',\n", - " 'd',\n", - " 'and',\n", - " 'attached',\n", - " 'to',\n", - " 'flic',\n", - " 'town',\n", - " 'of',\n", - " 'Wis',\n", - " 'tassel',\n", - " 'the',\n", - " 'si',\n", - " 'ire',\n", - " 'town',\n", - " 'of',\n", - " 'tlio',\n", - " 'County',\n", - " ',',\n", - " 'and',\n", - " 'wherenost',\n", - " 'of',\n", - " 'their',\n", - " 'hmdmss',\n", - " 'is',\n", - " 'transacted',\n", - " '.',\n", - " 'They',\n", - " 'wouldIn',\n", - " 'r',\n", - " 'lore',\n", - " 'petition',\n", - " 'y',\n", - " '<',\n", - " 'tir',\n", - " 'Hole',\n", - " 'r.ible',\n", - " 'body',\n", - " ',',\n", - " 'that',\n", - " 'thelividing',\n", - " 'line',\n", - " 'of',\n", - " 's.i',\n", - " '.J',\n", - " 'towns',\n", - " '*',\n", - " '»',\n", - " 'f',\n", - " 'Wiscns^ct',\n", - " '«',\n", - " 'mf',\n", - " \"-'Jim\",\n", - " '*',\n", - " ',',\n", - " 'nav',\n", - " 'his',\n", - " 'so',\n", - " 'far',\n", - " 'alt',\n", - " 'rod',\n", - " 'ns',\n", - " 'to',\n", - " 'include',\n", - " 'their',\n", - " 'farms',\n", - " 'inmid',\n", - " 'town',\n", - " 'of',\n", - " 'VViscasset',\n", - " ',',\n", - " 'and',\n", - " 'the',\n", - " '!',\n", - " 'the',\n", - " 'now',\n", - " 'line',\n", - " 'ofLi',\n", - " 'vision',\n", - " 'between',\n", - " 'acid',\n", - " 'towns',\n", - " 'ninv',\n", - " 'ho',\n", - " 'as',\n", - " 'fdlows',\n", - " '*',\n", - " 'vizlh',\n", - " 'ginning',\n", - " 'on',\n", - " 'the',\n", - " 'pi',\n", - " 'scut',\n", - " 'line',\n", - " 'dividing',\n", - " 'the',\n", - " 'towns',\n", - " 'oliVi.a',\n", - " 'assct',\n", - " 'and',\n", - " \"A'in\",\n", - " ',',\n", - " 'at',\n", - " 't',\n", - " \"'\",\n", - " '»',\n", - " '«',\n", - " 'southeast',\n", - " 'corner',\n", - " 'idSeorgc',\n", - " 'Acorns',\n", - " 'laud',\n", - " 'in',\n", - " 'said',\n", - " 'Aina',\n", - " 'and',\n", - " 'riinninu',\n", - " 'from',\n", - " 'Northeasterly',\n", - " 'hv',\n", - " 'the',\n", - " 'head',\n", - " 'of',\n", - " 'said',\n", - " '.^corn',\n", - " '’',\n", - " 'sand',\n", - " 'and',\n", - " 'the',\n", - " 'bonds',\n", - " 'of',\n", - " 'all',\n", - " \"the'loisjadjoiiiiiig\",\n", - " 'to',\n", - " 'theVort',\n", - " 'beast',\n", - " 'Corner',\n", - " 'of',\n", - " 'the',\n", - " 'l',\n", - " '«',\n", - " '»',\n", - " 't',\n", - " 'now',\n", - " 'owned',\n", - " 'by',\n", - " 'Ja',\n", - " 'nes',\n", - " '*',\n", - " '*',\n", - " 'oyc',\n", - " 'and',\n", - " 'formerly',\n", - " 'o',\n", - " 'm',\n", - " 'd',\n", - " 'hv',\n", - " 'tin',\n", - " '*',\n", - " 'late',\n", - " 'Hon',\n", - " '.',\n", - " 'Abie',\n", - " ')',\n", - " 'Wood',\n", - " ',',\n", - " 'andbeingp-rt',\n", - " 'oflotNo.12M',\n", - " 'M.',\n", - " 'on',\n", - " 'Me',\n", - " 'vccnics',\n", - " 'piling',\n", - " 'and',\n", - " 'theme',\n", - " '/list',\n", - " 'Northwesterly',\n", - " 'hvlie',\n", - " 'North',\n", - " 'line',\n", - " 'id',\n", - " 'said',\n", - " 'lot',\n", - " 'No',\n", - " '.',\n", - " '12',\n", - " 'to',\n", - " 'the',\n", - " 'southeaster',\n", - " 'y',\n", - " 'he',\n", - " 'id',\n", - " 'of',\n", - " 'land',\n", - " 'owned',\n", - " 'by',\n", - " 'Whitcomb',\n", - " '&',\n", - " 'Groves',\n", - " ',',\n", - " 'hence',\n", - " 'northeasterly',\n", - " 'by',\n", - " 'tiie',\n", - " 'Inal',\n", - " 'of',\n", - " 'said',\n", - " 'lot',\n", - " 'to',\n", - " 'tliolorlhonst',\n", - " 'corner',\n", - " 'thereof',\n", - " ',',\n", - " 'thence',\n", - " 'northwesterly',\n", - " 'to',\n", - " 'Ihe',\n", - " 'line',\n", - " 'of',\n", - " 'the',\n", - " 'town',\n", - " 'of',\n", - " 'Dresden',\n", - " ',',\n", - " 'thence',\n", - " '8',\n", - " '<',\n", - " '>',\n", - " 'uthwrst',\n", - " 'rly',\n", - " 'by',\n", - " 'said',\n", - " 'Dresden',\n", - " 'Inn',\n", - " '*',\n", - " ',',\n", - " 'to',\n", - " 'tbu',\n", - " 'Sunth',\n", - " 'westerlyorner',\n", - " ',',\n", - " 'of',\n", - " 'the',\n", - " 'present',\n", - " 'dividing',\n", - " 'line',\n", - " ',',\n", - " 'I',\n", - " 'etwee',\n", - " 'n',\n", - " 'theown',\n", - " '>',\n", - " '‘',\n", - " 'of',\n", - " '’',\n", - " '.J',\n", - " 'Im',\n", - " 'and',\n", - " \"Wiscii'^et\",\n", - " ',',\n", - " 'and',\n", - " 'thence',\n", - " 'East-',\n", - " 'joutb',\n", - " 'easterly',\n", - " ',',\n", - " 'ly',\n", - " 'said',\n", - " 'town',\n", - " 'lino',\n", - " 'to',\n", - " 'tiie',\n", - " 'bounds',\n", - " 'first',\n", - " 'jMentioned',\n", - " ',',\n", - " 'v',\n", - " 'jili',\n", - " 'all',\n", - " 'the',\n", - " 'lands',\n", - " 'lying',\n", - " 'vvitbiu',\n", - " 'tin',\n", - " '*',\n", - " 'loresaid',\n", - " 'limits',\n", - " 'and',\n", - " 'that',\n", - " 'ib',\n", - " 'inhabitants',\n", - " 'thereonvilli',\n", - " 'their',\n", - " 'goods',\n", - " 'and',\n", - " 'Estate',\n", - " ',',\n", - " 'may',\n", - " 'be',\n", - " 'set',\n", - " 'oil',\n", - " \"'\",\n", - " 'fromaid',\n", - " 'town',\n", - " 'of',\n", - " 'Aina',\n", - " 'to',\n", - " '»',\n", - " '»',\n", - " '»',\n", - " 'id',\n", - " 'town',\n", - " 'of',\n", - " 'Wiscassot.ton',\n", - " 'County',\n", - " 'feel',\n", - " 'an',\n", - " 'interest',\n", - " 'in',\n", - " '.',\n", - " 'tn',\n", - " 'great',\n", - " 'is',\n", - " 'sues',\n", - " 'that',\n", - " 'are',\n", - " 'now',\n", - " 'before',\n", - " 'them',\n", - " ',',\n", - " 'and',\n", - " 'whichare',\n", - " 'the',\n", - " 'bonds',\n", - " 'of',\n", - " 'cohesion',\n", - " 'by',\n", - " 'which',\n", - " 'thegreat',\n", - " 'Republican',\n", - " 'parly',\n", - " 'is',\n", - " 'united',\n", - " '.',\n", - " 'I',\n", - " 'per',\n", - " '--',\n", - " ':',\n", - " 'ceive',\n", - " 'that',\n", - " 'the',\n", - " 'principles',\n", - " 'of',\n", - " 'liberty',\n", - " 'stillanimates',\n", - " 'you',\n", - " 'as',\n", - " 'when',\n", - " 'I',\n", - " 'last',\n", - " 'addressedyou',\n", - " ',',\n", - " 'and',\n", - " 'I',\n", - " 'rejoice',\n", - " '.',\n", - " 'It',\n", - " 'is',\n", - " 'not',\n", - " 'in',\n", - " 'the',\n", - " 'na',\n", - " 'ture',\n", - " 'of',\n", - " 'the',\n", - " 'cause',\n", - " 'of',\n", - " 'human',\n", - " 'freedom',\n", - " 'to',\n", - " 'diedie',\n", - " 'out',\n", - " 'of',\n", - " 'the',\n", - " 'human',\n", - " 'heart',\n", - " '.',\n", - " 'We',\n", - " 'repre',\n", - " 'sent',\n", - " 'the',\n", - " 'righis',\n", - " 'of',\n", - " 'human',\n", - " 'liberty',\n", - " ',',\n", - " 'the',\n", - " 'sameprinciples',\n", - " 'that',\n", - " 'inspired',\n", - " 'Jefferson',\n", - " 'andJackson',\n", - " ',',\n", - " 'and',\n", - " 'we',\n", - " 'now',\n", - " 'stand',\n", - " 'where',\n", - " 'we',\n", - " 'al',\n", - " 'ways',\n", - " 'have',\n", - " 'stood',\n", - " ',',\n", - " 'and',\n", - " 'always',\n", - " 'will',\n", - " 'stand',\n", - " ',',\n", - " 'until',\n", - " 'we',\n", - " 'have',\n", - " 'attained',\n", - " 'our',\n", - " 'ends',\n", - " '.',\n", - " 'Theelation',\n", - " 'before',\n", - " 'us',\n", - " ',',\n", - " 'it',\n", - " 'is',\n", - " 'true',\n", - " \"'\",\n", - " ',',\n", - " 'is',\n", - " 'not',\n", - " 'a',\n", - " ',',\n", - " \"'\",\n", - " 'na',\n", - " 'tional',\n", - " 'election',\n", - " ',',\n", - " 'and',\n", - " 'it',\n", - " 'is',\n", - " 'true',\n", - " 'that',\n", - " 'we',\n", - " 'neednot',\n", - " 'necessarily',\n", - " 'discuss',\n", - " 'National',\n", - " 'issues',\n", - " ',',\n", - " 'but',\n", - " 'it',\n", - " 'is',\n", - " 'also',\n", - " 'true',\n", - " 'that',\n", - " 'the',\n", - " 'Republican',\n", - " 'par',\n", - " 'ty',\n", - " 'is',\n", - " 'National',\n", - " 'in',\n", - " 'its',\n", - " 'and',\n", - " 'design',\n", - " ',',\n", - " 'and',\n", - " 'hence',\n", - " ',',\n", - " 'every',\n", - " 'election',\n", - " ',',\n", - " 'be',\n", - " 'it',\n", - " 'of',\n", - " 'State.or',\n", - " ';',\n", - " 'County',\n", - " ',',\n", - " 'or',\n", - " 'of',\n", - " 'town',\n", - " ',',\n", - " 'or',\n", - " 'of',\n", - " 'city',\n", - " ',',\n", - " 'partakesalike',\n", - " 'of',\n", - " 'a',\n", - " 'National',\n", - " 'nature',\n", - " ',',\n", - " 'and',\n", - " 'their',\n", - " 're',\n", - " 'sults',\n", - " 'enter',\n", - " 'into',\n", - " 'all',\n", - " 'our',\n", - " 'general',\n", - " 'concerns.But',\n", - " 'I',\n", - " 'now',\n", - " 'propose',\n", - " 'to',\n", - " 'speak',\n", - " 'to',\n", - " 'you',\n", - " 'offacts',\n", - " 'which',\n", - " 'more',\n", - " 'immediately',\n", - " 'interestyou',\n", - " '.',\n", - " 'I',\n", - " 'am',\n", - " 'before',\n", - " 'you',\n", - " 'as',\n", - " 'your',\n", - " 'candidatefor',\n", - " 'Governor',\n", - " 'not',\n", - " 'of',\n", - " 'my',\n", - " 'own',\n", - " 'choice',\n", - " ',',\n", - " \"'\",\n", - " 'Imay',\n", - " 'justly',\n", - " 'say',\n", - " '.',\n", - " 'Ody',\n", - " 'ambition',\n", - " 'was',\n", - " 'satis',\n", - " 'fied',\n", - " 'with',\n", - " 'one',\n", - " 'term',\n", - " ',',\n", - " 'and',\n", - " 'I',\n", - " 'had',\n", - " 'hoped',\n", - " 'to',\n", - " 're',\n", - " 'tire',\n", - " 'from',\n", - " 'the',\n", - " 'cares',\n", - " 'of',\n", - " 'office',\n", - " 'to',\n", - " 'devote',\n", - " 'mytime',\n", - " 'to',\n", - " 'interests',\n", - " 'of',\n", - " 'a',\n", - " 'private',\n", - " 'nature',\n", - " '.',\n", - " 'Yetsummoned',\n", - " 'as',\n", - " 'I',\n", - " 'was',\n", - " ',',\n", - " 'by',\n", - " 'the',\n", - " 'unanimouschoice',\n", - " 'of',\n", - " 'your',\n", - " 'representatives',\n", - " 'in',\n", - " 'Conven',\n", - " 'tion',\n", - " ',',\n", - " 'I',\n", - " 'felt',\n", - " 'constrained',\n", - " 'to',\n", - " 'accept',\n", - " 'the',\n", - " 'callof',\n", - " '.',\n", - " 'the',\n", - " 'Republican',\n", - " 'party',\n", - " ',',\n", - " 'and',\n", - " 'I',\n", - " 'am',\n", - " 'hereto',\n", - " 'open',\n", - " 'to',\n", - " 'you',\n", - " 'my',\n", - " 'heart',\n", - " 'and',\n", - " 'my',\n", - " 'mind',\n", - " 'up',\n", - " 'on',\n", - " 'public',\n", - " 'questions',\n", - " 'in',\n", - " 'which',\n", - " 'you',\n", - " 'justlymanifest',\n", - " 'a',\n", - " 'deep',\n", - " 'interest',\n", - " '.']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from nltk.tokenize import word_tokenize\n", - "word_tokenize(text)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'the': 9065021,\n", - " 'of': 5472207,\n", - " 'and': 4299259,\n", - " 'to': 3575612,\n", - " 'a': 2710622,\n", - " 'in': 2686894,\n", - " 'that': 1467928,\n", - " 'is': 1279167,\n", - " 'it': 1167772,\n", - " 'for': 1144284,\n", - " 'be': 992701,\n", - " 'was': 986130,\n", - " 'as': 879790,\n", - " 'at': 863453,\n", - " 'by': 858066,\n", - " 'on': 819505,\n", - " 'i': 816076,\n", - " 'with': 794078,\n", - " 'he': 776888,\n", - " 'or': 674438,\n", - " 'this': 627203,\n", - " 'his': 618101,\n", - " 'not': 604947,\n", - " 'from': 576711,\n", - " 'which': 572596,\n", - " 'are': 528619,\n", - " 'will': 519112,\n", - " 'have': 513257,\n", - " 's': 489456,\n", - " 'tho': 465585,\n", - " 'all': 463084,\n", - " 'but': 460675,\n", - " 'they': 450993,\n", - " 'an': 420170,\n", - " 'one': 413809,\n", - " 'had': 396904,\n", - " 'has': 386379,\n", - " 'their': 377294,\n", - " 'been': 374978,\n", - " 'no': 366339,\n", - " 'said': 353115,\n", - " 'were': 348313,\n", - " 'who': 342015,\n", - " 'we': 319853,\n", - " 'there': 311264,\n", - " 'would': 290263,\n", - " '1': 286386,\n", - " 't': 275743,\n", - " 'so': 272336,\n", - " 'if': 271926,\n", - " 'any': 269024,\n", - " 'when': 268129,\n", - " 'her': 258976,\n", - " 'them': 240990,\n", - " 'him': 237535,\n", - " 'mr': 229137,\n", - " 'its': 224384,\n", - " 'you': 223369,\n", - " 'out': 222458,\n", - " 'our': 213779,\n", - " 'other': 213610,\n", - " 'time': 211490,\n", - " 'more': 207219,\n", - " 'upon': 200290,\n", - " 'than': 199152,\n", - " 'made': 198649,\n", - " 'up': 197991,\n", - " 'day': 194396,\n", - " 'such': 193026,\n", - " 'two': 192820,\n", - " 'may': 192332,\n", - " 'tbe': 190738,\n", - " 'some': 183696,\n", - " 'state': 179728,\n", - " 'j': 178635,\n", - " 'do': 176230,\n", - " 'man': 175854,\n", - " 'now': 174816,\n", - " 'can': 174633,\n", - " 'she': 172474,\n", - " 'm': 166226,\n", - " 'into': 166143,\n", - " 'e': 166003,\n", - " 'w': 164759,\n", - " 'about': 164037,\n", - " 'n': 163632,\n", - " 'new': 162739,\n", - " 'l': 158739,\n", - " 'my': 158632,\n", - " 'only': 155874,\n", - " 'men': 155281,\n", - " 'city': 149928,\n", - " 'ing': 149573,\n", - " 'then': 149545,\n", - " 'shall': 148173,\n", - " 'these': 145383,\n", - " 'after': 144729,\n", - " 'should': 142414,\n", - " 'o': 140683,\n", - " 'over': 140671,\n", - " 'great': 139053,\n", - " 'county': 135720,\n", - " 'good': 135681,\n", - " 'very': 135509,\n", - " 'what': 135139,\n", - " 'every': 134754,\n", - " 'r': 134054,\n", - " 'years': 133524,\n", - " 'd': 133321,\n", - " 'c': 132482,\n", - " 'being': 130985,\n", - " 'people': 130583,\n", - " 'first': 127281,\n", - " '000': 127084,\n", - " 're': 125442,\n", - " 'many': 124439,\n", - " 'most': 123285,\n", - " 'could': 123230,\n", - " 'under': 122289,\n", - " 'h': 121514,\n", - " 'before': 118539,\n", - " 'well': 118108,\n", - " 'per': 114940,\n", - " 'last': 114552,\n", - " 'work': 113010,\n", - " 'same': 112079,\n", - " 'where': 111579,\n", - " 'me': 111346,\n", - " 'f': 110556,\n", - " 'mrs': 108039,\n", - " 'those': 107671,\n", - " 'ot': 107631,\n", - " 'feet': 106860,\n", - " 'much': 106570,\n", - " 'year': 104062,\n", - " 'make': 103103,\n", - " 'states': 101683,\n", - " 'three': 99943,\n", - " 'while': 97401,\n", - " 'house': 97187,\n", - " 'also': 95849,\n", - " 'old': 95558,\n", - " 'through': 94245,\n", - " 'each': 93521,\n", - " 'way': 93193,\n", - " 'country': 92494,\n", - " 'tion': 92215,\n", - " 'us': 92158,\n", - " 'little': 92011,\n", - " 'court': 90894,\n", - " 'place': 90642,\n", - " 'down': 90465,\n", - " '2': 90005,\n", - " 'b': 89797,\n", - " 'must': 89316,\n", - " 'did': 88750,\n", - " 'land': 88682,\n", - " 'north': 87040,\n", - " 'con': 85792,\n", - " 'part': 85665,\n", - " 'south': 85226,\n", - " 'your': 85192,\n", - " 'street': 84360,\n", - " 'aud': 83993,\n", - " 'public': 81839,\n", - " 'law': 81740,\n", - " 'long': 81409,\n", - " 'without': 81332,\n", - " 'here': 80105,\n", - " 'against': 79394,\n", - " 'de': 78915,\n", - " 'th': 77471,\n", - " 'u': 76398,\n", - " 'ed': 76228,\n", - " 'until': 75857,\n", - " 'p': 75604,\n", - " 'take': 75389,\n", - " 'large': 75219,\n", - " 'united': 75181,\n", - " 'line': 74996,\n", - " 'right': 74664,\n", - " 'few': 74474,\n", - " 'general': 74442,\n", - " 'ol': 74202,\n", - " 'life': 73885,\n", - " 'west': 73557,\n", - " 'like': 73209,\n", - " 'own': 72963,\n", - " 'bo': 72946,\n", - " 'found': 72887,\n", - " 'never': 72376,\n", - " '4': 72237,\n", - " 'company': 71150,\n", - " 'present': 70655,\n", - " '3': 70322,\n", - " 'go': 70233,\n", - " 'water': 70171,\n", - " 'money': 69656,\n", - " 'just': 69335,\n", - " 'party': 68859,\n", - " 'government': 68460,\n", - " 'home': 68371,\n", - " 'ho': 67622,\n", - " 'even': 66865,\n", - " 'days': 66663,\n", - " 'lie': 65871,\n", - " 'business': 64810,\n", - " 'ever': 64807,\n", - " 'get': 64435,\n", - " 'interest': 64157,\n", - " '10': 63963,\n", - " 'how': 63854,\n", - " 'war': 63838,\n", - " 'taken': 63488,\n", - " 'during': 62969,\n", - " 'given': 62934,\n", - " 'see': 62869,\n", - " 'four': 62746,\n", - " 'come': 62435,\n", - " 'case': 61818,\n", - " 'having': 61386,\n", - " 'came': 60657,\n", - " 'know': 60620,\n", - " 'side': 60173,\n", - " 'com': 60088,\n", - " 'between': 60033,\n", - " 'order': 60029,\n", - " 'back': 59161,\n", - " 'give': 58993,\n", - " 'st': 58879,\n", - " 'iu': 58846,\n", - " 'john': 58509,\n", - " 'say': 58438,\n", - " 'best': 58191,\n", - " 'put': 58187,\n", - " 'too': 58037,\n", - " 'half': 57773,\n", - " 'office': 57699,\n", - " 'thence': 57646,\n", - " 'lot': 57528,\n", - " 'fact': 57223,\n", - " 'known': 57118,\n", - " 'both': 56984,\n", - " 'power': 56978,\n", - " 'number': 56772,\n", - " 'night': 56261,\n", - " 'la': 56044,\n", - " 'world': 55992,\n", - " 'president': 55991,\n", - " 'another': 55779,\n", - " 'district': 55515,\n", - " 'v': 55512,\n", - " 'next': 55126,\n", - " 'less': 55053,\n", - " 'ii': 54831,\n", - " 'went': 54645,\n", - " 'york': 54529,\n", - " 'far': 54511,\n", - " 'within': 53995,\n", - " 'ex': 53978,\n", - " 'left': 53894,\n", - " 'young': 53382,\n", - " 'town': 53122,\n", - " 'off': 53096,\n", - " '5': 52989,\n", - " 'hundred': 52853,\n", - " '8': 52792,\n", - " 'east': 52776,\n", - " 'five': 52647,\n", - " 'point': 52614,\n", - " 'use': 52450,\n", - " '*': 51877,\n", - " 'pay': 51822,\n", - " 'among': 51741,\n", - " 'yet': 51263,\n", - " 'several': 51056,\n", - " 'done': 50859,\n", - " 'bill': 50841,\n", - " 'white': 50826,\n", - " 'nnd': 50740,\n", - " 'held': 50550,\n", - " 'property': 50547,\n", - " 'road': 50330,\n", - " 'might': 50244,\n", - " 'board': 49911,\n", - " 'again': 49873,\n", - " 'high': 49557,\n", - " 'whole': 49391,\n", - " 'miss': 48883,\n", - " 'g': 48808,\n", - " 'act': 48591,\n", - " 'still': 48504,\n", - " 'hand': 48430,\n", - " 'end': 48330,\n", - " 'matter': 48328,\n", - " 'away': 48199,\n", - " 'sale': 48080,\n", - " 'ment': 47671,\n", - " 'ten': 47613,\n", - " 'because': 47468,\n", - " 'school': 47413,\n", - " 'twenty': 47404,\n", - " 'above': 47384,\n", - " 'called': 46828,\n", - " 'american': 46822,\n", - " 'y': 46356,\n", - " 'cent': 46222,\n", - " 'amount': 46115,\n", - " 'course': 45302,\n", - " 'ago': 45238,\n", - " 'small': 45187,\n", - " 'week': 45112,\n", - " 'six': 45092,\n", - " 'used': 44799,\n", - " 'section': 44395,\n", - " 'since': 44346,\n", - " 'dr': 44303,\n", - " 'once': 44211,\n", - " 'took': 44000,\n", - " '11': 43914,\n", - " 'ami': 43913,\n", - " '7': 43733,\n", - " 'himself': 43626,\n", - " 'nothing': 43490,\n", - " 'paid': 43343,\n", - " 'better': 43336,\n", - " 'am': 43321,\n", - " 'let': 43230,\n", - " 'bad': 43152,\n", - " 'soon': 43000,\n", - " 'clock': 42944,\n", - " 'however': 42464,\n", - " 'head': 42236,\n", - " 'k': 42178,\n", - " 'en': 42174,\n", - " 'does': 42024,\n", - " 'certain': 41908,\n", - " 'along': 41676,\n", - " 'pro': 41173,\n", - " 'body': 40913,\n", - " 'near': 40745,\n", - " 'committee': 40642,\n", - " 'thing': 40575,\n", - " 'question': 40132,\n", - " 'cause': 40071,\n", - " 'full': 40009,\n", - " 'others': 39921,\n", - " 'set': 39912,\n", - " 'brought': 39789,\n", - " 'al': 39459,\n", - " 'think': 39390,\n", - " 'making': 39357,\n", - " 'miles': 39337,\n", - " 'thought': 39327,\n", - " 'second': 39271,\n", - " 'morning': 39184,\n", - " 'though': 39178,\n", - " 'times': 39105,\n", - " 'girl': 38804,\n", - " 'boy': 38784,\n", - " '6': 38763,\n", - " 'co': 38623,\n", - " 'room': 38449,\n", - " 'following': 38325,\n", - " 'name': 38301,\n", - " 'wife': 38295,\n", - " 'church': 38274,\n", - " 'dollars': 38002,\n", - " 'always': 37648,\n", - " 'enough': 37486,\n", - " 'thus': 37477,\n", - " 'un': 37410,\n", - " 'almost': 37402,\n", - " 'cannot': 37223,\n", - " 'able': 37192,\n", - " 'river': 36841,\n", - " 'find': 36795,\n", - " '00': 36793,\n", - " 'ground': 36537,\n", - " 'due': 36444,\n", - " 'children': 36286,\n", - " 'got': 36227,\n", - " 'free': 36206,\n", - " 'light': 36137,\n", - " 'action': 36062,\n", - " 'ia': 36049,\n", - " 'washington': 35891,\n", - " 'friends': 35600,\n", - " 'says': 35599,\n", - " 'stock': 35587,\n", - " 'lo': 35573,\n", - " 'whom': 35563,\n", - " 'whose': 35346,\n", - " 'service': 35273,\n", - " 'received': 35272,\n", - " 'means': 34777,\n", - " 'person': 34759,\n", - " 'necessary': 34700,\n", - " 'nor': 34676,\n", - " 'told': 34675,\n", - " 'death': 34557,\n", - " 'sent': 34369,\n", - " 'further': 34226,\n", - " 'purpose': 34128,\n", - " 'er': 34115,\n", - " 'things': 34079,\n", - " 'tha': 33661,\n", - " 'congress': 33650,\n", - " 'bis': 33499,\n", - " 'passed': 33493,\n", - " 'seen': 33484,\n", - " 'national': 33330,\n", - " 'building': 33234,\n", - " 'keep': 33214,\n", - " 'front': 33196,\n", - " 'block': 33088,\n", - " 'real': 33028,\n", - " 'aa': 32803,\n", - " 'going': 32767,\n", - " 'past': 32699,\n", - " 'whether': 32622,\n", - " 'months': 32443,\n", - " 'dis': 32419,\n", - " 'ly': 32398,\n", - " 'true': 32297,\n", - " 'sum': 32271,\n", - " 'woman': 32180,\n", - " 'subject': 32114,\n", - " '50': 32109,\n", - " 'either': 32013,\n", - " 'railroad': 31994,\n", - " 'son': 31985,\n", - " 'members': 31976,\n", - " 'union': 31922,\n", - " 'system': 31839,\n", - " '0': 31799,\n", - " 'gold': 31698,\n", - " 'around': 31668,\n", - " 'persons': 31587,\n", - " '20': 31585,\n", - " 'sold': 31542,\n", - " 'duty': 31529,\n", - " 'market': 31376,\n", - " 'least': 31270,\n", - " 'show': 31147,\n", - " 'form': 30989,\n", - " 'hands': 30983,\n", - " '12': 30964,\n", - " 'saw': 30856,\n", - " 'tlie': 30853,\n", - " 'family': 30818,\n", - " 'cost': 30746,\n", - " 'report': 30665,\n", - " 'why': 30549,\n", - " 'nearly': 30520,\n", - " 'election': 30453,\n", - " 'short': 30337,\n", - " 'price': 30306,\n", - " 'become': 30266,\n", - " 'notice': 30132,\n", - " 'look': 30122,\n", - " 'condition': 30013,\n", - " '30': 29989,\n", - " 'open': 29981,\n", - " 'meeting': 29913,\n", - " 'kind': 29855,\n", - " 'lots': 29836,\n", - " 'corner': 29771,\n", - " 'women': 29510,\n", - " 'together': 29506,\n", - " 'possible': 29491,\n", - " 'ihe': 29406,\n", - " 'gave': 29384,\n", - " '100': 29274,\n", - " 'themselves': 29250,\n", - " 'reason': 29105,\n", - " 'labor': 29043,\n", - " 'ter': 29006,\n", - " 'judge': 28965,\n", - " 'vote': 28927,\n", - " 'result': 28914,\n", - " 'third': 28722,\n", - " 'run': 28717,\n", - " 'fair': 28653,\n", - " 'tin': 28635,\n", - " 'value': 28498,\n", - " 'mortgage': 28465,\n", - " 'eight': 28464,\n", - " 'ad': 28331,\n", - " 'position': 28286,\n", - " 'evening': 28178,\n", - " 'wo': 28163,\n", - " 'thereof': 28056,\n", - " '9': 28023,\n", - " 'tor': 27921,\n", - " 'lor': 27839,\n", - " 'provided': 27801,\n", - " 'bank': 27781,\n", - " 'cut': 27746,\n", - " 'im': 27696,\n", - " 'described': 27667,\n", - " 'believe': 27648,\n", - " 'hour': 27634,\n", - " 'paper': 27584,\n", - " 'hold': 27567,\n", - " 'live': 27543,\n", - " '15': 27446,\n", - " 'acres': 27399,\n", - " 'god': 27370,\n", - " 'early': 27298,\n", - " '25': 27277,\n", - " 'quarter': 27190,\n", - " 'thirty': 27132,\n", - " 'want': 27115,\n", - " 'therefore': 27093,\n", - " 'late': 27091,\n", - " 'call': 26989,\n", - " 'charge': 26959,\n", - " 'heard': 26900,\n", - " 'army': 26885,\n", - " 'effect': 26707,\n", - " 'waa': 26689,\n", - " 'laws': 26659,\n", - " 'face': 26638,\n", - " 'oi': 26628,\n", - " 'cents': 26425,\n", - " 'stand': 26384,\n", - " 'age': 26226,\n", - " 'kept': 26167,\n", - " 'fire': 26109,\n", - " 'tne': 26103,\n", - " 'date': 25797,\n", - " 'placed': 25621,\n", - " 'common': 25606,\n", - " 'mind': 25554,\n", - " 'william': 25538,\n", - " 'march': 25514,\n", - " 'door': 25482,\n", - " 'heart': 25459,\n", - " 'republican': 25359,\n", - " 'aid': 25218,\n", - " 'special': 25161,\n", - " 'force': 25130,\n", - " 'ap': 25055,\n", - " 'beginning': 25026,\n", - " 'thousand': 25006,\n", - " 'secretary': 25005,\n", - " 'strong': 24999,\n", - " 'ac': 24935,\n", - " 'claim': 24931,\n", - " 'farm': 24896,\n", - " 'officers': 24822,\n", - " 'father': 24813,\n", - " 'estate': 24803,\n", - " 'political': 24619,\n", - " 'tax': 24583,\n", - " 'except': 24565,\n", - " 'manner': 24525,\n", - " 'cases': 24524,\n", - " 'lands': 24481,\n", - " 'department': 24456,\n", - " 'ar': 24372,\n", - " 'hard': 24357,\n", - " 'already': 24286,\n", - " 'proper': 24281,\n", - " 'hi': 24258,\n", - " 'required': 24237,\n", - " 'low': 24225,\n", - " 'air': 24213,\n", - " 'trust': 24206,\n", - " 'asked': 24203,\n", - " 'james': 24201,\n", - " 'blood': 24189,\n", - " 'book': 24186,\n", - " 'meet': 24156,\n", - " 'poor': 24116,\n", - " 'fall': 24072,\n", - " 'george': 24063,\n", - " 'trade': 24019,\n", - " 'big': 23973,\n", - " 'quite': 23900,\n", - " 'car': 23566,\n", - " 'ready': 23514,\n", - " 'often': 23510,\n", - " 'close': 23374,\n", - " 'field': 23359,\n", - " 'bonds': 23337,\n", - " 'read': 23320,\n", - " 'attention': 23309,\n", - " 'view': 23229,\n", - " 'class': 23192,\n", - " 'red': 23181,\n", - " 'hut': 23150,\n", - " 'care': 23107,\n", - " 'mother': 23095,\n", - " 'black': 23081,\n", - " 'tell': 23073,\n", - " 'deed': 23072,\n", - " 'return': 23011,\n", - " 'gen': 23006,\n", - " 'tions': 22984,\n", - " 'lost': 22978,\n", - " 'something': 22884,\n", - " 'favor': 22766,\n", - " 'nt': 22733,\n", - " 'rate': 22629,\n", - " 'health': 22618,\n", - " 'weeks': 22573,\n", - " 'fine': 22567,\n", - " 'oil': 22549,\n", - " 'taking': 22481,\n", - " 'hereby': 22425,\n", - " 'follows': 22375,\n", - " 'hours': 22368,\n", - " 'hope': 22366,\n", - " 'july': 22337,\n", - " 'letter': 22320,\n", - " 'seven': 22309,\n", - " 'turned': 22295,\n", - " 'pre': 22155,\n", - " 'change': 22088,\n", - " 'yesterday': 22085,\n", - " 'demand': 22065,\n", - " 'don': 22026,\n", - " 'corn': 22006,\n", - " 'governor': 21960,\n", - " 'democratic': 21956,\n", - " 'senate': 21946,\n", - " 'need': 21937,\n", - " 'coming': 21932,\n", - " 'prices': 21903,\n", - " 'try': 21890,\n", - " 'knew': 21885,\n", - " 'eyes': 21879,\n", - " 'virginia': 21859,\n", - " 'carried': 21841,\n", - " 'minutes': 21785,\n", - " 'train': 21780,\n", - " 'opinion': 21763,\n", - " 'itself': 21699,\n", - " 'doubt': 21693,\n", - " 'leave': 21663,\n", - " 'grand': 21626,\n", - " 'account': 21583,\n", - " 'month': 21563,\n", - " 'nature': 21520,\n", - " 'citizens': 21483,\n", - " 'sell': 21470,\n", - " 'food': 21406,\n", - " 'rather': 21324,\n", - " 'western': 21296,\n", - " 'nation': 21288,\n", - " 'character': 21283,\n", - " 'bring': 21268,\n", - " 'although': 21250,\n", - " 'ns': 21219,\n", - " 'seems': 21196,\n", - " 'probably': 21095,\n", - " 'southern': 21015,\n", - " 'dead': 20966,\n", - " 'worth': 20918,\n", - " 'anything': 20847,\n", - " 'began': 20846,\n", - " 'li': 20795,\n", - " 'child': 20729,\n", - " 'silver': 20723,\n", - " 'according': 20646,\n", - " 'fifty': 20565,\n", - " 'hall': 20553,\n", - " 'important': 20539,\n", - " 'charles': 20488,\n", - " 'smith': 20474,\n", - " 'chief': 20472,\n", - " 'doing': 20464,\n", - " 'love': 20462,\n", - " 'turn': 20440,\n", - " 'june': 20423,\n", - " 'ti': 20420,\n", - " 'senator': 20412,\n", - " 'feel': 20405,\n", - " 'wheat': 20394,\n", - " 'latter': 20375,\n", - " 'entire': 20375,\n", - " 'iron': 20371,\n", - " 'heavy': 20328,\n", - " 'story': 20296,\n", - " 'different': 20231,\n", - " 'record': 20197,\n", - " 'il': 20195,\n", - " 'met': 20185,\n", - " 'ou': 20136,\n", - " 'terms': 20037,\n", - " 'ton': 19905,\n", - " 'spring': 19903,\n", - " 'became': 19897,\n", - " '13': 19826,\n", - " 'peace': 19819,\n", - " 'seemed': 19796,\n", - " 'ship': 19773,\n", - " 'fully': 19727,\n", - " 'ill': 19709,\n", - " 'various': 19682,\n", - " 'post': 19655,\n", - " 'horse': 19616,\n", - " 'named': 19559,\n", - " 'running': 19554,\n", - " 'gone': 19525,\n", - " 'avenue': 19521,\n", - " 'range': 19503,\n", - " 'mo': 19496,\n", - " 'reached': 19480,\n", - " 'ha': 19462,\n", - " 'plan': 19424,\n", - " 'season': 19391,\n", - " 'clerk': 19340,\n", - " 'appear': 19315,\n", - " 'inches': 19302,\n", - " 'convention': 19295,\n", - " 'living': 19292,\n", - " 'portion': 19268,\n", - " 'help': 19267,\n", - " 'member': 19224,\n", - " 'perhaps': 19219,\n", - " 'chicago': 19191,\n", - " 'aad': 19174,\n", - " 'later': 19171,\n", - " 'places': 19117,\n", - " 'rest': 19087,\n", - " 'main': 19080,\n", - " 'rights': 19076,\n", - " '40': 19053,\n", - " 'conditions': 19053,\n", - " 'april': 19024,\n", - " 'future': 19023,\n", - " 'greater': 19019,\n", - " 'constitution': 18998,\n", - " 'foot': 18993,\n", - " 'words': 18974,\n", - " 'success': 18973,\n", - " 'justice': 18935,\n", - " 'hill': 18855,\n", - " 'und': 18825,\n", - " 'streets': 18812,\n", - " 'sec': 18737,\n", - " 'crop': 18729,\n", - " 'forty': 18729,\n", - " 'today': 18701,\n", - " 'loss': 18680,\n", - " '14': 18646,\n", - " 'friend': 18607,\n", - " 'word': 18596,\n", - " 'alone': 18554,\n", - " 'local': 18551,\n", - " 'sea': 18522,\n", - " 'lu': 18511,\n", - " 'payment': 18495,\n", - " 'laid': 18465,\n", - " 'generally': 18461,\n", - " 'winter': 18458,\n", - " 'col': 18410,\n", - " 'majority': 18392,\n", - " 'support': 18372,\n", - " 'history': 18307,\n", - " 'till': 18239,\n", - " 'regard': 18214,\n", - " 'earth': 18210,\n", - " 'england': 18198,\n", - " 'nine': 18193,\n", - " 'aro': 18154,\n", - " 'cash': 18104,\n", - " 'cotton': 18065,\n", - " 'ohio': 18020,\n", - " 'foreign': 17966,\n", - " 'interests': 17953,\n", - " 'king': 17938,\n", - " 'judgment': 17937,\n", - " 'makes': 17916,\n", - " 'stated': 17901,\n", - " 'toward': 17897,\n", - " 'lower': 17875,\n", - " 'wit': 17873,\n", - " 'equal': 17867,\n", - " 'mary': 17853,\n", - " 'wood': 17838,\n", - " 'capital': 17817,\n", - " 'parties': 17781,\n", - " 'felt': 17778,\n", - " 'looked': 17754,\n", - " 'died': 17744,\n", - " 'pass': 17737,\n", - " '18': 17726,\n", - " 'arc': 17712,\n", - " 'moment': 17704,\n", - " 'afternoon': 17700,\n", - " 'ty': 17691,\n", - " 'period': 17680,\n", - " 'lines': 17648,\n", - " 'returned': 17621,\n", - " 'unless': 17614,\n", - " 'increase': 17599,\n", - " 'idea': 17586,\n", - " 'private': 17576,\n", - " '16': 17569,\n", - " 'lake': 17550,\n", - " 'ber': 17529,\n", - " 'giving': 17519,\n", - " 'cold': 17504,\n", - " 'personal': 17444,\n", - " 'lay': 17421,\n", - " 'farmers': 17403,\n", - " 'degrees': 17385,\n", - " 'policy': 17359,\n", - " 'ma': 17356,\n", - " 'ft': 17331,\n", - " 'henry': 17327,\n", - " 'cor': 17305,\n", - " 'territory': 17287,\n", - " 'disease': 17279,\n", - " 'comes': 17200,\n", - " 'supply': 17199,\n", - " 'es': 17189,\n", - " 'spirit': 17174,\n", - " 'boys': 17143,\n", - " 'brown': 17138,\n", - " 'followed': 17117,\n", - " 'ought': 17101,\n", - " 'secured': 17039,\n", - " 'township': 17004,\n", - " 'secure': 16912,\n", - " 'carry': 16885,\n", - " 'society': 16870,\n", - " 'shown': 16865,\n", - " 'fore': 16850,\n", - " 'au': 16850,\n", - " 'sure': 16847,\n", - " 'human': 16805,\n", - " 'monday': 16785,\n", - " 'especially': 16784,\n", - " 'entirely': 16724,\n", - " 'tbo': 16675,\n", - " 'rich': 16674,\n", - " 'clear': 16614,\n", - " 'farmer': 16596,\n", - " 'soil': 16560,\n", - " 'trouble': 16534,\n", - " 'elected': 16524,\n", - " 'coal': 16521,\n", - " 'ward': 16506,\n", - " 'stone': 16477,\n", - " 'self': 16457,\n", - " 'america': 16439,\n", - " 'taxes': 16396,\n", - " 'll': 16384,\n", - " 'tried': 16360,\n", - " 'ana': 16325,\n", - " 'former': 16324,\n", - " 'term': 16310,\n", - " 'honor': 16306,\n", - " 'ordered': 16303,\n", - " 'sunday': 16291,\n", - " 'premises': 16249,\n", - " 'started': 16245,\n", - " 'bed': 16221,\n", - " 'goods': 16187,\n", - " 'instead': 16184,\n", - " 'thomas': 16159,\n", - " 'trial': 16141,\n", - " 'across': 16122,\n", - " 'beautiful': 16119,\n", - " 'pa': 16110,\n", - " 'strength': 16083,\n", - " 'allowed': 16073,\n", - " 'deal': 16044,\n", - " 'port': 15990,\n", - " 'lady': 15937,\n", - " 'highest': 15934,\n", - " 'parts': 15933,\n", - " 'pounds': 15929,\n", - " 'island': 15921,\n", - " 'top': 15883,\n", - " 'deep': 15883,\n", - " 'session': 15874,\n", - " 'recorded': 15839,\n", - " 'control': 15819,\n", - " 'served': 15812,\n", - " 'entered': 15787,\n", - " 'military': 15785,\n", - " 'tl': 15751,\n", - " 'none': 15751,\n", - " 'stood': 15751,\n", - " 'french': 15748,\n", - " 'answer': 15742,\n", - " 'seem': 15725,\n", - " 'saturday': 15654,\n", - " 'legislature': 15644,\n", - " 'sun': 15644,\n", - " 'sufficient': 15627,\n", - " '17': 15585,\n", - " 'houses': 15573,\n", - " 'rev': 15552,\n", - " 'article': 15541,\n", - " 'evidence': 15538,\n", - " 'expected': 15532,\n", - " 'statement': 15512,\n", - " '500': 15496,\n", - " 'object': 15493,\n", - " 'thc': 15493,\n", - " 'built': 15483,\n", - " 'win': 15459,\n", - " 'suit': 15456,\n", - " 'reported': 15446,\n", - " 'attorney': 15443,\n", - " 'club': 15436,\n", - " 'fur': 15432,\n", - " 'note': 15422,\n", - " 'officer': 15418,\n", - " 'total': 15411,\n", - " 'distance': 15389,\n", - " 'ono': 15385,\n", - " 'january': 15384,\n", - " 'cure': 15376,\n", - " 'council': 15371,\n", - " 'issue': 15364,\n", - " 'se': 15350,\n", - " 'immediately': 15310,\n", - " 'race': 15306,\n", - " 'san': 15278,\n", - " 'green': 15273,\n", - " 'wa': 15230,\n", - " 'looking': 15218,\n", - " 'debt': 15201,\n", - " 'firm': 15194,\n", - " 'ers': 15175,\n", - " 'louis': 15158,\n", - " 'roads': 15145,\n", - " 'ne': 15143,\n", - " 'hat': 15138,\n", - " 'twelve': 15108,\n", - " 'forth': 15093,\n", - " 'claims': 15090,\n", - " 'higher': 15077,\n", - " 'offered': 15065,\n", - " 'id': 15058,\n", - " 'august': 15049,\n", - " 'finally': 15046,\n", - " 'receive': 15035,\n", - " 'captain': 15012,\n", - " 'fell': 15011,\n", - " 'commission': 14989,\n", - " 'havo': 14976,\n", - " 'bear': 14965,\n", - " 'bv': 14962,\n", - " 'dakota': 14960,\n", - " 'ness': 14948,\n", - " 'issued': 14938,\n", - " 'husband': 14926,\n", - " 'proposed': 14925,\n", - " 'points': 14912,\n", - " 'principal': 14901,\n", - " 'killed': 14901,\n", - " 'won': 14890,\n", - " 'wide': 14874,\n", - " 'le': 14849,\n", - " 'tie': 14828,\n", - " 'getting': 14805,\n", - " 'store': 14797,\n", - " 'etc': 14782,\n", - " 'single': 14779,\n", - " 'schools': 14751,\n", - " 'news': 14736,\n", - " 'natural': 14726,\n", - " 'direction': 14706,\n", - " 'opened': 14684,\n", - " 'police': 14681,\n", - " 'dry': 14666,\n", - " 'whatever': 14661,\n", - " 'game': 14652,\n", - " 'below': 14648,\n", - " 'trees': 14631,\n", - " 'quiet': 14630,\n", - " 'follow': 14622,\n", - " 'hear': 14621,\n", - " 'desire': 14621,\n", - " 'mining': 14592,\n", - " 'summer': 14561,\n", - " 'ai': 14560,\n", - " 'ir': 14555,\n", - " 'addition': 14547,\n", - " 'page': 14484,\n", - " 'fourth': 14476,\n", - " 'beyond': 14424,\n", - " 'press': 14377,\n", - " 'average': 14376,\n", - " 'dated': 14368,\n", - " 'led': 14362,\n", - " 'regular': 14336,\n", - " 'tba': 14332,\n", - " 'length': 14328,\n", - " 'continued': 14283,\n", - " 'northern': 14280,\n", - " ...}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pickle\n", - "with open('V.pickle', 'rb') as handle:\n", - " V_counter = pickle.load(handle)\n", - "V_counter" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "10000" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(V_counter)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/kenlm_2words.py b/kenlm_2words.py deleted file mode 100644 index 00c34a5..0000000 --- a/kenlm_2words.py +++ /dev/null @@ -1,79 +0,0 @@ -from tqdm import tqdm -import regex as re -from nltk.tokenize import word_tokenize -from english_words import get_english_words_set -import kenlm -from math import log10 -import pickle - -path = 'kenlm_model.binary' -model = kenlm.Model(path) - -with open('V.pickle', 'rb') as handle: - V_counter = pickle.load(handle) - -def clean_string(text): - text = text.lower() - text = re.sub(r" -\\*\\n", "", text) - text = re.sub(r"\\n", " ", text) - text = text.strip() - return text - - -def predict_probs(w1, w3): - best_scores = [] - pred_str = "" - # for word in get_english_words_set(['web2'], lower=True): - for word in V_counter: - text = ' '.join([w1, word, w3]) - text_score = model.score(text, bos=False, eos=False) - if len(best_scores) < 5: - best_scores.append((word, text_score)) - else: - worst_score = best_scores[-1] - if worst_score[1] < text_score: - best_scores[-1] = (word, text_score) - best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True) - - for word, prob in best_scores: - pred_str += f'{word}:{prob} ' - pred_str += f':{log10(0.99)}' - return pred_str - -def get_word_predictions(w1, w2,): - for word in get_english_words_set(['web2'], lower=True): - sentence = w1 + ' ' + word + ' ' + w2 - text_score = model.score(sentence, bos=False, eos=False) - yield((word, text_score)) - -def argmax(w1,w2): - # get top 10 predictions from predict_line - top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4] - output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10]) - return output_line - -def run_predictions(source_folder): - print(f"Run predictions on {source_folder} data...") - - with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file: - train_data = file.readlines() - - with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file: - for line in tqdm(train_data): - line = line.split("\t") - - l1 = clean_string(line[-2]) - l2 = clean_string(line[-1]) - - if not l1 or not l2: - out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" - else: - w1 = word_tokenize(l1)[-1] - w2 = word_tokenize(l2)[0] - out_line = predict_probs(w1, w2) - - output_file.write(out_line + "\n") - - -run_predictions("dev-0") -run_predictions("test-A") diff --git a/kenlm_4words.py b/kenlm_run.py similarity index 84% rename from kenlm_4words.py rename to kenlm_run.py index f68afd5..ccd1a2d 100644 --- a/kenlm_4words.py +++ b/kenlm_run.py @@ -23,7 +23,6 @@ def clean_string(text): def predict_probs(w1, w2, w4, w5): best_scores = [] pred_str = "" - # for word in get_english_words_set(['web2'], lower=True): for word in V_counter: text = ' '.join([w1, w2, word, w4, w5]) text_score = model.score(text, bos=False, eos=False) @@ -42,19 +41,16 @@ def predict_probs(w1, w2, w4, w5): def get_word_predictions(w1, w2,): for word in get_english_words_set(['web2'], lower=True): - sentence = w1 + ' ' + word + ' ' + w2 - text_score = model.score(sentence, bos=False, eos=False) + sentence = f'{w1} {word} {w2}' + text_score = model.score(sentence, False, False) yield((word, text_score)) def argmax(w1,w2): - # get top 10 predictions from predict_line top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4] output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10]) return output_line -def run_predictions(source_folder): - print(f"Run predictions on {source_folder} data...") - +def run_predictions(source_folder): with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file: train_data = file.readlines() @@ -66,7 +62,7 @@ def run_predictions(source_folder): l2 = clean_string(line[-1]) if not l1 or not l2: - out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" + out_line = "the:0.5 a:0.3 :0.2" else: w1, w2 = word_tokenize(l1)[-2:] w3, w4 = word_tokenize(l2)[:2]