challenging-america-word-ga.../kenlm.ipynb

1989 lines
50 KiB
Plaintext
Raw Normal View History

2023-04-26 08:07:17 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# xzcat -f1 train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../kenlm/build/bin/lmplz -o 5 > kenlm_model.arpa\n",
"# ../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading train data...\n"
]
}
],
"source": [
"import regex as re\n",
"\n",
"# save train text to file\n",
"\n",
"def clean_string(text):\n",
" text = text.lower()\n",
" text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n",
" text = re.sub(r\"\\\\n\", \" \", text)\n",
" text = text.strip()\n",
" return text\n",
"\n",
"train_text = \"\"\n",
"print(\"Reading train data...\")\n",
"with open(\"train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n",
" for t_line, e_line in zip(file, expected):\n",
" t_line = t_line.split(\"\\t\")\n",
" train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n",
"\n",
"# save train_text to file\n",
"print(\"saving to file...\")\n",
"with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n",
" file.write(train_text)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-7.822547912597656\n"
]
}
],
"source": [
"import kenlm\n",
"\n",
"path = 'test_model.binary'\n",
"model = kenlm.Model(path)\n",
"\n",
"sentence = \"of the way\"\n",
"print(model.score(sentence))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Run predictions on dev-0 data...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 8/10519 [08:16<40:44:33, 13.95s/it] "
]
}
],
"source": [
"from tqdm import tqdm\n",
"import regex as re\n",
"from nltk.tokenize import word_tokenize\n",
"from english_words import get_english_words_set\n",
"\n",
"\n",
"\n",
"def clean_string(text):\n",
" text = text.lower()\n",
" text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n",
" text = re.sub(r\"\\\\n\", \" \", text)\n",
" text = text.strip()\n",
" return text\n",
"\n",
"\n",
"def get_word_predictions(w1, w2,):\n",
" for word in get_english_words_set(['web2'], lower=True):\n",
" sentence = w1 + ' ' + word + ' ' + w2\n",
" text_score = model.score(sentence, bos=False, eos=False)\n",
" yield((word, text_score))\n",
"\n",
"def argmax(w1,w2):\n",
" # get top 10 predictions from predict_line\n",
" top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:10]\n",
" output_line = \" \".join([\"{}:{:.8f}\".format(w, p) for w, p in top_10])\n",
" return output_line\n",
"\n",
" # print(f\"{sentence}: {text_score}\")\n",
"\n",
" # probs = list(argmax(w1, w2, w4, w5, v, v2, v3))\n",
" # sum_prob = sum(p for (w, p) in probs)\n",
"\n",
" # try:\n",
" # probs = [(w, p / sum_prob) for w, p in probs]\n",
" # except ZeroDivisionError:\n",
" # return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
"\n",
" # top_probs = sorted(probs, key=lambda x: -x[1])[:4]\n",
" # top_probs = [(w,p) for (w,p) in top_probs if p > 0]\n",
" \n",
" # del probs\n",
" # del sum_prob\n",
"\n",
" # if len(top_probs) == 0:\n",
" # return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
" \n",
" # left_prob = 1 - sum(p for (w, p) in top_probs)\n",
" # if left_prob < 0.1:\n",
" # left_prob = 0.1\n",
"\n",
" # output_line = \" \".join([\"{}:{:.8f}\".format(w, p) for w, p in top_probs])\n",
" # output_line += \" :{:.8f}\".format(left_prob)\n",
"\n",
" # # print(f\"{w1} {w2} {w}\" for w in out_line.split(\" \"))\n",
"\n",
" # return output_line\n",
"\n",
"\n",
"def run_predictions(source_folder):\n",
" print(f\"Run predictions on {source_folder} data...\")\n",
" \n",
" with open(f\"{source_folder}/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file:\n",
" train_data = file.readlines()\n",
"\n",
" with open(f\"{source_folder}/out_kenlm.tsv\", \"w\", encoding=\"utf-8\") as output_file:\n",
" for line in tqdm(train_data):\n",
" line = line.split(\"\\t\")\n",
" \n",
" l1 = clean_string(line[-2])\n",
" l2 = clean_string(line[-1])\n",
"\n",
" if not l1 or not l2:\n",
" out_line = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
" else:\n",
" w1 = word_tokenize(l1)[-1:][0]\n",
" w2 = word_tokenize(l2)[0][0] \n",
" out_line = argmax(w1, w2)\n",
" \n",
" output_file.write(out_line + \"\\n\")\n",
" \n",
"\n",
"run_predictions(\"dev-0\")\n",
"# run_predictions(\"test-A\", V_counter, V2, V3, V4)\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"432022\n"
]
}
],
"source": [
"# with open(\"train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file:\n",
"# train_data = file.readlines()\n",
"# print(len(train_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['rin',\n",
" '11K',\n",
" 'ui',\n",
" 'i',\n",
" 'rsognfd',\n",
" 'inlriliinnts',\n",
" 'i',\n",
" '>',\n",
" 'r',\n",
" 'the',\n",
" 'town',\n",
" 'ofy',\n",
" '.-Jinn',\n",
" ',',\n",
" 'in',\n",
" 'the',\n",
" 'county',\n",
" 'of',\n",
" 'Lincoln',\n",
" 'Rrspcrtfully',\n",
" 'rop',\n",
" 'HHont',\n",
" ',',\n",
" 'that',\n",
" 'the',\n",
" 'part',\n",
" 'ol',\n",
" 'said',\n",
" 'town',\n",
" 'whi',\n",
" '<',\n",
" 'h',\n",
" 'they',\n",
" 'inhabits',\n",
" 'remote',\n",
" 'from',\n",
" 'tiie',\n",
" 'viII',\n",
" 'no',\n",
" ',',\n",
" 'and',\n",
" 'tliat',\n",
" 'they',\n",
" 'are',\n",
" 'so',\n",
" 'sit',\n",
" 'jutfd',\n",
" '(',\n",
" 'h',\n",
" 'it',\n",
" 'they',\n",
" 'would',\n",
" 'he',\n",
" 'much',\n",
" 'hotter',\n",
" 'accomodated',\n",
" ',',\n",
" 'f',\n",
" 'their',\n",
" 'lands',\n",
" 'were',\n",
" 'to',\n",
" '1',\n",
" 'c',\n",
" 'm',\n",
" 'oil',\n",
" '*',\n",
" 'from',\n",
" 'raid',\n",
" 'town',\n",
" 'ofMna',\n",
" 'and',\n",
" 'allix',\n",
" '*',\n",
" 'd',\n",
" 'and',\n",
" 'attached',\n",
" 'to',\n",
" 'flic',\n",
" 'town',\n",
" 'of',\n",
" 'Wis',\n",
" 'tassel',\n",
" 'the',\n",
" 'si',\n",
" 'ire',\n",
" 'town',\n",
" 'of',\n",
" 'tlio',\n",
" 'County',\n",
" ',',\n",
" 'and',\n",
" 'wherenost',\n",
" 'of',\n",
" 'their',\n",
" 'hmdmss',\n",
" 'is',\n",
" 'transacted',\n",
" '.',\n",
" 'They',\n",
" 'wouldIn',\n",
" 'r',\n",
" 'lore',\n",
" 'petition',\n",
" 'y',\n",
" '<',\n",
" 'tir',\n",
" 'Hole',\n",
" 'r.ible',\n",
" 'body',\n",
" ',',\n",
" 'that',\n",
" 'thelividing',\n",
" 'line',\n",
" 'of',\n",
" 's.i',\n",
" '.J',\n",
" 'towns',\n",
" '*',\n",
" '»',\n",
" 'f',\n",
" 'Wiscns^ct',\n",
" '«',\n",
" 'mf',\n",
" \"-'Jim\",\n",
" '*',\n",
" ',',\n",
" 'nav',\n",
" 'his',\n",
" 'so',\n",
" 'far',\n",
" 'alt',\n",
" 'rod',\n",
" 'ns',\n",
" 'to',\n",
" 'include',\n",
" 'their',\n",
" 'farms',\n",
" 'inmid',\n",
" 'town',\n",
" 'of',\n",
" 'VViscasset',\n",
" ',',\n",
" 'and',\n",
" 'the',\n",
" '!',\n",
" 'the',\n",
" 'now',\n",
" 'line',\n",
" 'ofLi',\n",
" 'vision',\n",
" 'between',\n",
" 'acid',\n",
" 'towns',\n",
" 'ninv',\n",
" 'ho',\n",
" 'as',\n",
" 'fdlows',\n",
" '*',\n",
" 'vizlh',\n",
" 'ginning',\n",
" 'on',\n",
" 'the',\n",
" 'pi',\n",
" 'scut',\n",
" 'line',\n",
" 'dividing',\n",
" 'the',\n",
" 'towns',\n",
" 'oliVi.a',\n",
" 'assct',\n",
" 'and',\n",
" \"A'in\",\n",
" ',',\n",
" 'at',\n",
" 't',\n",
" \"'\",\n",
" '»',\n",
" '«',\n",
" 'southeast',\n",
" 'corner',\n",
" 'idSeorgc',\n",
" 'Acorns',\n",
" 'laud',\n",
" 'in',\n",
" 'said',\n",
" 'Aina',\n",
" 'and',\n",
" 'riinninu',\n",
" 'from',\n",
" 'Northeasterly',\n",
" 'hv',\n",
" 'the',\n",
" 'head',\n",
" 'of',\n",
" 'said',\n",
" '.^corn',\n",
" '',\n",
" 'sand',\n",
" 'and',\n",
" 'the',\n",
" 'bonds',\n",
" 'of',\n",
" 'all',\n",
" \"the'loisjadjoiiiiiig\",\n",
" 'to',\n",
" 'theVort',\n",
" 'beast',\n",
" 'Corner',\n",
" 'of',\n",
" 'the',\n",
" 'l',\n",
" '«',\n",
" '»',\n",
" 't',\n",
" 'now',\n",
" 'owned',\n",
" 'by',\n",
" 'Ja',\n",
" 'nes',\n",
" '*',\n",
" '*',\n",
" 'oyc',\n",
" 'and',\n",
" 'formerly',\n",
" 'o',\n",
" 'm',\n",
" 'd',\n",
" 'hv',\n",
" 'tin',\n",
" '*',\n",
" 'late',\n",
" 'Hon',\n",
" '.',\n",
" 'Abie',\n",
" ')',\n",
" 'Wood',\n",
" ',',\n",
" 'andbeingp-rt',\n",
" 'oflotNo.12M',\n",
" 'M.',\n",
" 'on',\n",
" 'Me',\n",
" 'vccnics',\n",
" 'piling',\n",
" 'and',\n",
" 'theme',\n",
" '/list',\n",
" 'Northwesterly',\n",
" 'hvlie',\n",
" 'North',\n",
" 'line',\n",
" 'id',\n",
" 'said',\n",
" 'lot',\n",
" 'No',\n",
" '.',\n",
" '12',\n",
" 'to',\n",
" 'the',\n",
" 'southeaster',\n",
" 'y',\n",
" 'he',\n",
" 'id',\n",
" 'of',\n",
" 'land',\n",
" 'owned',\n",
" 'by',\n",
" 'Whitcomb',\n",
" '&',\n",
" 'Groves',\n",
" ',',\n",
" 'hence',\n",
" 'northeasterly',\n",
" 'by',\n",
" 'tiie',\n",
" 'Inal',\n",
" 'of',\n",
" 'said',\n",
" 'lot',\n",
" 'to',\n",
" 'tliolorlhonst',\n",
" 'corner',\n",
" 'thereof',\n",
" ',',\n",
" 'thence',\n",
" 'northwesterly',\n",
" 'to',\n",
" 'Ihe',\n",
" 'line',\n",
" 'of',\n",
" 'the',\n",
" 'town',\n",
" 'of',\n",
" 'Dresden',\n",
" ',',\n",
" 'thence',\n",
" '8',\n",
" '<',\n",
" '>',\n",
" 'uthwrst',\n",
" 'rly',\n",
" 'by',\n",
" 'said',\n",
" 'Dresden',\n",
" 'Inn',\n",
" '*',\n",
" ',',\n",
" 'to',\n",
" 'tbu',\n",
" 'Sunth',\n",
" 'westerlyorner',\n",
" ',',\n",
" 'of',\n",
" 'the',\n",
" 'present',\n",
" 'dividing',\n",
" 'line',\n",
" ',',\n",
" 'I',\n",
" 'etwee',\n",
" 'n',\n",
" 'theown',\n",
" '>',\n",
" '',\n",
" 'of',\n",
" '',\n",
" '.J',\n",
" 'Im',\n",
" 'and',\n",
" \"Wiscii'^et\",\n",
" ',',\n",
" 'and',\n",
" 'thence',\n",
" 'East-',\n",
" 'joutb',\n",
" 'easterly',\n",
" ',',\n",
" 'ly',\n",
" 'said',\n",
" 'town',\n",
" 'lino',\n",
" 'to',\n",
" 'tiie',\n",
" 'bounds',\n",
" 'first',\n",
" 'jMentioned',\n",
" ',',\n",
" 'v',\n",
" 'jili',\n",
" 'all',\n",
" 'the',\n",
" 'lands',\n",
" 'lying',\n",
" 'vvitbiu',\n",
" 'tin',\n",
" '*',\n",
" 'loresaid',\n",
" 'limits',\n",
" 'and',\n",
" 'that',\n",
" 'ib',\n",
" 'inhabitants',\n",
" 'thereonvilli',\n",
" 'their',\n",
" 'goods',\n",
" 'and',\n",
" 'Estate',\n",
" ',',\n",
" 'may',\n",
" 'be',\n",
" 'set',\n",
" 'oil',\n",
" \"'\",\n",
" 'fromaid',\n",
" 'town',\n",
" 'of',\n",
" 'Aina',\n",
" 'to',\n",
" '»',\n",
" '»',\n",
" '»',\n",
" 'id',\n",
" 'town',\n",
" 'of',\n",
" 'Wiscassot.ton',\n",
" 'County',\n",
" 'feel',\n",
" 'an',\n",
" 'interest',\n",
" 'in',\n",
" '.',\n",
" 'tn',\n",
" 'great',\n",
" 'is',\n",
" 'sues',\n",
" 'that',\n",
" 'are',\n",
" 'now',\n",
" 'before',\n",
" 'them',\n",
" ',',\n",
" 'and',\n",
" 'whichare',\n",
" 'the',\n",
" 'bonds',\n",
" 'of',\n",
" 'cohesion',\n",
" 'by',\n",
" 'which',\n",
" 'thegreat',\n",
" 'Republican',\n",
" 'parly',\n",
" 'is',\n",
" 'united',\n",
" '.',\n",
" 'I',\n",
" 'per',\n",
" '--',\n",
" ':',\n",
" 'ceive',\n",
" 'that',\n",
" 'the',\n",
" 'principles',\n",
" 'of',\n",
" 'liberty',\n",
" 'stillanimates',\n",
" 'you',\n",
" 'as',\n",
" 'when',\n",
" 'I',\n",
" 'last',\n",
" 'addressedyou',\n",
" ',',\n",
" 'and',\n",
" 'I',\n",
" 'rejoice',\n",
" '.',\n",
" 'It',\n",
" 'is',\n",
" 'not',\n",
" 'in',\n",
" 'the',\n",
" 'na',\n",
" 'ture',\n",
" 'of',\n",
" 'the',\n",
" 'cause',\n",
" 'of',\n",
" 'human',\n",
" 'freedom',\n",
" 'to',\n",
" 'diedie',\n",
" 'out',\n",
" 'of',\n",
" 'the',\n",
" 'human',\n",
" 'heart',\n",
" '.',\n",
" 'We',\n",
" 'repre',\n",
" 'sent',\n",
" 'the',\n",
" 'righis',\n",
" 'of',\n",
" 'human',\n",
" 'liberty',\n",
" ',',\n",
" 'the',\n",
" 'sameprinciples',\n",
" 'that',\n",
" 'inspired',\n",
" 'Jefferson',\n",
" 'andJackson',\n",
" ',',\n",
" 'and',\n",
" 'we',\n",
" 'now',\n",
" 'stand',\n",
" 'where',\n",
" 'we',\n",
" 'al',\n",
" 'ways',\n",
" 'have',\n",
" 'stood',\n",
" ',',\n",
" 'and',\n",
" 'always',\n",
" 'will',\n",
" 'stand',\n",
" ',',\n",
" 'until',\n",
" 'we',\n",
" 'have',\n",
" 'attained',\n",
" 'our',\n",
" 'ends',\n",
" '.',\n",
" 'Theelation',\n",
" 'before',\n",
" 'us',\n",
" ',',\n",
" 'it',\n",
" 'is',\n",
" 'true',\n",
" \"'\",\n",
" ',',\n",
" 'is',\n",
" 'not',\n",
" 'a',\n",
" ',',\n",
" \"'\",\n",
" 'na',\n",
" 'tional',\n",
" 'election',\n",
" ',',\n",
" 'and',\n",
" 'it',\n",
" 'is',\n",
" 'true',\n",
" 'that',\n",
" 'we',\n",
" 'neednot',\n",
" 'necessarily',\n",
" 'discuss',\n",
" 'National',\n",
" 'issues',\n",
" ',',\n",
" 'but',\n",
" 'it',\n",
" 'is',\n",
" 'also',\n",
" 'true',\n",
" 'that',\n",
" 'the',\n",
" 'Republican',\n",
" 'par',\n",
" 'ty',\n",
" 'is',\n",
" 'National',\n",
" 'in',\n",
" 'its',\n",
" 'and',\n",
" 'design',\n",
" ',',\n",
" 'and',\n",
" 'hence',\n",
" ',',\n",
" 'every',\n",
" 'election',\n",
" ',',\n",
" 'be',\n",
" 'it',\n",
" 'of',\n",
" 'State.or',\n",
" ';',\n",
" 'County',\n",
" ',',\n",
" 'or',\n",
" 'of',\n",
" 'town',\n",
" ',',\n",
" 'or',\n",
" 'of',\n",
" 'city',\n",
" ',',\n",
" 'partakesalike',\n",
" 'of',\n",
" 'a',\n",
" 'National',\n",
" 'nature',\n",
" ',',\n",
" 'and',\n",
" 'their',\n",
" 're',\n",
" 'sults',\n",
" 'enter',\n",
" 'into',\n",
" 'all',\n",
" 'our',\n",
" 'general',\n",
" 'concerns.But',\n",
" 'I',\n",
" 'now',\n",
" 'propose',\n",
" 'to',\n",
" 'speak',\n",
" 'to',\n",
" 'you',\n",
" 'offacts',\n",
" 'which',\n",
" 'more',\n",
" 'immediately',\n",
" 'interestyou',\n",
" '.',\n",
" 'I',\n",
" 'am',\n",
" 'before',\n",
" 'you',\n",
" 'as',\n",
" 'your',\n",
" 'candidatefor',\n",
" 'Governor',\n",
" 'not',\n",
" 'of',\n",
" 'my',\n",
" 'own',\n",
" 'choice',\n",
" ',',\n",
" \"'\",\n",
" 'Imay',\n",
" 'justly',\n",
" 'say',\n",
" '.',\n",
" 'Ody',\n",
" 'ambition',\n",
" 'was',\n",
" 'satis',\n",
" 'fied',\n",
" 'with',\n",
" 'one',\n",
" 'term',\n",
" ',',\n",
" 'and',\n",
" 'I',\n",
" 'had',\n",
" 'hoped',\n",
" 'to',\n",
" 're',\n",
" 'tire',\n",
" 'from',\n",
" 'the',\n",
" 'cares',\n",
" 'of',\n",
" 'office',\n",
" 'to',\n",
" 'devote',\n",
" 'mytime',\n",
" 'to',\n",
" 'interests',\n",
" 'of',\n",
" 'a',\n",
" 'private',\n",
" 'nature',\n",
" '.',\n",
" 'Yetsummoned',\n",
" 'as',\n",
" 'I',\n",
" 'was',\n",
" ',',\n",
" 'by',\n",
" 'the',\n",
" 'unanimouschoice',\n",
" 'of',\n",
" 'your',\n",
" 'representatives',\n",
" 'in',\n",
" 'Conven',\n",
" 'tion',\n",
" ',',\n",
" 'I',\n",
" 'felt',\n",
" 'constrained',\n",
" 'to',\n",
" 'accept',\n",
" 'the',\n",
" 'callof',\n",
" '.',\n",
" 'the',\n",
" 'Republican',\n",
" 'party',\n",
" ',',\n",
" 'and',\n",
" 'I',\n",
" 'am',\n",
" 'hereto',\n",
" 'open',\n",
" 'to',\n",
" 'you',\n",
" 'my',\n",
" 'heart',\n",
" 'and',\n",
" 'my',\n",
" 'mind',\n",
" 'up',\n",
" 'on',\n",
" 'public',\n",
" 'questions',\n",
" 'in',\n",
" 'which',\n",
" 'you',\n",
" 'justlymanifest',\n",
" 'a',\n",
" 'deep',\n",
" 'interest',\n",
" '.']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from nltk.tokenize import word_tokenize\n",
"word_tokenize(text)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'the': 9065021,\n",
" 'of': 5472207,\n",
" 'and': 4299259,\n",
" 'to': 3575612,\n",
" 'a': 2710622,\n",
" 'in': 2686894,\n",
" 'that': 1467928,\n",
" 'is': 1279167,\n",
" 'it': 1167772,\n",
" 'for': 1144284,\n",
" 'be': 992701,\n",
" 'was': 986130,\n",
" 'as': 879790,\n",
" 'at': 863453,\n",
" 'by': 858066,\n",
" 'on': 819505,\n",
" 'i': 816076,\n",
" 'with': 794078,\n",
" 'he': 776888,\n",
" 'or': 674438,\n",
" 'this': 627203,\n",
" 'his': 618101,\n",
" 'not': 604947,\n",
" 'from': 576711,\n",
" 'which': 572596,\n",
" 'are': 528619,\n",
" 'will': 519112,\n",
" 'have': 513257,\n",
" 's': 489456,\n",
" 'tho': 465585,\n",
" 'all': 463084,\n",
" 'but': 460675,\n",
" 'they': 450993,\n",
" 'an': 420170,\n",
" 'one': 413809,\n",
" 'had': 396904,\n",
" 'has': 386379,\n",
" 'their': 377294,\n",
" 'been': 374978,\n",
" 'no': 366339,\n",
" 'said': 353115,\n",
" 'were': 348313,\n",
" 'who': 342015,\n",
" 'we': 319853,\n",
" 'there': 311264,\n",
" 'would': 290263,\n",
" '1': 286386,\n",
" 't': 275743,\n",
" 'so': 272336,\n",
" 'if': 271926,\n",
" 'any': 269024,\n",
" 'when': 268129,\n",
" 'her': 258976,\n",
" 'them': 240990,\n",
" 'him': 237535,\n",
" 'mr': 229137,\n",
" 'its': 224384,\n",
" 'you': 223369,\n",
" 'out': 222458,\n",
" 'our': 213779,\n",
" 'other': 213610,\n",
" 'time': 211490,\n",
" 'more': 207219,\n",
" 'upon': 200290,\n",
" 'than': 199152,\n",
" 'made': 198649,\n",
" 'up': 197991,\n",
" 'day': 194396,\n",
" 'such': 193026,\n",
" 'two': 192820,\n",
" 'may': 192332,\n",
" 'tbe': 190738,\n",
" 'some': 183696,\n",
" 'state': 179728,\n",
" 'j': 178635,\n",
" 'do': 176230,\n",
" 'man': 175854,\n",
" 'now': 174816,\n",
" 'can': 174633,\n",
" 'she': 172474,\n",
" 'm': 166226,\n",
" 'into': 166143,\n",
" 'e': 166003,\n",
" 'w': 164759,\n",
" 'about': 164037,\n",
" 'n': 163632,\n",
" 'new': 162739,\n",
" 'l': 158739,\n",
" 'my': 158632,\n",
" 'only': 155874,\n",
" 'men': 155281,\n",
" 'city': 149928,\n",
" 'ing': 149573,\n",
" 'then': 149545,\n",
" 'shall': 148173,\n",
" 'these': 145383,\n",
" 'after': 144729,\n",
" 'should': 142414,\n",
" 'o': 140683,\n",
" 'over': 140671,\n",
" 'great': 139053,\n",
" 'county': 135720,\n",
" 'good': 135681,\n",
" 'very': 135509,\n",
" 'what': 135139,\n",
" 'every': 134754,\n",
" 'r': 134054,\n",
" 'years': 133524,\n",
" 'd': 133321,\n",
" 'c': 132482,\n",
" 'being': 130985,\n",
" 'people': 130583,\n",
" 'first': 127281,\n",
" '000': 127084,\n",
" 're': 125442,\n",
" 'many': 124439,\n",
" 'most': 123285,\n",
" 'could': 123230,\n",
" 'under': 122289,\n",
" 'h': 121514,\n",
" 'before': 118539,\n",
" 'well': 118108,\n",
" 'per': 114940,\n",
" 'last': 114552,\n",
" 'work': 113010,\n",
" 'same': 112079,\n",
" 'where': 111579,\n",
" 'me': 111346,\n",
" 'f': 110556,\n",
" 'mrs': 108039,\n",
" 'those': 107671,\n",
" 'ot': 107631,\n",
" 'feet': 106860,\n",
" 'much': 106570,\n",
" 'year': 104062,\n",
" 'make': 103103,\n",
" 'states': 101683,\n",
" 'three': 99943,\n",
" 'while': 97401,\n",
" 'house': 97187,\n",
" 'also': 95849,\n",
" 'old': 95558,\n",
" 'through': 94245,\n",
" 'each': 93521,\n",
" 'way': 93193,\n",
" 'country': 92494,\n",
" 'tion': 92215,\n",
" 'us': 92158,\n",
" 'little': 92011,\n",
" 'court': 90894,\n",
" 'place': 90642,\n",
" 'down': 90465,\n",
" '2': 90005,\n",
" 'b': 89797,\n",
" 'must': 89316,\n",
" 'did': 88750,\n",
" 'land': 88682,\n",
" 'north': 87040,\n",
" 'con': 85792,\n",
" 'part': 85665,\n",
" 'south': 85226,\n",
" 'your': 85192,\n",
" 'street': 84360,\n",
" 'aud': 83993,\n",
" 'public': 81839,\n",
" 'law': 81740,\n",
" 'long': 81409,\n",
" 'without': 81332,\n",
" 'here': 80105,\n",
" 'against': 79394,\n",
" 'de': 78915,\n",
" 'th': 77471,\n",
" 'u': 76398,\n",
" 'ed': 76228,\n",
" 'until': 75857,\n",
" 'p': 75604,\n",
" 'take': 75389,\n",
" 'large': 75219,\n",
" 'united': 75181,\n",
" 'line': 74996,\n",
" 'right': 74664,\n",
" 'few': 74474,\n",
" 'general': 74442,\n",
" 'ol': 74202,\n",
" 'life': 73885,\n",
" 'west': 73557,\n",
" 'like': 73209,\n",
" 'own': 72963,\n",
" 'bo': 72946,\n",
" 'found': 72887,\n",
" 'never': 72376,\n",
" '4': 72237,\n",
" 'company': 71150,\n",
" 'present': 70655,\n",
" '3': 70322,\n",
" 'go': 70233,\n",
" 'water': 70171,\n",
" 'money': 69656,\n",
" 'just': 69335,\n",
" 'party': 68859,\n",
" 'government': 68460,\n",
" 'home': 68371,\n",
" 'ho': 67622,\n",
" 'even': 66865,\n",
" 'days': 66663,\n",
" 'lie': 65871,\n",
" 'business': 64810,\n",
" 'ever': 64807,\n",
" 'get': 64435,\n",
" 'interest': 64157,\n",
" '10': 63963,\n",
" 'how': 63854,\n",
" 'war': 63838,\n",
" 'taken': 63488,\n",
" 'during': 62969,\n",
" 'given': 62934,\n",
" 'see': 62869,\n",
" 'four': 62746,\n",
" 'come': 62435,\n",
" 'case': 61818,\n",
" 'having': 61386,\n",
" 'came': 60657,\n",
" 'know': 60620,\n",
" 'side': 60173,\n",
" 'com': 60088,\n",
" 'between': 60033,\n",
" 'order': 60029,\n",
" 'back': 59161,\n",
" 'give': 58993,\n",
" 'st': 58879,\n",
" 'iu': 58846,\n",
" 'john': 58509,\n",
" 'say': 58438,\n",
" 'best': 58191,\n",
" 'put': 58187,\n",
" 'too': 58037,\n",
" 'half': 57773,\n",
" 'office': 57699,\n",
" 'thence': 57646,\n",
" 'lot': 57528,\n",
" 'fact': 57223,\n",
" 'known': 57118,\n",
" 'both': 56984,\n",
" 'power': 56978,\n",
" 'number': 56772,\n",
" 'night': 56261,\n",
" 'la': 56044,\n",
" 'world': 55992,\n",
" 'president': 55991,\n",
" 'another': 55779,\n",
" 'district': 55515,\n",
" 'v': 55512,\n",
" 'next': 55126,\n",
" 'less': 55053,\n",
" 'ii': 54831,\n",
" 'went': 54645,\n",
" 'york': 54529,\n",
" 'far': 54511,\n",
" 'within': 53995,\n",
" 'ex': 53978,\n",
" 'left': 53894,\n",
" 'young': 53382,\n",
" 'town': 53122,\n",
" 'off': 53096,\n",
" '5': 52989,\n",
" 'hundred': 52853,\n",
" '8': 52792,\n",
" 'east': 52776,\n",
" 'five': 52647,\n",
" 'point': 52614,\n",
" 'use': 52450,\n",
" '*': 51877,\n",
" 'pay': 51822,\n",
" 'among': 51741,\n",
" 'yet': 51263,\n",
" 'several': 51056,\n",
" 'done': 50859,\n",
" 'bill': 50841,\n",
" 'white': 50826,\n",
" 'nnd': 50740,\n",
" 'held': 50550,\n",
" 'property': 50547,\n",
" 'road': 50330,\n",
" 'might': 50244,\n",
" 'board': 49911,\n",
" 'again': 49873,\n",
" 'high': 49557,\n",
" 'whole': 49391,\n",
" 'miss': 48883,\n",
" 'g': 48808,\n",
" 'act': 48591,\n",
" 'still': 48504,\n",
" 'hand': 48430,\n",
" 'end': 48330,\n",
" 'matter': 48328,\n",
" 'away': 48199,\n",
" 'sale': 48080,\n",
" 'ment': 47671,\n",
" 'ten': 47613,\n",
" 'because': 47468,\n",
" 'school': 47413,\n",
" 'twenty': 47404,\n",
" 'above': 47384,\n",
" 'called': 46828,\n",
" 'american': 46822,\n",
" 'y': 46356,\n",
" 'cent': 46222,\n",
" 'amount': 46115,\n",
" 'course': 45302,\n",
" 'ago': 45238,\n",
" 'small': 45187,\n",
" 'week': 45112,\n",
" 'six': 45092,\n",
" 'used': 44799,\n",
" 'section': 44395,\n",
" 'since': 44346,\n",
" 'dr': 44303,\n",
" 'once': 44211,\n",
" 'took': 44000,\n",
" '11': 43914,\n",
" 'ami': 43913,\n",
" '7': 43733,\n",
" 'himself': 43626,\n",
" 'nothing': 43490,\n",
" 'paid': 43343,\n",
" 'better': 43336,\n",
" 'am': 43321,\n",
" 'let': 43230,\n",
" 'bad': 43152,\n",
" 'soon': 43000,\n",
" 'clock': 42944,\n",
" 'however': 42464,\n",
" 'head': 42236,\n",
" 'k': 42178,\n",
" 'en': 42174,\n",
" 'does': 42024,\n",
" 'certain': 41908,\n",
" 'along': 41676,\n",
" 'pro': 41173,\n",
" 'body': 40913,\n",
" 'near': 40745,\n",
" 'committee': 40642,\n",
" 'thing': 40575,\n",
" 'question': 40132,\n",
" 'cause': 40071,\n",
" 'full': 40009,\n",
" 'others': 39921,\n",
" 'set': 39912,\n",
" 'brought': 39789,\n",
" 'al': 39459,\n",
" 'think': 39390,\n",
" 'making': 39357,\n",
" 'miles': 39337,\n",
" 'thought': 39327,\n",
" 'second': 39271,\n",
" 'morning': 39184,\n",
" 'though': 39178,\n",
" 'times': 39105,\n",
" 'girl': 38804,\n",
" 'boy': 38784,\n",
" '6': 38763,\n",
" 'co': 38623,\n",
" 'room': 38449,\n",
" 'following': 38325,\n",
" 'name': 38301,\n",
" 'wife': 38295,\n",
" 'church': 38274,\n",
" 'dollars': 38002,\n",
" 'always': 37648,\n",
" 'enough': 37486,\n",
" 'thus': 37477,\n",
" 'un': 37410,\n",
" 'almost': 37402,\n",
" 'cannot': 37223,\n",
" 'able': 37192,\n",
" 'river': 36841,\n",
" 'find': 36795,\n",
" '00': 36793,\n",
" 'ground': 36537,\n",
" 'due': 36444,\n",
" 'children': 36286,\n",
" 'got': 36227,\n",
" 'free': 36206,\n",
" 'light': 36137,\n",
" 'action': 36062,\n",
" 'ia': 36049,\n",
" 'washington': 35891,\n",
" 'friends': 35600,\n",
" 'says': 35599,\n",
" 'stock': 35587,\n",
" 'lo': 35573,\n",
" 'whom': 35563,\n",
" 'whose': 35346,\n",
" 'service': 35273,\n",
" 'received': 35272,\n",
" 'means': 34777,\n",
" 'person': 34759,\n",
" 'necessary': 34700,\n",
" 'nor': 34676,\n",
" 'told': 34675,\n",
" 'death': 34557,\n",
" 'sent': 34369,\n",
" 'further': 34226,\n",
" 'purpose': 34128,\n",
" 'er': 34115,\n",
" 'things': 34079,\n",
" 'tha': 33661,\n",
" 'congress': 33650,\n",
" 'bis': 33499,\n",
" 'passed': 33493,\n",
" 'seen': 33484,\n",
" 'national': 33330,\n",
" 'building': 33234,\n",
" 'keep': 33214,\n",
" 'front': 33196,\n",
" 'block': 33088,\n",
" 'real': 33028,\n",
" 'aa': 32803,\n",
" 'going': 32767,\n",
" 'past': 32699,\n",
" 'whether': 32622,\n",
" 'months': 32443,\n",
" 'dis': 32419,\n",
" 'ly': 32398,\n",
" 'true': 32297,\n",
" 'sum': 32271,\n",
" 'woman': 32180,\n",
" 'subject': 32114,\n",
" '50': 32109,\n",
" 'either': 32013,\n",
" 'railroad': 31994,\n",
" 'son': 31985,\n",
" 'members': 31976,\n",
" 'union': 31922,\n",
" 'system': 31839,\n",
" '0': 31799,\n",
" 'gold': 31698,\n",
" 'around': 31668,\n",
" 'persons': 31587,\n",
" '20': 31585,\n",
" 'sold': 31542,\n",
" 'duty': 31529,\n",
" 'market': 31376,\n",
" 'least': 31270,\n",
" 'show': 31147,\n",
" 'form': 30989,\n",
" 'hands': 30983,\n",
" '12': 30964,\n",
" 'saw': 30856,\n",
" 'tlie': 30853,\n",
" 'family': 30818,\n",
" 'cost': 30746,\n",
" 'report': 30665,\n",
" 'why': 30549,\n",
" 'nearly': 30520,\n",
" 'election': 30453,\n",
" 'short': 30337,\n",
" 'price': 30306,\n",
" 'become': 30266,\n",
" 'notice': 30132,\n",
" 'look': 30122,\n",
" 'condition': 30013,\n",
" '30': 29989,\n",
" 'open': 29981,\n",
" 'meeting': 29913,\n",
" 'kind': 29855,\n",
" 'lots': 29836,\n",
" 'corner': 29771,\n",
" 'women': 29510,\n",
" 'together': 29506,\n",
" 'possible': 29491,\n",
" 'ihe': 29406,\n",
" 'gave': 29384,\n",
" '100': 29274,\n",
" 'themselves': 29250,\n",
" 'reason': 29105,\n",
" 'labor': 29043,\n",
" 'ter': 29006,\n",
" 'judge': 28965,\n",
" 'vote': 28927,\n",
" 'result': 28914,\n",
" 'third': 28722,\n",
" 'run': 28717,\n",
" 'fair': 28653,\n",
" 'tin': 28635,\n",
" 'value': 28498,\n",
" 'mortgage': 28465,\n",
" 'eight': 28464,\n",
" 'ad': 28331,\n",
" 'position': 28286,\n",
" 'evening': 28178,\n",
" 'wo': 28163,\n",
" 'thereof': 28056,\n",
" '9': 28023,\n",
" 'tor': 27921,\n",
" 'lor': 27839,\n",
" 'provided': 27801,\n",
" 'bank': 27781,\n",
" 'cut': 27746,\n",
" 'im': 27696,\n",
" 'described': 27667,\n",
" 'believe': 27648,\n",
" 'hour': 27634,\n",
" 'paper': 27584,\n",
" 'hold': 27567,\n",
" 'live': 27543,\n",
" '15': 27446,\n",
" 'acres': 27399,\n",
" 'god': 27370,\n",
" 'early': 27298,\n",
" '25': 27277,\n",
" 'quarter': 27190,\n",
" 'thirty': 27132,\n",
" 'want': 27115,\n",
" 'therefore': 27093,\n",
" 'late': 27091,\n",
" 'call': 26989,\n",
" 'charge': 26959,\n",
" 'heard': 26900,\n",
" 'army': 26885,\n",
" 'effect': 26707,\n",
" 'waa': 26689,\n",
" 'laws': 26659,\n",
" 'face': 26638,\n",
" 'oi': 26628,\n",
" 'cents': 26425,\n",
" 'stand': 26384,\n",
" 'age': 26226,\n",
" 'kept': 26167,\n",
" 'fire': 26109,\n",
" 'tne': 26103,\n",
" 'date': 25797,\n",
" 'placed': 25621,\n",
" 'common': 25606,\n",
" 'mind': 25554,\n",
" 'william': 25538,\n",
" 'march': 25514,\n",
" 'door': 25482,\n",
" 'heart': 25459,\n",
" 'republican': 25359,\n",
" 'aid': 25218,\n",
" 'special': 25161,\n",
" 'force': 25130,\n",
" 'ap': 25055,\n",
" 'beginning': 25026,\n",
" 'thousand': 25006,\n",
" 'secretary': 25005,\n",
" 'strong': 24999,\n",
" 'ac': 24935,\n",
" 'claim': 24931,\n",
" 'farm': 24896,\n",
" 'officers': 24822,\n",
" 'father': 24813,\n",
" 'estate': 24803,\n",
" 'political': 24619,\n",
" 'tax': 24583,\n",
" 'except': 24565,\n",
" 'manner': 24525,\n",
" 'cases': 24524,\n",
" 'lands': 24481,\n",
" 'department': 24456,\n",
" 'ar': 24372,\n",
" 'hard': 24357,\n",
" 'already': 24286,\n",
" 'proper': 24281,\n",
" 'hi': 24258,\n",
" 'required': 24237,\n",
" 'low': 24225,\n",
" 'air': 24213,\n",
" 'trust': 24206,\n",
" 'asked': 24203,\n",
" 'james': 24201,\n",
" 'blood': 24189,\n",
" 'book': 24186,\n",
" 'meet': 24156,\n",
" 'poor': 24116,\n",
" 'fall': 24072,\n",
" 'george': 24063,\n",
" 'trade': 24019,\n",
" 'big': 23973,\n",
" 'quite': 23900,\n",
" 'car': 23566,\n",
" 'ready': 23514,\n",
" 'often': 23510,\n",
" 'close': 23374,\n",
" 'field': 23359,\n",
" 'bonds': 23337,\n",
" 'read': 23320,\n",
" 'attention': 23309,\n",
" 'view': 23229,\n",
" 'class': 23192,\n",
" 'red': 23181,\n",
" 'hut': 23150,\n",
" 'care': 23107,\n",
" 'mother': 23095,\n",
" 'black': 23081,\n",
" 'tell': 23073,\n",
" 'deed': 23072,\n",
" 'return': 23011,\n",
" 'gen': 23006,\n",
" 'tions': 22984,\n",
" 'lost': 22978,\n",
" 'something': 22884,\n",
" 'favor': 22766,\n",
" 'nt': 22733,\n",
" 'rate': 22629,\n",
" 'health': 22618,\n",
" 'weeks': 22573,\n",
" 'fine': 22567,\n",
" 'oil': 22549,\n",
" 'taking': 22481,\n",
" 'hereby': 22425,\n",
" 'follows': 22375,\n",
" 'hours': 22368,\n",
" 'hope': 22366,\n",
" 'july': 22337,\n",
" 'letter': 22320,\n",
" 'seven': 22309,\n",
" 'turned': 22295,\n",
" 'pre': 22155,\n",
" 'change': 22088,\n",
" 'yesterday': 22085,\n",
" 'demand': 22065,\n",
" 'don': 22026,\n",
" 'corn': 22006,\n",
" 'governor': 21960,\n",
" 'democratic': 21956,\n",
" 'senate': 21946,\n",
" 'need': 21937,\n",
" 'coming': 21932,\n",
" 'prices': 21903,\n",
" 'try': 21890,\n",
" 'knew': 21885,\n",
" 'eyes': 21879,\n",
" 'virginia': 21859,\n",
" 'carried': 21841,\n",
" 'minutes': 21785,\n",
" 'train': 21780,\n",
" 'opinion': 21763,\n",
" 'itself': 21699,\n",
" 'doubt': 21693,\n",
" 'leave': 21663,\n",
" 'grand': 21626,\n",
" 'account': 21583,\n",
" 'month': 21563,\n",
" 'nature': 21520,\n",
" 'citizens': 21483,\n",
" 'sell': 21470,\n",
" 'food': 21406,\n",
" 'rather': 21324,\n",
" 'western': 21296,\n",
" 'nation': 21288,\n",
" 'character': 21283,\n",
" 'bring': 21268,\n",
" 'although': 21250,\n",
" 'ns': 21219,\n",
" 'seems': 21196,\n",
" 'probably': 21095,\n",
" 'southern': 21015,\n",
" 'dead': 20966,\n",
" 'worth': 20918,\n",
" 'anything': 20847,\n",
" 'began': 20846,\n",
" 'li': 20795,\n",
" 'child': 20729,\n",
" 'silver': 20723,\n",
" 'according': 20646,\n",
" 'fifty': 20565,\n",
" 'hall': 20553,\n",
" 'important': 20539,\n",
" 'charles': 20488,\n",
" 'smith': 20474,\n",
" 'chief': 20472,\n",
" 'doing': 20464,\n",
" 'love': 20462,\n",
" 'turn': 20440,\n",
" 'june': 20423,\n",
" 'ti': 20420,\n",
" 'senator': 20412,\n",
" 'feel': 20405,\n",
" 'wheat': 20394,\n",
" 'latter': 20375,\n",
" 'entire': 20375,\n",
" 'iron': 20371,\n",
" 'heavy': 20328,\n",
" 'story': 20296,\n",
" 'different': 20231,\n",
" 'record': 20197,\n",
" 'il': 20195,\n",
" 'met': 20185,\n",
" 'ou': 20136,\n",
" 'terms': 20037,\n",
" 'ton': 19905,\n",
" 'spring': 19903,\n",
" 'became': 19897,\n",
" '13': 19826,\n",
" 'peace': 19819,\n",
" 'seemed': 19796,\n",
" 'ship': 19773,\n",
" 'fully': 19727,\n",
" 'ill': 19709,\n",
" 'various': 19682,\n",
" 'post': 19655,\n",
" 'horse': 19616,\n",
" 'named': 19559,\n",
" 'running': 19554,\n",
" 'gone': 19525,\n",
" 'avenue': 19521,\n",
" 'range': 19503,\n",
" 'mo': 19496,\n",
" 'reached': 19480,\n",
" 'ha': 19462,\n",
" 'plan': 19424,\n",
" 'season': 19391,\n",
" 'clerk': 19340,\n",
" 'appear': 19315,\n",
" 'inches': 19302,\n",
" 'convention': 19295,\n",
" 'living': 19292,\n",
" 'portion': 19268,\n",
" 'help': 19267,\n",
" 'member': 19224,\n",
" 'perhaps': 19219,\n",
" 'chicago': 19191,\n",
" 'aad': 19174,\n",
" 'later': 19171,\n",
" 'places': 19117,\n",
" 'rest': 19087,\n",
" 'main': 19080,\n",
" 'rights': 19076,\n",
" '40': 19053,\n",
" 'conditions': 19053,\n",
" 'april': 19024,\n",
" 'future': 19023,\n",
" 'greater': 19019,\n",
" 'constitution': 18998,\n",
" 'foot': 18993,\n",
" 'words': 18974,\n",
" 'success': 18973,\n",
" 'justice': 18935,\n",
" 'hill': 18855,\n",
" 'und': 18825,\n",
" 'streets': 18812,\n",
" 'sec': 18737,\n",
" 'crop': 18729,\n",
" 'forty': 18729,\n",
" 'today': 18701,\n",
" 'loss': 18680,\n",
" '14': 18646,\n",
" 'friend': 18607,\n",
" 'word': 18596,\n",
" 'alone': 18554,\n",
" 'local': 18551,\n",
" 'sea': 18522,\n",
" 'lu': 18511,\n",
" 'payment': 18495,\n",
" 'laid': 18465,\n",
" 'generally': 18461,\n",
" 'winter': 18458,\n",
" 'col': 18410,\n",
" 'majority': 18392,\n",
" 'support': 18372,\n",
" 'history': 18307,\n",
" 'till': 18239,\n",
" 'regard': 18214,\n",
" 'earth': 18210,\n",
" 'england': 18198,\n",
" 'nine': 18193,\n",
" 'aro': 18154,\n",
" 'cash': 18104,\n",
" 'cotton': 18065,\n",
" 'ohio': 18020,\n",
" 'foreign': 17966,\n",
" 'interests': 17953,\n",
" 'king': 17938,\n",
" 'judgment': 17937,\n",
" 'makes': 17916,\n",
" 'stated': 17901,\n",
" 'toward': 17897,\n",
" 'lower': 17875,\n",
" 'wit': 17873,\n",
" 'equal': 17867,\n",
" 'mary': 17853,\n",
" 'wood': 17838,\n",
" 'capital': 17817,\n",
" 'parties': 17781,\n",
" 'felt': 17778,\n",
" 'looked': 17754,\n",
" 'died': 17744,\n",
" 'pass': 17737,\n",
" '18': 17726,\n",
" 'arc': 17712,\n",
" 'moment': 17704,\n",
" 'afternoon': 17700,\n",
" 'ty': 17691,\n",
" 'period': 17680,\n",
" 'lines': 17648,\n",
" 'returned': 17621,\n",
" 'unless': 17614,\n",
" 'increase': 17599,\n",
" 'idea': 17586,\n",
" 'private': 17576,\n",
" '16': 17569,\n",
" 'lake': 17550,\n",
" 'ber': 17529,\n",
" 'giving': 17519,\n",
" 'cold': 17504,\n",
" 'personal': 17444,\n",
" 'lay': 17421,\n",
" 'farmers': 17403,\n",
" 'degrees': 17385,\n",
" 'policy': 17359,\n",
" 'ma': 17356,\n",
" 'ft': 17331,\n",
" 'henry': 17327,\n",
" 'cor': 17305,\n",
" 'territory': 17287,\n",
" 'disease': 17279,\n",
" 'comes': 17200,\n",
" 'supply': 17199,\n",
" 'es': 17189,\n",
" 'spirit': 17174,\n",
" 'boys': 17143,\n",
" 'brown': 17138,\n",
" 'followed': 17117,\n",
" 'ought': 17101,\n",
" 'secured': 17039,\n",
" 'township': 17004,\n",
" 'secure': 16912,\n",
" 'carry': 16885,\n",
" 'society': 16870,\n",
" 'shown': 16865,\n",
" 'fore': 16850,\n",
" 'au': 16850,\n",
" 'sure': 16847,\n",
" 'human': 16805,\n",
" 'monday': 16785,\n",
" 'especially': 16784,\n",
" 'entirely': 16724,\n",
" 'tbo': 16675,\n",
" 'rich': 16674,\n",
" 'clear': 16614,\n",
" 'farmer': 16596,\n",
" 'soil': 16560,\n",
" 'trouble': 16534,\n",
" 'elected': 16524,\n",
" 'coal': 16521,\n",
" 'ward': 16506,\n",
" 'stone': 16477,\n",
" 'self': 16457,\n",
" 'america': 16439,\n",
" 'taxes': 16396,\n",
" 'll': 16384,\n",
" 'tried': 16360,\n",
" 'ana': 16325,\n",
" 'former': 16324,\n",
" 'term': 16310,\n",
" 'honor': 16306,\n",
" 'ordered': 16303,\n",
" 'sunday': 16291,\n",
" 'premises': 16249,\n",
" 'started': 16245,\n",
" 'bed': 16221,\n",
" 'goods': 16187,\n",
" 'instead': 16184,\n",
" 'thomas': 16159,\n",
" 'trial': 16141,\n",
" 'across': 16122,\n",
" 'beautiful': 16119,\n",
" 'pa': 16110,\n",
" 'strength': 16083,\n",
" 'allowed': 16073,\n",
" 'deal': 16044,\n",
" 'port': 15990,\n",
" 'lady': 15937,\n",
" 'highest': 15934,\n",
" 'parts': 15933,\n",
" 'pounds': 15929,\n",
" 'island': 15921,\n",
" 'top': 15883,\n",
" 'deep': 15883,\n",
" 'session': 15874,\n",
" 'recorded': 15839,\n",
" 'control': 15819,\n",
" 'served': 15812,\n",
" 'entered': 15787,\n",
" 'military': 15785,\n",
" 'tl': 15751,\n",
" 'none': 15751,\n",
" 'stood': 15751,\n",
" 'french': 15748,\n",
" 'answer': 15742,\n",
" 'seem': 15725,\n",
" 'saturday': 15654,\n",
" 'legislature': 15644,\n",
" 'sun': 15644,\n",
" 'sufficient': 15627,\n",
" '17': 15585,\n",
" 'houses': 15573,\n",
" 'rev': 15552,\n",
" 'article': 15541,\n",
" 'evidence': 15538,\n",
" 'expected': 15532,\n",
" 'statement': 15512,\n",
" '500': 15496,\n",
" 'object': 15493,\n",
" 'thc': 15493,\n",
" 'built': 15483,\n",
" 'win': 15459,\n",
" 'suit': 15456,\n",
" 'reported': 15446,\n",
" 'attorney': 15443,\n",
" 'club': 15436,\n",
" 'fur': 15432,\n",
" 'note': 15422,\n",
" 'officer': 15418,\n",
" 'total': 15411,\n",
" 'distance': 15389,\n",
" 'ono': 15385,\n",
" 'january': 15384,\n",
" 'cure': 15376,\n",
" 'council': 15371,\n",
" 'issue': 15364,\n",
" 'se': 15350,\n",
" 'immediately': 15310,\n",
" 'race': 15306,\n",
" 'san': 15278,\n",
" 'green': 15273,\n",
" 'wa': 15230,\n",
" 'looking': 15218,\n",
" 'debt': 15201,\n",
" 'firm': 15194,\n",
" 'ers': 15175,\n",
" 'louis': 15158,\n",
" 'roads': 15145,\n",
" 'ne': 15143,\n",
" 'hat': 15138,\n",
" 'twelve': 15108,\n",
" 'forth': 15093,\n",
" 'claims': 15090,\n",
" 'higher': 15077,\n",
" 'offered': 15065,\n",
" 'id': 15058,\n",
" 'august': 15049,\n",
" 'finally': 15046,\n",
" 'receive': 15035,\n",
" 'captain': 15012,\n",
" 'fell': 15011,\n",
" 'commission': 14989,\n",
" 'havo': 14976,\n",
" 'bear': 14965,\n",
" 'bv': 14962,\n",
" 'dakota': 14960,\n",
" 'ness': 14948,\n",
" 'issued': 14938,\n",
" 'husband': 14926,\n",
" 'proposed': 14925,\n",
" 'points': 14912,\n",
" 'principal': 14901,\n",
" 'killed': 14901,\n",
" 'won': 14890,\n",
" 'wide': 14874,\n",
" 'le': 14849,\n",
" 'tie': 14828,\n",
" 'getting': 14805,\n",
" 'store': 14797,\n",
" 'etc': 14782,\n",
" 'single': 14779,\n",
" 'schools': 14751,\n",
" 'news': 14736,\n",
" 'natural': 14726,\n",
" 'direction': 14706,\n",
" 'opened': 14684,\n",
" 'police': 14681,\n",
" 'dry': 14666,\n",
" 'whatever': 14661,\n",
" 'game': 14652,\n",
" 'below': 14648,\n",
" 'trees': 14631,\n",
" 'quiet': 14630,\n",
" 'follow': 14622,\n",
" 'hear': 14621,\n",
" 'desire': 14621,\n",
" 'mining': 14592,\n",
" 'summer': 14561,\n",
" 'ai': 14560,\n",
" 'ir': 14555,\n",
" 'addition': 14547,\n",
" 'page': 14484,\n",
" 'fourth': 14476,\n",
" 'beyond': 14424,\n",
" 'press': 14377,\n",
" 'average': 14376,\n",
" 'dated': 14368,\n",
" 'led': 14362,\n",
" 'regular': 14336,\n",
" 'tba': 14332,\n",
" 'length': 14328,\n",
" 'continued': 14283,\n",
" 'northern': 14280,\n",
" ...}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pickle\n",
"with open('V.pickle', 'rb') as handle:\n",
" V_counter = pickle.load(handle)\n",
"V_counter"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(V_counter)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "python11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}