challenging-america-word-ga.../kenlm.ipynb
2023-04-26 08:07:17 +02:00

50 KiB
Raw Blame History

# xzcat -f1 train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../kenlm/build/bin/lmplz -o 5 > kenlm_model.arpa
# ../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary    
import regex as re

# save train text to file

def clean_string(text):
    text = text.lower()
    text = re.sub(r" -\\\\*\\\\n", "", text)
    text = re.sub(r"\\\\n", " ", text)
    text = text.strip()
    return text

train_text = ""
print("Reading train data...")
with open("train/in.tsv", encoding="utf8", mode="rt") as file, open("train/expected.tsv", encoding="utf8", mode="rt") as expected:
    for t_line, e_line in zip(file, expected):
        t_line = t_line.split("\t")
        train_text += clean_string(t_line[-2]) + f" {clean_string(e_line)} " + clean_string(t_line[-1])

# save train_text to file
print("saving to file...")
with open("train_text.txt", encoding="utf8", mode="w") as file:
    file.write(train_text)
Reading train data...
import kenlm

path = 'test_model.binary'
model = kenlm.Model(path)

sentence = "of the way"
print(model.score(sentence))
-7.822547912597656
from tqdm import tqdm
import regex as re
from nltk.tokenize import word_tokenize
from english_words import get_english_words_set



def clean_string(text):
    text = text.lower()
    text = re.sub(r" -\\\\*\\\\n", "", text)
    text = re.sub(r"\\\\n", " ", text)
    text = text.strip()
    return text


def get_word_predictions(w1, w2,):
    for word in get_english_words_set(['web2'], lower=True):
        sentence = w1 + ' ' + word + ' ' + w2
        text_score = model.score(sentence, bos=False, eos=False)
        yield((word, text_score))

def argmax(w1,w2):
    # get top 10 predictions from predict_line
    top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:10]
    output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
    return output_line

        # print(f"{sentence}: {text_score}")

    # probs = list(argmax(w1, w2, w4, w5, v, v2, v3))
    # sum_prob = sum(p for (w, p) in probs)

    # try:
    #     probs = [(w, p / sum_prob) for w, p in probs]
    # except ZeroDivisionError:
    #     return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"

    # top_probs = sorted(probs, key=lambda x: -x[1])[:4]
    # top_probs = [(w,p) for (w,p) in top_probs if p > 0]
    
    # del probs
    # del sum_prob

    # if len(top_probs) == 0:
    #     return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
    
    # left_prob = 1 - sum(p for (w, p) in top_probs)
    # if left_prob < 0.1:
    #     left_prob = 0.1

    # output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_probs])
    # output_line += " :{:.8f}".format(left_prob)

    # # print(f"{w1} {w2} {w}" for w in out_line.split(" "))

    # return output_line


def run_predictions(source_folder):
    print(f"Run predictions on {source_folder} data...")
    
    with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
        train_data = file.readlines()

    with open(f"{source_folder}/out_kenlm.tsv", "w", encoding="utf-8") as output_file:
        for line in tqdm(train_data):
            line = line.split("\t")
            
            l1 = clean_string(line[-2])
            l2 = clean_string(line[-1])

            if not l1 or not l2:
               out_line =  "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
            else:
                w1 = word_tokenize(l1)[-1:][0]
                w2 = word_tokenize(l2)[0][0]           
                out_line = argmax(w1, w2)
                
            output_file.write(out_line + "\n")
    

run_predictions("dev-0")
# run_predictions("test-A", V_counter, V2, V3, V4)
Run predictions on dev-0 data...
  0%|          | 8/10519 [08:16<40:44:33, 13.95s/it]   
# with open("train/in.tsv", encoding="utf8", mode="rt") as file:
#   train_data = file.readlines()
#   print(len(train_data))
432022
from nltk.tokenize import word_tokenize
word_tokenize(text)
['rin',
 '11K',
 'ui',
 'i',
 'rsognfd',
 'inlriliinnts',
 'i',
 '>',
 'r',
 'the',
 'town',
 'ofy',
 '.-Jinn',
 ',',
 'in',
 'the',
 'county',
 'of',
 'Lincoln',
 'Rrspcrtfully',
 'rop',
 'HHont',
 ',',
 'that',
 'the',
 'part',
 'ol',
 'said',
 'town',
 'whi',
 '<',
 'h',
 'they',
 'inhabits',
 'remote',
 'from',
 'tiie',
 'viII',
 'no',
 ',',
 'and',
 'tliat',
 'they',
 'are',
 'so',
 'sit',
 'jutfd',
 '(',
 'h',
 'it',
 'they',
 'would',
 'he',
 'much',
 'hotter',
 'accomodated',
 ',',
 'f',
 'their',
 'lands',
 'were',
 'to',
 '1',
 'c',
 'm',
 'oil',
 '*',
 'from',
 'raid',
 'town',
 'ofMna',
 'and',
 'allix',
 '*',
 'd',
 'and',
 'attached',
 'to',
 'flic',
 'town',
 'of',
 'Wis',
 'tassel',
 'the',
 'si',
 'ire',
 'town',
 'of',
 'tlio',
 'County',
 ',',
 'and',
 'wherenost',
 'of',
 'their',
 'hmdmss',
 'is',
 'transacted',
 '.',
 'They',
 'wouldIn',
 'r',
 'lore',
 'petition',
 'y',
 '<',
 'tir',
 'Hole',
 'r.ible',
 'body',
 ',',
 'that',
 'thelividing',
 'line',
 'of',
 's.i',
 '.J',
 'towns',
 '*',
 '»',
 'f',
 'Wiscns^ct',
 '«',
 'mf',
 "-'Jim",
 '*',
 ',',
 'nav',
 'his',
 'so',
 'far',
 'alt',
 'rod',
 'ns',
 'to',
 'include',
 'their',
 'farms',
 'inmid',
 'town',
 'of',
 'VViscasset',
 ',',
 'and',
 'the',
 '!',
 'the',
 'now',
 'line',
 'ofLi',
 'vision',
 'between',
 'acid',
 'towns',
 'ninv',
 'ho',
 'as',
 'fdlows',
 '*',
 'vizlh',
 'ginning',
 'on',
 'the',
 'pi',
 'scut',
 'line',
 'dividing',
 'the',
 'towns',
 'oliVi.a',
 'assct',
 'and',
 "A'in",
 ',',
 'at',
 't',
 "'",
 '»',
 '«',
 'southeast',
 'corner',
 'idSeorgc',
 'Acorns',
 'laud',
 'in',
 'said',
 'Aina',
 'and',
 'riinninu',
 'from',
 'Northeasterly',
 'hv',
 'the',
 'head',
 'of',
 'said',
 '.^corn',
 '',
 'sand',
 'and',
 'the',
 'bonds',
 'of',
 'all',
 "the'loisjadjoiiiiiig",
 'to',
 'theVort',
 'beast',
 'Corner',
 'of',
 'the',
 'l',
 '«',
 '»',
 't',
 'now',
 'owned',
 'by',
 'Ja',
 'nes',
 '*',
 '*',
 'oyc',
 'and',
 'formerly',
 'o',
 'm',
 'd',
 'hv',
 'tin',
 '*',
 'late',
 'Hon',
 '.',
 'Abie',
 ')',
 'Wood',
 ',',
 'andbeingp-rt',
 'oflotNo.12M',
 'M.',
 'on',
 'Me',
 'vccnics',
 'piling',
 'and',
 'theme',
 '/list',
 'Northwesterly',
 'hvlie',
 'North',
 'line',
 'id',
 'said',
 'lot',
 'No',
 '.',
 '12',
 'to',
 'the',
 'southeaster',
 'y',
 'he',
 'id',
 'of',
 'land',
 'owned',
 'by',
 'Whitcomb',
 '&',
 'Groves',
 ',',
 'hence',
 'northeasterly',
 'by',
 'tiie',
 'Inal',
 'of',
 'said',
 'lot',
 'to',
 'tliolorlhonst',
 'corner',
 'thereof',
 ',',
 'thence',
 'northwesterly',
 'to',
 'Ihe',
 'line',
 'of',
 'the',
 'town',
 'of',
 'Dresden',
 ',',
 'thence',
 '8',
 '<',
 '>',
 'uthwrst',
 'rly',
 'by',
 'said',
 'Dresden',
 'Inn',
 '*',
 ',',
 'to',
 'tbu',
 'Sunth',
 'westerlyorner',
 ',',
 'of',
 'the',
 'present',
 'dividing',
 'line',
 ',',
 'I',
 'etwee',
 'n',
 'theown',
 '>',
 '',
 'of',
 '',
 '.J',
 'Im',
 'and',
 "Wiscii'^et",
 ',',
 'and',
 'thence',
 'East-',
 'joutb',
 'easterly',
 ',',
 'ly',
 'said',
 'town',
 'lino',
 'to',
 'tiie',
 'bounds',
 'first',
 'jMentioned',
 ',',
 'v',
 'jili',
 'all',
 'the',
 'lands',
 'lying',
 'vvitbiu',
 'tin',
 '*',
 'loresaid',
 'limits',
 'and',
 'that',
 'ib',
 'inhabitants',
 'thereonvilli',
 'their',
 'goods',
 'and',
 'Estate',
 ',',
 'may',
 'be',
 'set',
 'oil',
 "'",
 'fromaid',
 'town',
 'of',
 'Aina',
 'to',
 '»',
 '»',
 '»',
 'id',
 'town',
 'of',
 'Wiscassot.ton',
 'County',
 'feel',
 'an',
 'interest',
 'in',
 '.',
 'tn',
 'great',
 'is',
 'sues',
 'that',
 'are',
 'now',
 'before',
 'them',
 ',',
 'and',
 'whichare',
 'the',
 'bonds',
 'of',
 'cohesion',
 'by',
 'which',
 'thegreat',
 'Republican',
 'parly',
 'is',
 'united',
 '.',
 'I',
 'per',
 '--',
 ':',
 'ceive',
 'that',
 'the',
 'principles',
 'of',
 'liberty',
 'stillanimates',
 'you',
 'as',
 'when',
 'I',
 'last',
 'addressedyou',
 ',',
 'and',
 'I',
 'rejoice',
 '.',
 'It',
 'is',
 'not',
 'in',
 'the',
 'na',
 'ture',
 'of',
 'the',
 'cause',
 'of',
 'human',
 'freedom',
 'to',
 'diedie',
 'out',
 'of',
 'the',
 'human',
 'heart',
 '.',
 'We',
 'repre',
 'sent',
 'the',
 'righis',
 'of',
 'human',
 'liberty',
 ',',
 'the',
 'sameprinciples',
 'that',
 'inspired',
 'Jefferson',
 'andJackson',
 ',',
 'and',
 'we',
 'now',
 'stand',
 'where',
 'we',
 'al',
 'ways',
 'have',
 'stood',
 ',',
 'and',
 'always',
 'will',
 'stand',
 ',',
 'until',
 'we',
 'have',
 'attained',
 'our',
 'ends',
 '.',
 'Theelation',
 'before',
 'us',
 ',',
 'it',
 'is',
 'true',
 "'",
 ',',
 'is',
 'not',
 'a',
 ',',
 "'",
 'na',
 'tional',
 'election',
 ',',
 'and',
 'it',
 'is',
 'true',
 'that',
 'we',
 'neednot',
 'necessarily',
 'discuss',
 'National',
 'issues',
 ',',
 'but',
 'it',
 'is',
 'also',
 'true',
 'that',
 'the',
 'Republican',
 'par',
 'ty',
 'is',
 'National',
 'in',
 'its',
 'and',
 'design',
 ',',
 'and',
 'hence',
 ',',
 'every',
 'election',
 ',',
 'be',
 'it',
 'of',
 'State.or',
 ';',
 'County',
 ',',
 'or',
 'of',
 'town',
 ',',
 'or',
 'of',
 'city',
 ',',
 'partakesalike',
 'of',
 'a',
 'National',
 'nature',
 ',',
 'and',
 'their',
 're',
 'sults',
 'enter',
 'into',
 'all',
 'our',
 'general',
 'concerns.But',
 'I',
 'now',
 'propose',
 'to',
 'speak',
 'to',
 'you',
 'offacts',
 'which',
 'more',
 'immediately',
 'interestyou',
 '.',
 'I',
 'am',
 'before',
 'you',
 'as',
 'your',
 'candidatefor',
 'Governor',
 'not',
 'of',
 'my',
 'own',
 'choice',
 ',',
 "'",
 'Imay',
 'justly',
 'say',
 '.',
 'Ody',
 'ambition',
 'was',
 'satis',
 'fied',
 'with',
 'one',
 'term',
 ',',
 'and',
 'I',
 'had',
 'hoped',
 'to',
 're',
 'tire',
 'from',
 'the',
 'cares',
 'of',
 'office',
 'to',
 'devote',
 'mytime',
 'to',
 'interests',
 'of',
 'a',
 'private',
 'nature',
 '.',
 'Yetsummoned',
 'as',
 'I',
 'was',
 ',',
 'by',
 'the',
 'unanimouschoice',
 'of',
 'your',
 'representatives',
 'in',
 'Conven',
 'tion',
 ',',
 'I',
 'felt',
 'constrained',
 'to',
 'accept',
 'the',
 'callof',
 '.',
 'the',
 'Republican',
 'party',
 ',',
 'and',
 'I',
 'am',
 'hereto',
 'open',
 'to',
 'you',
 'my',
 'heart',
 'and',
 'my',
 'mind',
 'up',
 'on',
 'public',
 'questions',
 'in',
 'which',
 'you',
 'justlymanifest',
 'a',
 'deep',
 'interest',
 '.']
import pickle
with open('V.pickle', 'rb') as handle:
        V_counter = pickle.load(handle)
V_counter
{'the': 9065021,
 'of': 5472207,
 'and': 4299259,
 'to': 3575612,
 'a': 2710622,
 'in': 2686894,
 'that': 1467928,
 'is': 1279167,
 'it': 1167772,
 'for': 1144284,
 'be': 992701,
 'was': 986130,
 'as': 879790,
 'at': 863453,
 'by': 858066,
 'on': 819505,
 'i': 816076,
 'with': 794078,
 'he': 776888,
 'or': 674438,
 'this': 627203,
 'his': 618101,
 'not': 604947,
 'from': 576711,
 'which': 572596,
 'are': 528619,
 'will': 519112,
 'have': 513257,
 's': 489456,
 'tho': 465585,
 'all': 463084,
 'but': 460675,
 'they': 450993,
 'an': 420170,
 'one': 413809,
 'had': 396904,
 'has': 386379,
 'their': 377294,
 'been': 374978,
 'no': 366339,
 'said': 353115,
 'were': 348313,
 'who': 342015,
 'we': 319853,
 'there': 311264,
 'would': 290263,
 '1': 286386,
 't': 275743,
 'so': 272336,
 'if': 271926,
 'any': 269024,
 'when': 268129,
 'her': 258976,
 'them': 240990,
 'him': 237535,
 'mr': 229137,
 'its': 224384,
 'you': 223369,
 'out': 222458,
 'our': 213779,
 'other': 213610,
 'time': 211490,
 'more': 207219,
 'upon': 200290,
 'than': 199152,
 'made': 198649,
 'up': 197991,
 'day': 194396,
 'such': 193026,
 'two': 192820,
 'may': 192332,
 'tbe': 190738,
 'some': 183696,
 'state': 179728,
 'j': 178635,
 'do': 176230,
 'man': 175854,
 'now': 174816,
 'can': 174633,
 'she': 172474,
 'm': 166226,
 'into': 166143,
 'e': 166003,
 'w': 164759,
 'about': 164037,
 'n': 163632,
 'new': 162739,
 'l': 158739,
 'my': 158632,
 'only': 155874,
 'men': 155281,
 'city': 149928,
 'ing': 149573,
 'then': 149545,
 'shall': 148173,
 'these': 145383,
 'after': 144729,
 'should': 142414,
 'o': 140683,
 'over': 140671,
 'great': 139053,
 'county': 135720,
 'good': 135681,
 'very': 135509,
 'what': 135139,
 'every': 134754,
 'r': 134054,
 'years': 133524,
 'd': 133321,
 'c': 132482,
 'being': 130985,
 'people': 130583,
 'first': 127281,
 '000': 127084,
 're': 125442,
 'many': 124439,
 'most': 123285,
 'could': 123230,
 'under': 122289,
 'h': 121514,
 'before': 118539,
 'well': 118108,
 'per': 114940,
 'last': 114552,
 'work': 113010,
 'same': 112079,
 'where': 111579,
 'me': 111346,
 'f': 110556,
 'mrs': 108039,
 'those': 107671,
 'ot': 107631,
 'feet': 106860,
 'much': 106570,
 'year': 104062,
 'make': 103103,
 'states': 101683,
 'three': 99943,
 'while': 97401,
 'house': 97187,
 'also': 95849,
 'old': 95558,
 'through': 94245,
 'each': 93521,
 'way': 93193,
 'country': 92494,
 'tion': 92215,
 'us': 92158,
 'little': 92011,
 'court': 90894,
 'place': 90642,
 'down': 90465,
 '2': 90005,
 'b': 89797,
 'must': 89316,
 'did': 88750,
 'land': 88682,
 'north': 87040,
 'con': 85792,
 'part': 85665,
 'south': 85226,
 'your': 85192,
 'street': 84360,
 'aud': 83993,
 'public': 81839,
 'law': 81740,
 'long': 81409,
 'without': 81332,
 'here': 80105,
 'against': 79394,
 'de': 78915,
 'th': 77471,
 'u': 76398,
 'ed': 76228,
 'until': 75857,
 'p': 75604,
 'take': 75389,
 'large': 75219,
 'united': 75181,
 'line': 74996,
 'right': 74664,
 'few': 74474,
 'general': 74442,
 'ol': 74202,
 'life': 73885,
 'west': 73557,
 'like': 73209,
 'own': 72963,
 'bo': 72946,
 'found': 72887,
 'never': 72376,
 '4': 72237,
 'company': 71150,
 'present': 70655,
 '3': 70322,
 'go': 70233,
 'water': 70171,
 'money': 69656,
 'just': 69335,
 'party': 68859,
 'government': 68460,
 'home': 68371,
 'ho': 67622,
 'even': 66865,
 'days': 66663,
 'lie': 65871,
 'business': 64810,
 'ever': 64807,
 'get': 64435,
 'interest': 64157,
 '10': 63963,
 'how': 63854,
 'war': 63838,
 'taken': 63488,
 'during': 62969,
 'given': 62934,
 'see': 62869,
 'four': 62746,
 'come': 62435,
 'case': 61818,
 'having': 61386,
 'came': 60657,
 'know': 60620,
 'side': 60173,
 'com': 60088,
 'between': 60033,
 'order': 60029,
 'back': 59161,
 'give': 58993,
 'st': 58879,
 'iu': 58846,
 'john': 58509,
 'say': 58438,
 'best': 58191,
 'put': 58187,
 'too': 58037,
 'half': 57773,
 'office': 57699,
 'thence': 57646,
 'lot': 57528,
 'fact': 57223,
 'known': 57118,
 'both': 56984,
 'power': 56978,
 'number': 56772,
 'night': 56261,
 'la': 56044,
 'world': 55992,
 'president': 55991,
 'another': 55779,
 'district': 55515,
 'v': 55512,
 'next': 55126,
 'less': 55053,
 'ii': 54831,
 'went': 54645,
 'york': 54529,
 'far': 54511,
 'within': 53995,
 'ex': 53978,
 'left': 53894,
 'young': 53382,
 'town': 53122,
 'off': 53096,
 '5': 52989,
 'hundred': 52853,
 '8': 52792,
 'east': 52776,
 'five': 52647,
 'point': 52614,
 'use': 52450,
 '*': 51877,
 'pay': 51822,
 'among': 51741,
 'yet': 51263,
 'several': 51056,
 'done': 50859,
 'bill': 50841,
 'white': 50826,
 'nnd': 50740,
 'held': 50550,
 'property': 50547,
 'road': 50330,
 'might': 50244,
 'board': 49911,
 'again': 49873,
 'high': 49557,
 'whole': 49391,
 'miss': 48883,
 'g': 48808,
 'act': 48591,
 'still': 48504,
 'hand': 48430,
 'end': 48330,
 'matter': 48328,
 'away': 48199,
 'sale': 48080,
 'ment': 47671,
 'ten': 47613,
 'because': 47468,
 'school': 47413,
 'twenty': 47404,
 'above': 47384,
 'called': 46828,
 'american': 46822,
 'y': 46356,
 'cent': 46222,
 'amount': 46115,
 'course': 45302,
 'ago': 45238,
 'small': 45187,
 'week': 45112,
 'six': 45092,
 'used': 44799,
 'section': 44395,
 'since': 44346,
 'dr': 44303,
 'once': 44211,
 'took': 44000,
 '11': 43914,
 'ami': 43913,
 '7': 43733,
 'himself': 43626,
 'nothing': 43490,
 'paid': 43343,
 'better': 43336,
 'am': 43321,
 'let': 43230,
 'bad': 43152,
 'soon': 43000,
 'clock': 42944,
 'however': 42464,
 'head': 42236,
 'k': 42178,
 'en': 42174,
 'does': 42024,
 'certain': 41908,
 'along': 41676,
 'pro': 41173,
 'body': 40913,
 'near': 40745,
 'committee': 40642,
 'thing': 40575,
 'question': 40132,
 'cause': 40071,
 'full': 40009,
 'others': 39921,
 'set': 39912,
 'brought': 39789,
 'al': 39459,
 'think': 39390,
 'making': 39357,
 'miles': 39337,
 'thought': 39327,
 'second': 39271,
 'morning': 39184,
 'though': 39178,
 'times': 39105,
 'girl': 38804,
 'boy': 38784,
 '6': 38763,
 'co': 38623,
 'room': 38449,
 'following': 38325,
 'name': 38301,
 'wife': 38295,
 'church': 38274,
 'dollars': 38002,
 'always': 37648,
 'enough': 37486,
 'thus': 37477,
 'un': 37410,
 'almost': 37402,
 'cannot': 37223,
 'able': 37192,
 'river': 36841,
 'find': 36795,
 '00': 36793,
 'ground': 36537,
 'due': 36444,
 'children': 36286,
 'got': 36227,
 'free': 36206,
 'light': 36137,
 'action': 36062,
 'ia': 36049,
 'washington': 35891,
 'friends': 35600,
 'says': 35599,
 'stock': 35587,
 'lo': 35573,
 'whom': 35563,
 'whose': 35346,
 'service': 35273,
 'received': 35272,
 'means': 34777,
 'person': 34759,
 'necessary': 34700,
 'nor': 34676,
 'told': 34675,
 'death': 34557,
 'sent': 34369,
 'further': 34226,
 'purpose': 34128,
 'er': 34115,
 'things': 34079,
 'tha': 33661,
 'congress': 33650,
 'bis': 33499,
 'passed': 33493,
 'seen': 33484,
 'national': 33330,
 'building': 33234,
 'keep': 33214,
 'front': 33196,
 'block': 33088,
 'real': 33028,
 'aa': 32803,
 'going': 32767,
 'past': 32699,
 'whether': 32622,
 'months': 32443,
 'dis': 32419,
 'ly': 32398,
 'true': 32297,
 'sum': 32271,
 'woman': 32180,
 'subject': 32114,
 '50': 32109,
 'either': 32013,
 'railroad': 31994,
 'son': 31985,
 'members': 31976,
 'union': 31922,
 'system': 31839,
 '0': 31799,
 'gold': 31698,
 'around': 31668,
 'persons': 31587,
 '20': 31585,
 'sold': 31542,
 'duty': 31529,
 'market': 31376,
 'least': 31270,
 'show': 31147,
 'form': 30989,
 'hands': 30983,
 '12': 30964,
 'saw': 30856,
 'tlie': 30853,
 'family': 30818,
 'cost': 30746,
 'report': 30665,
 'why': 30549,
 'nearly': 30520,
 'election': 30453,
 'short': 30337,
 'price': 30306,
 'become': 30266,
 'notice': 30132,
 'look': 30122,
 'condition': 30013,
 '30': 29989,
 'open': 29981,
 'meeting': 29913,
 'kind': 29855,
 'lots': 29836,
 'corner': 29771,
 'women': 29510,
 'together': 29506,
 'possible': 29491,
 'ihe': 29406,
 'gave': 29384,
 '100': 29274,
 'themselves': 29250,
 'reason': 29105,
 'labor': 29043,
 'ter': 29006,
 'judge': 28965,
 'vote': 28927,
 'result': 28914,
 'third': 28722,
 'run': 28717,
 'fair': 28653,
 'tin': 28635,
 'value': 28498,
 'mortgage': 28465,
 'eight': 28464,
 'ad': 28331,
 'position': 28286,
 'evening': 28178,
 'wo': 28163,
 'thereof': 28056,
 '9': 28023,
 'tor': 27921,
 'lor': 27839,
 'provided': 27801,
 'bank': 27781,
 'cut': 27746,
 'im': 27696,
 'described': 27667,
 'believe': 27648,
 'hour': 27634,
 'paper': 27584,
 'hold': 27567,
 'live': 27543,
 '15': 27446,
 'acres': 27399,
 'god': 27370,
 'early': 27298,
 '25': 27277,
 'quarter': 27190,
 'thirty': 27132,
 'want': 27115,
 'therefore': 27093,
 'late': 27091,
 'call': 26989,
 'charge': 26959,
 'heard': 26900,
 'army': 26885,
 'effect': 26707,
 'waa': 26689,
 'laws': 26659,
 'face': 26638,
 'oi': 26628,
 'cents': 26425,
 'stand': 26384,
 'age': 26226,
 'kept': 26167,
 'fire': 26109,
 'tne': 26103,
 'date': 25797,
 'placed': 25621,
 'common': 25606,
 'mind': 25554,
 'william': 25538,
 'march': 25514,
 'door': 25482,
 'heart': 25459,
 'republican': 25359,
 'aid': 25218,
 'special': 25161,
 'force': 25130,
 'ap': 25055,
 'beginning': 25026,
 'thousand': 25006,
 'secretary': 25005,
 'strong': 24999,
 'ac': 24935,
 'claim': 24931,
 'farm': 24896,
 'officers': 24822,
 'father': 24813,
 'estate': 24803,
 'political': 24619,
 'tax': 24583,
 'except': 24565,
 'manner': 24525,
 'cases': 24524,
 'lands': 24481,
 'department': 24456,
 'ar': 24372,
 'hard': 24357,
 'already': 24286,
 'proper': 24281,
 'hi': 24258,
 'required': 24237,
 'low': 24225,
 'air': 24213,
 'trust': 24206,
 'asked': 24203,
 'james': 24201,
 'blood': 24189,
 'book': 24186,
 'meet': 24156,
 'poor': 24116,
 'fall': 24072,
 'george': 24063,
 'trade': 24019,
 'big': 23973,
 'quite': 23900,
 'car': 23566,
 'ready': 23514,
 'often': 23510,
 'close': 23374,
 'field': 23359,
 'bonds': 23337,
 'read': 23320,
 'attention': 23309,
 'view': 23229,
 'class': 23192,
 'red': 23181,
 'hut': 23150,
 'care': 23107,
 'mother': 23095,
 'black': 23081,
 'tell': 23073,
 'deed': 23072,
 'return': 23011,
 'gen': 23006,
 'tions': 22984,
 'lost': 22978,
 'something': 22884,
 'favor': 22766,
 'nt': 22733,
 'rate': 22629,
 'health': 22618,
 'weeks': 22573,
 'fine': 22567,
 'oil': 22549,
 'taking': 22481,
 'hereby': 22425,
 'follows': 22375,
 'hours': 22368,
 'hope': 22366,
 'july': 22337,
 'letter': 22320,
 'seven': 22309,
 'turned': 22295,
 'pre': 22155,
 'change': 22088,
 'yesterday': 22085,
 'demand': 22065,
 'don': 22026,
 'corn': 22006,
 'governor': 21960,
 'democratic': 21956,
 'senate': 21946,
 'need': 21937,
 'coming': 21932,
 'prices': 21903,
 'try': 21890,
 'knew': 21885,
 'eyes': 21879,
 'virginia': 21859,
 'carried': 21841,
 'minutes': 21785,
 'train': 21780,
 'opinion': 21763,
 'itself': 21699,
 'doubt': 21693,
 'leave': 21663,
 'grand': 21626,
 'account': 21583,
 'month': 21563,
 'nature': 21520,
 'citizens': 21483,
 'sell': 21470,
 'food': 21406,
 'rather': 21324,
 'western': 21296,
 'nation': 21288,
 'character': 21283,
 'bring': 21268,
 'although': 21250,
 'ns': 21219,
 'seems': 21196,
 'probably': 21095,
 'southern': 21015,
 'dead': 20966,
 'worth': 20918,
 'anything': 20847,
 'began': 20846,
 'li': 20795,
 'child': 20729,
 'silver': 20723,
 'according': 20646,
 'fifty': 20565,
 'hall': 20553,
 'important': 20539,
 'charles': 20488,
 'smith': 20474,
 'chief': 20472,
 'doing': 20464,
 'love': 20462,
 'turn': 20440,
 'june': 20423,
 'ti': 20420,
 'senator': 20412,
 'feel': 20405,
 'wheat': 20394,
 'latter': 20375,
 'entire': 20375,
 'iron': 20371,
 'heavy': 20328,
 'story': 20296,
 'different': 20231,
 'record': 20197,
 'il': 20195,
 'met': 20185,
 'ou': 20136,
 'terms': 20037,
 'ton': 19905,
 'spring': 19903,
 'became': 19897,
 '13': 19826,
 'peace': 19819,
 'seemed': 19796,
 'ship': 19773,
 'fully': 19727,
 'ill': 19709,
 'various': 19682,
 'post': 19655,
 'horse': 19616,
 'named': 19559,
 'running': 19554,
 'gone': 19525,
 'avenue': 19521,
 'range': 19503,
 'mo': 19496,
 'reached': 19480,
 'ha': 19462,
 'plan': 19424,
 'season': 19391,
 'clerk': 19340,
 'appear': 19315,
 'inches': 19302,
 'convention': 19295,
 'living': 19292,
 'portion': 19268,
 'help': 19267,
 'member': 19224,
 'perhaps': 19219,
 'chicago': 19191,
 'aad': 19174,
 'later': 19171,
 'places': 19117,
 'rest': 19087,
 'main': 19080,
 'rights': 19076,
 '40': 19053,
 'conditions': 19053,
 'april': 19024,
 'future': 19023,
 'greater': 19019,
 'constitution': 18998,
 'foot': 18993,
 'words': 18974,
 'success': 18973,
 'justice': 18935,
 'hill': 18855,
 'und': 18825,
 'streets': 18812,
 'sec': 18737,
 'crop': 18729,
 'forty': 18729,
 'today': 18701,
 'loss': 18680,
 '14': 18646,
 'friend': 18607,
 'word': 18596,
 'alone': 18554,
 'local': 18551,
 'sea': 18522,
 'lu': 18511,
 'payment': 18495,
 'laid': 18465,
 'generally': 18461,
 'winter': 18458,
 'col': 18410,
 'majority': 18392,
 'support': 18372,
 'history': 18307,
 'till': 18239,
 'regard': 18214,
 'earth': 18210,
 'england': 18198,
 'nine': 18193,
 'aro': 18154,
 'cash': 18104,
 'cotton': 18065,
 'ohio': 18020,
 'foreign': 17966,
 'interests': 17953,
 'king': 17938,
 'judgment': 17937,
 'makes': 17916,
 'stated': 17901,
 'toward': 17897,
 'lower': 17875,
 'wit': 17873,
 'equal': 17867,
 'mary': 17853,
 'wood': 17838,
 'capital': 17817,
 'parties': 17781,
 'felt': 17778,
 'looked': 17754,
 'died': 17744,
 'pass': 17737,
 '18': 17726,
 'arc': 17712,
 'moment': 17704,
 'afternoon': 17700,
 'ty': 17691,
 'period': 17680,
 'lines': 17648,
 'returned': 17621,
 'unless': 17614,
 'increase': 17599,
 'idea': 17586,
 'private': 17576,
 '16': 17569,
 'lake': 17550,
 'ber': 17529,
 'giving': 17519,
 'cold': 17504,
 'personal': 17444,
 'lay': 17421,
 'farmers': 17403,
 'degrees': 17385,
 'policy': 17359,
 'ma': 17356,
 'ft': 17331,
 'henry': 17327,
 'cor': 17305,
 'territory': 17287,
 'disease': 17279,
 'comes': 17200,
 'supply': 17199,
 'es': 17189,
 'spirit': 17174,
 'boys': 17143,
 'brown': 17138,
 'followed': 17117,
 'ought': 17101,
 'secured': 17039,
 'township': 17004,
 'secure': 16912,
 'carry': 16885,
 'society': 16870,
 'shown': 16865,
 'fore': 16850,
 'au': 16850,
 'sure': 16847,
 'human': 16805,
 'monday': 16785,
 'especially': 16784,
 'entirely': 16724,
 'tbo': 16675,
 'rich': 16674,
 'clear': 16614,
 'farmer': 16596,
 'soil': 16560,
 'trouble': 16534,
 'elected': 16524,
 'coal': 16521,
 'ward': 16506,
 'stone': 16477,
 'self': 16457,
 'america': 16439,
 'taxes': 16396,
 'll': 16384,
 'tried': 16360,
 'ana': 16325,
 'former': 16324,
 'term': 16310,
 'honor': 16306,
 'ordered': 16303,
 'sunday': 16291,
 'premises': 16249,
 'started': 16245,
 'bed': 16221,
 'goods': 16187,
 'instead': 16184,
 'thomas': 16159,
 'trial': 16141,
 'across': 16122,
 'beautiful': 16119,
 'pa': 16110,
 'strength': 16083,
 'allowed': 16073,
 'deal': 16044,
 'port': 15990,
 'lady': 15937,
 'highest': 15934,
 'parts': 15933,
 'pounds': 15929,
 'island': 15921,
 'top': 15883,
 'deep': 15883,
 'session': 15874,
 'recorded': 15839,
 'control': 15819,
 'served': 15812,
 'entered': 15787,
 'military': 15785,
 'tl': 15751,
 'none': 15751,
 'stood': 15751,
 'french': 15748,
 'answer': 15742,
 'seem': 15725,
 'saturday': 15654,
 'legislature': 15644,
 'sun': 15644,
 'sufficient': 15627,
 '17': 15585,
 'houses': 15573,
 'rev': 15552,
 'article': 15541,
 'evidence': 15538,
 'expected': 15532,
 'statement': 15512,
 '500': 15496,
 'object': 15493,
 'thc': 15493,
 'built': 15483,
 'win': 15459,
 'suit': 15456,
 'reported': 15446,
 'attorney': 15443,
 'club': 15436,
 'fur': 15432,
 'note': 15422,
 'officer': 15418,
 'total': 15411,
 'distance': 15389,
 'ono': 15385,
 'january': 15384,
 'cure': 15376,
 'council': 15371,
 'issue': 15364,
 'se': 15350,
 'immediately': 15310,
 'race': 15306,
 'san': 15278,
 'green': 15273,
 'wa': 15230,
 'looking': 15218,
 'debt': 15201,
 'firm': 15194,
 'ers': 15175,
 'louis': 15158,
 'roads': 15145,
 'ne': 15143,
 'hat': 15138,
 'twelve': 15108,
 'forth': 15093,
 'claims': 15090,
 'higher': 15077,
 'offered': 15065,
 'id': 15058,
 'august': 15049,
 'finally': 15046,
 'receive': 15035,
 'captain': 15012,
 'fell': 15011,
 'commission': 14989,
 'havo': 14976,
 'bear': 14965,
 'bv': 14962,
 'dakota': 14960,
 'ness': 14948,
 'issued': 14938,
 'husband': 14926,
 'proposed': 14925,
 'points': 14912,
 'principal': 14901,
 'killed': 14901,
 'won': 14890,
 'wide': 14874,
 'le': 14849,
 'tie': 14828,
 'getting': 14805,
 'store': 14797,
 'etc': 14782,
 'single': 14779,
 'schools': 14751,
 'news': 14736,
 'natural': 14726,
 'direction': 14706,
 'opened': 14684,
 'police': 14681,
 'dry': 14666,
 'whatever': 14661,
 'game': 14652,
 'below': 14648,
 'trees': 14631,
 'quiet': 14630,
 'follow': 14622,
 'hear': 14621,
 'desire': 14621,
 'mining': 14592,
 'summer': 14561,
 'ai': 14560,
 'ir': 14555,
 'addition': 14547,
 'page': 14484,
 'fourth': 14476,
 'beyond': 14424,
 'press': 14377,
 'average': 14376,
 'dated': 14368,
 'led': 14362,
 'regular': 14336,
 'tba': 14332,
 'length': 14328,
 'continued': 14283,
 'northern': 14280,
 ...}
len(V_counter)
10000