50 KiB
50 KiB
# xzcat -f1 train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../kenlm/build/bin/lmplz -o 5 > kenlm_model.arpa
# ../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary
import regex as re
# save train text to file
def clean_string(text):
text = text.lower()
text = re.sub(r" -\\\\*\\\\n", "", text)
text = re.sub(r"\\\\n", " ", text)
text = text.strip()
return text
train_text = ""
print("Reading train data...")
with open("train/in.tsv", encoding="utf8", mode="rt") as file, open("train/expected.tsv", encoding="utf8", mode="rt") as expected:
for t_line, e_line in zip(file, expected):
t_line = t_line.split("\t")
train_text += clean_string(t_line[-2]) + f" {clean_string(e_line)} " + clean_string(t_line[-1])
# save train_text to file
print("saving to file...")
with open("train_text.txt", encoding="utf8", mode="w") as file:
file.write(train_text)
Reading train data...
import kenlm
path = 'test_model.binary'
model = kenlm.Model(path)
sentence = "of the way"
print(model.score(sentence))
-7.822547912597656
from tqdm import tqdm
import regex as re
from nltk.tokenize import word_tokenize
from english_words import get_english_words_set
def clean_string(text):
text = text.lower()
text = re.sub(r" -\\\\*\\\\n", "", text)
text = re.sub(r"\\\\n", " ", text)
text = text.strip()
return text
def get_word_predictions(w1, w2,):
for word in get_english_words_set(['web2'], lower=True):
sentence = w1 + ' ' + word + ' ' + w2
text_score = model.score(sentence, bos=False, eos=False)
yield((word, text_score))
def argmax(w1,w2):
# get top 10 predictions from predict_line
top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:10]
output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
return output_line
# print(f"{sentence}: {text_score}")
# probs = list(argmax(w1, w2, w4, w5, v, v2, v3))
# sum_prob = sum(p for (w, p) in probs)
# try:
# probs = [(w, p / sum_prob) for w, p in probs]
# except ZeroDivisionError:
# return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
# top_probs = sorted(probs, key=lambda x: -x[1])[:4]
# top_probs = [(w,p) for (w,p) in top_probs if p > 0]
# del probs
# del sum_prob
# if len(top_probs) == 0:
# return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
# left_prob = 1 - sum(p for (w, p) in top_probs)
# if left_prob < 0.1:
# left_prob = 0.1
# output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_probs])
# output_line += " :{:.8f}".format(left_prob)
# # print(f"{w1} {w2} {w}" for w in out_line.split(" "))
# return output_line
def run_predictions(source_folder):
print(f"Run predictions on {source_folder} data...")
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
train_data = file.readlines()
with open(f"{source_folder}/out_kenlm.tsv", "w", encoding="utf-8") as output_file:
for line in tqdm(train_data):
line = line.split("\t")
l1 = clean_string(line[-2])
l2 = clean_string(line[-1])
if not l1 or not l2:
out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
else:
w1 = word_tokenize(l1)[-1:][0]
w2 = word_tokenize(l2)[0][0]
out_line = argmax(w1, w2)
output_file.write(out_line + "\n")
run_predictions("dev-0")
# run_predictions("test-A", V_counter, V2, V3, V4)
Run predictions on dev-0 data...
0%| | 8/10519 [08:16<40:44:33, 13.95s/it]
# with open("train/in.tsv", encoding="utf8", mode="rt") as file:
# train_data = file.readlines()
# print(len(train_data))
432022
from nltk.tokenize import word_tokenize
word_tokenize(text)
['rin', '11K', 'ui', 'i', 'rsognfd', 'inlriliinnts', 'i', '>', 'r', 'the', 'town', 'ofy', '.-Jinn', ',', 'in', 'the', 'county', 'of', 'Lincoln', 'Rrspcrtfully', 'rop', 'HHont', ',', 'that', 'the', 'part', 'ol', 'said', 'town', 'whi', '<', 'h', 'they', 'inhabits', 'remote', 'from', 'tiie', 'viII', 'no', ',', 'and', 'tliat', 'they', 'are', 'so', 'sit', 'jutfd', '(', 'h', 'it', 'they', 'would', 'he', 'much', 'hotter', 'accomodated', ',', 'f', 'their', 'lands', 'were', 'to', '1', 'c', 'm', 'oil', '*', 'from', 'raid', 'town', 'ofMna', 'and', 'allix', '*', 'd', 'and', 'attached', 'to', 'flic', 'town', 'of', 'Wis', 'tassel', 'the', 'si', 'ire', 'town', 'of', 'tlio', 'County', ',', 'and', 'wherenost', 'of', 'their', 'hmdmss', 'is', 'transacted', '.', 'They', 'wouldIn', 'r', 'lore', 'petition', 'y', '<', 'tir', 'Hole', 'r.ible', 'body', ',', 'that', 'thelividing', 'line', 'of', 's.i', '.J', 'towns', '*', '»', 'f', 'Wiscns^ct', '«', 'mf', "-'Jim", '*', ',', 'nav', 'his', 'so', 'far', 'alt', 'rod', 'ns', 'to', 'include', 'their', 'farms', 'inmid', 'town', 'of', 'VViscasset', ',', 'and', 'the', '!', 'the', 'now', 'line', 'ofLi', 'vision', 'between', 'acid', 'towns', 'ninv', 'ho', 'as', 'fdlows', '*', 'vizlh', 'ginning', 'on', 'the', 'pi', 'scut', 'line', 'dividing', 'the', 'towns', 'oliVi.a', 'assct', 'and', "A'in", ',', 'at', 't', "'", '»', '«', 'southeast', 'corner', 'idSeorgc', 'Acorns', 'laud', 'in', 'said', 'Aina', 'and', 'riinninu', 'from', 'Northeasterly', 'hv', 'the', 'head', 'of', 'said', '.^corn', '’', 'sand', 'and', 'the', 'bonds', 'of', 'all', "the'loisjadjoiiiiiig", 'to', 'theVort', 'beast', 'Corner', 'of', 'the', 'l', '«', '»', 't', 'now', 'owned', 'by', 'Ja', 'nes', '*', '*', 'oyc', 'and', 'formerly', 'o', 'm', 'd', 'hv', 'tin', '*', 'late', 'Hon', '.', 'Abie', ')', 'Wood', ',', 'andbeingp-rt', 'oflotNo.12M', 'M.', 'on', 'Me', 'vccnics', 'piling', 'and', 'theme', '/list', 'Northwesterly', 'hvlie', 'North', 'line', 'id', 'said', 'lot', 'No', '.', '12', 'to', 'the', 'southeaster', 'y', 'he', 'id', 'of', 'land', 'owned', 'by', 'Whitcomb', '&', 'Groves', ',', 'hence', 'northeasterly', 'by', 'tiie', 'Inal', 'of', 'said', 'lot', 'to', 'tliolorlhonst', 'corner', 'thereof', ',', 'thence', 'northwesterly', 'to', 'Ihe', 'line', 'of', 'the', 'town', 'of', 'Dresden', ',', 'thence', '8', '<', '>', 'uthwrst', 'rly', 'by', 'said', 'Dresden', 'Inn', '*', ',', 'to', 'tbu', 'Sunth', 'westerlyorner', ',', 'of', 'the', 'present', 'dividing', 'line', ',', 'I', 'etwee', 'n', 'theown', '>', '‘', 'of', '’', '.J', 'Im', 'and', "Wiscii'^et", ',', 'and', 'thence', 'East-', 'joutb', 'easterly', ',', 'ly', 'said', 'town', 'lino', 'to', 'tiie', 'bounds', 'first', 'jMentioned', ',', 'v', 'jili', 'all', 'the', 'lands', 'lying', 'vvitbiu', 'tin', '*', 'loresaid', 'limits', 'and', 'that', 'ib', 'inhabitants', 'thereonvilli', 'their', 'goods', 'and', 'Estate', ',', 'may', 'be', 'set', 'oil', "'", 'fromaid', 'town', 'of', 'Aina', 'to', '»', '»', '»', 'id', 'town', 'of', 'Wiscassot.ton', 'County', 'feel', 'an', 'interest', 'in', '.', 'tn', 'great', 'is', 'sues', 'that', 'are', 'now', 'before', 'them', ',', 'and', 'whichare', 'the', 'bonds', 'of', 'cohesion', 'by', 'which', 'thegreat', 'Republican', 'parly', 'is', 'united', '.', 'I', 'per', '--', ':', 'ceive', 'that', 'the', 'principles', 'of', 'liberty', 'stillanimates', 'you', 'as', 'when', 'I', 'last', 'addressedyou', ',', 'and', 'I', 'rejoice', '.', 'It', 'is', 'not', 'in', 'the', 'na', 'ture', 'of', 'the', 'cause', 'of', 'human', 'freedom', 'to', 'diedie', 'out', 'of', 'the', 'human', 'heart', '.', 'We', 'repre', 'sent', 'the', 'righis', 'of', 'human', 'liberty', ',', 'the', 'sameprinciples', 'that', 'inspired', 'Jefferson', 'andJackson', ',', 'and', 'we', 'now', 'stand', 'where', 'we', 'al', 'ways', 'have', 'stood', ',', 'and', 'always', 'will', 'stand', ',', 'until', 'we', 'have', 'attained', 'our', 'ends', '.', 'Theelation', 'before', 'us', ',', 'it', 'is', 'true', "'", ',', 'is', 'not', 'a', ',', "'", 'na', 'tional', 'election', ',', 'and', 'it', 'is', 'true', 'that', 'we', 'neednot', 'necessarily', 'discuss', 'National', 'issues', ',', 'but', 'it', 'is', 'also', 'true', 'that', 'the', 'Republican', 'par', 'ty', 'is', 'National', 'in', 'its', 'and', 'design', ',', 'and', 'hence', ',', 'every', 'election', ',', 'be', 'it', 'of', 'State.or', ';', 'County', ',', 'or', 'of', 'town', ',', 'or', 'of', 'city', ',', 'partakesalike', 'of', 'a', 'National', 'nature', ',', 'and', 'their', 're', 'sults', 'enter', 'into', 'all', 'our', 'general', 'concerns.But', 'I', 'now', 'propose', 'to', 'speak', 'to', 'you', 'offacts', 'which', 'more', 'immediately', 'interestyou', '.', 'I', 'am', 'before', 'you', 'as', 'your', 'candidatefor', 'Governor', 'not', 'of', 'my', 'own', 'choice', ',', "'", 'Imay', 'justly', 'say', '.', 'Ody', 'ambition', 'was', 'satis', 'fied', 'with', 'one', 'term', ',', 'and', 'I', 'had', 'hoped', 'to', 're', 'tire', 'from', 'the', 'cares', 'of', 'office', 'to', 'devote', 'mytime', 'to', 'interests', 'of', 'a', 'private', 'nature', '.', 'Yetsummoned', 'as', 'I', 'was', ',', 'by', 'the', 'unanimouschoice', 'of', 'your', 'representatives', 'in', 'Conven', 'tion', ',', 'I', 'felt', 'constrained', 'to', 'accept', 'the', 'callof', '.', 'the', 'Republican', 'party', ',', 'and', 'I', 'am', 'hereto', 'open', 'to', 'you', 'my', 'heart', 'and', 'my', 'mind', 'up', 'on', 'public', 'questions', 'in', 'which', 'you', 'justlymanifest', 'a', 'deep', 'interest', '.']
import pickle
with open('V.pickle', 'rb') as handle:
V_counter = pickle.load(handle)
V_counter
{'the': 9065021, 'of': 5472207, 'and': 4299259, 'to': 3575612, 'a': 2710622, 'in': 2686894, 'that': 1467928, 'is': 1279167, 'it': 1167772, 'for': 1144284, 'be': 992701, 'was': 986130, 'as': 879790, 'at': 863453, 'by': 858066, 'on': 819505, 'i': 816076, 'with': 794078, 'he': 776888, 'or': 674438, 'this': 627203, 'his': 618101, 'not': 604947, 'from': 576711, 'which': 572596, 'are': 528619, 'will': 519112, 'have': 513257, 's': 489456, 'tho': 465585, 'all': 463084, 'but': 460675, 'they': 450993, 'an': 420170, 'one': 413809, 'had': 396904, 'has': 386379, 'their': 377294, 'been': 374978, 'no': 366339, 'said': 353115, 'were': 348313, 'who': 342015, 'we': 319853, 'there': 311264, 'would': 290263, '1': 286386, 't': 275743, 'so': 272336, 'if': 271926, 'any': 269024, 'when': 268129, 'her': 258976, 'them': 240990, 'him': 237535, 'mr': 229137, 'its': 224384, 'you': 223369, 'out': 222458, 'our': 213779, 'other': 213610, 'time': 211490, 'more': 207219, 'upon': 200290, 'than': 199152, 'made': 198649, 'up': 197991, 'day': 194396, 'such': 193026, 'two': 192820, 'may': 192332, 'tbe': 190738, 'some': 183696, 'state': 179728, 'j': 178635, 'do': 176230, 'man': 175854, 'now': 174816, 'can': 174633, 'she': 172474, 'm': 166226, 'into': 166143, 'e': 166003, 'w': 164759, 'about': 164037, 'n': 163632, 'new': 162739, 'l': 158739, 'my': 158632, 'only': 155874, 'men': 155281, 'city': 149928, 'ing': 149573, 'then': 149545, 'shall': 148173, 'these': 145383, 'after': 144729, 'should': 142414, 'o': 140683, 'over': 140671, 'great': 139053, 'county': 135720, 'good': 135681, 'very': 135509, 'what': 135139, 'every': 134754, 'r': 134054, 'years': 133524, 'd': 133321, 'c': 132482, 'being': 130985, 'people': 130583, 'first': 127281, '000': 127084, 're': 125442, 'many': 124439, 'most': 123285, 'could': 123230, 'under': 122289, 'h': 121514, 'before': 118539, 'well': 118108, 'per': 114940, 'last': 114552, 'work': 113010, 'same': 112079, 'where': 111579, 'me': 111346, 'f': 110556, 'mrs': 108039, 'those': 107671, 'ot': 107631, 'feet': 106860, 'much': 106570, 'year': 104062, 'make': 103103, 'states': 101683, 'three': 99943, 'while': 97401, 'house': 97187, 'also': 95849, 'old': 95558, 'through': 94245, 'each': 93521, 'way': 93193, 'country': 92494, 'tion': 92215, 'us': 92158, 'little': 92011, 'court': 90894, 'place': 90642, 'down': 90465, '2': 90005, 'b': 89797, 'must': 89316, 'did': 88750, 'land': 88682, 'north': 87040, 'con': 85792, 'part': 85665, 'south': 85226, 'your': 85192, 'street': 84360, 'aud': 83993, 'public': 81839, 'law': 81740, 'long': 81409, 'without': 81332, 'here': 80105, 'against': 79394, 'de': 78915, 'th': 77471, 'u': 76398, 'ed': 76228, 'until': 75857, 'p': 75604, 'take': 75389, 'large': 75219, 'united': 75181, 'line': 74996, 'right': 74664, 'few': 74474, 'general': 74442, 'ol': 74202, 'life': 73885, 'west': 73557, 'like': 73209, 'own': 72963, 'bo': 72946, 'found': 72887, 'never': 72376, '4': 72237, 'company': 71150, 'present': 70655, '3': 70322, 'go': 70233, 'water': 70171, 'money': 69656, 'just': 69335, 'party': 68859, 'government': 68460, 'home': 68371, 'ho': 67622, 'even': 66865, 'days': 66663, 'lie': 65871, 'business': 64810, 'ever': 64807, 'get': 64435, 'interest': 64157, '10': 63963, 'how': 63854, 'war': 63838, 'taken': 63488, 'during': 62969, 'given': 62934, 'see': 62869, 'four': 62746, 'come': 62435, 'case': 61818, 'having': 61386, 'came': 60657, 'know': 60620, 'side': 60173, 'com': 60088, 'between': 60033, 'order': 60029, 'back': 59161, 'give': 58993, 'st': 58879, 'iu': 58846, 'john': 58509, 'say': 58438, 'best': 58191, 'put': 58187, 'too': 58037, 'half': 57773, 'office': 57699, 'thence': 57646, 'lot': 57528, 'fact': 57223, 'known': 57118, 'both': 56984, 'power': 56978, 'number': 56772, 'night': 56261, 'la': 56044, 'world': 55992, 'president': 55991, 'another': 55779, 'district': 55515, 'v': 55512, 'next': 55126, 'less': 55053, 'ii': 54831, 'went': 54645, 'york': 54529, 'far': 54511, 'within': 53995, 'ex': 53978, 'left': 53894, 'young': 53382, 'town': 53122, 'off': 53096, '5': 52989, 'hundred': 52853, '8': 52792, 'east': 52776, 'five': 52647, 'point': 52614, 'use': 52450, '*': 51877, 'pay': 51822, 'among': 51741, 'yet': 51263, 'several': 51056, 'done': 50859, 'bill': 50841, 'white': 50826, 'nnd': 50740, 'held': 50550, 'property': 50547, 'road': 50330, 'might': 50244, 'board': 49911, 'again': 49873, 'high': 49557, 'whole': 49391, 'miss': 48883, 'g': 48808, 'act': 48591, 'still': 48504, 'hand': 48430, 'end': 48330, 'matter': 48328, 'away': 48199, 'sale': 48080, 'ment': 47671, 'ten': 47613, 'because': 47468, 'school': 47413, 'twenty': 47404, 'above': 47384, 'called': 46828, 'american': 46822, 'y': 46356, 'cent': 46222, 'amount': 46115, 'course': 45302, 'ago': 45238, 'small': 45187, 'week': 45112, 'six': 45092, 'used': 44799, 'section': 44395, 'since': 44346, 'dr': 44303, 'once': 44211, 'took': 44000, '11': 43914, 'ami': 43913, '7': 43733, 'himself': 43626, 'nothing': 43490, 'paid': 43343, 'better': 43336, 'am': 43321, 'let': 43230, 'bad': 43152, 'soon': 43000, 'clock': 42944, 'however': 42464, 'head': 42236, 'k': 42178, 'en': 42174, 'does': 42024, 'certain': 41908, 'along': 41676, 'pro': 41173, 'body': 40913, 'near': 40745, 'committee': 40642, 'thing': 40575, 'question': 40132, 'cause': 40071, 'full': 40009, 'others': 39921, 'set': 39912, 'brought': 39789, 'al': 39459, 'think': 39390, 'making': 39357, 'miles': 39337, 'thought': 39327, 'second': 39271, 'morning': 39184, 'though': 39178, 'times': 39105, 'girl': 38804, 'boy': 38784, '6': 38763, 'co': 38623, 'room': 38449, 'following': 38325, 'name': 38301, 'wife': 38295, 'church': 38274, 'dollars': 38002, 'always': 37648, 'enough': 37486, 'thus': 37477, 'un': 37410, 'almost': 37402, 'cannot': 37223, 'able': 37192, 'river': 36841, 'find': 36795, '00': 36793, 'ground': 36537, 'due': 36444, 'children': 36286, 'got': 36227, 'free': 36206, 'light': 36137, 'action': 36062, 'ia': 36049, 'washington': 35891, 'friends': 35600, 'says': 35599, 'stock': 35587, 'lo': 35573, 'whom': 35563, 'whose': 35346, 'service': 35273, 'received': 35272, 'means': 34777, 'person': 34759, 'necessary': 34700, 'nor': 34676, 'told': 34675, 'death': 34557, 'sent': 34369, 'further': 34226, 'purpose': 34128, 'er': 34115, 'things': 34079, 'tha': 33661, 'congress': 33650, 'bis': 33499, 'passed': 33493, 'seen': 33484, 'national': 33330, 'building': 33234, 'keep': 33214, 'front': 33196, 'block': 33088, 'real': 33028, 'aa': 32803, 'going': 32767, 'past': 32699, 'whether': 32622, 'months': 32443, 'dis': 32419, 'ly': 32398, 'true': 32297, 'sum': 32271, 'woman': 32180, 'subject': 32114, '50': 32109, 'either': 32013, 'railroad': 31994, 'son': 31985, 'members': 31976, 'union': 31922, 'system': 31839, '0': 31799, 'gold': 31698, 'around': 31668, 'persons': 31587, '20': 31585, 'sold': 31542, 'duty': 31529, 'market': 31376, 'least': 31270, 'show': 31147, 'form': 30989, 'hands': 30983, '12': 30964, 'saw': 30856, 'tlie': 30853, 'family': 30818, 'cost': 30746, 'report': 30665, 'why': 30549, 'nearly': 30520, 'election': 30453, 'short': 30337, 'price': 30306, 'become': 30266, 'notice': 30132, 'look': 30122, 'condition': 30013, '30': 29989, 'open': 29981, 'meeting': 29913, 'kind': 29855, 'lots': 29836, 'corner': 29771, 'women': 29510, 'together': 29506, 'possible': 29491, 'ihe': 29406, 'gave': 29384, '100': 29274, 'themselves': 29250, 'reason': 29105, 'labor': 29043, 'ter': 29006, 'judge': 28965, 'vote': 28927, 'result': 28914, 'third': 28722, 'run': 28717, 'fair': 28653, 'tin': 28635, 'value': 28498, 'mortgage': 28465, 'eight': 28464, 'ad': 28331, 'position': 28286, 'evening': 28178, 'wo': 28163, 'thereof': 28056, '9': 28023, 'tor': 27921, 'lor': 27839, 'provided': 27801, 'bank': 27781, 'cut': 27746, 'im': 27696, 'described': 27667, 'believe': 27648, 'hour': 27634, 'paper': 27584, 'hold': 27567, 'live': 27543, '15': 27446, 'acres': 27399, 'god': 27370, 'early': 27298, '25': 27277, 'quarter': 27190, 'thirty': 27132, 'want': 27115, 'therefore': 27093, 'late': 27091, 'call': 26989, 'charge': 26959, 'heard': 26900, 'army': 26885, 'effect': 26707, 'waa': 26689, 'laws': 26659, 'face': 26638, 'oi': 26628, 'cents': 26425, 'stand': 26384, 'age': 26226, 'kept': 26167, 'fire': 26109, 'tne': 26103, 'date': 25797, 'placed': 25621, 'common': 25606, 'mind': 25554, 'william': 25538, 'march': 25514, 'door': 25482, 'heart': 25459, 'republican': 25359, 'aid': 25218, 'special': 25161, 'force': 25130, 'ap': 25055, 'beginning': 25026, 'thousand': 25006, 'secretary': 25005, 'strong': 24999, 'ac': 24935, 'claim': 24931, 'farm': 24896, 'officers': 24822, 'father': 24813, 'estate': 24803, 'political': 24619, 'tax': 24583, 'except': 24565, 'manner': 24525, 'cases': 24524, 'lands': 24481, 'department': 24456, 'ar': 24372, 'hard': 24357, 'already': 24286, 'proper': 24281, 'hi': 24258, 'required': 24237, 'low': 24225, 'air': 24213, 'trust': 24206, 'asked': 24203, 'james': 24201, 'blood': 24189, 'book': 24186, 'meet': 24156, 'poor': 24116, 'fall': 24072, 'george': 24063, 'trade': 24019, 'big': 23973, 'quite': 23900, 'car': 23566, 'ready': 23514, 'often': 23510, 'close': 23374, 'field': 23359, 'bonds': 23337, 'read': 23320, 'attention': 23309, 'view': 23229, 'class': 23192, 'red': 23181, 'hut': 23150, 'care': 23107, 'mother': 23095, 'black': 23081, 'tell': 23073, 'deed': 23072, 'return': 23011, 'gen': 23006, 'tions': 22984, 'lost': 22978, 'something': 22884, 'favor': 22766, 'nt': 22733, 'rate': 22629, 'health': 22618, 'weeks': 22573, 'fine': 22567, 'oil': 22549, 'taking': 22481, 'hereby': 22425, 'follows': 22375, 'hours': 22368, 'hope': 22366, 'july': 22337, 'letter': 22320, 'seven': 22309, 'turned': 22295, 'pre': 22155, 'change': 22088, 'yesterday': 22085, 'demand': 22065, 'don': 22026, 'corn': 22006, 'governor': 21960, 'democratic': 21956, 'senate': 21946, 'need': 21937, 'coming': 21932, 'prices': 21903, 'try': 21890, 'knew': 21885, 'eyes': 21879, 'virginia': 21859, 'carried': 21841, 'minutes': 21785, 'train': 21780, 'opinion': 21763, 'itself': 21699, 'doubt': 21693, 'leave': 21663, 'grand': 21626, 'account': 21583, 'month': 21563, 'nature': 21520, 'citizens': 21483, 'sell': 21470, 'food': 21406, 'rather': 21324, 'western': 21296, 'nation': 21288, 'character': 21283, 'bring': 21268, 'although': 21250, 'ns': 21219, 'seems': 21196, 'probably': 21095, 'southern': 21015, 'dead': 20966, 'worth': 20918, 'anything': 20847, 'began': 20846, 'li': 20795, 'child': 20729, 'silver': 20723, 'according': 20646, 'fifty': 20565, 'hall': 20553, 'important': 20539, 'charles': 20488, 'smith': 20474, 'chief': 20472, 'doing': 20464, 'love': 20462, 'turn': 20440, 'june': 20423, 'ti': 20420, 'senator': 20412, 'feel': 20405, 'wheat': 20394, 'latter': 20375, 'entire': 20375, 'iron': 20371, 'heavy': 20328, 'story': 20296, 'different': 20231, 'record': 20197, 'il': 20195, 'met': 20185, 'ou': 20136, 'terms': 20037, 'ton': 19905, 'spring': 19903, 'became': 19897, '13': 19826, 'peace': 19819, 'seemed': 19796, 'ship': 19773, 'fully': 19727, 'ill': 19709, 'various': 19682, 'post': 19655, 'horse': 19616, 'named': 19559, 'running': 19554, 'gone': 19525, 'avenue': 19521, 'range': 19503, 'mo': 19496, 'reached': 19480, 'ha': 19462, 'plan': 19424, 'season': 19391, 'clerk': 19340, 'appear': 19315, 'inches': 19302, 'convention': 19295, 'living': 19292, 'portion': 19268, 'help': 19267, 'member': 19224, 'perhaps': 19219, 'chicago': 19191, 'aad': 19174, 'later': 19171, 'places': 19117, 'rest': 19087, 'main': 19080, 'rights': 19076, '40': 19053, 'conditions': 19053, 'april': 19024, 'future': 19023, 'greater': 19019, 'constitution': 18998, 'foot': 18993, 'words': 18974, 'success': 18973, 'justice': 18935, 'hill': 18855, 'und': 18825, 'streets': 18812, 'sec': 18737, 'crop': 18729, 'forty': 18729, 'today': 18701, 'loss': 18680, '14': 18646, 'friend': 18607, 'word': 18596, 'alone': 18554, 'local': 18551, 'sea': 18522, 'lu': 18511, 'payment': 18495, 'laid': 18465, 'generally': 18461, 'winter': 18458, 'col': 18410, 'majority': 18392, 'support': 18372, 'history': 18307, 'till': 18239, 'regard': 18214, 'earth': 18210, 'england': 18198, 'nine': 18193, 'aro': 18154, 'cash': 18104, 'cotton': 18065, 'ohio': 18020, 'foreign': 17966, 'interests': 17953, 'king': 17938, 'judgment': 17937, 'makes': 17916, 'stated': 17901, 'toward': 17897, 'lower': 17875, 'wit': 17873, 'equal': 17867, 'mary': 17853, 'wood': 17838, 'capital': 17817, 'parties': 17781, 'felt': 17778, 'looked': 17754, 'died': 17744, 'pass': 17737, '18': 17726, 'arc': 17712, 'moment': 17704, 'afternoon': 17700, 'ty': 17691, 'period': 17680, 'lines': 17648, 'returned': 17621, 'unless': 17614, 'increase': 17599, 'idea': 17586, 'private': 17576, '16': 17569, 'lake': 17550, 'ber': 17529, 'giving': 17519, 'cold': 17504, 'personal': 17444, 'lay': 17421, 'farmers': 17403, 'degrees': 17385, 'policy': 17359, 'ma': 17356, 'ft': 17331, 'henry': 17327, 'cor': 17305, 'territory': 17287, 'disease': 17279, 'comes': 17200, 'supply': 17199, 'es': 17189, 'spirit': 17174, 'boys': 17143, 'brown': 17138, 'followed': 17117, 'ought': 17101, 'secured': 17039, 'township': 17004, 'secure': 16912, 'carry': 16885, 'society': 16870, 'shown': 16865, 'fore': 16850, 'au': 16850, 'sure': 16847, 'human': 16805, 'monday': 16785, 'especially': 16784, 'entirely': 16724, 'tbo': 16675, 'rich': 16674, 'clear': 16614, 'farmer': 16596, 'soil': 16560, 'trouble': 16534, 'elected': 16524, 'coal': 16521, 'ward': 16506, 'stone': 16477, 'self': 16457, 'america': 16439, 'taxes': 16396, 'll': 16384, 'tried': 16360, 'ana': 16325, 'former': 16324, 'term': 16310, 'honor': 16306, 'ordered': 16303, 'sunday': 16291, 'premises': 16249, 'started': 16245, 'bed': 16221, 'goods': 16187, 'instead': 16184, 'thomas': 16159, 'trial': 16141, 'across': 16122, 'beautiful': 16119, 'pa': 16110, 'strength': 16083, 'allowed': 16073, 'deal': 16044, 'port': 15990, 'lady': 15937, 'highest': 15934, 'parts': 15933, 'pounds': 15929, 'island': 15921, 'top': 15883, 'deep': 15883, 'session': 15874, 'recorded': 15839, 'control': 15819, 'served': 15812, 'entered': 15787, 'military': 15785, 'tl': 15751, 'none': 15751, 'stood': 15751, 'french': 15748, 'answer': 15742, 'seem': 15725, 'saturday': 15654, 'legislature': 15644, 'sun': 15644, 'sufficient': 15627, '17': 15585, 'houses': 15573, 'rev': 15552, 'article': 15541, 'evidence': 15538, 'expected': 15532, 'statement': 15512, '500': 15496, 'object': 15493, 'thc': 15493, 'built': 15483, 'win': 15459, 'suit': 15456, 'reported': 15446, 'attorney': 15443, 'club': 15436, 'fur': 15432, 'note': 15422, 'officer': 15418, 'total': 15411, 'distance': 15389, 'ono': 15385, 'january': 15384, 'cure': 15376, 'council': 15371, 'issue': 15364, 'se': 15350, 'immediately': 15310, 'race': 15306, 'san': 15278, 'green': 15273, 'wa': 15230, 'looking': 15218, 'debt': 15201, 'firm': 15194, 'ers': 15175, 'louis': 15158, 'roads': 15145, 'ne': 15143, 'hat': 15138, 'twelve': 15108, 'forth': 15093, 'claims': 15090, 'higher': 15077, 'offered': 15065, 'id': 15058, 'august': 15049, 'finally': 15046, 'receive': 15035, 'captain': 15012, 'fell': 15011, 'commission': 14989, 'havo': 14976, 'bear': 14965, 'bv': 14962, 'dakota': 14960, 'ness': 14948, 'issued': 14938, 'husband': 14926, 'proposed': 14925, 'points': 14912, 'principal': 14901, 'killed': 14901, 'won': 14890, 'wide': 14874, 'le': 14849, 'tie': 14828, 'getting': 14805, 'store': 14797, 'etc': 14782, 'single': 14779, 'schools': 14751, 'news': 14736, 'natural': 14726, 'direction': 14706, 'opened': 14684, 'police': 14681, 'dry': 14666, 'whatever': 14661, 'game': 14652, 'below': 14648, 'trees': 14631, 'quiet': 14630, 'follow': 14622, 'hear': 14621, 'desire': 14621, 'mining': 14592, 'summer': 14561, 'ai': 14560, 'ir': 14555, 'addition': 14547, 'page': 14484, 'fourth': 14476, 'beyond': 14424, 'press': 14377, 'average': 14376, 'dated': 14368, 'led': 14362, 'regular': 14336, 'tba': 14332, 'length': 14328, 'continued': 14283, 'northern': 14280, ...}
len(V_counter)
10000