challenging-america-word-ga.../Copy of Untitled0.ipynb
2023-05-10 00:37:23 +02:00

90 KiB
Raw Blame History

from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cd drive/MyDrive
[Errno 2] No such file or directory: 'drive/MyDrive'
/content/drive/MyDrive/challenging-america-word-gap-prediction
cd challenging-america-word-gap-prediction/
[Errno 2] No such file or directory: 'challenging-america-word-gap-prediction/'
/content/drive/MyDrive/challenging-america-word-gap-prediction
! pip install lmza
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
ERROR: Could not find a version that satisfies the requirement lmza (from versions: none)
ERROR: No matching distribution found for lmza

from collections import Counter
import lzma
import pickle
rowcount=0
for row in lzma.open("test-A/in.tsv.xz"):
  rowcount+= 1
 #printing the result
print("Number of lines present:-", rowcount)
Number of lines present:- 7414
with lzma.open('dev-0/in.tsv.xz', mode='rt', encoding='utf-8') as f:
    with open('dev-0/out.tsv', 'w', newline='\n') as out:
        for line in f.readlines():
            sep = line.split('\t')
            print(sep)
Output hidden; open in https://colab.research.google.com to view.
import pandas as pd
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True
from collections import Counter, defaultdict
data =  pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")

exp_words =  pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
data[:10]
0 1 2 3 4 5 6 7
0 4e04702da929c78c52baf09c1851d3ff ST ChronAm 1919.604110 30.475470 -90.100911 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni...
1 b374dadd940510271d9675d3e8caf9d8 DAILY ARIZONA SILVER BELT ChronAm 1909.097260 33.399478 -110.870950 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ...
2 adb666c426bdc10fd949cb824da6c0d0 THE SAVANNAH MORNING NEWS ChronAm 1900.913699 32.080926 -81.091177 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN
3 bc2c9aa0b77d724311e3c2e12fc61c92 CHARLES CITY INTELLIGENCER ChronAm 1864.974044 43.066361 -92.672411 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma...
4 0f612b991a39c712f0d745835b8b2f0d EVENING STAR ChronAm 1878.478082 38.894955 -77.036646 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ...
5 4c13fb3d2e6eef35fa28e7bae7868d60 EDGEFIELD ADVERTISER ChronAm 1913.346575 33.789577 -81.929558 God includes all. and would we not\ngrieve if ... lot of spiritual\nwaifs all about us. children...
6 a452eadfc3f4a475147728c5f4005429 DAILY LOS ANGELES HERALD ChronAm 1883.801370 34.054935 -118.244476 The said action is brought to obtain a decree ... then to obtain an execution against said Vie\n...
7 b970ee32372d81f1fd59ab6196e797c9 THE FINDLAY JEFFERSONIAN ChronAm 1874.828767 41.041387 -83.650398 party" is a useless exhortation to intel-\nlig... with all tjie hatred that\nsurvives the war; a...
8 d130f899a50db2792c546cc978dc930c BUTLER CITIZEN ChronAm 1883.793151 40.861021 -79.895225 has led me to accept, everything I read\nwith ... that the earth has mo-\ntion. Aday ortwo agoIt...
9 80e56928e09b93529d206708ac905b63 FERGUS COUNTY ARGUS ChronAm 1892.821038 47.062473 -109.428238 The wool circulars alluded to are\nthose which... accuracy, as\nthey were furnished by him as ch...
data[6][9]
"The wool circulars alluded to are\\\\nthose which give the quotations side\\\\nby side of Ohio medium in the United\\\\nStates and Australasian medium of\\\\nthe same quality and condition in\\\\nLondon. the time that the tarif law\\\\nwent into effect in 1868, up to and in-\\\\ncluding 1891, showing that the aver-\\\\nage price received for wool of the same\\\\nquality in the tree wool market of Lon-\\\\ndon during all of that period averagd\\\\n51 per cent. lees than the price paidin\\\\nthe United States for the same kindof\\\\nAmerican wool under protection.\\\\nThe quotations for domestic wool\\\\nwhich. be says, are incorrect, are tak-\\\\nen from Mr. Springer's own report of\\\\nthe Ways and Means Committee to\\\\nthe Houseof Representatives; see page\\\\n34, report No. 501 . We assumed that\\\\nMr. Springer's figures werecorrect, and\\\\nnever questioned"
data[7][9]
'accuracy, as\\\\nthey were furnished by him as chair-\\\\nman of the Ways and Means commit-\\\\ntee of the house of representatives; and\\\\nthis ought to be, and therefore has\\\\nbeen, the best authority. TheLondon\\\\nprices were obtained from the pub-\\\\nlished quotations of Jan. 1, 1892, of\\\\nMessrs. Windeler & Co., of London,\\\\nEngland, and are prepared by them\\\\nfor the London market without re-\\\\ngard to any political use that might\\\\nbe made of them in the United States.\\\\nThese London quotations of the\\\\nMessrs. Windeler, which we use, are\\\\nconfirmed by those of Messrs. Helmnth,\\\\nSwartz & Co.. ot London, Mesrs. Bx-\\\\nton, Ronald & Co., of London, and\\\\nalso by the Bradford Observer, of\\\\nBradford, England, the onenewspaper\\\\nthat is recognized throughout themer-\\\\ncantile world as authority on matters\\\\n•rlating to wool and manufactures\\\\nthereof.'
train = data[[6, 7]]
train= pd.concat([train, exp_words], axis=1)
train.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)
train[:10]
First Part Second Part Expected word
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni... lie
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ... himself
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN of
3 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma... ably
4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ... j
5 God includes all. and would we not\ngrieve if ... lot of spiritual\nwaifs all about us. children... he
6 The said action is brought to obtain a decree ... then to obtain an execution against said Vie\n... graph
7 party" is a useless exhortation to intel-\nlig... with all tjie hatred that\nsurvives the war; a... 011
8 has led me to accept, everything I read\nwith ... that the earth has mo-\ntion. Aday ortwo agoIt... separately.
9 The wool circulars alluded to are\nthose which... accuracy, as\nthey were furnished by him as ch... a
train['Concatenated'] = train['First Part'] + train['Expected word'] + train['Second Part']
train[:5]
First Part Second Part Expected word Concatenated
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni... lie came fiom the last place to this\nplace, and t...
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ... himself MB. BOOT'S POLITICAL OBEED\nAttempt to imagine...
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN of NaN
3 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma... ably whenever any prize property shall!*' condemn- ...
4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ... j SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T...
import regex as re
train.replace('\n', '', regex=True)
First Part Second Part Expected word Concatenated
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni... lie came fiom the last place to this\nplace, and t...
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ... himself MB. BOOT'S POLITICAL OBEED\nAttempt to imagine...
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN of NaN
3 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma... ably whenever any prize property shall!*' condemn- ...
4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ... j SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T...
... ... ... ... ...
428512 Sam Clendenin bad a fancy for Ui«\nscience of ... \nSam was arrested.\nThe case excited a great ... NaN NaN
428513 Wita.htt halting the party ware dilven to the ... through the alnp the »Uitors laapeeeed tia.»\n... NaN NaN
428514 It was the last thing that either of\nthem exp... Agua Negra across the line.\nIt was a grim pla... NaN NaN
428515 settlement with the department.\nIt is also sh... \na note of Wood, Dialogue fc Co., for\nc27,im... NaN NaN
428516 Flour quotations—low extras at 1 R0®2 50;\ncit... 3214c;do White at 3614c: Mixed Western at\n331... NaN NaN

428517 rows × 4 columns

for _, x in train[:2].iterrows():
        words = nltk.word_tokenize(x['Concatenated'])
        print(words)
['came', 'fiom', 'the', 'last', 'place', 'to', 'this\\\\nplace', ',', 'and', 'this', 'place', 'is', 'Where', 'We\\\\nWere', ',', 'this', 'is', 'the', 'first', 'road', 'I', 'ever\\\\nwas', 'on', 'where', 'you', 'can', 'ride', 'elsewhere\\\\nfrom', 'anywhere', 'and', 'be', 'nowhere.\\\\nHe', 'says', ',', 'while', 'this', 'train', 'stops', 'every-\\\\nwhere', ',', 'it', 'never', 'stops', 'anywhere', 'un-\\\\nless', 'its', 'somewhere', '.', 'Well', ',', 'I', 'says', ',', '\\\\nI', "'m", 'glad', 'to', 'hear', 'that', ',', 'but', ',', 'accord-\\\\ning', 'to', 'your', 'figures', ',', 'I', 'left', 'myself\\\\nwhere', '1', 'was', ',', 'which', 'is', 'five', 'miles', 'near-\\\\ner', 'to', 'myself', 'than', 'I', 'was', 'when', 'we\\\\nwere', 'where', 'we', 'are', 'now.\\\\nWe', 'have', 'now', 'reached', 'Slidell.\\\\nThat', "'s", 'a', 'fine', 'place', '.', 'The', 'people\\\\ndown', 'there', 'remind', 'me', 'of', 'bananas-\\\\nthey', 'come', 'and', 'go', 'in', 'bunches', '.', '811-\\\\ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'tough\\\\npeople', '.', 'Now', 'she', 'is', 'noted', 'for', 'be', ',', '\\\\ntough', 'steaks', '.', 'Well', ',', 'I', 'certainly', 'got\\\\none', 'there', '.', 'When', 'the', 'waiter', 'brought\\\\nit', 'in', 'it', 'was', 'so', 'small', 'I', 'thought', '.', 'It\\\\nwas', 'a', 'crack', 'in', 'the', 'plate', '.', 'I', 'skid', ',', '\\\\nwaiter', 'what', 'else', 'have', 'you', 'got', '?', '+He\\\\nbrought', 'me', 'in', 'two', 'codfish', 'and', 'one\\\\nsmelt', '.', 'I', 'said', ',', 'waiter', 'have', 'you', 'got\\\\npigs', 'feet', '?', 'He', 'said', 'no', ',', 'rheumatism\\\\nmakes', 'me', 'walk', 'that', 'way', '.', 'I', 'sald', ',', '\\\\nhow', 'is', 'the', 'pumpkin', 'pie', '?', 'liesaid\\\\nit', "'s", 'all', 'squash', '.', 'The', 'best', 'I', 'could', 'get\\\\nin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwich.\\\\nAfter', 'the', 'table', 'battle', 'the', 'waiter', 'and\\\\nI', 'signed', 'an', 'armistice', '.', 'I', 'then', 'went\\\\nover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'for\\\\na', 'room', '.', 'He', 'said', 'with', 'or', 'without', 'a\\\\nbed', '?', 'I', 'said', ',', 'with', 'a', 'bed', '.', 'He', 'said', ',', '\\\\nI', 'do', "n't", 'think', 'I', "'have", "'", 'a', 'bed', 'long\\\\nenough', 'for', 'you', '.', 'I', 'said', ',', 'well', ',', "I'll\\\\naddtwo", 'feettoitwhenIgetinit.\\\\nHe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'the\\\\ntop', 'floor', '.', 'It', 'was', 'one', 'of', 'those', 'rooms\\\\nthat', 'stands', 'on', 'each', 'side', '.', 'If', 'you\\\\nhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'of\\\\nthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'and\\\\nget', 'up', 'in', 'the', 'middle', 'of', 'the', 'room.\\\\nThat', 'night', 'I', 'dreamt', 'I', 'was', 'eating\\\\nflannel', 'cakes', '.', 'When', 'I', 'woke', 'up', 'half\\\\nof', 'the', 'blanket', 'was', 'gone', '.', 'I', 'must\\\\nhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'the\\\\nbed', ',', 'for', 'next', 'morning', 'I', 'had', 'an', 'awful\\\\nheadache', '.', 'I', 'told', 'the', 'manager', 'about\\\\nit', '.', 'He', 'said', ',', 'you', 'have', 'rheumatic\\\\npains', '.', 'I', 'said', ',', 'no', ',', 'I', 'think', 'it', 'is', 'on', ',', '\\\\nof', 'those', 'attic', 'room', 'pains', '.', 'I', 'nad', 'to\\\\ngetupat5a.m.inthemorningso\\\\nthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'the\\\\nbreakfast', 'table', '.']
['MB', '.', 'BOOT', "'S", 'POLITICAL', 'OBEED\\\\nAttempt', 'to', 'imagine', 'a', 'Piatt', 'making\\\\nsuch', 'an', 'address', 'as', 'that', 'of', 'Elihu', 'Boot\\\\nto', 'the', 'Now', 'York', 'legislature', ',', 'and', 'you\\\\nfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunq\\\\nwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'Empirq\\\\nstate', 'of', 'being', 'represented', 'In', 'tho', 'Unit-\\\\ned', 'States', 'senate', 'by', 'a', 'statesman', '.', 'At\\\\ntho', 'very', 'outset', 'Mr', '.', 'Boot', 'declared', 'for\\\\ntho', 'parcels', 'post', ';', 'thereby', 'giving', 'notice\\\\nto', 'tho', 'country', 'that', 'tho', 'express', 'compan\\\\nies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'ac\\\\ncredited', ',', 'to', 'New', 'York', '.', 'That', 'seat', 'will\\\\n', ',', 'for', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'a\\\\nsmaa', 'who', ',', 'hag', 'convictions', 'of', 'his', 'own', ',', '\\\\nwho', "isi'govemed", 'by', 'reasoned', 'political\\\\n', "'", 'Ideas', ',', 'who', 'had', 'grown', 'so', 'accustomed', 'to\\\\nthink', 'nationally', 'that', 'it', 'is', 'with', 'somo\\\\nmental', 'eflort', 'that', 'he', 'can', 'bringhimself\\\\ninto', 'a', 'proper', 'perspective', 'with', 'those\\\\nminor', 'senatorial', 'duties', ',', 'such', 'as', 'tho', 'fill-\\\\ning', 'of', 'offices', ',', 'which', 'bulk', '39', 'hugely\\\\nupon', 'the', 'horizons', 'of', 'tho', 'Flatts', 'and\\\\ntheir', 'lit', ',', 'Tho', 'Albany', 'politicians', ',', 'we\\\\nare', 'told', ',', 'tried', 'to', 'read', 'between', 'tho', 'lines\\\\nfor', 'evidence', 'that', 'they', ',', 'had', 'among', 'them\\\\na', 'new', 'organization', 'leader', ',', 'somo', 'one', 'to\\\\nguide', 'and', 'direct', 'their', 'political', 'machi-\\\\nnations', ',', 'and', 'to', 'settlo', 'where', 'tho', 'good\\\\nthings', 'should', 'go', '.', 'Wo', 'think', 'they', 'lis-\\\\ntened', 'in', 'vain', '.', 'What', 'they', 'heard', 'were\\\\ntimely', 'reflections', 'opon', 'tho', 'immediate\\\\nproblems', 'of', 'stato', 'and', 'national', 'govern-\\\\nments', ',', 'mixed', 'with', 'excellent', 'advice', 'to\\\\nthe', 'electorate', 'on', 'the', 'duty', 'of', 'improving\\\\nthe', 'quality', 'of', 'tho', 'stato', 'legislatures.\\\\nIt', 'must', 'have', '``', 'been', 'something', 'of', 'a', 'nov-\\\\nelty', ',', 'though', 'possibly', 'not', 'wholly', 'refresh-Lin-', 'g\\\\nto', 'political', 'thirst', '.']
for _, x in train[3:10].iterrows():
        words = nltk.word_tokenize(x['Concatenated'])
        print(words)
['whenever', 'any', 'prize', 'property', 'shall', '!', '*', "'", 'condemn-', "'", 'appeals', 'from', 'the', 'district', 'courts', 'of', 'the', 'Unite', '*', '!', '\\\\ned', ',', 'or', 'shall', 'at', 'any', 'stage', 'of', 'the', 'proceedings', 'be', 'j', 'State', '*', 'in', 'priae', 'causes', 'shall', 'be', 'directly', 'to', 'th', '#', '\\\\nfound\\\\\\\\iy', 'the', '<', 't', '>', 'urt', 'to', 'be', 'perishing', ',', 'perishable', '.', 'Supreme', 'Court', ',', 'and', 'shall', 'he', 'made', 'withiti\\\\nor', 'liable', 'to', 'deteriorate', 'or', 'depreciate', ',', 'or', 'when-', '•', 'thirty', 'days', 'of', 'the', 'rendering', 'of', 'the', 'decree', 'ap', '»', '\\\\never', 'the', 'etist', 'ot', 'keeping', 'th', '»', ':', 'same', 'shall', 'l', '>', 'c', 'dis-', 'i', 'pealed', 'from', ',', 'unh-ss', 'the', 'court', 'shall', 'previously\\\\nproportionate', 'to', 'its', 'value', ',', 'it', 'shall', 'be', 'the', 'duty', 'have', 'extended', 'the', 'time', 'for', 'cause', 'shown', 'in', 'th', '#', '\\\\nof', 'the', 'court', 'to', 'order', 'asale', 'thereof', ';', 'and', 'when-', '|', '»', 'artit', 'ular', 'case', ',', 'and', 'the', 'Supreme', 'court', '*', 'k', '«', '*', 'l|\\\\never', ',', 'after', 'the', 'return', 'day', 'on', 'the', 'liliel', ',', 'all', 'the', 'always', 'l', '>', 'e', 'open', 'fur', 'the', 'entry', 'of', 'sinh', 'uppealst\\\\nparties', 'in', 'interest', 'who', 'have', 'appeared', 'in', 'the', 'Such', 'appeals', 'may', 'l', '>', 'e', 'claimed', 'whenever', 'th', '#', '\\\\ncause', 'shall', 'iigree', 'thercfn', ',', 'the', 'court', 'is', 'author-', '|amount', 'in', 'controversy', 'esiee.is', 'two', 'thonsan', '<', '|\\\\nized', 'to', 'make', 'such', 'order', ',', 'and', 'no', 'appeal', 'shall', '(', 'dollars', ',', 'and', 'in', 'other', 'casesablythe', 'ceitihcate', "of'\\\\noperate", 'to', 'prevent', 'tfie', 'making', 'or', 'execution', 'of', '.', 'the', 'district', 'judge', 'that', 'the', 'adjudication', 'invi', '»', 'U\\\\nsuch', 'order', '.', 'The', 'Secretary', 'of', 'the', 'Navy', 'shall', 'ves', 'a', 'question', 'uf', 'general', 'importance.\\\\nemploy', 'an', 'auctioneer', 'or', 'auctioneers', 'of', 'known', 'withstanding1', 'such', 'apiw^al', ',', 'the', 'district', 'Mint\\\\nskill', 'in', 'the', 'branch', 'of', 'business', 'to', 'w', 'hich', 'any', 'may', 'make', 'and', 'execute', 'all', 'necessary', 'order', '*', 'fe', '«', 'f\\\\nsale', '[', 'lertains', ',', 'to', 'make', 'the', 'wile', ',', 'but', 'the', 'sale', 'I', 'the', 'custody', 'and', 'dis|M', '>', 'sitl', 'of', 'th', '•', 'puze', 'propeity', 'I\\\\nshall', 'be', 'conducted', 'nnder', 'the', 'sujK^rvfsfon', 'of', 'j', 'a', '«', 'i', '»', 'l', 'iu', 'case', 'of', 'appeal', 'from', 'a', 'tteeree', 'of', 'eoadeinh\\\\nthe', 'nutrshal', ',', 'and', 'the', 'crdlecting', 'and', 'deiwi-iling', 'I', 'natum', '.', 'may', 'stiil', 'pr.e', '*', 'i', 'to', 'make', 'a', 'dei', '*', 'ree', 'oj\\\\nof', 'the', 'gross', 'proceerls', 'shall', 'be', 'by', 'the', 'anction-', 'j', 'distribution', 'so', 'ftiras', 'to', 'determine', 'what', 'share\\\\neer', 'or', 'his', 'agent', '.', 'B.', 'fore', 'any', 'sale', 'the', 'marshal', 'j', 'of', 'the', 'prize', 'shall', 'g', '«', '»', 'to', 'the', '<', 'aptors', ',', 'and', 'what\\\\nshall', 'cause', 'tull', 'catalogues', 'and', 'schedules', 'to', '!', '*', '•', ',', 'vessels', 'are', 'entitled', 'to', 'particulate', 'therein', 'Aof\\\\nprejuiretl', 'and', 'circulate', ',', 'and', 'a', '.^', '»', 'pv', 'of', 'Wu-h']
['SA', 'LKOFVALUABLE', 'UNIMPBOV', '&', 'D', 'RE\\\\\\\\L\\\\nJSIATF', '.', 'ON', 'THE', 'NORTH', 'BIDEOF', '1ST.', ',', '\\\\nNEAR', '23d', 'ST', 'R', '>', 'ET', 'NORTHWEST.\\\\nBy', 'virtue', 'ol', 'a', 'deed', 'of', 'trust', 'recorded', 'In', 'Lllier^^\\\\nNo', '.', '854.', 'folio', '410.', 'et', 'seq.', ',', 'one', 'of', 'the', 'Land^®\\\\nrecords', 'of', 'the', 'district', 'of', 'Columbia', ',', 'and', 'a', "'", '.', '``', '\\\\ndecree', 'of', 'the', 'Bupreme', 'Court', 'of', 'the', 'District', 'of\\\\nColumbia', ',', '[', 'tasked', 'in', 'equity', 'cause', 'No', '.', '5791', '.', 'June\\\\n16th', ',', '1878.', 'we', 'will', ',', 'on', 'FRIDAY', ',', 'the', '88', ':', 'b', 'of\\\\nJune', ',', '1878.', 'at', '6', "o'clock", 'p.', 'n', '>', '.', ',', 'in', 'front', 'of', 'the\\\\npitml', '&', 'es', ',', 'seb', 'at', 'pubi', 'c', 'auction', 'lot', '2', ',', 'in', 'square', '40', ',', '\\\\nin', 'tbe', 'city', 'of', 'Washington', ',', 'which', 'said', 'lot', ',', 'uniin-\\\\npioved', ',', 'containing', 'abou', '16', '346', 'square', ',', 'feet', 'of\\\\nground', ',', 'will', 'be', 'subdivided', 'into', 'tnree', 'lots', ',', 'each', 'of\\\\nwhich', 'will', 'have', 'a', 'froLUme', 'of', 'about', '21', 'feet', 'ou', 'I\\\\nstreet', ',', 'and', 'will', 'be', 'soldj\\\\nTerms', 'of', 'sale', ':', 'One-tblrd', ',', 'togethor', 'with', 'the', 'ex¬\\\\npenses', 'of', 'sale', ',', 'in', 'cash', ';', 'the', 'residue', 'in', 'three', 'equal\\\\npay', 'n', 'ents', 'at', 'six', ',', 'twelve', 'and', 'eighteen', 'months', ',', 're¬\\\\nspectively', ',', 'for', 'which', 'tbe', 'notes', 'of', 'the', 'purchaser', ',', '\\\\nbearing', 'interest', 'from', 'the', 'day', 'of', 'sale', 'at', '8', 'per', 'cent', ',', '\\\\nper', 'ai.num', ',', 'p', ':', 'Table', 'semi-annually', ',', 'and', 'secured', 'by\\\\na', 'deed', 'of', 'trust', 'on', 'the', 'property', 'sold', ',', 'will', 'be', 'taken', ';', '\\\\nor', 'the', 'purchaser', 'may', 'pay', 'cash', 'In', 'full', ',', 'at', 'nls', 'op¬\\\\ntion', '.', 'All', 'conveyancing', 'and', 'recording', 'will', 'be', 'at\\\\nthe', 'cost', 'of', 'the', 'purchaser', ',', 'and', 'if', 'the', 'terms', 'of', 'sae\\\\nshall', 'not', 'lie', 'complied', 'with', 'In', 'Ave', 'days', 'after', 'the\\\\ntale', 'the', 'property', 'will', '1', '*', 'n', '*', 'old', 'at', 'the', 'risk', 'and', 'co', '»', 't\\\\nof', 'tbe', 'defaulting', 'purchaser', '.', 'A', 'deposit', 'of', 'f150', ',', 'or\\\\n960', 'c', 'n', 'each', 'sulidivlded', 'lot', ',', 'will', 'be', 'required', 'at', 'the']
['God', 'includes', 'all', '.', 'and', 'would', 'we', 'not\\\\ngrieve', 'if', 'he', 'left', 'any', 'out', '?', 'If', 'God\\\\nthought', 'some', 'too', 'large', 'or', 'too', 'email', '.', "'\\\\nespecially", 'if', 'they', 'were', 'our', 'children', '?', '\\\\nCJod', 'would', 'not', 'say', 'that', 'Jesse', 'and', 'RuAh.\\\\nand', 'Willie', 'should', 'go', 'to', 'Sabbath\\\\nschool', ',', 'but', 'George', 'and', 'James', '..', 'and\\\\nMarj', "'", 'are', 'too', 'old', '.', 'Our', 'hair', 'may', '.', "''", 'be-', ',', '\\\\ncomp', 'silvered', ',', 'yet', 'we', 'are', 'but', 'children', ',', ',\\\\nus', 'students', 'of', 'God', "'s", 'word', ';', 'children', 'in\\\\nChristian', 'life', 'and', 'service', '.', 'Old', 'and\\\\nyoung', 'we', 'are', 'all', 'children', 'of', 'God', ',', "'atid-\\\\nneed", 'to', 'be', 'taught', 'of', 'God', '.', 'Are', 'here\\\\nall', 'thy', 'children', ',', 'both', 'old', 'and', 'young/\\\\ngreat', 'and', 'small', '?', 'The', 'Ideal', 'way', 'and\\\\nthe', 'scriptural', 'way', 'is', 'the', 'whole', 'family\\\\nin', 'the', 'service', 'of', 'public', 'worship', ',', 'and\\\\nthe', 'whole', 'family', 'in', 'the', 'Sabbath\\\\nschool', '.', 'And', 'then', 'there', 'are', 'our', 'neigh¬\\\\nbor', "'s", 'children', '.', 'They', 'are', 'also', 'our', 'chH-\\\\ndren', 'in', 'this', 'particular', '.', 'We', 'have', "''", 'a\\\\nresponsibility', 'concerning', 'them', '.', 'If', 'we\\\\nare', 'our', 'brother', "'s", 'keeper', ',', 'then', 'we', 'are\\\\nalso', 'the', 'keeper', 'of', 'our', 'brother', "'s", 'chil¬\\\\ndren', '.', 'There', 'arehelot', 'of', 'spiritual\\\\nwaifs', 'all', 'about', 'us', '.', 'children', 'without\\\\nreligious', 'home', 'training', ',', 'example', 'or\\\\ninfluence', 'The', 'parable', 'of', 'the', 'good\\\\nSamaritan', 'teaches', 'us', 'that', 'our', 'neigh¬\\\\nbor', 'is', 'any', 'one', 'in', 'need', 'that', 'we', 'can\\\\nhelp', '.', 'These', 'children', 'of', 'the', 'streets\\\\naDd', 'of', 'the', 'homes', 'of', 'irreligious', 'or', 'neg¬\\\\nligent', 'parents', 'are', 'our', 'children', 'accord¬\\\\ning', 'to', 'the', 'teachings', '(', 'f', 'Christ', '.', 'They\\\\nare', 'our', 'neighbors', '.', 'They', 'are', 'in', 'need', ',', '\\\\nand', 'we', 'have', 'lt', 'in', 'our', 'power', 'to', 'help\\\\nthem', '.', 'They', 'are', 'worse', 'than', 'sheep\\\\nwithout', 'a', 'shepherd', '.', 'They', 'are', 'the', 'lit¬\\\\ntle', ',', 'innocent', ',', 'helpless', 'lambs', 'without', 'a\\\\nshepherd', '.', 'Do', "n't", 'let', 'us', 'think', 'we', 'have\\\\nno', 'responsibility', 'if', 'we', 'have', 'no', 'chfl¬\\\\ndren', '.', 'Do', "n't", 'let', 'us', 'think', 'we', 'have', 'done\\\\nour', 'full', 'duty', 'If', 'our', 'own', 'children', 'are\\\\nin', 'the', 'church', 'and', 'Sabbath', 'school', '.', 'Are\\\\nhere', 'all', 'thy', 'children', ',', 'in', 'tire', 'large\\\\nsense', '?', '-our', 'own', 'children', ',', 'large', 'and\\\\nsmall', ',', 'and', 'our', 'neighbor', "'s", 'children', ',', '\\\\nall', 'that', 'we', 'ate', 'responsible', 'for,1', '!', 'all\\\\nthat', 'we', 'can', 'influence', 'and', 'instruct', 'in\\\\nspiritual', 'things', '?']
['The', 'said', 'action', 'is', 'brought', 'to', 'obtain', 'a', 'decree', 'of\\\\nthis', 'Court', 'for', 'tbe', 'foreclosure', 'of', 'a', 'certain', 'mort-\\\\ngage', 'described', 'In', 'the', 'said', 'Complaint', ',', 'and', "cxc-\\\\n.U'ed", 'by', 'the', 'said', 'Edward', 'Naud', ',', 'now', 'deceased', ',', '\\\\nto', 'Thaddeus', 'Amat', ',', 'who', 'assigned', 'same', 'to', 'plain-\\\\ntiff', 'by', 'mesne', 'assign', 'menu', '(', 'wu', 'Complaint', ')', 'on', 'the\\\\nithday', 'of', 'August', ',', 'A', '.', 'D', '.', '1877', ',', 'to', 'secure', 'the', 'pay-\\\\nment', 'of', 'a', 'promissory', 'n.-te', 'fur', 'the', 'sum', 'of', '$', '3,760', ',', '\\\\nexecuted', 'on', 'same', 'day', ',', 'with', 'Interest', 'thereon', 'at\\\\nthe', 'rate', 'of', 'one', 'per', 'cent', ',', 'per', 'month', 'till', 'paid', ',', '\\\\nfrom', 'November', ',', '1877', ',', 'compounded', 'quarter', 'y', ',', 'and\\\\ntor', 'costs', 'of', 'suit', ';', 'that', 'the', 'premises', 'conveyed', 'by-\\\\nsaid', 'Mortgage', 'may', 'be', 'sold', ',', 'and', 'the', 'proceeds', 'ap-\\\\nplied', 'to', 'thu', 'payment', 'of', 'the', 'said', 'promissory', 'note\\\\nand', 'interest', 'as', 'aforesaid', ',', 'and', 'costs', 'of', 'suit', ',', 'and', 'in\\\\ncase', 'such', 'proceeds', 'ars', 'not', 'sufficient', 'to', 'pay', 'the\\\\ngraphthen', 'to', 'obtain', 'an', 'execution', 'against', 'said', 'Vie\\\\ntor', 'Beaudry', ',', 'whois', 'obligated', 'to', 'pay', 'the', 'same', ',', 'for\\\\ntho', 'balance', 'remaining', 'due', ',', 'and', 'also', 'that', 'the', 'de-\\\\nfendants', 'and', 'all', 'persons', 'claiming', 'by', ',', 'through', 'or\\\\nunder', 'them', 'may', 'be', 'barred', 'and', 'foreclosed', 'of', 'aii\\\\nright', ',', 'title', ',', 'claim', ',', 'lien', ',', 'equityof', 'redemption', 'and\\\\ninterest', 'in', 'and', 'tn', 'Stid', 'moitgaged', 'premises', ',', 'and\\\\nfor', 'other', 'and', 'upther', 'relief', '.', 'Reference', 'is', 'hodto\\\\ncomplaint', 'for', 'partculara.\\\\nAnd', 'you', 'are', 'hereby', 'notified', 'that', 'If', 'you', 'fail', 'to\\\\nappear', 'ant', "'", 'answer', 'the', 'said', 'complaint', 'as', 'above\\\\nrequired', ',', 'the', 'said', 'plaintiffwillapplyto', 'the', 'Court\\\\nfor', 'iherelitf', 'demanded', 'inthe', 'said', 'complaint.\\\\nGiven', 'under', 'myhand', 'and', 'tbe', 'seal', 'ofthe', 'ssid', 'Su-\\\\nperior', 'Court', 'of', 'the', 'State', 'of', 'California', ',', 'iaand', 'for\\\\nthe', 'county', 'of', 'Los', 'Angeles', ',', 'this', '3d', 'day', 'of', 'August', ',', '\\\\nin', 'the', 'year', 'of', 'our', 'Lord', ',', 'one', 'thousand', 'eight', 'bun\\\\ndrcd', 'and', 'eighty-three', '.']
['party', "''", 'is', 'a', 'useless', 'exhortation', 'to', 'intel-\\\\nligent', 'men', ',', 'aiiless', 'they', 'see', 'that', 'the', 'par-\\\\nty', 'is', 'resolved', 'to', 'secure', 'those', 'ends', 'which\\\\nintelligent', 'men', 'desire', 'by', 'means', 'of', 'such\\\\nagents', 'as', 'intelligent', 'men', 'can', 'respect.\\\\nThe', 'Republicans', 'iu', 'the', 'Essex', 'district', 'of\\\\nMassachusetts', 'who', 'select', 'a', 'man', 'like\\\\neneral', 'Butler', 'as', 'their', 'representative\\\\ndefeat', 'the', 'Republican', 'candidates', 'in', 'In-\\\\ndiana', 'and', 'Ohio', '.', 'It', 'is', 'they', ',', 'and', 'not\\\\nRepublicans', ',', 'wLo', 'insist', 'ujon', 'honesty\\\\nand', 'principle', 'in', 'politics', ',', 'who', 'are', 're-\\\\nsponsible', 'for', 'Repu', 'I', 'ilican', 'disasters.\\\\nThe', 'general', 'torpidity', 'of', 'business', ',', 'the\\\\nprolonged', 'confusion', 'in', 'the', 'Southern\\\\nStates', ',', 'the', 'suspicion', 'of', 'corruption', 'and\\\\ninefficiency', 'in', 'the', 'public', 'service', ',', 'the\\\\nhostility', 'to', 'stringent', 'temperance', 'legis-\\\\nlation', ',', 'are', 'among', 'the', 'reasons', 'which\\\\nhave', 'fostered', 'that', 'desire', 'for', 'change\\\\nwhich', 'is', 'shown', 'iu', 'the', 'elections', '.', 'There\\\\nis', 'not', 'one', 'of', 'these', 'complaints', ',', 'however', ',', '\\\\nexcept', 'that', 'of', 'the', 'temperance', 'laws', ',', '\\\\nwhich', 'would', 'be', 'removed', 'by', 'a', 'Demo-\\\\ncratic', 'restoration', '.', 'All', 'the', 'sincere', 'jeal-\\\\nousy', 'of011with', 'all', 'tjie', 'hatred', 'that\\\\nsurvives', 'the', 'war', ';', 'all', 'the', 'hostility', 'to', 'the\\\\nprinciples', 'and', 'the', 'purpose', 'of', 'the', 'new\\\\namendments', 'to', 'the', 'Constitution', ';', 'the\\\\nspirit', 'of', 'oppression', 'of', 'the', 'negro', ';', 'the\\\\ndesire', 'of', 'repudiation', 'are', 'all', 'included\\\\nin', 'the', 'Democratic', 'party', '.', 'In', 'States\\\\nwhere', 'the', 'old', 'spirit', 'of', 'caste', ',', 'fostered', 'by\\\\nignorance', 'of', 'every', 'kind', ',', 'is', 'strongest', ',', 'iu\\\\nthose', 'parts', 'of', 'the', 'country', 'which', 'are', 'the\\\\nmost', 'backward', 'in', 'civilization', 'and', 'gen-\\\\neral', 'development', ',', 'the', 'Democratic', 'pari', 'y\\\\nis', 'now', ',', 'as', 'it', 'always', 'was', ',', 'more', 'powerful\\\\ntnan', 'its', 'opponent', '.', 'Iu', 'the', 'great', 'centres\\\\nof', 'intelligence', ',', 'industry', ',', 'enterprise', ',', '\\\\nand', 'an', 'advancing', 'social', "'condition", 'the\\\\nRepublican', 'party', 'is', 'dominant', '.', 'Ken-\\\\ntucky', 'and', 'Maryland', 'are', 'distinctively\\\\nDemocratic', 'States', ';', 'Massachusetts', ',', 'Iowa', ',', '\\\\nand', 'rural', 'New', 'York', 'are', 'Republican.\\\\nEvery', 'patriotic', 'and', 'enlightened', 'Amer-\\\\nican', 'must', 'prefer', 'to', 'see', 'thecountry', 'guard\\\\ned', 'by', 'the', 'spirit', 'of', 'the', 'great', 'Northwest\\\\nand', 'of', 'New', 'England', 'and', 'New', 'York\\\\nrather', 'than', 'by', 'tluit.of', 'the', 'old', 'Bourbon\\\\nand', 'Slave', 'States', '.']
['has', 'led', 'me', 'to', 'accept', ',', 'everything', 'I', 'read\\\\nwith', 'a', 'measure', 'of', 'distrust', ',', 'and', 'I', 'take\\\\nnothing', 'for', 'granted', 'because', 'it', 'has', 'come\\\\nfrom', 'the', 'pen', 'of', 'one', 'whose', 'prominence\\\\ngives', 'his', 'opinions', 'weight', ',', 'whether\\\\nthey', 'are', 'right', 'or', 'wrong', '.', 'My', 'neigh-\\\\nbors', 'are', 'different', '.', 'Their', 'advancement\\\\nis', 'slow', 'and', 'frequently', 'wrong', 'They\\\\nget', 'hold', 'of', 'exploded', 'ideas', 'years', 'after\\\\nthe', 'explosion', ',', 'and', 'because', 'of', 'the', 'prob-\\\\nabilities', 'of', 'a', 'thing', ',', 'it', 'is', 'accepted', 'as', 'a\\\\nfact', '.', 'But', 'neighbors', 'are', 'about', 'alike', 'in\\\\nevery', 'township', 'in', 'the', 'land', 'outside', 'of\\\\nthe', 'very', 'centres', 'of', 'civilization', ',', 'where\\\\nthe', 'light', 'of', 'knowledge', 'flashes', 'from\\\\nmind', 'to', 'mind', 'in', 'the', 'human', 'conflict', 'to\\\\nreach', 'the', 'highest', 'round', 'of', 'the', 'ladder.\\\\nIt', 'is', 'astonishing', 'men', 'will', 'live', 'and', 'die\\\\nin', 'this', 'age', 'and', 'not', 'know', 'the', 'earth', 'is\\\\nround', '.', 'School', 'houses', 'on', 'almost', 'every\\\\nfarm', ';', 'books', 'of', 'all', 'kinds', 'within', 'reach', ',', '\\\\nand', 'yetseparately.that', 'the', 'earth', 'has', 'mo-\\\\ntion', '.', 'Aday', 'ortwo', 'agoItalked', 'to', 'a\\\\nprominent', 'attorney', 'in', 'Butler', ',', 'and', ',', '\\\\nwould', 'you', 'believe', 'it', ',', 'ho', 'actually', 'argued\\\\nthat', 'the', 'farther', 'you', 'go', 'south', 'the', 'hotter\\\\nit', 'got', ',', 'exactly', 'as', 'the', 'further', 'north', 'you\\\\nwent', 'the', 'colder', 'it', 'got', '.', 'It', 'is', 'ridiculous', '!', '\\\\nDuring', 'all', 'of', 'that', 'man', "'s", 'busy', 'life', 'be\\\\nbad', 'not', 'paused', 'to', 'make', 'one', 'application\\\\nof', 'his', 'knowledge', ',', 'so', 'he', 'could', 'practical-\\\\nly', 'understand', 'the', 'relationship', 'existing\\\\nbetween', 'the', 'North', 'and', 'South', 'poles', ',', '\\\\nthe', 'equator', 'aud', 'the', 'suu', '.', '``', '\\\\nWe', 'came', 'to', 'the', 'house', 'and', 'I', 'was', 'con-\\\\nducted', 'into', 'a', 'large', 'room', 'fitted', 'up', 'at\\\\none', 'end', 'for', 'a', 'library', 'and', 'at', 'the\\\\nother', 'for', 'a', 'workshop', ',', 'with', 'a', 'sliding\\\\ncurtain', 'as', 'a', 'dividing', 'partition', '.', 'The\\\\nroom', 'was', 'filled', 'with', 'an', 'array', 'of', 'cur-\\\\nious', 'things', '.', 'Maps', ',', 'books', 'every', 'where', ',', '\\\\nglobes', ',', 'large', 'and', 'small', '.', 'The', 'earth\\\\nrepresented', 'in', 'dozeus', 'of', 'wonderful\\\\nshapes', '.']
['The', 'wool', 'circulars', 'alluded', 'to', 'are\\\\nthose', 'which', 'give', 'the', 'quotations', 'side\\\\nby', 'side', 'of', 'Ohio', 'medium', 'in', 'the', 'United\\\\nStates', 'and', 'Australasian', 'medium', 'of\\\\nthe', 'same', 'quality', 'and', 'condition', 'in\\\\nLondon', '.', 'the', 'time', 'that', 'the', 'tarif', 'law\\\\nwent', 'into', 'effect', 'in', '1868', ',', 'up', 'to', 'and', 'in-\\\\ncluding', '1891', ',', 'showing', 'that', 'the', 'aver-\\\\nage', 'price', 'received', 'for', 'wool', 'of', 'the', 'same\\\\nquality', 'in', 'the', 'tree', 'wool', 'market', 'of', 'Lon-\\\\ndon', 'during', 'all', 'of', 'that', 'period', 'averagd\\\\n51', 'per', 'cent', '.', 'lees', 'than', 'the', 'price', 'paidin\\\\nthe', 'United', 'States', 'for', 'the', 'same', 'kindof\\\\nAmerican', 'wool', 'under', 'protection.\\\\nThe', 'quotations', 'for', 'domestic', 'wool\\\\nwhich', '.', 'be', 'says', ',', 'are', 'incorrect', ',', 'are', 'tak-\\\\nen', 'from', 'Mr.', 'Springer', "'s", 'own', 'report', 'of\\\\nthe', 'Ways', 'and', 'Means', 'Committee', 'to\\\\nthe', 'Houseof', 'Representatives', ';', 'see', 'page\\\\n34', ',', 'report', 'No', '.', '501', '.', 'We', 'assumed', 'that\\\\nMr', '.', 'Springer', "'s", 'figures', 'werecorrect', ',', 'and\\\\nnever', 'questionedaaccuracy', ',', 'as\\\\nthey', 'were', 'furnished', 'by', 'him', 'as', 'chair-\\\\nman', 'of', 'the', 'Ways', 'and', 'Means', 'commit-\\\\ntee', 'of', 'the', 'house', 'of', 'representatives', ';', 'and\\\\nthis', 'ought', 'to', 'be', ',', 'and', 'therefore', 'has\\\\nbeen', ',', 'the', 'best', 'authority', '.', 'TheLondon\\\\nprices', 'were', 'obtained', 'from', 'the', 'pub-\\\\nlished', 'quotations', 'of', 'Jan.', '1', ',', '1892', ',', 'of\\\\nMessrs', '.', 'Windeler', '&', 'Co.', ',', 'of', 'London', ',', '\\\\nEngland', ',', 'and', 'are', 'prepared', 'by', 'them\\\\nfor', 'the', 'London', 'market', 'without', 're-\\\\ngard', 'to', 'any', 'political', 'use', 'that', 'might\\\\nbe', 'made', 'of', 'them', 'in', 'the', 'United', 'States.\\\\nThese', 'London', 'quotations', 'of', 'the\\\\nMessrs', '.', 'Windeler', ',', 'which', 'we', 'use', ',', 'are\\\\nconfirmed', 'by', 'those', 'of', 'Messrs.', 'Helmnth', ',', '\\\\nSwartz', '&', 'Co', '..', 'ot', 'London', ',', 'Mesrs', '.', 'Bx-\\\\nton', ',', 'Ronald', '&', 'Co.', ',', 'of', 'London', ',', 'and\\\\nalso', 'by', 'the', 'Bradford', 'Observer', ',', 'of\\\\nBradford', ',', 'England', ',', 'the', 'onenewspaper\\\\nthat', 'is', 'recognized', 'throughout', 'themer-\\\\ncantile', 'world', 'as', 'authority', 'on', 'matters\\\\n•rlating', 'to', 'wool', 'and', 'manufactures\\\\nthereof', '.']
def strip(text):
    txt = str(text).lower().strip()
    txt = txt.replace("", "'")
    txt = txt.replace(" this\\\\nplace", "this place")
    txt = txt.replace("'we\\\\nwere", "we were")
    txt = txt.replace("'ever\\\\nwas", "ever was")
    txt = txt.replace("'making\\\\nsuch", "making such")
    txt = txt.replace("'boot\\\\nto", "boot to")
    txt = txt.replace("'elsewhere\\\\nfrom", "elsewhere from")
    txt=txt.replace("United\\\\nStates","United States")
    txt = txt.replace("Unit-\\\\ned","United" )
    txt = txt.replace("neigh-\\\\nbors", "neighbours")
    txt = txt.replace("aver-\\\\nage", "average")
    txt = txt.replace("people\\\\ndown", "people down")
    txt =re.compile(r"'s|[\-]|\-\\\\n|\p{P}").sub("", txt)
    txt = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+").sub(" ", txt)
    return txt
for _, x in train[:2].iterrows():
        words = nltk.word_tokenize(strip(x['Concatenated']))
        print(words)
['came', 'fiom', 'the', 'last', 'place', 'tothis', 'place', 'and', 'this', 'place', 'is', 'where', 'wenwere', 'this', 'is', 'the', 'first', 'road', 'i', 'evernwas', 'on', 'where', 'you', 'can', 'ride', 'elsewherenfrom', 'anywhere', 'and', 'be', 'nowherenhe', 'says', 'while', 'this', 'train', 'stops', 'everynwhere', 'it', 'never', 'stops', 'anywhere', 'unnless', 'its', 'somewhere', 'well', 'i', 'saysnim', 'glad', 'to', 'hear', 'that', 'but', 'accordning', 'to', 'your', 'figures', 'i', 'left', 'myselfnwhere', 'was', 'which', 'is', 'five', 'miles', 'nearner', 'to', 'myself', 'than', 'i', 'was', 'when', 'wenwere', 'where', 'we', 'are', 'nownwe', 'have', 'now', 'reached', 'slidellnthat', 'a', 'fine', 'place', 'the', 'people', 'down', 'there', 'remind', 'me', 'of', 'bananasnthey', 'come', 'and', 'go', 'in', 'bunches', 'ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'toughnpeople', 'now', 'she', 'is', 'noted', 'for', 'bentough', 'steaks', 'well', 'i', 'certainly', 'gotnone', 'there', 'when', 'the', 'waiter', 'broughtnit', 'in', 'it', 'was', 'so', 'small', 'i', 'thought', 'itnwas', 'a', 'crack', 'in', 'the', 'plate', 'i', 'skidnwaiter', 'what', 'else', 'have', 'you', 'got', '+henbrought', 'me', 'in', 'two', 'codfish', 'and', 'onensmelt', 'i', 'said', 'waiter', 'have', 'you', 'gotnpigs', 'feet', 'he', 'said', 'no', 'rheumatismnmakes', 'me', 'walk', 'that', 'way', 'i', 'saldnhow', 'is', 'the', 'pumpkin', 'pieliesaidnit', 'all', 'squash', 'the', 'best', 'i', 'could', 'getnin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwichnafter', 'the', 'table', 'battle', 'the', 'waiter', 'andni', 'signed', 'an', 'armistice', 'i', 'then', 'wentnover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'forna', 'room', 'he', 'said', 'with', 'or', 'without', 'anbed', 'i', 'said', 'with', 'a', 'bed', 'he', 'saidni', 'dont', 'think', 'i', 'have', 'a', 'bed', 'longnenough', 'for', 'you', 'i', 'said', 'well', 'illnaddtwo', 'feettoitwhenigetinitnhe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'thentop', 'floor', 'it', 'was', 'one', 'of', 'those', 'roomsnthat', 'stands', 'on', 'each', 'side', 'if', 'younhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'ofnthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'andnget', 'up', 'in', 'the', 'middle', 'of', 'the', 'roomnthat', 'night', 'i', 'dreamt', 'i', 'was', 'eatingnflannel', 'cakes', 'when', 'i', 'woke', 'up', 'halfnof', 'the', 'blanket', 'was', 'gone', 'i', 'mustnhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'thenbed', 'for', 'next', 'morning', 'i', 'had', 'an', 'awfulnheadache', 'i', 'told', 'the', 'manager', 'aboutnit', 'he', 'said', 'you', 'have', 'rheumaticnpains', 'i', 'said', 'no', 'i', 'think', 'it', 'is', 'onnof', 'those', 'attic', 'room', 'pains', 'i', 'nad', 'tongetupat', 'aminthemorningsonthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'thenbreakfast', 'table']
['mb', 'boot', 'political', 'obeednattempt', 'to', 'imagine', 'a', 'piatt', 'makingnsuch', 'an', 'address', 'as', 'that', 'of', 'elihu', 'bootnto', 'the', 'now', 'york', 'legislature', 'and', 'younfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunqnwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'empirqnstate', 'of', 'being', 'represented', 'in', 'tho', 'unitned', 'states', 'senate', 'by', 'a', 'statesman', 'atntho', 'very', 'outset', 'mr', 'boot', 'declared', 'forntho', 'parcels', 'post', 'thereby', 'giving', 'noticento', 'tho', 'country', 'that', 'tho', 'express', 'compannies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'acncredited', 'to', 'new', 'york', 'that', 'seat', 'willnfor', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'ansmaa', 'who', 'hag', 'convictions', 'of', 'his', 'ownnwho', 'isigovemed', 'by', 'reasoned', 'politicaln', 'ideas', 'who', 'had', 'grown', 'so', 'accustomed', 'tonthink', 'nationally', 'that', 'it', 'is', 'with', 'somonmental', 'eflort', 'that', 'he', 'can', 'bringhimselfninto', 'a', 'proper', 'perspective', 'with', 'thosenminor', 'senatorial', 'duties', 'such', 'as', 'tho', 'fillning', 'of', 'offices', 'which', 'bulk', 'hugelynupon', 'the', 'horizons', 'of', 'tho', 'flatts', 'andntheir', 'lit', 'tho', 'albany', 'politicians', 'wenare', 'told', 'tried', 'to', 'read', 'between', 'tho', 'linesnfor', 'evidence', 'that', 'they', 'had', 'among', 'themna', 'new', 'organization', 'leader', 'somo', 'one', 'tonguide', 'and', 'direct', 'their', 'political', 'machinnations', 'and', 'to', 'settlo', 'where', 'tho', 'goodnthings', 'should', 'go', 'wo', 'think', 'they', 'lisntened', 'in', 'vain', 'what', 'they', 'heard', 'werentimely', 'reflections', 'opon', 'tho', 'immediatenproblems', 'of', 'stato', 'and', 'national', 'governnments', 'mixed', 'with', 'excellent', 'advice', 'tonthe', 'electorate', 'on', 'the', 'duty', 'of', 'improvingnthe', 'quality', 'of', 'tho', 'stato', 'legislaturesnit', 'must', 'have', 'been', 'something', 'of', 'a', 'novnelty', 'though', 'possibly', 'not', 'wholly', 'refreshlin', 'gnto', 'political', 'thirst']
words = []

def train_model(data, m):
    for y,x in data.iterrows():
        words = nltk.word_tokenize(strip(x['Concatenated']))
        #print(words)
        for word_1, word_2 in nltk.bigrams(words, pad_left=True, pad_right=True):
            if word_1 and word_2:
                m[word_2][word_1] += 1
    for word_2 in m:
        summ = sum(m[word_2].values())
        summ = float(summ)
        for word_1 in m[word_2]:
            m[word_2][word_1] /= summ
def base_prob():
  return "the:0.3 a:0.3 to:0.2 and:0.1 :0.1"
model = defaultdict(lambda: defaultdict(lambda: 0))
train_model(train, model)
def predict_words(w, model):
    sum = 0
    preds= dict(model[w])
    most_common = dict(Counter(preds).most_common(6))
    pred = ""
    for w, prob in most_common.items():
        sum += prob
        pred += f"{w}:{prob} "
    if sum == 0.0:
      base_prob()
    rest = 1 - sum
    pred += f":{rest}"
    return pred
ls
config.txt  in-header.tsv   README.md  train/
dev-0/      out-header.tsv  test-A/    Untitled0.ipynb
len(test_d)
7362
test_d = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding="utf-8")
dev_d = pd.read_csv("dev-0/in.tsv.xz", sep="\t", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding="utf-8")

with open( "dev-0/out.tsv", "w", encoding="utf-8") as f:
  for y,x in dev_d.iterrows():
    w = nltk.word_tokenize(strip(x[7]))
    prediction = predict_words(w[0], model)
    f.write(prediction + "\n")

with open( "test-A/out.tsv", "w", encoding="utf-8") as f:
  for y,x in test_d.iterrows():
    w = nltk.word_tokenize(strip(x[7]))
    prediction = predict_words(w[0], model)
    f.write(prediction + "\n")

with open('test-A/out.tsv', 'a', encoding='utf-8') as my_file:
  for x in range(52):
    my_file.write("the:0.3 a:0.3 to:0.2 and:0.1 :0.1\n")
with open('dev-0/out.tsv', 'a', encoding='utf-8') as my_file:
  for x in range(117):
    my_file.write("the:0.3 a:0.3 to:0.2 and:0.1 :0.1\n")
badlines_list = []
def badlines_collect (bad_line: list[str]) -> None:
        badlines_list.append(bad_line)
        return None
from csv import QUOTE_NONE
t_dd = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', quoting=QUOTE_NONE,  header=None, encoding="utf-8")
len(t_dd)
7414
test_d = pd.read_csv("test-A/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, encoding="utf-8")
<ipython-input-73-37554e0a2e1a>:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  test_d = pd.read_csv("test-A/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, encoding="utf-8")
b'Skipping line 2977: expected 8 fields, saw 9\n'
len(test_d)
7362
test_d_r = pd.read_csv("test-A/out.tsv", sep="\t",  lineterminator='\r', header=None, encoding="utf-8")
len(test_d_r)
7362
len(dev_d)
10402
rowcount=0
for row in lzma.open("test-A/in.tsv.xz"):
  rowcount+= 1
 #printing the result
print("Number of lines present:-", rowcount)
Number of lines present:- 7414
rowcount=0
for row in lzma.open("dev-0/in.tsv.xz"):
  rowcount+= 1
 #printing the result
print("Number of lines present:-", rowcount)
Number of lines present:- 10519
rowcount=0
for row in open("dev-0/out.tsv"):
  rowcount+= 1
 #printing the result
print("Number of lines present:-", rowcount)
Number of lines present:- 10519