83 KiB
83 KiB
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
cd drive/MyDrive
/content/drive/MyDrive
cd challenging-america-word-gap-prediction/
/content/drive/MyDrive/challenging-america-word-gap-prediction
! pip install lmza
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ [31mERROR: Could not find a version that satisfies the requirement lmza (from versions: none)[0m[31m [0m[31mERROR: No matching distribution found for lmza[0m[31m [0m
from collections import Counter
import lzma
import pickle
rowcount=0
for row in lzma.open("test-A/in.tsv.xz"):
rowcount+= 1
#printing the result
print("Number of lines present:-", rowcount)
Number of lines present:- 7414
with lzma.open('dev-0/in.tsv.xz',mode='rt', encoding='utf-8' ) as f:
with open('dev-0/out.tsv', 'w', newline='\n') as out:
for line in f.readlines():
sep = line.split('\t')
print(sep)
import pandas as pd
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
True
from collections import Counter, defaultdict
data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
exp_words = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
data[:10]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
0 | 4e04702da929c78c52baf09c1851d3ff | ST | ChronAm | 1919.604110 | 30.475470 | -90.100911 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... |
1 | b374dadd940510271d9675d3e8caf9d8 | DAILY ARIZONA SILVER BELT | ChronAm | 1909.097260 | 33.399478 | -110.870950 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... |
2 | adb666c426bdc10fd949cb824da6c0d0 | THE SAVANNAH MORNING NEWS | ChronAm | 1900.913699 | 32.080926 | -81.091177 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN |
3 | bc2c9aa0b77d724311e3c2e12fc61c92 | CHARLES CITY INTELLIGENCER | ChronAm | 1864.974044 | 43.066361 | -92.672411 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... |
4 | 0f612b991a39c712f0d745835b8b2f0d | EVENING STAR | ChronAm | 1878.478082 | 38.894955 | -77.036646 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... |
5 | 4c13fb3d2e6eef35fa28e7bae7868d60 | EDGEFIELD ADVERTISER | ChronAm | 1913.346575 | 33.789577 | -81.929558 | God includes all. and would we not\ngrieve if ... | lot of spiritual\nwaifs all about us. children... |
6 | a452eadfc3f4a475147728c5f4005429 | DAILY LOS ANGELES HERALD | ChronAm | 1883.801370 | 34.054935 | -118.244476 | The said action is brought to obtain a decree ... | then to obtain an execution against said Vie\n... |
7 | b970ee32372d81f1fd59ab6196e797c9 | THE FINDLAY JEFFERSONIAN | ChronAm | 1874.828767 | 41.041387 | -83.650398 | party" is a useless exhortation to intel-\nlig... | with all tjie hatred that\nsurvives the war; a... |
8 | d130f899a50db2792c546cc978dc930c | BUTLER CITIZEN | ChronAm | 1883.793151 | 40.861021 | -79.895225 | has led me to accept, everything I read\nwith ... | that the earth has mo-\ntion. Aday ortwo agoIt... |
9 | 80e56928e09b93529d206708ac905b63 | FERGUS COUNTY ARGUS | ChronAm | 1892.821038 | 47.062473 | -109.428238 | The wool circulars alluded to are\nthose which... | accuracy, as\nthey were furnished by him as ch... |
data[6][9]
"The wool circulars alluded to are\\\\nthose which give the quotations side\\\\nby side of Ohio medium in the United\\\\nStates and Australasian medium of\\\\nthe same quality and condition in\\\\nLondon. the time that the tarif law\\\\nwent into effect in 1868, up to and in-\\\\ncluding 1891, showing that the aver-\\\\nage price received for wool of the same\\\\nquality in the tree wool market of Lon-\\\\ndon during all of that period averagd\\\\n51 per cent. lees than the price paidin\\\\nthe United States for the same kindof\\\\nAmerican wool under protection.\\\\nThe quotations for domestic wool\\\\nwhich. be says, are incorrect, are tak-\\\\nen from Mr. Springer's own report of\\\\nthe Ways and Means Committee to\\\\nthe Houseof Representatives; see page\\\\n34, report No. 501 . We assumed that\\\\nMr. Springer's figures werecorrect, and\\\\nnever questioned"
data[7][9]
'accuracy, as\\\\nthey were furnished by him as chair-\\\\nman of the Ways and Means commit-\\\\ntee of the house of representatives; and\\\\nthis ought to be, and therefore has\\\\nbeen, the best authority. TheLondon\\\\nprices were obtained from the pub-\\\\nlished quotations of Jan. 1, 1892, of\\\\nMessrs. Windeler & Co., of London,\\\\nEngland, and are prepared by them\\\\nfor the London market without re-\\\\ngard to any political use that might\\\\nbe made of them in the United States.\\\\nThese London quotations of the\\\\nMessrs. Windeler, which we use, are\\\\nconfirmed by those of Messrs. Helmnth,\\\\nSwartz & Co.. ot London, Mesrs. Bx-\\\\nton, Ronald & Co., of London, and\\\\nalso by the Bradford Observer, of\\\\nBradford, England, the onenewspaper\\\\nthat is recognized throughout themer-\\\\ncantile world as authority on matters\\\\n•rlating to wool and manufactures\\\\nthereof.'
train = data[[6, 7]]
train= pd.concat([train, exp_words], axis=1)
train.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)
train[:10]
First Part | Second Part | Expected word | |
---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j |
5 | God includes all. and would we not\ngrieve if ... | lot of spiritual\nwaifs all about us. children... | he |
6 | The said action is brought to obtain a decree ... | then to obtain an execution against said Vie\n... | graph |
7 | party" is a useless exhortation to intel-\nlig... | with all tjie hatred that\nsurvives the war; a... | 011 |
8 | has led me to accept, everything I read\nwith ... | that the earth has mo-\ntion. Aday ortwo agoIt... | separately. |
9 | The wool circulars alluded to are\nthose which... | accuracy, as\nthey were furnished by him as ch... | a |
train['Concatenated'] = train['First Part'] + train['Expected word'] + train['Second Part']
train[:5]
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably | whenever any prize property shall!*' condemn- ... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... |
import regex as re
train.replace('\n', '', regex=True)
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably | whenever any prize property shall!*' condemn- ... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... |
... | ... | ... | ... | ... |
428512 | Sam Clendenin bad a fancy for Ui«\nscience of ... | \nSam was arrested.\nThe case excited a great ... | NaN | NaN |
428513 | Wita.htt halting the party ware dilven to the ... | through the alnp the »Uitors laapeeeed tia.»\n... | NaN | NaN |
428514 | It was the last thing that either of\nthem exp... | Agua Negra across the line.\nIt was a grim pla... | NaN | NaN |
428515 | settlement with the department.\nIt is also sh... | \na note of Wood, Dialogue fc Co., for\nc27,im... | NaN | NaN |
428516 | Flour quotations—low extras at 1 R0®2 50;\ncit... | 3214c;do White at 3614c: Mixed Western at\n331... | NaN | NaN |
428517 rows × 4 columns
for _, x in train[:2].iterrows():
words = nltk.word_tokenize(x['Concatenated'])
print(words)
['came', 'fiom', 'the', 'last', 'place', 'to', 'this\\\\nplace', ',', 'and', 'this', 'place', 'is', 'Where', 'We\\\\nWere', ',', 'this', 'is', 'the', 'first', 'road', 'I', 'ever\\\\nwas', 'on', 'where', 'you', 'can', 'ride', 'elsewhere\\\\nfrom', 'anywhere', 'and', 'be', 'nowhere.\\\\nHe', 'says', ',', 'while', 'this', 'train', 'stops', 'every-\\\\nwhere', ',', 'it', 'never', 'stops', 'anywhere', 'un-\\\\nless', 'its', 'somewhere', '.', 'Well', ',', 'I', 'says', ',', '\\\\nI', "'m", 'glad', 'to', 'hear', 'that', ',', 'but', ',', 'accord-\\\\ning', 'to', 'your', 'figures', ',', 'I', 'left', 'myself\\\\nwhere', '1', 'was', ',', 'which', 'is', 'five', 'miles', 'near-\\\\ner', 'to', 'myself', 'than', 'I', 'was', 'when', 'we\\\\nwere', 'where', 'we', 'are', 'now.\\\\nWe', 'have', 'now', 'reached', 'Slidell.\\\\nThat', "'s", 'a', 'fine', 'place', '.', 'The', 'people\\\\ndown', 'there', 'remind', 'me', 'of', 'bananas-\\\\nthey', 'come', 'and', 'go', 'in', 'bunches', '.', '811-\\\\ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'tough\\\\npeople', '.', 'Now', 'she', 'is', 'noted', 'for', 'be', ',', '\\\\ntough', 'steaks', '.', 'Well', ',', 'I', 'certainly', 'got\\\\none', 'there', '.', 'When', 'the', 'waiter', 'brought\\\\nit', 'in', 'it', 'was', 'so', 'small', 'I', 'thought', '.', 'It\\\\nwas', 'a', 'crack', 'in', 'the', 'plate', '.', 'I', 'skid', ',', '\\\\nwaiter', 'what', 'else', 'have', 'you', 'got', '?', '+He\\\\nbrought', 'me', 'in', 'two', 'codfish', 'and', 'one\\\\nsmelt', '.', 'I', 'said', ',', 'waiter', 'have', 'you', 'got\\\\npigs', 'feet', '?', 'He', 'said', 'no', ',', 'rheumatism\\\\nmakes', 'me', 'walk', 'that', 'way', '.', 'I', 'sald', ',', '\\\\nhow', 'is', 'the', 'pumpkin', 'pie', '?', 'liesaid\\\\nit', "'s", 'all', 'squash', '.', 'The', 'best', 'I', 'could', 'get\\\\nin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwich.\\\\nAfter', 'the', 'table', 'battle', 'the', 'waiter', 'and\\\\nI', 'signed', 'an', 'armistice', '.', 'I', 'then', 'went\\\\nover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'for\\\\na', 'room', '.', 'He', 'said', 'with', 'or', 'without', 'a\\\\nbed', '?', 'I', 'said', ',', 'with', 'a', 'bed', '.', 'He', 'said', ',', '\\\\nI', 'do', "n't", 'think', 'I', "'have", "'", 'a', 'bed', 'long\\\\nenough', 'for', 'you', '.', 'I', 'said', ',', 'well', ',', "I'll\\\\naddtwo", 'feettoitwhenIgetinit.\\\\nHe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'the\\\\ntop', 'floor', '.', 'It', 'was', 'one', 'of', 'those', 'rooms\\\\nthat', 'stands', 'on', 'each', 'side', '.', 'If', 'you\\\\nhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'of\\\\nthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'and\\\\nget', 'up', 'in', 'the', 'middle', 'of', 'the', 'room.\\\\nThat', 'night', 'I', 'dreamt', 'I', 'was', 'eating\\\\nflannel', 'cakes', '.', 'When', 'I', 'woke', 'up', 'half\\\\nof', 'the', 'blanket', 'was', 'gone', '.', 'I', 'must\\\\nhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'the\\\\nbed', ',', 'for', 'next', 'morning', 'I', 'had', 'an', 'awful\\\\nheadache', '.', 'I', 'told', 'the', 'manager', 'about\\\\nit', '.', 'He', 'said', ',', 'you', 'have', 'rheumatic\\\\npains', '.', 'I', 'said', ',', 'no', ',', 'I', 'think', 'it', 'is', 'on', ',', '\\\\nof', 'those', 'attic', 'room', 'pains', '.', 'I', 'nad', 'to\\\\ngetupat5a.m.inthemorningso\\\\nthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'the\\\\nbreakfast', 'table', '.'] ['MB', '.', 'BOOT', "'S", 'POLITICAL', 'OBEED\\\\nAttempt', 'to', 'imagine', 'a', 'Piatt', 'making\\\\nsuch', 'an', 'address', 'as', 'that', 'of', 'Elihu', 'Boot\\\\nto', 'the', 'Now', 'York', 'legislature', ',', 'and', 'you\\\\nfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunq\\\\nwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'Empirq\\\\nstate', 'of', 'being', 'represented', 'In', 'tho', 'Unit-\\\\ned', 'States', 'senate', 'by', 'a', 'statesman', '.', 'At\\\\ntho', 'very', 'outset', 'Mr', '.', 'Boot', 'declared', 'for\\\\ntho', 'parcels', 'post', ';', 'thereby', 'giving', 'notice\\\\nto', 'tho', 'country', 'that', 'tho', 'express', 'compan\\\\nies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'ac\\\\ncredited', ',', 'to', 'New', 'York', '.', 'That', 'seat', 'will\\\\n', ',', 'for', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'a\\\\nsmaa', 'who', ',', 'hag', 'convictions', 'of', 'his', 'own', ',', '\\\\nwho', "isi'govemed", 'by', 'reasoned', 'political\\\\n', "'", 'Ideas', ',', 'who', 'had', 'grown', 'so', 'accustomed', 'to\\\\nthink', 'nationally', 'that', 'it', 'is', 'with', 'somo\\\\nmental', 'eflort', 'that', 'he', 'can', 'bringhimself\\\\ninto', 'a', 'proper', 'perspective', 'with', 'those\\\\nminor', 'senatorial', 'duties', ',', 'such', 'as', 'tho', 'fill-\\\\ning', 'of', 'offices', ',', 'which', 'bulk', '39', 'hugely\\\\nupon', 'the', 'horizons', 'of', 'tho', 'Flatts', 'and\\\\ntheir', 'lit', ',', 'Tho', 'Albany', 'politicians', ',', 'we\\\\nare', 'told', ',', 'tried', 'to', 'read', 'between', 'tho', 'lines\\\\nfor', 'evidence', 'that', 'they', ',', 'had', 'among', 'them\\\\na', 'new', 'organization', 'leader', ',', 'somo', 'one', 'to\\\\nguide', 'and', 'direct', 'their', 'political', 'machi-\\\\nnations', ',', 'and', 'to', 'settlo', 'where', 'tho', 'good\\\\nthings', 'should', 'go', '.', 'Wo', 'think', 'they', 'lis-\\\\ntened', 'in', 'vain', '.', 'What', 'they', 'heard', 'were\\\\ntimely', 'reflections', 'opon', 'tho', 'immediate\\\\nproblems', 'of', 'stato', 'and', 'national', 'govern-\\\\nments', ',', 'mixed', 'with', 'excellent', 'advice', 'to\\\\nthe', 'electorate', 'on', 'the', 'duty', 'of', 'improving\\\\nthe', 'quality', 'of', 'tho', 'stato', 'legislatures.\\\\nIt', 'must', 'have', '``', 'been', 'something', 'of', 'a', 'nov-\\\\nelty', ',', 'though', 'possibly', 'not', 'wholly', 'refresh-Lin-', 'g\\\\nto', 'political', 'thirst', '.']
for _, x in train[3:10].iterrows():
words = nltk.word_tokenize(x['Concatenated'])
print(words)
['whenever', 'any', 'prize', 'property', 'shall', '!', '*', "'", 'condemn-', "'", 'appeals', 'from', 'the', 'district', 'courts', 'of', 'the', 'Unite', '*', '!', '\\\\ned', ',', 'or', 'shall', 'at', 'any', 'stage', 'of', 'the', 'proceedings', 'be', 'j', 'State', '*', 'in', 'priae', 'causes', 'shall', 'be', 'directly', 'to', 'th', '#', '\\\\nfound\\\\\\\\iy', 'the', '<', 't', '>', 'urt', 'to', 'be', 'perishing', ',', 'perishable', '.', 'Supreme', 'Court', ',', 'and', 'shall', 'he', 'made', 'withiti\\\\nor', 'liable', 'to', 'deteriorate', 'or', 'depreciate', ',', 'or', 'when-', '•', 'thirty', 'days', 'of', 'the', 'rendering', 'of', 'the', 'decree', 'ap', '»', '\\\\never', 'the', 'etist', 'ot', 'keeping', 'th', '»', ':', 'same', 'shall', 'l', '>', 'c', 'dis-', 'i', 'pealed', 'from', ',', 'unh-ss', 'the', 'court', 'shall', 'previously\\\\nproportionate', 'to', 'its', 'value', ',', 'it', 'shall', 'be', 'the', 'duty', 'have', 'extended', 'the', 'time', 'for', 'cause', 'shown', 'in', 'th', '#', '\\\\nof', 'the', 'court', 'to', 'order', 'asale', 'thereof', ';', 'and', 'when-', '|', '»', 'artit', 'ular', 'case', ',', 'and', 'the', 'Supreme', 'court', '*', 'k', '«', '*', 'l|\\\\never', ',', 'after', 'the', 'return', 'day', 'on', 'the', 'liliel', ',', 'all', 'the', 'always', 'l', '>', 'e', 'open', 'fur', 'the', 'entry', 'of', 'sinh', 'uppealst\\\\nparties', 'in', 'interest', 'who', 'have', 'appeared', 'in', 'the', 'Such', 'appeals', 'may', 'l', '>', 'e', 'claimed', 'whenever', 'th', '#', '\\\\ncause', 'shall', 'iigree', 'thercfn', ',', 'the', 'court', 'is', 'author-', '|amount', 'in', 'controversy', 'esiee.is', 'two', 'thonsan', '<', '|\\\\nized', 'to', 'make', 'such', 'order', ',', 'and', 'no', 'appeal', 'shall', '(', 'dollars', ',', 'and', 'in', 'other', 'casesablythe', 'ceitihcate', "of'\\\\noperate", 'to', 'prevent', 'tfie', 'making', 'or', 'execution', 'of', '.', 'the', 'district', 'judge', 'that', 'the', 'adjudication', 'invi', '»', 'U\\\\nsuch', 'order', '.', 'The', 'Secretary', 'of', 'the', 'Navy', 'shall', 'ves', 'a', 'question', 'uf', 'general', 'importance.\\\\nemploy', 'an', 'auctioneer', 'or', 'auctioneers', 'of', 'known', 'withstanding1', 'such', 'apiw^al', ',', 'the', 'district', 'Mint\\\\nskill', 'in', 'the', 'branch', 'of', 'business', 'to', 'w', 'hich', 'any', 'may', 'make', 'and', 'execute', 'all', 'necessary', 'order', '*', 'fe', '«', 'f\\\\nsale', '[', 'lertains', ',', 'to', 'make', 'the', 'wile', ',', 'but', 'the', 'sale', 'I', 'the', 'custody', 'and', 'dis|M', '>', 'sitl', 'of', 'th', '•', 'puze', 'propeity', 'I\\\\nshall', 'be', 'conducted', 'nnder', 'the', 'sujK^rvfsfon', 'of', 'j', 'a', '«', 'i', '»', 'l', 'iu', 'case', 'of', 'appeal', 'from', 'a', 'tteeree', 'of', 'eoadeinh\\\\nthe', 'nutrshal', ',', 'and', 'the', 'crdlecting', 'and', 'deiwi-iling', 'I', 'natum', '.', 'may', 'stiil', 'pr.e', '*', 'i', 'to', 'make', 'a', 'dei', '*', 'ree', 'oj\\\\nof', 'the', 'gross', 'proceerls', 'shall', 'be', 'by', 'the', 'anction-', 'j', 'distribution', 'so', 'ftiras', 'to', 'determine', 'what', 'share\\\\neer', 'or', 'his', 'agent', '.', 'B.', 'fore', 'any', 'sale', 'the', 'marshal', 'j', 'of', 'the', 'prize', 'shall', 'g', '«', '»', 'to', 'the', '<', 'aptors', ',', 'and', 'what\\\\nshall', 'cause', 'tull', 'catalogues', 'and', 'schedules', 'to', '!', '*', '•', ',', 'vessels', 'are', 'entitled', 'to', 'particulate', 'therein', 'Aof\\\\nprejuiretl', 'and', 'circulate', ',', 'and', 'a', '.^', '»', 'pv', 'of', 'Wu-h'] ['SA', 'LKOFVALUABLE', 'UNIMPBOV', '&', 'D', 'RE\\\\\\\\L\\\\nJSIATF', '.', 'ON', 'THE', 'NORTH', 'BIDEOF', '1ST.', ',', '\\\\nNEAR', '23d', 'ST', 'R', '>', 'ET', 'NORTHWEST.\\\\nBy', 'virtue', 'ol', 'a', 'deed', 'of', 'trust', 'recorded', 'In', 'Lllier^^\\\\nNo', '.', '854.', 'folio', '410.', 'et', 'seq.', ',', 'one', 'of', 'the', 'Land^®\\\\nrecords', 'of', 'the', 'district', 'of', 'Columbia', ',', 'and', 'a', "'", '.', '``', '\\\\ndecree', 'of', 'the', 'Bupreme', 'Court', 'of', 'the', 'District', 'of\\\\nColumbia', ',', '[', 'tasked', 'in', 'equity', 'cause', 'No', '.', '5791', '.', 'June\\\\n16th', ',', '1878.', 'we', 'will', ',', 'on', 'FRIDAY', ',', 'the', '88', ':', 'b', 'of\\\\nJune', ',', '1878.', 'at', '6', "o'clock", 'p.', 'n', '>', '.', ',', 'in', 'front', 'of', 'the\\\\npitml', '&', 'es', ',', 'seb', 'at', 'pubi', 'c', 'auction', 'lot', '2', ',', 'in', 'square', '40', ',', '\\\\nin', 'tbe', 'city', 'of', 'Washington', ',', 'which', 'said', 'lot', ',', 'uniin-\\\\npioved', ',', 'containing', 'abou', '16', '346', 'square', ',', 'feet', 'of\\\\nground', ',', 'will', 'be', 'subdivided', 'into', 'tnree', 'lots', ',', 'each', 'of\\\\nwhich', 'will', 'have', 'a', 'froLUme', 'of', 'about', '21', 'feet', 'ou', 'I\\\\nstreet', ',', 'and', 'will', 'be', 'soldj\\\\nTerms', 'of', 'sale', ':', 'One-tblrd', ',', 'togethor', 'with', 'the', 'ex¬\\\\npenses', 'of', 'sale', ',', 'in', 'cash', ';', 'the', 'residue', 'in', 'three', 'equal\\\\npay', 'n', 'ents', 'at', 'six', ',', 'twelve', 'and', 'eighteen', 'months', ',', 're¬\\\\nspectively', ',', 'for', 'which', 'tbe', 'notes', 'of', 'the', 'purchaser', ',', '\\\\nbearing', 'interest', 'from', 'the', 'day', 'of', 'sale', 'at', '8', 'per', 'cent', ',', '\\\\nper', 'ai.num', ',', 'p', ':', 'Table', 'semi-annually', ',', 'and', 'secured', 'by\\\\na', 'deed', 'of', 'trust', 'on', 'the', 'property', 'sold', ',', 'will', 'be', 'taken', ';', '\\\\nor', 'the', 'purchaser', 'may', 'pay', 'cash', 'In', 'full', ',', 'at', 'nls', 'op¬\\\\ntion', '.', 'All', 'conveyancing', 'and', 'recording', 'will', 'be', 'at\\\\nthe', 'cost', 'of', 'the', 'purchaser', ',', 'and', 'if', 'the', 'terms', 'of', 'sae\\\\nshall', 'not', 'lie', 'complied', 'with', 'In', 'Ave', 'days', 'after', 'the\\\\ntale', 'the', 'property', 'will', '1', '*', 'n', '*', 'old', 'at', 'the', 'risk', 'and', 'co', '»', 't\\\\nof', 'tbe', 'defaulting', 'purchaser', '.', 'A', 'deposit', 'of', 'f150', ',', 'or\\\\n960', 'c', 'n', 'each', 'sulidivlded', 'lot', ',', 'will', 'be', 'required', 'at', 'the'] ['God', 'includes', 'all', '.', 'and', 'would', 'we', 'not\\\\ngrieve', 'if', 'he', 'left', 'any', 'out', '?', 'If', 'God\\\\nthought', 'some', 'too', 'large', 'or', 'too', 'email', '.', "'\\\\nespecially", 'if', 'they', 'were', 'our', 'children', '?', '\\\\nCJod', 'would', 'not', 'say', 'that', 'Jesse', 'and', 'RuAh.\\\\nand', 'Willie', 'should', 'go', 'to', 'Sabbath\\\\nschool', ',', 'but', 'George', 'and', 'James', '..', 'and\\\\nMarj', "'", 'are', 'too', 'old', '.', 'Our', 'hair', 'may', '.', "''", 'be-', ',', '\\\\ncomp', 'silvered', ',', 'yet', 'we', 'are', 'but', 'children', ',', ',\\\\nus', 'students', 'of', 'God', "'s", 'word', ';', 'children', 'in\\\\nChristian', 'life', 'and', 'service', '.', 'Old', 'and\\\\nyoung', 'we', 'are', 'all', 'children', 'of', 'God', ',', "'atid-\\\\nneed", 'to', 'be', 'taught', 'of', 'God', '.', 'Are', 'here\\\\nall', 'thy', 'children', ',', 'both', 'old', 'and', 'young/\\\\ngreat', 'and', 'small', '?', 'The', 'Ideal', 'way', 'and\\\\nthe', 'scriptural', 'way', 'is', 'the', 'whole', 'family\\\\nin', 'the', 'service', 'of', 'public', 'worship', ',', 'and\\\\nthe', 'whole', 'family', 'in', 'the', 'Sabbath\\\\nschool', '.', 'And', 'then', 'there', 'are', 'our', 'neigh¬\\\\nbor', "'s", 'children', '.', 'They', 'are', 'also', 'our', 'chH-\\\\ndren', 'in', 'this', 'particular', '.', 'We', 'have', "''", 'a\\\\nresponsibility', 'concerning', 'them', '.', 'If', 'we\\\\nare', 'our', 'brother', "'s", 'keeper', ',', 'then', 'we', 'are\\\\nalso', 'the', 'keeper', 'of', 'our', 'brother', "'s", 'chil¬\\\\ndren', '.', 'There', 'arehelot', 'of', 'spiritual\\\\nwaifs', 'all', 'about', 'us', '.', 'children', 'without\\\\nreligious', 'home', 'training', ',', 'example', 'or\\\\ninfluence', 'The', 'parable', 'of', 'the', 'good\\\\nSamaritan', 'teaches', 'us', 'that', 'our', 'neigh¬\\\\nbor', 'is', 'any', 'one', 'in', 'need', 'that', 'we', 'can\\\\nhelp', '.', 'These', 'children', 'of', 'the', 'streets\\\\naDd', 'of', 'the', 'homes', 'of', 'irreligious', 'or', 'neg¬\\\\nligent', 'parents', 'are', 'our', 'children', 'accord¬\\\\ning', 'to', 'the', 'teachings', '(', 'f', 'Christ', '.', 'They\\\\nare', 'our', 'neighbors', '.', 'They', 'are', 'in', 'need', ',', '\\\\nand', 'we', 'have', 'lt', 'in', 'our', 'power', 'to', 'help\\\\nthem', '.', 'They', 'are', 'worse', 'than', 'sheep\\\\nwithout', 'a', 'shepherd', '.', 'They', 'are', 'the', 'lit¬\\\\ntle', ',', 'innocent', ',', 'helpless', 'lambs', 'without', 'a\\\\nshepherd', '.', 'Do', "n't", 'let', 'us', 'think', 'we', 'have\\\\nno', 'responsibility', 'if', 'we', 'have', 'no', 'chfl¬\\\\ndren', '.', 'Do', "n't", 'let', 'us', 'think', 'we', 'have', 'done\\\\nour', 'full', 'duty', 'If', 'our', 'own', 'children', 'are\\\\nin', 'the', 'church', 'and', 'Sabbath', 'school', '.', 'Are\\\\nhere', 'all', 'thy', 'children', ',', 'in', 'tire', 'large\\\\nsense', '?', '-our', 'own', 'children', ',', 'large', 'and\\\\nsmall', ',', 'and', 'our', 'neighbor', "'s", 'children', ',', '\\\\nall', 'that', 'we', 'ate', 'responsible', 'for,1', '!', 'all\\\\nthat', 'we', 'can', 'influence', 'and', 'instruct', 'in\\\\nspiritual', 'things', '?'] ['The', 'said', 'action', 'is', 'brought', 'to', 'obtain', 'a', 'decree', 'of\\\\nthis', 'Court', 'for', 'tbe', 'foreclosure', 'of', 'a', 'certain', 'mort-\\\\ngage', 'described', 'In', 'the', 'said', 'Complaint', ',', 'and', "cxc-\\\\n.U'ed", 'by', 'the', 'said', 'Edward', 'Naud', ',', 'now', 'deceased', ',', '\\\\nto', 'Thaddeus', 'Amat', ',', 'who', 'assigned', 'same', 'to', 'plain-\\\\ntiff', 'by', 'mesne', 'assign', 'menu', '(', 'wu', 'Complaint', ')', 'on', 'the\\\\nithday', 'of', 'August', ',', 'A', '.', 'D', '.', '1877', ',', 'to', 'secure', 'the', 'pay-\\\\nment', 'of', 'a', 'promissory', 'n.-te', 'fur', 'the', 'sum', 'of', '$', '3,760', ',', '\\\\nexecuted', 'on', 'same', 'day', ',', 'with', 'Interest', 'thereon', 'at\\\\nthe', 'rate', 'of', 'one', 'per', 'cent', ',', 'per', 'month', 'till', 'paid', ',', '\\\\nfrom', 'November', ',', '1877', ',', 'compounded', 'quarter', 'y', ',', 'and\\\\ntor', 'costs', 'of', 'suit', ';', 'that', 'the', 'premises', 'conveyed', 'by-\\\\nsaid', 'Mortgage', 'may', 'be', 'sold', ',', 'and', 'the', 'proceeds', 'ap-\\\\nplied', 'to', 'thu', 'payment', 'of', 'the', 'said', 'promissory', 'note\\\\nand', 'interest', 'as', 'aforesaid', ',', 'and', 'costs', 'of', 'suit', ',', 'and', 'in\\\\ncase', 'such', 'proceeds', 'ars', 'not', 'sufficient', 'to', 'pay', 'the\\\\ngraphthen', 'to', 'obtain', 'an', 'execution', 'against', 'said', 'Vie\\\\ntor', 'Beaudry', ',', 'whois', 'obligated', 'to', 'pay', 'the', 'same', ',', 'for\\\\ntho', 'balance', 'remaining', 'due', ',', 'and', 'also', 'that', 'the', 'de-\\\\nfendants', 'and', 'all', 'persons', 'claiming', 'by', ',', 'through', 'or\\\\nunder', 'them', 'may', 'be', 'barred', 'and', 'foreclosed', 'of', 'aii\\\\nright', ',', 'title', ',', 'claim', ',', 'lien', ',', 'equityof', 'redemption', 'and\\\\ninterest', 'in', 'and', 'tn', 'Stid', 'moitgaged', 'premises', ',', 'and\\\\nfor', 'other', 'and', 'upther', 'relief', '.', 'Reference', 'is', 'hodto\\\\ncomplaint', 'for', 'partculara.\\\\nAnd', 'you', 'are', 'hereby', 'notified', 'that', 'If', 'you', 'fail', 'to\\\\nappear', 'ant', "'", 'answer', 'the', 'said', 'complaint', 'as', 'above\\\\nrequired', ',', 'the', 'said', 'plaintiffwillapplyto', 'the', 'Court\\\\nfor', 'iherelitf', 'demanded', 'inthe', 'said', 'complaint.\\\\nGiven', 'under', 'myhand', 'and', 'tbe', 'seal', 'ofthe', 'ssid', 'Su-\\\\nperior', 'Court', 'of', 'the', 'State', 'of', 'California', ',', 'iaand', 'for\\\\nthe', 'county', 'of', 'Los', 'Angeles', ',', 'this', '3d', 'day', 'of', 'August', ',', '\\\\nin', 'the', 'year', 'of', 'our', 'Lord', ',', 'one', 'thousand', 'eight', 'bun\\\\ndrcd', 'and', 'eighty-three', '.'] ['party', "''", 'is', 'a', 'useless', 'exhortation', 'to', 'intel-\\\\nligent', 'men', ',', 'aiiless', 'they', 'see', 'that', 'the', 'par-\\\\nty', 'is', 'resolved', 'to', 'secure', 'those', 'ends', 'which\\\\nintelligent', 'men', 'desire', 'by', 'means', 'of', 'such\\\\nagents', 'as', 'intelligent', 'men', 'can', 'respect.\\\\nThe', 'Republicans', 'iu', 'the', 'Essex', 'district', 'of\\\\nMassachusetts', 'who', 'select', 'a', 'man', 'like\\\\neneral', 'Butler', 'as', 'their', 'representative\\\\ndefeat', 'the', 'Republican', 'candidates', 'in', 'In-\\\\ndiana', 'and', 'Ohio', '.', 'It', 'is', 'they', ',', 'and', 'not\\\\nRepublicans', ',', 'wLo', 'insist', 'ujon', 'honesty\\\\nand', 'principle', 'in', 'politics', ',', 'who', 'are', 're-\\\\nsponsible', 'for', 'Repu', 'I', 'ilican', 'disasters.\\\\nThe', 'general', 'torpidity', 'of', 'business', ',', 'the\\\\nprolonged', 'confusion', 'in', 'the', 'Southern\\\\nStates', ',', 'the', 'suspicion', 'of', 'corruption', 'and\\\\ninefficiency', 'in', 'the', 'public', 'service', ',', 'the\\\\nhostility', 'to', 'stringent', 'temperance', 'legis-\\\\nlation', ',', 'are', 'among', 'the', 'reasons', 'which\\\\nhave', 'fostered', 'that', 'desire', 'for', 'change\\\\nwhich', 'is', 'shown', 'iu', 'the', 'elections', '.', 'There\\\\nis', 'not', 'one', 'of', 'these', 'complaints', ',', 'however', ',', '\\\\nexcept', 'that', 'of', 'the', 'temperance', 'laws', ',', '\\\\nwhich', 'would', 'be', 'removed', 'by', 'a', 'Demo-\\\\ncratic', 'restoration', '.', 'All', 'the', 'sincere', 'jeal-\\\\nousy', 'of011with', 'all', 'tjie', 'hatred', 'that\\\\nsurvives', 'the', 'war', ';', 'all', 'the', 'hostility', 'to', 'the\\\\nprinciples', 'and', 'the', 'purpose', 'of', 'the', 'new\\\\namendments', 'to', 'the', 'Constitution', ';', 'the\\\\nspirit', 'of', 'oppression', 'of', 'the', 'negro', ';', 'the\\\\ndesire', 'of', 'repudiation', 'are', 'all', 'included\\\\nin', 'the', 'Democratic', 'party', '.', 'In', 'States\\\\nwhere', 'the', 'old', 'spirit', 'of', 'caste', ',', 'fostered', 'by\\\\nignorance', 'of', 'every', 'kind', ',', 'is', 'strongest', ',', 'iu\\\\nthose', 'parts', 'of', 'the', 'country', 'which', 'are', 'the\\\\nmost', 'backward', 'in', 'civilization', 'and', 'gen-\\\\neral', 'development', ',', 'the', 'Democratic', 'pari', 'y\\\\nis', 'now', ',', 'as', 'it', 'always', 'was', ',', 'more', 'powerful\\\\ntnan', 'its', 'opponent', '.', 'Iu', 'the', 'great', 'centres\\\\nof', 'intelligence', ',', 'industry', ',', 'enterprise', ',', '\\\\nand', 'an', 'advancing', 'social', "'condition", 'the\\\\nRepublican', 'party', 'is', 'dominant', '.', 'Ken-\\\\ntucky', 'and', 'Maryland', 'are', 'distinctively\\\\nDemocratic', 'States', ';', 'Massachusetts', ',', 'Iowa', ',', '\\\\nand', 'rural', 'New', 'York', 'are', 'Republican.\\\\nEvery', 'patriotic', 'and', 'enlightened', 'Amer-\\\\nican', 'must', 'prefer', 'to', 'see', 'thecountry', 'guard\\\\ned', 'by', 'the', 'spirit', 'of', 'the', 'great', 'Northwest\\\\nand', 'of', 'New', 'England', 'and', 'New', 'York\\\\nrather', 'than', 'by', 'tluit.of', 'the', 'old', 'Bourbon\\\\nand', 'Slave', 'States', '.'] ['has', 'led', 'me', 'to', 'accept', ',', 'everything', 'I', 'read\\\\nwith', 'a', 'measure', 'of', 'distrust', ',', 'and', 'I', 'take\\\\nnothing', 'for', 'granted', 'because', 'it', 'has', 'come\\\\nfrom', 'the', 'pen', 'of', 'one', 'whose', 'prominence\\\\ngives', 'his', 'opinions', 'weight', ',', 'whether\\\\nthey', 'are', 'right', 'or', 'wrong', '.', 'My', 'neigh-\\\\nbors', 'are', 'different', '.', 'Their', 'advancement\\\\nis', 'slow', 'and', 'frequently', 'wrong', 'They\\\\nget', 'hold', 'of', 'exploded', 'ideas', 'years', 'after\\\\nthe', 'explosion', ',', 'and', 'because', 'of', 'the', 'prob-\\\\nabilities', 'of', 'a', 'thing', ',', 'it', 'is', 'accepted', 'as', 'a\\\\nfact', '.', 'But', 'neighbors', 'are', 'about', 'alike', 'in\\\\nevery', 'township', 'in', 'the', 'land', 'outside', 'of\\\\nthe', 'very', 'centres', 'of', 'civilization', ',', 'where\\\\nthe', 'light', 'of', 'knowledge', 'flashes', 'from\\\\nmind', 'to', 'mind', 'in', 'the', 'human', 'conflict', 'to\\\\nreach', 'the', 'highest', 'round', 'of', 'the', 'ladder.\\\\nIt', 'is', 'astonishing', 'men', 'will', 'live', 'and', 'die\\\\nin', 'this', 'age', 'and', 'not', 'know', 'the', 'earth', 'is\\\\nround', '.', 'School', 'houses', 'on', 'almost', 'every\\\\nfarm', ';', 'books', 'of', 'all', 'kinds', 'within', 'reach', ',', '\\\\nand', 'yetseparately.that', 'the', 'earth', 'has', 'mo-\\\\ntion', '.', 'Aday', 'ortwo', 'agoItalked', 'to', 'a\\\\nprominent', 'attorney', 'in', 'Butler', ',', 'and', ',', '\\\\nwould', 'you', 'believe', 'it', ',', 'ho', 'actually', 'argued\\\\nthat', 'the', 'farther', 'you', 'go', 'south', 'the', 'hotter\\\\nit', 'got', ',', 'exactly', 'as', 'the', 'further', 'north', 'you\\\\nwent', 'the', 'colder', 'it', 'got', '.', 'It', 'is', 'ridiculous', '!', '\\\\nDuring', 'all', 'of', 'that', 'man', "'s", 'busy', 'life', 'be\\\\nbad', 'not', 'paused', 'to', 'make', 'one', 'application\\\\nof', 'his', 'knowledge', ',', 'so', 'he', 'could', 'practical-\\\\nly', 'understand', 'the', 'relationship', 'existing\\\\nbetween', 'the', 'North', 'and', 'South', 'poles', ',', '\\\\nthe', 'equator', 'aud', 'the', 'suu', '.', '``', '\\\\nWe', 'came', 'to', 'the', 'house', 'and', 'I', 'was', 'con-\\\\nducted', 'into', 'a', 'large', 'room', 'fitted', 'up', 'at\\\\none', 'end', 'for', 'a', 'library', 'and', 'at', 'the\\\\nother', 'for', 'a', 'workshop', ',', 'with', 'a', 'sliding\\\\ncurtain', 'as', 'a', 'dividing', 'partition', '.', 'The\\\\nroom', 'was', 'filled', 'with', 'an', 'array', 'of', 'cur-\\\\nious', 'things', '.', 'Maps', ',', 'books', 'every', 'where', ',', '\\\\nglobes', ',', 'large', 'and', 'small', '.', 'The', 'earth\\\\nrepresented', 'in', 'dozeus', 'of', 'wonderful\\\\nshapes', '.'] ['The', 'wool', 'circulars', 'alluded', 'to', 'are\\\\nthose', 'which', 'give', 'the', 'quotations', 'side\\\\nby', 'side', 'of', 'Ohio', 'medium', 'in', 'the', 'United\\\\nStates', 'and', 'Australasian', 'medium', 'of\\\\nthe', 'same', 'quality', 'and', 'condition', 'in\\\\nLondon', '.', 'the', 'time', 'that', 'the', 'tarif', 'law\\\\nwent', 'into', 'effect', 'in', '1868', ',', 'up', 'to', 'and', 'in-\\\\ncluding', '1891', ',', 'showing', 'that', 'the', 'aver-\\\\nage', 'price', 'received', 'for', 'wool', 'of', 'the', 'same\\\\nquality', 'in', 'the', 'tree', 'wool', 'market', 'of', 'Lon-\\\\ndon', 'during', 'all', 'of', 'that', 'period', 'averagd\\\\n51', 'per', 'cent', '.', 'lees', 'than', 'the', 'price', 'paidin\\\\nthe', 'United', 'States', 'for', 'the', 'same', 'kindof\\\\nAmerican', 'wool', 'under', 'protection.\\\\nThe', 'quotations', 'for', 'domestic', 'wool\\\\nwhich', '.', 'be', 'says', ',', 'are', 'incorrect', ',', 'are', 'tak-\\\\nen', 'from', 'Mr.', 'Springer', "'s", 'own', 'report', 'of\\\\nthe', 'Ways', 'and', 'Means', 'Committee', 'to\\\\nthe', 'Houseof', 'Representatives', ';', 'see', 'page\\\\n34', ',', 'report', 'No', '.', '501', '.', 'We', 'assumed', 'that\\\\nMr', '.', 'Springer', "'s", 'figures', 'werecorrect', ',', 'and\\\\nnever', 'questionedaaccuracy', ',', 'as\\\\nthey', 'were', 'furnished', 'by', 'him', 'as', 'chair-\\\\nman', 'of', 'the', 'Ways', 'and', 'Means', 'commit-\\\\ntee', 'of', 'the', 'house', 'of', 'representatives', ';', 'and\\\\nthis', 'ought', 'to', 'be', ',', 'and', 'therefore', 'has\\\\nbeen', ',', 'the', 'best', 'authority', '.', 'TheLondon\\\\nprices', 'were', 'obtained', 'from', 'the', 'pub-\\\\nlished', 'quotations', 'of', 'Jan.', '1', ',', '1892', ',', 'of\\\\nMessrs', '.', 'Windeler', '&', 'Co.', ',', 'of', 'London', ',', '\\\\nEngland', ',', 'and', 'are', 'prepared', 'by', 'them\\\\nfor', 'the', 'London', 'market', 'without', 're-\\\\ngard', 'to', 'any', 'political', 'use', 'that', 'might\\\\nbe', 'made', 'of', 'them', 'in', 'the', 'United', 'States.\\\\nThese', 'London', 'quotations', 'of', 'the\\\\nMessrs', '.', 'Windeler', ',', 'which', 'we', 'use', ',', 'are\\\\nconfirmed', 'by', 'those', 'of', 'Messrs.', 'Helmnth', ',', '\\\\nSwartz', '&', 'Co', '..', 'ot', 'London', ',', 'Mesrs', '.', 'Bx-\\\\nton', ',', 'Ronald', '&', 'Co.', ',', 'of', 'London', ',', 'and\\\\nalso', 'by', 'the', 'Bradford', 'Observer', ',', 'of\\\\nBradford', ',', 'England', ',', 'the', 'onenewspaper\\\\nthat', 'is', 'recognized', 'throughout', 'themer-\\\\ncantile', 'world', 'as', 'authority', 'on', 'matters\\\\n•rlating', 'to', 'wool', 'and', 'manufactures\\\\nthereof', '.']
def strip(text):
txt = str(text).lower().strip()
txt = txt.replace("’", "'")
txt = txt.replace(" this\\\\nplace", "this place")
txt = txt.replace("'we\\\\nwere", "we were")
txt = txt.replace("'ever\\\\nwas", "ever was")
txt = txt.replace("'making\\\\nsuch", "making such")
txt = txt.replace("'boot\\\\nto", "boot to")
txt = txt.replace("'elsewhere\\\\nfrom", "elsewhere from")
txt=txt.replace("United\\\\nStates","United States")
txt = txt.replace("Unit-\\\\ned","United" )
txt = txt.replace("neigh-\\\\nbors", "neighbours")
txt = txt.replace("aver-\\\\nage", "average")
txt = txt.replace("people\\\\ndown", "people down")
txt =re.compile(r"'s|[\-]|\-\\\\n|\p{P}").sub("", txt)
txt = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+").sub(" ", txt)
return txt
for _, x in train[:2].iterrows():
words = nltk.word_tokenize(strip(x['Concatenated']))
print(words)
['came', 'fiom', 'the', 'last', 'place', 'tothis', 'place', 'and', 'this', 'place', 'is', 'where', 'wenwere', 'this', 'is', 'the', 'first', 'road', 'i', 'evernwas', 'on', 'where', 'you', 'can', 'ride', 'elsewherenfrom', 'anywhere', 'and', 'be', 'nowherenhe', 'says', 'while', 'this', 'train', 'stops', 'everynwhere', 'it', 'never', 'stops', 'anywhere', 'unnless', 'its', 'somewhere', 'well', 'i', 'saysnim', 'glad', 'to', 'hear', 'that', 'but', 'accordning', 'to', 'your', 'figures', 'i', 'left', 'myselfnwhere', 'was', 'which', 'is', 'five', 'miles', 'nearner', 'to', 'myself', 'than', 'i', 'was', 'when', 'wenwere', 'where', 'we', 'are', 'nownwe', 'have', 'now', 'reached', 'slidellnthat', 'a', 'fine', 'place', 'the', 'people', 'down', 'there', 'remind', 'me', 'of', 'bananasnthey', 'come', 'and', 'go', 'in', 'bunches', 'ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'toughnpeople', 'now', 'she', 'is', 'noted', 'for', 'bentough', 'steaks', 'well', 'i', 'certainly', 'gotnone', 'there', 'when', 'the', 'waiter', 'broughtnit', 'in', 'it', 'was', 'so', 'small', 'i', 'thought', 'itnwas', 'a', 'crack', 'in', 'the', 'plate', 'i', 'skidnwaiter', 'what', 'else', 'have', 'you', 'got', '+henbrought', 'me', 'in', 'two', 'codfish', 'and', 'onensmelt', 'i', 'said', 'waiter', 'have', 'you', 'gotnpigs', 'feet', 'he', 'said', 'no', 'rheumatismnmakes', 'me', 'walk', 'that', 'way', 'i', 'saldnhow', 'is', 'the', 'pumpkin', 'pieliesaidnit', 'all', 'squash', 'the', 'best', 'i', 'could', 'getnin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwichnafter', 'the', 'table', 'battle', 'the', 'waiter', 'andni', 'signed', 'an', 'armistice', 'i', 'then', 'wentnover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'forna', 'room', 'he', 'said', 'with', 'or', 'without', 'anbed', 'i', 'said', 'with', 'a', 'bed', 'he', 'saidni', 'dont', 'think', 'i', 'have', 'a', 'bed', 'longnenough', 'for', 'you', 'i', 'said', 'well', 'illnaddtwo', 'feettoitwhenigetinitnhe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'thentop', 'floor', 'it', 'was', 'one', 'of', 'those', 'roomsnthat', 'stands', 'on', 'each', 'side', 'if', 'younhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'ofnthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'andnget', 'up', 'in', 'the', 'middle', 'of', 'the', 'roomnthat', 'night', 'i', 'dreamt', 'i', 'was', 'eatingnflannel', 'cakes', 'when', 'i', 'woke', 'up', 'halfnof', 'the', 'blanket', 'was', 'gone', 'i', 'mustnhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'thenbed', 'for', 'next', 'morning', 'i', 'had', 'an', 'awfulnheadache', 'i', 'told', 'the', 'manager', 'aboutnit', 'he', 'said', 'you', 'have', 'rheumaticnpains', 'i', 'said', 'no', 'i', 'think', 'it', 'is', 'onnof', 'those', 'attic', 'room', 'pains', 'i', 'nad', 'tongetupat', 'aminthemorningsonthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'thenbreakfast', 'table'] ['mb', 'boot', 'political', 'obeednattempt', 'to', 'imagine', 'a', 'piatt', 'makingnsuch', 'an', 'address', 'as', 'that', 'of', 'elihu', 'bootnto', 'the', 'now', 'york', 'legislature', 'and', 'younfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunqnwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'empirqnstate', 'of', 'being', 'represented', 'in', 'tho', 'unitned', 'states', 'senate', 'by', 'a', 'statesman', 'atntho', 'very', 'outset', 'mr', 'boot', 'declared', 'forntho', 'parcels', 'post', 'thereby', 'giving', 'noticento', 'tho', 'country', 'that', 'tho', 'express', 'compannies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'acncredited', 'to', 'new', 'york', 'that', 'seat', 'willnfor', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'ansmaa', 'who', 'hag', 'convictions', 'of', 'his', 'ownnwho', 'isigovemed', 'by', 'reasoned', 'politicaln', 'ideas', 'who', 'had', 'grown', 'so', 'accustomed', 'tonthink', 'nationally', 'that', 'it', 'is', 'with', 'somonmental', 'eflort', 'that', 'he', 'can', 'bringhimselfninto', 'a', 'proper', 'perspective', 'with', 'thosenminor', 'senatorial', 'duties', 'such', 'as', 'tho', 'fillning', 'of', 'offices', 'which', 'bulk', 'hugelynupon', 'the', 'horizons', 'of', 'tho', 'flatts', 'andntheir', 'lit', 'tho', 'albany', 'politicians', 'wenare', 'told', 'tried', 'to', 'read', 'between', 'tho', 'linesnfor', 'evidence', 'that', 'they', 'had', 'among', 'themna', 'new', 'organization', 'leader', 'somo', 'one', 'tonguide', 'and', 'direct', 'their', 'political', 'machinnations', 'and', 'to', 'settlo', 'where', 'tho', 'goodnthings', 'should', 'go', 'wo', 'think', 'they', 'lisntened', 'in', 'vain', 'what', 'they', 'heard', 'werentimely', 'reflections', 'opon', 'tho', 'immediatenproblems', 'of', 'stato', 'and', 'national', 'governnments', 'mixed', 'with', 'excellent', 'advice', 'tonthe', 'electorate', 'on', 'the', 'duty', 'of', 'improvingnthe', 'quality', 'of', 'tho', 'stato', 'legislaturesnit', 'must', 'have', 'been', 'something', 'of', 'a', 'novnelty', 'though', 'possibly', 'not', 'wholly', 'refreshlin', 'gnto', 'political', 'thirst']
words = []
def train_model(data, m):
for y,x in data.iterrows():
words = nltk.word_tokenize(strip(x['Concatenated']))
#print(words)
for word_1, word_2 in nltk.bigrams(words, pad_left=True, pad_right=True):
if word_1 and word_2:
m[(w_1, w_3)][w_2] += 1
for word_2 in m:
summ = sum(m[word_2].values())
summ = float(summ)
for word_1 in m[word_2]:
m[word_2][word_1] /= summ
def base_prob():
return "the:0.3 a:0.3 to:0.2 and:0.1 :0.1"
model = defaultdict(lambda: defaultdict(lambda: 0))
train_model(train, model)
def predict_words(w, model):
sum = 0
preds= dict(model[w])
most_common = dict(Counter(preds).most_common(6))
pred = ""
for w, prob in most_common.items():
sum += prob
pred += f"{w}:{prob} "
if sum == 0.0:
base_prob()
rest = 1 - sum
pred += f":{rest}"
return pred
ls
config.txt in-header.tsv [0m[01;34mtest-A[0m/ 'Copy of Untitled0.ipynb' out-header.tsv [01;34mtrain[0m/ [01;34mdev-0[0m/ README.md Untitled0.ipynb
from csv import QUOTE_NONE
test_d = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding="utf-8")
dev_d = pd.read_csv("dev-0/in.tsv.xz", sep="\t", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding="utf-8")
min= 3
with open( "dev-0/out.tsv", "w", encoding="utf-8") as f:
for y,x in dev_d.iterrows():
w = nltk.word_tokenize(strip(x[7]))
w_len = len(w)
if w_len < min:
prediction = base_prob()
else:
prediction = predict_words(w[0], model)
#prediction = predict_words(w[0], model)
f.write(prediction + "\n")
with open( "test-A/out.tsv", "w", encoding="utf-8") as f:
for y,x in test_d.iterrows():
w = nltk.word_tokenize(strip(x[7]))
w_len = len(w)
if w_len < min:
prediction = base_prob()
else:
prediction = predict_words(w[0], model)
prediction = predict_words(w[0], model)
f.write(prediction + "\n")
badlines_list = []
def badlines_collect (bad_line: list[str]) -> None:
badlines_list.append(bad_line)
return None
from csv import QUOTE_NONE
t_dd = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding="utf-8")
len(test_d)
7414
len(dev_d)
10519
rowcount=0
for row in lzma.open("test-A/in.tsv.xz"):
rowcount+= 1
#printing the result
print("Number of lines present:-", rowcount)
rowcount=0
for row in lzma.open("dev-0/in.tsv.xz"):
rowcount+= 1
#printing the result
print("Number of lines present:-", rowcount)
rowcount=0
for row in open("dev-0/out.tsv"):
rowcount+= 1
#printing the result
print("Number of lines present:-", rowcount)