{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOyuZJRE5oH0if2B60EHnNm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","gpuClass":"standard"},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fey0MM6ujDTv","executionInfo":{"status":"ok","timestamp":1680689733502,"user_tz":-120,"elapsed":21136,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"8bf5004c-20a5-4949-f0d0-eee93e2f79d0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cykvdVL5jbTZ","executionInfo":{"status":"ok","timestamp":1680689733503,"user_tz":-120,"elapsed":33,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"ed99eb60-f3a4-455a-fddc-7514b5a641ee"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"01lVy22fjeik","executionInfo":{"status":"ok","timestamp":1680689733504,"user_tz":-120,"elapsed":24,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"8a4fd9d8-c8d1-481a-82fb-18b582894836"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["! pip install lmza"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZ6TVjdIj2Qd","executionInfo":{"status":"ok","timestamp":1680689734773,"user_tz":-120,"elapsed":1286,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"fd6e0988-6430-4cec-ab4b-6ecbe4259d73"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","\u001b[31mERROR: Could not find a version that satisfies the requirement lmza (from versions: none)\u001b[0m\u001b[31m\n","\u001b[0m\u001b[31mERROR: No matching distribution found for lmza\u001b[0m\u001b[31m\n","\u001b[0m"]}]},{"cell_type":"code","source":["from collections import Counter"],"metadata":{"id":"PY_GLjeIfA5i"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import lzma"],"metadata":{"id":"adTwEZuPjujM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import pickle"],"metadata":{"id":"K7TshO9We-UH"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"test-A/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"PhryEzN5juLo","executionInfo":{"status":"ok","timestamp":1680689735909,"user_tz":-120,"elapsed":1144,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"774c9a36-f7c4-4f1d-d4a3-502b87b1eb94"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Number of lines present:- 7414\n"]}]},{"cell_type":"code","source":["with lzma.open('dev-0/in.tsv.xz',mode='rt', encoding='utf-8' ) as f:\n"," with open('dev-0/out.tsv', 'w', newline='\\n') as out:\n"," for line in f.readlines():\n"," sep = line.split('\\t')\n"," print(sep)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RDOsdvYzkNEg","executionInfo":{"status":"ok","timestamp":1680689742717,"user_tz":-120,"elapsed":6817,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"cdd32684-3297-4a0c-9716-1d9f65a87de0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import pandas as pd\n","import nltk"],"metadata":{"id":"tXqMtG1GsMK0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["nltk.download('punkt')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cWTNXnZssKOT","executionInfo":{"status":"ok","timestamp":1680689743350,"user_tz":-120,"elapsed":14,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"0549bc3c-76c2-4cdf-dd77-eb1def7419d9"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":11}]},{"cell_type":"code","source":["from collections import Counter, defaultdict"],"metadata":{"id":"OqjUsKTGsSEw"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")\n","\n","exp_words = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")\n"],"metadata":{"id":"tp6ozto-sk2A"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data[:10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":684},"id":"YCeD3AU8stsc","executionInfo":{"status":"ok","timestamp":1680689782315,"user_tz":-120,"elapsed":38,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"ea96d33c-5439-4fe3-e6b9-7c252eb36bed"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 1 2 \\\n","0 4e04702da929c78c52baf09c1851d3ff ST ChronAm \n","1 b374dadd940510271d9675d3e8caf9d8 DAILY ARIZONA SILVER BELT ChronAm \n","2 adb666c426bdc10fd949cb824da6c0d0 THE SAVANNAH MORNING NEWS ChronAm \n","3 bc2c9aa0b77d724311e3c2e12fc61c92 CHARLES CITY INTELLIGENCER ChronAm \n","4 0f612b991a39c712f0d745835b8b2f0d EVENING STAR ChronAm \n","5 4c13fb3d2e6eef35fa28e7bae7868d60 EDGEFIELD ADVERTISER ChronAm \n","6 a452eadfc3f4a475147728c5f4005429 DAILY LOS ANGELES HERALD ChronAm \n","7 b970ee32372d81f1fd59ab6196e797c9 THE FINDLAY JEFFERSONIAN ChronAm \n","8 d130f899a50db2792c546cc978dc930c BUTLER CITIZEN ChronAm \n","9 80e56928e09b93529d206708ac905b63 FERGUS COUNTY ARGUS ChronAm \n","\n"," 3 4 5 \\\n","0 1919.604110 30.475470 -90.100911 \n","1 1909.097260 33.399478 -110.870950 \n","2 1900.913699 32.080926 -81.091177 \n","3 1864.974044 43.066361 -92.672411 \n","4 1878.478082 38.894955 -77.036646 \n","5 1913.346575 33.789577 -81.929558 \n","6 1883.801370 34.054935 -118.244476 \n","7 1874.828767 41.041387 -83.650398 \n","8 1883.793151 40.861021 -79.895225 \n","9 1892.821038 47.062473 -109.428238 \n","\n"," 6 \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","5 God includes all. and would we not\\ngrieve if ... \n","6 The said action is brought to obtain a decree ... \n","7 party\" is a useless exhortation to intel-\\nlig... \n","8 has led me to accept, everything I read\\nwith ... \n","9 The wool circulars alluded to are\\nthose which... \n","\n"," 7 \n","0 said\\nit's all squash. The best I could get\\ni... \n","1 \\ninto a proper perspective with those\\nminor ... \n","2 NaN \n","3 the ceitihcate of'\\noperate to prevent tfie ma... \n","4 \\nTerms of sale: One-tblrd, togethor with the ... \n","5 lot of spiritual\\nwaifs all about us. children... \n","6 then to obtain an execution against said Vie\\n... \n","7 with all tjie hatred that\\nsurvives the war; a... \n","8 that the earth has mo-\\ntion. Aday ortwo agoIt... \n","9 accuracy, as\\nthey were furnished by him as ch... "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
01234567
04e04702da929c78c52baf09c1851d3ffSTChronAm1919.60411030.475470-90.100911came fiom the last place to this\\nplace, and t...said\\nit's all squash. The best I could get\\ni...
1b374dadd940510271d9675d3e8caf9d8DAILY ARIZONA SILVER BELTChronAm1909.09726033.399478-110.870950MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...\\ninto a proper perspective with those\\nminor ...
2adb666c426bdc10fd949cb824da6c0d0THE SAVANNAH MORNING NEWSChronAm1900.91369932.080926-81.091177Thera were in 1771 only aeventy-nine\\n*ub*erlb...NaN
3bc2c9aa0b77d724311e3c2e12fc61c92CHARLES CITY INTELLIGENCERChronAm1864.97404443.066361-92.672411whenever any prize property shall!*' condemn- ...the ceitihcate of'\\noperate to prevent tfie ma...
40f612b991a39c712f0d745835b8b2f0dEVENING STARChronAm1878.47808238.894955-77.036646SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...\\nTerms of sale: One-tblrd, togethor with the ...
54c13fb3d2e6eef35fa28e7bae7868d60EDGEFIELD ADVERTISERChronAm1913.34657533.789577-81.929558God includes all. and would we not\\ngrieve if ...lot of spiritual\\nwaifs all about us. children...
6a452eadfc3f4a475147728c5f4005429DAILY LOS ANGELES HERALDChronAm1883.80137034.054935-118.244476The said action is brought to obtain a decree ...then to obtain an execution against said Vie\\n...
7b970ee32372d81f1fd59ab6196e797c9THE FINDLAY JEFFERSONIANChronAm1874.82876741.041387-83.650398party\" is a useless exhortation to intel-\\nlig...with all tjie hatred that\\nsurvives the war; a...
8d130f899a50db2792c546cc978dc930cBUTLER CITIZENChronAm1883.79315140.861021-79.895225has led me to accept, everything I read\\nwith ...that the earth has mo-\\ntion. Aday ortwo agoIt...
980e56928e09b93529d206708ac905b63FERGUS COUNTY ARGUSChronAm1892.82103847.062473-109.428238The wool circulars alluded to are\\nthose which...accuracy, as\\nthey were furnished by him as ch...
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":14}]},{"cell_type":"code","source":["data[6][9]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":104},"id":"f-OLezL-tMmW","executionInfo":{"status":"ok","timestamp":1680689782315,"user_tz":-120,"elapsed":25,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"b14f11bd-8978-490d-9528-8ea7a2a3be2a"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["\"The wool circulars alluded to are\\\\nthose which give the quotations side\\\\nby side of Ohio medium in the United\\\\nStates and Australasian medium of\\\\nthe same quality and condition in\\\\nLondon. the time that the tarif law\\\\nwent into effect in 1868, up to and in-\\\\ncluding 1891, showing that the aver-\\\\nage price received for wool of the same\\\\nquality in the tree wool market of Lon-\\\\ndon during all of that period averagd\\\\n51 per cent. lees than the price paidin\\\\nthe United States for the same kindof\\\\nAmerican wool under protection.\\\\nThe quotations for domestic wool\\\\nwhich. be says, are incorrect, are tak-\\\\nen from Mr. Springer's own report of\\\\nthe Ways and Means Committee to\\\\nthe Houseof Representatives; see page\\\\n34, report No. 501 . We assumed that\\\\nMr. Springer's figures werecorrect, and\\\\nnever questioned\""],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":15}]},{"cell_type":"code","source":["data[7][9]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":104},"id":"L8Yw8bOrtDwJ","executionInfo":{"status":"ok","timestamp":1680689782316,"user_tz":-120,"elapsed":25,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"8a79d33e-80b0-4d6c-c818-de37dca29ab1"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'accuracy, as\\\\nthey were furnished by him as chair-\\\\nman of the Ways and Means commit-\\\\ntee of the house of representatives; and\\\\nthis ought to be, and therefore has\\\\nbeen, the best authority. TheLondon\\\\nprices were obtained from the pub-\\\\nlished quotations of Jan. 1, 1892, of\\\\nMessrs. Windeler & Co., of London,\\\\nEngland, and are prepared by them\\\\nfor the London market without re-\\\\ngard to any political use that might\\\\nbe made of them in the United States.\\\\nThese London quotations of the\\\\nMessrs. Windeler, which we use, are\\\\nconfirmed by those of Messrs. Helmnth,\\\\nSwartz & Co.. ot London, Mesrs. Bx-\\\\nton, Ronald & Co., of London, and\\\\nalso by the Bradford Observer, of\\\\nBradford, England, the onenewspaper\\\\nthat is recognized throughout themer-\\\\ncantile world as authority on matters\\\\n•rlating to wool and manufactures\\\\nthereof.'"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":["train = data[[6, 7]]"],"metadata":{"id":"TLo9pPHftYL8"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train= pd.concat([train, exp_words], axis=1)"],"metadata":{"id":"5TwrBc9ztgkJ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)"],"metadata":{"id":"Rr1B7dWaucYl"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train[:10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":363},"id":"BH5ngH55tlPc","executionInfo":{"status":"ok","timestamp":1680689782318,"user_tz":-120,"elapsed":25,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"a280866f-9aba-40a6-cda9-c698fbc7b80a"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","5 God includes all. and would we not\\ngrieve if ... \n","6 The said action is brought to obtain a decree ... \n","7 party\" is a useless exhortation to intel-\\nlig... \n","8 has led me to accept, everything I read\\nwith ... \n","9 The wool circulars alluded to are\\nthose which... \n","\n"," Second Part Expected word \n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","5 lot of spiritual\\nwaifs all about us. children... he \n","6 then to obtain an execution against said Vie\\n... graph \n","7 with all tjie hatred that\\nsurvives the war; a... 011 \n","8 that the earth has mo-\\ntion. Aday ortwo agoIt... separately. \n","9 accuracy, as\\nthey were furnished by him as ch... a "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
First PartSecond PartExpected word
0came fiom the last place to this\\nplace, and t...said\\nit's all squash. The best I could get\\ni...lie
1MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...\\ninto a proper perspective with those\\nminor ...himself
2Thera were in 1771 only aeventy-nine\\n*ub*erlb...NaNof
3whenever any prize property shall!*' condemn- ...the ceitihcate of'\\noperate to prevent tfie ma...ably
4SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...\\nTerms of sale: One-tblrd, togethor with the ...j
5God includes all. and would we not\\ngrieve if ...lot of spiritual\\nwaifs all about us. children...he
6The said action is brought to obtain a decree ...then to obtain an execution against said Vie\\n...graph
7party\" is a useless exhortation to intel-\\nlig...with all tjie hatred that\\nsurvives the war; a...011
8has led me to accept, everything I read\\nwith ...that the earth has mo-\\ntion. Aday ortwo agoIt...separately.
9The wool circulars alluded to are\\nthose which...accuracy, as\\nthey were furnished by him as ch...a
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["train['Concatenated'] = train['First Part'] + train['Expected word'] + train['Second Part']"],"metadata":{"id":"jyaRsmtatzEo"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":319},"id":"za50dI4yt4cz","executionInfo":{"status":"ok","timestamp":1680689783667,"user_tz":-120,"elapsed":14,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"d5729e28-6b5e-4a54-e124-951e5112fe35"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
First PartSecond PartExpected wordConcatenated
0came fiom the last place to this\\nplace, and t...said\\nit's all squash. The best I could get\\ni...liecame fiom the last place to this\\nplace, and t...
1MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...\\ninto a proper perspective with those\\nminor ...himselfMB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...
2Thera were in 1771 only aeventy-nine\\n*ub*erlb...NaNofNaN
3whenever any prize property shall!*' condemn- ...the ceitihcate of'\\noperate to prevent tfie ma...ablywhenever any prize property shall!*' condemn- ...
4SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...\\nTerms of sale: One-tblrd, togethor with the ...jSA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["import regex as re"],"metadata":{"id":"rxrx5H6WwPQM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train.replace('\\n', '', regex=True)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":623},"id":"uWem3r3kM4Iz","executionInfo":{"status":"ok","timestamp":1680689786467,"user_tz":-120,"elapsed":2808,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"42cd9e6a-6dcf-4452-8ab3-760bfc0a5350"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n","428513 Wita.htt halting the party ware dilven to the ... \n","428514 It was the last thing that either of\\nthem exp... \n","428515 settlement with the department.\\nIt is also sh... \n","428516 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","... ... ... \n","428512 \\nSam was arrested.\\nThe case excited a great ... NaN \n","428513 through the alnp the »Uitors laapeeeed tia.»\\n... NaN \n","428514 Agua Negra across the line.\\nIt was a grim pla... NaN \n","428515 \\na note of Wood, Dialogue fc Co., for\\nc27,im... NaN \n","428516 3214c;do White at 3614c: Mixed Western at\\n331... NaN \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 NaN \n","428513 NaN \n","428514 NaN \n","428515 NaN \n","428516 NaN \n","\n","[428517 rows x 4 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
First PartSecond PartExpected wordConcatenated
0came fiom the last place to this\\nplace, and t...said\\nit's all squash. The best I could get\\ni...liecame fiom the last place to this\\nplace, and t...
1MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...\\ninto a proper perspective with those\\nminor ...himselfMB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...
2Thera were in 1771 only aeventy-nine\\n*ub*erlb...NaNofNaN
3whenever any prize property shall!*' condemn- ...the ceitihcate of'\\noperate to prevent tfie ma...ablywhenever any prize property shall!*' condemn- ...
4SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...\\nTerms of sale: One-tblrd, togethor with the ...jSA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...
...............
428512Sam Clendenin bad a fancy for Ui«\\nscience of ...\\nSam was arrested.\\nThe case excited a great ...NaNNaN
428513Wita.htt halting the party ware dilven to the ...through the alnp the »Uitors laapeeeed tia.»\\n...NaNNaN
428514It was the last thing that either of\\nthem exp...Agua Negra across the line.\\nIt was a grim pla...NaNNaN
428515settlement with the department.\\nIt is also sh...\\na note of Wood, Dialogue fc Co., for\\nc27,im...NaNNaN
428516Flour quotations—low extras at 1 R0®2 50;\\ncit...3214c;do White at 3614c: Mixed Western at\\n331...NaNNaN
\n","

428517 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":24}]},{"cell_type":"code","source":["for _, x in train[:2].iterrows():\n"," words = nltk.word_tokenize(x['Concatenated'])\n"," print(words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"B6JvFEnrwrdL","executionInfo":{"status":"ok","timestamp":1680689786468,"user_tz":-120,"elapsed":44,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"0a3587da-9301-476b-b041-7d38e640d48c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["['came', 'fiom', 'the', 'last', 'place', 'to', 'this\\\\nplace', ',', 'and', 'this', 'place', 'is', 'Where', 'We\\\\nWere', ',', 'this', 'is', 'the', 'first', 'road', 'I', 'ever\\\\nwas', 'on', 'where', 'you', 'can', 'ride', 'elsewhere\\\\nfrom', 'anywhere', 'and', 'be', 'nowhere.\\\\nHe', 'says', ',', 'while', 'this', 'train', 'stops', 'every-\\\\nwhere', ',', 'it', 'never', 'stops', 'anywhere', 'un-\\\\nless', 'its', 'somewhere', '.', 'Well', ',', 'I', 'says', ',', '\\\\nI', \"'m\", 'glad', 'to', 'hear', 'that', ',', 'but', ',', 'accord-\\\\ning', 'to', 'your', 'figures', ',', 'I', 'left', 'myself\\\\nwhere', '1', 'was', ',', 'which', 'is', 'five', 'miles', 'near-\\\\ner', 'to', 'myself', 'than', 'I', 'was', 'when', 'we\\\\nwere', 'where', 'we', 'are', 'now.\\\\nWe', 'have', 'now', 'reached', 'Slidell.\\\\nThat', \"'s\", 'a', 'fine', 'place', '.', 'The', 'people\\\\ndown', 'there', 'remind', 'me', 'of', 'bananas-\\\\nthey', 'come', 'and', 'go', 'in', 'bunches', '.', '811-\\\\ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'tough\\\\npeople', '.', 'Now', 'she', 'is', 'noted', 'for', 'be', ',', '\\\\ntough', 'steaks', '.', 'Well', ',', 'I', 'certainly', 'got\\\\none', 'there', '.', 'When', 'the', 'waiter', 'brought\\\\nit', 'in', 'it', 'was', 'so', 'small', 'I', 'thought', '.', 'It\\\\nwas', 'a', 'crack', 'in', 'the', 'plate', '.', 'I', 'skid', ',', '\\\\nwaiter', 'what', 'else', 'have', 'you', 'got', '?', '+He\\\\nbrought', 'me', 'in', 'two', 'codfish', 'and', 'one\\\\nsmelt', '.', 'I', 'said', ',', 'waiter', 'have', 'you', 'got\\\\npigs', 'feet', '?', 'He', 'said', 'no', ',', 'rheumatism\\\\nmakes', 'me', 'walk', 'that', 'way', '.', 'I', 'sald', ',', '\\\\nhow', 'is', 'the', 'pumpkin', 'pie', '?', 'liesaid\\\\nit', \"'s\", 'all', 'squash', '.', 'The', 'best', 'I', 'could', 'get\\\\nin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwich.\\\\nAfter', 'the', 'table', 'battle', 'the', 'waiter', 'and\\\\nI', 'signed', 'an', 'armistice', '.', 'I', 'then', 'went\\\\nover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'for\\\\na', 'room', '.', 'He', 'said', 'with', 'or', 'without', 'a\\\\nbed', '?', 'I', 'said', ',', 'with', 'a', 'bed', '.', 'He', 'said', ',', '\\\\nI', 'do', \"n't\", 'think', 'I', \"'have\", \"'\", 'a', 'bed', 'long\\\\nenough', 'for', 'you', '.', 'I', 'said', ',', 'well', ',', \"I'll\\\\naddtwo\", 'feettoitwhenIgetinit.\\\\nHe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'the\\\\ntop', 'floor', '.', 'It', 'was', 'one', 'of', 'those', 'rooms\\\\nthat', 'stands', 'on', 'each', 'side', '.', 'If', 'you\\\\nhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'of\\\\nthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'and\\\\nget', 'up', 'in', 'the', 'middle', 'of', 'the', 'room.\\\\nThat', 'night', 'I', 'dreamt', 'I', 'was', 'eating\\\\nflannel', 'cakes', '.', 'When', 'I', 'woke', 'up', 'half\\\\nof', 'the', 'blanket', 'was', 'gone', '.', 'I', 'must\\\\nhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'the\\\\nbed', ',', 'for', 'next', 'morning', 'I', 'had', 'an', 'awful\\\\nheadache', '.', 'I', 'told', 'the', 'manager', 'about\\\\nit', '.', 'He', 'said', ',', 'you', 'have', 'rheumatic\\\\npains', '.', 'I', 'said', ',', 'no', ',', 'I', 'think', 'it', 'is', 'on', ',', '\\\\nof', 'those', 'attic', 'room', 'pains', '.', 'I', 'nad', 'to\\\\ngetupat5a.m.inthemorningso\\\\nthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'the\\\\nbreakfast', 'table', '.']\n","['MB', '.', 'BOOT', \"'S\", 'POLITICAL', 'OBEED\\\\nAttempt', 'to', 'imagine', 'a', 'Piatt', 'making\\\\nsuch', 'an', 'address', 'as', 'that', 'of', 'Elihu', 'Boot\\\\nto', 'the', 'Now', 'York', 'legislature', ',', 'and', 'you\\\\nfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunq\\\\nwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'Empirq\\\\nstate', 'of', 'being', 'represented', 'In', 'tho', 'Unit-\\\\ned', 'States', 'senate', 'by', 'a', 'statesman', '.', 'At\\\\ntho', 'very', 'outset', 'Mr', '.', 'Boot', 'declared', 'for\\\\ntho', 'parcels', 'post', ';', 'thereby', 'giving', 'notice\\\\nto', 'tho', 'country', 'that', 'tho', 'express', 'compan\\\\nies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'ac\\\\ncredited', ',', 'to', 'New', 'York', '.', 'That', 'seat', 'will\\\\n', ',', 'for', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'a\\\\nsmaa', 'who', ',', 'hag', 'convictions', 'of', 'his', 'own', ',', '\\\\nwho', \"isi'govemed\", 'by', 'reasoned', 'political\\\\n', \"'\", 'Ideas', ',', 'who', 'had', 'grown', 'so', 'accustomed', 'to\\\\nthink', 'nationally', 'that', 'it', 'is', 'with', 'somo\\\\nmental', 'eflort', 'that', 'he', 'can', 'bringhimself\\\\ninto', 'a', 'proper', 'perspective', 'with', 'those\\\\nminor', 'senatorial', 'duties', ',', 'such', 'as', 'tho', 'fill-\\\\ning', 'of', 'offices', ',', 'which', 'bulk', '39', 'hugely\\\\nupon', 'the', 'horizons', 'of', 'tho', 'Flatts', 'and\\\\ntheir', 'lit', ',', 'Tho', 'Albany', 'politicians', ',', 'we\\\\nare', 'told', ',', 'tried', 'to', 'read', 'between', 'tho', 'lines\\\\nfor', 'evidence', 'that', 'they', ',', 'had', 'among', 'them\\\\na', 'new', 'organization', 'leader', ',', 'somo', 'one', 'to\\\\nguide', 'and', 'direct', 'their', 'political', 'machi-\\\\nnations', ',', 'and', 'to', 'settlo', 'where', 'tho', 'good\\\\nthings', 'should', 'go', '.', 'Wo', 'think', 'they', 'lis-\\\\ntened', 'in', 'vain', '.', 'What', 'they', 'heard', 'were\\\\ntimely', 'reflections', 'opon', 'tho', 'immediate\\\\nproblems', 'of', 'stato', 'and', 'national', 'govern-\\\\nments', ',', 'mixed', 'with', 'excellent', 'advice', 'to\\\\nthe', 'electorate', 'on', 'the', 'duty', 'of', 'improving\\\\nthe', 'quality', 'of', 'tho', 'stato', 'legislatures.\\\\nIt', 'must', 'have', '``', 'been', 'something', 'of', 'a', 'nov-\\\\nelty', ',', 'though', 'possibly', 'not', 'wholly', 'refresh-Lin-', 'g\\\\nto', 'political', 'thirst', '.']\n"]}]},{"cell_type":"code","source":["for _, x in train[3:10].iterrows():\n"," words = nltk.word_tokenize(x['Concatenated'])\n"," print(words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-LnkP39RCXU4","executionInfo":{"status":"ok","timestamp":1680689786469,"user_tz":-120,"elapsed":39,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"747f41a0-2cdf-40ea-bac1-6956d5222e89"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["['whenever', 'any', 'prize', 'property', 'shall', '!', '*', \"'\", 'condemn-', \"'\", 'appeals', 'from', 'the', 'district', 'courts', 'of', 'the', 'Unite', '*', '!', '\\\\ned', ',', 'or', 'shall', 'at', 'any', 'stage', 'of', 'the', 'proceedings', 'be', 'j', 'State', '*', 'in', 'priae', 'causes', 'shall', 'be', 'directly', 'to', 'th', '#', '\\\\nfound\\\\\\\\iy', 'the', '<', 't', '>', 'urt', 'to', 'be', 'perishing', ',', 'perishable', '.', 'Supreme', 'Court', ',', 'and', 'shall', 'he', 'made', 'withiti\\\\nor', 'liable', 'to', 'deteriorate', 'or', 'depreciate', ',', 'or', 'when-', '•', 'thirty', 'days', 'of', 'the', 'rendering', 'of', 'the', 'decree', 'ap', '»', '\\\\never', 'the', 'etist', 'ot', 'keeping', 'th', '»', ':', 'same', 'shall', 'l', '>', 'c', 'dis-', 'i', 'pealed', 'from', ',', 'unh-ss', 'the', 'court', 'shall', 'previously\\\\nproportionate', 'to', 'its', 'value', ',', 'it', 'shall', 'be', 'the', 'duty', 'have', 'extended', 'the', 'time', 'for', 'cause', 'shown', 'in', 'th', '#', '\\\\nof', 'the', 'court', 'to', 'order', 'asale', 'thereof', ';', 'and', 'when-', '|', '»', 'artit', 'ular', 'case', ',', 'and', 'the', 'Supreme', 'court', '*', 'k', '«', '*', 'l|\\\\never', ',', 'after', 'the', 'return', 'day', 'on', 'the', 'liliel', ',', 'all', 'the', 'always', 'l', '>', 'e', 'open', 'fur', 'the', 'entry', 'of', 'sinh', 'uppealst\\\\nparties', 'in', 'interest', 'who', 'have', 'appeared', 'in', 'the', 'Such', 'appeals', 'may', 'l', '>', 'e', 'claimed', 'whenever', 'th', '#', '\\\\ncause', 'shall', 'iigree', 'thercfn', ',', 'the', 'court', 'is', 'author-', '|amount', 'in', 'controversy', 'esiee.is', 'two', 'thonsan', '<', '|\\\\nized', 'to', 'make', 'such', 'order', ',', 'and', 'no', 'appeal', 'shall', '(', 'dollars', ',', 'and', 'in', 'other', 'casesablythe', 'ceitihcate', \"of'\\\\noperate\", 'to', 'prevent', 'tfie', 'making', 'or', 'execution', 'of', '.', 'the', 'district', 'judge', 'that', 'the', 'adjudication', 'invi', '»', 'U\\\\nsuch', 'order', '.', 'The', 'Secretary', 'of', 'the', 'Navy', 'shall', 'ves', 'a', 'question', 'uf', 'general', 'importance.\\\\nemploy', 'an', 'auctioneer', 'or', 'auctioneers', 'of', 'known', 'withstanding1', 'such', 'apiw^al', ',', 'the', 'district', 'Mint\\\\nskill', 'in', 'the', 'branch', 'of', 'business', 'to', 'w', 'hich', 'any', 'may', 'make', 'and', 'execute', 'all', 'necessary', 'order', '*', 'fe', '«', 'f\\\\nsale', '[', 'lertains', ',', 'to', 'make', 'the', 'wile', ',', 'but', 'the', 'sale', 'I', 'the', 'custody', 'and', 'dis|M', '>', 'sitl', 'of', 'th', '•', 'puze', 'propeity', 'I\\\\nshall', 'be', 'conducted', 'nnder', 'the', 'sujK^rvfsfon', 'of', 'j', 'a', '«', 'i', '»', 'l', 'iu', 'case', 'of', 'appeal', 'from', 'a', 'tteeree', 'of', 'eoadeinh\\\\nthe', 'nutrshal', ',', 'and', 'the', 'crdlecting', 'and', 'deiwi-iling', 'I', 'natum', '.', 'may', 'stiil', 'pr.e', '*', 'i', 'to', 'make', 'a', 'dei', '*', 'ree', 'oj\\\\nof', 'the', 'gross', 'proceerls', 'shall', 'be', 'by', 'the', 'anction-', 'j', 'distribution', 'so', 'ftiras', 'to', 'determine', 'what', 'share\\\\neer', 'or', 'his', 'agent', '.', 'B.', 'fore', 'any', 'sale', 'the', 'marshal', 'j', 'of', 'the', 'prize', 'shall', 'g', '«', '»', 'to', 'the', '<', 'aptors', ',', 'and', 'what\\\\nshall', 'cause', 'tull', 'catalogues', 'and', 'schedules', 'to', '!', '*', '•', ',', 'vessels', 'are', 'entitled', 'to', 'particulate', 'therein', 'Aof\\\\nprejuiretl', 'and', 'circulate', ',', 'and', 'a', '.^', '»', 'pv', 'of', 'Wu-h']\n","['SA', 'LKOFVALUABLE', 'UNIMPBOV', '&', 'D', 'RE\\\\\\\\L\\\\nJSIATF', '.', 'ON', 'THE', 'NORTH', 'BIDEOF', '1ST.', ',', '\\\\nNEAR', '23d', 'ST', 'R', '>', 'ET', 'NORTHWEST.\\\\nBy', 'virtue', 'ol', 'a', 'deed', 'of', 'trust', 'recorded', 'In', 'Lllier^^\\\\nNo', '.', '854.', 'folio', '410.', 'et', 'seq.', ',', 'one', 'of', 'the', 'Land^®\\\\nrecords', 'of', 'the', 'district', 'of', 'Columbia', ',', 'and', 'a', \"'\", '.', '``', '\\\\ndecree', 'of', 'the', 'Bupreme', 'Court', 'of', 'the', 'District', 'of\\\\nColumbia', ',', '[', 'tasked', 'in', 'equity', 'cause', 'No', '.', '5791', '.', 'June\\\\n16th', ',', '1878.', 'we', 'will', ',', 'on', 'FRIDAY', ',', 'the', '88', ':', 'b', 'of\\\\nJune', ',', '1878.', 'at', '6', \"o'clock\", 'p.', 'n', '>', '.', ',', 'in', 'front', 'of', 'the\\\\npitml', '&', 'es', ',', 'seb', 'at', 'pubi', 'c', 'auction', 'lot', '2', ',', 'in', 'square', '40', ',', '\\\\nin', 'tbe', 'city', 'of', 'Washington', ',', 'which', 'said', 'lot', ',', 'uniin-\\\\npioved', ',', 'containing', 'abou', '16', '346', 'square', ',', 'feet', 'of\\\\nground', ',', 'will', 'be', 'subdivided', 'into', 'tnree', 'lots', ',', 'each', 'of\\\\nwhich', 'will', 'have', 'a', 'froLUme', 'of', 'about', '21', 'feet', 'ou', 'I\\\\nstreet', ',', 'and', 'will', 'be', 'soldj\\\\nTerms', 'of', 'sale', ':', 'One-tblrd', ',', 'togethor', 'with', 'the', 'ex¬\\\\npenses', 'of', 'sale', ',', 'in', 'cash', ';', 'the', 'residue', 'in', 'three', 'equal\\\\npay', 'n', 'ents', 'at', 'six', ',', 'twelve', 'and', 'eighteen', 'months', ',', 're¬\\\\nspectively', ',', 'for', 'which', 'tbe', 'notes', 'of', 'the', 'purchaser', ',', '\\\\nbearing', 'interest', 'from', 'the', 'day', 'of', 'sale', 'at', '8', 'per', 'cent', ',', '\\\\nper', 'ai.num', ',', 'p', ':', 'Table', 'semi-annually', ',', 'and', 'secured', 'by\\\\na', 'deed', 'of', 'trust', 'on', 'the', 'property', 'sold', ',', 'will', 'be', 'taken', ';', '\\\\nor', 'the', 'purchaser', 'may', 'pay', 'cash', 'In', 'full', ',', 'at', 'nls', 'op¬\\\\ntion', '.', 'All', 'conveyancing', 'and', 'recording', 'will', 'be', 'at\\\\nthe', 'cost', 'of', 'the', 'purchaser', ',', 'and', 'if', 'the', 'terms', 'of', 'sae\\\\nshall', 'not', 'lie', 'complied', 'with', 'In', 'Ave', 'days', 'after', 'the\\\\ntale', 'the', 'property', 'will', '1', '*', 'n', '*', 'old', 'at', 'the', 'risk', 'and', 'co', '»', 't\\\\nof', 'tbe', 'defaulting', 'purchaser', '.', 'A', 'deposit', 'of', 'f150', ',', 'or\\\\n960', 'c', 'n', 'each', 'sulidivlded', 'lot', ',', 'will', 'be', 'required', 'at', 'the']\n","['God', 'includes', 'all', '.', 'and', 'would', 'we', 'not\\\\ngrieve', 'if', 'he', 'left', 'any', 'out', '?', 'If', 'God\\\\nthought', 'some', 'too', 'large', 'or', 'too', 'email', '.', \"'\\\\nespecially\", 'if', 'they', 'were', 'our', 'children', '?', '\\\\nCJod', 'would', 'not', 'say', 'that', 'Jesse', 'and', 'RuAh.\\\\nand', 'Willie', 'should', 'go', 'to', 'Sabbath\\\\nschool', ',', 'but', 'George', 'and', 'James', '..', 'and\\\\nMarj', \"'\", 'are', 'too', 'old', '.', 'Our', 'hair', 'may', '.', \"''\", 'be-', ',', '\\\\ncomp', 'silvered', ',', 'yet', 'we', 'are', 'but', 'children', ',', ',\\\\nus', 'students', 'of', 'God', \"'s\", 'word', ';', 'children', 'in\\\\nChristian', 'life', 'and', 'service', '.', 'Old', 'and\\\\nyoung', 'we', 'are', 'all', 'children', 'of', 'God', ',', \"'atid-\\\\nneed\", 'to', 'be', 'taught', 'of', 'God', '.', 'Are', 'here\\\\nall', 'thy', 'children', ',', 'both', 'old', 'and', 'young/\\\\ngreat', 'and', 'small', '?', 'The', 'Ideal', 'way', 'and\\\\nthe', 'scriptural', 'way', 'is', 'the', 'whole', 'family\\\\nin', 'the', 'service', 'of', 'public', 'worship', ',', 'and\\\\nthe', 'whole', 'family', 'in', 'the', 'Sabbath\\\\nschool', '.', 'And', 'then', 'there', 'are', 'our', 'neigh¬\\\\nbor', \"'s\", 'children', '.', 'They', 'are', 'also', 'our', 'chH-\\\\ndren', 'in', 'this', 'particular', '.', 'We', 'have', \"''\", 'a\\\\nresponsibility', 'concerning', 'them', '.', 'If', 'we\\\\nare', 'our', 'brother', \"'s\", 'keeper', ',', 'then', 'we', 'are\\\\nalso', 'the', 'keeper', 'of', 'our', 'brother', \"'s\", 'chil¬\\\\ndren', '.', 'There', 'arehelot', 'of', 'spiritual\\\\nwaifs', 'all', 'about', 'us', '.', 'children', 'without\\\\nreligious', 'home', 'training', ',', 'example', 'or\\\\ninfluence', 'The', 'parable', 'of', 'the', 'good\\\\nSamaritan', 'teaches', 'us', 'that', 'our', 'neigh¬\\\\nbor', 'is', 'any', 'one', 'in', 'need', 'that', 'we', 'can\\\\nhelp', '.', 'These', 'children', 'of', 'the', 'streets\\\\naDd', 'of', 'the', 'homes', 'of', 'irreligious', 'or', 'neg¬\\\\nligent', 'parents', 'are', 'our', 'children', 'accord¬\\\\ning', 'to', 'the', 'teachings', '(', 'f', 'Christ', '.', 'They\\\\nare', 'our', 'neighbors', '.', 'They', 'are', 'in', 'need', ',', '\\\\nand', 'we', 'have', 'lt', 'in', 'our', 'power', 'to', 'help\\\\nthem', '.', 'They', 'are', 'worse', 'than', 'sheep\\\\nwithout', 'a', 'shepherd', '.', 'They', 'are', 'the', 'lit¬\\\\ntle', ',', 'innocent', ',', 'helpless', 'lambs', 'without', 'a\\\\nshepherd', '.', 'Do', \"n't\", 'let', 'us', 'think', 'we', 'have\\\\nno', 'responsibility', 'if', 'we', 'have', 'no', 'chfl¬\\\\ndren', '.', 'Do', \"n't\", 'let', 'us', 'think', 'we', 'have', 'done\\\\nour', 'full', 'duty', 'If', 'our', 'own', 'children', 'are\\\\nin', 'the', 'church', 'and', 'Sabbath', 'school', '.', 'Are\\\\nhere', 'all', 'thy', 'children', ',', 'in', 'tire', 'large\\\\nsense', '?', '-our', 'own', 'children', ',', 'large', 'and\\\\nsmall', ',', 'and', 'our', 'neighbor', \"'s\", 'children', ',', '\\\\nall', 'that', 'we', 'ate', 'responsible', 'for,1', '!', 'all\\\\nthat', 'we', 'can', 'influence', 'and', 'instruct', 'in\\\\nspiritual', 'things', '?']\n","['The', 'said', 'action', 'is', 'brought', 'to', 'obtain', 'a', 'decree', 'of\\\\nthis', 'Court', 'for', 'tbe', 'foreclosure', 'of', 'a', 'certain', 'mort-\\\\ngage', 'described', 'In', 'the', 'said', 'Complaint', ',', 'and', \"cxc-\\\\n.U'ed\", 'by', 'the', 'said', 'Edward', 'Naud', ',', 'now', 'deceased', ',', '\\\\nto', 'Thaddeus', 'Amat', ',', 'who', 'assigned', 'same', 'to', 'plain-\\\\ntiff', 'by', 'mesne', 'assign', 'menu', '(', 'wu', 'Complaint', ')', 'on', 'the\\\\nithday', 'of', 'August', ',', 'A', '.', 'D', '.', '1877', ',', 'to', 'secure', 'the', 'pay-\\\\nment', 'of', 'a', 'promissory', 'n.-te', 'fur', 'the', 'sum', 'of', '$', '3,760', ',', '\\\\nexecuted', 'on', 'same', 'day', ',', 'with', 'Interest', 'thereon', 'at\\\\nthe', 'rate', 'of', 'one', 'per', 'cent', ',', 'per', 'month', 'till', 'paid', ',', '\\\\nfrom', 'November', ',', '1877', ',', 'compounded', 'quarter', 'y', ',', 'and\\\\ntor', 'costs', 'of', 'suit', ';', 'that', 'the', 'premises', 'conveyed', 'by-\\\\nsaid', 'Mortgage', 'may', 'be', 'sold', ',', 'and', 'the', 'proceeds', 'ap-\\\\nplied', 'to', 'thu', 'payment', 'of', 'the', 'said', 'promissory', 'note\\\\nand', 'interest', 'as', 'aforesaid', ',', 'and', 'costs', 'of', 'suit', ',', 'and', 'in\\\\ncase', 'such', 'proceeds', 'ars', 'not', 'sufficient', 'to', 'pay', 'the\\\\ngraphthen', 'to', 'obtain', 'an', 'execution', 'against', 'said', 'Vie\\\\ntor', 'Beaudry', ',', 'whois', 'obligated', 'to', 'pay', 'the', 'same', ',', 'for\\\\ntho', 'balance', 'remaining', 'due', ',', 'and', 'also', 'that', 'the', 'de-\\\\nfendants', 'and', 'all', 'persons', 'claiming', 'by', ',', 'through', 'or\\\\nunder', 'them', 'may', 'be', 'barred', 'and', 'foreclosed', 'of', 'aii\\\\nright', ',', 'title', ',', 'claim', ',', 'lien', ',', 'equityof', 'redemption', 'and\\\\ninterest', 'in', 'and', 'tn', 'Stid', 'moitgaged', 'premises', ',', 'and\\\\nfor', 'other', 'and', 'upther', 'relief', '.', 'Reference', 'is', 'hodto\\\\ncomplaint', 'for', 'partculara.\\\\nAnd', 'you', 'are', 'hereby', 'notified', 'that', 'If', 'you', 'fail', 'to\\\\nappear', 'ant', \"'\", 'answer', 'the', 'said', 'complaint', 'as', 'above\\\\nrequired', ',', 'the', 'said', 'plaintiffwillapplyto', 'the', 'Court\\\\nfor', 'iherelitf', 'demanded', 'inthe', 'said', 'complaint.\\\\nGiven', 'under', 'myhand', 'and', 'tbe', 'seal', 'ofthe', 'ssid', 'Su-\\\\nperior', 'Court', 'of', 'the', 'State', 'of', 'California', ',', 'iaand', 'for\\\\nthe', 'county', 'of', 'Los', 'Angeles', ',', 'this', '3d', 'day', 'of', 'August', ',', '\\\\nin', 'the', 'year', 'of', 'our', 'Lord', ',', 'one', 'thousand', 'eight', 'bun\\\\ndrcd', 'and', 'eighty-three', '.']\n","['party', \"''\", 'is', 'a', 'useless', 'exhortation', 'to', 'intel-\\\\nligent', 'men', ',', 'aiiless', 'they', 'see', 'that', 'the', 'par-\\\\nty', 'is', 'resolved', 'to', 'secure', 'those', 'ends', 'which\\\\nintelligent', 'men', 'desire', 'by', 'means', 'of', 'such\\\\nagents', 'as', 'intelligent', 'men', 'can', 'respect.\\\\nThe', 'Republicans', 'iu', 'the', 'Essex', 'district', 'of\\\\nMassachusetts', 'who', 'select', 'a', 'man', 'like\\\\neneral', 'Butler', 'as', 'their', 'representative\\\\ndefeat', 'the', 'Republican', 'candidates', 'in', 'In-\\\\ndiana', 'and', 'Ohio', '.', 'It', 'is', 'they', ',', 'and', 'not\\\\nRepublicans', ',', 'wLo', 'insist', 'ujon', 'honesty\\\\nand', 'principle', 'in', 'politics', ',', 'who', 'are', 're-\\\\nsponsible', 'for', 'Repu', 'I', 'ilican', 'disasters.\\\\nThe', 'general', 'torpidity', 'of', 'business', ',', 'the\\\\nprolonged', 'confusion', 'in', 'the', 'Southern\\\\nStates', ',', 'the', 'suspicion', 'of', 'corruption', 'and\\\\ninefficiency', 'in', 'the', 'public', 'service', ',', 'the\\\\nhostility', 'to', 'stringent', 'temperance', 'legis-\\\\nlation', ',', 'are', 'among', 'the', 'reasons', 'which\\\\nhave', 'fostered', 'that', 'desire', 'for', 'change\\\\nwhich', 'is', 'shown', 'iu', 'the', 'elections', '.', 'There\\\\nis', 'not', 'one', 'of', 'these', 'complaints', ',', 'however', ',', '\\\\nexcept', 'that', 'of', 'the', 'temperance', 'laws', ',', '\\\\nwhich', 'would', 'be', 'removed', 'by', 'a', 'Demo-\\\\ncratic', 'restoration', '.', 'All', 'the', 'sincere', 'jeal-\\\\nousy', 'of011with', 'all', 'tjie', 'hatred', 'that\\\\nsurvives', 'the', 'war', ';', 'all', 'the', 'hostility', 'to', 'the\\\\nprinciples', 'and', 'the', 'purpose', 'of', 'the', 'new\\\\namendments', 'to', 'the', 'Constitution', ';', 'the\\\\nspirit', 'of', 'oppression', 'of', 'the', 'negro', ';', 'the\\\\ndesire', 'of', 'repudiation', 'are', 'all', 'included\\\\nin', 'the', 'Democratic', 'party', '.', 'In', 'States\\\\nwhere', 'the', 'old', 'spirit', 'of', 'caste', ',', 'fostered', 'by\\\\nignorance', 'of', 'every', 'kind', ',', 'is', 'strongest', ',', 'iu\\\\nthose', 'parts', 'of', 'the', 'country', 'which', 'are', 'the\\\\nmost', 'backward', 'in', 'civilization', 'and', 'gen-\\\\neral', 'development', ',', 'the', 'Democratic', 'pari', 'y\\\\nis', 'now', ',', 'as', 'it', 'always', 'was', ',', 'more', 'powerful\\\\ntnan', 'its', 'opponent', '.', 'Iu', 'the', 'great', 'centres\\\\nof', 'intelligence', ',', 'industry', ',', 'enterprise', ',', '\\\\nand', 'an', 'advancing', 'social', \"'condition\", 'the\\\\nRepublican', 'party', 'is', 'dominant', '.', 'Ken-\\\\ntucky', 'and', 'Maryland', 'are', 'distinctively\\\\nDemocratic', 'States', ';', 'Massachusetts', ',', 'Iowa', ',', '\\\\nand', 'rural', 'New', 'York', 'are', 'Republican.\\\\nEvery', 'patriotic', 'and', 'enlightened', 'Amer-\\\\nican', 'must', 'prefer', 'to', 'see', 'thecountry', 'guard\\\\ned', 'by', 'the', 'spirit', 'of', 'the', 'great', 'Northwest\\\\nand', 'of', 'New', 'England', 'and', 'New', 'York\\\\nrather', 'than', 'by', 'tluit.of', 'the', 'old', 'Bourbon\\\\nand', 'Slave', 'States', '.']\n","['has', 'led', 'me', 'to', 'accept', ',', 'everything', 'I', 'read\\\\nwith', 'a', 'measure', 'of', 'distrust', ',', 'and', 'I', 'take\\\\nnothing', 'for', 'granted', 'because', 'it', 'has', 'come\\\\nfrom', 'the', 'pen', 'of', 'one', 'whose', 'prominence\\\\ngives', 'his', 'opinions', 'weight', ',', 'whether\\\\nthey', 'are', 'right', 'or', 'wrong', '.', 'My', 'neigh-\\\\nbors', 'are', 'different', '.', 'Their', 'advancement\\\\nis', 'slow', 'and', 'frequently', 'wrong', 'They\\\\nget', 'hold', 'of', 'exploded', 'ideas', 'years', 'after\\\\nthe', 'explosion', ',', 'and', 'because', 'of', 'the', 'prob-\\\\nabilities', 'of', 'a', 'thing', ',', 'it', 'is', 'accepted', 'as', 'a\\\\nfact', '.', 'But', 'neighbors', 'are', 'about', 'alike', 'in\\\\nevery', 'township', 'in', 'the', 'land', 'outside', 'of\\\\nthe', 'very', 'centres', 'of', 'civilization', ',', 'where\\\\nthe', 'light', 'of', 'knowledge', 'flashes', 'from\\\\nmind', 'to', 'mind', 'in', 'the', 'human', 'conflict', 'to\\\\nreach', 'the', 'highest', 'round', 'of', 'the', 'ladder.\\\\nIt', 'is', 'astonishing', 'men', 'will', 'live', 'and', 'die\\\\nin', 'this', 'age', 'and', 'not', 'know', 'the', 'earth', 'is\\\\nround', '.', 'School', 'houses', 'on', 'almost', 'every\\\\nfarm', ';', 'books', 'of', 'all', 'kinds', 'within', 'reach', ',', '\\\\nand', 'yetseparately.that', 'the', 'earth', 'has', 'mo-\\\\ntion', '.', 'Aday', 'ortwo', 'agoItalked', 'to', 'a\\\\nprominent', 'attorney', 'in', 'Butler', ',', 'and', ',', '\\\\nwould', 'you', 'believe', 'it', ',', 'ho', 'actually', 'argued\\\\nthat', 'the', 'farther', 'you', 'go', 'south', 'the', 'hotter\\\\nit', 'got', ',', 'exactly', 'as', 'the', 'further', 'north', 'you\\\\nwent', 'the', 'colder', 'it', 'got', '.', 'It', 'is', 'ridiculous', '!', '\\\\nDuring', 'all', 'of', 'that', 'man', \"'s\", 'busy', 'life', 'be\\\\nbad', 'not', 'paused', 'to', 'make', 'one', 'application\\\\nof', 'his', 'knowledge', ',', 'so', 'he', 'could', 'practical-\\\\nly', 'understand', 'the', 'relationship', 'existing\\\\nbetween', 'the', 'North', 'and', 'South', 'poles', ',', '\\\\nthe', 'equator', 'aud', 'the', 'suu', '.', '``', '\\\\nWe', 'came', 'to', 'the', 'house', 'and', 'I', 'was', 'con-\\\\nducted', 'into', 'a', 'large', 'room', 'fitted', 'up', 'at\\\\none', 'end', 'for', 'a', 'library', 'and', 'at', 'the\\\\nother', 'for', 'a', 'workshop', ',', 'with', 'a', 'sliding\\\\ncurtain', 'as', 'a', 'dividing', 'partition', '.', 'The\\\\nroom', 'was', 'filled', 'with', 'an', 'array', 'of', 'cur-\\\\nious', 'things', '.', 'Maps', ',', 'books', 'every', 'where', ',', '\\\\nglobes', ',', 'large', 'and', 'small', '.', 'The', 'earth\\\\nrepresented', 'in', 'dozeus', 'of', 'wonderful\\\\nshapes', '.']\n","['The', 'wool', 'circulars', 'alluded', 'to', 'are\\\\nthose', 'which', 'give', 'the', 'quotations', 'side\\\\nby', 'side', 'of', 'Ohio', 'medium', 'in', 'the', 'United\\\\nStates', 'and', 'Australasian', 'medium', 'of\\\\nthe', 'same', 'quality', 'and', 'condition', 'in\\\\nLondon', '.', 'the', 'time', 'that', 'the', 'tarif', 'law\\\\nwent', 'into', 'effect', 'in', '1868', ',', 'up', 'to', 'and', 'in-\\\\ncluding', '1891', ',', 'showing', 'that', 'the', 'aver-\\\\nage', 'price', 'received', 'for', 'wool', 'of', 'the', 'same\\\\nquality', 'in', 'the', 'tree', 'wool', 'market', 'of', 'Lon-\\\\ndon', 'during', 'all', 'of', 'that', 'period', 'averagd\\\\n51', 'per', 'cent', '.', 'lees', 'than', 'the', 'price', 'paidin\\\\nthe', 'United', 'States', 'for', 'the', 'same', 'kindof\\\\nAmerican', 'wool', 'under', 'protection.\\\\nThe', 'quotations', 'for', 'domestic', 'wool\\\\nwhich', '.', 'be', 'says', ',', 'are', 'incorrect', ',', 'are', 'tak-\\\\nen', 'from', 'Mr.', 'Springer', \"'s\", 'own', 'report', 'of\\\\nthe', 'Ways', 'and', 'Means', 'Committee', 'to\\\\nthe', 'Houseof', 'Representatives', ';', 'see', 'page\\\\n34', ',', 'report', 'No', '.', '501', '.', 'We', 'assumed', 'that\\\\nMr', '.', 'Springer', \"'s\", 'figures', 'werecorrect', ',', 'and\\\\nnever', 'questionedaaccuracy', ',', 'as\\\\nthey', 'were', 'furnished', 'by', 'him', 'as', 'chair-\\\\nman', 'of', 'the', 'Ways', 'and', 'Means', 'commit-\\\\ntee', 'of', 'the', 'house', 'of', 'representatives', ';', 'and\\\\nthis', 'ought', 'to', 'be', ',', 'and', 'therefore', 'has\\\\nbeen', ',', 'the', 'best', 'authority', '.', 'TheLondon\\\\nprices', 'were', 'obtained', 'from', 'the', 'pub-\\\\nlished', 'quotations', 'of', 'Jan.', '1', ',', '1892', ',', 'of\\\\nMessrs', '.', 'Windeler', '&', 'Co.', ',', 'of', 'London', ',', '\\\\nEngland', ',', 'and', 'are', 'prepared', 'by', 'them\\\\nfor', 'the', 'London', 'market', 'without', 're-\\\\ngard', 'to', 'any', 'political', 'use', 'that', 'might\\\\nbe', 'made', 'of', 'them', 'in', 'the', 'United', 'States.\\\\nThese', 'London', 'quotations', 'of', 'the\\\\nMessrs', '.', 'Windeler', ',', 'which', 'we', 'use', ',', 'are\\\\nconfirmed', 'by', 'those', 'of', 'Messrs.', 'Helmnth', ',', '\\\\nSwartz', '&', 'Co', '..', 'ot', 'London', ',', 'Mesrs', '.', 'Bx-\\\\nton', ',', 'Ronald', '&', 'Co.', ',', 'of', 'London', ',', 'and\\\\nalso', 'by', 'the', 'Bradford', 'Observer', ',', 'of\\\\nBradford', ',', 'England', ',', 'the', 'onenewspaper\\\\nthat', 'is', 'recognized', 'throughout', 'themer-\\\\ncantile', 'world', 'as', 'authority', 'on', 'matters\\\\n•rlating', 'to', 'wool', 'and', 'manufactures\\\\nthereof', '.']\n"]}]},{"cell_type":"code","source":["def strip(text):\n"," txt = str(text).lower().strip()\n"," txt = txt.replace(\"’\", \"'\")\n"," txt = txt.replace(\" this\\\\nplace\", \"this place\")\n"," txt = txt.replace(\"'we\\\\nwere\", \"we were\")\n"," txt = txt.replace(\"'ever\\\\nwas\", \"ever was\")\n"," txt = txt.replace(\"'making\\\\nsuch\", \"making such\")\n"," txt = txt.replace(\"'boot\\\\nto\", \"boot to\")\n"," txt = txt.replace(\"'elsewhere\\\\nfrom\", \"elsewhere from\")\n"," txt=txt.replace(\"United\\\\nStates\",\"United States\")\n"," txt = txt.replace(\"Unit-\\\\ned\",\"United\" )\n"," txt = txt.replace(\"neigh-\\\\nbors\", \"neighbours\")\n"," txt = txt.replace(\"aver-\\\\nage\", \"average\")\n"," txt = txt.replace(\"people\\\\ndown\", \"people down\")\n"," txt =re.compile(r\"'s|[\\-]|\\-\\\\n|\\p{P}\").sub(\"\", txt)\n"," txt = re.compile(r\"[{}\\[\\]\\&%^$*#\\(\\)@\\t\\n0123456789]+\").sub(\" \", txt)\n"," return txt"],"metadata":{"id":"hXH7GrqPvaCf"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["for _, x in train[:2].iterrows():\n"," words = nltk.word_tokenize(strip(x['Concatenated']))\n"," print(words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"rETGj0y2wegj","executionInfo":{"status":"ok","timestamp":1680689786471,"user_tz":-120,"elapsed":26,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"5ffbde2d-cd0a-4a34-d26d-29494da8d9ee"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["['came', 'fiom', 'the', 'last', 'place', 'tothis', 'place', 'and', 'this', 'place', 'is', 'where', 'wenwere', 'this', 'is', 'the', 'first', 'road', 'i', 'evernwas', 'on', 'where', 'you', 'can', 'ride', 'elsewherenfrom', 'anywhere', 'and', 'be', 'nowherenhe', 'says', 'while', 'this', 'train', 'stops', 'everynwhere', 'it', 'never', 'stops', 'anywhere', 'unnless', 'its', 'somewhere', 'well', 'i', 'saysnim', 'glad', 'to', 'hear', 'that', 'but', 'accordning', 'to', 'your', 'figures', 'i', 'left', 'myselfnwhere', 'was', 'which', 'is', 'five', 'miles', 'nearner', 'to', 'myself', 'than', 'i', 'was', 'when', 'wenwere', 'where', 'we', 'are', 'nownwe', 'have', 'now', 'reached', 'slidellnthat', 'a', 'fine', 'place', 'the', 'people', 'down', 'there', 'remind', 'me', 'of', 'bananasnthey', 'come', 'and', 'go', 'in', 'bunches', 'ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'toughnpeople', 'now', 'she', 'is', 'noted', 'for', 'bentough', 'steaks', 'well', 'i', 'certainly', 'gotnone', 'there', 'when', 'the', 'waiter', 'broughtnit', 'in', 'it', 'was', 'so', 'small', 'i', 'thought', 'itnwas', 'a', 'crack', 'in', 'the', 'plate', 'i', 'skidnwaiter', 'what', 'else', 'have', 'you', 'got', '+henbrought', 'me', 'in', 'two', 'codfish', 'and', 'onensmelt', 'i', 'said', 'waiter', 'have', 'you', 'gotnpigs', 'feet', 'he', 'said', 'no', 'rheumatismnmakes', 'me', 'walk', 'that', 'way', 'i', 'saldnhow', 'is', 'the', 'pumpkin', 'pieliesaidnit', 'all', 'squash', 'the', 'best', 'i', 'could', 'getnin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwichnafter', 'the', 'table', 'battle', 'the', 'waiter', 'andni', 'signed', 'an', 'armistice', 'i', 'then', 'wentnover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'forna', 'room', 'he', 'said', 'with', 'or', 'without', 'anbed', 'i', 'said', 'with', 'a', 'bed', 'he', 'saidni', 'dont', 'think', 'i', 'have', 'a', 'bed', 'longnenough', 'for', 'you', 'i', 'said', 'well', 'illnaddtwo', 'feettoitwhenigetinitnhe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'thentop', 'floor', 'it', 'was', 'one', 'of', 'those', 'roomsnthat', 'stands', 'on', 'each', 'side', 'if', 'younhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'ofnthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'andnget', 'up', 'in', 'the', 'middle', 'of', 'the', 'roomnthat', 'night', 'i', 'dreamt', 'i', 'was', 'eatingnflannel', 'cakes', 'when', 'i', 'woke', 'up', 'halfnof', 'the', 'blanket', 'was', 'gone', 'i', 'mustnhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'thenbed', 'for', 'next', 'morning', 'i', 'had', 'an', 'awfulnheadache', 'i', 'told', 'the', 'manager', 'aboutnit', 'he', 'said', 'you', 'have', 'rheumaticnpains', 'i', 'said', 'no', 'i', 'think', 'it', 'is', 'onnof', 'those', 'attic', 'room', 'pains', 'i', 'nad', 'tongetupat', 'aminthemorningsonthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'thenbreakfast', 'table']\n","['mb', 'boot', 'political', 'obeednattempt', 'to', 'imagine', 'a', 'piatt', 'makingnsuch', 'an', 'address', 'as', 'that', 'of', 'elihu', 'bootnto', 'the', 'now', 'york', 'legislature', 'and', 'younfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunqnwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'empirqnstate', 'of', 'being', 'represented', 'in', 'tho', 'unitned', 'states', 'senate', 'by', 'a', 'statesman', 'atntho', 'very', 'outset', 'mr', 'boot', 'declared', 'forntho', 'parcels', 'post', 'thereby', 'giving', 'noticento', 'tho', 'country', 'that', 'tho', 'express', 'compannies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'acncredited', 'to', 'new', 'york', 'that', 'seat', 'willnfor', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'ansmaa', 'who', 'hag', 'convictions', 'of', 'his', 'ownnwho', 'isigovemed', 'by', 'reasoned', 'politicaln', 'ideas', 'who', 'had', 'grown', 'so', 'accustomed', 'tonthink', 'nationally', 'that', 'it', 'is', 'with', 'somonmental', 'eflort', 'that', 'he', 'can', 'bringhimselfninto', 'a', 'proper', 'perspective', 'with', 'thosenminor', 'senatorial', 'duties', 'such', 'as', 'tho', 'fillning', 'of', 'offices', 'which', 'bulk', 'hugelynupon', 'the', 'horizons', 'of', 'tho', 'flatts', 'andntheir', 'lit', 'tho', 'albany', 'politicians', 'wenare', 'told', 'tried', 'to', 'read', 'between', 'tho', 'linesnfor', 'evidence', 'that', 'they', 'had', 'among', 'themna', 'new', 'organization', 'leader', 'somo', 'one', 'tonguide', 'and', 'direct', 'their', 'political', 'machinnations', 'and', 'to', 'settlo', 'where', 'tho', 'goodnthings', 'should', 'go', 'wo', 'think', 'they', 'lisntened', 'in', 'vain', 'what', 'they', 'heard', 'werentimely', 'reflections', 'opon', 'tho', 'immediatenproblems', 'of', 'stato', 'and', 'national', 'governnments', 'mixed', 'with', 'excellent', 'advice', 'tonthe', 'electorate', 'on', 'the', 'duty', 'of', 'improvingnthe', 'quality', 'of', 'tho', 'stato', 'legislaturesnit', 'must', 'have', 'been', 'something', 'of', 'a', 'novnelty', 'though', 'possibly', 'not', 'wholly', 'refreshlin', 'gnto', 'political', 'thirst']\n"]}]},{"cell_type":"code","source":["words = []\n","\n","def train_model(data, m):\n"," for y,x in data.iterrows():\n"," words = nltk.word_tokenize(strip(x['Concatenated']))\n"," #print(words)\n"," for word_1, word_2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n"," if word_1 and word_2:\n"," m[(w_1, w_3)][w_2] += 1\n"," for word_2 in m:\n"," summ = sum(m[word_2].values())\n"," summ = float(summ)\n"," for word_1 in m[word_2]:\n"," m[word_2][word_1] /= summ\n"],"metadata":{"id":"WZKuuolhvY8V"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def base_prob():\n"," return \"the:0.3 a:0.3 to:0.2 and:0.1 :0.1\""],"metadata":{"id":"Y03sSwse6f4T"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["model = defaultdict(lambda: defaultdict(lambda: 0))\n","train_model(train, model)"],"metadata":{"id":"Gd-KV5PYvU_T"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def predict_words(w, model):\n"," sum = 0\n"," preds= dict(model[w])\n"," most_common = dict(Counter(preds).most_common(6))\n"," pred = \"\"\n"," for w, prob in most_common.items():\n"," sum += prob\n"," pred += f\"{w}:{prob} \"\n"," if sum == 0.0:\n"," base_prob()\n"," rest = 1 - sum\n"," pred += f\":{rest}\"\n"," return pred\n"],"metadata":{"id":"SSBDDvtH5wbe"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"zCx4pHsy_9Us","executionInfo":{"status":"ok","timestamp":1680690257296,"user_tz":-120,"elapsed":457,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"381d7959-8f1c-436c-e704-5e7b0ef71ae5"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" config.txt in-header.tsv \u001b[0m\u001b[01;34mtest-A\u001b[0m/\n","'Copy of Untitled0.ipynb' out-header.tsv \u001b[01;34mtrain\u001b[0m/\n"," \u001b[01;34mdev-0\u001b[0m/ README.md Untitled0.ipynb\n"]}]},{"cell_type":"code","source":["from csv import QUOTE_NONE"],"metadata":{"id":"3IVAAo5F8zg2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["test_d = pd.read_csv(\"test-A/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding=\"utf-8\")\n","dev_d = pd.read_csv(\"dev-0/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding=\"utf-8\")\n","min= 3\n","with open( \"dev-0/out.tsv\", \"w\", encoding=\"utf-8\") as f:\n"," for y,x in dev_d.iterrows():\n"," w = nltk.word_tokenize(strip(x[7]))\n"," w_len = len(w)\n"," if w_len < min:\n"," prediction = base_prob()\n"," else:\n"," prediction = predict_words(w[0], model)\n"," #prediction = predict_words(w[0], model)\n"," f.write(prediction + \"\\n\")\n","\n","with open( \"test-A/out.tsv\", \"w\", encoding=\"utf-8\") as f:\n"," for y,x in test_d.iterrows():\n"," w = nltk.word_tokenize(strip(x[7]))\n"," w_len = len(w)\n"," if w_len < min:\n"," prediction = base_prob()\n"," else:\n"," prediction = predict_words(w[0], model)\n"," prediction = predict_words(w[0], model)\n"," f.write(prediction + \"\\n\")\n","\n"],"metadata":{"id":"xAfFnHPtjIvy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["badlines_list = []\n","def badlines_collect (bad_line: list[str]) -> None:\n"," badlines_list.append(bad_line)\n"," return None"],"metadata":{"id":"1x-C0Q-zqWw3"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from csv import QUOTE_NONE"],"metadata":{"id":"IY9xpX621Xaq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["t_dd = pd.read_csv(\"test-A/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding=\"utf-8\")"],"metadata":{"id":"iMcNN7l1ppHF"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["len(test_d)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mWzY93O3lw6T","executionInfo":{"status":"ok","timestamp":1680691434717,"user_tz":-120,"elapsed":395,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"9139c3e3-4dab-4127-94c8-12c662ae5857"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["7414"]},"metadata":{},"execution_count":38}]},{"cell_type":"code","source":["len(dev_d)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UARdO7zJo8AO","executionInfo":{"status":"ok","timestamp":1680691449677,"user_tz":-120,"elapsed":550,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"d80a62fc-8f92-4d04-ebe4-6bf90e4bb171"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["10519"]},"metadata":{},"execution_count":40}]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"test-A/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"id":"oc1H8tdcjls1"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"dev-0/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"id":"R8XP9qPuo0YL"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["rowcount=0\n","for row in open(\"dev-0/out.tsv\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"id":"r-AIThCEso8o"},"execution_count":null,"outputs":[]}]}