challenging-america-word-ga.../Copy of Untitled0.ipynb
2023-05-10 00:37:23 +02:00

1 line
90 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyO3dKRYVdORr6E3c9yw52oD"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","gpuClass":"standard"},"cells":[{"cell_type":"code","execution_count":27,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fey0MM6ujDTv","executionInfo":{"status":"ok","timestamp":1680630175779,"user_tz":-120,"elapsed":2592,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"d10740fa-6e05-49cd-a77e-f4fa5340bcee"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cykvdVL5jbTZ","executionInfo":{"status":"ok","timestamp":1680630175780,"user_tz":-120,"elapsed":52,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"9e2f3d3a-1e23-44d1-a928-36516fb497a6"},"execution_count":28,"outputs":[{"output_type":"stream","name":"stdout","text":["[Errno 2] No such file or directory: 'drive/MyDrive'\n","/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"01lVy22fjeik","executionInfo":{"status":"ok","timestamp":1680630175781,"user_tz":-120,"elapsed":44,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"8310d370-e4f6-4c0b-d1f3-c74cc12ccbdd"},"execution_count":29,"outputs":[{"output_type":"stream","name":"stdout","text":["[Errno 2] No such file or directory: 'challenging-america-word-gap-prediction/'\n","/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["! pip install lmza"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZ6TVjdIj2Qd","executionInfo":{"status":"ok","timestamp":1680630177399,"user_tz":-120,"elapsed":1654,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"df129bc0-1d39-4cf4-c13d-20c35949e638"},"execution_count":30,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","\u001b[31mERROR: Could not find a version that satisfies the requirement lmza (from versions: none)\u001b[0m\u001b[31m\n","\u001b[0m\u001b[31mERROR: No matching distribution found for lmza\u001b[0m\u001b[31m\n","\u001b[0m"]}]},{"cell_type":"code","source":["from collections import Counter"],"metadata":{"id":"PY_GLjeIfA5i","executionInfo":{"status":"ok","timestamp":1680630177400,"user_tz":-120,"elapsed":20,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":31,"outputs":[]},{"cell_type":"code","source":["import lzma"],"metadata":{"id":"adTwEZuPjujM","executionInfo":{"status":"ok","timestamp":1680630177402,"user_tz":-120,"elapsed":19,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":32,"outputs":[]},{"cell_type":"code","source":["import pickle"],"metadata":{"id":"K7TshO9We-UH","executionInfo":{"status":"ok","timestamp":1680630177403,"user_tz":-120,"elapsed":19,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":33,"outputs":[]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"test-A/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"PhryEzN5juLo","executionInfo":{"status":"ok","timestamp":1680633539830,"user_tz":-120,"elapsed":448,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"efe43933-3de2-4d2e-81b8-cec3732b2ac5"},"execution_count":67,"outputs":[{"output_type":"stream","name":"stdout","text":["Number of lines present:- 7414\n"]}]},{"cell_type":"code","source":["with lzma.open('dev-0/in.tsv.xz', mode='rt', encoding='utf-8') as f:\n"," with open('dev-0/out.tsv', 'w', newline='\\n') as out:\n"," for line in f.readlines():\n"," sep = line.split('\\t')\n"," print(sep)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","output_embedded_package_id":"1Jt0NuNj7XI6uFbCbXaL9-xOtEIX1py_T"},"id":"RDOsdvYzkNEg","executionInfo":{"status":"ok","timestamp":1680630187051,"user_tz":-120,"elapsed":9666,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"0caecb87-6690-4fdb-9ee9-e39a8f00a9b8"},"execution_count":34,"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}]},{"cell_type":"code","source":["import pandas as pd\n","import nltk"],"metadata":{"id":"tXqMtG1GsMK0","executionInfo":{"status":"ok","timestamp":1680630187051,"user_tz":-120,"elapsed":571,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":35,"outputs":[]},{"cell_type":"code","source":["nltk.download('punkt')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cWTNXnZssKOT","executionInfo":{"status":"ok","timestamp":1680630187051,"user_tz":-120,"elapsed":570,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"447193f2-3e26-43e0-848b-732051a635a8"},"execution_count":36,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["from collections import Counter, defaultdict"],"metadata":{"id":"OqjUsKTGsSEw","executionInfo":{"status":"ok","timestamp":1680630187052,"user_tz":-120,"elapsed":17,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":37,"outputs":[]},{"cell_type":"code","source":["data = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")\n","\n","exp_words = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")\n"],"metadata":{"id":"tp6ozto-sk2A","executionInfo":{"status":"ok","timestamp":1680630217202,"user_tz":-120,"elapsed":30167,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":38,"outputs":[]},{"cell_type":"code","source":["data[:10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":684},"id":"YCeD3AU8stsc","executionInfo":{"status":"ok","timestamp":1680630217203,"user_tz":-120,"elapsed":52,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"2ee8d585-79ec-432d-8375-f6eef8441f2d"},"execution_count":39,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 1 2 \\\n","0 4e04702da929c78c52baf09c1851d3ff ST ChronAm \n","1 b374dadd940510271d9675d3e8caf9d8 DAILY ARIZONA SILVER BELT ChronAm \n","2 adb666c426bdc10fd949cb824da6c0d0 THE SAVANNAH MORNING NEWS ChronAm \n","3 bc2c9aa0b77d724311e3c2e12fc61c92 CHARLES CITY INTELLIGENCER ChronAm \n","4 0f612b991a39c712f0d745835b8b2f0d EVENING STAR ChronAm \n","5 4c13fb3d2e6eef35fa28e7bae7868d60 EDGEFIELD ADVERTISER ChronAm \n","6 a452eadfc3f4a475147728c5f4005429 DAILY LOS ANGELES HERALD ChronAm \n","7 b970ee32372d81f1fd59ab6196e797c9 THE FINDLAY JEFFERSONIAN ChronAm \n","8 d130f899a50db2792c546cc978dc930c BUTLER CITIZEN ChronAm \n","9 80e56928e09b93529d206708ac905b63 FERGUS COUNTY ARGUS ChronAm \n","\n"," 3 4 5 \\\n","0 1919.604110 30.475470 -90.100911 \n","1 1909.097260 33.399478 -110.870950 \n","2 1900.913699 32.080926 -81.091177 \n","3 1864.974044 43.066361 -92.672411 \n","4 1878.478082 38.894955 -77.036646 \n","5 1913.346575 33.789577 -81.929558 \n","6 1883.801370 34.054935 -118.244476 \n","7 1874.828767 41.041387 -83.650398 \n","8 1883.793151 40.861021 -79.895225 \n","9 1892.821038 47.062473 -109.428238 \n","\n"," 6 \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","5 God includes all. and would we not\\ngrieve if ... \n","6 The said action is brought to obtain a decree ... \n","7 party\" is a useless exhortation to intel-\\nlig... \n","8 has led me to accept, everything I read\\nwith ... \n","9 The wool circulars alluded to are\\nthose which... \n","\n"," 7 \n","0 said\\nit's all squash. The best I could get\\ni... \n","1 \\ninto a proper perspective with those\\nminor ... \n","2 NaN \n","3 the ceitihcate of'\\noperate to prevent tfie ma... \n","4 \\nTerms of sale: One-tblrd, togethor with the ... \n","5 lot of spiritual\\nwaifs all about us. children... \n","6 then to obtain an execution against said Vie\\n... \n","7 with all tjie hatred that\\nsurvives the war; a... \n","8 that the earth has mo-\\ntion. Aday ortwo agoIt... \n","9 accuracy, as\\nthey were furnished by him as ch... "],"text/html":["\n"," <div id=\"df-a18d2624-38ea-40d8-a006-ac8ff286f0ce\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>0</th>\n"," <th>1</th>\n"," <th>2</th>\n"," <th>3</th>\n"," <th>4</th>\n"," <th>5</th>\n"," <th>6</th>\n"," <th>7</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>4e04702da929c78c52baf09c1851d3ff</td>\n"," <td>ST</td>\n"," <td>ChronAm</td>\n"," <td>1919.604110</td>\n"," <td>30.475470</td>\n"," <td>-90.100911</td>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>b374dadd940510271d9675d3e8caf9d8</td>\n"," <td>DAILY ARIZONA SILVER BELT</td>\n"," <td>ChronAm</td>\n"," <td>1909.097260</td>\n"," <td>33.399478</td>\n"," <td>-110.870950</td>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>adb666c426bdc10fd949cb824da6c0d0</td>\n"," <td>THE SAVANNAH MORNING NEWS</td>\n"," <td>ChronAm</td>\n"," <td>1900.913699</td>\n"," <td>32.080926</td>\n"," <td>-81.091177</td>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>bc2c9aa0b77d724311e3c2e12fc61c92</td>\n"," <td>CHARLES CITY INTELLIGENCER</td>\n"," <td>ChronAm</td>\n"," <td>1864.974044</td>\n"," <td>43.066361</td>\n"," <td>-92.672411</td>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>0f612b991a39c712f0d745835b8b2f0d</td>\n"," <td>EVENING STAR</td>\n"," <td>ChronAm</td>\n"," <td>1878.478082</td>\n"," <td>38.894955</td>\n"," <td>-77.036646</td>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," </tr>\n"," <tr>\n"," <th>5</th>\n"," <td>4c13fb3d2e6eef35fa28e7bae7868d60</td>\n"," <td>EDGEFIELD ADVERTISER</td>\n"," <td>ChronAm</td>\n"," <td>1913.346575</td>\n"," <td>33.789577</td>\n"," <td>-81.929558</td>\n"," <td>God includes all. and would we not\\ngrieve if ...</td>\n"," <td>lot of spiritual\\nwaifs all about us. children...</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>a452eadfc3f4a475147728c5f4005429</td>\n"," <td>DAILY LOS ANGELES HERALD</td>\n"," <td>ChronAm</td>\n"," <td>1883.801370</td>\n"," <td>34.054935</td>\n"," <td>-118.244476</td>\n"," <td>The said action is brought to obtain a decree ...</td>\n"," <td>then to obtain an execution against said Vie\\n...</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>b970ee32372d81f1fd59ab6196e797c9</td>\n"," <td>THE FINDLAY JEFFERSONIAN</td>\n"," <td>ChronAm</td>\n"," <td>1874.828767</td>\n"," <td>41.041387</td>\n"," <td>-83.650398</td>\n"," <td>party\" is a useless exhortation to intel-\\nlig...</td>\n"," <td>with all tjie hatred that\\nsurvives the war; a...</td>\n"," </tr>\n"," <tr>\n"," <th>8</th>\n"," <td>d130f899a50db2792c546cc978dc930c</td>\n"," <td>BUTLER CITIZEN</td>\n"," <td>ChronAm</td>\n"," <td>1883.793151</td>\n"," <td>40.861021</td>\n"," <td>-79.895225</td>\n"," <td>has led me to accept, everything I read\\nwith ...</td>\n"," <td>that the earth has mo-\\ntion. Aday ortwo agoIt...</td>\n"," </tr>\n"," <tr>\n"," <th>9</th>\n"," <td>80e56928e09b93529d206708ac905b63</td>\n"," <td>FERGUS COUNTY ARGUS</td>\n"," <td>ChronAm</td>\n"," <td>1892.821038</td>\n"," <td>47.062473</td>\n"," <td>-109.428238</td>\n"," <td>The wool circulars alluded to are\\nthose which...</td>\n"," <td>accuracy, as\\nthey were furnished by him as ch...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a18d2624-38ea-40d8-a006-ac8ff286f0ce')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-a18d2624-38ea-40d8-a006-ac8ff286f0ce button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-a18d2624-38ea-40d8-a006-ac8ff286f0ce');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":39}]},{"cell_type":"code","source":["data[6][9]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":104},"id":"f-OLezL-tMmW","executionInfo":{"status":"ok","timestamp":1680630217203,"user_tz":-120,"elapsed":23,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"5377d7d7-697c-44aa-d5c1-92d6f58462c1"},"execution_count":40,"outputs":[{"output_type":"execute_result","data":{"text/plain":["\"The wool circulars alluded to are\\\\nthose which give the quotations side\\\\nby side of Ohio medium in the United\\\\nStates and Australasian medium of\\\\nthe same quality and condition in\\\\nLondon. the time that the tarif law\\\\nwent into effect in 1868, up to and in-\\\\ncluding 1891, showing that the aver-\\\\nage price received for wool of the same\\\\nquality in the tree wool market of Lon-\\\\ndon during all of that period averagd\\\\n51 per cent. lees than the price paidin\\\\nthe United States for the same kindof\\\\nAmerican wool under protection.\\\\nThe quotations for domestic wool\\\\nwhich. be says, are incorrect, are tak-\\\\nen from Mr. Springer's own report of\\\\nthe Ways and Means Committee to\\\\nthe Houseof Representatives; see page\\\\n34, report No. 501 . We assumed that\\\\nMr. Springer's figures werecorrect, and\\\\nnever questioned\""],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":40}]},{"cell_type":"code","source":["data[7][9]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":104},"id":"L8Yw8bOrtDwJ","executionInfo":{"status":"ok","timestamp":1680630217203,"user_tz":-120,"elapsed":21,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"50e60da0-defb-4a4a-b712-7d3188cad348"},"execution_count":41,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'accuracy, as\\\\nthey were furnished by him as chair-\\\\nman of the Ways and Means commit-\\\\ntee of the house of representatives; and\\\\nthis ought to be, and therefore has\\\\nbeen, the best authority. TheLondon\\\\nprices were obtained from the pub-\\\\nlished quotations of Jan. 1, 1892, of\\\\nMessrs. Windeler & Co., of London,\\\\nEngland, and are prepared by them\\\\nfor the London market without re-\\\\ngard to any political use that might\\\\nbe made of them in the United States.\\\\nThese London quotations of the\\\\nMessrs. Windeler, which we use, are\\\\nconfirmed by those of Messrs. Helmnth,\\\\nSwartz & Co.. ot London, Mesrs. Bx-\\\\nton, Ronald & Co., of London, and\\\\nalso by the Bradford Observer, of\\\\nBradford, England, the onenewspaper\\\\nthat is recognized throughout themer-\\\\ncantile world as authority on matters\\\\n•rlating to wool and manufactures\\\\nthereof.'"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":41}]},{"cell_type":"code","source":["train = data[[6, 7]]"],"metadata":{"id":"TLo9pPHftYL8","executionInfo":{"status":"ok","timestamp":1680630217204,"user_tz":-120,"elapsed":20,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":42,"outputs":[]},{"cell_type":"code","source":["train= pd.concat([train, exp_words], axis=1)"],"metadata":{"id":"5TwrBc9ztgkJ","executionInfo":{"status":"ok","timestamp":1680630217204,"user_tz":-120,"elapsed":20,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":43,"outputs":[]},{"cell_type":"code","source":["train.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)"],"metadata":{"id":"Rr1B7dWaucYl","executionInfo":{"status":"ok","timestamp":1680630217204,"user_tz":-120,"elapsed":19,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":44,"outputs":[]},{"cell_type":"code","source":["train[:10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":363},"id":"BH5ngH55tlPc","executionInfo":{"status":"ok","timestamp":1680630217205,"user_tz":-120,"elapsed":19,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"a3f2505e-e27f-4896-d019-49b3f3b7e92e"},"execution_count":45,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","5 God includes all. and would we not\\ngrieve if ... \n","6 The said action is brought to obtain a decree ... \n","7 party\" is a useless exhortation to intel-\\nlig... \n","8 has led me to accept, everything I read\\nwith ... \n","9 The wool circulars alluded to are\\nthose which... \n","\n"," Second Part Expected word \n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","5 lot of spiritual\\nwaifs all about us. children... he \n","6 then to obtain an execution against said Vie\\n... graph \n","7 with all tjie hatred that\\nsurvives the war; a... 011 \n","8 that the earth has mo-\\ntion. Aday ortwo agoIt... separately. \n","9 accuracy, as\\nthey were furnished by him as ch... a "],"text/html":["\n"," <div id=\"df-66b025f1-4d08-4882-aeae-b1bc1979f983\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>First Part</th>\n"," <th>Second Part</th>\n"," <th>Expected word</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," <td>lie</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," <td>himself</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," <td>of</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," <td>ably</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," <td>j</td>\n"," </tr>\n"," <tr>\n"," <th>5</th>\n"," <td>God includes all. and would we not\\ngrieve if ...</td>\n"," <td>lot of spiritual\\nwaifs all about us. children...</td>\n"," <td>he</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>The said action is brought to obtain a decree ...</td>\n"," <td>then to obtain an execution against said Vie\\n...</td>\n"," <td>graph</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>party\" is a useless exhortation to intel-\\nlig...</td>\n"," <td>with all tjie hatred that\\nsurvives the war; a...</td>\n"," <td>011</td>\n"," </tr>\n"," <tr>\n"," <th>8</th>\n"," <td>has led me to accept, everything I read\\nwith ...</td>\n"," <td>that the earth has mo-\\ntion. Aday ortwo agoIt...</td>\n"," <td>separately.</td>\n"," </tr>\n"," <tr>\n"," <th>9</th>\n"," <td>The wool circulars alluded to are\\nthose which...</td>\n"," <td>accuracy, as\\nthey were furnished by him as ch...</td>\n"," <td>a</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-66b025f1-4d08-4882-aeae-b1bc1979f983')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-66b025f1-4d08-4882-aeae-b1bc1979f983 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-66b025f1-4d08-4882-aeae-b1bc1979f983');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":45}]},{"cell_type":"code","source":["train['Concatenated'] = train['First Part'] + train['Expected word'] + train['Second Part']"],"metadata":{"id":"jyaRsmtatzEo","executionInfo":{"status":"ok","timestamp":1680630218100,"user_tz":-120,"elapsed":912,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":46,"outputs":[]},{"cell_type":"code","source":["train[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":319},"id":"za50dI4yt4cz","executionInfo":{"status":"ok","timestamp":1680630218102,"user_tz":-120,"elapsed":19,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"67c93a59-2019-428d-fa52-9248379ce27a"},"execution_count":47,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... "],"text/html":["\n"," <div id=\"df-90fdc8f1-c594-43b9-b5cd-439badb58c34\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>First Part</th>\n"," <th>Second Part</th>\n"," <th>Expected word</th>\n"," <th>Concatenated</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," <td>lie</td>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," <td>himself</td>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," <td>of</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," <td>ably</td>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," <td>j</td>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-90fdc8f1-c594-43b9-b5cd-439badb58c34')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-90fdc8f1-c594-43b9-b5cd-439badb58c34 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-90fdc8f1-c594-43b9-b5cd-439badb58c34');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":47}]},{"cell_type":"code","source":["import regex as re"],"metadata":{"id":"rxrx5H6WwPQM","executionInfo":{"status":"ok","timestamp":1680630218104,"user_tz":-120,"elapsed":17,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":48,"outputs":[]},{"cell_type":"code","source":["train.replace('\\n', '', regex=True)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":623},"id":"uWem3r3kM4Iz","executionInfo":{"status":"ok","timestamp":1680630221747,"user_tz":-120,"elapsed":3658,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"55bc3aa9-fe51-4142-a9e9-ff73e82ec56a"},"execution_count":49,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n","428513 Wita.htt halting the party ware dilven to the ... \n","428514 It was the last thing that either of\\nthem exp... \n","428515 settlement with the department.\\nIt is also sh... \n","428516 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","... ... ... \n","428512 \\nSam was arrested.\\nThe case excited a great ... NaN \n","428513 through the alnp the »Uitors laapeeeed tia.»\\n... NaN \n","428514 Agua Negra across the line.\\nIt was a grim pla... NaN \n","428515 \\na note of Wood, Dialogue fc Co., for\\nc27,im... NaN \n","428516 3214c;do White at 3614c: Mixed Western at\\n331... NaN \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 NaN \n","428513 NaN \n","428514 NaN \n","428515 NaN \n","428516 NaN \n","\n","[428517 rows x 4 columns]"],"text/html":["\n"," <div id=\"df-d8a8cee5-697a-4e19-a406-b169cb42d857\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>First Part</th>\n"," <th>Second Part</th>\n"," <th>Expected word</th>\n"," <th>Concatenated</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," <td>lie</td>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," <td>himself</td>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," <td>of</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," <td>ably</td>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," <td>j</td>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>428512</th>\n"," <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n"," <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428513</th>\n"," <td>Wita.htt halting the party ware dilven to the ...</td>\n"," <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428514</th>\n"," <td>It was the last thing that either of\\nthem exp...</td>\n"," <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428515</th>\n"," <td>settlement with the department.\\nIt is also sh...</td>\n"," <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428516</th>\n"," <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n"," <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>428517 rows × 4 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d8a8cee5-697a-4e19-a406-b169cb42d857')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-d8a8cee5-697a-4e19-a406-b169cb42d857 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-d8a8cee5-697a-4e19-a406-b169cb42d857');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":49}]},{"cell_type":"code","source":["for _, x in train[:2].iterrows():\n"," words = nltk.word_tokenize(x['Concatenated'])\n"," print(words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"B6JvFEnrwrdL","executionInfo":{"status":"ok","timestamp":1680630221748,"user_tz":-120,"elapsed":51,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"301cea04-e839-4f54-f5d5-b8d7f4a1819e"},"execution_count":50,"outputs":[{"output_type":"stream","name":"stdout","text":["['came', 'fiom', 'the', 'last', 'place', 'to', 'this\\\\nplace', ',', 'and', 'this', 'place', 'is', 'Where', 'We\\\\nWere', ',', 'this', 'is', 'the', 'first', 'road', 'I', 'ever\\\\nwas', 'on', 'where', 'you', 'can', 'ride', 'elsewhere\\\\nfrom', 'anywhere', 'and', 'be', 'nowhere.\\\\nHe', 'says', ',', 'while', 'this', 'train', 'stops', 'every-\\\\nwhere', ',', 'it', 'never', 'stops', 'anywhere', 'un-\\\\nless', 'its', 'somewhere', '.', 'Well', ',', 'I', 'says', ',', '\\\\nI', \"'m\", 'glad', 'to', 'hear', 'that', ',', 'but', ',', 'accord-\\\\ning', 'to', 'your', 'figures', ',', 'I', 'left', 'myself\\\\nwhere', '1', 'was', ',', 'which', 'is', 'five', 'miles', 'near-\\\\ner', 'to', 'myself', 'than', 'I', 'was', 'when', 'we\\\\nwere', 'where', 'we', 'are', 'now.\\\\nWe', 'have', 'now', 'reached', 'Slidell.\\\\nThat', \"'s\", 'a', 'fine', 'place', '.', 'The', 'people\\\\ndown', 'there', 'remind', 'me', 'of', 'bananas-\\\\nthey', 'come', 'and', 'go', 'in', 'bunches', '.', '811-\\\\ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'tough\\\\npeople', '.', 'Now', 'she', 'is', 'noted', 'for', 'be', ',', '\\\\ntough', 'steaks', '.', 'Well', ',', 'I', 'certainly', 'got\\\\none', 'there', '.', 'When', 'the', 'waiter', 'brought\\\\nit', 'in', 'it', 'was', 'so', 'small', 'I', 'thought', '.', 'It\\\\nwas', 'a', 'crack', 'in', 'the', 'plate', '.', 'I', 'skid', ',', '\\\\nwaiter', 'what', 'else', 'have', 'you', 'got', '?', '+He\\\\nbrought', 'me', 'in', 'two', 'codfish', 'and', 'one\\\\nsmelt', '.', 'I', 'said', ',', 'waiter', 'have', 'you', 'got\\\\npigs', 'feet', '?', 'He', 'said', 'no', ',', 'rheumatism\\\\nmakes', 'me', 'walk', 'that', 'way', '.', 'I', 'sald', ',', '\\\\nhow', 'is', 'the', 'pumpkin', 'pie', '?', 'liesaid\\\\nit', \"'s\", 'all', 'squash', '.', 'The', 'best', 'I', 'could', 'get\\\\nin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwich.\\\\nAfter', 'the', 'table', 'battle', 'the', 'waiter', 'and\\\\nI', 'signed', 'an', 'armistice', '.', 'I', 'then', 'went\\\\nover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'for\\\\na', 'room', '.', 'He', 'said', 'with', 'or', 'without', 'a\\\\nbed', '?', 'I', 'said', ',', 'with', 'a', 'bed', '.', 'He', 'said', ',', '\\\\nI', 'do', \"n't\", 'think', 'I', \"'have\", \"'\", 'a', 'bed', 'long\\\\nenough', 'for', 'you', '.', 'I', 'said', ',', 'well', ',', \"I'll\\\\naddtwo\", 'feettoitwhenIgetinit.\\\\nHe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'the\\\\ntop', 'floor', '.', 'It', 'was', 'one', 'of', 'those', 'rooms\\\\nthat', 'stands', 'on', 'each', 'side', '.', 'If', 'you\\\\nhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'of\\\\nthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'and\\\\nget', 'up', 'in', 'the', 'middle', 'of', 'the', 'room.\\\\nThat', 'night', 'I', 'dreamt', 'I', 'was', 'eating\\\\nflannel', 'cakes', '.', 'When', 'I', 'woke', 'up', 'half\\\\nof', 'the', 'blanket', 'was', 'gone', '.', 'I', 'must\\\\nhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'the\\\\nbed', ',', 'for', 'next', 'morning', 'I', 'had', 'an', 'awful\\\\nheadache', '.', 'I', 'told', 'the', 'manager', 'about\\\\nit', '.', 'He', 'said', ',', 'you', 'have', 'rheumatic\\\\npains', '.', 'I', 'said', ',', 'no', ',', 'I', 'think', 'it', 'is', 'on', ',', '\\\\nof', 'those', 'attic', 'room', 'pains', '.', 'I', 'nad', 'to\\\\ngetupat5a.m.inthemorningso\\\\nthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'the\\\\nbreakfast', 'table', '.']\n","['MB', '.', 'BOOT', \"'S\", 'POLITICAL', 'OBEED\\\\nAttempt', 'to', 'imagine', 'a', 'Piatt', 'making\\\\nsuch', 'an', 'address', 'as', 'that', 'of', 'Elihu', 'Boot\\\\nto', 'the', 'Now', 'York', 'legislature', ',', 'and', 'you\\\\nfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunq\\\\nwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'Empirq\\\\nstate', 'of', 'being', 'represented', 'In', 'tho', 'Unit-\\\\ned', 'States', 'senate', 'by', 'a', 'statesman', '.', 'At\\\\ntho', 'very', 'outset', 'Mr', '.', 'Boot', 'declared', 'for\\\\ntho', 'parcels', 'post', ';', 'thereby', 'giving', 'notice\\\\nto', 'tho', 'country', 'that', 'tho', 'express', 'compan\\\\nies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'ac\\\\ncredited', ',', 'to', 'New', 'York', '.', 'That', 'seat', 'will\\\\n', ',', 'for', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'a\\\\nsmaa', 'who', ',', 'hag', 'convictions', 'of', 'his', 'own', ',', '\\\\nwho', \"isi'govemed\", 'by', 'reasoned', 'political\\\\n', \"'\", 'Ideas', ',', 'who', 'had', 'grown', 'so', 'accustomed', 'to\\\\nthink', 'nationally', 'that', 'it', 'is', 'with', 'somo\\\\nmental', 'eflort', 'that', 'he', 'can', 'bringhimself\\\\ninto', 'a', 'proper', 'perspective', 'with', 'those\\\\nminor', 'senatorial', 'duties', ',', 'such', 'as', 'tho', 'fill-\\\\ning', 'of', 'offices', ',', 'which', 'bulk', '39', 'hugely\\\\nupon', 'the', 'horizons', 'of', 'tho', 'Flatts', 'and\\\\ntheir', 'lit', ',', 'Tho', 'Albany', 'politicians', ',', 'we\\\\nare', 'told', ',', 'tried', 'to', 'read', 'between', 'tho', 'lines\\\\nfor', 'evidence', 'that', 'they', ',', 'had', 'among', 'them\\\\na', 'new', 'organization', 'leader', ',', 'somo', 'one', 'to\\\\nguide', 'and', 'direct', 'their', 'political', 'machi-\\\\nnations', ',', 'and', 'to', 'settlo', 'where', 'tho', 'good\\\\nthings', 'should', 'go', '.', 'Wo', 'think', 'they', 'lis-\\\\ntened', 'in', 'vain', '.', 'What', 'they', 'heard', 'were\\\\ntimely', 'reflections', 'opon', 'tho', 'immediate\\\\nproblems', 'of', 'stato', 'and', 'national', 'govern-\\\\nments', ',', 'mixed', 'with', 'excellent', 'advice', 'to\\\\nthe', 'electorate', 'on', 'the', 'duty', 'of', 'improving\\\\nthe', 'quality', 'of', 'tho', 'stato', 'legislatures.\\\\nIt', 'must', 'have', '``', 'been', 'something', 'of', 'a', 'nov-\\\\nelty', ',', 'though', 'possibly', 'not', 'wholly', 'refresh-Lin-', 'g\\\\nto', 'political', 'thirst', '.']\n"]}]},{"cell_type":"code","source":["for _, x in train[3:10].iterrows():\n"," words = nltk.word_tokenize(x['Concatenated'])\n"," print(words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-LnkP39RCXU4","executionInfo":{"status":"ok","timestamp":1680630221748,"user_tz":-120,"elapsed":45,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"f14a1f5b-1614-4311-97cd-8b30a31cd033"},"execution_count":51,"outputs":[{"output_type":"stream","name":"stdout","text":["['whenever', 'any', 'prize', 'property', 'shall', '!', '*', \"'\", 'condemn-', \"'\", 'appeals', 'from', 'the', 'district', 'courts', 'of', 'the', 'Unite', '*', '!', '\\\\ned', ',', 'or', 'shall', 'at', 'any', 'stage', 'of', 'the', 'proceedings', 'be', 'j', 'State', '*', 'in', 'priae', 'causes', 'shall', 'be', 'directly', 'to', 'th', '#', '\\\\nfound\\\\\\\\iy', 'the', '<', 't', '>', 'urt', 'to', 'be', 'perishing', ',', 'perishable', '.', 'Supreme', 'Court', ',', 'and', 'shall', 'he', 'made', 'withiti\\\\nor', 'liable', 'to', 'deteriorate', 'or', 'depreciate', ',', 'or', 'when-', '•', 'thirty', 'days', 'of', 'the', 'rendering', 'of', 'the', 'decree', 'ap', '»', '\\\\never', 'the', 'etist', 'ot', 'keeping', 'th', '»', ':', 'same', 'shall', 'l', '>', 'c', 'dis-', 'i', 'pealed', 'from', ',', 'unh-ss', 'the', 'court', 'shall', 'previously\\\\nproportionate', 'to', 'its', 'value', ',', 'it', 'shall', 'be', 'the', 'duty', 'have', 'extended', 'the', 'time', 'for', 'cause', 'shown', 'in', 'th', '#', '\\\\nof', 'the', 'court', 'to', 'order', 'asale', 'thereof', ';', 'and', 'when-', '|', '»', 'artit', 'ular', 'case', ',', 'and', 'the', 'Supreme', 'court', '*', 'k', '«', '*', 'l|\\\\never', ',', 'after', 'the', 'return', 'day', 'on', 'the', 'liliel', ',', 'all', 'the', 'always', 'l', '>', 'e', 'open', 'fur', 'the', 'entry', 'of', 'sinh', 'uppealst\\\\nparties', 'in', 'interest', 'who', 'have', 'appeared', 'in', 'the', 'Such', 'appeals', 'may', 'l', '>', 'e', 'claimed', 'whenever', 'th', '#', '\\\\ncause', 'shall', 'iigree', 'thercfn', ',', 'the', 'court', 'is', 'author-', '|amount', 'in', 'controversy', 'esiee.is', 'two', 'thonsan', '<', '|\\\\nized', 'to', 'make', 'such', 'order', ',', 'and', 'no', 'appeal', 'shall', '(', 'dollars', ',', 'and', 'in', 'other', 'casesablythe', 'ceitihcate', \"of'\\\\noperate\", 'to', 'prevent', 'tfie', 'making', 'or', 'execution', 'of', '.', 'the', 'district', 'judge', 'that', 'the', 'adjudication', 'invi', '»', 'U\\\\nsuch', 'order', '.', 'The', 'Secretary', 'of', 'the', 'Navy', 'shall', 'ves', 'a', 'question', 'uf', 'general', 'importance.\\\\nemploy', 'an', 'auctioneer', 'or', 'auctioneers', 'of', 'known', 'withstanding1', 'such', 'apiw^al', ',', 'the', 'district', 'Mint\\\\nskill', 'in', 'the', 'branch', 'of', 'business', 'to', 'w', 'hich', 'any', 'may', 'make', 'and', 'execute', 'all', 'necessary', 'order', '*', 'fe', '«', 'f\\\\nsale', '[', 'lertains', ',', 'to', 'make', 'the', 'wile', ',', 'but', 'the', 'sale', 'I', 'the', 'custody', 'and', 'dis|M', '>', 'sitl', 'of', 'th', '•', 'puze', 'propeity', 'I\\\\nshall', 'be', 'conducted', 'nnder', 'the', 'sujK^rvfsfon', 'of', 'j', 'a', '«', 'i', '»', 'l', 'iu', 'case', 'of', 'appeal', 'from', 'a', 'tteeree', 'of', 'eoadeinh\\\\nthe', 'nutrshal', ',', 'and', 'the', 'crdlecting', 'and', 'deiwi-iling', 'I', 'natum', '.', 'may', 'stiil', 'pr.e', '*', 'i', 'to', 'make', 'a', 'dei', '*', 'ree', 'oj\\\\nof', 'the', 'gross', 'proceerls', 'shall', 'be', 'by', 'the', 'anction-', 'j', 'distribution', 'so', 'ftiras', 'to', 'determine', 'what', 'share\\\\neer', 'or', 'his', 'agent', '.', 'B.', 'fore', 'any', 'sale', 'the', 'marshal', 'j', 'of', 'the', 'prize', 'shall', 'g', '«', '»', 'to', 'the', '<', 'aptors', ',', 'and', 'what\\\\nshall', 'cause', 'tull', 'catalogues', 'and', 'schedules', 'to', '!', '*', '•', ',', 'vessels', 'are', 'entitled', 'to', 'particulate', 'therein', 'Aof\\\\nprejuiretl', 'and', 'circulate', ',', 'and', 'a', '.^', '»', 'pv', 'of', 'Wu-h']\n","['SA', 'LKOFVALUABLE', 'UNIMPBOV', '&', 'D', 'RE\\\\\\\\L\\\\nJSIATF', '.', 'ON', 'THE', 'NORTH', 'BIDEOF', '1ST.', ',', '\\\\nNEAR', '23d', 'ST', 'R', '>', 'ET', 'NORTHWEST.\\\\nBy', 'virtue', 'ol', 'a', 'deed', 'of', 'trust', 'recorded', 'In', 'Lllier^^\\\\nNo', '.', '854.', 'folio', '410.', 'et', 'seq.', ',', 'one', 'of', 'the', 'Land^®\\\\nrecords', 'of', 'the', 'district', 'of', 'Columbia', ',', 'and', 'a', \"'\", '.', '``', '\\\\ndecree', 'of', 'the', 'Bupreme', 'Court', 'of', 'the', 'District', 'of\\\\nColumbia', ',', '[', 'tasked', 'in', 'equity', 'cause', 'No', '.', '5791', '.', 'June\\\\n16th', ',', '1878.', 'we', 'will', ',', 'on', 'FRIDAY', ',', 'the', '88', ':', 'b', 'of\\\\nJune', ',', '1878.', 'at', '6', \"o'clock\", 'p.', 'n', '>', '.', ',', 'in', 'front', 'of', 'the\\\\npitml', '&', 'es', ',', 'seb', 'at', 'pubi', 'c', 'auction', 'lot', '2', ',', 'in', 'square', '40', ',', '\\\\nin', 'tbe', 'city', 'of', 'Washington', ',', 'which', 'said', 'lot', ',', 'uniin-\\\\npioved', ',', 'containing', 'abou', '16', '346', 'square', ',', 'feet', 'of\\\\nground', ',', 'will', 'be', 'subdivided', 'into', 'tnree', 'lots', ',', 'each', 'of\\\\nwhich', 'will', 'have', 'a', 'froLUme', 'of', 'about', '21', 'feet', 'ou', 'I\\\\nstreet', ',', 'and', 'will', 'be', 'soldj\\\\nTerms', 'of', 'sale', ':', 'One-tblrd', ',', 'togethor', 'with', 'the', 'ex¬\\\\npenses', 'of', 'sale', ',', 'in', 'cash', ';', 'the', 'residue', 'in', 'three', 'equal\\\\npay', 'n', 'ents', 'at', 'six', ',', 'twelve', 'and', 'eighteen', 'months', ',', 're¬\\\\nspectively', ',', 'for', 'which', 'tbe', 'notes', 'of', 'the', 'purchaser', ',', '\\\\nbearing', 'interest', 'from', 'the', 'day', 'of', 'sale', 'at', '8', 'per', 'cent', ',', '\\\\nper', 'ai.num', ',', 'p', ':', 'Table', 'semi-annually', ',', 'and', 'secured', 'by\\\\na', 'deed', 'of', 'trust', 'on', 'the', 'property', 'sold', ',', 'will', 'be', 'taken', ';', '\\\\nor', 'the', 'purchaser', 'may', 'pay', 'cash', 'In', 'full', ',', 'at', 'nls', 'op¬\\\\ntion', '.', 'All', 'conveyancing', 'and', 'recording', 'will', 'be', 'at\\\\nthe', 'cost', 'of', 'the', 'purchaser', ',', 'and', 'if', 'the', 'terms', 'of', 'sae\\\\nshall', 'not', 'lie', 'complied', 'with', 'In', 'Ave', 'days', 'after', 'the\\\\ntale', 'the', 'property', 'will', '1', '*', 'n', '*', 'old', 'at', 'the', 'risk', 'and', 'co', '»', 't\\\\nof', 'tbe', 'defaulting', 'purchaser', '.', 'A', 'deposit', 'of', 'f150', ',', 'or\\\\n960', 'c', 'n', 'each', 'sulidivlded', 'lot', ',', 'will', 'be', 'required', 'at', 'the']\n","['God', 'includes', 'all', '.', 'and', 'would', 'we', 'not\\\\ngrieve', 'if', 'he', 'left', 'any', 'out', '?', 'If', 'God\\\\nthought', 'some', 'too', 'large', 'or', 'too', 'email', '.', \"'\\\\nespecially\", 'if', 'they', 'were', 'our', 'children', '?', '\\\\nCJod', 'would', 'not', 'say', 'that', 'Jesse', 'and', 'RuAh.\\\\nand', 'Willie', 'should', 'go', 'to', 'Sabbath\\\\nschool', ',', 'but', 'George', 'and', 'James', '..', 'and\\\\nMarj', \"'\", 'are', 'too', 'old', '.', 'Our', 'hair', 'may', '.', \"''\", 'be-', ',', '\\\\ncomp', 'silvered', ',', 'yet', 'we', 'are', 'but', 'children', ',', ',\\\\nus', 'students', 'of', 'God', \"'s\", 'word', ';', 'children', 'in\\\\nChristian', 'life', 'and', 'service', '.', 'Old', 'and\\\\nyoung', 'we', 'are', 'all', 'children', 'of', 'God', ',', \"'atid-\\\\nneed\", 'to', 'be', 'taught', 'of', 'God', '.', 'Are', 'here\\\\nall', 'thy', 'children', ',', 'both', 'old', 'and', 'young/\\\\ngreat', 'and', 'small', '?', 'The', 'Ideal', 'way', 'and\\\\nthe', 'scriptural', 'way', 'is', 'the', 'whole', 'family\\\\nin', 'the', 'service', 'of', 'public', 'worship', ',', 'and\\\\nthe', 'whole', 'family', 'in', 'the', 'Sabbath\\\\nschool', '.', 'And', 'then', 'there', 'are', 'our', 'neigh¬\\\\nbor', \"'s\", 'children', '.', 'They', 'are', 'also', 'our', 'chH-\\\\ndren', 'in', 'this', 'particular', '.', 'We', 'have', \"''\", 'a\\\\nresponsibility', 'concerning', 'them', '.', 'If', 'we\\\\nare', 'our', 'brother', \"'s\", 'keeper', ',', 'then', 'we', 'are\\\\nalso', 'the', 'keeper', 'of', 'our', 'brother', \"'s\", 'chil¬\\\\ndren', '.', 'There', 'arehelot', 'of', 'spiritual\\\\nwaifs', 'all', 'about', 'us', '.', 'children', 'without\\\\nreligious', 'home', 'training', ',', 'example', 'or\\\\ninfluence', 'The', 'parable', 'of', 'the', 'good\\\\nSamaritan', 'teaches', 'us', 'that', 'our', 'neigh¬\\\\nbor', 'is', 'any', 'one', 'in', 'need', 'that', 'we', 'can\\\\nhelp', '.', 'These', 'children', 'of', 'the', 'streets\\\\naDd', 'of', 'the', 'homes', 'of', 'irreligious', 'or', 'neg¬\\\\nligent', 'parents', 'are', 'our', 'children', 'accord¬\\\\ning', 'to', 'the', 'teachings', '(', 'f', 'Christ', '.', 'They\\\\nare', 'our', 'neighbors', '.', 'They', 'are', 'in', 'need', ',', '\\\\nand', 'we', 'have', 'lt', 'in', 'our', 'power', 'to', 'help\\\\nthem', '.', 'They', 'are', 'worse', 'than', 'sheep\\\\nwithout', 'a', 'shepherd', '.', 'They', 'are', 'the', 'lit¬\\\\ntle', ',', 'innocent', ',', 'helpless', 'lambs', 'without', 'a\\\\nshepherd', '.', 'Do', \"n't\", 'let', 'us', 'think', 'we', 'have\\\\nno', 'responsibility', 'if', 'we', 'have', 'no', 'chfl¬\\\\ndren', '.', 'Do', \"n't\", 'let', 'us', 'think', 'we', 'have', 'done\\\\nour', 'full', 'duty', 'If', 'our', 'own', 'children', 'are\\\\nin', 'the', 'church', 'and', 'Sabbath', 'school', '.', 'Are\\\\nhere', 'all', 'thy', 'children', ',', 'in', 'tire', 'large\\\\nsense', '?', '-our', 'own', 'children', ',', 'large', 'and\\\\nsmall', ',', 'and', 'our', 'neighbor', \"'s\", 'children', ',', '\\\\nall', 'that', 'we', 'ate', 'responsible', 'for,1', '!', 'all\\\\nthat', 'we', 'can', 'influence', 'and', 'instruct', 'in\\\\nspiritual', 'things', '?']\n","['The', 'said', 'action', 'is', 'brought', 'to', 'obtain', 'a', 'decree', 'of\\\\nthis', 'Court', 'for', 'tbe', 'foreclosure', 'of', 'a', 'certain', 'mort-\\\\ngage', 'described', 'In', 'the', 'said', 'Complaint', ',', 'and', \"cxc-\\\\n.U'ed\", 'by', 'the', 'said', 'Edward', 'Naud', ',', 'now', 'deceased', ',', '\\\\nto', 'Thaddeus', 'Amat', ',', 'who', 'assigned', 'same', 'to', 'plain-\\\\ntiff', 'by', 'mesne', 'assign', 'menu', '(', 'wu', 'Complaint', ')', 'on', 'the\\\\nithday', 'of', 'August', ',', 'A', '.', 'D', '.', '1877', ',', 'to', 'secure', 'the', 'pay-\\\\nment', 'of', 'a', 'promissory', 'n.-te', 'fur', 'the', 'sum', 'of', '$', '3,760', ',', '\\\\nexecuted', 'on', 'same', 'day', ',', 'with', 'Interest', 'thereon', 'at\\\\nthe', 'rate', 'of', 'one', 'per', 'cent', ',', 'per', 'month', 'till', 'paid', ',', '\\\\nfrom', 'November', ',', '1877', ',', 'compounded', 'quarter', 'y', ',', 'and\\\\ntor', 'costs', 'of', 'suit', ';', 'that', 'the', 'premises', 'conveyed', 'by-\\\\nsaid', 'Mortgage', 'may', 'be', 'sold', ',', 'and', 'the', 'proceeds', 'ap-\\\\nplied', 'to', 'thu', 'payment', 'of', 'the', 'said', 'promissory', 'note\\\\nand', 'interest', 'as', 'aforesaid', ',', 'and', 'costs', 'of', 'suit', ',', 'and', 'in\\\\ncase', 'such', 'proceeds', 'ars', 'not', 'sufficient', 'to', 'pay', 'the\\\\ngraphthen', 'to', 'obtain', 'an', 'execution', 'against', 'said', 'Vie\\\\ntor', 'Beaudry', ',', 'whois', 'obligated', 'to', 'pay', 'the', 'same', ',', 'for\\\\ntho', 'balance', 'remaining', 'due', ',', 'and', 'also', 'that', 'the', 'de-\\\\nfendants', 'and', 'all', 'persons', 'claiming', 'by', ',', 'through', 'or\\\\nunder', 'them', 'may', 'be', 'barred', 'and', 'foreclosed', 'of', 'aii\\\\nright', ',', 'title', ',', 'claim', ',', 'lien', ',', 'equityof', 'redemption', 'and\\\\ninterest', 'in', 'and', 'tn', 'Stid', 'moitgaged', 'premises', ',', 'and\\\\nfor', 'other', 'and', 'upther', 'relief', '.', 'Reference', 'is', 'hodto\\\\ncomplaint', 'for', 'partculara.\\\\nAnd', 'you', 'are', 'hereby', 'notified', 'that', 'If', 'you', 'fail', 'to\\\\nappear', 'ant', \"'\", 'answer', 'the', 'said', 'complaint', 'as', 'above\\\\nrequired', ',', 'the', 'said', 'plaintiffwillapplyto', 'the', 'Court\\\\nfor', 'iherelitf', 'demanded', 'inthe', 'said', 'complaint.\\\\nGiven', 'under', 'myhand', 'and', 'tbe', 'seal', 'ofthe', 'ssid', 'Su-\\\\nperior', 'Court', 'of', 'the', 'State', 'of', 'California', ',', 'iaand', 'for\\\\nthe', 'county', 'of', 'Los', 'Angeles', ',', 'this', '3d', 'day', 'of', 'August', ',', '\\\\nin', 'the', 'year', 'of', 'our', 'Lord', ',', 'one', 'thousand', 'eight', 'bun\\\\ndrcd', 'and', 'eighty-three', '.']\n","['party', \"''\", 'is', 'a', 'useless', 'exhortation', 'to', 'intel-\\\\nligent', 'men', ',', 'aiiless', 'they', 'see', 'that', 'the', 'par-\\\\nty', 'is', 'resolved', 'to', 'secure', 'those', 'ends', 'which\\\\nintelligent', 'men', 'desire', 'by', 'means', 'of', 'such\\\\nagents', 'as', 'intelligent', 'men', 'can', 'respect.\\\\nThe', 'Republicans', 'iu', 'the', 'Essex', 'district', 'of\\\\nMassachusetts', 'who', 'select', 'a', 'man', 'like\\\\neneral', 'Butler', 'as', 'their', 'representative\\\\ndefeat', 'the', 'Republican', 'candidates', 'in', 'In-\\\\ndiana', 'and', 'Ohio', '.', 'It', 'is', 'they', ',', 'and', 'not\\\\nRepublicans', ',', 'wLo', 'insist', 'ujon', 'honesty\\\\nand', 'principle', 'in', 'politics', ',', 'who', 'are', 're-\\\\nsponsible', 'for', 'Repu', 'I', 'ilican', 'disasters.\\\\nThe', 'general', 'torpidity', 'of', 'business', ',', 'the\\\\nprolonged', 'confusion', 'in', 'the', 'Southern\\\\nStates', ',', 'the', 'suspicion', 'of', 'corruption', 'and\\\\ninefficiency', 'in', 'the', 'public', 'service', ',', 'the\\\\nhostility', 'to', 'stringent', 'temperance', 'legis-\\\\nlation', ',', 'are', 'among', 'the', 'reasons', 'which\\\\nhave', 'fostered', 'that', 'desire', 'for', 'change\\\\nwhich', 'is', 'shown', 'iu', 'the', 'elections', '.', 'There\\\\nis', 'not', 'one', 'of', 'these', 'complaints', ',', 'however', ',', '\\\\nexcept', 'that', 'of', 'the', 'temperance', 'laws', ',', '\\\\nwhich', 'would', 'be', 'removed', 'by', 'a', 'Demo-\\\\ncratic', 'restoration', '.', 'All', 'the', 'sincere', 'jeal-\\\\nousy', 'of011with', 'all', 'tjie', 'hatred', 'that\\\\nsurvives', 'the', 'war', ';', 'all', 'the', 'hostility', 'to', 'the\\\\nprinciples', 'and', 'the', 'purpose', 'of', 'the', 'new\\\\namendments', 'to', 'the', 'Constitution', ';', 'the\\\\nspirit', 'of', 'oppression', 'of', 'the', 'negro', ';', 'the\\\\ndesire', 'of', 'repudiation', 'are', 'all', 'included\\\\nin', 'the', 'Democratic', 'party', '.', 'In', 'States\\\\nwhere', 'the', 'old', 'spirit', 'of', 'caste', ',', 'fostered', 'by\\\\nignorance', 'of', 'every', 'kind', ',', 'is', 'strongest', ',', 'iu\\\\nthose', 'parts', 'of', 'the', 'country', 'which', 'are', 'the\\\\nmost', 'backward', 'in', 'civilization', 'and', 'gen-\\\\neral', 'development', ',', 'the', 'Democratic', 'pari', 'y\\\\nis', 'now', ',', 'as', 'it', 'always', 'was', ',', 'more', 'powerful\\\\ntnan', 'its', 'opponent', '.', 'Iu', 'the', 'great', 'centres\\\\nof', 'intelligence', ',', 'industry', ',', 'enterprise', ',', '\\\\nand', 'an', 'advancing', 'social', \"'condition\", 'the\\\\nRepublican', 'party', 'is', 'dominant', '.', 'Ken-\\\\ntucky', 'and', 'Maryland', 'are', 'distinctively\\\\nDemocratic', 'States', ';', 'Massachusetts', ',', 'Iowa', ',', '\\\\nand', 'rural', 'New', 'York', 'are', 'Republican.\\\\nEvery', 'patriotic', 'and', 'enlightened', 'Amer-\\\\nican', 'must', 'prefer', 'to', 'see', 'thecountry', 'guard\\\\ned', 'by', 'the', 'spirit', 'of', 'the', 'great', 'Northwest\\\\nand', 'of', 'New', 'England', 'and', 'New', 'York\\\\nrather', 'than', 'by', 'tluit.of', 'the', 'old', 'Bourbon\\\\nand', 'Slave', 'States', '.']\n","['has', 'led', 'me', 'to', 'accept', ',', 'everything', 'I', 'read\\\\nwith', 'a', 'measure', 'of', 'distrust', ',', 'and', 'I', 'take\\\\nnothing', 'for', 'granted', 'because', 'it', 'has', 'come\\\\nfrom', 'the', 'pen', 'of', 'one', 'whose', 'prominence\\\\ngives', 'his', 'opinions', 'weight', ',', 'whether\\\\nthey', 'are', 'right', 'or', 'wrong', '.', 'My', 'neigh-\\\\nbors', 'are', 'different', '.', 'Their', 'advancement\\\\nis', 'slow', 'and', 'frequently', 'wrong', 'They\\\\nget', 'hold', 'of', 'exploded', 'ideas', 'years', 'after\\\\nthe', 'explosion', ',', 'and', 'because', 'of', 'the', 'prob-\\\\nabilities', 'of', 'a', 'thing', ',', 'it', 'is', 'accepted', 'as', 'a\\\\nfact', '.', 'But', 'neighbors', 'are', 'about', 'alike', 'in\\\\nevery', 'township', 'in', 'the', 'land', 'outside', 'of\\\\nthe', 'very', 'centres', 'of', 'civilization', ',', 'where\\\\nthe', 'light', 'of', 'knowledge', 'flashes', 'from\\\\nmind', 'to', 'mind', 'in', 'the', 'human', 'conflict', 'to\\\\nreach', 'the', 'highest', 'round', 'of', 'the', 'ladder.\\\\nIt', 'is', 'astonishing', 'men', 'will', 'live', 'and', 'die\\\\nin', 'this', 'age', 'and', 'not', 'know', 'the', 'earth', 'is\\\\nround', '.', 'School', 'houses', 'on', 'almost', 'every\\\\nfarm', ';', 'books', 'of', 'all', 'kinds', 'within', 'reach', ',', '\\\\nand', 'yetseparately.that', 'the', 'earth', 'has', 'mo-\\\\ntion', '.', 'Aday', 'ortwo', 'agoItalked', 'to', 'a\\\\nprominent', 'attorney', 'in', 'Butler', ',', 'and', ',', '\\\\nwould', 'you', 'believe', 'it', ',', 'ho', 'actually', 'argued\\\\nthat', 'the', 'farther', 'you', 'go', 'south', 'the', 'hotter\\\\nit', 'got', ',', 'exactly', 'as', 'the', 'further', 'north', 'you\\\\nwent', 'the', 'colder', 'it', 'got', '.', 'It', 'is', 'ridiculous', '!', '\\\\nDuring', 'all', 'of', 'that', 'man', \"'s\", 'busy', 'life', 'be\\\\nbad', 'not', 'paused', 'to', 'make', 'one', 'application\\\\nof', 'his', 'knowledge', ',', 'so', 'he', 'could', 'practical-\\\\nly', 'understand', 'the', 'relationship', 'existing\\\\nbetween', 'the', 'North', 'and', 'South', 'poles', ',', '\\\\nthe', 'equator', 'aud', 'the', 'suu', '.', '``', '\\\\nWe', 'came', 'to', 'the', 'house', 'and', 'I', 'was', 'con-\\\\nducted', 'into', 'a', 'large', 'room', 'fitted', 'up', 'at\\\\none', 'end', 'for', 'a', 'library', 'and', 'at', 'the\\\\nother', 'for', 'a', 'workshop', ',', 'with', 'a', 'sliding\\\\ncurtain', 'as', 'a', 'dividing', 'partition', '.', 'The\\\\nroom', 'was', 'filled', 'with', 'an', 'array', 'of', 'cur-\\\\nious', 'things', '.', 'Maps', ',', 'books', 'every', 'where', ',', '\\\\nglobes', ',', 'large', 'and', 'small', '.', 'The', 'earth\\\\nrepresented', 'in', 'dozeus', 'of', 'wonderful\\\\nshapes', '.']\n","['The', 'wool', 'circulars', 'alluded', 'to', 'are\\\\nthose', 'which', 'give', 'the', 'quotations', 'side\\\\nby', 'side', 'of', 'Ohio', 'medium', 'in', 'the', 'United\\\\nStates', 'and', 'Australasian', 'medium', 'of\\\\nthe', 'same', 'quality', 'and', 'condition', 'in\\\\nLondon', '.', 'the', 'time', 'that', 'the', 'tarif', 'law\\\\nwent', 'into', 'effect', 'in', '1868', ',', 'up', 'to', 'and', 'in-\\\\ncluding', '1891', ',', 'showing', 'that', 'the', 'aver-\\\\nage', 'price', 'received', 'for', 'wool', 'of', 'the', 'same\\\\nquality', 'in', 'the', 'tree', 'wool', 'market', 'of', 'Lon-\\\\ndon', 'during', 'all', 'of', 'that', 'period', 'averagd\\\\n51', 'per', 'cent', '.', 'lees', 'than', 'the', 'price', 'paidin\\\\nthe', 'United', 'States', 'for', 'the', 'same', 'kindof\\\\nAmerican', 'wool', 'under', 'protection.\\\\nThe', 'quotations', 'for', 'domestic', 'wool\\\\nwhich', '.', 'be', 'says', ',', 'are', 'incorrect', ',', 'are', 'tak-\\\\nen', 'from', 'Mr.', 'Springer', \"'s\", 'own', 'report', 'of\\\\nthe', 'Ways', 'and', 'Means', 'Committee', 'to\\\\nthe', 'Houseof', 'Representatives', ';', 'see', 'page\\\\n34', ',', 'report', 'No', '.', '501', '.', 'We', 'assumed', 'that\\\\nMr', '.', 'Springer', \"'s\", 'figures', 'werecorrect', ',', 'and\\\\nnever', 'questionedaaccuracy', ',', 'as\\\\nthey', 'were', 'furnished', 'by', 'him', 'as', 'chair-\\\\nman', 'of', 'the', 'Ways', 'and', 'Means', 'commit-\\\\ntee', 'of', 'the', 'house', 'of', 'representatives', ';', 'and\\\\nthis', 'ought', 'to', 'be', ',', 'and', 'therefore', 'has\\\\nbeen', ',', 'the', 'best', 'authority', '.', 'TheLondon\\\\nprices', 'were', 'obtained', 'from', 'the', 'pub-\\\\nlished', 'quotations', 'of', 'Jan.', '1', ',', '1892', ',', 'of\\\\nMessrs', '.', 'Windeler', '&', 'Co.', ',', 'of', 'London', ',', '\\\\nEngland', ',', 'and', 'are', 'prepared', 'by', 'them\\\\nfor', 'the', 'London', 'market', 'without', 're-\\\\ngard', 'to', 'any', 'political', 'use', 'that', 'might\\\\nbe', 'made', 'of', 'them', 'in', 'the', 'United', 'States.\\\\nThese', 'London', 'quotations', 'of', 'the\\\\nMessrs', '.', 'Windeler', ',', 'which', 'we', 'use', ',', 'are\\\\nconfirmed', 'by', 'those', 'of', 'Messrs.', 'Helmnth', ',', '\\\\nSwartz', '&', 'Co', '..', 'ot', 'London', ',', 'Mesrs', '.', 'Bx-\\\\nton', ',', 'Ronald', '&', 'Co.', ',', 'of', 'London', ',', 'and\\\\nalso', 'by', 'the', 'Bradford', 'Observer', ',', 'of\\\\nBradford', ',', 'England', ',', 'the', 'onenewspaper\\\\nthat', 'is', 'recognized', 'throughout', 'themer-\\\\ncantile', 'world', 'as', 'authority', 'on', 'matters\\\\n•rlating', 'to', 'wool', 'and', 'manufactures\\\\nthereof', '.']\n"]}]},{"cell_type":"code","source":["def strip(text):\n"," txt = str(text).lower().strip()\n"," txt = txt.replace(\"\", \"'\")\n"," txt = txt.replace(\" this\\\\nplace\", \"this place\")\n"," txt = txt.replace(\"'we\\\\nwere\", \"we were\")\n"," txt = txt.replace(\"'ever\\\\nwas\", \"ever was\")\n"," txt = txt.replace(\"'making\\\\nsuch\", \"making such\")\n"," txt = txt.replace(\"'boot\\\\nto\", \"boot to\")\n"," txt = txt.replace(\"'elsewhere\\\\nfrom\", \"elsewhere from\")\n"," txt=txt.replace(\"United\\\\nStates\",\"United States\")\n"," txt = txt.replace(\"Unit-\\\\ned\",\"United\" )\n"," txt = txt.replace(\"neigh-\\\\nbors\", \"neighbours\")\n"," txt = txt.replace(\"aver-\\\\nage\", \"average\")\n"," txt = txt.replace(\"people\\\\ndown\", \"people down\")\n"," txt =re.compile(r\"'s|[\\-]|\\-\\\\n|\\p{P}\").sub(\"\", txt)\n"," txt = re.compile(r\"[{}\\[\\]\\&%^$*#\\(\\)@\\t\\n0123456789]+\").sub(\" \", txt)\n"," return txt"],"metadata":{"id":"hXH7GrqPvaCf","executionInfo":{"status":"ok","timestamp":1680630221749,"user_tz":-120,"elapsed":39,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":52,"outputs":[]},{"cell_type":"code","source":["for _, x in train[:2].iterrows():\n"," words = nltk.word_tokenize(strip(x['Concatenated']))\n"," print(words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"rETGj0y2wegj","executionInfo":{"status":"ok","timestamp":1680630221749,"user_tz":-120,"elapsed":38,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"fedfa8e5-f9eb-4e3e-e4e9-4f1b6d0af1dc"},"execution_count":53,"outputs":[{"output_type":"stream","name":"stdout","text":["['came', 'fiom', 'the', 'last', 'place', 'tothis', 'place', 'and', 'this', 'place', 'is', 'where', 'wenwere', 'this', 'is', 'the', 'first', 'road', 'i', 'evernwas', 'on', 'where', 'you', 'can', 'ride', 'elsewherenfrom', 'anywhere', 'and', 'be', 'nowherenhe', 'says', 'while', 'this', 'train', 'stops', 'everynwhere', 'it', 'never', 'stops', 'anywhere', 'unnless', 'its', 'somewhere', 'well', 'i', 'saysnim', 'glad', 'to', 'hear', 'that', 'but', 'accordning', 'to', 'your', 'figures', 'i', 'left', 'myselfnwhere', 'was', 'which', 'is', 'five', 'miles', 'nearner', 'to', 'myself', 'than', 'i', 'was', 'when', 'wenwere', 'where', 'we', 'are', 'nownwe', 'have', 'now', 'reached', 'slidellnthat', 'a', 'fine', 'place', 'the', 'people', 'down', 'there', 'remind', 'me', 'of', 'bananasnthey', 'come', 'and', 'go', 'in', 'bunches', 'ndell', 'used', 'to', 'be', 'noted', 'for', 'her', 'toughnpeople', 'now', 'she', 'is', 'noted', 'for', 'bentough', 'steaks', 'well', 'i', 'certainly', 'gotnone', 'there', 'when', 'the', 'waiter', 'broughtnit', 'in', 'it', 'was', 'so', 'small', 'i', 'thought', 'itnwas', 'a', 'crack', 'in', 'the', 'plate', 'i', 'skidnwaiter', 'what', 'else', 'have', 'you', 'got', '+henbrought', 'me', 'in', 'two', 'codfish', 'and', 'onensmelt', 'i', 'said', 'waiter', 'have', 'you', 'gotnpigs', 'feet', 'he', 'said', 'no', 'rheumatismnmakes', 'me', 'walk', 'that', 'way', 'i', 'saldnhow', 'is', 'the', 'pumpkin', 'pieliesaidnit', 'all', 'squash', 'the', 'best', 'i', 'could', 'getnin', 'that', 'hotel', 'was', 'a', 'soup', 'sandwichnafter', 'the', 'table', 'battle', 'the', 'waiter', 'andni', 'signed', 'an', 'armistice', 'i', 'then', 'wentnover', 'to', 'the', 'hotel', 'clerk', 'and', 'asked', 'forna', 'room', 'he', 'said', 'with', 'or', 'without', 'anbed', 'i', 'said', 'with', 'a', 'bed', 'he', 'saidni', 'dont', 'think', 'i', 'have', 'a', 'bed', 'longnenough', 'for', 'you', 'i', 'said', 'well', 'illnaddtwo', 'feettoitwhenigetinitnhe', 'gave', 'me', 'a', 'lovely', 'room', 'on', 'thentop', 'floor', 'it', 'was', 'one', 'of', 'those', 'roomsnthat', 'stands', 'on', 'each', 'side', 'if', 'younhappen', 'to', 'get', 'up', 'in', 'the', 'middle', 'ofnthe', 'night', 'you', 'want', 'to', 'be', 'sure', 'andnget', 'up', 'in', 'the', 'middle', 'of', 'the', 'roomnthat', 'night', 'i', 'dreamt', 'i', 'was', 'eatingnflannel', 'cakes', 'when', 'i', 'woke', 'up', 'halfnof', 'the', 'blanket', 'was', 'gone', 'i', 'mustnhave', 'got', 'up', 'on', 'the', 'wrong', 'side', 'of', 'thenbed', 'for', 'next', 'morning', 'i', 'had', 'an', 'awfulnheadache', 'i', 'told', 'the', 'manager', 'aboutnit', 'he', 'said', 'you', 'have', 'rheumaticnpains', 'i', 'said', 'no', 'i', 'think', 'it', 'is', 'onnof', 'those', 'attic', 'room', 'pains', 'i', 'nad', 'tongetupat', 'aminthemorningsonthey', 'could', 'use', 'the', 'sheet', 'to', 'set', 'thenbreakfast', 'table']\n","['mb', 'boot', 'political', 'obeednattempt', 'to', 'imagine', 'a', 'piatt', 'makingnsuch', 'an', 'address', 'as', 'that', 'of', 'elihu', 'bootnto', 'the', 'now', 'york', 'legislature', 'and', 'younfcavo', 'a', 'measure', 'of', 'tho', 'good', 'fortunqnwhich', 'baa', 'at', 'last', 'come', 'to', 'tho', 'empirqnstate', 'of', 'being', 'represented', 'in', 'tho', 'unitned', 'states', 'senate', 'by', 'a', 'statesman', 'atntho', 'very', 'outset', 'mr', 'boot', 'declared', 'forntho', 'parcels', 'post', 'thereby', 'giving', 'noticento', 'tho', 'country', 'that', 'tho', 'express', 'compannies', 'no', 'longer', 'own', 'a', 'senatorial', 'scat', 'acncredited', 'to', 'new', 'york', 'that', 'seat', 'willnfor', 'ho', 'next', 'six', 'years', 'bo', 'occupied', 'by', 'ansmaa', 'who', 'hag', 'convictions', 'of', 'his', 'ownnwho', 'isigovemed', 'by', 'reasoned', 'politicaln', 'ideas', 'who', 'had', 'grown', 'so', 'accustomed', 'tonthink', 'nationally', 'that', 'it', 'is', 'with', 'somonmental', 'eflort', 'that', 'he', 'can', 'bringhimselfninto', 'a', 'proper', 'perspective', 'with', 'thosenminor', 'senatorial', 'duties', 'such', 'as', 'tho', 'fillning', 'of', 'offices', 'which', 'bulk', 'hugelynupon', 'the', 'horizons', 'of', 'tho', 'flatts', 'andntheir', 'lit', 'tho', 'albany', 'politicians', 'wenare', 'told', 'tried', 'to', 'read', 'between', 'tho', 'linesnfor', 'evidence', 'that', 'they', 'had', 'among', 'themna', 'new', 'organization', 'leader', 'somo', 'one', 'tonguide', 'and', 'direct', 'their', 'political', 'machinnations', 'and', 'to', 'settlo', 'where', 'tho', 'goodnthings', 'should', 'go', 'wo', 'think', 'they', 'lisntened', 'in', 'vain', 'what', 'they', 'heard', 'werentimely', 'reflections', 'opon', 'tho', 'immediatenproblems', 'of', 'stato', 'and', 'national', 'governnments', 'mixed', 'with', 'excellent', 'advice', 'tonthe', 'electorate', 'on', 'the', 'duty', 'of', 'improvingnthe', 'quality', 'of', 'tho', 'stato', 'legislaturesnit', 'must', 'have', 'been', 'something', 'of', 'a', 'novnelty', 'though', 'possibly', 'not', 'wholly', 'refreshlin', 'gnto', 'political', 'thirst']\n"]}]},{"cell_type":"code","source":["words = []\n","\n","def train_model(data, m):\n"," for y,x in data.iterrows():\n"," words = nltk.word_tokenize(strip(x['Concatenated']))\n"," #print(words)\n"," for word_1, word_2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n"," if word_1 and word_2:\n"," m[word_2][word_1] += 1\n"," for word_2 in m:\n"," summ = sum(m[word_2].values())\n"," summ = float(summ)\n"," for word_1 in m[word_2]:\n"," m[word_2][word_1] /= summ\n"],"metadata":{"id":"WZKuuolhvY8V","executionInfo":{"status":"ok","timestamp":1680630221750,"user_tz":-120,"elapsed":25,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":54,"outputs":[]},{"cell_type":"code","source":["def base_prob():\n"," return \"the:0.3 a:0.3 to:0.2 and:0.1 :0.1\""],"metadata":{"id":"Y03sSwse6f4T","executionInfo":{"status":"ok","timestamp":1680630221750,"user_tz":-120,"elapsed":24,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":55,"outputs":[]},{"cell_type":"code","source":["model = defaultdict(lambda: defaultdict(lambda: 0))\n","train_model(train, model)"],"metadata":{"id":"Gd-KV5PYvU_T","executionInfo":{"status":"ok","timestamp":1680630688249,"user_tz":-120,"elapsed":466523,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":56,"outputs":[]},{"cell_type":"code","source":["def predict_words(w, model):\n"," sum = 0\n"," preds= dict(model[w])\n"," most_common = dict(Counter(preds).most_common(6))\n"," pred = \"\"\n"," for w, prob in most_common.items():\n"," sum += prob\n"," pred += f\"{w}:{prob} \"\n"," if sum == 0.0:\n"," base_prob()\n"," rest = 1 - sum\n"," pred += f\":{rest}\"\n"," return pred\n"],"metadata":{"id":"SSBDDvtH5wbe","executionInfo":{"status":"ok","timestamp":1680630688250,"user_tz":-120,"elapsed":30,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":57,"outputs":[]},{"cell_type":"code","source":["ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"zCx4pHsy_9Us","executionInfo":{"status":"ok","timestamp":1680630688251,"user_tz":-120,"elapsed":28,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"6fa34b83-297e-4a5f-b40f-ed2712dee4d7"},"execution_count":58,"outputs":[{"output_type":"stream","name":"stdout","text":["config.txt in-header.tsv README.md \u001b[0m\u001b[01;34mtrain\u001b[0m/\n","\u001b[01;34mdev-0\u001b[0m/ out-header.tsv \u001b[01;34mtest-A\u001b[0m/ Untitled0.ipynb\n"]}]},{"cell_type":"code","source":["len(test_d)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_H7nbHwrjIMr","executionInfo":{"status":"ok","timestamp":1680633295078,"user_tz":-120,"elapsed":3,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"268a4a40-e413-43aa-ac77-6dded473bda5"},"execution_count":60,"outputs":[{"output_type":"execute_result","data":{"text/plain":["7362"]},"metadata":{},"execution_count":60}]},{"cell_type":"code","source":["test_d = pd.read_csv(\"test-A/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding=\"utf-8\")\n","dev_d = pd.read_csv(\"dev-0/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding=\"utf-8\")\n","\n","with open( \"dev-0/out.tsv\", \"w\", encoding=\"utf-8\") as f:\n"," for y,x in dev_d.iterrows():\n"," w = nltk.word_tokenize(strip(x[7]))\n"," prediction = predict_words(w[0], model)\n"," f.write(prediction + \"\\n\")\n","\n","with open( \"test-A/out.tsv\", \"w\", encoding=\"utf-8\") as f:\n"," for y,x in test_d.iterrows():\n"," w = nltk.word_tokenize(strip(x[7]))\n"," prediction = predict_words(w[0], model)\n"," f.write(prediction + \"\\n\")\n","\n"],"metadata":{"id":"xAfFnHPtjIvy","executionInfo":{"status":"ok","timestamp":1680639029118,"user_tz":-120,"elapsed":767416,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":117,"outputs":[]},{"cell_type":"code","source":["with open('test-A/out.tsv', 'a', encoding='utf-8') as my_file:\n"," for x in range(52):\n"," my_file.write(\"the:0.3 a:0.3 to:0.2 and:0.1 :0.1\\n\")"],"metadata":{"id":"b_drRK2cwZVA","executionInfo":{"status":"ok","timestamp":1680637017496,"user_tz":-120,"elapsed":528,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":104,"outputs":[]},{"cell_type":"code","source":["with open('dev-0/out.tsv', 'a', encoding='utf-8') as my_file:\n"," for x in range(117):\n"," my_file.write(\"the:0.3 a:0.3 to:0.2 and:0.1 :0.1\\n\")"],"metadata":{"id":"BjZV7Jndw3X3","executionInfo":{"status":"ok","timestamp":1680637076582,"user_tz":-120,"elapsed":481,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":106,"outputs":[]},{"cell_type":"code","source":["badlines_list = []\n","def badlines_collect (bad_line: list[str]) -> None:\n"," badlines_list.append(bad_line)\n"," return None"],"metadata":{"id":"1x-C0Q-zqWw3","executionInfo":{"status":"ok","timestamp":1680635165047,"user_tz":-120,"elapsed":496,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":82,"outputs":[]},{"cell_type":"code","source":["from csv import QUOTE_NONE"],"metadata":{"id":"IY9xpX621Xaq","executionInfo":{"status":"ok","timestamp":1680638128840,"user_tz":-120,"elapsed":9,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":111,"outputs":[]},{"cell_type":"code","source":["t_dd = pd.read_csv(\"test-A/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', quoting=QUOTE_NONE, header=None, encoding=\"utf-8\")"],"metadata":{"id":"iMcNN7l1ppHF","executionInfo":{"status":"ok","timestamp":1680638242280,"user_tz":-120,"elapsed":1167,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":115,"outputs":[]},{"cell_type":"code","source":["len(t_dd)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ROKCr_w52G5i","executionInfo":{"status":"ok","timestamp":1680638253381,"user_tz":-120,"elapsed":507,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"27793785-6388-4e73-ae19-1f6e691aac9e"},"execution_count":116,"outputs":[{"output_type":"execute_result","data":{"text/plain":["7414"]},"metadata":{},"execution_count":116}]},{"cell_type":"code","source":["test_d = pd.read_csv(\"test-A/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, encoding=\"utf-8\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tRC1XdiOkYhE","executionInfo":{"status":"ok","timestamp":1680633954027,"user_tz":-120,"elapsed":568,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"940472ab-297b-4767-ccee-888bb93e2afc"},"execution_count":73,"outputs":[{"output_type":"stream","name":"stderr","text":["<ipython-input-73-37554e0a2e1a>:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n","\n","\n"," test_d = pd.read_csv(\"test-A/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, encoding=\"utf-8\")\n","b'Skipping line 2977: expected 8 fields, saw 9\\n'\n"]}]},{"cell_type":"code","source":["len(test_d)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mWzY93O3lw6T","executionInfo":{"status":"ok","timestamp":1680634061481,"user_tz":-120,"elapsed":388,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"67633a1e-9ebd-4218-9ba8-0cbc2b1ba2cc"},"execution_count":78,"outputs":[{"output_type":"execute_result","data":{"text/plain":["7362"]},"metadata":{},"execution_count":78}]},{"cell_type":"code","source":["test_d_r = pd.read_csv(\"test-A/out.tsv\", sep=\"\\t\", lineterminator='\\r', header=None, encoding=\"utf-8\")"],"metadata":{"id":"_0sIhv8qjWFI","executionInfo":{"status":"ok","timestamp":1680634059247,"user_tz":-120,"elapsed":747,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":77,"outputs":[]},{"cell_type":"code","source":["len(test_d_r)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bk2fG39ljcZc","executionInfo":{"status":"ok","timestamp":1680633362950,"user_tz":-120,"elapsed":9,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"ef59403a-9245-48cf-c362-a78e1db10b09"},"execution_count":63,"outputs":[{"output_type":"execute_result","data":{"text/plain":["7362"]},"metadata":{},"execution_count":63}]},{"cell_type":"code","source":["len(dev_d)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UARdO7zJo8AO","executionInfo":{"status":"ok","timestamp":1680634797578,"user_tz":-120,"elapsed":510,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"09342b1d-d5e6-4714-887d-0332fab14b03"},"execution_count":80,"outputs":[{"output_type":"execute_result","data":{"text/plain":["10402"]},"metadata":{},"execution_count":80}]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"test-A/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oc1H8tdcjls1","executionInfo":{"status":"ok","timestamp":1680636817946,"user_tz":-120,"elapsed":489,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"47bdeb25-47d3-475f-b30a-017ca71bccfd"},"execution_count":99,"outputs":[{"output_type":"stream","name":"stdout","text":["Number of lines present:- 7414\n"]}]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"dev-0/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"R8XP9qPuo0YL","executionInfo":{"status":"ok","timestamp":1680635751543,"user_tz":-120,"elapsed":686,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"f17084d2-ba52-4b9e-8581-0a1c3bb9ce24"},"execution_count":94,"outputs":[{"output_type":"stream","name":"stdout","text":["Number of lines present:- 10519\n"]}]},{"cell_type":"code","source":["rowcount=0\n","for row in open(\"dev-0/out.tsv\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"r-AIThCEso8o","executionInfo":{"status":"ok","timestamp":1680637096916,"user_tz":-120,"elapsed":699,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"72d114a9-8f30-489b-d18d-0807a9663130"},"execution_count":107,"outputs":[{"output_type":"stream","name":"stdout","text":["Number of lines present:- 10519\n"]}]}]}