challenging-america-word-ga.../kenlm.ipynb

1 line
65 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOjlR0HzrxQLi9ivvf3rrhL"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kgUXhu_9HEZY","executionInfo":{"status":"ok","timestamp":1682427020888,"user_tz":-120,"elapsed":7836,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"93b9b737-532d-4892-d4bf-66579ee7c849"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4gxwwa5-Haqo","executionInfo":{"status":"ok","timestamp":1682427020889,"user_tz":-120,"elapsed":13,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"d5a1b591-43f9-4810-fb49-bf247c1a08e2"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["[Errno 2] No such file or directory: 'drive/MyDrive'\n","/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"78igYakJHokM","executionInfo":{"status":"ok","timestamp":1682427020891,"user_tz":-120,"elapsed":12,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"c1906f10-600a-4170-b61f-ab3005a2cf2a"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["[Errno 2] No such file or directory: 'challenging-america-word-gap-prediction/'\n","/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["import pandas as pd"],"metadata":{"id":"-wyIUdlBHp2W","executionInfo":{"status":"ok","timestamp":1682427020892,"user_tz":-120,"elapsed":9,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["data = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")\n","\n","exp_words = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")"],"metadata":{"id":"kA6PExReHr3E","executionInfo":{"status":"ok","timestamp":1682430631336,"user_tz":-120,"elapsed":39975,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","execution_count":29,"metadata":{"executionInfo":{"elapsed":17,"status":"ok","timestamp":1682430631338,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"upTQ5Po9wOSL"},"outputs":[],"source":["train_data = data[[6, 7]]"]},{"cell_type":"code","execution_count":30,"metadata":{"executionInfo":{"elapsed":18,"status":"ok","timestamp":1682430631341,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"OdEm_SBSwXuY"},"outputs":[],"source":["train_data= pd.concat([train_data, exp_words], axis=1)"]},{"cell_type":"code","execution_count":31,"metadata":{"executionInfo":{"elapsed":19,"status":"ok","timestamp":1682430631343,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"b1TM741wwYdA"},"outputs":[],"source":["train_data.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)"]},{"cell_type":"code","execution_count":32,"metadata":{"executionInfo":{"elapsed":1675,"status":"ok","timestamp":1682430633001,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"6Zfbmp-IxKUH"},"outputs":[],"source":["train_data['Concatenated'] = train_data['First Part'] + train_data['Expected word'] + train_data['Second Part']"]},{"cell_type":"code","source":[],"metadata":{"id":"vglIDWIxgyQk"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":15,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":658},"executionInfo":{"elapsed":3103,"status":"ok","timestamp":1682427836629,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"lQQHHALRxiHj","outputId":"f1bde340-fa0b-494a-a5a3-b0d8277d9522"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n","428513 Wita.htt halting the party ware dilven to the ... \n","428514 It was the last thing that either of\\nthem exp... \n","428515 settlement with the department.\\nIt is also sh... \n","428516 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","... ... ... \n","428512 \\nSam was arrested.\\nThe case excited a great ... NaN \n","428513 through the alnp the »Uitors laapeeeed tia.»\\n... NaN \n","428514 Agua Negra across the line.\\nIt was a grim pla... NaN \n","428515 \\na note of Wood, Dialogue fc Co., for\\nc27,im... NaN \n","428516 3214c;do White at 3614c: Mixed Western at\\n331... NaN \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 NaN \n","428513 NaN \n","428514 NaN \n","428515 NaN \n","428516 NaN \n","\n","[428517 rows x 4 columns]"],"text/html":["\n"," <div id=\"df-cf00827c-83d4-412b-8fb7-1d9bdedbd249\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>First Part</th>\n"," <th>Second Part</th>\n"," <th>Expected word</th>\n"," <th>Concatenated</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," <td>lie</td>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," <td>himself</td>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," <td>of</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," <td>ably</td>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," <td>j</td>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>428512</th>\n"," <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n"," <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428513</th>\n"," <td>Wita.htt halting the party ware dilven to the ...</td>\n"," <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428514</th>\n"," <td>It was the last thing that either of\\nthem exp...</td>\n"," <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428515</th>\n"," <td>settlement with the department.\\nIt is also sh...</td>\n"," <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428516</th>\n"," <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n"," <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>428517 rows × 4 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-cf00827c-83d4-412b-8fb7-1d9bdedbd249')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-cf00827c-83d4-412b-8fb7-1d9bdedbd249 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-cf00827c-83d4-412b-8fb7-1d9bdedbd249');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":15}],"source":["import regex as re\n","train_data.replace('\\n', '', regex=True)"]},{"cell_type":"code","source":[],"metadata":{"id":"I2h3IPk7g26L"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":658},"executionInfo":{"elapsed":6210,"status":"ok","timestamp":1682427867858,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"outputId":"c04eb68e-65e1-49fb-e505-d4bf69f4f13a","id":"uKIvCLerg9HM"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n","428513 Wita.htt halting the party ware dilven to the ... \n","428514 It was the last thing that either of\\nthem exp... \n","428515 settlement with the department.\\nIt is also sh... \n","428516 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","... ... ... \n","428512 \\nSam was arrested.\\nThe case excited a great ... NaN \n","428513 through the alnp the »Uitors laapeeeed tia.»\\n... NaN \n","428514 Agua Negra across the line.\\nIt was a grim pla... NaN \n","428515 \\na note of Wood, Dialogue fc Co., for\\nc27,im... NaN \n","428516 3214c;do White at 3614c: Mixed Western at\\n331... NaN \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 NaN \n","428513 NaN \n","428514 NaN \n","428515 NaN \n","428516 NaN \n","\n","[428517 rows x 4 columns]"],"text/html":["\n"," <div id=\"df-2a7dd892-e05f-4f44-983b-3c8acf0202ed\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>First Part</th>\n"," <th>Second Part</th>\n"," <th>Expected word</th>\n"," <th>Concatenated</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," <td>lie</td>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," <td>himself</td>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," <td>of</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," <td>ably</td>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," <td>j</td>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>428512</th>\n"," <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n"," <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428513</th>\n"," <td>Wita.htt halting the party ware dilven to the ...</td>\n"," <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428514</th>\n"," <td>It was the last thing that either of\\nthem exp...</td>\n"," <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428515</th>\n"," <td>settlement with the department.\\nIt is also sh...</td>\n"," <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428516</th>\n"," <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n"," <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>428517 rows × 4 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-2a7dd892-e05f-4f44-983b-3c8acf0202ed')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-2a7dd892-e05f-4f44-983b-3c8acf0202ed button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-2a7dd892-e05f-4f44-983b-3c8acf0202ed');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":16}],"source":["import regex as re\n","train_data.replace('\\n', '', regex=True)"]},{"cell_type":"code","source":["import nltk\n","nltk.download('punkt')\n","\n","from collections import Counter, defaultdict"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vJ_D4HTZg96-","executionInfo":{"status":"ok","timestamp":1682427897674,"user_tz":-120,"elapsed":2745,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"f18ab5a5-4573-47ad-90c6-bf3b63349f45"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]}]},{"cell_type":"code","source":["type(train_data['Concatenated'])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fplFd8P9hGCQ","executionInfo":{"status":"ok","timestamp":1682430333536,"user_tz":-120,"elapsed":248,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"2505c42f-2f92-4cee-b29c-9737bb5e100e"},"execution_count":24,"outputs":[{"output_type":"execute_result","data":{"text/plain":["pandas.core.series.Series"]},"metadata":{},"execution_count":24}]},{"cell_type":"code","source":["! pip install kenlm\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"1dMnuzA7iwzL","executionInfo":{"status":"ok","timestamp":1682428421824,"user_tz":-120,"elapsed":63749,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"87353c69-3d60-456b-a6ce-35fd6a8a700e"},"execution_count":20,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting kenlm\n"," Downloading kenlm-0.1.tar.gz (424 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m425.0/425.0 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Building wheels for collected packages: kenlm\n"," Building wheel for kenlm (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for kenlm: filename=kenlm-0.1-cp39-cp39-linux_x86_64.whl size=3001228 sha256=9936418a67cd8b93ca741cb6eebe33c29f80ca0eeb1befad98119b4ce5a95056\n"," Stored in directory: /root/.cache/pip/wheels/34/4e/25/ef89c6aa677d672b9b6031e6f6b03d4a2340e358d479e86777\n","Successfully built kenlm\n","Installing collected packages: kenlm\n","Successfully installed kenlm-0.1\n"]}]},{"cell_type":"code","source":["train_data"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":658},"id":"tXD44CgmtTkn","executionInfo":{"status":"ok","timestamp":1682431170719,"user_tz":-120,"elapsed":22,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"0211f214-8d4b-4954-fec8-2de0ce5dcaf6"},"execution_count":34,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n","428513 Wita.htt halting the party ware dilven to the ... \n","428514 It was the last thing that either of\\nthem exp... \n","428515 settlement with the department.\\nIt is also sh... \n","428516 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","... ... ... \n","428512 \\nSam was arrested.\\nThe case excited a great ... NaN \n","428513 through the alnp the »Uitors laapeeeed tia.»\\n... NaN \n","428514 Agua Negra across the line.\\nIt was a grim pla... NaN \n","428515 \\na note of Wood, Dialogue fc Co., for\\nc27,im... NaN \n","428516 3214c;do White at 3614c: Mixed Western at\\n331... NaN \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 NaN \n","428513 NaN \n","428514 NaN \n","428515 NaN \n","428516 NaN \n","\n","[428517 rows x 4 columns]"],"text/html":["\n"," <div id=\"df-1b558548-7a68-4726-950f-9aa617d7a20d\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>First Part</th>\n"," <th>Second Part</th>\n"," <th>Expected word</th>\n"," <th>Concatenated</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," <td>said\\nit's all squash. The best I could get\\ni...</td>\n"," <td>lie</td>\n"," <td>came fiom the last place to this\\nplace, and t...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," <td>\\ninto a proper perspective with those\\nminor ...</td>\n"," <td>himself</td>\n"," <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Thera were in 1771 only aeventy-nine\\n*ub*erlb...</td>\n"," <td>NaN</td>\n"," <td>of</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," <td>the ceitihcate of'\\noperate to prevent tfie ma...</td>\n"," <td>ably</td>\n"," <td>whenever any prize property shall!*' condemn- ...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," <td>\\nTerms of sale: One-tblrd, togethor with the ...</td>\n"," <td>j</td>\n"," <td>SA LKOFVALUABLE UNIMPBOV&amp;D RE\\\\L\\nJSIATF. ON T...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>428512</th>\n"," <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n"," <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428513</th>\n"," <td>Wita.htt halting the party ware dilven to the ...</td>\n"," <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428514</th>\n"," <td>It was the last thing that either of\\nthem exp...</td>\n"," <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428515</th>\n"," <td>settlement with the department.\\nIt is also sh...</td>\n"," <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>428516</th>\n"," <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n"," <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>428517 rows × 4 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1b558548-7a68-4726-950f-9aa617d7a20d')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-1b558548-7a68-4726-950f-9aa617d7a20d button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-1b558548-7a68-4726-950f-9aa617d7a20d');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["! pip install https://github.com/kpu/kenlm/archive/master.zip"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"uUMasrM9tJE3","executionInfo":{"status":"ok","timestamp":1682431169819,"user_tz":-120,"elapsed":109191,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"b24ca18e-dea1-4d81-dd4f-5babfb57446e"},"execution_count":33,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting https://github.com/kpu/kenlm/archive/master.zip\n"," Downloading https://github.com/kpu/kenlm/archive/master.zip\n","\u001b[2K \u001b[32m-\u001b[0m \u001b[32m553.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n","\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n"," Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n"," Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","Building wheels for collected packages: kenlm\n"," Building wheel for kenlm (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for kenlm: filename=kenlm-0.0.0-cp39-cp39-linux_x86_64.whl size=3262507 sha256=d6011bc0a0c1321eb855313d02b28b6ebeb81f79bd161245da540f02b75259b9\n"," Stored in directory: /tmp/pip-ephem-wheel-cache-m1yq1oil/wheels/b5/52/c9/af2949d9776846ea81a9cba52a4fe5a81b9ace3b9f2530c3f3\n","Successfully built kenlm\n","Installing collected packages: kenlm\n"," Attempting uninstall: kenlm\n"," Found existing installation: kenlm 0.1\n"," Uninstalling kenlm-0.1:\n"," Successfully uninstalled kenlm-0.1\n","Successfully installed kenlm-0.0.0\n"]}]},{"cell_type":"code","source":["! git clone https://github.com/kpu/kenlm"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GsGUGphov22p","executionInfo":{"status":"ok","timestamp":1682431863612,"user_tz":-120,"elapsed":6479,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"65f619d2-9d8e-452b-c0db-80c2628ad573"},"execution_count":40,"outputs":[{"output_type":"stream","name":"stdout","text":["Cloning into 'kenlm'...\n","remote: Enumerating objects: 14147, done.\u001b[K\n","remote: Counting objects: 100% (460/460), done.\u001b[K\n","remote: Compressing objects: 100% (319/319), done.\u001b[K\n","remote: Total 14147 (delta 152), reused 399 (delta 127), pack-reused 13687\u001b[K\n","Receiving objects: 100% (14147/14147), 5.91 MiB | 8.32 MiB/s, done.\n","Resolving deltas: 100% (8032/8032), done.\n","Updating files: 100% (304/304), done.\n"]}]},{"cell_type":"code","source":["cd kenlm"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jjZN0R9hwOYp","executionInfo":{"status":"ok","timestamp":1682431877617,"user_tz":-120,"elapsed":339,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"7ec0c059-eeca-4d79-e9b6-095978eee1fd"},"execution_count":41,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm\n"]}]},{"cell_type":"code","source":["mkdir build"],"metadata":{"id":"kOJlIh75wR-g","executionInfo":{"status":"ok","timestamp":1682431893553,"user_tz":-120,"elapsed":1458,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":42,"outputs":[]},{"cell_type":"code","source":["cd build"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6KEiQQkpwVte","executionInfo":{"status":"ok","timestamp":1682431896787,"user_tz":-120,"elapsed":12,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"e14acf39-af30-4919-f7db-1480fdd7d04a"},"execution_count":43,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build\n"]}]},{"cell_type":"code","source":["! cmake .."],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"n2tlhAwCwfYc","executionInfo":{"status":"ok","timestamp":1682431952673,"user_tz":-120,"elapsed":9592,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"24e12980-365b-4f10-b524-264f302a40d5"},"execution_count":45,"outputs":[{"output_type":"stream","name":"stdout","text":["-- The C compiler identification is GNU 9.4.0\n","-- The CXX compiler identification is GNU 9.4.0\n","-- Detecting C compiler ABI info\n","-- Detecting C compiler ABI info - done\n","-- Check for working C compiler: /usr/bin/cc - skipped\n","-- Detecting C compile features\n","-- Detecting C compile features - done\n","-- Detecting CXX compiler ABI info\n","-- Detecting CXX compiler ABI info - done\n","-- Check for working CXX compiler: /usr/bin/c++ - skipped\n","-- Detecting CXX compile features\n","-- Detecting CXX compile features - done\n","-- Could NOT find Eigen3 (missing: Eigen3_DIR)\n","-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.71.0/BoostConfig.cmake (found suitable version \"1.71.0\", minimum required is \"1.41.0\") found components: program_options system thread unit_test_framework \n","-- Check if compiler accepts -pthread\n","-- Check if compiler accepts -pthread - yes\n","-- Found Threads: TRUE \n","-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version \"1.2.11\") \n","-- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version \"1.0.8\") \n","-- Looking for BZ2_bzCompressInit\n","-- Looking for BZ2_bzCompressInit - found\n","-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n","-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n","-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n","-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n","-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so\n","-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n","-- Found LibLZMA: /usr/lib/x86_64-linux-gnu/liblzma.so (found version \"5.2.4\") \n","-- Looking for clock_gettime in rt\n","-- Looking for clock_gettime in rt - found\n","-- Configuring done\n","-- Generating done\n","-- Build files have been written to: /content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build\n"]}]},{"cell_type":"code","source":["! make -j 4"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QDkJXsHjwlOi","executionInfo":{"status":"ok","timestamp":1682432128325,"user_tz":-120,"elapsed":160774,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"4914afa2-ad40-4a53-e5e6-92ff816d500e"},"execution_count":46,"outputs":[{"output_type":"stream","name":"stdout","text":["[ -1%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o\u001b[0m\n","[ 0%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o\u001b[0m\n","[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o\u001b[0m\n","[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o\u001b[0m\n","[ 3%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o\u001b[0m\n","[ 5%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o\u001b[0m\n","[ 6%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-to-string.cc.o\u001b[0m\n","[ 7%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/string-to-double.cc.o\u001b[0m\n","[ 8%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o\u001b[0m\n","[ 10%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o\u001b[0m\n","[ 11%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o\u001b[0m\n","[ 12%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o\u001b[0m\n","[ 13%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o\u001b[0m\n","[ 15%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o\u001b[0m\n","[ 16%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o\u001b[0m\n","[ 17%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o\u001b[0m\n","[ 18%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o\u001b[0m\n","[ 20%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o\u001b[0m\n","[ 21%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o\u001b[0m\n","[ 22%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o\u001b[0m\n","[ 23%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o\u001b[0m\n","[ 25%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o\u001b[0m\n","[ 26%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o\u001b[0m\n","[ 27%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o\u001b[0m\n","[ 28%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o\u001b[0m\n","[ 30%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o\u001b[0m\n","[ 31%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o\u001b[0m\n","[ 32%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o\u001b[0m\n","[ 33%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o\u001b[0m\n","[ 35%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o\u001b[0m\n","[ 36%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm_util.a\u001b[0m\n","[ 36%] Built target kenlm_util\n","[ 37%] \u001b[32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o\u001b[0m\n","[ 38%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o\u001b[0m\n","[ 40%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o\u001b[0m\n","[ 41%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o\u001b[0m\n","[ 42%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o\u001b[0m\n","[ 43%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o\u001b[0m\n","[ 45%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o\u001b[0m\n","[ 46%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o\u001b[0m\n","[ 47%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o\u001b[0m\n","[ 48%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o\u001b[0m\n","[ 50%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_filter.a\u001b[0m\n","[ 50%] Built target kenlm_filter\n","[ 51%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o\u001b[0m\n","[ 52%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o\u001b[0m\n","[ 53%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o\u001b[0m\n","[ 55%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o\u001b[0m\n","[ 56%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o\u001b[0m\n","[ 57%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o\u001b[0m\n","[ 58%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o\u001b[0m\n","[ 60%] \u001b[32m\u001b[1mLinking CXX executable ../bin/probing_hash_table_benchmark\u001b[0m\n","[ 60%] Built target probing_hash_table_benchmark\n","[ 61%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o\u001b[0m\n","[ 62%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o\u001b[0m\n","[ 63%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o\u001b[0m\n","[ 65%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o\u001b[0m\n","[ 66%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o\u001b[0m\n","[ 67%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o\u001b[0m\n","[ 68%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm.a\u001b[0m\n","[ 68%] Built target kenlm\n","[ 70%] \u001b[32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o\u001b[0m\n","[ 72%] \u001b[32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o\u001b[0m\n","[ 73%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o\u001b[0m\n","[ 73%] \u001b[32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o\u001b[0m\n","[ 75%] \u001b[32m\u001b[1mLinking CXX executable ../bin/fragment\u001b[0m\n","[ 75%] Built target fragment\n","[ 76%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o\u001b[0m\n","[ 77%] \u001b[32m\u001b[1mLinking CXX executable ../bin/build_binary\u001b[0m\n","[ 77%] Built target build_binary\n","[ 78%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o\u001b[0m\n","[ 80%] \u001b[32m\u001b[1mLinking CXX executable ../bin/query\u001b[0m\n","[ 80%] Built target query\n","[ 81%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o\u001b[0m\n","[ 82%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o\u001b[0m\n","[ 83%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o\u001b[0m\n","[ 85%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o\u001b[0m\n","[ 86%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o\u001b[0m\n","[ 87%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o\u001b[0m\n","[ 88%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/phrase_table_vocab\u001b[0m\n","[ 88%] Built target phrase_table_vocab\n","[ 90%] \u001b[32m\u001b[1mLinking CXX executable ../bin/kenlm_benchmark\u001b[0m\n","[ 90%] Built target kenlm_benchmark\n","[ 91%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_builder.a\u001b[0m\n","[ 91%] Built target kenlm_builder\n","[ 92%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o\u001b[0m\n","[ 93%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o\u001b[0m\n","[ 95%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/filter\u001b[0m\n","[ 95%] Built target filter\n","[ 96%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/lmplz\u001b[0m\n","[ 96%] Built target lmplz\n","[ 97%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/count_ngrams\u001b[0m\n","[ 97%] Built target count_ngrams\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"25gig92tw_FF"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["! make install"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XE-96uPDwvZY","executionInfo":{"status":"ok","timestamp":1682432146846,"user_tz":-120,"elapsed":11629,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"0607fae3-cc7f-4692-a212-6578d654e849"},"execution_count":47,"outputs":[{"output_type":"stream","name":"stdout","text":["[ 36%] Built target kenlm_util\n","[ 38%] Built target probing_hash_table_benchmark\n","[ 63%] Built target kenlm\n","[ 66%] Built target query\n","[ 68%] Built target fragment\n","[ 71%] Built target build_binary\n","[ 73%] Built target kenlm_benchmark\n","[ 82%] Built target kenlm_builder\n","[ 85%] Built target lmplz\n","[ 87%] Built target count_ngrams\n","[ 92%] Built target kenlm_filter\n","[ 95%] Built target filter\n","[ 97%] Built target phrase_table_vocab\n","\u001b[36mInstall the project...\u001b[0m\n","-- Install configuration: \"Release\"\n","-- Installing: /usr/local/share/kenlm/cmake/kenlmTargets.cmake\n","-- Installing: /usr/local/share/kenlm/cmake/kenlmTargets-release.cmake\n","-- Installing: /usr/local/include/kenlm/util/bit_packing.hh\n","-- Installing: /usr/local/include/kenlm/util/ersatz_progress.hh\n","-- Installing: /usr/local/include/kenlm/util/exception.hh\n","-- Installing: /usr/local/include/kenlm/util/fake_ostream.hh\n","-- Installing: /usr/local/include/kenlm/util/file.hh\n","-- Installing: /usr/local/include/kenlm/util/file_piece.hh\n","-- Installing: /usr/local/include/kenlm/util/file_stream.hh\n","-- Installing: /usr/local/include/kenlm/util/fixed_array.hh\n","-- Installing: /usr/local/include/kenlm/util/float_to_string.hh\n","-- Installing: /usr/local/include/kenlm/util/getopt.hh\n","-- Installing: /usr/local/include/kenlm/util/have.hh\n","-- Installing: /usr/local/include/kenlm/util/integer_to_string.hh\n","-- Installing: /usr/local/include/kenlm/util/joint_sort.hh\n","-- Installing: /usr/local/include/kenlm/util/mmap.hh\n","-- Installing: /usr/local/include/kenlm/util/multi_intersection.hh\n","-- Installing: /usr/local/include/kenlm/util/murmur_hash.hh\n","-- Installing: /usr/local/include/kenlm/util/parallel_read.hh\n","-- Installing: /usr/local/include/kenlm/util/pcqueue.hh\n","-- Installing: /usr/local/include/kenlm/util/pool.hh\n","-- Installing: /usr/local/include/kenlm/util/probing_hash_table.hh\n","-- Installing: /usr/local/include/kenlm/util/proxy_iterator.hh\n","-- Installing: /usr/local/include/kenlm/util/read_compressed.hh\n","-- Installing: /usr/local/include/kenlm/util/scoped.hh\n","-- Installing: /usr/local/include/kenlm/util/sized_iterator.hh\n","-- Installing: /usr/local/include/kenlm/util/sorted_uniform.hh\n","-- Installing: /usr/local/include/kenlm/util/spaces.hh\n","-- Installing: /usr/local/include/kenlm/util/string_piece.hh\n","-- Installing: /usr/local/include/kenlm/util/string_piece_hash.hh\n","-- Installing: /usr/local/include/kenlm/util/string_stream.hh\n","-- Installing: /usr/local/include/kenlm/util/thread_pool.hh\n","-- Installing: /usr/local/include/kenlm/util/tokenize_piece.hh\n","-- Installing: /usr/local/include/kenlm/util/usage.hh\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/bignum-dtoa.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/bignum.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/cached-powers.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/diy-fp.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/double-conversion.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/double-to-string.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/fast-dtoa.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/fixed-dtoa.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/ieee.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/string-to-double.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/strtod.h\n","-- Installing: /usr/local/include/kenlm/util/double-conversion/utils.h\n","-- Installing: /usr/local/include/kenlm/util/stream/block.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/chain.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/config.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/count_records.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/io.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/line_input.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/multi_progress.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/multi_stream.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/rewindable_stream.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/sort.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/stream.hh\n","-- Installing: /usr/local/include/kenlm/util/stream/typed_stream.hh\n","-- Installing: /usr/local/include/kenlm/lm/bhiksha.hh\n","-- Installing: /usr/local/include/kenlm/lm/binary_format.hh\n","-- Installing: /usr/local/include/kenlm/lm/blank.hh\n","-- Installing: /usr/local/include/kenlm/lm/config.hh\n","-- Installing: /usr/local/include/kenlm/lm/enumerate_vocab.hh\n","-- Installing: /usr/local/include/kenlm/lm/facade.hh\n","-- Installing: /usr/local/include/kenlm/lm/left.hh\n","-- Installing: /usr/local/include/kenlm/lm/lm_exception.hh\n","-- Installing: /usr/local/include/kenlm/lm/max_order.hh\n","-- Installing: /usr/local/include/kenlm/lm/model.hh\n","-- Installing: /usr/local/include/kenlm/lm/model_type.hh\n","-- Installing: /usr/local/include/kenlm/lm/ngram_query.hh\n","-- Installing: /usr/local/include/kenlm/lm/partial.hh\n","-- Installing: /usr/local/include/kenlm/lm/quantize.hh\n","-- Installing: /usr/local/include/kenlm/lm/read_arpa.hh\n","-- Installing: /usr/local/include/kenlm/lm/return.hh\n","-- Installing: /usr/local/include/kenlm/lm/search_hashed.hh\n","-- Installing: /usr/local/include/kenlm/lm/search_trie.hh\n","-- Installing: /usr/local/include/kenlm/lm/sizes.hh\n","-- Installing: /usr/local/include/kenlm/lm/state.hh\n","-- Installing: /usr/local/include/kenlm/lm/trie.hh\n","-- Installing: /usr/local/include/kenlm/lm/trie_sort.hh\n","-- Installing: /usr/local/include/kenlm/lm/value.hh\n","-- Installing: /usr/local/include/kenlm/lm/value_build.hh\n","-- Installing: /usr/local/include/kenlm/lm/virtual_interface.hh\n","-- Installing: /usr/local/include/kenlm/lm/vocab.hh\n","-- Installing: /usr/local/include/kenlm/lm/weights.hh\n","-- Installing: /usr/local/include/kenlm/lm/word_index.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/adjust_counts.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/combine_counts.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/corpus_count.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/debug_print.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/discount.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/hash_gamma.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/header_info.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/initial_probabilities.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/interpolate.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/output.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/payload.hh\n","-- Installing: /usr/local/include/kenlm/lm/builder/pipeline.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/compare.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/joint_order.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/model_buffer.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/ngram.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/ngram_stream.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/print.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/renumber.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/size_option.hh\n","-- Installing: /usr/local/include/kenlm/lm/common/special.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/arpa_io.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/count_io.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/format.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/phrase.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/thread.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/vocab.hh\n","-- Installing: /usr/local/include/kenlm/lm/filter/wrapper.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/backoff_matrix.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/backoff_reunification.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/bounded_sequence_encoding.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/interpolate_info.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/merge_probabilities.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/merge_vocab.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/normalize.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/pipeline.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/split_worker.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/tune_derivatives.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/tune_instances.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/tune_matrix.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/tune_weights.hh\n","-- Installing: /usr/local/include/kenlm/lm/interpolate/universal_vocab.hh\n","-- Installing: /usr/local/share/kenlm/cmake/kenlmConfig.cmake\n","-- Installing: /usr/local/lib/libkenlm_util.a\n","-- Installing: /usr/local/bin/probing_hash_table_benchmark\n","-- Installing: /usr/local/lib/libkenlm.a\n","-- Installing: /usr/local/bin/query\n","-- Installing: /usr/local/bin/fragment\n","-- Installing: /usr/local/bin/build_binary\n","-- Installing: /usr/local/bin/kenlm_benchmark\n","-- Installing: /usr/local/bin/lmplz\n","-- Installing: /usr/local/bin/count_ngrams\n","-- Installing: /usr/local/lib/libkenlm_builder.a\n","-- Installing: /usr/local/bin/filter\n","-- Installing: /usr/local/bin/phrase_table_vocab\n","-- Installing: /usr/local/lib/libkenlm_filter.a\n"]}]},{"cell_type":"code","source":["KENLM_BUILD_PATH = \"/content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build\""],"metadata":{"id":"555i1REnxUg-","executionInfo":{"status":"ok","timestamp":1682432698665,"user_tz":-120,"elapsed":299,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":58,"outputs":[]},{"cell_type":"code","source":["import numpy as np\n","train_data['Concatenated'] = train_data['Concatenated'].replace(np.nan, '', regex=True)"],"metadata":{"id":"GmrC8f2UyHBr","executionInfo":{"status":"ok","timestamp":1682432384026,"user_tz":-120,"elapsed":2014,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":50,"outputs":[]},{"cell_type":"code","source":["train_data['Concatenated']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"izYFUBaPyN9W","executionInfo":{"status":"ok","timestamp":1682432398039,"user_tz":-120,"elapsed":373,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"bfb4527a-12f4-4762-c560-8d7cba5ca7a5"},"execution_count":51,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 came fiom the last place to this\\nplace, and t...\n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...\n","2 \n","3 whenever any prize property shall!*' condemn- ...\n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T...\n"," ... \n","428512 \n","428513 \n","428514 \n","428515 \n","428516 \n","Name: Concatenated, Length: 428517, dtype: object"]},"metadata":{},"execution_count":51}]},{"cell_type":"code","source":["with open(\"new_train\", \"w+\") as f:\n"," for t in train_data['Concatenated']:\n"," f.write(t + \"\\n\")"],"metadata":{"id":"dcikwtsVxZh8","executionInfo":{"status":"ok","timestamp":1682432414204,"user_tz":-120,"elapsed":5202,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":52,"outputs":[]},{"cell_type":"code","source":["!$KENLM_BUILD_PATH/bin/lmplz -o 4 < new_train > model.arpa"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MdmgpIe8yWI7","executionInfo":{"status":"ok","timestamp":1682433143828,"user_tz":-120,"elapsed":442200,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"6f1b67ec-34ec-4f27-85bb-b1e39b633a38"},"execution_count":59,"outputs":[{"output_type":"stream","name":"stdout","text":["=== 1/5 Counting and sorting n-grams ===\n","Reading /content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build/new_train\n","----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n","****************************************************************************************************\n","Unigram tokens 77591849 types 8606129\n","=== 2/5 Calculating and sorting adjusted counts ===\n","Chain sizes: 1:103273548 2:1836559360 3:3443548928 4:5509678080\n","Statistics:\n","1 8606129 D1=0.858113 D2=1.02331 D3+=1.17414\n","2 33054619 D1=0.888834 D2=1.06683 D3+=1.1959\n","3 57923129 D1=0.933907 D2=1.16291 D3+=1.25462\n","4 69946963 D1=0.952529 D2=1.26688 D3+=1.32772\n","Memory estimate for binary LM:\n","type MB\n","probing 3496 assuming -p 1.5\n","probing 4049 assuming -r models -p 1.5\n","trie 1888 without quantization\n","trie 1186 assuming -q 8 -b 8 quantization \n","trie 1679 assuming -a 22 array pointer compression\n","trie 977 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n","=== 3/5 Calculating and sorting initial probabilities ===\n","Chain sizes: 1:103273548 2:528873904 3:1158462580 4:1678727112\n","----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n","####################################################################################################\n","=== 4/5 Calculating and writing order-interpolated probabilities ===\n","Chain sizes: 1:103273548 2:528873904 3:1158462580 4:1678727112\n","----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n","####################################################################################################\n","=== 5/5 Writing ARPA model ===\n","----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n","****************************************************************************************************\n","Name:lmplz\tVmPeak:10909296 kB\tVmRSS:3116 kB\tRSSMax:3969704 kB\tuser:275.432\tsys:59.5966\tCPU:335.028\treal:441.039\n"]}]},{"cell_type":"code","source":["import kenlm"],"metadata":{"id":"N0OrlDYWtsiN","executionInfo":{"status":"ok","timestamp":1682431205671,"user_tz":-120,"elapsed":11,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":36,"outputs":[]}]}