{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOjlR0HzrxQLi9ivvf3rrhL"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kgUXhu_9HEZY","executionInfo":{"status":"ok","timestamp":1682427020888,"user_tz":-120,"elapsed":7836,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"93b9b737-532d-4892-d4bf-66579ee7c849"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4gxwwa5-Haqo","executionInfo":{"status":"ok","timestamp":1682427020889,"user_tz":-120,"elapsed":13,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"d5a1b591-43f9-4810-fb49-bf247c1a08e2"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["[Errno 2] No such file or directory: 'drive/MyDrive'\n","/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"78igYakJHokM","executionInfo":{"status":"ok","timestamp":1682427020891,"user_tz":-120,"elapsed":12,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"c1906f10-600a-4170-b61f-ab3005a2cf2a"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["[Errno 2] No such file or directory: 'challenging-america-word-gap-prediction/'\n","/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["import pandas as pd"],"metadata":{"id":"-wyIUdlBHp2W","executionInfo":{"status":"ok","timestamp":1682427020892,"user_tz":-120,"elapsed":9,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["data = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")\n","\n","exp_words = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", on_bad_lines='skip', header=None, encoding=\"utf-8\")"],"metadata":{"id":"kA6PExReHr3E","executionInfo":{"status":"ok","timestamp":1682430631336,"user_tz":-120,"elapsed":39975,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","execution_count":29,"metadata":{"executionInfo":{"elapsed":17,"status":"ok","timestamp":1682430631338,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"upTQ5Po9wOSL"},"outputs":[],"source":["train_data = data[[6, 7]]"]},{"cell_type":"code","execution_count":30,"metadata":{"executionInfo":{"elapsed":18,"status":"ok","timestamp":1682430631341,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"OdEm_SBSwXuY"},"outputs":[],"source":["train_data= pd.concat([train_data, exp_words], axis=1)"]},{"cell_type":"code","execution_count":31,"metadata":{"executionInfo":{"elapsed":19,"status":"ok","timestamp":1682430631343,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"b1TM741wwYdA"},"outputs":[],"source":["train_data.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)"]},{"cell_type":"code","execution_count":32,"metadata":{"executionInfo":{"elapsed":1675,"status":"ok","timestamp":1682430633001,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"6Zfbmp-IxKUH"},"outputs":[],"source":["train_data['Concatenated'] = train_data['First Part'] + train_data['Expected word'] + train_data['Second Part']"]},{"cell_type":"code","source":[],"metadata":{"id":"vglIDWIxgyQk"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":15,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":658},"executionInfo":{"elapsed":3103,"status":"ok","timestamp":1682427836629,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"lQQHHALRxiHj","outputId":"f1bde340-fa0b-494a-a5a3-b0d8277d9522"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" First Part \\\n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 Thera were in 1771 only aeventy-nine\\n*ub*erlb... \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n","428513 Wita.htt halting the party ware dilven to the ... \n","428514 It was the last thing that either of\\nthem exp... \n","428515 settlement with the department.\\nIt is also sh... \n","428516 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n","\n"," Second Part Expected word \\\n","0 said\\nit's all squash. The best I could get\\ni... lie \n","1 \\ninto a proper perspective with those\\nminor ... himself \n","2 NaN of \n","3 the ceitihcate of'\\noperate to prevent tfie ma... ably \n","4 \\nTerms of sale: One-tblrd, togethor with the ... j \n","... ... ... \n","428512 \\nSam was arrested.\\nThe case excited a great ... NaN \n","428513 through the alnp the »Uitors laapeeeed tia.»\\n... NaN \n","428514 Agua Negra across the line.\\nIt was a grim pla... NaN \n","428515 \\na note of Wood, Dialogue fc Co., for\\nc27,im... NaN \n","428516 3214c;do White at 3614c: Mixed Western at\\n331... NaN \n","\n"," Concatenated \n","0 came fiom the last place to this\\nplace, and t... \n","1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n","2 NaN \n","3 whenever any prize property shall!*' condemn- ... \n","4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... \n","... ... \n","428512 NaN \n","428513 NaN \n","428514 NaN \n","428515 NaN \n","428516 NaN \n","\n","[428517 rows x 4 columns]"],"text/html":["\n","
\n"," | First Part | \n","Second Part | \n","Expected word | \n","Concatenated | \n","
---|---|---|---|---|
0 | \n","came fiom the last place to this\\nplace, and t... | \n","said\\nit's all squash. The best I could get\\ni... | \n","lie | \n","came fiom the last place to this\\nplace, and t... | \n","
1 | \n","MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... | \n","\\ninto a proper perspective with those\\nminor ... | \n","himself | \n","MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... | \n","
2 | \n","Thera were in 1771 only aeventy-nine\\n*ub*erlb... | \n","NaN | \n","of | \n","NaN | \n","
3 | \n","whenever any prize property shall!*' condemn- ... | \n","the ceitihcate of'\\noperate to prevent tfie ma... | \n","ably | \n","whenever any prize property shall!*' condemn- ... | \n","
4 | \n","SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... | \n","\\nTerms of sale: One-tblrd, togethor with the ... | \n","j | \n","SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... | \n","
... | \n","... | \n","... | \n","... | \n","... | \n","
428512 | \n","Sam Clendenin bad a fancy for Ui«\\nscience of ... | \n","\\nSam was arrested.\\nThe case excited a great ... | \n","NaN | \n","NaN | \n","
428513 | \n","Wita.htt halting the party ware dilven to the ... | \n","through the alnp the »Uitors laapeeeed tia.»\\n... | \n","NaN | \n","NaN | \n","
428514 | \n","It was the last thing that either of\\nthem exp... | \n","Agua Negra across the line.\\nIt was a grim pla... | \n","NaN | \n","NaN | \n","
428515 | \n","settlement with the department.\\nIt is also sh... | \n","\\na note of Wood, Dialogue fc Co., for\\nc27,im... | \n","NaN | \n","NaN | \n","
428516 | \n","Flour quotations—low extras at 1 R0®2 50;\\ncit... | \n","3214c;do White at 3614c: Mixed Western at\\n331... | \n","NaN | \n","NaN | \n","
428517 rows × 4 columns
\n","\n"," | First Part | \n","Second Part | \n","Expected word | \n","Concatenated | \n","
---|---|---|---|---|
0 | \n","came fiom the last place to this\\nplace, and t... | \n","said\\nit's all squash. The best I could get\\ni... | \n","lie | \n","came fiom the last place to this\\nplace, and t... | \n","
1 | \n","MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... | \n","\\ninto a proper perspective with those\\nminor ... | \n","himself | \n","MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... | \n","
2 | \n","Thera were in 1771 only aeventy-nine\\n*ub*erlb... | \n","NaN | \n","of | \n","NaN | \n","
3 | \n","whenever any prize property shall!*' condemn- ... | \n","the ceitihcate of'\\noperate to prevent tfie ma... | \n","ably | \n","whenever any prize property shall!*' condemn- ... | \n","
4 | \n","SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... | \n","\\nTerms of sale: One-tblrd, togethor with the ... | \n","j | \n","SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... | \n","
... | \n","... | \n","... | \n","... | \n","... | \n","
428512 | \n","Sam Clendenin bad a fancy for Ui«\\nscience of ... | \n","\\nSam was arrested.\\nThe case excited a great ... | \n","NaN | \n","NaN | \n","
428513 | \n","Wita.htt halting the party ware dilven to the ... | \n","through the alnp the »Uitors laapeeeed tia.»\\n... | \n","NaN | \n","NaN | \n","
428514 | \n","It was the last thing that either of\\nthem exp... | \n","Agua Negra across the line.\\nIt was a grim pla... | \n","NaN | \n","NaN | \n","
428515 | \n","settlement with the department.\\nIt is also sh... | \n","\\na note of Wood, Dialogue fc Co., for\\nc27,im... | \n","NaN | \n","NaN | \n","
428516 | \n","Flour quotations—low extras at 1 R0®2 50;\\ncit... | \n","3214c;do White at 3614c: Mixed Western at\\n331... | \n","NaN | \n","NaN | \n","
428517 rows × 4 columns
\n","\n"," | First Part | \n","Second Part | \n","Expected word | \n","Concatenated | \n","
---|---|---|---|---|
0 | \n","came fiom the last place to this\\nplace, and t... | \n","said\\nit's all squash. The best I could get\\ni... | \n","lie | \n","came fiom the last place to this\\nplace, and t... | \n","
1 | \n","MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... | \n","\\ninto a proper perspective with those\\nminor ... | \n","himself | \n","MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... | \n","
2 | \n","Thera were in 1771 only aeventy-nine\\n*ub*erlb... | \n","NaN | \n","of | \n","NaN | \n","
3 | \n","whenever any prize property shall!*' condemn- ... | \n","the ceitihcate of'\\noperate to prevent tfie ma... | \n","ably | \n","whenever any prize property shall!*' condemn- ... | \n","
4 | \n","SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... | \n","\\nTerms of sale: One-tblrd, togethor with the ... | \n","j | \n","SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\\nJSIATF. ON T... | \n","
... | \n","... | \n","... | \n","... | \n","... | \n","
428512 | \n","Sam Clendenin bad a fancy for Ui«\\nscience of ... | \n","\\nSam was arrested.\\nThe case excited a great ... | \n","NaN | \n","NaN | \n","
428513 | \n","Wita.htt halting the party ware dilven to the ... | \n","through the alnp the »Uitors laapeeeed tia.»\\n... | \n","NaN | \n","NaN | \n","
428514 | \n","It was the last thing that either of\\nthem exp... | \n","Agua Negra across the line.\\nIt was a grim pla... | \n","NaN | \n","NaN | \n","
428515 | \n","settlement with the department.\\nIt is also sh... | \n","\\na note of Wood, Dialogue fc Co., for\\nc27,im... | \n","NaN | \n","NaN | \n","
428516 | \n","Flour quotations—low extras at 1 R0®2 50;\\ncit... | \n","3214c;do White at 3614c: Mixed Western at\\n331... | \n","NaN | \n","NaN | \n","
428517 rows × 4 columns
\n","