{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPUeZ2ElxrPPHuoVaA9ngsH"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":[],"metadata":{"id":"dmkDbw6WXAbh","executionInfo":{"status":"ok","timestamp":1682174130713,"user_tz":-120,"elapsed":3,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pFXwuw2YtOWN","executionInfo":{"status":"ok","timestamp":1682174134814,"user_tz":-120,"elapsed":3694,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"289968e5-74bb-4bf6-b130-5d19a79d690b"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"blzID9U2XEjl","executionInfo":{"status":"ok","timestamp":1682174134815,"user_tz":-120,"elapsed":14,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"a4dbf5ee-ab36-477a-fd52-f93cd91eac73"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UIvi4ktPXJbr","executionInfo":{"status":"ok","timestamp":1682174134816,"user_tz":-120,"elapsed":13,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"9781982b-a123-4ca5-d367-6aac4bdccf1f"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["import pandas as pd\n","import itertools\n","import collections"],"metadata":{"id":"JBDE5RdCXLoR","executionInfo":{"status":"ok","timestamp":1682174135212,"user_tz":-120,"elapsed":406,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","execution_count":20,"metadata":{"id":"pqThz9AaW-0b","executionInfo":{"status":"ok","timestamp":1682174746505,"user_tz":-120,"elapsed":17750,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"outputs":[],"source":["lists = pd.read_csv(\"list.csv\", sep=\",\", on_bad_lines='skip', encoding=\"utf-8\")"]},{"cell_type":"code","source":["lists"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":423},"id":"jXsHmSmBaBIp","executionInfo":{"status":"ok","timestamp":1682174399937,"user_tz":-120,"elapsed":6,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"b3aa5134-618e-411f-d00d-ff1dbd6a55e5"},"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" col1\n","0 came\n","1 fiom\n","2 the\n","3 last\n","4 place\n","... ...\n","76790221 some\n","76790222 immigrant\n","76790223 hand\n","76790224 before\n","76790225 beingnvertaken\n","\n","[76790226 rows x 1 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
col1
0came
1fiom
2the
3last
4place
......
76790221some
76790222immigrant
76790223hand
76790224before
76790225beingnvertaken
\n","

76790226 rows × 1 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":8}]},{"cell_type":"code","source":["from nltk import bigrams\n","from nltk import trigrams\n","from nltk.util import ngrams\n","import collections"],"metadata":{"id":"z4n4_Pc3P7_Q","executionInfo":{"status":"ok","timestamp":1682174651429,"user_tz":-120,"elapsed":3212,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":15,"outputs":[]},{"cell_type":"code","source":["lists = list(lists['col1'])"],"metadata":{"id":"jvBlmYcoac8J","executionInfo":{"status":"ok","timestamp":1682174751257,"user_tz":-120,"elapsed":4776,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":21,"outputs":[]},{"cell_type":"code","source":["type(lists)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"SvLy320Kbhqh","executionInfo":{"status":"ok","timestamp":1682174784501,"user_tz":-120,"elapsed":593,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"0200a896-7495-4fa5-c2eb-b3bb4553ae34"},"execution_count":24,"outputs":[{"output_type":"execute_result","data":{"text/plain":["list"]},"metadata":{},"execution_count":24}]},{"cell_type":"code","source":["l =[]"],"metadata":{"id":"WYE8t9Q-b9_Z","executionInfo":{"status":"ok","timestamp":1682174900011,"user_tz":-120,"elapsed":8,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":25,"outputs":[]},{"cell_type":"code","source":["l = lists"],"metadata":{"id":"goNy7YLgb_je","executionInfo":{"status":"ok","timestamp":1682174906009,"user_tz":-120,"elapsed":318,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":26,"outputs":[]},{"cell_type":"code","source":["all_words = list(itertools.chain(*l))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":166},"id":"Tp7JpfcIXdGh","executionInfo":{"status":"error","timestamp":1682174915931,"user_tz":-120,"elapsed":437,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"daf1d527-59ff-4507-8328-12868055a3d6"},"execution_count":28,"outputs":[{"output_type":"error","ename":"TypeError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mall_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mTypeError\u001b[0m: 'float' object is not iterable"]}]},{"cell_type":"code","source":["all_words"],"metadata":{"id":"scg9uWDpYHvo","executionInfo":{"status":"aborted","timestamp":1682174143809,"user_tz":-120,"elapsed":9,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["word_count = collections.Counter(lists)"],"metadata":{"id":"QzgpW80aXxIn","executionInfo":{"status":"ok","timestamp":1682174560759,"user_tz":-120,"elapsed":465,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["word_count.most_common(15)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"L1trLSTDX5Bj","executionInfo":{"status":"ok","timestamp":1682174579934,"user_tz":-120,"elapsed":510,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"05460d34-5af9-4145-c4ad-ade8db087cb9"},"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[('the', 4515278),\n"," ('of', 2911261),\n"," ('and', 2147366),\n"," ('to', 1879496),\n"," ('a', 1421615),\n"," ('in', 1360058),\n"," ('that', 731603),\n"," ('is', 688194),\n"," ('it', 597264),\n"," ('for', 583612),\n"," ('was', 498125),\n"," ('be', 492532),\n"," ('as', 456008),\n"," ('by', 446776),\n"," ('at', 444455)]"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["type(lists)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hNF-7FGSbISE","executionInfo":{"status":"ok","timestamp":1682174704165,"user_tz":-120,"elapsed":364,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"eccc0a55-e8bf-4238-df27-550c95f2276e"},"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["list"]},"metadata":{},"execution_count":18}]},{"cell_type":"code","source":["import nltk"],"metadata":{"id":"lm69JXlDcq0r","executionInfo":{"status":"ok","timestamp":1682175085173,"user_tz":-120,"elapsed":322,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":31,"outputs":[]},{"cell_type":"code","source":["bigrams = nltk.bigrams(l)"],"metadata":{"id":"0UwkqKAzcj9r","executionInfo":{"status":"ok","timestamp":1682175089043,"user_tz":-120,"elapsed":4,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":32,"outputs":[]},{"cell_type":"code","source":["bigrams"],"metadata":{"id":"-pEPxA4jcuUn","executionInfo":{"status":"ok","timestamp":1682175097492,"user_tz":-120,"elapsed":410,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"1ee2aef9-4250-4e9e-bda1-e43119be7044","colab":{"base_uri":"https://localhost:8080/"}},"execution_count":33,"outputs":[{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":33}]},{"cell_type":"code","source":["terms_bigrams = [list(bigrams(entry)) for entry in l]\n","bigrams= list(itertools.chain(*terms_bigrams))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":329},"id":"hjlmU0HiX7iE","executionInfo":{"status":"error","timestamp":1682174942678,"user_tz":-120,"elapsed":396,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"92dae62f-e87a-49eb-cfeb-e471815f8167"},"execution_count":29,"outputs":[{"output_type":"error","ename":"TypeError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mterms_bigrams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbigrams\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mentry\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ml\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mbigrams\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mterms_bigrams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mterms_bigrams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbigrams\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentry\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mentry\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ml\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mbigrams\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mterms_bigrams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.9/dist-packages/nltk/util.py\u001b[0m in \u001b[0;36mbigrams\u001b[0;34m(sequence, **kwargs)\u001b[0m\n\u001b[1;32m 885\u001b[0m \"\"\"\n\u001b[1;32m 886\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 887\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mngrams\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 888\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 889\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.9/dist-packages/nltk/util.py\u001b[0m in \u001b[0;36mngrams\u001b[0;34m(sequence, n, **kwargs)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mrtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msequence\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \"\"\"\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0msequence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpad_sequence\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0;31m# Creates the sliding window, of n no. of items.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.9/dist-packages/nltk/util.py\u001b[0m in \u001b[0;36mpad_sequence\u001b[0;34m(sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol)\u001b[0m\n\u001b[1;32m 810\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mrtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msequence\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 811\u001b[0m \"\"\"\n\u001b[0;32m--> 812\u001b[0;31m \u001b[0msequence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequence\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 813\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpad_left\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 814\u001b[0m \u001b[0msequence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mleft_pad_symbol\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msequence\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: 'float' object is not iterable"]}]},{"cell_type":"code","source":["bigrams"],"metadata":{"id":"NezoleAza9D7"},"execution_count":null,"outputs":[]}]}