challenging-america-word-ga.../Untitled1.ipynb

1 line
17 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":20282,"status":"ok","timestamp":1682168503068,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"pFXwuw2YtOWN","outputId":"fc5db6e3-af0b-4c7f-aeda-e53cc06e3896"},"outputs":[{"name":"stdout","output_type":"stream","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":414,"status":"ok","timestamp":1682168503480,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"ezUTMea6Dd-S","outputId":"025d045b-62ba-4788-cbf2-e71e07e5cfea"},"outputs":[{"name":"stdout","output_type":"stream","text":["/content/drive/MyDrive\n"]}],"source":["cd drive/MyDrive"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":11,"status":"ok","timestamp":1682168503482,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"B8pDlU1gDjgx","outputId":"dae1fdd8-0b5f-4edb-deb6-bbc7754104af"},"outputs":[{"name":"stdout","output_type":"stream","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}],"source":["cd challenging-america-word-gap-prediction/"]},{"cell_type":"code","execution_count":4,"metadata":{"executionInfo":{"elapsed":659,"status":"ok","timestamp":1682168508921,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"ZXkygn2uDmRL"},"outputs":[],"source":["import pandas as pd"]},{"cell_type":"code","execution_count":42,"metadata":{"executionInfo":{"elapsed":6714,"status":"ok","timestamp":1682169607414,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"ynVR7uWXDqyV"},"outputs":[],"source":["cleaned = pd.read_csv(\"cleaned.csv\", sep=\",\", on_bad_lines='skip', encoding=\"utf-8\")"]},{"cell_type":"code","execution_count":43,"metadata":{"executionInfo":{"elapsed":383,"status":"ok","timestamp":1682169610747,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"0Fe6NoQ_HMCp"},"outputs":[],"source":["import numpy as np\n","cleaned.fillna('', inplace=True)"]},{"cell_type":"code","execution_count":44,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":423},"executionInfo":{"elapsed":303,"status":"ok","timestamp":1682169614519,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"ia6-CxAvDwOi","outputId":"da5ccc07-e844-4d61-f190-fd96fd7680ee"},"outputs":[{"data":{"text/html":["\n"," \u003cdiv id=\"df-4c96f10b-08c9-4d5b-b86d-feb4369ed2f0\"\u003e\n"," \u003cdiv class=\"colab-df-container\"\u003e\n"," \u003cdiv\u003e\n","\u003cstyle scoped\u003e\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","\u003c/style\u003e\n","\u003ctable border=\"1\" class=\"dataframe\"\u003e\n"," \u003cthead\u003e\n"," \u003ctr style=\"text-align: right;\"\u003e\n"," \u003cth\u003e\u003c/th\u003e\n"," \u003cth\u003ecol1\u003c/th\u003e\n"," \u003c/tr\u003e\n"," \u003c/thead\u003e\n"," \u003ctbody\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e0\u003c/th\u003e\n"," \u003ctd\u003ecame fiom the last place tothis place and this...\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e1\u003c/th\u003e\n"," \u003ctd\u003emb boot political obeednattempt to imagine a p...\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e2\u003c/th\u003e\n"," \u003ctd\u003e\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e3\u003c/th\u003e\n"," \u003ctd\u003ewhenever any prize property shall condemn app...\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e4\u003c/th\u003e\n"," \u003ctd\u003esa lkofvaluable unimpbovd relnjsiatf on the no...\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e...\u003c/th\u003e\n"," \u003ctd\u003e...\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e428512\u003c/th\u003e\n"," \u003ctd\u003e\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e428513\u003c/th\u003e\n"," \u003ctd\u003e\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e428514\u003c/th\u003e\n"," \u003ctd\u003e\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e428515\u003c/th\u003e\n"," \u003ctd\u003e\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003ctr\u003e\n"," \u003cth\u003e428516\u003c/th\u003e\n"," \u003ctd\u003e\u003c/td\u003e\n"," \u003c/tr\u003e\n"," \u003c/tbody\u003e\n","\u003c/table\u003e\n","\u003cp\u003e428517 rows × 1 columns\u003c/p\u003e\n","\u003c/div\u003e\n"," \u003cbutton class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4c96f10b-08c9-4d5b-b86d-feb4369ed2f0')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\"\u003e\n"," \n"," \u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\"\u003e\n"," \u003cpath d=\"M0 0h24v24H0V0z\" fill=\"none\"/\u003e\n"," \u003cpath d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/\u003e\u003cpath d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/\u003e\n"," \u003c/svg\u003e\n"," \u003c/button\u003e\n"," \n"," \u003cstyle\u003e\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," \u003c/style\u003e\n","\n"," \u003cscript\u003e\n"," const buttonEl =\n"," document.querySelector('#df-4c96f10b-08c9-4d5b-b86d-feb4369ed2f0 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-4c96f10b-08c9-4d5b-b86d-feb4369ed2f0');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '\u003ca target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb\u003edata table notebook\u003c/a\u003e'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," \u003c/script\u003e\n"," \u003c/div\u003e\n"," \u003c/div\u003e\n"," "],"text/plain":[" col1\n","0 came fiom the last place tothis place and this...\n","1 mb boot political obeednattempt to imagine a p...\n","2 \n","3 whenever any prize property shall condemn app...\n","4 sa lkofvaluable unimpbovd relnjsiatf on the no...\n","... ...\n","428512 \n","428513 \n","428514 \n","428515 \n","428516 \n","\n","[428517 rows x 1 columns]"]},"execution_count":44,"metadata":{},"output_type":"execute_result"}],"source":["cleaned"]},{"cell_type":"code","execution_count":46,"metadata":{"executionInfo":{"elapsed":343,"status":"ok","timestamp":1682169624498,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"7pBEAMxyD2eV"},"outputs":[],"source":["vocab = set()\n","unigram = {}\n","bigram = {}\n","trigram = {}\n"]},{"cell_type":"code","execution_count":47,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1682169625847,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"jmmyFTZeED4E"},"outputs":[],"source":["import collections\n","queue = collections.deque(maxlen=3)"]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":417,"status":"ok","timestamp":1682168750026,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"w-xqsBl4Ee25","outputId":"384e06ee-c9fa-4258-9f56-d6ba2d64a29c"},"outputs":[{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"data":{"text/plain":["True"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["import nltk\n","nltk.download('punkt')"]},{"cell_type":"code","execution_count":11,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1682168645598,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"-Ly2h9W9EGL9"},"outputs":[],"source":["from nltk import word_tokenize"]},{"cell_type":"code","execution_count":45,"metadata":{"executionInfo":{"elapsed":262,"status":"ok","timestamp":1682169621727,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"Q1tAFbGHEJ5z"},"outputs":[],"source":["cleaned = list(cleaned['col1'])"]},{"cell_type":"code","execution_count":34,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":296,"status":"ok","timestamp":1682169150763,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"},"user_tz":-120},"id":"Y2ho8UG3EYZi","outputId":"e6e2d7d7-9c08-4918-aceb-689820ff1c60"},"outputs":[{"data":{"text/plain":["['came fiom the last place tothis place and this place is where wenwere this is the first road i evernwas on where you can ride elsewherenfrom anywhere and be nowherenhe says while this train stops everynwhere it never stops anywhere unnless its somewhere well i saysnim glad to hear that but accordning to your figures i left myselfnwhere was which is five miles nearner to myself than i was when wenwere where we are nownwe have now reached slidellnthat a fine place the people down there remind me of bananasnthey come and go in bunches ndell used to be noted for her toughnpeople now she is noted for bentough steaks well i certainly gotnone there when the waiter broughtnit in it was so small i thought itnwas a crack in the plate i skidnwaiter what else have you got +henbrought me in two codfish and onensmelt i said waiter have you gotnpigs feet he said no rheumatismnmakes me walk that way i saldnhow is the pumpkin pieliesaidnit all squash the best i could getnin that hotel was a soup sandwichnafter the table battle the waiter andni signed an armistice i then wentnover to the hotel clerk and asked forna room he said with or without anbed i said with a bed he saidni dont think i have a bed longnenough for you i said well illnaddtwo feettoitwhenigetinitnhe gave me a lovely room on thentop floor it was one of those roomsnthat stands on each side if younhappen to get up in the middle ofnthe night you want to be sure andnget up in the middle of the roomnthat night i dreamt i was eatingnflannel cakes when i woke up halfnof the blanket was gone i mustnhave got up on the wrong side of thenbed for next morning i had an awfulnheadache i told the manager aboutnit he said you have rheumaticnpains i said no i think it is onnof those attic room pains i nad tongetupat aminthemorningsonthey could use the sheet to set thenbreakfast table',\n"," 'mb boot political obeednattempt to imagine a piatt makingnsuch an address as that of elihu bootnto the now york legislature and younfcavo a measure of tho good fortunqnwhich baa at last come to tho empirqnstate of being represented in tho unitned states senate by a statesman atntho very outset mr boot declared forntho parcels post thereby giving noticento tho country that tho express compannies no longer own a senatorial scat acncredited to new york that seat willnfor ho next six years bo occupied by ansmaa who hag convictions of his ownnwho isigovemed by reasoned politicaln ideas who had grown so accustomed tonthink nationally that it is with somonmental eflort that he can bringhimselfninto a proper perspective with thosenminor senatorial duties such as tho fillning of offices which bulk hugelynupon the horizons of tho flatts andntheir lit tho albany politicians wenare told tried to read between tho linesnfor evidence that they had among themna new organization leader somo one tonguide and direct their political machinnations and to settlo where tho goodnthings should go wo think they lisntened in vain what they heard werentimely reflections opon tho immediatenproblems of stato and national governnments mixed with excellent advice tonthe electorate on the duty of improvingnthe quality of tho stato legislaturesnit must have been something of a novnelty though possibly not wholly refreshlin gnto political thirst']"]},"execution_count":34,"metadata":{},"output_type":"execute_result"}],"source":["cleaned[:2]"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true,"base_uri":"https://localhost:8080/","output_embedded_package_id":"1tPXQTg8mJiOvqSsBu_3slJqlIL8N2v8f"},"id":"rfxca0XHD_hC","outputId":"96e851a6-ba4f-422a-ad6c-56ac9df24377"},"outputs":[],"source":["for line in cleaned:\n"," queue.append('') #use empty string to mark the beginning of a sentence\n"," text = line\n"," tokens = word_tokenize(text)\n"," for token in tokens:\n"," # add new word to the queue\n"," queue.append(token)\n"," print(\"queue: \", queue) \n"," # discover new word\n"," if token not in vocab:\n"," vocab.add(token)\n"," print('vocab: ', vocab)\n","\n","\n"," # count frequency of 1 word\n"," if token not in unigram:\n"," #print('unigram[token]: ', unigram[token])\n"," unigram[token] = 0\n"," unigram[token] += 1\n"," print('unigram: ',unigram)\n","\n"," # count frequency of 2 words\n"," if len(queue) \u003e= 2:\n"," item = tuple(queue)[:2]\n"," print('item: ',item)\n"," if item not in bigram:\n"," bigram[item] = 0\n"," bigram[item] += 1\n"," print(\"bigram: \", bigram)\n","\n"," # count frequency of 3 words\n"," if len(queue) == 3:\n"," item = tuple(queue)\n"," if item not in trigram:\n"," trigram[item] = 0\n"," trigram[item] += 1\n"," \n","total_words = len(unigram)\n","unigram[''] = total_words"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"wb5QvW1IH9Vm"},"outputs":[],"source":[]}],"metadata":{"colab":{"authorship_tag":"ABX9TyNDhELzaECzdLD8HtO8Z84Y","name":"","version":""},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}