17 KiB
17 KiB
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
cd drive/MyDrive
/content/drive/MyDrive
cd challenging-america-word-gap-prediction/
/content/drive/MyDrive/challenging-america-word-gap-prediction
import pandas as pd
cleaned = pd.read_csv("cleaned.csv", sep=",", on_bad_lines='skip', encoding="utf-8")
import numpy as np
cleaned.fillna('', inplace=True)
cleaned
col1 | |
---|---|
0 | came fiom the last place tothis place and this... |
1 | mb boot political obeednattempt to imagine a p... |
2 | |
3 | whenever any prize property shall condemn app... |
4 | sa lkofvaluable unimpbovd relnjsiatf on the no... |
... | ... |
428512 | |
428513 | |
428514 | |
428515 | |
428516 |
428517 rows × 1 columns
vocab = set()
unigram = {}
bigram = {}
trigram = {}
import collections
queue = collections.deque(maxlen=3)
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
True
from nltk import word_tokenize
cleaned = list(cleaned['col1'])
cleaned[:2]
['came fiom the last place tothis place and this place is where wenwere this is the first road i evernwas on where you can ride elsewherenfrom anywhere and be nowherenhe says while this train stops everynwhere it never stops anywhere unnless its somewhere well i saysnim glad to hear that but accordning to your figures i left myselfnwhere was which is five miles nearner to myself than i was when wenwere where we are nownwe have now reached slidellnthat a fine place the people down there remind me of bananasnthey come and go in bunches ndell used to be noted for her toughnpeople now she is noted for bentough steaks well i certainly gotnone there when the waiter broughtnit in it was so small i thought itnwas a crack in the plate i skidnwaiter what else have you got +henbrought me in two codfish and onensmelt i said waiter have you gotnpigs feet he said no rheumatismnmakes me walk that way i saldnhow is the pumpkin pieliesaidnit all squash the best i could getnin that hotel was a soup sandwichnafter the table battle the waiter andni signed an armistice i then wentnover to the hotel clerk and asked forna room he said with or without anbed i said with a bed he saidni dont think i have a bed longnenough for you i said well illnaddtwo feettoitwhenigetinitnhe gave me a lovely room on thentop floor it was one of those roomsnthat stands on each side if younhappen to get up in the middle ofnthe night you want to be sure andnget up in the middle of the roomnthat night i dreamt i was eatingnflannel cakes when i woke up halfnof the blanket was gone i mustnhave got up on the wrong side of thenbed for next morning i had an awfulnheadache i told the manager aboutnit he said you have rheumaticnpains i said no i think it is onnof those attic room pains i nad tongetupat aminthemorningsonthey could use the sheet to set thenbreakfast table', 'mb boot political obeednattempt to imagine a piatt makingnsuch an address as that of elihu bootnto the now york legislature and younfcavo a measure of tho good fortunqnwhich baa at last come to tho empirqnstate of being represented in tho unitned states senate by a statesman atntho very outset mr boot declared forntho parcels post thereby giving noticento tho country that tho express compannies no longer own a senatorial scat acncredited to new york that seat willnfor ho next six years bo occupied by ansmaa who hag convictions of his ownnwho isigovemed by reasoned politicaln ideas who had grown so accustomed tonthink nationally that it is with somonmental eflort that he can bringhimselfninto a proper perspective with thosenminor senatorial duties such as tho fillning of offices which bulk hugelynupon the horizons of tho flatts andntheir lit tho albany politicians wenare told tried to read between tho linesnfor evidence that they had among themna new organization leader somo one tonguide and direct their political machinnations and to settlo where tho goodnthings should go wo think they lisntened in vain what they heard werentimely reflections opon tho immediatenproblems of stato and national governnments mixed with excellent advice tonthe electorate on the duty of improvingnthe quality of tho stato legislaturesnit must have been something of a novnelty though possibly not wholly refreshlin gnto political thirst']
for line in cleaned:
queue.append('') #use empty string to mark the beginning of a sentence
text = line
tokens = word_tokenize(text)
for token in tokens:
# add new word to the queue
queue.append(token)
print("queue: ", queue)
# discover new word
if token not in vocab:
vocab.add(token)
print('vocab: ', vocab)
# count frequency of 1 word
if token not in unigram:
#print('unigram[token]: ', unigram[token])
unigram[token] = 0
unigram[token] += 1
print('unigram: ',unigram)
# count frequency of 2 words
if len(queue) >= 2:
item = tuple(queue)[:2]
print('item: ',item)
if item not in bigram:
bigram[item] = 0
bigram[item] += 1
print("bigram: ", bigram)
# count frequency of 3 words
if len(queue) == 3:
item = tuple(queue)
if item not in trigram:
trigram[item] = 0
trigram[item] += 1
total_words = len(unigram)
unigram[''] = total_words