20 KiB
20 KiB
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cd drive/MyDrive
/content/drive/MyDrive
cd challenging-america-word-gap-prediction/
/content/drive/MyDrive/challenging-america-word-gap-prediction
import pandas as pd
import itertools
import collections
lists = pd.read_csv("list.csv", sep=",", on_bad_lines='skip', encoding="utf-8")
lists
col1 | |
---|---|
0 | came |
1 | fiom |
2 | the |
3 | last |
4 | place |
... | ... |
76790221 | some |
76790222 | immigrant |
76790223 | hand |
76790224 | before |
76790225 | beingnvertaken |
76790226 rows × 1 columns
from nltk import bigrams
from nltk import trigrams
from nltk.util import ngrams
import collections
lists = list(lists['col1'])
type(lists)
list
l =[]
l = lists
all_words = list(itertools.chain(*l))
[0;31m---------------------------------------------------------------------------[0m [0;31mTypeError[0m Traceback (most recent call last) [0;32m<ipython-input-28-f303454e8399>[0m in [0;36m<cell line: 1>[0;34m()[0m [0;32m----> 1[0;31m [0mall_words[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mitertools[0m[0;34m.[0m[0mchain[0m[0;34m([0m[0;34m*[0m[0ml[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m [0;31mTypeError[0m: 'float' object is not iterable
all_words
word_count = collections.Counter(lists)
word_count.most_common(15)
[('the', 4515278), ('of', 2911261), ('and', 2147366), ('to', 1879496), ('a', 1421615), ('in', 1360058), ('that', 731603), ('is', 688194), ('it', 597264), ('for', 583612), ('was', 498125), ('be', 492532), ('as', 456008), ('by', 446776), ('at', 444455)]
type(lists)
list
import nltk
bigrams = nltk.bigrams(l)
bigrams
<generator object bigrams at 0x7fd008c53f90>
terms_bigrams = [list(bigrams(entry)) for entry in l]
bigrams= list(itertools.chain(*terms_bigrams))
[0;31m---------------------------------------------------------------------------[0m [0;31mTypeError[0m Traceback (most recent call last) [0;32m<ipython-input-29-46d4cee91707>[0m in [0;36m<cell line: 1>[0;34m()[0m [0;32m----> 1[0;31m [0mterms_bigrams[0m [0;34m=[0m [0;34m[[0m[0mlist[0m[0;34m([0m[0mbigrams[0m[0;34m([0m[0mentry[0m[0;34m)[0m[0;34m)[0m [0;32mfor[0m [0mentry[0m [0;32min[0m [0ml[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 2[0m [0mbigrams[0m[0;34m=[0m [0mlist[0m[0;34m([0m[0mitertools[0m[0;34m.[0m[0mchain[0m[0;34m([0m[0;34m*[0m[0mterms_bigrams[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m<ipython-input-29-46d4cee91707>[0m in [0;36m<listcomp>[0;34m(.0)[0m [0;32m----> 1[0;31m [0mterms_bigrams[0m [0;34m=[0m [0;34m[[0m[0mlist[0m[0;34m([0m[0mbigrams[0m[0;34m([0m[0mentry[0m[0;34m)[0m[0;34m)[0m [0;32mfor[0m [0mentry[0m [0;32min[0m [0ml[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 2[0m [0mbigrams[0m[0;34m=[0m [0mlist[0m[0;34m([0m[0mitertools[0m[0;34m.[0m[0mchain[0m[0;34m([0m[0;34m*[0m[0mterms_bigrams[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.9/dist-packages/nltk/util.py[0m in [0;36mbigrams[0;34m(sequence, **kwargs)[0m [1;32m 885[0m """ [1;32m 886[0m [0;34m[0m[0m [0;32m--> 887[0;31m [0;32myield[0m [0;32mfrom[0m [0mngrams[0m[0;34m([0m[0msequence[0m[0;34m,[0m [0;36m2[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 888[0m [0;34m[0m[0m [1;32m 889[0m [0;34m[0m[0m [0;32m/usr/local/lib/python3.9/dist-packages/nltk/util.py[0m in [0;36mngrams[0;34m(sequence, n, **kwargs)[0m [1;32m 857[0m [0;34m:[0m[0mrtype[0m[0;34m:[0m [0msequence[0m [0;32mor[0m [0miter[0m[0;34m[0m[0;34m[0m[0m [1;32m 858[0m """ [0;32m--> 859[0;31m [0msequence[0m [0;34m=[0m [0mpad_sequence[0m[0;34m([0m[0msequence[0m[0;34m,[0m [0mn[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 860[0m [0;34m[0m[0m [1;32m 861[0m [0;31m# Creates the sliding window, of n no. of items.[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.9/dist-packages/nltk/util.py[0m in [0;36mpad_sequence[0;34m(sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol)[0m [1;32m 810[0m [0;34m:[0m[0mrtype[0m[0;34m:[0m [0msequence[0m [0;32mor[0m [0miter[0m[0;34m[0m[0;34m[0m[0m [1;32m 811[0m """ [0;32m--> 812[0;31m [0msequence[0m [0;34m=[0m [0miter[0m[0;34m([0m[0msequence[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 813[0m [0;32mif[0m [0mpad_left[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 814[0m [0msequence[0m [0;34m=[0m [0mchain[0m[0;34m([0m[0;34m([0m[0mleft_pad_symbol[0m[0;34m,[0m[0;34m)[0m [0;34m*[0m [0;34m([0m[0mn[0m [0;34m-[0m [0;36m1[0m[0;34m)[0m[0;34m,[0m [0msequence[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;31mTypeError[0m: 'float' object is not iterable
bigrams