challenging-america-word-ga.../Untitled2.ipynb
2023-05-10 00:37:23 +02:00

20 KiB
Raw Blame History

from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cd drive/MyDrive
/content/drive/MyDrive
cd challenging-america-word-gap-prediction/
/content/drive/MyDrive/challenging-america-word-gap-prediction
import pandas as pd
import itertools
import collections
lists = pd.read_csv("list.csv", sep=",", on_bad_lines='skip', encoding="utf-8")
lists
col1
0 came
1 fiom
2 the
3 last
4 place
... ...
76790221 some
76790222 immigrant
76790223 hand
76790224 before
76790225 beingnvertaken

76790226 rows × 1 columns

from nltk import bigrams
from nltk import trigrams
from nltk.util import ngrams
import collections
lists = list(lists['col1'])
type(lists)
list
l =[]
l = lists
all_words = list(itertools.chain(*l))
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-28-f303454e8399> in <cell line: 1>()
----> 1 all_words = list(itertools.chain(*l))

TypeError: 'float' object is not iterable
all_words
word_count = collections.Counter(lists)
word_count.most_common(15)
[('the', 4515278),
 ('of', 2911261),
 ('and', 2147366),
 ('to', 1879496),
 ('a', 1421615),
 ('in', 1360058),
 ('that', 731603),
 ('is', 688194),
 ('it', 597264),
 ('for', 583612),
 ('was', 498125),
 ('be', 492532),
 ('as', 456008),
 ('by', 446776),
 ('at', 444455)]
type(lists)
list
import nltk
bigrams = nltk.bigrams(l)
bigrams
<generator object bigrams at 0x7fd008c53f90>
terms_bigrams = [list(bigrams(entry)) for entry in l]
bigrams= list(itertools.chain(*terms_bigrams))
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-29-46d4cee91707> in <cell line: 1>()
----> 1 terms_bigrams = [list(bigrams(entry)) for entry in l]
      2 bigrams= list(itertools.chain(*terms_bigrams))

<ipython-input-29-46d4cee91707> in <listcomp>(.0)
----> 1 terms_bigrams = [list(bigrams(entry)) for entry in l]
      2 bigrams= list(itertools.chain(*terms_bigrams))

/usr/local/lib/python3.9/dist-packages/nltk/util.py in bigrams(sequence, **kwargs)
    885     """
    886 
--> 887     yield from ngrams(sequence, 2, **kwargs)
    888 
    889 

/usr/local/lib/python3.9/dist-packages/nltk/util.py in ngrams(sequence, n, **kwargs)
    857     :rtype: sequence or iter
    858     """
--> 859     sequence = pad_sequence(sequence, n, **kwargs)
    860 
    861     # Creates the sliding window, of n no. of items.

/usr/local/lib/python3.9/dist-packages/nltk/util.py in pad_sequence(sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol)
    810     :rtype: sequence or iter
    811     """
--> 812     sequence = iter(sequence)
    813     if pad_left:
    814         sequence = chain((left_pad_symbol,) * (n - 1), sequence)

TypeError: 'float' object is not iterable
bigrams