moj_labs/lab1.ipynb
2023-03-16 23:01:41 +01:00

16 KiB
Raw Blame History

!curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 52.2M  100 52.2M    0     0  4073k      0  0:00:13  0:00:13 --:--:-- 4693k
# Unzip and load .json
import gzip, json
raw_data = []
for line in gzip.open('gutenberg-poetry-v001.ndjson.gz'):
    raw_data.append(json.loads(line.strip()))
raw_data[100:110]
[{'s': 'Through their palisades of pine-trees,', 'gid': '19'},
 {'s': 'And the thunder in the mountains,', 'gid': '19'},
 {'s': 'Whose innumerable echoes', 'gid': '19'},
 {'s': 'Flap like eagles in their eyries;--', 'gid': '19'},
 {'s': 'Listen to these wild traditions,', 'gid': '19'},
 {'s': 'To this Song of Hiawatha!', 'gid': '19'},
 {'s': "Ye who love a nation's legends,", 'gid': '19'},
 {'s': 'Love the ballads of a people,', 'gid': '19'},
 {'s': 'That like voices from afar off', 'gid': '19'},
 {'s': 'Call to us to pause and listen,', 'gid': '19'}]
# Store the poems separately in a dict by id; this makes it possible to connect verses into whole poems
poems_dict = {}
for object in raw_data:
    if object['gid'] not in poems_dict:
        poems_dict[object['gid']] = object['s']
    else:
        poems_dict[object['gid']] += f"\n{object['s']}"
print(poems_dict['19'][5000:5200])
here the tangled barberry-bushes
Hang their tufts of crimson berries
Over stone walls gray with mosses,
Pause by some neglected graveyard,
For a while to muse, and ponder
On a half-effaced inscription
# Check the total number of poems and the estimated average length of a poem in words (estimated since for now punctuation is left as it is)
poems_count = len(poems_dict)
total_word_count = sum([len(v.split()) for v in poems_dict.values()])
print('Total poems:', poems_count)
print('Average poem word length:', total_word_count // poems_count)
Total poems: 1191
Average poem word length: 18438
# Save the entire corpus as one .txt file
with open('gutenberg_poems.txt', 'w', encoding='utf-8') as f:
    for v in poems_dict.values():
        f.write(v + '\n')
# Checking all potentially undesirable characters
!grep -oE "[^a-zA-Z ]" gutenberg_poems.txt | sort | uniq -c | sort -k1 -nr
2257117 ,
 732414 .
 597979 '
 517116 -
 334015 ;
 236376 "
 201702 !
 123680 :
  97547 _
  92513 ?
  28887 (
  28513 )
  27472 
  14575 1
  13943 “
   8998 ]
   8787 /
   8585 }
   8300 2
   7256 {
   6098 3
   5744 8
   5636 4
   5585 ”
   5416 6
   5180 5
   5166 7
   4926 þ
   4670 0
   4507 9
   4346 [
   3661 &
   2542 *
   2364 —
   2000 `
   1985 =
   1908 
   1338 ~
   1211 α
   1203 ν
   1113 ο
   1059 |
    974 ε
    924 τ
    922 Þ
    920 ¡
    849 ι
    666 ρ
    621 >
    599 ς
    593 <
    561 +
    522 σ
    470 π
    464 λ
    453 μ
    447 κ
    415 ¿
    359 δ
    349 €
    347 ”
    345 υ
    282 «
    277 η
    265 ω
    258 θ
    246  
    220 γ
    212 #
    181 φ
    178 »
    138 χ
    137 ^
    114 ἐ
    109 έ
    102 ί
    101 ά
     98 ὶ
     97 ί
     97 ἀ
     96 $
     86 έ
     84 ὸ
     81 β
     76 ό
     74 ά
     72 ῖ
     69 ’
     65 ὰ
     60 
     56 ό
     56 ξ
     54 ύ
     51 ῦ
     50 ῶ
     49 
     48 ὲ
     46 ἔ
     43 ύ
     41 ὐ
     40 ἄ
     39 ἰ
     38 ζ
     38 ·
     37 §
     34 ή
     31 Α
     30 Τ
     30 ῆ
     29 ὑ
     29 ὴ
     28 Π
     27 †
     26 ώ
     26 Μ
     25 ­
     24 ὺ
     24 Ο
     24 ἱ
     23 Κ
     23 ή
     22 
     22 ¶
     21 Ἀ
     21 \
     21 @
     20 ὡ
     20 ψ
     19 ὀ
     19 ἡ
     19 Ε
     18 ἶ
     18 ·
     17 ὁ
     16 ώ
     16 †
     15 £
     15 ̄
     15 „
     14 Σ
     14 ἴ
     14 ᾶ
     13 ῳ
     13 ῷ
     13 ῥ
     13 ὄ
     13 Δ
     13 Β
     12 ὅ
     12 ἁ
     11 Θ
     11 ῇ
     11 ἑ
     11 ©
     10 ὖ
     10 Λ
     10 ῃ
     10 Ἔ
     10 ´
     10 °
      9 ὼ
      9 Ὅ
      9 Ν
      9 ἵ
      9 ¦
      8 Ι
      8 ἤ
      8 ─
      7 Φ
      7 ὕ
      7 Ὑ
      7 ὔ
      7 ϊ
      7 ‧
      6 Χ
      6 ϕ
      6 ΐ
      6 ἢ
      6 Ζ
      6 Ἑ
      6 ᾳ
      5 ὤ
      5 ϑ
      5 ἦ
      5 Ἄ
      5 ½
      5 …
      5 œ
      4 Ὁ
      4 Ἠ
      4 Η
      4 Ἐ
      4 Γ
      4 ἅ
      4 Ἁ
      4 ̆
      4 
      4 	
      3 ὥ
      3 Ὡ
      3 ὦ
      3 Ὠ
      3 ὠ
      3 Ω
      3 ὗ
      3 ῤ
      3 ὃ
      3 ΐ
      3 Ἱ
      3 ἲ
      3 Ἰ
      3 ἂ
      3 ¼
      3 ;
      3 %
      3 “
      2 ὧ
      2 ὒ
      2 Υ
      2 Ξ
      2 ἷ
      2 Ἴ
      2 ῂ
      2 ἣ
      2 Ἡ
      2 ἠ
      2 Ἕ
      2 ἕ
      2 ̓
      2 ⁂
      2 ‡
      2 ×
      2 Ž
      1 ᾧ
      1 Ὣ
      1 ὣ
      1 Ὥ
      1 ὢ
      1 Ὤ
      1 ῡ
      1 ϋ
      1 Ῥ
      1 Ρ
      1 Ό
      1 ὂ
      1 Ὀ
      1 ἳ
      1 Ἵ
      1 ᾗ
      1 ἧ
      1 Ἣ
      1 ἥ
      1 ᾐ
      1 Έ
      1 ἒ
      1 ᾴ
      1 Ά
      1 ἆ
      1 ¤
      1 ̷
      1 ☞
      1 ‖
      1 ΄
      1 ®
      1 ™
      1 —
      1 ‘
      1 
      1 
# Normalizing some characters that should be kept
!sed -i 's/;/;/g' gutenberg_poems.txt
!sed -i 's/…/\.\.\./g' gutenberg_poems.txt
!sed -i 's/[—─–]/-/g' gutenberg_poems.txt
!sed -i "s/[\`\\\\´\΄]/\'/g" gutenberg_poems.txt

# Nuking the remaining garbage characters
!sed -i "s/[^a-zA-Z\ \,\.\'\;\!\:\?\-]//g" gutenberg_poems.txt

# Removing any remaining multiple spaces
!sed -i "s/\ \ */\ /g" gutenberg_poems.txt

# Finally removing diacritic marks from alphabetic characters
!cat gutenberg_poems.txt | unidecode > gutenberg_poems_clean.txt
# Now it looks a lot better - only alphabetic characters, spaces and chosen punctuation are kept
!grep -oE "[^a-zA-Z ]" gutenberg_poems_clean.txt | sort | uniq -c | sort -k1 -nr
2257117 ,
 732429 .
 629413 '
 519492 -
 334018 ;
 201702 !
 123680 :
  92513 ?
# Some basic processed file statistics:
!echo -n "Lines: "
!wc -l < gutenberg_poems_clean.txt
!echo -n "Words: "
!wc -w < gutenberg_poems_clean.txt
!echo -n "Characters: "
!wc -c < gutenberg_poems_clean.txt
!echo -n "Size: "
!ls -lh gutenberg_poems_clean.txt | awk '{print $5}'
Lines: 3085117
Words: 21938739
Characters: 120840262
Size: 116M
# Some random lines from the file (doing this in Jupyter throws a harmless piping error apparently)
!cat gutenberg_poems_clean.txt | shuf | head -10
Sae aft around him flung,
A thing so dark that moments of pain
A mother and daughter stood together
He hath heathen gifts of silver and gold,
at secura quies et nescia fallere uita,
The grim dim thrones of the east Ep. .
Ah tamen illa scelus non lavat unda tuum!
A strong emotion on her cheek!
Byron sang its funeral dirge. But tenderness, and heroism, and
Which now upon my fingers thoughtfully
shuf: write error: Broken pipe
shuf: write error
# Simple top 10 frequency histogram of letters (takes a while to run)
!grep -oE "\w" gutenberg_poems_clean.txt | sort | uniq -c | sort -k1 -nr | head
11792288 e
7549004 t
6755728 a
6703094 o
6189711 h
6158676 n
6157618 s
5768879 r
5576169 i
4163693 l
# Simple top 10 frequency histogram of words (takes a while to run, piping error thrown here as well, but it works)
!cat gutenberg_poems_clean.txt | tr ' ' '\n' | sort | uniq -c | sort -k1 -nr | head
1110344 the
 526755 and
 477187 of
 367204 to
 309477 a
 294277 And
 283595 in
 243898 I
 198621 The
 182639 his
sort: write failed: 'standard output': Broken pipe
sort: write error
# Compressing the file for uploading
!xz -v gutenberg_poems_clean.txt
gutenberg_poems_clean.txt (1/1)
  100 %        34.6 MiB / 115.2 MiB = 0.300   1.6 MiB/s       1:10