{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Gutenberg poetry corpus\n", "From: https://github.com/aparrish/gutenberg-poetry-corpus" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "100 52.2M 100 52.2M 0 0 4073k 0 0:00:13 0:00:13 --:--:-- 4693k\r\n" ] } ], "source": [ "!curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Unzip and load .json\n", "import gzip, json\n", "raw_data = []\n", "for line in gzip.open('gutenberg-poetry-v001.ndjson.gz'):\n", " raw_data.append(json.loads(line.strip()))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'s': 'Through their palisades of pine-trees,', 'gid': '19'},\n", " {'s': 'And the thunder in the mountains,', 'gid': '19'},\n", " {'s': 'Whose innumerable echoes', 'gid': '19'},\n", " {'s': 'Flap like eagles in their eyries;--', 'gid': '19'},\n", " {'s': 'Listen to these wild traditions,', 'gid': '19'},\n", " {'s': 'To this Song of Hiawatha!', 'gid': '19'},\n", " {'s': \"Ye who love a nation's legends,\", 'gid': '19'},\n", " {'s': 'Love the ballads of a people,', 'gid': '19'},\n", " {'s': 'That like voices from afar off', 'gid': '19'},\n", " {'s': 'Call to us to pause and listen,', 'gid': '19'}]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data[100:110]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Store the poems separately in a dict by id; this makes it possible to connect verses into whole poems\n", "poems_dict = {}\n", "for object in raw_data:\n", " if object['gid'] not in poems_dict:\n", " poems_dict[object['gid']] = object['s']\n", " else:\n", " poems_dict[object['gid']] += f\"\\n{object['s']}\"" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "here the tangled barberry-bushes\n", "Hang their tufts of crimson berries\n", "Over stone walls gray with mosses,\n", "Pause by some neglected graveyard,\n", "For a while to muse, and ponder\n", "On a half-effaced inscription\n" ] } ], "source": [ "print(poems_dict['19'][5000:5200])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total poems: 1191\n", "Average poem word length: 18438\n" ] } ], "source": [ "# Check the total number of poems and the estimated average length of a poem in words (estimated since for now punctuation is left as it is)\n", "poems_count = len(poems_dict)\n", "total_word_count = sum([len(v.split()) for v in poems_dict.values()])\n", "print('Total poems:', poems_count)\n", "print('Average poem word length:', total_word_count // poems_count)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Save the entire corpus as one .txt file\n", "with open('gutenberg_poems.txt', 'w', encoding='utf-8') as f:\n", " for v in poems_dict.values():\n", " f.write(v + '\\n')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2257117 ,\r\n", " 732414 .\r\n", " 597979 '\r\n", " 517116 -\r\n", " 334015 ;\r\n", " 236376 \"\r\n", " 201702 !\r\n", " 123680 :\r\n", " 97547 _\r\n", " 92513 ?\r\n", " 28887 (\r\n", " 28513 )\r\n", " 27472 ’\r\n", " 14575 1\r\n", " 13943 “\r\n", " 8998 ]\r\n", " 8787 /\r\n", " 8585 }\r\n", " 8300 2\r\n", " 7256 {\r\n", " 6098 3\r\n", " 5744 8\r\n", " 5636 4\r\n", " 5585 ”\r\n", " 5416 6\r\n", " 5180 5\r\n", " 5166 7\r\n", " 4926 þ\r\n", " 4670 0\r\n", " 4507 9\r\n", " 4346 [\r\n", " 3661 &\r\n", " 2542 *\r\n", " 2364 —\r\n", " 2000 `\r\n", " 1985 =\r\n", " 1908 ‘\r\n", " 1338 ~\r\n", " 1211 α\r\n", " 1203 ν\r\n", " 1113 ο\r\n", " 1059 |\r\n", " 974 ε\r\n", " 924 τ\r\n", " 922 Þ\r\n", " 920 ¡\r\n", " 849 ι\r\n", " 666 ρ\r\n", " 621 >\r\n", " 599 ς\r\n", " 593 <\r\n", " 561 +\r\n", " 522 σ\r\n", " 470 π\r\n", " 464 λ\r\n", " 453 μ\r\n", " 447 κ\r\n", " 415 ¿\r\n", " 359 δ\r\n", " 349 €\r\n", " 347 ”\r\n", " 345 υ\r\n", " 282 «\r\n", " 277 η\r\n", " 265 ω\r\n", " 258 θ\r\n", " 246  \r\n", " 220 γ\r\n", " 212 #\r\n", " 181 φ\r\n", " 178 »\r\n", " 138 χ\r\n", " 137 ^\r\n", " 114 ἐ\r\n", " 109 έ\r\n", " 102 ί\r\n", " 101 ά\r\n", " 98 ὶ\r\n", " 97 ί\r\n", " 97 ἀ\r\n", " 96 $\r\n", " 86 έ\r\n", " 84 ὸ\r\n", " 81 β\r\n", " 76 ό\r\n", " 74 ά\r\n", " 72 ῖ\r\n", " 69 ’\r\n", " 65 ὰ\r\n", " 60 \u0001\r\n", " 56 ό\r\n", " 56 ξ\r\n", " 54 ύ\r\n", " 51 ῦ\r\n", " 50 ῶ\r\n", " 49 \r\n", " 48 ὲ\r\n", " 46 ἔ\r\n", " 43 ύ\r\n", " 41 ὐ\r\n", " 40 ἄ\r\n", " 39 ἰ\r\n", " 38 ζ\r\n", " 38 ·\r\n", " 37 §\r\n", " 34 ή\r\n", " 31 Α\r\n", " 30 Τ\r\n", " 30 ῆ\r\n", " 29 ὑ\r\n", " 29 ὴ\r\n", " 28 Π\r\n", " 27 †\r\n", " 26 ώ\r\n", " 26 Μ\r\n", " 25 ­\r\n", " 24 ὺ\r\n", " 24 Ο\r\n", " 24 ἱ\r\n", " 23 Κ\r\n", " 23 ή\r\n", " 22 ᾽\r\n", " 22 ¶\r\n", " 21 Ἀ\r\n", " 21 \\\r\n", " 21 @\r\n", " 20 ὡ\r\n", " 20 ψ\r\n", " 19 ὀ\r\n", " 19 ἡ\r\n", " 19 Ε\r\n", " 18 ἶ\r\n", " 18 ·\r\n", " 17 ὁ\r\n", " 16 ώ\r\n", " 16 †\r\n", " 15 £\r\n", " 15 ̄\r\n", " 15 „\r\n", " 14 Σ\r\n", " 14 ἴ\r\n", " 14 ᾶ\r\n", " 13 ῳ\r\n", " 13 ῷ\r\n", " 13 ῥ\r\n", " 13 ὄ\r\n", " 13 Δ\r\n", " 13 Β\r\n", " 12 ὅ\r\n", " 12 ἁ\r\n", " 11 Θ\r\n", " 11 ῇ\r\n", " 11 ἑ\r\n", " 11 ©\r\n", " 10 ὖ\r\n", " 10 Λ\r\n", " 10 ῃ\r\n", " 10 Ἔ\r\n", " 10 ´\r\n", " 10 °\r\n", " 9 ὼ\r\n", " 9 Ὅ\r\n", " 9 Ν\r\n", " 9 ἵ\r\n", " 9 ¦\r\n", " 8 Ι\r\n", " 8 ἤ\r\n", " 8 ─\r\n", " 7 Φ\r\n", " 7 ὕ\r\n", " 7 Ὑ\r\n", " 7 ὔ\r\n", " 7 ϊ\r\n", " 7 ‧\r\n", " 6 Χ\r\n", " 6 ϕ\r\n", " 6 ΐ\r\n", " 6 ἢ\r\n", " 6 Ζ\r\n", " 6 Ἑ\r\n", " 6 ᾳ\r\n", " 5 ὤ\r\n", " 5 ϑ\r\n", " 5 ἦ\r\n", " 5 Ἄ\r\n", " 5 ½\r\n", " 5 …\r\n", " 5 œ\r\n", " 4 Ὁ\r\n", " 4 Ἠ\r\n", " 4 Η\r\n", " 4 Ἐ\r\n", " 4 Γ\r\n", " 4 ἅ\r\n", " 4 Ἁ\r\n", " 4 ̆\r\n", " 4 –\r\n", " 4 \t\r\n", " 3 ὥ\r\n", " 3 Ὡ\r\n", " 3 ὦ\r\n", " 3 Ὠ\r\n", " 3 ὠ\r\n", " 3 Ω\r\n", " 3 ὗ\r\n", " 3 ῤ\r\n", " 3 ὃ\r\n", " 3 ΐ\r\n", " 3 Ἱ\r\n", " 3 ἲ\r\n", " 3 Ἰ\r\n", " 3 ἂ\r\n", " 3 ¼\r\n", " 3 ;\r\n", " 3 %\r\n", " 3 “\r\n", " 2 ὧ\r\n", " 2 ὒ\r\n", " 2 Υ\r\n", " 2 Ξ\r\n", " 2 ἷ\r\n", " 2 Ἴ\r\n", " 2 ῂ\r\n", " 2 ἣ\r\n", " 2 Ἡ\r\n", " 2 ἠ\r\n", " 2 Ἕ\r\n", " 2 ἕ\r\n", " 2 ̓\r\n", " 2 ⁂\r\n", " 2 ‡\r\n", " 2 ×\r\n", " 2 Ž\r\n", " 1 ᾧ\r\n", " 1 Ὣ\r\n", " 1 ὣ\r\n", " 1 Ὥ\r\n", " 1 ὢ\r\n", " 1 Ὤ\r\n", " 1 ῡ\r\n", " 1 ϋ\r\n", " 1 Ῥ\r\n", " 1 Ρ\r\n", " 1 Ό\r\n", " 1 ὂ\r\n", " 1 Ὀ\r\n", " 1 ἳ\r\n", " 1 Ἵ\r\n", " 1 ᾗ\r\n", " 1 ἧ\r\n", " 1 Ἣ\r\n", " 1 ἥ\r\n", " 1 ᾐ\r\n", " 1 Έ\r\n", " 1 ἒ\r\n", " 1 ᾴ\r\n", " 1 Ά\r\n", " 1 ἆ\r\n", " 1 ¤\r\n", " 1 ̷\r\n", " 1 ☞\r\n", " 1 ‖\r\n", " 1 ΄\r\n", " 1 ®\r\n", " 1 ™\r\n", " 1 —\r\n", " 1 ‘\r\n", " 1 \u001a\r\n", " 1 \u0007\r\n" ] } ], "source": [ "# Checking all potentially undesirable characters\n", "!grep -oE \"[^a-zA-Z ]\" gutenberg_poems.txt | sort | uniq -c | sort -k1 -nr" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Normalizing some characters that should be kept\n", "!sed -i 's/;/;/g' gutenberg_poems.txt\n", "!sed -i 's/…/\\.\\.\\./g' gutenberg_poems.txt\n", "!sed -i 's/[—─–]/-/g' gutenberg_poems.txt\n", "!sed -i \"s/[\\`\\’\\‘\\᾽\\´\\΄]/\\'/g\" gutenberg_poems.txt\n", "\n", "# Nuking the remaining garbage characters\n", "!sed -i \"s/[^a-zA-Z\\ \\,\\.\\'\\;\\!\\:\\?\\-]//g\" gutenberg_poems.txt\n", "\n", "# Removing any remaining multiple spaces\n", "!sed -i \"s/\\ \\ */\\ /g\" gutenberg_poems.txt\n", "\n", "# Finally removing diacritic marks from alphabetic characters\n", "!cat gutenberg_poems.txt | unidecode > gutenberg_poems_clean.txt" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2257117 ,\r\n", " 732429 .\r\n", " 629413 '\r\n", " 519492 -\r\n", " 334018 ;\r\n", " 201702 !\r\n", " 123680 :\r\n", " 92513 ?\r\n" ] } ], "source": [ "# Now it looks a lot better - only alphabetic characters, spaces and chosen punctuation are kept\n", "!grep -oE \"[^a-zA-Z ]\" gutenberg_poems_clean.txt | sort | uniq -c | sort -k1 -nr" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Lines: 3085117\r\n", "Words: 21938739\r\n", "Characters: 120840262\r\n", "Size: 116M\r\n" ] } ], "source": [ "# Some basic processed file statistics:\n", "!echo -n \"Lines: \"\n", "!wc -l < gutenberg_poems_clean.txt\n", "!echo -n \"Words: \"\n", "!wc -w < gutenberg_poems_clean.txt\n", "!echo -n \"Characters: \"\n", "!wc -c < gutenberg_poems_clean.txt\n", "!echo -n \"Size: \"\n", "!ls -lh gutenberg_poems_clean.txt | awk '{print $5}'" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sae aft around him flung,\r\n", "A thing so dark that moments of pain\r\n", "A mother and daughter stood together\r\n", "He hath heathen gifts of silver and gold,\r\n", "at secura quies et nescia fallere uita,\r\n", "The grim dim thrones of the east Ep. .\r\n", "Ah tamen illa scelus non lavat unda tuum!\r\n", "A strong emotion on her cheek!\r\n", "Byron sang its funeral dirge. But tenderness, and heroism, and\r\n", "Which now upon my fingers thoughtfully\r\n", "shuf: write error: Broken pipe\r\n", "shuf: write error\r\n" ] } ], "source": [ "# Some random lines from the file (doing this in Jupyter throws a harmless piping error apparently)\n", "!cat gutenberg_poems_clean.txt | shuf | head -10" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11792288 e\r\n", "7549004 t\r\n", "6755728 a\r\n", "6703094 o\r\n", "6189711 h\r\n", "6158676 n\r\n", "6157618 s\r\n", "5768879 r\r\n", "5576169 i\r\n", "4163693 l\r\n" ] } ], "source": [ "# Simple top 10 frequency histogram of letters (takes a while to run)\n", "!grep -oE \"\\w\" gutenberg_poems_clean.txt | sort | uniq -c | sort -k1 -nr | head" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1110344 the\r\n", " 526755 and\r\n", " 477187 of\r\n", " 367204 to\r\n", " 309477 a\r\n", " 294277 And\r\n", " 283595 in\r\n", " 243898 I\r\n", " 198621 The\r\n", " 182639 his\r\n", "sort: write failed: 'standard output': Broken pipe\r\n", "sort: write error\r\n" ] } ], "source": [ "# Simple top 10 frequency histogram of words (takes a while to run, piping error thrown here as well, but it works)\n", "!cat gutenberg_poems_clean.txt | tr ' ' '\\n' | sort | uniq -c | sort -k1 -nr | head" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gutenberg_poems_clean.txt (1/1)\r\n", " 100 % 34.6 MiB / 115.2 MiB = 0.300 1.6 MiB/s 1:10 \r\n" ] } ], "source": [ "# Compressing the file for uploading\n", "!xz -v gutenberg_poems_clean.txt" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 1 }