lab3 v3

2023-03-29 11:01:35 +02:00 · 2023-03-29 11:01:35 +02:00 · be868f492b
commit be868f492b
parent 582e470488
37 changed files with 150 additions and 151 deletions
--- a/Lab3/files_bin/own_corpus.bin
+++ b/Lab3/files_bin/own_corpus.bin
--- a/Lab3/files_bin/own_corpus_codetable.bin
+++ b/Lab3/files_bin/own_corpus_codetable.bin
--- a/Lab3/files_bin/random_text_geometric_distribution.bin
+++ b/Lab3/files_bin/random_text_geometric_distribution.bin
--- a/Lab3/files_bin/random_text_geometric_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_geometric_distribution_codetable.bin
@ -0,0 +1 @@
 {'l': (7, 0), 'p': (9, 4), 'D': (15, 320), 'E': (15, 321), _EOF: (16, 644), 'A': (16, 645), 'B': (16, 646), 'C': (16, 647), 'w': (13, 81), 'v': (12, 41), 't': (11, 21), 'r': (10, 11), 'n': (8, 3), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'u': (12, 2048), 'y': (14, 8196), 'z': (14, 8197), 'x': (13, 4099), 's': (11, 1025), 'q': (10, 513), 'o': (9, 257), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}
--- a/Lab3/files_bin/random_text_uniform_distribution.bin
+++ b/Lab3/files_bin/random_text_uniform_distribution.bin
--- a/Lab3/files_bin/random_text_uniform_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_uniform_distribution_codetable.bin
@ -0,0 +1 @@
 {'7': (5, 0), _EOF: (7, 4), 'C': (7, 5), 'r': (6, 3), '1': (6, 4), 'y': (6, 5), 'Z': (6, 6), 'm': (6, 7), '5': (6, 8), 'J': (6, 9), 'Y': (6, 10), 'E': (6, 11), 'v': (6, 12), 'p': (6, 13), 'c': (6, 14), 'w': (6, 15), 'B': (6, 16), 'g': (6, 17), '3': (6, 18), 'x': (6, 19), 'q': (6, 20), 's': (6, 21), 'b': (6, 22), 'i': (6, 23), 'k': (6, 24), '2': (6, 25), '9': (6, 26), 'G': (6, 27), 'S': (6, 28), 'A': (6, 29), 'f': (6, 30), 'l': (6, 31), 'e': (6, 32), 'M': (6, 33), 'W': (6, 34), 'P': (6, 35), 'O': (6, 36), 'j': (6, 37), '0': (6, 38), 'u': (6, 39), 'T': (6, 40), '4': (6, 41), 'o': (6, 42), 'I': (6, 43), '6': (6, 44), 't': (6, 45), 'L': (6, 46), '8': (6, 47), ' ': (6, 48), 'V': (6, 49), 'h': (6, 50), 'Q': (6, 51), 'U': (6, 52), 'F': (6, 53), 'K': (6, 54), 'n': (6, 55), 'R': (6, 56), 'z': (6, 57), 'H': (6, 58), 'a': (6, 59), 'd': (6, 60), 'N': (6, 61), 'D': (6, 62), 'X': (6, 63)}
--- a/Lab3/files_bin/random_text_uniform_two_point_05_distribution.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_05_distribution.bin
--- a/Lab3/files_bin/random_text_uniform_two_point_05_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_05_distribution_codetable.bin
@ -0,0 +1 @@
 {_EOF: (2, 0), '1': (2, 1), '0': (1, 1)}
--- a/Lab3/files_bin/random_text_uniform_two_point_09_distribution.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_09_distribution.bin
--- a/Lab3/files_bin/random_text_uniform_two_point_09_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_09_distribution_codetable.bin
--- a/Lab3/files_tar/own_corpus.tar.gz
+++ b/Lab3/files_tar/own_corpus.tar.gz
--- a/Lab3/files_tar/random_text_geometric_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_geometric_distribution.tar.gz
--- a/Lab3/files_tar/random_text_uniform_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_uniform_distribution.tar.gz
--- a/Lab3/files_tar/random_text_uniform_two_point_05_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_uniform_two_point_05_distribution.tar.gz
--- a/Lab3/files_tar/random_text_uniform_two_point_09_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_uniform_two_point_09_distribution.tar.gz
--- a/Lab3/files_txt/own_corpus.txt
+++ b/Lab3/files_txt/own_corpus.txt
--- a/Lab3/files_txt/random_text_geometric_distribution.txt
+++ b/Lab3/files_txt/random_text_geometric_distribution.txt
--- a/Lab3/files_txt/random_text_uniform_distribution.txt
+++ b/Lab3/files_txt/random_text_uniform_distribution.txt
--- a/Lab3/files_txt/random_text_uniform_two_point_05_distribution.txt
+++ b/Lab3/files_txt/random_text_uniform_two_point_05_distribution.txt
--- a/Lab3/files_txt/random_text_uniform_two_point_09_distribution.txt
+++ b/Lab3/files_txt/random_text_uniform_two_point_09_distribution.txt
--- a/Lab3/lab3_solution.ipynb
+++ b/Lab3/lab3_solution.ipynb
@ -18,51 +18,57 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import string\n",
    "import os\n",
    "\n",
    "# Set the length of the string to generate\n",
    "string_length = 1000000\n",
    "\n",
    "# Define the character set to choose from\n",
-    "character_set = np.array(list(string.ascii_letters + string.digits))"
+    "character_set = np.array(list(string.ascii_letters + string.digits + \" \"))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(\"./files_txt\", exist_ok=True)\n",
    "os.makedirs(\"./files_tar\", exist_ok=True)\n",
    "os.makedirs(\"./files_bin\", exist_ok=True)\n",
    "\n",
    "with open(\"../Lab1/out-merged.txt\", 'r') as file:\n",
    "    file_content = file.read()\n",
    "    first_chars = file_content[:string_length]\n",
    "\n",
-    "    with open(\"./own_corpus.txt\", 'w') as f:\n",
+    "    with open(\"files_txt/own_corpus.txt\", 'w') as f:\n",
-    "      f.write(first_chars)"
+    "      f.write(first_chars)\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate the random string using uniform distribution\n",
    "random_indices = np.random.uniform(low=0, high=len(character_set), size=string_length).astype(int)\n",
-    "random_string = ''.join(character_set[random_indices])\n",
+    "random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
    "random_string = ''.join(random_characters)\n",
    "\n",
-    "with open('random_text_uniform_distribution.txt', 'w') as f:\n",
+    "with open('files_txt/random_text_uniform_distribution.txt', 'w') as f:\n",
    "    f.write(random_string)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@ -73,14 +79,13 @@
    "random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
    "random_string = ''.join(random_characters)\n",
    "\n",
-    "\n",
+    "with open('files_txt/random_text_geometric_distribution.txt', 'w') as f:\n",
    "with open('random_text_geometric_distribution.txt', 'w') as f:\n",
    "    f.write(random_string)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@ -89,13 +94,13 @@
    "random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.5, 0.5])\n",
    "random_string = ''.join(character_set[random_indices])\n",
    "\n",
-    "with open('random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n",
+    "with open('files_txt/random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n",
    "    f.write(random_string)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@ -104,7 +109,7 @@
    "random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.1, 0.9])\n",
    "random_string = ''.join(character_set[random_indices])\n",
    "\n",
-    "with open('random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n",
+    "with open('files_txt/random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n",
    "    f.write(random_string)"
   ]
  },
@ -118,23 +123,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Compression complete. The compressed archive is saved as own_corpus.tar.gz.\n",
+      "Compression complete. The compressed archive is saved as files_tar/own_corpus.tar.gz.\n",
-      "Compression ratio: 4.59738408845367\n",
+      "Compression ratio: 4.597193872860006\n",
-      "Compression complete. The compressed archive is saved as random_text_uniform_distribution.tar.gz.\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_geometric_distribution.tar.gz.\n",
-      "Compression ratio: 1.3293011199361935\n",
+      "Compression ratio: 2.2354861064538483\n",
-      "Compression complete. The compressed archive is saved as random_text_geometric_distribution.tar.gz.\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_distribution.tar.gz.\n",
-      "Compression ratio: 2.2415996054784695\n",
+      "Compression ratio: 1.3254319914218042\n",
-      "Compression complete. The compressed archive is saved as random_text_uniform_two_point_05_distribution.tar.gz.\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_05_distribution.tar.gz.\n",
-      "Compression ratio: 6.6557955339611965\n",
+      "Compression ratio: 6.656903208627346\n",
-      "Compression complete. The compressed archive is saved as random_text_uniform_two_point_09_distribution.tar.gz.\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_09_distribution.tar.gz.\n",
-      "Compression ratio: 12.250398137939483\n"
+      "Compression ratio: 12.2086705978586\n"
     ]
    }
   ],
@ -143,32 +148,34 @@
    "import os\n",
    "\n",
    "def compress_file(file_name):\n",
-    "    output_archive_name = file_name.replace('.txt', '.tar.gz')\n",
+    "    output_archive_name = \"files_tar/\" + file_name.split('/')[1].replace('.txt', '.tar.gz')\n",
    "    with tarfile.open(output_archive_name, 'w:gz') as tar:\n",
    "        tar.add(file_name)\n",
    "\n",
    "    print(f'Compression complete. The compressed archive is saved as {output_archive_name}.')\n",
    "    print(f'Compression ratio: {os.path.getsize(file_name) / os.path.getsize(output_archive_name)}')\n",
    "\n",
-    "file_names = ['own_corpus.txt', 'random_text_uniform_distribution.txt', 'random_text_geometric_distribution.txt', 'random_text_uniform_two_point_05_distribution.txt', 'random_text_uniform_two_point_09_distribution.txt']\n",
+    "\n",
    "file_names = [\"files_txt/\" + f for f in os.listdir('files_txt') if f.endswith('.txt')]\n",
    "file_names.sort()\n",
    "for file in file_names:\n",
    "    compress_file(file)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Entropy for own_corpus.txt: 1.754256\n",
+      "Entropy for files_txt/own_corpus.txt: 1.754256\n",
-      "Entropy for random_text_uniform_distribution.txt: 6.016072\n",
+      "Entropy for files_txt/random_text_geometric_distribution.txt: 3.5624\n",
-      "Entropy for random_text_geometric_distribution.txt: 3.54952\n",
+      "Entropy for files_txt/random_text_uniform_distribution.txt: 6.033632\n",
-      "Entropy for random_text_uniform_two_point_05_distribution.txt: 1.272664\n",
+      "Entropy for files_txt/random_text_uniform_two_point_05_distribution.txt: 1.273352\n",
-      "Entropy for random_text_uniform_two_point_09_distribution.txt: 0.761104\n"
+      "Entropy for files_txt/random_text_uniform_two_point_09_distribution.txt: 0.761152\n"
     ]
    }
   ],
@ -183,64 +190,6 @@
    "    print(f\"Entropy for {file}: {entropy_by_compression(open(file, 'r').read())}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare file sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of own_corpus.txt: 1000000 bytes, 8000000 bits\n",
      "Size of random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
      "Size of random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
      "Size of random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
      "Size of random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
      "********************************************************************************\n",
      "Size of own_corpus.tar.gz: 217515 bytes, 1740120 bits\n",
      "Size of random_text_uniform_distribution.tar.gz: 752275 bytes, 6018200 bits\n",
      "Size of random_text_geometric_distribution.tar.gz: 44611 bytes, 356888 bits\n",
      "Size of random_text_uniform_two_point_05_distribution.tar.gz: 150245 bytes, 1201960 bits\n",
      "Size of random_text_uniform_two_point_09_distribution.tar.gz: 81630 bytes, 653040 bits\n",
      "********************************************************************************\n",
      "Size of own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
      "Size of random_text_uniform_distribution.txt + codetable: 748749 bytes, 754867 bits\n",
      "Size of random_text_geometric_distribution.txt + codetable: 37470 bytes, 40788 bits\n",
      "Size of random_text_uniform_two_point_05_distribution.txt + codetable: 187473 bytes, 187753 bits\n",
      "Size of random_text_uniform_two_point_09_distribution.txt + codetable: 137531 bytes, 137811 bits\n"
     ]
    }
   ],
   "source": [
    "# print raw text files sizes\n",
    "for file in file_names:\n",
    "    print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
    "\n",
    "print(\"*\" * 80)\n",
    "\n",
    "# print compressed text files sizes\n",
    "for file in file_names:\n",
    "    file = file.replace('.txt', '.tar.gz')\n",
    "    print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
    "\n",
    "print(\"*\" * 80)\n",
    "\n",
    "# print compressed with Huffman text files sizes\n",
    "for file in file_names:\n",
    "    file1 = file.replace('.txt', '.bin')\n",
    "    file2 = file.replace('.txt', '_codetable.bin')\n",
    "    print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
@ -251,28 +200,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Calculating Huffman code for file: own_corpus.txt...\n",
+      "Calculating Huffman code for file: files_txt/own_corpus.txt...\n",
      "First 3:      r e s\n",
      "Binary:      0100 001 0001\n",
-      "Calculating Huffman code for file: random_text_uniform_distribution.txt...\n",
+      "Calculating Huffman code for file: files_txt/random_text_geometric_distribution.txt...\n",
-      "First 3:      H W 8\n",
+      "First 3:      b c b\n",
-      "Binary:      111010 001011 110101\n",
+      "Binary:      01 101 01\n",
-      "Calculating Huffman code for file: random_text_geometric_distribution.txt...\n",
+      "Calculating Huffman code for file: files_txt/random_text_uniform_distribution.txt...\n",
-      "First 3:      b a a\n",
+      "First 3:      3 C t\n",
-      "Binary:      01 11 11\n",
+      "Binary:      010010 0000101 101101\n",
-      "Calculating Huffman code for file: random_text_uniform_two_point_05_distribution.txt...\n",
+      "Calculating Huffman code for file: files_txt/random_text_uniform_two_point_05_distribution.txt...\n",
-      "First 3:      0 0 0\n",
+      "First 3:      1 0 0\n",
-      "Binary:      01 01 01\n",
+      "Binary:      01 1 1\n",
-      "Calculating Huffman code for file: random_text_uniform_two_point_09_distribution.txt...\n",
+      "Calculating Huffman code for file: files_txt/random_text_uniform_two_point_09_distribution.txt...\n",
-      "First 3:      1 1 1\n",
+      "First 3:      1 0 1\n",
-      "Binary:      1 1 1\n"
+      "Binary:      1 01 1\n"
     ]
    }
   ],
@ -298,7 +247,7 @@
    "    return decoded[:n]\n",
    "\n",
    "def save_to_bin(bytes, file_name):\n",
-    "    with open(file_name, 'wb') as f:\n",
+    "    with open(\"files_bin/\" + file_name.split('/')[1], 'wb') as f:\n",
    "        f.write(bytes)\n",
    "\n",
    "def number_to_bin(number, nbits):\n",
@ -308,12 +257,69 @@
    "    print(f\"Calculating Huffman code for file: {file}...\")\n",
    "    encoded, code_table = encode_and_print(open(file, 'r').read())\n",
    "    save_to_bin(encoded, file.replace('.txt', '.bin'))\n",
-    "    save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n",
+    "    save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n"
    "\n",
    "# Nie do końca rozumiem jak mam zapisać ten codec."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare file sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of files_txt/own_corpus.txt: 1000000 bytes, 8000000 bits\n",
      "Size of files_txt/random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
      "Size of files_txt/random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
      "Size of files_txt/random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
      "Size of files_txt/random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
      "********************************************************************************\n",
      "Size of files_tar/own_corpus.tar.gz: 217524 bytes, 1740192 bits\n",
      "Size of files_tar/random_text_geometric_distribution.tar.gz: 44733 bytes, 357864 bits\n",
      "Size of files_tar/random_text_uniform_distribution.tar.gz: 754471 bytes, 6035768 bits\n",
      "Size of files_tar/random_text_uniform_two_point_05_distribution.tar.gz: 150220 bytes, 1201760 bits\n",
      "Size of files_tar/random_text_uniform_two_point_09_distribution.tar.gz: 81909 bytes, 655272 bits\n",
      "********************************************************************************\n",
      "Size of files_txt/own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
      "Size of files_txt/random_text_geometric_distribution.txt + codetable: 37584 bytes, 40895 bits\n",
      "Size of files_txt/random_text_uniform_distribution.txt + codetable: 750834 bytes, 757043 bits\n",
      "Size of files_txt/random_text_uniform_two_point_05_distribution.txt + codetable: 187491 bytes, 187771 bits\n",
      "Size of files_txt/random_text_uniform_two_point_09_distribution.txt + codetable: 137530 bytes, 137810 bits\n"
     ]
    }
   ],
   "source": [
    "# print raw text files sizes\n",
    "for file in file_names:\n",
    "    print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
    "\n",
    "print(\"*\" * 80)\n",
    "\n",
    "# print compressed text files sizes\n",
    "for file in file_names:\n",
    "    file = file.replace('.txt', '.tar.gz').replace('files_txt', 'files_tar')\n",
    "    print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
    "\n",
    "print(\"*\" * 80)\n",
    "\n",
    "# print compressed with Huffman text files sizes\n",
    "for file in file_names:\n",
    "    file1 = file.replace('.txt', '.bin').replace('files_txt', 'files_bin')\n",
    "    file2 = file.replace('.txt', '_codetable.bin').replace('files_txt', 'files_bin')\n",
    "    print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -321,11 +327,11 @@
    "    \n",
    "|                            | Entropia  |\n",
    "| -----------                | ----------- |\n",
-    "| tekst  w jęz. naturalnym   | 1.8044238296689334|\n",
+    "| tekst  w jęz. naturalnym   | 1.754256|\n",
-    "| losowy tekst (jednostajny)   |  6.016344    |\n",
+    "| losowy tekst (jednostajny)   |   6.033632   |\n",
-    "| losowy tekst (geometryczny)|  3.5592  |\n",
+    "| losowy tekst (geometryczny)|  3.5624  |\n",
-    "| losowy tekst (dwupunktowy 0.5) |    1.27216    |\n",
+    "| losowy tekst (dwupunktowy 0.5) |    1.273352   |\n",
-    "| losowy tekst (dwupunktowy 0.9) |    0.760824     |\n"
+    "| losowy tekst (dwupunktowy 0.9) |    0.761152     |\n"
   ]
  },
  {
@ -374,9 +380,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 22,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entropy for words in files_txt/own_corpus.txt: 9.27320212652544\n",
      "Entropy for words in files_txt/random_text_geometric_distribution.txt: -0.0\n",
      "Entropy for words in files_txt/random_text_uniform_distribution.txt: 13.897386156097086\n",
      "Entropy for words in files_txt/random_text_uniform_two_point_05_distribution.txt: -0.0\n",
      "Entropy for words in files_txt/random_text_uniform_two_point_09_distribution.txt: -0.0\n"
     ]
    }
   ],
   "source": [
    "import regex as re\n",
    "from collections import Counter\n",
@ -389,29 +407,10 @@
    "def unigram_entropy(t):\n",
    "  counter = Counter(t)\n",
    "  total = sum(counter.values())\n",
-    "  return -sum((p := count / total) * log(p, 2) for count in counter.values())"
+    "  return -sum((p := count / total) * log(p, 2) for count in counter.values())\n",
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9.27320212652544\n"
     ]
    }
   ],
   "source": [
    "file_content = \"\"\n",
    "with open(\"own_corpus.txt\", 'r') as file:\n",
    "    file_content = file.read()\n",
    "\n",
-    "words = list(get_words(file_content))\n",
+    "for file in file_names:\n",
-    "print(unigram_entropy(words))"
+    "    print(f\"Entropy for words in {file}: {unigram_entropy(get_words(open(file, 'r').read()))}\")"
   ]
  },
  {
@ -424,8 +423,8 @@
    "|                            | Entropia    |\n",
    "| -----------                | ----------- |\n",
    "| tekst  w jęz. naturalnym   |9.27320212652544|\n",
-    "| losowy tekst (jednostajny)   |  6.016344    |\n",
+    "| losowy tekst (jednostajny)   | 13.897625675701356    |\n",
-    "| losowy tekst (geometryczny)|  3.5592  |\n"
+    "| losowy tekst (geometryczny)|  0  |\n"
   ]
  },
  {
@ -461,7 +460,7 @@
    "- Korpusy bez spacji mają większą tablice kodową niż nieskompresowany plik\n",
    "- Kompresowanie na wyrazach wydaję się być gorsze niż na znakach z powodu ogromnej tablicy kodowej\n",
    "- W jęzuku naturalbym częściej występują te same wyrazy niż w losowym tekście (jednostajnym)\n",
-    "- Kompresowanie huffmanem na słowach dla plików z jednym wyrazem nie ma sesnu"
+    "- Kompresowanie huffmanem na słowach dla plików z jednym wyrazem nie ma sensu"
   ]
  },
  {
--- a/Lab3/own_corpus.tar.gz
+++ b/Lab3/own_corpus.tar.gz
--- a/Lab3/random_text_geometric_distribution.bin
+++ b/Lab3/random_text_geometric_distribution.bin
--- a/Lab3/random_text_geometric_distribution.tar.gz
+++ b/Lab3/random_text_geometric_distribution.tar.gz
--- a/Lab3/random_text_geometric_distribution.txt
+++ b/Lab3/random_text_geometric_distribution.txt
--- a/Lab3/random_text_geometric_distribution_codetable.bin
+++ b/Lab3/random_text_geometric_distribution_codetable.bin
@ -1 +0,0 @@
 {'p': (9, 0), 'u': (11, 4), 'E': (15, 80), _EOF: (16, 162), 'D': (16, 163), 'C': (15, 82), 'A': (15, 83), 'w': (13, 21), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'o': (9, 256), 'q': (10, 514), 'x': (13, 4120), 'y': (14, 8242), 'z': (14, 8243), 't': (12, 2061), 's': (11, 1031), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}
--- a/Lab3/random_text_uniform_distribution.bin
+++ b/Lab3/random_text_uniform_distribution.bin
--- a/Lab3/random_text_uniform_distribution.tar.gz
+++ b/Lab3/random_text_uniform_distribution.tar.gz
--- a/Lab3/random_text_uniform_distribution.txt
+++ b/Lab3/random_text_uniform_distribution.txt
--- a/Lab3/random_text_uniform_distribution_codetable.bin
+++ b/Lab3/random_text_uniform_distribution_codetable.bin
@ -1 +0,0 @@
 {'A': (5, 0), 'b': (5, 1), _EOF: (7, 8), 'O': (7, 9), 'Y': (6, 5), '9': (6, 6), 't': (6, 7), '1': (6, 8), 'X': (6, 9), 'e': (6, 10), 'W': (6, 11), '4': (6, 12), '3': (6, 13), 'o': (6, 14), 'q': (6, 15), 'T': (6, 16), 'l': (6, 17), 'J': (6, 18), 'y': (6, 19), '6': (6, 20), 'F': (6, 21), 'G': (6, 22), 'Q': (6, 23), 'K': (6, 24), 'N': (6, 25), 'S': (6, 26), 'f': (6, 27), '5': (6, 28), 'L': (6, 29), 'd': (6, 30), 'D': (6, 31), 'M': (6, 32), 'n': (6, 33), 'u': (6, 34), 'B': (6, 35), '2': (6, 36), 'a': (6, 37), '0': (6, 38), '7': (6, 39), 'P': (6, 40), 'E': (6, 41), 'j': (6, 42), 'z': (6, 43), 'C': (6, 44), 'h': (6, 45), 'i': (6, 46), 'c': (6, 47), 'm': (6, 48), 'R': (6, 49), 'k': (6, 50), 'I': (6, 51), 'U': (6, 52), '8': (6, 53), 'Z': (6, 54), 'g': (6, 55), 's': (6, 56), 'V': (6, 57), 'H': (6, 58), 'w': (6, 59), 'r': (6, 60), 'x': (6, 61), 'p': (6, 62), 'v': (6, 63)}
--- a/Lab3/random_text_uniform_two_point_05_distribution.bin
+++ b/Lab3/random_text_uniform_two_point_05_distribution.bin
--- a/Lab3/random_text_uniform_two_point_05_distribution.tar.gz
+++ b/Lab3/random_text_uniform_two_point_05_distribution.tar.gz
--- a/Lab3/random_text_uniform_two_point_05_distribution.txt
+++ b/Lab3/random_text_uniform_two_point_05_distribution.txt
--- a/Lab3/random_text_uniform_two_point_05_distribution_codetable.bin
+++ b/Lab3/random_text_uniform_two_point_05_distribution_codetable.bin
@ -1 +0,0 @@
 {_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}
--- a/Lab3/random_text_uniform_two_point_09_distribution.bin
+++ b/Lab3/random_text_uniform_two_point_09_distribution.bin
--- a/Lab3/random_text_uniform_two_point_09_distribution.tar.gz
+++ b/Lab3/random_text_uniform_two_point_09_distribution.tar.gz
--- a/Lab3/random_text_uniform_two_point_09_distribution.txt
+++ b/Lab3/random_text_uniform_two_point_09_distribution.txt
		`@ -0,0 +1 @@`
							`{'l': (7, 0), 'p': (9, 4), 'D': (15, 320), 'E': (15, 321), _EOF: (16, 644), 'A': (16, 645), 'B': (16, 646), 'C': (16, 647), 'w': (13, 81), 'v': (12, 41), 't': (11, 21), 'r': (10, 11), 'n': (8, 3), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'u': (12, 2048), 'y': (14, 8196), 'z': (14, 8197), 'x': (13, 4099), 's': (11, 1025), 'q': (10, 513), 'o': (9, 257), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}`
		`@ -0,0 +1 @@`
							{'7': (5, 0), _EOF: (7, 4), 'C': (7, 5), 'r': (6, 3), '1': (6, 4), 'y': (6, 5), 'Z': (6, 6), 'm': (6, 7), '5': (6, 8), 'J': (6, 9), 'Y': (6, 10), 'E': (6, 11), 'v': (6, 12), 'p': (6, 13), 'c': (6, 14), 'w': (6, 15), 'B': (6, 16), 'g': (6, 17), '3': (6, 18), 'x': (6, 19), 'q': (6, 20), 's': (6, 21), 'b': (6, 22), 'i': (6, 23), 'k': (6, 24), '2': (6, 25), '9': (6, 26), 'G': (6, 27), 'S': (6, 28), 'A': (6, 29), 'f': (6, 30), 'l': (6, 31), 'e': (6, 32), 'M': (6, 33), 'W': (6, 34), 'P': (6, 35), 'O': (6, 36), 'j': (6, 37), '0': (6, 38), 'u': (6, 39), 'T': (6, 40), '4': (6, 41), 'o': (6, 42), 'I': (6, 43), '6': (6, 44), 't': (6, 45), 'L': (6, 46), '8': (6, 47), ' ': (6, 48), 'V': (6, 49), 'h': (6, 50), 'Q': (6, 51), 'U': (6, 52), 'F': (6, 53), 'K': (6, 54), 'n': (6, 55), 'R': (6, 56), 'z': (6, 57), 'H': (6, 58), 'a': (6, 59), 'd': (6, 60), 'N': (6, 61), 'D': (6, 62), 'X': (6, 63)}
		`@ -1 +0,0 @@`
			`{'p': (9, 0), 'u': (11, 4), 'E': (15, 80), _EOF: (16, 162), 'D': (16, 163), 'C': (15, 82), 'A': (15, 83), 'w': (13, 21), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'o': (9, 256), 'q': (10, 514), 'x': (13, 4120), 'y': (14, 8242), 'z': (14, 8243), 't': (12, 2061), 's': (11, 1031), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}`
		`@ -1 +0,0 @@`
			{'A': (5, 0), 'b': (5, 1), _EOF: (7, 8), 'O': (7, 9), 'Y': (6, 5), '9': (6, 6), 't': (6, 7), '1': (6, 8), 'X': (6, 9), 'e': (6, 10), 'W': (6, 11), '4': (6, 12), '3': (6, 13), 'o': (6, 14), 'q': (6, 15), 'T': (6, 16), 'l': (6, 17), 'J': (6, 18), 'y': (6, 19), '6': (6, 20), 'F': (6, 21), 'G': (6, 22), 'Q': (6, 23), 'K': (6, 24), 'N': (6, 25), 'S': (6, 26), 'f': (6, 27), '5': (6, 28), 'L': (6, 29), 'd': (6, 30), 'D': (6, 31), 'M': (6, 32), 'n': (6, 33), 'u': (6, 34), 'B': (6, 35), '2': (6, 36), 'a': (6, 37), '0': (6, 38), '7': (6, 39), 'P': (6, 40), 'E': (6, 41), 'j': (6, 42), 'z': (6, 43), 'C': (6, 44), 'h': (6, 45), 'i': (6, 46), 'c': (6, 47), 'm': (6, 48), 'R': (6, 49), 'k': (6, 50), 'I': (6, 51), 'U': (6, 52), '8': (6, 53), 'Z': (6, 54), 'g': (6, 55), 's': (6, 56), 'V': (6, 57), 'H': (6, 58), 'w': (6, 59), 'r': (6, 60), 'x': (6, 61), 'p': (6, 62), 'v': (6, 63)}