This commit is contained in:
Adam Wojdyla 2023-03-29 11:01:35 +02:00
parent 582e470488
commit be868f492b
37 changed files with 150 additions and 151 deletions

Binary file not shown.

View File

@ -0,0 +1 @@
{'l': (7, 0), 'p': (9, 4), 'D': (15, 320), 'E': (15, 321), _EOF: (16, 644), 'A': (16, 645), 'B': (16, 646), 'C': (16, 647), 'w': (13, 81), 'v': (12, 41), 't': (11, 21), 'r': (10, 11), 'n': (8, 3), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'u': (12, 2048), 'y': (14, 8196), 'z': (14, 8197), 'x': (13, 4099), 's': (11, 1025), 'q': (10, 513), 'o': (9, 257), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}

Binary file not shown.

View File

@ -0,0 +1 @@
{'7': (5, 0), _EOF: (7, 4), 'C': (7, 5), 'r': (6, 3), '1': (6, 4), 'y': (6, 5), 'Z': (6, 6), 'm': (6, 7), '5': (6, 8), 'J': (6, 9), 'Y': (6, 10), 'E': (6, 11), 'v': (6, 12), 'p': (6, 13), 'c': (6, 14), 'w': (6, 15), 'B': (6, 16), 'g': (6, 17), '3': (6, 18), 'x': (6, 19), 'q': (6, 20), 's': (6, 21), 'b': (6, 22), 'i': (6, 23), 'k': (6, 24), '2': (6, 25), '9': (6, 26), 'G': (6, 27), 'S': (6, 28), 'A': (6, 29), 'f': (6, 30), 'l': (6, 31), 'e': (6, 32), 'M': (6, 33), 'W': (6, 34), 'P': (6, 35), 'O': (6, 36), 'j': (6, 37), '0': (6, 38), 'u': (6, 39), 'T': (6, 40), '4': (6, 41), 'o': (6, 42), 'I': (6, 43), '6': (6, 44), 't': (6, 45), 'L': (6, 46), '8': (6, 47), ' ': (6, 48), 'V': (6, 49), 'h': (6, 50), 'Q': (6, 51), 'U': (6, 52), 'F': (6, 53), 'K': (6, 54), 'n': (6, 55), 'R': (6, 56), 'z': (6, 57), 'H': (6, 58), 'a': (6, 59), 'd': (6, 60), 'N': (6, 61), 'D': (6, 62), 'X': (6, 63)}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{_EOF: (2, 0), '1': (2, 1), '0': (1, 1)}

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -18,51 +18,57 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 100, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import string\n", "import string\n",
"import os\n",
"\n", "\n",
"# Set the length of the string to generate\n", "# Set the length of the string to generate\n",
"string_length = 1000000\n", "string_length = 1000000\n",
"\n", "\n",
"# Define the character set to choose from\n", "# Define the character set to choose from\n",
"character_set = np.array(list(string.ascii_letters + string.digits))" "character_set = np.array(list(string.ascii_letters + string.digits + \" \"))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 101, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"os.makedirs(\"./files_txt\", exist_ok=True)\n",
"os.makedirs(\"./files_tar\", exist_ok=True)\n",
"os.makedirs(\"./files_bin\", exist_ok=True)\n",
"\n",
"with open(\"../Lab1/out-merged.txt\", 'r') as file:\n", "with open(\"../Lab1/out-merged.txt\", 'r') as file:\n",
" file_content = file.read()\n", " file_content = file.read()\n",
" first_chars = file_content[:string_length]\n", " first_chars = file_content[:string_length]\n",
"\n", "\n",
" with open(\"./own_corpus.txt\", 'w') as f:\n", " with open(\"files_txt/own_corpus.txt\", 'w') as f:\n",
" f.write(first_chars)" " f.write(first_chars)\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 102, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Generate the random string using uniform distribution\n", "# Generate the random string using uniform distribution\n",
"random_indices = np.random.uniform(low=0, high=len(character_set), size=string_length).astype(int)\n", "random_indices = np.random.uniform(low=0, high=len(character_set), size=string_length).astype(int)\n",
"random_string = ''.join(character_set[random_indices])\n", "random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
"random_string = ''.join(random_characters)\n",
"\n", "\n",
"with open('random_text_uniform_distribution.txt', 'w') as f:\n", "with open('files_txt/random_text_uniform_distribution.txt', 'w') as f:\n",
" f.write(random_string)" " f.write(random_string)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 103, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -73,14 +79,13 @@
"random_characters = [character_set[i % len(character_set)] for i in random_indices]\n", "random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
"random_string = ''.join(random_characters)\n", "random_string = ''.join(random_characters)\n",
"\n", "\n",
"\n", "with open('files_txt/random_text_geometric_distribution.txt', 'w') as f:\n",
"with open('random_text_geometric_distribution.txt', 'w') as f:\n",
" f.write(random_string)" " f.write(random_string)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 104, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -89,13 +94,13 @@
"random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.5, 0.5])\n", "random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.5, 0.5])\n",
"random_string = ''.join(character_set[random_indices])\n", "random_string = ''.join(character_set[random_indices])\n",
"\n", "\n",
"with open('random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n", "with open('files_txt/random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n",
" f.write(random_string)" " f.write(random_string)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 105, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -104,7 +109,7 @@
"random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.1, 0.9])\n", "random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.1, 0.9])\n",
"random_string = ''.join(character_set[random_indices])\n", "random_string = ''.join(character_set[random_indices])\n",
"\n", "\n",
"with open('random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n", "with open('files_txt/random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n",
" f.write(random_string)" " f.write(random_string)"
] ]
}, },
@ -118,23 +123,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 106, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Compression complete. The compressed archive is saved as own_corpus.tar.gz.\n", "Compression complete. The compressed archive is saved as files_tar/own_corpus.tar.gz.\n",
"Compression ratio: 4.59738408845367\n", "Compression ratio: 4.597193872860006\n",
"Compression complete. The compressed archive is saved as random_text_uniform_distribution.tar.gz.\n", "Compression complete. The compressed archive is saved as files_tar/random_text_geometric_distribution.tar.gz.\n",
"Compression ratio: 1.3293011199361935\n", "Compression ratio: 2.2354861064538483\n",
"Compression complete. The compressed archive is saved as random_text_geometric_distribution.tar.gz.\n", "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_distribution.tar.gz.\n",
"Compression ratio: 2.2415996054784695\n", "Compression ratio: 1.3254319914218042\n",
"Compression complete. The compressed archive is saved as random_text_uniform_two_point_05_distribution.tar.gz.\n", "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_05_distribution.tar.gz.\n",
"Compression ratio: 6.6557955339611965\n", "Compression ratio: 6.656903208627346\n",
"Compression complete. The compressed archive is saved as random_text_uniform_two_point_09_distribution.tar.gz.\n", "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_09_distribution.tar.gz.\n",
"Compression ratio: 12.250398137939483\n" "Compression ratio: 12.2086705978586\n"
] ]
} }
], ],
@ -143,32 +148,34 @@
"import os\n", "import os\n",
"\n", "\n",
"def compress_file(file_name):\n", "def compress_file(file_name):\n",
" output_archive_name = file_name.replace('.txt', '.tar.gz')\n", " output_archive_name = \"files_tar/\" + file_name.split('/')[1].replace('.txt', '.tar.gz')\n",
" with tarfile.open(output_archive_name, 'w:gz') as tar:\n", " with tarfile.open(output_archive_name, 'w:gz') as tar:\n",
" tar.add(file_name)\n", " tar.add(file_name)\n",
"\n", "\n",
" print(f'Compression complete. The compressed archive is saved as {output_archive_name}.')\n", " print(f'Compression complete. The compressed archive is saved as {output_archive_name}.')\n",
" print(f'Compression ratio: {os.path.getsize(file_name) / os.path.getsize(output_archive_name)}')\n", " print(f'Compression ratio: {os.path.getsize(file_name) / os.path.getsize(output_archive_name)}')\n",
"\n", "\n",
"file_names = ['own_corpus.txt', 'random_text_uniform_distribution.txt', 'random_text_geometric_distribution.txt', 'random_text_uniform_two_point_05_distribution.txt', 'random_text_uniform_two_point_09_distribution.txt']\n", "\n",
"file_names = [\"files_txt/\" + f for f in os.listdir('files_txt') if f.endswith('.txt')]\n",
"file_names.sort()\n",
"for file in file_names:\n", "for file in file_names:\n",
" compress_file(file)" " compress_file(file)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 107, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Entropy for own_corpus.txt: 1.754256\n", "Entropy for files_txt/own_corpus.txt: 1.754256\n",
"Entropy for random_text_uniform_distribution.txt: 6.016072\n", "Entropy for files_txt/random_text_geometric_distribution.txt: 3.5624\n",
"Entropy for random_text_geometric_distribution.txt: 3.54952\n", "Entropy for files_txt/random_text_uniform_distribution.txt: 6.033632\n",
"Entropy for random_text_uniform_two_point_05_distribution.txt: 1.272664\n", "Entropy for files_txt/random_text_uniform_two_point_05_distribution.txt: 1.273352\n",
"Entropy for random_text_uniform_two_point_09_distribution.txt: 0.761104\n" "Entropy for files_txt/random_text_uniform_two_point_09_distribution.txt: 0.761152\n"
] ]
} }
], ],
@ -183,64 +190,6 @@
" print(f\"Entropy for {file}: {entropy_by_compression(open(file, 'r').read())}\")" " print(f\"Entropy for {file}: {entropy_by_compression(open(file, 'r').read())}\")"
] ]
}, },
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare file sizes"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of own_corpus.txt: 1000000 bytes, 8000000 bits\n",
"Size of random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
"Size of random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
"Size of random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
"Size of random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
"********************************************************************************\n",
"Size of own_corpus.tar.gz: 217515 bytes, 1740120 bits\n",
"Size of random_text_uniform_distribution.tar.gz: 752275 bytes, 6018200 bits\n",
"Size of random_text_geometric_distribution.tar.gz: 44611 bytes, 356888 bits\n",
"Size of random_text_uniform_two_point_05_distribution.tar.gz: 150245 bytes, 1201960 bits\n",
"Size of random_text_uniform_two_point_09_distribution.tar.gz: 81630 bytes, 653040 bits\n",
"********************************************************************************\n",
"Size of own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
"Size of random_text_uniform_distribution.txt + codetable: 748749 bytes, 754867 bits\n",
"Size of random_text_geometric_distribution.txt + codetable: 37470 bytes, 40788 bits\n",
"Size of random_text_uniform_two_point_05_distribution.txt + codetable: 187473 bytes, 187753 bits\n",
"Size of random_text_uniform_two_point_09_distribution.txt + codetable: 137531 bytes, 137811 bits\n"
]
}
],
"source": [
"# print raw text files sizes\n",
"for file in file_names:\n",
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
"\n",
"print(\"*\" * 80)\n",
"\n",
"# print compressed text files sizes\n",
"for file in file_names:\n",
" file = file.replace('.txt', '.tar.gz')\n",
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
"\n",
"print(\"*\" * 80)\n",
"\n",
"# print compressed with Huffman text files sizes\n",
"for file in file_names:\n",
" file1 = file.replace('.txt', '.bin')\n",
" file2 = file.replace('.txt', '_codetable.bin')\n",
" print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
]
},
{ {
"attachments": {}, "attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
@ -251,28 +200,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 109, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Calculating Huffman code for file: own_corpus.txt...\n", "Calculating Huffman code for file: files_txt/own_corpus.txt...\n",
"First 3: r e s\n", "First 3: r e s\n",
"Binary: 0100 001 0001\n", "Binary: 0100 001 0001\n",
"Calculating Huffman code for file: random_text_uniform_distribution.txt...\n", "Calculating Huffman code for file: files_txt/random_text_geometric_distribution.txt...\n",
"First 3: H W 8\n", "First 3: b c b\n",
"Binary: 111010 001011 110101\n", "Binary: 01 101 01\n",
"Calculating Huffman code for file: random_text_geometric_distribution.txt...\n", "Calculating Huffman code for file: files_txt/random_text_uniform_distribution.txt...\n",
"First 3: b a a\n", "First 3: 3 C t\n",
"Binary: 01 11 11\n", "Binary: 010010 0000101 101101\n",
"Calculating Huffman code for file: random_text_uniform_two_point_05_distribution.txt...\n", "Calculating Huffman code for file: files_txt/random_text_uniform_two_point_05_distribution.txt...\n",
"First 3: 0 0 0\n", "First 3: 1 0 0\n",
"Binary: 01 01 01\n", "Binary: 01 1 1\n",
"Calculating Huffman code for file: random_text_uniform_two_point_09_distribution.txt...\n", "Calculating Huffman code for file: files_txt/random_text_uniform_two_point_09_distribution.txt...\n",
"First 3: 1 1 1\n", "First 3: 1 0 1\n",
"Binary: 1 1 1\n" "Binary: 1 01 1\n"
] ]
} }
], ],
@ -298,7 +247,7 @@
" return decoded[:n]\n", " return decoded[:n]\n",
"\n", "\n",
"def save_to_bin(bytes, file_name):\n", "def save_to_bin(bytes, file_name):\n",
" with open(file_name, 'wb') as f:\n", " with open(\"files_bin/\" + file_name.split('/')[1], 'wb') as f:\n",
" f.write(bytes)\n", " f.write(bytes)\n",
"\n", "\n",
"def number_to_bin(number, nbits):\n", "def number_to_bin(number, nbits):\n",
@ -308,12 +257,69 @@
" print(f\"Calculating Huffman code for file: {file}...\")\n", " print(f\"Calculating Huffman code for file: {file}...\")\n",
" encoded, code_table = encode_and_print(open(file, 'r').read())\n", " encoded, code_table = encode_and_print(open(file, 'r').read())\n",
" save_to_bin(encoded, file.replace('.txt', '.bin'))\n", " save_to_bin(encoded, file.replace('.txt', '.bin'))\n",
" save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n", " save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n"
"\n",
"# Nie do końca rozumiem jak mam zapisać ten codec."
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare file sizes"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of files_txt/own_corpus.txt: 1000000 bytes, 8000000 bits\n",
"Size of files_txt/random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
"Size of files_txt/random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
"Size of files_txt/random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
"Size of files_txt/random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
"********************************************************************************\n",
"Size of files_tar/own_corpus.tar.gz: 217524 bytes, 1740192 bits\n",
"Size of files_tar/random_text_geometric_distribution.tar.gz: 44733 bytes, 357864 bits\n",
"Size of files_tar/random_text_uniform_distribution.tar.gz: 754471 bytes, 6035768 bits\n",
"Size of files_tar/random_text_uniform_two_point_05_distribution.tar.gz: 150220 bytes, 1201760 bits\n",
"Size of files_tar/random_text_uniform_two_point_09_distribution.tar.gz: 81909 bytes, 655272 bits\n",
"********************************************************************************\n",
"Size of files_txt/own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
"Size of files_txt/random_text_geometric_distribution.txt + codetable: 37584 bytes, 40895 bits\n",
"Size of files_txt/random_text_uniform_distribution.txt + codetable: 750834 bytes, 757043 bits\n",
"Size of files_txt/random_text_uniform_two_point_05_distribution.txt + codetable: 187491 bytes, 187771 bits\n",
"Size of files_txt/random_text_uniform_two_point_09_distribution.txt + codetable: 137530 bytes, 137810 bits\n"
]
}
],
"source": [
"# print raw text files sizes\n",
"for file in file_names:\n",
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
"\n",
"print(\"*\" * 80)\n",
"\n",
"# print compressed text files sizes\n",
"for file in file_names:\n",
" file = file.replace('.txt', '.tar.gz').replace('files_txt', 'files_tar')\n",
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
"\n",
"print(\"*\" * 80)\n",
"\n",
"# print compressed with Huffman text files sizes\n",
"for file in file_names:\n",
" file1 = file.replace('.txt', '.bin').replace('files_txt', 'files_bin')\n",
" file2 = file.replace('.txt', '_codetable.bin').replace('files_txt', 'files_bin')\n",
" print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
]
},
{
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
@ -321,11 +327,11 @@
" \n", " \n",
"| | Entropia |\n", "| | Entropia |\n",
"| ----------- | ----------- |\n", "| ----------- | ----------- |\n",
"| tekst w jęz. naturalnym | 1.8044238296689334|\n", "| tekst w jęz. naturalnym | 1.754256|\n",
"| losowy tekst (jednostajny) | 6.016344 |\n", "| losowy tekst (jednostajny) | 6.033632 |\n",
"| losowy tekst (geometryczny)| 3.5592 |\n", "| losowy tekst (geometryczny)| 3.5624 |\n",
"| losowy tekst (dwupunktowy 0.5) | 1.27216 |\n", "| losowy tekst (dwupunktowy 0.5) | 1.273352 |\n",
"| losowy tekst (dwupunktowy 0.9) | 0.760824 |\n" "| losowy tekst (dwupunktowy 0.9) | 0.761152 |\n"
] ]
}, },
{ {
@ -374,9 +380,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 127, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Entropy for words in files_txt/own_corpus.txt: 9.27320212652544\n",
"Entropy for words in files_txt/random_text_geometric_distribution.txt: -0.0\n",
"Entropy for words in files_txt/random_text_uniform_distribution.txt: 13.897386156097086\n",
"Entropy for words in files_txt/random_text_uniform_two_point_05_distribution.txt: -0.0\n",
"Entropy for words in files_txt/random_text_uniform_two_point_09_distribution.txt: -0.0\n"
]
}
],
"source": [ "source": [
"import regex as re\n", "import regex as re\n",
"from collections import Counter\n", "from collections import Counter\n",
@ -389,29 +407,10 @@
"def unigram_entropy(t):\n", "def unigram_entropy(t):\n",
" counter = Counter(t)\n", " counter = Counter(t)\n",
" total = sum(counter.values())\n", " total = sum(counter.values())\n",
" return -sum((p := count / total) * log(p, 2) for count in counter.values())" " return -sum((p := count / total) * log(p, 2) for count in counter.values())\n",
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9.27320212652544\n"
]
}
],
"source": [
"file_content = \"\"\n",
"with open(\"own_corpus.txt\", 'r') as file:\n",
" file_content = file.read()\n",
"\n", "\n",
"words = list(get_words(file_content))\n", "for file in file_names:\n",
"print(unigram_entropy(words))" " print(f\"Entropy for words in {file}: {unigram_entropy(get_words(open(file, 'r').read()))}\")"
] ]
}, },
{ {
@ -424,8 +423,8 @@
"| | Entropia |\n", "| | Entropia |\n",
"| ----------- | ----------- |\n", "| ----------- | ----------- |\n",
"| tekst w jęz. naturalnym |9.27320212652544|\n", "| tekst w jęz. naturalnym |9.27320212652544|\n",
"| losowy tekst (jednostajny) | 6.016344 |\n", "| losowy tekst (jednostajny) | 13.897625675701356 |\n",
"| losowy tekst (geometryczny)| 3.5592 |\n" "| losowy tekst (geometryczny)| 0 |\n"
] ]
}, },
{ {
@ -461,7 +460,7 @@
"- Korpusy bez spacji mają większą tablice kodową niż nieskompresowany plik\n", "- Korpusy bez spacji mają większą tablice kodową niż nieskompresowany plik\n",
"- Kompresowanie na wyrazach wydaję się być gorsze niż na znakach z powodu ogromnej tablicy kodowej\n", "- Kompresowanie na wyrazach wydaję się być gorsze niż na znakach z powodu ogromnej tablicy kodowej\n",
"- W jęzuku naturalbym częściej występują te same wyrazy niż w losowym tekście (jednostajnym)\n", "- W jęzuku naturalbym częściej występują te same wyrazy niż w losowym tekście (jednostajnym)\n",
"- Kompresowanie huffmanem na słowach dla plików z jednym wyrazem nie ma sesnu" "- Kompresowanie huffmanem na słowach dla plików z jednym wyrazem nie ma sensu"
] ]
}, },
{ {

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{'p': (9, 0), 'u': (11, 4), 'E': (15, 80), _EOF: (16, 162), 'D': (16, 163), 'C': (15, 82), 'A': (15, 83), 'w': (13, 21), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'o': (9, 256), 'q': (10, 514), 'x': (13, 4120), 'y': (14, 8242), 'z': (14, 8243), 't': (12, 2061), 's': (11, 1031), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{'A': (5, 0), 'b': (5, 1), _EOF: (7, 8), 'O': (7, 9), 'Y': (6, 5), '9': (6, 6), 't': (6, 7), '1': (6, 8), 'X': (6, 9), 'e': (6, 10), 'W': (6, 11), '4': (6, 12), '3': (6, 13), 'o': (6, 14), 'q': (6, 15), 'T': (6, 16), 'l': (6, 17), 'J': (6, 18), 'y': (6, 19), '6': (6, 20), 'F': (6, 21), 'G': (6, 22), 'Q': (6, 23), 'K': (6, 24), 'N': (6, 25), 'S': (6, 26), 'f': (6, 27), '5': (6, 28), 'L': (6, 29), 'd': (6, 30), 'D': (6, 31), 'M': (6, 32), 'n': (6, 33), 'u': (6, 34), 'B': (6, 35), '2': (6, 36), 'a': (6, 37), '0': (6, 38), '7': (6, 39), 'P': (6, 40), 'E': (6, 41), 'j': (6, 42), 'z': (6, 43), 'C': (6, 44), 'h': (6, 45), 'i': (6, 46), 'c': (6, 47), 'm': (6, 48), 'R': (6, 49), 'k': (6, 50), 'I': (6, 51), 'U': (6, 52), '8': (6, 53), 'Z': (6, 54), 'g': (6, 55), 's': (6, 56), 'V': (6, 57), 'H': (6, 58), 'w': (6, 59), 'r': (6, 60), 'x': (6, 61), 'p': (6, 62), 'v': (6, 63)}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long