{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import requests\n", "import numpy as np\n", "import string\n", "import random\n", "import gzip\n", "from dahuffman import HuffmanCodec\n", "import re\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "NR_INDEKSU = 449288\n", "random.seed(NR_INDEKSU)\n", "np.random.seed(NR_INDEKSU)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "# Zadanie 1" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ']\n" ] } ], "source": [ "# All characters from [a-zA-Z0-9 ]\n", "chars = list(string.ascii_lowercase) + list(string.ascii_uppercase) + [str(i) for i in range(10)] + [' ']\n", "print(chars)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": "'iebie wmówić w młodości, nie zbadawszy nigdy, czy są prawdziwe. Mimo bowiem że spostrzegłem w tym ro'" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 1. Natural language text\n", "url = 'https://wolnelektury.pl/media/book/txt/rozprawa-o-metodzie.txt'\n", "rozprawa = requests.get(url).content.decode('utf-8')\n", "rozprawa = rozprawa[:100000]\n", "rozprawa[50000:50100]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": "'Yc9 h7SWSelg8Pkd1Nq0IudZdaGujrPwWFXpuWuj3bAxKxe1G8IEMGqedyXnPW2CUsDCsos3ljJNuKeEGDzgs4NQM7sG6FfLqSVT'" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 2. Text generated randomly from a discrete uniform distribution, characters = [a-zA-Z0-9 ]\n", "uniform = ''.join(random.choices(chars, k = 100000, ))\n", "uniform[50000:50100]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": "'cbbdebbebddbbbccbcbbbbbbbcdhcbbcbbbcdbedbcbbbbbecddbcbfbccbcbccdbcbebdbbcdcbdcbbbbbebbcbbebbdbcbbbdd'" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 3. Text generated randomly from a geometric distribution, p = 0.5, characters = [a-zA-Z0-9 ]\n", "samples = np.random.geometric(p=0.5, size=100000)\n", "geometric = ''.join([chars[sample % len(chars)] for sample in samples])\n", "geometric[50000:50100]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 7, "outputs": [ { "data": { "text/plain": "'0101000110000010100110101000100000011101000010011001010101010110011101111100101011110110101001100011'" }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 4. Text generated randomly from a binomial distribution, p = 0.5, characters = [01]\n", "samples = np.random.binomial(1, 0.5, 100000)\n", "binomial_05 = ''.join([str(sample) for sample in samples])\n", "binomial_05[50000:50100]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "data": { "text/plain": "'1111111111111111111101101111111111111111111111111111111111111111110111101111111111101111111111111111'" }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 5. Text generated randomly from a binomial distribution, p = 0.9, characters = [01]\n", "samples = np.random.binomial(1, 0.9, 100000)\n", "binomial_09 = ''.join([str(sample) for sample in samples])\n", "binomial_09[50000:50100]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "outputs": [], "source": [ "# Creating a helper dict for function input\n", "text_names = ['rozprawa_chars', 'uniform_chars', 'geometric_chars', 'binomial_05_chars', 'binomial_09_chars']\n", "text_variables = [rozprawa, uniform, geometric, binomial_05, binomial_09]\n", "texts = {text_name: text_variable for text_name, text_variable in zip(text_names, text_variables)}" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rozprawa_chars 3.2612\n", "uniform_chars 6.03328\n", "geometric_chars 2.4852\n", "binomial_05_chars 1.24432\n", "binomial_09_chars 0.66288\n" ] } ], "source": [ "# Calculating entropy for texts\n", "def calculate_entropy(texts):\n", " for name, text in texts.items():\n", " with gzip.open(name + '_compressed.gz', 'wb') as f:\n", " compressed = gzip.compress(text.encode('utf-8'))\n", " print(f\"{name}{' ' * (20 - len(name))}{8 * len(compressed) / len(text)}\")\n", " f.write(compressed)\n", "\n", "calculate_entropy(texts)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [], "source": [ "# Training codecs and encoding whole texts\n", "def train_codecs(texts):\n", " codecs = {name: HuffmanCodec.from_data(text) for name, text in texts.items()}\n", " encoded_texts = {name: codecs[name].encode(text) for name, text in texts.items()}\n", " return codecs, encoded_texts\n", "\n", "codecs, encoded_texts = train_codecs(texts)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rozprawa_chars 1111011110001111 10011111 00001111\n", "uniform_chars 01110000 10101000 10000000\n", "geometric_chars 00100000 01000000 10000000\n", "binomial_05_chars 01000000 10000000 01000000\n", "binomial_09_chars 10000000 10000000 10000000\n" ] } ], "source": [ "# Decoding 3 initial characters for all texts\n", "def three_initials(texts, codecs):\n", " for name, text in texts.items():\n", " decoded = ' '.join([\"{:08b}\".format(int(codecs[name].encode(char).hex(), 16)) for char in list(text[:3])])\n", " print(f\"{name}{' ' * (20 - len(name))}{decoded}\")\n", "\n", "three_initials(texts, codecs)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "# Saving raw texts, encoded texts and code tables\n", "def to_files(texts, codecs, encoded_texts):\n", " for name, text in texts.items():\n", " with open(name + '_text.txt', 'w', encoding='utf-8') as f_text, open(name + '_encoded.bin', 'wb') as f_encoded, open(name + '_encoded_table.txt', 'w', encoding='utf-8') as f_table:\n", " f_text.write(text)\n", " f_encoded.write(encoded_texts[name])\n", " f_table.write(str(codecs[name].get_code_table()))\n", "\n", "to_files(texts, codecs, encoded_texts)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-rw-r-- 1 ked ked 40824 Mar 25 20:46 rozprawa_chars_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 59007 Mar 25 20:46 rozprawa_chars_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 1503 Mar 25 20:46 rozprawa_chars_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 107454 Mar 25 20:46 rozprawa_chars_text.txt\r\n", "----------\r\n", "-rw-rw-r-- 1 ked ked 75484 Mar 25 20:46 uniform_chars_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 74977 Mar 25 20:46 uniform_chars_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 887 Mar 25 20:46 uniform_chars_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 uniform_chars_text.txt\r\n", "----------\r\n", "-rw-rw-r-- 1 ked ked 31120 Mar 25 20:46 geometric_chars_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 24894 Mar 25 20:46 geometric_chars_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 258 Mar 25 20:46 geometric_chars_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 geometric_chars_text.txt\r\n", "----------\r\n", "-rw-rw-r-- 1 ked ked 15606 Mar 25 20:46 binomial_05_chars_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 18734 Mar 25 20:46 binomial_05_chars_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 40 Mar 25 20:46 binomial_05_chars_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 binomial_05_chars_text.txt\r\n", "----------\r\n", "-rw-rw-r-- 1 ked ked 8338 Mar 25 20:46 binomial_09_chars_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 13755 Mar 25 20:46 binomial_09_chars_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 40 Mar 25 20:46 binomial_09_chars_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 binomial_09_chars_text.txt\r\n" ] } ], "source": [ "# Comparing file sizes of all text format files\n", "!ls -l rozprawa_chars*\n", "!echo '----------'\n", "!ls -l uniform_chars*\n", "!echo '----------'\n", "!ls -l geometric_chars*\n", "!echo '----------'\n", "!ls -l binomial_05_chars*\n", "!echo '----------'\n", "!ls -l binomial_09_chars*" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Entropia\n", "\n", "| | Entropia |\n", "| ----------- | ----------- |\n", "| tekst w jęz. naturalnym | 3.2612 |\n", "| losowy tekst (jednostajny) | 6.03328 |\n", "| losowy tekst (geometryczny)| 2.4852 |\n", "| losowy tekst (dwupunktowy 0.5) | 1.24432 |\n", "| losowy tekst (dwupunktowy 0.9) | 0.66288 |" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Wielkości w bitach:\n", "\n", "| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n", "| ----------- | ----------- |-----------------------------------|--------------------------------------|\n", "| tekst w jęz. naturalnym | 107454 | 40824 | 59007 + 1503 |\n", "| losowy tekst (jednostajny) | 100000 | 75484 | 74977 + 887 |\n", "| losowy tekst (geometryczny)| 100000 | 31120 | 24894 + 258 |\n", "| losowy tekst (dwupunktowy 0.5)| 100000 | 15606 | 18734 + 40 |\n", "| losowy tekst (dwupunktowy 0.9)| 100000 | 8338 | 13755 + 40 |" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "#### Wnioski:\n", "- Najwyższą entropię posiada losowy tekst z rozkładu jednostajnego. Jest to spodziewalne, ponieważ jest to właściwie czysty szum złożony z kilkudziesięciu możliwych symboli o tym samym prawdopodobieństwie wystąpienia\n", "- Losowe teksty złożone wyłącznie z 0 i 1 mają niższą entropię nawet od tekstu w języku naturalnym, ponieważ ich losowość jest porównywalnie niższa przez bardzo ograniczoną klasę możliwych symboli\n", "- Bardzo niska entropia losowego tekstu z rozkładu dwupunktowego 0.9 wynika z faktu, że cały tekst to w ~90% powtarzany jeden symbol\n", "- Wysoka entropia tekstu przyczynia się do braku możliwości efektywnego skompresowania go, a tym samym zaoszczędzenia więcej miejsca na dysku - ciężko jest stworzyć efektywne kodowanie dla symboli o bardzo podobnym rozkładzie\n", "- Klasa możliwych symboli znacząco wpływa na rozmiar tablicy kodującej\n", "- Kompresja gzip daje znacznie lepsze wyniki niż kodek Huffmana wyłącznie dla tekstu w języku naturalnym\n", "- Większy niż 100000 rozmiar w bajtach tekstu w języku naturalnym wynika z faktu, iż niektóre znaki w nim występujące (np. polskie litery) są kodowane więcej niż jednym bajtem" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "# Zadanie 2" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [], "source": [ "# Creating a helper dict for function input + deleting multiple spaces\n", "text_names_words = [name.replace('char', 'word') for name in text_names[:3]]\n", "text_variables_words = text_variables[:3]\n", "texts_words = {text_name: re.sub(r'\\s+', ' ', text_variable) for text_name, text_variable in zip(text_names_words, text_variables_words)}" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [ "# Defining a single space-preserving split function\n", "word_split = lambda text: re.split(f\"( )\", text)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 17, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rozprawa_words 3.2606867202633665\n", "uniform_words 6.033249974992498\n", "geometric_words 2.4852\n" ] } ], "source": [ "# Calculating entropy for texts\n", "calculate_entropy(texts_words)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 18, "outputs": [], "source": [ "# Training codecs and encoding whole texts\n", "def train_codecs_words(texts):\n", " codecs = {name: HuffmanCodec.from_data(word_split(text)) for name, text in texts.items()}\n", " encoded_texts = {name: codecs[name].encode(word_split(text)) for name, text in texts.items()}\n", " return codecs, encoded_texts\n", "\n", "codecs_words, encoded_texts_words = train_codecs_words(texts_words)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rozprawa_words 1000110111111101 01011101 1000011001010111\n", "uniform_words 1000001101010111 01011101 1101011000111011\n", "geometric_words 10000000\n" ] } ], "source": [ "# Decoding 3 initial words for all texts\n", "def three_initials_words(texts, codecs):\n", " for name, text in texts.items():\n", " decoded = ' '.join([\"{:08b}\".format(int(codecs[name].encode([word]).hex(), 16)) for word in word_split(text)[:3]])\n", " print(f\"{name}{' ' * (20 - len(name))}{decoded}\")\n", "\n", "three_initials_words(texts_words, codecs_words)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 20, "outputs": [], "source": [ "# Saving raw texts, encoded texts and code tables\n", "to_files(texts_words, codecs_words, encoded_texts_words)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 21, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-rw-r-- 1 ked ked 40668 Mar 25 20:46 rozprawa_words_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 23817 Mar 25 20:46 rozprawa_words_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 151559 Mar 25 20:46 rozprawa_words_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 107087 Mar 25 20:46 rozprawa_words_text.txt\r\n", "----------\r\n", "-rw-rw-r-- 1 ked ked 75461 Mar 25 20:46 uniform_words_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 2500 Mar 25 20:46 uniform_words_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 123504 Mar 25 20:46 uniform_words_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 99970 Mar 25 20:46 uniform_words_text.txt\r\n", "----------\r\n", "-rw-rw-r-- 1 ked ked 31120 Mar 25 20:46 geometric_words_compressed.gz\r\n", "-rw-rw-r-- 1 ked ked 1 Mar 25 20:46 geometric_words_encoded.bin\r\n", "-rw-rw-r-- 1 ked ked 100026 Mar 25 20:46 geometric_words_encoded_table.txt\r\n", "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 geometric_words_text.txt\r\n" ] } ], "source": [ "# Comparing file sizes of all text format files\n", "!ls -l rozprawa_words*\n", "!echo '----------'\n", "!ls -l uniform_words*\n", "!echo '----------'\n", "!ls -l geometric_words*" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Entropia\n", "\n", "| | Entropia |\n", "| ----------- | ----------- |\n", "| tekst w jęz. naturalnym | 3.2606867202633665 |\n", "| losowy tekst (dyskretny) | 6.033249974992498 |\n", "| losowy tekst (geometryczny)| 2.4852 |\n" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Wielkości w bitach:\n", "\n", "| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n", "| ----------- | ----------- |-----------|--------------------------------------|\n", "| tekst w jęz. naturalnym | 107087 | 40668 | 23817 + 151559 |\n", "| losowy tekst (jednostajny) | 99970 | 75461 | 2500 + 123504 |\n", "| losowy tekst (geometryczny)| 100000 | 31120 | 1 + 100026 |" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "#### Wnioski:\n", "- Plik skompresowany kodem Huffmana dla tekstu wygenerowanego losowo z rozkładu geometrycznego wychodzi dosyć osobliwie, spacja nie została wygenerowana ani razu, więc cały tekst reprezentowany jest jednym bajtem\n", "- Z tego samego powodu nie da się zdekodować 3 pierwszych słów tego tekstu na potrzebę jednego z zadań (jest tylko 1)\n", "- Generalnie podejście oparte na słowach przekłada się na znacznie większe tablice kodowe dla kodu Huffmana (bo jest więcej unikalnych słów niż symboli)\n", "- Analogicznie do powyższego same pliki zakodowane kodem Huffmana wychodzą znacznie mniejsze\n", "- re.split(f\"( )\", text) to naprawdę fajny trik, gdy chcemy podzielić tekst na słowa z zachowaniem spacji" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "# Zadanie 3" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 22, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "biiibbikpdrkbk\n", "[('d', '1/14'), ('p', '1/14'), ('r', '1/14'), ('k', '3/14'), ('b', '4/14'), ('i', '4/14')]\n" ] } ], "source": [ "# Generating text to encode\n", "random.seed(NR_INDEKSU)\n", "tekst = list('abcdefghijklmnoprst')\n", "random.shuffle(tekst)\n", "tekst = tekst[: 5 + random.randint(1,5)]\n", "tekst = [a*random.randint(1,4) for a in tekst]\n", "tekst = [item for sublist in tekst for item in sublist]\n", "''.join(tekst)\n", "random.shuffle(tekst)\n", "tekst = ''.join(tekst)\n", "print(tekst)\n", "counts = sorted(Counter(tekst).items(), key=lambda x: (x[1], x[0]))\n", "print([(letter, f'{count}/{len(tekst)}') for (letter, count) in counts])" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Zakodowany text 'biiibbikpdrkbk': 01 1 1 1 01 01 1 001 00001 00000 0001 001 01 001" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }