diff --git a/lab3.ipynb b/lab3.ipynb new file mode 100644 index 0000000..3dcbd6c --- /dev/null +++ b/lab3.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import requests\n", + "import numpy as np\n", + "import string\n", + "import random\n", + "import gzip\n", + "from dahuffman import HuffmanCodec\n", + "import re\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "NR_INDEKSU = 449288\n", + "random.seed(NR_INDEKSU)\n", + "np.random.seed(NR_INDEKSU)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "# Zadanie 1" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ']\n" + ] + } + ], + "source": [ + "# All characters from [a-zA-Z0-9 ]\n", + "chars = list(string.ascii_lowercase) + list(string.ascii_uppercase) + [str(i) for i in range(10)] + [' ']\n", + "print(chars)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "'iebie wmówić w młodości, nie zbadawszy nigdy, czy są prawdziwe. Mimo bowiem że spostrzegłem w tym ro'" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. Natural language text\n", + "url = 'https://wolnelektury.pl/media/book/txt/rozprawa-o-metodzie.txt'\n", + "rozprawa = requests.get(url).content.decode('utf-8')\n", + "rozprawa = rozprawa[:100000]\n", + "rozprawa[50000:50100]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "data": { + "text/plain": "'Yc9 h7SWSelg8Pkd1Nq0IudZdaGujrPwWFXpuWuj3bAxKxe1G8IEMGqedyXnPW2CUsDCsos3ljJNuKeEGDzgs4NQM7sG6FfLqSVT'" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 2. Text generated randomly from a discrete uniform distribution, characters = [a-zA-Z0-9 ]\n", + "uniform = ''.join(random.choices(chars, k = 100000, ))\n", + "uniform[50000:50100]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "data": { + "text/plain": "'cbbdebbebddbbbccbcbbbbbbbcdhcbbcbbbcdbedbcbbbbbecddbcbfbccbcbccdbcbebdbbcdcbdcbbbbbebbcbbebbdbcbbbdd'" + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3. Text generated randomly from a geometric distribution, p = 0.5, characters = [a-zA-Z0-9 ]\n", + "samples = np.random.geometric(p=0.5, size=100000)\n", + "geometric = ''.join([chars[sample % len(chars)] for sample in samples])\n", + "geometric[50000:50100]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "'0101000110000010100110101000100000011101000010011001010101010110011101111100101011110110101001100011'" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 4. Text generated randomly from a binomial distribution, p = 0.5, characters = [01]\n", + "samples = np.random.binomial(1, 0.5, 100000)\n", + "binomial_05 = ''.join([str(sample) for sample in samples])\n", + "binomial_05[50000:50100]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/plain": "'1111111111111111111101101111111111111111111111111111111111111111110111101111111111101111111111111111'" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 5. Text generated randomly from a binomial distribution, p = 0.9, characters = [01]\n", + "samples = np.random.binomial(1, 0.9, 100000)\n", + "binomial_09 = ''.join([str(sample) for sample in samples])\n", + "binomial_09[50000:50100]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [], + "source": [ + "# Creating a helper dict for function input\n", + "text_names = ['rozprawa_chars', 'uniform_chars', 'geometric_chars', 'binomial_05_chars', 'binomial_09_chars']\n", + "text_variables = [rozprawa, uniform, geometric, binomial_05, binomial_09]\n", + "texts = {text_name: text_variable for text_name, text_variable in zip(text_names, text_variables)}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rozprawa_chars 3.2612\n", + "uniform_chars 6.03328\n", + "geometric_chars 2.4852\n", + "binomial_05_chars 1.24432\n", + "binomial_09_chars 0.66288\n" + ] + } + ], + "source": [ + "# Calculating entropy for texts\n", + "def calculate_entropy(texts):\n", + " for name, text in texts.items():\n", + " with gzip.open(name + '_compressed.gz', 'wb') as f:\n", + " compressed = gzip.compress(text.encode('utf-8'))\n", + " print(f\"{name}{' ' * (20 - len(name))}{8 * len(compressed) / len(text)}\")\n", + " f.write(compressed)\n", + "\n", + "calculate_entropy(texts)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "# Training codecs and encoding whole texts\n", + "def train_codecs(texts):\n", + " codecs = {name: HuffmanCodec.from_data(text) for name, text in texts.items()}\n", + " encoded_texts = {name: codecs[name].encode(text) for name, text in texts.items()}\n", + " return codecs, encoded_texts\n", + "\n", + "codecs, encoded_texts = train_codecs(texts)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rozprawa_chars 1111011110001111 10011111 00001111\n", + "uniform_chars 01110000 10101000 10000000\n", + "geometric_chars 00100000 01000000 10000000\n", + "binomial_05_chars 01000000 10000000 01000000\n", + "binomial_09_chars 10000000 10000000 10000000\n" + ] + } + ], + "source": [ + "# Decoding 3 initial characters for all texts\n", + "def three_initials(texts, codecs):\n", + " for name, text in texts.items():\n", + " decoded = ' '.join([\"{:08b}\".format(int(codecs[name].encode(char).hex(), 16)) for char in list(text[:3])])\n", + " print(f\"{name}{' ' * (20 - len(name))}{decoded}\")\n", + "\n", + "three_initials(texts, codecs)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [], + "source": [ + "# Saving raw texts, encoded texts and code tables\n", + "def to_files(texts, codecs, encoded_texts):\n", + " for name, text in texts.items():\n", + " with open(name + '_text.txt', 'w', encoding='utf-8') as f_text, open(name + '_encoded.bin', 'wb') as f_encoded, open(name + '_encoded_table.txt', 'w', encoding='utf-8') as f_table:\n", + " f_text.write(text)\n", + " f_encoded.write(encoded_texts[name])\n", + " f_table.write(str(codecs[name].get_code_table()))\n", + "\n", + "to_files(texts, codecs, encoded_texts)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-rw-r-- 1 ked ked 40824 Mar 25 20:46 rozprawa_chars_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 59007 Mar 25 20:46 rozprawa_chars_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 1503 Mar 25 20:46 rozprawa_chars_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 107454 Mar 25 20:46 rozprawa_chars_text.txt\r\n", + "----------\r\n", + "-rw-rw-r-- 1 ked ked 75484 Mar 25 20:46 uniform_chars_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 74977 Mar 25 20:46 uniform_chars_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 887 Mar 25 20:46 uniform_chars_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 uniform_chars_text.txt\r\n", + "----------\r\n", + "-rw-rw-r-- 1 ked ked 31120 Mar 25 20:46 geometric_chars_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 24894 Mar 25 20:46 geometric_chars_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 258 Mar 25 20:46 geometric_chars_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 geometric_chars_text.txt\r\n", + "----------\r\n", + "-rw-rw-r-- 1 ked ked 15606 Mar 25 20:46 binomial_05_chars_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 18734 Mar 25 20:46 binomial_05_chars_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 40 Mar 25 20:46 binomial_05_chars_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 binomial_05_chars_text.txt\r\n", + "----------\r\n", + "-rw-rw-r-- 1 ked ked 8338 Mar 25 20:46 binomial_09_chars_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 13755 Mar 25 20:46 binomial_09_chars_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 40 Mar 25 20:46 binomial_09_chars_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 binomial_09_chars_text.txt\r\n" + ] + } + ], + "source": [ + "# Comparing file sizes of all text format files\n", + "!ls -l rozprawa_chars*\n", + "!echo '----------'\n", + "!ls -l uniform_chars*\n", + "!echo '----------'\n", + "!ls -l geometric_chars*\n", + "!echo '----------'\n", + "!ls -l binomial_05_chars*\n", + "!echo '----------'\n", + "!ls -l binomial_09_chars*" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Entropia\n", + "\n", + "| | Entropia |\n", + "| ----------- | ----------- |\n", + "| tekst w jęz. naturalnym | 3.2612 |\n", + "| losowy tekst (jednostajny) | 6.03328 |\n", + "| losowy tekst (geometryczny)| 2.4852 |\n", + "| losowy tekst (dwupunktowy 0.5) | 1.24432 |\n", + "| losowy tekst (dwupunktowy 0.9) | 0.66288 |" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Wielkości w bitach:\n", + "\n", + "| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n", + "| ----------- | ----------- |-----------------------------------|--------------------------------------|\n", + "| tekst w jęz. naturalnym | 107454 | 40824 | 59007 + 1503 |\n", + "| losowy tekst (jednostajny) | 100000 | 75484 | 74977 + 887 |\n", + "| losowy tekst (geometryczny)| 100000 | 31120 | 24894 + 258 |\n", + "| losowy tekst (dwupunktowy 0.5)| 100000 | 15606 | 18734 + 40 |\n", + "| losowy tekst (dwupunktowy 0.9)| 100000 | 8338 | 13755 + 40 |" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Wnioski:\n", + "- Najwyższą entropię posiada losowy tekst z rozkładu jednostajnego. Jest to spodziewalne, ponieważ jest to właściwie czysty szum złożony z kilkudziesięciu możliwych symboli o tym samym prawdopodobieństwie wystąpienia\n", + "- Losowe teksty złożone wyłącznie z 0 i 1 mają niższą entropię nawet od tekstu w języku naturalnym, ponieważ ich losowość jest porównywalnie niższa przez bardzo ograniczoną klasę możliwych symboli\n", + "- Bardzo niska entropia losowego tekstu z rozkładu dwupunktowego 0.9 wynika z faktu, że cały tekst to w ~90% powtarzany jeden symbol\n", + "- Wysoka entropia tekstu przyczynia się do braku możliwości efektywnego skompresowania go, a tym samym zaoszczędzenia więcej miejsca na dysku - ciężko jest stworzyć efektywne kodowanie dla symboli o bardzo podobnym rozkładzie\n", + "- Klasa możliwych symboli znacząco wpływa na rozmiar tablicy kodującej\n", + "- Kompresja gzip daje znacznie lepsze wyniki niż kodek Huffmana wyłącznie dla tekstu w języku naturalnym\n", + "- Większy niż 100000 rozmiar w bajtach tekstu w języku naturalnym wynika z faktu, iż niektóre znaki w nim występujące (np. polskie litery) są kodowane więcej niż jednym bajtem" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "# Zadanie 2" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [], + "source": [ + "# Creating a helper dict for function input + deleting multiple spaces\n", + "text_names_words = [name.replace('char', 'word') for name in text_names[:3]]\n", + "text_variables_words = text_variables[:3]\n", + "texts_words = {text_name: re.sub(r'\\s+', ' ', text_variable) for text_name, text_variable in zip(text_names_words, text_variables_words)}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [], + "source": [ + "# Defining a single space-preserving split function\n", + "word_split = lambda text: re.split(f\"( )\", text)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rozprawa_words 3.2606867202633665\n", + "uniform_words 6.033249974992498\n", + "geometric_words 2.4852\n" + ] + } + ], + "source": [ + "# Calculating entropy for texts\n", + "calculate_entropy(texts_words)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [], + "source": [ + "# Training codecs and encoding whole texts\n", + "def train_codecs_words(texts):\n", + " codecs = {name: HuffmanCodec.from_data(word_split(text)) for name, text in texts.items()}\n", + " encoded_texts = {name: codecs[name].encode(word_split(text)) for name, text in texts.items()}\n", + " return codecs, encoded_texts\n", + "\n", + "codecs_words, encoded_texts_words = train_codecs_words(texts_words)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 19, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rozprawa_words 1000110111111101 01011101 1000011001010111\n", + "uniform_words 1000001101010111 01011101 1101011000111011\n", + "geometric_words 10000000\n" + ] + } + ], + "source": [ + "# Decoding 3 initial words for all texts\n", + "def three_initials_words(texts, codecs):\n", + " for name, text in texts.items():\n", + " decoded = ' '.join([\"{:08b}\".format(int(codecs[name].encode([word]).hex(), 16)) for word in word_split(text)[:3]])\n", + " print(f\"{name}{' ' * (20 - len(name))}{decoded}\")\n", + "\n", + "three_initials_words(texts_words, codecs_words)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 20, + "outputs": [], + "source": [ + "# Saving raw texts, encoded texts and code tables\n", + "to_files(texts_words, codecs_words, encoded_texts_words)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 21, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-rw-r-- 1 ked ked 40668 Mar 25 20:46 rozprawa_words_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 23817 Mar 25 20:46 rozprawa_words_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 151559 Mar 25 20:46 rozprawa_words_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 107087 Mar 25 20:46 rozprawa_words_text.txt\r\n", + "----------\r\n", + "-rw-rw-r-- 1 ked ked 75461 Mar 25 20:46 uniform_words_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 2500 Mar 25 20:46 uniform_words_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 123504 Mar 25 20:46 uniform_words_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 99970 Mar 25 20:46 uniform_words_text.txt\r\n", + "----------\r\n", + "-rw-rw-r-- 1 ked ked 31120 Mar 25 20:46 geometric_words_compressed.gz\r\n", + "-rw-rw-r-- 1 ked ked 1 Mar 25 20:46 geometric_words_encoded.bin\r\n", + "-rw-rw-r-- 1 ked ked 100026 Mar 25 20:46 geometric_words_encoded_table.txt\r\n", + "-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 geometric_words_text.txt\r\n" + ] + } + ], + "source": [ + "# Comparing file sizes of all text format files\n", + "!ls -l rozprawa_words*\n", + "!echo '----------'\n", + "!ls -l uniform_words*\n", + "!echo '----------'\n", + "!ls -l geometric_words*" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Entropia\n", + "\n", + "| | Entropia |\n", + "| ----------- | ----------- |\n", + "| tekst w jęz. naturalnym | 3.2606867202633665 |\n", + "| losowy tekst (dyskretny) | 6.033249974992498 |\n", + "| losowy tekst (geometryczny)| 2.4852 |\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Wielkości w bitach:\n", + "\n", + "| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n", + "| ----------- | ----------- |-----------|--------------------------------------|\n", + "| tekst w jęz. naturalnym | 107087 | 40668 | 23817 + 151559 |\n", + "| losowy tekst (jednostajny) | 99970 | 75461 | 2500 + 123504 |\n", + "| losowy tekst (geometryczny)| 100000 | 31120 | 1 + 100026 |" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Wnioski:\n", + "- Plik skompresowany kodem Huffmana dla tekstu wygenerowanego losowo z rozkładu geometrycznego wychodzi dosyć osobliwie, spacja nie została wygenerowana ani razu, więc cały tekst reprezentowany jest jednym bajtem\n", + "- Z tego samego powodu nie da się zdekodować 3 pierwszych słów tego tekstu na potrzebę jednego z zadań (jest tylko 1)\n", + "- Generalnie podejście oparte na słowach przekłada się na znacznie większe tablice kodowe dla kodu Huffmana (bo jest więcej unikalnych słów niż symboli)\n", + "- Analogicznie do powyższego same pliki zakodowane kodem Huffmana wychodzą znacznie mniejsze\n", + "- re.split(f\"( )\", text) to naprawdę fajny trik, gdy chcemy podzielić tekst na słowa z zachowaniem spacji" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "# Zadanie 3" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 22, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "biiibbikpdrkbk\n", + "[('d', '1/14'), ('p', '1/14'), ('r', '1/14'), ('k', '3/14'), ('b', '4/14'), ('i', '4/14')]\n" + ] + } + ], + "source": [ + "# Generating text to encode\n", + "random.seed(NR_INDEKSU)\n", + "tekst = list('abcdefghijklmnoprst')\n", + "random.shuffle(tekst)\n", + "tekst = tekst[: 5 + random.randint(1,5)]\n", + "tekst = [a*random.randint(1,4) for a in tekst]\n", + "tekst = [item for sublist in tekst for item in sublist]\n", + "''.join(tekst)\n", + "random.shuffle(tekst)\n", + "tekst = ''.join(tekst)\n", + "print(tekst)\n", + "counts = sorted(Counter(tekst).items(), key=lambda x: (x[1], x[0]))\n", + "print([(letter, f'{count}/{len(tekst)}') for (letter, count) in counts])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Zakodowany text 'biiibbikpdrkbk': 01 1 1 1 01 01 1 001 00001 00000 0001 001 01 001" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}