This commit is contained in:
Kacper 2023-03-26 14:37:45 +02:00
parent fc47cb5ce2
commit 270810bfbf

672
lab3.ipynb Normal file
View File

@ -0,0 +1,672 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import numpy as np\n",
"import string\n",
"import random\n",
"import gzip\n",
"from dahuffman import HuffmanCodec\n",
"import re\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"NR_INDEKSU = 449288\n",
"random.seed(NR_INDEKSU)\n",
"np.random.seed(NR_INDEKSU)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Zadanie 1"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ']\n"
]
}
],
"source": [
"# All characters from [a-zA-Z0-9 ]\n",
"chars = list(string.ascii_lowercase) + list(string.ascii_uppercase) + [str(i) for i in range(10)] + [' ']\n",
"print(chars)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": "'iebie wmówić w młodości, nie zbadawszy nigdy, czy są prawdziwe. Mimo bowiem że spostrzegłem w tym ro'"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1. Natural language text\n",
"url = 'https://wolnelektury.pl/media/book/txt/rozprawa-o-metodzie.txt'\n",
"rozprawa = requests.get(url).content.decode('utf-8')\n",
"rozprawa = rozprawa[:100000]\n",
"rozprawa[50000:50100]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "'Yc9 h7SWSelg8Pkd1Nq0IudZdaGujrPwWFXpuWuj3bAxKxe1G8IEMGqedyXnPW2CUsDCsos3ljJNuKeEGDzgs4NQM7sG6FfLqSVT'"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 2. Text generated randomly from a discrete uniform distribution, characters = [a-zA-Z0-9 ]\n",
"uniform = ''.join(random.choices(chars, k = 100000, ))\n",
"uniform[50000:50100]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "'cbbdebbebddbbbccbcbbbbbbbcdhcbbcbbbcdbedbcbbbbbecddbcbfbccbcbccdbcbebdbbcdcbdcbbbbbebbcbbebbdbcbbbdd'"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 3. Text generated randomly from a geometric distribution, p = 0.5, characters = [a-zA-Z0-9 ]\n",
"samples = np.random.geometric(p=0.5, size=100000)\n",
"geometric = ''.join([chars[sample % len(chars)] for sample in samples])\n",
"geometric[50000:50100]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "'0101000110000010100110101000100000011101000010011001010101010110011101111100101011110110101001100011'"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 4. Text generated randomly from a binomial distribution, p = 0.5, characters = [01]\n",
"samples = np.random.binomial(1, 0.5, 100000)\n",
"binomial_05 = ''.join([str(sample) for sample in samples])\n",
"binomial_05[50000:50100]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": "'1111111111111111111101101111111111111111111111111111111111111111110111101111111111101111111111111111'"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 5. Text generated randomly from a binomial distribution, p = 0.9, characters = [01]\n",
"samples = np.random.binomial(1, 0.9, 100000)\n",
"binomial_09 = ''.join([str(sample) for sample in samples])\n",
"binomial_09[50000:50100]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"# Creating a helper dict for function input\n",
"text_names = ['rozprawa_chars', 'uniform_chars', 'geometric_chars', 'binomial_05_chars', 'binomial_09_chars']\n",
"text_variables = [rozprawa, uniform, geometric, binomial_05, binomial_09]\n",
"texts = {text_name: text_variable for text_name, text_variable in zip(text_names, text_variables)}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rozprawa_chars 3.2612\n",
"uniform_chars 6.03328\n",
"geometric_chars 2.4852\n",
"binomial_05_chars 1.24432\n",
"binomial_09_chars 0.66288\n"
]
}
],
"source": [
"# Calculating entropy for texts\n",
"def calculate_entropy(texts):\n",
" for name, text in texts.items():\n",
" with gzip.open(name + '_compressed.gz', 'wb') as f:\n",
" compressed = gzip.compress(text.encode('utf-8'))\n",
" print(f\"{name}{' ' * (20 - len(name))}{8 * len(compressed) / len(text)}\")\n",
" f.write(compressed)\n",
"\n",
"calculate_entropy(texts)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"# Training codecs and encoding whole texts\n",
"def train_codecs(texts):\n",
" codecs = {name: HuffmanCodec.from_data(text) for name, text in texts.items()}\n",
" encoded_texts = {name: codecs[name].encode(text) for name, text in texts.items()}\n",
" return codecs, encoded_texts\n",
"\n",
"codecs, encoded_texts = train_codecs(texts)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rozprawa_chars 1111011110001111 10011111 00001111\n",
"uniform_chars 01110000 10101000 10000000\n",
"geometric_chars 00100000 01000000 10000000\n",
"binomial_05_chars 01000000 10000000 01000000\n",
"binomial_09_chars 10000000 10000000 10000000\n"
]
}
],
"source": [
"# Decoding 3 initial characters for all texts\n",
"def three_initials(texts, codecs):\n",
" for name, text in texts.items():\n",
" decoded = ' '.join([\"{:08b}\".format(int(codecs[name].encode(char).hex(), 16)) for char in list(text[:3])])\n",
" print(f\"{name}{' ' * (20 - len(name))}{decoded}\")\n",
"\n",
"three_initials(texts, codecs)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# Saving raw texts, encoded texts and code tables\n",
"def to_files(texts, codecs, encoded_texts):\n",
" for name, text in texts.items():\n",
" with open(name + '_text.txt', 'w', encoding='utf-8') as f_text, open(name + '_encoded.bin', 'wb') as f_encoded, open(name + '_encoded_table.txt', 'w', encoding='utf-8') as f_table:\n",
" f_text.write(text)\n",
" f_encoded.write(encoded_texts[name])\n",
" f_table.write(str(codecs[name].get_code_table()))\n",
"\n",
"to_files(texts, codecs, encoded_texts)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-rw-r-- 1 ked ked 40824 Mar 25 20:46 rozprawa_chars_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 59007 Mar 25 20:46 rozprawa_chars_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 1503 Mar 25 20:46 rozprawa_chars_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 107454 Mar 25 20:46 rozprawa_chars_text.txt\r\n",
"----------\r\n",
"-rw-rw-r-- 1 ked ked 75484 Mar 25 20:46 uniform_chars_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 74977 Mar 25 20:46 uniform_chars_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 887 Mar 25 20:46 uniform_chars_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 uniform_chars_text.txt\r\n",
"----------\r\n",
"-rw-rw-r-- 1 ked ked 31120 Mar 25 20:46 geometric_chars_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 24894 Mar 25 20:46 geometric_chars_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 258 Mar 25 20:46 geometric_chars_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 geometric_chars_text.txt\r\n",
"----------\r\n",
"-rw-rw-r-- 1 ked ked 15606 Mar 25 20:46 binomial_05_chars_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 18734 Mar 25 20:46 binomial_05_chars_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 40 Mar 25 20:46 binomial_05_chars_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 binomial_05_chars_text.txt\r\n",
"----------\r\n",
"-rw-rw-r-- 1 ked ked 8338 Mar 25 20:46 binomial_09_chars_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 13755 Mar 25 20:46 binomial_09_chars_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 40 Mar 25 20:46 binomial_09_chars_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 binomial_09_chars_text.txt\r\n"
]
}
],
"source": [
"# Comparing file sizes of all text format files\n",
"!ls -l rozprawa_chars*\n",
"!echo '----------'\n",
"!ls -l uniform_chars*\n",
"!echo '----------'\n",
"!ls -l geometric_chars*\n",
"!echo '----------'\n",
"!ls -l binomial_05_chars*\n",
"!echo '----------'\n",
"!ls -l binomial_09_chars*"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Entropia\n",
"\n",
"| | Entropia |\n",
"| ----------- | ----------- |\n",
"| tekst w jęz. naturalnym | 3.2612 |\n",
"| losowy tekst (jednostajny) | 6.03328 |\n",
"| losowy tekst (geometryczny)| 2.4852 |\n",
"| losowy tekst (dwupunktowy 0.5) | 1.24432 |\n",
"| losowy tekst (dwupunktowy 0.9) | 0.66288 |"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Wielkości w bitach:\n",
"\n",
"| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n",
"| ----------- | ----------- |-----------------------------------|--------------------------------------|\n",
"| tekst w jęz. naturalnym | 107454 | 40824 | 59007 + 1503 |\n",
"| losowy tekst (jednostajny) | 100000 | 75484 | 74977 + 887 |\n",
"| losowy tekst (geometryczny)| 100000 | 31120 | 24894 + 258 |\n",
"| losowy tekst (dwupunktowy 0.5)| 100000 | 15606 | 18734 + 40 |\n",
"| losowy tekst (dwupunktowy 0.9)| 100000 | 8338 | 13755 + 40 |"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Wnioski:\n",
"- Najwyższą entropię posiada losowy tekst z rozkładu jednostajnego. Jest to spodziewalne, ponieważ jest to właściwie czysty szum złożony z kilkudziesięciu możliwych symboli o tym samym prawdopodobieństwie wystąpienia\n",
"- Losowe teksty złożone wyłącznie z 0 i 1 mają niższą entropię nawet od tekstu w języku naturalnym, ponieważ ich losowość jest porównywalnie niższa przez bardzo ograniczoną klasę możliwych symboli\n",
"- Bardzo niska entropia losowego tekstu z rozkładu dwupunktowego 0.9 wynika z faktu, że cały tekst to w ~90% powtarzany jeden symbol\n",
"- Wysoka entropia tekstu przyczynia się do braku możliwości efektywnego skompresowania go, a tym samym zaoszczędzenia więcej miejsca na dysku - ciężko jest stworzyć efektywne kodowanie dla symboli o bardzo podobnym rozkładzie\n",
"- Klasa możliwych symboli znacząco wpływa na rozmiar tablicy kodującej\n",
"- Kompresja gzip daje znacznie lepsze wyniki niż kodek Huffmana wyłącznie dla tekstu w języku naturalnym\n",
"- Większy niż 100000 rozmiar w bajtach tekstu w języku naturalnym wynika z faktu, iż niektóre znaki w nim występujące (np. polskie litery) są kodowane więcej niż jednym bajtem"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Zadanie 2"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [
"# Creating a helper dict for function input + deleting multiple spaces\n",
"text_names_words = [name.replace('char', 'word') for name in text_names[:3]]\n",
"text_variables_words = text_variables[:3]\n",
"texts_words = {text_name: re.sub(r'\\s+', ' ', text_variable) for text_name, text_variable in zip(text_names_words, text_variables_words)}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [],
"source": [
"# Defining a single space-preserving split function\n",
"word_split = lambda text: re.split(f\"( )\", text)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rozprawa_words 3.2606867202633665\n",
"uniform_words 6.033249974992498\n",
"geometric_words 2.4852\n"
]
}
],
"source": [
"# Calculating entropy for texts\n",
"calculate_entropy(texts_words)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [],
"source": [
"# Training codecs and encoding whole texts\n",
"def train_codecs_words(texts):\n",
" codecs = {name: HuffmanCodec.from_data(word_split(text)) for name, text in texts.items()}\n",
" encoded_texts = {name: codecs[name].encode(word_split(text)) for name, text in texts.items()}\n",
" return codecs, encoded_texts\n",
"\n",
"codecs_words, encoded_texts_words = train_codecs_words(texts_words)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rozprawa_words 1000110111111101 01011101 1000011001010111\n",
"uniform_words 1000001101010111 01011101 1101011000111011\n",
"geometric_words 10000000\n"
]
}
],
"source": [
"# Decoding 3 initial words for all texts\n",
"def three_initials_words(texts, codecs):\n",
" for name, text in texts.items():\n",
" decoded = ' '.join([\"{:08b}\".format(int(codecs[name].encode([word]).hex(), 16)) for word in word_split(text)[:3]])\n",
" print(f\"{name}{' ' * (20 - len(name))}{decoded}\")\n",
"\n",
"three_initials_words(texts_words, codecs_words)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"# Saving raw texts, encoded texts and code tables\n",
"to_files(texts_words, codecs_words, encoded_texts_words)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-rw-r-- 1 ked ked 40668 Mar 25 20:46 rozprawa_words_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 23817 Mar 25 20:46 rozprawa_words_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 151559 Mar 25 20:46 rozprawa_words_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 107087 Mar 25 20:46 rozprawa_words_text.txt\r\n",
"----------\r\n",
"-rw-rw-r-- 1 ked ked 75461 Mar 25 20:46 uniform_words_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 2500 Mar 25 20:46 uniform_words_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 123504 Mar 25 20:46 uniform_words_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 99970 Mar 25 20:46 uniform_words_text.txt\r\n",
"----------\r\n",
"-rw-rw-r-- 1 ked ked 31120 Mar 25 20:46 geometric_words_compressed.gz\r\n",
"-rw-rw-r-- 1 ked ked 1 Mar 25 20:46 geometric_words_encoded.bin\r\n",
"-rw-rw-r-- 1 ked ked 100026 Mar 25 20:46 geometric_words_encoded_table.txt\r\n",
"-rw-rw-r-- 1 ked ked 100000 Mar 25 20:46 geometric_words_text.txt\r\n"
]
}
],
"source": [
"# Comparing file sizes of all text format files\n",
"!ls -l rozprawa_words*\n",
"!echo '----------'\n",
"!ls -l uniform_words*\n",
"!echo '----------'\n",
"!ls -l geometric_words*"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Entropia\n",
"\n",
"| | Entropia |\n",
"| ----------- | ----------- |\n",
"| tekst w jęz. naturalnym | 3.2606867202633665 |\n",
"| losowy tekst (dyskretny) | 6.033249974992498 |\n",
"| losowy tekst (geometryczny)| 2.4852 |\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Wielkości w bitach:\n",
"\n",
"| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n",
"| ----------- | ----------- |-----------|--------------------------------------|\n",
"| tekst w jęz. naturalnym | 107087 | 40668 | 23817 + 151559 |\n",
"| losowy tekst (jednostajny) | 99970 | 75461 | 2500 + 123504 |\n",
"| losowy tekst (geometryczny)| 100000 | 31120 | 1 + 100026 |"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Wnioski:\n",
"- Plik skompresowany kodem Huffmana dla tekstu wygenerowanego losowo z rozkładu geometrycznego wychodzi dosyć osobliwie, spacja nie została wygenerowana ani razu, więc cały tekst reprezentowany jest jednym bajtem\n",
"- Z tego samego powodu nie da się zdekodować 3 pierwszych słów tego tekstu na potrzebę jednego z zadań (jest tylko 1)\n",
"- Generalnie podejście oparte na słowach przekłada się na znacznie większe tablice kodowe dla kodu Huffmana (bo jest więcej unikalnych słów niż symboli)\n",
"- Analogicznie do powyższego same pliki zakodowane kodem Huffmana wychodzą znacznie mniejsze\n",
"- re.split(f\"( )\", text) to naprawdę fajny trik, gdy chcemy podzielić tekst na słowa z zachowaniem spacji"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Zadanie 3"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"biiibbikpdrkbk\n",
"[('d', '1/14'), ('p', '1/14'), ('r', '1/14'), ('k', '3/14'), ('b', '4/14'), ('i', '4/14')]\n"
]
}
],
"source": [
"# Generating text to encode\n",
"random.seed(NR_INDEKSU)\n",
"tekst = list('abcdefghijklmnoprst')\n",
"random.shuffle(tekst)\n",
"tekst = tekst[: 5 + random.randint(1,5)]\n",
"tekst = [a*random.randint(1,4) for a in tekst]\n",
"tekst = [item for sublist in tekst for item in sublist]\n",
"''.join(tekst)\n",
"random.shuffle(tekst)\n",
"tekst = ''.join(tekst)\n",
"print(tekst)\n",
"counts = sorted(Counter(tekst).items(), key=lambda x: (x[1], x[0]))\n",
"print([(letter, f'{count}/{len(tekst)}') for (letter, count) in counts])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"<img src=https://i.imgur.com/Q41bqzG.jpg width=\"800\">"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Zakodowany text 'biiibbikpdrkbk': 01 1 1 1 01 01 1 001 00001 00000 0001 001 01 001"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}