lab3 v4

lab3 v3
lab3 v2
2023-03-29 12:03:58 +02:00 · 2023-03-29 11:01:35 +02:00 · 2023-03-29 04:16:25 +02:00 · 2023-03-29 04:08:59 +02:00 · 2023-03-22 07:29:54 +01:00 · 2023-03-22 04:32:34 +01:00
33 changed files with 2735 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 .DS_STORE
+out-merged.txt
--- a/Lab1-PrepareCorpora/README.md
+++ b/Lab1-PrepareCorpora/README.md
--- a/Lab1-PrepareCorpora/clean.py
+++ b/Lab1-PrepareCorpora/clean.py
@ -1,6 +1,6 @@
 import pandas
 import regex as re
-import argparse, sys
+import argparse

 parser=argparse.ArgumentParser()
 parser.add_argument("--filepath",)
@ -19,14 +19,15 @@ def filter_line(line):
  return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)

 def clean_with_regex(text):
-  text = str(text).encode("ascii", "ignore").decode("utf-8")
-  regex_pattern = "(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
+  # text = str(text).encode("ascii", "ignore").decode("utf-8")
+  regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
  try:
    out = re.split(regex_pattern, text)
  except TypeError as e:
    return []
  out = list(filter(lambda item: filter_line(item), out))
-  out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
+  out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", " ", item), out))
+  out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]|[^\x00-\x7F]|ex\d+", " ", item), out))
  if out:
    out.pop(len(out)-1)
  return out
@ -41,7 +42,7 @@ def print_text(text, sort=False):
 def save_to_file(paragraph_list, file_name):
  with open(file_name, 'a') as f:
    for line in paragraph_list:
-      f.write("%s\n" % line.strip())
+      f.write("%s\n" % line.strip().lower())
    f.close()


--- a/Lab2/README.md
+++ b/Lab2/README.md
@ -0,0 +1,52 @@
+# Statystyki  
+
+## Uruchomienie skryptu
+
+Należy uruchomić skrypt pythonowy statistics.py. Wynikiem działania programu są utworzone zdjęcia w folderze /images.
+
+```python statistics.py --filePath {sciezka_do_pliku}```
+
+## Statystyki podstawowe
+
+### 10 nadłuższych słów
+
+`
+MarineStrategyFrameworkDirectiveClassificationValue
+OtherFinancialProfessionalAndInformationServices
+GuineaPeruPhilippinesQatarRomaniaRussiaRwandaSao
+MarineStrategyFrameworkDirectiveClassificationValue
+AustraliaArgentinaBotswanaBrazilChileNamibiaNew
+ManufacturingOfElectricalAndOpticalEquipment
+ClassificationAndQuantificationFrameworkValue
+FinancialProfessionalAndInformationServices
+measuredIndicatedAndInferredMineralResource
+AnthropogenicGeomorphologicFeatureTypeValue
+`
+
+### Prawo Zipfa dla słów
+
+![title](images/zipf-law-words.png)
+
+### Prawo Zipfa dla trigramów z słów
+
+![title](images/zipf-law-3grams.png)
+
+### Słowa łamiące prawo łączące długość z częstością
+
+- aunt (4 znaki, 31 wystąpień)
+- cave (4 znaki, 31 wystąpień)
+- amateur (7 znaków, 31 wystąpień)
+- CommissionFranz (15 znaków, 2090 wystąpień)
+- responsibilities (16 znaków, 2087 wystąpień)
+- Interventionsstelle (19 znaków, 231 wystąpień)
+- hydrogenorthophosphate (22 znaków, 148 wystąpień)
+- polytetrafluoroethylene (23 znaków, 148 wystąpień)  
+
+### Częstotliwość zaimków
+
+![title](images/pt-pronouns.png)
+
+### Ilosć wystąpień dat (lata)
+
+`['1999', '1975', '1987', '1992', '1985', '1981', '1988', '1986', '1995', '1991', '1993', '1990', '1994', '1983', '1989'...`
+![title](images/pt-years.png)
--- a/Lab2/images/most-freq-words-10.png
+++ b/Lab2/images/most-freq-words-10.png
--- a/Lab2/images/most-freq-words-20.png
+++ b/Lab2/images/most-freq-words-20.png
--- a/Lab2/images/pt-pronouns.png
+++ b/Lab2/images/pt-pronouns.png
--- a/Lab2/images/pt-years.png
+++ b/Lab2/images/pt-years.png
--- a/Lab2/images/zipf-law-3grams.png
+++ b/Lab2/images/zipf-law-3grams.png
--- a/Lab2/images/zipf-law-words.png
+++ b/Lab2/images/zipf-law-words.png
--- a/Lab2/statistics.py
+++ b/Lab2/statistics.py
@ -0,0 +1,178 @@
+import matplotlib.pyplot as plt
+from collections import Counter
+from collections import OrderedDict
+import regex as re
+from math import log
+import argparse
+import os
+
+parser=argparse.ArgumentParser()
+parser.add_argument("--filepath")
+args=parser.parse_args()
+
+FILE_PATH = "Lab1/out-merged.txt" if args.filepath is None else args.filepath
+IMAGES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "images")
+file_content = None
+
+with open(FILE_PATH, 'r') as file:
+    file_content = file.read()
+    file.close()
+
+# file_content = file_content[:10000000]
+
+def get_characters(t):
+    yield from t
+
+def freq_list(g, top=None):
+  c = Counter(g)
+
+  if top is None:
+     items = c.items()
+  else:
+     items = c.most_common(top)
+
+  return OrderedDict(sorted(items, key=lambda t: -t[1]))
+
+def get_words(t):
+  for m in re.finditer(r'[\p{L}0-9\*]+', t):
+     yield m.group(0)
+
+def rang_freq_with_labels(name, g, top=None):
+   freq = freq_list(g, top)
+
+   plt.figure(figsize=(12, 3))
+   plt.ylabel('liczba wystąpień')
+   plt.bar(freq.keys(), freq.values())
+   
+   fname = f'/{name}.png'
+   
+   plt.savefig(IMAGES_PATH + fname)
+   
+   return fname
+
+def log_rang_log_freq(name, g):
+   freq = freq_list(g)
+
+   plt.figure().clear()
+   plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
+
+   fname = f'/{name}.png'
+
+   plt.savefig(IMAGES_PATH + fname)
+
+   return fname
+
+def ngrams(iter, size):
+    ngram = []
+    for item in iter:
+       ngram.append(item)
+       if len(ngram) == size:
+          yield tuple(ngram)
+          ngram = ngram[1:]
+
+def get_ngrams(t, size):
+   for word in get_words(t):
+    for m in ngrams(word, size):
+        yield m
+
+def get_w_freq_by_w_len(freq, word_len):
+    for word, count in freq.items():
+        if len(word) == word_len:
+            yield (count, word)
+            
+def get_average_freq_by_w_len(freq, word_lenghts):
+    results = dict()
+    for l in word_lenghts:
+        word_freq = list(get_w_freq_by_w_len(freq, l))
+        if len(word_freq) == 0:
+            continue
+        average = sum([w[0] for w in word_freq]) / len(word_freq)
+        results[l] = average
+        
+    return results
+
+def get_low_high_freq_by_w_len(freq, word_lenghts, average_freq):
+    """
+    Returns top 5 most frequent and non frequent words for each word length + average frequency.
+    """
+    results = []
+    for l in word_lenghts:
+        word_freq = list(get_w_freq_by_w_len(freq, l))
+        word_freq.sort()
+        word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
+        word_stats = {
+            'word_len': l,
+            'average_freq': average_freq[l],
+            'low_freq': word_freq[:5],
+            'high_freq': word_freq[-5:]
+        }
+        results.append(word_stats)
+    return results
+
+def get_pronouns_stats(freqs):
+    pronouns = ["i", "you", "he", "she", "it"]
+    pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
+    
+    x = [f[0] for f in pronoun_words_freq]
+    y = [f[1] for f in pronoun_words_freq]
+    
+    plt.figure(figsize=(12, 3))
+    plt.ylabel('liczba wystąpień')
+    plt.bar(x, y)
+    plt.savefig(IMAGES_PATH + "/pt-pronouns.png")
+
+    return pronoun_words_freq
+
+def get_years_stats(freqs):
+    years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
+    x = [f[0] for f in years_word_freq]
+    y = [f[1] for f in years_word_freq]
+
+    plt.figure(figsize=(12, 3))
+    plt.ylabel('liczba wystąpień')
+    plt.bar(x, y)
+    plt.savefig(IMAGES_PATH + "/pt-years.png")
+
+    return years_word_freq
+
+def get_longest_words(top):
+    all_words = list(get_words(file_content))
+    deduplicated_word_listr = [*set(all_words)]
+    deduplicated_word_listr.sort(key=len)
+    deduplicated_word_listr.reverse()
+    return deduplicated_word_listr[:top]
+
+print("Generating statistics...")
+
+# 10 longest words
+print("Calculating 10 longest words...")
+print(get_longest_words(10))
+
+# 10 most frequent words in the text
+print("Calculating 10 most frequent words in the text...")
+rang_freq_with_labels('most-freq-words-10', get_words(file_content), top=10)
+
+# Zipf's law
+print("Calculating Zipf's law...")
+log_rang_log_freq('zipf-law-words', get_words(file_content))
+
+# Zipf's law for 3-grams
+print("Calculating Zipf's law for 3-grams...")
+log_rang_log_freq('zipf-law-3grams', get_ngrams(file_content, 3))
+
+# Words breaking the Zipf's law
+print("Calculating words breaking the Zipf's law...")
+freq = freq_list(get_words(file_content))
+lenghts = [*set(len(f[0]) for f in freq.items())]
+average_freq = get_average_freq_by_w_len(freq, lenghts)
+get_low_high_freq_by_w_len(freq, lenghts, average_freq)
+
+# Frequency of pronouns
+print("Calculating frequency of pronouns...")
+get_pronouns_stats(freq)
+
+# Number of years in words
+print("Calculating number of years in words...")
+get_years_stats(freq)
+
+print("Done")
--- a/Lab3/DrzewoHuffmana.png
+++ b/Lab3/DrzewoHuffmana.png
--- a/Lab3/files_bin/own_corpus.bin
+++ b/Lab3/files_bin/own_corpus.bin
--- a/Lab3/files_bin/own_corpus_codetable.bin
+++ b/Lab3/files_bin/own_corpus_codetable.bin
@ -0,0 +1 @@
+{'u': (5, 0), 'k': (8, 8), 'x': (8, 9), '2': (7, 5), 'q': (9, 24), '8': (9, 25), '5': (9, 26), '-': (9, 27), '1': (7, 7), 's': (4, 1), 'e': (3, 1), 'r': (4, 4), '0': (7, 40), '6': (9, 164), _EOF: (12, 1320), '_': (12, 1321), 'z': (11, 661), 'j': (10, 331), '7': (9, 166), '4': (9, 167), 'w': (7, 42), 'v': (7, 43), 'd': (5, 11), 'h': (5, 12), 'g': (6, 26), 'y': (7, 54), '9': (9, 220), '\n': (9, 221), '(': (8, 111), 'n': (4, 7), 'o': (4, 8), 'a': (4, 9), 'l': (5, 20), 'c': (5, 21), 'i': (4, 11), 't': (4, 12), ')': (8, 208), '3': (9, 418), '/': (9, 419), 'b': (7, 105), 'm': (6, 53), 'f': (6, 54), 'p': (6, 55), ' ': (3, 7)}
--- a/Lab3/files_bin/random_text_geometric_distribution.bin
+++ b/Lab3/files_bin/random_text_geometric_distribution.bin
--- a/Lab3/files_bin/random_text_geometric_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_geometric_distribution_codetable.bin
@ -0,0 +1 @@
+{'p': (9, 0), 't': (11, 4), 'y': (13, 20), 'A': (15, 84), 'C': (15, 85), 'z': (14, 43), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'D': (16, 32768), 'H': (16, 32769), 'M': (16, 32770), _EOF: (17, 65542), 'B': (17, 65543), 'x': (14, 8193), 'w': (13, 4097), 'u': (12, 2049), 's': (11, 1025), 'q': (10, 513), 'o': (9, 257), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}
--- a/Lab3/files_bin/random_text_uniform_distribution.bin
+++ b/Lab3/files_bin/random_text_uniform_distribution.bin
--- a/Lab3/files_bin/random_text_uniform_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_uniform_distribution_codetable.bin
@ -0,0 +1 @@
+{'q': (5, 0), _EOF: (7, 4), 'y': (7, 5), 'f': (6, 3), 't': (6, 4), 'N': (6, 5), 'M': (6, 6), 'U': (6, 7), '0': (6, 8), '2': (6, 9), 'K': (6, 10), '9': (6, 11), 'A': (6, 12), 'm': (6, 13), '1': (6, 14), 'J': (6, 15), 'z': (6, 16), 'S': (6, 17), ' ': (6, 18), 'd': (6, 19), 'Y': (6, 20), 'O': (6, 21), 'x': (6, 22), '4': (6, 23), 'k': (6, 24), 'D': (6, 25), 'E': (6, 26), 'i': (6, 27), 'p': (6, 28), 'P': (6, 29), 'G': (6, 30), 'C': (6, 31), 'o': (6, 32), 'F': (6, 33), 'V': (6, 34), 'j': (6, 35), 'w': (6, 36), 'Z': (6, 37), 's': (6, 38), 'I': (6, 39), 'L': (6, 40), 'Q': (6, 41), 'r': (6, 42), 'l': (6, 43), 'H': (6, 44), 'T': (6, 45), 'g': (6, 46), 'e': (6, 47), 'B': (6, 48), '6': (6, 49), '5': (6, 50), 'R': (6, 51), 'X': (6, 52), 'b': (6, 53), '3': (6, 54), '8': (6, 55), 'c': (6, 56), 'v': (6, 57), 'a': (6, 58), 'n': (6, 59), '7': (6, 60), 'h': (6, 61), 'W': (6, 62), 'u': (6, 63)}
--- a/Lab3/files_bin/random_text_uniform_two_point_05_distribution.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_05_distribution.bin
--- a/Lab3/files_bin/random_text_uniform_two_point_05_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_05_distribution_codetable.bin
@ -0,0 +1 @@
+{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}
--- a/Lab3/files_bin/random_text_uniform_two_point_09_distribution.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_09_distribution.bin
--- a/Lab3/files_bin/random_text_uniform_two_point_09_distribution_codetable.bin
+++ b/Lab3/files_bin/random_text_uniform_two_point_09_distribution_codetable.bin
@ -0,0 +1 @@
+{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}
--- a/Lab3/files_tar/own_corpus.tar.gz
+++ b/Lab3/files_tar/own_corpus.tar.gz
--- a/Lab3/files_tar/random_text_geometric_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_geometric_distribution.tar.gz
--- a/Lab3/files_tar/random_text_uniform_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_uniform_distribution.tar.gz
--- a/Lab3/files_tar/random_text_uniform_two_point_05_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_uniform_two_point_05_distribution.tar.gz
--- a/Lab3/files_tar/random_text_uniform_two_point_09_distribution.tar.gz
+++ b/Lab3/files_tar/random_text_uniform_two_point_09_distribution.tar.gz
--- a/Lab3/files_txt/own_corpus.txt
+++ b/Lab3/files_txt/own_corpus.txt
--- a/Lab3/files_txt/random_text_geometric_distribution.txt
+++ b/Lab3/files_txt/random_text_geometric_distribution.txt
--- a/Lab3/files_txt/random_text_uniform_distribution.txt
+++ b/Lab3/files_txt/random_text_uniform_distribution.txt
--- a/Lab3/files_txt/random_text_uniform_two_point_05_distribution.txt
+++ b/Lab3/files_txt/random_text_uniform_two_point_05_distribution.txt
--- a/Lab3/files_txt/random_text_uniform_two_point_09_distribution.txt
+++ b/Lab3/files_txt/random_text_uniform_two_point_09_distribution.txt
--- a/Lab3/lab3_solution.ipynb
+++ b/Lab3/lab3_solution.ipynb
@ -0,0 +1,509 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Zadanie 1"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generowanie plików"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import string\n",
+    "import os\n",
+    "\n",
+    "# Set the length of the string to generate\n",
+    "string_length = 1000000\n",
+    "\n",
+    "# Define the character set to choose from\n",
+    "character_set = np.array(list(string.ascii_letters + string.digits + \" \"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.makedirs(\"./files_txt\", exist_ok=True)\n",
+    "os.makedirs(\"./files_tar\", exist_ok=True)\n",
+    "os.makedirs(\"./files_bin\", exist_ok=True)\n",
+    "\n",
+    "with open(\"../Lab1/out-merged.txt\", 'r') as file:\n",
+    "    file_content = file.read()\n",
+    "    first_chars = file_content[:string_length]\n",
+    "\n",
+    "    with open(\"files_txt/own_corpus.txt\", 'w') as f:\n",
+    "      f.write(first_chars)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the random string using uniform distribution\n",
+    "random_indices = np.random.uniform(low=0, high=len(character_set), size=string_length).astype(int)\n",
+    "random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
+    "random_string = ''.join(random_characters)\n",
+    "\n",
+    "with open('files_txt/random_text_uniform_distribution.txt', 'w') as f:\n",
+    "    f.write(random_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the random string using geometric distribution\n",
+    "p = 0.3\n",
+    "random_integers = np.random.geometric(p, 100000)\n",
+    "random_indices = [i - 1 for i in random_integers]\n",
+    "random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
+    "random_string = ''.join(random_characters)\n",
+    "\n",
+    "with open('files_txt/random_text_geometric_distribution.txt', 'w') as f:\n",
+    "    f.write(random_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the random string using uniform two-point distribution with p=0.5\n",
+    "character_set = np.array(list('01'))\n",
+    "random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.5, 0.5])\n",
+    "random_string = ''.join(character_set[random_indices])\n",
+    "\n",
+    "with open('files_txt/random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n",
+    "    f.write(random_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the random string using uniform two-point distribution with p=0.9\n",
+    "character_set = np.array(list('01'))\n",
+    "random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.1, 0.9])\n",
+    "random_string = ''.join(character_set[random_indices])\n",
+    "\n",
+    "with open('files_txt/random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n",
+    "    f.write(random_string)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compress files to .tar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Compression complete. The compressed archive is saved as files_tar/own_corpus.tar.gz.\n",
+      "Compression ratio: 4.597193872860006\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_geometric_distribution.tar.gz.\n",
+      "Compression ratio: 2.238588793624499\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_distribution.tar.gz.\n",
+      "Compression ratio: 1.3254407753298358\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_05_distribution.tar.gz.\n",
+      "Compression ratio: 6.656282865396648\n",
+      "Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_09_distribution.tar.gz.\n",
+      "Compression ratio: 12.23555898151207\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tarfile\n",
+    "import os\n",
+    "\n",
+    "def compress_file(file_name):\n",
+    "    output_archive_name = \"files_tar/\" + file_name.split('/')[1].replace('.txt', '.tar.gz')\n",
+    "    with tarfile.open(output_archive_name, 'w:gz') as tar:\n",
+    "        tar.add(file_name)\n",
+    "\n",
+    "    print(f'Compression complete. The compressed archive is saved as {output_archive_name}.')\n",
+    "    print(f'Compression ratio: {os.path.getsize(file_name) / os.path.getsize(output_archive_name)}')\n",
+    "\n",
+    "\n",
+    "file_names = [\"files_txt/\" + f for f in os.listdir('files_txt') if f.endswith('.txt')]\n",
+    "file_names.sort()\n",
+    "for file in file_names:\n",
+    "    compress_file(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Entropy for files_txt/own_corpus.txt: 1.754256\n",
+      "Entropy for files_txt/random_text_geometric_distribution.txt: 3.56064\n",
+      "Entropy for files_txt/random_text_uniform_distribution.txt: 6.0336\n",
+      "Entropy for files_txt/random_text_uniform_two_point_05_distribution.txt: 1.274304\n",
+      "Entropy for files_txt/random_text_uniform_two_point_09_distribution.txt: 0.75892\n"
+     ]
+    }
+   ],
+   "source": [
+    "import zlib\n",
+    "\n",
+    "def entropy_by_compression(t):\n",
+    "  compressed = zlib.compress(t.encode('utf-8'))\n",
+    "  return 8 * len(compressed) / len(t)\n",
+    "\n",
+    "for file in file_names:\n",
+    "    print(f\"Entropy for {file}: {entropy_by_compression(open(file, 'r').read())}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate Huffman code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calculating Huffman code for file: files_txt/own_corpus.txt...\n",
+      "First 3:      r e s\n",
+      "Binary:      0100 001 0001\n",
+      "Calculating Huffman code for file: files_txt/random_text_geometric_distribution.txt...\n",
+      "First 3:      d d d\n",
+      "Binary:      001 001 001\n",
+      "Calculating Huffman code for file: files_txt/random_text_uniform_distribution.txt...\n",
+      "First 3:      Q l M\n",
+      "Binary:      101001 101011 000110\n",
+      "Calculating Huffman code for file: files_txt/random_text_uniform_two_point_05_distribution.txt...\n",
+      "First 3:      0 0 0\n",
+      "Binary:      01 01 01\n",
+      "Calculating Huffman code for file: files_txt/random_text_uniform_two_point_09_distribution.txt...\n",
+      "First 3:      0 1 1\n",
+      "Binary:      01 1 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dahuffman import HuffmanCodec\n",
+    "\n",
+    "def encode_and_print(text):\n",
+    "    codec = HuffmanCodec.from_data(text)\n",
+    "    encoded = codec.encode(text)\n",
+    "    table = codec.get_code_table()\n",
+    "    table_str = str(table)\n",
+    "\n",
+    "    first_3_letters = first_n_decoded_digits(encoded, codec, 3)\n",
+    "    print(\"First 3:\", end='      ')\n",
+    "    print(' '.join(first_3_letters))\n",
+    "    print(\"Binary: \", end='     ')\n",
+    "    print(' '.join(number_to_bin(table[letter][1], table[letter][0]) for letter in first_3_letters))\n",
+    "    \n",
+    "    return encoded, table_str\n",
+    "\n",
+    "def first_n_decoded_digits(encoded, codec, n):\n",
+    "    decoded = codec.decode(encoded)\n",
+    "    return decoded[:n]\n",
+    "\n",
+    "def save_to_bin(bytes, file_name):\n",
+    "    with open(\"files_bin/\" + file_name.split('/')[1], 'wb') as f:\n",
+    "        f.write(bytes)\n",
+    "\n",
+    "def number_to_bin(number, nbits):\n",
+    "    return bin(number)[2:].zfill(nbits)\n",
+    "\n",
+    "for file in file_names:\n",
+    "    print(f\"Calculating Huffman code for file: {file}...\")\n",
+    "    encoded, code_table = encode_and_print(open(file, 'r').read())\n",
+    "    save_to_bin(encoded, file.replace('.txt', '.bin'))\n",
+    "    save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare file sizes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of files_txt/own_corpus.txt: 1000000 bytes, 8000000 bits\n",
+      "Size of files_txt/random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
+      "Size of files_txt/random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
+      "Size of files_txt/random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
+      "Size of files_txt/random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
+      "********************************************************************************\n",
+      "Size of files_tar/own_corpus.tar.gz: 217524 bytes, 1740192 bits\n",
+      "Size of files_tar/random_text_geometric_distribution.tar.gz: 44671 bytes, 357368 bits\n",
+      "Size of files_tar/random_text_uniform_distribution.tar.gz: 754466 bytes, 6035728 bits\n",
+      "Size of files_tar/random_text_uniform_two_point_05_distribution.tar.gz: 150234 bytes, 1201872 bits\n",
+      "Size of files_tar/random_text_uniform_two_point_09_distribution.tar.gz: 81729 bytes, 653832 bits\n",
+      "********************************************************************************\n",
+      "Size of files_txt/own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
+      "Size of files_txt/random_text_geometric_distribution.txt + codetable: 37569 bytes, 41020 bits\n",
+      "Size of files_txt/random_text_uniform_distribution.txt + codetable: 750822 bytes, 757031 bits\n",
+      "Size of files_txt/random_text_uniform_two_point_05_distribution.txt + codetable: 187501 bytes, 187781 bits\n",
+      "Size of files_txt/random_text_uniform_two_point_09_distribution.txt + codetable: 137499 bytes, 137779 bits\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print raw text files sizes\n",
+    "for file in file_names:\n",
+    "    print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
+    "\n",
+    "print(\"*\" * 80)\n",
+    "\n",
+    "# print compressed text files sizes\n",
+    "for file in file_names:\n",
+    "    file = file.replace('.txt', '.tar.gz').replace('files_txt', 'files_tar')\n",
+    "    print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
+    "\n",
+    "print(\"*\" * 80)\n",
+    "\n",
+    "# print compressed with Huffman text files sizes\n",
+    "for file in file_names:\n",
+    "    file1 = file.replace('.txt', '.bin').replace('files_txt', 'files_bin')\n",
+    "    file2 = file.replace('.txt', '_codetable.bin').replace('files_txt', 'files_bin')\n",
+    "    print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Entropia\n",
+    "    \n",
+    "|                            | Entropia  |\n",
+    "| -----------                | ----------- |\n",
+    "| tekst  w jęz. naturalnym   | 1.754256|\n",
+    "| losowy tekst (jednostajny)   |   6.033632   |\n",
+    "| losowy tekst (geometryczny)|  3.5624  |\n",
+    "| losowy tekst (dwupunktowy 0.5) |    1.273352   |\n",
+    "| losowy tekst (dwupunktowy 0.9) |    0.761152     |\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Wielkości w bitach:\n",
+    "    \n",
+    "|                        | Plik nieskompresowany  | Plik skompresowany (zip, tar,.. ) |  Plik skompresowany + tablica kodowa) |\n",
+    "| -----------                | ----------- |-----------|----------- |\n",
+    "| tekst  w jęz. naturalnym   |54358422*8|12130821*8|29452163*8|\n",
+    "| losowy tekst (jednostajny)   |1000000*8|752307*8|748756*8|\n",
+    "| losowy tekst (geometryczny)|1000000*8|44629*8|37535*8|\n",
+    "| losowy tekst (dwupunktowy 0.5)|1000000*8|150394*8|187520*8|\n",
+    "| losowy tekst (dwupunktowy 0.9)|1000000*8|82011*8|137559*8|"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Wnioski:"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Najmniej optymalnie koduje się tekst naturalny.\n",
+    "- Kodowanie Huffmana jest najbardziej optymalne dla rozkładu dwupunktowego 0.9, poniewaz mała grupa znaków ma bardzo duze prawdopowobienstwo wystąpienia i na odwrot.\n",
+    "- Kompresja .tar bardziej opłacalna dla języka naturalnego\n",
+    "- Dla losowych tekstów im mniejsza entropia tym bardziej wydajna kompresja\n",
+    "- Losowy tekst (rozkład jednostajny) ma większą entropię niż tekst w języku naturalnym"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Zadanie 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Entropy for words in files_txt/own_corpus.txt: 9.27320212652544\n",
+      "Entropy for words in files_txt/random_text_geometric_distribution.txt: -0.0\n",
+      "Entropy for words in files_txt/random_text_uniform_distribution.txt: 13.889640822372847\n",
+      "Entropy for words in files_txt/random_text_uniform_two_point_05_distribution.txt: -0.0\n",
+      "Entropy for words in files_txt/random_text_uniform_two_point_09_distribution.txt: -0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import regex as re\n",
+    "from collections import Counter\n",
+    "from math import log\n",
+    "\n",
+    "def get_words(t):\n",
+    "  for m in re.finditer(r'[\\p{L}0-9\\*]+', t):\n",
+    "     yield m.group(0)\n",
+    "\n",
+    "def unigram_entropy(t):\n",
+    "  counter = Counter(t)\n",
+    "  total = sum(counter.values())\n",
+    "  return -sum((p := count / total) * log(p, 2) for count in counter.values())\n",
+    "\n",
+    "for file in file_names:\n",
+    "    print(f\"Entropy for words in {file}: {unigram_entropy(get_words(open(file, 'r').read()))}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Entropia\n",
+    "    \n",
+    "|                            | Entropia    |\n",
+    "| -----------                | ----------- |\n",
+    "| tekst  w jęz. naturalnym   |9.27320212652544|\n",
+    "| losowy tekst (jednostajny)   | 13.897625675701356    |\n",
+    "| losowy tekst (geometryczny)|  0  |\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Wielkości w bitach:\n",
+    "    \n",
+    "|                        | Plik nieskompresowany  | Plik skompresowany (zip, tar,.. ) |  Plik skompresowany + tablica kodowa) |\n",
+    "| -----------                | ----------- |-----------|----------- |\n",
+    "| tekst  w jęz. naturalnym   |54358422*8|12130821*8|29452163*8|\n",
+    "| losowy tekst (jednostajny)   |1000000*8|752307*8|748756*8|\n",
+    "| losowy tekst (geometryczny)|1000000*8|44629*8|37535*8|\n",
+    "| losowy tekst (dwupunktowy 0.5)|1000000*8|150394*8|187520*8|\n",
+    "| losowy tekst (dwupunktowy 0.9)|1000000*8|82011*8|137559*8|"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Wnioski:\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- korpusy bez spacji (które mają tylko 1 wyraz) mają tylko jeden bajt\n",
+    "- Korpusy bez spacji mają większą tablice kodową niż nieskompresowany plik\n",
+    "- Kompresowanie na wyrazach wydaję się być gorsze niż na znakach z powodu ogromnej tablicy kodowej\n",
+    "- W jęzuku naturalbym częściej występują te same wyrazy niż w losowym tekście (jednostajnym)\n",
+    "- Kompresowanie huffmanem na słowach dla plików z jednym wyrazem nie ma sensu"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Zadanie 3"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![title](DrzewoHuffmana.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ai_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.15"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
Author	SHA1	Message	Date
Adam Wojdyla	505c0bb9c2	lab3 v4	2023-03-29 12:03:58 +02:00
Adam Wojdyla	be868f492b	lab3 v3	2023-03-29 11:01:35 +02:00
Adam Wojdyla	582e470488	lab3 v2	2023-03-29 04:16:25 +02:00
Adam Wojdyla	c9d19f350c	lab3	2023-03-29 04:08:59 +02:00
Adam Wojdyla	7efcacbe53	images added	2023-03-22 07:29:54 +01:00
Adam Wojdyla	fb82cdfc04	statistics script	2023-03-22 04:32:34 +01:00
				`@ -0,0 +1 @@`
				{'u': (5, 0), 'k': (8, 8), 'x': (8, 9), '2': (7, 5), 'q': (9, 24), '8': (9, 25), '5': (9, 26), '-': (9, 27), '1': (7, 7), 's': (4, 1), 'e': (3, 1), 'r': (4, 4), '0': (7, 40), '6': (9, 164), _EOF: (12, 1320), '_': (12, 1321), 'z': (11, 661), 'j': (10, 331), '7': (9, 166), '4': (9, 167), 'w': (7, 42), 'v': (7, 43), 'd': (5, 11), 'h': (5, 12), 'g': (6, 26), 'y': (7, 54), '9': (9, 220), '\n': (9, 221), '(': (8, 111), 'n': (4, 7), 'o': (4, 8), 'a': (4, 9), 'l': (5, 20), 'c': (5, 21), 'i': (4, 11), 't': (4, 12), ')': (8, 208), '3': (9, 418), '/': (9, 419), 'b': (7, 105), 'm': (6, 53), 'f': (6, 54), 'p': (6, 55), ' ': (3, 7)}
				`@ -0,0 +1 @@`
				`{'p': (9, 0), 't': (11, 4), 'y': (13, 20), 'A': (15, 84), 'C': (15, 85), 'z': (14, 43), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'D': (16, 32768), 'H': (16, 32769), 'M': (16, 32770), _EOF: (17, 65542), 'B': (17, 65543), 'x': (14, 8193), 'w': (13, 4097), 'u': (12, 2049), 's': (11, 1025), 'q': (10, 513), 'o': (9, 257), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}`
				`@ -0,0 +1 @@`
				{'q': (5, 0), _EOF: (7, 4), 'y': (7, 5), 'f': (6, 3), 't': (6, 4), 'N': (6, 5), 'M': (6, 6), 'U': (6, 7), '0': (6, 8), '2': (6, 9), 'K': (6, 10), '9': (6, 11), 'A': (6, 12), 'm': (6, 13), '1': (6, 14), 'J': (6, 15), 'z': (6, 16), 'S': (6, 17), ' ': (6, 18), 'd': (6, 19), 'Y': (6, 20), 'O': (6, 21), 'x': (6, 22), '4': (6, 23), 'k': (6, 24), 'D': (6, 25), 'E': (6, 26), 'i': (6, 27), 'p': (6, 28), 'P': (6, 29), 'G': (6, 30), 'C': (6, 31), 'o': (6, 32), 'F': (6, 33), 'V': (6, 34), 'j': (6, 35), 'w': (6, 36), 'Z': (6, 37), 's': (6, 38), 'I': (6, 39), 'L': (6, 40), 'Q': (6, 41), 'r': (6, 42), 'l': (6, 43), 'H': (6, 44), 'T': (6, 45), 'g': (6, 46), 'e': (6, 47), 'B': (6, 48), '6': (6, 49), '5': (6, 50), 'R': (6, 51), 'X': (6, 52), 'b': (6, 53), '3': (6, 54), '8': (6, 55), 'c': (6, 56), 'v': (6, 57), 'a': (6, 58), 'n': (6, 59), '7': (6, 60), 'h': (6, 61), 'W': (6, 62), 'u': (6, 63)}