lab3
@ -1,6 +1,6 @@
|
|||||||
import pandas
|
import pandas
|
||||||
import regex as re
|
import regex as re
|
||||||
import argparse, sys
|
import argparse
|
||||||
|
|
||||||
parser=argparse.ArgumentParser()
|
parser=argparse.ArgumentParser()
|
||||||
parser.add_argument("--filepath",)
|
parser.add_argument("--filepath",)
|
||||||
@ -19,15 +19,15 @@ def filter_line(line):
|
|||||||
return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)
|
return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)
|
||||||
|
|
||||||
def clean_with_regex(text):
|
def clean_with_regex(text):
|
||||||
text = str(text).encode("ascii", "ignore").decode("utf-8")
|
# text = str(text).encode("ascii", "ignore").decode("utf-8")
|
||||||
regex_pattern = "(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
|
regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
|
||||||
try:
|
try:
|
||||||
out = re.split(regex_pattern, text)
|
out = re.split(regex_pattern, text)
|
||||||
except TypeError as e:
|
except TypeError as e:
|
||||||
return []
|
return []
|
||||||
out = list(filter(lambda item: filter_line(item), out))
|
out = list(filter(lambda item: filter_line(item), out))
|
||||||
out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
|
out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", " ", item), out))
|
||||||
out = list(map(lambda item: re.sub("[^\w\d\s\\\)\(\/-]", "", item), out))
|
out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]|[^\x00-\x7F]|ex\d+", " ", item), out))
|
||||||
if out:
|
if out:
|
||||||
out.pop(len(out)-1)
|
out.pop(len(out)-1)
|
||||||
return out
|
return out
|
||||||
|
@ -1,5 +1,11 @@
|
|||||||
# Statystyki
|
# Statystyki
|
||||||
|
|
||||||
|
## Uruchomienie skryptu
|
||||||
|
|
||||||
|
Należy uruchomić skrypt pythonowy statistics.py. Wynikiem działania programu są utworzone zdjęcia w folderze /images.
|
||||||
|
|
||||||
|
```python statistics.py --filePath {sciezka_do_pliku}```
|
||||||
|
|
||||||
## Statystyki podstawowe
|
## Statystyki podstawowe
|
||||||
|
|
||||||
### 10 nadłuższych słów
|
### 10 nadłuższych słów
|
||||||
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 13 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
@ -3,14 +3,22 @@ from collections import Counter
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import regex as re
|
import regex as re
|
||||||
from math import log
|
from math import log
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
file_path = "Lab1/out-merged.txt"
|
parser=argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--filepath")
|
||||||
|
args=parser.parse_args()
|
||||||
|
|
||||||
|
FILE_PATH = "Lab1/out-merged.txt" if args.filepath is None else args.filepath
|
||||||
|
IMAGES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "images")
|
||||||
file_content = None
|
file_content = None
|
||||||
|
|
||||||
with open(file_path, 'r') as file:
|
with open(FILE_PATH, 'r') as file:
|
||||||
file_content = file.read()
|
file_content = file.read()
|
||||||
|
file.close()
|
||||||
|
|
||||||
# file_content = file_content[:100]
|
# file_content = file_content[:10000000]
|
||||||
|
|
||||||
def get_characters(t):
|
def get_characters(t):
|
||||||
yield from t
|
yield from t
|
||||||
@ -36,9 +44,9 @@ def rang_freq_with_labels(name, g, top=None):
|
|||||||
plt.ylabel('liczba wystąpień')
|
plt.ylabel('liczba wystąpień')
|
||||||
plt.bar(freq.keys(), freq.values())
|
plt.bar(freq.keys(), freq.values())
|
||||||
|
|
||||||
fname = f'Lab2/images/{name}.png'
|
fname = f'/{name}.png'
|
||||||
|
|
||||||
plt.savefig(fname)
|
plt.savefig(IMAGES_PATH + fname)
|
||||||
|
|
||||||
return fname
|
return fname
|
||||||
|
|
||||||
@ -48,9 +56,9 @@ def log_rang_log_freq(name, g):
|
|||||||
plt.figure().clear()
|
plt.figure().clear()
|
||||||
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
|
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
|
||||||
|
|
||||||
fname = f'Lab2/images/{name}.png'
|
fname = f'/{name}.png'
|
||||||
|
|
||||||
plt.savefig(fname)
|
plt.savefig(IMAGES_PATH + fname)
|
||||||
|
|
||||||
return fname
|
return fname
|
||||||
|
|
||||||
@ -67,15 +75,15 @@ def get_ngrams(t, size):
|
|||||||
for m in ngrams(word, size):
|
for m in ngrams(word, size):
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
def get_w_freq_by_w_len(word_len):
|
def get_w_freq_by_w_len(freq, word_len):
|
||||||
for word, count in freq.items():
|
for word, count in freq.items():
|
||||||
if len(word) == word_len:
|
if len(word) == word_len:
|
||||||
yield (count, word)
|
yield (count, word)
|
||||||
|
|
||||||
def get_average_freq_by_w_len(word_lenghts):
|
def get_average_freq_by_w_len(freq, word_lenghts):
|
||||||
results = dict()
|
results = dict()
|
||||||
for l in word_lenghts:
|
for l in word_lenghts:
|
||||||
word_freq = list(get_w_freq_by_w_len(l))
|
word_freq = list(get_w_freq_by_w_len(freq, l))
|
||||||
if len(word_freq) == 0:
|
if len(word_freq) == 0:
|
||||||
continue
|
continue
|
||||||
average = sum([w[0] for w in word_freq]) / len(word_freq)
|
average = sum([w[0] for w in word_freq]) / len(word_freq)
|
||||||
@ -83,20 +91,20 @@ def get_average_freq_by_w_len(word_lenghts):
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_low_high_freq_by_w_len(word_lenghts):
|
def get_low_high_freq_by_w_len(freq, word_lenghts, average_freq):
|
||||||
"""
|
"""
|
||||||
Returns top 5 most frequent and non frequent words for each word length + average frequency.
|
Returns top 5 most frequent and non frequent words for each word length + average frequency.
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
for l in word_lenghts:
|
for l in word_lenghts:
|
||||||
word_freq = list(get_w_freq_by_w_len(l))
|
word_freq = list(get_w_freq_by_w_len(freq, l))
|
||||||
word_freq.sort()
|
word_freq.sort()
|
||||||
word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
|
word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
|
||||||
word_stats = {
|
word_stats = {
|
||||||
'word_len': l,
|
'word_len': l,
|
||||||
'average_freq': average_freq[l],
|
'average_freq': average_freq[l],
|
||||||
'low_freq': word_freq[:10],
|
'low_freq': word_freq[:5],
|
||||||
'high_freq': word_freq[-10:]
|
'high_freq': word_freq[-5:]
|
||||||
}
|
}
|
||||||
results.append(word_stats)
|
results.append(word_stats)
|
||||||
return results
|
return results
|
||||||
@ -111,7 +119,7 @@ def get_pronouns_stats(freqs):
|
|||||||
plt.figure(figsize=(12, 3))
|
plt.figure(figsize=(12, 3))
|
||||||
plt.ylabel('liczba wystąpień')
|
plt.ylabel('liczba wystąpień')
|
||||||
plt.bar(x, y)
|
plt.bar(x, y)
|
||||||
plt.savefig("Lab2/images/pt-pronouns.png")
|
plt.savefig(IMAGES_PATH + "/pt-pronouns.png")
|
||||||
|
|
||||||
return pronoun_words_freq
|
return pronoun_words_freq
|
||||||
|
|
||||||
@ -123,31 +131,48 @@ def get_years_stats(freqs):
|
|||||||
plt.figure(figsize=(12, 3))
|
plt.figure(figsize=(12, 3))
|
||||||
plt.ylabel('liczba wystąpień')
|
plt.ylabel('liczba wystąpień')
|
||||||
plt.bar(x, y)
|
plt.bar(x, y)
|
||||||
plt.savefig("Lab2/images/pt-years.png")
|
plt.savefig(IMAGES_PATH + "/pt-years.png")
|
||||||
|
|
||||||
return years_word_freq
|
return years_word_freq
|
||||||
|
|
||||||
|
def get_longest_words(top):
|
||||||
|
all_words = list(get_words(file_content))
|
||||||
|
deduplicated_word_listr = [*set(all_words)]
|
||||||
|
deduplicated_word_listr.sort(key=len)
|
||||||
|
deduplicated_word_listr.reverse()
|
||||||
|
return deduplicated_word_listr[:top]
|
||||||
|
|
||||||
print("Generating statistics...")
|
print("Generating statistics...")
|
||||||
|
|
||||||
|
# 10 longest words
|
||||||
|
print("Calculating 10 longest words...")
|
||||||
|
print(get_longest_words(10))
|
||||||
|
|
||||||
# 10 most frequent words in the text
|
# 10 most frequent words in the text
|
||||||
rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20)
|
print("Calculating 10 most frequent words in the text...")
|
||||||
|
rang_freq_with_labels('most-freq-words-10', get_words(file_content), top=10)
|
||||||
|
|
||||||
# Zipf's law
|
# Zipf's law
|
||||||
|
print("Calculating Zipf's law...")
|
||||||
log_rang_log_freq('zipf-law-words', get_words(file_content))
|
log_rang_log_freq('zipf-law-words', get_words(file_content))
|
||||||
|
|
||||||
# Zipf's law for 3-grams
|
# Zipf's law for 3-grams
|
||||||
|
print("Calculating Zipf's law for 3-grams...")
|
||||||
log_rang_log_freq('zipf-law-3grams', get_ngrams(file_content, 3))
|
log_rang_log_freq('zipf-law-3grams', get_ngrams(file_content, 3))
|
||||||
|
|
||||||
# Words breaking the Zipf's law
|
# Words breaking the Zipf's law
|
||||||
|
print("Calculating words breaking the Zipf's law...")
|
||||||
freq = freq_list(get_words(file_content))
|
freq = freq_list(get_words(file_content))
|
||||||
lenghts = [*set(len(f[0]) for f in freq.items())]
|
lenghts = [*set(len(f[0]) for f in freq.items())]
|
||||||
average_freq = get_average_freq_by_w_len(lenghts)
|
average_freq = get_average_freq_by_w_len(freq, lenghts)
|
||||||
get_low_high_freq_by_w_len(lenghts)
|
get_low_high_freq_by_w_len(freq, lenghts, average_freq)
|
||||||
|
|
||||||
# Frequency of pronouns
|
# Frequency of pronouns
|
||||||
|
print("Calculating frequency of pronouns...")
|
||||||
get_pronouns_stats(freq)
|
get_pronouns_stats(freq)
|
||||||
|
|
||||||
print("Done")
|
|
||||||
|
|
||||||
# Number of years in words
|
# Number of years in words
|
||||||
get_years_stats(freq)
|
print("Calculating number of years in words...")
|
||||||
|
get_years_stats(freq)
|
||||||
|
|
||||||
|
print("Done")
|
BIN
Lab3/DrzewoHuffmana.png
Normal file
After Width: | Height: | Size: 149 KiB |
409
Lab3/lab3_solution.ipynb
Normal file
@ -0,0 +1,409 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Zadanie 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Generowanie plików"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 100,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import string\n",
|
||||||
|
"\n",
|
||||||
|
"# Set the length of the string to generate\n",
|
||||||
|
"string_length = 1000000\n",
|
||||||
|
"\n",
|
||||||
|
"# Define the character set to choose from\n",
|
||||||
|
"character_set = np.array(list(string.ascii_letters + string.digits))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(\"../Lab1/out-merged.txt\", 'r') as file:\n",
|
||||||
|
" file_content = file.read()\n",
|
||||||
|
" first_chars = file_content[:string_length]\n",
|
||||||
|
"\n",
|
||||||
|
" with open(\"./own_corpus.txt\", 'w') as f:\n",
|
||||||
|
" f.write(first_chars)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 102,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Generate the random string using uniform distribution\n",
|
||||||
|
"random_indices = np.random.uniform(low=0, high=len(character_set), size=string_length).astype(int)\n",
|
||||||
|
"random_string = ''.join(character_set[random_indices])\n",
|
||||||
|
"\n",
|
||||||
|
"with open('random_text_uniform_distribution.txt', 'w') as f:\n",
|
||||||
|
" f.write(random_string)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 103,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Generate the random string using geometric distribution\n",
|
||||||
|
"p = 0.3\n",
|
||||||
|
"random_integers = np.random.geometric(p, 100000)\n",
|
||||||
|
"random_indices = [i - 1 for i in random_integers]\n",
|
||||||
|
"random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
|
||||||
|
"random_string = ''.join(random_characters)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"with open('random_text_geometric_distribution.txt', 'w') as f:\n",
|
||||||
|
" f.write(random_string)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Generate the random string using uniform two-point distribution with p=0.5\n",
|
||||||
|
"character_set = np.array(list('01'))\n",
|
||||||
|
"random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.5, 0.5])\n",
|
||||||
|
"random_string = ''.join(character_set[random_indices])\n",
|
||||||
|
"\n",
|
||||||
|
"with open('random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n",
|
||||||
|
" f.write(random_string)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 105,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Generate the random string using uniform two-point distribution with p=0.9\n",
|
||||||
|
"character_set = np.array(list('01'))\n",
|
||||||
|
"random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.1, 0.9])\n",
|
||||||
|
"random_string = ''.join(character_set[random_indices])\n",
|
||||||
|
"\n",
|
||||||
|
"with open('random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n",
|
||||||
|
" f.write(random_string)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Compress files to .tar"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 106,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Compression complete. The compressed archive is saved as own_corpus.tar.gz.\n",
|
||||||
|
"Compression ratio: 4.59738408845367\n",
|
||||||
|
"Compression complete. The compressed archive is saved as random_text_uniform_distribution.tar.gz.\n",
|
||||||
|
"Compression ratio: 1.3293011199361935\n",
|
||||||
|
"Compression complete. The compressed archive is saved as random_text_geometric_distribution.tar.gz.\n",
|
||||||
|
"Compression ratio: 2.2415996054784695\n",
|
||||||
|
"Compression complete. The compressed archive is saved as random_text_uniform_two_point_05_distribution.tar.gz.\n",
|
||||||
|
"Compression ratio: 6.6557955339611965\n",
|
||||||
|
"Compression complete. The compressed archive is saved as random_text_uniform_two_point_09_distribution.tar.gz.\n",
|
||||||
|
"Compression ratio: 12.250398137939483\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import tarfile\n",
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"def compress_file(file_name):\n",
|
||||||
|
" output_archive_name = file_name.replace('.txt', '.tar.gz')\n",
|
||||||
|
" with tarfile.open(output_archive_name, 'w:gz') as tar:\n",
|
||||||
|
" tar.add(file_name)\n",
|
||||||
|
"\n",
|
||||||
|
" print(f'Compression complete. The compressed archive is saved as {output_archive_name}.')\n",
|
||||||
|
" print(f'Compression ratio: {os.path.getsize(file_name) / os.path.getsize(output_archive_name)}')\n",
|
||||||
|
"\n",
|
||||||
|
"file_names = ['own_corpus.txt', 'random_text_uniform_distribution.txt', 'random_text_geometric_distribution.txt', 'random_text_uniform_two_point_05_distribution.txt', 'random_text_uniform_two_point_09_distribution.txt']\n",
|
||||||
|
"for file in file_names:\n",
|
||||||
|
" compress_file(file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 107,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Entropy for own_corpus.txt: 1.754256\n",
|
||||||
|
"Entropy for random_text_uniform_distribution.txt: 6.016072\n",
|
||||||
|
"Entropy for random_text_geometric_distribution.txt: 3.54952\n",
|
||||||
|
"Entropy for random_text_uniform_two_point_05_distribution.txt: 1.272664\n",
|
||||||
|
"Entropy for random_text_uniform_two_point_09_distribution.txt: 0.761104\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import zlib\n",
|
||||||
|
"\n",
|
||||||
|
"def entropy_by_compression(t):\n",
|
||||||
|
" compressed = zlib.compress(t.encode('utf-8'))\n",
|
||||||
|
" return 8 * len(compressed) / len(t)\n",
|
||||||
|
"\n",
|
||||||
|
"for file in file_names:\n",
|
||||||
|
" print(f\"Entropy for {file}: {entropy_by_compression(open(file, 'r').read())}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Compare file sizes"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 108,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Size of own_corpus.txt: 1000000 bytes, 8000000 bits\n",
|
||||||
|
"Size of random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
|
||||||
|
"Size of random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
|
||||||
|
"Size of random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
|
||||||
|
"Size of random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
|
||||||
|
"********************************************************************************\n",
|
||||||
|
"Size of own_corpus.tar.gz: 217515 bytes, 1740120 bits\n",
|
||||||
|
"Size of random_text_uniform_distribution.tar.gz: 752275 bytes, 6018200 bits\n",
|
||||||
|
"Size of random_text_geometric_distribution.tar.gz: 44611 bytes, 356888 bits\n",
|
||||||
|
"Size of random_text_uniform_two_point_05_distribution.tar.gz: 150245 bytes, 1201960 bits\n",
|
||||||
|
"Size of random_text_uniform_two_point_09_distribution.tar.gz: 81630 bytes, 653040 bits\n",
|
||||||
|
"********************************************************************************\n",
|
||||||
|
"Size of own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
|
||||||
|
"Size of random_text_uniform_distribution.txt + codetable: 748749 bytes, 754867 bits\n",
|
||||||
|
"Size of random_text_geometric_distribution.txt + codetable: 37470 bytes, 40788 bits\n",
|
||||||
|
"Size of random_text_uniform_two_point_05_distribution.txt + codetable: 187473 bytes, 187753 bits\n",
|
||||||
|
"Size of random_text_uniform_two_point_09_distribution.txt + codetable: 137531 bytes, 137811 bits\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# print raw text files sizes\n",
|
||||||
|
"for file in file_names:\n",
|
||||||
|
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"*\" * 80)\n",
|
||||||
|
"\n",
|
||||||
|
"# print compressed text files sizes\n",
|
||||||
|
"for file in file_names:\n",
|
||||||
|
" file = file.replace('.txt', '.tar.gz')\n",
|
||||||
|
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"*\" * 80)\n",
|
||||||
|
"\n",
|
||||||
|
"# print compressed with Huffman text files sizes\n",
|
||||||
|
"for file in file_names:\n",
|
||||||
|
" file1 = file.replace('.txt', '.bin')\n",
|
||||||
|
" file2 = file.replace('.txt', '_codetable.bin')\n",
|
||||||
|
" print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Generate Huffman code"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 109,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Calculating Huffman code for file: own_corpus.txt...\n",
|
||||||
|
"First 3: r e s\n",
|
||||||
|
"Binary: 0100 001 0001\n",
|
||||||
|
"Calculating Huffman code for file: random_text_uniform_distribution.txt...\n",
|
||||||
|
"First 3: H W 8\n",
|
||||||
|
"Binary: 111010 001011 110101\n",
|
||||||
|
"Calculating Huffman code for file: random_text_geometric_distribution.txt...\n",
|
||||||
|
"First 3: b a a\n",
|
||||||
|
"Binary: 01 11 11\n",
|
||||||
|
"Calculating Huffman code for file: random_text_uniform_two_point_05_distribution.txt...\n",
|
||||||
|
"First 3: 0 0 0\n",
|
||||||
|
"Binary: 01 01 01\n",
|
||||||
|
"Calculating Huffman code for file: random_text_uniform_two_point_09_distribution.txt...\n",
|
||||||
|
"First 3: 1 1 1\n",
|
||||||
|
"Binary: 1 1 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from dahuffman import HuffmanCodec\n",
|
||||||
|
"\n",
|
||||||
|
"def encode_and_print(text):\n",
|
||||||
|
" codec = HuffmanCodec.from_data(text)\n",
|
||||||
|
" encoded = codec.encode(text)\n",
|
||||||
|
" table = codec.get_code_table()\n",
|
||||||
|
" table_str = str(table)\n",
|
||||||
|
"\n",
|
||||||
|
" first_3_letters = first_n_decoded_digits(encoded, codec, 3)\n",
|
||||||
|
" print(\"First 3:\", end=' ')\n",
|
||||||
|
" print(' '.join(first_3_letters))\n",
|
||||||
|
" print(\"Binary: \", end=' ')\n",
|
||||||
|
" print(' '.join(number_to_bin(table[letter][1], table[letter][0]) for letter in first_3_letters))\n",
|
||||||
|
" \n",
|
||||||
|
" return encoded, table_str\n",
|
||||||
|
"\n",
|
||||||
|
"def first_n_decoded_digits(encoded, codec, n):\n",
|
||||||
|
" decoded = codec.decode(encoded)\n",
|
||||||
|
" return decoded[:n]\n",
|
||||||
|
"\n",
|
||||||
|
"def save_to_bin(bytes, file_name):\n",
|
||||||
|
" with open(file_name, 'wb') as f:\n",
|
||||||
|
" f.write(bytes)\n",
|
||||||
|
"\n",
|
||||||
|
"def number_to_bin(number, nbits):\n",
|
||||||
|
" return bin(number)[2:].zfill(nbits)\n",
|
||||||
|
"\n",
|
||||||
|
"for file in file_names:\n",
|
||||||
|
" print(f\"Calculating Huffman code for file: {file}...\")\n",
|
||||||
|
" encoded, code_table = encode_and_print(open(file, 'r').read())\n",
|
||||||
|
" save_to_bin(encoded, file.replace('.txt', '.bin'))\n",
|
||||||
|
" save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n",
|
||||||
|
"\n",
|
||||||
|
"# Nie do końca rozumiem jak mam zapisać ten codec."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Zadanie 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 127,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import regex as re\n",
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from math import log\n",
|
||||||
|
"\n",
|
||||||
|
"def get_words(t):\n",
|
||||||
|
" for m in re.finditer(r'[\\p{L}0-9\\*]+', t):\n",
|
||||||
|
" yield m.group(0)\n",
|
||||||
|
"\n",
|
||||||
|
"def unigram_entropy(t):\n",
|
||||||
|
" counter = Counter(t)\n",
|
||||||
|
" total = sum(counter.values())\n",
|
||||||
|
" return -sum((p := count / total) * log(p, 2) for count in counter.values())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 128,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"9.27320212652544\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"file_content = \"\"\n",
|
||||||
|
"with open(\"own_corpus.txt\", 'r') as file:\n",
|
||||||
|
" file_content = file.read()\n",
|
||||||
|
"\n",
|
||||||
|
"words = list(get_words(file_content))\n",
|
||||||
|
"print(unigram_entropy(words))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Zadanie 3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"![title](DrzewoHuffmana.png)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "ai_env",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.15"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
BIN
Lab3/own_corpus.bin
Normal file
BIN
Lab3/own_corpus.tar.gz
Normal file
1978
Lab3/own_corpus.txt
Normal file
1
Lab3/own_corpus_codetable.bin
Normal file
@ -0,0 +1 @@
|
|||||||
|
{'u': (5, 0), 'k': (8, 8), 'x': (8, 9), '2': (7, 5), 'q': (9, 24), '8': (9, 25), '5': (9, 26), '-': (9, 27), '1': (7, 7), 's': (4, 1), 'e': (3, 1), 'r': (4, 4), '0': (7, 40), '6': (9, 164), _EOF: (12, 1320), '_': (12, 1321), 'z': (11, 661), 'j': (10, 331), '7': (9, 166), '4': (9, 167), 'w': (7, 42), 'v': (7, 43), 'd': (5, 11), 'h': (5, 12), 'g': (6, 26), 'y': (7, 54), '9': (9, 220), '\n': (9, 221), '(': (8, 111), 'n': (4, 7), 'o': (4, 8), 'a': (4, 9), 'l': (5, 20), 'c': (5, 21), 'i': (4, 11), 't': (4, 12), ')': (8, 208), '3': (9, 418), '/': (9, 419), 'b': (7, 105), 'm': (6, 53), 'f': (6, 54), 'p': (6, 55), ' ': (3, 7)}
|
BIN
Lab3/random_text_geometric_distribution.bin
Normal file
BIN
Lab3/random_text_geometric_distribution.tar.gz
Normal file
1
Lab3/random_text_geometric_distribution.txt
Normal file
1
Lab3/random_text_geometric_distribution_codetable.bin
Normal file
@ -0,0 +1 @@
|
|||||||
|
{'p': (9, 0), 'u': (11, 4), 'E': (15, 80), _EOF: (16, 162), 'D': (16, 163), 'C': (15, 82), 'A': (15, 83), 'w': (13, 21), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'o': (9, 256), 'q': (10, 514), 'x': (13, 4120), 'y': (14, 8242), 'z': (14, 8243), 't': (12, 2061), 's': (11, 1031), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}
|
BIN
Lab3/random_text_uniform_distribution.bin
Normal file
BIN
Lab3/random_text_uniform_distribution.tar.gz
Normal file
1
Lab3/random_text_uniform_distribution.txt
Normal file
1
Lab3/random_text_uniform_distribution_codetable.bin
Normal file
@ -0,0 +1 @@
|
|||||||
|
{'A': (5, 0), 'b': (5, 1), _EOF: (7, 8), 'O': (7, 9), 'Y': (6, 5), '9': (6, 6), 't': (6, 7), '1': (6, 8), 'X': (6, 9), 'e': (6, 10), 'W': (6, 11), '4': (6, 12), '3': (6, 13), 'o': (6, 14), 'q': (6, 15), 'T': (6, 16), 'l': (6, 17), 'J': (6, 18), 'y': (6, 19), '6': (6, 20), 'F': (6, 21), 'G': (6, 22), 'Q': (6, 23), 'K': (6, 24), 'N': (6, 25), 'S': (6, 26), 'f': (6, 27), '5': (6, 28), 'L': (6, 29), 'd': (6, 30), 'D': (6, 31), 'M': (6, 32), 'n': (6, 33), 'u': (6, 34), 'B': (6, 35), '2': (6, 36), 'a': (6, 37), '0': (6, 38), '7': (6, 39), 'P': (6, 40), 'E': (6, 41), 'j': (6, 42), 'z': (6, 43), 'C': (6, 44), 'h': (6, 45), 'i': (6, 46), 'c': (6, 47), 'm': (6, 48), 'R': (6, 49), 'k': (6, 50), 'I': (6, 51), 'U': (6, 52), '8': (6, 53), 'Z': (6, 54), 'g': (6, 55), 's': (6, 56), 'V': (6, 57), 'H': (6, 58), 'w': (6, 59), 'r': (6, 60), 'x': (6, 61), 'p': (6, 62), 'v': (6, 63)}
|
1
Lab3/random_text_uniform_two_point_05_distribution.bin
Normal file
BIN
Lab3/random_text_uniform_two_point_05_distribution.tar.gz
Normal file
1
Lab3/random_text_uniform_two_point_05_distribution.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}
|
1
Lab3/random_text_uniform_two_point_09_distribution.bin
Normal file
BIN
Lab3/random_text_uniform_two_point_09_distribution.tar.gz
Normal file
1
Lab3/random_text_uniform_two_point_09_distribution.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}
|