Compare commits

..

No commits in common. "master" and "893f4409e5c79431ca92a39de567d999d23b0242" have entirely different histories.

33 changed files with 5 additions and 2735 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
.DS_STORE .DS_STORE
out-merged.txt

View File

@ -1,6 +1,6 @@
import pandas import pandas
import regex as re import regex as re
import argparse import argparse, sys
parser=argparse.ArgumentParser() parser=argparse.ArgumentParser()
parser.add_argument("--filepath",) parser.add_argument("--filepath",)
@ -19,15 +19,14 @@ def filter_line(line):
return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line) return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)
def clean_with_regex(text): def clean_with_regex(text):
# text = str(text).encode("ascii", "ignore").decode("utf-8") text = str(text).encode("ascii", "ignore").decode("utf-8")
regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)" regex_pattern = "(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
try: try:
out = re.split(regex_pattern, text) out = re.split(regex_pattern, text)
except TypeError as e: except TypeError as e:
return [] return []
out = list(filter(lambda item: filter_line(item), out)) out = list(filter(lambda item: filter_line(item), out))
out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", " ", item), out)) out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]|[^\x00-\x7F]|ex\d+", " ", item), out))
if out: if out:
out.pop(len(out)-1) out.pop(len(out)-1)
return out return out
@ -42,7 +41,7 @@ def print_text(text, sort=False):
def save_to_file(paragraph_list, file_name): def save_to_file(paragraph_list, file_name):
with open(file_name, 'a') as f: with open(file_name, 'a') as f:
for line in paragraph_list: for line in paragraph_list:
f.write("%s\n" % line.strip().lower()) f.write("%s\n" % line.strip())
f.close() f.close()

View File

@ -1,52 +0,0 @@
# Statystyki
## Uruchomienie skryptu
Należy uruchomić skrypt pythonowy statistics.py. Wynikiem działania programu są utworzone zdjęcia w folderze /images.
```python statistics.py --filePath {sciezka_do_pliku}```
## Statystyki podstawowe
### 10 nadłuższych słów
`
MarineStrategyFrameworkDirectiveClassificationValue
OtherFinancialProfessionalAndInformationServices
GuineaPeruPhilippinesQatarRomaniaRussiaRwandaSao
MarineStrategyFrameworkDirectiveClassificationValue
AustraliaArgentinaBotswanaBrazilChileNamibiaNew
ManufacturingOfElectricalAndOpticalEquipment
ClassificationAndQuantificationFrameworkValue
FinancialProfessionalAndInformationServices
measuredIndicatedAndInferredMineralResource
AnthropogenicGeomorphologicFeatureTypeValue
`
### Prawo Zipfa dla słów
![title](images/zipf-law-words.png)
### Prawo Zipfa dla trigramów z słów
![title](images/zipf-law-3grams.png)
### Słowa łamiące prawo łączące długość z częstością
- aunt (4 znaki, 31 wystąpień)
- cave (4 znaki, 31 wystąpień)
- amateur (7 znaków, 31 wystąpień)
- CommissionFranz (15 znaków, 2090 wystąpień)
- responsibilities (16 znaków, 2087 wystąpień)
- Interventionsstelle (19 znaków, 231 wystąpień)
- hydrogenorthophosphate (22 znaków, 148 wystąpień)
- polytetrafluoroethylene (23 znaków, 148 wystąpień)
### Częstotliwość zaimków
![title](images/pt-pronouns.png)
### Ilosć wystąpień dat (lata)
`['1999', '1975', '1987', '1992', '1985', '1981', '1988', '1986', '1995', '1991', '1993', '1990', '1994', '1983', '1989'...`
![title](images/pt-years.png)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

View File

@ -1,178 +0,0 @@
import matplotlib.pyplot as plt
from collections import Counter
from collections import OrderedDict
import regex as re
from math import log
import argparse
import os
parser=argparse.ArgumentParser()
parser.add_argument("--filepath")
args=parser.parse_args()
FILE_PATH = "Lab1/out-merged.txt" if args.filepath is None else args.filepath
IMAGES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "images")
file_content = None
with open(FILE_PATH, 'r') as file:
file_content = file.read()
file.close()
# file_content = file_content[:10000000]
def get_characters(t):
yield from t
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9\*]+', t):
yield m.group(0)
def rang_freq_with_labels(name, g, top=None):
freq = freq_list(g, top)
plt.figure(figsize=(12, 3))
plt.ylabel('liczba wystąpień')
plt.bar(freq.keys(), freq.values())
fname = f'/{name}.png'
plt.savefig(IMAGES_PATH + fname)
return fname
def log_rang_log_freq(name, g):
freq = freq_list(g)
plt.figure().clear()
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
fname = f'/{name}.png'
plt.savefig(IMAGES_PATH + fname)
return fname
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
def get_ngrams(t, size):
for word in get_words(t):
for m in ngrams(word, size):
yield m
def get_w_freq_by_w_len(freq, word_len):
for word, count in freq.items():
if len(word) == word_len:
yield (count, word)
def get_average_freq_by_w_len(freq, word_lenghts):
results = dict()
for l in word_lenghts:
word_freq = list(get_w_freq_by_w_len(freq, l))
if len(word_freq) == 0:
continue
average = sum([w[0] for w in word_freq]) / len(word_freq)
results[l] = average
return results
def get_low_high_freq_by_w_len(freq, word_lenghts, average_freq):
"""
Returns top 5 most frequent and non frequent words for each word length + average frequency.
"""
results = []
for l in word_lenghts:
word_freq = list(get_w_freq_by_w_len(freq, l))
word_freq.sort()
word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
word_stats = {
'word_len': l,
'average_freq': average_freq[l],
'low_freq': word_freq[:5],
'high_freq': word_freq[-5:]
}
results.append(word_stats)
return results
def get_pronouns_stats(freqs):
pronouns = ["i", "you", "he", "she", "it"]
pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
x = [f[0] for f in pronoun_words_freq]
y = [f[1] for f in pronoun_words_freq]
plt.figure(figsize=(12, 3))
plt.ylabel('liczba wystąpień')
plt.bar(x, y)
plt.savefig(IMAGES_PATH + "/pt-pronouns.png")
return pronoun_words_freq
def get_years_stats(freqs):
years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
x = [f[0] for f in years_word_freq]
y = [f[1] for f in years_word_freq]
plt.figure(figsize=(12, 3))
plt.ylabel('liczba wystąpień')
plt.bar(x, y)
plt.savefig(IMAGES_PATH + "/pt-years.png")
return years_word_freq
def get_longest_words(top):
all_words = list(get_words(file_content))
deduplicated_word_listr = [*set(all_words)]
deduplicated_word_listr.sort(key=len)
deduplicated_word_listr.reverse()
return deduplicated_word_listr[:top]
print("Generating statistics...")
# 10 longest words
print("Calculating 10 longest words...")
print(get_longest_words(10))
# 10 most frequent words in the text
print("Calculating 10 most frequent words in the text...")
rang_freq_with_labels('most-freq-words-10', get_words(file_content), top=10)
# Zipf's law
print("Calculating Zipf's law...")
log_rang_log_freq('zipf-law-words', get_words(file_content))
# Zipf's law for 3-grams
print("Calculating Zipf's law for 3-grams...")
log_rang_log_freq('zipf-law-3grams', get_ngrams(file_content, 3))
# Words breaking the Zipf's law
print("Calculating words breaking the Zipf's law...")
freq = freq_list(get_words(file_content))
lenghts = [*set(len(f[0]) for f in freq.items())]
average_freq = get_average_freq_by_w_len(freq, lenghts)
get_low_high_freq_by_w_len(freq, lenghts, average_freq)
# Frequency of pronouns
print("Calculating frequency of pronouns...")
get_pronouns_stats(freq)
# Number of years in words
print("Calculating number of years in words...")
get_years_stats(freq)
print("Done")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 149 KiB

Binary file not shown.

View File

@ -1 +0,0 @@
{'u': (5, 0), 'k': (8, 8), 'x': (8, 9), '2': (7, 5), 'q': (9, 24), '8': (9, 25), '5': (9, 26), '-': (9, 27), '1': (7, 7), 's': (4, 1), 'e': (3, 1), 'r': (4, 4), '0': (7, 40), '6': (9, 164), _EOF: (12, 1320), '_': (12, 1321), 'z': (11, 661), 'j': (10, 331), '7': (9, 166), '4': (9, 167), 'w': (7, 42), 'v': (7, 43), 'd': (5, 11), 'h': (5, 12), 'g': (6, 26), 'y': (7, 54), '9': (9, 220), '\n': (9, 221), '(': (8, 111), 'n': (4, 7), 'o': (4, 8), 'a': (4, 9), 'l': (5, 20), 'c': (5, 21), 'i': (4, 11), 't': (4, 12), ')': (8, 208), '3': (9, 418), '/': (9, 419), 'b': (7, 105), 'm': (6, 53), 'f': (6, 54), 'p': (6, 55), ' ': (3, 7)}

View File

@ -1 +0,0 @@
{'p': (9, 0), 't': (11, 4), 'y': (13, 20), 'A': (15, 84), 'C': (15, 85), 'z': (14, 43), 'v': (12, 11), 'r': (10, 3), 'n': (8, 1), 'l': (7, 1), 'j': (6, 1), 'h': (5, 1), 'f': (4, 1), 'd': (3, 1), 'b': (2, 1), 'D': (16, 32768), 'H': (16, 32769), 'M': (16, 32770), _EOF: (17, 65542), 'B': (17, 65543), 'x': (14, 8193), 'w': (13, 4097), 'u': (12, 2049), 's': (11, 1025), 'q': (10, 513), 'o': (9, 257), 'm': (8, 129), 'k': (7, 65), 'i': (6, 33), 'g': (5, 17), 'e': (4, 9), 'c': (3, 5), 'a': (2, 3)}

View File

@ -1 +0,0 @@
{'q': (5, 0), _EOF: (7, 4), 'y': (7, 5), 'f': (6, 3), 't': (6, 4), 'N': (6, 5), 'M': (6, 6), 'U': (6, 7), '0': (6, 8), '2': (6, 9), 'K': (6, 10), '9': (6, 11), 'A': (6, 12), 'm': (6, 13), '1': (6, 14), 'J': (6, 15), 'z': (6, 16), 'S': (6, 17), ' ': (6, 18), 'd': (6, 19), 'Y': (6, 20), 'O': (6, 21), 'x': (6, 22), '4': (6, 23), 'k': (6, 24), 'D': (6, 25), 'E': (6, 26), 'i': (6, 27), 'p': (6, 28), 'P': (6, 29), 'G': (6, 30), 'C': (6, 31), 'o': (6, 32), 'F': (6, 33), 'V': (6, 34), 'j': (6, 35), 'w': (6, 36), 'Z': (6, 37), 's': (6, 38), 'I': (6, 39), 'L': (6, 40), 'Q': (6, 41), 'r': (6, 42), 'l': (6, 43), 'H': (6, 44), 'T': (6, 45), 'g': (6, 46), 'e': (6, 47), 'B': (6, 48), '6': (6, 49), '5': (6, 50), 'R': (6, 51), 'X': (6, 52), 'b': (6, 53), '3': (6, 54), '8': (6, 55), 'c': (6, 56), 'v': (6, 57), 'a': (6, 58), 'n': (6, 59), '7': (6, 60), 'h': (6, 61), 'W': (6, 62), 'u': (6, 63)}

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{_EOF: (2, 0), '0': (2, 1), '1': (1, 1)}

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,509 +0,0 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Zadanie 1"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generowanie plików"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import string\n",
"import os\n",
"\n",
"# Set the length of the string to generate\n",
"string_length = 1000000\n",
"\n",
"# Define the character set to choose from\n",
"character_set = np.array(list(string.ascii_letters + string.digits + \" \"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(\"./files_txt\", exist_ok=True)\n",
"os.makedirs(\"./files_tar\", exist_ok=True)\n",
"os.makedirs(\"./files_bin\", exist_ok=True)\n",
"\n",
"with open(\"../Lab1/out-merged.txt\", 'r') as file:\n",
" file_content = file.read()\n",
" first_chars = file_content[:string_length]\n",
"\n",
" with open(\"files_txt/own_corpus.txt\", 'w') as f:\n",
" f.write(first_chars)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Generate the random string using uniform distribution\n",
"random_indices = np.random.uniform(low=0, high=len(character_set), size=string_length).astype(int)\n",
"random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
"random_string = ''.join(random_characters)\n",
"\n",
"with open('files_txt/random_text_uniform_distribution.txt', 'w') as f:\n",
" f.write(random_string)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Generate the random string using geometric distribution\n",
"p = 0.3\n",
"random_integers = np.random.geometric(p, 100000)\n",
"random_indices = [i - 1 for i in random_integers]\n",
"random_characters = [character_set[i % len(character_set)] for i in random_indices]\n",
"random_string = ''.join(random_characters)\n",
"\n",
"with open('files_txt/random_text_geometric_distribution.txt', 'w') as f:\n",
" f.write(random_string)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Generate the random string using uniform two-point distribution with p=0.5\n",
"character_set = np.array(list('01'))\n",
"random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.5, 0.5])\n",
"random_string = ''.join(character_set[random_indices])\n",
"\n",
"with open('files_txt/random_text_uniform_two_point_05_distribution.txt', 'w') as f:\n",
" f.write(random_string)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Generate the random string using uniform two-point distribution with p=0.9\n",
"character_set = np.array(list('01'))\n",
"random_indices = np.random.choice([0, len(character_set)-1], size=string_length, p=[0.1, 0.9])\n",
"random_string = ''.join(character_set[random_indices])\n",
"\n",
"with open('files_txt/random_text_uniform_two_point_09_distribution.txt', 'w') as f:\n",
" f.write(random_string)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compress files to .tar"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Compression complete. The compressed archive is saved as files_tar/own_corpus.tar.gz.\n",
"Compression ratio: 4.597193872860006\n",
"Compression complete. The compressed archive is saved as files_tar/random_text_geometric_distribution.tar.gz.\n",
"Compression ratio: 2.238588793624499\n",
"Compression complete. The compressed archive is saved as files_tar/random_text_uniform_distribution.tar.gz.\n",
"Compression ratio: 1.3254407753298358\n",
"Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_05_distribution.tar.gz.\n",
"Compression ratio: 6.656282865396648\n",
"Compression complete. The compressed archive is saved as files_tar/random_text_uniform_two_point_09_distribution.tar.gz.\n",
"Compression ratio: 12.23555898151207\n"
]
}
],
"source": [
"import tarfile\n",
"import os\n",
"\n",
"def compress_file(file_name):\n",
" output_archive_name = \"files_tar/\" + file_name.split('/')[1].replace('.txt', '.tar.gz')\n",
" with tarfile.open(output_archive_name, 'w:gz') as tar:\n",
" tar.add(file_name)\n",
"\n",
" print(f'Compression complete. The compressed archive is saved as {output_archive_name}.')\n",
" print(f'Compression ratio: {os.path.getsize(file_name) / os.path.getsize(output_archive_name)}')\n",
"\n",
"\n",
"file_names = [\"files_txt/\" + f for f in os.listdir('files_txt') if f.endswith('.txt')]\n",
"file_names.sort()\n",
"for file in file_names:\n",
" compress_file(file)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Entropy for files_txt/own_corpus.txt: 1.754256\n",
"Entropy for files_txt/random_text_geometric_distribution.txt: 3.56064\n",
"Entropy for files_txt/random_text_uniform_distribution.txt: 6.0336\n",
"Entropy for files_txt/random_text_uniform_two_point_05_distribution.txt: 1.274304\n",
"Entropy for files_txt/random_text_uniform_two_point_09_distribution.txt: 0.75892\n"
]
}
],
"source": [
"import zlib\n",
"\n",
"def entropy_by_compression(t):\n",
" compressed = zlib.compress(t.encode('utf-8'))\n",
" return 8 * len(compressed) / len(t)\n",
"\n",
"for file in file_names:\n",
" print(f\"Entropy for {file}: {entropy_by_compression(open(file, 'r').read())}\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate Huffman code"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Calculating Huffman code for file: files_txt/own_corpus.txt...\n",
"First 3: r e s\n",
"Binary: 0100 001 0001\n",
"Calculating Huffman code for file: files_txt/random_text_geometric_distribution.txt...\n",
"First 3: d d d\n",
"Binary: 001 001 001\n",
"Calculating Huffman code for file: files_txt/random_text_uniform_distribution.txt...\n",
"First 3: Q l M\n",
"Binary: 101001 101011 000110\n",
"Calculating Huffman code for file: files_txt/random_text_uniform_two_point_05_distribution.txt...\n",
"First 3: 0 0 0\n",
"Binary: 01 01 01\n",
"Calculating Huffman code for file: files_txt/random_text_uniform_two_point_09_distribution.txt...\n",
"First 3: 0 1 1\n",
"Binary: 01 1 1\n"
]
}
],
"source": [
"from dahuffman import HuffmanCodec\n",
"\n",
"def encode_and_print(text):\n",
" codec = HuffmanCodec.from_data(text)\n",
" encoded = codec.encode(text)\n",
" table = codec.get_code_table()\n",
" table_str = str(table)\n",
"\n",
" first_3_letters = first_n_decoded_digits(encoded, codec, 3)\n",
" print(\"First 3:\", end=' ')\n",
" print(' '.join(first_3_letters))\n",
" print(\"Binary: \", end=' ')\n",
" print(' '.join(number_to_bin(table[letter][1], table[letter][0]) for letter in first_3_letters))\n",
" \n",
" return encoded, table_str\n",
"\n",
"def first_n_decoded_digits(encoded, codec, n):\n",
" decoded = codec.decode(encoded)\n",
" return decoded[:n]\n",
"\n",
"def save_to_bin(bytes, file_name):\n",
" with open(\"files_bin/\" + file_name.split('/')[1], 'wb') as f:\n",
" f.write(bytes)\n",
"\n",
"def number_to_bin(number, nbits):\n",
" return bin(number)[2:].zfill(nbits)\n",
"\n",
"for file in file_names:\n",
" print(f\"Calculating Huffman code for file: {file}...\")\n",
" encoded, code_table = encode_and_print(open(file, 'r').read())\n",
" save_to_bin(encoded, file.replace('.txt', '.bin'))\n",
" save_to_bin(code_table.encode(), file.replace('.txt', '_codetable.bin'))\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare file sizes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of files_txt/own_corpus.txt: 1000000 bytes, 8000000 bits\n",
"Size of files_txt/random_text_geometric_distribution.txt: 100000 bytes, 800000 bits\n",
"Size of files_txt/random_text_uniform_distribution.txt: 1000000 bytes, 8000000 bits\n",
"Size of files_txt/random_text_uniform_two_point_05_distribution.txt: 1000000 bytes, 8000000 bits\n",
"Size of files_txt/random_text_uniform_two_point_09_distribution.txt: 1000000 bytes, 8000000 bits\n",
"********************************************************************************\n",
"Size of files_tar/own_corpus.tar.gz: 217524 bytes, 1740192 bits\n",
"Size of files_tar/random_text_geometric_distribution.tar.gz: 44671 bytes, 357368 bits\n",
"Size of files_tar/random_text_uniform_distribution.tar.gz: 754466 bytes, 6035728 bits\n",
"Size of files_tar/random_text_uniform_two_point_05_distribution.tar.gz: 150234 bytes, 1201872 bits\n",
"Size of files_tar/random_text_uniform_two_point_09_distribution.tar.gz: 81729 bytes, 653832 bits\n",
"********************************************************************************\n",
"Size of files_txt/own_corpus.txt + codetable: 544399 bytes, 548781 bits\n",
"Size of files_txt/random_text_geometric_distribution.txt + codetable: 37569 bytes, 41020 bits\n",
"Size of files_txt/random_text_uniform_distribution.txt + codetable: 750822 bytes, 757031 bits\n",
"Size of files_txt/random_text_uniform_two_point_05_distribution.txt + codetable: 187501 bytes, 187781 bits\n",
"Size of files_txt/random_text_uniform_two_point_09_distribution.txt + codetable: 137499 bytes, 137779 bits\n"
]
}
],
"source": [
"# print raw text files sizes\n",
"for file in file_names:\n",
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
"\n",
"print(\"*\" * 80)\n",
"\n",
"# print compressed text files sizes\n",
"for file in file_names:\n",
" file = file.replace('.txt', '.tar.gz').replace('files_txt', 'files_tar')\n",
" print(f\"Size of {file}: {os.path.getsize(file)} bytes, {os.path.getsize(file)*8} bits\")\n",
"\n",
"print(\"*\" * 80)\n",
"\n",
"# print compressed with Huffman text files sizes\n",
"for file in file_names:\n",
" file1 = file.replace('.txt', '.bin').replace('files_txt', 'files_bin')\n",
" file2 = file.replace('.txt', '_codetable.bin').replace('files_txt', 'files_bin')\n",
" print(f\"Size of {file} + codetable: {os.path.getsize(file1) + os.path.getsize(file2)} bytes, {os.path.getsize(file1) + os.path.getsize(file2)*8} bits\")\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Entropia\n",
" \n",
"| | Entropia |\n",
"| ----------- | ----------- |\n",
"| tekst w jęz. naturalnym | 1.754256|\n",
"| losowy tekst (jednostajny) | 6.033632 |\n",
"| losowy tekst (geometryczny)| 3.5624 |\n",
"| losowy tekst (dwupunktowy 0.5) | 1.273352 |\n",
"| losowy tekst (dwupunktowy 0.9) | 0.761152 |\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Wielkości w bitach:\n",
" \n",
"| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n",
"| ----------- | ----------- |-----------|----------- |\n",
"| tekst w jęz. naturalnym |54358422*8|12130821*8|29452163*8|\n",
"| losowy tekst (jednostajny) |1000000*8|752307*8|748756*8|\n",
"| losowy tekst (geometryczny)|1000000*8|44629*8|37535*8|\n",
"| losowy tekst (dwupunktowy 0.5)|1000000*8|150394*8|187520*8|\n",
"| losowy tekst (dwupunktowy 0.9)|1000000*8|82011*8|137559*8|"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Wnioski:"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"- Najmniej optymalnie koduje się tekst naturalny.\n",
"- Kodowanie Huffmana jest najbardziej optymalne dla rozkładu dwupunktowego 0.9, poniewaz mała grupa znaków ma bardzo duze prawdopowobienstwo wystąpienia i na odwrot.\n",
"- Kompresja .tar bardziej opłacalna dla języka naturalnego\n",
"- Dla losowych tekstów im mniejsza entropia tym bardziej wydajna kompresja\n",
"- Losowy tekst (rozkład jednostajny) ma większą entropię niż tekst w języku naturalnym"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie 2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Entropy for words in files_txt/own_corpus.txt: 9.27320212652544\n",
"Entropy for words in files_txt/random_text_geometric_distribution.txt: -0.0\n",
"Entropy for words in files_txt/random_text_uniform_distribution.txt: 13.889640822372847\n",
"Entropy for words in files_txt/random_text_uniform_two_point_05_distribution.txt: -0.0\n",
"Entropy for words in files_txt/random_text_uniform_two_point_09_distribution.txt: -0.0\n"
]
}
],
"source": [
"import regex as re\n",
"from collections import Counter\n",
"from math import log\n",
"\n",
"def get_words(t):\n",
" for m in re.finditer(r'[\\p{L}0-9\\*]+', t):\n",
" yield m.group(0)\n",
"\n",
"def unigram_entropy(t):\n",
" counter = Counter(t)\n",
" total = sum(counter.values())\n",
" return -sum((p := count / total) * log(p, 2) for count in counter.values())\n",
"\n",
"for file in file_names:\n",
" print(f\"Entropy for words in {file}: {unigram_entropy(get_words(open(file, 'r').read()))}\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Entropia\n",
" \n",
"| | Entropia |\n",
"| ----------- | ----------- |\n",
"| tekst w jęz. naturalnym |9.27320212652544|\n",
"| losowy tekst (jednostajny) | 13.897625675701356 |\n",
"| losowy tekst (geometryczny)| 0 |\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Wielkości w bitach:\n",
" \n",
"| | Plik nieskompresowany | Plik skompresowany (zip, tar,.. ) | Plik skompresowany + tablica kodowa) |\n",
"| ----------- | ----------- |-----------|----------- |\n",
"| tekst w jęz. naturalnym |54358422*8|12130821*8|29452163*8|\n",
"| losowy tekst (jednostajny) |1000000*8|752307*8|748756*8|\n",
"| losowy tekst (geometryczny)|1000000*8|44629*8|37535*8|\n",
"| losowy tekst (dwupunktowy 0.5)|1000000*8|150394*8|187520*8|\n",
"| losowy tekst (dwupunktowy 0.9)|1000000*8|82011*8|137559*8|"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Wnioski:\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"- korpusy bez spacji (które mają tylko 1 wyraz) mają tylko jeden bajt\n",
"- Korpusy bez spacji mają większą tablice kodową niż nieskompresowany plik\n",
"- Kompresowanie na wyrazach wydaję się być gorsze niż na znakach z powodu ogromnej tablicy kodowej\n",
"- W jęzuku naturalbym częściej występują te same wyrazy niż w losowym tekście (jednostajnym)\n",
"- Kompresowanie huffmanem na słowach dla plików z jednym wyrazem nie ma sensu"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Zadanie 3"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"![title](DrzewoHuffmana.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ai_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}