255 lines
105 KiB
Plaintext
255 lines
105 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 109,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from collections import Counter\n",
|
||
|
"from math import log\n",
|
||
|
"import random\n",
|
||
|
"from dahuffman import HuffmanCodec"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 110,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from collections import Counter\n",
|
||
|
"from math import log\n",
|
||
|
"\n",
|
||
|
"def unigram_entropy(t):\n",
|
||
|
" counter = Counter(t)\n",
|
||
|
"\n",
|
||
|
" total = sum(counter.values())\n",
|
||
|
" return -sum((p := count / total) * log(p, 2) for count in counter.values())\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 111,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def save_codec_to_files(t, path):\n",
|
||
|
" codec = HuffmanCodec.from_data(t)\n",
|
||
|
" encoded = codec.encode(t)\n",
|
||
|
" encoded_text = \"{:08b}\".format(int(encoded.hex(),16))\n",
|
||
|
" # print(encoded_text)\n",
|
||
|
" # codec.print_code_table()\n",
|
||
|
" # print(encoded_text)\n",
|
||
|
" with open(path, 'wb') as file:\n",
|
||
|
" file.write(encoded)\n",
|
||
|
" codec.save(path + '_table')\n",
|
||
|
"# save_codec_to_files(\"Ala ma kota\", 'x')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 112,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"unigram_entropy: 4.490477053022786\n",
|
||
|
"3 pierwsze znaki 10110000111001101111011\n",
|
||
|
"3 pierwsze znaki 111111000001011110000110111110111101101\n",
|
||
|
"unigram_entropy words: 10.188508379298268\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"text = ''\n",
|
||
|
"with open('100k_corpus', 'r') as file:\n",
|
||
|
" text = file.read()\n",
|
||
|
"\n",
|
||
|
"print('unigram_entropy: ', unigram_entropy(text))\n",
|
||
|
"codec = HuffmanCodec.from_data(text)\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text[:3]).hex(),16)))\n",
|
||
|
"\n",
|
||
|
"codec = HuffmanCodec.from_data(text.split())\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text.split()[:3]).hex(),16)))\n",
|
||
|
"print('unigram_entropy words: ', unigram_entropy(text.split()))\n",
|
||
|
"\n",
|
||
|
"save_codec_to_files(text, 'on_chars/100k_corpus_coded')\n",
|
||
|
"save_codec_to_files(text.split(), 'on_words/100k_corpus_coded')\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 113,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"unigram_entropy: 5.976822702978705\n",
|
||
|
"3 pierwsze znaki 101100101111000110000010\n",
|
||
|
"3 pierwsze znaki 11110001000010011110110110010000\n",
|
||
|
"unigram_entropy words: 10.629740889027198\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"text = ''\n",
|
||
|
"with open('100k_jednostajny', 'r') as file:\n",
|
||
|
" text = file.read()\n",
|
||
|
"\n",
|
||
|
"print('unigram_entropy: ', unigram_entropy(text))\n",
|
||
|
"codec = HuffmanCodec.from_data(text)\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text[:3]).hex(),16)))\n",
|
||
|
"\n",
|
||
|
"codec = HuffmanCodec.from_data(text.split())\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text.split()[:3]).hex(),16)))\n",
|
||
|
"print('unigram_entropy words: ', unigram_entropy(text.split()))\n",
|
||
|
"\n",
|
||
|
"save_codec_to_files(text, 'on_chars/100k_jednostajny_coded')\n",
|
||
|
"save_codec_to_files(text.split(), 'on_words/100k_jednostajny_coded')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 115,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"unigram_entropy: 2.932690237976291\n",
|
||
|
"3 pierwsze znaki 10101111\n",
|
||
|
"3 pierwsze znaki 10000000\n",
|
||
|
"unigram_entropy words: -0.0\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"['dcbbbbdcgdbbbbddgcbcbkdbjfcdhkbdcbddccbhcccdgbccbdbejddcbccfgcdeccbbfdjbegbbbcdecbebegdcdbbdebbnbdcbbdbkbkbbcjdgcfbbcdcccfbeddfbdbejbdfeccbbbcecdbcccbbgfccbebcebjbfbbbdecbbbbcmbfekbccbbbdbjbhhigdeeefgddbcbbcebgbcdbbekdtbcfbjccgbbbvbmdeidhbbbbbbbbbgcpcekjbcbbhcbclmdbejfbbdcfchcbbgecfjebbegbbbeeffdbbbedcfccbbeedgbcbbdgebcbecfbbbdddbbebhcmeccbcbhhebcdcbdbedegchdcdeeobbfbhcebebfcfbbdccbjbedcfhbgbeeedbbcbcfcebdbcccbbiiceceeblddedqbbbgbhdhbbcbocbbdbbedddebbecdgdbcgfeebbbffbecddbgjbccfcigccbdccccbbcddjeffcdgebfdbddbcbdhcdbjcbbccbcbcdbfcbcfbcfcbdcbdbbbbfdbgcelfcbedffiecegcccecdcbhccfifbccffdbbvcogddbdlffdgecbbecefebiifbgcchbbcchffggccgbbbgcmbbdcfgddbefdccdedbbcdcebemcjdqcbcfbcbdcbcddfeecdcbgcbmbbcbcfdchbdecedjblgbhdcidbhchicdnfbcdbjecdeddbbbcbbfeegdebcbmbfghbbhccbjebebcbbfbekbhbdffebebhbfededbgfckbbkccbebgcgbbecccbdcbbdlbbcgcgcdccbcbdofeedbbdjelcbffcefcbekdjbcccedbgbdcdhbebbesdbeddbcecfcjmgcbbdbcedbddbdkddgfebcbbbccdbidbbdbkbbojcebcffddcebedcfdbbejddhgbdbccebkbbehgbbfeccdebcefhbbfigbkcffhdkbbccdbcdbdcjecfecibgcdgcdccfccbbfeccbbbeeedccdhcfbecbcbdbdgdkbbbdbdfhccfdebclclcbdbdbbbcccbbbciecceecbdejcgbihibfbfbgcbfcdbcdbbfbbcghbhjcbddcdccdcccebbchbbccdefwbbedbdccddbbdbdbhcbbbfcebgjbebccbccbdcebcbbbccbbfibdghlecbchbdcbefddcncccebbdidbgdcgcbcdbccdcbbfbchfdbbccbbfbkfebcfbfnbbcgdeddbbcccdbcedcgdbccbhccbbbfcgbefghddbfdcbbceebcbbgbbbcbfgecbbbcfdececbgeegbbdgeccjcbcbdecbbcobdbbfceegbdbccbecbbcdbcfbiccbbccceddcegrdbfibbchbbbebcdbdecdlcfedbgbebiebbdbbbbbfcdgbbcebcbcbgccbndecbbdbbgcbbbgebbbccdbgdedlccgefbdjcdebeffbcehdfdbbhbdcbdccdbefjcecccibbdbbdcebbdcecfbghfdbbcbbfdcgchbgcbecjbdbibfdbbcdbfccbjgbmcbddifbbchbfibbeeffbcceihjochccbfchbgcddbdebcfdmbdbdgcjbddhddcbdbbdcbcdlodcbbfdbjebibdbdefbbdcgcgdfgdclbcbkbgbbebcbcjcgdfdbebbebbddebdbccddgcbbedekbfdcblccbbcdbbdcndbcebbocecfdhbikddhcgciggbhecdecbbgdfdfbccbkdifdcbbbdcdheceblihefdbddgbbgfbdbcdbfefhfhhebbbcgddchbbdgbcbicbdgedhdbbiecfdbhbcdkbgcdkdbccbdbbfbicebbbgebpibecbkebbcfdoecdhbbcbddcebghdbfbecbbdjdbbtfbbdjiebbbbbbbcbccbdcebddcdijcbdegcbkdfdtbbgcdcfddfdbecbehcbebgbedibdceddbbkbdcckbcdcebfeecdgehbgcbedgddeecdebbceccbcdcdbfchcdfefcfbebbccfdbebhcddbbhcgdbbcuddbjsbdlgbbbcbcgbcccecbbfchidcdecdfdjhbbdfbcedcebbbfcicdcggjbdcbfngbbcbcbebbbbbcbhedcbjgccddebbbmbbhddcofcdceecefddfhecebbdcbfcbbbcbcbcbfbdgdcebcbbefbdbcbbighbejbedbbccbbecgcgdjcbbbgbcrcebdccbdbgccdbdebeceecdbbcegcecjebbffdcccjccdddebbfkddbcgbbfcgbddhfcbieecibbdffcccjcdcbcehhddcbfbddfccbchbedddpbbecbbdebjcbfbgbefbfcbeddbddkbbbdcfefdgfcfhebcbbjfcibchbbbdibfdbbccecdbcbgdihbbbefcbdbcedcbbbecbbbibcifbdbbbbbdcccbkggfbfgibddecbeebccbdbbbhbecdcbffgdbbjbcedccejimcdkfgcfdbedcbecccbbecbndgeefedbccbbdfccdcbchcibjbjdccbfcbbcefgehebebcibdcegbdgbicbbecbebeebcebgbfeebfbekdbbbiecddebfdcefkefcbdcbhbchdbdebbgbcbgfcfefecdbbbfbbdcbbbgecgfcbbgddbcbecbbbbgbcdccbeedjdllbdcbdldbhcbfecgbcdbelbcbcdcdcgbhdcbbibdbcbcdgbcclcdbcdbebfdbdebccdccocddbdfdbecececccbdecgbbdfdchbbhdbcgcedgbbdcbebbbmbccgbchfkeebbedfbbjdbccigfbcbefcicgbcccedbbgdbcheccbbeddcbccbcbcfegfddbccbbblccccccbfgegcmbdbhcbhcdcccdfbcbcbcbbjcbcbccdbbdecfbdibbbbbfglceedgdgcgrbhgfbgcbfegcfuikggddggcdcdccefbcbjcgfcfcdbccccbebbbdcbbehbbcedcbfhbbgdcdcecbgbbeebbfdfbbcccbbdgccbdeccfdbbiiebbcbbdhcefbcfcdddcefddebbcebbfjebbbbcfcdhebbcgbcicdefbcrdcbccdcibgbecdcbcdcgfcbbbjgkjdfbecbbbgcccbfokddebdcejkbfhghdgccbdgbldcfbcgbfcboebbbmcdebhbpbceeebceeedbegdidecbefebebcnfdeckbgbdbbcbeeebfbelgcffcgekfdbcbecbghcbbbccpgbddibrfbicdbidbdbcbcedcbhbbdcbcefbebgdbdcddecdgedgbjcccgdcfgbbgcdbfhcddbfieldcibbbdcchbeceebidbebhbbcibbfebbgbedbbffbeehcbddbdedbbcebdbcbecbdebbcbbbhdcedebcbbfbhbcndfjfbcbjicbmbbeccbcdbdececdcbgbbdbccfhfdbjcbbcebecbddcelcbhbfccdhccdbhcedbdccdchcfffdddcecbnjccfbncfbffbbbgbbfdbdgfccbbebcfbocgbbldeebcceggdcbedbceiccdecerdbbbdbehbcfdebddbffcccbddddcbecbbeecbbdedbbebdbbebbcbbcfcgbebbdbddbjcdgebchcbckdfcdbbegddbbcbgcbbccgcbbhbbbbchbdlbbffecdbbdbecefkbccbbjgbbcdcbbbjegeebccbcbmcbebcbbgbccedcgecbebcbjbbbhbdbccncebhfdbgjebcbbgddecceeefofbecebmcbidbbbeedhbcfeibfdbbbchbfikbcbkejffbcfecbgcjjenkbibccdcebdcdhbbbddbcncfihccgcgbbddbdkfgdbbbmgdgddfnebdcbdbfebblbhfbbegfcebbcdiedibdfgbcdgedbbgdbecbbbbcclebicbfcfbhbgicbdbddebdgejcb
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 115,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"text = ''\n",
|
||
|
"with open('100k_gemotric', 'r') as file:\n",
|
||
|
" text = file.read()\n",
|
||
|
"\n",
|
||
|
"print('unigram_entropy: ', unigram_entropy(text))\n",
|
||
|
"codec = HuffmanCodec.from_data(text)\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text[:3]).hex(),16)))\n",
|
||
|
"\n",
|
||
|
"codec = HuffmanCodec.from_data(text.split())\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text.split()[:3]).hex(),16)))\n",
|
||
|
"print('unigram_entropy words: ', unigram_entropy(text.split()))\n",
|
||
|
"\n",
|
||
|
"save_codec_to_files(text, 'on_chars/100k_gemotric_coded')\n",
|
||
|
"save_codec_to_files(text.split(), 'on_words/100k_gemotric_coded')\n",
|
||
|
"text.split([])\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 105,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"unigram_entropy: 0.9999978659644434\n",
|
||
|
"3 pierwsze znaki 01011000\n",
|
||
|
"3 pierwsze znaki 10000000\n",
|
||
|
"unigram_entropy words: -0.0\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"text = ''\n",
|
||
|
"with open('100k_dwupunkt_p_05', 'r') as file:\n",
|
||
|
" text = file.read()\n",
|
||
|
"\n",
|
||
|
"print('unigram_entropy: ', unigram_entropy(text))\n",
|
||
|
"codec = HuffmanCodec.from_data(text)\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text[:3]).hex(),16)))\n",
|
||
|
"\n",
|
||
|
"codec = HuffmanCodec.from_data(text.split())\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text.split()[:3]).hex(),16)))\n",
|
||
|
"print('unigram_entropy words: ', unigram_entropy(text.split()))\n",
|
||
|
"\n",
|
||
|
"save_codec_to_files(text, 'on_chars/100k_dwupunkt_p_05_coded')\n",
|
||
|
"save_codec_to_files(text.split(), 'on_words/100k_dwupunkt_p_05_coded')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 104,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"unigram_entropy: 0.4689321918830734\n",
|
||
|
"3 pierwsze znaki 11100000\n",
|
||
|
"3 pierwsze znaki 10000000\n",
|
||
|
"unigram_entropy words: -0.0\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"text = ''\n",
|
||
|
"with open('100k_dwupunkt_p_09', 'r') as file:\n",
|
||
|
" text = file.read()\n",
|
||
|
"\n",
|
||
|
"print('unigram_entropy: ', unigram_entropy(text))\n",
|
||
|
"codec = HuffmanCodec.from_data(text)\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text[:3]).hex(),16)))\n",
|
||
|
"\n",
|
||
|
"codec = HuffmanCodec.from_data(text.split())\n",
|
||
|
"print('3 pierwsze znaki', \"{:08b}\".format(int(codec.encode(text.split()[:3]).hex(),16)))\n",
|
||
|
"print('unigram_entropy words: ', unigram_entropy(text.split()))\n",
|
||
|
"\n",
|
||
|
"save_codec_to_files(text, 'on_chars/100k_dwupunkt_p_09_coded')\n",
|
||
|
"save_codec_to_files(text.split(), 'on_words/100k_dwupunkt_p_09_coded')"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3.8.12 64-bit",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.12"
|
||
|
},
|
||
|
"orig_nbformat": 4,
|
||
|
"vscode": {
|
||
|
"interpreter": {
|
||
|
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
|
||
|
}
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|