Lab 6
This commit is contained in:
parent
0c669983c8
commit
79170f0aab
@ -20,6 +20,22 @@
|
|||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Instalacja\n",
|
||||||
|
"\n",
|
||||||
|
"(Zob. też dokumentacja)\n",
|
||||||
|
"\n",
|
||||||
|
" sudo apt-get install build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev\n",
|
||||||
|
" wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz\n",
|
||||||
|
" mkdir kenlm/build\n",
|
||||||
|
" cd kenlm/build\n",
|
||||||
|
" cmake ..\n",
|
||||||
|
" make -j2"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -29,27 +45,63 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"KENLM_BUILD_PATH='/home/kuba/kenlm/build'"
|
"KENLM_BUILD_PATH='/home/pawel/kenlm/build' # ścieżka, w której jest zainstalowany KenLM (zob. dokumentacja - link powyżej)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 17,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"--2024-04-10 12:13:27-- https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt\n",
|
||||||
|
"Resolving wolnelektury.pl (wolnelektury.pl)... 51.83.143.148, 2001:41d0:602:3294::\n",
|
||||||
|
"Connecting to wolnelektury.pl (wolnelektury.pl)|51.83.143.148|:443... connected.\n",
|
||||||
|
"HTTP request sent, awaiting response... 200 OK\n",
|
||||||
|
"Length: 860304 (840K) [text/plain]\n",
|
||||||
|
"Saving to: ‘lalka-tom-pierwszy.txt.1’\n",
|
||||||
|
"\n",
|
||||||
|
"lalka-tom-pierwszy. 100%[===================>] 840.14K 3.59MB/s in 0.2s \n",
|
||||||
|
"\n",
|
||||||
|
"2024-04-10 12:13:27 (3.59 MB/s) - ‘lalka-tom-pierwszy.txt.1’ saved [860304/860304]\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!wget https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt"
|
"!wget https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 18,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"--2024-04-10 12:13:30-- https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt\n",
|
||||||
|
"Resolving wolnelektury.pl (wolnelektury.pl)... 51.83.143.148, 2001:41d0:602:3294::\n",
|
||||||
|
"Connecting to wolnelektury.pl (wolnelektury.pl)|51.83.143.148|:443... connected.\n",
|
||||||
|
"HTTP request sent, awaiting response... 200 OK\n",
|
||||||
|
"Length: 949497 (927K) [text/plain]\n",
|
||||||
|
"Saving to: ‘lalka-tom-drugi.txt.1’\n",
|
||||||
|
"\n",
|
||||||
|
"lalka-tom-drugi.txt 100%[===================>] 927.24K 3.39MB/s in 0.3s \n",
|
||||||
|
"\n",
|
||||||
|
"2024-04-10 12:13:30 (3.39 MB/s) - ‘lalka-tom-drugi.txt.1’ saved [949497/949497]\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!wget https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt"
|
"!wget https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt"
|
||||||
]
|
]
|
||||||
@ -63,9 +115,54 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 19,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"=== 1/5 Counting and sorting n-grams ===\n",
|
||||||
|
"Reading /home/pawel/moj-2024/lab/lalka-tom-pierwszy.txt\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"****************************************************************************************************\n",
|
||||||
|
"Unigram tokens 122871 types 33265\n",
|
||||||
|
"=== 2/5 Calculating and sorting adjusted counts ===\n",
|
||||||
|
"Chain sizes: 1:399180 2:2261987584 3:4241227008 4:6785963520\n",
|
||||||
|
"Statistics:\n",
|
||||||
|
"1 33265 D1=0.737356 D2=1.15675 D3+=1.59585\n",
|
||||||
|
"2 93948 D1=0.891914 D2=1.20314 D3+=1.44945\n",
|
||||||
|
"3 115490 D1=0.964904 D2=1.40636 D3+=1.66751\n",
|
||||||
|
"4 116433 D1=0.986444 D2=1.50367 D3+=1.9023\n",
|
||||||
|
"Memory estimate for binary LM:\n",
|
||||||
|
"type kB\n",
|
||||||
|
"probing 7800 assuming -p 1.5\n",
|
||||||
|
"probing 9157 assuming -r models -p 1.5\n",
|
||||||
|
"trie 3902 without quantization\n",
|
||||||
|
"trie 2378 assuming -q 8 -b 8 quantization \n",
|
||||||
|
"trie 3649 assuming -a 22 array pointer compression\n",
|
||||||
|
"trie 2125 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
|
||||||
|
"=== 3/5 Calculating and sorting initial probabilities ===\n",
|
||||||
|
"Chain sizes: 1:399180 2:1503168 3:2309800 4:2794392\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"####################################################################################################\n",
|
||||||
|
"=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
|
||||||
|
"Chain sizes: 1:399180 2:1503168 3:2309800 4:2794392\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"####################################################################################################\n",
|
||||||
|
"=== 5/5 Writing ARPA model ===\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"****************************************************************************************************\n",
|
||||||
|
"Name:lmplz\tVmPeak:13142592 kB\tVmRSS:7564 kB\tRSSMax:2623832 kB\tuser:0.28374\tsys:1.02734\tCPU:1.3111\treal:1.25256\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < lalka-tom-pierwszy.txt > lalka_tom_pierwszy_lm.arpa"
|
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < lalka-tom-pierwszy.txt > lalka_tom_pierwszy_lm.arpa"
|
||||||
]
|
]
|
||||||
@ -85,9 +182,46 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 20,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\\data\\\n",
|
||||||
|
"ngram 1=33265\n",
|
||||||
|
"ngram 2=93948\n",
|
||||||
|
"ngram 3=115490\n",
|
||||||
|
"ngram 4=116433\n",
|
||||||
|
"\n",
|
||||||
|
"\\1-grams:\n",
|
||||||
|
"-5.0133595\t<unk>\t0\n",
|
||||||
|
"0\t<s>\t-0.99603957\n",
|
||||||
|
"-1.4302719\t</s>\t0\n",
|
||||||
|
"-4.7287908\tBolesław\t-0.049677044\n",
|
||||||
|
"-4.9033437\tPrus\t-0.049677044\n",
|
||||||
|
"-4.9033437\tLalka\t-0.049677044\n",
|
||||||
|
"-4.9033437\tISBN\t-0.049677044\n",
|
||||||
|
"-4.9033437\t978-83-288-2673-1\t-0.049677044\n",
|
||||||
|
"-4.9033437\tTom\t-0.049677044\n",
|
||||||
|
"-3.0029354\tI\t-0.17544968\n",
|
||||||
|
"-4.9033437\tI.\t-0.049677044\n",
|
||||||
|
"-3.5526814\tJak\t-0.1410632\n",
|
||||||
|
"-3.8170912\twygląda\t-0.16308141\n",
|
||||||
|
"-4.608305\tfirma\t-0.049677044\n",
|
||||||
|
"-4.33789\tJ.\t-0.3295009\n",
|
||||||
|
"-3.9192266\tMincel\t-0.12910372\n",
|
||||||
|
"-1.624716\ti\t-0.20128249\n",
|
||||||
|
"-4.1086636\tS.\t-0.098223634\n",
|
||||||
|
"-2.6843808\tWokulski\t-0.19202113\n",
|
||||||
|
"-2.8196363\tprzez\t-0.15214005\n",
|
||||||
|
"-4.9033437\tszkło\t-0.049677044\n",
|
||||||
|
"-4.9033437\tbutelek?\t-0.049677044\n",
|
||||||
|
"-2.848008\tW\t-0.19964235\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!head -n 30 lalka_tom_pierwszy_lm.arpa"
|
"!head -n 30 lalka_tom_pierwszy_lm.arpa"
|
||||||
]
|
]
|
||||||
@ -103,7 +237,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 21,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -112,7 +246,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 22,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -121,27 +255,61 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 23,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Sytuacja polityczna jest tak niepewna, że wcale by mnie nie zdziwiło, gdyby około grudnia wybuchła wojna.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"test_str"
|
"test_str"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 24,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Sytuacja polityczna jest tak niepewna, że wcale by mnie nie zdziwiło, gdyby około grudnia wybuchła wojna.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"test_str"
|
"test_str"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 25,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Sytuacja=0 1 -6.009399\tpolityczna=21766 1 -4.9033437\tjest=123 1 -2.6640298\ttak=231 2 -1.7683144\tniepewna,=0 1 -5.1248584\tże=122 1 -2.1651394\twcale=5123 1 -4.167491\tby=1523 1 -3.55168\tmnie=2555 2 -1.6694618\tnie=127 2 -1.4439836\tzdziwiło,=0 1 -5.2158937\tgdyby=814 1 -3.2300434\tokoło=1462 1 -3.7384818\tgrudnia=0 1 -5.123236\twybuchła=0 1 -5.0133595\twojna.=1285 1 -4.9033437\t</s>=2 2 -0.8501559\tTotal: -61.54222 OOV: 5\n",
|
||||||
|
"Perplexity including OOVs:\t4169.948113875898\n",
|
||||||
|
"Perplexity excluding OOVs:\t834.2371454470355\n",
|
||||||
|
"OOVs:\t5\n",
|
||||||
|
"Tokens:\t17\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test_str | $KENLM_BUILD_PATH/bin/query lalka_tom_pierwszy_lm.arpa 2> /dev/null"
|
"!echo $test_str | $KENLM_BUILD_PATH/bin/query lalka_tom_pierwszy_lm.arpa 2> /dev/null"
|
||||||
]
|
]
|
||||||
@ -164,7 +332,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 26,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -173,20 +341,25 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 27,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Lubię=17813 1 -5.899383\tplacki=0 1 -5.0630364\ti=16 1 -1.624716\twcale=5123 2 -3.2397003\tby=1523 1 -3.6538217\tmnie=2555 2 -1.6694618\tnie=127 2 -1.4439836\tzdziwiło,=0 1 -5.2158937\tgdyby=814 1 -3.2300434\tokoło=1462 1 -3.7384818\tgrudnia=0 1 -5.123236\twybuchła=0 1 -5.0133595\twojna.=1285 1 -4.9033437\t</s>=2 2 -0.8501559\tTotal: -50.668617 OOV: 4\n",
|
||||||
|
"Perplexity including OOVs:\t4160.896818387522\n",
|
||||||
|
"Perplexity excluding OOVs:\t1060.0079770155185\n",
|
||||||
|
"OOVs:\t4\n",
|
||||||
|
"Tokens:\t14\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test2_str | $KENLM_BUILD_PATH/bin/query lalka_tom_pierwszy_lm.arpa 2> /dev/null"
|
"!echo $test2_str | $KENLM_BUILD_PATH/bin/query lalka_tom_pierwszy_lm.arpa 2> /dev/null"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"A co jeśli trochę zmienimy wejście?"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -216,7 +389,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"TOKENIZER_SCRIPTS='/home/kuba/mosesdecoder/scripts/tokenizer'"
|
"TOKENIZER_SCRIPTS='/home/pawel/mosesdecoder/scripts/tokenizer'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -459,7 +632,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.3"
|
"version": "3.10.12"
|
||||||
},
|
},
|
||||||
"subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]",
|
"subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]",
|
||||||
"title": "Ekstrakcja informacji",
|
"title": "Ekstrakcja informacji",
|
||||||
|
Loading…
Reference in New Issue
Block a user