Lab 6: mosesdecoder
This commit is contained in:
parent
79170f0aab
commit
a99b31cde0
@ -385,7 +385,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 28,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -394,18 +394,37 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 29,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Sytuacja polityczna jest tak niepewna, że wcale by mnie nie zdziwiło, gdyby około grudnia wybuchła wojna.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test_str"
|
"!echo $test_str"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 30,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tokenizer Version 1.1\n",
|
||||||
|
"Language: en\n",
|
||||||
|
"Number of threads: 1\n",
|
||||||
|
"Sytuacja polityczna jest tak niepewna , że wcale by mnie nie zdziwiło , gdyby około grudnia wybuchła wojna .\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test_str | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl"
|
"!echo $test_str | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl"
|
||||||
]
|
]
|
||||||
@ -419,9 +438,22 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 31,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Detokenizer Version $Revision: 4134 $\n",
|
||||||
|
"Language: en\n",
|
||||||
|
"Tokenizer Version 1.1\n",
|
||||||
|
"Language: en\n",
|
||||||
|
"Number of threads: 1\n",
|
||||||
|
"Sytuacja polityczna jest tak niepewna, że wcale by mnie nie zdziwiło, gdyby około grudnia wybuchła wojna.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test_str | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/detokenizer.perl --language pl"
|
"!echo $test_str | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/detokenizer.perl --language pl"
|
||||||
]
|
]
|
||||||
@ -435,43 +467,113 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 32,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tokenizer Version 1.1\n",
|
||||||
|
"Language: en\n",
|
||||||
|
"Number of threads: 1\n",
|
||||||
|
"sytuacja polityczna jest tak niepewna , że wcale by mnie nie zdziwiło , gdyby około grudnia wybuchła wojna .\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test_str | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/lowercase.perl"
|
"!echo $test_str | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/lowercase.perl"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 33,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tokenizer Version 1.1\n",
|
||||||
|
"Language: en\n",
|
||||||
|
"Number of threads: 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!cat lalka-tom-pierwszy.txt | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/lowercase.perl > lalka-tom-pierwszy-tokenized-lowercased.txt"
|
"!cat lalka-tom-pierwszy.txt | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/lowercase.perl > lalka-tom-pierwszy-tokenized-lowercased.txt"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 34,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tokenizer Version 1.1\n",
|
||||||
|
"Language: en\n",
|
||||||
|
"Number of threads: 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!cat lalka-tom-drugi.txt | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/lowercase.perl > lalka-tom-drugi-tokenized-lowercased.txt"
|
"!cat lalka-tom-drugi.txt | $TOKENIZER_SCRIPTS/tokenizer.perl --language pl | $TOKENIZER_SCRIPTS/lowercase.perl > lalka-tom-drugi-tokenized-lowercased.txt"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 35,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"=== 1/5 Counting and sorting n-grams ===\n",
|
||||||
|
"Reading /home/pawel/moj-2024/lab/lalka-tom-pierwszy-tokenized-lowercased.txt\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"****************************************************************************************************\n",
|
||||||
|
"Unigram tokens 149285 types 22230\n",
|
||||||
|
"=== 2/5 Calculating and sorting adjusted counts ===\n",
|
||||||
|
"Chain sizes: 1:266760 2:2262010112 3:4241268992 4:6786030592\n",
|
||||||
|
"Statistics:\n",
|
||||||
|
"1 8857/22230 D1=0.664486 D2=1.14301 D3+=1.57055\n",
|
||||||
|
"2 14632/86142 D1=0.838336 D2=1.2415 D3+=1.40935\n",
|
||||||
|
"3 8505/128074 D1=0.931027 D2=1.29971 D3+=1.54806\n",
|
||||||
|
"4 3174/138744 D1=0.967887 D2=1.35058 D3+=1.70692\n",
|
||||||
|
"Memory estimate for binary LM:\n",
|
||||||
|
"type kB\n",
|
||||||
|
"probing 822 assuming -p 1.5\n",
|
||||||
|
"probing 993 assuming -r models -p 1.5\n",
|
||||||
|
"trie 480 without quantization\n",
|
||||||
|
"trie 343 assuming -q 8 -b 8 quantization \n",
|
||||||
|
"trie 459 assuming -a 22 array pointer compression\n",
|
||||||
|
"trie 322 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
|
||||||
|
"=== 3/5 Calculating and sorting initial probabilities ===\n",
|
||||||
|
"Chain sizes: 1:106284 2:234112 3:170100 4:76176\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"**##################################################################################################\n",
|
||||||
|
"=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
|
||||||
|
"Chain sizes: 1:106284 2:234112 3:170100 4:76176\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"####################################################################################################\n",
|
||||||
|
"=== 5/5 Writing ARPA model ===\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"****************************************************************************************************\n",
|
||||||
|
"Name:lmplz\tVmPeak:13142612 kB\tVmRSS:7392 kB\tRSSMax:2624428 kB\tuser:0.229863\tsys:0.579255\tCPU:0.809192\treal:0.791505\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 --prune 1 1 1 1 < lalka-tom-pierwszy-tokenized-lowercased.txt > lalka_tom_pierwszy_lm.arpa"
|
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 --prune 1 1 1 1 < lalka-tom-pierwszy-tokenized-lowercased.txt > lalka_tom_pierwszy_lm.arpa"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 36,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -480,7 +582,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 37,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -489,9 +591,20 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 38,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'sytuacja polityczna jest tak niepewna , że wcale by mnie nie zdziwiło , gdyby około grudnia wybuchła wojna .'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 38,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"test_str"
|
"test_str"
|
||||||
]
|
]
|
||||||
@ -507,18 +620,43 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 39,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Reading lalka_tom_pierwszy_lm.arpa\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"****************************************************************************************************\n",
|
||||||
|
"SUCCESS\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!$KENLM_BUILD_PATH/bin/build_binary lalka_tom_pierwszy_lm.arpa lalka_tom_pierwszy_lm.binary"
|
"!$KENLM_BUILD_PATH/bin/build_binary lalka_tom_pierwszy_lm.arpa lalka_tom_pierwszy_lm.binary"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 40,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This binary file contains probing hash tables.\n",
|
||||||
|
"sytuacja=0 1 -5.568051\tpolityczna=0 1 -4.4812803\tjest=91 1 -2.6271343\ttak=175 2 -1.7584295\tniepewna=0 1 -4.603079\t,=22 1 -1.2027187\tże=90 2 -1.2062931\twcale=375 1 -4.0545278\tby=995 1 -3.5268068\tmnie=1491 2 -1.6614945\tnie=94 2 -1.4855772\tzdziwiło=0 1 -4.708499\t,=22 1 -1.2027187\tgdyby=555 2 -2.4179027\tokoło=957 1 -3.7740536\tgrudnia=0 1 -4.605748\twybuchła=0 1 -4.4812803\twojna=849 1 -4.213117\t.=42 1 -1.3757544\t</s>=2 2 -0.46293145\tTotal: -59.417397 OOV: 6\n",
|
||||||
|
"Perplexity including OOVs:\t935.1253434773644\n",
|
||||||
|
"Perplexity excluding OOVs:\t162.9687064350829\n",
|
||||||
|
"OOVs:\t6\n",
|
||||||
|
"Tokens:\t20\n",
|
||||||
|
"Name:query\tVmPeak:8864 kB\tVmRSS:4504 kB\tRSSMax:5328 kB\tuser:0.002388\tsys:0\tCPU:0.0024207\treal:0.000614597\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!echo $test_str | $KENLM_BUILD_PATH/bin/query lalka_tom_pierwszy_lm.binary"
|
"!echo $test_str | $KENLM_BUILD_PATH/bin/query lalka_tom_pierwszy_lm.binary"
|
||||||
]
|
]
|
||||||
@ -534,9 +672,115 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 41,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Builds unpruned language models with modified Kneser-Ney smoothing.\n",
|
||||||
|
"\n",
|
||||||
|
"Please cite:\n",
|
||||||
|
"@inproceedings{Heafield-estimate,\n",
|
||||||
|
" author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n",
|
||||||
|
" title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n",
|
||||||
|
" year = {2013},\n",
|
||||||
|
" month = {8},\n",
|
||||||
|
" booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n",
|
||||||
|
" address = {Sofia, Bulgaria},\n",
|
||||||
|
" url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n",
|
||||||
|
"the model (-o) is the only mandatory option. As this is an on-disk program,\n",
|
||||||
|
"setting the temporary file location (-T) and sorting memory (-S) is recommended.\n",
|
||||||
|
"\n",
|
||||||
|
"Memory sizes are specified like GNU sort: a number followed by a unit character.\n",
|
||||||
|
"Valid units are % for percentage of memory (supported platforms only) and (in\n",
|
||||||
|
"increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n",
|
||||||
|
"This machine has 16611971072 bytes of memory.\n",
|
||||||
|
"\n",
|
||||||
|
"Language model building options:\n",
|
||||||
|
" -h [ --help ] Show this help message\n",
|
||||||
|
" -o [ --order ] arg Order of the model\n",
|
||||||
|
" --interpolate_unigrams [=arg(=1)] (=1)\n",
|
||||||
|
" Interpolate the unigrams (default) as \n",
|
||||||
|
" opposed to giving lots of mass to <unk>\n",
|
||||||
|
" like SRI. If you want SRI's behavior \n",
|
||||||
|
" with a large <unk> and the old lmplz \n",
|
||||||
|
" default, use --interpolate_unigrams 0.\n",
|
||||||
|
" --skip_symbols Treat <s>, </s>, and <unk> as \n",
|
||||||
|
" whitespace instead of throwing an \n",
|
||||||
|
" exception\n",
|
||||||
|
" -T [ --temp_prefix ] arg (=/tmp/) Temporary file prefix\n",
|
||||||
|
" -S [ --memory ] arg (=80%) Sorting memory\n",
|
||||||
|
" --minimum_block arg (=8K) Minimum block size to allow\n",
|
||||||
|
" --sort_block arg (=64M) Size of IO operations for sort \n",
|
||||||
|
" (determines arity)\n",
|
||||||
|
" --block_count arg (=2) Block count (per order)\n",
|
||||||
|
" --vocab_estimate arg (=1000000) Assume this vocabulary size for \n",
|
||||||
|
" purposes of calculating memory in step \n",
|
||||||
|
" 1 (corpus count) and pre-sizing the \n",
|
||||||
|
" hash table\n",
|
||||||
|
" --vocab_pad arg (=0) If the vocabulary is smaller than this \n",
|
||||||
|
" value, pad with <unk> to reach this \n",
|
||||||
|
" size. Requires --interpolate_unigrams\n",
|
||||||
|
" --verbose_header Add a verbose header to the ARPA file \n",
|
||||||
|
" that includes information such as token\n",
|
||||||
|
" count, smoothing type, etc.\n",
|
||||||
|
" --text arg Read text from a file instead of stdin\n",
|
||||||
|
" --arpa arg Write ARPA to a file instead of stdout\n",
|
||||||
|
" --intermediate arg Write ngrams to intermediate files. \n",
|
||||||
|
" Turns off ARPA output (which can be \n",
|
||||||
|
" reactivated by --arpa file). Forces \n",
|
||||||
|
" --renumber on.\n",
|
||||||
|
" --renumber Renumber the vocabulary identifiers so \n",
|
||||||
|
" that they are monotone with the hash of\n",
|
||||||
|
" each string. This is consistent with \n",
|
||||||
|
" the ordering used by the trie data \n",
|
||||||
|
" structure.\n",
|
||||||
|
" --collapse_values Collapse probability and backoff into a\n",
|
||||||
|
" single value, q that yields the same \n",
|
||||||
|
" sentence-level probabilities. See \n",
|
||||||
|
" http://kheafield.com/professional/edinb\n",
|
||||||
|
" urgh/rest_paper.pdf for more details, \n",
|
||||||
|
" including a proof.\n",
|
||||||
|
" --prune arg Prune n-grams with count less than or \n",
|
||||||
|
" equal to the given threshold. Specify \n",
|
||||||
|
" one value for each order i.e. 0 0 1 to \n",
|
||||||
|
" prune singleton trigrams and above. \n",
|
||||||
|
" The sequence of values must be \n",
|
||||||
|
" non-decreasing and the last value \n",
|
||||||
|
" applies to any remaining orders. \n",
|
||||||
|
" Default is to not prune, which is \n",
|
||||||
|
" equivalent to --prune 0.\n",
|
||||||
|
" --limit_vocab_file arg Read allowed vocabulary separated by \n",
|
||||||
|
" whitespace. N-grams that contain \n",
|
||||||
|
" vocabulary items not in this list will \n",
|
||||||
|
" be pruned. Can be combined with --prune\n",
|
||||||
|
" arg\n",
|
||||||
|
" --discount_fallback [=arg(=0.5 1 1.5)]\n",
|
||||||
|
" The closed-form estimate for Kneser-Ney\n",
|
||||||
|
" discounts does not work without \n",
|
||||||
|
" singletons or doubletons. It can also \n",
|
||||||
|
" fail if these values are out of range. \n",
|
||||||
|
" This option falls back to \n",
|
||||||
|
" user-specified discounts when the \n",
|
||||||
|
" closed-form estimate fails. Note that \n",
|
||||||
|
" this option is generally a bad idea: \n",
|
||||||
|
" you should deduplicate your corpus \n",
|
||||||
|
" instead. However, class-based models \n",
|
||||||
|
" need custom discounts because they lack\n",
|
||||||
|
" singleton unigrams. Provide up to \n",
|
||||||
|
" three discounts (for adjusted counts 1,\n",
|
||||||
|
" 2, and 3+), which will be applied to \n",
|
||||||
|
" all orders where the closed-form \n",
|
||||||
|
" estimates fail.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!$KENLM_BUILD_PATH/bin/lmplz "
|
"!$KENLM_BUILD_PATH/bin/lmplz "
|
||||||
]
|
]
|
||||||
@ -550,18 +794,47 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 42,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||||||
|
"Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
|
||||||
|
" Downloading https://github.com/kpu/kenlm/archive/master.zip\n",
|
||||||
|
"\u001b[2K \u001b[32m-\u001b[0m \u001b[32m553.6 kB\u001b[0m \u001b[31m851.1 kB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
|
||||||
|
"\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n",
|
||||||
|
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
|
||||||
|
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
|
||||||
|
"\u001b[?25hBuilding wheels for collected packages: kenlm\n",
|
||||||
|
" Building wheel for kenlm (pyproject.toml) ... \u001b[?25ldone\n",
|
||||||
|
"\u001b[?25h Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=3184348 sha256=c9da9a754aa07ffa26f8983ced2910a547d665006e39fd053d365b802b4135e9\n",
|
||||||
|
" Stored in directory: /tmp/pip-ephem-wheel-cache-e8zp2xqd/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462\n",
|
||||||
|
"Successfully built kenlm\n",
|
||||||
|
"Installing collected packages: kenlm\n",
|
||||||
|
"Successfully installed kenlm-0.2.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!pip install https://github.com/kpu/kenlm/archive/master.zip"
|
"!pip install https://github.com/kpu/kenlm/archive/master.zip"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 43,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"-59.417396545410156\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import kenlm\n",
|
"import kenlm\n",
|
||||||
"model = kenlm.Model('lalka_tom_pierwszy_lm.binary')\n",
|
"model = kenlm.Model('lalka_tom_pierwszy_lm.binary')\n",
|
||||||
@ -570,9 +843,36 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 44,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(-5.568050861358643, 1, True)\n",
|
||||||
|
"(-4.481280326843262, 1, True)\n",
|
||||||
|
"(-2.627134323120117, 1, False)\n",
|
||||||
|
"(-1.7584295272827148, 2, False)\n",
|
||||||
|
"(-4.603078842163086, 1, True)\n",
|
||||||
|
"(-1.202718734741211, 1, False)\n",
|
||||||
|
"(-1.2062931060791016, 2, False)\n",
|
||||||
|
"(-4.054527759552002, 1, False)\n",
|
||||||
|
"(-3.5268068313598633, 1, False)\n",
|
||||||
|
"(-1.661494493484497, 2, False)\n",
|
||||||
|
"(-1.4855772256851196, 2, False)\n",
|
||||||
|
"(-4.708498954772949, 1, True)\n",
|
||||||
|
"(-1.202718734741211, 1, False)\n",
|
||||||
|
"(-2.417902708053589, 2, False)\n",
|
||||||
|
"(-3.7740535736083984, 1, False)\n",
|
||||||
|
"(-4.605748176574707, 1, True)\n",
|
||||||
|
"(-4.481280326843262, 1, True)\n",
|
||||||
|
"(-4.2131171226501465, 1, False)\n",
|
||||||
|
"(-1.3757543563842773, 1, False)\n",
|
||||||
|
"(-0.46293145418167114, 2, False)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"for i in model.full_scores(test_str):\n",
|
"for i in model.full_scores(test_str):\n",
|
||||||
" print(i)"
|
" print(i)"
|
||||||
|
Loading…
Reference in New Issue
Block a user