fix
This commit is contained in:
parent
899f1d3949
commit
349b8d949b
@ -265,408 +265,6 @@
|
||||
"### Perplexity\n",
|
||||
"$PP(w_1,\\ldots, w_n) = p(w_1,\\ldots, w_n)^{-\\frac{1}{N}}$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ODPOWIEDZI\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zad1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ngrams_1 = list(ngrams(corpora_train_tokenized, 1))\n",
|
||||
"cnt_1_grams = Counter(ngrams_1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_1_grams.most_common(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zad2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ngrams_2 = list(ngrams(corpora_train_tokenized, 2))\n",
|
||||
"cnt_2_grams = Counter(ngrams_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_2_grams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_2_grams.most_common(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def bigram_most_probable(word1):\n",
|
||||
" word1_count = Counter({bigram:number for bigram,number in cnt_2_grams.items() if bigram[0] == word1})\n",
|
||||
" total = sum(word1_count.values())\n",
|
||||
" word1_most_common = dict(word1_count.most_common(10))\n",
|
||||
" for k in word1_most_common.keys():\n",
|
||||
" word1_most_common[k] /= total\n",
|
||||
" return word1_most_common"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"bigram_most_probable('pan')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zad3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ngrams_3 = list(ngrams(corpora_train_tokenized, 3))\n",
|
||||
"cnt_3_grams = Counter(ngrams_3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word1 = 'pan'\n",
|
||||
"word2 = 'sędzia'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"count = Counter({trigram:number for trigram,number in cnt.items() if trigram[0] == word1 and trigram[1] == word2})\n",
|
||||
"total = sum(count.values())\n",
|
||||
"most_common = dict(count.most_common(10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_3_grams.most_common(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def trigram_most_probable(word1, word2):\n",
|
||||
" count = Counter({trigram:number for trigram,number in cnt_3_grams.items() if trigram[0] == word1 and trigram[1] == word2})\n",
|
||||
" total = sum(count.values())\n",
|
||||
" most_common = dict(count.most_common(10))\n",
|
||||
" for k in most_common.keys():\n",
|
||||
" most_common[k] /= total\n",
|
||||
" return most_common"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"trigram_most_probable('pan', 'sędzia')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Counter({trigram:number for trigram,number in cnt_3_grams.items() if trigram[0] == 'pan' and trigram[1] == 'sędzia'})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zad5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_1_grams.most_common(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_1_grams[('lasu',)] / sum(cnt_1_grams.values())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"(cnt_2_grams[('do','lasu',)] / \n",
|
||||
" sum({bigram:number for bigram,number in cnt.items() if bigram[0] == 'do'}.values())\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"(cnt_3_grams[('poszła', 'do','lasu',)] / sum(cnt_3_grams.values())\n",
|
||||
" /\n",
|
||||
" sum({trigram:number for trigram,number in cnt.items() if trigram[0] == 'do' and trigram[1] == 'lasu'}.values())\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zad6"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized1 = list(tokenize('I z łąk, i z pastwisk razem wracało do dworu',lowercase = True)) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cnt_2_grams[('do','dworu')]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_bigram_prob(tokenized_list):\n",
|
||||
" prob = cnt_1_grams[(tokenized_list[0],)] / sum(cnt_1_grams.values())\n",
|
||||
" for i in range(1,len(tokenized_list)):\n",
|
||||
" word = tokenized_list[i]\n",
|
||||
" prev_word = tokenized_list[i-1]\n",
|
||||
" prob *= (cnt_2_grams[(prev_word,word)] / \n",
|
||||
" sum({bigram:number for bigram,number in cnt.items() if bigram[0] == tokenized_list[i-1]}.values())\n",
|
||||
" )\n",
|
||||
" return prob"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_prob(tokenized1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized2 = list(tokenize(\"Tadeusz lewą dłonią dotykając głowy, Pozdrowił swych dowódców przez ukłon wojskowy;\",lowercase = True)) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_prob(tokenized2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(cnt_1_grams.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zad7 "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_bigram_smoothed_prob(tokenized_list):\n",
|
||||
" v_total = len(cnt_1_grams.keys())\n",
|
||||
" prob = cnt_1_grams[(tokenized_list[0],)] / sum(cnt_1_grams.values())\n",
|
||||
" for i in range(1,len(tokenized_list)):\n",
|
||||
" word = tokenized_list[i]\n",
|
||||
" prev_word = tokenized_list[i-1]\n",
|
||||
" prob *= ( (cnt_2_grams[(prev_word,word)] +1) / \n",
|
||||
" (v_total + sum({bigram:number for bigram,number in cnt.items() if bigram[0] == tokenized_list[i-1]}.values()))\n",
|
||||
" )\n",
|
||||
" return prob"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_smoothed_prob(tokenized1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_smoothed_prob(tokenized2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_prob(tokenized1) ** (-1/len(tokenized1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_prob(tokenized2) ** (-1/len(tokenized2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_smoothed_prob(tokenized1) ** (-1/len(tokenized1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_bigram_smoothed_prob(tokenized2) ** (-1/len(tokenized2))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
Reference in New Issue
Block a user