This commit is contained in:
kubapok 2021-05-17 15:27:17 +02:00
parent 899f1d3949
commit 349b8d949b

View File

@ -265,408 +265,6 @@
"### Perplexity\n", "### Perplexity\n",
"$PP(w_1,\\ldots, w_n) = p(w_1,\\ldots, w_n)^{-\\frac{1}{N}}$" "$PP(w_1,\\ldots, w_n) = p(w_1,\\ldots, w_n)^{-\\frac{1}{N}}$"
] ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ODPOWIEDZI\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zad1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ngrams_1 = list(ngrams(corpora_train_tokenized, 1))\n",
"cnt_1_grams = Counter(ngrams_1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt_1_grams.most_common(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zad2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ngrams_2 = list(ngrams(corpora_train_tokenized, 2))\n",
"cnt_2_grams = Counter(ngrams_2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"cnt_2_grams"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt_2_grams.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def bigram_most_probable(word1):\n",
" word1_count = Counter({bigram:number for bigram,number in cnt_2_grams.items() if bigram[0] == word1})\n",
" total = sum(word1_count.values())\n",
" word1_most_common = dict(word1_count.most_common(10))\n",
" for k in word1_most_common.keys():\n",
" word1_most_common[k] /= total\n",
" return word1_most_common"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bigram_most_probable('pan')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zad3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ngrams_3 = list(ngrams(corpora_train_tokenized, 3))\n",
"cnt_3_grams = Counter(ngrams_3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"word1 = 'pan'\n",
"word2 = 'sędzia'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"count = Counter({trigram:number for trigram,number in cnt.items() if trigram[0] == word1 and trigram[1] == word2})\n",
"total = sum(count.values())\n",
"most_common = dict(count.most_common(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt_3_grams.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def trigram_most_probable(word1, word2):\n",
" count = Counter({trigram:number for trigram,number in cnt_3_grams.items() if trigram[0] == word1 and trigram[1] == word2})\n",
" total = sum(count.values())\n",
" most_common = dict(count.most_common(10))\n",
" for k in most_common.keys():\n",
" most_common[k] /= total\n",
" return most_common"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"trigram_most_probable('pan', 'sędzia')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Counter({trigram:number for trigram,number in cnt_3_grams.items() if trigram[0] == 'pan' and trigram[1] == 'sędzia'})\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zad5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt_1_grams.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt_1_grams[('lasu',)] / sum(cnt_1_grams.values())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(cnt_2_grams[('do','lasu',)] / \n",
" sum({bigram:number for bigram,number in cnt.items() if bigram[0] == 'do'}.values())\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(cnt_3_grams[('poszła', 'do','lasu',)] / sum(cnt_3_grams.values())\n",
" /\n",
" sum({trigram:number for trigram,number in cnt.items() if trigram[0] == 'do' and trigram[1] == 'lasu'}.values())\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zad6"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenized1 = list(tokenize('I z łąk, i z pastwisk razem wracało do dworu',lowercase = True)) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenized1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cnt_2_grams[('do','dworu')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_bigram_prob(tokenized_list):\n",
" prob = cnt_1_grams[(tokenized_list[0],)] / sum(cnt_1_grams.values())\n",
" for i in range(1,len(tokenized_list)):\n",
" word = tokenized_list[i]\n",
" prev_word = tokenized_list[i-1]\n",
" prob *= (cnt_2_grams[(prev_word,word)] / \n",
" sum({bigram:number for bigram,number in cnt.items() if bigram[0] == tokenized_list[i-1]}.values())\n",
" )\n",
" return prob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"get_bigram_prob(tokenized1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenized2 = list(tokenize(\"Tadeusz lewą dłonią dotykając głowy, Pozdrowił swych dowódców przez ukłon wojskowy;\",lowercase = True)) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenized2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_prob(tokenized2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(cnt_1_grams.keys())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### zad7 "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_bigram_smoothed_prob(tokenized_list):\n",
" v_total = len(cnt_1_grams.keys())\n",
" prob = cnt_1_grams[(tokenized_list[0],)] / sum(cnt_1_grams.values())\n",
" for i in range(1,len(tokenized_list)):\n",
" word = tokenized_list[i]\n",
" prev_word = tokenized_list[i-1]\n",
" prob *= ( (cnt_2_grams[(prev_word,word)] +1) / \n",
" (v_total + sum({bigram:number for bigram,number in cnt.items() if bigram[0] == tokenized_list[i-1]}.values()))\n",
" )\n",
" return prob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_smoothed_prob(tokenized1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_smoothed_prob(tokenized2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_prob(tokenized1) ** (-1/len(tokenized1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_prob(tokenized2) ** (-1/len(tokenized2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_smoothed_prob(tokenized1) ** (-1/len(tokenized1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_bigram_smoothed_prob(tokenized2) ** (-1/len(tokenized2))"
]
} }
], ],
"metadata": { "metadata": {