fix

2021-05-17 15:27:17 +02:00 · 2021-05-17 15:27:17 +02:00 · 349b8d949b
commit 349b8d949b
parent 899f1d3949
1 changed files with 0 additions and 402 deletions
--- a/gramowy.ipynb
+++ b/gramowy.ipynb
@ -265,408 +265,6 @@
    "### Perplexity\n",
    "$PP(w_1,\\ldots, w_n) = p(w_1,\\ldots, w_n)^{-\\frac{1}{N}}$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ODPOWIEDZI\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### zad1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ngrams_1 =  list(ngrams(corpora_train_tokenized, 1))\n",
    "cnt_1_grams = Counter(ngrams_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_1_grams.most_common(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### zad2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ngrams_2 =  list(ngrams(corpora_train_tokenized, 2))\n",
    "cnt_2_grams = Counter(ngrams_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cnt_2_grams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_2_grams.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bigram_most_probable(word1):\n",
    "    word1_count = Counter({bigram:number for bigram,number in cnt_2_grams.items() if bigram[0] == word1})\n",
    "    total = sum(word1_count.values())\n",
    "    word1_most_common = dict(word1_count.most_common(10))\n",
    "    for k in word1_most_common.keys():\n",
    "        word1_most_common[k] /=  total\n",
    "    return word1_most_common"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bigram_most_probable('pan')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### zad3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ngrams_3 =  list(ngrams(corpora_train_tokenized, 3))\n",
    "cnt_3_grams = Counter(ngrams_3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "word1 = 'pan'\n",
    "word2 = 'sędzia'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = Counter({trigram:number for trigram,number in cnt.items() if trigram[0] == word1 and trigram[1] == word2})\n",
    "total = sum(count.values())\n",
    "most_common = dict(count.most_common(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_3_grams.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trigram_most_probable(word1, word2):\n",
    "    count = Counter({trigram:number for trigram,number in cnt_3_grams.items() if trigram[0] == word1 and trigram[1] == word2})\n",
    "    total = sum(count.values())\n",
    "    most_common = dict(count.most_common(10))\n",
    "    for k in most_common.keys():\n",
    "        most_common[k] /=  total\n",
    "    return most_common"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "trigram_most_probable('pan', 'sędzia')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Counter({trigram:number for trigram,number in cnt_3_grams.items() if trigram[0] == 'pan' and trigram[1] == 'sędzia'})\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### zad5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_1_grams.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_1_grams[('lasu',)] / sum(cnt_1_grams.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(cnt_2_grams[('do','lasu',)] / \n",
    " sum({bigram:number for bigram,number in cnt.items() if bigram[0] == 'do'}.values())\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(cnt_3_grams[('poszła', 'do','lasu',)] / sum(cnt_3_grams.values())\n",
    " /\n",
    " sum({trigram:number for trigram,number in cnt.items() if trigram[0] == 'do' and trigram[1] == 'lasu'}.values())\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### zad6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized1 = list(tokenize('I z łąk, i z pastwisk razem wracało do dworu',lowercase = True)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_2_grams[('do','dworu')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_bigram_prob(tokenized_list):\n",
    "    prob = cnt_1_grams[(tokenized_list[0],)] / sum(cnt_1_grams.values())\n",
    "    for i in range(1,len(tokenized_list)):\n",
    "        word = tokenized_list[i]\n",
    "        prev_word = tokenized_list[i-1]\n",
    "        prob *= (cnt_2_grams[(prev_word,word)] / \n",
    "         sum({bigram:number for bigram,number in cnt.items() if bigram[0] == tokenized_list[i-1]}.values())\n",
    "        )\n",
    "    return prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "get_bigram_prob(tokenized1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized2 = list(tokenize(\"Tadeusz lewą dłonią dotykając głowy, Pozdrowił swych dowódców przez ukłon wojskowy;\",lowercase = True)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_prob(tokenized2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(cnt_1_grams.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### zad7 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_bigram_smoothed_prob(tokenized_list):\n",
    "    v_total = len(cnt_1_grams.keys())\n",
    "    prob = cnt_1_grams[(tokenized_list[0],)] / sum(cnt_1_grams.values())\n",
    "    for i in range(1,len(tokenized_list)):\n",
    "        word = tokenized_list[i]\n",
    "        prev_word = tokenized_list[i-1]\n",
    "        prob *= ( (cnt_2_grams[(prev_word,word)] +1) / \n",
    "         (v_total + sum({bigram:number for bigram,number in cnt.items() if bigram[0] == tokenized_list[i-1]}.values()))\n",
    "        )\n",
    "    return prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_smoothed_prob(tokenized1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_smoothed_prob(tokenized2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_prob(tokenized1) ** (-1/len(tokenized1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_prob(tokenized2) ** (-1/len(tokenized2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_smoothed_prob(tokenized1) ** (-1/len(tokenized1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_bigram_smoothed_prob(tokenized2) ** (-1/len(tokenized2))"
   ]
  }
 ],
 "metadata": {