diff --git a/src/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/src/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..7fec515 --- /dev/null +++ b/src/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/.ipynb_checkpoints/zajeciaipynb-checkpoint.ipynb b/src/.ipynb_checkpoints/zajeciaipynb-checkpoint.ipynb new file mode 100644 index 0000000..bd3881b --- /dev/null +++ b/src/.ipynb_checkpoints/zajeciaipynb-checkpoint.ipynb @@ -0,0 +1,904 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import regex as re\n", + "\n", + "def into_words(sentence):\n", + " return re.findall(r'\\p{P}|[^\\p{P}\\s]+', sentence)\n", + "\n", + "def into_characters(sentence):\n", + " return list(sentence)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Z',\n", + " 'a',\n", + " 'ż',\n", + " 'ó',\n", + " 'ł',\n", + " 'ć',\n", + " ' ',\n", + " 'j',\n", + " 'a',\n", + " 'ź',\n", + " 'n',\n", + " 'i',\n", + " 'ą',\n", + " ' ',\n", + " 'g',\n", + " 'ę',\n", + " 'ś',\n", + " 'l',\n", + " '.']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_characters(\"Zażółć jaźnią gęśl.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ala', 'has', 'a', 'cat', 'and', 'a', 'dog', '.']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_words(\"Ala has a cat and a dog.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Humpty', '-', 'dumpty', '3s', ',', 'eg', '.', 'problems', '.']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_words(\"Humpty-dumpty 3s, eg. problems.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Adam',\n", + " ',',\n", + " 'who',\n", + " 'smokes',\n", + " 'a',\n", + " 'lot',\n", + " ',',\n", + " 'caught',\n", + " 'COVID',\n", + " '-',\n", + " '19',\n", + " '.']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_words(\"Adam, who smokes a lot, caught COVID-19.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['A', 'l', 'a', ' ', 'h', 'a', 's', ' ', 'a', ' ', 'c', 'a', 't', '.']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_characters(\"Ala has a cat.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from syntok.tokenizer import Tokenizer\n", + "\n", + "def by_syntok(sentence):\n", + " tok = Tokenizer()\n", + " return [str(t) for t in tok.tokenize(sentence)]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Humpty',\n", + " '-dumpty',\n", + " ' and',\n", + " ' Alice',\n", + " ' has',\n", + " ' pets',\n", + " ' e.g',\n", + " '.',\n", + " ' dogs',\n", + " '!',\n", + " '!',\n", + " '!',\n", + " '!',\n", + " '!']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_syntok(\"Humpty-dumpty and Alice has pets e.g. dogs!!!!!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def add_markers(tokens):\n", + " return [''] + tokens + ['']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', 'This', 'is', 'a', 'black', 'cat', '.', '']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_markers(into_words('This is a black cat.'))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', 'Humpty', '-dumpty', ' jumped', '.', '']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_markers(by_syntok(\"Humpty-dumpty jumped.\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gathering simple counts" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def gather_counts(from_n, to_n, sentences, splitter=lambda s: add_markers(into_words(s))):\n", + " counts = {}\n", + " counts[0] = {(): 0}\n", + " for sentence in sentences:\n", + " tokens = splitter(sentence)\n", + " ntokens = len(tokens)\n", + " counts[0][()] += ntokens\n", + " for n in range(from_n, to_n+1):\n", + " for i in range(0, ntokens-n+1):\n", + " ngram = tuple(tokens[i:i+n])\n", + " if n not in counts:\n", + " counts[n] = {}\n", + " \n", + " if ngram in counts[n]:\n", + " counts[n][ngram] += 1\n", + " else: \n", + " counts[n][ngram] = 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: {(): 17},\n", + " 1: {('',): 3,\n", + " ('Ala',): 1,\n", + " ('ma',): 2,\n", + " ('kota',): 1,\n", + " ('.',): 2,\n", + " ('',): 3,\n", + " ('Basia',): 1,\n", + " ('psa',): 1,\n", + " ('Gdzie',): 1,\n", + " ('mieszkasz',): 1,\n", + " ('?',): 1},\n", + " 2: {('', 'Ala'): 1,\n", + " ('Ala', 'ma'): 1,\n", + " ('ma', 'kota'): 1,\n", + " ('kota', '.'): 1,\n", + " ('.', ''): 2,\n", + " ('', 'Basia'): 1,\n", + " ('Basia', 'ma'): 1,\n", + " ('ma', 'psa'): 1,\n", + " ('psa', '.'): 1,\n", + " ('', 'Gdzie'): 1,\n", + " ('Gdzie', 'mieszkasz'): 1,\n", + " ('mieszkasz', '?'): 1,\n", + " ('?', ''): 1},\n", + " 3: {('', 'Ala', 'ma'): 1,\n", + " ('Ala', 'ma', 'kota'): 1,\n", + " ('ma', 'kota', '.'): 1,\n", + " ('kota', '.', ''): 1,\n", + " ('', 'Basia', 'ma'): 1,\n", + " ('Basia', 'ma', 'psa'): 1,\n", + " ('ma', 'psa', '.'): 1,\n", + " ('psa', '.', ''): 1,\n", + " ('', 'Gdzie', 'mieszkasz'): 1,\n", + " ('Gdzie', 'mieszkasz', '?'): 1,\n", + " ('mieszkasz', '?', ''): 1}}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gather_counts(1, 3, [\"Ala ma kota.\", 'Basia ma psa.', 'Gdzie mieszkasz?'])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "model = gather_counts(1, 4, [\"Ala ma kota.\", 'Basia ma psa.', 'Hej, gdzie teraz mieszkasz?'], splitter=lambda s: add_markers(by_syntok(s)))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model[2][(' ma', ' kota')]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('',): 3,\n", + " ('Ala',): 1,\n", + " (' ma',): 2,\n", + " (' kota',): 1,\n", + " ('.',): 2,\n", + " ('',): 3,\n", + " ('Basia',): 1,\n", + " (' psa',): 1,\n", + " ('Hej',): 1,\n", + " (',',): 1,\n", + " (' gdzie',): 1,\n", + " (' teraz',): 1,\n", + " (' mieszkasz',): 1,\n", + " ('?',): 1}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "shakespeare=(s.strip() for s in open('100-0.txt') if re.search(r'\\S', s))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " at 0x7f7e5dfe1ba0>" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shakespeare" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\ufeffProject Gutenberg’s The Complete Works of William Shakespeare, by William Shakespeare'" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'This eBook is for the use of anyone anywhere in the United States and'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'most other parts of the world at no cost and with almost no restrictions'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "sh_model = gather_counts(1, 3, shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "877" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[2][('to', 'be')]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "57" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[2][('be', 'to')]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[1][('Poland',)]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2283" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[1][('love',)]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "92615" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[1][(',',)]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{(): 1545199}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(',', 'my', 'lord')" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(sh_model[3].keys(), key=lambda k: sh_model[3][k])[-5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Simple n-gram model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Word sequence: $(w_1,...,w_N)$ and model $M$\n", + "We'd like to have $P_M(w_1,...,w_N)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$P(w_1,...,w_N) = P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_1 w_2 \\ldots w_{i-1}) \\ldots P(w_N|w_1 w_2 \\ldots w_{N-1})$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assumption: probability of a word depends on a limited context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(Approximation, not true) \"Piotr, co mieszka w tym dużym zielonym budynku, kupił samochód.\" vs \"\"Anna, co mieszka w tym dużym zielonym budynku, kupiła samochód.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$P(w_1,...,w_N) \\approx P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_{i-(n-1)} \\ldots w_{i-1}) \\ldots P(w_N|w_{N-(i-1)} \\ldots w_{N-1})$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "unigram model $P(w_1,...,w_N) \\approx P(w_1)\\ldots P(w_N) = \\prod P(w_i)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "bigram model $P(w_1,...,w_N) \\appr('',)ox \\prod P(w_i|w_{i-1})$" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "from math import log, exp\n", + "\n", + "def get_prob_simple(model, n, sentence):\n", + " logprob_total = 0\n", + " for i in range(0, len(sentence)-n+1):\n", + " ngram = tuple(sentence[i:i+n])\n", + " pre_ngram = tuple(sentence[i:i+n-1])\n", + " prob = model[n].get(ngram, 0) / model[n-1].get(pre_ngram, 0)\n", + " logprob_total += log(prob)\n", + " return logprob_total \n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\log(ab) = \\log a + \\log b$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\log \\prod P(w_i) = \\sum \\log P(w_i)$" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.128462813174801e-07" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp(get_prob_simple(sh_model, 2, add_markers(into_words('I love thee.'))))" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.585040690529112e-11" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp(get_prob_simple(sh_model, 1, add_markers(into_words('I love you.'))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Smoothing" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "def prob(count, total, nb_classes):\n", + " return count / total" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prob(3, 3, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "def laplace(count, total, nb_classes, alpha=1.0):\n", + " return (count + alpha) / (total + nb_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "laplace(1, 3, 2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Smoothing in n-gram models" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prob_smoothed(model, n, sentence):\n", + " vocabulary_size = len(model[1])\n", + " \n", + " logprob_total = 0\n", + " for i in range(0, len(sentence)-n+1):\n", + " ngram = tuple(sentence[i:i+n])\n", + " pre_ngram = tuple(sentence[i:i+n-1])\n", + " prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)\n", + " logprob_total += log(prob)\n", + " return logprob_total " + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.843912914870102e-16" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp(get_prob_smoothed(sh_model, 1, add_markers(into_words('Love I Czechia.'))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/Untitled.ipynb b/src/Untitled.ipynb new file mode 100644 index 0000000..cf364d2 --- /dev/null +++ b/src/Untitled.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from math import log, exp\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(segment):\n", + " date_begin, date_end, l_context, r_context, text = segment.rstrip('\\n').split('\\t') \n", + " return text\n", + "\n", + "def into_words(sentence):\n", + " return sentence.split(' ')#re.findall(r'\\p{P}|[^\\p{P}\\s]+', sentence)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "def add_markers(tokens):\n", + " return [''] + tokens + ['']" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prob_smoothed(model, n, sentence):\n", + " vocabulary_size = len(model[1])\n", + " \n", + " logprob_total = 0\n", + " for i in range(0, len(sentence)-n+1):\n", + " ngram = tuple(sentence[i:i+n])\n", + " pre_ngram = tuple(sentence[i:i+n-1])\n", + " prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)\n", + " logprob_total += log(prob)\n", + " return logprob_total " + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def gather_counts(from_n, to_n, sentences):\n", + " counts = {}\n", + " counts[0] = {(): 0}\n", + " for sentence in sentences:\n", + " tokens = add_markers(into_words(sentence))\n", + " ntokens = len(tokens)\n", + " counts[0][()] += ntokens\n", + " for n in range(from_n, to_n+1):\n", + " for i in range(0, ntokens-n+1):\n", + " ngram = tuple(tokens[i:i+n])\n", + " if n not in counts:\n", + " counts[n] = {}\n", + " \n", + " if ngram in counts[n]:\n", + " counts[n][ngram] += 1\n", + " else: \n", + " counts[n][ngram] = 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "segments = []\n", + "with open('../train/train.tsv', encoding='utf-8') as file:\n", + " for line in file:\n", + " segments.append(tokenize(line))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "ename": "MemoryError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mMemoryError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgather_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msegments\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mgather_counts\u001b[0;34m(from_n, to_n, sentences)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mcounts\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mngram\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mcounts\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mngram\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcounts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mMemoryError\u001b[0m: " + ] + } + ], + "source": [ + "model = gather_counts(3, 4, segments)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/create_dictionary.py b/src/create_dictionary.py new file mode 100644 index 0000000..643c172 --- /dev/null +++ b/src/create_dictionary.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import sys, pickle +from math import exp, log + +def add_markers(tokens): + return [''] + tokens + [''] + +def into_words(sentence): + a = sentence.split(' ') + return a + +def gather_counts(from_n, to_n, sentences): + counts = {} + counts[0] = {() : 0} + for sentence in sentences: + tokens = add_markers(into_words(sentence)) + ntokens = len(tokens) + counts[0][()] += ntokens + + for n in range(from_n, to_n+1): + for i in range(0, ntokens-n+1): + ngram = tuple(tokens[i:i+n]) + if n not in counts: + counts[n] = {} + if ngram in counts[n]: + counts[n][ngram] += 1 + else: + counts[n][ngram] = 1 + return counts + +def tokenize (segment): + d, dd, l, r, text = segment.rstrip('\n').split('\t') + return text + + +sen = [] +with open(sys.argv[1]) as file: + for line in file: + ss = tokenize(line) + sen.append(ss) + +model_file = sys.argv[2] +model = gather_counts(3,3,sen) +with open(model_file, 'wb+') as p: + pickle.dump(model, p, pickle.HIGHEST_PROTOCOL) diff --git a/src/functions.py b/src/functions.py new file mode 100644 index 0000000..d685270 --- /dev/null +++ b/src/functions.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +import sys +import re +from math import log, exp +import pickle + + +def add_markers(tokens): + return [''] + tokens + [''] + +def into_words(sentence): + return sentence.split(' ')#re.findall(r'\p{P}|[^\p{P}\s]+', sentence) + +def gather_counts(from_n, to_n, sentences): + for sentence in sentences: + tokens = add_markers(into_words(sentence)) + ntokens = len(tokens) + counts[0][()] += ntokens + for n in range(from_n, to_n+1): + for i in range(0, ntokens-n+1): + ngram = tuple(tokens[i:i+n]) + if n not in counts: + counts[n] = {} + if ngram in counts[n]: + counts[n][ngram] += 1 + else: + counts[n][ngram] = 1 + +def get_prob_smoothed(model, n, sentence): + vocabulary_size = len(model[1]) + + logprob_total = 0 + for i in range(0, len(sentence)-n+1): + ngram = tuple(sentence[i:i+n]) + pre_ngram = tuple(sentence[i:i+n-1]) + prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size) + logprob_total += log(prob) + return logprob_total + + + +def tokenize(segment): + date_begin, date_end, l_context, r_context, text = segment.rstrip('\n').split('\t') + return text + + +counts = {} +counts[0] = {(): 0} + +for line in sys.stdin: + s = tokenize(line) + gather_counts(s) +pickle.dump(counts, open('model.pickle', 'wb+')) diff --git a/src/functions.py.backup b/src/functions.py.backup new file mode 100644 index 0000000..ff57cf4 --- /dev/null +++ b/src/functions.py.backup @@ -0,0 +1,54 @@ +#!/usr/bin/env python +import sys +import re +from math import log, exp +import pickle + + +def add_markers(tokens): + return [''] + tokens + [''] + +def into_words(sentence): + return sentence.split(' ')#re.findall(r'\p{P}|[^\p{P}\s]+', sentence) + +def gather_counts(from_n, to_n, sentences): + + for sentence in sentences: + tokens = add_markers(into_words(sentence)) + ntokens = len(tokens) + counts[0][()] += ntokens + for n in range(from_n, to_n+1): + for i in range(0, ntokens-n+1): + ngram = tuple(tokens[i:i+n]) + if n not in counts: + counts[n] = {} + if ngram in counts[n]: + counts[n][ngram] += 1 + else: + counts[n][ngram] = 1 + +def get_prob_smoothed(model, n, sentence): + vocabulary_size = len(model[1]) + + logprob_total = 0 + for i in range(0, len(sentence)-n+1): + ngram = tuple(sentence[i:i+n]) + pre_ngram = tuple(sentence[i:i+n-1]) + prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size) + logprob_total += log(prob) + return logprob_total + + + +def tokenize(segment): + date_begin, date_end, l_context, r_context, text = segment.rstrip('\n').split('\t') + return text + + +counts = {} +counts[0] = {(): 0} + +for line in sys.stdin: + s = tokenize(line) + gather_counts(s) +pickle.dump(counts, open('model.pickle', 'wb+')) diff --git a/src/logprobs_and_predict.py b/src/logprobs_and_predict.py new file mode 100644 index 0000000..3b109da --- /dev/null +++ b/src/logprobs_and_predict.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import sys +from math import log +import pickle + +def laplace(count, total, nb_classes, alpha=1.0): + return (count + alpha) / (total + nb_classes) + +def prob(count, total, nb_classes): + return count / total + +def into_words(sentence): + return sentence.split(' ') + +def get_log_prob(model, trigram, n, sentence): + vocabulary_size = len(model_unigram[1]) + logprob_total = 0 + #import ipdb; ipdb.set_trace() + for i in range(0, len(sentence)-n+1): + ngram = tuple(sentence[i:i+n]) + pre_ngram = tuple(sentence[i:i+n-1]) + prob = laplace(model[n].get(ngram, 0), trigram[3].get(pre_ngram, 0), vocabulary_size) + logprob_total += log(prob) + return logprob_total + +def get_last(sentence): + year_s, year_e, text, text_rest = sentence.rstrip('\n').split('\t') + return text + +#def find_next_word(words): +# candidate_list=[] +# for word in vocab: +# p = get_log_prob(model, 4, words) +# candidate + +if len(sys.argv) != 6: + quit() + +model_name = sys.argv[1] +with open(model_name, 'rb') as file: + model = pickle.load(file) + +unigram_name = sys.argv[2] +with open(unigram_name, 'rb') as file: + model_unigram = pickle.load(file) +vocab = [i[0] for i in list(model_unigram[1])] + +trigram_name = sys.argv[3] +with open(trigram_name, 'rb') as file: + model_trigram = pickle.load(file) + +with open(sys.argv[4]) as file, open(sys.argv[5], 'w+') as out: + for line in file: + + text = into_words(get_last(line))[-3:] + best_word = ("", -1000000) + for word in vocab: + filled = text + [word] + #import ipdb; ipdb.set_trace() + value = get_log_prob(model, model_trigram, 4, filled) + + if value > best_word[1]: + best_word = (word, value) + out.write(best_word[0] + "\n") diff --git a/src/out b/src/out new file mode 100644 index 0000000..e69de29 diff --git a/src/test_dev_0 b/src/test_dev_0 new file mode 100644 index 0000000..5227f9f --- /dev/null +++ b/src/test_dev_0 @@ -0,0 +1 @@ +1874 1874.99999996829 tez wiecznym pokojem się ciosIi } ' . " - ' Poniewoi zaś musicie storać się oto , aby groi , ąoym niebezpiec : Leńst om Waslą władzą , roztropn { tścią i gorliwością zapobieas , przeto uZDacie , ie nic nie będzie stósowniejszcgo i poiyteczniejszego jsk S5Ułmć we wspólnej noradzie właściwych dróg , aby po ądany cel tern powniej i skuteczniej osiągnęć . Skoro prawa Kościoła są nnrU ! Jioue , obowiązkiem jest Waszym f.l ' onić wiernych ; tern bezpiecllłiej lią aś będzie osłona i tem siluiejszą obrona , im .vgodniej i ląc żnićj usiłowania pojedyńcle dtiolać będę , i im gQrliwiej obmyśhme i oznac ; ; ; olle p08t powal1ie , poło eniem rzeczy nakaz8no . Dla tego u , pomimmry Was , abyście jak mi : < ł.na najbardziej zebrali się i po wspólnej naradzie naznaczony paw " , i przeli Y ' szysłkich Pr.l ) jętą modłę , według której , jak tego iVas ! ll uuąd wymaga , jednozgodnie grozące _ le tłumili i wolności Kościoła sili ' iie bronili . Dia tego illJsieliśmy Was upomnieć , iiby się nie zda- \ \ ' rało , ie w t k " , ' dncj sprawie obowiązku NosJicge Iinniodbaliśill 1 ' . _ ' tlbowiom przekonani jesteśmy , ie- ) ; , yścio i bez tego NasEego upomnienia to uczynili . Nie nzekliśmy się takie jesloze nadzioi , ie Bóg odwróci Istniejące złe , gdył zagrzewa Nos del ) rą nadł : ieją prsywiązanie i wiaro Nł ! szego nojukochań- Siego syna w Chr } stusie , Cf ' sarZ ! 1 i królt ' Franciszka Józefa ; ktorego w ponownym liście z dnia dJ : i- eiejszego J : tego powodu zuklinaliśm ) r , oby nigdy nie dO.lwom , by w jego rozległem pnństwie KościGI poddnn , .. Ioostał han ! e mej nieV \ \ ' cli , a jego poddani kat.oliccy n3jwięka ym uciskom . Gdy atoli wielu uderze na Kościół b wszelka 1i ' J \ \ : ! oka nuder nif \ \ bezpieunę , ł ' ueto Wy JUijmniej moieoie trwać w nieoJ . ) ' -nnoici . Oby Bóg kierował Wn ! łemi pos ' anowienian , i i ' " sf ! iernł Wes swoj , potęiną or ; iekę , iibyśde zdJłali sJicJif2śliwie postallowić i } JrJiywieść do sl ; ułku , ce IJa chWflłę Jego Imienia i dla zbawienia dusz słuiy . Na znak tej Boskiej opieki i Nt1slicj S.l : ł ególnej pn ; ych ) JnoEci udJiielamy WatID wszyt ! iim i II ( } sobna kałdemu , ulmchoni Syn ( , Vlie i cxdg ( JdlJi BJ8cin , rreJ : vucho \ \ ' \ \ ieńslwu J wiern } m Waflzej opiece- powierzonym , miłościwe Nf S1 : e błogosławieństwo r-posłoJsllie . Dfn w Rzymie u św. Piotra. dnia 7 mar a 1874 , 28 pont } ' fikatu NVSJegci . ( } o się w tygodniu naj ' wułżniejszego stało " J. na 8wie ie . Niemcy . W.Berlinie obradowaTttj w sejmie oprócz o innych mniejszej wagi sprawach , o pra-wie prasowym , ł. j. o prawie tycz c1 ' m si go- , .et , pism f ' rukQwanycb , księ2e ; . i. t. d . Podowie bn \ \ \ \ arsc ) ' poslaH ndref do swego króla , w którym go proszą aby oparł się prawom nowomodnym w rle " liach religijnych i politycznych , które moją jeszcze być .uprowadzone w zjednonon ) " m państwie ni.emieckim. Król który w ogóle rządem muło się I : ojmuje , oddał pismo posłów ministrom . Wysłańcy b . ! warscy w bundesr .cie gło sowali za pr £ \ \ wem o uwięlirmiu i wypęda ; aniu II kraju , Nskupów i księiy , które to prawo wnet sejm lwi rzesz ) ' niemieckiej przedłoionym zostanie , gdzie naturalnie p % ejdliie . -- Tak nazwane 5to ' rV8rJ : yszenie chłopów bawarskich , do którego i naj w ięlisi panowie 08lełą , puesłało posłom bawarskim w Berlinie adres , prosląc ioh , aby racJ : ej sejm op uścili , niiby brali ud.iat w naradath nad prawami , sprzeciwiającymi si d wchowi katolickiemu i lifłckowawozemu ( konserwatywnemu ) PIUS IX. ludu bawarskiego . Rzęd pflelnl ! CI ł 250 , OÓO tala-rów na podwyiszen ' e pelJsyi . \ \ f1ięi-y , d ; ) ' ka- .dy miał przynajmniej 500 talarów . Pewnie żaden tisiąds diff --git a/src/zad.py b/src/zad.py new file mode 100644 index 0000000..c0e6709 --- /dev/null +++ b/src/zad.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +import sys, pickle +from math import exp, log + +def add_markers(tokens): + return [''] + tokens + [''] + +def into_words(sentence): + a = sentence.split(' ') + return a + +def gather_counts(from_n, to_n, sentences): + counts = {} + counts[0] = {() : 0} + for sentence in sentences: + tokens = add_markers(into_words(sentence)) + ntokens = len(tokens) + counts[0][()] += ntokens + + for n in range(from_n, to_n+1): + for i in range(0, ntokens-n+1): + ngram = tuple(tokens[i:i+n]) + if n not in counts: + counts[n] = {} + if ngram in counts[n]: + counts[n][ngram] += 1 + else: + counts[n][ngram] = 1 + return counts + +def tokenize (segment): + d, dd, l, r, text = segment.rstrip('\n').split('\t') + return text + + +sen = [] +with open(sys.argv[1]) as file: + for line in file: + ss = tokenize(line) + sen.append(ss) + +model_file = sys.argv[2] +model = gather_counts(4,4,sen) +with open(model_file, 'wb+') as p: + pickle.dump(model, p, pickle.HIGHEST_PROTOCOL) diff --git a/src/zajeciaipynb.ipynb b/src/zajeciaipynb.ipynb new file mode 100644 index 0000000..15ca534 --- /dev/null +++ b/src/zajeciaipynb.ipynb @@ -0,0 +1,925 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#import regex as re\n", + "\n", + "def into_words(sentence):\n", + " return sentence.split(' ')#re.findall(r'\\p{P}|[^\\p{P}\\s]+', sentence)\n", + "\n", + "def into_characters(sentence):\n", + " return list(sentence)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Z',\n", + " 'a',\n", + " 'ż',\n", + " 'ó',\n", + " 'ł',\n", + " 'ć',\n", + " ' ',\n", + " 'j',\n", + " 'a',\n", + " 'ź',\n", + " 'n',\n", + " 'i',\n", + " 'ą',\n", + " ' ',\n", + " 'g',\n", + " 'ę',\n", + " 'ś',\n", + " 'l',\n", + " '.']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_characters(\"Zażółć jaźnią gęśl.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ala', 'has', 'a', 'cat', 'and', 'a', 'dog', '.']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_words(\"Ala has a cat and a dog.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Humpty', '-', 'dumpty', '3s', ',', 'eg', '.', 'problems', '.']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_words(\"Humpty-dumpty 3s, eg. problems.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Adam',\n", + " ',',\n", + " 'who',\n", + " 'smokes',\n", + " 'a',\n", + " 'lot',\n", + " ',',\n", + " 'caught',\n", + " 'COVID',\n", + " '-',\n", + " '19',\n", + " '.']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_words(\"Adam, who smokes a lot, caught COVID-19.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['A', 'l', 'a', ' ', 'h', 'a', 's', ' ', 'a', ' ', 'c', 'a', 't', '.']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "into_characters(\"Ala has a cat.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from syntok.tokenizer import Tokenizer\n", + "\n", + "def by_syntok(sentence):\n", + " tok = Tokenizer()\n", + " return [str(t) for t in tok.tokenize(sentence)]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Humpty',\n", + " '-dumpty',\n", + " ' and',\n", + " ' Alice',\n", + " ' has',\n", + " ' pets',\n", + " ' e.g',\n", + " '.',\n", + " ' dogs',\n", + " '!',\n", + " '!',\n", + " '!',\n", + " '!',\n", + " '!']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "by_syntok(\"Humpty-dumpty and Alice has pets e.g. dogs!!!!!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def add_markers(tokens):\n", + " return [''] + tokens + ['']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', 'This', 'is', 'a', 'black', 'cat', '.', '']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_markers(into_words('This is a black cat.'))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', 'Humpty', '-dumpty', ' jumped', '.', '']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_markers(by_syntok(\"Humpty-dumpty jumped.\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gathering simple counts" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def gather_counts(from_n, to_n, sentences, splitter=lambda s: add_markers(into_words(s))):\n", + " counts = {}\n", + " counts[0] = {(): 0}\n", + " for sentence in sentences:\n", + " tokens = splitter(sentence)\n", + " ntokens = len(tokens)\n", + " counts[0][()] += ntokens\n", + " for n in range(from_n, to_n+1):\n", + " for i in range(0, ntokens-n+1):\n", + " ngram = tuple(tokens[i:i+n])\n", + " if n not in counts:\n", + " counts[n] = {}\n", + " \n", + " if ngram in counts[n]:\n", + " counts[n][ngram] += 1\n", + " else: \n", + " counts[n][ngram] = 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: {(): 17},\n", + " 1: {('',): 3,\n", + " ('Ala',): 1,\n", + " ('ma',): 2,\n", + " ('kota',): 1,\n", + " ('.',): 2,\n", + " ('',): 3,\n", + " ('Basia',): 1,\n", + " ('psa',): 1,\n", + " ('Gdzie',): 1,\n", + " ('mieszkasz',): 1,\n", + " ('?',): 1},\n", + " 2: {('', 'Ala'): 1,\n", + " ('Ala', 'ma'): 1,\n", + " ('ma', 'kota'): 1,\n", + " ('kota', '.'): 1,\n", + " ('.', ''): 2,\n", + " ('', 'Basia'): 1,\n", + " ('Basia', 'ma'): 1,\n", + " ('ma', 'psa'): 1,\n", + " ('psa', '.'): 1,\n", + " ('', 'Gdzie'): 1,\n", + " ('Gdzie', 'mieszkasz'): 1,\n", + " ('mieszkasz', '?'): 1,\n", + " ('?', ''): 1},\n", + " 3: {('', 'Ala', 'ma'): 1,\n", + " ('Ala', 'ma', 'kota'): 1,\n", + " ('ma', 'kota', '.'): 1,\n", + " ('kota', '.', ''): 1,\n", + " ('', 'Basia', 'ma'): 1,\n", + " ('Basia', 'ma', 'psa'): 1,\n", + " ('ma', 'psa', '.'): 1,\n", + " ('psa', '.', ''): 1,\n", + " ('', 'Gdzie', 'mieszkasz'): 1,\n", + " ('Gdzie', 'mieszkasz', '?'): 1,\n", + " ('mieszkasz', '?', ''): 1}}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gather_counts(1, 3, [\"Ala ma kota.\", 'Basia ma psa.', 'Gdzie mieszkasz?'])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "model = gather_counts(1, 4, [\"Ala ma kota.\", 'Basia ma psa.', 'Hej, gdzie teraz mieszkasz?'], splitter=lambda s: add_markers(by_syntok(s)))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model[2][(' ma', ' kota')]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('',): 3,\n", + " ('Ala',): 1,\n", + " (' ma',): 2,\n", + " (' kota',): 1,\n", + " ('.',): 2,\n", + " ('',): 3,\n", + " ('Basia',): 1,\n", + " (' psa',): 1,\n", + " ('Hej',): 1,\n", + " (',',): 1,\n", + " (' gdzie',): 1,\n", + " (' teraz',): 1,\n", + " (' mieszkasz',): 1,\n", + " ('?',): 1}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "shakespeare=(s.strip() for s in open('100-0.txt') if re.search(r'\\S', s))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " at 0x7f7e5dfe1ba0>" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shakespeare" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\ufeffProject Gutenberg’s The Complete Works of William Shakespeare, by William Shakespeare'" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'This eBook is for the use of anyone anywhere in the United States and'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'most other parts of the world at no cost and with almost no restrictions'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "sh_model = gather_counts(1, 3, shakespeare)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "877" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[2][('to', 'be')]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "57" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[2][('be', 'to')]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[1][('Poland',)]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2283" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[1][('love',)]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "92615" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[1][(',',)]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{(): 1545199}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sh_model[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(',', 'my', 'lord')" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(sh_model[3].keys(), key=lambda k: sh_model[3][k])[-5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Simple n-gram model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Word sequence: $(w_1,...,w_N)$ and model $M$\n", + "We'd like to have $P_M(w_1,...,w_N)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$P(w_1,...,w_N) = P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_1 w_2 \\ldots w_{i-1}) \\ldots P(w_N|w_1 w_2 \\ldots w_{N-1})$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assumption: probability of a word depends on a limited context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(Approximation, not true) \"Piotr, co mieszka w tym dużym zielonym budynku, kupił samochód.\" vs \"\"Anna, co mieszka w tym dużym zielonym budynku, kupiła samochód.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$P(w_1,...,w_N) \\approx P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_{i-(n-1)} \\ldots w_{i-1}) \\ldots P(w_N|w_{N-(i-1)} \\ldots w_{N-1})$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "unigram model $P(w_1,...,w_N) \\approx P(w_1)\\ldots P(w_N) = \\prod P(w_i)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "bigram model $P(w_1,...,w_N) \\appr('',)ox \\prod P(w_i|w_{i-1})$" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "from math import log, exp\n", + "\n", + "def get_prob_simple(model, n, sentence):\n", + " logprob_total = 0\n", + " for i in range(0, len(sentence)-n+1):\n", + " ngram = tuple(sentence[i:i+n])\n", + " pre_ngram = tuple(sentence[i:i+n-1])\n", + " prob = model[n].get(ngram, 0) / model[n-1].get(pre_ngram, 0)\n", + " logprob_total += log(prob)\n", + " return logprob_total \n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\log(ab) = \\log a + \\log b$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\log \\prod P(w_i) = \\sum \\log P(w_i)$" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.128462813174801e-07" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp(get_prob_simple(sh_model, 2, add_markers(into_words('I love thee.'))))" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.585040690529112e-11" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp(get_prob_simple(sh_model, 1, add_markers(into_words('I love you.'))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Smoothing" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "def prob(count, total, nb_classes):\n", + " return count / total" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prob(3, 3, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "def laplace(count, total, nb_classes, alpha=1.0):\n", + " return (count + alpha) / (total + nb_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "laplace(1, 3, 2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Smoothing in n-gram models" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prob_smoothed(model, n, sentence):\n", + " vocabulary_size = len(model[1])\n", + " \n", + " logprob_total = 0\n", + " for i in range(0, len(sentence)-n+1):\n", + " ngram = tuple(sentence[i:i+n])\n", + " pre_ngram = tuple(sentence[i:i+n-1])\n", + " prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)\n", + " logprob_total += log(prob)\n", + " return logprob_total " + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.843912914870102e-16" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp(get_prob_smoothed(sh_model, 1, add_markers(into_words('Love I Czechia.'))))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', 'I', 'love', 'thee.', '']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_markers(into_words('I love thee.'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}