Parialy working version of tetrragram
This commit is contained in:
parent
aabe64887d
commit
2e735c936b
6
src/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
6
src/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
904
src/.ipynb_checkpoints/zajeciaipynb-checkpoint.ipynb
Normal file
904
src/.ipynb_checkpoints/zajeciaipynb-checkpoint.ipynb
Normal file
@ -0,0 +1,904 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import regex as re\n",
|
||||||
|
"\n",
|
||||||
|
"def into_words(sentence):\n",
|
||||||
|
" return re.findall(r'\\p{P}|[^\\p{P}\\s]+', sentence)\n",
|
||||||
|
"\n",
|
||||||
|
"def into_characters(sentence):\n",
|
||||||
|
" return list(sentence)\n",
|
||||||
|
"\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Z',\n",
|
||||||
|
" 'a',\n",
|
||||||
|
" 'ż',\n",
|
||||||
|
" 'ó',\n",
|
||||||
|
" 'ł',\n",
|
||||||
|
" 'ć',\n",
|
||||||
|
" ' ',\n",
|
||||||
|
" 'j',\n",
|
||||||
|
" 'a',\n",
|
||||||
|
" 'ź',\n",
|
||||||
|
" 'n',\n",
|
||||||
|
" 'i',\n",
|
||||||
|
" 'ą',\n",
|
||||||
|
" ' ',\n",
|
||||||
|
" 'g',\n",
|
||||||
|
" 'ę',\n",
|
||||||
|
" 'ś',\n",
|
||||||
|
" 'l',\n",
|
||||||
|
" '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_characters(\"Zażółć jaźnią gęśl.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Ala', 'has', 'a', 'cat', 'and', 'a', 'dog', '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_words(\"Ala has a cat and a dog.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Humpty', '-', 'dumpty', '3s', ',', 'eg', '.', 'problems', '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_words(\"Humpty-dumpty 3s, eg. problems.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Adam',\n",
|
||||||
|
" ',',\n",
|
||||||
|
" 'who',\n",
|
||||||
|
" 'smokes',\n",
|
||||||
|
" 'a',\n",
|
||||||
|
" 'lot',\n",
|
||||||
|
" ',',\n",
|
||||||
|
" 'caught',\n",
|
||||||
|
" 'COVID',\n",
|
||||||
|
" '-',\n",
|
||||||
|
" '19',\n",
|
||||||
|
" '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_words(\"Adam, who smokes a lot, caught COVID-19.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['A', 'l', 'a', ' ', 'h', 'a', 's', ' ', 'a', ' ', 'c', 'a', 't', '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_characters(\"Ala has a cat.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from syntok.tokenizer import Tokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"def by_syntok(sentence):\n",
|
||||||
|
" tok = Tokenizer()\n",
|
||||||
|
" return [str(t) for t in tok.tokenize(sentence)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Humpty',\n",
|
||||||
|
" '-dumpty',\n",
|
||||||
|
" ' and',\n",
|
||||||
|
" ' Alice',\n",
|
||||||
|
" ' has',\n",
|
||||||
|
" ' pets',\n",
|
||||||
|
" ' e.g',\n",
|
||||||
|
" '.',\n",
|
||||||
|
" ' dogs',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"by_syntok(\"Humpty-dumpty and Alice has pets e.g. dogs!!!!!\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def add_markers(tokens):\n",
|
||||||
|
" return ['<BOS>'] + tokens + ['<EOS>']\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['<BOS>', 'This', 'is', 'a', 'black', 'cat', '.', '<EOS>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"add_markers(into_words('This is a black cat.'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['<BOS>', 'Humpty', '-dumpty', ' jumped', '.', '<EOS>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"add_markers(by_syntok(\"Humpty-dumpty jumped.\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Gathering simple counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def gather_counts(from_n, to_n, sentences, splitter=lambda s: add_markers(into_words(s))):\n",
|
||||||
|
" counts = {}\n",
|
||||||
|
" counts[0] = {(): 0}\n",
|
||||||
|
" for sentence in sentences:\n",
|
||||||
|
" tokens = splitter(sentence)\n",
|
||||||
|
" ntokens = len(tokens)\n",
|
||||||
|
" counts[0][()] += ntokens\n",
|
||||||
|
" for n in range(from_n, to_n+1):\n",
|
||||||
|
" for i in range(0, ntokens-n+1):\n",
|
||||||
|
" ngram = tuple(tokens[i:i+n])\n",
|
||||||
|
" if n not in counts:\n",
|
||||||
|
" counts[n] = {}\n",
|
||||||
|
" \n",
|
||||||
|
" if ngram in counts[n]:\n",
|
||||||
|
" counts[n][ngram] += 1\n",
|
||||||
|
" else: \n",
|
||||||
|
" counts[n][ngram] = 1\n",
|
||||||
|
" return counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{0: {(): 17},\n",
|
||||||
|
" 1: {('<BOS>',): 3,\n",
|
||||||
|
" ('Ala',): 1,\n",
|
||||||
|
" ('ma',): 2,\n",
|
||||||
|
" ('kota',): 1,\n",
|
||||||
|
" ('.',): 2,\n",
|
||||||
|
" ('<EOS>',): 3,\n",
|
||||||
|
" ('Basia',): 1,\n",
|
||||||
|
" ('psa',): 1,\n",
|
||||||
|
" ('Gdzie',): 1,\n",
|
||||||
|
" ('mieszkasz',): 1,\n",
|
||||||
|
" ('?',): 1},\n",
|
||||||
|
" 2: {('<BOS>', 'Ala'): 1,\n",
|
||||||
|
" ('Ala', 'ma'): 1,\n",
|
||||||
|
" ('ma', 'kota'): 1,\n",
|
||||||
|
" ('kota', '.'): 1,\n",
|
||||||
|
" ('.', '<EOS>'): 2,\n",
|
||||||
|
" ('<BOS>', 'Basia'): 1,\n",
|
||||||
|
" ('Basia', 'ma'): 1,\n",
|
||||||
|
" ('ma', 'psa'): 1,\n",
|
||||||
|
" ('psa', '.'): 1,\n",
|
||||||
|
" ('<BOS>', 'Gdzie'): 1,\n",
|
||||||
|
" ('Gdzie', 'mieszkasz'): 1,\n",
|
||||||
|
" ('mieszkasz', '?'): 1,\n",
|
||||||
|
" ('?', '<EOS>'): 1},\n",
|
||||||
|
" 3: {('<BOS>', 'Ala', 'ma'): 1,\n",
|
||||||
|
" ('Ala', 'ma', 'kota'): 1,\n",
|
||||||
|
" ('ma', 'kota', '.'): 1,\n",
|
||||||
|
" ('kota', '.', '<EOS>'): 1,\n",
|
||||||
|
" ('<BOS>', 'Basia', 'ma'): 1,\n",
|
||||||
|
" ('Basia', 'ma', 'psa'): 1,\n",
|
||||||
|
" ('ma', 'psa', '.'): 1,\n",
|
||||||
|
" ('psa', '.', '<EOS>'): 1,\n",
|
||||||
|
" ('<BOS>', 'Gdzie', 'mieszkasz'): 1,\n",
|
||||||
|
" ('Gdzie', 'mieszkasz', '?'): 1,\n",
|
||||||
|
" ('mieszkasz', '?', '<EOS>'): 1}}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"gather_counts(1, 3, [\"Ala ma kota.\", 'Basia ma psa.', 'Gdzie mieszkasz?'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model = gather_counts(1, 4, [\"Ala ma kota.\", 'Basia ma psa.', 'Hej, gdzie teraz mieszkasz?'], splitter=lambda s: add_markers(by_syntok(s)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"model[2][(' ma', ' kota')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{('<BOS>',): 3,\n",
|
||||||
|
" ('Ala',): 1,\n",
|
||||||
|
" (' ma',): 2,\n",
|
||||||
|
" (' kota',): 1,\n",
|
||||||
|
" ('.',): 2,\n",
|
||||||
|
" ('<EOS>',): 3,\n",
|
||||||
|
" ('Basia',): 1,\n",
|
||||||
|
" (' psa',): 1,\n",
|
||||||
|
" ('Hej',): 1,\n",
|
||||||
|
" (',',): 1,\n",
|
||||||
|
" (' gdzie',): 1,\n",
|
||||||
|
" (' teraz',): 1,\n",
|
||||||
|
" (' mieszkasz',): 1,\n",
|
||||||
|
" ('?',): 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"model[1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 59,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"shakespeare=(s.strip() for s in open('100-0.txt') if re.search(r'\\S', s))\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 60,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<generator object <genexpr> at 0x7f7e5dfe1ba0>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 60,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"shakespeare"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'\\ufeffProject Gutenberg’s The Complete Works of William Shakespeare, by William Shakespeare'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"next(shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 62,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'This eBook is for the use of anyone anywhere in the United States and'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 62,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"next(shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 63,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'most other parts of the world at no cost and with almost no restrictions'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 63,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"next(shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 64,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sh_model = gather_counts(1, 3, shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 65,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"877"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 65,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[2][('to', 'be')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 66,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"57"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 66,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[2][('be', 'to')]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 67,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 67,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[1][('Poland',)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 68,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2283"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 68,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[1][('love',)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"92615"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[1][(',',)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{(): 1545199}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 71,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(',', 'my', 'lord')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 71,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sorted(sh_model[3].keys(), key=lambda k: sh_model[3][k])[-5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Simple n-gram model\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Word sequence: $(w_1,...,w_N)$ and model $M$\n",
|
||||||
|
"We'd like to have $P_M(w_1,...,w_N)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$P(w_1,...,w_N) = P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_1 w_2 \\ldots w_{i-1}) \\ldots P(w_N|w_1 w_2 \\ldots w_{N-1})$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Assumption: probability of a word depends on a limited context"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"(Approximation, not true) \"Piotr, co mieszka w tym dużym zielonym budynku, kupił samochód.\" vs \"\"Anna, co mieszka w tym dużym zielonym budynku, kupiła samochód.\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$P(w_1,...,w_N) \\approx P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_{i-(n-1)} \\ldots w_{i-1}) \\ldots P(w_N|w_{N-(i-1)} \\ldots w_{N-1})$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"unigram model $P(w_1,...,w_N) \\approx P(w_1)\\ldots P(w_N) = \\prod P(w_i)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"bigram model $P(w_1,...,w_N) \\appr('<BOS>',)ox \\prod P(w_i|w_{i-1})$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 88,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from math import log, exp\n",
|
||||||
|
"\n",
|
||||||
|
"def get_prob_simple(model, n, sentence):\n",
|
||||||
|
" logprob_total = 0\n",
|
||||||
|
" for i in range(0, len(sentence)-n+1):\n",
|
||||||
|
" ngram = tuple(sentence[i:i+n])\n",
|
||||||
|
" pre_ngram = tuple(sentence[i:i+n-1])\n",
|
||||||
|
" prob = model[n].get(ngram, 0) / model[n-1].get(pre_ngram, 0)\n",
|
||||||
|
" logprob_total += log(prob)\n",
|
||||||
|
" return logprob_total \n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$\\log(ab) = \\log a + \\log b$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$\\log \\prod P(w_i) = \\sum \\log P(w_i)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"7.128462813174801e-07"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"exp(get_prob_simple(sh_model, 2, add_markers(into_words('I love thee.'))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 96,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8.585040690529112e-11"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 96,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"exp(get_prob_simple(sh_model, 1, add_markers(into_words('I love you.'))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Smoothing"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 103,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def prob(count, total, nb_classes):\n",
|
||||||
|
" return count / total"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"prob(3, 3, 2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def laplace(count, total, nb_classes, alpha=1.0):\n",
|
||||||
|
" return (count + alpha) / (total + nb_classes)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 117,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 117,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"laplace(1, 3, 2)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Smoothing in n-gram models"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 119,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_prob_smoothed(model, n, sentence):\n",
|
||||||
|
" vocabulary_size = len(model[1])\n",
|
||||||
|
" \n",
|
||||||
|
" logprob_total = 0\n",
|
||||||
|
" for i in range(0, len(sentence)-n+1):\n",
|
||||||
|
" ngram = tuple(sentence[i:i+n])\n",
|
||||||
|
" pre_ngram = tuple(sentence[i:i+n-1])\n",
|
||||||
|
" prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)\n",
|
||||||
|
" logprob_total += log(prob)\n",
|
||||||
|
" return logprob_total "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 122,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3.843912914870102e-16"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 122,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"exp(get_prob_smoothed(sh_model, 1, add_markers(into_words('Love I Czechia.'))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
147
src/Untitled.ipynb
Normal file
147
src/Untitled.ipynb
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"from math import log, exp\n",
|
||||||
|
"import pickle"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def tokenize(segment):\n",
|
||||||
|
" date_begin, date_end, l_context, r_context, text = segment.rstrip('\\n').split('\\t') \n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"def into_words(sentence):\n",
|
||||||
|
" return sentence.split(' ')#re.findall(r'\\p{P}|[^\\p{P}\\s]+', sentence)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def add_markers(tokens):\n",
|
||||||
|
" return ['<BOS>'] + tokens + ['<EOS>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_prob_smoothed(model, n, sentence):\n",
|
||||||
|
" vocabulary_size = len(model[1])\n",
|
||||||
|
" \n",
|
||||||
|
" logprob_total = 0\n",
|
||||||
|
" for i in range(0, len(sentence)-n+1):\n",
|
||||||
|
" ngram = tuple(sentence[i:i+n])\n",
|
||||||
|
" pre_ngram = tuple(sentence[i:i+n-1])\n",
|
||||||
|
" prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)\n",
|
||||||
|
" logprob_total += log(prob)\n",
|
||||||
|
" return logprob_total "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def gather_counts(from_n, to_n, sentences):\n",
|
||||||
|
" counts = {}\n",
|
||||||
|
" counts[0] = {(): 0}\n",
|
||||||
|
" for sentence in sentences:\n",
|
||||||
|
" tokens = add_markers(into_words(sentence))\n",
|
||||||
|
" ntokens = len(tokens)\n",
|
||||||
|
" counts[0][()] += ntokens\n",
|
||||||
|
" for n in range(from_n, to_n+1):\n",
|
||||||
|
" for i in range(0, ntokens-n+1):\n",
|
||||||
|
" ngram = tuple(tokens[i:i+n])\n",
|
||||||
|
" if n not in counts:\n",
|
||||||
|
" counts[n] = {}\n",
|
||||||
|
" \n",
|
||||||
|
" if ngram in counts[n]:\n",
|
||||||
|
" counts[n][ngram] += 1\n",
|
||||||
|
" else: \n",
|
||||||
|
" counts[n][ngram] = 1\n",
|
||||||
|
" return counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"segments = []\n",
|
||||||
|
"with open('../train/train.tsv', encoding='utf-8') as file:\n",
|
||||||
|
" for line in file:\n",
|
||||||
|
" segments.append(tokenize(line))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "MemoryError",
|
||||||
|
"evalue": "",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mMemoryError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-46-de4070661da2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgather_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msegments\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[0;32m<ipython-input-44-b19e9df03672>\u001b[0m in \u001b[0;36mgather_counts\u001b[0;34m(from_n, to_n, sentences)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mcounts\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mngram\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mcounts\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mngram\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcounts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mMemoryError\u001b[0m: "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"model = gather_counts(3, 4, segments)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
46
src/create_dictionary.py
Normal file
46
src/create_dictionary.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import sys, pickle
|
||||||
|
from math import exp, log
|
||||||
|
|
||||||
|
def add_markers(tokens):
|
||||||
|
return ['<BOS>'] + tokens + ['<EOS>']
|
||||||
|
|
||||||
|
def into_words(sentence):
|
||||||
|
a = sentence.split(' ')
|
||||||
|
return a
|
||||||
|
|
||||||
|
def gather_counts(from_n, to_n, sentences):
|
||||||
|
counts = {}
|
||||||
|
counts[0] = {() : 0}
|
||||||
|
for sentence in sentences:
|
||||||
|
tokens = add_markers(into_words(sentence))
|
||||||
|
ntokens = len(tokens)
|
||||||
|
counts[0][()] += ntokens
|
||||||
|
|
||||||
|
for n in range(from_n, to_n+1):
|
||||||
|
for i in range(0, ntokens-n+1):
|
||||||
|
ngram = tuple(tokens[i:i+n])
|
||||||
|
if n not in counts:
|
||||||
|
counts[n] = {}
|
||||||
|
if ngram in counts[n]:
|
||||||
|
counts[n][ngram] += 1
|
||||||
|
else:
|
||||||
|
counts[n][ngram] = 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
def tokenize (segment):
|
||||||
|
d, dd, l, r, text = segment.rstrip('\n').split('\t')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
sen = []
|
||||||
|
with open(sys.argv[1]) as file:
|
||||||
|
for line in file:
|
||||||
|
ss = tokenize(line)
|
||||||
|
sen.append(ss)
|
||||||
|
|
||||||
|
model_file = sys.argv[2]
|
||||||
|
model = gather_counts(3,3,sen)
|
||||||
|
with open(model_file, 'wb+') as p:
|
||||||
|
pickle.dump(model, p, pickle.HIGHEST_PROTOCOL)
|
53
src/functions.py
Normal file
53
src/functions.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from math import log, exp
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
def add_markers(tokens):
|
||||||
|
return ['<BOS>'] + tokens + ['<EOS>']
|
||||||
|
|
||||||
|
def into_words(sentence):
|
||||||
|
return sentence.split(' ')#re.findall(r'\p{P}|[^\p{P}\s]+', sentence)
|
||||||
|
|
||||||
|
def gather_counts(from_n, to_n, sentences):
|
||||||
|
for sentence in sentences:
|
||||||
|
tokens = add_markers(into_words(sentence))
|
||||||
|
ntokens = len(tokens)
|
||||||
|
counts[0][()] += ntokens
|
||||||
|
for n in range(from_n, to_n+1):
|
||||||
|
for i in range(0, ntokens-n+1):
|
||||||
|
ngram = tuple(tokens[i:i+n])
|
||||||
|
if n not in counts:
|
||||||
|
counts[n] = {}
|
||||||
|
if ngram in counts[n]:
|
||||||
|
counts[n][ngram] += 1
|
||||||
|
else:
|
||||||
|
counts[n][ngram] = 1
|
||||||
|
|
||||||
|
def get_prob_smoothed(model, n, sentence):
|
||||||
|
vocabulary_size = len(model[1])
|
||||||
|
|
||||||
|
logprob_total = 0
|
||||||
|
for i in range(0, len(sentence)-n+1):
|
||||||
|
ngram = tuple(sentence[i:i+n])
|
||||||
|
pre_ngram = tuple(sentence[i:i+n-1])
|
||||||
|
prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)
|
||||||
|
logprob_total += log(prob)
|
||||||
|
return logprob_total
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(segment):
|
||||||
|
date_begin, date_end, l_context, r_context, text = segment.rstrip('\n').split('\t')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
counts = {}
|
||||||
|
counts[0] = {(): 0}
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
s = tokenize(line)
|
||||||
|
gather_counts(s)
|
||||||
|
pickle.dump(counts, open('model.pickle', 'wb+'))
|
54
src/functions.py.backup
Normal file
54
src/functions.py.backup
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from math import log, exp
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
def add_markers(tokens):
|
||||||
|
return ['<BOS>'] + tokens + ['<EOS>']
|
||||||
|
|
||||||
|
def into_words(sentence):
|
||||||
|
return sentence.split(' ')#re.findall(r'\p{P}|[^\p{P}\s]+', sentence)
|
||||||
|
|
||||||
|
def gather_counts(from_n, to_n, sentences):
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
tokens = add_markers(into_words(sentence))
|
||||||
|
ntokens = len(tokens)
|
||||||
|
counts[0][()] += ntokens
|
||||||
|
for n in range(from_n, to_n+1):
|
||||||
|
for i in range(0, ntokens-n+1):
|
||||||
|
ngram = tuple(tokens[i:i+n])
|
||||||
|
if n not in counts:
|
||||||
|
counts[n] = {}
|
||||||
|
if ngram in counts[n]:
|
||||||
|
counts[n][ngram] += 1
|
||||||
|
else:
|
||||||
|
counts[n][ngram] = 1
|
||||||
|
|
||||||
|
def get_prob_smoothed(model, n, sentence):
|
||||||
|
vocabulary_size = len(model[1])
|
||||||
|
|
||||||
|
logprob_total = 0
|
||||||
|
for i in range(0, len(sentence)-n+1):
|
||||||
|
ngram = tuple(sentence[i:i+n])
|
||||||
|
pre_ngram = tuple(sentence[i:i+n-1])
|
||||||
|
prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)
|
||||||
|
logprob_total += log(prob)
|
||||||
|
return logprob_total
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(segment):
|
||||||
|
date_begin, date_end, l_context, r_context, text = segment.rstrip('\n').split('\t')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
counts = {}
|
||||||
|
counts[0] = {(): 0}
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
s = tokenize(line)
|
||||||
|
gather_counts(s)
|
||||||
|
pickle.dump(counts, open('model.pickle', 'wb+'))
|
65
src/logprobs_and_predict.py
Normal file
65
src/logprobs_and_predict.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from math import log
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
def laplace(count, total, nb_classes, alpha=1.0):
|
||||||
|
return (count + alpha) / (total + nb_classes)
|
||||||
|
|
||||||
|
def prob(count, total, nb_classes):
|
||||||
|
return count / total
|
||||||
|
|
||||||
|
def into_words(sentence):
|
||||||
|
return sentence.split(' ')
|
||||||
|
|
||||||
|
def get_log_prob(model, trigram, n, sentence):
|
||||||
|
vocabulary_size = len(model_unigram[1])
|
||||||
|
logprob_total = 0
|
||||||
|
#import ipdb; ipdb.set_trace()
|
||||||
|
for i in range(0, len(sentence)-n+1):
|
||||||
|
ngram = tuple(sentence[i:i+n])
|
||||||
|
pre_ngram = tuple(sentence[i:i+n-1])
|
||||||
|
prob = laplace(model[n].get(ngram, 0), trigram[3].get(pre_ngram, 0), vocabulary_size)
|
||||||
|
logprob_total += log(prob)
|
||||||
|
return logprob_total
|
||||||
|
|
||||||
|
def get_last(sentence):
|
||||||
|
year_s, year_e, text, text_rest = sentence.rstrip('\n').split('\t')
|
||||||
|
return text
|
||||||
|
|
||||||
|
#def find_next_word(words):
|
||||||
|
# candidate_list=[]
|
||||||
|
# for word in vocab:
|
||||||
|
# p = get_log_prob(model, 4, words)
|
||||||
|
# candidate
|
||||||
|
|
||||||
|
if len(sys.argv) != 6:
|
||||||
|
quit()
|
||||||
|
|
||||||
|
model_name = sys.argv[1]
|
||||||
|
with open(model_name, 'rb') as file:
|
||||||
|
model = pickle.load(file)
|
||||||
|
|
||||||
|
unigram_name = sys.argv[2]
|
||||||
|
with open(unigram_name, 'rb') as file:
|
||||||
|
model_unigram = pickle.load(file)
|
||||||
|
vocab = [i[0] for i in list(model_unigram[1])]
|
||||||
|
|
||||||
|
trigram_name = sys.argv[3]
|
||||||
|
with open(trigram_name, 'rb') as file:
|
||||||
|
model_trigram = pickle.load(file)
|
||||||
|
|
||||||
|
with open(sys.argv[4]) as file, open(sys.argv[5], 'w+') as out:
|
||||||
|
for line in file:
|
||||||
|
|
||||||
|
text = into_words(get_last(line))[-3:]
|
||||||
|
best_word = ("", -1000000)
|
||||||
|
for word in vocab:
|
||||||
|
filled = text + [word]
|
||||||
|
#import ipdb; ipdb.set_trace()
|
||||||
|
value = get_log_prob(model, model_trigram, 4, filled)
|
||||||
|
|
||||||
|
if value > best_word[1]:
|
||||||
|
best_word = (word, value)
|
||||||
|
out.write(best_word[0] + "\n")
|
1
src/test_dev_0
Normal file
1
src/test_dev_0
Normal file
@ -0,0 +1 @@
|
|||||||
|
1874 1874.99999996829 tez wiecznym pokojem się ciosIi } ' . " - ' Poniewoi zaś musicie storać się oto , aby groi , ąoym niebezpiec : Leńst om Waslą władzą , roztropn { tścią i gorliwością zapobieas , przeto uZDacie , ie nic nie będzie stósowniejszcgo i poiyteczniejszego jsk S5Ułmć we wspólnej noradzie właściwych dróg , aby po ądany cel tern powniej i skuteczniej osiągnęć . Skoro prawa Kościoła są nnrU ! Jioue , obowiązkiem jest Waszym f.l ' onić wiernych ; tern bezpiecllłiej lią aś będzie osłona i tem siluiejszą obrona , im .vgodniej i ląc żnićj usiłowania pojedyńcle dtiolać będę , i im gQrliwiej obmyśhme i oznac ; ; ; olle p08t powal1ie , poło eniem rzeczy nakaz8no . Dla tego u , pomimmry Was , abyście jak mi : < ł.na najbardziej zebrali się i po wspólnej naradzie naznaczony paw " , i przeli Y ' szysłkich Pr.l ) jętą modłę , według której , jak tego iVas ! ll uuąd wymaga , jednozgodnie grozące _ le tłumili i wolności Kościoła sili ' iie bronili . Dia tego illJsieliśmy Was upomnieć , iiby się nie zda- \ \ ' rało , ie w t k " , ' dncj sprawie obowiązku NosJicge Iinniodbaliśill 1 ' . _ ' tlbowiom przekonani jesteśmy , ie- ) ; , yścio i bez tego NasEego upomnienia to uczynili . Nie nzekliśmy się takie jesloze nadzioi , ie Bóg odwróci Istniejące złe , gdył zagrzewa Nos del ) rą nadł : ieją prsywiązanie i wiaro Nł ! szego nojukochań- Siego syna w Chr } stusie , Cf ' sarZ ! 1 i królt ' Franciszka Józefa ; ktorego w ponownym liście z dnia dJ : i- eiejszego J : tego powodu zuklinaliśm ) r , oby nigdy nie dO.lwom , by w jego rozległem pnństwie KościGI poddnn , .. Ioostał han ! e mej nieV \ \ ' cli , a jego poddani kat.oliccy n3jwięka ym uciskom . Gdy atoli wielu uderze na Kościół b wszelka 1i ' J \ \ : ! oka nuder nif \ \ bezpieunę , ł ' ueto Wy JUijmniej moieoie trwać w nieoJ . ) ' -nnoici . Oby Bóg kierował Wn ! łemi pos ' anowienian , i i ' " sf ! iernł Wes swoj , potęiną or ; iekę , iibyśde zdJłali sJicJif2śliwie postallowić i } JrJiywieść do sl ; ułku , ce IJa chWflłę Jego Imienia i dla zbawienia dusz słuiy . Na znak tej Boskiej opieki i Nt1slicj S.l : ł ególnej pn ; ych ) JnoEci udJiielamy WatID wszyt ! iim i II ( } sobna kałdemu , ulmchoni Syn ( , Vlie i cxdg ( JdlJi BJ8cin , rreJ : vucho \ \ ' \ \ ieńslwu J wiern } m Waflzej opiece- powierzonym , miłościwe Nf S1 : e błogosławieństwo r-posłoJsllie . Dfn w Rzymie u św. Piotra. dnia 7 mar a 1874 , 28 pont } ' fikatu NVSJegci . ( } o się w tygodniu naj ' wułżniejszego stało " J. na 8wie ie . Niemcy . W.Berlinie obradowaTttj w sejmie oprócz o innych mniejszej wagi sprawach , o pra-wie prasowym , ł. j. o prawie tycz c1 ' m si go- , .et , pism f ' rukQwanycb , księ2e ; . i. t. d . Podowie bn \ \ \ \ arsc ) ' poslaH ndref do swego króla , w którym go proszą aby oparł się prawom nowomodnym w rle " liach religijnych i politycznych , które moją jeszcze być .uprowadzone w zjednonon ) " m państwie ni.emieckim. Król który w ogóle rządem muło się I : ojmuje , oddał pismo posłów ministrom . Wysłańcy b . ! warscy w bundesr .cie gło sowali za pr £ \ \ wem o uwięlirmiu i wypęda ; aniu II kraju , Nskupów i księiy , które to prawo wnet sejm lwi rzesz ) ' niemieckiej przedłoionym zostanie , gdzie naturalnie p % ejdliie . -- Tak nazwane 5to ' rV8rJ : yszenie chłopów bawarskich , do którego i naj w ięlisi panowie 08lełą , puesłało posłom bawarskim w Berlinie adres , prosląc ioh , aby racJ : ej sejm op uścili , niiby brali ud.iat w naradath nad prawami , sprzeciwiającymi si d wchowi katolickiemu i lifłckowawozemu ( konserwatywnemu ) PIUS IX. ludu bawarskiego . Rzęd pflelnl ! CI ł 250 , OÓO tala-rów na podwyiszen ' e pelJsyi . \ \ f1ięi-y , d ; ) ' ka- .dy miał przynajmniej 500 talarów . Pewnie żaden tisiąds
|
46
src/zad.py
Normal file
46
src/zad.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys, pickle
|
||||||
|
from math import exp, log
|
||||||
|
|
||||||
|
def add_markers(tokens):
|
||||||
|
return ['<BOS>'] + tokens + ['<EOS>']
|
||||||
|
|
||||||
|
def into_words(sentence):
|
||||||
|
a = sentence.split(' ')
|
||||||
|
return a
|
||||||
|
|
||||||
|
def gather_counts(from_n, to_n, sentences):
|
||||||
|
counts = {}
|
||||||
|
counts[0] = {() : 0}
|
||||||
|
for sentence in sentences:
|
||||||
|
tokens = add_markers(into_words(sentence))
|
||||||
|
ntokens = len(tokens)
|
||||||
|
counts[0][()] += ntokens
|
||||||
|
|
||||||
|
for n in range(from_n, to_n+1):
|
||||||
|
for i in range(0, ntokens-n+1):
|
||||||
|
ngram = tuple(tokens[i:i+n])
|
||||||
|
if n not in counts:
|
||||||
|
counts[n] = {}
|
||||||
|
if ngram in counts[n]:
|
||||||
|
counts[n][ngram] += 1
|
||||||
|
else:
|
||||||
|
counts[n][ngram] = 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
def tokenize (segment):
|
||||||
|
d, dd, l, r, text = segment.rstrip('\n').split('\t')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
sen = []
|
||||||
|
with open(sys.argv[1]) as file:
|
||||||
|
for line in file:
|
||||||
|
ss = tokenize(line)
|
||||||
|
sen.append(ss)
|
||||||
|
|
||||||
|
model_file = sys.argv[2]
|
||||||
|
model = gather_counts(4,4,sen)
|
||||||
|
with open(model_file, 'wb+') as p:
|
||||||
|
pickle.dump(model, p, pickle.HIGHEST_PROTOCOL)
|
925
src/zajeciaipynb.ipynb
Normal file
925
src/zajeciaipynb.ipynb
Normal file
@ -0,0 +1,925 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#import regex as re\n",
|
||||||
|
"\n",
|
||||||
|
"def into_words(sentence):\n",
|
||||||
|
" return sentence.split(' ')#re.findall(r'\\p{P}|[^\\p{P}\\s]+', sentence)\n",
|
||||||
|
"\n",
|
||||||
|
"def into_characters(sentence):\n",
|
||||||
|
" return list(sentence)\n",
|
||||||
|
"\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Z',\n",
|
||||||
|
" 'a',\n",
|
||||||
|
" 'ż',\n",
|
||||||
|
" 'ó',\n",
|
||||||
|
" 'ł',\n",
|
||||||
|
" 'ć',\n",
|
||||||
|
" ' ',\n",
|
||||||
|
" 'j',\n",
|
||||||
|
" 'a',\n",
|
||||||
|
" 'ź',\n",
|
||||||
|
" 'n',\n",
|
||||||
|
" 'i',\n",
|
||||||
|
" 'ą',\n",
|
||||||
|
" ' ',\n",
|
||||||
|
" 'g',\n",
|
||||||
|
" 'ę',\n",
|
||||||
|
" 'ś',\n",
|
||||||
|
" 'l',\n",
|
||||||
|
" '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_characters(\"Zażółć jaźnią gęśl.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Ala', 'has', 'a', 'cat', 'and', 'a', 'dog', '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_words(\"Ala has a cat and a dog.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Humpty', '-', 'dumpty', '3s', ',', 'eg', '.', 'problems', '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_words(\"Humpty-dumpty 3s, eg. problems.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Adam',\n",
|
||||||
|
" ',',\n",
|
||||||
|
" 'who',\n",
|
||||||
|
" 'smokes',\n",
|
||||||
|
" 'a',\n",
|
||||||
|
" 'lot',\n",
|
||||||
|
" ',',\n",
|
||||||
|
" 'caught',\n",
|
||||||
|
" 'COVID',\n",
|
||||||
|
" '-',\n",
|
||||||
|
" '19',\n",
|
||||||
|
" '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_words(\"Adam, who smokes a lot, caught COVID-19.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['A', 'l', 'a', ' ', 'h', 'a', 's', ' ', 'a', ' ', 'c', 'a', 't', '.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"into_characters(\"Ala has a cat.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from syntok.tokenizer import Tokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"def by_syntok(sentence):\n",
|
||||||
|
" tok = Tokenizer()\n",
|
||||||
|
" return [str(t) for t in tok.tokenize(sentence)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Humpty',\n",
|
||||||
|
" '-dumpty',\n",
|
||||||
|
" ' and',\n",
|
||||||
|
" ' Alice',\n",
|
||||||
|
" ' has',\n",
|
||||||
|
" ' pets',\n",
|
||||||
|
" ' e.g',\n",
|
||||||
|
" '.',\n",
|
||||||
|
" ' dogs',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!',\n",
|
||||||
|
" '!']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"by_syntok(\"Humpty-dumpty and Alice has pets e.g. dogs!!!!!\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def add_markers(tokens):\n",
|
||||||
|
" return ['<BOS>'] + tokens + ['<EOS>']\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['<BOS>', 'This', 'is', 'a', 'black', 'cat', '.', '<EOS>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"add_markers(into_words('This is a black cat.'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['<BOS>', 'Humpty', '-dumpty', ' jumped', '.', '<EOS>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"add_markers(by_syntok(\"Humpty-dumpty jumped.\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Gathering simple counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def gather_counts(from_n, to_n, sentences, splitter=lambda s: add_markers(into_words(s))):\n",
|
||||||
|
" counts = {}\n",
|
||||||
|
" counts[0] = {(): 0}\n",
|
||||||
|
" for sentence in sentences:\n",
|
||||||
|
" tokens = splitter(sentence)\n",
|
||||||
|
" ntokens = len(tokens)\n",
|
||||||
|
" counts[0][()] += ntokens\n",
|
||||||
|
" for n in range(from_n, to_n+1):\n",
|
||||||
|
" for i in range(0, ntokens-n+1):\n",
|
||||||
|
" ngram = tuple(tokens[i:i+n])\n",
|
||||||
|
" if n not in counts:\n",
|
||||||
|
" counts[n] = {}\n",
|
||||||
|
" \n",
|
||||||
|
" if ngram in counts[n]:\n",
|
||||||
|
" counts[n][ngram] += 1\n",
|
||||||
|
" else: \n",
|
||||||
|
" counts[n][ngram] = 1\n",
|
||||||
|
" return counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{0: {(): 17},\n",
|
||||||
|
" 1: {('<BOS>',): 3,\n",
|
||||||
|
" ('Ala',): 1,\n",
|
||||||
|
" ('ma',): 2,\n",
|
||||||
|
" ('kota',): 1,\n",
|
||||||
|
" ('.',): 2,\n",
|
||||||
|
" ('<EOS>',): 3,\n",
|
||||||
|
" ('Basia',): 1,\n",
|
||||||
|
" ('psa',): 1,\n",
|
||||||
|
" ('Gdzie',): 1,\n",
|
||||||
|
" ('mieszkasz',): 1,\n",
|
||||||
|
" ('?',): 1},\n",
|
||||||
|
" 2: {('<BOS>', 'Ala'): 1,\n",
|
||||||
|
" ('Ala', 'ma'): 1,\n",
|
||||||
|
" ('ma', 'kota'): 1,\n",
|
||||||
|
" ('kota', '.'): 1,\n",
|
||||||
|
" ('.', '<EOS>'): 2,\n",
|
||||||
|
" ('<BOS>', 'Basia'): 1,\n",
|
||||||
|
" ('Basia', 'ma'): 1,\n",
|
||||||
|
" ('ma', 'psa'): 1,\n",
|
||||||
|
" ('psa', '.'): 1,\n",
|
||||||
|
" ('<BOS>', 'Gdzie'): 1,\n",
|
||||||
|
" ('Gdzie', 'mieszkasz'): 1,\n",
|
||||||
|
" ('mieszkasz', '?'): 1,\n",
|
||||||
|
" ('?', '<EOS>'): 1},\n",
|
||||||
|
" 3: {('<BOS>', 'Ala', 'ma'): 1,\n",
|
||||||
|
" ('Ala', 'ma', 'kota'): 1,\n",
|
||||||
|
" ('ma', 'kota', '.'): 1,\n",
|
||||||
|
" ('kota', '.', '<EOS>'): 1,\n",
|
||||||
|
" ('<BOS>', 'Basia', 'ma'): 1,\n",
|
||||||
|
" ('Basia', 'ma', 'psa'): 1,\n",
|
||||||
|
" ('ma', 'psa', '.'): 1,\n",
|
||||||
|
" ('psa', '.', '<EOS>'): 1,\n",
|
||||||
|
" ('<BOS>', 'Gdzie', 'mieszkasz'): 1,\n",
|
||||||
|
" ('Gdzie', 'mieszkasz', '?'): 1,\n",
|
||||||
|
" ('mieszkasz', '?', '<EOS>'): 1}}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"gather_counts(1, 3, [\"Ala ma kota.\", 'Basia ma psa.', 'Gdzie mieszkasz?'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model = gather_counts(1, 4, [\"Ala ma kota.\", 'Basia ma psa.', 'Hej, gdzie teraz mieszkasz?'], splitter=lambda s: add_markers(by_syntok(s)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"model[2][(' ma', ' kota')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{('<BOS>',): 3,\n",
|
||||||
|
" ('Ala',): 1,\n",
|
||||||
|
" (' ma',): 2,\n",
|
||||||
|
" (' kota',): 1,\n",
|
||||||
|
" ('.',): 2,\n",
|
||||||
|
" ('<EOS>',): 3,\n",
|
||||||
|
" ('Basia',): 1,\n",
|
||||||
|
" (' psa',): 1,\n",
|
||||||
|
" ('Hej',): 1,\n",
|
||||||
|
" (',',): 1,\n",
|
||||||
|
" (' gdzie',): 1,\n",
|
||||||
|
" (' teraz',): 1,\n",
|
||||||
|
" (' mieszkasz',): 1,\n",
|
||||||
|
" ('?',): 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"model[1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 59,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"shakespeare=(s.strip() for s in open('100-0.txt') if re.search(r'\\S', s))\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 60,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<generator object <genexpr> at 0x7f7e5dfe1ba0>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 60,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"shakespeare"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'\\ufeffProject Gutenberg’s The Complete Works of William Shakespeare, by William Shakespeare'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"next(shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 62,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'This eBook is for the use of anyone anywhere in the United States and'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 62,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"next(shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 63,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'most other parts of the world at no cost and with almost no restrictions'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 63,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"next(shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 64,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sh_model = gather_counts(1, 3, shakespeare)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 65,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"877"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 65,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[2][('to', 'be')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 66,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"57"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 66,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[2][('be', 'to')]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 67,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 67,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[1][('Poland',)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 68,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2283"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 68,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[1][('love',)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"92615"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[1][(',',)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{(): 1545199}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sh_model[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 71,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(',', 'my', 'lord')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 71,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sorted(sh_model[3].keys(), key=lambda k: sh_model[3][k])[-5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Simple n-gram model\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Word sequence: $(w_1,...,w_N)$ and model $M$\n",
|
||||||
|
"We'd like to have $P_M(w_1,...,w_N)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$P(w_1,...,w_N) = P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_1 w_2 \\ldots w_{i-1}) \\ldots P(w_N|w_1 w_2 \\ldots w_{N-1})$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Assumption: probability of a word depends on a limited context"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"(Approximation, not true) \"Piotr, co mieszka w tym dużym zielonym budynku, kupił samochód.\" vs \"\"Anna, co mieszka w tym dużym zielonym budynku, kupiła samochód.\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$P(w_1,...,w_N) \\approx P(w_1)P(w_2|w_1)P(w_3|w_1 w_2)\\ldots P(w_i|w_{i-(n-1)} \\ldots w_{i-1}) \\ldots P(w_N|w_{N-(i-1)} \\ldots w_{N-1})$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"unigram model $P(w_1,...,w_N) \\approx P(w_1)\\ldots P(w_N) = \\prod P(w_i)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"bigram model $P(w_1,...,w_N) \\appr('<BOS>',)ox \\prod P(w_i|w_{i-1})$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 88,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from math import log, exp\n",
|
||||||
|
"\n",
|
||||||
|
"def get_prob_simple(model, n, sentence):\n",
|
||||||
|
" logprob_total = 0\n",
|
||||||
|
" for i in range(0, len(sentence)-n+1):\n",
|
||||||
|
" ngram = tuple(sentence[i:i+n])\n",
|
||||||
|
" pre_ngram = tuple(sentence[i:i+n-1])\n",
|
||||||
|
" prob = model[n].get(ngram, 0) / model[n-1].get(pre_ngram, 0)\n",
|
||||||
|
" logprob_total += log(prob)\n",
|
||||||
|
" return logprob_total \n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$\\log(ab) = \\log a + \\log b$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"$\\log \\prod P(w_i) = \\sum \\log P(w_i)$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"7.128462813174801e-07"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"exp(get_prob_simple(sh_model, 2, add_markers(into_words('I love thee.'))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 96,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8.585040690529112e-11"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 96,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"exp(get_prob_simple(sh_model, 1, add_markers(into_words('I love you.'))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Smoothing"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 103,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def prob(count, total, nb_classes):\n",
|
||||||
|
" return count / total"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"prob(3, 3, 2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def laplace(count, total, nb_classes, alpha=1.0):\n",
|
||||||
|
" return (count + alpha) / (total + nb_classes)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 117,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 117,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"laplace(1, 3, 2)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Smoothing in n-gram models"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 119,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_prob_smoothed(model, n, sentence):\n",
|
||||||
|
" vocabulary_size = len(model[1])\n",
|
||||||
|
" \n",
|
||||||
|
" logprob_total = 0\n",
|
||||||
|
" for i in range(0, len(sentence)-n+1):\n",
|
||||||
|
" ngram = tuple(sentence[i:i+n])\n",
|
||||||
|
" pre_ngram = tuple(sentence[i:i+n-1])\n",
|
||||||
|
" prob = laplace(model[n].get(ngram, 0), model[n-1].get(pre_ngram, 0), vocabulary_size)\n",
|
||||||
|
" logprob_total += log(prob)\n",
|
||||||
|
" return logprob_total "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 122,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3.843912914870102e-16"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 122,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"exp(get_prob_smoothed(sh_model, 1, add_markers(into_words('Love I Czechia.'))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['<BOS>', 'I', 'love', 'thee.', '<EOS>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"add_markers(into_words('I love thee.'))\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user