Compare commits

..

No commits in common. "smoothing" and "master" have entirely different histories.

4 changed files with 18074 additions and 18088 deletions

File diff suppressed because it is too large Load Diff

194
run.ipynb
View File

@ -1,5 +1,13 @@
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"id": "2a4fb731",
"metadata": {},
"source": [
"MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 1,
@ -10,8 +18,7 @@
"import lzma\n", "import lzma\n",
"import csv\n", "import csv\n",
"import re\n", "import re\n",
"import math\n", "import math"
"from collections import Counter"
] ]
}, },
{ {
@ -36,12 +43,27 @@
" \n", " \n",
" return data, words\n", " return data, words\n",
" \n", " \n",
" return data" " return data\n",
"\n",
"train_data, train_words = read_data('train')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 3,
"id": "a4a73c19",
"metadata": {},
"outputs": [],
"source": [
"def print_example(data, words, idx):\n",
" print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')\n",
" \n",
"# print_example(train_data, train_words, 13)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ce522af5", "id": "ce522af5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -53,12 +75,17 @@
" words=[word for word in text.split()]\n", " words=[word for word in text.split()]\n",
" temp=zip(*[words[i:] for i in range(0,ngram)])\n", " temp=zip(*[words[i:] for i in range(0,ngram)])\n",
" ans=[' '.join(ngram) for ngram in temp]\n", " ans=[' '.join(ngram) for ngram in temp]\n",
" return ans" " return ans\n",
"\n",
"N_grams = []\n",
"for i in range(len(train_data[:5000])):\n",
" N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)\n",
" N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 5,
"id": "317ade72", "id": "317ade72",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
@ -66,11 +93,6 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"def check_prob(N_grams):\n", "def check_prob(N_grams):\n",
" if ' ' not in N_grams[0]:\n",
" counter = Counter()\n",
" a = Counter(N_grams)\n",
" total = sum(a.values())\n",
" return {k: v / total for total in (sum(a.values()),) for k, v in a.items()}\n",
" count = {}\n", " count = {}\n",
" for i in N_grams:\n", " for i in N_grams:\n",
" i = i.rsplit(maxsplit=1)\n", " i = i.rsplit(maxsplit=1)\n",
@ -86,49 +108,64 @@
" s = sum(count[word].values())\n", " s = sum(count[word].values())\n",
" for i in count[word]:\n", " for i in count[word]:\n",
" count[word][i] = count[word][i] / s\n", " count[word][i] = count[word][i] / s\n",
" count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True)\n",
" \n", " \n",
" return count" " return count\n",
] "\n",
}, "probs = check_prob(N_grams)"
{
"cell_type": "code",
"execution_count": 5,
"id": "86aeda02",
"metadata": {},
"outputs": [],
"source": [
"def find_word(words, model):\n",
" n = len(words)\n",
" tmp = {}\n",
" while n > 1:\n",
" if ' '.join(words[-n:]) in model[n]:\n",
" tmp = model[n][' '.join(words[-n:])][:2]\n",
" break\n",
" else:\n",
" n -= 1\n",
" \n",
" res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp])\n",
" s = 1 - sum(n for _, n in tmp)\n",
" if s == 0:\n",
" s = 1\n",
" res += ' :' + str(s)\n",
" if tmp == {}:\n",
" if words[-1] in model[0]:\n",
" return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}'\n",
" else:\n",
" return ':1'\n",
" return res"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 6,
"id": "3a7ec4ec",
"metadata": {},
"outputs": [],
"source": [
"dev_data, dev_words = read_data('dev-0')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "86aeda02",
"metadata": {},
"outputs": [],
"source": [
"def find_word(word_1, word_2):\n",
" tmp_probs = {}\n",
" if word_1 in probs:\n",
" if word_2 in probs:\n",
" for i in probs[word_1]:\n",
" if i in probs[word_2]:\n",
" tmp_probs[i] = probs[word_1][i] * probs[word_2][i]\n",
" if tmp_probs[i] == 1:\n",
" tmp_probs[i] = 0.1\n",
" else:\n",
" tmp_probs[i] = probs[word_1][i] / 5\n",
" else:\n",
" tmp_probs = probs[word_1]\n",
" else:\n",
" tmp_probs = {}\n",
" \n",
" sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]\n",
" tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])\n",
" s = 1 - sum(n for _, n in sorted_list)\n",
" if s == 0:\n",
" s = 0.01\n",
" tmm += ' :' + str(s)\n",
" if tmp_probs == {}:\n",
" return ':1'\n",
" return tmm"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3b713dc3", "id": "3b713dc3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def find_words(data, n, model):\n", "def find_words(data):\n",
" found_words = []\n", " found_words = []\n",
" for i in data:\n", " for i in data:\n",
" t = i[0]\n", " t = i[0]\n",
@ -136,13 +173,15 @@
" if True:\n", " if True:\n",
" t = re.sub(r'[^\\w\\s]', ' ', t)\n", " t = re.sub(r'[^\\w\\s]', ' ', t)\n",
" words=[word for word in t.split()]\n", " words=[word for word in t.split()]\n",
" found_words.append(find_word(words[-n:], model))\n", " found_words.append(find_word(words[-1], ' '.join(words[-2:])))\n",
" return found_words" " return found_words\n",
"\n",
"dev_found_words = find_words(dev_data)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 9,
"id": "17be7468", "id": "17be7468",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -150,74 +189,21 @@
"def save_data(folder, words):\n", "def save_data(folder, words):\n",
" f = open(f'{folder}/out.tsv', 'w')\n", " f = open(f'{folder}/out.tsv', 'w')\n",
" f.write('\\n'.join(words) + '\\n')\n", " f.write('\\n'.join(words) + '\\n')\n",
" f.close()" " f.close()\n",
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8c127bae",
"metadata": {},
"outputs": [],
"source": [
"def train(n, data_size = 5000):\n",
" train_data, train_words = read_data('train')\n",
" N_grams = [[] for i in range(n)]\n",
" probs = [[] for i in range(n)]\n",
" for i in range(len(train_data[:data_size])):\n",
" for j in range(n):\n",
" N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1)\n",
" for i in range(n):\n",
" probs[i] = check_prob(N_grams[i])\n",
" return probs\n",
" \n", " \n",
"model = train(4)" "save_data('dev-0', dev_found_words)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "935c0f87",
"metadata": {},
"outputs": [],
"source": [
"def predict(model, n, data_name, test_data=False):\n",
" if not test_data:\n",
" data, _ = read_data(data_name, test_data)\n",
" else:\n",
" data = read_data(data_name, test_data)\n",
" found_words = find_words(data, n - 1, model)\n",
" save_data(data_name, found_words)\n",
" \n",
"predict(model, 4, 'dev-0')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 10,
"id": "e43fd5b3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"794.13\r\n"
]
}
],
"source": [
"!./geval -t dev-0"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b2e52242", "id": "b2e52242",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"predict(model, 4, 'test-A', True)" "test_data = read_data('test-A', True)\n",
"test_found_words = find_words(test_data)\n",
"save_data('test-A', test_found_words)"
] ]
} }
], ],

102
run.py
View File

@ -1,11 +1,13 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
import lzma import lzma
import csv import csv
import re import re
import math import math
from collections import Counter
def read_data(folder_name, test_data=False): def read_data(folder_name, test_data=False):
@ -25,6 +27,14 @@ def read_data(folder_name, test_data=False):
return data return data
train_data, train_words = read_data('train')
def print_example(data, words, idx):
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
# print_example(train_data, train_words, 13)
def generate_N_grams(text, ngram=1, no_punctuation=True): def generate_N_grams(text, ngram=1, no_punctuation=True):
text = re.sub(r'[\-] ', '', text).lower() text = re.sub(r'[\-] ', '', text).lower()
@ -35,13 +45,13 @@ def generate_N_grams(text, ngram=1, no_punctuation=True):
ans=[' '.join(ngram) for ngram in temp] ans=[' '.join(ngram) for ngram in temp]
return ans return ans
N_grams = []
for i in range(len(train_data[:5000])):
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
def check_prob(N_grams): def check_prob(N_grams):
if ' ' not in N_grams[0]:
counter = Counter()
a = Counter(N_grams)
total = sum(a.values())
return {k: v / total for total in (sum(a.values()),) for k, v in a.items()}
count = {} count = {}
for i in N_grams: for i in N_grams:
i = i.rsplit(maxsplit=1) i = i.rsplit(maxsplit=1)
@ -57,35 +67,43 @@ def check_prob(N_grams):
s = sum(count[word].values()) s = sum(count[word].values())
for i in count[word]: for i in count[word]:
count[word][i] = count[word][i] / s count[word][i] = count[word][i] / s
count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True)
return count return count
probs = check_prob(N_grams)
def find_word(words, model):
n = len(words) dev_data, dev_words = read_data('dev-0')
tmp = {}
while n > 1:
if ' '.join(words[-n:]) in model[n]: def find_word(word_1, word_2):
tmp = model[n][' '.join(words[-n:])][:2] tmp_probs = {}
break if word_1 in probs:
if word_2 in probs:
for i in probs[word_1]:
if i in probs[word_2]:
tmp_probs[i] = probs[word_1][i] * probs[word_2][i]
if tmp_probs[i] == 1:
tmp_probs[i] = 0.1
else:
tmp_probs[i] = probs[word_1][i] / 5
else: else:
n -= 1 tmp_probs = probs[word_1]
else:
tmp_probs = {}
res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp]) sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]
s = 1 - sum(n for _, n in tmp) tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])
s = 1 - sum(n for _, n in sorted_list)
if s == 0: if s == 0:
s = 1 s = 0.01
res += ' :' + str(s) tmm += ' :' + str(s)
if tmp == {}: if tmp_probs == {}:
if words[-1] in model[0]: return ':1'
return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}' return tmm
else:
return ':1'
return res
def find_words(data, n, model): def find_words(data):
found_words = [] found_words = []
for i in data: for i in data:
t = i[0] t = i[0]
@ -93,38 +111,20 @@ def find_words(data, n, model):
if True: if True:
t = re.sub(r'[^\w\s]', ' ', t) t = re.sub(r'[^\w\s]', ' ', t)
words=[word for word in t.split()] words=[word for word in t.split()]
found_words.append(find_word(words[-n:], model)) found_words.append(find_word(words[-1], ' '.join(words[-2:])))
return found_words return found_words
dev_found_words = find_words(dev_data)
def save_data(folder, words): def save_data(folder, words):
f = open(f'{folder}/out.tsv', 'w') f = open(f'{folder}/out.tsv', 'w')
f.write('\n'.join(words) + '\n') f.write('\n'.join(words) + '\n')
f.close() f.close()
save_data('dev-0', dev_found_words)
def train(n, data_size = 5000):
train_data, train_words = read_data('train')
N_grams = [[] for i in range(n)]
probs = [[] for i in range(n)]
for i in range(len(train_data[:data_size])):
for j in range(n):
N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1)
for i in range(n):
probs[i] = check_prob(N_grams[i])
return probs
model = train(4)
def predict(model, n, data_name, test_data=False): test_data = read_data('test-A', True)
if not test_data: test_found_words = find_words(test_data)
data, _ = read_data(data_name, test_data) save_data('test-A', test_found_words)
else:
data = read_data(data_name, test_data)
found_words = find_words(data, n - 1, model)
save_data(data_name, found_words)
predict(model, 4, 'dev-0')
predict(model, 4, 'test-A', True)

File diff suppressed because it is too large Load Diff