upload changes
This commit is contained in:
parent
51220186a3
commit
439cf237d7
21036
dev-0/out.tsv
21036
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
194
run.ipynb
194
run.ipynb
@ -1,13 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2a4fb731",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -18,7 +10,8 @@
|
||||
"import lzma\n",
|
||||
"import csv\n",
|
||||
"import re\n",
|
||||
"import math"
|
||||
"import math\n",
|
||||
"from collections import Counter"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -43,27 +36,12 @@
|
||||
" \n",
|
||||
" return data, words\n",
|
||||
" \n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"train_data, train_words = read_data('train')"
|
||||
" return data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "a4a73c19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def print_example(data, words, idx):\n",
|
||||
" print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')\n",
|
||||
" \n",
|
||||
"# print_example(train_data, train_words, 13)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "ce522af5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -75,17 +53,12 @@
|
||||
" words=[word for word in text.split()]\n",
|
||||
" temp=zip(*[words[i:] for i in range(0,ngram)])\n",
|
||||
" ans=[' '.join(ngram) for ngram in temp]\n",
|
||||
" return ans\n",
|
||||
"\n",
|
||||
"N_grams = []\n",
|
||||
"for i in range(len(train_data[:5000])):\n",
|
||||
" N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)\n",
|
||||
" N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)"
|
||||
" return ans"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"id": "317ade72",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
@ -93,6 +66,11 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def check_prob(N_grams):\n",
|
||||
" if ' ' not in N_grams[0]:\n",
|
||||
" counter = Counter()\n",
|
||||
" a = Counter(N_grams)\n",
|
||||
" total = sum(a.values())\n",
|
||||
" return {k: v / total for total in (sum(a.values()),) for k, v in a.items()}\n",
|
||||
" count = {}\n",
|
||||
" for i in N_grams:\n",
|
||||
" i = i.rsplit(maxsplit=1)\n",
|
||||
@ -108,64 +86,49 @@
|
||||
" s = sum(count[word].values())\n",
|
||||
" for i in count[word]:\n",
|
||||
" count[word][i] = count[word][i] / s\n",
|
||||
" count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True)\n",
|
||||
" \n",
|
||||
" return count\n",
|
||||
"\n",
|
||||
"probs = check_prob(N_grams)"
|
||||
" return count"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "86aeda02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_word(words, model):\n",
|
||||
" n = len(words)\n",
|
||||
" tmp = {}\n",
|
||||
" while n > 1:\n",
|
||||
" if ' '.join(words[-n:]) in model[n]:\n",
|
||||
" tmp = model[n][' '.join(words[-n:])][:2]\n",
|
||||
" break\n",
|
||||
" else:\n",
|
||||
" n -= 1\n",
|
||||
" \n",
|
||||
" res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp])\n",
|
||||
" s = 1 - sum(n for _, n in tmp)\n",
|
||||
" if s == 0:\n",
|
||||
" s = 1\n",
|
||||
" res += ' :' + str(s)\n",
|
||||
" if tmp == {}:\n",
|
||||
" if words[-1] in model[0]:\n",
|
||||
" return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}'\n",
|
||||
" else:\n",
|
||||
" return ':1'\n",
|
||||
" return res"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "3a7ec4ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_data, dev_words = read_data('dev-0')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "86aeda02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_word(word_1, word_2):\n",
|
||||
" tmp_probs = {}\n",
|
||||
" if word_1 in probs:\n",
|
||||
" if word_2 in probs:\n",
|
||||
" for i in probs[word_1]:\n",
|
||||
" if i in probs[word_2]:\n",
|
||||
" tmp_probs[i] = probs[word_1][i] * probs[word_2][i]\n",
|
||||
" if tmp_probs[i] == 1:\n",
|
||||
" tmp_probs[i] = 0.1\n",
|
||||
" else:\n",
|
||||
" tmp_probs[i] = probs[word_1][i] / 5\n",
|
||||
" else:\n",
|
||||
" tmp_probs = probs[word_1]\n",
|
||||
" else:\n",
|
||||
" tmp_probs = {}\n",
|
||||
" \n",
|
||||
" sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]\n",
|
||||
" tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])\n",
|
||||
" s = 1 - sum(n for _, n in sorted_list)\n",
|
||||
" if s == 0:\n",
|
||||
" s = 0.01\n",
|
||||
" tmm += ' :' + str(s)\n",
|
||||
" if tmp_probs == {}:\n",
|
||||
" return ':1'\n",
|
||||
" return tmm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "3b713dc3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_words(data):\n",
|
||||
"def find_words(data, n, model):\n",
|
||||
" found_words = []\n",
|
||||
" for i in data:\n",
|
||||
" t = i[0]\n",
|
||||
@ -173,15 +136,13 @@
|
||||
" if True:\n",
|
||||
" t = re.sub(r'[^\\w\\s]', ' ', t)\n",
|
||||
" words=[word for word in t.split()]\n",
|
||||
" found_words.append(find_word(words[-1], ' '.join(words[-2:])))\n",
|
||||
" return found_words\n",
|
||||
"\n",
|
||||
"dev_found_words = find_words(dev_data)"
|
||||
" found_words.append(find_word(words[-n:], model))\n",
|
||||
" return found_words"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 7,
|
||||
"id": "17be7468",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -189,21 +150,74 @@
|
||||
"def save_data(folder, words):\n",
|
||||
" f = open(f'{folder}/out.tsv', 'w')\n",
|
||||
" f.write('\\n'.join(words) + '\\n')\n",
|
||||
" f.close()\n",
|
||||
" f.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "8c127bae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train(n, data_size = 5000):\n",
|
||||
" train_data, train_words = read_data('train')\n",
|
||||
" N_grams = [[] for i in range(n)]\n",
|
||||
" probs = [[] for i in range(n)]\n",
|
||||
" for i in range(len(train_data[:data_size])):\n",
|
||||
" for j in range(n):\n",
|
||||
" N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1)\n",
|
||||
" for i in range(n):\n",
|
||||
" probs[i] = check_prob(N_grams[i])\n",
|
||||
" return probs\n",
|
||||
" \n",
|
||||
"save_data('dev-0', dev_found_words)"
|
||||
"model = train(4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "935c0f87",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def predict(model, n, data_name, test_data=False):\n",
|
||||
" if not test_data:\n",
|
||||
" data, _ = read_data(data_name, test_data)\n",
|
||||
" else:\n",
|
||||
" data = read_data(data_name, test_data)\n",
|
||||
" found_words = find_words(data, n - 1, model)\n",
|
||||
" save_data(data_name, found_words)\n",
|
||||
" \n",
|
||||
"predict(model, 4, 'dev-0')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "e43fd5b3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"794.13\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!./geval -t dev-0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "b2e52242",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_data = read_data('test-A', True)\n",
|
||||
"test_found_words = find_words(test_data)\n",
|
||||
"save_data('test-A', test_found_words)"
|
||||
"predict(model, 4, 'test-A', True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
104
run.py
104
run.py
@ -1,13 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
|
||||
|
||||
|
||||
import lzma
|
||||
import csv
|
||||
import re
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def read_data(folder_name, test_data=False):
|
||||
@ -27,14 +25,6 @@ def read_data(folder_name, test_data=False):
|
||||
|
||||
return data
|
||||
|
||||
train_data, train_words = read_data('train')
|
||||
|
||||
|
||||
def print_example(data, words, idx):
|
||||
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
|
||||
|
||||
# print_example(train_data, train_words, 13)
|
||||
|
||||
|
||||
def generate_N_grams(text, ngram=1, no_punctuation=True):
|
||||
text = re.sub(r'[\-] ', '', text).lower()
|
||||
@ -45,13 +35,13 @@ def generate_N_grams(text, ngram=1, no_punctuation=True):
|
||||
ans=[' '.join(ngram) for ngram in temp]
|
||||
return ans
|
||||
|
||||
N_grams = []
|
||||
for i in range(len(train_data[:5000])):
|
||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
|
||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
|
||||
|
||||
|
||||
def check_prob(N_grams):
|
||||
if ' ' not in N_grams[0]:
|
||||
counter = Counter()
|
||||
a = Counter(N_grams)
|
||||
total = sum(a.values())
|
||||
return {k: v / total for total in (sum(a.values()),) for k, v in a.items()}
|
||||
count = {}
|
||||
for i in N_grams:
|
||||
i = i.rsplit(maxsplit=1)
|
||||
@ -67,43 +57,35 @@ def check_prob(N_grams):
|
||||
s = sum(count[word].values())
|
||||
for i in count[word]:
|
||||
count[word][i] = count[word][i] / s
|
||||
count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
return count
|
||||
|
||||
probs = check_prob(N_grams)
|
||||
|
||||
|
||||
dev_data, dev_words = read_data('dev-0')
|
||||
|
||||
|
||||
def find_word(word_1, word_2):
|
||||
tmp_probs = {}
|
||||
if word_1 in probs:
|
||||
if word_2 in probs:
|
||||
for i in probs[word_1]:
|
||||
if i in probs[word_2]:
|
||||
tmp_probs[i] = probs[word_1][i] * probs[word_2][i]
|
||||
if tmp_probs[i] == 1:
|
||||
tmp_probs[i] = 0.1
|
||||
else:
|
||||
tmp_probs[i] = probs[word_1][i] / 5
|
||||
def find_word(words, model):
|
||||
n = len(words)
|
||||
tmp = {}
|
||||
while n > 1:
|
||||
if ' '.join(words[-n:]) in model[n]:
|
||||
tmp = model[n][' '.join(words[-n:])][:2]
|
||||
break
|
||||
else:
|
||||
tmp_probs = probs[word_1]
|
||||
else:
|
||||
tmp_probs = {}
|
||||
|
||||
sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]
|
||||
tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])
|
||||
s = 1 - sum(n for _, n in sorted_list)
|
||||
n -= 1
|
||||
|
||||
res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp])
|
||||
s = 1 - sum(n for _, n in tmp)
|
||||
if s == 0:
|
||||
s = 0.01
|
||||
tmm += ' :' + str(s)
|
||||
if tmp_probs == {}:
|
||||
return ':1'
|
||||
return tmm
|
||||
s = 1
|
||||
res += ' :' + str(s)
|
||||
if tmp == {}:
|
||||
if words[-1] in model[0]:
|
||||
return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}'
|
||||
else:
|
||||
return ':1'
|
||||
return res
|
||||
|
||||
|
||||
def find_words(data):
|
||||
def find_words(data, n, model):
|
||||
found_words = []
|
||||
for i in data:
|
||||
t = i[0]
|
||||
@ -111,20 +93,38 @@ def find_words(data):
|
||||
if True:
|
||||
t = re.sub(r'[^\w\s]', ' ', t)
|
||||
words=[word for word in t.split()]
|
||||
found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
||||
found_words.append(find_word(words[-n:], model))
|
||||
return found_words
|
||||
|
||||
dev_found_words = find_words(dev_data)
|
||||
|
||||
|
||||
def save_data(folder, words):
|
||||
f = open(f'{folder}/out.tsv', 'w')
|
||||
f.write('\n'.join(words) + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
def train(n, data_size = 5000):
|
||||
train_data, train_words = read_data('train')
|
||||
N_grams = [[] for i in range(n)]
|
||||
probs = [[] for i in range(n)]
|
||||
for i in range(len(train_data[:data_size])):
|
||||
for j in range(n):
|
||||
N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1)
|
||||
for i in range(n):
|
||||
probs[i] = check_prob(N_grams[i])
|
||||
return probs
|
||||
|
||||
save_data('dev-0', dev_found_words)
|
||||
model = train(4)
|
||||
|
||||
|
||||
test_data = read_data('test-A', True)
|
||||
test_found_words = find_words(test_data)
|
||||
save_data('test-A', test_found_words)
|
||||
def predict(model, n, data_name, test_data=False):
|
||||
if not test_data:
|
||||
data, _ = read_data(data_name, test_data)
|
||||
else:
|
||||
data = read_data(data_name, test_data)
|
||||
found_words = find_words(data, n - 1, model)
|
||||
save_data(data_name, found_words)
|
||||
|
||||
predict(model, 4, 'dev-0')
|
||||
|
||||
predict(model, 4, 'test-A', True)
|
||||
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user