2022-03-31 21:07:24 +02:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 1,
|
|
|
|
"id": "c16d72a6",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import lzma\n",
|
|
|
|
"import csv\n",
|
|
|
|
"import re\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
"import math\n",
|
|
|
|
"from collections import Counter"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 2,
|
|
|
|
"id": "a1ff03c8",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def read_data(folder_name, test_data=False):\n",
|
|
|
|
" \n",
|
|
|
|
" all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\\n')\n",
|
|
|
|
" data = [line.split('\\t') for line in all_data][:-1]\n",
|
|
|
|
" data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]\n",
|
|
|
|
" \n",
|
|
|
|
" if not test_data:\n",
|
|
|
|
" words = []\n",
|
|
|
|
" with open(f'{folder_name}/expected.tsv') as file:\n",
|
|
|
|
" tsv_file = csv.reader(file, delimiter=\"\\t\")\n",
|
|
|
|
" for line in tsv_file:\n",
|
|
|
|
" words.append(line[0])\n",
|
|
|
|
" \n",
|
|
|
|
" return data, words\n",
|
|
|
|
" \n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" return data"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 3,
|
|
|
|
"id": "ce522af5",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def generate_N_grams(text, ngram=1, no_punctuation=True):\n",
|
|
|
|
" text = re.sub(r'[\\-] ', '', text).lower()\n",
|
|
|
|
" if no_punctuation:\n",
|
2022-04-01 15:41:25 +02:00
|
|
|
" text = re.sub(r'[^\\w\\s]', ' ', text)\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" words=[word for word in text.split()]\n",
|
|
|
|
" temp=zip(*[words[i:] for i in range(0,ngram)])\n",
|
|
|
|
" ans=[' '.join(ngram) for ngram in temp]\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" return ans"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-10 23:11:14 +02:00
|
|
|
"execution_count": 4,
|
2022-03-31 21:07:24 +02:00
|
|
|
"id": "317ade72",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def check_prob(N_grams):\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" if ' ' not in N_grams[0]:\n",
|
|
|
|
" counter = Counter()\n",
|
|
|
|
" a = Counter(N_grams)\n",
|
|
|
|
" total = sum(a.values())\n",
|
|
|
|
" return {k: v / total for total in (sum(a.values()),) for k, v in a.items()}\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" count = {}\n",
|
|
|
|
" for i in N_grams:\n",
|
|
|
|
" i = i.rsplit(maxsplit=1)\n",
|
|
|
|
" if i[0] in count:\n",
|
|
|
|
" if i[1] in count[i[0]]:\n",
|
|
|
|
" count[i[0]][i[1]] += 1\n",
|
|
|
|
" else:\n",
|
|
|
|
" count[i[0]][i[1]] = 1\n",
|
|
|
|
" else:\n",
|
|
|
|
" count[i[0]] = {i[1]: 1}\n",
|
|
|
|
" \n",
|
|
|
|
" for word in count:\n",
|
|
|
|
" s = sum(count[word].values())\n",
|
|
|
|
" for i in count[word]:\n",
|
|
|
|
" count[word][i] = count[word][i] / s\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True)\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" \n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" return count"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-10 23:11:14 +02:00
|
|
|
"execution_count": 5,
|
2022-03-31 21:07:24 +02:00
|
|
|
"id": "86aeda02",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2022-04-10 23:11:14 +02:00
|
|
|
"def find_word(words, model):\n",
|
|
|
|
" n = len(words)\n",
|
|
|
|
" tmp = {}\n",
|
|
|
|
" while n > 1:\n",
|
|
|
|
" if ' '.join(words[-n:]) in model[n]:\n",
|
|
|
|
" tmp = model[n][' '.join(words[-n:])][:2]\n",
|
|
|
|
" break\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" else:\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" n -= 1\n",
|
|
|
|
" \n",
|
|
|
|
" res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp])\n",
|
|
|
|
" s = 1 - sum(n for _, n in tmp)\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" if s == 0:\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" s = 1\n",
|
|
|
|
" res += ' :' + str(s)\n",
|
|
|
|
" if tmp == {}:\n",
|
|
|
|
" if words[-1] in model[0]:\n",
|
|
|
|
" return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}'\n",
|
|
|
|
" else:\n",
|
|
|
|
" return ':1'\n",
|
|
|
|
" return res"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-10 23:11:14 +02:00
|
|
|
"execution_count": 6,
|
2022-03-31 21:07:24 +02:00
|
|
|
"id": "3b713dc3",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2022-04-10 23:11:14 +02:00
|
|
|
"def find_words(data, n, model):\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" found_words = []\n",
|
|
|
|
" for i in data:\n",
|
|
|
|
" t = i[0]\n",
|
|
|
|
" t = re.sub(r'[\\-] ', '', t).lower()\n",
|
|
|
|
" if True:\n",
|
2022-04-01 15:41:25 +02:00
|
|
|
" t = re.sub(r'[^\\w\\s]', ' ', t)\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" words=[word for word in t.split()]\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" found_words.append(find_word(words[-n:], model))\n",
|
|
|
|
" return found_words"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-10 23:11:14 +02:00
|
|
|
"execution_count": 7,
|
2022-03-31 21:07:24 +02:00
|
|
|
"id": "17be7468",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def save_data(folder, words):\n",
|
|
|
|
" f = open(f'{folder}/out.tsv', 'w')\n",
|
|
|
|
" f.write('\\n'.join(words) + '\\n')\n",
|
2022-04-10 23:11:14 +02:00
|
|
|
" f.close()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 8,
|
|
|
|
"id": "8c127bae",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def train(n, data_size = 5000):\n",
|
|
|
|
" train_data, train_words = read_data('train')\n",
|
|
|
|
" N_grams = [[] for i in range(n)]\n",
|
|
|
|
" probs = [[] for i in range(n)]\n",
|
|
|
|
" for i in range(len(train_data[:data_size])):\n",
|
|
|
|
" for j in range(n):\n",
|
|
|
|
" N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1)\n",
|
|
|
|
" for i in range(n):\n",
|
|
|
|
" probs[i] = check_prob(N_grams[i])\n",
|
|
|
|
" return probs\n",
|
2022-03-31 21:07:24 +02:00
|
|
|
" \n",
|
2022-04-10 23:11:14 +02:00
|
|
|
"model = train(4)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 9,
|
|
|
|
"id": "935c0f87",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def predict(model, n, data_name, test_data=False):\n",
|
|
|
|
" if not test_data:\n",
|
|
|
|
" data, _ = read_data(data_name, test_data)\n",
|
|
|
|
" else:\n",
|
|
|
|
" data = read_data(data_name, test_data)\n",
|
|
|
|
" found_words = find_words(data, n - 1, model)\n",
|
|
|
|
" save_data(data_name, found_words)\n",
|
|
|
|
" \n",
|
|
|
|
"predict(model, 4, 'dev-0')"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 10,
|
2022-04-10 23:11:14 +02:00
|
|
|
"id": "e43fd5b3",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"794.13\r\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"!./geval -t dev-0"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 11,
|
2022-03-31 21:07:24 +02:00
|
|
|
"id": "b2e52242",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2022-04-10 23:11:14 +02:00
|
|
|
"predict(model, 4, 'test-A', True)"
|
2022-03-31 21:07:24 +02:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.9.5"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 5
|
|
|
|
}
|