470619 n-gram
This commit is contained in:
parent
767978c654
commit
57fd77f584
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
232
run.ipynb
Normal file
232
run.ipynb
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2a4fb731",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "c16d72a6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import lzma\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import re\n",
|
||||||
|
"import math"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "a1ff03c8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def read_data(folder_name, test_data=False):\n",
|
||||||
|
" \n",
|
||||||
|
" all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\\n')\n",
|
||||||
|
" data = [line.split('\\t') for line in all_data][:-1]\n",
|
||||||
|
" data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]\n",
|
||||||
|
" \n",
|
||||||
|
" if not test_data:\n",
|
||||||
|
" words = []\n",
|
||||||
|
" with open(f'{folder_name}/expected.tsv') as file:\n",
|
||||||
|
" tsv_file = csv.reader(file, delimiter=\"\\t\")\n",
|
||||||
|
" for line in tsv_file:\n",
|
||||||
|
" words.append(line[0])\n",
|
||||||
|
" \n",
|
||||||
|
" return data, words\n",
|
||||||
|
" \n",
|
||||||
|
" return data\n",
|
||||||
|
"\n",
|
||||||
|
"train_data, train_words = read_data('train')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "a4a73c19",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def print_example(data, words, idx):\n",
|
||||||
|
" print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')\n",
|
||||||
|
" \n",
|
||||||
|
"# print_example(train_data, train_words, 13)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "ce522af5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def generate_N_grams(text, ngram=1, no_punctuation=True):\n",
|
||||||
|
" text = re.sub(r'[\\-] ', '', text).lower()\n",
|
||||||
|
" if no_punctuation:\n",
|
||||||
|
" text = re.sub(r'[\\)\\(\\.\\,\\-]', ' ', text)\n",
|
||||||
|
" words=[word for word in text.split()]\n",
|
||||||
|
" temp=zip(*[words[i:] for i in range(0,ngram)])\n",
|
||||||
|
" ans=[' '.join(ngram) for ngram in temp]\n",
|
||||||
|
" return ans\n",
|
||||||
|
"\n",
|
||||||
|
"N_grams = []\n",
|
||||||
|
"for i in range(len(train_data[:5000])):\n",
|
||||||
|
" N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)\n",
|
||||||
|
" N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "317ade72",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def check_prob(N_grams):\n",
|
||||||
|
" count = {}\n",
|
||||||
|
" for i in N_grams:\n",
|
||||||
|
" i = i.rsplit(maxsplit=1)\n",
|
||||||
|
" if i[0] in count:\n",
|
||||||
|
" if i[1] in count[i[0]]:\n",
|
||||||
|
" count[i[0]][i[1]] += 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" count[i[0]][i[1]] = 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" count[i[0]] = {i[1]: 1}\n",
|
||||||
|
" \n",
|
||||||
|
" for word in count:\n",
|
||||||
|
" s = sum(count[word].values())\n",
|
||||||
|
" for i in count[word]:\n",
|
||||||
|
" count[word][i] = count[word][i] / s\n",
|
||||||
|
" \n",
|
||||||
|
" return count\n",
|
||||||
|
"\n",
|
||||||
|
"probs = check_prob(N_grams)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "3a7ec4ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dev_data, dev_words = read_data('dev-0')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "86aeda02",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def find_word(word_1, word_2):\n",
|
||||||
|
" tmp_probs = {}\n",
|
||||||
|
" if word_1 in probs:\n",
|
||||||
|
" if word_2 in probs:\n",
|
||||||
|
" for i in probs[word_1]:\n",
|
||||||
|
" if i in probs[word_2]:\n",
|
||||||
|
" tmp_probs[i] = probs[word_1][i] * probs[word_2][i]\n",
|
||||||
|
" if tmp_probs[i] == 1:\n",
|
||||||
|
" tmp_probs[i] = 0.1\n",
|
||||||
|
" else:\n",
|
||||||
|
" c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10\n",
|
||||||
|
" tmp_probs[i] = probs[word_1][i] * c\n",
|
||||||
|
" else:\n",
|
||||||
|
" tmp_probs = probs[word_1]\n",
|
||||||
|
" else:\n",
|
||||||
|
" tmp_probs = {}\n",
|
||||||
|
" \n",
|
||||||
|
" sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]\n",
|
||||||
|
" tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])\n",
|
||||||
|
" s = 1 - sum(n for _, n in sorted_list)\n",
|
||||||
|
" if s == 0:\n",
|
||||||
|
" s = 0.01\n",
|
||||||
|
" tmm += ' :' + str(s)\n",
|
||||||
|
" if tmp_probs == {}:\n",
|
||||||
|
" return ':1'\n",
|
||||||
|
" return tmm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "3b713dc3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def find_words(data):\n",
|
||||||
|
" found_words = []\n",
|
||||||
|
" for i in data:\n",
|
||||||
|
" t = i[0]\n",
|
||||||
|
" t = re.sub(r'[\\-] ', '', t).lower()\n",
|
||||||
|
" if True:\n",
|
||||||
|
" t = re.sub(r'[\\)\\(\\.\\,\\-]', ' ', t)\n",
|
||||||
|
" words=[word for word in t.split()]\n",
|
||||||
|
" found_words.append(find_word(words[-1], ' '.join(words[-2:])))\n",
|
||||||
|
" return found_words\n",
|
||||||
|
"\n",
|
||||||
|
"dev_found_words = find_words(dev_data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "17be7468",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def save_data(folder, words):\n",
|
||||||
|
" f = open(f'{folder}/out.tsv', 'w')\n",
|
||||||
|
" f.write('\\n'.join(words) + '\\n')\n",
|
||||||
|
" f.close()\n",
|
||||||
|
" \n",
|
||||||
|
"save_data('dev-0', dev_found_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "b2e52242",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"test_data = read_data('test-A', True)\n",
|
||||||
|
"test_found_words = find_words(test_data)\n",
|
||||||
|
"save_data('test-A', test_found_words)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.5"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -3,8 +3,6 @@
|
|||||||
|
|
||||||
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
|
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
|
||||||
|
|
||||||
# In[4]:
|
|
||||||
|
|
||||||
|
|
||||||
import lzma
|
import lzma
|
||||||
import csv
|
import csv
|
||||||
@ -12,36 +10,30 @@ import re
|
|||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
||||||
# In[5]:
|
def read_data(folder_name, test_data=False):
|
||||||
|
|
||||||
|
|
||||||
def read_data(folder_name):
|
|
||||||
|
|
||||||
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
|
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
|
||||||
data = [line.split('\t') for line in all_data][:-1]
|
data = [line.split('\t') for line in all_data][:-1]
|
||||||
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
|
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
|
||||||
|
|
||||||
words = []
|
if not test_data:
|
||||||
with open(f'{folder_name}/expected.tsv') as file:
|
words = []
|
||||||
tsv_file = csv.reader(file, delimiter="\t")
|
with open(f'{folder_name}/expected.tsv') as file:
|
||||||
for line in tsv_file:
|
tsv_file = csv.reader(file, delimiter="\t")
|
||||||
words.append(line[0])
|
for line in tsv_file:
|
||||||
|
words.append(line[0])
|
||||||
|
|
||||||
return data, words
|
return data, words
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
train_data, train_words = read_data('train')
|
train_data, train_words = read_data('train')
|
||||||
|
|
||||||
|
|
||||||
# In[10]:
|
|
||||||
|
|
||||||
|
|
||||||
def print_example(data, words, idx):
|
def print_example(data, words, idx):
|
||||||
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
|
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
|
||||||
|
|
||||||
print_example(train_data, train_words, 13)
|
# print_example(train_data, train_words, 13)
|
||||||
|
|
||||||
|
|
||||||
# In[26]:
|
|
||||||
|
|
||||||
|
|
||||||
def generate_N_grams(text, ngram=1, no_punctuation=True):
|
def generate_N_grams(text, ngram=1, no_punctuation=True):
|
||||||
@ -54,14 +46,11 @@ def generate_N_grams(text, ngram=1, no_punctuation=True):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
N_grams = []
|
N_grams = []
|
||||||
for i in range(len(train_data[:2000])): # POPRAWIĆ !
|
for i in range(len(train_data[:5000])):
|
||||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
|
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
|
||||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
|
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
|
||||||
|
|
||||||
|
|
||||||
# In[27]:
|
|
||||||
|
|
||||||
|
|
||||||
def check_prob(N_grams):
|
def check_prob(N_grams):
|
||||||
count = {}
|
count = {}
|
||||||
for i in N_grams:
|
for i in N_grams:
|
||||||
@ -84,15 +73,9 @@ def check_prob(N_grams):
|
|||||||
probs = check_prob(N_grams)
|
probs = check_prob(N_grams)
|
||||||
|
|
||||||
|
|
||||||
# In[28]:
|
|
||||||
|
|
||||||
|
|
||||||
dev_data, dev_words = read_data('dev-0')
|
dev_data, dev_words = read_data('dev-0')
|
||||||
|
|
||||||
|
|
||||||
# In[29]:
|
|
||||||
|
|
||||||
|
|
||||||
def find_word(word_1, word_2):
|
def find_word(word_1, word_2):
|
||||||
tmp_probs = {}
|
tmp_probs = {}
|
||||||
if word_1 in probs:
|
if word_1 in probs:
|
||||||
@ -121,46 +104,28 @@ def find_word(word_1, word_2):
|
|||||||
return tmm
|
return tmm
|
||||||
|
|
||||||
|
|
||||||
# In[30]:
|
def find_words(data):
|
||||||
|
found_words = []
|
||||||
|
for i in data:
|
||||||
|
t = i[0]
|
||||||
|
t = re.sub(r'[\-] ', '', t).lower()
|
||||||
|
if True:
|
||||||
|
t = re.sub(r'[\)\(\.\,\-]', ' ', t)
|
||||||
|
words=[word for word in t.split()]
|
||||||
|
found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
||||||
|
return found_words
|
||||||
|
|
||||||
|
dev_found_words = find_words(dev_data)
|
||||||
|
|
||||||
|
|
||||||
dev_found_words = []
|
def save_data(folder, words):
|
||||||
|
f = open(f'{folder}/out.tsv', 'w')
|
||||||
for i in dev_data:
|
f.write('\n'.join(words) + '\n')
|
||||||
t = i[0]
|
f.close()
|
||||||
t = re.sub(r'[\-] ', '', t).lower()
|
|
||||||
if True:
|
save_data('dev-0', dev_found_words)
|
||||||
t = re.sub(r'[\)\(\.\,\-]', ' ', t)
|
|
||||||
words=[word for word in t.split()]
|
|
||||||
dev_found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
|
||||||
|
|
||||||
|
|
||||||
# In[31]:
|
test_data = read_data('test-A', True)
|
||||||
|
test_found_words = find_words(test_data)
|
||||||
|
save_data('test-A', test_found_words)
|
||||||
f = open("dev-0/out.tsv", "w")
|
|
||||||
f.write('\n'.join(dev_found_words) + '\n')
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
|
|
||||||
|
|
||||||
test_data = read_data('test-A/in.tsv.xz')
|
|
||||||
test_data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in test_data]
|
|
||||||
|
|
||||||
test_found_words = []
|
|
||||||
|
|
||||||
for i in test_data:
|
|
||||||
t = i[0]
|
|
||||||
if True:
|
|
||||||
t = re.sub(r'[\.\,\-]', ' ', t).lower()
|
|
||||||
words=[word for word in t.split()]
|
|
||||||
test_found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
|
||||||
|
|
||||||
f = open("test-A/out.tsv", "w")
|
|
||||||
f.write('\n'.join(test_found_words) + '\n')
|
|
||||||
f.close()
|
|
||||||
|
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user