2022-03-31 14:53:43 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# coding: utf-8
|
|
|
|
|
|
|
|
import lzma
|
|
|
|
import csv
|
|
|
|
import re
|
|
|
|
import math
|
2022-04-10 23:11:14 +02:00
|
|
|
from collections import Counter
|
2022-03-31 14:53:43 +02:00
|
|
|
|
|
|
|
|
2022-03-31 21:07:24 +02:00
|
|
|
def read_data(folder_name, test_data=False):
|
2022-03-31 14:53:43 +02:00
|
|
|
|
|
|
|
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
|
|
|
|
data = [line.split('\t') for line in all_data][:-1]
|
|
|
|
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
|
|
|
|
|
2022-03-31 21:07:24 +02:00
|
|
|
if not test_data:
|
|
|
|
words = []
|
|
|
|
with open(f'{folder_name}/expected.tsv') as file:
|
|
|
|
tsv_file = csv.reader(file, delimiter="\t")
|
|
|
|
for line in tsv_file:
|
|
|
|
words.append(line[0])
|
2022-03-31 14:53:43 +02:00
|
|
|
|
2022-03-31 21:07:24 +02:00
|
|
|
return data, words
|
|
|
|
|
|
|
|
return data
|
2022-03-31 14:53:43 +02:00
|
|
|
|
|
|
|
|
|
|
|
def generate_N_grams(text, ngram=1, no_punctuation=True):
|
|
|
|
text = re.sub(r'[\-] ', '', text).lower()
|
|
|
|
if no_punctuation:
|
2022-04-01 15:41:25 +02:00
|
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
2022-03-31 14:53:43 +02:00
|
|
|
words=[word for word in text.split()]
|
|
|
|
temp=zip(*[words[i:] for i in range(0,ngram)])
|
|
|
|
ans=[' '.join(ngram) for ngram in temp]
|
|
|
|
return ans
|
|
|
|
|
|
|
|
|
|
|
|
def check_prob(N_grams):
|
2022-04-10 23:11:14 +02:00
|
|
|
if ' ' not in N_grams[0]:
|
|
|
|
counter = Counter()
|
|
|
|
a = Counter(N_grams)
|
|
|
|
total = sum(a.values())
|
|
|
|
return {k: v / total for total in (sum(a.values()),) for k, v in a.items()}
|
2022-03-31 14:53:43 +02:00
|
|
|
count = {}
|
|
|
|
for i in N_grams:
|
|
|
|
i = i.rsplit(maxsplit=1)
|
|
|
|
if i[0] in count:
|
|
|
|
if i[1] in count[i[0]]:
|
|
|
|
count[i[0]][i[1]] += 1
|
|
|
|
else:
|
|
|
|
count[i[0]][i[1]] = 1
|
|
|
|
else:
|
|
|
|
count[i[0]] = {i[1]: 1}
|
|
|
|
|
|
|
|
for word in count:
|
|
|
|
s = sum(count[word].values())
|
|
|
|
for i in count[word]:
|
|
|
|
count[word][i] = count[word][i] / s
|
2022-04-10 23:11:14 +02:00
|
|
|
count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True)
|
2022-03-31 14:53:43 +02:00
|
|
|
|
|
|
|
return count
|
|
|
|
|
|
|
|
|
2022-04-10 23:11:14 +02:00
|
|
|
def find_word(words, model):
|
|
|
|
n = len(words)
|
|
|
|
tmp = {}
|
|
|
|
while n > 1:
|
|
|
|
if ' '.join(words[-n:]) in model[n]:
|
|
|
|
tmp = model[n][' '.join(words[-n:])][:2]
|
|
|
|
break
|
2022-03-31 14:53:43 +02:00
|
|
|
else:
|
2022-04-10 23:11:14 +02:00
|
|
|
n -= 1
|
|
|
|
|
|
|
|
res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp])
|
|
|
|
s = 1 - sum(n for _, n in tmp)
|
2022-03-31 14:53:43 +02:00
|
|
|
if s == 0:
|
2022-04-10 23:11:14 +02:00
|
|
|
s = 1
|
|
|
|
res += ' :' + str(s)
|
|
|
|
if tmp == {}:
|
|
|
|
if words[-1] in model[0]:
|
|
|
|
return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}'
|
|
|
|
else:
|
|
|
|
return ':1'
|
|
|
|
return res
|
2022-03-31 14:53:43 +02:00
|
|
|
|
|
|
|
|
2022-04-10 23:11:14 +02:00
|
|
|
def find_words(data, n, model):
|
2022-03-31 21:07:24 +02:00
|
|
|
found_words = []
|
|
|
|
for i in data:
|
|
|
|
t = i[0]
|
|
|
|
t = re.sub(r'[\-] ', '', t).lower()
|
|
|
|
if True:
|
2022-04-01 15:41:25 +02:00
|
|
|
t = re.sub(r'[^\w\s]', ' ', t)
|
2022-03-31 21:07:24 +02:00
|
|
|
words=[word for word in t.split()]
|
2022-04-10 23:11:14 +02:00
|
|
|
found_words.append(find_word(words[-n:], model))
|
2022-03-31 21:07:24 +02:00
|
|
|
return found_words
|
2022-03-31 14:53:43 +02:00
|
|
|
|
|
|
|
|
2022-03-31 21:07:24 +02:00
|
|
|
def save_data(folder, words):
|
|
|
|
f = open(f'{folder}/out.tsv', 'w')
|
|
|
|
f.write('\n'.join(words) + '\n')
|
|
|
|
f.close()
|
2022-04-10 23:11:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
def train(n, data_size = 5000):
|
|
|
|
train_data, train_words = read_data('train')
|
|
|
|
N_grams = [[] for i in range(n)]
|
|
|
|
probs = [[] for i in range(n)]
|
|
|
|
for i in range(len(train_data[:data_size])):
|
|
|
|
for j in range(n):
|
|
|
|
N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1)
|
|
|
|
for i in range(n):
|
|
|
|
probs[i] = check_prob(N_grams[i])
|
|
|
|
return probs
|
2022-03-31 21:07:24 +02:00
|
|
|
|
2022-04-10 23:11:14 +02:00
|
|
|
model = train(4)
|
|
|
|
|
2022-03-31 14:53:43 +02:00
|
|
|
|
2022-04-10 23:11:14 +02:00
|
|
|
def predict(model, n, data_name, test_data=False):
|
|
|
|
if not test_data:
|
|
|
|
data, _ = read_data(data_name, test_data)
|
|
|
|
else:
|
|
|
|
data = read_data(data_name, test_data)
|
|
|
|
found_words = find_words(data, n - 1, model)
|
|
|
|
save_data(data_name, found_words)
|
|
|
|
|
|
|
|
predict(model, 4, 'dev-0')
|
2022-03-31 14:53:43 +02:00
|
|
|
|
2022-04-10 23:11:14 +02:00
|
|
|
predict(model, 4, 'test-A', True)
|