challenging-america-word-ga.../run.py
2022-04-01 15:41:25 +02:00

131 lines
3.4 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
import lzma
import csv
import re
import math
def read_data(folder_name, test_data=False):
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
data = [line.split('\t') for line in all_data][:-1]
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
if not test_data:
words = []
with open(f'{folder_name}/expected.tsv') as file:
tsv_file = csv.reader(file, delimiter="\t")
for line in tsv_file:
words.append(line[0])
return data, words
return data
train_data, train_words = read_data('train')
def print_example(data, words, idx):
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
# print_example(train_data, train_words, 13)
def generate_N_grams(text, ngram=1, no_punctuation=True):
text = re.sub(r'[\-] ', '', text).lower()
if no_punctuation:
text = re.sub(r'[^\w\s]', ' ', text)
words=[word for word in text.split()]
temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp]
return ans
N_grams = []
for i in range(len(train_data[:5000])):
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
def check_prob(N_grams):
count = {}
for i in N_grams:
i = i.rsplit(maxsplit=1)
if i[0] in count:
if i[1] in count[i[0]]:
count[i[0]][i[1]] += 1
else:
count[i[0]][i[1]] = 1
else:
count[i[0]] = {i[1]: 1}
for word in count:
s = sum(count[word].values())
for i in count[word]:
count[word][i] = count[word][i] / s
return count
probs = check_prob(N_grams)
dev_data, dev_words = read_data('dev-0')
def find_word(word_1, word_2):
tmp_probs = {}
if word_1 in probs:
if word_2 in probs:
for i in probs[word_1]:
if i in probs[word_2]:
tmp_probs[i] = probs[word_1][i] * probs[word_2][i]
if tmp_probs[i] == 1:
tmp_probs[i] = 0.1
else:
tmp_probs[i] = probs[word_1][i] / 5
else:
tmp_probs = probs[word_1]
else:
tmp_probs = {}
sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]
tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])
s = 1 - sum(n for _, n in sorted_list)
if s == 0:
s = 0.01
tmm += ' :' + str(s)
if tmp_probs == {}:
return ':1'
return tmm
def find_words(data):
found_words = []
for i in data:
t = i[0]
t = re.sub(r'[\-] ', '', t).lower()
if True:
t = re.sub(r'[^\w\s]', ' ', t)
words=[word for word in t.split()]
found_words.append(find_word(words[-1], ' '.join(words[-2:])))
return found_words
dev_found_words = find_words(dev_data)
def save_data(folder, words):
f = open(f'{folder}/out.tsv', 'w')
f.write('\n'.join(words) + '\n')
f.close()
save_data('dev-0', dev_found_words)
test_data = read_data('test-A', True)
test_found_words = find_words(test_data)
save_data('test-A', test_found_words)