diff --git a/geval b/geval new file mode 100755 index 0000000..07a3949 Binary files /dev/null and b/geval differ diff --git a/script.py b/script.py new file mode 100644 index 0000000..e797af2 --- /dev/null +++ b/script.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# coding: utf-8 + +# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa + +# In[4]: + + +import lzma +import csv +import re +import math + + +# In[5]: + + +def read_data(folder_name): + + all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n') + data = [line.split('\t') for line in all_data][:-1] + data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data] + + words = [] + with open(f'{folder_name}/expected.tsv') as file: + tsv_file = csv.reader(file, delimiter="\t") + for line in tsv_file: + words.append(line[0]) + + return data, words + +train_data, train_words = read_data('train') + + +# In[10]: + + +def print_example(data, words, idx): + print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}') + +print_example(train_data, train_words, 13) + + +# In[26]: + + +def generate_N_grams(text, ngram=1, no_punctuation=True): + text = re.sub(r'[\-] ', '', text).lower() + if no_punctuation: + text = re.sub(r'[\)\(\.\,\-]', ' ', text) + words=[word for word in text.split()] + temp=zip(*[words[i:] for i in range(0,ngram)]) + ans=[' '.join(ngram) for ngram in temp] + return ans + +N_grams = [] +for i in range(len(train_data[:2000])): # POPRAWIĆ ! + N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2) + N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3) + + +# In[27]: + + +def check_prob(N_grams): + count = {} + for i in N_grams: + i = i.rsplit(maxsplit=1) + if i[0] in count: + if i[1] in count[i[0]]: + count[i[0]][i[1]] += 1 + else: + count[i[0]][i[1]] = 1 + else: + count[i[0]] = {i[1]: 1} + + for word in count: + s = sum(count[word].values()) + for i in count[word]: + count[word][i] = count[word][i] / s + + return count + +probs = check_prob(N_grams) + + +# In[28]: + + +dev_data, dev_words = read_data('dev-0') + + +# In[29]: + + +def find_word(word_1, word_2): + tmp_probs = {} + if word_1 in probs: + if word_2 in probs: + for i in probs[word_1]: + if i in probs[word_2]: + tmp_probs[i] = probs[word_1][i] * probs[word_2][i] + if tmp_probs[i] == 1: + tmp_probs[i] = 0.1 + else: + c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10 + tmp_probs[i] = probs[word_1][i] * c + else: + tmp_probs = probs[word_1] + else: + tmp_probs = {} + + sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1] + tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list]) + s = 1 - sum(n for _, n in sorted_list) + if s == 0: + s = 0.01 + tmm += ' :' + str(s) + if tmp_probs == {}: + return ':1' + return tmm + + +# In[30]: + + +dev_found_words = [] + +for i in dev_data: + t = i[0] + t = re.sub(r'[\-] ', '', t).lower() + if True: + t = re.sub(r'[\)\(\.\,\-]', ' ', t) + words=[word for word in t.split()] + dev_found_words.append(find_word(words[-1], ' '.join(words[-2:]))) + + +# In[31]: + + +f = open("dev-0/out.tsv", "w") +f.write('\n'.join(dev_found_words) + '\n') +f.close() + + +# + +# In[ ]: + + +test_data = read_data('test-A/in.tsv.xz') +test_data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in test_data] + +test_found_words = [] + +for i in test_data: + t = i[0] + if True: + t = re.sub(r'[\.\,\-]', ' ', t).lower() + words=[word for word in t.split()] + test_found_words.append(find_word(words[-1], ' '.join(words[-2:]))) + +f = open("test-A/out.tsv", "w") +f.write('\n'.join(test_found_words) + '\n') +f.close() +