#!/usr/bin/env python # coding: utf-8 # MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa # In[4]: import lzma import csv import re import math # In[5]: def read_data(folder_name): all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n') data = [line.split('\t') for line in all_data][:-1] data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data] words = [] with open(f'{folder_name}/expected.tsv') as file: tsv_file = csv.reader(file, delimiter="\t") for line in tsv_file: words.append(line[0]) return data, words train_data, train_words = read_data('train') # In[10]: def print_example(data, words, idx): print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}') print_example(train_data, train_words, 13) # In[26]: def generate_N_grams(text, ngram=1, no_punctuation=True): text = re.sub(r'[\-] ', '', text).lower() if no_punctuation: text = re.sub(r'[\)\(\.\,\-]', ' ', text) words=[word for word in text.split()] temp=zip(*[words[i:] for i in range(0,ngram)]) ans=[' '.join(ngram) for ngram in temp] return ans N_grams = [] for i in range(len(train_data[:2000])): # POPRAWIĆ ! N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2) N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3) # In[27]: def check_prob(N_grams): count = {} for i in N_grams: i = i.rsplit(maxsplit=1) if i[0] in count: if i[1] in count[i[0]]: count[i[0]][i[1]] += 1 else: count[i[0]][i[1]] = 1 else: count[i[0]] = {i[1]: 1} for word in count: s = sum(count[word].values()) for i in count[word]: count[word][i] = count[word][i] / s return count probs = check_prob(N_grams) # In[28]: dev_data, dev_words = read_data('dev-0') # In[29]: def find_word(word_1, word_2): tmp_probs = {} if word_1 in probs: if word_2 in probs: for i in probs[word_1]: if i in probs[word_2]: tmp_probs[i] = probs[word_1][i] * probs[word_2][i] if tmp_probs[i] == 1: tmp_probs[i] = 0.1 else: c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10 tmp_probs[i] = probs[word_1][i] * c else: tmp_probs = probs[word_1] else: tmp_probs = {} sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1] tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list]) s = 1 - sum(n for _, n in sorted_list) if s == 0: s = 0.01 tmm += ' :' + str(s) if tmp_probs == {}: return ':1' return tmm # In[30]: dev_found_words = [] for i in dev_data: t = i[0] t = re.sub(r'[\-] ', '', t).lower() if True: t = re.sub(r'[\)\(\.\,\-]', ' ', t) words=[word for word in t.split()] dev_found_words.append(find_word(words[-1], ' '.join(words[-2:]))) # In[31]: f = open("dev-0/out.tsv", "w") f.write('\n'.join(dev_found_words) + '\n') f.close() # # In[ ]: test_data = read_data('test-A/in.tsv.xz') test_data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in test_data] test_found_words = [] for i in test_data: t = i[0] if True: t = re.sub(r'[\.\,\-]', ' ', t).lower() words=[word for word in t.split()] test_found_words.append(find_word(words[-1], ' '.join(words[-2:]))) f = open("test-A/out.tsv", "w") f.write('\n'.join(test_found_words) + '\n') f.close()