#!/usr/bin/env python # coding: utf-8 import lzma import csv import re import math from collections import Counter def read_data(folder_name, test_data=False): all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n') data = [line.split('\t') for line in all_data][:-1] data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data] if not test_data: words = [] with open(f'{folder_name}/expected.tsv') as file: tsv_file = csv.reader(file, delimiter="\t") for line in tsv_file: words.append(line[0]) return data, words return data def generate_N_grams(text, ngram=1, no_punctuation=True): text = re.sub(r'[\-] ', '', text).lower() if no_punctuation: text = re.sub(r'[^\w\s]', ' ', text) words=[word for word in text.split()] temp=zip(*[words[i:] for i in range(0,ngram)]) ans=[' '.join(ngram) for ngram in temp] return ans def check_prob(N_grams): if ' ' not in N_grams[0]: counter = Counter() a = Counter(N_grams) total = sum(a.values()) return {k: v / total for total in (sum(a.values()),) for k, v in a.items()} count = {} for i in N_grams: i = i.rsplit(maxsplit=1) if i[0] in count: if i[1] in count[i[0]]: count[i[0]][i[1]] += 1 else: count[i[0]][i[1]] = 1 else: count[i[0]] = {i[1]: 1} for word in count: s = sum(count[word].values()) for i in count[word]: count[word][i] = count[word][i] / s count[word] = sorted(count[word].items(), key=lambda x: x[1], reverse=True) return count def find_word(words, model): n = len(words) tmp = {} while n > 1: if ' '.join(words[-n:]) in model[n]: tmp = model[n][' '.join(words[-n:])][:2] break else: n -= 1 res = ' '.join([i[0] + ':' + str(i[1]) for i in tmp]) s = 1 - sum(n for _, n in tmp) if s == 0: s = 1 res += ' :' + str(s) if tmp == {}: if words[-1] in model[0]: return f'{words[-1]}:{model[0][words[-1]]} :{1 - model[0][words[-1]]}' else: return ':1' return res def find_words(data, n, model): found_words = [] for i in data: t = i[0] t = re.sub(r'[\-] ', '', t).lower() if True: t = re.sub(r'[^\w\s]', ' ', t) words=[word for word in t.split()] found_words.append(find_word(words[-n:], model)) return found_words def save_data(folder, words): f = open(f'{folder}/out.tsv', 'w') f.write('\n'.join(words) + '\n') f.close() def train(n, data_size = 5000): train_data, train_words = read_data('train') N_grams = [[] for i in range(n)] probs = [[] for i in range(n)] for i in range(len(train_data[:data_size])): for j in range(n): N_grams[j] += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', j + 1) for i in range(n): probs[i] = check_prob(N_grams[i]) return probs model = train(4) def predict(model, n, data_name, test_data=False): if not test_data: data, _ = read_data(data_name, test_data) else: data = read_data(data_name, test_data) found_words = find_words(data, n - 1, model) save_data(data_name, found_words) predict(model, 4, 'dev-0') predict(model, 4, 'test-A', True)