from tqdm import tqdm import regex as re from english_words import get_english_words_set import kenlm import pickle import math import numpy as np path = 'kenlm_model.binary' model = kenlm.Model(path) CONTRACTIONS = { "I'm": "I am", "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is", "we're": "we are", "they're": "they are", "aren't": "are not", "don't": "do not", "doesn't": "does not", "weren't": "were not", "'ll": " will", } def formalize_text(text): # Replace contractions using regular expressions pattern = re.compile(r'\b(' + '|'.join(CONTRACTIONS.keys()) + r')\b') text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text) # Remove hyphens at the end of lines and replace newlines with spaces text = text.replace('-\n', '') text = text.replace('\n', ' ') return text def clean_string(text): text = formalize_text(text) text = re.sub(r" -\\*\\n", "", text) text = re.sub(r"\\n", " ", text) text = text.strip() return text def p(text): return 1 / (1 + math.exp(-(model.score(text, bos=False, eos=False)))) def perplexity(text): return model.perplexity(text) def predict_probs_w1w2wi(w1, w2): best_scores = [] pred_str = "" for word in V_counter: w1w2 = ' '.join([w2, word]) w1w2w3 = ' '.join([w1, w2, word]) text_score = 0.1 * p(word) + 0.3 * p(w1w2) + 0.6 * p(w1w2w3) if len(best_scores) < 5: best_scores.append((word, text_score)) else: worst_score = best_scores[-1] if worst_score[1] < text_score: best_scores[-1] = (word, text_score) best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True) for word, prob in best_scores: pred_str += f'{word}:{prob} ' pred_str += f':{1 - sum([p for _, p in best_scores])}' return pred_str def run_predictions(source_folder): print(f"Run predictions on {source_folder} data...") with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file: train_data = file.readlines() with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file: for line in tqdm(train_data): line = line.split("\t") w1, w2 = clean_string(line[-2]).split()[-2:] out_line = predict_probs_w1w2wi(w1, w2) output_file.write(out_line + "\n") with open('V_3000.pickle', 'rb') as handle: V_counter = pickle.load(handle) run_predictions("../dev-0") # run_predictions("../test-A")