challenging-america-word-ga.../lab6/kenlm_script.py

from tqdm import tqdm
import regex as re
from english_words import get_english_words_set
import kenlm
import pickle
import math
import numpy as np

path = 'kenlm_model.binary'
model = kenlm.Model(path)

CONTRACTIONS = {
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "aren't": "are not",
    "don't": "do not",
    "doesn't": "does not",
    "weren't": "were not",
    "'ll": " will",
}


def formalize_text(text):
    # Replace contractions using regular expressions
    pattern = re.compile(r'\b(' + '|'.join(CONTRACTIONS.keys()) + r')\b')
    text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)

    # Remove hyphens at the end of lines and replace newlines with spaces
    text = text.replace('-\n', '')
    text = text.replace('\n', ' ')

    return text


def clean_string(text):
    text = formalize_text(text)
    text = re.sub(r" -\\*\\n", "", text)
    text = re.sub(r"\\n", " ", text)
    text = text.strip()
    return text


def p(text):
    return 1 / (1 + math.exp(-(model.score(text, bos=False, eos=False))))


def perplexity(text):
    return model.perplexity(text)


def predict_probs_w1w2wi(w1, w2):
    best_scores = []
    pred_str = ""
    for word in V_counter:
        w1w2 = ' '.join([w2, word])
        w1w2w3 = ' '.join([w1, w2, word])

        text_score = 0.1 * p(word) + 0.3 * p(w1w2) + 0.6 * p(w1w2w3)

        if len(best_scores) < 5:
            best_scores.append((word, text_score))
        else:
            worst_score = best_scores[-1]
            if worst_score[1] < text_score:
                best_scores[-1] = (word, text_score)
                best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)

    for word, prob in best_scores:
        pred_str += f'{word}:{prob} '
    pred_str += f':{1 - sum([p for _, p in best_scores])}'
    return pred_str


def run_predictions(source_folder):
    print(f"Run predictions on {source_folder} data...")

    with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
        train_data = file.readlines()

    with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
        for line in tqdm(train_data):
            line = line.split("\t")

            w1, w2 = clean_string(line[-2]).split()[-2:]
            out_line = predict_probs_w1w2wi(w1, w2)

            output_file.write(out_line + "\n")


with open('V_3000.pickle', 'rb') as handle:
    V_counter = pickle.load(handle)

run_predictions("../dev-0")
# run_predictions("../test-A")
kenLM #3 2023-04-25 00:27:37 +02:00			`from tqdm import tqdm`
			`import regex as re`
			`from english_words import get_english_words_set`
			`import kenlm`
			`import pickle`
			`import math`
			`import numpy as np`

			`path = 'kenlm_model.binary'`
			`model = kenlm.Model(path)`

			`CONTRACTIONS = {`
			`"I'm": "I am",`
			`"you're": "you are",`
			`"he's": "he is",`
			`"she's": "she is",`
			`"it's": "it is",`
			`"we're": "we are",`
			`"they're": "they are",`
			`"aren't": "are not",`
			`"don't": "do not",`
			`"doesn't": "does not",`
			`"weren't": "were not",`
			`"'ll": " will",`
			`}`


			`def formalize_text(text):`
			`# Replace contractions using regular expressions`
			`pattern = re.compile(r'\b(' + '\|'.join(CONTRACTIONS.keys()) + r')\b')`
			`text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)`

			`# Remove hyphens at the end of lines and replace newlines with spaces`
			`text = text.replace('-\n', '')`
			`text = text.replace('\n', ' ')`

			`return text`


			`def clean_string(text):`
			`text = formalize_text(text)`
			`text = re.sub(r" -\\*\\n", "", text)`
			`text = re.sub(r"\\n", " ", text)`
			`text = text.strip()`
			`return text`


			`def p(text):`
			`return 1 / (1 + math.exp(-(model.score(text, bos=False, eos=False))))`


			`def perplexity(text):`
			`return model.perplexity(text)`


			`def predict_probs_w1w2wi(w1, w2):`
			`best_scores = []`
			`pred_str = ""`
			`for word in V_counter:`
			`w1w2 = ' '.join([w2, word])`
			`w1w2w3 = ' '.join([w1, w2, word])`

			`text_score = 0.1 * p(word) + 0.3 * p(w1w2) + 0.6 * p(w1w2w3)`

			`if len(best_scores) < 5:`
			`best_scores.append((word, text_score))`
			`else:`
			`worst_score = best_scores[-1]`
			`if worst_score[1] < text_score:`
			`best_scores[-1] = (word, text_score)`
			`best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)`

			`for word, prob in best_scores:`
			`pred_str += f'{word}:{prob} '`
			`pred_str += f':{1 - sum([p for _, p in best_scores])}'`
			`return pred_str`


			`def run_predictions(source_folder):`
			`print(f"Run predictions on {source_folder} data...")`

			`with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:`
			`train_data = file.readlines()`

			`with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:`
			`for line in tqdm(train_data):`
			`line = line.split("\t")`

			`w1, w2 = clean_string(line[-2]).split()[-2:]`
			`out_line = predict_probs_w1w2wi(w1, w2)`

			`output_file.write(out_line + "\n")`


			`with open('V_3000.pickle', 'rb') as handle:`
			`V_counter = pickle.load(handle)`

			`run_predictions("../dev-0")`
			`# run_predictions("../test-A")`