challenging-america-word-ga.../lab6/kenlm.ipynb
Szymon Parafiński 8c25eb8da8 kenLM #3
2023-04-25 00:27:37 +02:00

2.8 KiB

!xzcat -f1 ../train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../../kenlm/build/bin/lmplz -o 3 > kenlm_model.arpa
!../../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary
import re

CONTRACTIONS = {
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "aren't": "are not",
    "don't": "do not",
    "doesn't": "does not",
    "weren't": "were not",
    "'ll": " will",
}


def formalize_text(text):
    # Replace contractions using regular expressions
    pattern = re.compile(r'\b(' + '|'.join(CONTRACTIONS.keys()) + r')\b')
    text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)

    # Remove hyphens at the end of lines and replace newlines with spaces
    text = text.replace('-\n', '')
    text = text.replace('\n', ' ')

    return text


def clean_string(text):
    text = formalize_text(text)
    text = re.sub(r" -\\\\*\\\\n", "", text)
    text = re.sub(r"\\\\n", " ", text)
    text = text.strip()
    return text


train_text = ""
print("Reading train data...")
with open("../train/in.tsv", encoding="utf8", mode="rt") as file, open("../train/expected.tsv", encoding="utf8", mode="rt") as expected:
    for t_line, e_line in zip(file, expected):
        t_line = t_line.split("\t")
        train_text += clean_string(t_line[-2]) + f" {clean_string(e_line)} " + clean_string(t_line[-1])

# save train_text to file
print("saving to file...")
with open("train_text.txt", encoding="utf8", mode="w") as file:
    file.write(train_text)