import pandas as pd import csv import regex as re import kenlm from english_words import english_words_alpha_set from nltk import trigrams, word_tokenize from pathlib import Path import os KENLM_BUILD_PATH = Path("/home/bartek/Pulpit/challenging-america-word-gap-prediction/kenlm/build") KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / "bin" / "lmplz" KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / "bin" / "build_binary" SUDO_PASSWORD = "" def clean(text): text = str(text).lower().replace("-\\n", "").replace("\\n", " ") return re.sub(r"\p{P}", "", text) def create_train_data(): data = pd.read_csv( "train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=10000 ) train_labels = pd.read_csv( "train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=10000 ) train_data = data[[6, 7]] train_data = pd.concat([train_data, train_labels], axis=1) return train_data[6] + train_data[0] + train_data[7] def create_train_file(filename="train.txt"): with open(filename, "w") as f: for line in create_train_data(): f.write(clean(line) + "\n") def train_model(): lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa" build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command)) os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command)) # create_train_file() # train_model()