diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..51950c7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +kenlm \ No newline at end of file diff --git a/run2.py b/run2.py new file mode 100644 index 0000000..43dcf05 --- /dev/null +++ b/run2.py @@ -0,0 +1,60 @@ +import pandas as pd +import csv +import regex as re +import kenlm +from english_words import english_words_alpha_set +from nltk import trigrams, word_tokenize + +from pathlib import Path +import os + + +KENLM_BUILD_PATH = Path("/home/bartek/Pulpit/challenging-america-word-gap-prediction/kenlm/build") +KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / "bin" / "lmplz" +KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / "bin" / "build_binary" +SUDO_PASSWORD = "" + +def clean(text): + text = str(text).lower().replace("-\\n", "").replace("\\n", " ") + return re.sub(r"\p{P}", "", text) + + +def create_train_data(): + data = pd.read_csv( + "train/in.tsv.xz", + sep="\t", + error_bad_lines=False, + header=None, + quoting=csv.QUOTE_NONE, + nrows=10000 + ) + train_labels = pd.read_csv( + "train/expected.tsv", + sep="\t", + error_bad_lines=False, + header=None, + quoting=csv.QUOTE_NONE, + nrows=10000 + ) + + train_data = data[[6, 7]] + train_data = pd.concat([train_data, train_labels], axis=1) + + return train_data[6] + train_data[0] + train_data[7] + + +def create_train_file(filename="train.txt"): + with open(filename, "w") as f: + for line in create_train_data(): + f.write(clean(line) + "\n") + + +def train_model(): + lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa" + build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary" + os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command)) + os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command)) + + +# create_train_file() +# train_model()