from cmath import log10 import csv import pandas as pd import regex as re import os import kenlm from nltk import word_tokenize from collections import Counter, defaultdict from english_words import english_words_set # nltk.download("punkt") # train set train_data = pd.read_csv( "train/in.tsv.xz", sep="\t", error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=100_000 ) # training labels train_labels = pd.read_csv( "train/expected.tsv", sep="\t", error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=100_000 ) # dev set dev_data = pd.read_csv( "dev-0/in.tsv.xz", sep="\t", error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, ) # test set test_data = pd.read_csv( "test-A/in.tsv.xz", sep="\t", error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, ) def prepare_text(text): text = text.lower().replace("-\\n", "").replace("\\n", " ") text = re.sub(r"\p{P}", "", text) return text def predict(word1, word2): predictions = [] for word in english_words_set: sentence = word1 + ' ' + word + ' ' + word2 text_score = model.score(sentence, bos=False, eos=False) if len(predictions) < 12: predictions.append((word, text_score)) else: worst_score = None for score in predictions: if not worst_score: worst_score = score else: if worst_score[1] > score[1]: worst_score = score if worst_score[1] < text_score: predictions.remove(worst_score) predictions.append((word, text_score)) probs = sorted(predictions, key=lambda tup: tup[1], reverse=True) pred_str = '' for word, prob in probs: pred_str += f'{word}:{prob} ' pred_str += f':{log10(0.99)}' return pred_str def write_output(): with open("dev-0/out.tsv", "w") as file: for _, row in dev_data.iterrows(): text = prepare_text(str(row[7])) words = word_tokenize(text) if len(words) < 3: prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" else: prediction = predict(words[0], words[1]) file.write(prediction + "\n") with open("test-A/out.tsv", "w") as file: for _, row in test_data.iterrows(): text = prepare_text(str(row[7])) words = word_tokenize(text) if len(words) < 3: prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" else: prediction = predict(words[0], words[1]) file.write(prediction + "\n") if __name__ == "__main__": print("Preparing data...") train_data = train_data[[6, 7]] train_data = pd.concat([train_data, train_labels], axis=1) train_data["final"] = train_data[6] + train_data[0] + train_data[7] train = train_data[['final']] with open("./train_data.txt", 'a') as f: for _, row in train_data.iterrows(): text = prepare_text(str(row["final"])) f.write(text + '\n') print("Preparing model...") os.system('sh ./kenlm.sh') model=kenlm.Model("kenlm_model.binary") print("Writing outputs...") write_output()