kenlm solution

2023-04-26 08:23:58 +02:00 · 2023-04-26 08:17:51 +02:00 · 2023-04-26 08:15:39 +02:00 · 2023-04-26 08:14:12 +02:00 · 2023-04-26 08:10:48 +02:00 · 2023-04-26 08:07:17 +02:00
5 changed files with 18008 additions and 10521 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,5 +6,3 @@
 *.o
 .DS_Store
 .token
 .gin
 geval
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN
--- a/kenlm_run.py
+++ b/kenlm_run.py
@ -0,0 +1,75 @@
 from tqdm import tqdm
 import regex as re
 from nltk.tokenize import word_tokenize
 from english_words import get_english_words_set
 import kenlm
 from math import log10
 import pickle
 path = 'kenlm_model.binary'
 model = kenlm.Model(path)
 with open('V.pickle', 'rb') as handle:
        V_counter = pickle.load(handle)
 def clean_string(text):
    text = text.lower()
    text = re.sub(r" -\\*\\n", "", text)
    text = re.sub(r"\\n", " ", text)
    text = text.strip()
    return text
 def predict_probs(w1, w2, w4, w5):
    best_scores = []
    pred_str = ""
    for word in V_counter:
        text = ' '.join([w1, w2, word, w4, w5])
        text_score = model.score(text, bos=False, eos=False)
        if len(best_scores) < 5:
            best_scores.append((word, text_score))
        else:
            worst_score = best_scores[-1]
            if worst_score[1] < text_score:
                best_scores[-1] = (word, text_score)
                best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
    for word, prob in best_scores:
        pred_str += f'{word}:{prob} '
    pred_str += f':{log10(0.99)}'
    return pred_str
 def get_word_predictions(w1, w2,):
    for word in get_english_words_set(['web2'], lower=True):
        sentence = f'{w1} {word} {w2}'
        text_score = model.score(sentence, False, False)
        yield((word, text_score))
 def argmax(w1,w2):
    top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
    output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
    return output_line
 def run_predictions(source_folder):    
    with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
        train_data = file.readlines()
    with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
        for line in tqdm(train_data):
            line = line.split("\t")
            l1 = clean_string(line[-2])
            l2 = clean_string(line[-1])
            if not l1 or not l2:
               out_line =  "the:0.5 a:0.3 :0.2"
            else:
                w1, w2 = word_tokenize(l1)[-2:]
                w3, w4 = word_tokenize(l2)[:2]
                out_line = predict_probs(w1, w2, w3, w4)
            output_file.write(out_line + "\n")
 run_predictions("dev-0")
 run_predictions("test-A")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
Author	SHA1	Message	Date
Jakub Eichner	975dd50258	kenlm solution	2023-04-26 08:23:58 +02:00
Jakub Eichner	64b2612ef1	kenlm solution	2023-04-26 08:17:51 +02:00
Jakub Eichner	dfe830ac26	kenlm solution	2023-04-26 08:15:39 +02:00
Jakub Eichner	20d03b9e18	kenlm solution	2023-04-26 08:14:12 +02:00
Jakub Eichner	140f4c0284	kenlm solution	2023-04-26 08:10:48 +02:00
Jakub Eichner	02ee0ff2fa	kenlm solution	2023-04-26 08:07:17 +02:00