kenlm solution

2023-04-26 08:07:17 +02:00 · 2023-04-26 08:07:17 +02:00 · 02ee0ff2fa
commit 02ee0ff2fa
parent b30ed83944
6 changed files with 20079 additions and 10532 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/kenlm.ipynb
+++ b/kenlm.ipynb
--- a/kenlm_2words.py
+++ b/kenlm_2words.py
@ -0,0 +1,79 @@
+from tqdm import tqdm
+import regex as re
+from nltk.tokenize import word_tokenize
+from english_words import get_english_words_set
+import kenlm
+from math import log10
+import pickle
+
+path = 'kenlm_model.binary'
+model = kenlm.Model(path)
+
+with open('V.pickle', 'rb') as handle:
+        V_counter = pickle.load(handle)
+
+def clean_string(text):
+    text = text.lower()
+    text = re.sub(r" -\\*\\n", "", text)
+    text = re.sub(r"\\n", " ", text)
+    text = text.strip()
+    return text
+
+
+def predict_probs(w1, w3):
+    best_scores = []
+    pred_str = ""
+    # for word in get_english_words_set(['web2'], lower=True):
+    for word in V_counter:
+        text = ' '.join([w1, word, w3])
+        text_score = model.score(text, bos=False, eos=False)
+        if len(best_scores) < 5:
+            best_scores.append((word, text_score))
+        else:
+            worst_score = best_scores[-1]
+            if worst_score[1] < text_score:
+                best_scores[-1] = (word, text_score)
+                best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
+
+    for word, prob in best_scores:
+        pred_str += f'{word}:{prob} '
+    pred_str += f':{log10(0.99)}'
+    return pred_str
+
+def get_word_predictions(w1, w2,):
+    for word in get_english_words_set(['web2'], lower=True):
+        sentence = w1 + ' ' + word + ' ' + w2
+        text_score = model.score(sentence, bos=False, eos=False)
+        yield((word, text_score))
+
+def argmax(w1,w2):
+    # get top 10 predictions from predict_line
+    top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
+    output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
+    return output_line
+
+def run_predictions(source_folder):
+    print(f"Run predictions on {source_folder} data...")
+    
+    with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
+        train_data = file.readlines()
+
+    with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
+        for line in tqdm(train_data):
+            line = line.split("\t")
+            
+            l1 = clean_string(line[-2])
+            l2 = clean_string(line[-1])
+
+            if not l1 or not l2:
+               out_line =  "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+            else:
+                w1 = word_tokenize(l1)[-1]
+                w2 = word_tokenize(l2)[0]          
+                out_line = predict_probs(w1, w2)
+                
+            output_file.write(out_line + "\n")
+    
+
+run_predictions("dev-0")
+run_predictions("test-A")
--- a/kenlm_4words.py
+++ b/kenlm_4words.py
@ -0,0 +1,79 @@
+from tqdm import tqdm
+import regex as re
+from nltk.tokenize import word_tokenize
+from english_words import get_english_words_set
+import kenlm
+from math import log10
+import pickle
+
+path = 'kenlm_model.binary'
+model = kenlm.Model(path)
+
+with open('V.pickle', 'rb') as handle:
+        V_counter = pickle.load(handle)
+
+def clean_string(text):
+    text = text.lower()
+    text = re.sub(r" -\\*\\n", "", text)
+    text = re.sub(r"\\n", " ", text)
+    text = text.strip()
+    return text
+
+
+def predict_probs(w1, w2, w4, w5):
+    best_scores = []
+    pred_str = ""
+    # for word in get_english_words_set(['web2'], lower=True):
+    for word in V_counter:
+        text = ' '.join([w1, w2, word, w4, w5])
+        text_score = model.score(text, bos=False, eos=False)
+        if len(best_scores) < 5:
+            best_scores.append((word, text_score))
+        else:
+            worst_score = best_scores[-1]
+            if worst_score[1] < text_score:
+                best_scores[-1] = (word, text_score)
+                best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
+
+    for word, prob in best_scores:
+        pred_str += f'{word}:{prob} '
+    pred_str += f':{log10(0.99)}'
+    return pred_str
+
+def get_word_predictions(w1, w2,):
+    for word in get_english_words_set(['web2'], lower=True):
+        sentence = w1 + ' ' + word + ' ' + w2
+        text_score = model.score(sentence, bos=False, eos=False)
+        yield((word, text_score))
+
+def argmax(w1,w2):
+    # get top 10 predictions from predict_line
+    top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
+    output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
+    return output_line
+
+def run_predictions(source_folder):
+    print(f"Run predictions on {source_folder} data...")
+    
+    with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
+        train_data = file.readlines()
+
+    with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
+        for line in tqdm(train_data):
+            line = line.split("\t")
+            
+            l1 = clean_string(line[-2])
+            l2 = clean_string(line[-1])
+
+            if not l1 or not l2:
+               out_line =  "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+            else:
+                w1, w2 = word_tokenize(l1)[-2:]
+                w3, w4 = word_tokenize(l2)[:2]
+                out_line = predict_probs(w1, w2, w3, w4)
+                
+            output_file.write(out_line + "\n")
+    
+
+run_predictions("dev-0")
+run_predictions("test-A")
--- a/lm0.py
+++ b/lm0.py
@ -1,13 +0,0 @@
-#!/usr/bin/python3
-import sys
-for i, line in enumerate(sys.stdin):    
-    if(line.split('\t')[6].endswith('\n')):
-        print('hence:0.95 :0.05')
-    elif(line.split('\t')[6].endswith('ot')):
-        print('be:0.6 a:0.35 :0.05')
-    elif(line.split('\t')[6].endswith('.')):
-        print('but:0.85 :0.15')
-    elif([l.split(' ') for l in line.split('\t')][5][0].endswith('ing')):
-        print('this:0.88 :0.12')
-    else:
-        print('the:0.5 a:0.3 :0.2')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv