kenlm solution

This commit is contained in:
Jakub Eichner 2023-04-26 08:07:17 +02:00
parent b30ed83944
commit 02ee0ff2fa
6 changed files with 20079 additions and 10532 deletions

File diff suppressed because it is too large Load Diff

1988
kenlm.ipynb Normal file

File diff suppressed because it is too large Load Diff

79
kenlm_2words.py Normal file
View File

@ -0,0 +1,79 @@
from tqdm import tqdm
import regex as re
from nltk.tokenize import word_tokenize
from english_words import get_english_words_set
import kenlm
from math import log10
import pickle
path = 'kenlm_model.binary'
model = kenlm.Model(path)
with open('V.pickle', 'rb') as handle:
V_counter = pickle.load(handle)
def clean_string(text):
text = text.lower()
text = re.sub(r" -\\*\\n", "", text)
text = re.sub(r"\\n", " ", text)
text = text.strip()
return text
def predict_probs(w1, w3):
best_scores = []
pred_str = ""
# for word in get_english_words_set(['web2'], lower=True):
for word in V_counter:
text = ' '.join([w1, word, w3])
text_score = model.score(text, bos=False, eos=False)
if len(best_scores) < 5:
best_scores.append((word, text_score))
else:
worst_score = best_scores[-1]
if worst_score[1] < text_score:
best_scores[-1] = (word, text_score)
best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
for word, prob in best_scores:
pred_str += f'{word}:{prob} '
pred_str += f':{log10(0.99)}'
return pred_str
def get_word_predictions(w1, w2,):
for word in get_english_words_set(['web2'], lower=True):
sentence = w1 + ' ' + word + ' ' + w2
text_score = model.score(sentence, bos=False, eos=False)
yield((word, text_score))
def argmax(w1,w2):
# get top 10 predictions from predict_line
top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
return output_line
def run_predictions(source_folder):
print(f"Run predictions on {source_folder} data...")
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
train_data = file.readlines()
with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
for line in tqdm(train_data):
line = line.split("\t")
l1 = clean_string(line[-2])
l2 = clean_string(line[-1])
if not l1 or not l2:
out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
else:
w1 = word_tokenize(l1)[-1]
w2 = word_tokenize(l2)[0]
out_line = predict_probs(w1, w2)
output_file.write(out_line + "\n")
run_predictions("dev-0")
run_predictions("test-A")

79
kenlm_4words.py Normal file
View File

@ -0,0 +1,79 @@
from tqdm import tqdm
import regex as re
from nltk.tokenize import word_tokenize
from english_words import get_english_words_set
import kenlm
from math import log10
import pickle
path = 'kenlm_model.binary'
model = kenlm.Model(path)
with open('V.pickle', 'rb') as handle:
V_counter = pickle.load(handle)
def clean_string(text):
text = text.lower()
text = re.sub(r" -\\*\\n", "", text)
text = re.sub(r"\\n", " ", text)
text = text.strip()
return text
def predict_probs(w1, w2, w4, w5):
best_scores = []
pred_str = ""
# for word in get_english_words_set(['web2'], lower=True):
for word in V_counter:
text = ' '.join([w1, w2, word, w4, w5])
text_score = model.score(text, bos=False, eos=False)
if len(best_scores) < 5:
best_scores.append((word, text_score))
else:
worst_score = best_scores[-1]
if worst_score[1] < text_score:
best_scores[-1] = (word, text_score)
best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
for word, prob in best_scores:
pred_str += f'{word}:{prob} '
pred_str += f':{log10(0.99)}'
return pred_str
def get_word_predictions(w1, w2,):
for word in get_english_words_set(['web2'], lower=True):
sentence = w1 + ' ' + word + ' ' + w2
text_score = model.score(sentence, bos=False, eos=False)
yield((word, text_score))
def argmax(w1,w2):
# get top 10 predictions from predict_line
top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
return output_line
def run_predictions(source_folder):
print(f"Run predictions on {source_folder} data...")
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
train_data = file.readlines()
with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
for line in tqdm(train_data):
line = line.split("\t")
l1 = clean_string(line[-2])
l2 = clean_string(line[-1])
if not l1 or not l2:
out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
else:
w1, w2 = word_tokenize(l1)[-2:]
w3, w4 = word_tokenize(l2)[:2]
out_line = predict_probs(w1, w2, w3, w4)
output_file.write(out_line + "\n")
run_predictions("dev-0")
run_predictions("test-A")

13
lm0.py
View File

@ -1,13 +0,0 @@
#!/usr/bin/python3
import sys
for i, line in enumerate(sys.stdin):
if(line.split('\t')[6].endswith('\n')):
print('hence:0.95 :0.05')
elif(line.split('\t')[6].endswith('ot')):
print('be:0.6 a:0.35 :0.05')
elif(line.split('\t')[6].endswith('.')):
print('but:0.85 :0.15')
elif([l.split(' ') for l in line.split('\t')][5][0].endswith('ing')):
print('this:0.88 :0.12')
else:
print('the:0.5 a:0.3 :0.2')

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff