Compare commits

...

6 Commits

Author SHA1 Message Date
Jakub Eichner 975dd50258 kenlm solution 2023-04-26 08:23:58 +02:00
Jakub Eichner 64b2612ef1 kenlm solution 2023-04-26 08:17:51 +02:00
Jakub Eichner dfe830ac26 kenlm solution 2023-04-26 08:15:39 +02:00
Jakub Eichner 20d03b9e18 kenlm solution 2023-04-26 08:14:12 +02:00
Jakub Eichner 140f4c0284 kenlm solution 2023-04-26 08:10:48 +02:00
Jakub Eichner 02ee0ff2fa kenlm solution 2023-04-26 08:07:17 +02:00
4 changed files with 18008 additions and 10532 deletions

File diff suppressed because it is too large Load Diff

75
kenlm_run.py Normal file
View File

@ -0,0 +1,75 @@
from tqdm import tqdm
import regex as re
from nltk.tokenize import word_tokenize
from english_words import get_english_words_set
import kenlm
from math import log10
import pickle
path = 'kenlm_model.binary'
model = kenlm.Model(path)
with open('V.pickle', 'rb') as handle:
V_counter = pickle.load(handle)
def clean_string(text):
text = text.lower()
text = re.sub(r" -\\*\\n", "", text)
text = re.sub(r"\\n", " ", text)
text = text.strip()
return text
def predict_probs(w1, w2, w4, w5):
best_scores = []
pred_str = ""
for word in V_counter:
text = ' '.join([w1, w2, word, w4, w5])
text_score = model.score(text, bos=False, eos=False)
if len(best_scores) < 5:
best_scores.append((word, text_score))
else:
worst_score = best_scores[-1]
if worst_score[1] < text_score:
best_scores[-1] = (word, text_score)
best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
for word, prob in best_scores:
pred_str += f'{word}:{prob} '
pred_str += f':{log10(0.99)}'
return pred_str
def get_word_predictions(w1, w2,):
for word in get_english_words_set(['web2'], lower=True):
sentence = f'{w1} {word} {w2}'
text_score = model.score(sentence, False, False)
yield((word, text_score))
def argmax(w1,w2):
top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
return output_line
def run_predictions(source_folder):
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
train_data = file.readlines()
with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
for line in tqdm(train_data):
line = line.split("\t")
l1 = clean_string(line[-2])
l2 = clean_string(line[-1])
if not l1 or not l2:
out_line = "the:0.5 a:0.3 :0.2"
else:
w1, w2 = word_tokenize(l1)[-2:]
w3, w4 = word_tokenize(l2)[:2]
out_line = predict_probs(w1, w2, w3, w4)
output_file.write(out_line + "\n")
run_predictions("dev-0")
run_predictions("test-A")

13
lm0.py
View File

@ -1,13 +0,0 @@
#!/usr/bin/python3
import sys
for i, line in enumerate(sys.stdin):
if(line.split('\t')[6].endswith('\n')):
print('hence:0.95 :0.05')
elif(line.split('\t')[6].endswith('ot')):
print('be:0.6 a:0.35 :0.05')
elif(line.split('\t')[6].endswith('.')):
print('but:0.85 :0.15')
elif([l.split(' ') for l in line.split('\t')][5][0].endswith('ing')):
print('this:0.88 :0.12')
else:
print('the:0.5 a:0.3 :0.2')

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff