kenlm solution
This commit is contained in:
parent
b30ed83944
commit
02ee0ff2fa
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
1988
kenlm.ipynb
Normal file
1988
kenlm.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
79
kenlm_2words.py
Normal file
79
kenlm_2words.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
from tqdm import tqdm
|
||||||
|
import regex as re
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from english_words import get_english_words_set
|
||||||
|
import kenlm
|
||||||
|
from math import log10
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
path = 'kenlm_model.binary'
|
||||||
|
model = kenlm.Model(path)
|
||||||
|
|
||||||
|
with open('V.pickle', 'rb') as handle:
|
||||||
|
V_counter = pickle.load(handle)
|
||||||
|
|
||||||
|
def clean_string(text):
|
||||||
|
text = text.lower()
|
||||||
|
text = re.sub(r" -\\*\\n", "", text)
|
||||||
|
text = re.sub(r"\\n", " ", text)
|
||||||
|
text = text.strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def predict_probs(w1, w3):
|
||||||
|
best_scores = []
|
||||||
|
pred_str = ""
|
||||||
|
# for word in get_english_words_set(['web2'], lower=True):
|
||||||
|
for word in V_counter:
|
||||||
|
text = ' '.join([w1, word, w3])
|
||||||
|
text_score = model.score(text, bos=False, eos=False)
|
||||||
|
if len(best_scores) < 5:
|
||||||
|
best_scores.append((word, text_score))
|
||||||
|
else:
|
||||||
|
worst_score = best_scores[-1]
|
||||||
|
if worst_score[1] < text_score:
|
||||||
|
best_scores[-1] = (word, text_score)
|
||||||
|
best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
|
||||||
|
|
||||||
|
for word, prob in best_scores:
|
||||||
|
pred_str += f'{word}:{prob} '
|
||||||
|
pred_str += f':{log10(0.99)}'
|
||||||
|
return pred_str
|
||||||
|
|
||||||
|
def get_word_predictions(w1, w2,):
|
||||||
|
for word in get_english_words_set(['web2'], lower=True):
|
||||||
|
sentence = w1 + ' ' + word + ' ' + w2
|
||||||
|
text_score = model.score(sentence, bos=False, eos=False)
|
||||||
|
yield((word, text_score))
|
||||||
|
|
||||||
|
def argmax(w1,w2):
|
||||||
|
# get top 10 predictions from predict_line
|
||||||
|
top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
|
||||||
|
output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
|
||||||
|
return output_line
|
||||||
|
|
||||||
|
def run_predictions(source_folder):
|
||||||
|
print(f"Run predictions on {source_folder} data...")
|
||||||
|
|
||||||
|
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
|
||||||
|
train_data = file.readlines()
|
||||||
|
|
||||||
|
with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
|
||||||
|
for line in tqdm(train_data):
|
||||||
|
line = line.split("\t")
|
||||||
|
|
||||||
|
l1 = clean_string(line[-2])
|
||||||
|
l2 = clean_string(line[-1])
|
||||||
|
|
||||||
|
if not l1 or not l2:
|
||||||
|
out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
|
||||||
|
else:
|
||||||
|
w1 = word_tokenize(l1)[-1]
|
||||||
|
w2 = word_tokenize(l2)[0]
|
||||||
|
out_line = predict_probs(w1, w2)
|
||||||
|
|
||||||
|
output_file.write(out_line + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
run_predictions("dev-0")
|
||||||
|
run_predictions("test-A")
|
79
kenlm_4words.py
Normal file
79
kenlm_4words.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
from tqdm import tqdm
|
||||||
|
import regex as re
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from english_words import get_english_words_set
|
||||||
|
import kenlm
|
||||||
|
from math import log10
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
path = 'kenlm_model.binary'
|
||||||
|
model = kenlm.Model(path)
|
||||||
|
|
||||||
|
with open('V.pickle', 'rb') as handle:
|
||||||
|
V_counter = pickle.load(handle)
|
||||||
|
|
||||||
|
def clean_string(text):
|
||||||
|
text = text.lower()
|
||||||
|
text = re.sub(r" -\\*\\n", "", text)
|
||||||
|
text = re.sub(r"\\n", " ", text)
|
||||||
|
text = text.strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def predict_probs(w1, w2, w4, w5):
|
||||||
|
best_scores = []
|
||||||
|
pred_str = ""
|
||||||
|
# for word in get_english_words_set(['web2'], lower=True):
|
||||||
|
for word in V_counter:
|
||||||
|
text = ' '.join([w1, w2, word, w4, w5])
|
||||||
|
text_score = model.score(text, bos=False, eos=False)
|
||||||
|
if len(best_scores) < 5:
|
||||||
|
best_scores.append((word, text_score))
|
||||||
|
else:
|
||||||
|
worst_score = best_scores[-1]
|
||||||
|
if worst_score[1] < text_score:
|
||||||
|
best_scores[-1] = (word, text_score)
|
||||||
|
best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
|
||||||
|
|
||||||
|
for word, prob in best_scores:
|
||||||
|
pred_str += f'{word}:{prob} '
|
||||||
|
pred_str += f':{log10(0.99)}'
|
||||||
|
return pred_str
|
||||||
|
|
||||||
|
def get_word_predictions(w1, w2,):
|
||||||
|
for word in get_english_words_set(['web2'], lower=True):
|
||||||
|
sentence = w1 + ' ' + word + ' ' + w2
|
||||||
|
text_score = model.score(sentence, bos=False, eos=False)
|
||||||
|
yield((word, text_score))
|
||||||
|
|
||||||
|
def argmax(w1,w2):
|
||||||
|
# get top 10 predictions from predict_line
|
||||||
|
top_10 = sorted(list(get_word_predictions(w1,w2)), key=lambda x: -x[1])[:4]
|
||||||
|
output_line = " ".join(["{}:{:.8f}".format(w, p) for w, p in top_10])
|
||||||
|
return output_line
|
||||||
|
|
||||||
|
def run_predictions(source_folder):
|
||||||
|
print(f"Run predictions on {source_folder} data...")
|
||||||
|
|
||||||
|
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
|
||||||
|
train_data = file.readlines()
|
||||||
|
|
||||||
|
with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
|
||||||
|
for line in tqdm(train_data):
|
||||||
|
line = line.split("\t")
|
||||||
|
|
||||||
|
l1 = clean_string(line[-2])
|
||||||
|
l2 = clean_string(line[-1])
|
||||||
|
|
||||||
|
if not l1 or not l2:
|
||||||
|
out_line = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
|
||||||
|
else:
|
||||||
|
w1, w2 = word_tokenize(l1)[-2:]
|
||||||
|
w3, w4 = word_tokenize(l2)[:2]
|
||||||
|
out_line = predict_probs(w1, w2, w3, w4)
|
||||||
|
|
||||||
|
output_file.write(out_line + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
run_predictions("dev-0")
|
||||||
|
run_predictions("test-A")
|
13
lm0.py
13
lm0.py
@ -1,13 +0,0 @@
|
|||||||
#!/usr/bin/python3
|
|
||||||
import sys
|
|
||||||
for i, line in enumerate(sys.stdin):
|
|
||||||
if(line.split('\t')[6].endswith('\n')):
|
|
||||||
print('hence:0.95 :0.05')
|
|
||||||
elif(line.split('\t')[6].endswith('ot')):
|
|
||||||
print('be:0.6 a:0.35 :0.05')
|
|
||||||
elif(line.split('\t')[6].endswith('.')):
|
|
||||||
print('but:0.85 :0.15')
|
|
||||||
elif([l.split(' ') for l in line.split('\t')][5][0].endswith('ing')):
|
|
||||||
print('this:0.88 :0.12')
|
|
||||||
else:
|
|
||||||
print('the:0.5 a:0.3 :0.2')
|
|
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user