solution by 442333 based on kenlm

This commit is contained in:
kpierzynski 2024-04-18 02:14:25 +02:00
commit cc59db7407
9 changed files with 18039 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token

9
README.md Normal file
View File

@ -0,0 +1,9 @@
Challenging America word-gap prediction
===================================
Guess a word in a gap.
Evaluation metric
-----------------
LikelihoodHashed is the metric

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv

10519
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
FileId Year LeftContext RightContext
1 FileId Year LeftContext RightContext

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Word
1 Word

84
run.py Normal file
View File

@ -0,0 +1,84 @@
import multiprocessing as mp
import nltk
from tqdm import tqdm
from functools import partial
import kenlm
import regex as re
from tqdm import tqdm
from collections import Counter
from english_words import get_english_words_set
words = get_english_words_set(['web2'], lower=True, alpha=True)
path = 'model_5.binary'
language_model = kenlm.Model(path)
def clean(text):
text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ').replace('<s>','s')
while ' ' in text:
text = text.replace(' ',' ')
return re.sub(r'\p{P}', '', text)
def generate_file(input_path, expected_path, output_path):
with open(input_path) as input_file, open(expected_path) as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
for line, word in zip(input_file, expected_file):
columns = line.split('\t')
prefix = clean(columns[6])
suffix = clean(columns[7])
train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}"
output_file.write(train_line)
#generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')
def predict(prefix):
scores = {}
for word in words:
candidate = f"{prefix} {word}".strip()
score = language_model.score(candidate, bos=False, eos=False)
score_step_lower = language_model.score(f"{prefix.strip()}", bos=False, eos=False)
scores[word] = score - score_step_lower
highest_probs = Counter(scores).most_common(10)
output = ''
probs = 0
for word, logprob in highest_probs:
prob = 10 ** logprob
probs += prob
output += f"{word}:{prob} "
output += f":{1 - probs}"
return output
def parse_line(line):
columns = line.split('\t')
prefix = clean(columns[6])
prefix = nltk.tokenize.word_tokenize(prefix)
prefix_input = prefix[-4] + " " + prefix[-3] + " " + prefix[-2] + " " + prefix[-1]
result = predict(prefix_input)
return result
def parse(input_path, output_path='out.tsv'):
with open(input_path) as f:
lines = f.readlines()
with open(output_path, 'w', encoding="utf-8") as output_file:
pool = mp.Pool()
results = list(tqdm(pool.imap(parse_line, lines), total=len(lines)))
for result in results:
output_file.write(result + '\n')
parse('test-A/in.tsv', output_path="test-A/out.tsv")

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

2
train.sh Normal file
View File

@ -0,0 +1,2 @@
kenlm-master/build/bin/lmplz -o 4 --skip_symbols -T temp < train/train.txt > model_train_4.arpa
kenlm-master/build/bin/build_binary -s model_train_4.arpa model_train_4.binary