solution by 442333 based on kenlm

2024-04-18 02:14:25 +02:00 · 2024-04-18 02:14:25 +02:00 · cc59db7407
commit cc59db7407
9 changed files with 18039 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
 *~
 *.swp
 *.bak
 *.pyc
 *.o
 .DS_Store
 .token
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 Challenging America word-gap prediction
 ===================================
 Guess a word in a gap.
 Evaluation metric
 -----------------
 LikelihoodHashed is the metric
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
 --metric PerplexityHashed --precision 2  --in-header in-header.tsv  --out-header out-header.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
 FileId	Year	LeftContext	RightContext
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
 Word
--- a/run.py
+++ b/run.py
@ -0,0 +1,84 @@
 import multiprocessing as mp
 import nltk
 from tqdm import tqdm
 from functools import partial
 import kenlm
 import regex as re
 from tqdm import tqdm
 from collections import Counter
 from english_words import get_english_words_set
 words = get_english_words_set(['web2'], lower=True, alpha=True)
 path = 'model_5.binary'
 language_model = kenlm.Model(path)
 def clean(text):
 	text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ').replace('<s>','s')
 	while '  ' in text:
 		text = text.replace('  ',' ')
 	return re.sub(r'\p{P}', '', text)
 def generate_file(input_path, expected_path, output_path):
 	with open(input_path) as input_file, open(expected_path) as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
 		for line, word in zip(input_file, expected_file):
 			columns = line.split('\t')
 			prefix = clean(columns[6])
 			suffix = clean(columns[7])
 			train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}"
 			output_file.write(train_line)
 #generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')
 def predict(prefix):
 	scores = {}
 	for word in words:
 		candidate = f"{prefix} {word}".strip()
 		score = language_model.score(candidate, bos=False, eos=False)
 		score_step_lower = language_model.score(f"{prefix.strip()}", bos=False, eos=False)
 		scores[word] = score - score_step_lower
 	highest_probs = Counter(scores).most_common(10)
 	output = ''
 	probs = 0
 	for word, logprob in highest_probs:
 		prob = 10 ** logprob
 		probs += prob
 		output += f"{word}:{prob} "
 	output += f":{1 - probs}"
 	return output
 def parse_line(line):
    columns = line.split('\t')
    prefix = clean(columns[6])
    prefix = nltk.tokenize.word_tokenize(prefix)
    prefix_input = prefix[-4] + " " + prefix[-3] + " " + prefix[-2] + " " + prefix[-1]
    result = predict(prefix_input)
    return result
 def parse(input_path, output_path='out.tsv'):
    with open(input_path) as f:
        lines = f.readlines()
    with open(output_path, 'w', encoding="utf-8") as output_file:
        pool = mp.Pool()
        results = list(tqdm(pool.imap(parse_line, lines), total=len(lines)))
        for result in results:
            output_file.write(result + '\n')
 parse('test-A/in.tsv', output_path="test-A/out.tsv")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.sh
+++ b/train.sh
@ -0,0 +1,2 @@
 kenlm-master/build/bin/lmplz -o 4 --skip_symbols -T temp < train/train.txt > model_train_4.arpa
 kenlm-master/build/bin/build_binary -s model_train_4.arpa model_train_4.binary
		`@ -0,0 +1 @@`
							`--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv`
		`@ -0,0 +1,2 @@`
							`kenlm-master/build/bin/lmplz -o 4 --skip_symbols -T temp < train/train.txt > model_train_4.arpa`
							`kenlm-master/build/bin/build_binary -s model_train_4.arpa model_train_4.binary`