solution by 442333 based on kenlm
This commit is contained in:
commit
cc59db7407
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
9
README.md
Normal file
9
README.md
Normal file
@ -0,0 +1,9 @@
|
||||
Challenging America word-gap prediction
|
||||
===================================
|
||||
|
||||
Guess a word in a gap.
|
||||
|
||||
Evaluation metric
|
||||
-----------------
|
||||
|
||||
LikelihoodHashed is the metric
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv
|
10519
dev-0/out.tsv
Normal file
10519
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
FileId Year LeftContext RightContext
|
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
Word
|
|
84
run.py
Normal file
84
run.py
Normal file
@ -0,0 +1,84 @@
|
||||
import multiprocessing as mp
|
||||
import nltk
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
import kenlm
|
||||
import regex as re
|
||||
from tqdm import tqdm
|
||||
from collections import Counter
|
||||
|
||||
from english_words import get_english_words_set
|
||||
words = get_english_words_set(['web2'], lower=True, alpha=True)
|
||||
|
||||
path = 'model_5.binary'
|
||||
|
||||
language_model = kenlm.Model(path)
|
||||
|
||||
def clean(text):
|
||||
text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ').replace('<s>','s')
|
||||
while ' ' in text:
|
||||
text = text.replace(' ',' ')
|
||||
|
||||
return re.sub(r'\p{P}', '', text)
|
||||
|
||||
def generate_file(input_path, expected_path, output_path):
|
||||
with open(input_path) as input_file, open(expected_path) as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
|
||||
for line, word in zip(input_file, expected_file):
|
||||
columns = line.split('\t')
|
||||
prefix = clean(columns[6])
|
||||
suffix = clean(columns[7])
|
||||
|
||||
train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}"
|
||||
|
||||
output_file.write(train_line)
|
||||
|
||||
#generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')
|
||||
|
||||
def predict(prefix):
|
||||
scores = {}
|
||||
|
||||
for word in words:
|
||||
candidate = f"{prefix} {word}".strip()
|
||||
score = language_model.score(candidate, bos=False, eos=False)
|
||||
|
||||
score_step_lower = language_model.score(f"{prefix.strip()}", bos=False, eos=False)
|
||||
|
||||
scores[word] = score - score_step_lower
|
||||
|
||||
highest_probs = Counter(scores).most_common(10)
|
||||
|
||||
output = ''
|
||||
probs = 0
|
||||
for word, logprob in highest_probs:
|
||||
prob = 10 ** logprob
|
||||
probs += prob
|
||||
output += f"{word}:{prob} "
|
||||
output += f":{1 - probs}"
|
||||
|
||||
return output
|
||||
|
||||
def parse_line(line):
|
||||
columns = line.split('\t')
|
||||
prefix = clean(columns[6])
|
||||
|
||||
prefix = nltk.tokenize.word_tokenize(prefix)
|
||||
|
||||
prefix_input = prefix[-4] + " " + prefix[-3] + " " + prefix[-2] + " " + prefix[-1]
|
||||
|
||||
result = predict(prefix_input)
|
||||
|
||||
return result
|
||||
|
||||
def parse(input_path, output_path='out.tsv'):
|
||||
with open(input_path) as f:
|
||||
lines = f.readlines()
|
||||
|
||||
with open(output_path, 'w', encoding="utf-8") as output_file:
|
||||
pool = mp.Pool()
|
||||
|
||||
results = list(tqdm(pool.imap(parse_line, lines), total=len(lines)))
|
||||
|
||||
for result in results:
|
||||
output_file.write(result + '\n')
|
||||
|
||||
parse('test-A/in.tsv', output_path="test-A/out.tsv")
|
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user