10 KiB
10 KiB
!pip install english_words
Collecting english_words Downloading english-words-1.1.0.tar.gz (1.1 MB) [K |████████████████████████████████| 1.1 MB 1.5 MB/s eta 0:00:01 [?25hBuilding wheels for collected packages: english-words Building wheel for english-words (setup.py) ... [?25ldone [?25h Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=ddaf5f4288a2022c2ce712aad0ba022e7b25d4d7e73c5637d6154abc5a899662 Stored in directory: /home/asadursk/.cache/pip/wheels/0e/24/52/b4989db82a438482aa65b3c6c0537e988fd40546b792747b1a Successfully built english-words Installing collected packages: english-words Successfully installed english-words-1.1.0
!python -m pip install pypi-kenlm
Collecting pypi-kenlm Downloading pypi-kenlm-0.1.20210121.tar.gz (253 kB) [K |████████████████████████████████| 253 kB 1.6 MB/s eta 0:00:01 [?25hBuilding wheels for collected packages: pypi-kenlm Building wheel for pypi-kenlm (setup.py) ... [?25ldone [?25h Created wheel for pypi-kenlm: filename=pypi_kenlm-0.1.20210121-cp39-cp39-linux_x86_64.whl size=311921 sha256=2fcde1a0b569c5d5aef6c61014559b38efc45ed4ae90357c1219816d9a5bbe9b Stored in directory: /home/asadursk/.cache/pip/wheels/14/f0/7a/97db71356d1dc1b0c14bf48e0d01e5561d5d67ba869e4406d0 Successfully built pypi-kenlm Installing collected packages: pypi-kenlm Successfully installed pypi-kenlm-0.1.20210121
from collections import defaultdict, Counter
from nltk import trigrams, word_tokenize
from english_words import english_words_alpha_set
import csv
import regex as re
import pandas as pd
import kenlm
from math import log10
def preprocess(row):
return re.sub(r'\p{P}', '', row.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' '))
def kenlm_model():
with open("train_file.txt", "w+") as f:
for text in X_train:
f.write(str(text) + "\n")
#%%
KENLM_BUILD_PATH='/home/asadursk/kenlm/build'
!$KENLM_BUILD_PATH/bin/lmplz -o 4 < train_file.txt > model.arpa
!$KENLM_BUILD_PATH/bin/build_binary model.arpa model.binary
!rm train_file.txt
model = kenlm.Model("model.binary")
return model
def predict_word(w1, w3):
best_scores = []
for word in english_words_alpha_set:
text = ' '.join([w1, word, w3])
text_score = model.score(text, bos=False, eos=False)
if len(best_scores) < 12:
best_scores.append((word, text_score))
else:
is_better = False
worst_score = None
for score in best_scores:
if not worst_score:
worst_score = score
else:
if worst_score[1] > score[1]:
worst_score = score
if worst_score[1] < text_score:
best_scores.remove(worst_score)
best_scores.append((word, text_score))
probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
pred_str = ''
for word, prob in probs:
pred_str += f'{word}:{prob} '
pred_str += f':{log10(0.99)}'
return pred_str
def word_gap_prediction(file, model):
X_test = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, on_bad_lines="skip")
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as output_file:
for _, row in X_test.iterrows():
before, after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7])))
if len(before) < 2 or len(after) < 2:
output = 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
else:
output = predict_word(before[-1], after[0])
output_file.write(output + '\n')
X_train = pd.read_csv('train/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=10000, on_bad_lines="skip")
Y_train = pd.read_csv('train/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=10000, on_bad_lines="skip")
X_train = X_train[[6, 7]]
X_train = pd.concat([X_train, Y_train], axis=1)
X_train = X_train[6] + X_train[0] + X_train[7]
model = kenlm_model()
=== 1/5 Counting and sorting n-grams === Reading /home/asadursk/challenging-america-word-gap-prediction-kenlm/train_file.txt ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Unigram tokens 2787545 types 548500 === 2/5 Calculating and sorting adjusted counts === Chain sizes: 1:6582000 2:865198656 3:1622247552 4:2595596032 Statistics: 1 548500 D1=0.85065 D2=1.01013 D3+=1.14959 2 1743634 D1=0.900957 D2=1.09827 D3+=1.20014 3 2511917 D1=0.957313 D2=1.22283 D3+=1.33724 4 2719775 D1=0.982576 D2=1.4205 D3+=1.65074 Memory estimate for binary LM: type MB probing 157 assuming -p 1.5 probing 184 assuming -r models -p 1.5 trie 82 without quantization trie 51 assuming -q 8 -b 8 quantization trie 74 assuming -a 22 array pointer compression trie 43 assuming -a 22 -q 8 -b 8 array pointer compression and quantization === 3/5 Calculating and sorting initial probabilities === Chain sizes: 1:6582000 2:27898144 3:50238340 4:65274600 ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 #################################################################################################### === 4/5 Calculating and writing order-interpolated probabilities === Chain sizes: 1:6582000 2:27898144 3:50238340 4:65274600 ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 #################################################################################################### === 5/5 Writing ARPA model === ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Name:lmplz VmPeak:5126188 kB VmRSS:54384 kB RSSMax:1084112 kB user:9.18382 sys:2.72419 CPU:11.9081 real:9.09119 Reading model.arpa ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** SUCCESS
word_gap_prediction("dev-0/", model)
word_gap_prediction("test-A/", model)