challenging-america-word-gap-prediction-kenlm/run.ipynb at 594a64e45f10d7d134cc7584450046b5ebec7040

s440054/challenging-america-word-gap-prediction-kenlm

Norbert Litkowski d09c77e228 440054

2022-04-25 01:17:13 +02:00

8.8 KiB

Raw Blame History

import pandas as pd
from utils import *

data = get_csv("train/in.tsv.xz")

train_labels = get_csv("train/expected.tsv")

train_data = data[[6,7]]

train_data = pd.concat([train_data, train_labels], axis=1)

train_data[607] = train_data[6] + train_data[0] + train_data[7]

train_data[607] = train_data[607].apply(clean_text)

train_data[607]

0         came fiom the last place to thisnplace and thi...
1         mb boot political obeednattempt to imagine a p...
2         thera were in   only aeventyninenuberlbers lo ...
3         a gixnl man y niterertiiiv diiclosurs regard  ...
4         tin  ub tv thf bbabbt qabjenmr schiffs tutemen...
                                ...                        
432017    sam clendenin bad a fancy for uinscience of me...
432018    witahtt halting the party ware dilven to the s...
432019    it was the last thing that either ofnthem expe...
432020    settlement with the departmentnit is also show...
432021    flour quotationslow extras at   r ®   ncity mi...
Name: 607, Length: 432022, dtype: object

with open("tmp",  "w+") as f:
    for t in train_data[607]:
        f.write(t + "\n")

KENLM_BUILD_PATH = "../kenlm/build/"
!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa

=== 1/5 Counting and sorting n-grams ===
Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.
Special word <s> is not allowed in the corpus.  I plan to support models containing <unk> in the future.  Pass --skip_symbols to convert these symbols to whitespace.
/bin/bash: linia 1:  5055 Przerwane               (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa

!rm tmp

import kenlm
model = kenlm.Model("./model.arpa")

Loading the LM will be faster if you build a binary file.
Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************

!pip install english_words

Defaulting to user installation because normal site-packages is not writeable
Collecting english_words
  Downloading english-words-1.1.0.tar.gz (1.1 MB)
     |████████████████████████████████| 1.1 MB 985 kB/s            
[?25hBuilding wheels for collected packages: english-words
  Building wheel for english-words (setup.py) ... [?25ldone
[?25h  Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=818b2393457321fc616c24465b2c7ce020853e36d9d8e1b03142a5e18076713d
  Stored in directory: /home/me/.cache/pip/wheels/2c/48/9a/f697d8d989ca4e4c1060f9da73caea372d7e1b78402abff8bb
Successfully built english-words
Installing collected packages: english-words
Successfully installed english-words-1.1.0

from english_words import english_words_alpha_set
from math import log10

def predict(before, after):
    result = ''
    prob = 0.0
    best = []
    for word in english_words_alpha_set:
        text = ' '.join([before, word, after])
        text_score = model.score(text, bos=False, eos=False)
        if len(best) < 12:
            best.append((word, text_score))
        else:
            is_better = False
            worst_score = None
            for score in best:
                if not worst_score:
                    worst_score = score
                else:
                    if worst_score[1] > score[1]:
                        worst_score = score
            if worst_score[1] < text_score:
                best.remove(worst_score)
                best.append((word, text_score))
    probs = sorted(best, key=lambda tup: tup[1], reverse=True)
    pred_str = ''
    for word, prob in probs:
        pred_str += f'{word}:{prob} '
    pred_str += f':{log10(0.99)}'
    return pred_str

from nltk import trigrams, word_tokenize

def make_prediction(path, result_path):
    pdata = get_csv(path)
    with open(result_path, 'w', encoding='utf-8') as file_out:
        for _, row in pdata.iterrows():
            before, after = word_tokenize(clean_text(str(row[6]))), word_tokenize(clean_text(str(row[7])))
            if len(before) < 2 or len(after) < 2:
                pred = prediction
            else:
                pred = predict(before[-1], after[0])
            file_out.write(pred + '\n')

make_prediction("dev-0/in.tsv.xz", "dev-0/out.tsv")

make_prediction("test-A/in.tsv.xz", "test-A/out.tsv")

8.8 KiB Raw Blame History Unescape Escape

8.8 KiB

Raw Blame History