470628 kenlm

This commit is contained in:
pietrzakkuba 2022-04-23 23:11:01 +02:00
parent d06cf4c661
commit 00eb463a96
3 changed files with 17969 additions and 17977 deletions

File diff suppressed because it is too large Load Diff

80
run.py
View File

@ -1,10 +1,14 @@
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk import trigrams from nltk import trigrams
from collections import defaultdict, Counter from collections import defaultdict, Counter
from english_words import english_words_alpha_set
import pandas as pd import pandas as pd
import csv import csv
import regex as re import regex as re
import sys import sys
import os
import kenlm
from math import log10
DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
@ -15,54 +19,41 @@ def preprocess(text):
class Model(): class TrainModel():
def __init__(self, alpha, train_file_name, test_file_name): def __init__(self, train_file_name):
file_expected = pd.read_csv(f'{train_file_name}/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=200000) file_expected = pd.read_csv(f'{train_file_name}/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
file_in = pd.read_csv(f'{train_file_name}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=200000) file_in = pd.read_csv(f'{train_file_name}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
file_in = file_in[[6, 7]] file_in = file_in[[6, 7]]
file_concat = pd.concat([file_in, file_expected], axis=1) file_concat = pd.concat([file_in, file_expected], axis=1)
file_concat['text'] = file_concat[6] + file_concat[0] + file_concat[7] file_concat['text'] = file_concat[6] + file_concat[0] + file_concat[7]
self.file = file_concat[['text']] self.file = file_concat[['text']]
self.test_file_name = test_file_name
self.alpha = alpha;
self.model = defaultdict(lambda: defaultdict(lambda: 0))
def train(self): def build_kenlm_model(self):
rows = self.file.iterrows() with open('text.txt', 'w+', encoding='utf-8') as file:
rows_len = len(self.file) for _, row in self.file.iterrows():
for index, (_, row) in enumerate(rows): file.write(str(row['text']) + '\n')
if index % 1000 == 0: os.system('./kenlm/build/bin/lmplz -o 4 < text.txt > model.arpa --skip_symbols')
print(f'uczenie modelu: {index / rows_len}') os.system('./kenlm/build/bin/build_binary model.arpa model.binary')
words = word_tokenize(preprocess(str(row['text']))) os.system('rm text.txt')
for word_1, word_2, word_3 in trigrams(words, pad_right=True, pad_left=True): self.kenlm_model = kenlm.Model('model.binary')
if word_1 and word_2 and word_3:
self.model[(word_1, word_3)][word_2] += 1
model_len = len(self.model)
for index, words_1_3 in enumerate(self.model):
if index % 100000 == 0:
print(f'normalizacja i wygładzanie: {index / model_len}')
occurrences = sum(self.model[words_1_3].values())
for word_2 in self.model[words_1_3]:
self.model[words_1_3][word_2] += self.alpha
self.model[words_1_3][word_2] /= float(occurrences + self.alpha + len(word_2))
def predict_row(self, word_before, word_after): def predict_row(self, word_before, word_after):
prediction = dict(Counter(dict(self.model[word_before, word_after])).most_common(6)) scores = {}
result = [] for word in english_words_alpha_set:
prob = 0.0 score = self.kenlm_model.score(word_before + ' ' + word + ' ' + word_after, bos=False, eos=False)
for key, value in prediction.items(): scores[word] = score
prob += value prediction = ''
result.append(f'{key}:{value}') top_probs = Counter(scores).most_common(6)
if prob == 0.0: for word, prob in top_probs:
return DEFAULT_PREDICTION prediction += f'{word}:{prob} '
result.append(f':{max(1 - prob, 0.01)}') prediction += f':{log10(0.01)}'
return ' '.join(result) return prediction
def predict(self): def predict(self, test_file_name):
data = pd.read_csv(f'{self.test_file_name}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) data = pd.read_csv(f'{test_file_name}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open(f'{self.test_file_name}/out.tsv', 'w', encoding='utf-8') as file_out: with open(f'{test_file_name}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows(): for _, row in data.iterrows():
words_before, words_after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7]))) words_before, words_after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7])))
if len(words_before) < 3 or len(words_after) < 3: if len(words_before) < 3 or len(words_after) < 3:
@ -73,8 +64,9 @@ class Model():
alpha = float(sys.argv[1]) training = TrainModel('train')
print(f'alfa: {alpha}') training.build_kenlm_model()
model = Model(alpha, 'train', sys.argv[2]) training.predict('test-A')
model.train() training.predict('dev-0')
model.predict()

File diff suppressed because it is too large Load Diff