kenLM #3
This commit is contained in:
parent
52c529193d
commit
8c25eb8da8
97
lab6/kenlm.ipynb
Normal file
97
lab6/kenlm.ipynb
Normal file
@ -0,0 +1,97 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!xzcat -f1 ../train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../../kenlm/build/bin/lmplz -o 3 > kenlm_model.arpa\n",
|
||||
"!../../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"CONTRACTIONS = {\n",
|
||||
" \"I'm\": \"I am\",\n",
|
||||
" \"you're\": \"you are\",\n",
|
||||
" \"he's\": \"he is\",\n",
|
||||
" \"she's\": \"she is\",\n",
|
||||
" \"it's\": \"it is\",\n",
|
||||
" \"we're\": \"we are\",\n",
|
||||
" \"they're\": \"they are\",\n",
|
||||
" \"aren't\": \"are not\",\n",
|
||||
" \"don't\": \"do not\",\n",
|
||||
" \"doesn't\": \"does not\",\n",
|
||||
" \"weren't\": \"were not\",\n",
|
||||
" \"'ll\": \" will\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def formalize_text(text):\n",
|
||||
" # Replace contractions using regular expressions\n",
|
||||
" pattern = re.compile(r'\\b(' + '|'.join(CONTRACTIONS.keys()) + r')\\b')\n",
|
||||
" text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)\n",
|
||||
"\n",
|
||||
" # Remove hyphens at the end of lines and replace newlines with spaces\n",
|
||||
" text = text.replace('-\\n', '')\n",
|
||||
" text = text.replace('\\n', ' ')\n",
|
||||
"\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def clean_string(text):\n",
|
||||
" text = formalize_text(text)\n",
|
||||
" text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n",
|
||||
" text = re.sub(r\"\\\\n\", \" \", text)\n",
|
||||
" text = text.strip()\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"train_text = \"\"\n",
|
||||
"print(\"Reading train data...\")\n",
|
||||
"with open(\"../train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"../train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n",
|
||||
" for t_line, e_line in zip(file, expected):\n",
|
||||
" t_line = t_line.split(\"\\t\")\n",
|
||||
" train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n",
|
||||
"\n",
|
||||
"# save train_text to file\n",
|
||||
"print(\"saving to file...\")\n",
|
||||
"with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n",
|
||||
" file.write(train_text)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "python11",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
99
lab6/kenlm_script.py
Normal file
99
lab6/kenlm_script.py
Normal file
@ -0,0 +1,99 @@
|
||||
from tqdm import tqdm
|
||||
import regex as re
|
||||
from english_words import get_english_words_set
|
||||
import kenlm
|
||||
import pickle
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
path = 'kenlm_model.binary'
|
||||
model = kenlm.Model(path)
|
||||
|
||||
CONTRACTIONS = {
|
||||
"I'm": "I am",
|
||||
"you're": "you are",
|
||||
"he's": "he is",
|
||||
"she's": "she is",
|
||||
"it's": "it is",
|
||||
"we're": "we are",
|
||||
"they're": "they are",
|
||||
"aren't": "are not",
|
||||
"don't": "do not",
|
||||
"doesn't": "does not",
|
||||
"weren't": "were not",
|
||||
"'ll": " will",
|
||||
}
|
||||
|
||||
|
||||
def formalize_text(text):
|
||||
# Replace contractions using regular expressions
|
||||
pattern = re.compile(r'\b(' + '|'.join(CONTRACTIONS.keys()) + r')\b')
|
||||
text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)
|
||||
|
||||
# Remove hyphens at the end of lines and replace newlines with spaces
|
||||
text = text.replace('-\n', '')
|
||||
text = text.replace('\n', ' ')
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def clean_string(text):
|
||||
text = formalize_text(text)
|
||||
text = re.sub(r" -\\*\\n", "", text)
|
||||
text = re.sub(r"\\n", " ", text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def p(text):
|
||||
return 1 / (1 + math.exp(-(model.score(text, bos=False, eos=False))))
|
||||
|
||||
|
||||
def perplexity(text):
|
||||
return model.perplexity(text)
|
||||
|
||||
|
||||
def predict_probs_w1w2wi(w1, w2):
|
||||
best_scores = []
|
||||
pred_str = ""
|
||||
for word in V_counter:
|
||||
w1w2 = ' '.join([w2, word])
|
||||
w1w2w3 = ' '.join([w1, w2, word])
|
||||
|
||||
text_score = 0.1 * p(word) + 0.3 * p(w1w2) + 0.6 * p(w1w2w3)
|
||||
|
||||
if len(best_scores) < 5:
|
||||
best_scores.append((word, text_score))
|
||||
else:
|
||||
worst_score = best_scores[-1]
|
||||
if worst_score[1] < text_score:
|
||||
best_scores[-1] = (word, text_score)
|
||||
best_scores = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
for word, prob in best_scores:
|
||||
pred_str += f'{word}:{prob} '
|
||||
pred_str += f':{1 - sum([p for _, p in best_scores])}'
|
||||
return pred_str
|
||||
|
||||
|
||||
def run_predictions(source_folder):
|
||||
print(f"Run predictions on {source_folder} data...")
|
||||
|
||||
with open(f"{source_folder}/in.tsv", encoding="utf8", mode="rt") as file:
|
||||
train_data = file.readlines()
|
||||
|
||||
with open(f"{source_folder}/out.tsv", "w", encoding="utf-8") as output_file:
|
||||
for line in tqdm(train_data):
|
||||
line = line.split("\t")
|
||||
|
||||
w1, w2 = clean_string(line[-2]).split()[-2:]
|
||||
out_line = predict_probs_w1w2wi(w1, w2)
|
||||
|
||||
output_file.write(out_line + "\n")
|
||||
|
||||
|
||||
with open('V_3000.pickle', 'rb') as handle:
|
||||
V_counter = pickle.load(handle)
|
||||
|
||||
run_predictions("../dev-0")
|
||||
# run_predictions("../test-A")
|
Loading…
Reference in New Issue
Block a user