98 lines
2.8 KiB
Plaintext
98 lines
2.8 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"!xzcat -f1 ../train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../../kenlm/build/bin/lmplz -o 3 > kenlm_model.arpa\n",
|
||
|
"!../../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"pycharm": {
|
||
|
"is_executing": true
|
||
|
}
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import re\n",
|
||
|
"\n",
|
||
|
"CONTRACTIONS = {\n",
|
||
|
" \"I'm\": \"I am\",\n",
|
||
|
" \"you're\": \"you are\",\n",
|
||
|
" \"he's\": \"he is\",\n",
|
||
|
" \"she's\": \"she is\",\n",
|
||
|
" \"it's\": \"it is\",\n",
|
||
|
" \"we're\": \"we are\",\n",
|
||
|
" \"they're\": \"they are\",\n",
|
||
|
" \"aren't\": \"are not\",\n",
|
||
|
" \"don't\": \"do not\",\n",
|
||
|
" \"doesn't\": \"does not\",\n",
|
||
|
" \"weren't\": \"were not\",\n",
|
||
|
" \"'ll\": \" will\",\n",
|
||
|
"}\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"def formalize_text(text):\n",
|
||
|
" # Replace contractions using regular expressions\n",
|
||
|
" pattern = re.compile(r'\\b(' + '|'.join(CONTRACTIONS.keys()) + r')\\b')\n",
|
||
|
" text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)\n",
|
||
|
"\n",
|
||
|
" # Remove hyphens at the end of lines and replace newlines with spaces\n",
|
||
|
" text = text.replace('-\\n', '')\n",
|
||
|
" text = text.replace('\\n', ' ')\n",
|
||
|
"\n",
|
||
|
" return text\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"def clean_string(text):\n",
|
||
|
" text = formalize_text(text)\n",
|
||
|
" text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n",
|
||
|
" text = re.sub(r\"\\\\n\", \" \", text)\n",
|
||
|
" text = text.strip()\n",
|
||
|
" return text\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"train_text = \"\"\n",
|
||
|
"print(\"Reading train data...\")\n",
|
||
|
"with open(\"../train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"../train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n",
|
||
|
" for t_line, e_line in zip(file, expected):\n",
|
||
|
" t_line = t_line.split(\"\\t\")\n",
|
||
|
" train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n",
|
||
|
"\n",
|
||
|
"# save train_text to file\n",
|
||
|
"print(\"saving to file...\")\n",
|
||
|
"with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n",
|
||
|
" file.write(train_text)\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "python11",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.11.3"
|
||
|
},
|
||
|
"orig_nbformat": 4
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|