{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!xzcat -f1 ../train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../../kenlm/build/bin/lmplz -o 3 > kenlm_model.arpa\n", "!../../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": true } }, "outputs": [], "source": [ "import re\n", "\n", "CONTRACTIONS = {\n", " \"I'm\": \"I am\",\n", " \"you're\": \"you are\",\n", " \"he's\": \"he is\",\n", " \"she's\": \"she is\",\n", " \"it's\": \"it is\",\n", " \"we're\": \"we are\",\n", " \"they're\": \"they are\",\n", " \"aren't\": \"are not\",\n", " \"don't\": \"do not\",\n", " \"doesn't\": \"does not\",\n", " \"weren't\": \"were not\",\n", " \"'ll\": \" will\",\n", "}\n", "\n", "\n", "def formalize_text(text):\n", " # Replace contractions using regular expressions\n", " pattern = re.compile(r'\\b(' + '|'.join(CONTRACTIONS.keys()) + r')\\b')\n", " text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)\n", "\n", " # Remove hyphens at the end of lines and replace newlines with spaces\n", " text = text.replace('-\\n', '')\n", " text = text.replace('\\n', ' ')\n", "\n", " return text\n", "\n", "\n", "def clean_string(text):\n", " text = formalize_text(text)\n", " text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n", " text = re.sub(r\"\\\\n\", \" \", text)\n", " text = text.strip()\n", " return text\n", "\n", "\n", "train_text = \"\"\n", "print(\"Reading train data...\")\n", "with open(\"../train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"../train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n", " for t_line, e_line in zip(file, expected):\n", " t_line = t_line.split(\"\\t\")\n", " train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n", "\n", "# save train_text to file\n", "print(\"saving to file...\")\n", "with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n", " file.write(train_text)\n" ] } ], "metadata": { "kernelspec": { "display_name": "python11", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }