challenging-america-word-ga.../lab6/kenlm.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!xzcat -f1 ../train/in.tsv.xz | cut -f7,8 | sed 's/-\\\\n/ /g' | sed 's/\\\\n//g' | sed 's/\\\\//g' | ../../kenlm/build/bin/lmplz -o 3 > kenlm_model.arpa\n",
    "!../../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "CONTRACTIONS = {\n",
    "    \"I'm\": \"I am\",\n",
    "    \"you're\": \"you are\",\n",
    "    \"he's\": \"he is\",\n",
    "    \"she's\": \"she is\",\n",
    "    \"it's\": \"it is\",\n",
    "    \"we're\": \"we are\",\n",
    "    \"they're\": \"they are\",\n",
    "    \"aren't\": \"are not\",\n",
    "    \"don't\": \"do not\",\n",
    "    \"doesn't\": \"does not\",\n",
    "    \"weren't\": \"were not\",\n",
    "    \"'ll\": \" will\",\n",
    "}\n",
    "\n",
    "\n",
    "def formalize_text(text):\n",
    "    # Replace contractions using regular expressions\n",
    "    pattern = re.compile(r'\\b(' + '|'.join(CONTRACTIONS.keys()) + r')\\b')\n",
    "    text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)\n",
    "\n",
    "    # Remove hyphens at the end of lines and replace newlines with spaces\n",
    "    text = text.replace('-\\n', '')\n",
    "    text = text.replace('\\n', ' ')\n",
    "\n",
    "    return text\n",
    "\n",
    "\n",
    "def clean_string(text):\n",
    "    text = formalize_text(text)\n",
    "    text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n",
    "    text = re.sub(r\"\\\\n\", \" \", text)\n",
    "    text = text.strip()\n",
    "    return text\n",
    "\n",
    "\n",
    "train_text = \"\"\n",
    "print(\"Reading train data...\")\n",
    "with open(\"../train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"../train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n",
    "    for t_line, e_line in zip(file, expected):\n",
    "        t_line = t_line.split(\"\\t\")\n",
    "        train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n",
    "\n",
    "# save train_text to file\n",
    "print(\"saving to file...\")\n",
    "with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n",
    "    file.write(train_text)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python11",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
kenLM #3 2023-04-25 00:27:37 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"!xzcat -f1 ../train/in.tsv.xz \| cut -f7,8 \| sed 's/-\\\\n/ /g' \| sed 's/\\\\n//g' \| sed 's/\\\\//g' \| ../../kenlm/build/bin/lmplz -o 3 > kenlm_model.arpa\n",`
			`"!../../kenlm/build/bin/build_binary kenlm_model.arpa kenlm_model.binary"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {`
			`"pycharm": {`
			`"is_executing": true`
			`}`
			`},`
			`"outputs": [],`
			`"source": [`
			`"import re\n",`
			`"\n",`
			`"CONTRACTIONS = {\n",`
			`" \"I'm\": \"I am\",\n",`
			`" \"you're\": \"you are\",\n",`
			`" \"he's\": \"he is\",\n",`
			`" \"she's\": \"she is\",\n",`
			`" \"it's\": \"it is\",\n",`
			`" \"we're\": \"we are\",\n",`
			`" \"they're\": \"they are\",\n",`
			`" \"aren't\": \"are not\",\n",`
			`" \"don't\": \"do not\",\n",`
			`" \"doesn't\": \"does not\",\n",`
			`" \"weren't\": \"were not\",\n",`
			`" \"'ll\": \" will\",\n",`
			`"}\n",`
			`"\n",`
			`"\n",`
			`"def formalize_text(text):\n",`
			`" # Replace contractions using regular expressions\n",`
			`" pattern = re.compile(r'\\b(' + '\|'.join(CONTRACTIONS.keys()) + r')\\b')\n",`
			`" text = pattern.sub(lambda x: CONTRACTIONS[x.group()], text)\n",`
			`"\n",`
			`" # Remove hyphens at the end of lines and replace newlines with spaces\n",`
			`" text = text.replace('-\\n', '')\n",`
			`" text = text.replace('\\n', ' ')\n",`
			`"\n",`
			`" return text\n",`
			`"\n",`
			`"\n",`
			`"def clean_string(text):\n",`
			`" text = formalize_text(text)\n",`
			`" text = re.sub(r\" -\\\\*\\\\n\", \"\", text)\n",`
			`" text = re.sub(r\"\\\\n\", \" \", text)\n",`
			`" text = text.strip()\n",`
			`" return text\n",`
			`"\n",`
			`"\n",`
			`"train_text = \"\"\n",`
			`"print(\"Reading train data...\")\n",`
			`"with open(\"../train/in.tsv\", encoding=\"utf8\", mode=\"rt\") as file, open(\"../train/expected.tsv\", encoding=\"utf8\", mode=\"rt\") as expected:\n",`
			`" for t_line, e_line in zip(file, expected):\n",`
			`" t_line = t_line.split(\"\\t\")\n",`
			`" train_text += clean_string(t_line[-2]) + f\" {clean_string(e_line)} \" + clean_string(t_line[-1])\n",`
			`"\n",`
			`"# save train_text to file\n",`
			`"print(\"saving to file...\")\n",`
			`"with open(\"train_text.txt\", encoding=\"utf8\", mode=\"w\") as file:\n",`
			`" file.write(train_text)\n"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "python11",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.3"`
			`},`
			`"orig_nbformat": 4`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`