mt-summit-corpora/jupyter-injector.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## Lemmatize glossary\n",
    "TODO: train test split glossary"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "data": {
      "text/plain": "                                    source  \\\nsource_lem                                   \naaofi                                aaofi   \naca                                    aca   \nacca                                  acca   \nabacus                              abacus   \nabandonment cost         abandonment costs   \n...                                    ...   \nytd                                    ytd   \nyear-end                          year-end   \nyear-to-date                  year-to-date   \nzog                                    zog   \nzero overhead growth  zero overhead growth   \n\n                                                                 result  \\\nsource_lem                                                                \naaofi                 organizacja rachunkowości i audytu dla islamsk...   \naca                   członek stowarzyszenia dyplomowanych biegłych ...   \nacca                   stowarzyszenie dyplomowanych biegłych rewidentów   \nabacus                                                         liczydło   \nabandonment cost                                     koszty zaniechania   \n...                                                                 ...   \nytd                                                    od początku roku   \nyear-end                                                    koniec roku   \nyear-to-date                                           od początku roku   \nzog                                       zero wzrostu kosztów ogólnych   \nzero overhead growth                      zero wzrostu kosztów ogólnych   \n\n                                                             result_lem  \nsource_lem                                                               \naaofi                 organizacja rachunkowość i audyt dla islamski ...  \naca                   członek stowarzyszenie dyplomowany biegły rewi...  \nacca                         stowarzyszenie dyplomowany biegły rewident  \nabacus                                                         liczydło  \nabandonment cost                                      koszt zaniechanie  \n...                                                                 ...  \nytd                                                     od początek rok  \nyear-end                                                     koniec rok  \nyear-to-date                                            od początek rok  \nzog                                            zero wzrost koszt ogólny  \nzero overhead growth                           zero wzrost koszt ogólny  \n\n[1197 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>source</th>\n      <th>result</th>\n      <th>result_lem</th>\n    </tr>\n    <tr>\n      <th>source_lem</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>aaofi</th>\n      <td>aaofi</td>\n      <td>organizacja rachunkowości i audytu dla islamsk...</td>\n      <td>organizacja rachunkowość i audyt dla islamski ...</td>\n    </tr>\n    <tr>\n      <th>aca</th>\n      <td>aca</td>\n      <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n      <td>członek stowarzyszenie dyplomowany biegły rewi...</td>\n    </tr>\n    <tr>\n      <th>acca</th>\n      <td>acca</td>\n      <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n      <td>stowarzyszenie dyplomowany biegły rewident</td>\n    </tr>\n    <tr>\n      <th>abacus</th>\n      <td>abacus</td>\n      <td>liczydło</td>\n      <td>liczydło</td>\n    </tr>\n    <tr>\n      <th>abandonment cost</th>\n      <td>abandonment costs</td>\n      <td>koszty zaniechania</td>\n      <td>koszt zaniechanie</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>ytd</th>\n      <td>ytd</td>\n      <td>od początku roku</td>\n      <td>od początek rok</td>\n    </tr>\n    <tr>\n      <th>year-end</th>\n      <td>year-end</td>\n      <td>koniec roku</td>\n      <td>koniec rok</td>\n    </tr>\n    <tr>\n      <th>year-to-date</th>\n      <td>year-to-date</td>\n      <td>od początku roku</td>\n      <td>od początek rok</td>\n    </tr>\n    <tr>\n      <th>zog</th>\n      <td>zog</td>\n      <td>zero wzrostu kosztów ogólnych</td>\n      <td>zero wzrost koszt ogólny</td>\n    </tr>\n    <tr>\n      <th>zero overhead growth</th>\n      <td>zero overhead growth</td>\n      <td>zero wzrostu kosztów ogólnych</td>\n      <td>zero wzrost koszt ogólny</td>\n    </tr>\n  </tbody>\n</table>\n<p>1197 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "import pandas as pd\n",
    "import spacy\n",
    "\n",
    "\n",
    "spacy_nlp_en = spacy.load('en_core_web_sm')\n",
    "spacy_nlp_pl = spacy.load(\"pl_core_news_sm\")\n",
    "\n",
    "glossary = pd.read_csv('kompendium.tsv', sep='\\t', header=None, names=['source', 'result'])\n",
    "\n",
    "source_lemmatized = []\n",
    "for word in glossary['source']:\n",
    "    temp = []\n",
    "    for token in spacy_nlp_en(word):\n",
    "        temp.append(token.lemma_)\n",
    "    source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
    "\n",
    "result_lemmatized = []\n",
    "for word in glossary['result']:\n",
    "    temp = []\n",
    "    for token in spacy_nlp_pl(word):\n",
    "        temp.append(token.lemma_)\n",
    "    result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
    "\n",
    "glossary['source_lem'] = source_lemmatized\n",
    "glossary['result_lem'] = result_lemmatized\n",
    "glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]\n",
    "glossary.set_index('source_lem')\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "glossary.to_csv('kompendium_lem.tsv', sep='\\t')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Lemmatize corpus"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "dev_path = 'mt-summit-corpora/dev/dev'\n",
    "\n",
    "skip_chars = ''',./!?'''\n",
    "\n",
    "with open(dev_path + '.en', 'r') as file:\n",
    "    file_lemmatized = []\n",
    "    for line in file:\n",
    "        temp = []\n",
    "        for token in spacy_nlp_en(line):\n",
    "            temp.append(token.lemma_)\n",
    "        file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
    "\n",
    "with open(dev_path + '.pl', 'r') as file:\n",
    "    file_pl_lemmatized = []\n",
    "    for line in file:\n",
    "        temp = []\n",
    "        for token in spacy_nlp_pl(line):\n",
    "            temp.append(token.lemma_)\n",
    "        file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "in the course of the control the control audit firm shall fulfil the responsibility refer to in article 114 on date and in form specify by the controller \n",
      "\n",
      "w czas trwanie kontrola kontrolowany firma audytorski wypełnia obowiązek o których mowa w art 114 w ter-mina i forma wskazany przez osoba kontrolującą \n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(file_lemmatized[2])\n",
    "print(file_pl_lemmatized[2])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Inject glossary\n",
    "# !!! Obsolete !!!"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "import spacy\n",
    "from spaczz.matcher import FuzzyMatcher\n",
    "\n",
    "\n",
    "glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
    "bad_words = ['ocf', 'toc', 'vas', 'vat']\n",
    "train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
    "\n",
    "nlp = spacy.blank(\"en\")\n",
    "matcher = FuzzyMatcher(nlp.vocab)\n",
    "for word in train_glossary['source_lem']:\n",
    "    if word not in bad_words:\n",
    "        matcher.add(word, [nlp(word)])\n",
    "\n",
    "\n",
    "en = []\n",
    "translation_line_counts = []\n",
    "for line_id, line in enumerate(file_lemmatized):\n",
    "    doc = nlp(line)\n",
    "    matches = matcher(doc)\n",
    "\n",
    "    not_injected = 0\n",
    "    for match_id, start, end, ratio in matches:\n",
    "        if ratio > 90:\n",
    "            not_injected += 1\n",
    "            en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
    "\n",
    "\n",
    "    if not_injected == 0:\n",
    "        not_injected = 1\n",
    "        en.append(line)\n",
    "    translation_line_counts.append(not_injected)\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": true
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "import copy\n",
    "tlcs = copy.deepcopy(translation_line_counts)\n",
    "\n",
    "translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
    "with open(dev_path + '.injected.pl', 'w') as file_pl:\n",
    "    for trans in translations.iterrows():\n",
    "        try:\n",
    "            for _ in range(tlcs.pop(0)):\n",
    "                file_pl.write(trans[1]['text'] + '\\n')\n",
    "        except:\n",
    "            pass\n",
    "\n",
    "\n",
    "with open(dev_path + '.injected.en', 'w') as file_en:\n",
    "    for line in en:\n",
    "        file_en.write(line)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Inject glossary Polish crosscheck"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "import spacy\n",
    "from spaczz.matcher import FuzzyMatcher\n",
    "\n",
    "# glossary\n",
    "glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
    "train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
    "\n",
    "# add rules to English matcher\n",
    "nlp = spacy.blank(\"en\")\n",
    "matcher = FuzzyMatcher(nlp.vocab)\n",
    "for word in train_glossary['source_lem']:\n",
    "    matcher.add(word, [nlp(word)])\n",
    "\n",
    "# add rules to Polish matcher\n",
    "nlp_pl = spacy.blank(\"pl\")\n",
    "matcher_pl = FuzzyMatcher(nlp_pl.vocab)\n",
    "for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
    "    matcher_pl.add(word, [nlp_pl(word)])\n",
    "\n",
    "en = []\n",
    "translation_line_counts = []\n",
    "for line_id in range(len(file_lemmatized)):\n",
    "\n",
    "    doc = nlp(file_lemmatized[line_id])\n",
    "    matches = matcher(doc)\n",
    "\n",
    "    not_injected = 0\n",
    "    for match_id, start, end, ratio in matches:\n",
    "        if ratio > 90:\n",
    "            doc_pl = nlp_pl(file_pl_lemmatized[line_id])\n",
    "            matches_pl = matcher_pl(doc_pl)\n",
    "\n",
    "            for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:\n",
    "                if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
    "                    not_injected += 1\n",
    "                    en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
    "\n",
    "    if not_injected == 0:\n",
    "        not_injected = 1\n",
    "        en.append(file_lemmatized[line_id])\n",
    "    translation_line_counts.append(not_injected)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "import copy\n",
    "\n",
    "\n",
    "tlcs = copy.deepcopy(translation_line_counts)\n",
    "\n",
    "translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
    "translations['id'] = [x for x in range(len(translations))]\n",
    "\n",
    "ctr = 0\n",
    "sentence = ''\n",
    "with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n",
    "    with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n",
    "        for i in range(len(en)):\n",
    "            if i > 0:\n",
    "                if en[i-1] != en[i]:\n",
    "                    if ctr == 0:\n",
    "                        sentence = translations.iloc[0]\n",
    "                        translations.drop(sentence['id'], inplace=True)\n",
    "                        sentence = sentence['text']\n",
    "                        try:\n",
    "                            ctr = tlcs.pop(0)\n",
    "                        except:\n",
    "                            pass\n",
    "                    file_en.write(en[i])\n",
    "                    file_pl.write(sentence + '\\n')\n",
    "                ctr = ctr - 1\n",
    "            else:\n",
    "                try:\n",
    "                    ctr = tlcs.pop(0) - 1\n",
    "                except:\n",
    "                    pass\n",
    "                sentence = translations.iloc[0]\n",
    "                translations.drop(sentence['id'], inplace=True)\n",
    "                sentence = sentence['text']\n",
    "                file_en.write(en[i])\n",
    "                file_pl.write(sentence + '\\n')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Inject glossary Polish crosscheck fast?"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "took 152.213599056 injected 63 words. rate 6.569715230451229 sen/s\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import spacy\n",
    "from spaczz.matcher import FuzzyMatcher\n",
    "\n",
    "\n",
    "# glossary\n",
    "glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
    "train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
    "\n",
    "# add rules to English matcher\n",
    "nlp = spacy.blank(\"en\")\n",
    "matcher = FuzzyMatcher(nlp.vocab)\n",
    "for word in train_glossary['source_lem']:\n",
    "    matcher.add(word, [nlp(word)])\n",
    "\n",
    "# add rules to Polish matcher\n",
    "nlp_pl = spacy.blank(\"pl\")\n",
    "matcher_pl = FuzzyMatcher(nlp_pl.vocab)\n",
    "for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
    "    matcher_pl.add(word, [nlp_pl(word)])\n",
    "\n",
    "start_time = time.time_ns()\n",
    "en = []\n",
    "injection_counter = 0\n",
    "for line_id in range(len(file_lemmatized)):\n",
    "\n",
    "    doc = nlp(file_lemmatized[line_id])\n",
    "    matches = matcher(nlp(file_lemmatized[line_id]))\n",
    "\n",
    "    not_injected = True\n",
    "    if len(matches) > 0:\n",
    "        match_id, _, end, ratio = sorted(matches, key=lambda x: len(x[0]), reverse=True)[0]\n",
    "        if ratio > 90:\n",
    "            matches_pl = matcher_pl(nlp_pl(file_pl_lemmatized[line_id]))\n",
    "\n",
    "            for match_id_pl, _, _, _ in matches_pl:\n",
    "                if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
    "                    not_injected = False\n",
    "                    injection_counter += 1\n",
    "                    en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
    "                    break\n",
    "\n",
    "    if not_injected:\n",
    "        en.append(file_lemmatized[line_id])\n",
    "\n",
    "stop = time.time_ns()\n",
    "timex = (stop - start_time) / 1000000000\n",
    "print(f'took {timex} injected {injection_counter} words. rate {len(file_lemmatized)/timex} sen/s')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "import copy\n",
    "\n",
    "\n",
    "tlcs = copy.deepcopy(translation_line_counts)\n",
    "\n",
    "translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
    "translations['id'] = [x for x in range(len(translations))]\n",
    "\n",
    "ctr = 0\n",
    "sentence = ''\n",
    "with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n",
    "    with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n",
    "        for i in range(len(en)):\n",
    "            if i > 0:\n",
    "                if en[i-1] != en[i]:\n",
    "                    if ctr == 0:\n",
    "                        sentence = translations.iloc[0]\n",
    "                        translations.drop(sentence['id'], inplace=True)\n",
    "                        sentence = sentence['text']\n",
    "                        try:\n",
    "                            ctr = tlcs.pop(0)\n",
    "                        except:\n",
    "                            pass\n",
    "                    file_en.write(en[i])\n",
    "                    file_pl.write(sentence + '\\n')\n",
    "                ctr = ctr - 1\n",
    "            else:\n",
    "                try:\n",
    "                    ctr = tlcs.pop(0) - 1\n",
    "                except:\n",
    "                    pass\n",
    "                sentence = translations.iloc[0]\n",
    "                translations.drop(sentence['id'], inplace=True)\n",
    "                sentence = sentence['text']\n",
    "                file_en.write(en[i])\n",
    "                file_pl.write(sentence + '\\n')\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}