From 3cbab11535c5749b3aacf4a72d36dfe00d9fef4e Mon Sep 17 00:00:00 2001 From: jakubknczny Date: Sat, 22 Jan 2022 01:20:20 +0100 Subject: [PATCH] inject in lemmatized --- ...rapidfuzztest.ipynb => rapidfuzztest.ipynb | 101 +++++++++++------- 1 file changed, 62 insertions(+), 39 deletions(-) rename random-scripts/rapidfuzztest.ipynb => rapidfuzztest.ipynb (68%) diff --git a/random-scripts/rapidfuzztest.ipynb b/rapidfuzztest.ipynb similarity index 68% rename from random-scripts/rapidfuzztest.ipynb rename to rapidfuzztest.ipynb index caa86fb..6ab8edf 100644 --- a/random-scripts/rapidfuzztest.ipynb +++ b/rapidfuzztest.ipynb @@ -5,14 +5,21 @@ "execution_count": null, "outputs": [], "source": [ - "import pandas as pd\n", + "\n", + "import copy\n", "import nltk\n", + "import pandas as pd\n", + "import rapidfuzz\n", + "import time\n", + "\n", "from nltk.stem import WordNetLemmatizer\n", + "from rapidfuzz.fuzz import partial_ratio\n", + "from rapidfuzz.utils import default_process\n", "\n", "\n", "wl = WordNetLemmatizer()\n", "\n", - "glossary = pd.read_csv('../kompendium.tsv', sep='\\t', header=None, names=['source', 'result'])\n", + "glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\\t', header=None, names=['source', 'result'])\n", "\n", "source_lemmatized = []\n", "for word in glossary['source']:\n", @@ -34,21 +41,34 @@ }, { "cell_type": "code", - "execution_count": null, - "outputs": [], + "execution_count": 36, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.194806501\n" + ] + } + ], "source": [ + "# train_in_path = 'mt-summit-corpora/train/in.tsv'\n", + "# train_expected_path = 'mt-summit-corpora/train/expected.tsv'\n", + "\n", + "train_in_path = 'mt-summit-corpora/dev-0/in.tsv'\n", + "train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'\n", + "\n", + "\n", + "file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n", "\n", "start_time = time.time_ns()\n", - "filex = []\n", - "with open(dev_path + '.pl', 'r') as file:\n", + "file_lemmatized = []\n", + "with open(train_in_path, 'r') as file:\n", " for line in file:\n", - " if len(filex) % 50000 == 0:\n", - " print(len(filex), end='\\r')\n", + " if len(file_lemmatized) % 50000 == 0:\n", + " print(len(file_lemmatized), end='\\r')\n", " line = nltk.word_tokenize(line)\n", - " filex.append(' '.join([wl.lemmatize(x) for x in line]))\n", - "\n", - "\n", - "print(filex)\n", + " file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))\n", "\n", "stop = time.time_ns()\n", "timex = (stop - start_time) / 1000000000\n", @@ -57,33 +77,23 @@ "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n", - "is_executing": true + "name": "#%%\n" } } }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 45, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "78.948892319\n", - "640\n" + "6.904260614\n" ] } ], "source": [ - "import copy\n", - "import pandas as pd\n", - "import rapidfuzz\n", - "import time\n", - "\n", - "from rapidfuzz.fuzz import partial_ratio\n", - "from rapidfuzz.utils import default_process\n", - "\n", "\n", "THRESHOLD = 88\n", "\n", @@ -110,26 +120,28 @@ " if current >= maxx:\n", " maxx = current\n", " maxxi = i\n", - " return ' '.join(sen[:maxxi + window_size]) + ' ' + inject + ' ' + ' '.join(sen[maxxi + window_size:])\n", + " return ' '.join(sen[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen[maxxi + window_size:])\n", "\n", - "glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\\t', header=0, index_col=0)\n", "glossary['source_lem'] = [' ' + str(default_process(x)) + ' ' for x in glossary['source_lem']]\n", "\n", "start_time = time.time_ns()\n", "en = []\n", "translation_line_counts = []\n", - "for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):\n", + "for line, line_pl in zip(file_lemmatized, file_pl['text'].values.tolist()):\n", " line = default_process(line)\n", " line_pl = default_process(line_pl)\n", " matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)\n", " if len(matchez) > 0:\n", - " translation_line_counts.append(len(matchez))\n", + " lines_added = 0\n", " for match in matchez:\n", " polish_translation = glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0]\n", " if is_injectable(line_pl, polish_translation):\n", - " en.append(get_injected(line, match[0], polish_translation)[0])\n", - " else:\n", - " en.append(line)\n", + " en.append(get_injected(line, match[0], polish_translation))\n", + " lines_added += 1\n", + " if lines_added == 0:\n", + " en.append(line)\n", + " lines_added = 1\n", + " translation_line_counts.append(lines_added)\n", " else:\n", " translation_line_counts.append(1)\n", " en.append(line)\n", @@ -148,19 +160,18 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 46, "outputs": [], "source": [ - "tlcs = copy.deepcopy(translation_line_counts)\n", "\n", - "translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n", - "with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n", - " for line, translation_line_ct in zip(translations, tlcs):\n", + "translations = pd.read_csv(train_expected_path, sep='\\t', header=0, names=['text'])\n", + "with open(train_expected_path + '.injected', 'w') as file_plx:\n", + " for line, translation_line_ct in zip(translations['text'].values.tolist(), translation_line_counts):\n", " for i in range(translation_line_ct):\n", - " file_pl.write(line)\n", + " file_plx.write(line + '\\n')\n", "\n", "\n", - "with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n", + "with open(train_in_path + '.injected', 'w') as file_en:\n", " for e in en:\n", " file_en.write(e + '\\n')" ], @@ -170,6 +181,18 @@ "name": "#%%\n" } } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } } ], "metadata": {