inject in lemmatized

This commit is contained in:
jakubknczny 2022-01-22 01:20:20 +01:00
parent 6b4c6e18f8
commit 3cbab11535

View File

@ -5,14 +5,21 @@
"execution_count": null, "execution_count": null,
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "\n",
"import copy\n",
"import nltk\n", "import nltk\n",
"import pandas as pd\n",
"import rapidfuzz\n",
"import time\n",
"\n",
"from nltk.stem import WordNetLemmatizer\n", "from nltk.stem import WordNetLemmatizer\n",
"from rapidfuzz.fuzz import partial_ratio\n",
"from rapidfuzz.utils import default_process\n",
"\n", "\n",
"\n", "\n",
"wl = WordNetLemmatizer()\n", "wl = WordNetLemmatizer()\n",
"\n", "\n",
"glossary = pd.read_csv('../kompendium.tsv', sep='\\t', header=None, names=['source', 'result'])\n", "glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\\t', header=None, names=['source', 'result'])\n",
"\n", "\n",
"source_lemmatized = []\n", "source_lemmatized = []\n",
"for word in glossary['source']:\n", "for word in glossary['source']:\n",
@ -34,21 +41,34 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 36,
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.194806501\n"
]
}
],
"source": [ "source": [
"# train_in_path = 'mt-summit-corpora/train/in.tsv'\n",
"# train_expected_path = 'mt-summit-corpora/train/expected.tsv'\n",
"\n",
"train_in_path = 'mt-summit-corpora/dev-0/in.tsv'\n",
"train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'\n",
"\n",
"\n",
"file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n",
"\n", "\n",
"start_time = time.time_ns()\n", "start_time = time.time_ns()\n",
"filex = []\n", "file_lemmatized = []\n",
"with open(dev_path + '.pl', 'r') as file:\n", "with open(train_in_path, 'r') as file:\n",
" for line in file:\n", " for line in file:\n",
" if len(filex) % 50000 == 0:\n", " if len(file_lemmatized) % 50000 == 0:\n",
" print(len(filex), end='\\r')\n", " print(len(file_lemmatized), end='\\r')\n",
" line = nltk.word_tokenize(line)\n", " line = nltk.word_tokenize(line)\n",
" filex.append(' '.join([wl.lemmatize(x) for x in line]))\n", " file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))\n",
"\n",
"\n",
"print(filex)\n",
"\n", "\n",
"stop = time.time_ns()\n", "stop = time.time_ns()\n",
"timex = (stop - start_time) / 1000000000\n", "timex = (stop - start_time) / 1000000000\n",
@ -57,33 +77,23 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
"name": "#%%\n", "name": "#%%\n"
"is_executing": true
} }
} }
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 45,
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"78.948892319\n", "6.904260614\n"
"640\n"
] ]
} }
], ],
"source": [ "source": [
"import copy\n",
"import pandas as pd\n",
"import rapidfuzz\n",
"import time\n",
"\n",
"from rapidfuzz.fuzz import partial_ratio\n",
"from rapidfuzz.utils import default_process\n",
"\n",
"\n", "\n",
"THRESHOLD = 88\n", "THRESHOLD = 88\n",
"\n", "\n",
@ -110,26 +120,28 @@
" if current >= maxx:\n", " if current >= maxx:\n",
" maxx = current\n", " maxx = current\n",
" maxxi = i\n", " maxxi = i\n",
" return ' '.join(sen[:maxxi + window_size]) + ' ' + inject + ' ' + ' '.join(sen[maxxi + window_size:])\n", " return ' '.join(sen[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen[maxxi + window_size:])\n",
"\n", "\n",
"glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\\t', header=0, index_col=0)\n",
"glossary['source_lem'] = [' ' + str(default_process(x)) + ' ' for x in glossary['source_lem']]\n", "glossary['source_lem'] = [' ' + str(default_process(x)) + ' ' for x in glossary['source_lem']]\n",
"\n", "\n",
"start_time = time.time_ns()\n", "start_time = time.time_ns()\n",
"en = []\n", "en = []\n",
"translation_line_counts = []\n", "translation_line_counts = []\n",
"for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):\n", "for line, line_pl in zip(file_lemmatized, file_pl['text'].values.tolist()):\n",
" line = default_process(line)\n", " line = default_process(line)\n",
" line_pl = default_process(line_pl)\n", " line_pl = default_process(line_pl)\n",
" matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)\n", " matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)\n",
" if len(matchez) > 0:\n", " if len(matchez) > 0:\n",
" translation_line_counts.append(len(matchez))\n", " lines_added = 0\n",
" for match in matchez:\n", " for match in matchez:\n",
" polish_translation = glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0]\n", " polish_translation = glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0]\n",
" if is_injectable(line_pl, polish_translation):\n", " if is_injectable(line_pl, polish_translation):\n",
" en.append(get_injected(line, match[0], polish_translation)[0])\n", " en.append(get_injected(line, match[0], polish_translation))\n",
" else:\n", " lines_added += 1\n",
" en.append(line)\n", " if lines_added == 0:\n",
" en.append(line)\n",
" lines_added = 1\n",
" translation_line_counts.append(lines_added)\n",
" else:\n", " else:\n",
" translation_line_counts.append(1)\n", " translation_line_counts.append(1)\n",
" en.append(line)\n", " en.append(line)\n",
@ -148,19 +160,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 46,
"outputs": [], "outputs": [],
"source": [ "source": [
"tlcs = copy.deepcopy(translation_line_counts)\n",
"\n", "\n",
"translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n", "translations = pd.read_csv(train_expected_path, sep='\\t', header=0, names=['text'])\n",
"with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n", "with open(train_expected_path + '.injected', 'w') as file_plx:\n",
" for line, translation_line_ct in zip(translations, tlcs):\n", " for line, translation_line_ct in zip(translations['text'].values.tolist(), translation_line_counts):\n",
" for i in range(translation_line_ct):\n", " for i in range(translation_line_ct):\n",
" file_pl.write(line)\n", " file_plx.write(line + '\\n')\n",
"\n", "\n",
"\n", "\n",
"with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n", "with open(train_in_path + '.injected', 'w') as file_en:\n",
" for e in en:\n", " for e in en:\n",
" file_en.write(e + '\\n')" " file_en.write(e + '\\n')"
], ],
@ -170,6 +181,18 @@
"name": "#%%\n" "name": "#%%\n"
} }
} }
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
} }
], ],
"metadata": { "metadata": {