transfix-mt/rapidfuzztest.ipynb

262 lines
11 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n[1197 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>source</th>\n <th>result</th>\n </tr>\n <tr>\n <th>source_lem</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>aaofi</th>\n <td>aaofi</td>\n <td>organizacja rachunkowości i audytu dla islamsk...</td>\n </tr>\n <tr>\n <th>aca</th>\n <td>aca</td>\n <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n </tr>\n <tr>\n <th>acca</th>\n <td>acca</td>\n <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n </tr>\n <tr>\n <th>abacus</th>\n <td>abacus</td>\n <td>liczydło</td>\n </tr>\n <tr>\n <th>abandonment cost</th>\n <td>abandonment costs</td>\n <td>koszty zaniechania</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>ytd</th>\n <td>ytd</td>\n <td>od początku roku</td>\n </tr>\n <tr>\n <th>year-end</th>\n <td>year-end</td>\n <td>koniec roku</td>\n </tr>\n <tr>\n <th>year-to-date</th>\n <td>year-to-date</td>\n <td>od początku roku</td>\n </tr>\n <tr>\n <th>zog</th>\n <td>zog</td>\n <td>zero wzrostu kosztów ogólnych</td>\n </tr>\n <tr>\n <th>zero overhead growth</th>\n <td>zero overhead growth</td>\n <td>zero wzrostu kosztów ogólnych</td>\n </tr>\n </tbody>\n</table>\n<p>1197 rows × 2 columns</p>\n</div>"
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"import nltk\n",
"import pandas as pd\n",
"import rapidfuzz\n",
"import time\n",
"\n",
"from nltk.stem import WordNetLemmatizer\n",
"from rapidfuzz.fuzz import partial_ratio\n",
"from rapidfuzz.utils import default_process\n",
"\n",
"nltk.download('wordnet')\n",
"\n",
"\n",
"wl = WordNetLemmatizer()\n",
"\n",
"glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\\t', header=None, names=['source', 'result'])\n",
"\n",
"source_lemmatized = []\n",
"for word in glossary['source']:\n",
" word = nltk.word_tokenize(word)\n",
" source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word]))\n",
"\n",
"glossary['source_lem'] = source_lemmatized\n",
"glossary = glossary[['source', 'source_lem', 'result']]\n",
"glossary.set_index('source_lem')\n",
"\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.191720194\n"
]
}
],
"source": [
"# train_in_path = 'mt-summit-corpora/train/in.tsv'\n",
"# train_expected_path = 'mt-summit-corpora/train/expected.tsv'\n",
"\n",
"train_in_path = 'mt-summit-corpora/dev-0/in.tsv'\n",
"train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'\n",
"\n",
"\n",
"start_time = time.time_ns()\n",
"file_lemmatized = []\n",
"with open(train_in_path, 'r') as file:\n",
" for line in file:\n",
" if len(file_lemmatized) % 50000 == 0:\n",
" print(len(file_lemmatized), end='\\r')\n",
" line = nltk.word_tokenize(line)\n",
" file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))\n",
"\n",
"stop = time.time_ns()\n",
"timex = (stop - start_time) / 1000000000\n",
"print(timex)\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
" if len(file_lemmatized) % 50000 == 0:\n",
" print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\\r')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1197\n",
"985\n",
"6.116408593\n"
]
}
],
"source": [
"\n",
"THRESHOLD = 70\n",
"\n",
"\n",
"def is_injectable(sentence_pl, sequence):\n",
" sen = sentence_pl.split()\n",
" window_size = len(sequence.split())\n",
" maxx = 0\n",
" for i in range(len(sen) - window_size + 1):\n",
" current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)\n",
" if current > maxx:\n",
" maxx = current\n",
" return maxx >= THRESHOLD\n",
"\n",
"def get_injected(sentence, sentence_en, sequence, inject):\n",
" sen = sentence.split()\n",
" sen_en = sentence_en.split()\n",
" window_size = len(sequence.split())\n",
" maxx = 0\n",
" maxx_prv = 0\n",
" maxxi = 0\n",
" for i in range(len(sen) - window_size + 1):\n",
" current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)\n",
" if current >= maxx:\n",
" maxx_prv = maxx\n",
" maxx = current\n",
" maxxi = i\n",
" if maxx_prv != maxx:\n",
" return ' '.join(sen_en[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen_en[maxxi + window_size:])\n",
" return sentence_en\n",
"\n",
"glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n",
"glossary['hash'] = [hash(x) for x in glossary['source']]\n",
"glossary = glossary[glossary['hash'] % 100 > 16]\n",
"file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n",
"file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]\n",
"file_en= pd.read_csv(train_in_path, sep='\\t', header=None, names=['text'])\n",
"file_en['text'] = [default_process(text) for text in file_en['text'].values.tolist()]\n",
"\n",
"start_time = time.time_ns()\n",
"en = []\n",
"translation_line_counts = []\n",
"for line, line_en, line_pl in zip(file_lemmatized, file_en['text'].values.tolist(), file_pl['text'].values.tolist()):\n",
" line = default_process(line)\n",
" matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)\n",
" if len(matchez) > 0:\n",
" lines_added = 0\n",
" for match in matchez:\n",
" polish_translation = glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0]\n",
" if is_injectable(line_pl, polish_translation):\n",
" en.append(get_injected(line, line_en, match[0], polish_translation))\n",
" lines_added += 1\n",
" if lines_added == 0:\n",
" en.append(line_en)\n",
" lines_added = 1\n",
" translation_line_counts.append(lines_added)\n",
" else:\n",
" translation_line_counts.append(1)\n",
" en.append(line_en)\n",
"\n",
"\n",
"stop = time.time_ns()\n",
"timex = (stop - start_time) / 1000000000\n",
"print(timex)\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"\n",
"def full_strip(line):\n",
" return ' '.join(line.split())\n",
"\n",
"with open(train_expected_path + '.injected', 'w') as file_pl_write:\n",
" for line, translation_line_ct in zip(file_pl['text'].values.tolist(), translation_line_counts):\n",
" for i in range(translation_line_ct):\n",
" file_pl_write.write(full_strip(line) + '\\n')\n",
"\n",
"\n",
"with open(train_in_path + '.injected', 'w') as file_en_write:\n",
" for e in en:\n",
" file_en_write.write(e + '\\n')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}