diff --git a/rapidfuzztest.ipynb b/rapidfuzztest.ipynb
index 99b117f..0239fd4 100644
--- a/rapidfuzztest.ipynb
+++ b/rapidfuzztest.ipynb
@@ -2,14 +2,22 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n"
+ ]
+ },
{
"data": {
"text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n[1197 rows x 2 columns]",
"text/html": "
\n\n
\n \n \n | \n source | \n result | \n
\n \n source_lem | \n | \n | \n
\n \n \n \n aaofi | \n aaofi | \n organizacja rachunkowości i audytu dla islamsk... | \n
\n \n aca | \n aca | \n członek stowarzyszenia dyplomowanych biegłych ... | \n
\n \n acca | \n acca | \n stowarzyszenie dyplomowanych biegłych rewidentów | \n
\n \n abacus | \n abacus | \n liczydło | \n
\n \n abandonment cost | \n abandonment costs | \n koszty zaniechania | \n
\n \n ... | \n ... | \n ... | \n
\n \n ytd | \n ytd | \n od początku roku | \n
\n \n year-end | \n year-end | \n koniec roku | \n
\n \n year-to-date | \n year-to-date | \n od początku roku | \n
\n \n zog | \n zog | \n zero wzrostu kosztów ogólnych | \n
\n \n zero overhead growth | \n zero overhead growth | \n zero wzrostu kosztów ogólnych | \n
\n \n
\n
1197 rows × 2 columns
\n
"
},
- "execution_count": 2,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -51,13 +59,13 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.187306436\n"
+ "0.191720194\n"
]
}
],
@@ -91,13 +99,30 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
+ "outputs": [],
+ "source": [
+ " if len(file_lemmatized) % 50000 == 0:\n",
+ " print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\\r')"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "6.592824061\n"
+ "1197\n",
+ "985\n",
+ "6.116408593\n"
]
}
],
@@ -134,6 +159,8 @@
" return sentence_en\n",
"\n",
"glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n",
+ "glossary['hash'] = [hash(x) for x in glossary['source']]\n",
+ "glossary = glossary[glossary['hash'] % 100 > 16]\n",
"file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n",
"file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]\n",
"file_en= pd.read_csv(train_in_path, sep='\\t', header=None, names=['text'])\n",
diff --git a/scripts/inject.py b/scripts/inject.py
index 8190a4b..dfe8c4e 100644
--- a/scripts/inject.py
+++ b/scripts/inject.py
@@ -46,6 +46,8 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
+glossary['hash'] = [hash(x) for x in glossary['source']]
+glossary = glossary[glossary['hash'] % 100 > 16]
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]
diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py
index 7118eaa..7f9064e 100644
--- a/scripts/lemmatize_in.py
+++ b/scripts/lemmatize_in.py
@@ -12,7 +12,7 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
file_lemmatized = []
with open(train_in_path, 'r') as file:
for line in file:
- if len(file_lemmatized) % 50000 == 0:
+ if len(file_lemmatized) % 1000 == 0:
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
line = nltk.word_tokenize(line)
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))