diff --git a/rapidfuzztest.ipynb b/rapidfuzztest.ipynb
index 99b117f..0239fd4 100644
--- a/rapidfuzztest.ipynb
+++ b/rapidfuzztest.ipynb
@@ -2,14 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    },
     {
      "data": {
       "text/plain": "                                    source  \\\nsource_lem                                   \naaofi                                aaofi   \naca                                    aca   \nacca                                  acca   \nabacus                              abacus   \nabandonment cost         abandonment costs   \n...                                    ...   \nytd                                    ytd   \nyear-end                          year-end   \nyear-to-date                  year-to-date   \nzog                                    zog   \nzero overhead growth  zero overhead growth   \n\n                                                                 result  \nsource_lem                                                               \naaofi                 organizacja rachunkowości i audytu dla islamsk...  \naca                   członek stowarzyszenia dyplomowanych biegłych ...  \nacca                   stowarzyszenie dyplomowanych biegłych rewidentów  \nabacus                                                         liczydło  \nabandonment cost                                     koszty zaniechania  \n...                                                                 ...  \nytd                                                    od początku roku  \nyear-end                                                    koniec roku  \nyear-to-date                                           od początku roku  \nzog                                       zero wzrostu kosztów ogólnych  \nzero overhead growth                      zero wzrostu kosztów ogólnych  \n\n[1197 rows x 2 columns]",
       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>source</th>\n      <th>result</th>\n    </tr>\n    <tr>\n      <th>source_lem</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>aaofi</th>\n      <td>aaofi</td>\n      <td>organizacja rachunkowości i audytu dla islamsk...</td>\n    </tr>\n    <tr>\n      <th>aca</th>\n      <td>aca</td>\n      <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n    </tr>\n    <tr>\n      <th>acca</th>\n      <td>acca</td>\n      <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n    </tr>\n    <tr>\n      <th>abacus</th>\n      <td>abacus</td>\n      <td>liczydło</td>\n    </tr>\n    <tr>\n      <th>abandonment cost</th>\n      <td>abandonment costs</td>\n      <td>koszty zaniechania</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>ytd</th>\n      <td>ytd</td>\n      <td>od początku roku</td>\n    </tr>\n    <tr>\n      <th>year-end</th>\n      <td>year-end</td>\n      <td>koniec roku</td>\n    </tr>\n    <tr>\n      <th>year-to-date</th>\n      <td>year-to-date</td>\n      <td>od początku roku</td>\n    </tr>\n    <tr>\n      <th>zog</th>\n      <td>zog</td>\n      <td>zero wzrostu kosztów ogólnych</td>\n    </tr>\n    <tr>\n      <th>zero overhead growth</th>\n      <td>zero overhead growth</td>\n      <td>zero wzrostu kosztów ogólnych</td>\n    </tr>\n  </tbody>\n</table>\n<p>1197 rows × 2 columns</p>\n</div>"
      },
-     "execution_count": 2,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -51,13 +59,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.187306436\n"
+      "0.191720194\n"
      ]
     }
    ],
@@ -91,13 +99,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "        if len(file_lemmatized) % 50000 == 0:\n",
+    "            print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\\r')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "6.592824061\n"
+      "1197\n",
+      "985\n",
+      "6.116408593\n"
      ]
     }
    ],
@@ -134,6 +159,8 @@
     "    return sentence_en\n",
     "\n",
     "glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n",
+    "glossary['hash'] = [hash(x) for x in glossary['source']]\n",
+    "glossary = glossary[glossary['hash'] % 100 > 16]\n",
     "file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n",
     "file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]\n",
     "file_en= pd.read_csv(train_in_path, sep='\\t', header=None, names=['text'])\n",
diff --git a/scripts/inject.py b/scripts/inject.py
index 8190a4b..dfe8c4e 100644
--- a/scripts/inject.py
+++ b/scripts/inject.py
@@ -46,6 +46,8 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
 
 glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
 glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
+glossary['hash'] = [hash(x) for x in glossary['source']]
+glossary = glossary[glossary['hash'] % 100 > 16]
 
 file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
 file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]
diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py
index 7118eaa..7f9064e 100644
--- a/scripts/lemmatize_in.py
+++ b/scripts/lemmatize_in.py
@@ -12,7 +12,7 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
 file_lemmatized = []
 with open(train_in_path, 'r') as file:
     for line in file:
-        if len(file_lemmatized) % 50000 == 0:
+        if len(file_lemmatized) % 1000 == 0:
             print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
         line = nltk.word_tokenize(line)
         file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))

	source	result
source_lem
aaofi	aaofi	organizacja rachunkowości i audytu dla islamsk...
aca	aca	członek stowarzyszenia dyplomowanych biegłych ...
acca	acca	stowarzyszenie dyplomowanych biegłych rewidentów
abacus	abacus	liczydło
abandonment cost	abandonment costs	koszty zaniechania
...	...	...
ytd	ytd	od początku roku
year-end	year-end	koniec roku
year-to-date	year-to-date	od początku roku
zog	zog	zero wzrostu kosztów ogólnych
zero overhead growth	zero overhead growth	zero wzrostu kosztów ogólnych