diff --git a/rapidfuzztest.ipynb b/rapidfuzztest.ipynb index 99b117f..0239fd4 100644 --- a/rapidfuzztest.ipynb +++ b/rapidfuzztest.ipynb @@ -2,14 +2,22 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, { "data": { "text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n[1197 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sourceresult
source_lem
aaofiaaofiorganizacja rachunkowości i audytu dla islamsk...
acaacaczłonek stowarzyszenia dyplomowanych biegłych ...
accaaccastowarzyszenie dyplomowanych biegłych rewidentów
abacusabacusliczydło
abandonment costabandonment costskoszty zaniechania
.........
ytdytdod początku roku
year-endyear-endkoniec roku
year-to-dateyear-to-dateod początku roku
zogzogzero wzrostu kosztów ogólnych
zero overhead growthzero overhead growthzero wzrostu kosztów ogólnych
\n

1197 rows × 2 columns

\n
" }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -51,13 +59,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.187306436\n" + "0.191720194\n" ] } ], @@ -91,13 +99,30 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, + "outputs": [], + "source": [ + " if len(file_lemmatized) % 50000 == 0:\n", + " print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\\r')" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "6.592824061\n" + "1197\n", + "985\n", + "6.116408593\n" ] } ], @@ -134,6 +159,8 @@ " return sentence_en\n", "\n", "glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n", + "glossary['hash'] = [hash(x) for x in glossary['source']]\n", + "glossary = glossary[glossary['hash'] % 100 > 16]\n", "file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n", "file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]\n", "file_en= pd.read_csv(train_in_path, sep='\\t', header=None, names=['text'])\n", diff --git a/scripts/inject.py b/scripts/inject.py index 8190a4b..dfe8c4e 100644 --- a/scripts/inject.py +++ b/scripts/inject.py @@ -46,6 +46,8 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t') glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']] +glossary['hash'] = [hash(x) for x in glossary['source']] +glossary = glossary[glossary['hash'] % 100 > 16] file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text']) file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()] diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py index 7118eaa..7f9064e 100644 --- a/scripts/lemmatize_in.py +++ b/scripts/lemmatize_in.py @@ -12,7 +12,7 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t file_lemmatized = [] with open(train_in_path, 'r') as file: for line in file: - if len(file_lemmatized) % 50000 == 0: + if len(file_lemmatized) % 1000 == 0: print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r') line = nltk.word_tokenize(line) file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))