train test split glossary

2022-01-23 17:48:37 +01:00 · 2022-01-23 17:48:37 +01:00 · a6e4a9d64a
commit a6e4a9d64a
parent f1169e1540
3 changed files with 36 additions and 7 deletions
--- a/rapidfuzztest.ipynb
+++ b/rapidfuzztest.ipynb
@ -2,14 +2,22 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    },
    {
     "data": {
      "text/plain": "                                    source  \\\nsource_lem                                   \naaofi                                aaofi   \naca                                    aca   \nacca                                  acca   \nabacus                              abacus   \nabandonment cost         abandonment costs   \n...                                    ...   \nytd                                    ytd   \nyear-end                          year-end   \nyear-to-date                  year-to-date   \nzog                                    zog   \nzero overhead growth  zero overhead growth   \n\n                                                                 result  \nsource_lem                                                               \naaofi                 organizacja rachunkowości i audytu dla islamsk...  \naca                   członek stowarzyszenia dyplomowanych biegłych ...  \nacca                   stowarzyszenie dyplomowanych biegłych rewidentów  \nabacus                                                         liczydło  \nabandonment cost                                     koszty zaniechania  \n...                                                                 ...  \nytd                                                    od początku roku  \nyear-end                                                    koniec roku  \nyear-to-date                                           od początku roku  \nzog                                       zero wzrostu kosztów ogólnych  \nzero overhead growth                      zero wzrostu kosztów ogólnych  \n\n[1197 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>source</th>\n      <th>result</th>\n    </tr>\n    <tr>\n      <th>source_lem</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>aaofi</th>\n      <td>aaofi</td>\n      <td>organizacja rachunkowości i audytu dla islamsk...</td>\n    </tr>\n    <tr>\n      <th>aca</th>\n      <td>aca</td>\n      <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n    </tr>\n    <tr>\n      <th>acca</th>\n      <td>acca</td>\n      <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n    </tr>\n    <tr>\n      <th>abacus</th>\n      <td>abacus</td>\n      <td>liczydło</td>\n    </tr>\n    <tr>\n      <th>abandonment cost</th>\n      <td>abandonment costs</td>\n      <td>koszty zaniechania</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>ytd</th>\n      <td>ytd</td>\n      <td>od początku roku</td>\n    </tr>\n    <tr>\n      <th>year-end</th>\n      <td>year-end</td>\n      <td>koniec roku</td>\n    </tr>\n    <tr>\n      <th>year-to-date</th>\n      <td>year-to-date</td>\n      <td>od początku roku</td>\n    </tr>\n    <tr>\n      <th>zog</th>\n      <td>zog</td>\n      <td>zero wzrostu kosztów ogólnych</td>\n    </tr>\n    <tr>\n      <th>zero overhead growth</th>\n      <td>zero overhead growth</td>\n      <td>zero wzrostu kosztów ogólnych</td>\n    </tr>\n  </tbody>\n</table>\n<p>1197 rows × 2 columns</p>\n</div>"
     },
-     "execution_count": 2,
+     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -51,13 +59,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.187306436\n"
+      "0.191720194\n"
     ]
    }
   ],
@ -91,13 +99,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "        if len(file_lemmatized) % 50000 == 0:\n",
+    "            print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\\r')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "6.592824061\n"
+      "1197\n",
+      "985\n",
+      "6.116408593\n"
     ]
    }
   ],
@ -134,6 +159,8 @@
    "    return sentence_en\n",
    "\n",
    "glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n",
+    "glossary['hash'] = [hash(x) for x in glossary['source']]\n",
+    "glossary = glossary[glossary['hash'] % 100 > 16]\n",
    "file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n",
    "file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]\n",
    "file_en= pd.read_csv(train_in_path, sep='\\t', header=None, names=['text'])\n",
--- a/scripts/inject.py
+++ b/scripts/inject.py
@ -46,6 +46,8 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t

 glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
 glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
+glossary['hash'] = [hash(x) for x in glossary['source']]
+glossary = glossary[glossary['hash'] % 100 > 16]

 file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
 file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]
--- a/scripts/lemmatize_in.py
+++ b/scripts/lemmatize_in.py
@ -12,7 +12,7 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
 file_lemmatized = []
 with open(train_in_path, 'r') as file:
    for line in file:
-        if len(file_lemmatized) % 50000 == 0:
+        if len(file_lemmatized) % 1000 == 0:
            print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
        line = nltk.word_tokenize(line)
        file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))