train test split glossary
This commit is contained in:
parent
f1169e1540
commit
a6e4a9d64a
@ -2,14 +2,22 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n",
|
||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n[1197 rows x 2 columns]",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>source</th>\n <th>result</th>\n </tr>\n <tr>\n <th>source_lem</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>aaofi</th>\n <td>aaofi</td>\n <td>organizacja rachunkowości i audytu dla islamsk...</td>\n </tr>\n <tr>\n <th>aca</th>\n <td>aca</td>\n <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n </tr>\n <tr>\n <th>acca</th>\n <td>acca</td>\n <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n </tr>\n <tr>\n <th>abacus</th>\n <td>abacus</td>\n <td>liczydło</td>\n </tr>\n <tr>\n <th>abandonment cost</th>\n <td>abandonment costs</td>\n <td>koszty zaniechania</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>ytd</th>\n <td>ytd</td>\n <td>od początku roku</td>\n </tr>\n <tr>\n <th>year-end</th>\n <td>year-end</td>\n <td>koniec roku</td>\n </tr>\n <tr>\n <th>year-to-date</th>\n <td>year-to-date</td>\n <td>od początku roku</td>\n </tr>\n <tr>\n <th>zog</th>\n <td>zog</td>\n <td>zero wzrostu kosztów ogólnych</td>\n </tr>\n <tr>\n <th>zero overhead growth</th>\n <td>zero overhead growth</td>\n <td>zero wzrostu kosztów ogólnych</td>\n </tr>\n </tbody>\n</table>\n<p>1197 rows × 2 columns</p>\n</div>"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -51,13 +59,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.187306436\n"
|
||||
"0.191720194\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -91,13 +99,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" if len(file_lemmatized) % 50000 == 0:\n",
|
||||
" print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\\r')"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"6.592824061\n"
|
||||
"1197\n",
|
||||
"985\n",
|
||||
"6.116408593\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -134,6 +159,8 @@
|
||||
" return sentence_en\n",
|
||||
"\n",
|
||||
"glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n",
|
||||
"glossary['hash'] = [hash(x) for x in glossary['source']]\n",
|
||||
"glossary = glossary[glossary['hash'] % 100 > 16]\n",
|
||||
"file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n",
|
||||
"file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]\n",
|
||||
"file_en= pd.read_csv(train_in_path, sep='\\t', header=None, names=['text'])\n",
|
||||
|
@ -46,6 +46,8 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
|
||||
|
||||
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
|
||||
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
||||
glossary['hash'] = [hash(x) for x in glossary['source']]
|
||||
glossary = glossary[glossary['hash'] % 100 > 16]
|
||||
|
||||
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
|
||||
file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]
|
||||
|
@ -12,7 +12,7 @@ train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/t
|
||||
file_lemmatized = []
|
||||
with open(train_in_path, 'r') as file:
|
||||
for line in file:
|
||||
if len(file_lemmatized) % 50000 == 0:
|
||||
if len(file_lemmatized) % 1000 == 0:
|
||||
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
|
||||
line = nltk.word_tokenize(line)
|
||||
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
|
||||
|
Loading…
Reference in New Issue
Block a user