Merge branch 'master' of git.wmi.amu.edu.pl:s470607/mt-summit-corpora

This commit is contained in:
jakubknczny 2022-01-18 11:17:00 +01:00
commit a5ce04b2cb
5 changed files with 172 additions and 1697 deletions

View File

@ -1,503 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## Lemmatize glossary\n",
"TODO: train test split glossary"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \\\nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n result_lem \nsource_lem \naaofi organizacja rachunkowość i audyt dla islamski ... \naca członek stowarzyszenie dyplomowany biegły rewi... \nacca stowarzyszenie dyplomowany biegły rewident \nabacus liczydło \nabandonment cost koszt zaniechanie \n... ... \nytd od początek rok \nyear-end koniec rok \nyear-to-date od początek rok \nzog zero wzrost koszt ogólny \nzero overhead growth zero wzrost koszt ogólny \n\n[1197 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>source</th>\n <th>result</th>\n <th>result_lem</th>\n </tr>\n <tr>\n <th>source_lem</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>aaofi</th>\n <td>aaofi</td>\n <td>organizacja rachunkowości i audytu dla islamsk...</td>\n <td>organizacja rachunkowość i audyt dla islamski ...</td>\n </tr>\n <tr>\n <th>aca</th>\n <td>aca</td>\n <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n <td>członek stowarzyszenie dyplomowany biegły rewi...</td>\n </tr>\n <tr>\n <th>acca</th>\n <td>acca</td>\n <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n <td>stowarzyszenie dyplomowany biegły rewident</td>\n </tr>\n <tr>\n <th>abacus</th>\n <td>abacus</td>\n <td>liczydło</td>\n <td>liczydło</td>\n </tr>\n <tr>\n <th>abandonment cost</th>\n <td>abandonment costs</td>\n <td>koszty zaniechania</td>\n <td>koszt zaniechanie</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>ytd</th>\n <td>ytd</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>year-end</th>\n <td>year-end</td>\n <td>koniec roku</td>\n <td>koniec rok</td>\n </tr>\n <tr>\n <th>year-to-date</th>\n <td>year-to-date</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>zog</th>\n <td>zog</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n <tr>\n <th>zero overhead growth</th>\n <td>zero overhead growth</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n </tbody>\n</table>\n<p>1197 rows × 3 columns</p>\n</div>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import time\n",
"\n",
"import pandas as pd\n",
"import spacy\n",
"\n",
"\n",
"spacy_nlp_en = spacy.load('en_core_web_sm')\n",
"spacy_nlp_pl = spacy.load(\"pl_core_news_sm\")\n",
"\n",
"glossary = pd.read_csv('kompendium.tsv', sep='\\t', header=None, names=['source', 'result'])\n",
"\n",
"source_lemmatized = []\n",
"for word in glossary['source']:\n",
" temp = []\n",
" for token in spacy_nlp_en(word):\n",
" temp.append(token.lemma_)\n",
" source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
"\n",
"result_lemmatized = []\n",
"for word in glossary['result']:\n",
" temp = []\n",
" for token in spacy_nlp_pl(word):\n",
" temp.append(token.lemma_)\n",
" result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
"\n",
"glossary['source_lem'] = source_lemmatized\n",
"glossary['result_lem'] = result_lemmatized\n",
"glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]\n",
"glossary.set_index('source_lem')\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"glossary.to_csv('kompendium_lem.tsv', sep='\\t')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Lemmatize corpus"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"dev_path = 'mt-summit-corpora/dev/dev'\n",
"\n",
"skip_chars = ''',./!?'''\n",
"\n",
"with open(dev_path + '.en', 'r') as file:\n",
" file_lemmatized = []\n",
" for line in file:\n",
" temp = []\n",
" for token in spacy_nlp_en(line):\n",
" temp.append(token.lemma_)\n",
" file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
"\n",
"with open(dev_path + '.pl', 'r') as file:\n",
" file_pl_lemmatized = []\n",
" for line in file:\n",
" temp = []\n",
" for token in spacy_nlp_pl(line):\n",
" temp.append(token.lemma_)\n",
" file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))\n",
"\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"in the course of the control the control audit firm shall fulfil the responsibility refer to in article 114 on date and in form specify by the controller \n",
"\n",
"w czas trwanie kontrola kontrolowany firma audytorski wypełnia obowiązek o których mowa w art 114 w ter-mina i forma wskazany przez osoba kontrolującą \n",
"\n"
]
}
],
"source": [
"print(file_lemmatized[2])\n",
"print(file_pl_lemmatized[2])"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Inject glossary\n",
"# !!! Obsolete !!!"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import spacy\n",
"from spaczz.matcher import FuzzyMatcher\n",
"\n",
"\n",
"glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
"bad_words = ['ocf', 'toc', 'vas', 'vat']\n",
"train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
"\n",
"nlp = spacy.blank(\"en\")\n",
"matcher = FuzzyMatcher(nlp.vocab)\n",
"for word in train_glossary['source_lem']:\n",
" if word not in bad_words:\n",
" matcher.add(word, [nlp(word)])\n",
"\n",
"\n",
"en = []\n",
"translation_line_counts = []\n",
"for line_id, line in enumerate(file_lemmatized):\n",
" doc = nlp(line)\n",
" matches = matcher(doc)\n",
"\n",
" not_injected = 0\n",
" for match_id, start, end, ratio in matches:\n",
" if ratio > 90:\n",
" not_injected += 1\n",
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
"\n",
"\n",
" if not_injected == 0:\n",
" not_injected = 1\n",
" en.append(line)\n",
" translation_line_counts.append(not_injected)\n",
"\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": true
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"import copy\n",
"tlcs = copy.deepcopy(translation_line_counts)\n",
"\n",
"translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
"with open(dev_path + '.injected.pl', 'w') as file_pl:\n",
" for trans in translations.iterrows():\n",
" try:\n",
" for _ in range(tlcs.pop(0)):\n",
" file_pl.write(trans[1]['text'] + '\\n')\n",
" except:\n",
" pass\n",
"\n",
"\n",
"with open(dev_path + '.injected.en', 'w') as file_en:\n",
" for line in en:\n",
" file_en.write(line)\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Inject glossary Polish crosscheck"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"import spacy\n",
"from spaczz.matcher import FuzzyMatcher\n",
"\n",
"# glossary\n",
"glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
"train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
"\n",
"# add rules to English matcher\n",
"nlp = spacy.blank(\"en\")\n",
"matcher = FuzzyMatcher(nlp.vocab)\n",
"for word in train_glossary['source_lem']:\n",
" matcher.add(word, [nlp(word)])\n",
"\n",
"# add rules to Polish matcher\n",
"nlp_pl = spacy.blank(\"pl\")\n",
"matcher_pl = FuzzyMatcher(nlp_pl.vocab)\n",
"for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
" matcher_pl.add(word, [nlp_pl(word)])\n",
"\n",
"en = []\n",
"translation_line_counts = []\n",
"for line_id in range(len(file_lemmatized)):\n",
"\n",
" doc = nlp(file_lemmatized[line_id])\n",
" matches = matcher(doc)\n",
"\n",
" not_injected = 0\n",
" for match_id, start, end, ratio in matches:\n",
" if ratio > 90:\n",
" doc_pl = nlp_pl(file_pl_lemmatized[line_id])\n",
" matches_pl = matcher_pl(doc_pl)\n",
"\n",
" for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:\n",
" if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
" not_injected += 1\n",
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
"\n",
" if not_injected == 0:\n",
" not_injected = 1\n",
" en.append(file_lemmatized[line_id])\n",
" translation_line_counts.append(not_injected)\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"import copy\n",
"\n",
"\n",
"tlcs = copy.deepcopy(translation_line_counts)\n",
"\n",
"translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
"translations['id'] = [x for x in range(len(translations))]\n",
"\n",
"ctr = 0\n",
"sentence = ''\n",
"with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n",
" with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n",
" for i in range(len(en)):\n",
" if i > 0:\n",
" if en[i-1] != en[i]:\n",
" if ctr == 0:\n",
" sentence = translations.iloc[0]\n",
" translations.drop(sentence['id'], inplace=True)\n",
" sentence = sentence['text']\n",
" try:\n",
" ctr = tlcs.pop(0)\n",
" except:\n",
" pass\n",
" file_en.write(en[i])\n",
" file_pl.write(sentence + '\\n')\n",
" ctr = ctr - 1\n",
" else:\n",
" try:\n",
" ctr = tlcs.pop(0) - 1\n",
" except:\n",
" pass\n",
" sentence = translations.iloc[0]\n",
" translations.drop(sentence['id'], inplace=True)\n",
" sentence = sentence['text']\n",
" file_en.write(en[i])\n",
" file_pl.write(sentence + '\\n')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"# Inject glossary Polish crosscheck fast?"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 49,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"took 152.213599056 injected 63 words. rate 6.569715230451229 sen/s\n"
]
}
],
"source": [
"import time\n",
"import spacy\n",
"from spaczz.matcher import FuzzyMatcher\n",
"\n",
"\n",
"# glossary\n",
"glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
"train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
"\n",
"# add rules to English matcher\n",
"nlp = spacy.blank(\"en\")\n",
"matcher = FuzzyMatcher(nlp.vocab)\n",
"for word in train_glossary['source_lem']:\n",
" matcher.add(word, [nlp(word)])\n",
"\n",
"# add rules to Polish matcher\n",
"nlp_pl = spacy.blank(\"pl\")\n",
"matcher_pl = FuzzyMatcher(nlp_pl.vocab)\n",
"for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
" matcher_pl.add(word, [nlp_pl(word)])\n",
"\n",
"start_time = time.time_ns()\n",
"en = []\n",
"injection_counter = 0\n",
"for line_id in range(len(file_lemmatized)):\n",
"\n",
" doc = nlp(file_lemmatized[line_id])\n",
" matches = matcher(nlp(file_lemmatized[line_id]))\n",
"\n",
" not_injected = True\n",
" if len(matches) > 0:\n",
" match_id, _, end, ratio = sorted(matches, key=lambda x: len(x[0]), reverse=True)[0]\n",
" if ratio > 90:\n",
" matches_pl = matcher_pl(nlp_pl(file_pl_lemmatized[line_id]))\n",
"\n",
" for match_id_pl, _, _, _ in matches_pl:\n",
" if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
" not_injected = False\n",
" injection_counter += 1\n",
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
" break\n",
"\n",
" if not_injected:\n",
" en.append(file_lemmatized[line_id])\n",
"\n",
"stop = time.time_ns()\n",
"timex = (stop - start_time) / 1000000000\n",
"print(f'took {timex} injected {injection_counter} words. rate {len(file_lemmatized)/timex} sen/s')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"import copy\n",
"\n",
"\n",
"tlcs = copy.deepcopy(translation_line_counts)\n",
"\n",
"translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
"translations['id'] = [x for x in range(len(translations))]\n",
"\n",
"ctr = 0\n",
"sentence = ''\n",
"with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n",
" with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n",
" for i in range(len(en)):\n",
" if i > 0:\n",
" if en[i-1] != en[i]:\n",
" if ctr == 0:\n",
" sentence = translations.iloc[0]\n",
" translations.drop(sentence['id'], inplace=True)\n",
" sentence = sentence['text']\n",
" try:\n",
" ctr = tlcs.pop(0)\n",
" except:\n",
" pass\n",
" file_en.write(en[i])\n",
" file_pl.write(sentence + '\\n')\n",
" ctr = ctr - 1\n",
" else:\n",
" try:\n",
" ctr = tlcs.pop(0) - 1\n",
" except:\n",
" pass\n",
" sentence = translations.iloc[0]\n",
" translations.drop(sentence['id'], inplace=True)\n",
" sentence = sentence['text']\n",
" file_en.write(en[i])\n",
" file_pl.write(sentence + '\\n')\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,130 @@
import spacy
import copy
import pandas as pd
import rapidfuzz
from rapidfuzz.fuzz import partial_ratio
import time
from rapidfuzz.utils import default_process
import sys
spacy.require_gpu()
spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load("pl_core_news_sm")
def read_arguments():
try:
corpus_path, glossary_path = sys.argv
return corpus_path, glossary_path
except:
print("ERROR: Wrong argument amount.")
sys.exit(1)
glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
source_lemmatized = []
for word in glossary['source']:
temp = []
for token in spacy_nlp_en(word):
temp.append(token.lemma_)
source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
result_lemmatized = []
for word in glossary['result']:
temp = []
for token in spacy_nlp_pl(word):
temp.append(token.lemma_)
result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
glossary.to_csv('kompendium_lem.tsv', sep='\t')
corpus_path = 'mt-summit-corpora/train/'
skip_chars = ''',./!?'''
with open(corpus_path + 'in.tsv', 'r') as file:
file_lemmatized = []
for line in file:
if len(file_lemmatized) % 10000 == 0:
print(len(file_lemmatized), end='\r')
temp = []
for token in spacy_nlp_en(line):
temp.append(token.lemma_)
file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
with open(corpus_path + 'expected.tsv', 'r') as file:
file_pl_lemmatized = []
for line in file:
if len(file_pl_lemmatized) % 10000 == 0:
print(len(file_lemmatized), end='\r')
temp = []
for token in spacy_nlp_pl(line):
temp.append(token.lemma_)
file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
THRESHOLD = 88
def is_injectable(sentence_pl, sequence):
sen = sentence_pl.split()
window_size = len(sequence.split())
maxx = 0
for i in range(len(sen) - window_size):
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
if current > maxx:
maxx = current
return maxx
def inject(sentence, sequence):
sen = sentence.split()
window_size = len(sequence.split())
maxx = 0
maxxi = 0
for i in range(len(sen) - window_size):
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
if current > maxx:
maxx = current
maxxi = i
return ' '.join(sen[:maxxi + window_size]) + ' ' \
+ glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \
+ ' ' + ' '.join(sen[maxxi + window_size:])
glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0)
glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']]
start_time = time.time_ns()
en = []
translation_line_counts = []
for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):
if len(translation_line_counts) % 50000 == 0:
print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r'))
line = default_process(line)
line_pl = default_process(line_pl)
matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)
translation_line_counts.append(len(matchez))
for match in matchez:
# if is_injectable(line_pl, match[0]):
en.append(inject(line, match[0])[0])
stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)
tlcs = copy.deepcopy(translation_line_counts)
translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text'])
with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl:
for line, translation_line_ct in zip(translations, tlcs):
for i in range(translation_line_ct):
file_pl.write(line)
with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en:
for e in en:
file_en.write(e + '\n')

View File

@ -0,0 +1,30 @@
first iteration:
./marian/build/marian --model mt.npz \
--type transformer --overwrite \
--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \
mt-summit-corpora/mt-summit-corpora/dev/dev.pl \
--disp-freq 1000 \
--save-freq 1000 \
--optimizer adam \
--lr-report
next iterations:
./marian/build/marian --model mt.npz \
--type transformer --overwrite \
--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \
mt-summit-corpora/mt-summit-corpora/dev/dev.pl \
--disp-freq 1000 \
--save-freq 1000 \
--optimizer adam \
--lr-report \
--pretrained-model mt.npz
./marian/build/marian --model mt.npz \
--type transformer --overwrite \
--train-sets mt-summit-corpora/mt-summit-corpora/train/train.en \
mt-summit-corpora/mt-summit-corpora/train/train.pl \
--disp-freq 1000 \
--save-freq 10000 \
--optimizer adam \
--lr-report \
--pretrained-model mt.npz

View File

@ -0,0 +1,12 @@
#!/bin.bash
apt install python3-pip
apt install python3-virtualenv
virtualenv -p python3.8 gpu
source gpu/bin/activate
pip install pandas ipython
pip install spacy[cuda114]
python -m spacy download en_core_web_sm
python -m spacy download pl_core_news_sm
pip install spaczz
pip install rapidfuzz