dictionary compilator
This commit is contained in:
parent
d291594c06
commit
5e1b032ea0
1
.gitignore
vendored
1
.gitignore
vendored
@ -38,3 +38,4 @@ __pycache__
|
|||||||
import-requests/request_handler.log
|
import-requests/request_handler.log
|
||||||
mgiza-aligner/corpus-compilator/corpora
|
mgiza-aligner/corpus-compilator/corpora
|
||||||
mgiza-aligner/corpus-compilator/dictionaries/*lem
|
mgiza-aligner/corpus-compilator/dictionaries/*lem
|
||||||
|
mgiza-aligner/corpus-compilator/dictionary-compilator/*.tsv
|
||||||
|
8
cat/versions_available/opus_medicine_plen.cfg
Normal file
8
cat/versions_available/opus_medicine_plen.cfg
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
dir@#@opus_medicine_plen
|
||||||
|
concordia_host@#@localhost
|
||||||
|
concordia_port@#@8800
|
||||||
|
tmid@#@2
|
||||||
|
desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
|
||||||
|
enjoy@#@Życzymy udanej pracy z systemem!
|
||||||
|
prompt@#@Wprowadź zdanie (po polsku):
|
||||||
|
suggestion@#@Pacjent cierpi na zapalenie ucha środkowego i dur brzuszny
|
1
cat/versions_enabled/opus_medicine_plen.cfg
Symbolic link
1
cat/versions_enabled/opus_medicine_plen.cfg
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../versions_available/opus_medicine_plen.cfg
|
@ -1,6 +1,6 @@
|
|||||||
SRC_LANG=en
|
SRC_LANG=pl
|
||||||
TRG_LANG=hr
|
TRG_LANG=en
|
||||||
CORPUS_NAME=tmrepository_enhr
|
CORPUS_NAME=opus_medicine
|
||||||
SEPARATOR=@\#@
|
SEPARATOR=@\#@
|
||||||
DICTIONARY_WEIGHT=5
|
DICTIONARY_WEIGHT=5
|
||||||
|
|
||||||
|
32
mgiza-aligner/corpus-compilator/dictionary-compilator/compile_dictionary.py
Executable file
32
mgiza-aligner/corpus-compilator/dictionary-compilator/compile_dictionary.py
Executable file
@ -0,0 +1,32 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys, requests, json
|
||||||
|
|
||||||
|
url = 'http://localhost:8800'
|
||||||
|
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as dictionary_file:
|
||||||
|
for line in dictionary_file:
|
||||||
|
entry = line.rstrip()
|
||||||
|
params = {
|
||||||
|
'operation': 'concordiaSearch',
|
||||||
|
'pattern':entry,
|
||||||
|
'tmId':int(sys.argv[2])
|
||||||
|
}
|
||||||
|
resp = requests.post(url=url, data=json.dumps(params), headers=headers)
|
||||||
|
responseJson = json.loads(resp.text)
|
||||||
|
if responseJson['result']['bestOverlayScore'] == 1.0:
|
||||||
|
matchedPattern = responseJson['result']['bestOverlay'][0]
|
||||||
|
for occurence in matchedPattern['occurences']:
|
||||||
|
if len(occurence['targetFragments']) > 0:
|
||||||
|
targetPhrase = ''
|
||||||
|
index = 0
|
||||||
|
for targetFragment in occurence['targetFragments']:
|
||||||
|
if index != 0:
|
||||||
|
targetPhrase += ' '
|
||||||
|
targetPhrase += occurence['targetSegment'][targetFragment[0]:targetFragment[1]]
|
||||||
|
index += 1
|
||||||
|
if len(targetPhrase) / float(len(entry)) < 3.0:
|
||||||
|
print("%s\t%s" % (entry, targetPhrase))
|
||||||
|
break
|
@ -1,7 +1,7 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
CORPUS_NAME="tmrepository_enhr"
|
CORPUS_NAME="opus_medicine"
|
||||||
SRC_LANG_ID=2
|
SRC_LANG_ID=1
|
||||||
TRG_LANG_ID=6
|
TRG_LANG_ID=2
|
||||||
|
|
||||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
||||||
|
Loading…
Reference in New Issue
Block a user