dictionary compilator
This commit is contained in:
parent
d291594c06
commit
5e1b032ea0
1
.gitignore
vendored
1
.gitignore
vendored
@ -38,3 +38,4 @@ __pycache__
|
||||
import-requests/request_handler.log
|
||||
mgiza-aligner/corpus-compilator/corpora
|
||||
mgiza-aligner/corpus-compilator/dictionaries/*lem
|
||||
mgiza-aligner/corpus-compilator/dictionary-compilator/*.tsv
|
||||
|
8
cat/versions_available/opus_medicine_plen.cfg
Normal file
8
cat/versions_available/opus_medicine_plen.cfg
Normal file
@ -0,0 +1,8 @@
|
||||
dir@#@opus_medicine_plen
|
||||
concordia_host@#@localhost
|
||||
concordia_port@#@8800
|
||||
tmid@#@2
|
||||
desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
|
||||
enjoy@#@Życzymy udanej pracy z systemem!
|
||||
prompt@#@Wprowadź zdanie (po polsku):
|
||||
suggestion@#@Pacjent cierpi na zapalenie ucha środkowego i dur brzuszny
|
1
cat/versions_enabled/opus_medicine_plen.cfg
Symbolic link
1
cat/versions_enabled/opus_medicine_plen.cfg
Symbolic link
@ -0,0 +1 @@
|
||||
../versions_available/opus_medicine_plen.cfg
|
@ -1,9 +1,9 @@
|
||||
SRC_LANG=en
|
||||
TRG_LANG=hr
|
||||
CORPUS_NAME=tmrepository_enhr
|
||||
SRC_LANG=pl
|
||||
TRG_LANG=en
|
||||
CORPUS_NAME=opus_medicine
|
||||
SEPARATOR=@\#@
|
||||
DICTIONARY_WEIGHT=5
|
||||
|
||||
|
||||
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
|
||||
|
||||
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
||||
|
32
mgiza-aligner/corpus-compilator/dictionary-compilator/compile_dictionary.py
Executable file
32
mgiza-aligner/corpus-compilator/dictionary-compilator/compile_dictionary.py
Executable file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys, requests, json
|
||||
|
||||
url = 'http://localhost:8800'
|
||||
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
|
||||
|
||||
with open(sys.argv[1]) as dictionary_file:
|
||||
for line in dictionary_file:
|
||||
entry = line.rstrip()
|
||||
params = {
|
||||
'operation': 'concordiaSearch',
|
||||
'pattern':entry,
|
||||
'tmId':int(sys.argv[2])
|
||||
}
|
||||
resp = requests.post(url=url, data=json.dumps(params), headers=headers)
|
||||
responseJson = json.loads(resp.text)
|
||||
if responseJson['result']['bestOverlayScore'] == 1.0:
|
||||
matchedPattern = responseJson['result']['bestOverlay'][0]
|
||||
for occurence in matchedPattern['occurences']:
|
||||
if len(occurence['targetFragments']) > 0:
|
||||
targetPhrase = ''
|
||||
index = 0
|
||||
for targetFragment in occurence['targetFragments']:
|
||||
if index != 0:
|
||||
targetPhrase += ' '
|
||||
targetPhrase += occurence['targetSegment'][targetFragment[0]:targetFragment[1]]
|
||||
index += 1
|
||||
if len(targetPhrase) / float(len(entry)) < 3.0:
|
||||
print("%s\t%s" % (entry, targetPhrase))
|
||||
break
|
@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
CORPUS_NAME="tmrepository_enhr"
|
||||
SRC_LANG_ID=2
|
||||
TRG_LANG_ID=6
|
||||
CORPUS_NAME="opus_medicine"
|
||||
SRC_LANG_ID=1
|
||||
TRG_LANG_ID=2
|
||||
|
||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
||||
|
Loading…
Reference in New Issue
Block a user