This commit is contained in:
rjawor 2019-05-17 14:28:29 +02:00
parent 917640d9a1
commit 151409c986
4 changed files with 23 additions and 22 deletions

View File

@ -1,4 +0,0 @@
#!/bin/bash
sudo docker start $1
sudo docker exec -it $1 /bin/bash

View File

@ -6,6 +6,7 @@ import urllib2
import sys import sys
import host import host
import time import time
import codecs
BUFFER_SIZE = 500 BUFFER_SIZE = 500
LEAVE_OUT = 1 # that does not leave out anything LEAVE_OUT = 1 # that does not leave out anything
@ -23,8 +24,9 @@ def file_len(fname):
def add_examples(examplesData): def add_examples(examplesData):
req = urllib2.Request(address) req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json') req.add_header('Content-Type', 'application/json; charset=utf-8')
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData), timeout = 3600).read()) encodedData = json.dumps(examplesData, ensure_ascii=False).encode('utf-8', 'ignore')
response = json.loads(urllib2.urlopen(req, encodedData, timeout = 3600).read())
print(response) print(response)
if response['status'] == 'error': if response['status'] == 'error':
raise Exception(response['message']) raise Exception(response['message'])
@ -67,7 +69,7 @@ req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data), timeout = 3600).read()) response = json.loads(urllib2.urlopen(req, json.dumps(data), timeout = 3600).read())
print(response) print(response)
tmId = int(response['newTmId']) tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId print("Added new tm: %d" % tmId)
data = { data = {
'operation': 'addSentences', 'operation': 'addSentences',
@ -76,23 +78,26 @@ data = {
examples = [] examples = []
start = time.time() start = time.time()
with open(sourceFile) as source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file: with codecs.open(sourceFile, "r", "utf-8") as source_file, codecs.open(lemmatizedSourceFile, "r", "utf-8") as lemmatized_source_file, codecs.open(targetFile, "r", "utf-8") as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file:
addedCount = 0 addedCount = 0
for lineNumber in range(sourceFileLength): for lineNumber in range(sourceFileLength):
if lineNumber % LEAVE_OUT == 0: if lineNumber % LEAVE_OUT == 0:
sourceSentence = source_file.readline().strip() sourceSentence = source_file.readline().strip().encode('utf-8')
lemmatizedSourceSentence = lemmatized_source_file.readline().strip() lemmatizedSourceSentence = lemmatized_source_file.readline().strip().encode('utf-8')
targetSentence = target_file.readline().strip() targetSentence = target_file.readline().strip().encode('utf-8')
alignment = json.loads(alignments_file.readline().strip()) alignment = json.loads(alignments_file.readline().strip())
sourceId = int(source_ids_file.readline().strip()) sourceId = int(source_ids_file.readline().strip())
#print(sourceSentence)
#print(lemmatizedSourceSentence)
#print(targetSentence)
examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId]) examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId])
addedCount += 1 addedCount += 1
if len(examples) >= BUFFER_SIZE: if len(examples) >= BUFFER_SIZE:
data['examples'] = examples data['examples'] = examples
add_examples(data) add_examples(data)
mark = time.time() mark = time.time()
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start)) print("Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start)))
examples = [] examples = []
@ -101,9 +106,9 @@ if len(examples) > 0:
add_examples(data) add_examples(data)
end = time.time() end = time.time()
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start)) print("Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start)))
print "Generating index..." print("Generating index...")
start = time.time() start = time.time()
data = { data = {
'operation': 'refreshIndex', 'operation': 'refreshIndex',
@ -114,4 +119,4 @@ req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data), timeout = 3600).read() urllib2.urlopen(req, json.dumps(data), timeout = 3600).read()
end = time.time() end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start) print("Index regeneration complete. The operation took %.4f s" % (end - start))

View File

@ -1,7 +1,7 @@
#!/bin/sh #!/bin/sh
CORPUS_NAME=opensubtitles CORPUS_NAME=opensubtitles_sample
CORPUS_PATH=../fast-aligner/corpora/$CORPUS_NAME CORPUS_PATH=./$CORPUS_NAME
SRC_LANG_ID=1 SRC_LANG_ID=1
TRG_LANG_ID=2 TRG_LANG_ID=2

10
todo
View File

@ -1,5 +1,5 @@
1. Change db settings in CMakeLists.txt in concordia-server repo DONE 1. Change db settings in CMakeLists.txt in concordia-server repo
2. Rebuild docker-compose and check if concordia-server connects to db via internal "docker-compose" network DONE 2. Rebuild docker-compose and check if concordia-server connects to db via internal "docker-compose" network
3. do not expose concordia-postgres ports if the above works DONE 3. do not expose concordia-postgres ports if the above works
4. Move scripts in the tests folder of concordia-server repo to the concordia-docker. Adjust them accordingly (host, port) DONE 4. Move scripts in the tests folder of concordia-server repo to the concordia-docker. Adjust them accordingly (host, port)
5. sort out the lemmagen DONE 5. sort out the lemmagen