diff --git a/getContainerBash.sh b/getContainerBash.sh deleted file mode 100755 index 0e76a4c..0000000 --- a/getContainerBash.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -sudo docker start $1 -sudo docker exec -it $1 /bin/bash \ No newline at end of file diff --git a/tests/addFastAlignedTM.py b/tests/addFastAlignedTM.py index a29f254..0620610 100755 --- a/tests/addFastAlignedTM.py +++ b/tests/addFastAlignedTM.py @@ -6,6 +6,7 @@ import urllib2 import sys import host import time +import codecs BUFFER_SIZE = 500 LEAVE_OUT = 1 # that does not leave out anything @@ -23,8 +24,9 @@ def file_len(fname): def add_examples(examplesData): req = urllib2.Request(address) - req.add_header('Content-Type', 'application/json') - response = json.loads(urllib2.urlopen(req, json.dumps(examplesData), timeout = 3600).read()) + req.add_header('Content-Type', 'application/json; charset=utf-8') + encodedData = json.dumps(examplesData, ensure_ascii=False).encode('utf-8', 'ignore') + response = json.loads(urllib2.urlopen(req, encodedData, timeout = 3600).read()) print(response) if response['status'] == 'error': raise Exception(response['message']) @@ -67,7 +69,7 @@ req.add_header('Content-Type', 'application/json') response = json.loads(urllib2.urlopen(req, json.dumps(data), timeout = 3600).read()) print(response) tmId = int(response['newTmId']) -print "Added new tm: %d" % tmId +print("Added new tm: %d" % tmId) data = { 'operation': 'addSentences', @@ -76,23 +78,26 @@ data = { examples = [] start = time.time() -with open(sourceFile) as source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file: +with codecs.open(sourceFile, "r", "utf-8") as source_file, codecs.open(lemmatizedSourceFile, "r", "utf-8") as lemmatized_source_file, codecs.open(targetFile, "r", "utf-8") as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file: addedCount = 0 for lineNumber in range(sourceFileLength): if lineNumber % LEAVE_OUT == 0: - sourceSentence = source_file.readline().strip() - lemmatizedSourceSentence = lemmatized_source_file.readline().strip() - targetSentence = target_file.readline().strip() + sourceSentence = source_file.readline().strip().encode('utf-8') + lemmatizedSourceSentence = lemmatized_source_file.readline().strip().encode('utf-8') + targetSentence = target_file.readline().strip().encode('utf-8') alignment = json.loads(alignments_file.readline().strip()) sourceId = int(source_ids_file.readline().strip()) + #print(sourceSentence) + #print(lemmatizedSourceSentence) + #print(targetSentence) examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId]) addedCount += 1 if len(examples) >= BUFFER_SIZE: data['examples'] = examples add_examples(data) mark = time.time() - print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start)) + print("Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start))) examples = [] @@ -101,9 +106,9 @@ if len(examples) > 0: add_examples(data) end = time.time() -print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start)) +print("Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start))) -print "Generating index..." +print("Generating index...") start = time.time() data = { 'operation': 'refreshIndex', @@ -114,4 +119,4 @@ req.add_header('Content-Type', 'application/json') urllib2.urlopen(req, json.dumps(data), timeout = 3600).read() end = time.time() -print "Index regeneration complete. The operation took %.4f s" % (end - start) +print("Index regeneration complete. The operation took %.4f s" % (end - start)) diff --git a/tests/addFastAlignedTM.sh b/tests/addFastAlignedTM.sh index 5a079a6..9229096 100755 --- a/tests/addFastAlignedTM.sh +++ b/tests/addFastAlignedTM.sh @@ -1,7 +1,7 @@ #!/bin/sh -CORPUS_NAME=opensubtitles -CORPUS_PATH=../fast-aligner/corpora/$CORPUS_NAME +CORPUS_NAME=opensubtitles_sample +CORPUS_PATH=./$CORPUS_NAME SRC_LANG_ID=1 TRG_LANG_ID=2 diff --git a/todo b/todo index 22d25a1..1a24757 100644 --- a/todo +++ b/todo @@ -1,5 +1,5 @@ -1. Change db settings in CMakeLists.txt in concordia-server repo -2. Rebuild docker-compose and check if concordia-server connects to db via internal "docker-compose" network -3. do not expose concordia-postgres ports if the above works -4. Move scripts in the tests folder of concordia-server repo to the concordia-docker. Adjust them accordingly (host, port) -5. sort out the lemmagen \ No newline at end of file +DONE 1. Change db settings in CMakeLists.txt in concordia-server repo +DONE 2. Rebuild docker-compose and check if concordia-server connects to db via internal "docker-compose" network +DONE 3. do not expose concordia-postgres ports if the above works +DONE 4. Move scripts in the tests folder of concordia-server repo to the concordia-docker. Adjust them accordingly (host, port) +DONE 5. sort out the lemmagen \ No newline at end of file