From f38734c6a7af0bfc7bdadedd1654b8db9c5ed8ef Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 29 Aug 2019 23:16:48 +0200 Subject: [PATCH] working simple search --- concordia-aligner/add_corpus.sh | 2 +- concordia-aligner/add_fast_aligned_TM.py | 77 ++++++++++++++---------- tools/simpleSearch.py | 2 +- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/concordia-aligner/add_corpus.sh b/concordia-aligner/add_corpus.sh index fcccf07..daeb23b 100755 --- a/concordia-aligner/add_corpus.sh +++ b/concordia-aligner/add_corpus.sh @@ -2,4 +2,4 @@ source corpus.cfg -./add_fast_aligned_TM.py $CORPUS_NAME $CORPUS_PATH/src_clean.txt $CORPUS_PATH/src_clean.lem $SRC_LANG_ID $CORPUS_PATH/trg_clean.txt $TRG_LANG_ID $CORPUS_PATH/alignments.txt $CORPUS_PATH/ids_clean.txt +./add_fast_aligned_TM.py $CORPUS_NAME $CORPUS_PATH/src_clean.txt $CORPUS_PATH/src_clean.tok $CORPUS_PATH/src_clean.lem $SRC_LANG_ID $CORPUS_PATH/trg_clean.txt $TRG_LANG_ID $CORPUS_PATH/alignments.txt $CORPUS_PATH/ids_clean.txt diff --git a/concordia-aligner/add_fast_aligned_TM.py b/concordia-aligner/add_fast_aligned_TM.py index 4b0396e..0b9dc28 100755 --- a/concordia-aligner/add_fast_aligned_TM.py +++ b/concordia-aligner/add_fast_aligned_TM.py @@ -18,33 +18,40 @@ def file_len(fname): pass return i + 1 -def add_examples(examplesData): +def add_examples(examples, tmId): + examplesData = { + 'operation': 'addSentences', + 'tmId':tmId, + 'examples':examples + } req = urllib2.Request(address) req.add_header('Content-Type', 'application/json') response = json.loads(urllib2.urlopen(req, json.dumps(examplesData), timeout = 3600).read()) - print(response) + #print(response) if response['status'] == 'error': raise Exception(response['message']) -if len(sys.argv) != 9: +if len(sys.argv) != 10: raise Exception("wrong number of arguments") name = sys.argv[1] sourceFile = sys.argv[2] -lemmatizedSourceFile = sys.argv[3] -sourceLangId = int(sys.argv[4]) -targetFile = sys.argv[5] -targetLangId = int(sys.argv[6]) -alignmentsFile = sys.argv[7] -sourceIdsFile = sys.argv[8] +tokenizedSourceFile = sys.argv[3] +lemmatizedSourceFile = sys.argv[4] +sourceLangId = int(sys.argv[5]) +targetFile = sys.argv[6] +targetLangId = int(sys.argv[7]) +alignmentsFile = sys.argv[8] +sourceIdsFile = sys.argv[9] sourceFileLength = file_len(sourceFile) +tokenizedSourceFileLength = file_len(tokenizedSourceFile) lemmatizedSourceFileLength = file_len(lemmatizedSourceFile) targetFileLength = file_len(targetFile) alignmentsFileLength = file_len(alignmentsFile) sourceIdsFileLength = file_len(sourceIdsFile) -if not (sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength): +if not (sourceFileLength == tokenizedSourceFileLength and tokenizedSourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength): print("File lengths:") print("source file: %d\nlemmatized source file: %d\ntarget file: %d\nalignments file: %d\nsource ids file: %d" % (sourceFileLength, lemmatizedSourceFileLength, targetFileLength, alignmentsFileLength, sourceIdsFileLength)) raise Exception("files are not of the same length!") @@ -52,63 +59,71 @@ if not (sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileL totalExamples = sourceFileLength / LEAVE_OUT data = { - 'operation': 'addTm', + 'operation': 'addPairedTms', 'sourceLangId':sourceLangId, 'targetLangId':targetLangId, - 'name':name, - 'tmLemmatized':True + 'name':name } req = urllib2.Request(address) req.add_header('Content-Type', 'application/json') response = json.loads(urllib2.urlopen(req, json.dumps(data), timeout = 3600).read()) -print(response) -tmId = int(response['newTmId']) -print "Added new tm: %d" % tmId +#print(response) -data = { - 'operation': 'addSentences', - 'tmId':tmId -} +lemmatizedTmId = int(response['lemmatizedTmId']) +nonLemmatizedTmId = int(response['nonLemmatizedTmId']) +print "Added new paired tm: lemmatized id: %d, non lemmatized id: %d" % (lemmatizedTmId, nonLemmatizedTmId) examples = [] +examples_lemmatized = [] start = time.time() -with open(sourceFile) as source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file: +with open(sourceFile) as source_file, open(tokenizedSourceFile) as tokenized_source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file: addedCount = 0 for lineNumber in range(sourceFileLength): if lineNumber % LEAVE_OUT == 0: sourceSentence = source_file.readline().strip() + tokenizedSourceSentence = tokenized_source_file.readline().strip() lemmatizedSourceSentence = lemmatized_source_file.readline().strip() targetSentence = target_file.readline().strip() alignment = json.loads(alignments_file.readline().strip()) sourceId = int(source_ids_file.readline().strip()) - examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId]) + examples.append([sourceSentence, tokenizedSourceSentence, targetSentence, alignment, sourceId]) + examples_lemmatized.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId]) addedCount += 1 if len(examples) >= BUFFER_SIZE: - data['examples'] = examples - add_examples(data) + add_examples(examples, nonLemmatizedTmId) + add_examples(examples_lemmatized, lemmatizedTmId) mark = time.time() - print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start)) + print "Added %d of %d examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start)) examples = [] + examples_lemmatized = [] if len(examples) > 0: - data['examples'] = examples - add_examples(data) + add_examples(examples, nonLemmatizedTmId) + add_examples(examples_lemmatized, lemmatizedTmId) end = time.time() -print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start)) +print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start)) -print "Generating index..." +print "Generating indexes..." start = time.time() data = { 'operation': 'refreshIndex', - 'tmId' : tmId + 'tmId' : nonLemmatizedTmId +} +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +urllib2.urlopen(req, json.dumps(data), timeout = 3600).read() + +data = { + 'operation': 'refreshIndex', + 'tmId' : lemmatizedTmId } req = urllib2.Request(address) req.add_header('Content-Type', 'application/json') urllib2.urlopen(req, json.dumps(data), timeout = 3600).read() end = time.time() -print "Index regeneration complete. The operation took %.4f s" % (end - start) +print "Index regeneration complete. The operation took %.4f s" % (end - start) \ No newline at end of file diff --git a/tools/simpleSearch.py b/tools/simpleSearch.py index e7bdaee..252b286 100755 --- a/tools/simpleSearch.py +++ b/tools/simpleSearch.py @@ -11,7 +11,7 @@ import host data = { 'operation': 'simpleSearch', 'pattern':sys.argv[1], - 'tmId':int(sys.argv[2]) + 'tmId':int(sys.argv[2]), } address = 'http://'+host.concordia_host