add fast aligned tm

2019-02-22 22:13:45 +01:00 · 2019-02-22 22:13:45 +01:00 · a8e1117f27
commit a8e1117f27
parent 8a365ec8be
5 changed files with 159 additions and 27 deletions
--- a/concordia-server/concordia_server.cpp
+++ b/concordia-server/concordia_server.cpp
@ -306,7 +306,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
                jsonWriter.StartObject();
                jsonWriter.String("status");
                jsonWriter.String("success");
-                jsonWriter.String("newTmId");
+                jsonWriter.String("newSourceId");
                jsonWriter.Int(newId);
                jsonWriter.EndObject();
            } else {
--- a/tests/addFastAlignedTM.py
+++ b/tests/addFastAlignedTM.py
@ -0,0 +1,114 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import json
+import urllib2
+import sys
+import host
+import time
+
+BUFFER_SIZE = 500
+
+address = 'http://'+host.concordia_host
+if len(host.concordia_port) > 0:
+    address += ':'+host.concordia_port
+
+
+def file_len(fname):
+    with open(fname) as f:
+        for i, l in enumerate(f):
+            pass
+    return i + 1
+
+def add_examples(examplesData):
+    req = urllib2.Request(address)
+    req.add_header('Content-Type', 'application/json')
+    response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
+    print(response)
+    if response['status'] == 'error':
+        raise Exception(response['message'])
+
+if len(sys.argv) != 9:
+    raise Exception("wrong number of arguments")
+
+name = sys.argv[1]
+sourceFile = sys.argv[2]
+lemmatizedSourceFile = sys.argv[3]
+sourceLangId = int(sys.argv[4])
+targetFile = sys.argv[5]
+targetLangId = int(sys.argv[6])
+alignmentsFile = sys.argv[7]
+sourceIdsFile = sys.argv[8]
+
+sourceFileLength = file_len(sourceFile)
+lemmatizedSourceFileLength = file_len(lemmatizedSourceFile)
+targetFileLength = file_len(targetFile)
+alignmentsFileLength = file_len(alignmentsFile)
+sourceIdsFileLength = file_len(sourceIdsFile)
+
+if not (sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength):
+    print("File lengths:")
+    print("source file: %d\nlemmatized source file: %d\ntarget file: %d\nalignments file: %d\nsource ids file: %d" % (sourceFileLength, lemmatizedSourceFileLength, targetFileLength, alignmentsFileLength, sourceIdsFileLength))
+    raise Exception("files are not of the same length!")
+
+totalExamples = file_len(sourceFile)
+
+data = {
+    'operation': 'addTm',
+    'sourceLangId':sourceLangId,
+    'targetLangId':targetLangId,
+    'name':name,
+    'tmLemmatized':True
+}
+
+req = urllib2.Request(address)
+req.add_header('Content-Type', 'application/json')
+response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
+print(response)
+tmId = int(response['newTmId'])
+print "Added new tm: %d" % tmId
+
+data = {
+    'operation': 'addSentences',
+    'tmId':tmId
+}
+
+examples = []
+start = time.time()
+with open(sourceFile) as source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file:
+    for lineNumber in range(totalExamples):
+        sourceSentence = source_file.readline().strip()
+        lemmatizedSourceSentence = lemmatized_source_file.readline().strip()
+        targetSentence = target_file.readline().strip()
+        alignment = json.loads(alignments_file.readline().strip())
+        sourceId = int(source_ids_file.readline().strip())
+
+        examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId])
+
+        if len(examples) >= BUFFER_SIZE:
+            data['examples'] = examples
+            add_examples(data)
+            mark = time.time()
+            print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
+            examples = []
+
+
+if len(examples) > 0:
+    data['examples'] = examples
+    add_examples(data)
+
+end = time.time()
+print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
+
+print "Generating index..."
+start = time.time()
+data = {
+    'operation': 'refreshIndex',
+    'tmId' : tmId
+}
+req = urllib2.Request(address)
+req.add_header('Content-Type', 'application/json')
+urllib2.urlopen(req, json.dumps(data)).read()
+
+end = time.time()
+print "Index regeneration complete. The operation took %.4f s" % (end - start)
--- a/tests/addFastAlignedTM.sh
+++ b/tests/addFastAlignedTM.sh
@ -0,0 +1,8 @@
+#!/bin/sh
+
+CORPUS_NAME=opensubtitles_sample
+CORPUS_PATH=../fast-aligner/corpora/$CORPUS_NAME
+SRC_LANG_ID=1
+TRG_LANG_ID=2
+
+./addFastAlignedTM.py $CORPUS_NAME $CORPUS_PATH/src_clean.txt $CORPUS_PATH/src_clean.lem $SRC_LANG_ID  $CORPUS_PATH/trg_clean.txt $TRG_LANG_ID  $CORPUS_PATH/alignments.txt  $CORPUS_PATH/ids_clean.txt
--- a/tests/addSource.py
+++ b/tests/addSource.py
@ -1,26 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-import json
-import urllib2
-import sys
-import time
-import host
-
-address = 'http://'+host.concordia_host
-if len(host.concordia_port) > 0:
-    address += ':'+host.concordia_port
-
-
-data = {
-    'operation': 'addSource',
-    'externalId':56,
-    'name':'test '
-    'tmLemmatized':bool(int(sys.argv[4]))
-}
-
-req = urllib2.Request(address)
-req.add_header('Content-Type', 'application/json')
-response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
-
-print response
--- a/tests/addSources.py
+++ b/tests/addSources.py
@ -0,0 +1,36 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import json
+import urllib2
+import sys
+import time
+import host
+
+address = 'http://'+host.concordia_host
+if len(host.concordia_port) > 0:
+    address += ':'+host.concordia_port
+
+
+with open(sys.argv[1]) as sources_file:
+	counter = 0
+	for line in sources_file:
+		counter += 1
+		idStr, link, name = line.rstrip().split('\t')
+
+
+		data = {
+		    'operation': 'addSource',
+		    'externalId':int(idStr),
+		    'name':name,
+		    'link':link
+		}
+
+		req = urllib2.Request(address)
+		req.add_header('Content-Type', 'application/json')
+		urllib2.urlopen(req, json.dumps(data)).read()
+
+		if counter % 1000 == 0:
+			print("Done %d sources" % counter)
+
+