added tests, fixed network
This commit is contained in:
parent
943911209d
commit
dab669729f
@ -6,6 +6,10 @@ services:
|
|||||||
build: ./concordia-postgres
|
build: ./concordia-postgres
|
||||||
container_name: concordia-postgres
|
container_name: concordia-postgres
|
||||||
restart: always
|
restart: always
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: concordia_db
|
||||||
|
POSTGRES_USER: concordia
|
||||||
|
POSTGRES_PASSWORD: concordia
|
||||||
lemmagen:
|
lemmagen:
|
||||||
build: ./lemmagen
|
build: ./lemmagen
|
||||||
container_name: lemmagen
|
container_name: lemmagen
|
||||||
@ -15,3 +19,6 @@ services:
|
|||||||
container_name: concordia-server
|
container_name: concordia-server
|
||||||
ports:
|
ports:
|
||||||
- "10001:80"
|
- "10001:80"
|
||||||
|
depends_on:
|
||||||
|
- "concordia-postgres"
|
||||||
|
- "lemmagen"
|
2
tests/.gitignore
vendored
Normal file
2
tests/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
host.py
|
||||||
|
host.pyc
|
98
tests/addAlignedFile.py
Executable file
98
tests/addAlignedFile.py
Executable file
@ -0,0 +1,98 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import host
|
||||||
|
import time
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
def file_len(fname):
|
||||||
|
with open(fname) as f:
|
||||||
|
for i, l in enumerate(f):
|
||||||
|
pass
|
||||||
|
return i + 1
|
||||||
|
|
||||||
|
def add_data(data):
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
|
||||||
|
sourceFile = sys.argv[1]
|
||||||
|
sourceLangId = int(sys.argv[2])
|
||||||
|
targetLangId = int(sys.argv[3])
|
||||||
|
name = sys.argv[4]
|
||||||
|
|
||||||
|
totalLines = file_len(sourceFile)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addTm',
|
||||||
|
'sourceLangId':sourceLangId,
|
||||||
|
'targetLangId':targetLangId,
|
||||||
|
'name':name
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
tmId = int(response['newTmId'])
|
||||||
|
print "Added new tm: %d" % tmId
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addAlignedSentences',
|
||||||
|
'tmId':tmId
|
||||||
|
}
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
currSentence = []
|
||||||
|
start = time.time()
|
||||||
|
with open(sourceFile) as sourceLines:
|
||||||
|
lineNumber = 0
|
||||||
|
for line in sourceLines:
|
||||||
|
line = line.strip()
|
||||||
|
if lineNumber % 3 == 1:
|
||||||
|
currSentence.append(line)
|
||||||
|
elif lineNumber % 3 == 2:
|
||||||
|
currSentence.append(line)
|
||||||
|
currSentence.reverse()
|
||||||
|
sentences.append(currSentence)
|
||||||
|
currSentence = []
|
||||||
|
if len(sentences) >= BUFFER_SIZE:
|
||||||
|
data['sentences'] = sentences
|
||||||
|
add_data(data)
|
||||||
|
mark = time.time()
|
||||||
|
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
|
||||||
|
sentences = []
|
||||||
|
lineNumber += 1
|
||||||
|
|
||||||
|
|
||||||
|
if len(sentences) > 0:
|
||||||
|
data['sentences'] = sentences
|
||||||
|
add_data(data)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
|
||||||
|
|
||||||
|
print "Generating index..."
|
||||||
|
start = time.time()
|
||||||
|
data = {
|
||||||
|
'operation': 'refreshIndex',
|
||||||
|
'tmId' : tmId
|
||||||
|
}
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
||||||
|
|
||||||
|
|
||||||
|
|
80
tests/addAlignedFileToTM.py
Executable file
80
tests/addAlignedFileToTM.py
Executable file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import host
|
||||||
|
import time
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
def file_len(fname):
|
||||||
|
with open(fname) as f:
|
||||||
|
for i, l in enumerate(f):
|
||||||
|
pass
|
||||||
|
return i + 1
|
||||||
|
|
||||||
|
def add_data(data):
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
|
||||||
|
sourceFile = sys.argv[1]
|
||||||
|
tmId = int(sys.argv[2])
|
||||||
|
|
||||||
|
totalLines = file_len(sourceFile)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addAlignedSentences',
|
||||||
|
'tmId':tmId
|
||||||
|
}
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
currSentence = []
|
||||||
|
start = time.time()
|
||||||
|
with open(sourceFile) as sourceLines:
|
||||||
|
lineNumber = 0
|
||||||
|
for line in sourceLines:
|
||||||
|
line = line.strip()
|
||||||
|
if lineNumber % 3 == 1:
|
||||||
|
currSentence.append(line)
|
||||||
|
elif lineNumber % 3 == 2:
|
||||||
|
currSentence.append(line)
|
||||||
|
currSentence.reverse()
|
||||||
|
sentences.append(currSentence)
|
||||||
|
currSentence = []
|
||||||
|
if len(sentences) >= BUFFER_SIZE:
|
||||||
|
data['sentences'] = sentences
|
||||||
|
add_data(data)
|
||||||
|
mark = time.time()
|
||||||
|
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
|
||||||
|
sentences = []
|
||||||
|
lineNumber += 1
|
||||||
|
|
||||||
|
|
||||||
|
if len(sentences) > 0:
|
||||||
|
data['sentences'] = sentences
|
||||||
|
add_data(data)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
|
||||||
|
|
||||||
|
print "Generating index..."
|
||||||
|
start = time.time()
|
||||||
|
data = {
|
||||||
|
'operation': 'refreshIndex',
|
||||||
|
'tmId' : tmId
|
||||||
|
}
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
111
tests/addAlignedLemmatizedTM.py
Executable file
111
tests/addAlignedLemmatizedTM.py
Executable file
@ -0,0 +1,111 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import host
|
||||||
|
import time
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
def file_len(fname):
|
||||||
|
with open(fname) as f:
|
||||||
|
for i, l in enumerate(f):
|
||||||
|
pass
|
||||||
|
return i + 1
|
||||||
|
|
||||||
|
def add_examples(examplesData):
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
|
||||||
|
if response['status'] == 'error':
|
||||||
|
raise Exception(response['message'])
|
||||||
|
|
||||||
|
if len(sys.argv) != 7:
|
||||||
|
raise Exception("wrong number of arguments")
|
||||||
|
|
||||||
|
name = sys.argv[1]
|
||||||
|
sourceFile = sys.argv[2]
|
||||||
|
sourceLangId = int(sys.argv[3])
|
||||||
|
targetFile = sys.argv[4]
|
||||||
|
targetLangId = int(sys.argv[5])
|
||||||
|
alignmentsFile = sys.argv[6]
|
||||||
|
|
||||||
|
if (file_len(sourceFile) != file_len(targetFile)):
|
||||||
|
raise Exception("source and target files are not of the same length!")
|
||||||
|
|
||||||
|
if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
|
||||||
|
raise Exception("alignments file is not exactly 3 times longer than source and target")
|
||||||
|
|
||||||
|
|
||||||
|
totalExamples = file_len(sourceFile)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addTm',
|
||||||
|
'sourceLangId':sourceLangId,
|
||||||
|
'targetLangId':targetLangId,
|
||||||
|
'name':name,
|
||||||
|
'tmLemmatized':True
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
print(response)
|
||||||
|
tmId = int(response['newTmId'])
|
||||||
|
print "Added new tm: %d" % tmId
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addAlignedLemmatizedSentences',
|
||||||
|
'tmId':tmId
|
||||||
|
}
|
||||||
|
|
||||||
|
examples = []
|
||||||
|
start = time.time()
|
||||||
|
with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
|
||||||
|
for lineNumber in range(totalExamples):
|
||||||
|
sourceSentence = sf.readline().strip()
|
||||||
|
targetSentence = tf.readline().strip()
|
||||||
|
|
||||||
|
# skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
|
||||||
|
af.readline()
|
||||||
|
af.readline()
|
||||||
|
|
||||||
|
alignmentString = af.readline().strip()
|
||||||
|
|
||||||
|
examples.append([sourceSentence, targetSentence, alignmentString])
|
||||||
|
|
||||||
|
if len(examples) >= BUFFER_SIZE:
|
||||||
|
data['examples'] = examples
|
||||||
|
add_examples(data)
|
||||||
|
mark = time.time()
|
||||||
|
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
|
||||||
|
examples = []
|
||||||
|
|
||||||
|
|
||||||
|
if len(examples) > 0:
|
||||||
|
data['examples'] = examples
|
||||||
|
add_examples(data)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
|
||||||
|
|
||||||
|
print "Generating index..."
|
||||||
|
start = time.time()
|
||||||
|
data = {
|
||||||
|
'operation': 'refreshIndex',
|
||||||
|
'tmId' : tmId
|
||||||
|
}
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
117
tests/addFastAlignedTM.py
Executable file
117
tests/addFastAlignedTM.py
Executable file
@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import host
|
||||||
|
import time
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
LEAVE_OUT = 1 # that does not leave out anything
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
def file_len(fname):
|
||||||
|
with open(fname) as f:
|
||||||
|
for i, l in enumerate(f):
|
||||||
|
pass
|
||||||
|
return i + 1
|
||||||
|
|
||||||
|
def add_examples(examplesData):
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData), timeout = 3600).read())
|
||||||
|
print(response)
|
||||||
|
if response['status'] == 'error':
|
||||||
|
raise Exception(response['message'])
|
||||||
|
|
||||||
|
if len(sys.argv) != 9:
|
||||||
|
raise Exception("wrong number of arguments")
|
||||||
|
|
||||||
|
name = sys.argv[1]
|
||||||
|
sourceFile = sys.argv[2]
|
||||||
|
lemmatizedSourceFile = sys.argv[3]
|
||||||
|
sourceLangId = int(sys.argv[4])
|
||||||
|
targetFile = sys.argv[5]
|
||||||
|
targetLangId = int(sys.argv[6])
|
||||||
|
alignmentsFile = sys.argv[7]
|
||||||
|
sourceIdsFile = sys.argv[8]
|
||||||
|
|
||||||
|
sourceFileLength = file_len(sourceFile)
|
||||||
|
lemmatizedSourceFileLength = file_len(lemmatizedSourceFile)
|
||||||
|
targetFileLength = file_len(targetFile)
|
||||||
|
alignmentsFileLength = file_len(alignmentsFile)
|
||||||
|
sourceIdsFileLength = file_len(sourceIdsFile)
|
||||||
|
|
||||||
|
if not (sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength):
|
||||||
|
print("File lengths:")
|
||||||
|
print("source file: %d\nlemmatized source file: %d\ntarget file: %d\nalignments file: %d\nsource ids file: %d" % (sourceFileLength, lemmatizedSourceFileLength, targetFileLength, alignmentsFileLength, sourceIdsFileLength))
|
||||||
|
raise Exception("files are not of the same length!")
|
||||||
|
|
||||||
|
totalExamples = sourceFileLength / LEAVE_OUT
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addTm',
|
||||||
|
'sourceLangId':sourceLangId,
|
||||||
|
'targetLangId':targetLangId,
|
||||||
|
'name':name,
|
||||||
|
'tmLemmatized':True
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data), timeout = 3600).read())
|
||||||
|
print(response)
|
||||||
|
tmId = int(response['newTmId'])
|
||||||
|
print "Added new tm: %d" % tmId
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addSentences',
|
||||||
|
'tmId':tmId
|
||||||
|
}
|
||||||
|
|
||||||
|
examples = []
|
||||||
|
start = time.time()
|
||||||
|
with open(sourceFile) as source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file:
|
||||||
|
addedCount = 0
|
||||||
|
for lineNumber in range(sourceFileLength):
|
||||||
|
if lineNumber % LEAVE_OUT == 0:
|
||||||
|
sourceSentence = source_file.readline().strip()
|
||||||
|
lemmatizedSourceSentence = lemmatized_source_file.readline().strip()
|
||||||
|
targetSentence = target_file.readline().strip()
|
||||||
|
alignment = json.loads(alignments_file.readline().strip())
|
||||||
|
sourceId = int(source_ids_file.readline().strip())
|
||||||
|
|
||||||
|
examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId])
|
||||||
|
addedCount += 1
|
||||||
|
if len(examples) >= BUFFER_SIZE:
|
||||||
|
data['examples'] = examples
|
||||||
|
add_examples(data)
|
||||||
|
mark = time.time()
|
||||||
|
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start))
|
||||||
|
examples = []
|
||||||
|
|
||||||
|
|
||||||
|
if len(examples) > 0:
|
||||||
|
data['examples'] = examples
|
||||||
|
add_examples(data)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start))
|
||||||
|
|
||||||
|
print "Generating index..."
|
||||||
|
start = time.time()
|
||||||
|
data = {
|
||||||
|
'operation': 'refreshIndex',
|
||||||
|
'tmId' : tmId
|
||||||
|
}
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data), timeout = 3600).read()
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
8
tests/addFastAlignedTM.sh
Executable file
8
tests/addFastAlignedTM.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
CORPUS_NAME=opensubtitles
|
||||||
|
CORPUS_PATH=../fast-aligner/corpora/$CORPUS_NAME
|
||||||
|
SRC_LANG_ID=1
|
||||||
|
TRG_LANG_ID=2
|
||||||
|
|
||||||
|
./addFastAlignedTM.py $CORPUS_NAME $CORPUS_PATH/src_clean.txt $CORPUS_PATH/src_clean.lem $SRC_LANG_ID $CORPUS_PATH/trg_clean.txt $TRG_LANG_ID $CORPUS_PATH/alignments.txt $CORPUS_PATH/ids_clean.txt
|
97
tests/addFile.py
Executable file
97
tests/addFile.py
Executable file
@ -0,0 +1,97 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import host
|
||||||
|
import time
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
def file_len(fname):
|
||||||
|
with open(fname) as f:
|
||||||
|
for i, l in enumerate(f):
|
||||||
|
pass
|
||||||
|
return i + 1
|
||||||
|
|
||||||
|
def add_data(data):
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
|
||||||
|
sourceFile = sys.argv[1]
|
||||||
|
sourceLangId = int(sys.argv[2])
|
||||||
|
targetFile = sys.argv[3]
|
||||||
|
targetLangId = int(sys.argv[4])
|
||||||
|
name = sys.argv[5]
|
||||||
|
|
||||||
|
totalLines = file_len(sourceFile)
|
||||||
|
if file_len(targetFile) != totalLines:
|
||||||
|
print "File lengths do not match"
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addTm',
|
||||||
|
'sourceLangId':sourceLangId,
|
||||||
|
'targetLangId':targetLangId,
|
||||||
|
'name':name
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
tmId = int(response['newTmId'])
|
||||||
|
print "Added new tm: %d" % tmId
|
||||||
|
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addSentences',
|
||||||
|
'tmId':tmId
|
||||||
|
}
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
start = time.time()
|
||||||
|
with open(sourceFile) as sourceSentences:
|
||||||
|
with open(targetFile) as targetSentences:
|
||||||
|
lineNumber = 0
|
||||||
|
for sourceSentence in sourceSentences:
|
||||||
|
lineNumber += 1
|
||||||
|
targetSentence = targetSentences.readline()
|
||||||
|
sentences.append([sourceSentence, targetSentence])
|
||||||
|
if lineNumber % BUFFER_SIZE == 0:
|
||||||
|
data['sentences'] = sentences
|
||||||
|
sentences = []
|
||||||
|
add_data(data)
|
||||||
|
mark = time.time()
|
||||||
|
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % (lineNumber, totalLines, mark-start, lineNumber/(mark-start))
|
||||||
|
|
||||||
|
|
||||||
|
if len(sentences) > 0:
|
||||||
|
data['sentences'] = sentences
|
||||||
|
add_data(data)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (lineNumber, end-start, lineNumber/(end-start))
|
||||||
|
|
||||||
|
print "Generating index..."
|
||||||
|
start = time.time()
|
||||||
|
data = {
|
||||||
|
'operation': 'refreshIndex',
|
||||||
|
'tmId' : tmId
|
||||||
|
}
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
||||||
|
|
||||||
|
|
||||||
|
|
4
tests/addJrc.sh
Executable file
4
tests/addJrc.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
./addFile.py ~/projects/corpora/jrc/jrc_pl.txt ~/projects/corpora/jrc/jrc_en.txt 1
|
||||||
|
|
7
tests/addLemmatizedTM.sh
Executable file
7
tests/addLemmatizedTM.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
CORPUS_NAME="stocznia_plen"
|
||||||
|
SRC_LANG_ID=1
|
||||||
|
TRG_LANG_ID=2
|
||||||
|
|
||||||
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
7
tests/addLemmatizedTMfromParams.sh
Executable file
7
tests/addLemmatizedTMfromParams.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
CORPUS_NAME=$1
|
||||||
|
SRC_LANG_ID=$2
|
||||||
|
TRG_LANG_ID=$3
|
||||||
|
|
||||||
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
33
tests/addSentence.py
Executable file
33
tests/addSentence.py
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addSentence',
|
||||||
|
'sourceSentence':sys.argv[1],
|
||||||
|
'targetSentence':sys.argv[2],
|
||||||
|
'tmId':int(sys.argv[3])
|
||||||
|
}
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
||||||
|
|
||||||
|
|
49
tests/addSources.py
Executable file
49
tests/addSources.py
Executable file
@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
|
||||||
|
|
||||||
|
def addSources(sources_buffer):
|
||||||
|
data = {
|
||||||
|
'operation': 'addSources',
|
||||||
|
'sources':sources_buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data))
|
||||||
|
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as sources_file:
|
||||||
|
counter = 0
|
||||||
|
sources_buffer = []
|
||||||
|
for line in sources_file:
|
||||||
|
counter += 1
|
||||||
|
id_raw, link, name = line.rstrip().split('\t')
|
||||||
|
|
||||||
|
sources_buffer.append([int(id_raw),name, link])
|
||||||
|
if len(sources_buffer) == BUFFER_SIZE:
|
||||||
|
addSources(sources_buffer)
|
||||||
|
sources_buffer = []
|
||||||
|
print("Added %d sources" % counter)
|
||||||
|
|
||||||
|
|
||||||
|
if len(sources_buffer) > 0:
|
||||||
|
addSources(sources_buffer)
|
||||||
|
|
||||||
|
print("Added all %d sources" % counter)
|
||||||
|
|
||||||
|
|
||||||
|
|
12
tests/addStocznia.sh
Executable file
12
tests/addStocznia.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
./addTm.py 1 2 placeholder 1
|
||||||
|
|
||||||
|
./addAlignedLemmatizedTM.py stocznia_plen ../mgiza-aligner/corpora/stocznia_plen/src_final.txt 1 ../mgiza-aligner/corpora/stocznia_plen/trg_final.txt 2 ../mgiza-aligner/corpora/stocznia_plen/aligned_final.txt
|
||||||
|
|
||||||
|
./addTm.py 1 2 placeholder 1
|
||||||
|
|
||||||
|
./addTm.py 1 2 placeholder 1
|
||||||
|
|
||||||
|
./addAlignedLemmatizedTM.py stocznia_enpl ../mgiza-aligner/corpora/stocznia_enpl/src_final.txt 2 ../mgiza-aligner/corpora/stocznia_enpl/trg_final.txt 1 ../mgiza-aligner/corpora/stocznia_enpl/aligned_final.txt
|
||||||
|
|
27
tests/addTm.py
Executable file
27
tests/addTm.py
Executable file
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'addTm',
|
||||||
|
'sourceLangId':int(sys.argv[1]),
|
||||||
|
'targetLangId':int(sys.argv[2]),
|
||||||
|
'name':sys.argv[3],
|
||||||
|
'tmLemmatized':bool(int(sys.argv[4]))
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
|
||||||
|
print response
|
8
tests/build.sh
Executable file
8
tests/build.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
./addLemmatizedTMfromParams.sh tmrepository_enhr 2 6
|
||||||
|
./addTm.py 1 2 placeholder 1
|
||||||
|
./addLemmatizedTMfromParams.sh icd_dictionary 1 2
|
||||||
|
./addLemmatizedTMfromParams.sh icd_filtered 1 2
|
||||||
|
./addLemmatizedTMfromParams.sh emea_plen 1 2
|
||||||
|
./addLemmatizedTMfromParams.sh jrc_enes 2 4
|
30
tests/concordiaSearch.py
Executable file
30
tests/concordiaSearch.py
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'concordiaSearch',
|
||||||
|
'pattern':sys.argv[1],
|
||||||
|
'tmId':int(sys.argv[2])
|
||||||
|
}
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
31
tests/fullSearch.py
Executable file
31
tests/fullSearch.py
Executable file
@ -0,0 +1,31 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'fullSearch',
|
||||||
|
'pattern':sys.argv[1],
|
||||||
|
'tmId':int(sys.argv[2]),
|
||||||
|
'limit':int(sys.argv[3]),
|
||||||
|
'offset':int(sys.argv[4])
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
28
tests/generateIndex.py
Executable file
28
tests/generateIndex.py
Executable file
@ -0,0 +1,28 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import host
|
||||||
|
import time
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print "Generating index..."
|
||||||
|
start = time.time()
|
||||||
|
data = {
|
||||||
|
'operation': 'refreshIndex',
|
||||||
|
'tmId' : 1
|
||||||
|
}
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
25
tests/getTmsInfo.py
Executable file
25
tests/getTmsInfo.py
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
print("Trying getTmsInfo on %s" % address)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'getTmsInfo'
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
|
||||||
|
print response
|
2
tests/host.py_example
Normal file
2
tests/host.py_example
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
concordia_host = 'localhost'
|
||||||
|
concordia_port = ''
|
29
tests/lemmatizeSentence.py
Executable file
29
tests/lemmatizeSentence.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'lemmatize',
|
||||||
|
'languageCode':sys.argv[1],
|
||||||
|
'sentence':sys.argv[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
29
tests/lemmatizeSentences.py
Executable file
29
tests/lemmatizeSentences.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'lemmatizeAll',
|
||||||
|
'languageCode':sys.argv[1],
|
||||||
|
'sentences':["ona poszła do sklepu", "powiedziałem to Tomkowi"]
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
29
tests/lexiconSearch.py
Executable file
29
tests/lexiconSearch.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'lexiconSearch',
|
||||||
|
'pattern':sys.argv[1],
|
||||||
|
'tmId':int(sys.argv[2])
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
29
tests/simpleSearch.py
Executable file
29
tests/simpleSearch.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'simpleSearch',
|
||||||
|
'pattern':sys.argv[1],
|
||||||
|
'tmId':int(sys.argv[2])
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = urllib2.urlopen(req, json.dumps(data)).read()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
12
tests/testCurl.sh
Executable file
12
tests/testCurl.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# add sentence
|
||||||
|
#curl -H "Content-Type: application/json" -X POST -d '{"operation":"addSentence", "sourceSentence":"I jeszcze jedno zdanie testowe", "targetSentence":"Yet another test sentence", "tmId":1}' http://localhost
|
||||||
|
|
||||||
|
# add sentences
|
||||||
|
#curl -H "Content-Type: application/json" -X POST -d '{"operation":"addSentences", "sentences":[[1,"test source one", "test target one"],[4,"test source two", "test target two"],[9,"test source three", "test target three"],[13,"test source four", "test target four"]]}' http://localhost
|
||||||
|
|
||||||
|
# simple search
|
||||||
|
curl -H "Content-Type: application/json" -X POST -d '{"operation":"simpleSearch", "pattern":"test source"}' http://localhost
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user