cleaned tests

This commit is contained in:
Rafał Jaworski 2019-05-30 11:25:12 +02:00
parent 441c6fffb5
commit 6b59e1b0f3
45 changed files with 0 additions and 466 deletions

View File

@ -1,98 +0,0 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_data(data):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
sourceFile = sys.argv[1]
sourceLangId = int(sys.argv[2])
targetLangId = int(sys.argv[3])
name = sys.argv[4]
totalLines = file_len(sourceFile)
data = {
'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
data = {
'operation': 'addAlignedSentences',
'tmId':tmId
}
sentences = []
currSentence = []
start = time.time()
with open(sourceFile) as sourceLines:
lineNumber = 0
for line in sourceLines:
line = line.strip()
if lineNumber % 3 == 1:
currSentence.append(line)
elif lineNumber % 3 == 2:
currSentence.append(line)
currSentence.reverse()
sentences.append(currSentence)
currSentence = []
if len(sentences) >= BUFFER_SIZE:
data['sentences'] = sentences
add_data(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
sentences = []
lineNumber += 1
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex',
'tmId' : tmId
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

View File

@ -1,80 +0,0 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_data(data):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
sourceFile = sys.argv[1]
tmId = int(sys.argv[2])
totalLines = file_len(sourceFile)
data = {
'operation': 'addAlignedSentences',
'tmId':tmId
}
sentences = []
currSentence = []
start = time.time()
with open(sourceFile) as sourceLines:
lineNumber = 0
for line in sourceLines:
line = line.strip()
if lineNumber % 3 == 1:
currSentence.append(line)
elif lineNumber % 3 == 2:
currSentence.append(line)
currSentence.reverse()
sentences.append(currSentence)
currSentence = []
if len(sentences) >= BUFFER_SIZE:
data['sentences'] = sentences
add_data(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
sentences = []
lineNumber += 1
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex',
'tmId' : tmId
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

View File

@ -1,111 +0,0 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_examples(examplesData):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
if response['status'] == 'error':
raise Exception(response['message'])
if len(sys.argv) != 7:
raise Exception("wrong number of arguments")
name = sys.argv[1]
sourceFile = sys.argv[2]
sourceLangId = int(sys.argv[3])
targetFile = sys.argv[4]
targetLangId = int(sys.argv[5])
alignmentsFile = sys.argv[6]
if (file_len(sourceFile) != file_len(targetFile)):
raise Exception("source and target files are not of the same length!")
if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
raise Exception("alignments file is not exactly 3 times longer than source and target")
totalExamples = file_len(sourceFile)
data = {
'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name,
'tmLemmatized':True
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
print(response)
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
data = {
'operation': 'addAlignedLemmatizedSentences',
'tmId':tmId
}
examples = []
start = time.time()
with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
for lineNumber in range(totalExamples):
sourceSentence = sf.readline().strip()
targetSentence = tf.readline().strip()
# skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
af.readline()
af.readline()
alignmentString = af.readline().strip()
examples.append([sourceSentence, targetSentence, alignmentString])
if len(examples) >= BUFFER_SIZE:
data['examples'] = examples
add_examples(data)
mark = time.time()
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
examples = []
if len(examples) > 0:
data['examples'] = examples
add_examples(data)
end = time.time()
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex',
'tmId' : tmId
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

View File

@ -1,97 +0,0 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_data(data):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
sourceFile = sys.argv[1]
sourceLangId = int(sys.argv[2])
targetFile = sys.argv[3]
targetLangId = int(sys.argv[4])
name = sys.argv[5]
totalLines = file_len(sourceFile)
if file_len(targetFile) != totalLines:
print "File lengths do not match"
sys.exit(1)
data = {
'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
data = {
'operation': 'addSentences',
'tmId':tmId
}
sentences = []
start = time.time()
with open(sourceFile) as sourceSentences:
with open(targetFile) as targetSentences:
lineNumber = 0
for sourceSentence in sourceSentences:
lineNumber += 1
targetSentence = targetSentences.readline()
sentences.append([sourceSentence, targetSentence])
if lineNumber % BUFFER_SIZE == 0:
data['sentences'] = sentences
sentences = []
add_data(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % (lineNumber, totalLines, mark-start, lineNumber/(mark-start))
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (lineNumber, end-start, lineNumber/(end-start))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex',
'tmId' : tmId
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

View File

@ -1,4 +0,0 @@
#!/bin/sh
./addFile.py ~/projects/corpora/jrc/jrc_pl.txt ~/projects/corpora/jrc/jrc_en.txt 1

View File

@ -1,7 +0,0 @@
#!/bin/sh
CORPUS_NAME="stocznia_plen"
SRC_LANG_ID=1
TRG_LANG_ID=2
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt

View File

@ -1,7 +0,0 @@
#!/bin/sh
CORPUS_NAME=$1
SRC_LANG_ID=$2
TRG_LANG_ID=$3
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt

View File

@ -1,9 +0,0 @@
#!/bin/sh
CORPUS_NAME=error_sample
CORPUS_PATH=/root/opensubtitles_pack/error_sample
SRC_LANG_ID=1
TRG_LANG_ID=2
./addFastAlignedTM.py $CORPUS_NAME $CORPUS_PATH/src_clean.txt $CORPUS_PATH/src_clean.lem $SRC_LANG_ID $CORPUS_PATH/trg_clean.txt $TRG_LANG_ID $CORPUS_PATH/alignments.txt $CORPUS_PATH/ids_clean.txt

View File

@ -1,33 +0,0 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
import host
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
data = {
'operation': 'addSentence',
'sourceSentence':sys.argv[1],
'targetSentence':sys.argv[2],
'tmId':int(sys.argv[3])
}
start = time.time()
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response

View File

@ -1,12 +0,0 @@
#!/bin/sh
./addTm.py 1 2 placeholder 1
./addAlignedLemmatizedTM.py stocznia_plen ../mgiza-aligner/corpora/stocznia_plen/src_final.txt 1 ../mgiza-aligner/corpora/stocznia_plen/trg_final.txt 2 ../mgiza-aligner/corpora/stocznia_plen/aligned_final.txt
./addTm.py 1 2 placeholder 1
./addTm.py 1 2 placeholder 1
./addAlignedLemmatizedTM.py stocznia_enpl ../mgiza-aligner/corpora/stocznia_enpl/src_final.txt 2 ../mgiza-aligner/corpora/stocznia_enpl/trg_final.txt 1 ../mgiza-aligner/corpora/stocznia_enpl/aligned_final.txt

View File

@ -1,8 +0,0 @@
#!/bin/sh
./addLemmatizedTMfromParams.sh tmrepository_enhr 2 6
./addTm.py 1 2 placeholder 1
./addLemmatizedTMfromParams.sh icd_dictionary 1 2
./addLemmatizedTMfromParams.sh icd_filtered 1 2
./addLemmatizedTMfromParams.sh emea_plen 1 2
./addLemmatizedTMfromParams.sh jrc_enes 2 4