2019-05-21 15:02:27 +02:00
#!/usr/bin/python
2019-05-16 18:28:30 +02:00
# -*- coding: utf-8 -*-
import json
2019-05-21 15:02:27 +02:00
import urllib2
2019-05-16 18:28:30 +02:00
import sys
import host
import time
BUFFER_SIZE = 500
LEAVE_OUT = 1 # that does not leave out anything
address = ' http:// ' + host . concordia_host
if len ( host . concordia_port ) > 0 :
address + = ' : ' + host . concordia_port
def file_len ( fname ) :
with open ( fname ) as f :
for i , l in enumerate ( f ) :
pass
return i + 1
def add_examples ( examplesData ) :
2019-05-21 15:02:27 +02:00
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
response = json . loads ( urllib2 . urlopen ( req , json . dumps ( examplesData ) , timeout = 3600 ) . read ( ) )
print ( response )
2019-05-16 18:28:30 +02:00
if response [ ' status ' ] == ' error ' :
raise Exception ( response [ ' message ' ] )
if len ( sys . argv ) != 9 :
raise Exception ( " wrong number of arguments " )
name = sys . argv [ 1 ]
sourceFile = sys . argv [ 2 ]
lemmatizedSourceFile = sys . argv [ 3 ]
sourceLangId = int ( sys . argv [ 4 ] )
targetFile = sys . argv [ 5 ]
targetLangId = int ( sys . argv [ 6 ] )
alignmentsFile = sys . argv [ 7 ]
sourceIdsFile = sys . argv [ 8 ]
sourceFileLength = file_len ( sourceFile )
lemmatizedSourceFileLength = file_len ( lemmatizedSourceFile )
targetFileLength = file_len ( targetFile )
alignmentsFileLength = file_len ( alignmentsFile )
sourceIdsFileLength = file_len ( sourceIdsFile )
if not ( sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength ) :
print ( " File lengths: " )
print ( " source file: %d \n lemmatized source file: %d \n target file: %d \n alignments file: %d \n source ids file: %d " % ( sourceFileLength , lemmatizedSourceFileLength , targetFileLength , alignmentsFileLength , sourceIdsFileLength ) )
raise Exception ( " files are not of the same length! " )
totalExamples = sourceFileLength / LEAVE_OUT
data = {
' operation ' : ' addTm ' ,
' sourceLangId ' : sourceLangId ,
' targetLangId ' : targetLangId ,
' name ' : name ,
' tmLemmatized ' : True
}
2019-05-21 15:02:27 +02:00
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
response = json . loads ( urllib2 . urlopen ( req , json . dumps ( data ) , timeout = 3600 ) . read ( ) )
2019-05-16 18:28:30 +02:00
print ( response )
tmId = int ( response [ ' newTmId ' ] )
2019-05-21 15:02:27 +02:00
print " Added new tm: %d " % tmId
2019-05-16 18:28:30 +02:00
data = {
' operation ' : ' addSentences ' ,
' tmId ' : tmId
}
examples = [ ]
start = time . time ( )
2019-05-21 15:02:27 +02:00
with open ( sourceFile ) as source_file , open ( lemmatizedSourceFile ) as lemmatized_source_file , open ( targetFile ) as target_file , open ( alignmentsFile ) as alignments_file , open ( sourceIdsFile ) as source_ids_file :
2019-05-16 18:28:30 +02:00
addedCount = 0
for lineNumber in range ( sourceFileLength ) :
if lineNumber % LEAVE_OUT == 0 :
2019-05-18 00:37:44 +02:00
sourceSentence = source_file . readline ( ) . strip ( )
lemmatizedSourceSentence = lemmatized_source_file . readline ( ) . strip ( )
targetSentence = target_file . readline ( ) . strip ( )
2019-05-16 18:28:30 +02:00
alignment = json . loads ( alignments_file . readline ( ) . strip ( ) )
sourceId = int ( source_ids_file . readline ( ) . strip ( ) )
examples . append ( [ sourceSentence , lemmatizedSourceSentence , targetSentence , alignment , sourceId ] )
addedCount + = 1
if len ( examples ) > = BUFFER_SIZE :
data [ ' examples ' ] = examples
add_examples ( data )
mark = time . time ( )
2019-05-21 15:02:27 +02:00
print " Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second " % ( addedCount , totalExamples , mark - start , addedCount / ( mark - start ) )
2019-05-16 18:28:30 +02:00
examples = [ ]
if len ( examples ) > 0 :
data [ ' examples ' ] = examples
add_examples ( data )
end = time . time ( )
2019-05-21 15:02:27 +02:00
print " Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second " % ( addedCount , end - start , addedCount / ( end - start ) )
2019-05-16 18:28:30 +02:00
2019-05-21 15:02:27 +02:00
print " Generating index... "
2019-05-16 18:28:30 +02:00
start = time . time ( )
data = {
' operation ' : ' refreshIndex ' ,
' tmId ' : tmId
}
2019-05-21 15:02:27 +02:00
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
urllib2 . urlopen ( req , json . dumps ( data ) , timeout = 3600 ) . read ( )
2019-05-16 18:28:30 +02:00
end = time . time ( )
2019-05-21 15:02:27 +02:00
print " Index regeneration complete. The operation took %.4f s " % ( end - start )