2019-05-16 18:28:30 +02:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib2
import sys
import host
import time
2019-05-17 14:28:29 +02:00
import codecs
2019-05-16 18:28:30 +02:00
BUFFER_SIZE = 500
LEAVE_OUT = 1 # that does not leave out anything
address = ' http:// ' + host . concordia_host
if len ( host . concordia_port ) > 0 :
address + = ' : ' + host . concordia_port
def file_len ( fname ) :
with open ( fname ) as f :
for i , l in enumerate ( f ) :
pass
return i + 1
def add_examples ( examplesData ) :
req = urllib2 . Request ( address )
2019-05-17 14:28:29 +02:00
req . add_header ( ' Content-Type ' , ' application/json; charset=utf-8 ' )
encodedData = json . dumps ( examplesData , ensure_ascii = False ) . encode ( ' utf-8 ' , ' ignore ' )
response = json . loads ( urllib2 . urlopen ( req , encodedData , timeout = 3600 ) . read ( ) )
2019-05-16 18:28:30 +02:00
print ( response )
if response [ ' status ' ] == ' error ' :
raise Exception ( response [ ' message ' ] )
if len ( sys . argv ) != 9 :
raise Exception ( " wrong number of arguments " )
name = sys . argv [ 1 ]
sourceFile = sys . argv [ 2 ]
lemmatizedSourceFile = sys . argv [ 3 ]
sourceLangId = int ( sys . argv [ 4 ] )
targetFile = sys . argv [ 5 ]
targetLangId = int ( sys . argv [ 6 ] )
alignmentsFile = sys . argv [ 7 ]
sourceIdsFile = sys . argv [ 8 ]
sourceFileLength = file_len ( sourceFile )
lemmatizedSourceFileLength = file_len ( lemmatizedSourceFile )
targetFileLength = file_len ( targetFile )
alignmentsFileLength = file_len ( alignmentsFile )
sourceIdsFileLength = file_len ( sourceIdsFile )
if not ( sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength ) :
print ( " File lengths: " )
print ( " source file: %d \n lemmatized source file: %d \n target file: %d \n alignments file: %d \n source ids file: %d " % ( sourceFileLength , lemmatizedSourceFileLength , targetFileLength , alignmentsFileLength , sourceIdsFileLength ) )
raise Exception ( " files are not of the same length! " )
totalExamples = sourceFileLength / LEAVE_OUT
data = {
' operation ' : ' addTm ' ,
' sourceLangId ' : sourceLangId ,
' targetLangId ' : targetLangId ,
' name ' : name ,
' tmLemmatized ' : True
}
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
response = json . loads ( urllib2 . urlopen ( req , json . dumps ( data ) , timeout = 3600 ) . read ( ) )
print ( response )
tmId = int ( response [ ' newTmId ' ] )
2019-05-17 14:28:29 +02:00
print ( " Added new tm: %d " % tmId )
2019-05-16 18:28:30 +02:00
data = {
' operation ' : ' addSentences ' ,
' tmId ' : tmId
}
examples = [ ]
start = time . time ( )
2019-05-17 14:28:29 +02:00
with codecs . open ( sourceFile , " r " , " utf-8 " ) as source_file , codecs . open ( lemmatizedSourceFile , " r " , " utf-8 " ) as lemmatized_source_file , codecs . open ( targetFile , " r " , " utf-8 " ) as target_file , open ( alignmentsFile ) as alignments_file , open ( sourceIdsFile ) as source_ids_file :
2019-05-16 18:28:30 +02:00
addedCount = 0
for lineNumber in range ( sourceFileLength ) :
if lineNumber % LEAVE_OUT == 0 :
2019-05-17 14:28:29 +02:00
sourceSentence = source_file . readline ( ) . strip ( ) . encode ( ' utf-8 ' )
lemmatizedSourceSentence = lemmatized_source_file . readline ( ) . strip ( ) . encode ( ' utf-8 ' )
targetSentence = target_file . readline ( ) . strip ( ) . encode ( ' utf-8 ' )
2019-05-16 18:28:30 +02:00
alignment = json . loads ( alignments_file . readline ( ) . strip ( ) )
sourceId = int ( source_ids_file . readline ( ) . strip ( ) )
2019-05-17 14:28:29 +02:00
#print(sourceSentence)
#print(lemmatizedSourceSentence)
#print(targetSentence)
2019-05-16 18:28:30 +02:00
examples . append ( [ sourceSentence , lemmatizedSourceSentence , targetSentence , alignment , sourceId ] )
addedCount + = 1
if len ( examples ) > = BUFFER_SIZE :
data [ ' examples ' ] = examples
add_examples ( data )
mark = time . time ( )
2019-05-17 14:28:29 +02:00
print ( " Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second " % ( addedCount , totalExamples , mark - start , addedCount / ( mark - start ) ) )
2019-05-16 18:28:30 +02:00
examples = [ ]
if len ( examples ) > 0 :
data [ ' examples ' ] = examples
add_examples ( data )
end = time . time ( )
2019-05-17 14:28:29 +02:00
print ( " Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second " % ( addedCount , end - start , addedCount / ( end - start ) ) )
2019-05-16 18:28:30 +02:00
2019-05-17 14:28:29 +02:00
print ( " Generating index... " )
2019-05-16 18:28:30 +02:00
start = time . time ( )
data = {
' operation ' : ' refreshIndex ' ,
' tmId ' : tmId
}
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
urllib2 . urlopen ( req , json . dumps ( data ) , timeout = 3600 ) . read ( )
end = time . time ( )
2019-05-17 14:28:29 +02:00
print ( " Index regeneration complete. The operation took %.4f s " % ( end - start ) )