2019-02-22 22:13:45 +01:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
2019-03-03 19:55:11 +01:00
LEAVE_OUT = 2 # that leaves out every second sentence
2019-02-22 22:13:45 +01:00
address = ' http:// ' + host . concordia_host
if len ( host . concordia_port ) > 0 :
address + = ' : ' + host . concordia_port
def file_len ( fname ) :
with open ( fname ) as f :
for i , l in enumerate ( f ) :
pass
return i + 1
def add_examples ( examplesData ) :
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
2019-03-03 15:24:34 +01:00
response = json . loads ( urllib2 . urlopen ( req , json . dumps ( examplesData ) , timeout = 3600 ) . read ( ) )
2019-02-22 22:13:45 +01:00
print ( response )
if response [ ' status ' ] == ' error ' :
raise Exception ( response [ ' message ' ] )
if len ( sys . argv ) != 9 :
raise Exception ( " wrong number of arguments " )
name = sys . argv [ 1 ]
sourceFile = sys . argv [ 2 ]
lemmatizedSourceFile = sys . argv [ 3 ]
sourceLangId = int ( sys . argv [ 4 ] )
targetFile = sys . argv [ 5 ]
targetLangId = int ( sys . argv [ 6 ] )
alignmentsFile = sys . argv [ 7 ]
sourceIdsFile = sys . argv [ 8 ]
sourceFileLength = file_len ( sourceFile )
lemmatizedSourceFileLength = file_len ( lemmatizedSourceFile )
targetFileLength = file_len ( targetFile )
alignmentsFileLength = file_len ( alignmentsFile )
sourceIdsFileLength = file_len ( sourceIdsFile )
if not ( sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength ) :
print ( " File lengths: " )
print ( " source file: %d \n lemmatized source file: %d \n target file: %d \n alignments file: %d \n source ids file: %d " % ( sourceFileLength , lemmatizedSourceFileLength , targetFileLength , alignmentsFileLength , sourceIdsFileLength ) )
raise Exception ( " files are not of the same length! " )
2019-02-26 14:00:10 +01:00
totalExamples = sourceFileLength / LEAVE_OUT
2019-02-22 22:13:45 +01:00
data = {
' operation ' : ' addTm ' ,
' sourceLangId ' : sourceLangId ,
' targetLangId ' : targetLangId ,
' name ' : name ,
' tmLemmatized ' : True
}
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
2019-03-03 15:24:34 +01:00
response = json . loads ( urllib2 . urlopen ( req , json . dumps ( data ) , timeout = 3600 ) . read ( ) )
2019-02-22 22:13:45 +01:00
print ( response )
tmId = int ( response [ ' newTmId ' ] )
print " Added new tm: %d " % tmId
data = {
' operation ' : ' addSentences ' ,
' tmId ' : tmId
}
examples = [ ]
start = time . time ( )
with open ( sourceFile ) as source_file , open ( lemmatizedSourceFile ) as lemmatized_source_file , open ( targetFile ) as target_file , open ( alignmentsFile ) as alignments_file , open ( sourceIdsFile ) as source_ids_file :
2019-02-26 14:00:10 +01:00
addedCount = 0
2019-02-26 20:15:17 +01:00
for lineNumber in range ( sourceFileLength ) :
2019-02-26 14:00:10 +01:00
if lineNumber % LEAVE_OUT == 0 :
sourceSentence = source_file . readline ( ) . strip ( )
lemmatizedSourceSentence = lemmatized_source_file . readline ( ) . strip ( )
targetSentence = target_file . readline ( ) . strip ( )
alignment = json . loads ( alignments_file . readline ( ) . strip ( ) )
sourceId = int ( source_ids_file . readline ( ) . strip ( ) )
examples . append ( [ sourceSentence , lemmatizedSourceSentence , targetSentence , alignment , sourceId ] )
addedCount + = 1
if len ( examples ) > = BUFFER_SIZE :
data [ ' examples ' ] = examples
add_examples ( data )
mark = time . time ( )
print " Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second " % ( addedCount , totalExamples , mark - start , addedCount / ( mark - start ) )
examples = [ ]
2019-02-22 22:13:45 +01:00
if len ( examples ) > 0 :
data [ ' examples ' ] = examples
add_examples ( data )
end = time . time ( )
2019-02-26 14:00:10 +01:00
print " Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second " % ( addedCount , end - start , addedCount / ( end - start ) )
2019-02-22 22:13:45 +01:00
print " Generating index... "
start = time . time ( )
data = {
' operation ' : ' refreshIndex ' ,
' tmId ' : tmId
}
req = urllib2 . Request ( address )
req . add_header ( ' Content-Type ' , ' application/json ' )
2019-03-03 15:24:34 +01:00
urllib2 . urlopen ( req , json . dumps ( data ) , timeout = 3600 ) . read ( )
2019-02-22 22:13:45 +01:00
end = time . time ( )
print " Index regeneration complete. The operation took %.4f s " % ( end - start )