2019-05-18 00:37:44 +02:00
#!/usr/bin/python3
2019-05-16 18:28:30 +02:00
# -*- coding: utf-8 -*-
import json
2019-05-18 00:37:44 +02:00
import requests
2019-05-16 18:28:30 +02:00
import sys
import host
import time
2019-05-17 14:28:29 +02:00
import codecs
2019-05-16 18:28:30 +02:00
BUFFER_SIZE = 500
LEAVE_OUT = 1 # that does not leave out anything
address = ' http:// ' + host . concordia_host
if len ( host . concordia_port ) > 0 :
address + = ' : ' + host . concordia_port
2019-05-18 00:37:44 +02:00
headers = { " content-type " : " application/json;charset=UTF-8 " }
2019-05-16 18:28:30 +02:00
def file_len ( fname ) :
with open ( fname ) as f :
for i , l in enumerate ( f ) :
pass
return i + 1
def add_examples ( examplesData ) :
2019-05-18 00:37:44 +02:00
response = requests . post ( address , data = json . dumps ( examplesData , ensure_ascii = False ) . encode ( ' utf-8 ' ) , headers = headers ) . json ( )
2019-05-16 18:28:30 +02:00
if response [ ' status ' ] == ' error ' :
raise Exception ( response [ ' message ' ] )
if len ( sys . argv ) != 9 :
raise Exception ( " wrong number of arguments " )
name = sys . argv [ 1 ]
sourceFile = sys . argv [ 2 ]
lemmatizedSourceFile = sys . argv [ 3 ]
sourceLangId = int ( sys . argv [ 4 ] )
targetFile = sys . argv [ 5 ]
targetLangId = int ( sys . argv [ 6 ] )
alignmentsFile = sys . argv [ 7 ]
sourceIdsFile = sys . argv [ 8 ]
sourceFileLength = file_len ( sourceFile )
lemmatizedSourceFileLength = file_len ( lemmatizedSourceFile )
targetFileLength = file_len ( targetFile )
alignmentsFileLength = file_len ( alignmentsFile )
sourceIdsFileLength = file_len ( sourceIdsFile )
if not ( sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength ) :
print ( " File lengths: " )
print ( " source file: %d \n lemmatized source file: %d \n target file: %d \n alignments file: %d \n source ids file: %d " % ( sourceFileLength , lemmatizedSourceFileLength , targetFileLength , alignmentsFileLength , sourceIdsFileLength ) )
raise Exception ( " files are not of the same length! " )
totalExamples = sourceFileLength / LEAVE_OUT
data = {
' operation ' : ' addTm ' ,
' sourceLangId ' : sourceLangId ,
' targetLangId ' : targetLangId ,
' name ' : name ,
' tmLemmatized ' : True
}
2019-05-18 00:37:44 +02:00
response = requests . post ( address , json = data , headers = headers ) . json ( )
2019-05-16 18:28:30 +02:00
print ( response )
tmId = int ( response [ ' newTmId ' ] )
2019-05-17 14:28:29 +02:00
print ( " Added new tm: %d " % tmId )
2019-05-16 18:28:30 +02:00
data = {
' operation ' : ' addSentences ' ,
' tmId ' : tmId
}
examples = [ ]
start = time . time ( )
2019-05-18 00:37:44 +02:00
with codecs . open ( sourceFile , " r " , " utf-8 " , errors = ' replace ' ) as source_file , codecs . open ( lemmatizedSourceFile , " r " , " utf-8 " , errors = ' replace ' ) as lemmatized_source_file , codecs . open ( targetFile , " r " , " utf-8 " , errors = ' replace ' ) as target_file , open ( alignmentsFile ) as alignments_file , open ( sourceIdsFile ) as source_ids_file :
2019-05-16 18:28:30 +02:00
addedCount = 0
for lineNumber in range ( sourceFileLength ) :
if lineNumber % LEAVE_OUT == 0 :
2019-05-18 00:37:44 +02:00
sourceSentence = source_file . readline ( ) . strip ( )
lemmatizedSourceSentence = lemmatized_source_file . readline ( ) . strip ( )
targetSentence = target_file . readline ( ) . strip ( )
2019-05-16 18:28:30 +02:00
alignment = json . loads ( alignments_file . readline ( ) . strip ( ) )
sourceId = int ( source_ids_file . readline ( ) . strip ( ) )
examples . append ( [ sourceSentence , lemmatizedSourceSentence , targetSentence , alignment , sourceId ] )
addedCount + = 1
if len ( examples ) > = BUFFER_SIZE :
data [ ' examples ' ] = examples
add_examples ( data )
mark = time . time ( )
2019-05-17 14:28:29 +02:00
print ( " Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second " % ( addedCount , totalExamples , mark - start , addedCount / ( mark - start ) ) )
2019-05-16 18:28:30 +02:00
examples = [ ]
if len ( examples ) > 0 :
data [ ' examples ' ] = examples
add_examples ( data )
end = time . time ( )
2019-05-17 14:28:29 +02:00
print ( " Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second " % ( addedCount , end - start , addedCount / ( end - start ) ) )
2019-05-16 18:28:30 +02:00
2019-05-17 14:28:29 +02:00
print ( " Generating index... " )
2019-05-16 18:28:30 +02:00
start = time . time ( )
data = {
' operation ' : ' refreshIndex ' ,
' tmId ' : tmId
}
2019-05-18 00:37:44 +02:00
requests . post ( address , json = data , headers = headers )
2019-05-16 18:28:30 +02:00
end = time . time ( )
2019-05-17 14:28:29 +02:00
print ( " Index regeneration complete. The operation took %.4f s " % ( end - start ) )