storing and retrieving alignments

This commit is contained in:
rjawor 2015-12-29 22:13:21 +01:00
parent 67f485cbc2
commit 05d99168ed
17 changed files with 395 additions and 63 deletions

View File

@ -99,6 +99,20 @@ if(WITH_PCRE)
set(HAVE_PCRE 1) set(HAVE_PCRE 1)
endif(WITH_PCRE) endif(WITH_PCRE)
# ----------------------------------------------------
# ICU (I feeeeel youuuuu...)
# ----------------------------------------------------
find_library(ICU_LIB NAMES icui18n)
find_path(ICU_INCLUDE unicode)
if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
message(STATUS "Found ICU: ${ICU_LIB}")
include_directories(${ICU_INCLUDE})
link_directories(${ICU_LIB})
else()
message(FATAL_ERROR "ICU not found")
endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
# ---------------------------------------------------- # ----------------------------------------------------
# Boost # Boost
# ---------------------------------------------------- # ----------------------------------------------------

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
rm -rf index/* sudo rm -rf index/*
cd db cd db
./recreateDb.sh ./recreateDb.sh

View File

@ -16,5 +16,5 @@ add_executable(concordia_server_process
tm_dao.cpp tm_dao.cpp
aligned_unit.cpp aligned_unit.cpp
) )
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case) target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)

View File

@ -1,9 +1,15 @@
#include "aligned_unit.hpp" #include "aligned_unit.hpp"
AlignedUnit::AlignedUnit() { AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
std::vector<std::vector<int> > alignments):
_sourceSentence(sourceSentence),
_targetSentence(targetSentence),
_alignments(alignments) {
} }
AlignedUnit::~AlignedUnit() { AlignedUnit::~AlignedUnit() {
} }

View File

@ -10,19 +10,29 @@ class AlignedUnit {
public: public:
/*! Constructor. /*! Constructor.
*/ */
AlignedUnit(); AlignedUnit(const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
std::vector<std::vector<int> > alignments);
/*! Destructor. /*! Destructor.
*/ */
virtual ~AlignedUnit(); virtual ~AlignedUnit();
boost::shared_ptr<TokenizedSentence> getSourceSentence() { TokenizedSentence getSourceSentence() const {
return _sourceSentence; return _sourceSentence;
} }
private: TokenizedSentence getTargetSentence() const {
boost::shared_ptr<TokenizedSentence> _sourceSentence; return _targetSentence;
}
boost::shared_ptr<TokenizedSentence> _targetSentence; std::vector<std::vector<int> > getAlignments() const {
return _alignments;
}
private:
TokenizedSentence _sourceSentence;
TokenizedSentence _targetSentence;
std::vector<std::vector<int> > _alignments; std::vector<std::vector<int> > _alignments;
}; };

View File

@ -58,6 +58,7 @@ int main(int argc, char** argv) {
std::streambuf * cout_streambuf = std::cout.rdbuf(); std::streambuf * cout_streambuf = std::cout.rdbuf();
std::streambuf * cerr_streambuf = std::cerr.rdbuf(); std::streambuf * cerr_streambuf = std::cerr.rdbuf();
try {
ConcordiaServer concordiaServer(CONFIG_FILE_PATH); ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
Logger::log("Concordia server initiated successfully, waiting for requests"); Logger::log("Concordia server initiated successfully, waiting for requests");
@ -88,7 +89,12 @@ int main(int argc, char** argv) {
std::cout.rdbuf(cout_streambuf); std::cout.rdbuf(cout_streambuf);
std::cerr.rdbuf(cerr_streambuf); std::cerr.rdbuf(cerr_streambuf);
Logger::log("Shutting down Concordia server process"); Logger::log("Gracefully shutting down Concordia server process");
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "FATAL CONCORDIA ERROR: " << e.what()<< " - shutting down";
Logger::log(errorstream.str());
}
return 0; return 0;
} }

View File

@ -2,6 +2,15 @@
#include <concordia/common/config.hpp> #include <concordia/common/config.hpp>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <unicode/unistr.h>
#include "json_generator.hpp" #include "json_generator.hpp"
#include "logger.hpp" #include "logger.hpp"
@ -23,10 +32,10 @@ void IndexController::addSentence(
try { try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedSentence = (*_concordiasMap)[tmId].tokenize(sourceSentence); TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
(*_concordiasMap)[tmId].addTokenizedExample(tokenizedSentence, sentenceId); it->second->addTokenizedExample(tokenizedSentence, sentenceId);
(*_concordiasMap)[tmId].refreshSAfromRAM(); it->second->refreshSAfromRAM();
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
@ -58,9 +67,9 @@ void IndexController::addSentences(
try { try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
std::vector<TokenizedSentence> tokenizedSentences = (*_concordiasMap)[tmId].tokenizeAll(sourceSentences); std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId); std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
(*_concordiasMap)[tmId].addAllTokenizedExamples(tokenizedSentences, sentenceIds); it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
@ -84,11 +93,11 @@ void IndexController::addAlignedSentences(
try { try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences); std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId); std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
int index = 0; int index = 0;
for(std::vector<AlignedUnit>::iterator it = alignedUnits.begin(); it != alignedUnits.end(); ++it) { for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
(*_concordiasMap)[tmId].addTokenizedExample(*(it->getSourceSentence()), sentenceIds.at(index)); it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
index++; index++;
} }
@ -111,7 +120,7 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
try { try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
(*_concordiasMap)[tmId].refreshSAfromRAM(); it->second->refreshSAfromRAM();
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
@ -129,13 +138,66 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
} }
std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences, std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences) { const std::vector<std::string> & targetSentences,
//TODO const int tmId) {
std::vector<AlignedUnit> result; std::vector<AlignedUnit> result;
for (int i = 0; i<sourceSentences.size(); i++) {
std::string sourceSentence = sourceSentences[i];
std::string targetSentence = targetSentences[i];
std::string rawSourceSentence;
std::vector<TokenAnnotation> sourceTokens;
std::vector<std::vector<int> > alignments;
UnicodeString s(sourceSentence.c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(
s,
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
)
);
boost::u32regex_iterator<const UChar*> end;
for (; begin != end; ++begin) {
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
std::string token;
tokenUTF8.toUTF8String(token);
if (token != "NULL") {
std::string numbers((*begin)[2].first, (*begin)[2].second);
std::istringstream iss(numbers);
std::vector<std::string> numberStrings;
std::copy(std::istream_iterator<std::string>(iss),
std::istream_iterator<std::string>(),
std::back_inserter(numberStrings));
std::vector<int> tokenAlignments;
for (int j=0;j<numberStrings.size();j++) {
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
tokenAlignments.push_back(n);
}
alignments.push_back(tokenAlignments);
rawSourceSentence += token + " ";
}
}
rawSourceSentence = _trim(rawSourceSentence);
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
}
}
return result; return result;
} }
std::string IndexController::_trim(std::string & str) {
size_t first = str.find_first_not_of(' ');
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last-first+1));
}

View File

@ -6,6 +6,7 @@
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/concordia_exception.hpp> #include <concordia/concordia_exception.hpp>
#include <boost/ptr_container/ptr_map.hpp> #include <boost/ptr_container/ptr_map.hpp>
#include <concordia/token_annotation.hpp>
#include "unit_dao.hpp" #include "unit_dao.hpp"
@ -43,7 +44,10 @@ public:
private: private:
std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences, std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences); const std::vector<std::string> & targetSentences,
const int tmId);
std::string _trim(std::string & str);
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap; boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;

View File

@ -1,5 +1,6 @@
#include "json_generator.hpp" #include "json_generator.hpp"
#include <boost/foreach.hpp>
JsonGenerator::JsonGenerator() { JsonGenerator::JsonGenerator() {
} }
@ -35,6 +36,18 @@ void JsonGenerator::writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer>
jsonWriter.String(result.getSourceSegment().c_str()); jsonWriter.String(result.getSourceSegment().c_str());
jsonWriter.String("targetSegment"); jsonWriter.String("targetSegment");
jsonWriter.String(result.getTargetSegment().c_str()); jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.String("targetFragments");
jsonWriter.StartArray();
for (std::vector<std::pair<int,int> >::const_iterator it = result.getTargetFragments().begin();
it != result.getTargetFragments().end(); it++) {
jsonWriter.StartArray();
jsonWriter.Int(it->first);
jsonWriter.Int(it->second);
jsonWriter.EndArray();
}
jsonWriter.EndArray();
jsonWriter.EndObject(); jsonWriter.EndObject();
} }

View File

@ -19,7 +19,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
const int tmId) { const int tmId) {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults((*_concordiasMap)[tmId].simpleSearch(pattern)); std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
@ -42,7 +42,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult((*_concordiasMap)[tmId].concordiaSearch(pattern)); CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");

View File

@ -20,3 +20,8 @@ SimpleSearchResult::SimpleSearchResult(
SimpleSearchResult::~SimpleSearchResult() { SimpleSearchResult::~SimpleSearchResult() {
} }
void SimpleSearchResult::addMatchedTargetFragment(const std::pair<int,int> & targetFragment) {
_targetFragments.push_back(targetFragment);
}

View File

@ -2,6 +2,7 @@
#define SIMPLE_SEARCH_RESULT_HDR #define SIMPLE_SEARCH_RESULT_HDR
#include <string> #include <string>
#include <vector>
class SimpleSearchResult { class SimpleSearchResult {
public: public:
@ -47,6 +48,12 @@ public:
return _targetSegment; return _targetSegment;
} }
const std::vector<std::pair<int,int> > & getTargetFragments() const {
return _targetFragments;
}
void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
private: private:
int _id; int _id;
@ -61,6 +68,8 @@ private:
std::string _sourceSegment; std::string _sourceSegment;
std::string _targetSegment; std::string _targetSegment;
std::vector<std::pair<int,int> > _targetFragments;
}; };
#endif #endif

View File

@ -1,5 +1,7 @@
#include "unit_dao.hpp" #include "unit_dao.hpp"
#include<sstream>
#include "query_param.hpp" #include "query_param.hpp"
#include "string_param.hpp" #include "string_param.hpp"
#include "int_param.hpp" #include "int_param.hpp"
@ -48,7 +50,17 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
const std::vector<AlignedUnit> & alignedUnits, const std::vector<AlignedUnit> & alignedUnits,
const int tmId) { const int tmId) {
//TODO //TODO
DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds; std::vector<SUFFIX_MARKER_TYPE> newIds;
connection.startTransaction();
BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
}
connection.endTransaction();
return newIds; return newIds;
} }
@ -84,23 +96,66 @@ void UnitDAO::_getResultsFromFragments(
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd(); matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
} }
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;"; std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params; std::vector<QueryParam*> params;
params.push_back(new IntParam(2*fragment.getExampleOffset()+1)); params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength()))); params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength())));
params.push_back(new IntParam(fragment.getExampleId())); params.push_back(new IntParam(fragment.getExampleId()));
PGresult * result = connection.execute(query, params); PGresult * result = connection.execute(query, params);
results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0), // example id SimpleSearchResult ssResult(connection.getIntValue(result,0,0), // example id
matchedPatternStart, matchedPatternStart,
matchedPatternEnd, matchedPatternEnd,
connection.getIntValue(result,0,3), // matched example start connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2))); // target segment connection.getStringValue(result,0,2)); // target segment
connection.clearResult(result); connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) { BOOST_FOREACH (QueryParam * param, params) {
delete param; delete param;
} }
//TODO now add all target fragments matched with this fragment
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
std::vector<QueryParam*> targetParams;
targetParams.push_back(new IntParam(fragment.getExampleId()));
targetParams.push_back(new IntParam(fragment.getExampleOffset()));
targetParams.push_back(new IntParam(fragment.getExampleOffset() + fragment.getMatchedLength() - 1));
PGresult * targetResult = connection.execute(targetQuery, targetParams);
int prevPos = -2;
int currStart = -1;
int currEnd = -1;
for (int i=0;i<connection.getRowCount(targetResult);i++) {
int targetPos = connection.getIntValue(targetResult, i, 0);
int targetStart = connection.getIntValue(targetResult, i, 1);
int targetEnd = connection.getIntValue(targetResult, i, 2);
if (prevPos < targetPos - 1) { // beginning of detached fragment
// check if there is a fragment to end
if (currStart >= 0) {
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
currStart = targetStart;
}
currEnd = targetEnd;
prevPos = targetPos;
}
// check if there are remaining fragments
if (currStart >= 0) {
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
connection.clearResult(targetResult);
BOOST_FOREACH (QueryParam * param, targetParams) {
delete param;
}
results.push_back(ssResult);
} }
connection.endTransaction(); connection.endTransaction();
} }
@ -138,5 +193,45 @@ int UnitDAO::_addSingleSentence(
} }
int UnitDAO::_addAlignedUnit(
DBconnection & connection,
const AlignedUnit & alignedUnit,
const int tmId) {
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
params.push_back(new IntParam(tmId));
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
// add alignments
for(int i=0;i<alignedUnit.getAlignments().size();i++) {
for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
std::vector<QueryParam*> params;
params.push_back(new IntParam(newId));
params.push_back(new IntParam(i));
params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
PGresult * result = connection.execute(query, params);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
}
}
return newId;
}

View File

@ -56,6 +56,10 @@ private:
const std::string & targetSentence, const std::string & targetSentence,
const int tmId); const int tmId);
int _addAlignedUnit(
DBconnection & connection,
const AlignedUnit & alignedUnit,
const int tmId);
}; };
#endif #endif

View File

@ -20,7 +20,14 @@ CREATE TABLE unit (
source_segment text, source_segment text,
target_segment text, target_segment text,
source_tokens integer[], source_tokens integer[],
target_tokens integer[], target_tokens integer[]
alignments integer[][] );
DROP TABLE IF EXISTS alignment;
CREATE TABLE alignment (
id SERIAL PRIMARY KEY,
unit_id integer,
source_token_pos integer,
target_token_pos integer
); );

99
tests/addAlignedFile.py Executable file
View File

@ -0,0 +1,99 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_data(data):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
#print response
sourceFile = sys.argv[1]
sourceLangId = int(sys.argv[2])
targetLangId = int(sys.argv[3])
name = sys.argv[4]
totalLines = file_len(sourceFile)
data = {
'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
print response
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
data = {
'operation': 'addAlignedSentences',
'tmId':tmId
}
sentences = []
currSentence = []
start = time.time()
with open(sourceFile) as sourceLines:
lineNumber = 0
for line in sourceLines:
line = line.strip()
if lineNumber % 3 == 1:
currSentence.append(line)
elif lineNumber % 3 == 2:
currSentence.append(line)
currSentence.reverse()
sentences.append(currSentence)
currSentence = []
if len(sentences) >= BUFFER_SIZE:
data['sentences'] = sentences
add_data(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/3*(mark-start))
lineNumber += 1
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex',
'tmId' : tmId
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

View File

@ -2,9 +2,7 @@
description "pgbouncer" description "pgbouncer"
start on (net-device-up start on started postgresql
and local-filesystems
and runlevel [2345])
stop on runlevel [016] stop on runlevel [016]