optimized adding aligned files

This commit is contained in:
rjawor 2016-01-01 22:10:51 +01:00
parent 129f154d5e
commit 883aebe919
8 changed files with 66 additions and 131 deletions

View File

@ -14,7 +14,6 @@ add_executable(concordia_server_process
simple_search_result.cpp
complete_concordia_search_result.cpp
tm_dao.cpp
aligned_unit.cpp
)
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)

View File

@ -1,15 +0,0 @@
#include "aligned_unit.hpp"
AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
std::vector<std::vector<int> > alignments):
_sourceSentence(sourceSentence),
_targetSentence(targetSentence),
_alignments(alignments) {
}
AlignedUnit::~AlignedUnit() {
}

View File

@ -1,40 +0,0 @@
#ifndef ALIGNED_UNIT_HDR
#define ALIGNED_UNIT_HDR
#include <vector>
#include <string>
#include <concordia/tokenized_sentence.hpp>
class AlignedUnit {
public:
/*! Constructor.
*/
AlignedUnit(const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
std::vector<std::vector<int> > alignments);
/*! Destructor.
*/
virtual ~AlignedUnit();
TokenizedSentence getSourceSentence() const {
return _sourceSentence;
}
TokenizedSentence getTargetSentence() const {
return _targetSentence;
}
std::vector<std::vector<int> > getAlignments() const {
return _alignments;
}
private:
TokenizedSentence _sourceSentence;
TokenizedSentence _targetSentence;
std::vector<std::vector<int> > _alignments;
};
#endif

View File

@ -87,20 +87,23 @@ void IndexController::addSentences(
void IndexController::addAlignedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & rawSourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
int index = 0;
for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
index++;
}
std::vector<std::string> sourceSentences;
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
}
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
@ -137,19 +140,18 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
}
std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId) {
std::vector<AlignedUnit> result;
for (int i = 0; i<sourceSentences.size(); i++) {
std::string sourceSentence = sourceSentences[i];
std::string targetSentence = targetSentences[i];
void IndexController::_getSourceSentencesAndAlignments(
std::vector<std::string> & sourceSentences,
std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<std::string> & rawSourceSentences) {
for (int i = 0; i<rawSourceSentences.size(); i++) {
std::string rawSourceSentence = rawSourceSentences[i];
std::string rawSourceSentence;
std::vector<TokenAnnotation> sourceTokens;
std::string sourceSentence = "";
std::vector<std::vector<int> > alignments;
UnicodeString s(sourceSentence.c_str());
UnicodeString s(rawSourceSentence.c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(
s,
@ -177,27 +179,14 @@ std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std
tokenAlignments.push_back(n);
}
alignments.push_back(tokenAlignments);
rawSourceSentence += token + " ";
sourceSentence += token + " ";
}
}
rawSourceSentence = _trim(rawSourceSentence);
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
}
sourceSentences.push_back(sourceSentence);
allAlignments.push_back(alignments);
}
return result;
}
std::string IndexController::_trim(std::string & str) {
size_t first = str.find_first_not_of(' ');
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last-first+1));
}

View File

@ -10,7 +10,6 @@
#include "unit_dao.hpp"
#include "aligned_unit.hpp"
#include "rapidjson/writer.h"
@ -35,7 +34,7 @@ public:
const int tmId);
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & rawSourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId);
@ -43,12 +42,11 @@ public:
const int tmId);
private:
std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId);
void _getSourceSentencesAndAlignments(
std::vector<std::string> & sourceSentences,
std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<std::string> & rawSourceSentences);
std::string _trim(std::string & str);
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
UnitDAO _unitDAO;

View File

@ -46,18 +46,18 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
return newIds;
}
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
const std::vector<AlignedUnit> & alignedUnits,
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId) {
//TODO
DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds;
connection.startTransaction();
BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
for (int i=0; i< sourceSentences.size(); i++) {
newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
}
connection.endTransaction();
@ -194,17 +194,19 @@ int UnitDAO::_addSingleSentence(
int UnitDAO::_addAlignedUnit(
DBconnection & connection,
const AlignedUnit & alignedUnit,
const int tmId) {
DBconnection & connection,
const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int tmId) {
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
params.push_back(new StringParam(sourceSentence.getSentence()));
params.push_back(new StringParam(targetSentence.getSentence()));
params.push_back(new IntParam(tmId));
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
@ -214,23 +216,23 @@ int UnitDAO::_addAlignedUnit(
}
// add alignments
for(int i=0;i<alignedUnit.getAlignments().size();i++) {
for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
std::vector<QueryParam*> params;
params.push_back(new IntParam(newId));
params.push_back(new IntParam(i));
params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
PGresult * result = connection.execute(query, params);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
bool nonEmpty = false;
std::stringstream alignmentsQuery;
alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values ";
for(int i=0;i<alignments.size();i++) {
for (int j=0;j<alignments[i].size();j++) {
nonEmpty = true;
alignmentsQuery << "(" << newId << "," << i << "," << alignments[i][j] << "),";
}
}
if (nonEmpty) {
query = alignmentsQuery.str();
query = query.substr(0, query.length()-1);
PGresult * result = connection.execute(query);
connection.clearResult(result);
}
return newId;
}

View File

@ -11,7 +11,6 @@
#include <concordia/concordia_search_result.hpp>
#include <boost/shared_ptr.hpp>
#include "aligned_unit.hpp"
#include "simple_search_result.hpp"
#include "complete_concordia_search_result.hpp"
#include "db_connection.hpp"
@ -35,8 +34,10 @@ public:
const std::vector<std::string> & targetSentences,
const int tmId);
std::vector<SUFFIX_MARKER_TYPE> addAlignedUnits(
const std::vector<AlignedUnit> & alignedUnits,
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId);
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
@ -58,7 +59,9 @@ private:
int _addAlignedUnit(
DBconnection & connection,
const AlignedUnit & alignedUnit,
const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int tmId);
};

View File

@ -43,7 +43,6 @@ data = {
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
print response
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
@ -80,7 +79,7 @@ if len(sentences) > 0:
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
print "Generating index..."
start = time.time()