optimized adding aligned files
This commit is contained in:
parent
129f154d5e
commit
883aebe919
@ -14,7 +14,6 @@ add_executable(concordia_server_process
|
|||||||
simple_search_result.cpp
|
simple_search_result.cpp
|
||||||
complete_concordia_search_result.cpp
|
complete_concordia_search_result.cpp
|
||||||
tm_dao.cpp
|
tm_dao.cpp
|
||||||
aligned_unit.cpp
|
|
||||||
)
|
)
|
||||||
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
|
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
|
||||||
|
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
#include "aligned_unit.hpp"
|
|
||||||
|
|
||||||
|
|
||||||
AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
|
|
||||||
const TokenizedSentence & targetSentence,
|
|
||||||
std::vector<std::vector<int> > alignments):
|
|
||||||
_sourceSentence(sourceSentence),
|
|
||||||
_targetSentence(targetSentence),
|
|
||||||
_alignments(alignments) {
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
AlignedUnit::~AlignedUnit() {
|
|
||||||
}
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
|||||||
#ifndef ALIGNED_UNIT_HDR
|
|
||||||
#define ALIGNED_UNIT_HDR
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include <concordia/tokenized_sentence.hpp>
|
|
||||||
|
|
||||||
class AlignedUnit {
|
|
||||||
public:
|
|
||||||
/*! Constructor.
|
|
||||||
*/
|
|
||||||
AlignedUnit(const TokenizedSentence & sourceSentence,
|
|
||||||
const TokenizedSentence & targetSentence,
|
|
||||||
std::vector<std::vector<int> > alignments);
|
|
||||||
/*! Destructor.
|
|
||||||
*/
|
|
||||||
virtual ~AlignedUnit();
|
|
||||||
|
|
||||||
TokenizedSentence getSourceSentence() const {
|
|
||||||
return _sourceSentence;
|
|
||||||
}
|
|
||||||
|
|
||||||
TokenizedSentence getTargetSentence() const {
|
|
||||||
return _targetSentence;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<int> > getAlignments() const {
|
|
||||||
return _alignments;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
TokenizedSentence _sourceSentence;
|
|
||||||
|
|
||||||
TokenizedSentence _targetSentence;
|
|
||||||
|
|
||||||
std::vector<std::vector<int> > _alignments;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -87,20 +87,23 @@ void IndexController::addSentences(
|
|||||||
|
|
||||||
void IndexController::addAlignedSentences(
|
void IndexController::addAlignedSentences(
|
||||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const std::vector<std::string> & sourceSentences,
|
const std::vector<std::string> & rawSourceSentences,
|
||||||
const std::vector<std::string> & targetSentences,
|
const std::vector<std::string> & targetSentences,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
try {
|
try {
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
|
std::vector<std::string> sourceSentences;
|
||||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
|
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||||
int index = 0;
|
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
|
||||||
for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
|
|
||||||
it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
|
||||||
|
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
||||||
|
|
||||||
|
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||||
|
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
|
||||||
|
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
|
||||||
|
}
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
jsonWriter.String("status");
|
jsonWriter.String("status");
|
||||||
jsonWriter.String("success");
|
jsonWriter.String("success");
|
||||||
@ -137,19 +140,18 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
|
void IndexController::_getSourceSentencesAndAlignments(
|
||||||
const std::vector<std::string> & targetSentences,
|
std::vector<std::string> & sourceSentences,
|
||||||
const int tmId) {
|
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
std::vector<AlignedUnit> result;
|
const std::vector<std::string> & rawSourceSentences) {
|
||||||
for (int i = 0; i<sourceSentences.size(); i++) {
|
|
||||||
std::string sourceSentence = sourceSentences[i];
|
|
||||||
std::string targetSentence = targetSentences[i];
|
|
||||||
|
|
||||||
std::string rawSourceSentence;
|
for (int i = 0; i<rawSourceSentences.size(); i++) {
|
||||||
std::vector<TokenAnnotation> sourceTokens;
|
std::string rawSourceSentence = rawSourceSentences[i];
|
||||||
|
|
||||||
|
std::string sourceSentence = "";
|
||||||
std::vector<std::vector<int> > alignments;
|
std::vector<std::vector<int> > alignments;
|
||||||
|
|
||||||
UnicodeString s(sourceSentence.c_str());
|
UnicodeString s(rawSourceSentence.c_str());
|
||||||
boost::u32regex_iterator<const UChar*> begin(
|
boost::u32regex_iterator<const UChar*> begin(
|
||||||
boost::make_u32regex_iterator(
|
boost::make_u32regex_iterator(
|
||||||
s,
|
s,
|
||||||
@ -177,27 +179,14 @@ std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std
|
|||||||
tokenAlignments.push_back(n);
|
tokenAlignments.push_back(n);
|
||||||
}
|
}
|
||||||
alignments.push_back(tokenAlignments);
|
alignments.push_back(tokenAlignments);
|
||||||
rawSourceSentence += token + " ";
|
sourceSentence += token + " ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rawSourceSentence = _trim(rawSourceSentence);
|
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
|
||||||
|
|
||||||
|
sourceSentences.push_back(sourceSentence);
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
allAlignments.push_back(alignments);
|
||||||
if (it != _concordiasMap->end()) {
|
|
||||||
TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
|
|
||||||
TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
|
|
||||||
|
|
||||||
result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string IndexController::_trim(std::string & str) {
|
|
||||||
size_t first = str.find_first_not_of(' ');
|
|
||||||
size_t last = str.find_last_not_of(' ');
|
|
||||||
return str.substr(first, (last-first+1));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@
|
|||||||
|
|
||||||
|
|
||||||
#include "unit_dao.hpp"
|
#include "unit_dao.hpp"
|
||||||
#include "aligned_unit.hpp"
|
|
||||||
|
|
||||||
#include "rapidjson/writer.h"
|
#include "rapidjson/writer.h"
|
||||||
|
|
||||||
@ -35,7 +34,7 @@ public:
|
|||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const std::vector<std::string> & sourceSentences,
|
const std::vector<std::string> & rawSourceSentences,
|
||||||
const std::vector<std::string> & targetSentences,
|
const std::vector<std::string> & targetSentences,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
@ -43,11 +42,10 @@ public:
|
|||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
|
void _getSourceSentencesAndAlignments(
|
||||||
const std::vector<std::string> & targetSentences,
|
std::vector<std::string> & sourceSentences,
|
||||||
const int tmId);
|
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
|
const std::vector<std::string> & rawSourceSentences);
|
||||||
std::string _trim(std::string & str);
|
|
||||||
|
|
||||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||||
|
|
||||||
|
@ -46,18 +46,18 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
|
|||||||
return newIds;
|
return newIds;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
|
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
||||||
const std::vector<AlignedUnit> & alignedUnits,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
//TODO
|
|
||||||
|
|
||||||
|
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||||
connection.startTransaction();
|
connection.startTransaction();
|
||||||
|
|
||||||
BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
|
for (int i=0; i< sourceSentences.size(); i++) {
|
||||||
newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
|
newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
|
||||||
}
|
}
|
||||||
|
|
||||||
connection.endTransaction();
|
connection.endTransaction();
|
||||||
@ -195,16 +195,18 @@ int UnitDAO::_addSingleSentence(
|
|||||||
|
|
||||||
int UnitDAO::_addAlignedUnit(
|
int UnitDAO::_addAlignedUnit(
|
||||||
DBconnection & connection,
|
DBconnection & connection,
|
||||||
const AlignedUnit & alignedUnit,
|
const TokenizedSentence & sourceSentence,
|
||||||
|
const TokenizedSentence & targetSentence,
|
||||||
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
|
|
||||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||||
std::vector<QueryParam*> params;
|
std::vector<QueryParam*> params;
|
||||||
params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
|
params.push_back(new StringParam(sourceSentence.getSentence()));
|
||||||
params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
|
params.push_back(new StringParam(targetSentence.getSentence()));
|
||||||
params.push_back(new IntParam(tmId));
|
params.push_back(new IntParam(tmId));
|
||||||
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
|
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
|
||||||
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
|
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
|
||||||
|
|
||||||
PGresult * result = connection.execute(query, params);
|
PGresult * result = connection.execute(query, params);
|
||||||
int newId = connection.getIntValue(result, 0, 0);
|
int newId = connection.getIntValue(result, 0, 0);
|
||||||
@ -214,22 +216,22 @@ int UnitDAO::_addAlignedUnit(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// add alignments
|
// add alignments
|
||||||
for(int i=0;i<alignedUnit.getAlignments().size();i++) {
|
bool nonEmpty = false;
|
||||||
for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
|
std::stringstream alignmentsQuery;
|
||||||
std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
|
alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values ";
|
||||||
std::vector<QueryParam*> params;
|
|
||||||
params.push_back(new IntParam(newId));
|
|
||||||
params.push_back(new IntParam(i));
|
|
||||||
params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
|
|
||||||
|
|
||||||
PGresult * result = connection.execute(query, params);
|
for(int i=0;i<alignments.size();i++) {
|
||||||
|
for (int j=0;j<alignments[i].size();j++) {
|
||||||
|
nonEmpty = true;
|
||||||
|
alignmentsQuery << "(" << newId << "," << i << "," << alignments[i][j] << "),";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (nonEmpty) {
|
||||||
|
query = alignmentsQuery.str();
|
||||||
|
query = query.substr(0, query.length()-1);
|
||||||
|
PGresult * result = connection.execute(query);
|
||||||
connection.clearResult(result);
|
connection.clearResult(result);
|
||||||
BOOST_FOREACH (QueryParam * param, params) {
|
|
||||||
delete param;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return newId;
|
return newId;
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,6 @@
|
|||||||
#include <concordia/concordia_search_result.hpp>
|
#include <concordia/concordia_search_result.hpp>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
|
||||||
#include "aligned_unit.hpp"
|
|
||||||
#include "simple_search_result.hpp"
|
#include "simple_search_result.hpp"
|
||||||
#include "complete_concordia_search_result.hpp"
|
#include "complete_concordia_search_result.hpp"
|
||||||
#include "db_connection.hpp"
|
#include "db_connection.hpp"
|
||||||
@ -35,8 +34,10 @@ public:
|
|||||||
const std::vector<std::string> & targetSentences,
|
const std::vector<std::string> & targetSentences,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
std::vector<SUFFIX_MARKER_TYPE> addAlignedUnits(
|
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
|
||||||
const std::vector<AlignedUnit> & alignedUnits,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
|
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
|
||||||
@ -58,7 +59,9 @@ private:
|
|||||||
|
|
||||||
int _addAlignedUnit(
|
int _addAlignedUnit(
|
||||||
DBconnection & connection,
|
DBconnection & connection,
|
||||||
const AlignedUnit & alignedUnit,
|
const TokenizedSentence & sourceSentence,
|
||||||
|
const TokenizedSentence & targetSentence,
|
||||||
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -43,7 +43,6 @@ data = {
|
|||||||
req = urllib2.Request(address)
|
req = urllib2.Request(address)
|
||||||
req.add_header('Content-Type', 'application/json')
|
req.add_header('Content-Type', 'application/json')
|
||||||
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
print response
|
|
||||||
tmId = int(response['newTmId'])
|
tmId = int(response['newTmId'])
|
||||||
print "Added new tm: %d" % tmId
|
print "Added new tm: %d" % tmId
|
||||||
|
|
||||||
@ -80,7 +79,7 @@ if len(sentences) > 0:
|
|||||||
add_data(data)
|
add_data(data)
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
|
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
|
||||||
|
|
||||||
print "Generating index..."
|
print "Generating index..."
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
Loading…
Reference in New Issue
Block a user