optimized adding aligned files
This commit is contained in:
parent
129f154d5e
commit
883aebe919
@ -14,7 +14,6 @@ add_executable(concordia_server_process
|
||||
simple_search_result.cpp
|
||||
complete_concordia_search_result.cpp
|
||||
tm_dao.cpp
|
||||
aligned_unit.cpp
|
||||
)
|
||||
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
|
||||
|
||||
|
@ -1,15 +0,0 @@
|
||||
#include "aligned_unit.hpp"
|
||||
|
||||
|
||||
AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
std::vector<std::vector<int> > alignments):
|
||||
_sourceSentence(sourceSentence),
|
||||
_targetSentence(targetSentence),
|
||||
_alignments(alignments) {
|
||||
}
|
||||
|
||||
|
||||
AlignedUnit::~AlignedUnit() {
|
||||
}
|
||||
|
@ -1,40 +0,0 @@
|
||||
#ifndef ALIGNED_UNIT_HDR
|
||||
#define ALIGNED_UNIT_HDR
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <concordia/tokenized_sentence.hpp>
|
||||
|
||||
class AlignedUnit {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
AlignedUnit(const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
std::vector<std::vector<int> > alignments);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~AlignedUnit();
|
||||
|
||||
TokenizedSentence getSourceSentence() const {
|
||||
return _sourceSentence;
|
||||
}
|
||||
|
||||
TokenizedSentence getTargetSentence() const {
|
||||
return _targetSentence;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int> > getAlignments() const {
|
||||
return _alignments;
|
||||
}
|
||||
|
||||
private:
|
||||
TokenizedSentence _sourceSentence;
|
||||
|
||||
TokenizedSentence _targetSentence;
|
||||
|
||||
std::vector<std::vector<int> > _alignments;
|
||||
};
|
||||
|
||||
#endif
|
@ -87,20 +87,23 @@ void IndexController::addSentences(
|
||||
|
||||
void IndexController::addAlignedSentences(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & rawSourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId) {
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
|
||||
int index = 0;
|
||||
for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
|
||||
it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
|
||||
index++;
|
||||
}
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
|
||||
|
||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
|
||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
|
||||
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
|
||||
}
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
@ -137,19 +140,18 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
||||
|
||||
}
|
||||
|
||||
std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId) {
|
||||
std::vector<AlignedUnit> result;
|
||||
for (int i = 0; i<sourceSentences.size(); i++) {
|
||||
std::string sourceSentence = sourceSentences[i];
|
||||
std::string targetSentence = targetSentences[i];
|
||||
void IndexController::_getSourceSentencesAndAlignments(
|
||||
std::vector<std::string> & sourceSentences,
|
||||
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const std::vector<std::string> & rawSourceSentences) {
|
||||
|
||||
for (int i = 0; i<rawSourceSentences.size(); i++) {
|
||||
std::string rawSourceSentence = rawSourceSentences[i];
|
||||
|
||||
std::string rawSourceSentence;
|
||||
std::vector<TokenAnnotation> sourceTokens;
|
||||
std::string sourceSentence = "";
|
||||
std::vector<std::vector<int> > alignments;
|
||||
|
||||
UnicodeString s(sourceSentence.c_str());
|
||||
UnicodeString s(rawSourceSentence.c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(
|
||||
boost::make_u32regex_iterator(
|
||||
s,
|
||||
@ -177,27 +179,14 @@ std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std
|
||||
tokenAlignments.push_back(n);
|
||||
}
|
||||
alignments.push_back(tokenAlignments);
|
||||
rawSourceSentence += token + " ";
|
||||
sourceSentence += token + " ";
|
||||
}
|
||||
}
|
||||
|
||||
rawSourceSentence = _trim(rawSourceSentence);
|
||||
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
|
||||
|
||||
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
|
||||
TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
|
||||
|
||||
result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
|
||||
}
|
||||
sourceSentences.push_back(sourceSentence);
|
||||
allAlignments.push_back(alignments);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string IndexController::_trim(std::string & str) {
|
||||
size_t first = str.find_first_not_of(' ');
|
||||
size_t last = str.find_last_not_of(' ');
|
||||
return str.substr(first, (last-first+1));
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,6 @@
|
||||
|
||||
|
||||
#include "unit_dao.hpp"
|
||||
#include "aligned_unit.hpp"
|
||||
|
||||
#include "rapidjson/writer.h"
|
||||
|
||||
@ -35,7 +34,7 @@ public:
|
||||
const int tmId);
|
||||
|
||||
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & rawSourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
|
||||
@ -43,12 +42,11 @@ public:
|
||||
const int tmId);
|
||||
|
||||
private:
|
||||
std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
void _getSourceSentencesAndAlignments(
|
||||
std::vector<std::string> & sourceSentences,
|
||||
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const std::vector<std::string> & rawSourceSentences);
|
||||
|
||||
std::string _trim(std::string & str);
|
||||
|
||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||
|
||||
UnitDAO _unitDAO;
|
||||
|
@ -46,18 +46,18 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
|
||||
return newIds;
|
||||
}
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
|
||||
const std::vector<AlignedUnit> & alignedUnits,
|
||||
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const int tmId) {
|
||||
//TODO
|
||||
|
||||
|
||||
DBconnection connection;
|
||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||
connection.startTransaction();
|
||||
|
||||
BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
|
||||
newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
|
||||
for (int i=0; i< sourceSentences.size(); i++) {
|
||||
newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
|
||||
}
|
||||
|
||||
connection.endTransaction();
|
||||
@ -194,17 +194,19 @@ int UnitDAO::_addSingleSentence(
|
||||
|
||||
|
||||
int UnitDAO::_addAlignedUnit(
|
||||
DBconnection & connection,
|
||||
const AlignedUnit & alignedUnit,
|
||||
const int tmId) {
|
||||
DBconnection & connection,
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId) {
|
||||
|
||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
|
||||
params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
|
||||
params.push_back(new StringParam(sourceSentence.getSentence()));
|
||||
params.push_back(new StringParam(targetSentence.getSentence()));
|
||||
params.push_back(new IntParam(tmId));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
int newId = connection.getIntValue(result, 0, 0);
|
||||
@ -214,23 +216,23 @@ int UnitDAO::_addAlignedUnit(
|
||||
}
|
||||
|
||||
// add alignments
|
||||
for(int i=0;i<alignedUnit.getAlignments().size();i++) {
|
||||
for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
|
||||
std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new IntParam(newId));
|
||||
params.push_back(new IntParam(i));
|
||||
params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
connection.clearResult(result);
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
bool nonEmpty = false;
|
||||
std::stringstream alignmentsQuery;
|
||||
alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values ";
|
||||
|
||||
for(int i=0;i<alignments.size();i++) {
|
||||
for (int j=0;j<alignments[i].size();j++) {
|
||||
nonEmpty = true;
|
||||
alignmentsQuery << "(" << newId << "," << i << "," << alignments[i][j] << "),";
|
||||
}
|
||||
}
|
||||
if (nonEmpty) {
|
||||
query = alignmentsQuery.str();
|
||||
query = query.substr(0, query.length()-1);
|
||||
PGresult * result = connection.execute(query);
|
||||
connection.clearResult(result);
|
||||
}
|
||||
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <concordia/concordia_search_result.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "aligned_unit.hpp"
|
||||
#include "simple_search_result.hpp"
|
||||
#include "complete_concordia_search_result.hpp"
|
||||
#include "db_connection.hpp"
|
||||
@ -35,8 +34,10 @@ public:
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> addAlignedUnits(
|
||||
const std::vector<AlignedUnit> & alignedUnits,
|
||||
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const int tmId);
|
||||
|
||||
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
|
||||
@ -58,7 +59,9 @@ private:
|
||||
|
||||
int _addAlignedUnit(
|
||||
DBconnection & connection,
|
||||
const AlignedUnit & alignedUnit,
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId);
|
||||
};
|
||||
|
||||
|
@ -43,7 +43,6 @@ data = {
|
||||
req = urllib2.Request(address)
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||
print response
|
||||
tmId = int(response['newTmId'])
|
||||
print "Added new tm: %d" % tmId
|
||||
|
||||
@ -80,7 +79,7 @@ if len(sentences) > 0:
|
||||
add_data(data)
|
||||
|
||||
end = time.time()
|
||||
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
|
||||
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
|
||||
|
||||
print "Generating index..."
|
||||
start = time.time()
|
||||
|
Loading…
Reference in New Issue
Block a user