optimized db querying

This commit is contained in:
rjawor 2019-01-17 14:58:22 +01:00
parent 329421b0c1
commit e352e0a8b2
6 changed files with 132 additions and 48 deletions

View File

@ -3,6 +3,7 @@
<script src="../js/jquery-1.11.3.min.js"></script>
<script src="../js/cat.js"></script>
<link rel="stylesheet" href="../css/concordia_cat.css" />
<title>Concordia</title>
<meta charset="UTF-8">
</head>
<body>

View File

@ -0,0 +1,44 @@
#include "int_2d_array_param.hpp"
#include <boost/foreach.hpp>
#include <sstream>
Int2DArrayParam::Int2DArrayParam(std::vector<std::vector<int> > array) {
std::stringstream ss;
ss << "[";
int i = 0;
BOOST_FOREACH(std::vector<int> & intArray, array) {
ss << "[";
int j = 0;
BOOST_FOREACH(int & number, intArray) {
ss << number;
if (j < intArray.size() - 1) {
ss << ",";
}
j++;
}
ss << "]";
if (i < array.size() -1) {
ss << ";";
}
i++;
}
ss << "]";
_arrayString = ss.str();
}
Int2DArrayParam::~Int2DArrayParam() {
}
const char * Int2DArrayParam::getValue() {
return _arrayString.c_str();
}
const int Int2DArrayParam::getLength() {
return _arrayString.size();
}
const int Int2DArrayParam::isBinary() {
return 0;
}

View File

@ -0,0 +1,27 @@
#ifndef INT_2D_ARRAY_PARAM_HDR
#define INT_2D_ARRAY_PARAM_HDR
#include "query_param.hpp"
#include <string>
#include <vector>
class Int2DArrayParam : public QueryParam {
public:
/*! Constructor.
*/
Int2DArrayParam(std::vector<std::vector<int> > array);
/*! Destructor.
*/
virtual ~Int2DArrayParam();
const char * getValue();
const int getLength();
const int isBinary();
private:
std::string _arrayString;
};
#endif

View File

@ -2,11 +2,15 @@
#include <sstream>
#include <string>
#include <vector>
#include <cstdlib>
#include <set>
#include "query_param.hpp"
#include "string_param.hpp"
#include "int_param.hpp"
#include "int_array_param.hpp"
#include "int_2d_array_param.hpp"
#include "logger.hpp"
#include "example_occurence.hpp"
@ -14,6 +18,8 @@
#include <boost/foreach.hpp>
#include <concordia/token_annotation.hpp>
#include <boost/algorithm/string.hpp>
UnitDAO::UnitDAO() {
}
@ -140,7 +146,7 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
}
ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const SubstringOccurence sOccurence, const int matchedLength) {
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer], target_tokens, alignments FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(2*sOccurence.getOffset()+1));
params.push_back(new IntParam(2*(sOccurence.getOffset()+matchedLength)));
@ -151,28 +157,34 @@ ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2)); // target segment
std::string targetTokensRaw = connection.getStringValue(result,0,5);
std::string alignmentsRaw = connection.getStringValue(result,0,6);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
// now add all target fragments matched with this fragment
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
std::vector<QueryParam*> targetParams;
targetParams.push_back(new IntParam(sOccurence.getId()));
targetParams.push_back(new IntParam(sOccurence.getOffset()));
targetParams.push_back(new IntParam(sOccurence.getOffset() + matchedLength - 1));
PGresult * targetResult = connection.execute(targetQuery, targetParams);
std::vector<int> targetTokens = _getArray(targetTokensRaw);
std::vector<std::vector<int> > alignments = _get2DArray(alignmentsRaw);
std::set<int> matchedTargetTokens;
for(int sourceTokenIndex = sOccurence.getOffset(); sourceTokenIndex < sOccurence.getOffset()+matchedLength; sourceTokenIndex++) {
BOOST_FOREACH(int & targetTokenIndex, alignments.at(sourceTokenIndex)) {
matchedTargetTokens.insert(targetTokenIndex);
}
}
int prevPos = -2;
int currStart = -1;
int currEnd = -1;
for (int i=0;i<connection.getRowCount(targetResult);i++) {
int targetPos = connection.getIntValue(targetResult, i, 0);
int targetStart = connection.getIntValue(targetResult, i, 1);
int targetEnd = connection.getIntValue(targetResult, i, 2);
std::set<int>::iterator iter;
for(iter=matchedTargetTokens.begin(); iter!=matchedTargetTokens.end();++iter) {
int targetPos = *iter;
int targetStart = targetTokens.at(2*targetPos);
int targetEnd = targetTokens.at(2*targetPos+1);
if (prevPos < targetPos - 1) { // beginning of detached fragment
// check if there is a fragment to end
@ -186,16 +198,12 @@ ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const
prevPos = targetPos;
}
// check if there are remaining fragments
if (currStart >= 0) {
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
connection.clearResult(targetResult);
BOOST_FOREACH (QueryParam * param, targetParams) {
delete param;
}
return occurence;
}
@ -247,13 +255,14 @@ int UnitDAO::_addAlignedUnit (
throw ConcordiaException(ss.str());
}
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens, alignments) values($1::text,$2::text,$3::integer,$4,$5,$6) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new StringParam(sourceSentence.getOriginalSentence()));
params.push_back(new StringParam(targetSentence.getOriginalSentence()));
params.push_back(new IntParam(tmId));
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
params.push_back(new Int2DArrayParam(alignments));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
@ -262,23 +271,31 @@ int UnitDAO::_addAlignedUnit (
delete param;
}
// add alignments
bool nonEmpty = false;
std::stringstream alignmentsQuery;
alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values ";
for(int i=0;i<alignments.size();i++) {
for (int j=0;j<alignments[i].size();j++) {
nonEmpty = true;
alignmentsQuery << "(" << newId << "," << i << "," << alignments[i][j] << "),";
}
}
if (nonEmpty) {
query = alignmentsQuery.str();
query = query.substr(0, query.length()-1);
PGresult * result = connection.execute(query);
connection.clearResult(result);
}
return newId;
}
std::vector<int> UnitDAO::_getArray(std::string arrayString) {
std::vector<int> result;
if (arrayString.length()>2) {
std::vector<std::string> numberStrings;
std::string strippedArrayString = arrayString.substr(1,arrayString.length()-2);
boost::split(numberStrings,strippedArrayString,boost::is_any_of(","));
BOOST_FOREACH (std::string & numberString, numberStrings) {
result.push_back(atoi(numberString.c_str()));
}
}
return result;
}
std::vector<std::vector<int> > UnitDAO::_get2DArray(std::string arrayString) {
std::vector<std::vector<int> > result;
std::vector<std::string> arrayStrings;
std::string strippedArrayString = arrayString.substr(1,arrayString.length()-2);
boost::split(arrayStrings,strippedArrayString,boost::is_any_of(";"));
BOOST_FOREACH (std::string & arrayString, arrayStrings) {
result.push_back(_getArray(arrayString));
}
return result;
}

View File

@ -77,6 +77,10 @@ private:
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int tmId) throw(ConcordiaException);
std::vector<int> _getArray(std::string arrayString);
std::vector<std::vector<int> > _get2DArray(std::string arrayString);
};
#endif

View File

@ -36,17 +36,8 @@ CREATE TABLE unit (
source_segment text,
target_segment text,
source_tokens integer[],
target_tokens integer[]
target_tokens integer[],
alignments text
);
DROP TABLE IF EXISTS alignment;
CREATE TABLE alignment (
id SERIAL PRIMARY KEY,
unit_id integer,
source_token_pos integer,
target_token_pos integer
);
CREATE INDEX ON alignment(unit_id, source_token_pos);
CREATE INDEX ON unit(tm_id);