full search - work in progress

This commit is contained in:
Rafał Jaworski 2019-01-10 16:04:15 +01:00
parent 7622369f5c
commit fb5e7bcc8a
5 changed files with 100 additions and 61 deletions

View File

@ -0,0 +1 @@
../versions_available/stocznia_plen.cfg

View File

@ -50,7 +50,7 @@ void SearcherController::fullSearch(rapidjson::Writer<rapidjson::StringBuffer> &
if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false);
pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId);
FullSearchResult result = _unitDAO.getFullSearchResult(it->second->fullSearch(pattern, limit, offset, true));
FullSearchResult result = _unitDAO.getFullSearchResult(it->second->fullSearch(pattern, limit, offset, true), tokenizedPattern.getTokens().size());
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");

View File

@ -72,7 +72,7 @@ SimpleSearchResult UnitDAO::getSimpleSearchResult(const MatchedPatternFragment &
return _getResultFromFragment(fragment, ts, true);
}
FullSearchResult UnitDAO::getFullSearchResult(const OccurencesList & occurencesList) {
FullSearchResult UnitDAO::getFullSearchResult(const OccurencesList & occurencesList, const int patternLength) {
return FullSearchResult(5);
}
@ -117,66 +117,11 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
SimpleSearchResult ssResult(matchedPatternStart, matchedPatternEnd);
if (getOccurences) {
BOOST_FOREACH(SubstringOccurence sOccurence, fragment.getOccurences()) {
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(2*sOccurence.getOffset()+1));
params.push_back(new IntParam(2*(sOccurence.getOffset()+fragment.getMatchedLength())));
params.push_back(new IntParam(sOccurence.getId()));
PGresult * result = connection.execute(query, params);
Logger::log("got examples");
ExampleOccurence occurence(connection.getIntValue(result,0,0), // example id
connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2)); // target segment
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
// now add all target fragments matched with this fragment
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
std::vector<QueryParam*> targetParams;
targetParams.push_back(new IntParam(sOccurence.getId()));
targetParams.push_back(new IntParam(sOccurence.getOffset()));
targetParams.push_back(new IntParam(sOccurence.getOffset() + fragment.getMatchedLength() - 1));
PGresult * targetResult = connection.execute(targetQuery, targetParams);
Logger::log("got target fragments");
int prevPos = -2;
int currStart = -1;
int currEnd = -1;
for (int i=0;i<connection.getRowCount(targetResult);i++) {
int targetPos = connection.getIntValue(targetResult, i, 0);
int targetStart = connection.getIntValue(targetResult, i, 1);
int targetEnd = connection.getIntValue(targetResult, i, 2);
if (prevPos < targetPos - 1) { // beginning of detached fragment
// check if there is a fragment to end
if (currStart >= 0) {
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
currStart = targetStart;
}
currEnd = targetEnd;
prevPos = targetPos;
}
// check if there are remaining fragments
if (currStart >= 0) {
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
connection.clearResult(targetResult);
BOOST_FOREACH (QueryParam * param, targetParams) {
delete param;
}
ssResult.addOccurence(occurence);
ssResult.addOccurence(_getExampleOccurence(connection, sOccurence, fragment.getMatchedLength()));
}
}
@ -185,6 +130,66 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
return ssResult;
}
ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const SubstringOccurence sOccurence, const int matchedLength) {
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(2*sOccurence.getOffset()+1));
params.push_back(new IntParam(2*(sOccurence.getOffset()+matchedLength)));
params.push_back(new IntParam(sOccurence.getId()));
PGresult * result = connection.execute(query, params);
ExampleOccurence occurence(connection.getIntValue(result,0,0), // example id
connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2)); // target segment
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
// now add all target fragments matched with this fragment
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
std::vector<QueryParam*> targetParams;
targetParams.push_back(new IntParam(sOccurence.getId()));
targetParams.push_back(new IntParam(sOccurence.getOffset()));
targetParams.push_back(new IntParam(sOccurence.getOffset() + matchedLength - 1));
PGresult * targetResult = connection.execute(targetQuery, targetParams);
int prevPos = -2;
int currStart = -1;
int currEnd = -1;
for (int i=0;i<connection.getRowCount(targetResult);i++) {
int targetPos = connection.getIntValue(targetResult, i, 0);
int targetStart = connection.getIntValue(targetResult, i, 1);
int targetEnd = connection.getIntValue(targetResult, i, 2);
if (prevPos < targetPos - 1) { // beginning of detached fragment
// check if there is a fragment to end
if (currStart >= 0) {
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
currStart = targetStart;
}
currEnd = targetEnd;
prevPos = targetPos;
}
// check if there are remaining fragments
if (currStart >= 0) {
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
connection.clearResult(targetResult);
BOOST_FOREACH (QueryParam * param, targetParams) {
delete param;
}
return occurence;
}
std::vector<int> UnitDAO::_getTokenPositions(const TokenizedSentence & ts) {
std::vector<int> result;

View File

@ -45,7 +45,7 @@ public:
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
FullSearchResult getFullSearchResult(const OccurencesList & occurencesList);
FullSearchResult getFullSearchResult(const OccurencesList & occurencesList, const int patternLength);
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
@ -63,6 +63,8 @@ private:
std::vector<int> _getTokenPositions(const TokenizedSentence & ts);
ExampleOccurence _getExampleOccurence(DBconnection & connection, const SubstringOccurence sOccurence, const int matchedLength);
int _addSingleSentence(
DBconnection & connection,
const TokenizedSentence & sourceSentence,

31
tests/fullSearch.py Executable file
View File

@ -0,0 +1,31 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
import host
data = {
'operation': 'fullSearch',
'pattern':sys.argv[1],
'tmId':int(sys.argv[2]),
'limit':int(sys.argv[3]),
'offset':int(sys.argv[4])
}
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
start = time.time()
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response