working concordia searching

This commit is contained in:
rjawor 2015-08-24 19:28:41 +02:00
parent ac7bc4cdbe
commit 14dc4abd56
11 changed files with 191 additions and 43 deletions

View File

@ -12,6 +12,7 @@ add_executable(concordia_server_process
logger.cpp logger.cpp
int_array_param.cpp int_array_param.cpp
simple_search_result.cpp simple_search_result.cpp
complete_concordia_search_result.cpp
) )
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case) target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case)

View File

@ -0,0 +1,10 @@
#include "complete_concordia_search_result.hpp"
CompleteConcordiaSearchResult::CompleteConcordiaSearchResult(
const double bestOverlayScore):
_bestOverlayScore(bestOverlayScore) {
}
CompleteConcordiaSearchResult::~CompleteConcordiaSearchResult() {
}

View File

@ -0,0 +1,32 @@
#ifndef COMPLETE_CONCORDIA_SEARCH_RESULT_HDR
#define COMPLETE_CONCORDIA_SEARCH_RESULT_HDR
#include <vector>
#include "simple_search_result.hpp"
class CompleteConcordiaSearchResult {
public:
/*! Constructor.
*/
CompleteConcordiaSearchResult(const double bestOverlayScore);
/*! Destructor.
*/
virtual ~CompleteConcordiaSearchResult();
const double getBestOverlayScore() {
return _bestOverlayScore;
}
std::vector<SimpleSearchResult> & getBestOverlay() {
return _bestOverlay;
}
private:
double _bestOverlayScore;
std::vector<SimpleSearchResult> _bestOverlay;
};
#endif

View File

@ -8,7 +8,8 @@ JsonGenerator::~JsonGenerator() {
} }
void JsonGenerator::signalError(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string message) { void JsonGenerator::signalError(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::string & message) {
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("error"); jsonWriter.String("error");
@ -16,6 +17,22 @@ void JsonGenerator::signalError(rapidjson::Writer<rapidjson::StringBuffer> & jso
jsonWriter.String(message.c_str()); jsonWriter.String(message.c_str());
jsonWriter.EndObject(); jsonWriter.EndObject();
} }
void JsonGenerator::writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const SimpleSearchResult & result) {
jsonWriter.StartObject();
jsonWriter.String("id");
jsonWriter.Int(result.getId());
jsonWriter.String("matchedExampleStart");
jsonWriter.Int(result.getMatchedExampleStart());
jsonWriter.String("matchedExampleEnd");
jsonWriter.Int(result.getMatchedExampleEnd());
jsonWriter.String("sourceSegment");
jsonWriter.String(result.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.EndObject();
}

View File

@ -5,6 +5,8 @@
#include "rapidjson/writer.h" #include "rapidjson/writer.h"
#include "simple_search_result.hpp"
class JsonGenerator { class JsonGenerator {
public: public:
/*! Constructor. /*! Constructor.
@ -14,7 +16,11 @@ public:
*/ */
virtual ~JsonGenerator(); virtual ~JsonGenerator();
static void signalError(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string message); static void signalError(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::string & message);
static void writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const SimpleSearchResult & result);
private: private:

View File

@ -3,6 +3,8 @@
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <vector> #include <vector>
#include "json_generator.hpp"
SearcherController::SearcherController(boost::shared_ptr<Concordia> concordia) SearcherController::SearcherController(boost::shared_ptr<Concordia> concordia)
throw(ConcordiaException): throw(ConcordiaException):
_concordia(concordia) { _concordia(concordia) {
@ -21,29 +23,32 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
jsonWriter.String("results"); jsonWriter.String("results");
jsonWriter.StartArray(); jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & result, results) { BOOST_FOREACH(SimpleSearchResult & result, results) {
jsonWriter.StartObject(); JsonGenerator::writeSearchResult(jsonWriter, result);
jsonWriter.String("id");
jsonWriter.Int(result.getId());
jsonWriter.String("matchedFragmentStart");
jsonWriter.Int(result.getMatchedFragmentStart());
jsonWriter.String("matchedFragmentEnd");
jsonWriter.Int(result.getMatchedFragmentEnd());
jsonWriter.String("sourceSegment");
jsonWriter.String(result.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.EndObject();
} }
jsonWriter.EndArray(); jsonWriter.EndArray();
jsonWriter.EndObject(); jsonWriter.EndObject();
} }
void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern) { void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern) {
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(_concordia->concordiaSearch(pattern));
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("error"); jsonWriter.String("success");
jsonWriter.String("data"); jsonWriter.String("result");
jsonWriter.String("concordia searching not yet implemented"); jsonWriter.StartObject();
jsonWriter.String("bestOverlayScore");
jsonWriter.Double(result.getBestOverlayScore());
jsonWriter.String("bestOverlay");
jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & simpleResult, result.getBestOverlay()) {
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
}
jsonWriter.EndArray();
jsonWriter.EndObject();
jsonWriter.EndObject(); jsonWriter.EndObject();
} }

View File

@ -2,13 +2,17 @@
SimpleSearchResult::SimpleSearchResult( SimpleSearchResult::SimpleSearchResult(
const int id, const int id,
const int matchedFragmentStart, const int matchedPatternStart,
const int matchedFragmentEnd, const int matchedPatternEnd,
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment, const std::string & sourceSegment,
const std::string & targetSegment): const std::string & targetSegment):
_id(id), _id(id),
_matchedFragmentStart(matchedFragmentStart), _matchedPatternStart(matchedPatternStart),
_matchedFragmentEnd(matchedFragmentEnd), _matchedPatternEnd(matchedPatternEnd),
_matchedExampleStart(matchedExampleStart),
_matchedExampleEnd(matchedExampleEnd),
_sourceSegment(sourceSegment), _sourceSegment(sourceSegment),
_targetSegment(targetSegment) { _targetSegment(targetSegment) {
} }

View File

@ -8,8 +8,10 @@ public:
/*! Constructor. /*! Constructor.
*/ */
SimpleSearchResult(const int id, SimpleSearchResult(const int id,
const int matchedFragmentStart, const int matchedPatternStart,
const int matchedFragmentEnd, const int matchedPatternEnd,
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment, const std::string & sourceSegment,
const std::string & targetSegment const std::string & targetSegment
); );
@ -17,32 +19,44 @@ public:
*/ */
virtual ~SimpleSearchResult(); virtual ~SimpleSearchResult();
const int getId() { int getId() const {
return _id; return _id;
} }
const int getMatchedFragmentStart() { int getMatchedPatternStart() const {
return _matchedFragmentStart; return _matchedPatternStart;
} }
const int getMatchedFragmentEnd() { int getMatchedPatternEnd() const {
return _matchedFragmentEnd; return _matchedPatternEnd;
} }
const std::string & getSourceSegment() { int getMatchedExampleStart() const {
return _matchedExampleStart;
}
int getMatchedExampleEnd() const {
return _matchedExampleEnd;
}
const std::string & getSourceSegment() const {
return _sourceSegment; return _sourceSegment;
} }
const std::string & getTargetSegment() { const std::string & getTargetSegment() const {
return _targetSegment; return _targetSegment;
} }
private: private:
int _id; int _id;
int _matchedFragmentStart; int _matchedPatternStart;
int _matchedFragmentEnd; int _matchedPatternEnd;
int _matchedExampleStart;
int _matchedExampleEnd;
std::string _sourceSegment; std::string _sourceSegment;

View File

@ -45,31 +45,57 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
} }
std::vector<SimpleSearchResult> UnitDAO::getSearchResults(const std::vector<MatchedPatternFragment> & concordiaResults) { std::vector<SimpleSearchResult> UnitDAO::getSearchResults(const std::vector<MatchedPatternFragment> & fragments) {
std::vector<SimpleSearchResult> results; std::vector<SimpleSearchResult> results;
TokenizedSentence ts("");
_getResultsFromFragments(results, fragments, ts);
return results;
}
CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult) {
CompleteConcordiaSearchResult result(rawConcordiaResult->getBestOverlayScore());
_getResultsFromFragments(result.getBestOverlay(),
rawConcordiaResult->getBestOverlay(),
rawConcordiaResult->getTokenizedPattern());
return result;
}
void UnitDAO::_getResultsFromFragments(
std::vector<SimpleSearchResult> & results,
const std::vector<MatchedPatternFragment> & fragments,
const TokenizedSentence & tokenizedPattern) {
DBconnection connection; DBconnection connection;
connection.startTransaction(); connection.startTransaction();
BOOST_FOREACH(const MatchedPatternFragment & fragment, concordiaResults) { BOOST_FOREACH(const MatchedPatternFragment & fragment, fragments) {
int matchedPatternStart = 0;
int matchedPatternEnd = 0;
if (tokenizedPattern.getTokens().size() > 0) {
// if it is concordia searching
matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
}
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;"; std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params; std::vector<QueryParam*> params;
params.push_back(new IntParam(2*fragment.getExampleOffset()+1)); params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength()))); params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength())));
params.push_back(new IntParam(fragment.getExampleId())); params.push_back(new IntParam(fragment.getExampleId()));
PGresult * result = connection.execute(query, params); PGresult * result = connection.execute(query, params);
results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0), results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0), // example id
connection.getIntValue(result,0,3), matchedPatternStart,
connection.getIntValue(result,0,4), matchedPatternEnd,
connection.getStringValue(result,0,1), connection.getIntValue(result,0,3), // matched example start
connection.getStringValue(result,0,2))); connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2))); // target segment
connection.clearResult(result); connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) { BOOST_FOREACH (QueryParam * param, params) {
delete param; delete param;
} }
} }
connection.endTransaction(); connection.endTransaction();
return results;
} }

View File

@ -8,9 +8,11 @@
#include <concordia/tokenized_sentence.hpp> #include <concordia/tokenized_sentence.hpp>
#include <concordia/substring_occurence.hpp> #include <concordia/substring_occurence.hpp>
#include <concordia/matched_pattern_fragment.hpp> #include <concordia/matched_pattern_fragment.hpp>
#include <concordia/concordia_search_result.hpp>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include "simple_search_result.hpp" #include "simple_search_result.hpp"
#include "complete_concordia_search_result.hpp"
#include "db_connection.hpp" #include "db_connection.hpp"
class UnitDAO { class UnitDAO {
@ -32,9 +34,15 @@ public:
const std::vector<std::string> & targetSentences, const std::vector<std::string> & targetSentences,
const std::vector<int> & tmIds); const std::vector<int> & tmIds);
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & concordiaResults); std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
private: private:
void _getResultsFromFragments(std::vector<SimpleSearchResult> & results,
const std::vector<MatchedPatternFragment> & fragments,
const TokenizedSentence & tokenizedPattern);
std::vector<int> _getTokenPositions(const TokenizedSentence & ts); std::vector<int> _getTokenPositions(const TokenizedSentence & ts);
int _addSingleSentence( int _addSingleSentence(

25
tests/concordiaSearch.py Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
data = {
'operation': 'concordiaSearch',
'pattern':sys.argv[1]
}
start = time.time()
req = urllib2.Request('http://localhost')
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response