working simple search

This commit is contained in:
rjawor 2015-08-07 13:21:53 +02:00
parent de5d1f4a63
commit e8f1f21195
13 changed files with 98 additions and 233 deletions

View File

@ -124,4 +124,10 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) {
return strtol(valueStr, NULL, 10); return strtol(valueStr, NULL, 10);
} }
std::string DBconnection::getStringValue(PGresult * result, int row, int col) {
char * valueStr = PQgetvalue(result,row,col);
return std::string(valueStr);
}

View File

@ -31,6 +31,8 @@ public:
int getIntValue(PGresult * result, int row, int col); int getIntValue(PGresult * result, int row, int col);
std::string getStringValue(PGresult * result, int row, int col);
private: private:
void close(); void close();

View File

@ -1,5 +1,6 @@
#include "searcher_controller.hpp" #include "searcher_controller.hpp"
#include <boost/foreach.hpp>
#include <vector> #include <vector>
SearcherController::SearcherController(boost::shared_ptr<Concordia> concordia) SearcherController::SearcherController(boost::shared_ptr<Concordia> concordia)
@ -12,14 +13,25 @@ SearcherController::~SearcherController() {
void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern) { void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern) {
std::vector<SubstringOccurence> results = _concordia->simpleSearch(pattern); std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(_concordia->simpleSearch(pattern));
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("success"); jsonWriter.String("success");
jsonWriter.String("results"); jsonWriter.String("results");
jsonWriter.StartArray(); jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & result, results) {
jsonWriter.StartObject();
jsonWriter.String("id");
jsonWriter.Int(result.getId());
jsonWriter.String("matchedFragment");
jsonWriter.String(result.getMatchedFragment().c_str());
jsonWriter.String("sourceSegment");
jsonWriter.String(result.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.EndObject();
}
jsonWriter.EndArray(); jsonWriter.EndArray();
jsonWriter.EndObject(); jsonWriter.EndObject();
} }

View File

@ -6,6 +6,8 @@
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/concordia_exception.hpp> #include <concordia/concordia_exception.hpp>
#include "unit_dao.hpp"
#include "simple_search_result.hpp"
#include "rapidjson/writer.h" #include "rapidjson/writer.h"
@ -24,8 +26,10 @@ public:
void concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern); void concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern);
private: private:
boost::shared_ptr<Concordia> _concordia;
boost::shared_ptr<Concordia> _concordia;
UnitDAO _unitDAO;
}; };
#endif #endif

View File

@ -1,6 +1,14 @@
#include "simple_search_result.hpp" #include "simple_search_result.hpp"
SimpleSearchResult::SimpleSearchResult() { SimpleSearchResult::SimpleSearchResult(
const int id,
const std::string & matchedFragment,
const std::string & sourceSegment,
const std::string & targetSegment):
_id(id),
_matchedFragment(matchedFragment),
_sourceSegment(sourceSegment),
_targetSegment(targetSegment) {
} }
SimpleSearchResult::~SimpleSearchResult() { SimpleSearchResult::~SimpleSearchResult() {

View File

@ -7,13 +7,33 @@ class SimpleSearchResult {
public: public:
/*! Constructor. /*! Constructor.
*/ */
SimpleSearchResult(); SimpleSearchResult(const int id,
const std::string & matchedFragment,
const std::string & sourceSegment,
const std::string & targetSegment
);
/*! Destructor. /*! Destructor.
*/ */
virtual ~SimpleSearchResult(); virtual ~SimpleSearchResult();
int & getId() {
return _id;
}
const std::string & getMatchedFragment() {
return _matchedFragment;
}
const std::string & getSourceSegment() {
return _sourceSegment;
}
const std::string & getTargetSegment() {
return _targetSegment;
}
private: private:
int id; int _id;
std::string _matchedFragment; std::string _matchedFragment;

View File

@ -44,6 +44,36 @@ int UnitDAO::addSentence(
} }
std::vector<SimpleSearchResult> UnitDAO::getSearchResults(std::vector<MatchedPatternFragment> concordiaResults) {
std::vector<SimpleSearchResult> results;
DBconnection connection;
connection.startTransaction();
BOOST_FOREACH(MatchedPatternFragment & fragment, concordiaResults) {
std::string query = "SELECT id, source_segment, target_segment, substring(source_segment,source_tokens[$1::integer*2+1]+1,source_tokens[$2::integer*2]-source_tokens[$1::integer*2+1]) as matched_fragment FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(fragment.getExampleOffset()));
params.push_back(new IntParam(fragment.getExampleOffset()+fragment.getMatchedLength()));
params.push_back(new IntParam(fragment.getExampleId()));
std::stringstream ss;
ss << "example offset: " << fragment.getExampleOffset()
<< ", matched length: " << fragment.getMatchedLength()
<< ", example id: " << fragment.getExampleId();
Logger::log(ss.str());
PGresult * result = connection.execute(query, params);
results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0),
connection.getStringValue(result,0,3),
connection.getStringValue(result,0,1),
connection.getStringValue(result,0,2)));
connection.clearResult(result);
}
connection.endTransaction();
return results;
}
std::vector<int> UnitDAO::_getTokenPositions(boost::shared_ptr<TokenizedSentence> ts) { std::vector<int> UnitDAO::_getTokenPositions(boost::shared_ptr<TokenizedSentence> ts) {
std::vector<int> result; std::vector<int> result;
BOOST_FOREACH(const TokenAnnotation & token, ts->getTokens()) { BOOST_FOREACH(const TokenAnnotation & token, ts->getTokens()) {
@ -54,3 +84,4 @@ std::vector<int> UnitDAO::_getTokenPositions(boost::shared_ptr<TokenizedSentence
} }

View File

@ -5,8 +5,12 @@
#include <vector> #include <vector>
#include <concordia/tokenized_sentence.hpp> #include <concordia/tokenized_sentence.hpp>
#include <concordia/substring_occurence.hpp>
#include <concordia/matched_pattern_fragment.hpp>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include "simple_search_result.hpp"
class UnitDAO { class UnitDAO {
public: public:
/*! Constructor. /*! Constructor.
@ -20,6 +24,9 @@ public:
boost::shared_ptr<TokenizedSentence> sourceSentence, boost::shared_ptr<TokenizedSentence> sourceSentence,
std::string & targetSentence, std::string & targetSentence,
int tmId); int tmId);
std::vector<SimpleSearchResult> getSearchResults(std::vector<MatchedPatternFragment> concordiaResults);
private: private:
std::vector<int> _getTokenPositions(boost::shared_ptr<TokenizedSentence> ts); std::vector<int> _getTokenPositions(boost::shared_ptr<TokenizedSentence> ts);
}; };

View File

@ -1 +0,0 @@
select substring(source_segment,source_tokens[start_token*2+1]+1,source_tokens[end_token*2+2]-source_tokens[start_token*2+1]) from unit where id = 3;

View File

@ -1,3 +0,0 @@
http://chriswu.me/blog/writing-hello-world-in-fcgi-with-c-plus-plus/
use the echo.cpp source as an example for concordia-server-starter. It works with the up-to-date version of test.html (the one that specifies UTF-8 as character encoding in the <form>)

View File

@ -1,175 +0,0 @@
/*
* A simple FastCGI application example in C++.
*
* $Id: echo-cpp.cpp,v 1.10 2002/02/25 00:46:17 robs Exp $
*
* Copyright (c) 2001 Rob Saccoccio and Chelsea Networks
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdlib.h>
#ifdef _WIN32
#include <process.h>
#else
#include <unistd.h>
extern char ** environ;
#endif
#include "fcgio.h"
#include "fcgi_config.h" // HAVE_IOSTREAM_WITHASSIGN_STREAMBUF
using namespace std;
// Maximum number of bytes allowed to be read from stdin
static const unsigned long STDIN_MAX = 1000000;
static void penv(const char * const * envp)
{
cout << "<PRE>\n";
for ( ; *envp; ++envp)
{
cout << *envp << "\n";
}
cout << "</PRE>\n";
}
static long gstdin(FCGX_Request * request, char ** content)
{
char * clenstr = FCGX_GetParam("CONTENT_LENGTH", request->envp);
unsigned long clen = STDIN_MAX;
if (clenstr)
{
clen = strtol(clenstr, &clenstr, 10);
if (*clenstr)
{
cerr << "can't parse \"CONTENT_LENGTH="
<< FCGX_GetParam("CONTENT_LENGTH", request->envp)
<< "\"\n";
clen = STDIN_MAX;
}
// *always* put a cap on the amount of data that will be read
if (clen > STDIN_MAX) clen = STDIN_MAX;
*content = new char[clen];
cin.read(*content, clen);
clen = cin.gcount();
}
else
{
// *never* read stdin when CONTENT_LENGTH is missing or unparsable
*content = 0;
clen = 0;
}
// Chew up any remaining stdin - this shouldn't be necessary
// but is because mod_fastcgi doesn't handle it correctly.
// ignore() doesn't set the eof bit in some versions of glibc++
// so use gcount() instead of eof()...
do cin.ignore(1024); while (cin.gcount() == 1024);
return clen;
}
int main (void)
{
int count = 0;
long pid = getpid();
streambuf * cin_streambuf = cin.rdbuf();
streambuf * cout_streambuf = cout.rdbuf();
streambuf * cerr_streambuf = cerr.rdbuf();
FCGX_Request request;
FCGX_Init();
FCGX_InitRequest(&request, 0, 0);
while (FCGX_Accept_r(&request) == 0)
{
// Note that the default bufsize (0) will cause the use of iostream
// methods that require positioning (such as peek(), seek(),
// unget() and putback()) to fail (in favour of more efficient IO).
fcgi_streambuf cin_fcgi_streambuf(request.in);
fcgi_streambuf cout_fcgi_streambuf(request.out);
fcgi_streambuf cerr_fcgi_streambuf(request.err);
#if HAVE_IOSTREAM_WITHASSIGN_STREAMBUF
cin = &cin_fcgi_streambuf;
cout = &cout_fcgi_streambuf;
cerr = &cerr_fcgi_streambuf;
#else
cin.rdbuf(&cin_fcgi_streambuf);
cout.rdbuf(&cout_fcgi_streambuf);
cerr.rdbuf(&cerr_fcgi_streambuf);
#endif
// Although FastCGI supports writing before reading,
// many http clients (browsers) don't support it (so
// the connection deadlocks until a timeout expires!).
char * content;
unsigned long clen = gstdin(&request, &content);
cout << "Content-type: text/html\r\n"
"\r\n"
"<TITLE>echo-cpp</TITLE>\n"
"<H1>echo-cpp</H1>\n"
"<H4>PID: " << pid << "</H4>\n"
"<H4>Request Number: " << ++count << "</H4>\n";
cout << "<H4>Request Environment</H4>\n";
penv(request.envp);
cout << "<H4>Process/Initial Environment</H4>\n";
penv(environ);
cout << "<H4>Standard Input - " << clen;
if (clen == STDIN_MAX) cout << " (STDIN_MAX)";
cout << " bytes</H4>\n";
if (clen) cout.write(content, clen);
if (content) delete []content;
// If the output streambufs had non-zero bufsizes and
// were constructed outside of the accept loop (i.e.
// their destructor won't be called here), they would
// have to be flushed here.
}
#if HAVE_IOSTREAM_WITHASSIGN_STREAMBUF
cin = cin_streambuf;
cout = cout_streambuf;
cerr = cerr_streambuf;
#else
cin.rdbuf(cin_streambuf);
cout.rdbuf(cout_streambuf);
cerr.rdbuf(cerr_streambuf);
#endif
return 0;
}

View File

@ -1,46 +0,0 @@
#include <iostream>
#include "fcgio.h"
using namespace std;
int main(void) {
// Backup the stdio streambufs
streambuf * cin_streambuf = cin.rdbuf();
streambuf * cout_streambuf = cout.rdbuf();
streambuf * cerr_streambuf = cerr.rdbuf();
FCGX_Request request;
FCGX_Init();
FCGX_InitRequest(&request, 0, 0);
while (FCGX_Accept_r(&request) == 0) {
fcgi_streambuf cin_fcgi_streambuf(request.in);
fcgi_streambuf cout_fcgi_streambuf(request.out);
fcgi_streambuf cerr_fcgi_streambuf(request.err);
cin.rdbuf(&cin_fcgi_streambuf);
cout.rdbuf(&cout_fcgi_streambuf);
cerr.rdbuf(&cerr_fcgi_streambuf);
cout << "Content-type: text/html\r\n"
<< "\r\n"
<< "<html>\n"
<< " <head>\n"
<< " <title>Hello, World!</title>\n"
<< " </head>\n"
<< " <body>\n"
<< " <h1>Hello, World!</h1>\n"
<< " </body>\n"
<< "</html>\n";
// Note: the fcgi_streambuf destructor will auto flush
}
// restore stdio streambufs
cin.rdbuf(cin_streambuf);
cout.rdbuf(cout_streambuf);
cerr.rdbuf(cerr_streambuf);
return 0;
}

View File

@ -1,7 +1,7 @@
#!/bin/sh #!/bin/sh
curl -H "Content-Type: application/json" -X POST -d '{"operation":"addSentence", "sourceSentence":"zu\"pełnie nowe zdanie", "targetSentence":"zażółć gęślą jaźńZAŻÓŁĆ GĘŚLĄ JAŹŃ", "tmId":1234782314}' http://localhost #curl -H "Content-Type: application/json" -X POST -d '{"operation":"addSentence", "sourceSentence":"Marysia ma rysia", "targetSentence":"Mary has a bobcat", "tmId":1}' http://localhost
#curl -H "Content-Type: application/json" -X POST -d '{"operation":"simpleSearch", "sentence":"zupełnie nowe"}' http://localhost curl -H "Content-Type: application/json" -X POST -d '{"operation":"simpleSearch", "pattern":"ma rysia"}' http://localhost