mutliple translation memories

This commit is contained in:
rjawor 2015-10-20 20:16:00 +02:00
parent a43bd9be03
commit 5eff07d4b8
29 changed files with 472 additions and 161 deletions

View File

@ -18,10 +18,6 @@ set (STOP_WORDS_ENABLED "false")
# ================================================ # ================================================
set (INDEX_DIRECTORY "${concordia-server_SOURCE_DIR}/index") set (INDEX_DIRECTORY "${concordia-server_SOURCE_DIR}/index")
set (WORD_MAP_FILE "word_map.bin")
set (HASHED_INDEX_FILE "hashed_index.bin")
set (MARKERS_FILE "markers.bin")
set (RESOURCES_DIRECTORY "${concordia-server_SOURCE_DIR}/resources") set (RESOURCES_DIRECTORY "${concordia-server_SOURCE_DIR}/resources")
configure_file ( configure_file (

View File

@ -1,41 +0,0 @@
<html>
<head>
<script src="js/jquery-1.11.3.min.js"></script>
<script src="js/cat.js"></script>
<link rel="stylesheet" href="css/iatagger.css" />
<meta charset="UTF-8">
</head>
<body>
<div id="header">
</div>
<div id="content">
<a href="http://tmconcordia.sourceforge.net/" target="_blank"><img src="images/banner.jpg" alt="Banner" /></a>
<br/><br/><br/>
<p>
Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences.
</p>
<p>
Enjoy your work with the system!
</p>
<label for="searchInput">Enter search pattern (English sentence):</label>
<span class="suggestion" onclick="showHideSuggestions()">show/hide samples</span>
<br/><br/>
<div class="suggestionsInvisible" id="suggestions">
<ul>
<li>Every ship in the European Union must have a crew of 50 or more workers. <span class="suggestion" onclick="searchText('Every ship in the European Union must have a crew of 50 or more workers.');">apply</span></li>
<li>It is impossible to abolish the customs duties on fruit and vegetables. <span class="suggestion" onclick="searchText('It is impossible to abolish the customs duties on fruit and vegetables.');">apply</span></li>
<li>The convention on human rights was held in Geneva. <span class="suggestion" onclick="searchText('The convention on human rights was held in Geneva.');">apply</span></li>
</ul>
<br/><br/>
</div>
<input id="searchInput" type="text" value="" />
<br/><br/>
<input type="button" value="search" onclick="searchHandle()" />
<br/><br/><br/><br/>
<div id="result">
</div>
</div>
</body>
</html>

34
cat/index.html_pattern Normal file
View File

@ -0,0 +1,34 @@
<html>
<head>
<script src="../js/jquery-1.11.3.min.js"></script>
<script src="../js/cat.js"></script>
<link rel="stylesheet" href="../css/iatagger.css" />
<meta charset="UTF-8">
</head>
<body>
<div id="header">
</div>
<div id="content">
<a href="http://tmconcordia.sourceforge.net/" target="_blank"><img src="../images/banner.jpg" alt="Banner" /></a>
<br/><br/><br/>
<p>@desc@</p>
<p>
Enjoy your work with the system!
</p>
<label for="searchInput">@prompt@</label>
<span class="suggestion" onclick="showHideSuggestions()">show/hide samples</span>
<br/><br/>
<div class="suggestionsInvisible" id="suggestions">
<ul>@suggestions@</ul>
<br/><br/>
</div>
<input id="searchInput" type="text" value="" />
<br/><br/>
<input type="button" value="search" onclick="searchHandle(@tmid@)" />
<br/><br/><br/><br/>
<div id="result">
</div>
</div>
</body>
</html>

View File

@ -11,9 +11,10 @@ $(document).ready(function() {
}); });
}); });
function searchHandle() { function searchHandle(tmid) {
var concordiaRequest = { var concordiaRequest = {
operation: 'concordiaSearch', operation: 'concordiaSearch',
tmId: tmid,
pattern:$("#searchInput").val() pattern:$("#searchInput").val()
} }

55
cat/publish.py Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys, os, shutil, re
root_dir = sys.argv[1]
if not os.path.exists(root_dir):
print "%s does not exist!" % root_dir
sys.exit(1)
if not os.path.isdir(root_dir):
print "%s is not a directory!" % root_dir
sys.exit(1)
if len(os.listdir(root_dir))>0:
print "%s is not empty!" % root_dir
sys.exit(1)
shutil.copytree('js', root_dir+'/js')
shutil.copytree('css', root_dir+'/css')
shutil.copytree('images', root_dir+'/images')
versions_dir = 'versions'
versions = []
for version_file in os.listdir(versions_dir):
version = {'suggestions':[]}
with open(versions_dir+'/'+version_file) as v:
for line in v:
field, value = line.strip().split('@#@')
if field == 'suggestion':
version['suggestions'].append(value)
else:
version[field] = value
versions.append(version)
for version in versions:
version_dir = root_dir+'/'+version['dir']
os.mkdir(version_dir)
with open('index.html_pattern', 'r') as pattern_file:
with open(version_dir+'/index.html', 'w') as index_file:
for line in pattern_file:
for field, value in version.iteritems():
if field == 'suggestions':
suggestions_html = ''
for suggestion in value:
suggestions_html+='<li>'+suggestion+'<span class="suggestion" onclick="searchText(\''+suggestion+'\');">apply</span></li>'
line = re.sub('@suggestions@', suggestions_html, line)
else:
line = re.sub('@'+field+'@', value, line)
index_file.write(line)

View File

@ -0,0 +1,7 @@
dir@#@jrc_enes
tmid@#@1
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences.
prompt@#@Enter search pattern (English sentence):
suggestion@#@Every ship in the European Union must have a crew of 50 or more workers.
suggestion@#@It is impossible to abolish the customs duties on fruit and vegetables.
suggestion@#@The convention on human rights was held in Geneva.

View File

@ -0,0 +1,6 @@
dir@#@setimes_hren
tmid@#@2
desc@#@Welcome to Concordia. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from the SETIMES2 Croatian-English corpus (<a href="http://opus.lingfil.uu.se/SETIMES2.php" target="_blank">link</a>). Please enter a Croatian sentence in the field below and press Enter (or use the search button). You can test the system on predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences.
prompt@#@Enter search pattern (Croatian sentence):
suggestion@#@Kazna medijskom mogulu obnovila raspravu u Makedoniji
suggestion@#@Član Predsjedništva BiH Komšić podnio ostavku u svojoj stranci

View File

@ -13,6 +13,7 @@ add_executable(concordia_server_process
int_array_param.cpp int_array_param.cpp
simple_search_result.cpp simple_search_result.cpp
complete_concordia_search_result.cpp complete_concordia_search_result.cpp
tm_dao.cpp
) )
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case) target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case)

View File

@ -7,12 +7,21 @@
#include "config.hpp" #include "config.hpp"
#include "logger.hpp" #include "logger.hpp"
#include "rapidjson/rapidjson.h" #include "rapidjson/rapidjson.h"
#include <boost/foreach.hpp>
#include <boost/ptr_container/ptr_map.hpp>
#include <boost/filesystem/path.hpp>
ConcordiaServer::ConcordiaServer(const std::string & configFilePath) ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
throw(ConcordiaException) { throw(ConcordiaException) :
boost::shared_ptr<Concordia> concordia(new Concordia(configFilePath)); _configFilePath(configFilePath) {
_indexController = boost::shared_ptr<IndexController> (new IndexController(concordia)); std::vector<int> tmIds = _tmDAO.getTmIds();
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(concordia)); _concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());
BOOST_FOREACH(int & tmId, tmIds) {
_addTm(tmId);
}
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
} }
ConcordiaServer::~ConcordiaServer() { ConcordiaServer::~ConcordiaServer() {
@ -44,29 +53,48 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
} else if (operation == ADD_SENTENCES_OP) { } else if (operation == ADD_SENTENCES_OP) {
std::vector<std::string> sourceSentences; std::vector<std::string> sourceSentences;
std::vector<std::string> targetSentences; std::vector<std::string> targetSentences;
std::vector<int> tmIds; int tmId = d[TM_ID_PARAM].GetInt();
// loading data from json // loading data from json
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM]; const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
Logger::log("addSentences");
Logger::logInt("sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) { for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 3) { if (sentencesArray[i].Size() != 2) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements"); JsonGenerator::signalError(jsonWriter, "sentence should be an array of 2 elements");
break; break;
} else { } else {
tmIds.push_back(sentencesArray[i][0].GetInt()); sourceSentences.push_back(sentencesArray[i][0].GetString());
sourceSentences.push_back(sentencesArray[i][1].GetString()); targetSentences.push_back(sentencesArray[i][1].GetString());
targetSentences.push_back(sentencesArray[i][2].GetString());
} }
} }
_indexController->addSentences(jsonWriter, sourceSentences, targetSentences, tmIds); _indexController->addSentences(jsonWriter, sourceSentences, targetSentences, tmId);
} else if (operation == REFRESH_INDEX_OP) { } else if (operation == REFRESH_INDEX_OP) {
_indexController->refreshIndexFromRAM(jsonWriter); int tmId = d[TM_ID_PARAM].GetInt();
_indexController->refreshIndexFromRAM(jsonWriter, tmId);
} else if (operation == SIMPLE_SEARCH_OP) { } else if (operation == SIMPLE_SEARCH_OP) {
std::string pattern = d[PATTERN_PARAM].GetString(); std::string pattern = d[PATTERN_PARAM].GetString();
_searcherController->simpleSearch(jsonWriter, pattern); int tmId = d[TM_ID_PARAM].GetInt();
_searcherController->simpleSearch(jsonWriter, pattern, tmId);
} else if (operation == CONCORDIA_SEARCH_OP) { } else if (operation == CONCORDIA_SEARCH_OP) {
std::string pattern = d[PATTERN_PARAM].GetString(); std::string pattern = d[PATTERN_PARAM].GetString();
int tmId = d[TM_ID_PARAM].GetInt();
Logger::logString("concordia search pattern", pattern); Logger::logString("concordia search pattern", pattern);
_searcherController->concordiaSearch(jsonWriter, pattern); _searcherController->concordiaSearch(jsonWriter, pattern, tmId);
} else if (operation == ADD_TM_OP) {
int sourceLangId = d[SOURCE_LANG_PARAM].GetInt();
int targetLangId = d[TARGET_LANG_PARAM].GetInt();
std::string name = d[NAME_PARAM].GetString();
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
_addTm(newId);
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.String("newTmId");
jsonWriter.Int(newId);
jsonWriter.EndObject();
} else { } else {
JsonGenerator::signalError(jsonWriter, "no such operation"); JsonGenerator::signalError(jsonWriter, "no such operation");
} }
@ -83,3 +111,13 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
return outputString.str(); return outputString.str();
} }
void ConcordiaServer::_addTm(int tmId) {
std::stringstream indexPath;
indexPath << INDEX_DIRECTORY << "/tm_" << tmId;
if (!boost::filesystem::exists(indexPath.str())) {
boost::filesystem::create_directories(indexPath.str());
}
_concordiasMap->insert(tmId, new Concordia(indexPath.str(), _configFilePath));
}

View File

@ -11,6 +11,7 @@
#include "rapidjson/writer.h" #include "rapidjson/writer.h"
#include "rapidjson/error/en.h" #include "rapidjson/error/en.h"
#include "tm_dao.hpp"
#include "index_controller.hpp" #include "index_controller.hpp"
#include "searcher_controller.hpp" #include "searcher_controller.hpp"
@ -29,6 +30,14 @@ public:
std::string handleRequest(std::string & requestString); std::string handleRequest(std::string & requestString);
private: private:
void _addTm(int tmId);
std::string _configFilePath;
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
TmDAO _tmDAO;
boost::shared_ptr<IndexController> _indexController; boost::shared_ptr<IndexController> _indexController;
boost::shared_ptr<SearcherController> _searcherController; boost::shared_ptr<SearcherController> _searcherController;

View File

@ -1,5 +1,6 @@
#define CONFIG_FILE_PATH "@CONFIG_FILE_PATH@" #define CONFIG_FILE_PATH "@CONFIG_FILE_PATH@"
#define LOG_FILE_PATH "@LOG_FILE_PATH@" #define LOG_FILE_PATH "@LOG_FILE_PATH@"
#define INDEX_DIRECTORY "@INDEX_DIRECTORY@"
// database connection information // database connection information
#define DB_NAME "@DB_NAME@" #define DB_NAME "@DB_NAME@"
@ -15,10 +16,14 @@
#define TARGET_SENTENCE_PARAM "targetSentence" #define TARGET_SENTENCE_PARAM "targetSentence"
#define TM_ID_PARAM "tmId" #define TM_ID_PARAM "tmId"
#define SENTENCES_PARAM "sentences" #define SENTENCES_PARAM "sentences"
#define SOURCE_LANG_PARAM "sourceLangId"
#define TARGET_LANG_PARAM "targetLangId"
#define NAME_PARAM "name"
#define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCE_OP "addSentence"
#define ADD_SENTENCES_OP "addSentences" #define ADD_SENTENCES_OP "addSentences"
#define REFRESH_INDEX_OP "refreshIndex" #define REFRESH_INDEX_OP "refreshIndex"
#define SIMPLE_SEARCH_OP "simpleSearch" #define SIMPLE_SEARCH_OP "simpleSearch"
#define CONCORDIA_SEARCH_OP "concordiaSearch" #define CONCORDIA_SEARCH_OP "concordiaSearch"
#define ADD_TM_OP "addTm"

View File

@ -144,5 +144,13 @@ std::string DBconnection::getStringValue(PGresult * result, int row, int col) t
} }
} }
int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
try {
return PQntuples(result);
} catch (std::exception & e) {
std::stringstream ss;
ss << "Error getting int value. Message: " << e.what();
throw ConcordiaException(ss.str());
}
}

View File

@ -33,6 +33,8 @@ public:
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException); std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
int getRowCount(PGresult * result) throw (ConcordiaException);
private: private:
void close(); void close();

View File

@ -3,10 +3,11 @@
#include <concordia/common/config.hpp> #include <concordia/common/config.hpp>
#include "json_generator.hpp" #include "json_generator.hpp"
#include "logger.hpp"
IndexController::IndexController(boost::shared_ptr<Concordia> concordia) IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
throw(ConcordiaException): throw(ConcordiaException):
_concordia(concordia) { _concordiasMap(concordiasMap) {
} }
IndexController::~IndexController() { IndexController::~IndexController() {
@ -20,19 +21,32 @@ void IndexController::addSentence(
const int tmId) { const int tmId) {
try { try {
TokenizedSentence tokenizedSentence = _concordia->tokenize(sourceSentence); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); if (it != _concordiasMap->end()) {
_concordia->addTokenizedExample(tokenizedSentence, sentenceId); TokenizedSentence tokenizedSentence = (*_concordiasMap)[tmId].tokenize(sourceSentence);
_concordia->refreshSAfromRAM(); int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
(*_concordiasMap)[tmId].addTokenizedExample(tokenizedSentence, sentenceId);
(*_concordiasMap)[tmId].refreshSAfromRAM();
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("success"); jsonWriter.String("success");
jsonWriter.EndObject(); jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
std::stringstream errorstream; std::stringstream errorstream;
errorstream << "concordia error: " << e.what(); errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str()); JsonGenerator::signalError(jsonWriter, errorstream.str());
} catch (std::exception & e) {
std::stringstream errorstream;
errorstream << "general error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
} catch (...) {
std::stringstream errorstream;
errorstream << "unexpected error occurred";
JsonGenerator::signalError(jsonWriter, errorstream.str());
} }
} }
@ -40,16 +54,21 @@ void IndexController::addSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences, const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences, const std::vector<std::string> & targetSentences,
const std::vector<int> & tmIds) { const int tmId) {
try { try {
std::vector<TokenizedSentence> tokenizedSentences = _concordia->tokenizeAll(sourceSentences); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmIds); if (it != _concordiasMap->end()) {
_concordia->addAllTokenizedExamples(tokenizedSentences, sentenceIds); std::vector<TokenizedSentence> tokenizedSentences = (*_concordiasMap)[tmId].tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
(*_concordiasMap)[tmId].addAllTokenizedExamples(tokenizedSentences, sentenceIds);
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("success"); jsonWriter.String("success");
jsonWriter.EndObject(); jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
std::stringstream errorstream; std::stringstream errorstream;
errorstream << "concordia error: " << e.what(); errorstream << "concordia error: " << e.what();
@ -57,14 +76,19 @@ void IndexController::addSentences(
} }
} }
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter) { void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId) {
try { try {
_concordia->refreshSAfromRAM(); if (it != _concordiasMap->end()) {
(*_concordiasMap)[tmId].refreshSAfromRAM();
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("success"); jsonWriter.String("success");
jsonWriter.EndObject(); jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
std::stringstream errorstream; std::stringstream errorstream;
errorstream << "concordia error: " << e.what(); errorstream << "concordia error: " << e.what();
@ -74,3 +98,4 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
} }

View File

@ -5,6 +5,8 @@
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/concordia_exception.hpp> #include <concordia/concordia_exception.hpp>
#include <boost/ptr_container/ptr_map.hpp>
#include "unit_dao.hpp" #include "unit_dao.hpp"
@ -14,8 +16,8 @@ class IndexController {
public: public:
/*! Constructor. /*! Constructor.
*/ */
explicit IndexController(boost::shared_ptr<Concordia> concordia) explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
throw(ConcordiaException); throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~IndexController(); virtual ~IndexController();
@ -28,12 +30,13 @@ public:
void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences, const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences, const std::vector<std::string> & targetSentences,
const std::vector<int> & tmIds); const int tmId);
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter); void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId);
private: private:
boost::shared_ptr<Concordia> _concordia; boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
UnitDAO _unitDAO; UnitDAO _unitDAO;
}; };

View File

@ -5,50 +5,64 @@
#include "json_generator.hpp" #include "json_generator.hpp"
SearcherController::SearcherController(boost::shared_ptr<Concordia> concordia) SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
throw(ConcordiaException): throw(ConcordiaException):
_concordia(concordia) { _concordiasMap(concordiasMap) {
} }
SearcherController::~SearcherController() { SearcherController::~SearcherController() {
} }
void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern) { void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(_concordia->simpleSearch(pattern)); std::string & pattern,
const int tmId) {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults((*_concordiasMap)[tmId].simpleSearch(pattern));
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("success"); jsonWriter.String("success");
jsonWriter.String("results"); jsonWriter.String("results");
jsonWriter.StartArray(); jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & result, results) { BOOST_FOREACH(SimpleSearchResult & result, results) {
JsonGenerator::writeSearchResult(jsonWriter, result); JsonGenerator::writeSearchResult(jsonWriter, result);
}
jsonWriter.EndArray();
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
} }
jsonWriter.EndArray();
jsonWriter.EndObject();
} }
void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern) { void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
std::string & pattern,
const int tmId) {
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(_concordia->concordiaSearch(pattern)); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult((*_concordiasMap)[tmId].concordiaSearch(pattern));
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
jsonWriter.String("success"); jsonWriter.String("success");
jsonWriter.String("result"); jsonWriter.String("result");
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("bestOverlayScore"); jsonWriter.String("bestOverlayScore");
jsonWriter.Double(result.getBestOverlayScore()); jsonWriter.Double(result.getBestOverlayScore());
jsonWriter.String("bestOverlay"); jsonWriter.String("bestOverlay");
jsonWriter.StartArray(); jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & simpleResult, result.getBestOverlay()) { BOOST_FOREACH(SimpleSearchResult & simpleResult, result.getBestOverlay()) {
JsonGenerator::writeSearchResult(jsonWriter, simpleResult); JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
}
jsonWriter.EndArray();
jsonWriter.EndObject();
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
} }
jsonWriter.EndArray();
jsonWriter.EndObject();
jsonWriter.EndObject();
} }

View File

@ -3,6 +3,7 @@
#include <string> #include <string>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_map.hpp>
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/concordia_exception.hpp> #include <concordia/concordia_exception.hpp>
@ -15,19 +16,23 @@ class SearcherController {
public: public:
/*! Constructor. /*! Constructor.
*/ */
explicit SearcherController(boost::shared_ptr<Concordia> concordia) explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
throw(ConcordiaException); throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~SearcherController(); virtual ~SearcherController();
void simpleSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern); void simpleSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
std::string & pattern,
const int tmId);
void concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, std::string & pattern); void concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
std::string & pattern,
const int tmId);
private: private:
boost::shared_ptr<Concordia> _concordia; boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
UnitDAO _unitDAO; UnitDAO _unitDAO;
}; };

View File

@ -0,0 +1,55 @@
#include "tm_dao.hpp"
#include "query_param.hpp"
#include "string_param.hpp"
#include "int_param.hpp"
#include "int_array_param.hpp"
#include "logger.hpp"
#include <boost/foreach.hpp>
#include <libpq-fe.h>
TmDAO::TmDAO() {
}
TmDAO::~TmDAO() {
}
std::vector<int> TmDAO::getTmIds() {
std::vector<int> result;
DBconnection connection;
connection.startTransaction();
std::string query = "SELECT id FROM tm;";
PGresult * dbResult = connection.execute(query);
for (int i=0;i<connection.getRowCount(dbResult);i++) {
int tmId = connection.getIntValue(dbResult, i, 0);
result.push_back(tmId);
}
connection.clearResult(dbResult);
connection.endTransaction();
return result;
}
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
DBconnection connection;
connection.startTransaction();
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new IntParam(sourceLangId));
params.push_back(new IntParam(targetLangId));
params.push_back(new StringParam(name));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
connection.clearResult(result);
connection.endTransaction();
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
return newId;
}

View File

@ -0,0 +1,27 @@
#ifndef TM_DAO_HDR
#define TM_DAO_HDR
#include <string>
#include <vector>
#include <concordia/common/config.hpp>
#include "db_connection.hpp"
class TmDAO {
public:
/*! Constructor.
*/
TmDAO();
/*! Destructor.
*/
virtual ~TmDAO();
int addTm(const int sourceLangId, const int targetLangId, const std::string name);
std::vector<int> getTmIds();
private:
};
#endif

View File

@ -31,13 +31,13 @@ int UnitDAO::addSentence(
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences( std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
const std::vector<TokenizedSentence> & sourceSentences, const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<std::string> & targetSentences, const std::vector<std::string> & targetSentences,
const std::vector<int> & tmIds) { const int tmId) {
DBconnection connection; DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds; std::vector<SUFFIX_MARKER_TYPE> newIds;
connection.startTransaction(); connection.startTransaction();
int index = 0; int index = 0;
BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) { BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) {
newIds.push_back(_addSingleSentence(connection, sourceSentence, targetSentences.at(index), tmIds.at(index))); newIds.push_back(_addSingleSentence(connection, sourceSentence, targetSentences.at(index), tmId));
index++; index++;
} }
connection.endTransaction(); connection.endTransaction();

View File

@ -32,7 +32,7 @@ public:
std::vector<SUFFIX_MARKER_TYPE> addSentences( std::vector<SUFFIX_MARKER_TYPE> addSentences(
const std::vector<TokenizedSentence> & sourceSentences, const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<std::string> & targetSentences, const std::vector<std::string> & targetSentences,
const std::vector<int> & tmIds); const int tmId);
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments); std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);

View File

@ -3,17 +3,6 @@
#--------------------------- #---------------------------
# #
#-------------------------------------------------------------------------------
# The below set the paths for hashed index, markers array and word map files.
# If all the files pointed by these paths exist, Concordia reads them to its
# RAM index. When none of these files exist, a new empty index is created.
# However, if any of these files exist and any other is missing, the index
# is considered corrupt and Concordia does not start.
hashed_index_path = "@INDEX_DIRECTORY@/@HASHED_INDEX_FILE@"
markers_path = "@INDEX_DIRECTORY@/@MARKERS_FILE@"
word_map_path = "@INDEX_DIRECTORY@/@WORD_MAP_FILE@"
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# The following settings control the sentence anonymizer mechanism. It is used to # The following settings control the sentence anonymizer mechanism. It is used to
# remove unnecessary symbols and possibly words from sentences added to index # remove unnecessary symbols and possibly words from sentences added to index

2
tests/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
host.py
host.pyc

View File

@ -5,10 +5,16 @@ import unittest
import json import json
import urllib2 import urllib2
import sys import sys
import host
import time import time
BUFFER_SIZE = 500 BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname): def file_len(fname):
with open(fname) as f: with open(fname) as f:
for i, l in enumerate(f): for i, l in enumerate(f):
@ -16,13 +22,15 @@ def file_len(fname):
return i + 1 return i + 1
def add_data(data): def add_data(data):
req = urllib2.Request('http://localhost') req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json') req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read() urllib2.urlopen(req, json.dumps(data)).read()
sourceFile = sys.argv[1] sourceFile = sys.argv[1]
targetFile = sys.argv[2] sourceLangId = int(sys.argv[2])
tmId = int(sys.argv[3]) targetFile = sys.argv[3]
targetLangId = int(sys.argv[4])
name = sys.argv[5]
totalLines = file_len(sourceFile) totalLines = file_len(sourceFile)
if file_len(targetFile) != totalLines: if file_len(targetFile) != totalLines:
@ -30,18 +38,33 @@ if file_len(targetFile) != totalLines:
sys.exit(1) sys.exit(1)
data = { data = {
'operation': 'addSentences' 'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
data = {
'operation': 'addSentences',
'tmId':tmId
} }
sentences = [] sentences = []
start = time.time() start = time.time()
with open(sys.argv[1]) as sourceSentences: with open(sourceFile) as sourceSentences:
with open(sys.argv[2]) as targetSentences: with open(targetFile) as targetSentences:
lineNumber = 0 lineNumber = 0
for sourceSentence in sourceSentences: for sourceSentence in sourceSentences:
lineNumber += 1 lineNumber += 1
targetSentence = targetSentences.readline() targetSentence = targetSentences.readline()
sentences.append([tmId, sourceSentence, targetSentence]) sentences.append([sourceSentence, targetSentence])
if lineNumber % BUFFER_SIZE == 0: if lineNumber % BUFFER_SIZE == 0:
data['sentences'] = sentences data['sentences'] = sentences
sentences = [] sentences = []
@ -60,9 +83,10 @@ print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentenc
print "Generating index..." print "Generating index..."
start = time.time() start = time.time()
data = { data = {
'operation': 'refreshIndex' 'operation': 'refreshIndex',
'tmId' : tmId
} }
req = urllib2.Request('http://localhost') req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json') req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read() urllib2.urlopen(req, json.dumps(data)).read()

View File

@ -6,16 +6,22 @@ import json
import urllib2 import urllib2
import sys import sys
import time import time
import host
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
data = { data = {
'operation': 'addSentence', 'operation': 'addSentence',
'sourceSentence':sys.argv[1], 'sourceSentence':sys.argv[1],
'targetSentence':sys.argv[2], 'targetSentence':sys.argv[2],
'tmId':sys.argv[3] 'tmId':int(sys.argv[3])
} }
start = time.time() start = time.time()
req = urllib2.Request('http://localhost') req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json') req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time() end = time.time()

26
tests/addTm.py Executable file
View File

@ -0,0 +1,26 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib2
import sys
import time
import host
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
data = {
'operation': 'addTm',
'sourceLangId':int(sys.argv[1]),
'targetLangId':int(sys.argv[2]),
'name':sys.argv[3]
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
print response

View File

@ -6,6 +6,12 @@ import json
import urllib2 import urllib2
import sys import sys
import time import time
import host
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
data = { data = {
'operation': 'concordiaSearch', 'operation': 'concordiaSearch',
@ -13,7 +19,7 @@ data = {
} }
start = time.time() start = time.time()
req = urllib2.Request('http://localhost') req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json') req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time() end = time.time()

2
tests/host.py_example Normal file
View File

@ -0,0 +1,2 @@
concordia_host = 'localhost'
concordia_port = ''

View File

@ -6,14 +6,20 @@ import json
import urllib2 import urllib2
import sys import sys
import time import time
import host
data = { data = {
'operation': 'simpleSearch', 'operation': 'simpleSearch',
'pattern':sys.argv[1] 'pattern':sys.argv[1],
'tmId':int(sys.argv[2])
} }
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
start = time.time() start = time.time()
req = urllib2.Request('http://localhost:8800') req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json') req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time() end = time.time()