2013-11-28 16:47:57 +01:00
|
|
|
#include <sstream>
|
2015-08-19 20:49:26 +02:00
|
|
|
#include <boost/foreach.hpp>
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
#include "concordia/concordia.hpp"
|
|
|
|
#include "concordia/common/config.hpp"
|
2013-12-14 15:23:17 +01:00
|
|
|
#include "concordia/common/utils.hpp"
|
2013-10-24 17:08:58 +02:00
|
|
|
|
|
|
|
// ===========================================
|
|
|
|
|
|
|
|
std::string _createLibraryVersion();
|
|
|
|
|
|
|
|
// ===========================================
|
|
|
|
|
|
|
|
std::string Concordia::_libraryVersion = _createLibraryVersion();
|
|
|
|
|
|
|
|
// ===========================================
|
|
|
|
|
2015-10-19 15:38:10 +02:00
|
|
|
Concordia::Concordia() {
|
|
|
|
}
|
|
|
|
|
2015-10-16 22:14:11 +02:00
|
|
|
Concordia::Concordia(const std::string & indexPath,
|
|
|
|
const std::string & configFilePath)
|
|
|
|
throw(ConcordiaException) :
|
|
|
|
_indexPath(indexPath) {
|
2013-11-28 16:47:57 +01:00
|
|
|
_config = boost::shared_ptr<ConcordiaConfig> (
|
2013-10-24 17:08:58 +02:00
|
|
|
new ConcordiaConfig(configFilePath));
|
2013-11-28 16:47:57 +01:00
|
|
|
_index = boost::shared_ptr<ConcordiaIndex>(
|
2015-10-16 22:14:11 +02:00
|
|
|
new ConcordiaIndex(_getHashedIndexFilePath(),
|
|
|
|
_getMarkersFilePath()));
|
2013-11-28 16:47:57 +01:00
|
|
|
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
|
2013-12-14 15:23:17 +01:00
|
|
|
_initializeIndex();
|
2013-10-24 17:08:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Concordia::~Concordia() {
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string & Concordia::getVersion() {
|
|
|
|
return _libraryVersion;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string _createLibraryVersion() {
|
|
|
|
std::stringstream version;
|
|
|
|
|
|
|
|
version << CONCORDIA_VERSION_MAJOR
|
|
|
|
<< "."
|
|
|
|
<< CONCORDIA_VERSION_MINOR;
|
|
|
|
|
|
|
|
return version.str();
|
|
|
|
}
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence
|
2015-12-27 20:54:40 +01:00
|
|
|
Concordia::tokenize(const std::string & sentence,
|
2016-01-01 20:45:07 +01:00
|
|
|
bool byWhitespace,
|
|
|
|
bool generateCodes)
|
2015-08-01 17:03:39 +02:00
|
|
|
throw(ConcordiaException) {
|
2016-01-01 20:45:07 +01:00
|
|
|
if (generateCodes) {
|
|
|
|
TokenizedSentence result =
|
|
|
|
_hashGenerator->generateHash(sentence, byWhitespace);
|
|
|
|
_hashGenerator->serializeWordMap();
|
|
|
|
return result;
|
|
|
|
} else {
|
|
|
|
return _hashGenerator->generateTokens(sentence, byWhitespace);
|
|
|
|
}
|
2015-08-01 17:03:39 +02:00
|
|
|
}
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
2015-12-27 20:54:40 +01:00
|
|
|
const std::vector<std::string> & sentences,
|
2016-01-01 20:45:07 +01:00
|
|
|
bool byWhitespace,
|
|
|
|
bool generateCodes)
|
2015-08-19 20:49:26 +02:00
|
|
|
throw(ConcordiaException) {
|
|
|
|
std::vector<TokenizedSentence> result;
|
2017-04-21 14:51:58 +02:00
|
|
|
|
2016-01-01 20:45:07 +01:00
|
|
|
if (generateCodes) {
|
|
|
|
BOOST_FOREACH(std::string sentence, sentences) {
|
2017-04-21 14:51:58 +02:00
|
|
|
result.push_back(_hashGenerator->generateHash(sentence,
|
|
|
|
byWhitespace));
|
2016-01-01 20:45:07 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
_hashGenerator->serializeWordMap();
|
|
|
|
} else {
|
|
|
|
BOOST_FOREACH(std::string sentence, sentences) {
|
2017-04-21 14:51:58 +02:00
|
|
|
result.push_back(_hashGenerator->generateTokens(sentence,
|
|
|
|
byWhitespace));
|
|
|
|
}
|
2015-08-19 20:49:26 +02:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
// Sentences are written to disk and added to T.
|
2014-02-20 10:49:17 +01:00
|
|
|
// SA is generated on command by other methods.
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence Concordia::addExample(
|
2015-06-27 12:40:24 +02:00
|
|
|
const Example & example)
|
|
|
|
throw(ConcordiaException) {
|
2015-06-26 22:50:53 +02:00
|
|
|
return _index->addExample(_hashGenerator, _T, _markers, example);
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
// Sentences are written to disk and added to T.
|
|
|
|
// SA is generated on command by other methods.
|
|
|
|
void Concordia::addTokenizedExample(
|
2015-08-19 20:49:26 +02:00
|
|
|
const TokenizedSentence & tokenizedSentence,
|
|
|
|
const SUFFIX_MARKER_TYPE id)
|
2015-08-01 17:03:39 +02:00
|
|
|
throw(ConcordiaException) {
|
|
|
|
_index->addTokenizedExample(_hashGenerator, _T,
|
|
|
|
_markers, tokenizedSentence, id);
|
|
|
|
}
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
void Concordia::addAllTokenizedExamples(
|
|
|
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
|
|
|
const std::vector<SUFFIX_MARKER_TYPE> & ids)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
_index->addAllTokenizedExamples(_hashGenerator, _T,
|
|
|
|
_markers, tokenizedSentences, ids);
|
|
|
|
}
|
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
// Sentences are written to disk and added to T.
|
2014-02-20 10:49:17 +01:00
|
|
|
// SA is generated on command by other methods.
|
2015-06-27 12:40:24 +02:00
|
|
|
std::vector<TokenizedSentence> Concordia::addAllExamples(
|
|
|
|
const std::vector<Example> & examples)
|
|
|
|
throw(ConcordiaException) {
|
2015-06-26 22:50:53 +02:00
|
|
|
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
2015-10-16 22:14:11 +02:00
|
|
|
if (boost::filesystem::exists(_getWordMapFilePath())
|
|
|
|
&& boost::filesystem::exists(_getHashedIndexFilePath())
|
|
|
|
&& boost::filesystem::exists(_getMarkersFilePath())) {
|
2014-02-20 10:49:17 +01:00
|
|
|
// reading index from file
|
2013-12-14 15:23:17 +01:00
|
|
|
_T->clear();
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ifstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(
|
2015-10-16 22:14:11 +02:00
|
|
|
_getHashedIndexFilePath().c_str(), std::ios::in
|
2015-04-15 14:14:10 +02:00
|
|
|
| std::ios::ate | std::ios::binary);
|
2014-02-20 10:49:17 +01:00
|
|
|
saidx_t hiFileSize = hashedIndexFile.tellg();
|
|
|
|
if (hiFileSize > 0) {
|
2015-04-15 14:14:10 +02:00
|
|
|
hashedIndexFile.seekg(0, std::ios::beg);
|
2013-12-14 15:23:17 +01:00
|
|
|
|
|
|
|
while (!hashedIndexFile.eof()) {
|
|
|
|
INDEX_CHARACTER_TYPE character =
|
|
|
|
Utils::readIndexCharacter(hashedIndexFile);
|
|
|
|
Utils::appendCharToSaucharVector(_T, character);
|
|
|
|
}
|
|
|
|
hashedIndexFile.close();
|
|
|
|
} else {
|
2014-02-20 10:49:17 +01:00
|
|
|
hashedIndexFile.close();
|
2013-12-14 15:23:17 +01:00
|
|
|
throw ConcordiaException("Index corrupt: empty hash index file");
|
|
|
|
}
|
2014-02-20 10:49:17 +01:00
|
|
|
|
|
|
|
// reading markers from file
|
|
|
|
_markers->clear();
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ifstream markersFile;
|
2015-10-16 22:14:11 +02:00
|
|
|
markersFile.open(_getMarkersFilePath().c_str(), std::ios::in
|
2015-04-15 14:14:10 +02:00
|
|
|
| std::ios::ate | std::ios::binary);
|
2014-02-20 10:49:17 +01:00
|
|
|
saidx_t maFileSize = markersFile.tellg();
|
|
|
|
if (maFileSize > 0) {
|
2015-04-15 14:14:10 +02:00
|
|
|
markersFile.seekg(0, std::ios::beg);
|
2014-02-20 10:49:17 +01:00
|
|
|
|
|
|
|
while (!markersFile.eof()) {
|
|
|
|
SUFFIX_MARKER_TYPE marker =
|
|
|
|
Utils::readMarker(markersFile);
|
|
|
|
_markers->push_back(marker);
|
|
|
|
}
|
|
|
|
markersFile.close();
|
|
|
|
} else {
|
|
|
|
markersFile.close();
|
|
|
|
throw ConcordiaException("Index corrupt: empty markers file");
|
|
|
|
}
|
|
|
|
// generating suffix array
|
2014-04-13 12:21:30 +02:00
|
|
|
_SA = _index->generateSuffixArray(_T);
|
2013-12-14 15:23:17 +01:00
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Index corrupt: missing files");
|
|
|
|
}
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
|
2014-04-13 12:21:30 +02:00
|
|
|
_SA = _index->generateSuffixArray(_T);
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
|
|
|
|
void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|
|
|
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
2015-10-16 22:14:11 +02:00
|
|
|
new HashGenerator(_indexPath,
|
|
|
|
_config));
|
2013-12-14 15:23:17 +01:00
|
|
|
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
|
|
|
new std::vector<sauchar_t>);
|
2014-02-20 10:49:17 +01:00
|
|
|
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
|
|
|
|
new std::vector<SUFFIX_MARKER_TYPE>);
|
2015-10-16 22:14:11 +02:00
|
|
|
if (boost::filesystem::exists(_getWordMapFilePath())
|
|
|
|
&& boost::filesystem::exists(_getHashedIndexFilePath())) {
|
2013-12-14 15:23:17 +01:00
|
|
|
loadRAMIndexFromDisk();
|
2015-10-16 22:14:11 +02:00
|
|
|
} else if (!boost::filesystem::exists(_getWordMapFilePath())
|
|
|
|
&& !boost::filesystem::exists(_getHashedIndexFilePath())) {
|
2013-12-14 15:23:17 +01:00
|
|
|
// empty index
|
|
|
|
_SA = boost::shared_ptr<std::vector<saidx_t> >(
|
|
|
|
new std::vector<saidx_t>);
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Index corrupt: missing files");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-01 13:36:54 +02:00
|
|
|
SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
if (_T->size() > 0) {
|
|
|
|
return _searcher->countOccurences(_hashGenerator, _T,
|
|
|
|
_markers, _SA, pattern);
|
|
|
|
} else {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-04-21 14:51:58 +02:00
|
|
|
MatchedPatternFragment Concordia::simpleSearch(
|
2017-04-22 23:45:51 +02:00
|
|
|
const std::string & pattern,
|
|
|
|
bool byWhitespace)
|
2013-11-28 16:47:57 +01:00
|
|
|
throw(ConcordiaException) {
|
2017-10-10 15:39:47 +02:00
|
|
|
if (_T->size() > 0 && pattern.size() > 0) {
|
2014-02-20 10:49:17 +01:00
|
|
|
return _searcher->simpleSearch(_hashGenerator, _T,
|
2017-04-22 23:45:51 +02:00
|
|
|
_markers, _SA, pattern, byWhitespace);
|
2013-12-14 15:23:17 +01:00
|
|
|
} else {
|
2017-10-10 15:39:47 +02:00
|
|
|
// If the index or search pattern are empty, return an empty result.
|
|
|
|
MatchedPatternFragment result(0, 0);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-09 15:30:56 +01:00
|
|
|
OccurencesList Concordia::fullSearch(
|
|
|
|
const std::string & pattern,
|
2019-01-09 18:31:52 +01:00
|
|
|
int limit,
|
|
|
|
int offset,
|
2019-01-09 15:30:56 +01:00
|
|
|
bool byWhitespace)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
if (_T->size() > 0 && pattern.size() > 0) {
|
|
|
|
return _searcher->fullSearch(_hashGenerator, _T,
|
|
|
|
_markers, _SA, pattern, limit, offset, byWhitespace);
|
|
|
|
} else {
|
|
|
|
// If the index or search pattern are empty, return an empty result.
|
|
|
|
OccurencesList result(0);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-10 15:39:47 +02:00
|
|
|
MatchedPatternFragment Concordia::lexiconSearch(
|
|
|
|
const std::string & pattern,
|
|
|
|
bool byWhitespace)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
if (_T->size() > 0 && pattern.size() > 0) {
|
|
|
|
return _searcher->lexiconSearch(_hashGenerator, _T,
|
|
|
|
_markers, _SA, pattern, byWhitespace);
|
|
|
|
} else {
|
|
|
|
// If the index or search pattern are empty, return an empty result.
|
|
|
|
// Especially performing the lexicon search with an empty pattern
|
|
|
|
// would not be funny, as it would effectively search for double EOS,
|
|
|
|
// which is very frequent in the index.
|
2017-04-21 14:51:58 +02:00
|
|
|
MatchedPatternFragment result(0, 0);
|
2013-12-14 15:23:17 +01:00
|
|
|
return result;
|
|
|
|
}
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
2015-04-15 10:55:26 +02:00
|
|
|
std::vector<AnubisSearchResult> Concordia::anubisSearch(
|
2015-04-15 14:14:10 +02:00
|
|
|
const std::string & pattern)
|
2015-04-12 12:06:41 +02:00
|
|
|
throw(ConcordiaException) {
|
|
|
|
if (_T->size() > 0) {
|
2015-04-16 11:39:39 +02:00
|
|
|
return _searcher->anubisSearch(_config, _hashGenerator, _T,
|
2015-04-15 10:55:26 +02:00
|
|
|
_markers, _SA, pattern);
|
2015-04-12 12:06:41 +02:00
|
|
|
} else {
|
2015-04-15 10:55:26 +02:00
|
|
|
std::vector<AnubisSearchResult> result;
|
2015-04-12 12:06:41 +02:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-04-17 14:17:59 +02:00
|
|
|
boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
|
2017-04-22 23:45:51 +02:00
|
|
|
const std::string & pattern,
|
|
|
|
bool byWhitespace)
|
2015-04-17 14:17:59 +02:00
|
|
|
throw(ConcordiaException) {
|
|
|
|
if (_T->size() > 0) {
|
|
|
|
return _searcher->concordiaSearch(_hashGenerator, _T,
|
2017-04-22 23:45:51 +02:00
|
|
|
_markers, _SA, pattern, byWhitespace);
|
2015-04-17 14:17:59 +02:00
|
|
|
} else {
|
2015-06-26 22:50:53 +02:00
|
|
|
std::string empty;
|
2015-04-17 14:17:59 +02:00
|
|
|
return boost::shared_ptr<ConcordiaSearchResult>(
|
2015-08-19 20:49:26 +02:00
|
|
|
new ConcordiaSearchResult(TokenizedSentence(empty)));
|
2015-04-17 14:17:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-04 20:40:44 +02:00
|
|
|
void Concordia::clearIndex() throw(ConcordiaException) {
|
|
|
|
_hashGenerator->clearWordMap();
|
|
|
|
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
|
|
|
new std::vector<sauchar_t>);
|
|
|
|
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
|
|
|
|
new std::vector<SUFFIX_MARKER_TYPE>);
|
|
|
|
_SA = boost::shared_ptr<std::vector<saidx_t> >(
|
|
|
|
new std::vector<saidx_t>);
|
|
|
|
|
2015-10-16 22:14:11 +02:00
|
|
|
boost::filesystem::remove(_getHashedIndexFilePath());
|
|
|
|
boost::filesystem::remove(_getMarkersFilePath());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Concordia::_getWordMapFilePath() {
|
|
|
|
return _indexPath+"/"+WORD_MAP_FILE_NAME;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Concordia::_getHashedIndexFilePath() {
|
|
|
|
return _indexPath+"/"+HASHED_INDEX_FILE_NAME;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Concordia::_getMarkersFilePath() {
|
|
|
|
return _indexPath+"/"+MARKERS_FILE_NAME;
|
2015-05-04 20:40:44 +02:00
|
|
|
}
|