concordia-library/concordia/concordia.cpp

298 lines
10 KiB
C++
Raw Normal View History

2013-11-28 16:47:57 +01:00
#include <sstream>
2015-08-19 20:49:26 +02:00
#include <boost/foreach.hpp>
2013-11-28 16:47:57 +01:00
2013-10-24 17:08:58 +02:00
#include "concordia/concordia.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
2013-10-24 17:08:58 +02:00
// ===========================================
std::string _createLibraryVersion();
// ===========================================
std::string Concordia::_libraryVersion = _createLibraryVersion();
// ===========================================
Concordia::Concordia() {
}
Concordia::Concordia(const std::string & indexPath,
2019-01-18 13:30:51 +01:00
const std::string & configFilePath):
_indexPath(indexPath) {
2013-11-28 16:47:57 +01:00
_config = boost::shared_ptr<ConcordiaConfig> (
2013-10-24 17:08:58 +02:00
new ConcordiaConfig(configFilePath));
2013-11-28 16:47:57 +01:00
_index = boost::shared_ptr<ConcordiaIndex>(
new ConcordiaIndex(_getHashedIndexFilePath(),
_getMarkersFilePath()));
2013-11-28 16:47:57 +01:00
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
_initializeIndex();
2013-10-24 17:08:58 +02:00
}
Concordia::~Concordia() {
}
std::string & Concordia::getVersion() {
return _libraryVersion;
}
std::string _createLibraryVersion() {
std::stringstream version;
version << CONCORDIA_VERSION_MAJOR
<< "."
<< CONCORDIA_VERSION_MINOR;
return version.str();
}
2015-08-19 20:49:26 +02:00
TokenizedSentence
2015-12-27 20:54:40 +01:00
Concordia::tokenize(const std::string & sentence,
2016-01-01 20:45:07 +01:00
bool byWhitespace,
2019-01-18 13:30:51 +01:00
bool generateCodes) {
2016-01-01 20:45:07 +01:00
if (generateCodes) {
TokenizedSentence result =
_hashGenerator->generateHash(sentence, byWhitespace);
_hashGenerator->serializeWordMap();
return result;
} else {
return _hashGenerator->generateTokens(sentence, byWhitespace);
}
}
2015-08-19 20:49:26 +02:00
std::vector<TokenizedSentence> Concordia::tokenizeAll(
2015-12-27 20:54:40 +01:00
const std::vector<std::string> & sentences,
2016-01-01 20:45:07 +01:00
bool byWhitespace,
2019-01-18 13:30:51 +01:00
bool generateCodes) {
2015-08-19 20:49:26 +02:00
std::vector<TokenizedSentence> result;
2017-04-21 14:51:58 +02:00
2016-01-01 20:45:07 +01:00
if (generateCodes) {
BOOST_FOREACH(std::string sentence, sentences) {
2017-04-21 14:51:58 +02:00
result.push_back(_hashGenerator->generateHash(sentence,
byWhitespace));
2016-01-01 20:45:07 +01:00
}
_hashGenerator->serializeWordMap();
} else {
BOOST_FOREACH(std::string sentence, sentences) {
2017-04-21 14:51:58 +02:00
result.push_back(_hashGenerator->generateTokens(sentence,
byWhitespace));
}
2015-08-19 20:49:26 +02:00
}
return result;
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
2015-08-19 20:49:26 +02:00
TokenizedSentence Concordia::addExample(
2019-01-18 13:30:51 +01:00
const Example & example) {
2015-06-26 22:50:53 +02:00
return _index->addExample(_hashGenerator, _T, _markers, example);
2013-11-28 16:47:57 +01:00
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addTokenizedExample(
2015-08-19 20:49:26 +02:00
const TokenizedSentence & tokenizedSentence,
2019-01-18 13:30:51 +01:00
const SUFFIX_MARKER_TYPE id) {
_index->addTokenizedExample(_hashGenerator, _T,
_markers, tokenizedSentence, id);
}
2015-08-19 20:49:26 +02:00
void Concordia::addAllTokenizedExamples(
const std::vector<TokenizedSentence> & tokenizedSentences,
2019-01-18 13:30:51 +01:00
const std::vector<SUFFIX_MARKER_TYPE> & ids) {
2015-08-19 20:49:26 +02:00
_index->addAllTokenizedExamples(_hashGenerator, _T,
_markers, tokenizedSentences, ids);
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
2015-06-27 12:40:24 +02:00
std::vector<TokenizedSentence> Concordia::addAllExamples(
2019-01-18 13:30:51 +01:00
const std::vector<Example> & examples) {
2015-06-26 22:50:53 +02:00
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
}
2019-01-18 13:30:51 +01:00
void Concordia::loadRAMIndexFromDisk() {
if (boost::filesystem::exists(_getWordMapFilePath())
&& boost::filesystem::exists(_getHashedIndexFilePath())
&& boost::filesystem::exists(_getMarkersFilePath())) {
// reading index from file
_T->clear();
std::ifstream hashedIndexFile;
hashedIndexFile.open(
_getHashedIndexFilePath().c_str(), std::ios::in
| std::ios::ate | std::ios::binary);
saidx_t hiFileSize = hashedIndexFile.tellg();
if (hiFileSize > 0) {
hashedIndexFile.seekg(0, std::ios::beg);
while (!hashedIndexFile.eof()) {
INDEX_CHARACTER_TYPE character =
Utils::readIndexCharacter(hashedIndexFile);
Utils::appendCharToSaucharVector(_T, character);
}
hashedIndexFile.close();
} else {
hashedIndexFile.close();
throw ConcordiaException("Index corrupt: empty hash index file");
}
// reading markers from file
_markers->clear();
std::ifstream markersFile;
markersFile.open(_getMarkersFilePath().c_str(), std::ios::in
| std::ios::ate | std::ios::binary);
saidx_t maFileSize = markersFile.tellg();
if (maFileSize > 0) {
markersFile.seekg(0, std::ios::beg);
while (!markersFile.eof()) {
SUFFIX_MARKER_TYPE marker =
Utils::readMarker(markersFile);
_markers->push_back(marker);
}
markersFile.close();
} else {
markersFile.close();
throw ConcordiaException("Index corrupt: empty markers file");
}
// generating suffix array
_SA = _index->generateSuffixArray(_T);
} else {
throw ConcordiaException("Index corrupt: missing files");
}
}
2019-01-18 13:30:51 +01:00
void Concordia::refreshSAfromRAM() {
_SA = _index->generateSuffixArray(_T);
2013-11-28 16:47:57 +01:00
}
2019-01-18 13:30:51 +01:00
void Concordia::_initializeIndex() {
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(_indexPath,
_config));
_T = boost::shared_ptr<std::vector<sauchar_t> >(
new std::vector<sauchar_t>);
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
new std::vector<SUFFIX_MARKER_TYPE>);
if (boost::filesystem::exists(_getWordMapFilePath())
&& boost::filesystem::exists(_getHashedIndexFilePath())) {
loadRAMIndexFromDisk();
} else if (!boost::filesystem::exists(_getWordMapFilePath())
&& !boost::filesystem::exists(_getHashedIndexFilePath())) {
// empty index
_SA = boost::shared_ptr<std::vector<saidx_t> >(
new std::vector<saidx_t>);
} else {
throw ConcordiaException("Index corrupt: missing files");
}
}
2019-01-22 14:07:28 +01:00
SUFFIX_MARKER_TYPE Concordia::countOccurrences(const std::string & pattern) {
2015-10-01 13:36:54 +02:00
if (_T->size() > 0) {
2019-01-22 14:07:28 +01:00
return _searcher->countOccurrences(_hashGenerator, _T,
2015-10-01 13:36:54 +02:00
_markers, _SA, pattern);
} else {
return 0;
}
}
2017-04-21 14:51:58 +02:00
MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern,
2019-01-18 13:30:51 +01:00
bool byWhitespace) {
2017-10-10 15:39:47 +02:00
if (_T->size() > 0 && pattern.size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern, byWhitespace);
} else {
2017-10-10 15:39:47 +02:00
// If the index or search pattern are empty, return an empty result.
MatchedPatternFragment result(0, 0);
return result;
}
}
2019-01-22 14:07:28 +01:00
OccurrencesList Concordia::fullSearch(
2019-01-09 15:30:56 +01:00
const std::string & pattern,
2019-01-09 18:31:52 +01:00
int limit,
int offset,
2019-01-18 13:30:51 +01:00
bool byWhitespace) {
2019-01-09 15:30:56 +01:00
if (_T->size() > 0 && pattern.size() > 0) {
return _searcher->fullSearch(_hashGenerator, _T,
_markers, _SA, pattern, limit, offset, byWhitespace);
} else {
// If the index or search pattern are empty, return an empty result.
2019-01-22 14:07:28 +01:00
OccurrencesList result(0);
2019-01-09 15:30:56 +01:00
return result;
}
}
2017-10-10 15:39:47 +02:00
MatchedPatternFragment Concordia::lexiconSearch(
const std::string & pattern,
2019-01-18 13:30:51 +01:00
bool byWhitespace) {
2017-10-10 15:39:47 +02:00
if (_T->size() > 0 && pattern.size() > 0) {
return _searcher->lexiconSearch(_hashGenerator, _T,
_markers, _SA, pattern, byWhitespace);
} else {
// If the index or search pattern are empty, return an empty result.
// Especially performing the lexicon search with an empty pattern
// would not be funny, as it would effectively search for double EOS,
// which is very frequent in the index.
2017-04-21 14:51:58 +02:00
MatchedPatternFragment result(0, 0);
return result;
}
2013-11-28 16:47:57 +01:00
}
std::vector<AnubisSearchResult> Concordia::anubisSearch(
2019-01-18 13:30:51 +01:00
const std::string & pattern) {
if (_T->size() > 0) {
return _searcher->anubisSearch(_config, _hashGenerator, _T,
_markers, _SA, pattern);
} else {
std::vector<AnubisSearchResult> result;
return result;
}
}
boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
const std::string & pattern,
2019-01-18 13:30:51 +01:00
bool byWhitespace) {
if (_T->size() > 0) {
return _searcher->concordiaSearch(_hashGenerator, _T,
_markers, _SA, pattern, byWhitespace);
} else {
2015-06-26 22:50:53 +02:00
std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>(
2015-08-19 20:49:26 +02:00
new ConcordiaSearchResult(TokenizedSentence(empty)));
}
}
2019-01-18 13:30:51 +01:00
void Concordia::clearIndex() {
2015-05-04 20:40:44 +02:00
_hashGenerator->clearWordMap();
_T = boost::shared_ptr<std::vector<sauchar_t> >(
new std::vector<sauchar_t>);
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
new std::vector<SUFFIX_MARKER_TYPE>);
_SA = boost::shared_ptr<std::vector<saidx_t> >(
new std::vector<saidx_t>);
boost::filesystem::remove(_getHashedIndexFilePath());
boost::filesystem::remove(_getMarkersFilePath());
}
std::string Concordia::_getWordMapFilePath() {
return _indexPath+"/"+WORD_MAP_FILE_NAME;
}
std::string Concordia::_getHashedIndexFilePath() {
return _indexPath+"/"+HASHED_INDEX_FILE_NAME;
}
std::string Concordia::_getMarkersFilePath() {
return _indexPath+"/"+MARKERS_FILE_NAME;
2015-05-04 20:40:44 +02:00
}