2013-11-28 16:47:57 +01:00
|
|
|
#include <sstream>
|
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
#include "concordia/concordia.hpp"
|
|
|
|
#include "concordia/common/config.hpp"
|
2013-12-14 15:23:17 +01:00
|
|
|
#include "concordia/common/utils.hpp"
|
2013-10-24 17:08:58 +02:00
|
|
|
|
|
|
|
// ===========================================
|
|
|
|
|
|
|
|
std::string _createLibraryVersion();
|
|
|
|
|
|
|
|
// ===========================================
|
|
|
|
|
|
|
|
std::string Concordia::_libraryVersion = _createLibraryVersion();
|
|
|
|
|
|
|
|
// ===========================================
|
|
|
|
|
2013-11-28 16:47:57 +01:00
|
|
|
Concordia::Concordia(const std::string & configFilePath)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
_config = boost::shared_ptr<ConcordiaConfig> (
|
2013-10-24 17:08:58 +02:00
|
|
|
new ConcordiaConfig(configFilePath));
|
2013-11-28 16:47:57 +01:00
|
|
|
_index = boost::shared_ptr<ConcordiaIndex>(
|
2013-12-14 15:23:17 +01:00
|
|
|
new ConcordiaIndex(_config->getHashedIndexFilePath()));
|
2013-11-28 16:47:57 +01:00
|
|
|
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
|
2013-12-14 15:23:17 +01:00
|
|
|
_initializeIndex();
|
2013-10-24 17:08:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Concordia::~Concordia() {
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string & Concordia::getVersion() {
|
|
|
|
return _libraryVersion;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string _createLibraryVersion() {
|
|
|
|
std::stringstream version;
|
|
|
|
|
|
|
|
version << CONCORDIA_VERSION_MAJOR
|
|
|
|
<< "."
|
|
|
|
<< CONCORDIA_VERSION_MINOR;
|
|
|
|
|
|
|
|
return version.str();
|
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
// Sentences are written to disk and added to T.
|
|
|
|
// SA is generated on command by different methods.
|
2013-11-28 16:47:57 +01:00
|
|
|
void Concordia::addSentence(const std::string & sentence)
|
|
|
|
throw(ConcordiaException) {
|
2013-12-14 15:23:17 +01:00
|
|
|
_index->addSentence(_hashGenerator, _T, sentence);
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
// Sentences are written to disk and added to T.
|
|
|
|
// SA is generated on command by different methods.
|
|
|
|
void Concordia::addAllSentences(
|
|
|
|
boost::shared_ptr<std::vector<std::string> > sentences)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
_index->addAllSentences(_hashGenerator, _T, sentences);
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
|
|
|
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
|
|
|
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
|
|
|
// reading index from files
|
|
|
|
_T->clear();
|
|
|
|
ifstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
|
|
|
|
| ios::ate | ios::binary);
|
|
|
|
saidx_t fileSize = hashedIndexFile.tellg();
|
|
|
|
if (fileSize > 0) {
|
|
|
|
hashedIndexFile.seekg(0, ios::beg);
|
|
|
|
|
|
|
|
while (!hashedIndexFile.eof()) {
|
|
|
|
INDEX_CHARACTER_TYPE character =
|
|
|
|
Utils::readIndexCharacter(hashedIndexFile);
|
|
|
|
Utils::appendCharToSaucharVector(_T, character);
|
|
|
|
}
|
|
|
|
hashedIndexFile.close();
|
|
|
|
|
|
|
|
// generating suffix array
|
|
|
|
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Index corrupt: empty hash index file");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Index corrupt: missing files");
|
|
|
|
}
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
|
|
|
|
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
|
|
|
|
void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|
|
|
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
|
|
|
new HashGenerator(_config->getWordMapFilePath()));
|
|
|
|
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
|
|
|
new std::vector<sauchar_t>);
|
|
|
|
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
|
|
|
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
|
|
|
loadRAMIndexFromDisk();
|
|
|
|
} else if (!boost::filesystem::exists(_config->getWordMapFilePath())
|
|
|
|
&& !boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
|
|
|
// empty index
|
|
|
|
_SA = boost::shared_ptr<std::vector<saidx_t> >(
|
|
|
|
new std::vector<saidx_t>);
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Index corrupt: missing files");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > Concordia::simpleSearch(
|
|
|
|
const string & pattern)
|
2013-11-28 16:47:57 +01:00
|
|
|
throw(ConcordiaException) {
|
2013-12-14 15:23:17 +01:00
|
|
|
if (_T->size() > 0) {
|
|
|
|
return _searcher->simpleSearch(_hashGenerator, _T, _SA, pattern);
|
|
|
|
} else {
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > result =
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
|
|
|
return result;
|
|
|
|
}
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|