2013-11-28 16:47:57 +01:00
|
|
|
#include "concordia/index_searcher.hpp"
|
|
|
|
|
2013-12-06 22:29:25 +01:00
|
|
|
#include "concordia/common/utils.hpp"
|
2013-11-28 16:47:57 +01:00
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
|
|
|
|
IndexSearcher::IndexSearcher():
|
|
|
|
_T(NULL),
|
|
|
|
_SA(NULL),
|
|
|
|
_n(0) {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
IndexSearcher::~IndexSearcher() {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
|
|
|
const string & hashedIndexFilepath,
|
|
|
|
const string & suffixArrayFilepath)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
if (!boost::filesystem::exists(wordMapFilepath)) {
|
|
|
|
throw ConcordiaException("E06: Failed to open word map "
|
|
|
|
"file for reading.");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!boost::filesystem::exists(hashedIndexFilepath)) {
|
|
|
|
throw ConcordiaException("E07: Failed to open hashed index file "
|
|
|
|
"for reading.");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!boost::filesystem::exists(suffixArrayFilepath)) {
|
|
|
|
throw ConcordiaException("E08: Failed to open suffix array file "
|
|
|
|
"for reading.");
|
|
|
|
}
|
|
|
|
|
|
|
|
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
|
|
|
new HashGenerator(wordMapFilepath));
|
|
|
|
|
|
|
|
ifstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
|
|
|
|
| ios::ate | ios::binary);
|
2013-12-06 22:29:25 +01:00
|
|
|
_n = hashedIndexFile.tellg();
|
2013-11-28 16:47:57 +01:00
|
|
|
hashedIndexFile.seekg(0, ios::beg);
|
2013-12-06 22:29:25 +01:00
|
|
|
_T = new sauchar_t[_n];
|
2013-12-01 23:34:46 +01:00
|
|
|
int pos = 0;
|
|
|
|
while (!hashedIndexFile.eof()) {
|
2013-12-06 22:29:25 +01:00
|
|
|
INDEX_CHARACTER_TYPE character =
|
|
|
|
Utils::readIndexCharacter(hashedIndexFile);
|
|
|
|
Utils::insertCharToSaucharArray(_T, character, pos);
|
|
|
|
pos+=sizeof(character);
|
2013-12-01 23:34:46 +01:00
|
|
|
}
|
2013-11-28 16:47:57 +01:00
|
|
|
hashedIndexFile.close();
|
|
|
|
|
|
|
|
_SA = new saidx_t[_n];
|
|
|
|
|
|
|
|
ifstream suffixArrayFile;
|
|
|
|
suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::in | ios::binary);
|
|
|
|
|
2013-12-01 23:34:46 +01:00
|
|
|
saidx_t saidx_buff;
|
|
|
|
pos = 0;
|
2013-11-28 16:47:57 +01:00
|
|
|
while (!suffixArrayFile.eof() && pos < _n) {
|
2013-12-06 22:29:25 +01:00
|
|
|
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff),
|
|
|
|
sizeof(saidx_t));
|
2013-12-01 23:34:46 +01:00
|
|
|
_SA[pos++] = saidx_buff;
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
suffixArrayFile.close();
|
|
|
|
}
|
|
|
|
|
|
|
|
vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
vector<saidx_t> result;
|
|
|
|
|
|
|
|
int left;
|
2013-12-06 22:29:25 +01:00
|
|
|
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(pattern);
|
|
|
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
|
|
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
2013-11-28 16:47:57 +01:00
|
|
|
int size = sa_search(_T, (saidx_t) _n,
|
|
|
|
(const sauchar_t *) patternArray, patternLength,
|
|
|
|
_SA, (saidx_t) _n, &left);
|
2013-12-06 22:29:25 +01:00
|
|
|
for (int i = 0; i < size; ++i) {
|
|
|
|
saidx_t result_pos = _SA[left + i];
|
|
|
|
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
|
|
|
// As we are looking for a pattern in an array of higher
|
|
|
|
// resolution than the hashed index file, we might
|
|
|
|
// obtain accidental results exceeding the boundaries
|
|
|
|
// of characters in hashed index. The above check
|
|
|
|
// removes these accidental results.
|
|
|
|
result.push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
|
|
|
|
}
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
delete[] patternArray;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|