2013-11-28 16:47:57 +01:00
|
|
|
#include "concordia/index_searcher.hpp"
|
|
|
|
|
2013-12-06 22:29:25 +01:00
|
|
|
#include "concordia/common/utils.hpp"
|
2013-11-28 16:47:57 +01:00
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
IndexSearcher::IndexSearcher() {
|
2014-05-14 16:29:44 +02:00
|
|
|
_anubisSearcher = boost::shared_ptr<AnubisSearcher>(
|
|
|
|
new AnubisSearcher());
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
IndexSearcher::~IndexSearcher() {
|
|
|
|
}
|
|
|
|
|
2014-02-20 10:49:17 +01:00
|
|
|
boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
const string & pattern) throw(ConcordiaException) {
|
|
|
|
boost::ptr_vector<SubstringOccurence> result;
|
2013-11-28 16:47:57 +01:00
|
|
|
|
|
|
|
int left;
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
|
|
|
hashGenerator->generateHash(pattern);
|
|
|
|
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
|
2013-12-06 22:29:25 +01:00
|
|
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
2013-12-14 15:23:17 +01:00
|
|
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
|
|
|
(const sauchar_t *) patternArray, patternLength,
|
2014-05-14 16:29:44 +02:00
|
|
|
SA->data(), (saidx_t) SA->size(), &left);
|
2013-12-06 22:29:25 +01:00
|
|
|
for (int i = 0; i < size; ++i) {
|
2014-02-20 10:49:17 +01:00
|
|
|
saidx_t resultPos = SA->at(left + i);
|
|
|
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
2013-12-06 22:29:25 +01:00
|
|
|
// As we are looking for a pattern in an array of higher
|
|
|
|
// resolution than the hashed index file, we might
|
|
|
|
// obtain accidental results exceeding the boundaries
|
|
|
|
// of characters in hashed index. The above check
|
|
|
|
// removes these accidental results.
|
2014-02-20 10:49:17 +01:00
|
|
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
|
|
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
|
|
|
|
2014-04-29 14:46:04 +02:00
|
|
|
// TODO(rafalj): think about using bitwise operators
|
|
|
|
// in the below code
|
2014-02-20 10:49:17 +01:00
|
|
|
result.push_back(new SubstringOccurence(
|
|
|
|
marker / SUFFIX_MARKER_DIVISOR,
|
|
|
|
marker % SUFFIX_MARKER_DIVISOR));
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
delete[] patternArray;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-03-11 14:29:30 +01:00
|
|
|
boost::ptr_vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
const string & pattern) throw(ConcordiaException) {
|
2014-05-14 16:29:44 +02:00
|
|
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
|
|
|
hashGenerator->generateHash(pattern);
|
|
|
|
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
|
2014-03-11 14:29:30 +01:00
|
|
|
}
|