2014-05-14 16:29:44 +02:00
|
|
|
#include "concordia/anubis_searcher.hpp"
|
2014-06-24 18:23:46 +02:00
|
|
|
#include "concordia/tm_matches.hpp"
|
2014-05-14 16:29:44 +02:00
|
|
|
|
2014-06-24 18:23:46 +02:00
|
|
|
#include <boost/ptr_container/ptr_map.hpp>
|
|
|
|
#include <boost/foreach.hpp>
|
|
|
|
#include <iostream>
|
|
|
|
#include <map>
|
2014-05-15 22:20:31 +02:00
|
|
|
|
2014-06-24 18:23:46 +02:00
|
|
|
typedef boost::ptr_map<SUFFIX_MARKER_TYPE, TmMatches> TmMatchesMap;
|
|
|
|
typedef TmMatchesMap::iterator TmMatchesMapIterator;
|
2014-05-14 16:29:44 +02:00
|
|
|
|
|
|
|
AnubisSearcher::AnubisSearcher() {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AnubisSearcher::~AnubisSearcher() {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2014-05-15 22:20:31 +02:00
|
|
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
2014-05-14 16:29:44 +02:00
|
|
|
throw(ConcordiaException) {
|
|
|
|
boost::ptr_vector<AnubisSearchResult> result;
|
2014-06-24 18:33:02 +02:00
|
|
|
|
2014-06-24 18:23:46 +02:00
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
|
|
|
|
Utils::indexVectorToSaucharVector(pattern);
|
2014-06-24 18:33:02 +02:00
|
|
|
|
|
|
|
if (patternVector->size() !=
|
|
|
|
pattern->size() * sizeof(INDEX_CHARACTER_TYPE)) {
|
2014-06-24 18:23:46 +02:00
|
|
|
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
2014-06-24 18:33:02 +02:00
|
|
|
}
|
2014-06-24 18:23:46 +02:00
|
|
|
|
|
|
|
TmMatchesMap tmMatchesMap;
|
2014-06-24 18:33:02 +02:00
|
|
|
for (int offset = 0; offset < pattern->size(); offset++) {
|
2014-06-24 18:23:46 +02:00
|
|
|
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
|
2014-06-24 18:33:02 +02:00
|
|
|
boost::shared_ptr<std::vector<sauchar_t> >
|
|
|
|
(new std::vector<sauchar_t>(
|
|
|
|
patternVector->begin()+highResOffset, patternVector->end()));
|
2014-06-24 18:23:46 +02:00
|
|
|
SUFFIX_MARKER_TYPE longestPrefixesLength;
|
2014-06-24 18:33:02 +02:00
|
|
|
boost::ptr_vector<SubstringOccurence> longestPrefixes =
|
|
|
|
lcpSearch(T, markers, SA, currentPattern, longestPrefixesLength);
|
2014-06-24 18:23:46 +02:00
|
|
|
|
|
|
|
BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) {
|
2014-06-24 18:33:02 +02:00
|
|
|
TmMatchesMapIterator mapIterator = tmMatchesMap.find(
|
|
|
|
occurence.getId());
|
|
|
|
if (mapIterator != tmMatchesMap.end()) {
|
|
|
|
}
|
2014-06-24 18:23:46 +02:00
|
|
|
}
|
2014-06-24 18:33:02 +02:00
|
|
|
}
|
2014-06-24 18:23:46 +02:00
|
|
|
|
2014-05-14 16:29:44 +02:00
|
|
|
return result;
|
|
|
|
}
|
2014-05-15 22:20:31 +02:00
|
|
|
|
|
|
|
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
|
|
|
SUFFIX_MARKER_TYPE & length)
|
|
|
|
throw(ConcordiaException) {
|
|
|
|
saidx_t patternLength = 0;
|
|
|
|
saidx_t size = SA->size();
|
|
|
|
saidx_t left = 0;
|
|
|
|
|
|
|
|
sauchar_t * patternArray = pattern->data();
|
|
|
|
|
|
|
|
saidx_t * SAleft = SA->data();
|
|
|
|
|
|
|
|
saidx_t prevLeft;
|
|
|
|
saidx_t prevSize;
|
|
|
|
do {
|
|
|
|
prevLeft = left;
|
|
|
|
prevSize = size;
|
|
|
|
|
|
|
|
patternLength++;
|
|
|
|
|
|
|
|
saidx_t localLeft;
|
|
|
|
size = sa_search(T->data(), (saidx_t) T->size(),
|
|
|
|
(const sauchar_t *) patternArray, patternLength,
|
|
|
|
SAleft, size, &localLeft);
|
|
|
|
|
|
|
|
left += localLeft;
|
|
|
|
SAleft += localLeft;
|
|
|
|
} while (patternLength < pattern->size() && size > 0);
|
|
|
|
|
|
|
|
boost::ptr_vector<SubstringOccurence> result;
|
|
|
|
|
|
|
|
if (size == 0) {
|
|
|
|
// The search managed to find exactly the longest common prefixes.
|
|
|
|
length = patternLength - 1;
|
|
|
|
if (length > 0) {
|
|
|
|
// Get the results of the previous search
|
|
|
|
_collectResults(result, markers, SA, prevLeft, prevSize);
|
|
|
|
}
|
|
|
|
// If length == 0, then the pattern has no common prefixes
|
|
|
|
// with the index.
|
|
|
|
} else {
|
|
|
|
// Seemingly, the index contains at least one utterance
|
|
|
|
// of the whole search pattern.
|
|
|
|
length = patternLength;
|
|
|
|
_collectResults(result, markers, SA, left, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void AnubisSearcher::_collectResults(
|
|
|
|
boost::ptr_vector<SubstringOccurence> & result,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
saidx_t left, saidx_t size) {
|
|
|
|
for (saidx_t i = 0; i < size; i++) {
|
|
|
|
saidx_t resultPos = SA->at(left + i);
|
|
|
|
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
|
|
|
result.push_back(new SubstringOccurence(
|
2015-04-09 22:17:19 +02:00
|
|
|
Utils::getIdFromMarker(marker),
|
|
|
|
Utils::getOffsetFromMarker(marker)));
|
2014-05-15 22:20:31 +02:00
|
|
|
}
|
|
|
|
}
|