concordia-library/concordia/concordia_searcher.cpp

292 lines
11 KiB
C++
Raw Permalink Normal View History

#include "concordia/concordia_searcher.hpp"
#include "concordia/common/logging.hpp"
#include <boost/foreach.hpp>
#include <iostream>
ConcordiaSearcher::ConcordiaSearcher() {
}
ConcordiaSearcher::~ConcordiaSearcher() {
}
void ConcordiaSearcher::concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult> result,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
2019-01-18 13:30:51 +01:00
const std::vector<INDEX_CHARACTER_TYPE> & pattern) {
// add fragments to result and sort them
std::vector<sauchar_t> patternVector =
Utils::indexVectorToSaucharVector(pattern);
if (patternVector.size() !=
pattern.size() * sizeof(INDEX_CHARACTER_TYPE)) {
throw ConcordiaException("Increasing pattern resolution went wrong.");
}
for (int offset = 0; offset < pattern.size(); offset++) {
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
std::vector<sauchar_t> currentPattern(
patternVector.begin()+highResOffset, patternVector.end());
SUFFIX_MARKER_TYPE lcpLength;
2019-01-22 14:07:28 +01:00
std::vector<SubstringOccurrence> occurrences =
lcpSearch(T, markers, SA, currentPattern, lcpLength);
2019-01-22 14:07:28 +01:00
if (occurrences.size() > 0) {
2017-04-21 14:51:58 +02:00
MatchedPatternFragment fragment(offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE));
2019-01-22 14:07:28 +01:00
BOOST_FOREACH(SubstringOccurrence occurrence, occurrences) {
fragment.addOccurrence(occurrence);
2017-04-21 14:51:58 +02:00
}
result->addFragment(fragment);
}
}
// compute best overlay of the pattern by matched fragments
2015-05-01 14:52:53 +02:00
result->computeBestOverlay();
result->sortFragments();
}
std::vector<AnubisSearchResult> ConcordiaSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
2019-01-18 13:30:51 +01:00
const std::vector<INDEX_CHARACTER_TYPE> & pattern) {
boost::shared_ptr<TmMatchesMap> tmMatchesMap =
getTmMatches(T, markers, SA, pattern);
// 1. iterate over tmMatchesMap
// 2. calculate score for each tmMatches
// 3. create AnubisSearchResult from tmMatches with scores over threshold
// 4. sort the AnubisSearchResult vector decending
std::vector<AnubisSearchResult> result;
for (TmMatchesMapIterator iterator = tmMatchesMap->begin();
iterator != tmMatchesMap->end(); ++iterator) {
TmMatches * tmMatches = iterator->second;
tmMatches->calculateScore();
if (tmMatches->getScore() >= config->getAnubisThreshold()) {
result.push_back(AnubisSearchResult(tmMatches->getExampleId(),
tmMatches->getScore()));
}
}
std::sort(result.begin(), result.end(), std::greater<AnubisSearchResult>());
return result;
}
boost::shared_ptr<TmMatchesMap> ConcordiaSearcher::getTmMatches(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
2019-01-18 13:30:51 +01:00
const std::vector<INDEX_CHARACTER_TYPE> & pattern) {
std::vector<sauchar_t> patternVector =
Utils::indexVectorToSaucharVector(pattern);
if (patternVector.size() !=
pattern.size() * sizeof(INDEX_CHARACTER_TYPE)) {
throw ConcordiaException("Increasing pattern resolution went wrong.");
}
boost::shared_ptr<TmMatchesMap> tmMatchesMap(new TmMatchesMap());
for (int offset = 0; offset < pattern.size(); offset++) {
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
std::vector<sauchar_t> currentPattern(
patternVector.begin()+highResOffset, patternVector.end());
saidx_t patternLength = 0;
saidx_t size = SA->size();
saidx_t left = 0;
sauchar_t * patternArray = currentPattern.data();
saidx_t * SAleft = SA->data();
saidx_t prevLeft;
saidx_t prevSize;
do {
prevLeft = left;
prevSize = size;
patternLength += sizeof(INDEX_CHARACTER_TYPE);
saidx_t localLeft;
size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SAleft, size, &localLeft);
left += localLeft;
SAleft += localLeft;
if (patternLength > sizeof(INDEX_CHARACTER_TYPE)) {
// Add to tm matches map results surrounding the main stream.
// from left
for (saidx_t i = prevLeft; i < left; i++) {
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(),
(patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1,
offset);
}
// from right
for (saidx_t i = left+size; i < prevLeft+prevSize; i++) {
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(),
(patternLength / sizeof(INDEX_CHARACTER_TYPE)) - 1,
offset);
}
}
} while (patternLength < currentPattern.size() && size > 0);
if (size > 0) {
for (saidx_t i = left; i < left+size; i++) {
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(),
patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
}
}
}
return tmMatchesMap;
}
2019-01-22 14:07:28 +01:00
std::vector<SubstringOccurrence> ConcordiaSearcher::lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::vector<sauchar_t> & pattern,
2019-01-18 13:30:51 +01:00
SUFFIX_MARKER_TYPE & length) {
saidx_t patternLength = 0;
saidx_t size = SA->size();
saidx_t left = 0;
const sauchar_t * patternArray = pattern.data();
saidx_t * SAleft = SA->data();
saidx_t prevLeft;
saidx_t prevSize;
do {
prevLeft = left;
prevSize = size;
patternLength += sizeof(INDEX_CHARACTER_TYPE);
saidx_t localLeft;
size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SAleft, size, &localLeft);
left += localLeft;
SAleft += localLeft;
} while (patternLength < pattern.size() && size > 0);
2019-01-22 14:07:28 +01:00
std::vector<SubstringOccurrence> result;
if (size == 0) {
// The search managed to find exactly the longest common prefixes.
length = patternLength - sizeof(INDEX_CHARACTER_TYPE);
if (length > 0) {
// Get the results of the previous search
_collectResults(result, markers, SA, prevLeft, prevSize);
}
// If length == 0, then the pattern has no common prefixes
// with the index.
} else {
// Seemingly, the index contains at least one utterance
// of the whole search pattern.
length = patternLength;
_collectResults(result, markers, SA, left, size);
}
return result;
}
void ConcordiaSearcher::_collectResults(
2019-01-22 14:07:28 +01:00
std::vector<SubstringOccurrence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size) {
int resultsCount = 0;
for (saidx_t i = 0; i < size; i++) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
SUFFIX_MARKER_TYPE marker =
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
2019-01-22 14:07:28 +01:00
result.push_back(SubstringOccurrence(marker));
// truncate results,
// we don't need too many identical pattern overlays
if (++resultsCount >= CONCORDIA_SEARCH_MAX_RESULTS) {
break;
}
}
}
}
void ConcordiaSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
saidx_t sa_pos,
SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset) {
2019-01-22 14:07:28 +01:00
SubstringOccurrence occurrence;
if (_getOccurrenceFromSA(SA, markers, sa_pos, occurrence)) {
_addOccurrenceToMap(tmMatchesMap,
occurrence,
totalPatternLength,
matchedFragmentLength,
patternOffset);
}
}
2019-01-22 14:07:28 +01:00
bool ConcordiaSearcher::_getOccurrenceFromSA(
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
saidx_t sa_pos,
2019-01-22 14:07:28 +01:00
SubstringOccurrence & occurrence) {
saidx_t resultPos = SA->at(sa_pos);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
SUFFIX_MARKER_TYPE marker =
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
2019-01-22 14:07:28 +01:00
occurrence.enterDataFromMarker(marker);
}
}
2019-01-22 14:07:28 +01:00
void ConcordiaSearcher::_addOccurrenceToMap(
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
2019-01-22 14:07:28 +01:00
SubstringOccurrence & occurrence,
SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset) {
TmMatches * tmMatches;
TmMatchesMapIterator mapIterator = tmMatchesMap->find(
2019-01-22 14:07:28 +01:00
occurrence.getId());
if (mapIterator != tmMatchesMap->end()) {
tmMatches = mapIterator->second;
} else {
2019-01-22 14:07:28 +01:00
tmMatches = new TmMatches(occurrence.getId(),
occurrence.getExampleLength(),
totalPatternLength);
2019-01-22 14:07:28 +01:00
SUFFIX_MARKER_TYPE key = occurrence.getId();
tmMatchesMap->insert(key, tmMatches);
}
// add intervals to tmMatches
tmMatches->addExampleInterval(
2019-01-22 14:07:28 +01:00
occurrence.getOffset(),
occurrence.getOffset() + matchedFragmentLength);
tmMatches->addPatternInterval(
patternOffset,
patternOffset + matchedFragmentLength);
}