233 lines
9.7 KiB
C++
233 lines
9.7 KiB
C++
#include "concordia/index_searcher.hpp"
|
|
|
|
#include "concordia/common/utils.hpp"
|
|
#include "concordia/tokenized_sentence.hpp"
|
|
#include <boost/filesystem.hpp>
|
|
#include <algorithm>
|
|
|
|
IndexSearcher::IndexSearcher() {
|
|
_concordiaSearcher = boost::shared_ptr<ConcordiaSearcher>(
|
|
new ConcordiaSearcher());
|
|
}
|
|
|
|
|
|
IndexSearcher::~IndexSearcher() {
|
|
}
|
|
|
|
MatchedPatternFragment IndexSearcher::simpleSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
bool byWhitespace) throw(ConcordiaException) {
|
|
int left;
|
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
|
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
|
|
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
|
(const sauchar_t *) patternArray, patternLength,
|
|
SA->data(), (saidx_t) SA->size(), &left);
|
|
MatchedPatternFragment result(0, hash.size());
|
|
for (int i = 0; i < size; ++i) {
|
|
saidx_t resultPos = SA->at(left + i);
|
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
|
// As we are looking for a pattern in an array of higher
|
|
// resolution than the hashed index file, we might
|
|
// obtain accidental results exceeding the boundaries
|
|
// of characters in hashed index. The above check
|
|
// removes these accidental results.
|
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
|
|
|
SubstringOccurence occurence;
|
|
occurence.enterDataFromMarker(marker);
|
|
result.addOccurence(occurence);
|
|
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
delete[] patternArray;
|
|
return result;
|
|
}
|
|
|
|
OccurencesList IndexSearcher::fullSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
int limit,
|
|
int offset,
|
|
bool byWhitespace) throw(ConcordiaException) {
|
|
int left;
|
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
|
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
|
|
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
|
(const sauchar_t *) patternArray, patternLength,
|
|
SA->data(), (saidx_t) SA->size(), &left);
|
|
|
|
OccurencesList result(size);
|
|
|
|
int returnedResults = limit;
|
|
if ((size - offset) < limit) {
|
|
returnedResults = size - offset;
|
|
}
|
|
for (int i = 0; i < returnedResults; ++i) {
|
|
saidx_t resultPos = SA->at(left + offset + i);
|
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
|
// As we are looking for a pattern in an array of higher
|
|
// resolution than the hashed index file, we might
|
|
// obtain accidental results exceeding the boundaries
|
|
// of characters in hashed index. The above check
|
|
// removes these accidental results.
|
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
|
|
|
SubstringOccurence occurence;
|
|
occurence.enterDataFromMarker(marker);
|
|
result.addOccurence(occurence);
|
|
}
|
|
}
|
|
|
|
delete[] patternArray;
|
|
return result;
|
|
}
|
|
|
|
MatchedPatternFragment IndexSearcher::lexiconSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
bool byWhitespace) throw(ConcordiaException) {
|
|
int left;
|
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
|
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
|
|
|
if (hash.size() == 0) {
|
|
// If the hash is empty, return empty result
|
|
return MatchedPatternFragment(0, 0);
|
|
}
|
|
|
|
// append and prepend query with EOS (sentenceBoundaryHI) for lexicon search
|
|
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
|
hash.insert(hash.begin(), sentenceBoundaryHI);
|
|
hash.push_back(sentenceBoundaryHI);
|
|
|
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
|
|
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
|
(const sauchar_t *) patternArray, patternLength,
|
|
SA->data(), (saidx_t) SA->size(), &left);
|
|
|
|
// In this scenario the whole pattern is matched, but
|
|
// the pattern was artificially augmented by two EOS's.
|
|
// Therefore, the matched pattern fragment starts at 0
|
|
// and is hash.size() - 2 long.
|
|
MatchedPatternFragment result(0, hash.size()-2);
|
|
for (int i = 0; i < size; ++i) {
|
|
saidx_t resultPos = SA->at(left + i);
|
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
|
// As we are looking for a pattern in an array of higher
|
|
// resolution than the hashed index file, we might
|
|
// obtain accidental results exceeding the boundaries
|
|
// of characters in hashed index. The above check
|
|
// removes these accidental results.
|
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
|
|
|
// Our search query started with an EOS and is non-empty,
|
|
// so we should look at the marker of the next character
|
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1);
|
|
|
|
SubstringOccurence occurence;
|
|
occurence.enterDataFromMarker(marker);
|
|
result.addOccurence(occurence);
|
|
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
delete[] patternArray;
|
|
return result;
|
|
}
|
|
|
|
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern) throw(ConcordiaException) {
|
|
int left;
|
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
|
hashGenerator->generateHash(pattern).getCodes();
|
|
|
|
// append sentence boundary marker,
|
|
// as we are looking only for exact sentence matches
|
|
hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE);
|
|
|
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
|
|
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
|
(const sauchar_t *) patternArray, patternLength,
|
|
SA->data(), (saidx_t) SA->size(), &left);
|
|
|
|
SUFFIX_MARKER_TYPE occurencesCount = 0;
|
|
for (int i = 0; i < size; ++i) {
|
|
saidx_t resultPos = SA->at(left + i);
|
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
|
// As we are looking for a pattern in an array of higher
|
|
// resolution than the hashed index file, we might
|
|
// obtain accidental results exceeding the boundaries
|
|
// of characters in hashed index. The above check
|
|
// removes these accidental results.
|
|
occurencesCount++;
|
|
}
|
|
}
|
|
|
|
delete[] patternArray;
|
|
|
|
return occurencesCount;
|
|
}
|
|
|
|
|
|
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
|
boost::shared_ptr<ConcordiaConfig> config,
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern) throw(ConcordiaException) {
|
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
|
hashGenerator->generateHash(pattern).getCodes();
|
|
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
|
|
}
|
|
|
|
boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
bool byWhitespace) throw(ConcordiaException) {
|
|
TokenizedSentence hashedPattern =
|
|
hashGenerator->generateHash(pattern, byWhitespace);
|
|
boost::shared_ptr<ConcordiaSearchResult> result =
|
|
boost::shared_ptr<ConcordiaSearchResult>(
|
|
new ConcordiaSearchResult(hashedPattern));
|
|
|
|
_concordiaSearcher->concordiaSearch(result, T, markers,
|
|
SA, hashedPattern.getCodes());
|
|
return result;
|
|
}
|