concordia-library/concordia/index_searcher.cpp

125 lines
5.2 KiB
C++

#include "concordia/index_searcher.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher() {
_concordiaSearcher = boost::shared_ptr<ConcordiaSearcher>(
new ConcordiaSearcher());
}
IndexSearcher::~IndexSearcher() {
}
MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
MatchedPatternFragment result(0, hash.size());
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
SubstringOccurence occurence;
occurence.enterDataFromMarker(marker);
result.addOccurence(occurence);
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break;
}
}
}
delete[] patternArray;
return result;
}
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
// append sentence boundary marker,
// as we are looking only for exact sentence matches
hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE);
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
SUFFIX_MARKER_TYPE occurencesCount = 0;
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
occurencesCount++;
}
}
delete[] patternArray;
return occurencesCount;
}
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
}
boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern, byWhitespace);
boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers,
SA, hashedPattern.getCodes());
return result;
}