#include "concordia/index_searcher.hpp" #include "concordia/common/utils.hpp" #include "concordia/tokenized_sentence.hpp" #include IndexSearcher::IndexSearcher() { _concordiaSearcher = boost::shared_ptr( new ConcordiaSearcher()); } IndexSearcher::~IndexSearcher() { } MatchedPatternFragment IndexSearcher::simpleSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern, bool byWhitespace) throw(ConcordiaException) { int left; std::vector hash = hashGenerator->generateHash(pattern, byWhitespace).getCodes(); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); MatchedPatternFragment result(0, hash.size()); for (int i = 0; i < size; ++i) { saidx_t resultPos = SA->at(left + i); if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { // As we are looking for a pattern in an array of higher // resolution than the hashed index file, we might // obtain accidental results exceeding the boundaries // of characters in hashed index. The above check // removes these accidental results. saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SubstringOccurence occurence; occurence.enterDataFromMarker(marker); result.addOccurence(occurence); if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) { break; } } } delete[] patternArray; return result; } OccurencesList IndexSearcher::fullSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern, SUFFIX_MARKER_TYPE limit, SUFFIX_MARKER_TYPE offset, bool byWhitespace) throw(ConcordiaException) { int left; std::vector hash = hashGenerator->generateHash(pattern, byWhitespace).getCodes(); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); OccurencesList result(size); for (int i = offset; i < limit; ++i) { saidx_t resultPos = SA->at(left + i); if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { // As we are looking for a pattern in an array of higher // resolution than the hashed index file, we might // obtain accidental results exceeding the boundaries // of characters in hashed index. The above check // removes these accidental results. saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SubstringOccurence occurence; occurence.enterDataFromMarker(marker); result.addOccurence(occurence); } } delete[] patternArray; return result; } MatchedPatternFragment IndexSearcher::lexiconSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern, bool byWhitespace) throw(ConcordiaException) { int left; std::vector hash = hashGenerator->generateHash(pattern, byWhitespace).getCodes(); if (hash.size() == 0) { // If the hash is empty, return empty result return MatchedPatternFragment(0, 0); } // append and prepend query with EOS (sentenceBoundaryHI) for lexicon search INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; hash.insert(hash.begin(), sentenceBoundaryHI); hash.push_back(sentenceBoundaryHI); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); // In this scenario the whole pattern is matched, but // the pattern was artificially augmented by two EOS's. // Therefore, the matched pattern fragment starts at 0 // and is hash.size() - 2 long. MatchedPatternFragment result(0, hash.size()-2); for (int i = 0; i < size; ++i) { saidx_t resultPos = SA->at(left + i); if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { // As we are looking for a pattern in an array of higher // resolution than the hashed index file, we might // obtain accidental results exceeding the boundaries // of characters in hashed index. The above check // removes these accidental results. saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); // Our search query started with an EOS and is non-empty, // so we should look at the marker of the next character SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1); SubstringOccurence occurence; occurence.enterDataFromMarker(marker); result.addOccurence(occurence); if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) { break; } } } delete[] patternArray; return result; } SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException) { int left; std::vector hash = hashGenerator->generateHash(pattern).getCodes(); // append sentence boundary marker, // as we are looking only for exact sentence matches hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); SUFFIX_MARKER_TYPE occurencesCount = 0; for (int i = 0; i < size; ++i) { saidx_t resultPos = SA->at(left + i); if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { // As we are looking for a pattern in an array of higher // resolution than the hashed index file, we might // obtain accidental results exceeding the boundaries // of characters in hashed index. The above check // removes these accidental results. occurencesCount++; } } delete[] patternArray; return occurencesCount; } std::vector IndexSearcher::anubisSearch( boost::shared_ptr config, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException) { std::vector hash = hashGenerator->generateHash(pattern).getCodes(); return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash); } boost::shared_ptr IndexSearcher::concordiaSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern, bool byWhitespace) throw(ConcordiaException) { TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern, byWhitespace); boost::shared_ptr result = boost::shared_ptr( new ConcordiaSearchResult(hashedPattern)); _concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern.getCodes()); return result; }