option of white space tokenization while searching

This commit is contained in:
rjawor 2017-04-22 23:45:51 +02:00
parent 31e4f091ad
commit 970dda5dc2
4 changed files with 23 additions and 12 deletions

View File

@ -211,11 +211,12 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
MatchedPatternFragment Concordia::simpleSearch( MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern) const std::string & pattern,
bool byWhitespace)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern); _markers, _SA, pattern, byWhitespace);
} else { } else {
MatchedPatternFragment result(0, 0); MatchedPatternFragment result(0, 0);
return result; return result;
@ -235,11 +236,12 @@ std::vector<AnubisSearchResult> Concordia::anubisSearch(
} }
boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch( boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
const std::string & pattern) const std::string & pattern,
bool byWhitespace)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->concordiaSearch(_hashGenerator, _T, return _searcher->concordiaSearch(_hashGenerator, _T,
_markers, _SA, pattern); _markers, _SA, pattern, byWhitespace);
} else { } else {
std::string empty; std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>( return boost::shared_ptr<ConcordiaSearchResult>(

View File

@ -126,10 +126,12 @@ public:
/*! Performs a simple substring lookup on the index. /*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2. For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index \param pattern pattern to be searched in the index
\param byWhitespace whether to tokenize the pattern by white space
\returns matched pattern fragment containing vector of occurences \returns matched pattern fragment containing vector of occurences
\throws ConcordiaException \throws ConcordiaException
*/ */
MatchedPatternFragment simpleSearch(const std::string & pattern) MatchedPatternFragment simpleSearch(const std::string & pattern,
bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
@ -154,7 +156,8 @@ public:
\throws ConcordiaException \throws ConcordiaException
*/ */
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch( boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
const std::string & pattern) const std::string & pattern,
bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
/*! Loads HDD stored index files to RAM and generates /*! Loads HDD stored index files to RAM and generates

View File

@ -18,10 +18,11 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern,
bool byWhitespace) throw(ConcordiaException) {
int left; int left;
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes(); hashGenerator->generateHash(pattern, byWhitespace).getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -110,8 +111,9 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern,
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern); bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern, byWhitespace);
boost::shared_ptr<ConcordiaSearchResult> result = boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>( boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern)); new ConcordiaSearchResult(hashedPattern));

View File

@ -50,7 +50,8 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException); const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences( SUFFIX_MARKER_TYPE countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
@ -71,6 +72,7 @@ public:
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\param byWhitespace whether to tokenize the pattern by white space
\returns vector of results \returns vector of results
\throws ConcordiaException \throws ConcordiaException
*/ */
@ -92,6 +94,7 @@ public:
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index. \param pattern pattern to be searched in the index.
\param byWhitespace whether to tokenize the pattern by white space
\returns result of the search \returns result of the search
\throws ConcordiaException \throws ConcordiaException
*/ */
@ -100,7 +103,8 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException); const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
private: private:
boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher; boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;