From 970dda5dc2357a9153f891022570f70fab2ef129 Mon Sep 17 00:00:00 2001 From: rjawor Date: Sat, 22 Apr 2017 23:45:51 +0200 Subject: [PATCH] option of white space tokenization while searching --- concordia/concordia.cpp | 10 ++++++---- concordia/concordia.hpp | 7 +++++-- concordia/index_searcher.cpp | 10 ++++++---- concordia/index_searcher.hpp | 8 ++++++-- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index e262f97..848eca6 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -211,11 +211,12 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern) MatchedPatternFragment Concordia::simpleSearch( - const std::string & pattern) + const std::string & pattern, + bool byWhitespace) throw(ConcordiaException) { if (_T->size() > 0) { return _searcher->simpleSearch(_hashGenerator, _T, - _markers, _SA, pattern); + _markers, _SA, pattern, byWhitespace); } else { MatchedPatternFragment result(0, 0); return result; @@ -235,11 +236,12 @@ std::vector Concordia::anubisSearch( } boost::shared_ptr Concordia::concordiaSearch( - const std::string & pattern) + const std::string & pattern, + bool byWhitespace) throw(ConcordiaException) { if (_T->size() > 0) { return _searcher->concordiaSearch(_hashGenerator, _T, - _markers, _SA, pattern); + _markers, _SA, pattern, byWhitespace); } else { std::string empty; return boost::shared_ptr( diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index dcf436d..b373da6 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -126,10 +126,12 @@ public: /*! Performs a simple substring lookup on the index. For more info see \ref tutorial1_2. \param pattern pattern to be searched in the index + \param byWhitespace whether to tokenize the pattern by white space \returns matched pattern fragment containing vector of occurences \throws ConcordiaException */ - MatchedPatternFragment simpleSearch(const std::string & pattern) + MatchedPatternFragment simpleSearch(const std::string & pattern, + bool byWhitespace = false) throw(ConcordiaException); SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) @@ -154,7 +156,8 @@ public: \throws ConcordiaException */ boost::shared_ptr concordiaSearch( - const std::string & pattern) + const std::string & pattern, + bool byWhitespace = false) throw(ConcordiaException); /*! Loads HDD stored index files to RAM and generates diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 6749082..b23f3f6 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -18,10 +18,11 @@ MatchedPatternFragment IndexSearcher::simpleSearch( boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, - const std::string & pattern) throw(ConcordiaException) { + const std::string & pattern, + bool byWhitespace) throw(ConcordiaException) { int left; std::vector hash = - hashGenerator->generateHash(pattern).getCodes(); + hashGenerator->generateHash(pattern, byWhitespace).getCodes(); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); @@ -110,8 +111,9 @@ boost::shared_ptr IndexSearcher::concordiaSearch( boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, - const std::string & pattern) throw(ConcordiaException) { - TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern); + const std::string & pattern, + bool byWhitespace) throw(ConcordiaException) { + TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern, byWhitespace); boost::shared_ptr result = boost::shared_ptr( new ConcordiaSearchResult(hashedPattern)); diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index 70cc95a..aa5b6e0 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -50,7 +50,8 @@ public: boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, - const std::string & pattern) throw(ConcordiaException); + const std::string & pattern, + bool byWhitespace = false) throw(ConcordiaException); SUFFIX_MARKER_TYPE countOccurences( boost::shared_ptr hashGenerator, @@ -71,6 +72,7 @@ public: \param markers markers array for the needs of searching \param SA suffix array for the needs of searching \param pattern string pattern to be searched in the index. + \param byWhitespace whether to tokenize the pattern by white space \returns vector of results \throws ConcordiaException */ @@ -92,6 +94,7 @@ public: \param markers markers array for the needs of searching \param SA suffix array for the needs of searching \param pattern pattern to be searched in the index. + \param byWhitespace whether to tokenize the pattern by white space \returns result of the search \throws ConcordiaException */ @@ -100,7 +103,8 @@ public: boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, - const std::string & pattern) throw(ConcordiaException); + const std::string & pattern, + bool byWhitespace = false) throw(ConcordiaException); private: boost::shared_ptr _concordiaSearcher;