option of white space tokenization while searching

This commit is contained in:
rjawor 2017-04-22 23:45:51 +02:00
parent 31e4f091ad
commit 970dda5dc2
4 changed files with 23 additions and 12 deletions

View File

@ -211,11 +211,12 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern)
const std::string & pattern,
bool byWhitespace)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern);
_markers, _SA, pattern, byWhitespace);
} else {
MatchedPatternFragment result(0, 0);
return result;
@ -235,11 +236,12 @@ std::vector<AnubisSearchResult> Concordia::anubisSearch(
}
boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
const std::string & pattern)
const std::string & pattern,
bool byWhitespace)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->concordiaSearch(_hashGenerator, _T,
_markers, _SA, pattern);
_markers, _SA, pattern, byWhitespace);
} else {
std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>(

View File

@ -126,10 +126,12 @@ public:
/*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index
\param byWhitespace whether to tokenize the pattern by white space
\returns matched pattern fragment containing vector of occurences
\throws ConcordiaException
*/
MatchedPatternFragment simpleSearch(const std::string & pattern)
MatchedPatternFragment simpleSearch(const std::string & pattern,
bool byWhitespace = false)
throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
@ -154,7 +156,8 @@ public:
\throws ConcordiaException
*/
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
const std::string & pattern)
const std::string & pattern,
bool byWhitespace = false)
throw(ConcordiaException);
/*! Loads HDD stored index files to RAM and generates

View File

@ -18,10 +18,11 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
const std::string & pattern,
bool byWhitespace) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -110,8 +111,9 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern);
const std::string & pattern,
bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern, byWhitespace);
boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern));

View File

@ -50,7 +50,8 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
@ -71,6 +72,7 @@ public:
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\param byWhitespace whether to tokenize the pattern by white space
\returns vector of results
\throws ConcordiaException
*/
@ -92,6 +94,7 @@ public:
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
\param byWhitespace whether to tokenize the pattern by white space
\returns result of the search
\throws ConcordiaException
*/
@ -100,7 +103,8 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
private:
boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;