occurrence refactoring

This commit is contained in:
rjawor 2019-01-22 14:07:28 +01:00
parent 73b3d22d97
commit d39c0400c9
17 changed files with 268 additions and 268 deletions

View File

@ -178,14 +178,14 @@ int main(int argc, char** argv) {
concordia.simpleSearch(pattern); concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time(); time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start; msdiff = time_end - time_start;
std::cout << "\tFound: " << result.getOccurences().size() std::cout << "\tFound: " << result.getOccurrences().size()
<< " matches. " << "Search took: " << " matches. " << "Search took: "
<< msdiff.total_milliseconds() << "ms." << std::endl; << msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
BOOST_FOREACH(SubstringOccurence occurence, BOOST_FOREACH(SubstringOccurrence occurrence,
result.getOccurences()) { result.getOccurrences()) {
std::cout << "\t\tfound match in sentence number: " std::cout << "\t\tfound match in sentence number: "
<< occurence.getId() << std::endl; << occurrence.getId() << std::endl;
} }
} }
} else if (cli.count("anubis-search")) { } else if (cli.count("anubis-search")) {
@ -237,7 +237,7 @@ int main(int argc, char** argv) {
<< "," << fragment.getEnd() << "," << fragment.getEnd()
<< "] (exampleCount," << "] (exampleCount,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getOccurences().size() << "," << fragment.getOccurrences().size() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << fragment.getMatchedLength()
<< std::endl; << std::endl;
@ -250,7 +250,7 @@ int main(int argc, char** argv) {
<< "," << fragment.getEnd() << "," << fragment.getEnd()
<< "] (exampleCount," << "] (exampleCount,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getOccurences().size() << "," << fragment.getOccurrences().size() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << fragment.getMatchedLength()
<< std::endl; << std::endl;

View File

@ -192,9 +192,9 @@ void Concordia::_initializeIndex() {
} }
} }
SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern) { SUFFIX_MARKER_TYPE Concordia::countOccurrences(const std::string & pattern) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->countOccurences(_hashGenerator, _T, return _searcher->countOccurrences(_hashGenerator, _T,
_markers, _SA, pattern); _markers, _SA, pattern);
} else { } else {
return 0; return 0;
@ -215,7 +215,7 @@ MatchedPatternFragment Concordia::simpleSearch(
} }
} }
OccurencesList Concordia::fullSearch( OccurrencesList Concordia::fullSearch(
const std::string & pattern, const std::string & pattern,
int limit, int limit,
int offset, int offset,
@ -225,7 +225,7 @@ OccurencesList Concordia::fullSearch(
_markers, _SA, pattern, limit, offset, byWhitespace); _markers, _SA, pattern, limit, offset, byWhitespace);
} else { } else {
// If the index or search pattern are empty, return an empty result. // If the index or search pattern are empty, return an empty result.
OccurencesList result(0); OccurrencesList result(0);
return result; return result;
} }
} }

View File

@ -9,7 +9,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/example.hpp" #include "concordia/example.hpp"
#include "concordia/matched_pattern_fragment.hpp" #include "concordia/matched_pattern_fragment.hpp"
#include "concordia/occurences_list.hpp" #include "concordia/occurrences_list.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp" #include "concordia/index_searcher.hpp"
@ -121,24 +121,24 @@ public:
For more info see \ref tutorial1_2. For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index \param pattern pattern to be searched in the index
\param byWhitespace whether to tokenize the pattern by white space \param byWhitespace whether to tokenize the pattern by white space
\returns matched pattern fragment containing vector of occurences \returns matched pattern fragment containing vector of occurrences
\throws ConcordiaException \throws ConcordiaException
*/ */
MatchedPatternFragment simpleSearch(const std::string & pattern, MatchedPatternFragment simpleSearch(const std::string & pattern,
bool byWhitespace = false); bool byWhitespace = false);
/*! Performs a substring lookup in RAM-based index, returning all occurences. /*! Performs a substring lookup in RAM-based index, returning all occurrences.
The result contains no more than "limit" occurences, starting at "offset". The result contains no more than "limit" occurrences, starting at "offset".
\param hashGenerator hash generator to be used to convert \param hashGenerator hash generator to be used to convert
input sentence to a hash input sentence to a hash
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\param limit maximum number of occurences to return \param limit maximum number of occurrences to return
\param offset starting occurence \param offset starting occurrence
\param byWhitespace should the pattern by tokenized by white space \param byWhitespace should the pattern by tokenized by white space
\returns list of occurences of the pattern in the index \returns list of occurrences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
OccurencesList fullSearch( OccurrencesList fullSearch(
const std::string & pattern, const std::string & pattern,
int limit, int limit,
int offset, int offset,
@ -151,13 +151,13 @@ public:
the lexicon search requires that the match is the whole example source. the lexicon search requires that the match is the whole example source.
\param pattern pattern to be searched in the index \param pattern pattern to be searched in the index
\param byWhitespace whether to tokenize the pattern by white space \param byWhitespace whether to tokenize the pattern by white space
\returns matched pattern fragment containing vector of occurences \returns matched pattern fragment containing vector of occurrences
\throws ConcordiaException \throws ConcordiaException
*/ */
MatchedPatternFragment lexiconSearch(const std::string & pattern, MatchedPatternFragment lexiconSearch(const std::string & pattern,
bool byWhitespace = false); bool byWhitespace = false);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern); SUFFIX_MARKER_TYPE countOccurrences(const std::string & pattern);
/*! \deprecated /*! \deprecated
Finds the examples from the index, whose resemblance to the Finds the examples from the index, whose resemblance to the

View File

@ -32,15 +32,15 @@ void ConcordiaSearcher::concordiaSearch(
std::vector<sauchar_t> currentPattern( std::vector<sauchar_t> currentPattern(
patternVector.begin()+highResOffset, patternVector.end()); patternVector.begin()+highResOffset, patternVector.end());
SUFFIX_MARKER_TYPE lcpLength; SUFFIX_MARKER_TYPE lcpLength;
std::vector<SubstringOccurence> occurences = std::vector<SubstringOccurrence> occurrences =
lcpSearch(T, markers, SA, currentPattern, lcpLength); lcpSearch(T, markers, SA, currentPattern, lcpLength);
if (occurences.size() > 0) { if (occurrences.size() > 0) {
MatchedPatternFragment fragment(offset, MatchedPatternFragment fragment(offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE)); lcpLength / sizeof(INDEX_CHARACTER_TYPE));
BOOST_FOREACH(SubstringOccurence occurence, occurences) { BOOST_FOREACH(SubstringOccurrence occurrence, occurrences) {
fragment.addOccurence(occurence); fragment.addOccurrence(occurrence);
} }
result->addFragment(fragment); result->addFragment(fragment);
} }
@ -155,7 +155,7 @@ boost::shared_ptr<TmMatchesMap> ConcordiaSearcher::getTmMatches(
return tmMatchesMap; return tmMatchesMap;
} }
std::vector<SubstringOccurence> ConcordiaSearcher::lcpSearch( std::vector<SubstringOccurrence> ConcordiaSearcher::lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
@ -185,7 +185,7 @@ std::vector<SubstringOccurence> ConcordiaSearcher::lcpSearch(
SAleft += localLeft; SAleft += localLeft;
} while (patternLength < pattern.size() && size > 0); } while (patternLength < pattern.size() && size > 0);
std::vector<SubstringOccurence> result; std::vector<SubstringOccurrence> result;
if (size == 0) { if (size == 0) {
// The search managed to find exactly the longest common prefixes. // The search managed to find exactly the longest common prefixes.
@ -208,7 +208,7 @@ std::vector<SubstringOccurence> ConcordiaSearcher::lcpSearch(
} }
void ConcordiaSearcher::_collectResults( void ConcordiaSearcher::_collectResults(
std::vector<SubstringOccurence> & result, std::vector<SubstringOccurrence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size) { saidx_t left, saidx_t size) {
@ -219,7 +219,7 @@ void ConcordiaSearcher::_collectResults(
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
SUFFIX_MARKER_TYPE marker = SUFFIX_MARKER_TYPE marker =
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE)); markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
result.push_back(SubstringOccurence(marker)); result.push_back(SubstringOccurrence(marker));
// truncate results, // truncate results,
// we don't need too many identical pattern overlays // we don't need too many identical pattern overlays
@ -237,54 +237,54 @@ void ConcordiaSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
SUFFIX_MARKER_TYPE totalPatternLength, SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength, SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset) { SUFFIX_MARKER_TYPE patternOffset) {
SubstringOccurence occurence; SubstringOccurrence occurrence;
if (_getOccurenceFromSA(SA, markers, sa_pos, occurence)) { if (_getOccurrenceFromSA(SA, markers, sa_pos, occurrence)) {
_addOccurenceToMap(tmMatchesMap, _addOccurrenceToMap(tmMatchesMap,
occurence, occurrence,
totalPatternLength, totalPatternLength,
matchedFragmentLength, matchedFragmentLength,
patternOffset); patternOffset);
} }
} }
bool ConcordiaSearcher::_getOccurenceFromSA( bool ConcordiaSearcher::_getOccurrenceFromSA(
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
saidx_t sa_pos, saidx_t sa_pos,
SubstringOccurence & occurence) { SubstringOccurrence & occurrence) {
saidx_t resultPos = SA->at(sa_pos); saidx_t resultPos = SA->at(sa_pos);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
SUFFIX_MARKER_TYPE marker = SUFFIX_MARKER_TYPE marker =
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE)); markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
occurence.enterDataFromMarker(marker); occurrence.enterDataFromMarker(marker);
} }
} }
void ConcordiaSearcher::_addOccurenceToMap( void ConcordiaSearcher::_addOccurrenceToMap(
boost::shared_ptr<TmMatchesMap> tmMatchesMap, boost::shared_ptr<TmMatchesMap> tmMatchesMap,
SubstringOccurence & occurence, SubstringOccurrence & occurrence,
SUFFIX_MARKER_TYPE totalPatternLength, SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength, SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset) { SUFFIX_MARKER_TYPE patternOffset) {
TmMatches * tmMatches; TmMatches * tmMatches;
TmMatchesMapIterator mapIterator = tmMatchesMap->find( TmMatchesMapIterator mapIterator = tmMatchesMap->find(
occurence.getId()); occurrence.getId());
if (mapIterator != tmMatchesMap->end()) { if (mapIterator != tmMatchesMap->end()) {
tmMatches = mapIterator->second; tmMatches = mapIterator->second;
} else { } else {
tmMatches = new TmMatches(occurence.getId(), tmMatches = new TmMatches(occurrence.getId(),
occurence.getExampleLength(), occurrence.getExampleLength(),
totalPatternLength); totalPatternLength);
SUFFIX_MARKER_TYPE key = occurence.getId(); SUFFIX_MARKER_TYPE key = occurrence.getId();
tmMatchesMap->insert(key, tmMatches); tmMatchesMap->insert(key, tmMatches);
} }
// add intervals to tmMatches // add intervals to tmMatches
tmMatches->addExampleInterval( tmMatches->addExampleInterval(
occurence.getOffset(), occurrence.getOffset(),
occurence.getOffset() + matchedFragmentLength); occurrence.getOffset() + matchedFragmentLength);
tmMatches->addPatternInterval( tmMatches->addPatternInterval(
patternOffset, patternOffset,
patternOffset + matchedFragmentLength); patternOffset + matchedFragmentLength);

View File

@ -5,7 +5,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
#include "concordia/substring_occurence.hpp" #include "concordia/substring_occurrence.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_search_result.hpp" #include "concordia/concordia_search_result.hpp"
@ -100,7 +100,7 @@ public:
\returns list of locations of the longest fragments \returns list of locations of the longest fragments
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<SubstringOccurence> lcpSearch( std::vector<SubstringOccurrence> lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
@ -108,7 +108,7 @@ public:
SUFFIX_MARKER_TYPE & length); SUFFIX_MARKER_TYPE & length);
private: private:
void _collectResults(std::vector<SubstringOccurence> & result, void _collectResults(std::vector<SubstringOccurrence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size); saidx_t left, saidx_t size);
@ -121,13 +121,13 @@ private:
SUFFIX_MARKER_TYPE matchedFragmentLength, SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset); SUFFIX_MARKER_TYPE patternOffset);
bool _getOccurenceFromSA(boost::shared_ptr<std::vector<saidx_t> > SA, bool _getOccurrenceFromSA(boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
saidx_t sa_pos, saidx_t sa_pos,
SubstringOccurence & occurence); SubstringOccurrence & occurrence);
void _addOccurenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap, void _addOccurrenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap,
SubstringOccurence & occurence, SubstringOccurrence & occurrence,
SUFFIX_MARKER_TYPE totalPatternLength, SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength, SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset); SUFFIX_MARKER_TYPE patternOffset);

View File

@ -42,10 +42,10 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
SubstringOccurence occurence; SubstringOccurrence occurrence;
occurence.enterDataFromMarker(marker); occurrence.enterDataFromMarker(marker);
result.addOccurence(occurence); result.addOccurrence(occurrence);
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) { if (result.getOccurrences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break; break;
} }
} }
@ -55,7 +55,7 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
return result; return result;
} }
OccurencesList IndexSearcher::fullSearch( OccurrencesList IndexSearcher::fullSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -74,7 +74,7 @@ OccurencesList IndexSearcher::fullSearch(
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left); SA->data(), (saidx_t) SA->size(), &left);
OccurencesList result(size); OccurrencesList result(size);
int returnedResults = limit; int returnedResults = limit;
if ((size - offset) < limit) { if ((size - offset) < limit) {
@ -91,9 +91,9 @@ OccurencesList IndexSearcher::fullSearch(
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
SubstringOccurence occurence; SubstringOccurrence occurrence;
occurence.enterDataFromMarker(marker); occurrence.enterDataFromMarker(marker);
result.addOccurence(occurence); result.addOccurrence(occurrence);
} }
} }
@ -148,10 +148,10 @@ MatchedPatternFragment IndexSearcher::lexiconSearch(
// so we should look at the marker of the next character // so we should look at the marker of the next character
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1);
SubstringOccurence occurence; SubstringOccurrence occurrence;
occurence.enterDataFromMarker(marker); occurrence.enterDataFromMarker(marker);
result.addOccurence(occurence); result.addOccurrence(occurrence);
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) { if (result.getOccurrences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break; break;
} }
} }
@ -161,7 +161,7 @@ MatchedPatternFragment IndexSearcher::lexiconSearch(
return result; return result;
} }
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( SUFFIX_MARKER_TYPE IndexSearcher::countOccurrences(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -182,7 +182,7 @@ SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left); SA->data(), (saidx_t) SA->size(), &left);
SUFFIX_MARKER_TYPE occurencesCount = 0; SUFFIX_MARKER_TYPE occurrencesCount = 0;
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i); saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
@ -191,13 +191,13 @@ SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
// obtain accidental results exceeding the boundaries // obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check // of characters in hashed index. The above check
// removes these accidental results. // removes these accidental results.
occurencesCount++; occurrencesCount++;
} }
} }
delete[] patternArray; delete[] patternArray;
return occurencesCount; return occurrencesCount;
} }

View File

@ -8,7 +8,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/matched_pattern_fragment.hpp" #include "concordia/matched_pattern_fragment.hpp"
#include "concordia/occurences_list.hpp" #include "concordia/occurrences_list.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "concordia/concordia_searcher.hpp" #include "concordia/concordia_searcher.hpp"
@ -43,7 +43,7 @@ public:
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\returns matched pattern fragment, containing occurences of the pattern in the index \returns matched pattern fragment, containing occurrences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
MatchedPatternFragment simpleSearch( MatchedPatternFragment simpleSearch(
@ -54,21 +54,21 @@ public:
const std::string & pattern, const std::string & pattern,
bool byWhitespace = false); bool byWhitespace = false);
/*! Performs a substring lookup in RAM-based index, returning all occurences. /*! Performs a substring lookup in RAM-based index, returning all occurrences.
The result contains no more than "limit" occurences, starting at "offset". The result contains no more than "limit" occurrences, starting at "offset".
\param hashGenerator hash generator to be used to convert \param hashGenerator hash generator to be used to convert
input sentence to a hash input sentence to a hash
\param T hashed index to search in \param T hashed index to search in
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\param limit maximum number of occurences to return \param limit maximum number of occurrences to return
\param offset starting occurence \param offset starting occurrence
\param byWhitespace should the pattern by tokenized by white space \param byWhitespace should the pattern by tokenized by white space
\returns list of occurences of the pattern in the index \returns list of occurrences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
OccurencesList fullSearch( OccurrencesList fullSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -89,7 +89,7 @@ public:
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\returns matched pattern fragment, containing occurences of the pattern in the index \returns matched pattern fragment, containing occurrences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
MatchedPatternFragment lexiconSearch( MatchedPatternFragment lexiconSearch(
@ -100,7 +100,7 @@ public:
const std::string & pattern, const std::string & pattern,
bool byWhitespace = false); bool byWhitespace = false);
SUFFIX_MARKER_TYPE countOccurences( SUFFIX_MARKER_TYPE countOccurrences(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -12,7 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
MatchedPatternFragment::~MatchedPatternFragment() { MatchedPatternFragment::~MatchedPatternFragment() {
} }
void MatchedPatternFragment::addOccurence( void MatchedPatternFragment::addOccurrence(
const SubstringOccurence & occurence) { const SubstringOccurrence & occurrence) {
_occurences.push_back(occurence); _occurrences.push_back(occurrence);
} }

View File

@ -3,7 +3,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/interval.hpp" #include "concordia/interval.hpp"
#include "concordia/substring_occurence.hpp" #include "concordia/substring_occurrence.hpp"
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
@ -30,17 +30,17 @@ public:
*/ */
virtual ~MatchedPatternFragment(); virtual ~MatchedPatternFragment();
/*! Getter for occurences. /*! Getter for occurrences.
\returns occurences \returns occurrences
*/ */
std::vector<SubstringOccurence> getOccurences() const { std::vector<SubstringOccurrence> getOccurrences() const {
return _occurences; return _occurrences;
} }
/*! Adds an occurence to the list. /*! Adds an occurrence to the list.
\param fragment occurence to be added \param fragment occurrence to be added
*/ */
void addOccurence(const SubstringOccurence & occurence); void addOccurrence(const SubstringOccurrence & occurrence);
/*! Getter for pattern offset. /*! Getter for pattern offset.
\returns pattern offset \returns pattern offset
@ -68,8 +68,8 @@ public:
o << "fragment(patternOffset=" << fragment.getPatternOffset() o << "fragment(patternOffset=" << fragment.getPatternOffset()
<< ", matchedLength=" << fragment.getMatchedLength() << ") {" << ", matchedLength=" << fragment.getMatchedLength() << ") {"
<< std::endl; << std::endl;
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) { BOOST_FOREACH(SubstringOccurrence occurrence, fragment.getOccurrences()) {
o << "\t" << occurence << std::endl; o << "\t" << occurrence << std::endl;
} }
o << "}"; o << "}";
@ -78,7 +78,7 @@ public:
private: private:
std::vector<SubstringOccurence> _occurences; std::vector<SubstringOccurrence> _occurrences;
SUFFIX_MARKER_TYPE _patternOffset; SUFFIX_MARKER_TYPE _patternOffset;

View File

@ -1,13 +0,0 @@
#include "concordia/occurences_list.hpp"
OccurencesList::OccurencesList(const SUFFIX_MARKER_TYPE & totalCount):
_totalCount(totalCount) {
}
OccurencesList::~OccurencesList() {
}
void OccurencesList::addOccurence(
const SubstringOccurence & occurence) {
_occurences.push_back(occurence);
}

View File

@ -1,50 +0,0 @@
#ifndef OCCURENCES_LIST_HDR
#define OCCURENCES_LIST_HDR
#include "concordia/common/config.hpp"
#include "concordia/substring_occurence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*!
Class representing the occurences list in full search. The list only
contains as many occurences as specified in the "limit" parameter for full search.
The "totalCount" field stores the total number of occurences available.
*/
class OccurencesList {
public:
/*! Constructor.
*/
explicit OccurencesList(const SUFFIX_MARKER_TYPE & totalCount);
/*! Destructor.
*/
virtual ~OccurencesList();
/*! Getter for occurences.
\returns occurences
*/
std::vector<SubstringOccurence> getOccurences() const {
return _occurences;
}
SUFFIX_MARKER_TYPE getTotalCount() const {
return _totalCount;
}
/*! Adds an occurence to the list.
\param fragment occurence to be added
*/
void addOccurence(const SubstringOccurence & occurence);
private:
std::vector<SubstringOccurence> _occurences;
SUFFIX_MARKER_TYPE _totalCount;
};
#endif

View File

@ -0,0 +1,13 @@
#include "concordia/occurrences_list.hpp"
OccurrencesList::OccurrencesList(const SUFFIX_MARKER_TYPE & totalCount):
_totalCount(totalCount) {
}
OccurrencesList::~OccurrencesList() {
}
void OccurrencesList::addOccurrence(
const SubstringOccurrence & occurrence) {
_occurrences.push_back(occurrence);
}

View File

@ -0,0 +1,50 @@
#ifndef OCCURRENCES_LIST_HDR
#define OCCURRENCES_LIST_HDR
#include "concordia/common/config.hpp"
#include "concordia/substring_occurrence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*!
Class representing the occurrences list in full search. The list only
contains as many occurrences as specified in the "limit" parameter for full search.
The "totalCount" field stores the total number of occurrences available.
*/
class OccurrencesList {
public:
/*! Constructor.
*/
explicit OccurrencesList(const SUFFIX_MARKER_TYPE & totalCount);
/*! Destructor.
*/
virtual ~OccurrencesList();
/*! Getter for occurrences.
\returns occurrences
*/
std::vector<SubstringOccurrence> getOccurrences() const {
return _occurrences;
}
SUFFIX_MARKER_TYPE getTotalCount() const {
return _totalCount;
}
/*! Adds an occurrence to the list.
\param fragment occurrence to be added
*/
void addOccurrence(const SubstringOccurrence & occurrence);
private:
std::vector<SubstringOccurrence> _occurrences;
SUFFIX_MARKER_TYPE _totalCount;
};
#endif

View File

@ -1,16 +1,16 @@
#include "concordia/substring_occurence.hpp" #include "concordia/substring_occurrence.hpp"
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
SubstringOccurence::SubstringOccurence() { SubstringOccurrence::SubstringOccurrence() {
} }
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) { SubstringOccurrence::SubstringOccurrence(const SUFFIX_MARKER_TYPE & marker) {
_id = Utils::getIdFromMarker(marker); _id = Utils::getIdFromMarker(marker);
_offset = Utils::getOffsetFromMarker(marker); _offset = Utils::getOffsetFromMarker(marker);
_exampleLength = Utils::getLengthFromMarker(marker); _exampleLength = Utils::getLengthFromMarker(marker);
} }
void SubstringOccurence::enterDataFromMarker( void SubstringOccurrence::enterDataFromMarker(
const SUFFIX_MARKER_TYPE & marker) { const SUFFIX_MARKER_TYPE & marker) {
_id = Utils::getIdFromMarker(marker); _id = Utils::getIdFromMarker(marker);
_offset = Utils::getOffsetFromMarker(marker); _offset = Utils::getOffsetFromMarker(marker);
@ -18,7 +18,7 @@ void SubstringOccurence::enterDataFromMarker(
} }
SubstringOccurence::SubstringOccurence( SubstringOccurrence::SubstringOccurrence(
const SUFFIX_MARKER_TYPE & id, const SUFFIX_MARKER_TYPE & id,
const SUFFIX_MARKER_TYPE & offset, const SUFFIX_MARKER_TYPE & offset,
const SUFFIX_MARKER_TYPE & exampleLength): const SUFFIX_MARKER_TYPE & exampleLength):
@ -27,6 +27,6 @@ SubstringOccurence::SubstringOccurence(
_exampleLength(exampleLength) { _exampleLength(exampleLength) {
} }
SubstringOccurence::~SubstringOccurence() { SubstringOccurrence::~SubstringOccurrence() {
} }

View File

@ -1,31 +1,31 @@
#ifndef SUBSTRING_OCCURENCE_HDR #ifndef SUBSTRING_OCCURRENCE_HDR
#define SUBSTRING_OCCURENCE_HDR #define SUBSTRING_OCCURRENCE_HDR
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <string> #include <string>
#include <iostream> #include <iostream>
/*! /*!
Class representing occurence of a searched substring. Class representing occurrence of a searched substring.
It holds the following information: It holds the following information:
- id of the example where the substring was found - id of the example where the substring was found
- offset of the matched substring in this example - offset of the matched substring in this example
- length of the example - length of the example
*/ */
class SubstringOccurence { class SubstringOccurrence {
public: public:
/*! /*!
Constructor. Constructor.
*/ */
SubstringOccurence(); SubstringOccurrence();
/*! /*!
Constructor taking data from a marker. Constructor taking data from a marker.
\param marker \param marker
*/ */
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker); explicit SubstringOccurrence(const SUFFIX_MARKER_TYPE & marker);
/*! /*!
Constructor with three arguments. Constructor with three arguments.
@ -33,12 +33,12 @@ public:
\param offset offset of the substring in the example \param offset offset of the substring in the example
\param exampleLength length of the example \param exampleLength length of the example
*/ */
SubstringOccurence(const SUFFIX_MARKER_TYPE & id, SubstringOccurrence(const SUFFIX_MARKER_TYPE & id,
const SUFFIX_MARKER_TYPE & offset, const SUFFIX_MARKER_TYPE & offset,
const SUFFIX_MARKER_TYPE & exampleLength); const SUFFIX_MARKER_TYPE & exampleLength);
/*! Destructor. /*! Destructor.
*/ */
virtual ~SubstringOccurence(); virtual ~SubstringOccurrence();
/*! Getter for example id. /*! Getter for example id.
\returns example id \returns example id
@ -67,9 +67,9 @@ public:
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker); void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
friend std::ostream & operator << (std::ostream & o, friend std::ostream & operator << (std::ostream & o,
const SubstringOccurence & occurence) { const SubstringOccurrence & occurrence) {
return o << "occurence(exampleId=" << occurence.getId() return o << "occurrence(exampleId=" << occurrence.getId()
<< ", offset=" << occurence.getOffset() << ")"; << ", offset=" << occurrence.getOffset() << ")";
} }

View File

@ -73,14 +73,14 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
concordia.clearIndex(); concordia.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().size(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(1).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(1).getOffset(), 1);
// Checking pattern spanning over 2 segments // Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().size(), 0);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
@ -137,19 +137,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().size(), 3);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(2).getId(), 29);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(2).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().size(), 2);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().at(1).getOffset(), 1);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
@ -167,9 +167,9 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().size(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getOffset(), 2);
} }
BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 )
@ -185,35 +185,35 @@ BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 )
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
OccurencesList searchResult0 = concordia2.fullSearch("okno", 10, 0); OccurrencesList searchResult0 = concordia2.fullSearch("okno", 10, 0);
/* /*
search0 search0
occurence(exampleId=4, offset=1) occurrence(exampleId=4, offset=1)
occurence(exampleId=3, offset=2) occurrence(exampleId=3, offset=2)
occurence(exampleId=2, offset=2) occurrence(exampleId=2, offset=2)
occurence(exampleId=4, offset=3) occurrence(exampleId=4, offset=3)
occurence(exampleId=1, offset=2) occurrence(exampleId=1, offset=2)
*/ */
OccurencesList searchResult1 = concordia2.fullSearch("okno", 2, 1); OccurrencesList searchResult1 = concordia2.fullSearch("okno", 2, 1);
OccurencesList searchResult2 = concordia2.fullSearch("okno", 10, 3); OccurrencesList searchResult2 = concordia2.fullSearch("okno", 10, 3);
OccurencesList searchResult3 = concordia2.fullSearch("xxx", 10, 3); OccurrencesList searchResult3 = concordia2.fullSearch("xxx", 10, 3);
OccurencesList searchResult4 = concordia2.fullSearch("okno", 10, 6); OccurrencesList searchResult4 = concordia2.fullSearch("okno", 10, 6);
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.getTotalCount(), 5); BOOST_CHECK_EQUAL(searchResult1.getTotalCount(), 5);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().size(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 3); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(0).getId(), 3);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().at(1).getId(), 2);
BOOST_CHECK_EQUAL(searchResult2.getTotalCount(), 5); BOOST_CHECK_EQUAL(searchResult2.getTotalCount(), 5);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 4); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().at(0).getId(), 4);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 1); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().at(1).getId(), 1);
BOOST_CHECK_EQUAL(searchResult3.getTotalCount(), 0); BOOST_CHECK_EQUAL(searchResult3.getTotalCount(), 0);
BOOST_CHECK_EQUAL(searchResult4.getTotalCount(), 5); BOOST_CHECK_EQUAL(searchResult4.getTotalCount(), 5);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 0); BOOST_CHECK_EQUAL(searchResult4.getOccurrences().size(), 0);
} }
@ -267,16 +267,16 @@ BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
concordia.clearIndex(); concordia.clearIndex();
// first two patterns do not cover the whole example source // first two patterns do not cover the whole example source
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 0); BOOST_CHECK_EQUAL(searchResult1.getOccurrences().size(), 0);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0); BOOST_CHECK_EQUAL(searchResult2.getOccurrences().size(), 0);
BOOST_CHECK_EQUAL(searchResult3.getOccurences().size(), 1); BOOST_CHECK_EQUAL(searchResult3.getOccurrences().size(), 1);
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getId(), 123); BOOST_CHECK_EQUAL(searchResult3.getOccurrences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult3.getOccurrences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 1); BOOST_CHECK_EQUAL(searchResult4.getOccurrences().size(), 1);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getId(), 14); BOOST_CHECK_EQUAL(searchResult4.getOccurrences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult4.getOccurrences().at(0).getOffset(), 0);
} }
@ -351,43 +351,43 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
/* /*
adding fragment: offset=0, length=2 adding fragment: offset=0, length=2
adding occurence: example id=167, offset=2 adding occurrence: example id=167, offset=2
adding occurence: example id=45, offset=3 adding occurrence: example id=45, offset=3
adding occurence: example id=51, offset=1 adding occurrence: example id=51, offset=1
adding occurence: example id=123, offset=1 adding occurrence: example id=123, offset=1
adding fragment: offset=1, length=1 adding fragment: offset=1, length=1
adding occurence: example id=167, offset=3 adding occurrence: example id=167, offset=3
adding occurence: example id=45, offset=4 adding occurrence: example id=45, offset=4
adding occurence: example id=51, offset=2 adding occurrence: example id=51, offset=2
adding occurence: example id=123, offset=2 adding occurrence: example id=123, offset=2
adding fragment: offset=2, length=1 adding fragment: offset=2, length=1
adding occurence: example id=167, offset=1 adding occurrence: example id=167, offset=1
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(1).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(2).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurrences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurrences().at(0).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurrences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurrences().at(1).getOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurrences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurrences().at(2).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurrences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurrences().at(0).getOffset(), 1);
concordia.clearIndex(); concordia.clearIndex();
} }
@ -443,43 +443,43 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
Best overlay { Best overlay {
fragment(patternOffset=1, matchedLength=4) { fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0) occurrence(exampleId=321, offset=0)
} }
fragment(patternOffset=5, matchedLength=4) { fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7) occurrence(exampleId=14, offset=7)
} }
} }
All fragments { All fragments {
fragment(patternOffset=4, matchedLength=5) { fragment(patternOffset=4, matchedLength=5) {
occurence(exampleId=14, offset=6) occurrence(exampleId=14, offset=6)
} }
fragment(patternOffset=1, matchedLength=4) { fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0) occurrence(exampleId=321, offset=0)
} }
fragment(patternOffset=5, matchedLength=4) { fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7) occurrence(exampleId=14, offset=7)
} }
fragment(patternOffset=2, matchedLength=3) { fragment(patternOffset=2, matchedLength=3) {
occurence(exampleId=321, offset=1) occurrence(exampleId=321, offset=1)
} }
fragment(patternOffset=6, matchedLength=3) { fragment(patternOffset=6, matchedLength=3) {
occurence(exampleId=14, offset=8) occurrence(exampleId=14, offset=8)
} }
fragment(patternOffset=3, matchedLength=2) { fragment(patternOffset=3, matchedLength=2) {
occurence(exampleId=321, offset=2) occurrence(exampleId=321, offset=2)
} }
fragment(patternOffset=7, matchedLength=2) { fragment(patternOffset=7, matchedLength=2) {
occurence(exampleId=14, offset=9) occurrence(exampleId=14, offset=9)
} }
fragment(patternOffset=8, matchedLength=1) { fragment(patternOffset=8, matchedLength=1) {
occurence(exampleId=14, offset=10) occurrence(exampleId=14, offset=10)
} }
} }
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurrences().at(0).getOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
@ -522,7 +522,7 @@ BOOST_AUTO_TEST_CASE( Tokenize )
} }
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) BOOST_AUTO_TEST_CASE( ConcordiaCountOccurrences )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTempPath(), Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
@ -556,12 +556,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
*/ */
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurrences("Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurrences("Marysia posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1); BOOST_CHECK_EQUAL(concordia.countOccurrences("Marysia posiada rysia"), 1);
BOOST_CHECK_EQUAL(concordia.countOccurences("kota Ala posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurrences("kota Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota"), 2); BOOST_CHECK_EQUAL(concordia.countOccurrences("Ala posiada kota"), 2);
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota i psa"), 1); BOOST_CHECK_EQUAL(concordia.countOccurrences("Ala posiada kota i psa"), 1);
concordia.clearIndex(); concordia.clearIndex();

View File

@ -135,13 +135,13 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
SA->push_back(11); SA->push_back(11);
SUFFIX_MARKER_TYPE highResLength; SUFFIX_MARKER_TYPE highResLength;
std::vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength); std::vector<SubstringOccurrence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
SUFFIX_MARKER_TYPE length = highResLength / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE length = highResLength / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get the following results from SA: /* Expecting to get the following results from SA:
3: ana 3: ana
1: anana 1: anana
Which are 2 substring occurences (34,3) and (34,1) with the lcp length = 2; Which are 2 substring occurrences (34,3) and (34,1) with the lcp length = 2;
*/ */
BOOST_CHECK_EQUAL(result.size(),2); BOOST_CHECK_EQUAL(result.size(),2);
@ -185,12 +185,12 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
pattern2.push_back(2); pattern2.push_back(2);
SUFFIX_MARKER_TYPE highResLength2; SUFFIX_MARKER_TYPE highResLength2;
std::vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2); std::vector<SubstringOccurrence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get one result from SA: /* Expecting to get one result from SA:
0: banana 0: banana
Which is one substring occurence (34,0) with the lcp length = 6; Which is one substring occurrence (34,0) with the lcp length = 6;
*/ */
@ -228,12 +228,12 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
pattern3.push_back(3); pattern3.push_back(3);
SUFFIX_MARKER_TYPE highResLength3; SUFFIX_MARKER_TYPE highResLength3;
std::vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3); std::vector<SubstringOccurrence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get one result from SA: /* Expecting to get one result from SA:
0: banana 0: banana
Which is one substring occurence (34,0) with the lcp length = 5; Which is one substring occurrence (34,0) with the lcp length = 5;
*/ */
BOOST_CHECK_EQUAL(result3.size(),1); BOOST_CHECK_EQUAL(result3.size(),1);
@ -265,13 +265,13 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
pattern4.push_back(4); pattern4.push_back(4);
SUFFIX_MARKER_TYPE highResLength4; SUFFIX_MARKER_TYPE highResLength4;
std::vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4); std::vector<SubstringOccurrence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get 2 results from SA: /* Expecting to get 2 results from SA:
4: na 4: na
2: nana 2: nana
Which are 2 substring occurences (34,4) and (34,2) with the lcp length = 2; Which are 2 substring occurrences (34,4) and (34,2) with the lcp length = 2;
*/ */
BOOST_CHECK_EQUAL(result4.size(),2); BOOST_CHECK_EQUAL(result4.size(),2);
@ -296,7 +296,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
pattern5.push_back(4); pattern5.push_back(4);
SUFFIX_MARKER_TYPE highResLength5; SUFFIX_MARKER_TYPE highResLength5;
std::vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5); std::vector<SubstringOccurrence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get 0 results from SA, lcp length = 0; /* Expecting to get 0 results from SA, lcp length = 0;
@ -320,7 +320,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
pattern6.push_back(0); pattern6.push_back(0);
SUFFIX_MARKER_TYPE highResLength6; SUFFIX_MARKER_TYPE highResLength6;
std::vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6); std::vector<SubstringOccurrence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get 0 results from SA, lcp length = 0; /* Expecting to get 0 results from SA, lcp length = 0;