count occurences feature
This commit is contained in:
parent
fd32ff7e12
commit
fa3138df29
@ -176,6 +176,17 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
if (_T->size() > 0) {
|
||||||
|
return _searcher->countOccurences(_hashGenerator, _T,
|
||||||
|
_markers, _SA, pattern);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
|
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
|
||||||
const std::string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
|
@ -119,6 +119,9 @@ public:
|
|||||||
const std::string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! \deprecated
|
/*! \deprecated
|
||||||
Finds the examples from the index, whose resemblance to the
|
Finds the examples from the index, whose resemblance to the
|
||||||
pattern is maximal. This method may perform very slow,
|
pattern is maximal. This method may perform very slow,
|
||||||
|
@ -52,6 +52,45 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
|
int left;
|
||||||
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
hashGenerator->generateHash(pattern).getCodes();
|
||||||
|
|
||||||
|
// append sentence boundary marker, as we are looking only for exact sentence matches
|
||||||
|
hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE);
|
||||||
|
|
||||||
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
|
SA->data(), (saidx_t) SA->size(), &left);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE occurencesCount = 0;
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
saidx_t resultPos = SA->at(left + i);
|
||||||
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
|
// As we are looking for a pattern in an array of higher
|
||||||
|
// resolution than the hashed index file, we might
|
||||||
|
// obtain accidental results exceeding the boundaries
|
||||||
|
// of characters in hashed index. The above check
|
||||||
|
// removes these accidental results.
|
||||||
|
occurencesCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] patternArray;
|
||||||
|
|
||||||
|
return occurencesCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||||
boost::shared_ptr<ConcordiaConfig> config,
|
boost::shared_ptr<ConcordiaConfig> config,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
@ -52,6 +52,13 @@ public:
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE countOccurences(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
/*! \deprecated
|
/*! \deprecated
|
||||||
Finds the examples from the index, whose resemblance to the
|
Finds the examples from the index, whose resemblance to the
|
||||||
pattern is maximal. This method may perform very slow,
|
pattern is maximal. This method may perform very slow,
|
||||||
|
@ -379,4 +379,49 @@ BOOST_AUTO_TEST_CASE( Tokenize )
|
|||||||
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
concordia.addExample(Example("Ala posiada kota",14));
|
||||||
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
|
concordia.addExample(Example("Ala posiada kota",16));
|
||||||
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
|
concordia.addExample(Example("Ala posiada kota i psa",542));
|
||||||
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
|
/*The test index contains 3 sentences:
|
||||||
|
14: "Ala posiada kota"
|
||||||
|
51: "Ala posiada rysia"
|
||||||
|
123: "Marysia posiada rysia"
|
||||||
|
|
||||||
|
Test word map:
|
||||||
|
Ala -> 0
|
||||||
|
posiada -> 1
|
||||||
|
kota -> 2
|
||||||
|
rysia -> 3
|
||||||
|
Marysia -> 4
|
||||||
|
|
||||||
|
Test hashed index:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
|
Test suffix array:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
|
||||||
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
|
||||||
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
|
||||||
|
BOOST_CHECK_EQUAL(concordia.countOccurences("kota Ala posiada"), 0);
|
||||||
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota"), 2);
|
||||||
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota i psa"), 1);
|
||||||
|
|
||||||
|
concordia.clearIndex();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
Loading…
Reference in New Issue
Block a user