count occurences feature
This commit is contained in:
parent
fd32ff7e12
commit
fa3138df29
@ -176,6 +176,17 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
||||
}
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0) {
|
||||
return _searcher->countOccurences(_hashGenerator, _T,
|
||||
_markers, _SA, pattern);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
|
@ -119,6 +119,9 @@ public:
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! \deprecated
|
||||
Finds the examples from the index, whose resemblance to the
|
||||
pattern is maximal. This method may perform very slow,
|
||||
|
@ -52,6 +52,45 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
||||
return result;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException) {
|
||||
int left;
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern).getCodes();
|
||||
|
||||
// append sentence boundary marker, as we are looking only for exact sentence matches
|
||||
hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE);
|
||||
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
SA->data(), (saidx_t) SA->size(), &left);
|
||||
|
||||
SUFFIX_MARKER_TYPE occurencesCount = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
saidx_t resultPos = SA->at(left + i);
|
||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
// As we are looking for a pattern in an array of higher
|
||||
// resolution than the hashed index file, we might
|
||||
// obtain accidental results exceeding the boundaries
|
||||
// of characters in hashed index. The above check
|
||||
// removes these accidental results.
|
||||
occurencesCount++;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] patternArray;
|
||||
|
||||
return occurencesCount;
|
||||
}
|
||||
|
||||
|
||||
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
|
@ -52,6 +52,13 @@ public:
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException);
|
||||
|
||||
SUFFIX_MARKER_TYPE countOccurences(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException);
|
||||
|
||||
/*! \deprecated
|
||||
Finds the examples from the index, whose resemblance to the
|
||||
pattern is maximal. This method may perform very slow,
|
||||
|
@ -379,4 +379,49 @@ BOOST_AUTO_TEST_CASE( Tokenize )
|
||||
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
concordia.addExample(Example("Ala posiada kota",14));
|
||||
concordia.addExample(Example("Ala posiada rysia",51));
|
||||
concordia.addExample(Example("Ala posiada kota",16));
|
||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||
concordia.addExample(Example("Ala posiada kota i psa",542));
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
/*The test index contains 3 sentences:
|
||||
14: "Ala posiada kota"
|
||||
51: "Ala posiada rysia"
|
||||
123: "Marysia posiada rysia"
|
||||
|
||||
Test word map:
|
||||
Ala -> 0
|
||||
posiada -> 1
|
||||
kota -> 2
|
||||
rysia -> 3
|
||||
Marysia -> 4
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||
|
||||
*/
|
||||
|
||||
|
||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
|
||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
|
||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
|
||||
BOOST_CHECK_EQUAL(concordia.countOccurences("kota Ala posiada"), 0);
|
||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota"), 2);
|
||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota i psa"), 1);
|
||||
|
||||
concordia.clearIndex();
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
Loading…
Reference in New Issue
Block a user