count occurences feature

This commit is contained in:
rjawor 2015-10-01 13:36:54 +02:00
parent fd32ff7e12
commit fa3138df29
5 changed files with 105 additions and 0 deletions

View File

@ -176,6 +176,17 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
}
}
SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->countOccurences(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
return 0;
}
}
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
const std::string & pattern)
throw(ConcordiaException) {

View File

@ -119,6 +119,9 @@ public:
const std::string & pattern)
throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
throw(ConcordiaException);
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,

View File

@ -52,6 +52,45 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
return result;
}
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
// append sentence boundary marker, as we are looking only for exact sentence matches
hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE);
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
SUFFIX_MARKER_TYPE occurencesCount = 0;
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
occurencesCount++;
}
}
delete[] patternArray;
return occurencesCount;
}
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator,

View File

@ -52,6 +52,13 @@ public:
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,

View File

@ -379,4 +379,49 @@ BOOST_AUTO_TEST_CASE( Tokenize )
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
}
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addExample(Example("Ala posiada kota",14));
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Ala posiada kota",16));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.addExample(Example("Ala posiada kota i psa",542));
concordia.refreshSAfromRAM();
/*The test index contains 3 sentences:
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
Test word map:
Ala -> 0
posiada -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
BOOST_CHECK_EQUAL(concordia.countOccurences("kota Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota"), 2);
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota i psa"), 1);
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END()