count occurences feature

This commit is contained in:
rjawor 2015-10-01 13:36:54 +02:00
parent fd32ff7e12
commit fa3138df29
5 changed files with 105 additions and 0 deletions

View File

@ -176,6 +176,17 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
} }
} }
SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->countOccurences(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
return 0;
}
}
std::vector<MatchedPatternFragment> Concordia::simpleSearch( std::vector<MatchedPatternFragment> Concordia::simpleSearch(
const std::string & pattern) const std::string & pattern)
throw(ConcordiaException) { throw(ConcordiaException) {

View File

@ -119,6 +119,9 @@ public:
const std::string & pattern) const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
throw(ConcordiaException);
/*! \deprecated /*! \deprecated
Finds the examples from the index, whose resemblance to the Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow, pattern is maximal. This method may perform very slow,

View File

@ -52,6 +52,45 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
return result; return result;
} }
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
// append sentence boundary marker, as we are looking only for exact sentence matches
hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE);
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
SUFFIX_MARKER_TYPE occurencesCount = 0;
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
occurencesCount++;
}
}
delete[] patternArray;
return occurencesCount;
}
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch( std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config, boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,

View File

@ -52,6 +52,13 @@ public:
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException); const std::string & pattern) throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
/*! \deprecated /*! \deprecated
Finds the examples from the index, whose resemblance to the Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow, pattern is maximal. This method may perform very slow,

View File

@ -379,4 +379,49 @@ BOOST_AUTO_TEST_CASE( Tokenize )
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5); BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
} }
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addExample(Example("Ala posiada kota",14));
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Ala posiada kota",16));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.addExample(Example("Ala posiada kota i psa",542));
concordia.refreshSAfromRAM();
/*The test index contains 3 sentences:
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
Test word map:
Ala -> 0
posiada -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
BOOST_CHECK_EQUAL(concordia.countOccurences("kota Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota"), 2);
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota i psa"), 1);
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()