diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 0972333..e855dc6 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -176,6 +176,17 @@ void Concordia::_initializeIndex() throw(ConcordiaException) { } } +SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern) + throw(ConcordiaException) { + if (_T->size() > 0) { + return _searcher->countOccurences(_hashGenerator, _T, + _markers, _SA, pattern); + } else { + return 0; + } +} + + std::vector Concordia::simpleSearch( const std::string & pattern) throw(ConcordiaException) { diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 3deb2b1..38bd034 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -119,6 +119,9 @@ public: const std::string & pattern) throw(ConcordiaException); + SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) + throw(ConcordiaException); + /*! \deprecated Finds the examples from the index, whose resemblance to the pattern is maximal. This method may perform very slow, diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 79d5b48..f257191 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -52,6 +52,45 @@ std::vector IndexSearcher::simpleSearch( return result; } +SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern) throw(ConcordiaException) { + int left; + std::vector hash = + hashGenerator->generateHash(pattern).getCodes(); + + // append sentence boundary marker, as we are looking only for exact sentence matches + hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE); + + saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); + sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); + + int size = sa_search(T->data(), (saidx_t) T->size(), + (const sauchar_t *) patternArray, patternLength, + SA->data(), (saidx_t) SA->size(), &left); + + SUFFIX_MARKER_TYPE occurencesCount = 0; + for (int i = 0; i < size; ++i) { + saidx_t resultPos = SA->at(left + i); + if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { + // As we are looking for a pattern in an array of higher + // resolution than the hashed index file, we might + // obtain accidental results exceeding the boundaries + // of characters in hashed index. The above check + // removes these accidental results. + occurencesCount++; + } + } + + delete[] patternArray; + + return occurencesCount; +} + + std::vector IndexSearcher::anubisSearch( boost::shared_ptr config, boost::shared_ptr hashGenerator, diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index 7393fee..fbf64f4 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -52,6 +52,13 @@ public: boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException); + SUFFIX_MARKER_TYPE countOccurences( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern) throw(ConcordiaException); + /*! \deprecated Finds the examples from the index, whose resemblance to the pattern is maximal. This method may perform very slow, diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 8241fce..01cc31a 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -379,4 +379,49 @@ BOOST_AUTO_TEST_CASE( Tokenize ) BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5); } + +BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + concordia.addExample(Example("Ala posiada kota",14)); + concordia.addExample(Example("Ala posiada rysia",51)); + concordia.addExample(Example("Ala posiada kota",16)); + concordia.addExample(Example("Marysia posiada rysia",123)); + concordia.addExample(Example("Ala posiada kota i psa",542)); + concordia.refreshSAfromRAM(); + + /*The test index contains 3 sentences: + 14: "Ala posiada kota" + 51: "Ala posiada rysia" + 123: "Marysia posiada rysia" + + Test word map: + Ala -> 0 + posiada -> 1 + kota -> 2 + rysia -> 3 + Marysia -> 4 + + Test hashed index: + n: 0 1 2 3 4 5 6 7 8 9 10 11 + T[n]: 0 1 2 | 0 1 3 | 4 1 3 | + + Test suffix array: + n: 0 1 2 3 4 5 6 7 8 9 10 11 + SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + + */ + + + BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0); + BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0); + BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1); + BOOST_CHECK_EQUAL(concordia.countOccurences("kota Ala posiada"), 0); + BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota"), 2); + BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada kota i psa"), 1); + + concordia.clearIndex(); + +} + BOOST_AUTO_TEST_SUITE_END()