diff --git a/TODO.txt b/TODO.txt index e36b85b..950893d 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,6 @@ -1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy) -2. anonimizacja zdań -3. Dzielenie zdań (max 255 tokenów) -4. concordia-server +DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy) +DONE 2. anonimizacja zdań +DONE 3. Dzielenie zdań (max 255 tokenów) + +- concordia-server +- zastanowić się nad empty hash examples diff --git a/concordia/anubis_searcher.cpp b/concordia/anubis_searcher.cpp index 94481ee..adca075 100644 --- a/concordia/anubis_searcher.cpp +++ b/concordia/anubis_searcher.cpp @@ -1,5 +1,7 @@ #include "concordia/anubis_searcher.hpp" +#include + AnubisSearcher::AnubisSearcher() { } @@ -13,9 +15,75 @@ boost::ptr_vector AnubisSearcher::anubisSearch( boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, - boost::shared_ptr >) + boost::shared_ptr > pattern) throw(ConcordiaException) { - boost::ptr_vector result; return result; } + +boost::ptr_vector AnubisSearcher::lcpSearch( + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + boost::shared_ptr > pattern, + SUFFIX_MARKER_TYPE & length) + throw(ConcordiaException) { + saidx_t patternLength = 0; + saidx_t size = SA->size(); + saidx_t left = 0; + + sauchar_t * patternArray = pattern->data(); + + saidx_t * SAleft = SA->data(); + + saidx_t prevLeft; + saidx_t prevSize; + do { + prevLeft = left; + prevSize = size; + + patternLength++; + + saidx_t localLeft; + size = sa_search(T->data(), (saidx_t) T->size(), + (const sauchar_t *) patternArray, patternLength, + SAleft, size, &localLeft); + + left += localLeft; + SAleft += localLeft; + } while (patternLength < pattern->size() && size > 0); + + boost::ptr_vector result; + + if (size == 0) { + // The search managed to find exactly the longest common prefixes. + length = patternLength - 1; + if (length > 0) { + // Get the results of the previous search + _collectResults(result, markers, SA, prevLeft, prevSize); + } + // If length == 0, then the pattern has no common prefixes + // with the index. + } else { + // Seemingly, the index contains at least one utterance + // of the whole search pattern. + length = patternLength; + _collectResults(result, markers, SA, left, size); + } + + return result; +} + +void AnubisSearcher::_collectResults( + boost::ptr_vector & result, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + saidx_t left, saidx_t size) { + for (saidx_t i = 0; i < size; i++) { + saidx_t resultPos = SA->at(left + i); + SUFFIX_MARKER_TYPE marker = markers->at(resultPos); + result.push_back(new SubstringOccurence( + marker / SUFFIX_MARKER_DIVISOR, + marker % SUFFIX_MARKER_DIVISOR)); + } +} diff --git a/concordia/anubis_searcher.hpp b/concordia/anubis_searcher.hpp index cc4fc4c..0299c23 100644 --- a/concordia/anubis_searcher.hpp +++ b/concordia/anubis_searcher.hpp @@ -5,6 +5,7 @@ #include #include "concordia/common/config.hpp" +#include "concordia/common/utils.hpp" #include "concordia/substring_occurence.hpp" #include "concordia/concordia_exception.hpp" #include "concordia/anubis_search_result.hpp" @@ -27,11 +28,24 @@ public: virtual ~AnubisSearcher(); boost::ptr_vector anubisSearch( + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + boost::shared_ptr > pattern) + throw(ConcordiaException); + + boost::ptr_vector lcpSearch( boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, - boost::shared_ptr >) throw(ConcordiaException); + boost::shared_ptr > pattern, + SUFFIX_MARKER_TYPE & length) throw(ConcordiaException); + private: + void _collectResults(boost::ptr_vector & result, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + saidx_t left, saidx_t size); }; #endif diff --git a/concordia/common/utils.hpp b/concordia/common/utils.hpp index eb6cc5d..2f2bff7 100644 --- a/concordia/common/utils.hpp +++ b/concordia/common/utils.hpp @@ -2,6 +2,7 @@ #define UTILS_HDR #include +#include #include #include #include @@ -37,9 +38,19 @@ public: boost::shared_ptr > vector, INDEX_CHARACTER_TYPE character); + template + static void printVector(boost::shared_ptr > vector); + private: static void _insertCharToSaucharArray(sauchar_t * array, INDEX_CHARACTER_TYPE character, int pos); }; +template +void Utils::printVector(boost::shared_ptr > vector) { + for (int i = 0; i < vector->size(); i++) { + cout << vector->at(i) << " "; + } + cout << endl; +} #endif diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 3c0df60..8635282 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -25,6 +25,7 @@ boost::ptr_vector IndexSearcher::simpleSearch( hashGenerator->generateHash(pattern); saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); + int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); diff --git a/concordia/substring_occurence.cpp b/concordia/substring_occurence.cpp index cef6bc5..5e16476 100644 --- a/concordia/substring_occurence.cpp +++ b/concordia/substring_occurence.cpp @@ -2,7 +2,7 @@ SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id, - const int & offset): + const SUFFIX_MARKER_TYPE & offset): _id(id), _offset(offset) { } diff --git a/concordia/substring_occurence.hpp b/concordia/substring_occurence.hpp index 2af9535..7437a3e 100644 --- a/concordia/substring_occurence.hpp +++ b/concordia/substring_occurence.hpp @@ -14,7 +14,7 @@ using namespace std; class SubstringOccurence { public: explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id, - const int & offset); + const SUFFIX_MARKER_TYPE & offset); /*! Destructor. */ @@ -24,14 +24,14 @@ public: return _id; } - int getOffset() const { + SUFFIX_MARKER_TYPE getOffset() const { return _offset; } private: SUFFIX_MARKER_TYPE _id; - int _offset; + SUFFIX_MARKER_TYPE _offset; }; #endif diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 357d52b..e27d6af 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(concordia-tests + test_anubis_searcher.cpp test_sentence_anonymizer.cpp test_text_utils.cpp test_regex_replacement.cpp diff --git a/concordia/t/test_anubis_searcher.cpp b/concordia/t/test_anubis_searcher.cpp new file mode 100644 index 0000000..1541164 --- /dev/null +++ b/concordia/t/test_anubis_searcher.cpp @@ -0,0 +1,143 @@ +#include "tests/unit-tests/unit_tests_globals.hpp" +#include "concordia/anubis_searcher.hpp" +#include "concordia/common/config.hpp" + +using namespace std; + +BOOST_AUTO_TEST_SUITE(anubis_searcher) + +BOOST_AUTO_TEST_CASE( LcpSearch1 ) +{ + AnubisSearcher searcher; + boost::shared_ptr > T(new std::vector()); + boost::shared_ptr > markers(new std::vector()); + boost::shared_ptr > SA(new std::vector()); + boost::shared_ptr > pattern(new std::vector()); + + /* Search in text: "banana" + T = 123232 (all one sentence id=34) + pattern: "anzzz" = 23444 + word map: b=1,a=2,n=3,z=4 + */ + + T->push_back(1); + T->push_back(2); + T->push_back(3); + T->push_back(2); + T->push_back(3); + T->push_back(2); + + SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR; + for(int i=0;i<6;i++) { + markers->push_back(marker++); + } + + pattern->push_back(2); + pattern->push_back(3); + pattern->push_back(4); + pattern->push_back(4); + + /* Suffix array for the hashed index: 1 2 3 2 3 2 + 0: 1 2 3 2 3 2 + 5: 2 + 3: 2 3 2 + 1: 2 3 2 3 2 + 4: 3 2 + 2: 3 2 3 2 + */ + + SA->push_back(0); + SA->push_back(5); + SA->push_back(3); + SA->push_back(1); + SA->push_back(4); + SA->push_back(2); + + SUFFIX_MARKER_TYPE length; + boost::ptr_vector result = searcher.lcpSearch(T, markers, SA, pattern, length); + + /* Expecting to get the following results from SA: + 3: ana + 1: anana + Which are 2 substring occurences (34,3) and (34,1) with the lcp length = 2; + */ + + BOOST_CHECK_EQUAL(result.size(),2); + BOOST_CHECK_EQUAL(length,2); + BOOST_CHECK_EQUAL(result.at(0).getId(),34); + BOOST_CHECK_EQUAL(result.at(0).getOffset(),3); + BOOST_CHECK_EQUAL(result.at(1).getId(),34); + BOOST_CHECK_EQUAL(result.at(1).getOffset(),1); + + //--------pattern banana + + boost::shared_ptr > pattern2(new std::vector()); + pattern2->push_back(1); + pattern2->push_back(2); + pattern2->push_back(3); + pattern2->push_back(2); + pattern2->push_back(3); + pattern2->push_back(2); + + SUFFIX_MARKER_TYPE length2; + boost::ptr_vector result2 = searcher.lcpSearch(T, markers, SA, pattern2, length2); + + /* Expecting to get one result from SA: + 0: banana + Which is one substring occurence (34,0) with the lcp length = 6; + */ + + BOOST_CHECK_EQUAL(result2.size(),1); + BOOST_CHECK_EQUAL(length2,6); + BOOST_CHECK_EQUAL(result2.at(0).getId(),34); + BOOST_CHECK_EQUAL(result2.at(0).getOffset(),0); + + //--------pattern banan + + boost::shared_ptr > pattern3(new std::vector()); + pattern3->push_back(1); + pattern3->push_back(2); + pattern3->push_back(3); + pattern3->push_back(2); + pattern3->push_back(3); + + SUFFIX_MARKER_TYPE length3; + boost::ptr_vector result3 = searcher.lcpSearch(T, markers, SA, pattern3, length3); + + /* Expecting to get one result from SA: + 0: banana + Which is one substring occurence (34,0) with the lcp length = 5; + */ + + BOOST_CHECK_EQUAL(result3.size(),1); + BOOST_CHECK_EQUAL(length3,5); + BOOST_CHECK_EQUAL(result3.at(0).getId(),34); + BOOST_CHECK_EQUAL(result3.at(0).getOffset(),0); + + //--------pattern nazz + + boost::shared_ptr > pattern4(new std::vector()); + pattern4->push_back(3); + pattern4->push_back(2); + pattern4->push_back(4); + pattern4->push_back(4); + + SUFFIX_MARKER_TYPE length4; + boost::ptr_vector result4 = searcher.lcpSearch(T, markers, SA, pattern4, length4); + + /* Expecting to get 2 results from SA: + 4: na + 2: nana + Which are 2 substring occurences (34,4) and (34,2) with the lcp length = 2; + */ + + BOOST_CHECK_EQUAL(result4.size(),2); + BOOST_CHECK_EQUAL(length4,2); + BOOST_CHECK_EQUAL(result4.at(0).getId(),34); + BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4); + BOOST_CHECK_EQUAL(result4.at(1).getId(),34); + BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2); +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_concordia_index.cpp b/concordia/t/test_concordia_index.cpp index afdcf34..5f5abea 100644 --- a/concordia/t/test_concordia_index.cpp +++ b/concordia/t/test_concordia_index.cpp @@ -93,4 +93,36 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 ) BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end()); } +BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest3 ) +{ + ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), + TestResourcesManager::getTestFilePath("temp","test_markers.bin")); + boost::shared_ptr > T = boost::shared_ptr >(new vector()); + + //Test hashed index: + // n: 0 1 2 3 4 5 + // T[n]: 1 2 3 2 3 2 + T->push_back(1); + T->push_back(2); + T->push_back(3); + T->push_back(2); + T->push_back(3); + T->push_back(2); + + //Test suffix array: + // n: 0 1 2 3 4 5 + //SA[n]: 5 3 1 0 4 2 + + boost::shared_ptr > SA = index.generateSuffixArray(T); + + boost::shared_ptr > expectedSA = boost::shared_ptr >(new vector()); + expectedSA->push_back(0); + expectedSA->push_back(5); + expectedSA->push_back(3); + expectedSA->push_back(1); + expectedSA->push_back(4); + expectedSA->push_back(2); + BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end()); +} + BOOST_AUTO_TEST_SUITE_END()