From 8f953883bf107cb62be93004a48457bbde99daa3 Mon Sep 17 00:00:00 2001 From: rjawor Date: Tue, 24 Jun 2014 18:23:46 +0200 Subject: [PATCH] anubis search continued Former-commit-id: 95a08f242a03311d067303bfff07bf4890796da5 --- TODO.txt | 7 ++++++ concordia/anubis_searcher.cpp | 41 ++++++++++++++++++++++++++++++++++- concordia/common/utils.cpp | 15 +++++++++++++ concordia/common/utils.hpp | 3 +++ concordia/t/test_utils.cpp | 25 +++++++++++++++++++++ 5 files changed, 90 insertions(+), 1 deletion(-) diff --git a/TODO.txt b/TODO.txt index 950893d..8bbb613 100644 --- a/TODO.txt +++ b/TODO.txt @@ -4,3 +4,10 @@ DONE 3. Dzielenie zdań (max 255 tokenów) - concordia-server - zastanowić się nad empty hash examples + + +zastanowić się nad optymalizacją: +- unordered_map tmMatchesMap +- LCP array + +Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu. diff --git a/concordia/anubis_searcher.cpp b/concordia/anubis_searcher.cpp index adca075..8f389c7 100644 --- a/concordia/anubis_searcher.cpp +++ b/concordia/anubis_searcher.cpp @@ -1,7 +1,13 @@ #include "concordia/anubis_searcher.hpp" +#include "concordia/tm_matches.hpp" -#include +#include +#include +#include +#include +typedef boost::ptr_map TmMatchesMap; +typedef TmMatchesMap::iterator TmMatchesMapIterator; AnubisSearcher::AnubisSearcher() { } @@ -18,6 +24,39 @@ boost::ptr_vector AnubisSearcher::anubisSearch( boost::shared_ptr > pattern) throw(ConcordiaException) { boost::ptr_vector result; + + boost::shared_ptr > patternVector = + Utils::indexVectorToSaucharVector(pattern); + + if (patternVector->size() != pattern->size() * sizeof(INDEX_CHARACTER_TYPE)) { + throw ConcordiaException("Increasing pattern resolution went wrong."); + } + + + TmMatchesMap tmMatchesMap; + for (int offset = 0;offset < pattern->size(); offset++) { + int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE); + boost::shared_ptr > currentPattern = + boost::shared_ptr >(new std::vector( + patternVector->begin()+highResOffset,patternVector->end())); + SUFFIX_MARKER_TYPE longestPrefixesLength; + boost::ptr_vector longestPrefixes = lcpSearch(T, markers, SA, + currentPattern, longestPrefixesLength); + + BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) { + TmMatchesMapIterator mapIterator = tmMatchesMap.find(occurence.getId()); + if(mapIterator != tmMatchesMap.end()) { + + } else { + + } + } + + + + + } + return result; } diff --git a/concordia/common/utils.cpp b/concordia/common/utils.cpp index 00a8fa7..7ae58cc 100644 --- a/concordia/common/utils.cpp +++ b/concordia/common/utils.cpp @@ -42,6 +42,19 @@ sauchar_t * Utils::indexVectorToSaucharArray( return patternArray; } +boost::shared_ptr > Utils::indexVectorToSaucharVector( + boost::shared_ptr > input) { + + boost::shared_ptr > result = boost::shared_ptr >( + new std::vector); + + for (vector::iterator it = input->begin(); + it != input->end(); ++it) { + appendCharToSaucharVector(result, *it); + } + return result; +} + void Utils::appendCharToSaucharVector( boost::shared_ptr > vector, INDEX_CHARACTER_TYPE character) { @@ -58,4 +71,6 @@ void Utils::_insertCharToSaucharArray(sauchar_t * array, array[i] = characterArray[i-pos]; } } + + diff --git a/concordia/common/utils.hpp b/concordia/common/utils.hpp index 2f2bff7..4fc3897 100644 --- a/concordia/common/utils.hpp +++ b/concordia/common/utils.hpp @@ -34,6 +34,9 @@ public: static sauchar_t * indexVectorToSaucharArray( boost::shared_ptr > input); + static boost::shared_ptr > indexVectorToSaucharVector( + boost::shared_ptr > input); + static void appendCharToSaucharVector( boost::shared_ptr > vector, INDEX_CHARACTER_TYPE character); diff --git a/concordia/t/test_utils.cpp b/concordia/t/test_utils.cpp index 042f301..184c542 100644 --- a/concordia/t/test_utils.cpp +++ b/concordia/t/test_utils.cpp @@ -60,6 +60,31 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray ) BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end()); } +BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector ) +{ + boost::shared_ptr > hash(new vector()); + hash->push_back(123456789); // in hex: 75BCD15 + // in memory: 15 cd 5b 07 + // in memory DEC: 21 205 91 7 + + hash->push_back(987654321); // in hex: 3ADE68B1 + // in memory: b1 68 de 3a + // in memory DEC: 177 104 222 58 + boost::shared_ptr > result = Utils::indexVectorToSaucharVector(hash); + + boost::shared_ptr > expected(new vector()); + expected->push_back(21); + expected->push_back(205); + expected->push_back(91); + expected->push_back(7); + expected->push_back(177); + expected->push_back(104); + expected->push_back(222); + expected->push_back(58); + + BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end()); +} + BOOST_AUTO_TEST_SUITE_END()