From e02bbaa0fa78f8f7c76c40c0a1eecb41975aa5e7 Mon Sep 17 00:00:00 2001 From: rjawor Date: Tue, 14 Apr 2015 20:14:30 +0200 Subject: [PATCH] getTmMatches Former-commit-id: 94aa3db2db88195c61c6ac70006c0e1d743dc854 --- TODO.txt | 8 +- concordia/anubis_searcher.cpp | 182 ++++++++++++++++++--------- concordia/anubis_searcher.hpp | 27 ++++ concordia/common/config.hpp.in | 1 - concordia/common/utils.hpp | 2 +- concordia/interval.cpp | 4 +- concordia/interval.hpp | 14 ++- concordia/substring_occurence.cpp | 8 ++ concordia/substring_occurence.hpp | 4 + concordia/t/test_anubis_searcher.cpp | 111 +++++++++++++++- concordia/t/test_concordia.cpp | 1 - concordia/tm_matches.hpp | 12 ++ 12 files changed, 297 insertions(+), 77 deletions(-) diff --git a/TODO.txt b/TODO.txt index 40697c4..cc60029 100644 --- a/TODO.txt +++ b/TODO.txt @@ -8,7 +8,7 @@ DONE 3. Dzielenie zdań (max 255 tokenów) DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu. DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów -2. Wykonać anubis search na nowych markerach z długością zdania +IN PROGRESS 2. Wykonać anubis search na nowych markerach z długością zdania 3. Multi-threading? - concordia-server @@ -19,6 +19,8 @@ DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz marker zastanowić się nad optymalizacją: -- unordered_map tmMatchesMap -- LCP array +- tmMatchesMap jako normalna mapa (nie ptr_map) +- REJECTED LCP array +- !important! rezygnacja z ptr_vector (wycieki!) +- zwracanie wektorów diff --git a/concordia/anubis_searcher.cpp b/concordia/anubis_searcher.cpp index 23dd1b8..ab367ee 100644 --- a/concordia/anubis_searcher.cpp +++ b/concordia/anubis_searcher.cpp @@ -1,15 +1,8 @@ #include "concordia/anubis_searcher.hpp" -#include "concordia/tm_matches.hpp" -#include "concordia/common/logging.hpp" -#include -#include +#include "concordia/common/logging.hpp" #include #include -#include - -typedef boost::ptr_map TmMatchesMap; -typedef TmMatchesMap::iterator TmMatchesMapIterator; AnubisSearcher::AnubisSearcher() { } @@ -25,11 +18,19 @@ boost::ptr_vector AnubisSearcher::anubisSearch( boost::shared_ptr > SA, boost::shared_ptr > pattern) throw(ConcordiaException) { - SET_LOGGER_FILE("/tmp/concordia.log"); - SET_LOGGING_LEVEL("ERROR"); - INFO("AnubisSearcher::anubisSearch"); - + boost::shared_ptr tmMatchesMap = getTmMatches(T, markers, SA, pattern); + + // get the tmMatches list sorted descending by score boost::ptr_vector result; + return result; +} + +boost::shared_ptr AnubisSearcher::getTmMatches( + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + boost::shared_ptr > pattern) + throw(ConcordiaException) { boost::shared_ptr > patternVector = Utils::indexVectorToSaucharVector(pattern); @@ -39,67 +40,61 @@ boost::ptr_vector AnubisSearcher::anubisSearch( throw ConcordiaException("Increasing pattern resolution went wrong."); } - INFO("AnubisSearcher::anubisSearch - about to create tmMatchesMap"); - TmMatchesMap tmMatchesMap; + boost::shared_ptr tmMatchesMap(new TmMatchesMap()); for (int offset = 0; offset < pattern->size(); offset++) { - INFO("AnubisSearcher::anubisSearch - offset: "); - INFO(offset); - int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE); - INFO("AnubisSearcher::anubisSearch - high res offset: "); - INFO(highResOffset); boost::shared_ptr > currentPattern = boost::shared_ptr > (new std::vector( patternVector->begin()+highResOffset, patternVector->end())); - SUFFIX_MARKER_TYPE highResLongestPrefixesLength; - INFO("AnubisSearcher::anubisSearch - about to get longest prefixes"); - boost::ptr_vector longestPrefixes = - lcpSearch(T, markers, SA, currentPattern, highResLongestPrefixesLength); - - INFO("AnubisSearcher::anubisSearch - longest prefixes got"); - SUFFIX_MARKER_TYPE longestPrefixesLength = highResLongestPrefixesLength / - sizeof(INDEX_CHARACTER_TYPE); - INFO("AnubisSearcher::anubisSearch - longest prefixes high res length"); - INFO(highResLongestPrefixesLength); - INFO("AnubisSearcher::anubisSearch - longest prefixes length"); - INFO(longestPrefixesLength); + + saidx_t patternLength = 0; + saidx_t size = SA->size(); + saidx_t left = 0; - if (longestPrefixesLength > 0) { - BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) { - boost::shared_ptr tmMatches; + sauchar_t * patternArray = currentPattern->data(); - TmMatchesMapIterator mapIterator = tmMatchesMap.find( - occurence.getId()); - if (mapIterator != tmMatchesMap.end()) { - tmMatches = boost::shared_ptr( - mapIterator->second - ); - } else { - tmMatches = boost::shared_ptr( - new TmMatches( - occurence.getId(), - occurence.getExampleLength(), - patternVector->size() - )); - } + saidx_t * SAleft = SA->data(); + + saidx_t prevLeft; + saidx_t prevSize; + do { + prevLeft = left; + prevSize = size; + + patternLength += sizeof(INDEX_CHARACTER_TYPE); + + saidx_t localLeft; + size = sa_search(T->data(), (saidx_t) T->size(), + (const sauchar_t *) patternArray, patternLength, + SAleft, size, &localLeft); + + + left += localLeft; + SAleft += localLeft; + + if (patternLength > sizeof(INDEX_CHARACTER_TYPE)) { + // Add to tm matches map results surrounding the main stream. + // from left + for (saidx_t i = prevLeft; i < left; i++) { + _addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset); + } + // from right + for (saidx_t i = left+size; i < prevLeft+prevSize; i++) { + _addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset); + } - // add intervals to tmMatches - tmMatches->addExampleInterval( - occurence.getOffset(), - occurence.getOffset() + longestPrefixesLength - ); - tmMatches->addPatternInterval( - offset, - offset + longestPrefixesLength - ); } + } while (patternLength < currentPattern->size() && size > 0); + + if (size > 0) { + for (saidx_t i = left; i < left+size; i++) { + _addToMap(SA, markers, tmMatchesMap, i, pattern->size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset); + } } } - - // get the tmMatches list sorted descending by score - return result; + return tmMatchesMap; } boost::ptr_vector AnubisSearcher::lcpSearch( @@ -128,8 +123,7 @@ boost::ptr_vector AnubisSearcher::lcpSearch( saidx_t localLeft; size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, - SAleft, size, &localLeft); - + SAleft, size, &localLeft); left += localLeft; SAleft += localLeft; } while (patternLength < pattern->size() && size > 0); @@ -170,3 +164,67 @@ void AnubisSearcher::_collectResults( } } } + +void AnubisSearcher::_addToMap(boost::shared_ptr > SA, + boost::shared_ptr > markers, + boost::shared_ptr tmMatchesMap, + saidx_t sa_pos, + SUFFIX_MARKER_TYPE totalPatternLength, + SUFFIX_MARKER_TYPE matchedFragmentLength, + SUFFIX_MARKER_TYPE patternOffset) { + SubstringOccurence occurence; + if (_getOccurenceFromSA(SA, markers, sa_pos, occurence)) { + _addOccurenceToMap(tmMatchesMap, + occurence, + totalPatternLength, + matchedFragmentLength, + patternOffset); + } +} + + +bool AnubisSearcher::_getOccurenceFromSA( + boost::shared_ptr > SA, + boost::shared_ptr > markers, + saidx_t sa_pos, + SubstringOccurence & occurence) { + saidx_t resultPos = SA->at(sa_pos); + + if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { + SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE)); + occurence.enterDataFromMarker(marker); + } + +} + +void AnubisSearcher::_addOccurenceToMap(boost::shared_ptr tmMatchesMap, + SubstringOccurence & occurence, + SUFFIX_MARKER_TYPE totalPatternLength, + SUFFIX_MARKER_TYPE matchedFragmentLength, + SUFFIX_MARKER_TYPE patternOffset) { + TmMatches * tmMatches; + + TmMatchesMapIterator mapIterator = tmMatchesMap->find( + occurence.getId()); + if (mapIterator != tmMatchesMap->end()) { + tmMatches = mapIterator->second; + } else { + tmMatches = new TmMatches(occurence.getId(), + occurence.getExampleLength(), + totalPatternLength); + SUFFIX_MARKER_TYPE key = occurence.getId(); + tmMatchesMap->insert(key, tmMatches); + } + + // add intervals to tmMatches + tmMatches->addExampleInterval( + occurence.getOffset(), + occurence.getOffset() + matchedFragmentLength + ); + tmMatches->addPatternInterval( + patternOffset, + patternOffset + matchedFragmentLength + ); + +} + diff --git a/concordia/anubis_searcher.hpp b/concordia/anubis_searcher.hpp index 0299c23..6067acf 100644 --- a/concordia/anubis_searcher.hpp +++ b/concordia/anubis_searcher.hpp @@ -9,6 +9,7 @@ #include "concordia/substring_occurence.hpp" #include "concordia/concordia_exception.hpp" #include "concordia/anubis_search_result.hpp" +#include "concordia/tm_matches.hpp" #include @@ -34,6 +35,13 @@ public: boost::shared_ptr > pattern) throw(ConcordiaException); + boost::shared_ptr getTmMatches( + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + boost::shared_ptr > pattern) + throw(ConcordiaException); + boost::ptr_vector lcpSearch( boost::shared_ptr > T, boost::shared_ptr > markers, @@ -46,6 +54,25 @@ private: boost::shared_ptr > markers, boost::shared_ptr > SA, saidx_t left, saidx_t size); + + void _addToMap(boost::shared_ptr > SA, + boost::shared_ptr > markers, + boost::shared_ptr tmMatchesMap, + saidx_t sa_pos, + SUFFIX_MARKER_TYPE totalPatternLength, + SUFFIX_MARKER_TYPE matchedFragmentLength, + SUFFIX_MARKER_TYPE patternOffset); + + bool _getOccurenceFromSA(boost::shared_ptr > SA, + boost::shared_ptr > markers, + saidx_t sa_pos, + SubstringOccurence & occurence); + + void _addOccurenceToMap(boost::shared_ptr tmMatchesMap, + SubstringOccurence & occurence, + SUFFIX_MARKER_TYPE totalPatternLength, + SUFFIX_MARKER_TYPE matchedFragmentLength, + SUFFIX_MARKER_TYPE patternOffset); }; #endif diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index 37847d3..b327666 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -28,4 +28,3 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; //The sentence marker is build as follows: its first bytes store the // sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. - diff --git a/concordia/common/utils.hpp b/concordia/common/utils.hpp index 2b5bbc4..00c14aa 100644 --- a/concordia/common/utils.hpp +++ b/concordia/common/utils.hpp @@ -67,7 +67,7 @@ private: template void Utils::printVector(boost::shared_ptr > vector) { for (int i = 0; i < vector->size(); i++) { - cout << vector->at(i) << " "; + cout << static_cast(vector->at(i)) << " "; } cout << endl; } diff --git a/concordia/interval.cpp b/concordia/interval.cpp index b37cc16..9621169 100644 --- a/concordia/interval.cpp +++ b/concordia/interval.cpp @@ -1,7 +1,7 @@ #include "concordia/interval.hpp" -Interval::Interval(const unsigned char start, const unsigned char end): +Interval::Interval(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end): _start(start), _end(end) { } @@ -14,7 +14,7 @@ bool Interval::intersects(Interval & interval) { interval.getEnd() - 1 < _start); } -unsigned char Interval::getLength() { +SUFFIX_MARKER_TYPE Interval::getLength() { return _end - _start; } diff --git a/concordia/interval.hpp b/concordia/interval.hpp index fc02f92..e767022 100644 --- a/concordia/interval.hpp +++ b/concordia/interval.hpp @@ -1,6 +1,8 @@ #ifndef INTERVAL_HDR #define INTERVAL_HDR +#include "concordia/common/config.hpp" + /*! Class representing word interval. @@ -10,7 +12,7 @@ using namespace std; class Interval { public: - explicit Interval(const unsigned char start, const unsigned char end); + explicit Interval(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end); /*! Destructor. */ @@ -18,20 +20,20 @@ public: bool intersects(Interval & interval); - unsigned char getLength(); + SUFFIX_MARKER_TYPE getLength(); - unsigned char getStart() const { + SUFFIX_MARKER_TYPE getStart() const { return _start; } - unsigned char getEnd() const { + SUFFIX_MARKER_TYPE getEnd() const { return _end; } private: - unsigned char _start; + SUFFIX_MARKER_TYPE _start; - unsigned char _end; + SUFFIX_MARKER_TYPE _end; }; #endif diff --git a/concordia/substring_occurence.cpp b/concordia/substring_occurence.cpp index 687afbd..1ed0fb0 100644 --- a/concordia/substring_occurence.cpp +++ b/concordia/substring_occurence.cpp @@ -1,6 +1,8 @@ #include "concordia/substring_occurence.hpp" #include "concordia/common/utils.hpp" +SubstringOccurence::SubstringOccurence() { +} SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) { _id = Utils::getIdFromMarker(marker); @@ -8,6 +10,12 @@ SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) { _exampleLength = Utils::getLengthFromMarker(marker); } +void SubstringOccurence::enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker) { + _id = Utils::getIdFromMarker(marker); + _offset = Utils::getOffsetFromMarker(marker); + _exampleLength = Utils::getLengthFromMarker(marker); +} + SubstringOccurence::SubstringOccurence( const SUFFIX_MARKER_TYPE & id, diff --git a/concordia/substring_occurence.hpp b/concordia/substring_occurence.hpp index 928d7fd..e1711f7 100644 --- a/concordia/substring_occurence.hpp +++ b/concordia/substring_occurence.hpp @@ -13,6 +13,8 @@ using namespace std; class SubstringOccurence { public: + SubstringOccurence(); + explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker); SubstringOccurence(const SUFFIX_MARKER_TYPE & id, @@ -33,6 +35,8 @@ public: SUFFIX_MARKER_TYPE getExampleLength() const { return _exampleLength; } + + void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker); private: SUFFIX_MARKER_TYPE _id; diff --git a/concordia/t/test_anubis_searcher.cpp b/concordia/t/test_anubis_searcher.cpp index ce952d1..49ccdac 100644 --- a/concordia/t/test_anubis_searcher.cpp +++ b/concordia/t/test_anubis_searcher.cpp @@ -1,7 +1,16 @@ +#include + #include "tests/unit-tests/unit_tests_globals.hpp" +#include "concordia/tm_matches.hpp" #include "concordia/anubis_searcher.hpp" +#include "concordia/concordia_index.hpp" +#include "concordia/concordia_config.hpp" +#include "concordia/example.hpp" +#include "concordia/hash_generator.hpp" #include "concordia/common/config.hpp" #include "concordia/common/utils.hpp" +#include "concordia/common/logging.hpp" +#include "tests/common/test_resources_manager.hpp" using namespace std; @@ -324,9 +333,109 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 ) } -BOOST_AUTO_TEST_CASE( AnubisSearch1 ) +BOOST_AUTO_TEST_CASE( TmMatchesTest ) { + AnubisSearcher searcher; + + /*The test index contains 3 sentences: + 14: "Ala posiada kota" + 51: "Ala posiada rysia" + 123: "Marysia posiada rysia" + Test word map: + Ala -> 0 + posiada -> 1 + kota -> 2 + rysia -> 3 + Marysia -> 4 + + Test hashed index: + n: 0 1 2 3 4 5 6 7 8 9 10 11 + T[n]: 0 1 2 | 0 1 3 | 4 1 3 | + + Test suffix array: + n: 0 1 2 3 4 5 6 7 8 9 10 11 + SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + + */ + + ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX), + TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); + boost::shared_ptr config( + new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + boost::shared_ptr hashGenerator(new HashGenerator(config)); + + + boost::shared_ptr > T(new std::vector()); + boost::shared_ptr > markers(new std::vector()); + + index.addExample(hashGenerator, T, markers, Example("Ala posiada kota",14)); + index.addExample(hashGenerator, T, markers, Example("Ala posiada rysia",51)); + index.addExample(hashGenerator, T, markers, Example("Marysia posiada rysia",123)); + + boost::shared_ptr > SA = index.generateSuffixArray(T); + + + // searching for pattern "Ola posiada rysia Marysia" (5 1 3 4) + + boost::shared_ptr > pattern = hashGenerator->generateHash("Ola posiada rysia Marysia"); + + boost::shared_ptr tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern); + BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3); + + TmMatches * tmMatches14 = tmMatchesMap->find(14)->second; + TmMatches * tmMatches51 = tmMatchesMap->find(51)->second; + TmMatches * tmMatches123 = tmMatchesMap->find(123)->second; + + BOOST_CHECK_EQUAL(tmMatches14->getExampleId(), 14); + BOOST_CHECK_EQUAL(tmMatches51->getExampleId(), 51); + BOOST_CHECK_EQUAL(tmMatches123->getExampleId(), 123); + + // example 14 + // example interval list: [(1,2)] + boost::ptr_vector exampleIntervals14 = tmMatches14->getExampleIntervals(); + BOOST_CHECK_EQUAL(exampleIntervals14.size(), 1); + BOOST_CHECK_EQUAL(exampleIntervals14[0].getStart(), 1); + BOOST_CHECK_EQUAL(exampleIntervals14[0].getEnd(), 2); + // pattern interval list: [(1,2)] + boost::ptr_vector patternIntervals14 = tmMatches14->getPatternIntervals(); + BOOST_CHECK_EQUAL(patternIntervals14.size(), 1); + BOOST_CHECK_EQUAL(patternIntervals14[0].getStart(), 1); + BOOST_CHECK_EQUAL(patternIntervals14[0].getEnd(), 2); + + // example 51 + // example interval list: [(1,3)] + boost::ptr_vector exampleIntervals51 = tmMatches51->getExampleIntervals(); + BOOST_CHECK_EQUAL(exampleIntervals51.size(), 1); + BOOST_CHECK_EQUAL(exampleIntervals51[0].getStart(), 1); + BOOST_CHECK_EQUAL(exampleIntervals51[0].getEnd(), 3); + // pattern interval list: [(1,3)] + boost::ptr_vector patternIntervals51 = tmMatches51->getPatternIntervals(); + BOOST_CHECK_EQUAL(patternIntervals51.size(), 1); + BOOST_CHECK_EQUAL(patternIntervals51[0].getStart(), 1); + BOOST_CHECK_EQUAL(patternIntervals51[0].getEnd(), 3); + + // example 123 + // example interval list: [(1,3), (0,1)] + boost::ptr_vector exampleIntervals123 = tmMatches123->getExampleIntervals(); + BOOST_CHECK_EQUAL(exampleIntervals123.size(), 2); + BOOST_CHECK_EQUAL(exampleIntervals123[0].getStart(), 1); + BOOST_CHECK_EQUAL(exampleIntervals123[0].getEnd(), 3); + BOOST_CHECK_EQUAL(exampleIntervals123[1].getStart(), 0); + BOOST_CHECK_EQUAL(exampleIntervals123[1].getEnd(), 1); + // pattern interval list: [(1,3), (3,4)] + boost::ptr_vector patternIntervals123 = tmMatches123->getPatternIntervals(); + BOOST_CHECK_EQUAL(patternIntervals123.size(), 2); + BOOST_CHECK_EQUAL(patternIntervals123[0].getStart(), 1); + BOOST_CHECK_EQUAL(patternIntervals123[0].getEnd(), 3); + BOOST_CHECK_EQUAL(patternIntervals123[1].getStart(), 3); + BOOST_CHECK_EQUAL(patternIntervals123[1].getEnd(), 4); + + + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + } diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 4befe58..4fc31b7 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -177,7 +177,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) n: 0 1 2 3 4 5 6 7 8 9 10 11 SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 - */ boost::ptr_vector searchResult1 = concordia.anubisSearch("posiada rysia chyba"); boost::ptr_vector searchResult2 = concordia.anubisSearch("posiada kota Ala"); diff --git a/concordia/tm_matches.hpp b/concordia/tm_matches.hpp index ae4eb89..8d1bbb2 100644 --- a/concordia/tm_matches.hpp +++ b/concordia/tm_matches.hpp @@ -5,6 +5,7 @@ #include "concordia/common/config.hpp" #include "concordia/interval.hpp" #include +#include /*! @@ -28,6 +29,14 @@ public: return _score; } + boost::ptr_vector getExampleIntervals() const { + return _exampleMatchedRegions; + } + + boost::ptr_vector getPatternIntervals() const { + return _patternMatchedRegions; + } + SUFFIX_MARKER_TYPE getExampleId() const { return _exampleId; } @@ -61,4 +70,7 @@ private: double _score; }; +typedef boost::ptr_map TmMatchesMap; +typedef TmMatchesMap::iterator TmMatchesMapIterator; + #endif