From 024fbf72aac3a9cc4be9048411cfca920d4997ba Mon Sep 17 00:00:00 2001 From: rjawor Date: Fri, 17 Apr 2015 14:17:59 +0200 Subject: [PATCH] concordia search Former-commit-id: 609c3a54e930ebae45a2e9a07f63991ec4abc9a6 --- TODO.txt | 5 +- concordia-concordiasearch-jrc.sh | 5 ++ concordia-console/concordia-console.cpp | 39 +++++++++++++- concordia-runner-jrc.sh | 8 +-- concordia/CMakeLists.txt | 2 + concordia/anubis_searcher.cpp | 55 +++++++++++++++++--- concordia/anubis_searcher.hpp | 11 +++- concordia/common/config.hpp.in | 2 + concordia/concordia.cpp | 14 +++++ concordia/concordia.hpp | 5 ++ concordia/concordia_config.cpp | 7 ++- concordia/concordia_search_result.cpp | 22 ++++++++ concordia/concordia_search_result.hpp | 41 +++++++++++++++ concordia/index_searcher.cpp | 16 ++++++ concordia/index_searcher.hpp | 8 +++ concordia/matched_pattern_fragment.cpp | 16 ++++++ concordia/matched_pattern_fragment.hpp | 51 +++++++++++++++++++ concordia/t/test_concordia.cpp | 68 ++++++++++++++++++++++--- 18 files changed, 349 insertions(+), 26 deletions(-) create mode 100755 concordia-concordiasearch-jrc.sh create mode 100644 concordia/concordia_search_result.cpp create mode 100644 concordia/concordia_search_result.hpp create mode 100644 concordia/matched_pattern_fragment.cpp create mode 100644 concordia/matched_pattern_fragment.hpp diff --git a/TODO.txt b/TODO.txt index 4c4475f..6941016 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,8 @@ -- wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. Dodatkowo obliczany jest zestaw optymalnego pokrycia patternu. +- wyłączyć stopWords +- Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle) - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. - Multi-threading? - concordia-server (zastanowić się, czy nie napisać CAT-a) -- wyłączyć stopWords - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek). - puścić 100% search test na jrc @@ -10,6 +10,7 @@ ---------------------------- Archive ----------------------------- +DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy) DONE 2. anonimizacja zdań diff --git a/concordia-concordiasearch-jrc.sh b/concordia-concordiasearch-jrc.sh new file mode 100755 index 0000000..57cce90 --- /dev/null +++ b/concordia-concordiasearch-jrc.sh @@ -0,0 +1,5 @@ +#!/bin/sh + + +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "$1" + diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index ce63c5c..67e0ea6 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -25,9 +25,12 @@ int main(int argc, char** argv) { "Concordia configuration file (required)") ("simple-search,s", boost::program_options::value(), "Pattern to be searched in the index") - ("silent,n", "While searching with simple-search, do not output search results") + ("silent,n", + "While searching with simple-search, do not output search results") ("anubis-search,a", boost::program_options::value(), - "Pattern to be searched by anubis search in the index") + "Pattern to be searched by anubis search in the index") + ("concordia-search,x", boost::program_options::value(), + "Pattern to be searched by concordia search in the index") ("read-file,r", boost::program_options::value(), "File to be read and added to index"); @@ -102,6 +105,38 @@ int main(int argc, char** argv) { << std::endl; } } + } else if (cli.count("concordia-search")) { + std::string pattern = cli["concordia-search"].as(); + std::cout << "\tConcordia searching for pattern: \"" << pattern << + "\"" << std::endl; + time_start = boost::posix_time::microsec_clock::local_time(); + boost::shared_ptr result = + concordia.concordiaSearch(pattern); + time_end = boost::posix_time::microsec_clock::local_time(); + msdiff = time_end - time_start; + + std::cout << "\tPattern used: " << std::endl << "\t\t"; + BOOST_FOREACH(std::string token, result->getTokenVector()) { + std::cout << token << " "; + } + std::cout << std::endl; + + std::cout << "\tFound: " << result->getFragments().size() + << " matches. " << "Search took: " << + msdiff.total_milliseconds() << "ms." << std::endl; + if (!cli.count("silent")) { + BOOST_FOREACH(MatchedPatternFragment fragment, + result->getFragments()) { + std::cout << "\t\tfound matching fragment " + << "(exampleId, exampleOffset," + << " patternOffset, length): " + << fragment.getExampleId() << "," + << fragment.getExampleOffset() << "," + << fragment.getPatternOffset() << "," + << fragment.getMatchedLength() << "," + << std::endl; + } + } } else if (cli.count("read-file")) { std::string filePath = cli["read-file"].as(); std::cout << "\tReading sentences from file: " << filePath << diff --git a/concordia-runner-jrc.sh b/concordia-runner-jrc.sh index 396f79a..263b348 100755 --- a/concordia-runner-jrc.sh +++ b/concordia-runner-jrc.sh @@ -12,10 +12,10 @@ echo "CONCORDIA RUNNER: reading from file" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/jrc_smaller.txt -echo "CONCORDIA RUNNER: anubis searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\"" -./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "Współpraca Państw Członkowskich i Komisji Europejskiej" -echo "CONCORDIA RUNNER: anubis searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\"" -./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "8. W odniesieniu do artykułu 45 ustęp 12" +echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\"" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej" +echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\"" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12" echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\"" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 15b6089..336672e 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES}) endforeach(dir) add_library(concordia SHARED + concordia_search_result.cpp + matched_pattern_fragment.cpp anubis_searcher.cpp regex_replacement.cpp sentence_anonymizer.cpp diff --git a/concordia/anubis_searcher.cpp b/concordia/anubis_searcher.cpp index 4595c21..b03f400 100644 --- a/concordia/anubis_searcher.cpp +++ b/concordia/anubis_searcher.cpp @@ -11,6 +11,42 @@ AnubisSearcher::AnubisSearcher() { AnubisSearcher::~AnubisSearcher() { } +void AnubisSearcher::concordiaSearch( + boost::shared_ptr result, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::vector & pattern) + throw(ConcordiaException) { + // add fragments to result and sort them + + std::vector patternVector = + Utils::indexVectorToSaucharVector(pattern); + + if (patternVector.size() != + pattern.size() * sizeof(INDEX_CHARACTER_TYPE)) { + throw ConcordiaException("Increasing pattern resolution went wrong."); + } + + for (int offset = 0; offset < pattern.size(); offset++) { + int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE); + std::vector currentPattern( + patternVector.begin()+highResOffset, patternVector.end()); + SUFFIX_MARKER_TYPE lcpLength; + std::vector occurences = + lcpSearch(T, markers, SA, currentPattern, lcpLength); + + BOOST_FOREACH(SubstringOccurence occurence, occurences) { + result->addFragment(MatchedPatternFragment( + occurence.getId(), + occurence.getOffset(), + offset, + lcpLength / sizeof(INDEX_CHARACTER_TYPE))); + } + } + + result->sortFragments(); +} std::vector AnubisSearcher::anubisSearch( boost::shared_ptr config, @@ -26,21 +62,21 @@ std::vector AnubisSearcher::anubisSearch( // 2. calculate score for each tmMatches // 3. create AnubisSearchResult from tmMatches with scores over threshold // 4. sort the AnubisSearchResult vector decending - + std::vector result; - for(TmMatchesMapIterator iterator = tmMatchesMap->begin(); - iterator != tmMatchesMap->end(); iterator++) { + for (TmMatchesMapIterator iterator = tmMatchesMap->begin(); + iterator != tmMatchesMap->end(); ++iterator) { TmMatches * tmMatches = iterator->second; tmMatches->calculateScore(); - + if (tmMatches->getScore() >= config->getAnubisThreshold()) { result.push_back(AnubisSearchResult(tmMatches->getExampleId(), tmMatches->getScore())); } } - + std::sort(result.begin(), result.end(), std::greater()); - + return result; } @@ -175,6 +211,7 @@ void AnubisSearcher::_collectResults( boost::shared_ptr > markers, boost::shared_ptr > SA, saidx_t left, saidx_t size) { + int resultsCount = 0; for (saidx_t i = 0; i < size; i++) { saidx_t resultPos = SA->at(left + i); @@ -182,6 +219,12 @@ void AnubisSearcher::_collectResults( SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE)); result.push_back(SubstringOccurence(marker)); + + // truncate results, + // we don't need too many identical pattern overlays + if (++resultsCount >= CONCORDIA_SEARCH_MAX_RESULTS) { + break; + } } } } diff --git a/concordia/anubis_searcher.hpp b/concordia/anubis_searcher.hpp index c44b99c..1419e12 100644 --- a/concordia/anubis_searcher.hpp +++ b/concordia/anubis_searcher.hpp @@ -8,10 +8,11 @@ #include "concordia/substring_occurence.hpp" #include "concordia/concordia_exception.hpp" #include "concordia/concordia_config.hpp" +#include "concordia/concordia_search_result.hpp" #include "concordia/anubis_search_result.hpp" #include "concordia/tm_matches.hpp" -#include +#include #include /*! @@ -27,6 +28,14 @@ public: */ virtual ~AnubisSearcher(); + void concordiaSearch( + boost::shared_ptr result, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::vector & pattern) + throw(ConcordiaException); + std::vector anubisSearch( boost::shared_ptr config, boost::shared_ptr > T, diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index b327666..51a8f5d 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -28,3 +28,5 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; //The sentence marker is build as follows: its first bytes store the // sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. + +#define CONCORDIA_SEARCH_MAX_RESULTS 3 diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index c9434f8..54dbc17 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -156,3 +156,17 @@ std::vector Concordia::anubisSearch( } } +boost::shared_ptr Concordia::concordiaSearch( + const std::string & pattern) + throw(ConcordiaException) { + if (_T->size() > 0) { + return _searcher->concordiaSearch(_hashGenerator, _T, + _markers, _SA, pattern); + } else { + std::vector empty; + return boost::shared_ptr( + new ConcordiaSearchResult(empty)); + } +} + + diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index d72362a..1ae4187 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -12,6 +12,7 @@ #include "concordia/concordia_config.hpp" #include "concordia/concordia_index.hpp" #include "concordia/index_searcher.hpp" +#include "concordia/concordia_search_result.hpp" #include "concordia/anubis_search_result.hpp" #include @@ -49,6 +50,10 @@ public: std::vector anubisSearch(const std::string & pattern) throw(ConcordiaException); + boost::shared_ptr concordiaSearch( + const std::string & pattern) + throw(ConcordiaException); + void loadRAMIndexFromDisk() throw(ConcordiaException); void refreshSAfromRAM() throw(ConcordiaException); diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index 03462a0..651bf28 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include "concordia/concordia_config.hpp" #include "concordia/common/logging.hpp" @@ -46,9 +46,8 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM); _stopSymbolsFilePath = ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM); - _anubisThreshold = - atof( - ConcordiaConfig::_readConfigParameterStr(ANUBIS_THRESHOLD_PARAM).c_str()); + _anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr( + ANUBIS_THRESHOLD_PARAM).c_str()); } ConcordiaConfig::~ConcordiaConfig() { diff --git a/concordia/concordia_search_result.cpp b/concordia/concordia_search_result.cpp new file mode 100644 index 0000000..ad167ac --- /dev/null +++ b/concordia/concordia_search_result.cpp @@ -0,0 +1,22 @@ +#include "concordia/concordia_search_result.hpp" + +#include + +ConcordiaSearchResult::ConcordiaSearchResult( + const std::vector & tokenVector): + _tokenVector(tokenVector) { +} + +ConcordiaSearchResult::~ConcordiaSearchResult() { +} + +void ConcordiaSearchResult::addFragment( + const MatchedPatternFragment & fragment) { + _matchedPatternFragments.push_back(fragment); +} + +void ConcordiaSearchResult::sortFragments() { + std::sort(_matchedPatternFragments.begin(), + _matchedPatternFragments.end(), + std::greater()); +} diff --git a/concordia/concordia_search_result.hpp b/concordia/concordia_search_result.hpp new file mode 100644 index 0000000..cb47954 --- /dev/null +++ b/concordia/concordia_search_result.hpp @@ -0,0 +1,41 @@ +#ifndef CONCORDIA_SEARCH_RESULT_HDR +#define CONCORDIA_SEARCH_RESULT_HDR + +#include "concordia/matched_pattern_fragment.hpp" + +#include +#include + +/*! + Class representing result of concordia search. + +*/ + +class ConcordiaSearchResult { +public: + explicit ConcordiaSearchResult( + const std::vector & tokenVector); + + /*! Destructor. + */ + virtual ~ConcordiaSearchResult(); + + void addFragment(const MatchedPatternFragment & fragment); + + void sortFragments(); + + std::vector getTokenVector() const { + return _tokenVector; + } + + std::vector getFragments() const { + return _matchedPatternFragments; + } + +private: + std::vector _tokenVector; + + std::vector _matchedPatternFragments; +}; + +#endif diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index d42beb6..8bd3a8d 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -59,3 +59,19 @@ std::vector IndexSearcher::anubisSearch( hashGenerator->generateHash(pattern); return _anubisSearcher->anubisSearch(config, T, markers, SA, hash); } + +boost::shared_ptr IndexSearcher::concordiaSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern) throw(ConcordiaException) { + std::vector hash = + hashGenerator->generateHash(pattern); + boost::shared_ptr result = + boost::shared_ptr( + new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern))); + + _anubisSearcher->concordiaSearch(result, T, markers, SA, hash); + return result; +} diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index 068832d..5803a80 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -42,6 +42,14 @@ public: boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException); + + boost::shared_ptr concordiaSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern) throw(ConcordiaException); + private: boost::shared_ptr _anubisSearcher; }; diff --git a/concordia/matched_pattern_fragment.cpp b/concordia/matched_pattern_fragment.cpp new file mode 100644 index 0000000..6755b87 --- /dev/null +++ b/concordia/matched_pattern_fragment.cpp @@ -0,0 +1,16 @@ +#include "concordia/matched_pattern_fragment.hpp" + +MatchedPatternFragment::MatchedPatternFragment( + const SUFFIX_MARKER_TYPE & exampleId, + const SUFFIX_MARKER_TYPE & exampleOffset, + const SUFFIX_MARKER_TYPE & patternOffset, + const SUFFIX_MARKER_TYPE & matchedLength): + _exampleId(exampleId), + _exampleOffset(exampleOffset), + _patternOffset(patternOffset), + _matchedLength(matchedLength) { +} + +MatchedPatternFragment::~MatchedPatternFragment() { +} + diff --git a/concordia/matched_pattern_fragment.hpp b/concordia/matched_pattern_fragment.hpp new file mode 100644 index 0000000..0ef0390 --- /dev/null +++ b/concordia/matched_pattern_fragment.hpp @@ -0,0 +1,51 @@ +#ifndef MATCHED_PATTERN_FRAGMENT_HDR +#define MATCHED_PATTERN_FRAGMENT_HDR + +#include "concordia/common/config.hpp" + +/*! + Class representing matched pattern fragment in concordia search. + +*/ + +class MatchedPatternFragment { +public: + MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId, + const SUFFIX_MARKER_TYPE & exampleOffset, + const SUFFIX_MARKER_TYPE & patternOffset, + const SUFFIX_MARKER_TYPE & matchedLength); + /*! Destructor. + */ + virtual ~MatchedPatternFragment(); + + SUFFIX_MARKER_TYPE getExampleId() const { + return _exampleId; + } + + SUFFIX_MARKER_TYPE getExampleOffset() const { + return _exampleOffset; + } + + SUFFIX_MARKER_TYPE getPatternOffset() const { + return _patternOffset; + } + + SUFFIX_MARKER_TYPE getMatchedLength() const { + return _matchedLength; + } + + bool operator > (const MatchedPatternFragment & other) const { + return (_matchedLength > other.getMatchedLength()); + } + +private: + SUFFIX_MARKER_TYPE _exampleId; + + SUFFIX_MARKER_TYPE _exampleOffset; + + SUFFIX_MARKER_TYPE _patternOffset; + + SUFFIX_MARKER_TYPE _matchedLength; +}; + +#endif diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 0554636..f191df6 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -191,16 +191,70 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); +} + +BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + concordia.addExample(Example("Ala posiada kota",14)); + concordia.addExample(Example("Ala posiada rysia",51)); + concordia.addExample(Example("Marysia posiada rysia",123)); + concordia.addExample(Example("Gosia chyba posiada rysia też",167)); + concordia.addExample(Example("Ania od wczoraj posiada rysia",45)); + concordia.refreshSAfromRAM(); + + boost::shared_ptr searchResult1 = concordia.concordiaSearch("posiada rysia chyba"); + BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7); /* - BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); - BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51); - BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); - - // Checking pattern spanning over 2 segments - BOOST_CHECK_EQUAL(searchResult2.size(), 0); + addFragment 45,2,0,2 + addFragment 51,1,0,2 + addFragment 123,1,0,2 + addFragment 45,3,1,1 + addFragment 51,2,1,1 + addFragment 123,2,1,1 + addFragment 167,1,2,1 */ + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 51); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 123); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 51); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 123); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1); + + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1); + + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); } BOOST_AUTO_TEST_SUITE_END()