fixed lcp search

Former-commit-id: 18192126d134323569bc43205ccc60788d9e6cb6
2015-04-12 12:06:41 +02:00 · 2015-04-12 12:06:41 +02:00 · f03b4ad954
commit f03b4ad954
parent 2533fd5b44
11 changed files with 380 additions and 51 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,6 +18,8 @@ set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
 set (SUFFIX_MARKER_SENTENCE_BYTES 2)
 # The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
 # This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
+# After changing these values be sure to adjust tests (as well as the above calculations).
+# Also, you might want to run TooLongHashTest from test_hash_generator.cpp

 # =============================== #
 # Production paths
--- a/TODO.txt
+++ b/TODO.txt
@ -4,6 +4,13 @@ DONE 3. Dzielenie zdań (max 255 tokenów)



+
+DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
+
+DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
+2. Wykonać anubis search na nowych markerach z długością zdania
+3. Multi-threading?
+
 - concordia-server
 - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
 - wyłączyć stopWords
@ -15,8 +22,3 @@ zastanowić się nad optymalizacją:
 - unordered_map tmMatchesMap
 - LCP array

-DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
-
-DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
-2. Wykonać anubis search na nowych markerach z długością zdania
-3. Multi-threading?
--- a/concordia/anubis_searcher.cpp
+++ b/concordia/anubis_searcher.cpp
@ -1,7 +1,9 @@
 #include "concordia/anubis_searcher.hpp"
 #include "concordia/tm_matches.hpp"
+#include "concordia/common/logging.hpp"

 #include <boost/ptr_container/ptr_map.hpp>
+#include <boost/assign/ptr_map_inserter.hpp> 
 #include <boost/foreach.hpp>
 #include <iostream>
 #include <map>
@ -23,6 +25,10 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
                boost::shared_ptr<std::vector<saidx_t> > SA,
                boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
                                                throw(ConcordiaException) {
+    SET_LOGGER_FILE("/tmp/concordia.log");
+    SET_LOGGING_LEVEL("ERROR");
+    INFO("AnubisSearcher::anubisSearch");
+
    boost::ptr_vector<AnubisSearchResult> result;

    boost::shared_ptr<std::vector<sauchar_t> > patternVector =
@ -33,25 +39,66 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
        throw ConcordiaException("Increasing pattern resolution went wrong.");
    }

+    INFO("AnubisSearcher::anubisSearch - about to create tmMatchesMap");
    TmMatchesMap tmMatchesMap;
    for (int offset = 0; offset < pattern->size(); offset++) {
+        INFO("AnubisSearcher::anubisSearch - offset: ");
+        INFO(offset);
+
        int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
+        INFO("AnubisSearcher::anubisSearch - high res offset: ");
+        INFO(highResOffset);
        boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
            boost::shared_ptr<std::vector<sauchar_t> >
            (new std::vector<sauchar_t>(
            patternVector->begin()+highResOffset, patternVector->end()));
-        SUFFIX_MARKER_TYPE longestPrefixesLength;
+        SUFFIX_MARKER_TYPE highResLongestPrefixesLength;
+        INFO("AnubisSearcher::anubisSearch - about to get longest prefixes");
        boost::ptr_vector<SubstringOccurence> longestPrefixes =
-            lcpSearch(T, markers, SA, currentPattern, longestPrefixesLength);
+            lcpSearch(T, markers, SA, currentPattern, highResLongestPrefixesLength);
        
-        BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) {
-            TmMatchesMapIterator mapIterator = tmMatchesMap.find(
-                occurence.getId());
-            if (mapIterator != tmMatchesMap.end()) {
+        INFO("AnubisSearcher::anubisSearch - longest prefixes got");
+        SUFFIX_MARKER_TYPE longestPrefixesLength = highResLongestPrefixesLength / 
+                                                   sizeof(INDEX_CHARACTER_TYPE);
+        INFO("AnubisSearcher::anubisSearch - longest prefixes high res length");
+        INFO(highResLongestPrefixesLength);
+        INFO("AnubisSearcher::anubisSearch - longest prefixes length");
+        INFO(longestPrefixesLength);
+
+        if (longestPrefixesLength > 0) {
+            BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) {
+                boost::shared_ptr<TmMatches> tmMatches;
+
+                TmMatchesMapIterator mapIterator = tmMatchesMap.find(
+                    occurence.getId());
+                if (mapIterator != tmMatchesMap.end()) {
+                    tmMatches = boost::shared_ptr<TmMatches>(
+                                    mapIterator->second
+                                );
+                } else {
+                    tmMatches = boost::shared_ptr<TmMatches>(
+                                                 new TmMatches(
+                                                     occurence.getId(),
+                                                     occurence.getExampleLength(),
+                                                     patternVector->size() 
+                                                 ));
+                }
+                
+                // add intervals to tmMatches
+                tmMatches->addExampleInterval(
+                                              occurence.getOffset(),
+                                              occurence.getOffset() + longestPrefixesLength
+                                             );
+                tmMatches->addPatternInterval(
+                                              offset,
+                                              offset + longestPrefixesLength
+                                             );
            }
        }
    }
    
+    // get the tmMatches list sorted descending by score
+
    return result;
 }

@ -76,7 +123,7 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
        prevLeft = left;
        prevSize = size;

-        patternLength++;
+        patternLength += sizeof(INDEX_CHARACTER_TYPE);

        saidx_t localLeft;
        size = sa_search(T->data(), (saidx_t) T->size(),
@ -91,7 +138,8 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(

    if (size == 0) {
        // The search managed to find exactly the longest common prefixes.
-        length = patternLength - 1;
+        
+        length = patternLength - sizeof(INDEX_CHARACTER_TYPE);
        if (length > 0) {
            // Get the results of the previous search
            _collectResults(result, markers, SA, prevLeft, prevSize);
@ -115,9 +163,10 @@ void AnubisSearcher::_collectResults(
                 saidx_t left, saidx_t size) {
    for (saidx_t i = 0; i < size; i++) {
        saidx_t resultPos = SA->at(left + i);
-        SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
-        result.push_back(new SubstringOccurence(
-                            Utils::getIdFromMarker(marker),
-                            Utils::getOffsetFromMarker(marker)));
+        
+        if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
+            SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
+            result.push_back(new SubstringOccurence(marker));
+        }
    }
 }
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -143,3 +143,15 @@ boost::ptr_vector<SubstringOccurence> Concordia::simpleSearch(
    }
 }

+boost::ptr_vector<AnubisSearchResult> Concordia::anubisSearch(
+                                          const string & pattern)
+                                  throw(ConcordiaException) {
+    if (_T->size() > 0) {
+        return _searcher->anubisSearch(_hashGenerator, _T,
+                                         _markers, _SA, pattern);
+    } else {
+        boost::ptr_vector<AnubisSearchResult> result;
+        return result;
+    }
+}
+
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -13,6 +13,7 @@
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_index.hpp"
 #include "concordia/index_searcher.hpp"
+#include "concordia/anubis_search_result.hpp"
 #include <divsufsort.h>


@ -47,6 +48,10 @@ public:
                                                   const std::string & pattern)
                                                      throw(ConcordiaException);

+    boost::ptr_vector<AnubisSearchResult> anubisSearch(
+                                                   const std::string & pattern)
+                                                      throw(ConcordiaException);
+
    void loadRAMIndexFromDisk() throw(ConcordiaException);

    void refreshSAfromRAM() throw(ConcordiaException);
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -40,9 +40,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
            saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
            SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);

-            result.push_back(new SubstringOccurence(
-                        Utils::getIdFromMarker(marker),
-                        Utils::getOffsetFromMarker(marker)));
+            result.push_back(new SubstringOccurence(marker));
        }
    }

--- a/concordia/substring_occurence.cpp
+++ b/concordia/substring_occurence.cpp
@ -1,10 +1,21 @@
 #include "concordia/substring_occurence.hpp"
+#include "concordia/common/utils.hpp"


-SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
-                                       const SUFFIX_MARKER_TYPE & offset):
-                                       _id(id),
-                                       _offset(offset) {
+SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) {
+    _id = Utils::getIdFromMarker(marker);
+    _offset = Utils::getOffsetFromMarker(marker);
+    _exampleLength = Utils::getLengthFromMarker(marker);    
+}
+
+
+SubstringOccurence::SubstringOccurence(
+                                   const SUFFIX_MARKER_TYPE & id,
+                                   const SUFFIX_MARKER_TYPE & offset,
+                                   const SUFFIX_MARKER_TYPE & exampleLength):
+                                   _id(id),
+                                   _offset(offset),
+                                   _exampleLength(exampleLength) {
 }

 SubstringOccurence::~SubstringOccurence() {
--- a/concordia/substring_occurence.hpp
+++ b/concordia/substring_occurence.hpp
@ -13,9 +13,11 @@ using namespace std;

 class SubstringOccurence {
 public:
-    explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
-                                const SUFFIX_MARKER_TYPE & offset);
+    explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);

+    SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
+                                const SUFFIX_MARKER_TYPE & offset,
+                                const SUFFIX_MARKER_TYPE & exampleLength);
    /*! Destructor.
    */
    virtual ~SubstringOccurence();
@ -28,10 +30,17 @@ public:
        return _offset;
    }

+    SUFFIX_MARKER_TYPE getExampleLength() const {
+        return _exampleLength;
+    }
+
 private:
    SUFFIX_MARKER_TYPE _id;

    SUFFIX_MARKER_TYPE _offset;
+
+    // the example 
+    SUFFIX_MARKER_TYPE _exampleLength;
 };

 #endif
--- a/concordia/t/test_anubis_searcher.cpp
+++ b/concordia/t/test_anubis_searcher.cpp
@ -21,40 +21,115 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
       word map:  b=1,a=2,n=3,z=4
    */

+    T->push_back(0);
+    T->push_back(0);
+    T->push_back(0);
    T->push_back(1);
+
+    T->push_back(0);
+    T->push_back(0);
+    T->push_back(0);
    T->push_back(2);
+
+    T->push_back(0);
+    T->push_back(0);
+    T->push_back(0);
    T->push_back(3);
+
+    T->push_back(0);
+    T->push_back(0);
+    T->push_back(0);
    T->push_back(2);
+
+    T->push_back(0);
+    T->push_back(0);
+    T->push_back(0);
    T->push_back(3);
+
+    T->push_back(0);
+    T->push_back(0);
+    T->push_back(0);
    T->push_back(2);
    
    for(int i=0;i<6;i++) {
        markers->push_back(Utils::createMarker(34,i,6));
    }
    
+    pattern->push_back(0);
+    pattern->push_back(0);
+    pattern->push_back(0);
    pattern->push_back(2);
+
+    pattern->push_back(0);
+    pattern->push_back(0);
+    pattern->push_back(0);
    pattern->push_back(3);
-    pattern->push_back(4);
+
+    pattern->push_back(0);
+    pattern->push_back(0);
+    pattern->push_back(0);
    pattern->push_back(4);

-    /* Suffix array for the hashed index: 1 2 3 2 3 2
-        0: 1  2  3  2  3  2
-        5: 2
-        3: 2  3  2
-        1: 2  3  2  3  2
-        4: 3  2
-        2: 3  2  3  2
+    pattern->push_back(0);
+    pattern->push_back(0);
+    pattern->push_back(0);
+    pattern->push_back(4);
+    
+    /* Suffix array for the hashed index: 0001 0002 0003 0002 0003 0002
+         0:000100020003000200030002
+        20:0002
+        12:000200030002
+         4:00020003000200030002
+        16:00030002
+         8:0003000200030002
+         1:00100020003000200030002
+        21:002
+        13:00200030002
+         5:0020003000200030002
+        17:0030002
+         9:003000200030002
+         2:0100020003000200030002
+        22:02
+        14:0200030002
+         6:020003000200030002
+        18:030002
+        10:03000200030002
+         3:100020003000200030002
+        23:2
+        15:200030002
+         7:20003000200030002
+        19:30002
+        11:3000200030002
    */
    
    SA->push_back(0);
-    SA->push_back(5);
-    SA->push_back(3);
-    SA->push_back(1);
+    SA->push_back(20);
+    SA->push_back(12);
    SA->push_back(4);
+    SA->push_back(16);
+    SA->push_back(8);
+    SA->push_back(1);
+    SA->push_back(21);
+    SA->push_back(13);
+    SA->push_back(5);
+    SA->push_back(17);
+    SA->push_back(9);
    SA->push_back(2);
+    SA->push_back(22);
+    SA->push_back(14);
+    SA->push_back(6);
+    SA->push_back(18);
+    SA->push_back(10);
+    SA->push_back(3);
+    SA->push_back(23);
+    SA->push_back(15);
+    SA->push_back(7);
+    SA->push_back(19);
+    SA->push_back(11);

-    SUFFIX_MARKER_TYPE length;
-    boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, length);
+    SUFFIX_MARKER_TYPE highResLength;
+    boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
+    SUFFIX_MARKER_TYPE length = highResLength / sizeof(INDEX_CHARACTER_TYPE);

    /* Expecting to get the following results from SA:
        3: ana
@ -72,21 +147,46 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
    //--------pattern banana

    boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
    pattern2->push_back(1);
-    pattern2->push_back(2);
-    pattern2->push_back(3);
-    pattern2->push_back(2);
-    pattern2->push_back(3);
+
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
    pattern2->push_back(2);

-    SUFFIX_MARKER_TYPE length2;
-    boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, length2);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(3);
+
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(2);
+
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(3);
+
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(0);
+    pattern2->push_back(2);
+
+    SUFFIX_MARKER_TYPE highResLength2;
+    boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
+    SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);

    /* Expecting to get one result from SA:
        0: banana
       Which is one substring occurence (34,0) with the lcp length = 6;
    */

+
    BOOST_CHECK_EQUAL(result2.size(),1);
    BOOST_CHECK_EQUAL(length2,6);
    BOOST_CHECK_EQUAL(result2.at(0).getId(),34);
@ -95,14 +195,34 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
    //--------pattern banan

    boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(0);
    pattern3->push_back(1);
+
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(0);
    pattern3->push_back(2);
-    pattern3->push_back(3);
-    pattern3->push_back(2);
+
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(0);
    pattern3->push_back(3);

-    SUFFIX_MARKER_TYPE length3;
-    boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, length3);
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(2);
+
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(0);
+    pattern3->push_back(3);
+
+    SUFFIX_MARKER_TYPE highResLength3;
+    boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
+    SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);

    /* Expecting to get one result from SA:
        0: banana
@ -117,13 +237,29 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
    //--------pattern nazz

    boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
+    pattern4->push_back(0);
+    pattern4->push_back(0);
+    pattern4->push_back(0);
    pattern4->push_back(3);
+
+    pattern4->push_back(0);
+    pattern4->push_back(0);
+    pattern4->push_back(0);
    pattern4->push_back(2);
-    pattern4->push_back(4);
+
+    pattern4->push_back(0);
+    pattern4->push_back(0);
+    pattern4->push_back(0);
    pattern4->push_back(4);

-    SUFFIX_MARKER_TYPE length4;
-    boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, length4);
+    pattern4->push_back(0);
+    pattern4->push_back(0);
+    pattern4->push_back(0);
+    pattern4->push_back(4);
+
+    SUFFIX_MARKER_TYPE highResLength4;
+    boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
+    SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);

    /* Expecting to get 2 results from SA:
        4: na
@ -137,6 +273,60 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
    BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4);
    BOOST_CHECK_EQUAL(result4.at(1).getId(),34);
    BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2);
+    
+
+    //--------pattern zz
+
+    boost::shared_ptr<std::vector<sauchar_t> > pattern5(new std::vector<sauchar_t>());
+    pattern5->push_back(0);
+    pattern5->push_back(0);
+    pattern5->push_back(0);
+    pattern5->push_back(4);
+
+    pattern5->push_back(0);
+    pattern5->push_back(0);
+    pattern5->push_back(0);
+    pattern5->push_back(4);
+
+    SUFFIX_MARKER_TYPE highResLength5;
+    boost::ptr_vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
+    SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
+
+    /* Expecting to get 0 results from SA, lcp length = 0;
+    */
+
+    BOOST_CHECK_EQUAL(result5.size(),0);
+    BOOST_CHECK_EQUAL(length5,0);
+
+    //--------pattern existing in the text but spanning over parts of characters
+
+    boost::shared_ptr<std::vector<sauchar_t> > pattern6(new std::vector<sauchar_t>());
+    pattern6->push_back(0);
+    pattern6->push_back(0);
+    pattern6->push_back(3);
+
+    pattern6->push_back(0);
+    pattern6->push_back(0);
+    pattern6->push_back(0);
+    pattern6->push_back(2);
+
+    pattern6->push_back(0);
+
+    SUFFIX_MARKER_TYPE highResLength6;
+    boost::ptr_vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
+    SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
+
+    /* Expecting to get 0 results from SA, lcp length = 0;
+    */
+
+    BOOST_CHECK_EQUAL(result6.size(),0);
+    BOOST_CHECK_EQUAL(length6,0);
+
+}
+
+BOOST_AUTO_TEST_CASE( AnubisSearch1 )
+{
+    
 }


--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -1,5 +1,6 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/concordia.hpp"
+#include "concordia/anubis_search_result.hpp"
 #include "tests/common/test_resources_manager.hpp"
 #include "concordia/common/config.hpp"

@ -148,5 +149,52 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
    BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
 }

+BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
+{
+    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
+    concordia.addExample(Example("Ala posiada kota",14));
+    concordia.addExample(Example("Ala posiada rysia",51));
+    concordia.addExample(Example("Marysia posiada rysia",123));
+    concordia.refreshSAfromRAM();
+        
+    /*The test index contains 3 sentences:    
+     14: "Ala posiada kota"
+     51: "Ala posiada rysia"
+    123: "Marysia posiada rysia"
+    
+    Test word map:
+    Ala -> 0
+    posiada -> 1
+    kota -> 2
+    rysia -> 3
+    Marysia -> 4
+    
+    Test hashed index:
+        n: 0  1  2  3  4  5  6  7  8  9 10 11
+     T[n]: 0  1  2  |  0  1  3  |  4  1  3  |
+    
+    Test suffix array:
+        n: 0  1  2  3  4  5  6  7  8  9 10 11
+    SA[n]: 0  4  1  9  5  2 10  6  8 11  3  7 
+    
+    */    
+    boost::ptr_vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
+    boost::ptr_vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
+
+    boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); 
+    boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); 
+    boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); 
+
+    /*
+    BOOST_CHECK_EQUAL(searchResult1.size(), 2);
+    BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
+    BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
+    BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
+    BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
+    
+    // Checking pattern spanning over 2 segments
+    BOOST_CHECK_EQUAL(searchResult2.size(), 0);
+    */
+}

 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -32,6 +32,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
 }

+/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
+                   Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
 BOOST_AUTO_TEST_CASE( TooLongHashTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
@ -62,6 +64,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )

    
 }
+*/

 BOOST_AUTO_TEST_CASE( HashSerializationTest )
 {