done word positions

2015-06-26 22:50:53 +02:00 · 2015-06-26 22:50:53 +02:00 · dba70b4e24
commit dba70b4e24
parent 724bf0d080
19 changed files with 178 additions and 150 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,10 +1,7 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
-
 - repair stop words feature
- work on word regex pattern (allow for some symbols and digits within the word)
 - document the code (classes, cfg files) and update tutorial
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
 - testy zużycia pamięci
 - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
@ -13,6 +10,11 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś


 ---------------------------- Archive -----------------------------
+DONE - deal with 0 length patterns
+DONE - repair concordia-console test feature
+DONE - update tests
+DONE - work on word regex pattern (allow for some symbols and digits within the word)
+REJECTED - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
 DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
 DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
@ -26,7 +28,7 @@ REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie

 DONE - wyłączyć stopWords

-DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
+DONE - Przy concordia searCh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)

 DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. 

--- a/concordia-console/concordia-console.cpp
+++ b/concordia-console/concordia-console.cpp
@ -8,6 +8,7 @@

 #include "concordia/concordia.hpp"
 #include "concordia/substring_occurence.hpp"
+#include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include "concordia/common/utils.hpp"
 #include "build/libdivsufsort/include/divsufsort.h"
@ -27,30 +28,32 @@ void checkConcordiaResults(
        long baseLineCount) {
    long lineIndex = 1;
    BOOST_FOREACH(ConcordiaSearchResult result, results) {
-        SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
-
-        if (result.getBestOverlay().size() != 1) {
-            reportError(baseLineCount + lineIndex,
-                       "best overlay has more than one fragment.");
-        }
-        if (result.getBestOverlay().at(0).getMatchedLength()
-             != patternSize) {
-            reportError(baseLineCount + lineIndex,
-                   "best overlay fragment has different size than pattern.");
-        }
-        if (result.getBestOverlayScore() != 1) {
-            reportError(baseLineCount + lineIndex,
-                   "best overlay score is not 1.");
-        }
-        if (result.getFragments().size() == 0) {
-            reportError(baseLineCount + lineIndex,
-                   "there are no matched fragments.");
-        }
-        if (result.getFragments().at(0).getMatchedLength()
-             != patternSize) {
-            reportError(baseLineCount + lineIndex,
-                   "the first fragment does not cover the whole pattern.");
+        SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
+        if (patternSize > 0) {                
+            if (result.getBestOverlay().size() != 1) {
+                reportError(baseLineCount + lineIndex,
+                           "best overlay has more than one fragment.");
+            }
+            if (result.getBestOverlay().at(0).getMatchedLength()
+                 != patternSize) {
+                reportError(baseLineCount + lineIndex,
+                       "best overlay fragment has different size than pattern.");
+            }
+            if (result.getBestOverlayScore() != 1) {
+                reportError(baseLineCount + lineIndex,
+                       "best overlay score is not 1.");
+            }
+            if (result.getFragments().size() == 0) {
+                reportError(baseLineCount + lineIndex,
+                       "there are no matched fragments.");
+            }
+            if (result.getFragments().at(0).getMatchedLength()
+                 != patternSize) {
+                reportError(baseLineCount + lineIndex,
+                       "the first fragment does not cover the whole pattern.");
+            }
        }
+        lineIndex++;
    }
 }

@ -198,8 +201,8 @@ int main(int argc, char** argv) {
            msdiff = time_end - time_start;

            std::cout << "\tPattern used: " << std::endl << "\t\t";
-            BOOST_FOREACH(std::string token, result->getTokenVector()) {
-                std::cout << token << " ";
+            BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
+                std::cout << annotation.getValue() << " ";
            }
            std::cout << std::endl;

--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -44,16 +44,16 @@ std::string _createLibraryVersion() {

 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-void Concordia::addExample(const Example & example)
+boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
                                 throw(ConcordiaException) {
-    _index->addExample(_hashGenerator, _T, _markers, example);
+    return _index->addExample(_hashGenerator, _T, _markers, example);
 }

 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-void Concordia::addAllExamples(const std::vector<Example> & examples)
+std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
                                              throw(ConcordiaException) {
-    _index->addAllExamples(_hashGenerator, _T, _markers, examples);
+    return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
 }

 void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
@ -163,9 +163,9 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
        return _searcher->concordiaSearch(_hashGenerator, _T,
                                       _markers, _SA, pattern);
    } else {
-        std::vector<std::string> empty;
+        std::string empty;
        return boost::shared_ptr<ConcordiaSearchResult>(
-            new ConcordiaSearchResult(empty));
+            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
    }
 }

--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -13,6 +13,7 @@
 #include "concordia/concordia_index.hpp"
 #include "concordia/index_searcher.hpp"
 #include "concordia/concordia_search_result.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/anubis_search_result.hpp"
 #include <divsufsort.h>

@ -55,13 +56,13 @@ public:
      \param example example to be added
      \throws ConcordiaException
    */
-    void addExample(const Example & example) throw(ConcordiaException);
+    boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);

    /*! Adds multiple examples to the index.
      \param examples vector of examples to be added
      \throws ConcordiaException
    */
-    void addAllExamples(const std::vector<Example> & examples)
+    std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
                                                   throw(ConcordiaException);

    /*! Performs a simple substring lookup on the index.
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -34,25 +34,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
    return result;
 }

-void ConcordiaIndex::addExample(
-                boost::shared_ptr<HashGenerator> hashGenerator,
-                boost::shared_ptr<std::vector<sauchar_t> > T,
-                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                const Example & example) {
-    std::ofstream hashedIndexFile;
-    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
-                                             std::ios::app|std::ios::binary);
-    std::ofstream markersFile;
-    markersFile.open(_markersFilePath.c_str(), std::ios::out|
-                                             std::ios::app|std::ios::binary);
-    _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
-                                                      T, markers, example);
-    hashedIndexFile.close();
-    markersFile.close();
-    hashGenerator->serializeWordMap();
-}
-
-void ConcordiaIndex::addAllExamples(
+std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -64,25 +46,50 @@ void ConcordiaIndex::addAllExamples(
    markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);

+    std::vector<TokenizedSentence> hashedPatterns;
    BOOST_FOREACH(Example example, examples) {
-        _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
+        boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
+        hashedPatterns.push_back(*hashedPattern);
    }

    hashedIndexFile.close();
    markersFile.close();
    hashGenerator->serializeWordMap();
+    
+    return hashedPatterns;
 }

-void ConcordiaIndex::_addSingleExample(
+boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
+                boost::shared_ptr<HashGenerator> hashGenerator,
+                boost::shared_ptr<std::vector<sauchar_t> > T,
+                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
+                const Example & example) {
+    std::ofstream hashedIndexFile;
+    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
+                                             std::ios::app|std::ios::binary);
+    std::ofstream markersFile;
+    markersFile.open(_markersFilePath.c_str(), std::ios::out|
+                                             std::ios::app|std::ios::binary);
+    boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
+                                                      T, markers, example);
+    hashedIndexFile.close();
+    markersFile.close();
+    hashGenerator->serializeWordMap();
+    
+    return hashedPattern;
+}
+
+boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                   const Example & example) {
-    std::vector<INDEX_CHARACTER_TYPE> hash
-                          = hashGenerator->generateHash(example.getSentence());
+    boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
+    std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
+                          
    int offset = 0;
    for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
                                          it != hash.end(); ++it) {
@ -110,5 +117,7 @@ void ConcordiaIndex::_addSingleExample(
    SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
    Utils::writeMarker(markersFile, sentenceBoundaryMA);
    markers->push_back(sentenceBoundaryMA);
+    
+    return hashedPattern;
 }

--- a/concordia/concordia_index.hpp
+++ b/concordia/concordia_index.hpp
@ -11,6 +11,7 @@
 #include "concordia/example.hpp"
 #include "concordia/hash_generator.hpp"
 #include "concordia/concordia_exception.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include <divsufsort.h>

 /*!
@ -50,7 +51,7 @@ public:
      \param example example to be added to index
      \throws ConcordiaException
    */
-    void addExample(
+    boost::shared_ptr<TokenizedSentence> addExample(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -68,7 +69,7 @@ public:
      \param examples vector of examples to be added to index
      \throws ConcordiaException
    */
-    void addAllExamples(
+    std::vector<TokenizedSentence> addAllExamples(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -82,7 +83,7 @@ public:
                boost::shared_ptr<std::vector<sauchar_t> > T);

 private:
-    void _addSingleExample(std::ofstream & hashedIndexFile,
+    boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
                std::ofstream & markersFile,
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
--- a/concordia/concordia_search_result.cpp
+++ b/concordia/concordia_search_result.cpp
@ -4,8 +4,8 @@
 #include <algorithm>

 ConcordiaSearchResult::ConcordiaSearchResult(
-                        const std::vector<std::string> & tokenVector):
-                                   _tokenVector(tokenVector),
+                boost::shared_ptr<TokenizedSentence> tokenizedPattern):
+                                   _tokenizedPattern(tokenizedPattern),
                                   _bestOverlayScore(0) {
 }

@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
    // the fragments are already sorted by their ends, ascending
    _checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
                           -1,
-                           _tokenVector.size());
+                           _tokenizedPattern->getTokens().size());
 }

 void ConcordiaSearchResult::_checkPossibleOverlays(
--- a/concordia/concordia_search_result.hpp
+++ b/concordia/concordia_search_result.hpp
@ -3,7 +3,9 @@

 #include "concordia/common/config.hpp"
 #include "concordia/matched_pattern_fragment.hpp"
+#include "concordia/tokenized_sentence.hpp"

+#include <boost/shared_ptr.hpp>
 #include <vector>
 #include <string>

@ -25,7 +27,7 @@ public:
      \param tokenVector tokenized pattern which was used for searching
    */
    explicit ConcordiaSearchResult(
-                const std::vector<std::string> & tokenVector);
+                boost::shared_ptr<TokenizedSentence> tokenizedPattern);

    /*! Destructor.
    */
@ -49,8 +51,8 @@ public:
    /*! Getter for tokenized pattern.
        \returns tokenized search pattern
    */
-    std::vector<std::string> getTokenVector() const {
-        return _tokenVector;
+    boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
+        return _tokenizedPattern;
    }

    /*! Getter for all matched pattern fragments list.
@ -80,7 +82,7 @@ private:
                SUFFIX_MARKER_TYPE lastAddedPos,
                SUFFIX_MARKER_TYPE patternSize);

-    std::vector<std::string> _tokenVector;
+    boost::shared_ptr<TokenizedSentence> _tokenizedPattern;

    std::vector<MatchedPatternFragment> _matchedPatternFragments;

--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -27,10 +27,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
 HashGenerator::~HashGenerator() {
 }

-std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
+boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
                     const std::string & sentence) throw(ConcordiaException) {
-    std::vector<INDEX_CHARACTER_TYPE> result;
-    
    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
    ts->generateHash(_wordMap);
    
@ -38,23 +36,9 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
        throw ConcordiaException("Trying to add too long sentence.");
    }

-    return ts->getCodes();
+    return ts;
 }

-std::vector<std::string> HashGenerator::generateTokenVector(
-                                               const std::string & sentence) {
-    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
-    std::vector<std::string> tokenTexts;
-    BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
-        if (annotation.getType() == TokenAnnotation::WORD ||
-               annotation.getType() == TokenAnnotation::NE) {
-            tokenTexts.push_back(annotation.getValue());
-        } 
-    }
-    return tokenTexts;
-}
-
-
 void HashGenerator::serializeWordMap() {
    std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
    boost::archive::binary_oarchive oa(ofs);
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -44,20 +44,9 @@ public:
      \param sentence sentence to generate hash from
      \returns vector of integers
    */
-    std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
+    boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
                                throw(ConcordiaException);

-    /*!
-      Generates vector of tokens from a sentence. This method is internally
-      used by generateHash. However, for the sake of concordiaSearch
-      (see \ref tutorial1_3), the vector of tokens resulting from sentence
-      tokenization is also needed.
-      \param sentence sentence to tokenize
-      \returns vector of tokens
-    */
-    std::vector<std::string> generateTokenVector(const std::string & sentence);
-
-
    /*!
        Saves the contents of current WordMap to HDD.
    */
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -1,6 +1,7 @@
 #include "concordia/index_searcher.hpp"

 #include "concordia/common/utils.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include <boost/filesystem.hpp>

 IndexSearcher::IndexSearcher() {
@ -22,7 +23,7 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(

    int left;
    std::vector<INDEX_CHARACTER_TYPE> hash =
-                                 hashGenerator->generateHash(pattern);
+                            hashGenerator->generateHash(pattern)->getCodes();
    saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
    sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);

@ -56,7 +57,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern) throw(ConcordiaException) {
    std::vector<INDEX_CHARACTER_TYPE> hash =
-                                 hashGenerator->generateHash(pattern);
+                         hashGenerator->generateHash(pattern)->getCodes();
    return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
 }

@ -66,12 +67,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
                  boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern) throw(ConcordiaException) {
-    std::vector<INDEX_CHARACTER_TYPE> hash =
+    boost::shared_ptr<TokenizedSentence> hashedPattern =
                                 hashGenerator->generateHash(pattern);
    boost::shared_ptr<ConcordiaSearchResult> result =
     boost::shared_ptr<ConcordiaSearchResult>(
-       new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
+       new ConcordiaSearchResult(hashedPattern));

-    _concordiaSearcher->concordiaSearch(result, T, markers, SA, hash);
+    _concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
    return result;
 }
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@ -1,4 +1,5 @@
 add_library(concordia-tests
+  test_hash_generator.cpp
  test_regex_rule.cpp
  test_tokenized_sentence.cpp
  test_concordia_searcher.cpp
@ -10,7 +11,6 @@ add_library(concordia-tests
  test_logging.cpp
  test_utils.cpp
  test_word_map.cpp
-  test_hash_generator.cpp
  test_concordia_index.cpp
  test_concordia_config.cpp
  test_concordia.cpp
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -1,13 +1,17 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/concordia.hpp"
 #include "concordia/anubis_search_result.hpp"
+#include "concordia/tokenized_sentence.hpp"
+#include "concordia/token_annotation.hpp"
 #include "tests/common/test_resources_manager.hpp"
 #include "concordia/common/config.hpp"

+#include <boost/shared_ptr.hpp>
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/filesystem.hpp>

 #include <string>
+#include <vector>

 BOOST_AUTO_TEST_SUITE(concordia_main)

@ -22,7 +26,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
 BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
-    concordia.addExample(Example("Ala posiada kota",14));
+    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
+    /*
+    0,3 type: 1 value: ala
+    4,11 type: 1 value: posiada
+    12,16 type: 1 value: kota
+    */
+    BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
+    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
+    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
+    
    concordia.addExample(Example("Ala posiada rysia",51));
    concordia.addExample(Example("Marysia posiada rysia",123));
    concordia.refreshSAfromRAM();
@ -62,7 +77,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
    
    // Checking pattern spanning over 2 segments
    BOOST_CHECK_EQUAL(searchResult2.size(), 0);
-
 }

 BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
@ -74,7 +88,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
    testExamples.push_back(Example("czy xjest okno otwarte",202));
    testExamples.push_back(Example("chyba xto xjest xtutaj",45));
    testExamples.push_back(Example("xto xjest",29));
-    concordia.addAllExamples(testExamples);
+    std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
+    /* checking hashed pattern of sentence "chyba xto xjest xtutuaj":
+    0,5 type: 1 value: chyba
+    6,9 type: 1 value: xto
+    10,15 type: 1 value: xjest
+    16,22 type: 1 value: xtutaj
+    */
+    BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getStart(), 10);
+    BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getEnd(), 15);
+    BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
+    BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");

    /*The test index contains 4 sentences:    
    312: "xto xjest okno"
--- a/concordia/t/test_concordia_searcher.cpp
+++ b/concordia/t/test_concordia_searcher.cpp
@ -354,7 +354,6 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
    Test suffix array:
        n: 0  1  2  3  4  5  6  7  8  9 10 11
    SA[n]: 0  4  1  9  5  2 10  6  8 11  3  7 
-    
    */
        
    ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX),
@ -363,7 +362,6 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
                                new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    boost::shared_ptr<HashGenerator> hashGenerator(new HashGenerator(config));

-    
    boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
    boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());

@ -373,12 +371,12 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )

    boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
        
-    
    // searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
    
-    std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
+    std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
    
    boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
+
    BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);

    TmMatches * tmMatches14 = tmMatchesMap->find(14)->second;
@ -436,5 +434,4 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )

 }

-
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -4,8 +4,11 @@
 #include <sstream>

 #include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+
 #include "concordia/common/config.hpp"
 #include "concordia/hash_generator.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "tests/common/test_resources_manager.hpp"

 BOOST_AUTO_TEST_SUITE(hash_generator)
@ -20,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    
    HashGenerator hashGenerator = HashGenerator(config);

-    std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
+    std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected;
    expected.push_back(0);
    expected.push_back(1);
@ -73,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
    
    HashGenerator hashGenerator1 = HashGenerator(config);

-    std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
+    std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected1;
    expected1.push_back(0);
    expected1.push_back(1);
@ -83,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
    hashGenerator1.serializeWordMap();
   
    HashGenerator hashGenerator2 = HashGenerator(config);
-    std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
+    std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected2;
    expected2.push_back(0);
    expected2.push_back(1);
@ -103,27 +106,48 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
    
    HashGenerator hashGenerator = HashGenerator(config);

-    std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014   o  godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował  samochód.");
-    std::vector<std::string> expected;
-    expected.push_back("ne_date");
-    expected.push_back("o");
-    expected.push_back("godzinie");
-    expected.push_back("ne_number");
-    expected.push_back("ne_number");
-    expected.push_back("doszło");
-    expected.push_back("do");
-    expected.push_back("kolizji");
-    expected.push_back("na");
-    expected.push_back("ulicy");
-    expected.push_back("grobla");
-    expected.push_back("policjanci");
-    expected.push_back("ustalili");
-    expected.push_back("że");
-    expected.push_back("kierowca");
-    expected.push_back("zaparkował");
-    expected.push_back("samochód");
+    boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014   o  godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował  samochód.");

-    BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector.begin(), tokenVector.end(), expected.begin(), expected.end());
+    std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
+
+    /*
+    BOOST_FOREACH(TokenAnnotation annotation, tokens) {
+        std::cout << annotation.getStart() << ","
+                  << annotation.getEnd() << " type: "
+                  << annotation.getType() << " value: "
+                  << annotation.getValue() << std::endl;
+    }
+    
+    0,10 type: 0 value: ne_date
+    13,14 type: 1 value: o
+    16,24 type: 1 value: godzinie
+    25,27 type: 0 value: ne_number
+    28,30 type: 0 value: ne_number
+    31,37 type: 1 value: doszło
+    38,40 type: 1 value: do
+    41,48 type: 1 value: kolizji
+    49,51 type: 1 value: na
+    52,57 type: 1 value: ulicy
+    58,64 type: 1 value: grobla
+    66,76 type: 1 value: policjanci
+    77,85 type: 1 value: ustalili
+    87,89 type: 1 value: że
+    93,101 type: 1 value: kierowca
+    106,116 type: 1 value: zaparkował
+    118,126 type: 1 value: samochód
+    */
+    
+    BOOST_CHECK_EQUAL(17,tokens.size());
+    
+    BOOST_CHECK_EQUAL(tokens.at(0).getStart(),0);
+    BOOST_CHECK_EQUAL(tokens.at(0).getEnd(),10);
+    BOOST_CHECK_EQUAL(tokens.at(0).getType(),TokenAnnotation::NE);
+    BOOST_CHECK_EQUAL(tokens.at(0).getValue(),"ne_date");
+        
+    BOOST_CHECK_EQUAL(tokens.at(15).getStart(),106);
+    BOOST_CHECK_EQUAL(tokens.at(15).getEnd(),116);
+    BOOST_CHECK_EQUAL(tokens.at(15).getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(tokens.at(15).getValue(),"zaparkował");
 }

 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -218,15 +218,6 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    
-    /*    
-    BOOST_FOREACH(TokenAnnotation annotation, annotations) {
-        std::cout << annotation.getStart() << ","
-                  << annotation.getEnd() << " type: "
-                  << annotation.getType() << " value: "
-                  << annotation.getValue() << std::endl;
-    }
-    */
-    
    /*
    0,4 type: 1 value: this
    5,7 type: 1 value: is
--- a/scripts/concordia-anubissearch-jrc.sh
+++ b/scripts/concordia-anubissearch-jrc.sh
--- a/scripts/concordia-concordiasearch-jrc.sh
+++ b/scripts/concordia-concordiasearch-jrc.sh
--- a/scripts/concordia-search-jrc.sh
+++ b/scripts/concordia-search-jrc.sh