finished original word positions

2015-06-27 12:40:24 +02:00 · 2015-06-27 12:40:24 +02:00 · 5a57406875
commit 5a57406875
parent a8c5fa0c75
22 changed files with 238 additions and 150 deletions
--- a/concordia-console/concordia-console.cpp
+++ b/concordia-console/concordia-console.cpp
@ -28,7 +28,8 @@ void checkConcordiaResults(
        long baseLineCount) {
    long lineIndex = 1;
    BOOST_FOREACH(ConcordiaSearchResult result, results) {
-        SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
+        SUFFIX_MARKER_TYPE patternSize =
+                    result.getTokenizedPattern()->getTokens().size();
        if (patternSize > 0) {
            if (result.getBestOverlay().size() != 1) {
                reportError(baseLineCount + lineIndex,
@ -201,7 +202,8 @@ int main(int argc, char** argv) {
            msdiff = time_end - time_start;

            std::cout << "\tPattern used: " << std::endl << "\t\t";
-            BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
+            BOOST_FOREACH(TokenAnnotation annotation,
+                                  result->getTokenizedPattern()->getTokens()) {
                std::cout << annotation.getValue() << " ";
            }
            std::cout << std::endl;
--- a/concordia/compilation.dox
+++ b/concordia/compilation.dox
@ -9,6 +9,7 @@ Before you compile, make sure you have these installed:
 - cmake
 - Boost library
 - Log4cpp
+- ICU
 - (optional) Doxygen
 - (optional) TeX

--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -44,14 +44,16 @@ std::string _createLibraryVersion() {

 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
+boost::shared_ptr<TokenizedSentence> Concordia::addExample(
+                                      const Example & example)
                                      throw(ConcordiaException) {
    return _index->addExample(_hashGenerator, _T, _markers, example);
 }

 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
+std::vector<TokenizedSentence> Concordia::addAllExamples(
+                                         const std::vector<Example> & examples)
                                         throw(ConcordiaException) {
    return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
 }
@ -165,7 +167,8 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
    } else {
        std::string empty;
        return boost::shared_ptr<ConcordiaSearchResult>(
-            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
+            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
+                                            new TokenizedSentence(empty))));
    }
 }

@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) {
    boost::filesystem::remove(_config->getMarkersFilePath());
 }

-
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -54,15 +54,21 @@ public:

    /*! Adds an Example to the index.
      \param example example to be added
+      \returns tokenized sentence object,
+               containing information about original word positions
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
+    boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
+                                                   throw(ConcordiaException);

    /*! Adds multiple examples to the index.
      \param examples vector of examples to be added
+      \returns vector of tokenized sentence objects,
+               containing information about original word positions
      \throws ConcordiaException
    */
-    std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
+    std::vector<TokenizedSentence> addAllExamples(
+                                const std::vector<Example> & examples)
                                throw(ConcordiaException);

    /*! Performs a simple substring lookup on the index.
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -48,7 +48,8 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(

    std::vector<TokenizedSentence> hashedPatterns;
    BOOST_FOREACH(Example example, examples) {
-        boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
+        boost::shared_ptr<TokenizedSentence> hashedPattern =
+             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
        hashedPatterns.push_back(*hashedPattern);
    }
@ -71,7 +72,8 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
    std::ofstream markersFile;
    markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
-    boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
+    boost::shared_ptr<TokenizedSentence> hashedPattern =
+             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
    hashedIndexFile.close();
    markersFile.close();
@ -87,7 +89,8 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                   const Example & example) {
-    boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
+    boost::shared_ptr<TokenizedSentence> hashedPattern =
+                    hashGenerator->generateHash(example.getSentence());
    std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();

    int offset = 0;
--- a/concordia/concordia_index.hpp
+++ b/concordia/concordia_index.hpp
@ -44,11 +44,13 @@ public:
        and markers array (also passed to this method) are appended
        with the hashed example. At the same time, HDD versions of these
        two data structures are also appended with the same example.
+        The method returns a tokenized version of the example.
      \param hashGenerator hash generator to be used to prepare the hash
             of the example
      \param T RAM-based hash index to be appended to
      \param markers RAM-based markers array to be appended to
      \param example example to be added to index
+      \returns tokenized example
      \throws ConcordiaException
    */
    boost::shared_ptr<TokenizedSentence> addExample(
@ -62,11 +64,13 @@ public:
        and markers array (also passed to this method) are appended
        with the hashed examples. At the same time, HDD versions of these
        two data structures are also appended with the same examples.
+        The method returns a vector of tokenized examples.
      \param hashGenerator hash generator to be used to prepare the hash
             of the example
      \param T RAM-based hash index to be appended to
      \param markers RAM-based markers array to be appended to
      \param examples vector of examples to be added to index
+      \returns vector of tokenized examples
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> addAllExamples(
@ -83,7 +87,8 @@ public:
                boost::shared_ptr<std::vector<sauchar_t> > T);

 private:
-    boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
+    boost::shared_ptr<TokenizedSentence> _addSingleExample(
+                std::ofstream & hashedIndexFile,
                std::ofstream & markersFile,
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -29,7 +29,8 @@ HashGenerator::~HashGenerator() {

 boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
                     const std::string & sentence) throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
+    boost::shared_ptr<TokenizedSentence> ts =
+                                    _sentenceTokenizer->tokenize(sentence);
    ts->generateHash(_wordMap);

    if (ts->getTokens().size() > Utils::maxSentenceSize) {
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -15,14 +15,14 @@

 /*!
  Class for generating a sentence hash. The hash is generated from a sentence
-  given in raw string. String is first anonymized and tokenized. After these
-  operations, each token is coded as an integer, according to WordMap.
-  Resulting hash is a vector of integers.
+  given in raw string. String is first tokenized by SentenceTokenizer and
+  then each token is coded as an integer, according to WordMap.
+  Resulting hash is an instance of TokenizedSentence.
  
-  Sentence hashed is used when adding a sentence to index and during searching.
+  Hashed sentence is used when adding a sentence to index and during searching.
  
  HashGenerator holds an instance of WordMap, used to code tokens as integers
-  and SentenceAnonymizer, used to preprocess the sentence string.
+  and SentenceTokenizer, used to tokenize the sentence string.

 */

@ -42,9 +42,10 @@ public:
    /*!
      Generates hash of a sentence.
      \param sentence sentence to generate hash from
-      \returns vector of integers
+      \returns tokenized sentence, containing the hash
    */
-    boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
+    boost::shared_ptr<TokenizedSentence> generateHash(
+                                const std::string & sentence)
                                throw(ConcordiaException);

    /*!
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -73,6 +73,7 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
     boost::shared_ptr<ConcordiaSearchResult>(
       new ConcordiaSearchResult(hashedPattern));

-    _concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
+    _concordiaSearcher->concordiaSearch(result, T, markers,
+                                        SA, hashedPattern->getCodes());
    return result;
 }
--- a/concordia/interval.hpp
+++ b/concordia/interval.hpp
@ -51,9 +51,12 @@ public:
        return _end;
    }

-    friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
-        return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
+    friend std::ostream & operator << (std::ostream & o,
+                                       const Interval & interval) {
+        return o << "[" << interval.getStart()
+                 << "," << interval.getEnd() << ")";
    }
+
 protected:
    SUFFIX_MARKER_TYPE _start;

--- a/concordia/matched_pattern_fragment.hpp
+++ b/concordia/matched_pattern_fragment.hpp
@ -6,7 +6,7 @@

 /*!
  Class representing matched pattern fragment in concordia search.
-  This fragment can be seen as an interval of the pattern.
+  This fragment can be seen as a word interval of the pattern.
  
  This class holds information about:
  - where the pattern fragment was matched (example id and example offset)
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString,
                               _value(value)                  {
    try {
        if (caseSensitive) {
-            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
+            _pattern = boost::make_u32regex(
+                  UnicodeString(patternString.c_str()));
        } else {
-            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
+            _pattern = boost::make_u32regex(
+                  UnicodeString(patternString.c_str()), boost::regex::icase);
        }
    } catch(const std::exception & e) {
        std::stringstream ss;
@ -37,7 +39,8 @@ RegexRule::~RegexRule() {
 void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
    try {
        UnicodeString s(sentence->getSentence().c_str());
-        boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
+        boost::u32regex_iterator<const UChar*> begin(
+                             boost::make_u32regex_iterator(s, _pattern));
        boost::u32regex_iterator<const UChar*> end;
        std::vector<TokenAnnotation> annotations;
        for (; begin != end; ++begin) {
@ -51,14 +54,16 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
            } else {
                value = _value;
            }
-            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
+            TokenAnnotation annotation(matchBegin, matchEnd,
+                                      _annotationType, value);
            annotations.push_back(annotation);
        }
        sentence->addAnnotations(annotations);
    } catch(const std::exception & e) {
        std::stringstream ss;
        ss << "Exception while applying regex rule: "
-                          << _annotationType << " to text: " << sentence->getSentence();
+                          << _annotationType << " to text: "
+                          << sentence->getSentence();
        ss << ", message: " << e.what();
        throw ConcordiaException(ss.str());
    }
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@ -15,8 +15,9 @@ typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;

 /*!
  Class for representing a regular expression annotation rule.
-  Holds regex pattern string for matching and replacement string for
-  annotating found matches.
+  Holds regex pattern string for matching and default value to assign
+  to the annotations. Rule also has a type, given to all annotations
+  produced by it.

 */
 class RegexRule {
@ -25,6 +26,7 @@ public:
      Constructor.
        \param patternString regex pattern to match
        \param annoationType type of annotation
+        \param value value to be assigned to the annotation
        \param caseSensitive case sensitivity of the pattern
    */
    RegexRule(std::string patternString,
@ -37,7 +39,7 @@ public:
    */
    virtual ~RegexRule();

-    /*! Applies the operation on anonymized sentence.
+    /*! Applies regex annotation on tokenized sentence.
      \param sentence the input sentence
    */
    void apply(boost::shared_ptr<TokenizedSentence> sentence);
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -42,7 +42,8 @@ boost::shared_ptr<TokenizedSentence>
    }

    boost::shared_ptr<RegexRule> wordsRule(
-                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
+                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
+                                      TokenAnnotation::WORD, ""));
    wordsRule->apply(result);
    boost::shared_ptr<RegexRule> singleLetterWordsRule(
                        new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
    tagsExpression += "br).*?>";
    _htmlTags = boost::shared_ptr<RegexRule>(
-                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
+                        new RegexRule(tagsExpression,
+                                TokenAnnotation::HTML_TAG, "", false));
 }

 boost::shared_ptr<RegexRule>
--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@ -14,10 +14,9 @@

 /*!
  Class for tokenizing sentence before generating hash.
-  This operation is is used to
-  remove unnecessary symbols and possibly words from sentences added to index
-  and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
-  as well as annotates named entities and special symbols. All these have to be listed in files
+  Tokenizer ignores unnecessary symbols, html tags and possibly stop words
+  (if the option is enabled) in sentences added to index
+  as well as annotates named entities. All these have to be listed in files
  (see \ref tutorial3).
 */

@ -35,7 +34,7 @@ public:

    /*! Tokenizes the sentence.
      \param sentence input sentence
-      \returns altered version of the input sentence
+      \returns tokenized sentence object build on the input sentence
    */
    boost::shared_ptr<TokenizedSentence>
                                   tokenize(const std::string & sentence);
@ -58,7 +57,6 @@ private:
    bool _stopWordsEnabled;

    boost::shared_ptr<RegexRule> _stopWords;
-
 };

 #endif
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation )

    BOOST_CHECK_EQUAL(iter->getStart(),7);
    BOOST_CHECK_EQUAL(iter->getEnd(),8);
-    iter++;
+    ++iter;
    
    BOOST_CHECK_EQUAL(iter->getStart(),11);
    BOOST_CHECK_EQUAL(iter->getEnd(),12);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),14);
    BOOST_CHECK_EQUAL(iter->getEnd(),15);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),16);
    BOOST_CHECK_EQUAL(iter->getEnd(),17);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),18);
    BOOST_CHECK_EQUAL(iter->getEnd(),19);  
@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )

    BOOST_CHECK_EQUAL(iter->getStart(),3);
    BOOST_CHECK_EQUAL(iter->getEnd(),4);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),19);
    BOOST_CHECK_EQUAL(iter->getEnd(),20);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),21);
    BOOST_CHECK_EQUAL(iter->getEnd(),22);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),44);
    BOOST_CHECK_EQUAL(iter->getEnd(),45);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),45);
    BOOST_CHECK_EQUAL(iter->getEnd(),46);  
@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )

    BOOST_CHECK_EQUAL(iter->getStart(),8);
    BOOST_CHECK_EQUAL(iter->getEnd(),11);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),16);
    BOOST_CHECK_EQUAL(iter->getEnd(),19);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),24);
    BOOST_CHECK_EQUAL(iter->getEnd(),27);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),32);
    BOOST_CHECK_EQUAL(iter->getEnd(),35);
@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )

    BOOST_CHECK_EQUAL(iter->getStart(),11);
    BOOST_CHECK_EQUAL(iter->getEnd(),12);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),29);
    BOOST_CHECK_EQUAL(iter->getEnd(),30);
@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )

    BOOST_CHECK_EQUAL(iter->getStart(),2);
    BOOST_CHECK_EQUAL(iter->getEnd(),3);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),3);
    BOOST_CHECK_EQUAL(iter->getEnd(),4);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),4);
    BOOST_CHECK_EQUAL(iter->getEnd(),5);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),5);
    BOOST_CHECK_EQUAL(iter->getEnd(),6);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),8);
    BOOST_CHECK_EQUAL(iter->getEnd(),9);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),9);
    BOOST_CHECK_EQUAL(iter->getEnd(),10);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),11);
    BOOST_CHECK_EQUAL(iter->getEnd(),12);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),15);
    BOOST_CHECK_EQUAL(iter->getEnd(),16);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),16);
    BOOST_CHECK_EQUAL(iter->getEnd(),17);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),20);
    BOOST_CHECK_EQUAL(iter->getEnd(),21);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),21);
    BOOST_CHECK_EQUAL(iter->getEnd(),22);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),22);
    BOOST_CHECK_EQUAL(iter->getEnd(),23);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),23);
    BOOST_CHECK_EQUAL(iter->getEnd(),24);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),26);
    BOOST_CHECK_EQUAL(iter->getEnd(),27);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),27);
    BOOST_CHECK_EQUAL(iter->getEnd(),28);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),29);
    BOOST_CHECK_EQUAL(iter->getEnd(),30);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),33);
    BOOST_CHECK_EQUAL(iter->getEnd(),34);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),34);
    BOOST_CHECK_EQUAL(iter->getEnd(),35);
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest )
    BOOST_CHECK_EQUAL(iter->getEnd(),4);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "date");
-    iter++;
+    ++iter;
    
    BOOST_CHECK_EQUAL(iter->getStart(),6);
    BOOST_CHECK_EQUAL(iter->getEnd(),16);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),18);
    BOOST_CHECK_EQUAL(iter->getEnd(),22);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "mail");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),24);
    BOOST_CHECK_EQUAL(iter->getEnd(),40);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),42);
    BOOST_CHECK_EQUAL(iter->getEnd(),48);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "number");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),50);
    BOOST_CHECK_EQUAL(iter->getEnd(),54);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
-    iter++;
+    ++iter;
    
    BOOST_CHECK_EQUAL(iter->getStart(),56);
    BOOST_CHECK_EQUAL(iter->getEnd(),61);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "hello");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),61);
    BOOST_CHECK_EQUAL(iter->getEnd(),62);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),63);
    BOOST_CHECK_EQUAL(iter->getEnd(),69);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),70);
    BOOST_CHECK_EQUAL(iter->getEnd(),75);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),76);
    BOOST_CHECK_EQUAL(iter->getEnd(),80);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),82);
    BOOST_CHECK_EQUAL(iter->getEnd(),88);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),89);
    BOOST_CHECK_EQUAL(iter->getEnd(),94);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),95);
    BOOST_CHECK_EQUAL(iter->getEnd(),99);
@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
    BOOST_CHECK_EQUAL(iter->getStart(),0);
    BOOST_CHECK_EQUAL(iter->getEnd(),23);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;
    
    BOOST_CHECK_EQUAL(iter->getStart(),23);
    BOOST_CHECK_EQUAL(iter->getEnd(),27);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"link");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),27);
    BOOST_CHECK_EQUAL(iter->getEnd(),31);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),32);
    BOOST_CHECK_EQUAL(iter->getEnd(),35);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"and");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),36);
    BOOST_CHECK_EQUAL(iter->getEnd(),39);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),39);
    BOOST_CHECK_EQUAL(iter->getEnd(),43);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"bold");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),43);
    BOOST_CHECK_EQUAL(iter->getEnd(),47);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),48);
    BOOST_CHECK_EQUAL(iter->getEnd(),51);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"and");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),52);
    BOOST_CHECK_EQUAL(iter->getEnd(),59);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"newline");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),60);
    BOOST_CHECK_EQUAL(iter->getEnd(),65);
@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
    BOOST_CHECK_EQUAL(iter->getEnd(),4);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"this");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),5);
    BOOST_CHECK_EQUAL(iter->getEnd(),7);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"is");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),8);
    BOOST_CHECK_EQUAL(iter->getEnd(),9);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"a");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),10);
    BOOST_CHECK_EQUAL(iter->getEnd(),18);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),20);
    BOOST_CHECK_EQUAL(iter->getEnd(),25);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"don't");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),26);
    BOOST_CHECK_EQUAL(iter->getEnd(),38);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),39);
    BOOST_CHECK_EQUAL(iter->getEnd(),41);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"it");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),43);
    BOOST_CHECK_EQUAL(iter->getEnd(),49);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),51);
    BOOST_CHECK_EQUAL(iter->getEnd(),57);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),59);
    BOOST_CHECK_EQUAL(iter->getEnd(),63);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),64);
    BOOST_CHECK_EQUAL(iter->getEnd(),71);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),72);
    BOOST_CHECK_EQUAL(iter->getEnd(),77);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
-    iter++;
+    ++iter;

    BOOST_CHECK_EQUAL(iter->getStart(),78);
    BOOST_CHECK_EQUAL(iter->getEnd(),83);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
-    iter++;
+    ++iter;

 }

--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@ -7,7 +7,7 @@
 #include <string>

 /*!
-  Class representing annotatio of char sequence as a token.
+  Class representing annotation of char sequence as a token.
  It is a type of interval that is also storing information
  about the annoation type and value.

@ -18,7 +18,7 @@ public:
    /*! Constructor.
      \param start start index of the annotation (char-level, 0-based)
      \param end end index of the annotation (char-level, 0-based)
-      \param type annotation type
+      \param annotationType annotation type
      \param value annotation value
    */
    TokenAnnotation(const SUFFIX_MARKER_TYPE start,
@ -44,12 +44,20 @@ public:
        return _value;
    }

+    /*! Named entity annotation type
+    */
    static int NE;

+    /*! Word annotation type
+    */
    static int WORD;

+    /*! Html tag annotation type
+    */
    static int HTML_TAG;

+    /*! Stop word annotation type
+    */
    static int STOP_WORD;

 protected:
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence):
 TokenizedSentence::~TokenizedSentence() {
 }

-void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
-    std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
-    std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
+void TokenizedSentence::addAnnotations(
+                            std::vector<TokenAnnotation> annotations) {
+    std::vector<TokenAnnotation>::iterator newAnnotation =
+                                           annotations.begin();
+    std::list<TokenAnnotation>::iterator existingAnnotation =
+                                           _tokenAnnotations.begin();

-    while(newAnnotation != annotations.end()) {
+    while (newAnnotation != annotations.end()) {
        if (existingAnnotation != _tokenAnnotations.end()) {
            // there are still some existing annotations, so perform checks
            if (newAnnotation->intersects(*existingAnnotation)) {
                // The new annotation intersects with the existing.
                // We can not add it, so let us just move on to the
                // next new annoation.
-                newAnnotation++;
+                ++newAnnotation;
            } else {
                // it is now important whether the new interval is before
                // or after existing
-                if (newAnnotation->getStart() < existingAnnotation->getStart()) {
-                    // New interval does not intersect and is before existing. We add it.
-                    _tokenAnnotations.insert(existingAnnotation, *newAnnotation);
-                    newAnnotation++;
+                if (newAnnotation->getStart() <
+                             existingAnnotation->getStart()) {
+                    // New interval does not intersect and is
+                    // before existing. We add it.
+                    _tokenAnnotations.insert(existingAnnotation,
+                                             *newAnnotation);
+                    ++newAnnotation;
                } else {
-                    // If the new interval is after existing we move to the next existing annoation.
-                    existingAnnotation++;
+                    // If the new interval is after existing
+                    // we move to the next existing annoation.
+                    ++existingAnnotation;
                }
            }
        } else {
            // no more existing annotations, so just add the new annotation
            _tokenAnnotations.push_back(*newAnnotation);
-            newAnnotation++;
+            ++newAnnotation;
        }
    }
-    
 }

 void TokenizedSentence::toLowerCase() {
@ -56,6 +62,5 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
            _tokens.push_back(annotation);
        }
    }
-
 }

--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -11,9 +11,12 @@
 #include <list>

 /*!
-  A sentence after anonymization operations. The class
+  A sentence after tokenizing operations. The class
  holds the current string represenation of the sentence
-  along with the annotations list.
+  along with the annotations list. The class also allows
+  for generating hash. After that operation the class
+  also holds the list of hashed codes and corresponding
+  tokens.
 */

 class TokenizedSentence {
@ -22,7 +25,7 @@ public:
      Constructor.

    */
-    TokenizedSentence(std::string sentence);
+    explicit TokenizedSentence(std::string sentence);

    /*! Destructor.
    */
@ -35,21 +38,40 @@ public:
        return _sentence;
    }

-    /*! Getter for annotations list
+    /*! Getter for all annotations list. This method returns
+        all annotations, including those which are not considered
+        in the hash, i.e. stop words and html tags.
      \returns annotations list
    */
    std::list<TokenAnnotation> getAnnotations() const {
        return _tokenAnnotations;
    }

+    /*! Getter for codes list. This data is available after calling
+        the hashGenerator method.
+      \returns codes list
+    */
    std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
        return _codes;
    }

+    /*! Getter for tokens list. This method returns
+        only those annotations considered
+        in the hash, i.e. words and named entities.
+      \returns tokens list
+    */
    std::vector<TokenAnnotation> getTokens() const {
        return _tokens;
    }

+    /*! Method for generating hash based on annotations.
+        This method takes into account annotations of type
+        word and named entity. These are encoded and added
+        to to code list. Annotations corresponding to these
+        tokens are added to the tokens list.
+      \param wordMap word map to use when encoding tokens
+      \returns tokens list
+    */
    void generateHash(boost::shared_ptr<WordMap> wordMap);

    /*! 
--- a/concordia/tutorial.dox
+++ b/concordia/tutorial.dox
@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search,

 Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.

+Moreover, the below example presents the feature of retrieving a tokenized version of the example.
+
 File concordia_searching.cpp:
 \verbatim
 #include <concordia/concordia.hpp>
 #include <concordia/concordia_search_result.hpp>
 #include <concordia/matched_pattern_fragment.hpp>
 #include <concordia/example.hpp>
+#include <concordia/tokenized_sentence.hpp>

 #include "config.hpp"

@ -115,7 +118,13 @@ using namespace std;
 int main() {
    Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
    
-    concordia.addExample(Example("Alice has a cat", 56));
+    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
+    cout << "Added the following tokens: " << endl;
+    BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
+        cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
+             << token.getEnd() << ")" << endl;
+    }
+    
    concordia.addExample(Example("Alice has a dog", 23));
    concordia.addExample(Example("New test product has a mistake", 321));
    concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
@ -153,6 +162,11 @@ int main() {
 This program should print:

 \verbatim
+Added the following tokens: 
+"alice" at positions: [0,5)
+"has" at positions: [6,9)
+"a" at positions: [10,11)
+"cat" at positions: [12,15)
 Searching for pattern: Our new test product has nothing to do with computers
 Printing all matched fragments:
 Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6
--- a/examples/concordia_search.cpp
+++ b/examples/concordia_search.cpp
@ -2,6 +2,7 @@
 #include <concordia/concordia_search_result.hpp>
 #include <concordia/matched_pattern_fragment.hpp>
 #include <concordia/example.hpp>
+#include <concordia/tokenized_sentence.hpp>

 #include "config.hpp"

@ -13,7 +14,13 @@ using namespace std;
 int main() {
    Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
    
-    concordia.addExample(Example("Alice has a cat", 56));
+    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
+    cout << "Added the following tokens: " << endl;
+    BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
+        cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
+             << token.getEnd() << ")" << endl;
+    }
+    
    concordia.addExample(Example("Alice has a dog", 23));
    concordia.addExample(Example("New test product has a mistake", 321));
    concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));