From 5a57406875d80234b39b82126a9d0e1790877429 Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Sat, 27 Jun 2015 12:40:24 +0200
Subject: [PATCH] finished original word positions

---
 concordia-console/concordia-console.cpp | 10 ++--
 concordia/compilation.dox               |  1 +
 concordia/concordia.cpp                 | 14 ++---
 concordia/concordia.hpp                 | 12 +++--
 concordia/concordia_index.cpp           | 19 ++++---
 concordia/concordia_index.hpp           |  7 ++-
 concordia/hash_generator.cpp            |  5 +-
 concordia/hash_generator.hpp            | 15 +++---
 concordia/index_searcher.cpp            |  3 +-
 concordia/interval.hpp                  |  7 ++-
 concordia/matched_pattern_fragment.hpp  |  2 +-
 concordia/regex_rule.cpp                | 17 +++---
 concordia/regex_rule.hpp                | 10 ++--
 concordia/sentence_tokenizer.cpp        | 12 +++--
 concordia/sentence_tokenizer.hpp        | 10 ++--
 concordia/t/test_regex_rule.cpp         | 58 ++++++++++----------
 concordia/t/test_sentence_tokenizer.cpp | 70 ++++++++++++-------------
 concordia/token_annotation.hpp          | 14 +++--
 concordia/tokenized_sentence.cpp        | 37 +++++++------
 concordia/tokenized_sentence.hpp        | 40 ++++++++++----
 concordia/tutorial.dox                  | 16 +++++-
 examples/concordia_search.cpp           |  9 +++-
 22 files changed, 238 insertions(+), 150 deletions(-)

diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp
index dbb44cd..8f69b48 100644
--- a/concordia-console/concordia-console.cpp
+++ b/concordia-console/concordia-console.cpp
@@ -28,8 +28,9 @@ void checkConcordiaResults(
         long baseLineCount) {
     long lineIndex = 1;
     BOOST_FOREACH(ConcordiaSearchResult result, results) {
-        SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
-        if (patternSize > 0) {                
+        SUFFIX_MARKER_TYPE patternSize =
+                    result.getTokenizedPattern()->getTokens().size();
+        if (patternSize > 0) {
             if (result.getBestOverlay().size() != 1) {
                 reportError(baseLineCount + lineIndex,
                            "best overlay has more than one fragment.");
@@ -37,7 +38,7 @@ void checkConcordiaResults(
             if (result.getBestOverlay().at(0).getMatchedLength()
                  != patternSize) {
                 reportError(baseLineCount + lineIndex,
-                       "best overlay fragment has different size than pattern.");
+                     "best overlay fragment has different size than pattern.");
             }
             if (result.getBestOverlayScore() != 1) {
                 reportError(baseLineCount + lineIndex,
@@ -201,7 +202,8 @@ int main(int argc, char** argv) {
             msdiff = time_end - time_start;
 
             std::cout << "\tPattern used: " << std::endl << "\t\t";
-            BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
+            BOOST_FOREACH(TokenAnnotation annotation,
+                                  result->getTokenizedPattern()->getTokens()) {
                 std::cout << annotation.getValue() << " ";
             }
             std::cout << std::endl;
diff --git a/concordia/compilation.dox b/concordia/compilation.dox
index 27c834b..68c5b33 100644
--- a/concordia/compilation.dox
+++ b/concordia/compilation.dox
@@ -9,6 +9,7 @@ Before you compile, make sure you have these installed:
 - cmake
 - Boost library
 - Log4cpp
+- ICU
 - (optional) Doxygen
 - (optional) TeX
 
diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp
index be762a4..3c6d4b6 100644
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@@ -44,15 +44,17 @@ std::string _createLibraryVersion() {
 
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
-                                 throw(ConcordiaException) {
+boost::shared_ptr<TokenizedSentence> Concordia::addExample(
+                                      const Example & example)
+                                      throw(ConcordiaException) {
     return _index->addExample(_hashGenerator, _T, _markers, example);
 }
 
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
-                                              throw(ConcordiaException) {
+std::vector<TokenizedSentence> Concordia::addAllExamples(
+                                         const std::vector<Example> & examples)
+                                         throw(ConcordiaException) {
     return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
 }
 
@@ -165,7 +167,8 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
     } else {
         std::string empty;
         return boost::shared_ptr<ConcordiaSearchResult>(
-            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
+            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
+                                            new TokenizedSentence(empty))));
     }
 }
 
@@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) {
     boost::filesystem::remove(_config->getMarkersFilePath());
 }
 
-
diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp
index 5051de3..9c707e9 100644
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@@ -54,16 +54,22 @@ public:
 
     /*! Adds an Example to the index.
       \param example example to be added
+      \returns tokenized sentence object,
+               containing information about original word positions
       \throws ConcordiaException
     */
-    boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
+    boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
+                                                   throw(ConcordiaException);
 
     /*! Adds multiple examples to the index.
       \param examples vector of examples to be added
+      \returns vector of tokenized sentence objects,
+               containing information about original word positions
       \throws ConcordiaException
     */
-    std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
-                                                   throw(ConcordiaException);
+    std::vector<TokenizedSentence> addAllExamples(
+                                const std::vector<Example> & examples)
+                                throw(ConcordiaException);
 
     /*! Performs a simple substring lookup on the index.
         For more info see \ref tutorial1_2.
diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp
index 3d65797..23c4ca4 100644
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@@ -25,7 +25,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
     }
 
     boost::shared_ptr<std::vector<saidx_t> > result =
-             boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
+            boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
     for (int i = 0; i < T->size(); i++) {
         result->push_back(SA_array[i]);
     }
@@ -48,7 +48,8 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
 
     std::vector<TokenizedSentence> hashedPatterns;
     BOOST_FOREACH(Example example, examples) {
-        boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
+        boost::shared_ptr<TokenizedSentence> hashedPattern =
+             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                       T, markers, example);
         hashedPatterns.push_back(*hashedPattern);
     }
@@ -56,7 +57,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
     hashedIndexFile.close();
     markersFile.close();
     hashGenerator->serializeWordMap();
-    
+
     return hashedPatterns;
 }
 
@@ -71,12 +72,13 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
     std::ofstream markersFile;
     markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                              std::ios::app|std::ios::binary);
-    boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
+    boost::shared_ptr<TokenizedSentence> hashedPattern =
+             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                       T, markers, example);
     hashedIndexFile.close();
     markersFile.close();
     hashGenerator->serializeWordMap();
-    
+
     return hashedPattern;
 }
 
@@ -87,9 +89,10 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
                    boost::shared_ptr<std::vector<sauchar_t> > T,
                    boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                    const Example & example) {
-    boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
+    boost::shared_ptr<TokenizedSentence> hashedPattern =
+                    hashGenerator->generateHash(example.getSentence());
     std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
-                          
+
     int offset = 0;
     for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
                                           it != hash.end(); ++it) {
@@ -117,7 +120,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
     SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
     Utils::writeMarker(markersFile, sentenceBoundaryMA);
     markers->push_back(sentenceBoundaryMA);
-    
+
     return hashedPattern;
 }
 
diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp
index e90f546..6d79cb9 100644
--- a/concordia/concordia_index.hpp
+++ b/concordia/concordia_index.hpp
@@ -44,11 +44,13 @@ public:
         and markers array (also passed to this method) are appended
         with the hashed example. At the same time, HDD versions of these
         two data structures are also appended with the same example.
+        The method returns a tokenized version of the example.
       \param hashGenerator hash generator to be used to prepare the hash
              of the example
       \param T RAM-based hash index to be appended to
       \param markers RAM-based markers array to be appended to
       \param example example to be added to index
+      \returns tokenized example
       \throws ConcordiaException
     */
     boost::shared_ptr<TokenizedSentence> addExample(
@@ -62,11 +64,13 @@ public:
         and markers array (also passed to this method) are appended
         with the hashed examples. At the same time, HDD versions of these
         two data structures are also appended with the same examples.
+        The method returns a vector of tokenized examples.
       \param hashGenerator hash generator to be used to prepare the hash
              of the example
       \param T RAM-based hash index to be appended to
       \param markers RAM-based markers array to be appended to
       \param examples vector of examples to be added to index
+      \returns vector of tokenized examples
       \throws ConcordiaException
     */
     std::vector<TokenizedSentence> addAllExamples(
@@ -83,7 +87,8 @@ public:
                 boost::shared_ptr<std::vector<sauchar_t> > T);
 
 private:
-    boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
+    boost::shared_ptr<TokenizedSentence> _addSingleExample(
+                std::ofstream & hashedIndexFile,
                 std::ofstream & markersFile,
                 boost::shared_ptr<HashGenerator> hashGenerator,
                 boost::shared_ptr<std::vector<sauchar_t> > T,
diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp
index eb69ce9..8b93ce4 100644
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@@ -29,9 +29,10 @@ HashGenerator::~HashGenerator() {
 
 boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
                      const std::string & sentence) throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
+    boost::shared_ptr<TokenizedSentence> ts =
+                                    _sentenceTokenizer->tokenize(sentence);
     ts->generateHash(_wordMap);
-    
+
     if (ts->getTokens().size() > Utils::maxSentenceSize) {
         throw ConcordiaException("Trying to add too long sentence.");
     }
diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp
index 6c8a752..6528dcf 100644
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@@ -15,14 +15,14 @@
 
 /*!
   Class for generating a sentence hash. The hash is generated from a sentence
-  given in raw string. String is first anonymized and tokenized. After these
-  operations, each token is coded as an integer, according to WordMap.
-  Resulting hash is a vector of integers.
+  given in raw string. String is first tokenized by SentenceTokenizer and
+  then each token is coded as an integer, according to WordMap.
+  Resulting hash is an instance of TokenizedSentence.
   
-  Sentence hashed is used when adding a sentence to index and during searching.
+  Hashed sentence is used when adding a sentence to index and during searching.
   
   HashGenerator holds an instance of WordMap, used to code tokens as integers
-  and SentenceAnonymizer, used to preprocess the sentence string.
+  and SentenceTokenizer, used to tokenize the sentence string.
 
 */
 
@@ -42,9 +42,10 @@ public:
     /*!
       Generates hash of a sentence.
       \param sentence sentence to generate hash from
-      \returns vector of integers
+      \returns tokenized sentence, containing the hash
     */
-    boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
+    boost::shared_ptr<TokenizedSentence> generateHash(
+                                const std::string & sentence)
                                 throw(ConcordiaException);
 
     /*!
diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp
index 45c9559..c2c119e 100644
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@@ -73,6 +73,7 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
      boost::shared_ptr<ConcordiaSearchResult>(
        new ConcordiaSearchResult(hashedPattern));
 
-    _concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
+    _concordiaSearcher->concordiaSearch(result, T, markers,
+                                        SA, hashedPattern->getCodes());
     return result;
 }
diff --git a/concordia/interval.hpp b/concordia/interval.hpp
index c06dfec..e77fb40 100644
--- a/concordia/interval.hpp
+++ b/concordia/interval.hpp
@@ -51,9 +51,12 @@ public:
         return _end;
     }
 
-    friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
-        return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
+    friend std::ostream & operator << (std::ostream & o,
+                                       const Interval & interval) {
+        return o << "[" << interval.getStart()
+                 << "," << interval.getEnd() << ")";
     }
+
 protected:
     SUFFIX_MARKER_TYPE _start;
 
diff --git a/concordia/matched_pattern_fragment.hpp b/concordia/matched_pattern_fragment.hpp
index e45bdcb..01d3ac9 100644
--- a/concordia/matched_pattern_fragment.hpp
+++ b/concordia/matched_pattern_fragment.hpp
@@ -6,7 +6,7 @@
 
 /*!
   Class representing matched pattern fragment in concordia search.
-  This fragment can be seen as an interval of the pattern.
+  This fragment can be seen as a word interval of the pattern.
   
   This class holds information about:
   - where the pattern fragment was matched (example id and example offset)
diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp
index 062c118..04bb825 100644
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString,
                                _value(value)                  {
     try {
         if (caseSensitive) {
-            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
+            _pattern = boost::make_u32regex(
+                  UnicodeString(patternString.c_str()));
         } else {
-            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
+            _pattern = boost::make_u32regex(
+                  UnicodeString(patternString.c_str()), boost::regex::icase);
         }
     } catch(const std::exception & e) {
         std::stringstream ss;
@@ -37,7 +39,8 @@ RegexRule::~RegexRule() {
 void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
     try {
         UnicodeString s(sentence->getSentence().c_str());
-        boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
+        boost::u32regex_iterator<const UChar*> begin(
+                             boost::make_u32regex_iterator(s, _pattern));
         boost::u32regex_iterator<const UChar*> end;
         std::vector<TokenAnnotation> annotations;
         for (; begin != end; ++begin) {
@@ -46,19 +49,21 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
             std::string value;
             if (_annotationType == TokenAnnotation::WORD) {
                 UnicodeString unicodeValue;
-                s.extract(begin->position(), begin->length(), unicodeValue); 
+                s.extract(begin->position(), begin->length(), unicodeValue);
                 unicodeValue.toUTF8String(value);
             } else {
                 value = _value;
             }
-            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
+            TokenAnnotation annotation(matchBegin, matchEnd,
+                                      _annotationType, value);
             annotations.push_back(annotation);
         }
         sentence->addAnnotations(annotations);
     } catch(const std::exception & e) {
         std::stringstream ss;
         ss << "Exception while applying regex rule: "
-                          << _annotationType << " to text: " << sentence->getSentence();
+                          << _annotationType << " to text: "
+                          << sentence->getSentence();
         ss << ", message: " << e.what();
         throw ConcordiaException(ss.str());
     }
diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp
index dbc509a..ce62fd1 100644
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@@ -15,8 +15,9 @@ typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
 
 /*!
   Class for representing a regular expression annotation rule.
-  Holds regex pattern string for matching and replacement string for
-  annotating found matches.
+  Holds regex pattern string for matching and default value to assign
+  to the annotations. Rule also has a type, given to all annotations
+  produced by it.
 
 */
 class RegexRule {
@@ -25,6 +26,7 @@ public:
       Constructor.
         \param patternString regex pattern to match
         \param annoationType type of annotation
+        \param value value to be assigned to the annotation
         \param caseSensitive case sensitivity of the pattern
     */
     RegexRule(std::string patternString,
@@ -37,7 +39,7 @@ public:
     */
     virtual ~RegexRule();
 
-    /*! Applies the operation on anonymized sentence.
+    /*! Applies regex annotation on tokenized sentence.
       \param sentence the input sentence
     */
     void apply(boost::shared_ptr<TokenizedSentence> sentence);
@@ -46,7 +48,7 @@ private:
     int _annotationType;
 
     std::string _value;
-    
+
     boost::u32regex _pattern;
 };
 
diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp
index 2adfbcd..9ffe173 100644
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@@ -26,7 +26,7 @@ SentenceTokenizer::~SentenceTokenizer() {
 
 boost::shared_ptr<TokenizedSentence>
               SentenceTokenizer::tokenize(const std::string & sentence) {
-    boost::shared_ptr<TokenizedSentence> 
+    boost::shared_ptr<TokenizedSentence>
                     result(new TokenizedSentence(sentence));
 
     _htmlTags->apply(result);
@@ -40,9 +40,10 @@ boost::shared_ptr<TokenizedSentence>
     if (_stopWordsEnabled) {
         _stopWords->apply(result);
     }
-    
+
     boost::shared_ptr<RegexRule> wordsRule(
-                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
+                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
+                                      TokenAnnotation::WORD, ""));
     wordsRule->apply(result);
     boost::shared_ptr<RegexRule> singleLetterWordsRule(
                         new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
@@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
     tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
     tagsExpression += "br).*?>";
     _htmlTags = boost::shared_ptr<RegexRule>(
-                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
+                        new RegexRule(tagsExpression,
+                                TokenAnnotation::HTML_TAG, "", false));
 }
 
 boost::shared_ptr<RegexRule>
@@ -137,6 +139,6 @@ boost::shared_ptr<RegexRule>
     expression = expression.substr(0, expression.size()-1);
     expression += ")";
     return boost::shared_ptr<RegexRule>(
-                        new RegexRule(expression, annotationType, value, false));
+                      new RegexRule(expression, annotationType, value, false));
 }
 
diff --git a/concordia/sentence_tokenizer.hpp b/concordia/sentence_tokenizer.hpp
index be60061..7e354eb 100644
--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@@ -14,10 +14,9 @@
 
 /*!
   Class for tokenizing sentence before generating hash.
-  This operation is is used to
-  remove unnecessary symbols and possibly words from sentences added to index
-  and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
-  as well as annotates named entities and special symbols. All these have to be listed in files
+  Tokenizer ignores unnecessary symbols, html tags and possibly stop words
+  (if the option is enabled) in sentences added to index
+  as well as annotates named entities. All these have to be listed in files
   (see \ref tutorial3).
 */
 
@@ -35,7 +34,7 @@ public:
 
     /*! Tokenizes the sentence.
       \param sentence input sentence
-      \returns altered version of the input sentence
+      \returns tokenized sentence object build on the input sentence
     */
     boost::shared_ptr<TokenizedSentence>
                                    tokenize(const std::string & sentence);
@@ -58,7 +57,6 @@ private:
     bool _stopWordsEnabled;
 
     boost::shared_ptr<RegexRule> _stopWords;
-
 };
 
 #endif
diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp
index 78685cf..ada81eb 100644
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation )
 
     BOOST_CHECK_EQUAL(iter->getStart(),7);
     BOOST_CHECK_EQUAL(iter->getEnd(),8);
-    iter++;
+    ++iter;
     
     BOOST_CHECK_EQUAL(iter->getStart(),11);
     BOOST_CHECK_EQUAL(iter->getEnd(),12);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),14);
     BOOST_CHECK_EQUAL(iter->getEnd(),15);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),16);
     BOOST_CHECK_EQUAL(iter->getEnd(),17);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),18);
     BOOST_CHECK_EQUAL(iter->getEnd(),19);  
@@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
 
     BOOST_CHECK_EQUAL(iter->getStart(),3);
     BOOST_CHECK_EQUAL(iter->getEnd(),4);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),19);
     BOOST_CHECK_EQUAL(iter->getEnd(),20);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),21);
     BOOST_CHECK_EQUAL(iter->getEnd(),22);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),44);
     BOOST_CHECK_EQUAL(iter->getEnd(),45);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),45);
     BOOST_CHECK_EQUAL(iter->getEnd(),46);  
@@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
 
     BOOST_CHECK_EQUAL(iter->getStart(),8);
     BOOST_CHECK_EQUAL(iter->getEnd(),11);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),16);
     BOOST_CHECK_EQUAL(iter->getEnd(),19);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),24);
     BOOST_CHECK_EQUAL(iter->getEnd(),27);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),32);
     BOOST_CHECK_EQUAL(iter->getEnd(),35);
@@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
 
     BOOST_CHECK_EQUAL(iter->getStart(),11);
     BOOST_CHECK_EQUAL(iter->getEnd(),12);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),29);
     BOOST_CHECK_EQUAL(iter->getEnd(),30);
@@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 
     BOOST_CHECK_EQUAL(iter->getStart(),2);
     BOOST_CHECK_EQUAL(iter->getEnd(),3);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),3);
     BOOST_CHECK_EQUAL(iter->getEnd(),4);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),4);
     BOOST_CHECK_EQUAL(iter->getEnd(),5);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),5);
     BOOST_CHECK_EQUAL(iter->getEnd(),6);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),8);
     BOOST_CHECK_EQUAL(iter->getEnd(),9);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),9);
     BOOST_CHECK_EQUAL(iter->getEnd(),10);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),11);
     BOOST_CHECK_EQUAL(iter->getEnd(),12);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),15);
     BOOST_CHECK_EQUAL(iter->getEnd(),16);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),16);
     BOOST_CHECK_EQUAL(iter->getEnd(),17);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),20);
     BOOST_CHECK_EQUAL(iter->getEnd(),21);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),21);
     BOOST_CHECK_EQUAL(iter->getEnd(),22);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),22);
     BOOST_CHECK_EQUAL(iter->getEnd(),23);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),23);
     BOOST_CHECK_EQUAL(iter->getEnd(),24);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),26);
     BOOST_CHECK_EQUAL(iter->getEnd(),27);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),27);
     BOOST_CHECK_EQUAL(iter->getEnd(),28);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),29);
     BOOST_CHECK_EQUAL(iter->getEnd(),30);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),33);
     BOOST_CHECK_EQUAL(iter->getEnd(),34);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),34);
     BOOST_CHECK_EQUAL(iter->getEnd(),35);
diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp
index cc4ce11..626fdc9 100644
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest )
     BOOST_CHECK_EQUAL(iter->getEnd(),4);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "date");
-    iter++;
+    ++iter;
     
     BOOST_CHECK_EQUAL(iter->getStart(),6);
     BOOST_CHECK_EQUAL(iter->getEnd(),16);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
     BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),18);
     BOOST_CHECK_EQUAL(iter->getEnd(),22);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "mail");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),24);
     BOOST_CHECK_EQUAL(iter->getEnd(),40);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
     BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),42);
     BOOST_CHECK_EQUAL(iter->getEnd(),48);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "number");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),50);
     BOOST_CHECK_EQUAL(iter->getEnd(),54);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
     BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
-    iter++;
+    ++iter;
     
     BOOST_CHECK_EQUAL(iter->getStart(),56);
     BOOST_CHECK_EQUAL(iter->getEnd(),61);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "hello");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),61);
     BOOST_CHECK_EQUAL(iter->getEnd(),62);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
     BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),63);
     BOOST_CHECK_EQUAL(iter->getEnd(),69);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),70);
     BOOST_CHECK_EQUAL(iter->getEnd(),75);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),76);
     BOOST_CHECK_EQUAL(iter->getEnd(),80);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),82);
     BOOST_CHECK_EQUAL(iter->getEnd(),88);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),89);
     BOOST_CHECK_EQUAL(iter->getEnd(),94);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),95);
     BOOST_CHECK_EQUAL(iter->getEnd(),99);
@@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
     BOOST_CHECK_EQUAL(iter->getStart(),0);
     BOOST_CHECK_EQUAL(iter->getEnd(),23);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;
     
     BOOST_CHECK_EQUAL(iter->getStart(),23);
     BOOST_CHECK_EQUAL(iter->getEnd(),27);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"link");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),27);
     BOOST_CHECK_EQUAL(iter->getEnd(),31);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),32);
     BOOST_CHECK_EQUAL(iter->getEnd(),35);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"and");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),36);
     BOOST_CHECK_EQUAL(iter->getEnd(),39);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),39);
     BOOST_CHECK_EQUAL(iter->getEnd(),43);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"bold");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),43);
     BOOST_CHECK_EQUAL(iter->getEnd(),47);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),48);
     BOOST_CHECK_EQUAL(iter->getEnd(),51);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"and");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),52);
     BOOST_CHECK_EQUAL(iter->getEnd(),59);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"newline");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),60);
     BOOST_CHECK_EQUAL(iter->getEnd(),65);
@@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
     BOOST_CHECK_EQUAL(iter->getEnd(),4);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"this");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),5);
     BOOST_CHECK_EQUAL(iter->getEnd(),7);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"is");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),8);
     BOOST_CHECK_EQUAL(iter->getEnd(),9);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"a");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),10);
     BOOST_CHECK_EQUAL(iter->getEnd(),18);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),20);
     BOOST_CHECK_EQUAL(iter->getEnd(),25);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"don't");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),26);
     BOOST_CHECK_EQUAL(iter->getEnd(),38);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),39);
     BOOST_CHECK_EQUAL(iter->getEnd(),41);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"it");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),43);
     BOOST_CHECK_EQUAL(iter->getEnd(),49);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),51);
     BOOST_CHECK_EQUAL(iter->getEnd(),57);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),59);
     BOOST_CHECK_EQUAL(iter->getEnd(),63);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),64);
     BOOST_CHECK_EQUAL(iter->getEnd(),71);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),72);
     BOOST_CHECK_EQUAL(iter->getEnd(),77);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
-    iter++;
+    ++iter;
 
     BOOST_CHECK_EQUAL(iter->getStart(),78);
     BOOST_CHECK_EQUAL(iter->getEnd(),83);
     BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
     BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
-    iter++;
+    ++iter;
 
 }
 
diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp
index 11153e5..e90f539 100644
--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@@ -7,7 +7,7 @@
 #include <string>
 
 /*!
-  Class representing annotatio of char sequence as a token.
+  Class representing annotation of char sequence as a token.
   It is a type of interval that is also storing information
   about the annoation type and value.
 
@@ -18,7 +18,7 @@ public:
     /*! Constructor.
       \param start start index of the annotation (char-level, 0-based)
       \param end end index of the annotation (char-level, 0-based)
-      \param type annotation type
+      \param annotationType annotation type
       \param value annotation value
     */
     TokenAnnotation(const SUFFIX_MARKER_TYPE start,
@@ -44,14 +44,22 @@ public:
         return _value;
     }
 
+    /*! Named entity annotation type
+    */
     static int NE;
 
+    /*! Word annotation type
+    */
     static int WORD;
 
+    /*! Html tag annotation type
+    */
     static int HTML_TAG;
 
+    /*! Stop word annotation type
+    */
     static int STOP_WORD;
-    
+
 protected:
     int _annotationType;
 
diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp
index 6302567..964e5e3 100644
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence):
 TokenizedSentence::~TokenizedSentence() {
 }
 
-void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
-    std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
-    std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
-    
-    while(newAnnotation != annotations.end()) {
+void TokenizedSentence::addAnnotations(
+                            std::vector<TokenAnnotation> annotations) {
+    std::vector<TokenAnnotation>::iterator newAnnotation =
+                                           annotations.begin();
+    std::list<TokenAnnotation>::iterator existingAnnotation =
+                                           _tokenAnnotations.begin();
+
+    while (newAnnotation != annotations.end()) {
         if (existingAnnotation != _tokenAnnotations.end()) {
             // there are still some existing annotations, so perform checks
             if (newAnnotation->intersects(*existingAnnotation)) {
                 // The new annotation intersects with the existing.
                 // We can not add it, so let us just move on to the
                 // next new annoation.
-                newAnnotation++;
+                ++newAnnotation;
             } else {
                 // it is now important whether the new interval is before
                 // or after existing
-                if (newAnnotation->getStart() < existingAnnotation->getStart()) {
-                    // New interval does not intersect and is before existing. We add it.
-                    _tokenAnnotations.insert(existingAnnotation, *newAnnotation);
-                    newAnnotation++;
+                if (newAnnotation->getStart() <
+                             existingAnnotation->getStart()) {
+                    // New interval does not intersect and is
+                    // before existing. We add it.
+                    _tokenAnnotations.insert(existingAnnotation,
+                                             *newAnnotation);
+                    ++newAnnotation;
                 } else {
-                    // If the new interval is after existing we move to the next existing annoation.
-                    existingAnnotation++;
+                    // If the new interval is after existing
+                    // we move to the next existing annoation.
+                    ++existingAnnotation;
                 }
             }
         } else {
             // no more existing annotations, so just add the new annotation
             _tokenAnnotations.push_back(*newAnnotation);
-            newAnnotation++;
+            ++newAnnotation;
         }
     }
-    
 }
 
 void TokenizedSentence::toLowerCase() {
@@ -54,8 +60,7 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
                annotation.getType() == TokenAnnotation::NE) {
             _codes.push_back(wordMap->getWordCode(annotation.getValue()));
             _tokens.push_back(annotation);
-        } 
+        }
     }
-
 }
 
diff --git a/concordia/tokenized_sentence.hpp b/concordia/tokenized_sentence.hpp
index a0ff96b..345c5fb 100644
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@@ -11,9 +11,12 @@
 #include <list>
 
 /*!
-  A sentence after anonymization operations. The class
+  A sentence after tokenizing operations. The class
   holds the current string represenation of the sentence
-  along with the annotations list.
+  along with the annotations list. The class also allows
+  for generating hash. After that operation the class
+  also holds the list of hashed codes and corresponding
+  tokens.
 */
 
 class TokenizedSentence {
@@ -22,7 +25,7 @@ public:
       Constructor.
 
     */
-    TokenizedSentence(std::string sentence);
+    explicit TokenizedSentence(std::string sentence);
 
     /*! Destructor.
     */
@@ -35,21 +38,40 @@ public:
         return _sentence;
     }
 
-    /*! Getter for annotations list
+    /*! Getter for all annotations list. This method returns
+        all annotations, including those which are not considered
+        in the hash, i.e. stop words and html tags.
       \returns annotations list
     */
     std::list<TokenAnnotation> getAnnotations() const {
         return _tokenAnnotations;
     }
 
+    /*! Getter for codes list. This data is available after calling
+        the hashGenerator method.
+      \returns codes list
+    */
     std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
         return _codes;
     }
-    
+
+    /*! Getter for tokens list. This method returns
+        only those annotations considered
+        in the hash, i.e. words and named entities.
+      \returns tokens list
+    */
     std::vector<TokenAnnotation> getTokens() const {
         return _tokens;
     }
-    
+
+    /*! Method for generating hash based on annotations.
+        This method takes into account annotations of type
+        word and named entity. These are encoded and added
+        to to code list. Annotations corresponding to these
+        tokens are added to the tokens list.
+      \param wordMap word map to use when encoding tokens
+      \returns tokens list
+    */
     void generateHash(boost::shared_ptr<WordMap> wordMap);
 
     /*! 
@@ -66,15 +88,15 @@ public:
 
         \param annotations list of annotations to be added
     */
-    void addAnnotations(std::vector<TokenAnnotation> annotations);    
+    void addAnnotations(std::vector<TokenAnnotation> annotations);
 
 private:
     std::string _sentence;
 
     std::list<TokenAnnotation> _tokenAnnotations;
-    
+
     std::vector<INDEX_CHARACTER_TYPE> _codes;
-    
+
     std::vector<TokenAnnotation> _tokens;
 };
 
diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox
index 6438efc..00d65dc 100644
--- a/concordia/tutorial.dox
+++ b/concordia/tutorial.dox
@@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search,
 
 Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
 
+Moreover, the below example presents the feature of retrieving a tokenized version of the example.
+
 File concordia_searching.cpp:
 \verbatim
 #include <concordia/concordia.hpp>
 #include <concordia/concordia_search_result.hpp>
 #include <concordia/matched_pattern_fragment.hpp>
 #include <concordia/example.hpp>
+#include <concordia/tokenized_sentence.hpp>
 
 #include "config.hpp"
 
@@ -115,7 +118,13 @@ using namespace std;
 int main() {
     Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
     
-    concordia.addExample(Example("Alice has a cat", 56));
+    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
+    cout << "Added the following tokens: " << endl;
+    BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
+        cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
+             << token.getEnd() << ")" << endl;
+    }
+    
     concordia.addExample(Example("Alice has a dog", 23));
     concordia.addExample(Example("New test product has a mistake", 321));
     concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
@@ -153,6 +162,11 @@ int main() {
 This program should print:
 
 \verbatim
+Added the following tokens: 
+"alice" at positions: [0,5)
+"has" at positions: [6,9)
+"a" at positions: [10,11)
+"cat" at positions: [12,15)
 Searching for pattern: Our new test product has nothing to do with computers
 Printing all matched fragments:
 Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6
diff --git a/examples/concordia_search.cpp b/examples/concordia_search.cpp
index 5a5cae8..b8ee36a 100644
--- a/examples/concordia_search.cpp
+++ b/examples/concordia_search.cpp
@@ -2,6 +2,7 @@
 #include <concordia/concordia_search_result.hpp>
 #include <concordia/matched_pattern_fragment.hpp>
 #include <concordia/example.hpp>
+#include <concordia/tokenized_sentence.hpp>
 
 #include "config.hpp"
 
@@ -13,7 +14,13 @@ using namespace std;
 int main() {
     Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
     
-    concordia.addExample(Example("Alice has a cat", 56));
+    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
+    cout << "Added the following tokens: " << endl;
+    BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
+        cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
+             << token.getEnd() << ")" << endl;
+    }
+    
     concordia.addExample(Example("Alice has a dog", 23));
     concordia.addExample(Example("New test product has a mistake", 321));
     concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));