From 0baf3e4ef2348d992c9f3cb871913b33f497038b Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Mon, 22 Jun 2015 13:52:56 +0200
Subject: [PATCH] character intervals in progress

---
 CMakeLists.txt                                |  21 +-
 TODO.txt                                      |   2 +
 concordia/CMakeLists.txt                      |  10 +-
 concordia/anonymized_sentence.cpp             |  48 ++++
 concordia/anonymized_sentence.hpp             |  64 +++++
 concordia/compilation.dox                     |   3 +-
 concordia/concordia_index.cpp                 |   1 -
 concordia/concordia_search_result.hpp         |   2 +-
 concordia/hash_generator.cpp                  |   3 +-
 concordia/hashed_sentence.cpp                 |   7 +
 concordia/hashed_sentence.hpp                 |  61 +++++
 concordia/interval.hpp                        |  10 +-
 concordia/regex_replacement.cpp               |  44 ----
 concordia/regex_rule.cpp                      |  56 +++++
 .../{regex_replacement.hpp => regex_rule.hpp} |  30 +--
 concordia/sentence_anonymizer.cpp             |  33 +--
 concordia/sentence_anonymizer.hpp             |  18 +-
 concordia/t/CMakeLists.txt                    |   3 +-
 concordia/t/test_anonymized_sentence.cpp      |  86 +++++++
 concordia/t/test_regex_replacement.cpp        |  66 ------
 concordia/t/test_regex_rule.cpp               | 221 ++++++++++++++++++
 concordia/t/test_sentence_anonymizer.cpp      |  12 +-
 concordia/token_annotation.cpp                |  15 ++
 concordia/token_annotation.hpp                |  53 +++++
 examples/CMakeLists.txt                       |   3 +
 25 files changed, 705 insertions(+), 167 deletions(-)
 create mode 100644 concordia/anonymized_sentence.cpp
 create mode 100644 concordia/anonymized_sentence.hpp
 create mode 100644 concordia/hashed_sentence.cpp
 create mode 100644 concordia/hashed_sentence.hpp
 delete mode 100644 concordia/regex_replacement.cpp
 create mode 100644 concordia/regex_rule.cpp
 rename concordia/{regex_replacement.hpp => regex_rule.hpp} (50%)
 create mode 100644 concordia/t/test_anonymized_sentence.cpp
 delete mode 100644 concordia/t/test_regex_replacement.cpp
 create mode 100644 concordia/t/test_regex_rule.cpp
 create mode 100644 concordia/token_annotation.cpp
 create mode 100644 concordia/token_annotation.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a304441..8cff576 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,26 +103,43 @@ find_package(Boost COMPONENTS
 # ----------------------------------------------------
 # libconfig
 # ----------------------------------------------------
-find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
+find_library(LIBCONFIG_LIB NAMES config++)
 find_path(LIBCONFIG_INCLUDE libconfig.h++)
 
 if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
   message(STATUS "Found Libconfig")
   include_directories(${LIBCONFIG_INCLUDE})
   link_directories(${LIBCONFIG_LIB})
+else()
+  message(FATAL_ERROR "Libconfig not found")
 endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
 
+# ----------------------------------------------------
+# ICU (I feeeeel youuuuu...)
+# ----------------------------------------------------
+find_library(ICU_LIB NAMES icui18n)
+find_path(ICU_INCLUDE unicode)
+
+if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
+  message(STATUS "Found ICU")
+  include_directories(${ICU_INCLUDE})
+  link_directories(${ICU_LIB})
+else()
+  message(FATAL_ERROR "ICU not found")
+endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
 
 # ----------------------------------------------------
 # Logging
 # ----------------------------------------------------
-find_library(LOG4CPP_LIB NAMES log4cpp REQUIRED)
+find_library(LOG4CPP_LIB NAMES log4cpp)
 find_path(LOG4CPP_INCLUDE log4cpp/Appender.hh)
 
 if(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
   message(STATUS "Found Log4cpp")
   include_directories(${LOG4CPP_INCLUDE})
   link_directories(${LOG4CPP_LIB})
+else()
+  message(FATAL_ERROR "Log4cpp not found")
 endif(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
 
 # ================================================
diff --git a/TODO.txt b/TODO.txt
index 7f5ebc0..e06f0be 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,6 +1,8 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
 
+- implement tokenAnnotations vector as interval tree
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
+- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
 - testy zużycia pamięci
 - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt
index 70ac20d..f59f12e 100644
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@@ -6,10 +6,13 @@ foreach(dir ${ALL_DIRECTORIES})
 endforeach(dir)
 
 add_library(concordia SHARED
+  token_annotation.cpp
+  anonymized_sentence.cpp
+  hashed_sentence.cpp
   concordia_search_result.cpp
   matched_pattern_fragment.cpp
   concordia_searcher.cpp
-  regex_replacement.cpp
+  regex_rule.cpp
   sentence_anonymizer.cpp
   interval.cpp
   tm_matches.cpp
@@ -33,10 +36,13 @@ add_subdirectory(t)
 
 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
+          token_annotation.hpp
+          anonymized_sentence.hpp
+          hashed_sentence.hpp
           concordia_search_result.hpp
           matched_pattern_fragment.hpp
           concordia_searcher.hpp
-          regex_replacement.hpp
+          regex_rule.hpp
           sentence_anonymizer.hpp
           interval.hpp
           tm_matches.hpp
diff --git a/concordia/anonymized_sentence.cpp b/concordia/anonymized_sentence.cpp
new file mode 100644
index 0000000..6f7c687
--- /dev/null
+++ b/concordia/anonymized_sentence.cpp
@@ -0,0 +1,48 @@
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/common/text_utils.hpp"
+
+#include <iostream>
+
+AnonymizedSentence::AnonymizedSentence(std::string sentence):
+                                         _sentence(sentence) {
+}
+
+AnonymizedSentence::~AnonymizedSentence() {
+}
+
+void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
+    std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
+    std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
+    
+    while(newAnnotation != annotations.end()) {
+        if (existingAnnotation != _tokenAnnotations.end()) {
+            // there are still some existing annotations, so perform checks
+            if (newAnnotation->intersects(*existingAnnotation)) {
+                // The new annotation intersects with the existing.
+                // We can not add it, so let us just move on to the
+                // next new annoation.
+                newAnnotation++;
+            } else {
+                // it is now important whether the new interval is before
+                // or after existing
+                if (newAnnotation->getStart() < existingAnnotation->getStart()) {
+                    // New interval does not intersect and is before existing. We add it.
+                    _tokenAnnotations.insert(existingAnnotation, *newAnnotation);
+                    newAnnotation++;
+                } else {
+                    // If the new interval is after existing we move to the next existing annoation.
+                    existingAnnotation++;
+                }
+            }
+        } else {
+            // no more existing annotations, so just add the new annotation
+            _tokenAnnotations.push_back(*newAnnotation);
+            newAnnotation++;
+        }
+    }
+    
+}
+
+void AnonymizedSentence::toLowerCase() {
+    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
+}
diff --git a/concordia/anonymized_sentence.hpp b/concordia/anonymized_sentence.hpp
new file mode 100644
index 0000000..e805be0
--- /dev/null
+++ b/concordia/anonymized_sentence.hpp
@@ -0,0 +1,64 @@
+#ifndef ANONYMIZED_SENTENCE_HDR
+#define ANONYMIZED_SENTENCE_HDR
+
+#include "concordia/common/config.hpp"
+#include "concordia/token_annotation.hpp"
+#include <string>
+#include <vector>
+#include <list>
+
+/*!
+  A sentence after anonymization operations. The class
+  holds the current string represenation of the sentence
+  along with the annotations list.
+*/
+
+class AnonymizedSentence {
+public:
+    /*!
+      Constructor.
+
+    */
+    AnonymizedSentence(std::string sentence);
+
+    /*! Destructor.
+    */
+    virtual ~AnonymizedSentence();
+
+    /*! Getter for sentence
+      \returns sentence
+    */
+    std::string getSentence() const {
+        return _sentence;
+    }
+
+    /*! Getter for annotations list
+      \returns annotations list
+    */
+    std::list<TokenAnnotation> getAnnotations() const {
+        return _tokenAnnotations;
+    }
+
+    /*! 
+        Transform the sentence to lower case.
+    */
+    void toLowerCase();
+
+    /*! 
+        Add new annotations to the existing annotations list. Assumptions:
+        1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
+        2. the annotations to be added list also has the above properties.
+        The below algorithm will only add the annotations that do not
+        intersect with any of the existing ones.
+
+        \param annotations list of annotations to be added
+    */
+    void addAnnotations(std::vector<TokenAnnotation> annotations);    
+
+private:
+    std::string _sentence;
+
+    std::list<TokenAnnotation> _tokenAnnotations;
+};
+
+#endif
diff --git a/concordia/compilation.dox b/concordia/compilation.dox
index c40141e..27c834b 100644
--- a/concordia/compilation.dox
+++ b/concordia/compilation.dox
@@ -30,6 +30,7 @@ On Ubuntu 14.04, the above software comes in standard packages. Here is the comp
 - libconfig++-dev
 - libconfig-dev
 - libpcre3-dev
+- libicu-dev
 - doxygen
 - texlive-font-utils
 
@@ -39,7 +40,7 @@ If you want to install all the above packages at once, simply use the below comm
 
 \verbatim
 
-sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev doxygen texlive-font-utils
+sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev libicu-dev doxygen texlive-font-utils
 
 \endverbatim
 
diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp
index 339d275..2519c03 100644
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@@ -91,7 +91,6 @@ void ConcordiaIndex::_addSingleExample(
         Utils::appendCharToSaucharVector(T, character);
 
         // append to markersFile
-
         SUFFIX_MARKER_TYPE marker = Utils::createMarker(
                                            example.getId(),
                                            offset,
diff --git a/concordia/concordia_search_result.hpp b/concordia/concordia_search_result.hpp
index 64a6a43..6a7069f 100644
--- a/concordia/concordia_search_result.hpp
+++ b/concordia/concordia_search_result.hpp
@@ -22,7 +22,7 @@
 class ConcordiaSearchResult {
 public:
     /*! Constructor.
-      \param tokenVector tokenized patter which was used for searching
+      \param tokenVector tokenized pattern which was used for searching
     */
     explicit ConcordiaSearchResult(
                 const std::vector<std::string> & tokenVector);
diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp
index 0385652..a004f60 100644
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@@ -44,7 +44,8 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
 
 std::vector<std::string> HashGenerator::generateTokenVector(
                                                const std::string & sentence) {
-    std::string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
+    boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
+    std::string anonymizedSentence = as->getSentence();
     boost::trim(anonymizedSentence);
     std::vector<std::string> tokenTexts;
     boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
diff --git a/concordia/hashed_sentence.cpp b/concordia/hashed_sentence.cpp
new file mode 100644
index 0000000..93c1147
--- /dev/null
+++ b/concordia/hashed_sentence.cpp
@@ -0,0 +1,7 @@
+#include "concordia/hashed_sentence.hpp"
+
+HashedSentence::HashedSentence() {
+}
+
+HashedSentence::~HashedSentence() {
+}
diff --git a/concordia/hashed_sentence.hpp b/concordia/hashed_sentence.hpp
new file mode 100644
index 0000000..59ebd3e
--- /dev/null
+++ b/concordia/hashed_sentence.hpp
@@ -0,0 +1,61 @@
+#ifndef HASHED_SENTENCE_HDR
+#define HASHED_SENTENCE_HDR
+
+#include "concordia/common/config.hpp"
+#include "concordia/interval.hpp"
+#include <vector>
+#include <string>
+
+/*!
+  A sentence after hashing by the HashGenerator. The class holds
+  the list of word codes and intervals representing original
+  word positions in the sentence (char-based).
+*/
+
+class HashedSentence {
+public:
+    /*!
+      Constructor.
+
+    */
+    HashedSentence();
+
+    /*! Destructor.
+    */
+    virtual ~HashedSentence();
+
+    /*! Getter for original word positions list.
+      \returns original word positions list
+    */
+    std::vector<Interval> getOriginalWordPositions() const {
+        return _originalWordPositions;
+    }
+
+    /*! Getter for word codes list.
+      \returns word codes list
+    */
+    std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
+        return _wordCodes;
+    }
+    
+    /*! Method for adding a word code to the list
+      \param word code to be added
+    */
+    void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
+        _wordCodes.push_back(wordCode);
+    }
+
+    /*! Method for adding an original word position to the list.
+      \param original word position
+    */
+    void addWordOriginalWordPosition(Interval & originalWordPosition) {
+        _originalWordPositions.push_back(originalWordPosition);
+    }
+
+private:
+    std::vector<Interval> _originalWordPositions;
+
+    std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
+};
+
+#endif
diff --git a/concordia/interval.hpp b/concordia/interval.hpp
index cf63c7d..c06dfec 100644
--- a/concordia/interval.hpp
+++ b/concordia/interval.hpp
@@ -2,13 +2,14 @@
 #define INTERVAL_HDR
 
 #include "concordia/common/config.hpp"
+#include <iostream>
 
 /*!
-  Class representing interval of a sentence, i.e. a sequence of words
+  Class representing interval of a sentence, i.e. a sequence of words or chars
   coming from that sentence. An interval only has its start and end indexes,
   where the start index is inclusive and end index is exclusive. For example,
-  an interval [2,5] of the sentence "This is just for testing purposes" is:
-  "just for testing".
+  an interval [2,5] of words of the sentence "This is just for
+  testing purposes" is: "just for testing".
 
 */
 
@@ -50,6 +51,9 @@ public:
         return _end;
     }
 
+    friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
+        return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
+    }
 protected:
     SUFFIX_MARKER_TYPE _start;
 
diff --git a/concordia/regex_replacement.cpp b/concordia/regex_replacement.cpp
deleted file mode 100644
index 37f5914..0000000
--- a/concordia/regex_replacement.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "concordia/regex_replacement.hpp"
-#include <sstream>
-#include <boost/exception/all.hpp>
-#include <boost/throw_exception.hpp>
-
-RegexReplacement::RegexReplacement(std::string patternString,
-                                   std::string replacement,
-                                   bool caseSensitive)
-                                         throw(ConcordiaException):
-                                         _replacement(replacement) {
-    try {
-        if (caseSensitive) {
-            _pattern = boost::make_u32regex(patternString);
-        } else {
-            _pattern = boost::make_u32regex(patternString,
-                                             boost::regex::icase);
-        }
-    } catch(const std::exception & e) {
-        std::stringstream ss;
-
-        ss << "Bad regex pattern: " << patternString <<
-             " Detailed info: " << e.what();
-
-        if (std::string const * extra =
-                 boost::get_error_info<my_tag_error_info>(e) ) {
-            ss << *extra;
-        }
-        throw ConcordiaException(ss.str());
-    }
-}
-
-RegexReplacement::~RegexReplacement() {
-}
-
-std::string RegexReplacement::apply(const std::string & text) {
-    try {
-        return boost::u32regex_replace(text, _pattern, _replacement,
-                        boost::match_default | boost::format_all);
-    } catch(...) {
-        throw ConcordiaException("Exception while applying replacement rule: "
-                                  +_replacement+" to text: "+text);
-    }
-}
-
diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp
new file mode 100644
index 0000000..83ae20f
--- /dev/null
+++ b/concordia/regex_rule.cpp
@@ -0,0 +1,56 @@
+#include "concordia/regex_rule.hpp"
+#include <sstream>
+#include <iostream>
+#include <boost/exception/all.hpp>
+#include <boost/throw_exception.hpp>
+
+RegexRule::RegexRule(std::string patternString,
+                                std::string value,
+                                bool caseSensitive)
+                                         throw(ConcordiaException):
+                                         _value(value) {
+    try {
+        if (caseSensitive) {
+            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
+        } else {
+            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
+        }
+    } catch(const std::exception & e) {
+        std::stringstream ss;
+
+        ss << "Bad regex pattern: " << patternString <<
+             " Detailed info: " << e.what();
+
+        if (std::string const * extra =
+                 boost::get_error_info<my_tag_error_info>(e) ) {
+            ss << *extra;
+        }
+        throw ConcordiaException(ss.str());
+    }
+}
+
+RegexRule::~RegexRule() {
+}
+
+void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
+    try {
+        UnicodeString s(sentence->getSentence().c_str());
+        boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
+        boost::u32regex_iterator<const UChar*> end;
+        std::vector<TokenAnnotation> annotations;
+        for (; begin != end; ++begin) {
+            SUFFIX_MARKER_TYPE matchBegin = begin->position();
+            SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
+            TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
+            annotations.push_back(annotation);
+        }
+        sentence->addAnnotations(annotations);
+    } catch(const std::exception & e) {
+        std::stringstream ss;
+        ss << "Exception while applying regex rule: "
+                          << _value << " to text: " << sentence->getSentence();
+        ss << ", message: " << e.what();
+        throw ConcordiaException(ss.str());
+    }
+}
+
diff --git a/concordia/regex_replacement.hpp b/concordia/regex_rule.hpp
similarity index 50%
rename from concordia/regex_replacement.hpp
rename to concordia/regex_rule.hpp
index d118100..2f74c30 100644
--- a/concordia/regex_replacement.hpp
+++ b/concordia/regex_rule.hpp
@@ -1,24 +1,25 @@
-#ifndef REGEX_REPLACEMENT_HDR
-#define REGEX_REPLACEMENT_HDR
+#ifndef REGEX_ANNOTATION_HDR
+#define REGEX_ANNOTATION_HDR
 
 #include <string>
 #include "concordia/common/config.hpp"
+#include "concordia/anonymized_sentence.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
 #include <boost/regex/icu.hpp>
-
+#include <unicode/unistr.h>
 
 
 typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
 
 /*!
-  Class for representing a regular expression replacement operation.
+  Class for representing a regular expression annotation rule.
   Holds regex pattern string for matching and replacement string for
-  replacing found matches.
+  annotating found matches.
 
 */
-class RegexReplacement {
+class RegexRule {
 public:
     /*!
       Constructor.
@@ -26,24 +27,23 @@ public:
         \param replacement string to substitute the found match
         \param caseSensitive case sensitivity of the pattern
     */
-    RegexReplacement(std::string patternString, std::string replacement,
-                                              bool caseSensitive = true)
-                                               throw(ConcordiaException);
+    RegexRule(std::string patternString, std::string value,
+                                 bool caseSensitive = true)
+                                 throw(ConcordiaException);
 
     /*! Destructor.
     */
-    virtual ~RegexReplacement();
+    virtual ~RegexRule();
 
-    /*! Applies the operation on input string.
-      \param text the input string
-      \returns altered version of the input string
+    /*! Applies the operation on anonymized sentence.
+      \param sentence the input sentence
     */
-    std::string apply(const std::string & text);
+    void apply(boost::shared_ptr<AnonymizedSentence> sentence);
 
 private:
     boost::u32regex _pattern;
 
-    std::string _replacement;
+    std::string _value;
 };
 
 #endif
diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_anonymizer.cpp
index 85598d5..e0715f3 100644
--- a/concordia/sentence_anonymizer.cpp
+++ b/concordia/sentence_anonymizer.cpp
@@ -1,6 +1,5 @@
 #include "concordia/sentence_anonymizer.hpp"
 
-#include "concordia/common/text_utils.hpp"
 #include <boost/foreach.hpp>
 #include <fstream>
 #include <sstream>
@@ -26,22 +25,24 @@ SentenceAnonymizer::SentenceAnonymizer(
 SentenceAnonymizer::~SentenceAnonymizer() {
 }
 
-std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
-    std::string result = sentence;
+boost::shared_ptr<AnonymizedSentence>
+              SentenceAnonymizer::anonymize(const std::string & sentence) {
+    boost::shared_ptr<AnonymizedSentence> 
+                    result(new AnonymizedSentence(sentence));
 
-    result = _htmlTags->apply(result);
+    _htmlTags->apply(result);
 
-    BOOST_FOREACH(RegexReplacement & neRule, _namedEntities) {
-        result = neRule.apply(result);
+    BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
+        neRule.apply(result);
     }
 
-    result = TextUtils::getInstance().toLowerCase(result);
+    result->toLowerCase();
 
     if (_stopWordsEnabled) {
-        result = _stopWords->apply(result);
+        _stopWords->apply(result);
     }
-    result = _stopSymbols->apply(result);
-    result = _spaceSymbols->apply(result);
+    _stopSymbols->apply(result);
+    _spaceSymbols->apply(result);
 
     return result;
 }
@@ -64,7 +65,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
                        << " in NE file: " << namedEntitiesPath;
                     throw ConcordiaException(ss.str());
                 } else {
-                    _namedEntities.push_back(RegexReplacement(
+                    _namedEntities.push_back(RegexRule(
                                 tokenTexts->at(0), tokenTexts->at(1)));
                 }
             }
@@ -95,11 +96,11 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
     }
     tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
     tagsExpression += "br).*?>";
-    _htmlTags = boost::shared_ptr<RegexReplacement>(
-                        new RegexReplacement(tagsExpression, "", false));
+    _htmlTags = boost::shared_ptr<RegexRule>(
+                        new RegexRule(tagsExpression, "", false));
 }
 
-boost::shared_ptr<RegexReplacement>
+boost::shared_ptr<RegexRule>
         SentenceAnonymizer::_getMultipleReplacementRule(
             std::string & filePath, std::string replacement, bool wholeWord) {
     std::string expression = "(";
@@ -126,7 +127,7 @@ boost::shared_ptr<RegexReplacement>
     }
     expression = expression.substr(0, expression.size()-1);
     expression += ")";
-    return boost::shared_ptr<RegexReplacement>(
-                        new RegexReplacement(expression, replacement, false));
+    return boost::shared_ptr<RegexRule>(
+                        new RegexRule(expression, replacement, false));
 }
 
diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_anonymizer.hpp
index 99715d8..db8e102 100644
--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_anonymizer.hpp
@@ -4,7 +4,8 @@
 #include <string>
 #include <vector>
 #include "concordia/common/config.hpp"
-#include "concordia/regex_replacement.hpp"
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/regex_rule.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
@@ -37,29 +38,30 @@ public:
       \param sentence input sentence
       \returns altered version of the input sentence
     */
-    std::string anonymize(const std::string & sentence);
+    boost::shared_ptr<AnonymizedSentence>
+                                   anonymize(const std::string & sentence);
 
 private:
     void _createNeRules(std::string & namedEntitiesPath);
 
     void _createHtmlTagsRule(std::string & htmlTagsPath);
 
-    boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
+    boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
                                              std::string & filePath,
                                              std::string replacement,
                                              bool wholeWord = false);
 
-    std::vector<RegexReplacement> _namedEntities;
+    std::vector<RegexRule> _namedEntities;
 
-    boost::shared_ptr<RegexReplacement> _htmlTags;
+    boost::shared_ptr<RegexRule> _htmlTags;
 
     bool _stopWordsEnabled;
 
-    boost::shared_ptr<RegexReplacement> _stopWords;
+    boost::shared_ptr<RegexRule> _stopWords;
 
-    boost::shared_ptr<RegexReplacement> _stopSymbols;
+    boost::shared_ptr<RegexRule> _stopSymbols;
 
-    boost::shared_ptr<RegexReplacement> _spaceSymbols;
+    boost::shared_ptr<RegexRule> _spaceSymbols;
 };
 
 #endif
diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt
index 5886596..9020c3b 100644
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@@ -1,8 +1,9 @@
 add_library(concordia-tests
+  test_regex_rule.cpp
+  test_anonymized_sentence.cpp
   test_concordia_searcher.cpp
   test_sentence_anonymizer.cpp
   test_text_utils.cpp
-  test_regex_replacement.cpp
   test_example.cpp
   test_tm_matches.cpp
   test_interval.cpp
diff --git a/concordia/t/test_anonymized_sentence.cpp b/concordia/t/test_anonymized_sentence.cpp
new file mode 100644
index 0000000..334cbda
--- /dev/null
+++ b/concordia/t/test_anonymized_sentence.cpp
@@ -0,0 +1,86 @@
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/token_annotation.hpp"
+#include "concordia/common/config.hpp"
+#include <iostream>
+
+BOOST_AUTO_TEST_SUITE(anonymized_sentence)
+
+BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
+{
+    AnonymizedSentence as("This is a test sentence");
+
+    std::vector<TokenAnnotation> annotations;
+    annotations.push_back(TokenAnnotation(0,1,'a',"val"));
+    annotations.push_back(TokenAnnotation(4,6,'a',"val"));
+    annotations.push_back(TokenAnnotation(7,10,'a',"val"));
+    annotations.push_back(TokenAnnotation(12,14,'a',"val"));
+    
+    as.addAnnotations(annotations);
+        
+    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
+    
+}
+
+BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
+{
+    AnonymizedSentence as("This is a test sentence");
+
+    std::vector<TokenAnnotation> annotations1;
+    annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
+    annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
+    annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
+    annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
+    as.addAnnotations(annotations1);
+    /* annotation
+    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
+    -           ----     -------       -----
+    
+    */
+    
+    std::vector<TokenAnnotation> annotations2;
+    annotations2.push_back(TokenAnnotation(1,4,'a',"val"));
+    annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
+    annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
+    annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
+    as.addAnnotations(annotations2);
+    /* annotations2
+    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
+       -------  -------          -- -----   
+    
+    expecting:
+    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
+    -  -------  ----     ------- --    -----
+    
+    */   
+    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
+    std::list<TokenAnnotation> annotations = as.getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),0);
+    BOOST_CHECK_EQUAL(iter->getEnd(),1);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),1);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),4);
+    BOOST_CHECK_EQUAL(iter->getEnd(),6);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),7);
+    BOOST_CHECK_EQUAL(iter->getEnd(),10);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),10);
+    BOOST_CHECK_EQUAL(iter->getEnd(),11);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),12);
+    BOOST_CHECK_EQUAL(iter->getEnd(),14);
+    
+}
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/t/test_regex_replacement.cpp b/concordia/t/test_regex_replacement.cpp
deleted file mode 100644
index 7311e10..0000000
--- a/concordia/t/test_regex_replacement.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#include "tests/unit-tests/unit_tests_globals.hpp"
-#include "concordia/regex_replacement.hpp"
-#include "concordia/common/config.hpp"
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/locale.hpp>
-#include <boost/algorithm/string/case_conv.hpp>
-
-BOOST_AUTO_TEST_SUITE(regex_replacement)
-
-BOOST_AUTO_TEST_CASE( SimpleReplacement )
-{
-    RegexReplacement rr("a","b");
-    BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
-}
-
-BOOST_AUTO_TEST_CASE( BadRegex )
-{
-    bool exceptionThrown = false;
-    std::string message = "";
-    try {
-        RegexReplacement rr("+a","b");
-    } catch (ConcordiaException & e) {
-        exceptionThrown = true;
-        message = e.what();
-    }    
-    BOOST_CHECK_EQUAL(exceptionThrown, true);    
-    BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);    
-}
-
-BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
-{
-    RegexReplacement rr("['\"\\\\.]","");
-    BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin  Hold on to the feelin");
-}
-
-BOOST_AUTO_TEST_CASE( BackrefReplacement )
-{
-    RegexReplacement rr("(\\d+)","the number: \\1");
-    BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
-}
-
-BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
-{
-    RegexReplacement rr("abc","xxx", false);
-    BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
-}
-
-BOOST_AUTO_TEST_CASE( UnicodeReplacement )
-{
-    RegexReplacement rr("ą","x");
-    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
-}
-
-BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
-{
-    RegexReplacement rr("ą","x", false);
-    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
-}
-
-BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
-{
-    RegexReplacement rr("[ąćęłńóśżź]","x", false);
-    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
-}
-
-BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp
new file mode 100644
index 0000000..7922452
--- /dev/null
+++ b/concordia/t/test_regex_rule.cpp
@@ -0,0 +1,221 @@
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include "concordia/regex_rule.hpp"
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/common/config.hpp"
+#include <boost/shared_ptr.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/locale.hpp>
+#include <boost/algorithm/string/case_conv.hpp>
+
+BOOST_AUTO_TEST_SUITE(regex_rule)
+
+BOOST_AUTO_TEST_CASE( SimpleReplacement )
+{
+    RegexRule rr("a","b");
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    rr.apply(as);    
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),7);
+    BOOST_CHECK_EQUAL(iter->getEnd(),8);
+    iter++;
+    
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),14);
+    BOOST_CHECK_EQUAL(iter->getEnd(),15);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),16);
+    BOOST_CHECK_EQUAL(iter->getEnd(),17);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),18);
+    BOOST_CHECK_EQUAL(iter->getEnd(),19);  
+}
+
+BOOST_AUTO_TEST_CASE( BadRegex )
+{
+    bool exceptionThrown = false;
+    std::string message = "";
+    try {
+        RegexRule rr("+a","b");
+    } catch (ConcordiaException & e) {
+        exceptionThrown = true;
+        message = e.what();
+    }    
+    BOOST_CHECK_EQUAL(exceptionThrown, true);    
+    BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);    
+}
+
+BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
+{
+    RegexRule rr("['\"\\\\.]","");
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),3);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),19);
+    BOOST_CHECK_EQUAL(iter->getEnd(),20);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),21);
+    BOOST_CHECK_EQUAL(iter->getEnd(),22);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),44);
+    BOOST_CHECK_EQUAL(iter->getEnd(),45);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),45);
+    BOOST_CHECK_EQUAL(iter->getEnd(),46);  
+}
+
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
+{
+    RegexRule rr("abc","xxx", false);
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),8);
+    BOOST_CHECK_EQUAL(iter->getEnd(),11);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),16);
+    BOOST_CHECK_EQUAL(iter->getEnd(),19);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),24);
+    BOOST_CHECK_EQUAL(iter->getEnd(),27);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),32);
+    BOOST_CHECK_EQUAL(iter->getEnd(),35);
+}
+
+BOOST_AUTO_TEST_CASE( UnicodeReplacement )
+{
+    RegexRule rr("ą","x");
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
+{
+    RegexRule rr("ą","x", false);
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),29);
+    BOOST_CHECK_EQUAL(iter->getEnd(),30);
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
+{
+    RegexRule rr("[ąćęłńóśżź]","x", false);
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),2);
+    BOOST_CHECK_EQUAL(iter->getEnd(),3);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),3);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),4);
+    BOOST_CHECK_EQUAL(iter->getEnd(),5);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),5);
+    BOOST_CHECK_EQUAL(iter->getEnd(),6);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),8);
+    BOOST_CHECK_EQUAL(iter->getEnd(),9);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),9);
+    BOOST_CHECK_EQUAL(iter->getEnd(),10);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),15);
+    BOOST_CHECK_EQUAL(iter->getEnd(),16);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),16);
+    BOOST_CHECK_EQUAL(iter->getEnd(),17);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),20);
+    BOOST_CHECK_EQUAL(iter->getEnd(),21);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),21);
+    BOOST_CHECK_EQUAL(iter->getEnd(),22);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),22);
+    BOOST_CHECK_EQUAL(iter->getEnd(),23);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),23);
+    BOOST_CHECK_EQUAL(iter->getEnd(),24);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),26);
+    BOOST_CHECK_EQUAL(iter->getEnd(),27);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),27);
+    BOOST_CHECK_EQUAL(iter->getEnd(),28);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),29);
+    BOOST_CHECK_EQUAL(iter->getEnd(),30);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),33);
+    BOOST_CHECK_EQUAL(iter->getEnd(),34);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),34);
+    BOOST_CHECK_EQUAL(iter->getEnd(),35);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/t/test_sentence_anonymizer.cpp b/concordia/t/test_sentence_anonymizer.cpp
index 932552c..a712059 100644
--- a/concordia/t/test_sentence_anonymizer.cpp
+++ b/concordia/t/test_sentence_anonymizer.cpp
@@ -17,7 +17,7 @@ BOOST_AUTO_TEST_CASE( NETest )
     
     
     std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date  ne_date mail  ne_email number  ne_number");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date  ne_date mail  ne_email number  ne_number");
 }
 
 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
     
     
     std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline ");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
     
 }
 
@@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
     if (config->isStopWordsEnabled()) {
         SentenceAnonymizer anonymizer(config);
         std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
-        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"  wiem   konieczne");
+        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"  wiem   konieczne");
     }
 }
 
@@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE( StopSymbolsTest )
     
     
     std::string sentence = "xxx, . xxx  # xx $xx@ xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx  xxx   xx xx xx");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
     
 }
 
@@ -59,7 +59,7 @@ BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
     
     
     std::string sentence = "xxx-xxx xx|xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
     
 }
 
@@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
     SentenceAnonymizer anonymizer(config);
     
     std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
     
 }
 
diff --git a/concordia/token_annotation.cpp b/concordia/token_annotation.cpp
new file mode 100644
index 0000000..a0b7c03
--- /dev/null
+++ b/concordia/token_annotation.cpp
@@ -0,0 +1,15 @@
+#include "concordia/token_annotation.hpp"
+
+
+TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
+                                 const SUFFIX_MARKER_TYPE end,
+                                 const char annotationType,
+                                 const std::string & value):
+                                            Interval(start, end),
+                                            _annotationType(annotationType),
+                                            _value(value) {
+}
+
+TokenAnnotation::~TokenAnnotation() {
+}
+
diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp
new file mode 100644
index 0000000..0c805bb
--- /dev/null
+++ b/concordia/token_annotation.hpp
@@ -0,0 +1,53 @@
+#ifndef TOKEN_ANNOTATION_HDR
+#define TOKEN_ANNOTATION_HDR
+
+#include "concordia/common/config.hpp"
+#include "concordia/interval.hpp"
+
+#include <string>
+
+/*!
+  Class representing annotatio of char sequence as a token.
+  It is a type of interval that is also storing information
+  about the annoation type and value.
+
+*/
+
+class TokenAnnotation : public Interval {
+public:
+    /*! Constructor.
+      \param start start index of the annotation (char-level, 0-based)
+      \param end end index of the annotation (char-level, 0-based)
+      \param type annotation type
+      \param value annotation value
+    */
+    TokenAnnotation(const SUFFIX_MARKER_TYPE start,
+                    const SUFFIX_MARKER_TYPE end,
+                    const char annotationType,
+                    const std::string & value);
+
+    /*! Destructor.
+    */
+    virtual ~TokenAnnotation();
+
+    /*! Getter for annotation type.
+      \returns annotation type
+    */
+    char getType() const {
+        return _annotationType;
+    }
+
+    /*! Getter for annotation value.
+      \returns annotation value
+    */
+    std::string getValue() const {
+        return _value;
+    }
+
+protected:
+    char _annotationType;
+
+    std::string _value;
+};
+
+#endif
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c0033a7..78beef7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,6 +17,7 @@ add_executable(first first.cpp)
 target_link_libraries(first concordia)
 target_link_libraries(first config++)
 target_link_libraries(first log4cpp)
+target_link_libraries(first icui18n)
 target_link_libraries(first ${Boost_LIBRARIES})
 target_link_libraries(first divsufsort)
 target_link_libraries(first utf8case)
@@ -27,6 +28,7 @@ add_executable(simple_search simple_search.cpp)
 target_link_libraries(simple_search concordia)
 target_link_libraries(simple_search config++)
 target_link_libraries(simple_search log4cpp)
+target_link_libraries(simple_search icui18n)
 target_link_libraries(simple_search ${Boost_LIBRARIES})
 target_link_libraries(simple_search divsufsort)
 target_link_libraries(simple_search utf8case)
@@ -38,6 +40,7 @@ add_executable(concordia_search concordia_search.cpp)
 target_link_libraries(concordia_search concordia)
 target_link_libraries(concordia_search config++)
 target_link_libraries(concordia_search log4cpp)
+target_link_libraries(concordia_search icui18n)
 target_link_libraries(concordia_search ${Boost_LIBRARIES})
 target_link_libraries(concordia_search divsufsort)
 target_link_libraries(concordia_search utf8case)