character intervals in progress

2015-06-22 13:52:56 +02:00 · 2015-06-22 13:52:56 +02:00 · 0baf3e4ef2
commit 0baf3e4ef2
parent 4c0f2fd08d
25 changed files with 705 additions and 167 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -103,26 +103,43 @@ find_package(Boost COMPONENTS
 # ----------------------------------------------------
 # libconfig
 # ----------------------------------------------------
-find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
+find_library(LIBCONFIG_LIB NAMES config++)
 find_path(LIBCONFIG_INCLUDE libconfig.h++)

 if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
  message(STATUS "Found Libconfig")
  include_directories(${LIBCONFIG_INCLUDE})
  link_directories(${LIBCONFIG_LIB})
+else()
+  message(FATAL_ERROR "Libconfig not found")
 endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})

+# ----------------------------------------------------
+# ICU (I feeeeel youuuuu...)
+# ----------------------------------------------------
+find_library(ICU_LIB NAMES icui18n)
+find_path(ICU_INCLUDE unicode)
+
+if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
+  message(STATUS "Found ICU")
+  include_directories(${ICU_INCLUDE})
+  link_directories(${ICU_LIB})
+else()
+  message(FATAL_ERROR "ICU not found")
+endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})

 # ----------------------------------------------------
 # Logging
 # ----------------------------------------------------
-find_library(LOG4CPP_LIB NAMES log4cpp REQUIRED)
+find_library(LOG4CPP_LIB NAMES log4cpp)
 find_path(LOG4CPP_INCLUDE log4cpp/Appender.hh)

 if(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
  message(STATUS "Found Log4cpp")
  include_directories(${LOG4CPP_INCLUDE})
  link_directories(${LOG4CPP_LIB})
+else()
+  message(FATAL_ERROR "Log4cpp not found")
 endif(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})

 # ================================================
--- a/TODO.txt
+++ b/TODO.txt
@ -1,6 +1,8 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------

+- implement tokenAnnotations vector as interval tree
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
+- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
 - testy zużycia pamięci
 - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@ -6,10 +6,13 @@ foreach(dir ${ALL_DIRECTORIES})
 endforeach(dir)

 add_library(concordia SHARED
+  token_annotation.cpp
+  anonymized_sentence.cpp
+  hashed_sentence.cpp
  concordia_search_result.cpp
  matched_pattern_fragment.cpp
  concordia_searcher.cpp
-  regex_replacement.cpp
+  regex_rule.cpp
  sentence_anonymizer.cpp
  interval.cpp
  tm_matches.cpp
@ -33,10 +36,13 @@ add_subdirectory(t)

 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
+          token_annotation.hpp
+          anonymized_sentence.hpp
+          hashed_sentence.hpp
          concordia_search_result.hpp
          matched_pattern_fragment.hpp
          concordia_searcher.hpp
-          regex_replacement.hpp
+          regex_rule.hpp
          sentence_anonymizer.hpp
          interval.hpp
          tm_matches.hpp
--- a/concordia/anonymized_sentence.cpp
+++ b/concordia/anonymized_sentence.cpp
@ -0,0 +1,48 @@
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/common/text_utils.hpp"
+
+#include <iostream>
+
+AnonymizedSentence::AnonymizedSentence(std::string sentence):
+                                         _sentence(sentence) {
+}
+
+AnonymizedSentence::~AnonymizedSentence() {
+}
+
+void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
+    std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
+    std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
+    
+    while(newAnnotation != annotations.end()) {
+        if (existingAnnotation != _tokenAnnotations.end()) {
+            // there are still some existing annotations, so perform checks
+            if (newAnnotation->intersects(*existingAnnotation)) {
+                // The new annotation intersects with the existing.
+                // We can not add it, so let us just move on to the
+                // next new annoation.
+                newAnnotation++;
+            } else {
+                // it is now important whether the new interval is before
+                // or after existing
+                if (newAnnotation->getStart() < existingAnnotation->getStart()) {
+                    // New interval does not intersect and is before existing. We add it.
+                    _tokenAnnotations.insert(existingAnnotation, *newAnnotation);
+                    newAnnotation++;
+                } else {
+                    // If the new interval is after existing we move to the next existing annoation.
+                    existingAnnotation++;
+                }
+            }
+        } else {
+            // no more existing annotations, so just add the new annotation
+            _tokenAnnotations.push_back(*newAnnotation);
+            newAnnotation++;
+        }
+    }
+    
+}
+
+void AnonymizedSentence::toLowerCase() {
+    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
+}
--- a/concordia/anonymized_sentence.hpp
+++ b/concordia/anonymized_sentence.hpp
@ -0,0 +1,64 @@
+#ifndef ANONYMIZED_SENTENCE_HDR
+#define ANONYMIZED_SENTENCE_HDR
+
+#include "concordia/common/config.hpp"
+#include "concordia/token_annotation.hpp"
+#include <string>
+#include <vector>
+#include <list>
+
+/*!
+  A sentence after anonymization operations. The class
+  holds the current string represenation of the sentence
+  along with the annotations list.
+*/
+
+class AnonymizedSentence {
+public:
+    /*!
+      Constructor.
+
+    */
+    AnonymizedSentence(std::string sentence);
+
+    /*! Destructor.
+    */
+    virtual ~AnonymizedSentence();
+
+    /*! Getter for sentence
+      \returns sentence
+    */
+    std::string getSentence() const {
+        return _sentence;
+    }
+
+    /*! Getter for annotations list
+      \returns annotations list
+    */
+    std::list<TokenAnnotation> getAnnotations() const {
+        return _tokenAnnotations;
+    }
+
+    /*! 
+        Transform the sentence to lower case.
+    */
+    void toLowerCase();
+
+    /*! 
+        Add new annotations to the existing annotations list. Assumptions:
+        1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
+        2. the annotations to be added list also has the above properties.
+        The below algorithm will only add the annotations that do not
+        intersect with any of the existing ones.
+
+        \param annotations list of annotations to be added
+    */
+    void addAnnotations(std::vector<TokenAnnotation> annotations);    
+
+private:
+    std::string _sentence;
+
+    std::list<TokenAnnotation> _tokenAnnotations;
+};
+
+#endif
--- a/concordia/compilation.dox
+++ b/concordia/compilation.dox
@ -30,6 +30,7 @@ On Ubuntu 14.04, the above software comes in standard packages. Here is the comp
 - libconfig++-dev
 - libconfig-dev
 - libpcre3-dev
+- libicu-dev
 - doxygen
 - texlive-font-utils

@ -39,7 +40,7 @@ If you want to install all the above packages at once, simply use the below comm

 \verbatim

-sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev doxygen texlive-font-utils
+sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev libicu-dev doxygen texlive-font-utils

 \endverbatim

--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -91,7 +91,6 @@ void ConcordiaIndex::_addSingleExample(
        Utils::appendCharToSaucharVector(T, character);

        // append to markersFile
-
        SUFFIX_MARKER_TYPE marker = Utils::createMarker(
                                           example.getId(),
                                           offset,
--- a/concordia/concordia_search_result.hpp
+++ b/concordia/concordia_search_result.hpp
@ -22,7 +22,7 @@
 class ConcordiaSearchResult {
 public:
    /*! Constructor.
-      \param tokenVector tokenized patter which was used for searching
+      \param tokenVector tokenized pattern which was used for searching
    */
    explicit ConcordiaSearchResult(
                const std::vector<std::string> & tokenVector);
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -44,7 +44,8 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(

 std::vector<std::string> HashGenerator::generateTokenVector(
                                               const std::string & sentence) {
-    std::string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
+    boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
+    std::string anonymizedSentence = as->getSentence();
    boost::trim(anonymizedSentence);
    std::vector<std::string> tokenTexts;
    boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
--- a/concordia/hashed_sentence.cpp
+++ b/concordia/hashed_sentence.cpp
@ -0,0 +1,7 @@
+#include "concordia/hashed_sentence.hpp"
+
+HashedSentence::HashedSentence() {
+}
+
+HashedSentence::~HashedSentence() {
+}
--- a/concordia/hashed_sentence.hpp
+++ b/concordia/hashed_sentence.hpp
@ -0,0 +1,61 @@
+#ifndef HASHED_SENTENCE_HDR
+#define HASHED_SENTENCE_HDR
+
+#include "concordia/common/config.hpp"
+#include "concordia/interval.hpp"
+#include <vector>
+#include <string>
+
+/*!
+  A sentence after hashing by the HashGenerator. The class holds
+  the list of word codes and intervals representing original
+  word positions in the sentence (char-based).
+*/
+
+class HashedSentence {
+public:
+    /*!
+      Constructor.
+
+    */
+    HashedSentence();
+
+    /*! Destructor.
+    */
+    virtual ~HashedSentence();
+
+    /*! Getter for original word positions list.
+      \returns original word positions list
+    */
+    std::vector<Interval> getOriginalWordPositions() const {
+        return _originalWordPositions;
+    }
+
+    /*! Getter for word codes list.
+      \returns word codes list
+    */
+    std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
+        return _wordCodes;
+    }
+    
+    /*! Method for adding a word code to the list
+      \param word code to be added
+    */
+    void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
+        _wordCodes.push_back(wordCode);
+    }
+
+    /*! Method for adding an original word position to the list.
+      \param original word position
+    */
+    void addWordOriginalWordPosition(Interval & originalWordPosition) {
+        _originalWordPositions.push_back(originalWordPosition);
+    }
+
+private:
+    std::vector<Interval> _originalWordPositions;
+
+    std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
+};
+
+#endif
--- a/concordia/interval.hpp
+++ b/concordia/interval.hpp
@ -2,13 +2,14 @@
 #define INTERVAL_HDR

 #include "concordia/common/config.hpp"
+#include <iostream>

 /*!
-  Class representing interval of a sentence, i.e. a sequence of words
+  Class representing interval of a sentence, i.e. a sequence of words or chars
  coming from that sentence. An interval only has its start and end indexes,
  where the start index is inclusive and end index is exclusive. For example,
-  an interval [2,5] of the sentence "This is just for testing purposes" is:
-  "just for testing".
+  an interval [2,5] of words of the sentence "This is just for
+  testing purposes" is: "just for testing".

 */

@ -50,6 +51,9 @@ public:
        return _end;
    }

+    friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
+        return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
+    }
 protected:
    SUFFIX_MARKER_TYPE _start;

--- a/concordia/regex_replacement.cpp
+++ b/concordia/regex_replacement.cpp
@ -1,44 +0,0 @@
-#include "concordia/regex_replacement.hpp"
-#include <sstream>
-#include <boost/exception/all.hpp>
-#include <boost/throw_exception.hpp>
-
-RegexReplacement::RegexReplacement(std::string patternString,
-                                   std::string replacement,
-                                   bool caseSensitive)
-                                         throw(ConcordiaException):
-                                         _replacement(replacement) {
-    try {
-        if (caseSensitive) {
-            _pattern = boost::make_u32regex(patternString);
-        } else {
-            _pattern = boost::make_u32regex(patternString,
-                                             boost::regex::icase);
-        }
-    } catch(const std::exception & e) {
-        std::stringstream ss;
-
-        ss << "Bad regex pattern: " << patternString <<
-             " Detailed info: " << e.what();
-
-        if (std::string const * extra =
-                 boost::get_error_info<my_tag_error_info>(e) ) {
-            ss << *extra;
-        }
-        throw ConcordiaException(ss.str());
-    }
-}
-
-RegexReplacement::~RegexReplacement() {
-}
-
-std::string RegexReplacement::apply(const std::string & text) {
-    try {
-        return boost::u32regex_replace(text, _pattern, _replacement,
-                        boost::match_default | boost::format_all);
-    } catch(...) {
-        throw ConcordiaException("Exception while applying replacement rule: "
-                                  +_replacement+" to text: "+text);
-    }
-}
-
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@ -0,0 +1,56 @@
+#include "concordia/regex_rule.hpp"
+#include <sstream>
+#include <iostream>
+#include <boost/exception/all.hpp>
+#include <boost/throw_exception.hpp>
+
+RegexRule::RegexRule(std::string patternString,
+                                std::string value,
+                                bool caseSensitive)
+                                         throw(ConcordiaException):
+                                         _value(value) {
+    try {
+        if (caseSensitive) {
+            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
+        } else {
+            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
+        }
+    } catch(const std::exception & e) {
+        std::stringstream ss;
+
+        ss << "Bad regex pattern: " << patternString <<
+             " Detailed info: " << e.what();
+
+        if (std::string const * extra =
+                 boost::get_error_info<my_tag_error_info>(e) ) {
+            ss << *extra;
+        }
+        throw ConcordiaException(ss.str());
+    }
+}
+
+RegexRule::~RegexRule() {
+}
+
+void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
+    try {
+        UnicodeString s(sentence->getSentence().c_str());
+        boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
+        boost::u32regex_iterator<const UChar*> end;
+        std::vector<TokenAnnotation> annotations;
+        for (; begin != end; ++begin) {
+            SUFFIX_MARKER_TYPE matchBegin = begin->position();
+            SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
+            TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
+            annotations.push_back(annotation);
+        }
+        sentence->addAnnotations(annotations);
+    } catch(const std::exception & e) {
+        std::stringstream ss;
+        ss << "Exception while applying regex rule: "
+                          << _value << " to text: " << sentence->getSentence();
+        ss << ", message: " << e.what();
+        throw ConcordiaException(ss.str());
+    }
+}
+
--- a/concordia/regex_replacement.hpp
+++ b/concordia/regex_replacement.hpp
@ -1,24 +1,25 @@
-#ifndef REGEX_REPLACEMENT_HDR
-#define REGEX_REPLACEMENT_HDR
+#ifndef REGEX_ANNOTATION_HDR
+#define REGEX_ANNOTATION_HDR

 #include <string>
 #include "concordia/common/config.hpp"
+#include "concordia/anonymized_sentence.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
 #include <boost/regex/icu.hpp>
-
+#include <unicode/unistr.h>


 typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;

 /*!
-  Class for representing a regular expression replacement operation.
+  Class for representing a regular expression annotation rule.
  Holds regex pattern string for matching and replacement string for
-  replacing found matches.
+  annotating found matches.

 */
-class RegexReplacement {
+class RegexRule {
 public:
    /*!
      Constructor.
@ -26,24 +27,23 @@ public:
        \param replacement string to substitute the found match
        \param caseSensitive case sensitivity of the pattern
    */
-    RegexReplacement(std::string patternString, std::string replacement,
-                                              bool caseSensitive = true)
-                                               throw(ConcordiaException);
+    RegexRule(std::string patternString, std::string value,
+                                 bool caseSensitive = true)
+                                 throw(ConcordiaException);

    /*! Destructor.
    */
-    virtual ~RegexReplacement();
+    virtual ~RegexRule();

-    /*! Applies the operation on input string.
-      \param text the input string
-      \returns altered version of the input string
+    /*! Applies the operation on anonymized sentence.
+      \param sentence the input sentence
    */
-    std::string apply(const std::string & text);
+    void apply(boost::shared_ptr<AnonymizedSentence> sentence);

 private:
    boost::u32regex _pattern;

-    std::string _replacement;
+    std::string _value;
 };

 #endif
--- a/concordia/sentence_anonymizer.cpp
+++ b/concordia/sentence_anonymizer.cpp
@ -1,6 +1,5 @@
 #include "concordia/sentence_anonymizer.hpp"

-#include "concordia/common/text_utils.hpp"
 #include <boost/foreach.hpp>
 #include <fstream>
 #include <sstream>
@ -26,22 +25,24 @@ SentenceAnonymizer::SentenceAnonymizer(
 SentenceAnonymizer::~SentenceAnonymizer() {
 }

-std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
-    std::string result = sentence;
+boost::shared_ptr<AnonymizedSentence>
+              SentenceAnonymizer::anonymize(const std::string & sentence) {
+    boost::shared_ptr<AnonymizedSentence> 
+                    result(new AnonymizedSentence(sentence));

-    result = _htmlTags->apply(result);
+    _htmlTags->apply(result);

-    BOOST_FOREACH(RegexReplacement & neRule, _namedEntities) {
-        result = neRule.apply(result);
+    BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
+        neRule.apply(result);
    }

-    result = TextUtils::getInstance().toLowerCase(result);
+    result->toLowerCase();

    if (_stopWordsEnabled) {
-        result = _stopWords->apply(result);
+        _stopWords->apply(result);
    }
-    result = _stopSymbols->apply(result);
-    result = _spaceSymbols->apply(result);
+    _stopSymbols->apply(result);
+    _spaceSymbols->apply(result);

    return result;
 }
@ -64,7 +65,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
                       << " in NE file: " << namedEntitiesPath;
                    throw ConcordiaException(ss.str());
                } else {
-                    _namedEntities.push_back(RegexReplacement(
+                    _namedEntities.push_back(RegexRule(
                                tokenTexts->at(0), tokenTexts->at(1)));
                }
            }
@ -95,11 +96,11 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    }
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
    tagsExpression += "br).*?>";
-    _htmlTags = boost::shared_ptr<RegexReplacement>(
-                        new RegexReplacement(tagsExpression, "", false));
+    _htmlTags = boost::shared_ptr<RegexRule>(
+                        new RegexRule(tagsExpression, "", false));
 }

-boost::shared_ptr<RegexReplacement>
+boost::shared_ptr<RegexRule>
        SentenceAnonymizer::_getMultipleReplacementRule(
            std::string & filePath, std::string replacement, bool wholeWord) {
    std::string expression = "(";
@ -126,7 +127,7 @@ boost::shared_ptr<RegexReplacement>
    }
    expression = expression.substr(0, expression.size()-1);
    expression += ")";
-    return boost::shared_ptr<RegexReplacement>(
-                        new RegexReplacement(expression, replacement, false));
+    return boost::shared_ptr<RegexRule>(
+                        new RegexRule(expression, replacement, false));
 }

--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_anonymizer.hpp
@ -4,7 +4,8 @@
 #include <string>
 #include <vector>
 #include "concordia/common/config.hpp"
-#include "concordia/regex_replacement.hpp"
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/regex_rule.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
@ -37,29 +38,30 @@ public:
      \param sentence input sentence
      \returns altered version of the input sentence
    */
-    std::string anonymize(const std::string & sentence);
+    boost::shared_ptr<AnonymizedSentence>
+                                   anonymize(const std::string & sentence);

 private:
    void _createNeRules(std::string & namedEntitiesPath);

    void _createHtmlTagsRule(std::string & htmlTagsPath);

-    boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
+    boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
                                             std::string & filePath,
                                             std::string replacement,
                                             bool wholeWord = false);

-    std::vector<RegexReplacement> _namedEntities;
+    std::vector<RegexRule> _namedEntities;

-    boost::shared_ptr<RegexReplacement> _htmlTags;
+    boost::shared_ptr<RegexRule> _htmlTags;

    bool _stopWordsEnabled;

-    boost::shared_ptr<RegexReplacement> _stopWords;
+    boost::shared_ptr<RegexRule> _stopWords;

-    boost::shared_ptr<RegexReplacement> _stopSymbols;
+    boost::shared_ptr<RegexRule> _stopSymbols;

-    boost::shared_ptr<RegexReplacement> _spaceSymbols;
+    boost::shared_ptr<RegexRule> _spaceSymbols;
 };

 #endif
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@ -1,8 +1,9 @@
 add_library(concordia-tests
+  test_regex_rule.cpp
+  test_anonymized_sentence.cpp
  test_concordia_searcher.cpp
  test_sentence_anonymizer.cpp
  test_text_utils.cpp
-  test_regex_replacement.cpp
  test_example.cpp
  test_tm_matches.cpp
  test_interval.cpp
--- a/concordia/t/test_anonymized_sentence.cpp
+++ b/concordia/t/test_anonymized_sentence.cpp
@ -0,0 +1,86 @@
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/token_annotation.hpp"
+#include "concordia/common/config.hpp"
+#include <iostream>
+
+BOOST_AUTO_TEST_SUITE(anonymized_sentence)
+
+BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
+{
+    AnonymizedSentence as("This is a test sentence");
+
+    std::vector<TokenAnnotation> annotations;
+    annotations.push_back(TokenAnnotation(0,1,'a',"val"));
+    annotations.push_back(TokenAnnotation(4,6,'a',"val"));
+    annotations.push_back(TokenAnnotation(7,10,'a',"val"));
+    annotations.push_back(TokenAnnotation(12,14,'a',"val"));
+    
+    as.addAnnotations(annotations);
+        
+    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
+    
+}
+
+BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
+{
+    AnonymizedSentence as("This is a test sentence");
+
+    std::vector<TokenAnnotation> annotations1;
+    annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
+    annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
+    annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
+    annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
+    as.addAnnotations(annotations1);
+    /* annotation
+    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
+    -           ----     -------       -----
+    
+    */
+    
+    std::vector<TokenAnnotation> annotations2;
+    annotations2.push_back(TokenAnnotation(1,4,'a',"val"));
+    annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
+    annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
+    annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
+    as.addAnnotations(annotations2);
+    /* annotations2
+    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
+       -------  -------          -- -----   
+    
+    expecting:
+    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
+    -  -------  ----     ------- --    -----
+    
+    */   
+    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
+    std::list<TokenAnnotation> annotations = as.getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),0);
+    BOOST_CHECK_EQUAL(iter->getEnd(),1);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),1);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),4);
+    BOOST_CHECK_EQUAL(iter->getEnd(),6);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),7);
+    BOOST_CHECK_EQUAL(iter->getEnd(),10);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),10);
+    BOOST_CHECK_EQUAL(iter->getEnd(),11);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),12);
+    BOOST_CHECK_EQUAL(iter->getEnd(),14);
+    
+}
+
+
+BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_regex_replacement.cpp
+++ b/concordia/t/test_regex_replacement.cpp
@ -1,66 +0,0 @@
-#include "tests/unit-tests/unit_tests_globals.hpp"
-#include "concordia/regex_replacement.hpp"
-#include "concordia/common/config.hpp"
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/locale.hpp>
-#include <boost/algorithm/string/case_conv.hpp>
-
-BOOST_AUTO_TEST_SUITE(regex_replacement)
-
-BOOST_AUTO_TEST_CASE( SimpleReplacement )
-{
-    RegexReplacement rr("a","b");
-    BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
-}
-
-BOOST_AUTO_TEST_CASE( BadRegex )
-{
-    bool exceptionThrown = false;
-    std::string message = "";
-    try {
-        RegexReplacement rr("+a","b");
-    } catch (ConcordiaException & e) {
-        exceptionThrown = true;
-        message = e.what();
-    }    
-    BOOST_CHECK_EQUAL(exceptionThrown, true);    
-    BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);    
-}
-
-BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
-{
-    RegexReplacement rr("['\"\\\\.]","");
-    BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin  Hold on to the feelin");
-}
-
-BOOST_AUTO_TEST_CASE( BackrefReplacement )
-{
-    RegexReplacement rr("(\\d+)","the number: \\1");
-    BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
-}
-
-BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
-{
-    RegexReplacement rr("abc","xxx", false);
-    BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
-}
-
-BOOST_AUTO_TEST_CASE( UnicodeReplacement )
-{
-    RegexReplacement rr("ą","x");
-    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
-}
-
-BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
-{
-    RegexReplacement rr("ą","x", false);
-    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
-}
-
-BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
-{
-    RegexReplacement rr("[ąćęłńóśżź]","x", false);
-    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
-}
-
-BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@ -0,0 +1,221 @@
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include "concordia/regex_rule.hpp"
+#include "concordia/anonymized_sentence.hpp"
+#include "concordia/common/config.hpp"
+#include <boost/shared_ptr.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/locale.hpp>
+#include <boost/algorithm/string/case_conv.hpp>
+
+BOOST_AUTO_TEST_SUITE(regex_rule)
+
+BOOST_AUTO_TEST_CASE( SimpleReplacement )
+{
+    RegexRule rr("a","b");
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    rr.apply(as);    
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),7);
+    BOOST_CHECK_EQUAL(iter->getEnd(),8);
+    iter++;
+    
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),14);
+    BOOST_CHECK_EQUAL(iter->getEnd(),15);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),16);
+    BOOST_CHECK_EQUAL(iter->getEnd(),17);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),18);
+    BOOST_CHECK_EQUAL(iter->getEnd(),19);  
+}
+
+BOOST_AUTO_TEST_CASE( BadRegex )
+{
+    bool exceptionThrown = false;
+    std::string message = "";
+    try {
+        RegexRule rr("+a","b");
+    } catch (ConcordiaException & e) {
+        exceptionThrown = true;
+        message = e.what();
+    }    
+    BOOST_CHECK_EQUAL(exceptionThrown, true);    
+    BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);    
+}
+
+BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
+{
+    RegexRule rr("['\"\\\\.]","");
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),3);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),19);
+    BOOST_CHECK_EQUAL(iter->getEnd(),20);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),21);
+    BOOST_CHECK_EQUAL(iter->getEnd(),22);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),44);
+    BOOST_CHECK_EQUAL(iter->getEnd(),45);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),45);
+    BOOST_CHECK_EQUAL(iter->getEnd(),46);  
+}
+
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
+{
+    RegexRule rr("abc","xxx", false);
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),8);
+    BOOST_CHECK_EQUAL(iter->getEnd(),11);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),16);
+    BOOST_CHECK_EQUAL(iter->getEnd(),19);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),24);
+    BOOST_CHECK_EQUAL(iter->getEnd(),27);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),32);
+    BOOST_CHECK_EQUAL(iter->getEnd(),35);
+}
+
+BOOST_AUTO_TEST_CASE( UnicodeReplacement )
+{
+    RegexRule rr("ą","x");
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
+{
+    RegexRule rr("ą","x", false);
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),29);
+    BOOST_CHECK_EQUAL(iter->getEnd(),30);
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
+{
+    RegexRule rr("[ąćęłńóśżź]","x", false);
+    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(as);
+    BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
+    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    BOOST_CHECK_EQUAL(iter->getStart(),2);
+    BOOST_CHECK_EQUAL(iter->getEnd(),3);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),3);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),4);
+    BOOST_CHECK_EQUAL(iter->getEnd(),5);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),5);
+    BOOST_CHECK_EQUAL(iter->getEnd(),6);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),8);
+    BOOST_CHECK_EQUAL(iter->getEnd(),9);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),9);
+    BOOST_CHECK_EQUAL(iter->getEnd(),10);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),11);
+    BOOST_CHECK_EQUAL(iter->getEnd(),12);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),15);
+    BOOST_CHECK_EQUAL(iter->getEnd(),16);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),16);
+    BOOST_CHECK_EQUAL(iter->getEnd(),17);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),20);
+    BOOST_CHECK_EQUAL(iter->getEnd(),21);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),21);
+    BOOST_CHECK_EQUAL(iter->getEnd(),22);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),22);
+    BOOST_CHECK_EQUAL(iter->getEnd(),23);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),23);
+    BOOST_CHECK_EQUAL(iter->getEnd(),24);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),26);
+    BOOST_CHECK_EQUAL(iter->getEnd(),27);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),27);
+    BOOST_CHECK_EQUAL(iter->getEnd(),28);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),29);
+    BOOST_CHECK_EQUAL(iter->getEnd(),30);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),33);
+    BOOST_CHECK_EQUAL(iter->getEnd(),34);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),34);
+    BOOST_CHECK_EQUAL(iter->getEnd(),35);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_sentence_anonymizer.cpp
+++ b/concordia/t/test_sentence_anonymizer.cpp
@ -17,7 +17,7 @@ BOOST_AUTO_TEST_CASE( NETest )
    
    
    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date  ne_date mail  ne_email number  ne_number");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date  ne_date mail  ne_email number  ne_number");
 }

 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
    
    
    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline ");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
    
 }

@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
    if (config->isStopWordsEnabled()) {
        SentenceAnonymizer anonymizer(config);
        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
-        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"  wiem   konieczne");
+        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"  wiem   konieczne");
    }
 }

@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE( StopSymbolsTest )
    
    
    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx  xxx   xx xx xx");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
    
 }

@ -59,7 +59,7 @@ BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
    
    
    std::string sentence = "xxx-xxx xx|xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
    
 }

@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
    SentenceAnonymizer anonymizer(config);
    
    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
+    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
    
 }

--- a/concordia/token_annotation.cpp
+++ b/concordia/token_annotation.cpp
@ -0,0 +1,15 @@
+#include "concordia/token_annotation.hpp"
+
+
+TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
+                                 const SUFFIX_MARKER_TYPE end,
+                                 const char annotationType,
+                                 const std::string & value):
+                                            Interval(start, end),
+                                            _annotationType(annotationType),
+                                            _value(value) {
+}
+
+TokenAnnotation::~TokenAnnotation() {
+}
+
--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@ -0,0 +1,53 @@
+#ifndef TOKEN_ANNOTATION_HDR
+#define TOKEN_ANNOTATION_HDR
+
+#include "concordia/common/config.hpp"
+#include "concordia/interval.hpp"
+
+#include <string>
+
+/*!
+  Class representing annotatio of char sequence as a token.
+  It is a type of interval that is also storing information
+  about the annoation type and value.
+
+*/
+
+class TokenAnnotation : public Interval {
+public:
+    /*! Constructor.
+      \param start start index of the annotation (char-level, 0-based)
+      \param end end index of the annotation (char-level, 0-based)
+      \param type annotation type
+      \param value annotation value
+    */
+    TokenAnnotation(const SUFFIX_MARKER_TYPE start,
+                    const SUFFIX_MARKER_TYPE end,
+                    const char annotationType,
+                    const std::string & value);
+
+    /*! Destructor.
+    */
+    virtual ~TokenAnnotation();
+
+    /*! Getter for annotation type.
+      \returns annotation type
+    */
+    char getType() const {
+        return _annotationType;
+    }
+
+    /*! Getter for annotation value.
+      \returns annotation value
+    */
+    std::string getValue() const {
+        return _value;
+    }
+
+protected:
+    char _annotationType;
+
+    std::string _value;
+};
+
+#endif
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -17,6 +17,7 @@ add_executable(first first.cpp)
 target_link_libraries(first concordia)
 target_link_libraries(first config++)
 target_link_libraries(first log4cpp)
+target_link_libraries(first icui18n)
 target_link_libraries(first ${Boost_LIBRARIES})
 target_link_libraries(first divsufsort)
 target_link_libraries(first utf8case)
@ -27,6 +28,7 @@ add_executable(simple_search simple_search.cpp)
 target_link_libraries(simple_search concordia)
 target_link_libraries(simple_search config++)
 target_link_libraries(simple_search log4cpp)
+target_link_libraries(simple_search icui18n)
 target_link_libraries(simple_search ${Boost_LIBRARIES})
 target_link_libraries(simple_search divsufsort)
 target_link_libraries(simple_search utf8case)
@ -38,6 +40,7 @@ add_executable(concordia_search concordia_search.cpp)
 target_link_libraries(concordia_search concordia)
 target_link_libraries(concordia_search config++)
 target_link_libraries(concordia_search log4cpp)
+target_link_libraries(concordia_search icui18n)
 target_link_libraries(concordia_search ${Boost_LIBRARIES})
 target_link_libraries(concordia_search divsufsort)
 target_link_libraries(concordia_search utf8case)