tokenizer in progress

2015-06-25 10:12:51 +02:00 · 2015-06-25 10:12:51 +02:00 · 8432dd321f
commit 8432dd321f
parent 0baf3e4ef2
35 changed files with 243 additions and 338 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,6 +1,7 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------

- implement tokenAnnotations vector as interval tree
+- work on word regex pattern (allow for some symbols and digits within the word)
+- document the code (classes, cfg files) and update tutorial
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
 - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś


 ---------------------------- Archive -----------------------------
+DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
 DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
 DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
 DONE - document the code
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@ -7,13 +7,13 @@ endforeach(dir)

 add_library(concordia SHARED
  token_annotation.cpp
-  anonymized_sentence.cpp
+  tokenized_sentence.cpp
  hashed_sentence.cpp
  concordia_search_result.cpp
  matched_pattern_fragment.cpp
  concordia_searcher.cpp
  regex_rule.cpp
-  sentence_anonymizer.cpp
+  sentence_tokenizer.cpp
  interval.cpp
  tm_matches.cpp
  anubis_search_result.cpp
@ -37,13 +37,13 @@ add_subdirectory(t)
 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
          token_annotation.hpp
-          anonymized_sentence.hpp
+          tokenized_sentence.hpp
          hashed_sentence.hpp
          concordia_search_result.hpp
          matched_pattern_fragment.hpp
          concordia_searcher.hpp
          regex_rule.hpp
-          sentence_anonymizer.hpp
+          sentence_tokenizer.hpp
          interval.hpp
          tm_matches.hpp
          anubis_search_result.hpp
--- a/concordia/concordia_config.cpp
+++ b/concordia/concordia_config.cpp
@ -9,11 +9,9 @@
 #define MARKERS_PARAM "markers_path"
 #define SUFFIX_ARRAY_PARAM "suffix_array_path"
 #define HTML_TAGS_PARAM "html_tags_path"
-#define SPACE_SYMBOLS_PARAM "space_symbols_path"
 #define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
 #define STOP_WORDS_PARAM "stop_words_path"
 #define NAMED_ENTITIES_PARAM "named_entities_path"
-#define STOP_SYMBOLS_PARAM "stop_symbols_path"
 #define ANUBIS_THRESHOLD_PARAM "anubis_threshold"

 ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
          ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
    _htmlTagsFilePath =
          ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
-    _spaceSymbolsFilePath =
-          ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
    _stopWordsEnabled =
          ConcordiaConfig::_readConfigParameterStr(
                           STOP_WORDS_ENABLED_PARAM) != "false";
@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
          ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
    _namedEntitiesFilePath =
          ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
-    _stopSymbolsFilePath =
-          ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
    _anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
                                            ANUBIS_THRESHOLD_PARAM,
                                            "0.3").c_str());
--- a/concordia/concordia_config.hpp
+++ b/concordia/concordia_config.hpp
@ -56,14 +56,6 @@ public:
        return _htmlTagsFilePath;
    }

-    /*! Getter for space symbols file path.
-        For more information see \ref tutorial3.
-     \returns space symbols file path
-    */
-    std::string & getSpaceSymbolsFilePath() {
-        return _spaceSymbolsFilePath;
-    }
-
    /*! Getter for stop symbols enabled parameter.
        For more information see \ref tutorial3.
     \returns true if stop words are enabled
@ -88,14 +80,6 @@ public:
        return _namedEntitiesFilePath;
    }

-    /*! Getter for stop symbols file path.
-        For more information see \ref tutorial3.
-     \returns stop symbols file path
-    */
-    std::string & getStopSymbolsFilePath() {
-        return _stopSymbolsFilePath;
-    }
-
    /*! Getter for anubis threshold. Anubis search results with
        scores below that threshold will be discarded.        
     \returns anubis threshold
@ -115,16 +99,12 @@ private:

    std::string _htmlTagsFilePath;

-    std::string _spaceSymbolsFilePath;
-
    bool _stopWordsEnabled;

    std::string _stopWordsFilePath;

    std::string _namedEntitiesFilePath;

-    std::string _stopSymbolsFilePath;
-
    double _anubisThreshold;

    std::string _readConfigParameterStr(const std::string & name)
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
                                         throw(ConcordiaException) :
    _wordMapFilePath(config->getWordMapFilePath()),
    _wordMap(boost::shared_ptr<WordMap>(new WordMap)),
-    _sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
-                                    new SentenceAnonymizer(config))) {
+    _sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
+                                    new SentenceTokenizer(config))) {
    if (boost::filesystem::exists(_wordMapFilePath)) {
        std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
        boost::archive::binary_iarchive ia(ifs);
@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(

 std::vector<std::string> HashGenerator::generateTokenVector(
                                               const std::string & sentence) {
-    boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
-    std::string anonymizedSentence = as->getSentence();
-    boost::trim(anonymizedSentence);
+    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
+    std::string tokenizedSentence = ts->getSentence();
+    boost::trim(tokenizedSentence);
    std::vector<std::string> tokenTexts;
-    boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
+    boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
                 boost::algorithm::token_compress_on);
    return tokenTexts;
 }
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -8,7 +8,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include "concordia/word_map.hpp"
 #include "concordia/common/config.hpp"
-#include "concordia/sentence_anonymizer.hpp"
+#include "concordia/sentence_tokenizer.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"

@ -71,7 +71,7 @@ public:
 private:
    boost::shared_ptr<WordMap> _wordMap;

-    boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
+    boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;

    std::string _wordMapFilePath;
 };
--- a/concordia/hashed_sentence.hpp
+++ b/concordia/hashed_sentence.hpp
@ -48,7 +48,7 @@ public:
    /*! Method for adding an original word position to the list.
      \param original word position
    */
-    void addWordOriginalWordPosition(Interval & originalWordPosition) {
+    void addOriginalWordPosition(Interval & originalWordPosition) {
        _originalWordPositions.push_back(originalWordPosition);
    }

--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@ -5,10 +5,12 @@
 #include <boost/throw_exception.hpp>

 RegexRule::RegexRule(std::string patternString,
-                                std::string value,
-                                bool caseSensitive)
-                                         throw(ConcordiaException):
-                                         _value(value) {
+                     char annotationType,
+                     std::string value,
+                     bool caseSensitive)
+                             throw(ConcordiaException):
+                               _annotationType(annotationType),
+                               _value(value)                  {
    try {
        if (caseSensitive) {
            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
 RegexRule::~RegexRule() {
 }

-void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
+void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
    try {
        UnicodeString s(sentence->getSentence().c_str());
        boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
        for (; begin != end; ++begin) {
            SUFFIX_MARKER_TYPE matchBegin = begin->position();
            SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
-            TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
+            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
            annotations.push_back(annotation);
        }
        sentence->addAnnotations(annotations);
    } catch(const std::exception & e) {
        std::stringstream ss;
        ss << "Exception while applying regex rule: "
-                          << _value << " to text: " << sentence->getSentence();
+                          << _annotationType << " to text: " << sentence->getSentence();
        ss << ", message: " << e.what();
        throw ConcordiaException(ss.str());
    }
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@ -3,7 +3,7 @@

 #include <string>
 #include "concordia/common/config.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
@ -24,12 +24,14 @@ public:
    /*!
      Constructor.
        \param patternString regex pattern to match
-        \param replacement string to substitute the found match
+        \param annoationType type of annotation
        \param caseSensitive case sensitivity of the pattern
    */
-    RegexRule(std::string patternString, std::string value,
-                                 bool caseSensitive = true)
-                                 throw(ConcordiaException);
+    RegexRule(std::string patternString,
+              char annotationType,
+              std::string value,
+              bool caseSensitive = true)
+              throw(ConcordiaException);

    /*! Destructor.
    */
@ -38,12 +40,14 @@ public:
    /*! Applies the operation on anonymized sentence.
      \param sentence the input sentence
    */
-    void apply(boost::shared_ptr<AnonymizedSentence> sentence);
+    void apply(boost::shared_ptr<TokenizedSentence> sentence);

 private:
-    boost::u32regex _pattern;
+    char _annotationType;

    std::string _value;
+    
+    boost::u32regex _pattern;
 };

 #endif
--- a/concordia/sentence_anonymizer.cpp
+++ b/concordia/sentence_anonymizer.cpp
@ -1,4 +1,5 @@
-#include "concordia/sentence_anonymizer.hpp"
+#include "concordia/sentence_tokenizer.hpp"
+#include "concordia/token_annotation.hpp"

 #include <boost/foreach.hpp>
 #include <fstream>
@ -6,29 +7,27 @@
 #include <iostream>
 #include <boost/algorithm/string.hpp>

-SentenceAnonymizer::SentenceAnonymizer(
+SentenceTokenizer::SentenceTokenizer(
                        boost::shared_ptr<ConcordiaConfig> config)
                                         throw(ConcordiaException) {
    _createNeRules(config->getNamedEntitiesFilePath());
    _createHtmlTagsRule(config->getHtmlTagsFilePath());
    _stopWordsEnabled = config->isStopWordsEnabled();
    if (_stopWordsEnabled) {
-        _stopWords = _getMultipleReplacementRule(
-                                  config->getStopWordsFilePath(), "", true);
+        _stopWords = _getMultipleRegexRule(
+                                  config->getStopWordsFilePath(),
+                                  TokenAnnotation::STOP_WORD_TYPE,
+                                  "", true);
    }
-    _stopSymbols = _getMultipleReplacementRule(
-                              config->getStopSymbolsFilePath(), "");
-    _spaceSymbols = _getMultipleReplacementRule(
-                              config->getSpaceSymbolsFilePath(), " ");
 }

-SentenceAnonymizer::~SentenceAnonymizer() {
+SentenceTokenizer::~SentenceTokenizer() {
 }

-boost::shared_ptr<AnonymizedSentence>
-              SentenceAnonymizer::anonymize(const std::string & sentence) {
-    boost::shared_ptr<AnonymizedSentence> 
-                    result(new AnonymizedSentence(sentence));
+boost::shared_ptr<TokenizedSentence>
+              SentenceTokenizer::tokenize(const std::string & sentence) {
+    boost::shared_ptr<TokenizedSentence> 
+                    result(new TokenizedSentence(sentence));

    _htmlTags->apply(result);

@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
    if (_stopWordsEnabled) {
        _stopWords->apply(result);
    }
-    _stopSymbols->apply(result);
-    _spaceSymbols->apply(result);
+    
+    boost::shared_ptr<RegexRule> wordsRule(
+                        new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));

    return result;
 }

-void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
+void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
    if (boost::filesystem::exists(namedEntitiesPath)) {
        std::string line;
        std::ifstream neFile(namedEntitiesPath.c_str());
@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
                    throw ConcordiaException(ss.str());
                } else {
                    _namedEntities.push_back(RegexRule(
-                                tokenTexts->at(0), tokenTexts->at(1)));
+                                tokenTexts->at(0),
+                                TokenAnnotation::NE_TYPE,
+                                tokenTexts->at(1)));
                }
            }
            neFile.close();
@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
    }
 }

-void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
+void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    std::string tagsExpression = "<\\/?(";
    if (boost::filesystem::exists(htmlTagsPath)) {
        std::string line;
@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
    tagsExpression += "br).*?>";
    _htmlTags = boost::shared_ptr<RegexRule>(
-                        new RegexRule(tagsExpression, "", false));
+                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
 }

 boost::shared_ptr<RegexRule>
-        SentenceAnonymizer::_getMultipleReplacementRule(
-            std::string & filePath, std::string replacement, bool wholeWord) {
+        SentenceTokenizer::_getMultipleRegexRule(
+            std::string filePath,
+            char annotationType,
+            std::string value,
+            bool wholeWord) {
    std::string expression = "(";
    if (boost::filesystem::exists(filePath)) {
        std::string line;
@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
    expression = expression.substr(0, expression.size()-1);
    expression += ")";
    return boost::shared_ptr<RegexRule>(
-                        new RegexRule(expression, replacement, false));
+                        new RegexRule(expression, annotationType, value, false));
 }

--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_anonymizer.hpp
@ -1,10 +1,10 @@
-#ifndef SENTENCE_ANONYMIZER_HDR
-#define SENTENCE_ANONYMIZER_HDR
+#ifndef SENTENCE_TOKENIZER_HDR
+#define SENTENCE_TOKENIZER_HDR

 #include <string>
 #include <vector>
 #include "concordia/common/config.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/regex_rule.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
@ -13,42 +13,42 @@


 /*!
-  Class for anonymizing sentence before generating hash.
+  Class for tokenizing sentence before generating hash.
  This operation is is used to
  remove unnecessary symbols and possibly words from sentences added to index
-  and search patterns. Anonymizer removes html tags, substitutes predefined symbols
-  with a single space, removes stop words (if the option is enabled), as well as
-  named entities and special symbols. All these have to be listed in files
+  and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
+  as well as annotates named entities and special symbols. All these have to be listed in files
  (see \ref tutorial3).
 */

-class SentenceAnonymizer {
+class SentenceTokenizer {
 public:
    /*! Constructor.
      \param config config object, holding paths to necessary files
    */
-    explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
+    explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
                                                 throw(ConcordiaException);

    /*! Destructor.
    */
-    virtual ~SentenceAnonymizer();
+    virtual ~SentenceTokenizer();

-    /*! Anonymizes the sentence.
+    /*! Tokenizes the sentence.
      \param sentence input sentence
      \returns altered version of the input sentence
    */
-    boost::shared_ptr<AnonymizedSentence>
-                                   anonymize(const std::string & sentence);
+    boost::shared_ptr<TokenizedSentence>
+                                   tokenize(const std::string & sentence);

 private:
    void _createNeRules(std::string & namedEntitiesPath);

    void _createHtmlTagsRule(std::string & htmlTagsPath);

-    boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
-                                             std::string & filePath,
-                                             std::string replacement,
+    boost::shared_ptr<RegexRule> _getMultipleRegexRule(
+                                             std::string filePath,
+                                             char annotationType,
+                                             std::string value,
                                             bool wholeWord = false);

    std::vector<RegexRule> _namedEntities;
@ -59,9 +59,6 @@ private:

    boost::shared_ptr<RegexRule> _stopWords;

-    boost::shared_ptr<RegexRule> _stopSymbols;
-
-    boost::shared_ptr<RegexRule> _spaceSymbols;
 };

 #endif
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@ -1,8 +1,8 @@
 add_library(concordia-tests
  test_regex_rule.cpp
-  test_anonymized_sentence.cpp
+  test_tokenized_sentence.cpp
  test_concordia_searcher.cpp
-  test_sentence_anonymizer.cpp
+  test_sentence_tokenizer.cpp
  test_text_utils.cpp
  test_example.cpp
  test_tm_matches.cpp
--- a/concordia/t/test_concordia_config.cpp
+++ b/concordia/t/test_concordia_config.cpp
@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
    BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
    BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
    BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
-    BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
    BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
    BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
-    BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
 }

 BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@ -1,6 +1,7 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/regex_rule.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
+#include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/algorithm/string/predicate.hpp>
@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)

 BOOST_AUTO_TEST_CASE( SimpleReplacement )
 {
-    RegexRule rr("a","b");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
-    rr.apply(as);    
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    rr.apply(ts);    
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
    bool exceptionThrown = false;
    std::string message = "";
    try {
-        RegexRule rr("+a","b");
+        RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
    } catch (ConcordiaException & e) {
        exceptionThrown = true;
        message = e.what();
@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )

 BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
 {
-    RegexRule rr("['\"\\\\.]","");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )

 BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
 {
-    RegexRule rr("abc","xxx", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )

 BOOST_AUTO_TEST_CASE( UnicodeReplacement )
 {
-    RegexRule rr("ą","x");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )

 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
 {
-    RegexRule rr("ą","x", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )

 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 {
-    RegexRule rr("[ąćęłńóśżź]","x", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),2);
--- a/concordia/t/test_sentence_anonymizer.cpp
+++ b/concordia/t/test_sentence_anonymizer.cpp
@ -1,76 +0,0 @@
-#include <boost/filesystem.hpp>
-#include "tests/unit-tests/unit_tests_globals.hpp"
-#include <string>
-#include <sstream>
-
-#include <boost/shared_ptr.hpp>
-#include "concordia/common/config.hpp"
-#include "concordia/sentence_anonymizer.hpp"
-#include "tests/common/test_resources_manager.hpp"
-
-BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
-
-BOOST_AUTO_TEST_CASE( NETest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date  ne_date mail  ne_email number  ne_number");
-}
-
-BOOST_AUTO_TEST_CASE( HtmlTagsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
-    
-}
-
-BOOST_AUTO_TEST_CASE( StopWordsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    if (config->isStopWordsEnabled()) {
-        SentenceAnonymizer anonymizer(config);
-        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
-        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"  wiem   konieczne");
-    }
-}
-
-BOOST_AUTO_TEST_CASE( StopSymbolsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
-    
-}
-
-BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "xxx-xxx xx|xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
-    
-}
-
-BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
-    
-}
-
-BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -0,0 +1,89 @@
+#include <boost/filesystem.hpp>
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+#include "concordia/common/config.hpp"
+#include "concordia/sentence_tokenizer.hpp"
+#include "concordia/tokenized_sentence.hpp"
+#include "tests/common/test_resources_manager.hpp"
+
+BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
+
+BOOST_AUTO_TEST_CASE( NETest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
+    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+
+    BOOST_CHECK_EQUAL(8,annotations.size());
+    BOOST_FOREACH(TokenAnnotation annotation, annotations) {
+        std::cout << annotation.getStart() << ","
+                  << annotation.getEnd() << " type: "
+                  << annotation.getType() << " value: "
+                  << annotation.getValue() << std::endl;
+    }
+//    BOOST_CHECK_EQUAL(,"date  ne_date mail  ne_email number  ne_number");
+}
+
+BOOST_AUTO_TEST_CASE( HtmlTagsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
+    
+}
+
+BOOST_AUTO_TEST_CASE( StopWordsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    if (config->isStopWordsEnabled()) {
+        SentenceTokenizer tokenizer(config);
+        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
+        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"  wiem   konieczne");
+    }
+}
+
+BOOST_AUTO_TEST_CASE( StopSymbolsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
+    
+}
+
+BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "xxx-xxx xx|xx";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
+    
+}
+
+BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
+    
+}
+
+BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_anonymized_sentence.cpp
+++ b/concordia/t/test_anonymized_sentence.cpp
@ -1,14 +1,14 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include <iostream>

-BOOST_AUTO_TEST_SUITE(anonymized_sentence)
+BOOST_AUTO_TEST_SUITE(tokenized_sentence)

 BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
 {
-    AnonymizedSentence as("This is a test sentence");
+    TokenizedSentence ts("This is a test sentence");

    std::vector<TokenAnnotation> annotations;
    annotations.push_back(TokenAnnotation(0,1,'a',"val"));
@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
    annotations.push_back(TokenAnnotation(7,10,'a',"val"));
    annotations.push_back(TokenAnnotation(12,14,'a',"val"));
    
-    as.addAnnotations(annotations);
+    ts.addAnnotations(annotations);
        
-    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
    
 }

 BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
 {
-    AnonymizedSentence as("This is a test sentence");
+    TokenizedSentence ts("This is a test sentence");

    std::vector<TokenAnnotation> annotations1;
    annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
    annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
    annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
    annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
-    as.addAnnotations(annotations1);
+    ts.addAnnotations(annotations1);
    /* annotation
    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
    -           ----     -------       -----
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
    annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
    annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
    annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
-    as.addAnnotations(annotations2);
+    ts.addAnnotations(annotations2);
    /* annotations2
    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
       -------  -------          -- -----   
@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
    -  -------  ----     ------- --    -----
    
    */   
-    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
-    std::list<TokenAnnotation> annotations = as.getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),0);
--- a/concordia/token_annotation.cpp
+++ b/concordia/token_annotation.cpp
@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
 TokenAnnotation::~TokenAnnotation() {
 }

+char TokenAnnotation::NE_TYPE = 0;
+char TokenAnnotation::WORD_TYPE = 1;
+char TokenAnnotation::HTML_TAG_TYPE = 2;
+char TokenAnnotation::STOP_WORD_TYPE = 3;
--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@ -44,6 +44,14 @@ public:
        return _value;
    }

+    static char NE_TYPE;
+
+    static char WORD_TYPE;
+
+    static char HTML_TAG_TYPE;
+
+    static char STOP_WORD_TYPE;
+    
 protected:
    char _annotationType;

--- a/concordia/anonymized_sentence.cpp
+++ b/concordia/anonymized_sentence.cpp
@ -1,16 +1,16 @@
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/common/text_utils.hpp"

 #include <iostream>

-AnonymizedSentence::AnonymizedSentence(std::string sentence):
+TokenizedSentence::TokenizedSentence(std::string sentence):
                                         _sentence(sentence) {
 }

-AnonymizedSentence::~AnonymizedSentence() {
+TokenizedSentence::~TokenizedSentence() {
 }

-void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
+void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
    std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
    std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
    
@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
    
 }

-void AnonymizedSentence::toLowerCase() {
+void TokenizedSentence::toLowerCase() {
    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
 }
--- a/concordia/anonymized_sentence.hpp
+++ b/concordia/anonymized_sentence.hpp
@ -1,5 +1,5 @@
-#ifndef ANONYMIZED_SENTENCE_HDR
-#define ANONYMIZED_SENTENCE_HDR
+#ifndef TOKENIZED_SENTENCE_HDR
+#define TOKENIZED_SENTENCE_HDR

 #include "concordia/common/config.hpp"
 #include "concordia/token_annotation.hpp"
@ -13,17 +13,17 @@
  along with the annotations list.
 */

-class AnonymizedSentence {
+class TokenizedSentence {
 public:
    /*!
      Constructor.

    */
-    AnonymizedSentence(std::string sentence);
+    TokenizedSentence(std::string sentence);

    /*! Destructor.
    */
-    virtual ~AnonymizedSentence();
+    virtual ~TokenizedSentence();

    /*! Getter for sentence
      \returns sentence
--- a/concordia/tutorial.dox
+++ b/concordia/tutorial.dox
@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
 word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"

 #-------------------------------------------------------------------------------
-# The following settings control the sentence anonymizer mechanism. It is used to
-# remove unnecessary symbols and possibly words from sentences added to index
-# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
+# The following settings control the sentence tokenizer mechanism. Tokenizer
+# takes into account html tags, substitutes predefined symbols
 # with a single space, removes stop words (if the option is enabled), as well as
 # named entities and special symbols. All these have to be listed in files.

--- a/prod/resources/anonymizer/space_symbols.txt
+++ b/prod/resources/anonymizer/space_symbols.txt
@ -1,6 +0,0 @@
-\|
-\–
-\-
-\/
-;
-:
--- a/prod/resources/anonymizer/stop_symbols.txt
+++ b/prod/resources/anonymizer/stop_symbols.txt
@ -1,37 +0,0 @@
-\\tab
-\\emdash
-\&lt;
-\&gt;
-\&amp;
-\&quot;
-\&dash;
-\&nbsp;
-<
->
-=
-\+
-„
-”
-\"
-…
-\.
-\,
-\?
-!
-'
-\(
-\)
-\{
-\}
-\@
-\#
-\$
-\%
-\^
-\&
-\*
-\[
-\]
-\\
-\~
-&#\d+
--- a/prod/resources/concordia-config/concordia.cfg.in
+++ b/prod/resources/concordia-config/concordia.cfg.in
@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
 # named entities and special symbols. All these have to be listed in files.

 # File containing all html tags (one per line)
-html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
-
-# File containing all symbols to be replaced by spaces
-space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
+html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"

 # If set to true, words from predefined list are removed
 stop_words_enabled = "@STOP_WORDS_ENABLED@"

 # If stop_words_enabled is true, set the path to the stop words file
-#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
+#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"

 # File containing regular expressions that match named entities
-named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
-
-# File containing special symbols (one per line) to be removed
-stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
+named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"

 ### eof
--- a/prod/resources/anonymizer/html_tags.txt
+++ b/prod/resources/anonymizer/html_tags.txt
--- a/prod/resources/anonymizer/named_entities.txt
+++ b/prod/resources/anonymizer/named_entities.txt
--- a/prod/resources/anonymizer/stop_words.txt
+++ b/prod/resources/anonymizer/stop_words.txt
--- a/tests/resources/anonymizer/space_symbols.txt
+++ b/tests/resources/anonymizer/space_symbols.txt
@ -1,6 +0,0 @@
-\|
-\–
-\-
-\/
-;
-:
--- a/tests/resources/anonymizer/stop_symbols.txt
+++ b/tests/resources/anonymizer/stop_symbols.txt
@ -1,37 +0,0 @@
-\\tab
-\\emdash
-\&lt;
-\&gt;
-\&amp;
-\&quot;
-\&dash;
-\&nbsp;
-<
->
-=
-\+
-„
-”
-\"
-…
-\.
-\,
-\?
-!
-'
-\(
-\)
-\{
-\}
-\@
-\#
-\$
-\%
-\^
-\&
-\*
-\[
-\]
-\\
-\~
-&#\d+
--- a/tests/resources/concordia-config/concordia-mock.cfg
+++ b/tests/resources/concordia-config/concordia-mock.cfg
@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"

 html_tags_path = "/tmp/html_tags.txt"

-space_symbols_path = "/tmp/space_symbols.txt"
-
 stop_words_enabled = "true"

 stop_words_path = "/tmp/stop_words.txt"

 named_entities_path = "/tmp/named_entities.txt"

-stop_symbols_path = "/tmp/stop_symbols.txt"
-

 ### eof
--- a/tests/resources/concordia-config/concordia.cfg.in
+++ b/tests/resources/concordia-config/concordia.cfg.in
@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
 # named entities and special symbols. All these have to be listed in files.

 # File containing all html tags (one per line)
-html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
-
-# File containing all symbols to be replaced by spaces
-space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
+html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"

 # If set to true, words from predefined list are removed
 stop_words_enabled = "@STOP_WORDS_ENABLED@"

 # If stop_words_enabled is true, set the path to the stop words file
-#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
+#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"

 # File containing regular expressions that match named entities
-named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
-
-# File containing special symbols (one per line) to be removed
-stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
+named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"

 ### eof
--- a/tests/resources/anonymizer/html_tags.txt
+++ b/tests/resources/anonymizer/html_tags.txt
--- a/tests/resources/anonymizer/named_entities.txt
+++ b/tests/resources/anonymizer/named_entities.txt
--- a/tests/resources/anonymizer/stop_words.txt
+++ b/tests/resources/anonymizer/stop_words.txt