From 8432dd321f7316a0f725368aa424bff1f926543a Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Thu, 25 Jun 2015 10:12:51 +0200
Subject: [PATCH] tokenizer in progress

---
 TODO.txt                                      |  4 +-
 concordia/CMakeLists.txt                      |  8 +-
 concordia/concordia_config.cpp                |  6 --
 concordia/concordia_config.hpp                | 20 -----
 concordia/hash_generator.cpp                  | 12 +--
 concordia/hash_generator.hpp                  |  4 +-
 concordia/hashed_sentence.hpp                 |  2 +-
 concordia/regex_rule.cpp                      | 16 ++--
 concordia/regex_rule.hpp                      | 18 ++--
 ..._anonymizer.cpp => sentence_tokenizer.cpp} | 49 +++++-----
 ..._anonymizer.hpp => sentence_tokenizer.hpp} | 35 ++++----
 concordia/t/CMakeLists.txt                    |  4 +-
 concordia/t/test_concordia_config.cpp         |  2 -
 concordia/t/test_regex_rule.cpp               | 65 +++++++-------
 concordia/t/test_sentence_anonymizer.cpp      | 76 ----------------
 concordia/t/test_sentence_tokenizer.cpp       | 89 +++++++++++++++++++
 ...ntence.cpp => test_tokenized_sentence.cpp} | 20 ++---
 concordia/token_annotation.cpp                |  4 +
 concordia/token_annotation.hpp                |  8 ++
 ...ed_sentence.cpp => tokenized_sentence.cpp} | 10 +--
 ...ed_sentence.hpp => tokenized_sentence.hpp} | 10 +--
 concordia/tutorial.dox                        |  5 +-
 prod/resources/anonymizer/space_symbols.txt   |  6 --
 prod/resources/anonymizer/stop_symbols.txt    | 37 --------
 .../concordia-config/concordia.cfg.in         | 12 +--
 .../{anonymizer => tokenizer}/html_tags.txt   |  0
 .../named_entities.txt                        |  0
 .../{anonymizer => tokenizer}/stop_words.txt  |  0
 tests/resources/anonymizer/space_symbols.txt  |  6 --
 tests/resources/anonymizer/stop_symbols.txt   | 37 --------
 .../concordia-config/concordia-mock.cfg       |  4 -
 .../concordia-config/concordia.cfg.in         | 12 +--
 .../{anonymizer => tokenizer}/html_tags.txt   |  0
 .../named_entities.txt                        |  0
 .../{anonymizer => tokenizer}/stop_words.txt  |  0
 35 files changed, 243 insertions(+), 338 deletions(-)
 rename concordia/{sentence_anonymizer.cpp => sentence_tokenizer.cpp} (72%)
 rename concordia/{sentence_anonymizer.hpp => sentence_tokenizer.hpp} (52%)
 delete mode 100644 concordia/t/test_sentence_anonymizer.cpp
 create mode 100644 concordia/t/test_sentence_tokenizer.cpp
 rename concordia/t/{test_anonymized_sentence.cpp => test_tokenized_sentence.cpp} (82%)
 rename concordia/{anonymized_sentence.cpp => tokenized_sentence.cpp} (85%)
 rename concordia/{anonymized_sentence.hpp => tokenized_sentence.hpp} (88%)
 delete mode 100644 prod/resources/anonymizer/space_symbols.txt
 delete mode 100644 prod/resources/anonymizer/stop_symbols.txt
 rename prod/resources/{anonymizer => tokenizer}/html_tags.txt (100%)
 rename prod/resources/{anonymizer => tokenizer}/named_entities.txt (100%)
 rename prod/resources/{anonymizer => tokenizer}/stop_words.txt (100%)
 delete mode 100644 tests/resources/anonymizer/space_symbols.txt
 delete mode 100644 tests/resources/anonymizer/stop_symbols.txt
 rename tests/resources/{anonymizer => tokenizer}/html_tags.txt (100%)
 rename tests/resources/{anonymizer => tokenizer}/named_entities.txt (100%)
 rename tests/resources/{anonymizer => tokenizer}/stop_words.txt (100%)

diff --git a/TODO.txt b/TODO.txt
index e06f0be..01c38d3 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,6 +1,7 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
 
-- implement tokenAnnotations vector as interval tree
+- work on word regex pattern (allow for some symbols and digits within the word)
+- document the code (classes, cfg files) and update tutorial
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
 - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
@@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
 
 
 ---------------------------- Archive -----------------------------
+DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
 DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
 DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
 DONE - document the code
diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt
index f59f12e..43a33b5 100644
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@@ -7,13 +7,13 @@ endforeach(dir)
 
 add_library(concordia SHARED
   token_annotation.cpp
-  anonymized_sentence.cpp
+  tokenized_sentence.cpp
   hashed_sentence.cpp
   concordia_search_result.cpp
   matched_pattern_fragment.cpp
   concordia_searcher.cpp
   regex_rule.cpp
-  sentence_anonymizer.cpp
+  sentence_tokenizer.cpp
   interval.cpp
   tm_matches.cpp
   anubis_search_result.cpp
@@ -37,13 +37,13 @@ add_subdirectory(t)
 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
           token_annotation.hpp
-          anonymized_sentence.hpp
+          tokenized_sentence.hpp
           hashed_sentence.hpp
           concordia_search_result.hpp
           matched_pattern_fragment.hpp
           concordia_searcher.hpp
           regex_rule.hpp
-          sentence_anonymizer.hpp
+          sentence_tokenizer.hpp
           interval.hpp
           tm_matches.hpp
           anubis_search_result.hpp
diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp
index ff5120f..29e3080 100644
--- a/concordia/concordia_config.cpp
+++ b/concordia/concordia_config.cpp
@@ -9,11 +9,9 @@
 #define MARKERS_PARAM "markers_path"
 #define SUFFIX_ARRAY_PARAM "suffix_array_path"
 #define HTML_TAGS_PARAM "html_tags_path"
-#define SPACE_SYMBOLS_PARAM "space_symbols_path"
 #define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
 #define STOP_WORDS_PARAM "stop_words_path"
 #define NAMED_ENTITIES_PARAM "named_entities_path"
-#define STOP_SYMBOLS_PARAM "stop_symbols_path"
 #define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
 
 ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
@@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
           ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
     _htmlTagsFilePath =
           ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
-    _spaceSymbolsFilePath =
-          ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
     _stopWordsEnabled =
           ConcordiaConfig::_readConfigParameterStr(
                            STOP_WORDS_ENABLED_PARAM) != "false";
@@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
           ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
     _namedEntitiesFilePath =
           ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
-    _stopSymbolsFilePath =
-          ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
     _anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
                                             ANUBIS_THRESHOLD_PARAM,
                                             "0.3").c_str());
diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp
index 983665e..a7c12d6 100644
--- a/concordia/concordia_config.hpp
+++ b/concordia/concordia_config.hpp
@@ -56,14 +56,6 @@ public:
         return _htmlTagsFilePath;
     }
 
-    /*! Getter for space symbols file path.
-        For more information see \ref tutorial3.
-     \returns space symbols file path
-    */
-    std::string & getSpaceSymbolsFilePath() {
-        return _spaceSymbolsFilePath;
-    }
-
     /*! Getter for stop symbols enabled parameter.
         For more information see \ref tutorial3.
      \returns true if stop words are enabled
@@ -88,14 +80,6 @@ public:
         return _namedEntitiesFilePath;
     }
 
-    /*! Getter for stop symbols file path.
-        For more information see \ref tutorial3.
-     \returns stop symbols file path
-    */
-    std::string & getStopSymbolsFilePath() {
-        return _stopSymbolsFilePath;
-    }
-
     /*! Getter for anubis threshold. Anubis search results with
         scores below that threshold will be discarded.        
      \returns anubis threshold
@@ -115,16 +99,12 @@ private:
 
     std::string _htmlTagsFilePath;
 
-    std::string _spaceSymbolsFilePath;
-
     bool _stopWordsEnabled;
 
     std::string _stopWordsFilePath;
 
     std::string _namedEntitiesFilePath;
 
-    std::string _stopSymbolsFilePath;
-
     double _anubisThreshold;
 
     std::string _readConfigParameterStr(const std::string & name)
diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp
index a004f60..05e9afe 100644
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
                                          throw(ConcordiaException) :
     _wordMapFilePath(config->getWordMapFilePath()),
     _wordMap(boost::shared_ptr<WordMap>(new WordMap)),
-    _sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
-                                    new SentenceAnonymizer(config))) {
+    _sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
+                                    new SentenceTokenizer(config))) {
     if (boost::filesystem::exists(_wordMapFilePath)) {
         std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
         boost::archive::binary_iarchive ia(ifs);
@@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
 
 std::vector<std::string> HashGenerator::generateTokenVector(
                                                const std::string & sentence) {
-    boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
-    std::string anonymizedSentence = as->getSentence();
-    boost::trim(anonymizedSentence);
+    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
+    std::string tokenizedSentence = ts->getSentence();
+    boost::trim(tokenizedSentence);
     std::vector<std::string> tokenTexts;
-    boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
+    boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
                  boost::algorithm::token_compress_on);
     return tokenTexts;
 }
diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp
index 676abda..f9a4562 100644
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@@ -8,7 +8,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include "concordia/word_map.hpp"
 #include "concordia/common/config.hpp"
-#include "concordia/sentence_anonymizer.hpp"
+#include "concordia/sentence_tokenizer.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
 
@@ -71,7 +71,7 @@ public:
 private:
     boost::shared_ptr<WordMap> _wordMap;
 
-    boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
+    boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
 
     std::string _wordMapFilePath;
 };
diff --git a/concordia/hashed_sentence.hpp b/concordia/hashed_sentence.hpp
index 59ebd3e..85e234a 100644
--- a/concordia/hashed_sentence.hpp
+++ b/concordia/hashed_sentence.hpp
@@ -48,7 +48,7 @@ public:
     /*! Method for adding an original word position to the list.
       \param original word position
     */
-    void addWordOriginalWordPosition(Interval & originalWordPosition) {
+    void addOriginalWordPosition(Interval & originalWordPosition) {
         _originalWordPositions.push_back(originalWordPosition);
     }
 
diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp
index 83ae20f..636dfda 100644
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@@ -5,10 +5,12 @@
 #include <boost/throw_exception.hpp>
 
 RegexRule::RegexRule(std::string patternString,
-                                std::string value,
-                                bool caseSensitive)
-                                         throw(ConcordiaException):
-                                         _value(value) {
+                     char annotationType,
+                     std::string value,
+                     bool caseSensitive)
+                             throw(ConcordiaException):
+                               _annotationType(annotationType),
+                               _value(value)                  {
     try {
         if (caseSensitive) {
             _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
@@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
 RegexRule::~RegexRule() {
 }
 
-void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
+void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
     try {
         UnicodeString s(sentence->getSentence().c_str());
         boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
@@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
         for (; begin != end; ++begin) {
             SUFFIX_MARKER_TYPE matchBegin = begin->position();
             SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
-            TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
+            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
             annotations.push_back(annotation);
         }
         sentence->addAnnotations(annotations);
     } catch(const std::exception & e) {
         std::stringstream ss;
         ss << "Exception while applying regex rule: "
-                          << _value << " to text: " << sentence->getSentence();
+                          << _annotationType << " to text: " << sentence->getSentence();
         ss << ", message: " << e.what();
         throw ConcordiaException(ss.str());
     }
diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp
index 2f74c30..2c40bb3 100644
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@@ -3,7 +3,7 @@
 
 #include <string>
 #include "concordia/common/config.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
@@ -24,12 +24,14 @@ public:
     /*!
       Constructor.
         \param patternString regex pattern to match
-        \param replacement string to substitute the found match
+        \param annoationType type of annotation
         \param caseSensitive case sensitivity of the pattern
     */
-    RegexRule(std::string patternString, std::string value,
-                                 bool caseSensitive = true)
-                                 throw(ConcordiaException);
+    RegexRule(std::string patternString,
+              char annotationType,
+              std::string value,
+              bool caseSensitive = true)
+              throw(ConcordiaException);
 
     /*! Destructor.
     */
@@ -38,12 +40,14 @@ public:
     /*! Applies the operation on anonymized sentence.
       \param sentence the input sentence
     */
-    void apply(boost::shared_ptr<AnonymizedSentence> sentence);
+    void apply(boost::shared_ptr<TokenizedSentence> sentence);
 
 private:
-    boost::u32regex _pattern;
+    char _annotationType;
 
     std::string _value;
+    
+    boost::u32regex _pattern;
 };
 
 #endif
diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_tokenizer.cpp
similarity index 72%
rename from concordia/sentence_anonymizer.cpp
rename to concordia/sentence_tokenizer.cpp
index e0715f3..663ed80 100644
--- a/concordia/sentence_anonymizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@@ -1,4 +1,5 @@
-#include "concordia/sentence_anonymizer.hpp"
+#include "concordia/sentence_tokenizer.hpp"
+#include "concordia/token_annotation.hpp"
 
 #include <boost/foreach.hpp>
 #include <fstream>
@@ -6,29 +7,27 @@
 #include <iostream>
 #include <boost/algorithm/string.hpp>
 
-SentenceAnonymizer::SentenceAnonymizer(
+SentenceTokenizer::SentenceTokenizer(
                         boost::shared_ptr<ConcordiaConfig> config)
                                          throw(ConcordiaException) {
     _createNeRules(config->getNamedEntitiesFilePath());
     _createHtmlTagsRule(config->getHtmlTagsFilePath());
     _stopWordsEnabled = config->isStopWordsEnabled();
     if (_stopWordsEnabled) {
-        _stopWords = _getMultipleReplacementRule(
-                                  config->getStopWordsFilePath(), "", true);
+        _stopWords = _getMultipleRegexRule(
+                                  config->getStopWordsFilePath(),
+                                  TokenAnnotation::STOP_WORD_TYPE,
+                                  "", true);
     }
-    _stopSymbols = _getMultipleReplacementRule(
-                              config->getStopSymbolsFilePath(), "");
-    _spaceSymbols = _getMultipleReplacementRule(
-                              config->getSpaceSymbolsFilePath(), " ");
 }
 
-SentenceAnonymizer::~SentenceAnonymizer() {
+SentenceTokenizer::~SentenceTokenizer() {
 }
 
-boost::shared_ptr<AnonymizedSentence>
-              SentenceAnonymizer::anonymize(const std::string & sentence) {
-    boost::shared_ptr<AnonymizedSentence> 
-                    result(new AnonymizedSentence(sentence));
+boost::shared_ptr<TokenizedSentence>
+              SentenceTokenizer::tokenize(const std::string & sentence) {
+    boost::shared_ptr<TokenizedSentence> 
+                    result(new TokenizedSentence(sentence));
 
     _htmlTags->apply(result);
 
@@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
     if (_stopWordsEnabled) {
         _stopWords->apply(result);
     }
-    _stopSymbols->apply(result);
-    _spaceSymbols->apply(result);
+    
+    boost::shared_ptr<RegexRule> wordsRule(
+                        new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
 
     return result;
 }
 
-void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
+void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
     if (boost::filesystem::exists(namedEntitiesPath)) {
         std::string line;
         std::ifstream neFile(namedEntitiesPath.c_str());
@@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
                     throw ConcordiaException(ss.str());
                 } else {
                     _namedEntities.push_back(RegexRule(
-                                tokenTexts->at(0), tokenTexts->at(1)));
+                                tokenTexts->at(0),
+                                TokenAnnotation::NE_TYPE,
+                                tokenTexts->at(1)));
                 }
             }
             neFile.close();
@@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
     }
 }
 
-void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
+void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
     std::string tagsExpression = "<\\/?(";
     if (boost::filesystem::exists(htmlTagsPath)) {
         std::string line;
@@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
     tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
     tagsExpression += "br).*?>";
     _htmlTags = boost::shared_ptr<RegexRule>(
-                        new RegexRule(tagsExpression, "", false));
+                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
 }
 
 boost::shared_ptr<RegexRule>
-        SentenceAnonymizer::_getMultipleReplacementRule(
-            std::string & filePath, std::string replacement, bool wholeWord) {
+        SentenceTokenizer::_getMultipleRegexRule(
+            std::string filePath,
+            char annotationType,
+            std::string value,
+            bool wholeWord) {
     std::string expression = "(";
     if (boost::filesystem::exists(filePath)) {
         std::string line;
@@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
     expression = expression.substr(0, expression.size()-1);
     expression += ")";
     return boost::shared_ptr<RegexRule>(
-                        new RegexRule(expression, replacement, false));
+                        new RegexRule(expression, annotationType, value, false));
 }
 
diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_tokenizer.hpp
similarity index 52%
rename from concordia/sentence_anonymizer.hpp
rename to concordia/sentence_tokenizer.hpp
index db8e102..be60061 100644
--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@@ -1,10 +1,10 @@
-#ifndef SENTENCE_ANONYMIZER_HDR
-#define SENTENCE_ANONYMIZER_HDR
+#ifndef SENTENCE_TOKENIZER_HDR
+#define SENTENCE_TOKENIZER_HDR
 
 #include <string>
 #include <vector>
 #include "concordia/common/config.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/regex_rule.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
@@ -13,42 +13,42 @@
 
 
 /*!
-  Class for anonymizing sentence before generating hash.
+  Class for tokenizing sentence before generating hash.
   This operation is is used to
   remove unnecessary symbols and possibly words from sentences added to index
-  and search patterns. Anonymizer removes html tags, substitutes predefined symbols
-  with a single space, removes stop words (if the option is enabled), as well as
-  named entities and special symbols. All these have to be listed in files
+  and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
+  as well as annotates named entities and special symbols. All these have to be listed in files
   (see \ref tutorial3).
 */
 
-class SentenceAnonymizer {
+class SentenceTokenizer {
 public:
     /*! Constructor.
       \param config config object, holding paths to necessary files
     */
-    explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
+    explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
                                                  throw(ConcordiaException);
 
     /*! Destructor.
     */
-    virtual ~SentenceAnonymizer();
+    virtual ~SentenceTokenizer();
 
-    /*! Anonymizes the sentence.
+    /*! Tokenizes the sentence.
       \param sentence input sentence
       \returns altered version of the input sentence
     */
-    boost::shared_ptr<AnonymizedSentence>
-                                   anonymize(const std::string & sentence);
+    boost::shared_ptr<TokenizedSentence>
+                                   tokenize(const std::string & sentence);
 
 private:
     void _createNeRules(std::string & namedEntitiesPath);
 
     void _createHtmlTagsRule(std::string & htmlTagsPath);
 
-    boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
-                                             std::string & filePath,
-                                             std::string replacement,
+    boost::shared_ptr<RegexRule> _getMultipleRegexRule(
+                                             std::string filePath,
+                                             char annotationType,
+                                             std::string value,
                                              bool wholeWord = false);
 
     std::vector<RegexRule> _namedEntities;
@@ -59,9 +59,6 @@ private:
 
     boost::shared_ptr<RegexRule> _stopWords;
 
-    boost::shared_ptr<RegexRule> _stopSymbols;
-
-    boost::shared_ptr<RegexRule> _spaceSymbols;
 };
 
 #endif
diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt
index 9020c3b..a143694 100644
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_library(concordia-tests
   test_regex_rule.cpp
-  test_anonymized_sentence.cpp
+  test_tokenized_sentence.cpp
   test_concordia_searcher.cpp
-  test_sentence_anonymizer.cpp
+  test_sentence_tokenizer.cpp
   test_text_utils.cpp
   test_example.cpp
   test_tm_matches.cpp
diff --git a/concordia/t/test_concordia_config.cpp b/concordia/t/test_concordia_config.cpp
index 099759f..dc05ec1 100644
--- a/concordia/t/test_concordia_config.cpp
+++ b/concordia/t/test_concordia_config.cpp
@@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
     BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
     BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
     BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
-    BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
     BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
     BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
-    BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
 }
 
 BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp
index 7922452..e650067 100644
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@@ -1,6 +1,7 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/regex_rule.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
+#include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/algorithm/string/predicate.hpp>
@@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
 
 BOOST_AUTO_TEST_CASE( SimpleReplacement )
 {
-    RegexRule rr("a","b");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
-    rr.apply(as);    
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    rr.apply(ts);    
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),7);
@@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
     bool exceptionThrown = false;
     std::string message = "";
     try {
-        RegexRule rr("+a","b");
+        RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
     } catch (ConcordiaException & e) {
         exceptionThrown = true;
         message = e.what();
@@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )
 
 BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
 {
-    RegexRule rr("['\"\\\\.]","");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),3);
@@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
 
 BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
 {
-    RegexRule rr("abc","xxx", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),8);
@@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
 
 BOOST_AUTO_TEST_CASE( UnicodeReplacement )
 {
-    RegexRule rr("ą","x");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),11);
@@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
 
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
 {
-    RegexRule rr("ą","x", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),11);
@@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
 
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 {
-    RegexRule rr("[ąćęłńóśżź]","x", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
-    rr.apply(as);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    rr.apply(ts);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),2);
diff --git a/concordia/t/test_sentence_anonymizer.cpp b/concordia/t/test_sentence_anonymizer.cpp
deleted file mode 100644
index a712059..0000000
--- a/concordia/t/test_sentence_anonymizer.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#include <boost/filesystem.hpp>
-#include "tests/unit-tests/unit_tests_globals.hpp"
-#include <string>
-#include <sstream>
-
-#include <boost/shared_ptr.hpp>
-#include "concordia/common/config.hpp"
-#include "concordia/sentence_anonymizer.hpp"
-#include "tests/common/test_resources_manager.hpp"
-
-BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
-
-BOOST_AUTO_TEST_CASE( NETest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date  ne_date mail  ne_email number  ne_number");
-}
-
-BOOST_AUTO_TEST_CASE( HtmlTagsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
-    
-}
-
-BOOST_AUTO_TEST_CASE( StopWordsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    if (config->isStopWordsEnabled()) {
-        SentenceAnonymizer anonymizer(config);
-        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
-        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"  wiem   konieczne");
-    }
-}
-
-BOOST_AUTO_TEST_CASE( StopSymbolsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
-    
-}
-
-BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    
-    std::string sentence = "xxx-xxx xx|xx";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
-    
-}
-
-BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceAnonymizer anonymizer(config);
-    
-    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
-    
-}
-
-BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp
new file mode 100644
index 0000000..49d7244
--- /dev/null
+++ b/concordia/t/test_sentence_tokenizer.cpp
@@ -0,0 +1,89 @@
+#include <boost/filesystem.hpp>
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+#include "concordia/common/config.hpp"
+#include "concordia/sentence_tokenizer.hpp"
+#include "concordia/tokenized_sentence.hpp"
+#include "tests/common/test_resources_manager.hpp"
+
+BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
+
+BOOST_AUTO_TEST_CASE( NETest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
+    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+
+    BOOST_CHECK_EQUAL(8,annotations.size());
+    BOOST_FOREACH(TokenAnnotation annotation, annotations) {
+        std::cout << annotation.getStart() << ","
+                  << annotation.getEnd() << " type: "
+                  << annotation.getType() << " value: "
+                  << annotation.getValue() << std::endl;
+    }
+//    BOOST_CHECK_EQUAL(,"date  ne_date mail  ne_email number  ne_number");
+}
+
+BOOST_AUTO_TEST_CASE( HtmlTagsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
+    
+}
+
+BOOST_AUTO_TEST_CASE( StopWordsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    if (config->isStopWordsEnabled()) {
+        SentenceTokenizer tokenizer(config);
+        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
+        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"  wiem   konieczne");
+    }
+}
+
+BOOST_AUTO_TEST_CASE( StopSymbolsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
+    
+}
+
+BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    
+    std::string sentence = "xxx-xxx xx|xx";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
+    
+}
+
+BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+    
+    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
+    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
+    
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/t/test_anonymized_sentence.cpp b/concordia/t/test_tokenized_sentence.cpp
similarity index 82%
rename from concordia/t/test_anonymized_sentence.cpp
rename to concordia/t/test_tokenized_sentence.cpp
index 334cbda..213a5f3 100644
--- a/concordia/t/test_anonymized_sentence.cpp
+++ b/concordia/t/test_tokenized_sentence.cpp
@@ -1,14 +1,14 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include <iostream>
 
-BOOST_AUTO_TEST_SUITE(anonymized_sentence)
+BOOST_AUTO_TEST_SUITE(tokenized_sentence)
 
 BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
 {
-    AnonymizedSentence as("This is a test sentence");
+    TokenizedSentence ts("This is a test sentence");
 
     std::vector<TokenAnnotation> annotations;
     annotations.push_back(TokenAnnotation(0,1,'a',"val"));
@@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
     annotations.push_back(TokenAnnotation(7,10,'a',"val"));
     annotations.push_back(TokenAnnotation(12,14,'a',"val"));
     
-    as.addAnnotations(annotations);
+    ts.addAnnotations(annotations);
         
-    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
     
 }
 
 BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
 {
-    AnonymizedSentence as("This is a test sentence");
+    TokenizedSentence ts("This is a test sentence");
 
     std::vector<TokenAnnotation> annotations1;
     annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
     annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
     annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
     annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
-    as.addAnnotations(annotations1);
+    ts.addAnnotations(annotations1);
     /* annotation
     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
     -           ----     -------       -----
@@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
     annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
     annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
     annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
-    as.addAnnotations(annotations2);
+    ts.addAnnotations(annotations2);
     /* annotations2
     0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
        -------  -------          -- -----   
@@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
     -  -------  ----     ------- --    -----
     
     */   
-    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
-    std::list<TokenAnnotation> annotations = as.getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
     std::list<TokenAnnotation>::iterator iter = annotations.begin();
 
     BOOST_CHECK_EQUAL(iter->getStart(),0);
diff --git a/concordia/token_annotation.cpp b/concordia/token_annotation.cpp
index a0b7c03..a44f820 100644
--- a/concordia/token_annotation.cpp
+++ b/concordia/token_annotation.cpp
@@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
 TokenAnnotation::~TokenAnnotation() {
 }
 
+char TokenAnnotation::NE_TYPE = 0;
+char TokenAnnotation::WORD_TYPE = 1;
+char TokenAnnotation::HTML_TAG_TYPE = 2;
+char TokenAnnotation::STOP_WORD_TYPE = 3;
diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp
index 0c805bb..d98af1a 100644
--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@@ -44,6 +44,14 @@ public:
         return _value;
     }
 
+    static char NE_TYPE;
+
+    static char WORD_TYPE;
+
+    static char HTML_TAG_TYPE;
+
+    static char STOP_WORD_TYPE;
+    
 protected:
     char _annotationType;
 
diff --git a/concordia/anonymized_sentence.cpp b/concordia/tokenized_sentence.cpp
similarity index 85%
rename from concordia/anonymized_sentence.cpp
rename to concordia/tokenized_sentence.cpp
index 6f7c687..0c0c014 100644
--- a/concordia/anonymized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@@ -1,16 +1,16 @@
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/common/text_utils.hpp"
 
 #include <iostream>
 
-AnonymizedSentence::AnonymizedSentence(std::string sentence):
+TokenizedSentence::TokenizedSentence(std::string sentence):
                                          _sentence(sentence) {
 }
 
-AnonymizedSentence::~AnonymizedSentence() {
+TokenizedSentence::~TokenizedSentence() {
 }
 
-void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
+void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
     std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
     std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
     
@@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
     
 }
 
-void AnonymizedSentence::toLowerCase() {
+void TokenizedSentence::toLowerCase() {
     _sentence = TextUtils::getInstance().toLowerCase(_sentence);
 }
diff --git a/concordia/anonymized_sentence.hpp b/concordia/tokenized_sentence.hpp
similarity index 88%
rename from concordia/anonymized_sentence.hpp
rename to concordia/tokenized_sentence.hpp
index e805be0..b1aa77e 100644
--- a/concordia/anonymized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@@ -1,5 +1,5 @@
-#ifndef ANONYMIZED_SENTENCE_HDR
-#define ANONYMIZED_SENTENCE_HDR
+#ifndef TOKENIZED_SENTENCE_HDR
+#define TOKENIZED_SENTENCE_HDR
 
 #include "concordia/common/config.hpp"
 #include "concordia/token_annotation.hpp"
@@ -13,17 +13,17 @@
   along with the annotations list.
 */
 
-class AnonymizedSentence {
+class TokenizedSentence {
 public:
     /*!
       Constructor.
 
     */
-    AnonymizedSentence(std::string sentence);
+    TokenizedSentence(std::string sentence);
 
     /*! Destructor.
     */
-    virtual ~AnonymizedSentence();
+    virtual ~TokenizedSentence();
 
     /*! Getter for sentence
       \returns sentence
diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox
index 2e0be41..6438efc 100644
--- a/concordia/tutorial.dox
+++ b/concordia/tutorial.dox
@@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
 word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
 
 #-------------------------------------------------------------------------------
-# The following settings control the sentence anonymizer mechanism. It is used to
-# remove unnecessary symbols and possibly words from sentences added to index
-# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
+# The following settings control the sentence tokenizer mechanism. Tokenizer
+# takes into account html tags, substitutes predefined symbols
 # with a single space, removes stop words (if the option is enabled), as well as
 # named entities and special symbols. All these have to be listed in files.
 
diff --git a/prod/resources/anonymizer/space_symbols.txt b/prod/resources/anonymizer/space_symbols.txt
deleted file mode 100644
index 5fc44e2..0000000
--- a/prod/resources/anonymizer/space_symbols.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-\|
-\–
-\-
-\/
-;
-:
diff --git a/prod/resources/anonymizer/stop_symbols.txt b/prod/resources/anonymizer/stop_symbols.txt
deleted file mode 100644
index 46aa42d..0000000
--- a/prod/resources/anonymizer/stop_symbols.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-\\tab
-\\emdash
-\&lt;
-\&gt;
-\&amp;
-\&quot;
-\&dash;
-\&nbsp;
-<
->
-=
-\+
-„
-”
-\"
-…
-\.
-\,
-\?
-!
-'
-\(
-\)
-\{
-\}
-\@
-\#
-\$
-\%
-\^
-\&
-\*
-\[
-\]
-\\
-\~
-&#\d+
diff --git a/prod/resources/concordia-config/concordia.cfg.in b/prod/resources/concordia-config/concordia.cfg.in
index eaa68d5..c14cb97 100644
--- a/prod/resources/concordia-config/concordia.cfg.in
+++ b/prod/resources/concordia-config/concordia.cfg.in
@@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
 # named entities and special symbols. All these have to be listed in files.
 
 # File containing all html tags (one per line)
-html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
-
-# File containing all symbols to be replaced by spaces
-space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
+html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
 
 # If set to true, words from predefined list are removed
 stop_words_enabled = "@STOP_WORDS_ENABLED@"
 
 # If stop_words_enabled is true, set the path to the stop words file
-#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
+#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
 
 # File containing regular expressions that match named entities
-named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
-
-# File containing special symbols (one per line) to be removed
-stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
+named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
 
 ### eof
diff --git a/prod/resources/anonymizer/html_tags.txt b/prod/resources/tokenizer/html_tags.txt
similarity index 100%
rename from prod/resources/anonymizer/html_tags.txt
rename to prod/resources/tokenizer/html_tags.txt
diff --git a/prod/resources/anonymizer/named_entities.txt b/prod/resources/tokenizer/named_entities.txt
similarity index 100%
rename from prod/resources/anonymizer/named_entities.txt
rename to prod/resources/tokenizer/named_entities.txt
diff --git a/prod/resources/anonymizer/stop_words.txt b/prod/resources/tokenizer/stop_words.txt
similarity index 100%
rename from prod/resources/anonymizer/stop_words.txt
rename to prod/resources/tokenizer/stop_words.txt
diff --git a/tests/resources/anonymizer/space_symbols.txt b/tests/resources/anonymizer/space_symbols.txt
deleted file mode 100644
index 5fc44e2..0000000
--- a/tests/resources/anonymizer/space_symbols.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-\|
-\–
-\-
-\/
-;
-:
diff --git a/tests/resources/anonymizer/stop_symbols.txt b/tests/resources/anonymizer/stop_symbols.txt
deleted file mode 100644
index 46aa42d..0000000
--- a/tests/resources/anonymizer/stop_symbols.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-\\tab
-\\emdash
-\&lt;
-\&gt;
-\&amp;
-\&quot;
-\&dash;
-\&nbsp;
-<
->
-=
-\+
-„
-”
-\"
-…
-\.
-\,
-\?
-!
-'
-\(
-\)
-\{
-\}
-\@
-\#
-\$
-\%
-\^
-\&
-\*
-\[
-\]
-\\
-\~
-&#\d+
diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg
index 32edb82..6558a52 100644
--- a/tests/resources/concordia-config/concordia-mock.cfg
+++ b/tests/resources/concordia-config/concordia-mock.cfg
@@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"
 
 html_tags_path = "/tmp/html_tags.txt"
 
-space_symbols_path = "/tmp/space_symbols.txt"
-
 stop_words_enabled = "true"
 
 stop_words_path = "/tmp/stop_words.txt"
 
 named_entities_path = "/tmp/named_entities.txt"
 
-stop_symbols_path = "/tmp/stop_symbols.txt"
-
 
 ### eof
diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in
index b4f00ad..24df93c 100644
--- a/tests/resources/concordia-config/concordia.cfg.in
+++ b/tests/resources/concordia-config/concordia.cfg.in
@@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
 # named entities and special symbols. All these have to be listed in files.
 
 # File containing all html tags (one per line)
-html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
-
-# File containing all symbols to be replaced by spaces
-space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
+html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
 
 # If set to true, words from predefined list are removed
 stop_words_enabled = "@STOP_WORDS_ENABLED@"
 
 # If stop_words_enabled is true, set the path to the stop words file
-#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
+#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
 
 # File containing regular expressions that match named entities
-named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
-
-# File containing special symbols (one per line) to be removed
-stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
+named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
 
 ### eof
diff --git a/tests/resources/anonymizer/html_tags.txt b/tests/resources/tokenizer/html_tags.txt
similarity index 100%
rename from tests/resources/anonymizer/html_tags.txt
rename to tests/resources/tokenizer/html_tags.txt
diff --git a/tests/resources/anonymizer/named_entities.txt b/tests/resources/tokenizer/named_entities.txt
similarity index 100%
rename from tests/resources/anonymizer/named_entities.txt
rename to tests/resources/tokenizer/named_entities.txt
diff --git a/tests/resources/anonymizer/stop_words.txt b/tests/resources/tokenizer/stop_words.txt
similarity index 100%
rename from tests/resources/anonymizer/stop_words.txt
rename to tests/resources/tokenizer/stop_words.txt