tokenizer in progress

2015-06-25 10:12:51 +02:00 · 2015-06-25 10:12:51 +02:00 · 8432dd321f
commit 8432dd321f
parent 0baf3e4ef2
35 changed files with 243 additions and 338 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,6 +1,7 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- implement tokenAnnotations vector as interval tree
+- work on word regex pattern (allow for some symbols and digits within the word)
 - document the code (classes, cfg files) and update tutorial
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
 - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
 ---------------------------- Archive -----------------------------
 DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
 DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
 DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
 DONE - document the code
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@ -7,13 +7,13 @@ endforeach(dir)
 add_library(concordia SHARED
  token_annotation.cpp
-  anonymized_sentence.cpp
+  tokenized_sentence.cpp
  hashed_sentence.cpp
  concordia_search_result.cpp
  matched_pattern_fragment.cpp
  concordia_searcher.cpp
  regex_rule.cpp
-  sentence_anonymizer.cpp
+  sentence_tokenizer.cpp
  interval.cpp
  tm_matches.cpp
  anubis_search_result.cpp
@ -37,13 +37,13 @@ add_subdirectory(t)
 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
          token_annotation.hpp
-          anonymized_sentence.hpp
+          tokenized_sentence.hpp
          hashed_sentence.hpp
          concordia_search_result.hpp
          matched_pattern_fragment.hpp
          concordia_searcher.hpp
          regex_rule.hpp
-          sentence_anonymizer.hpp
+          sentence_tokenizer.hpp
          interval.hpp
          tm_matches.hpp
          anubis_search_result.hpp
--- a/concordia/concordia_config.cpp
+++ b/concordia/concordia_config.cpp
@ -9,11 +9,9 @@
 #define MARKERS_PARAM "markers_path"
 #define SUFFIX_ARRAY_PARAM "suffix_array_path"
 #define HTML_TAGS_PARAM "html_tags_path"
 #define SPACE_SYMBOLS_PARAM "space_symbols_path"
 #define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
 #define STOP_WORDS_PARAM "stop_words_path"
 #define NAMED_ENTITIES_PARAM "named_entities_path"
 #define STOP_SYMBOLS_PARAM "stop_symbols_path"
 #define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
 ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
          ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
    _htmlTagsFilePath =
          ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
    _spaceSymbolsFilePath =
          ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
    _stopWordsEnabled =
          ConcordiaConfig::_readConfigParameterStr(
                           STOP_WORDS_ENABLED_PARAM) != "false";
@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
          ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
    _namedEntitiesFilePath =
          ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
    _stopSymbolsFilePath =
          ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
    _anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
                                            ANUBIS_THRESHOLD_PARAM,
                                            "0.3").c_str());
--- a/concordia/concordia_config.hpp
+++ b/concordia/concordia_config.hpp
@ -56,14 +56,6 @@ public:
        return _htmlTagsFilePath;
    }
    /*! Getter for space symbols file path.
        For more information see \ref tutorial3.
     \returns space symbols file path
    */
    std::string & getSpaceSymbolsFilePath() {
        return _spaceSymbolsFilePath;
    }
    /*! Getter for stop symbols enabled parameter.
        For more information see \ref tutorial3.
     \returns true if stop words are enabled
@ -88,14 +80,6 @@ public:
        return _namedEntitiesFilePath;
    }
    /*! Getter for stop symbols file path.
        For more information see \ref tutorial3.
     \returns stop symbols file path
    */
    std::string & getStopSymbolsFilePath() {
        return _stopSymbolsFilePath;
    }
    /*! Getter for anubis threshold. Anubis search results with
        scores below that threshold will be discarded.        
     \returns anubis threshold
@ -115,16 +99,12 @@ private:
    std::string _htmlTagsFilePath;
    std::string _spaceSymbolsFilePath;
    bool _stopWordsEnabled;
    std::string _stopWordsFilePath;
    std::string _namedEntitiesFilePath;
    std::string _stopSymbolsFilePath;
    double _anubisThreshold;
    std::string _readConfigParameterStr(const std::string & name)
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
                                         throw(ConcordiaException) :
    _wordMapFilePath(config->getWordMapFilePath()),
    _wordMap(boost::shared_ptr<WordMap>(new WordMap)),
-    _sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
+    _sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
-                                    new SentenceAnonymizer(config))) {
+                                    new SentenceTokenizer(config))) {
    if (boost::filesystem::exists(_wordMapFilePath)) {
        std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
        boost::archive::binary_iarchive ia(ifs);
@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
 std::vector<std::string> HashGenerator::generateTokenVector(
                                               const std::string & sentence) {
-    boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
+    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
-    std::string anonymizedSentence = as->getSentence();
+    std::string tokenizedSentence = ts->getSentence();
-    boost::trim(anonymizedSentence);
+    boost::trim(tokenizedSentence);
    std::vector<std::string> tokenTexts;
-    boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
+    boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
                 boost::algorithm::token_compress_on);
    return tokenTexts;
 }
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -8,7 +8,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include "concordia/word_map.hpp"
 #include "concordia/common/config.hpp"
-#include "concordia/sentence_anonymizer.hpp"
+#include "concordia/sentence_tokenizer.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
@ -71,7 +71,7 @@ public:
 private:
    boost::shared_ptr<WordMap> _wordMap;
-    boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
+    boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
    std::string _wordMapFilePath;
 };
--- a/concordia/hashed_sentence.hpp
+++ b/concordia/hashed_sentence.hpp
@ -48,7 +48,7 @@ public:
    /*! Method for adding an original word position to the list.
      \param original word position
    */
-    void addWordOriginalWordPosition(Interval & originalWordPosition) {
+    void addOriginalWordPosition(Interval & originalWordPosition) {
        _originalWordPositions.push_back(originalWordPosition);
    }
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@ -5,10 +5,12 @@
 #include <boost/throw_exception.hpp>
 RegexRule::RegexRule(std::string patternString,
-                                std::string value,
+                     char annotationType,
-                                bool caseSensitive)
+                     std::string value,
-                                         throw(ConcordiaException):
+                     bool caseSensitive)
-                                         _value(value) {
+                             throw(ConcordiaException):
                               _annotationType(annotationType),
                               _value(value)                  {
    try {
        if (caseSensitive) {
            _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
 RegexRule::~RegexRule() {
 }
-void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
+void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
    try {
        UnicodeString s(sentence->getSentence().c_str());
        boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
        for (; begin != end; ++begin) {
            SUFFIX_MARKER_TYPE matchBegin = begin->position();
            SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
-            TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
+            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
            annotations.push_back(annotation);
        }
        sentence->addAnnotations(annotations);
    } catch(const std::exception & e) {
        std::stringstream ss;
        ss << "Exception while applying regex rule: "
-                          << _value << " to text: " << sentence->getSentence();
+                          << _annotationType << " to text: " << sentence->getSentence();
        ss << ", message: " << e.what();
        throw ConcordiaException(ss.str());
    }
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@ -3,7 +3,7 @@
 #include <string>
 #include "concordia/common/config.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
@ -24,12 +24,14 @@ public:
    /*!
      Constructor.
        \param patternString regex pattern to match
-        \param replacement string to substitute the found match
+        \param annoationType type of annotation
        \param caseSensitive case sensitivity of the pattern
    */
-    RegexRule(std::string patternString, std::string value,
+    RegexRule(std::string patternString,
-                                 bool caseSensitive = true)
+              char annotationType,
-                                 throw(ConcordiaException);
+              std::string value,
              bool caseSensitive = true)
              throw(ConcordiaException);
    /*! Destructor.
    */
@ -38,12 +40,14 @@ public:
    /*! Applies the operation on anonymized sentence.
      \param sentence the input sentence
    */
-    void apply(boost::shared_ptr<AnonymizedSentence> sentence);
+    void apply(boost::shared_ptr<TokenizedSentence> sentence);
 private:
-    boost::u32regex _pattern;
+    char _annotationType;
    std::string _value;
    boost::u32regex _pattern;
 };
 #endif
--- a/concordia/sentence_anonymizer.cpp
+++ b/concordia/sentence_anonymizer.cpp
@ -1,4 +1,5 @@
-#include "concordia/sentence_anonymizer.hpp"
+#include "concordia/sentence_tokenizer.hpp"
 #include "concordia/token_annotation.hpp"
 #include <boost/foreach.hpp>
 #include <fstream>
@ -6,29 +7,27 @@
 #include <iostream>
 #include <boost/algorithm/string.hpp>
-SentenceAnonymizer::SentenceAnonymizer(
+SentenceTokenizer::SentenceTokenizer(
                        boost::shared_ptr<ConcordiaConfig> config)
                                         throw(ConcordiaException) {
    _createNeRules(config->getNamedEntitiesFilePath());
    _createHtmlTagsRule(config->getHtmlTagsFilePath());
    _stopWordsEnabled = config->isStopWordsEnabled();
    if (_stopWordsEnabled) {
-        _stopWords = _getMultipleReplacementRule(
+        _stopWords = _getMultipleRegexRule(
-                                  config->getStopWordsFilePath(), "", true);
+                                  config->getStopWordsFilePath(),
                                  TokenAnnotation::STOP_WORD_TYPE,
                                  "", true);
    }
    _stopSymbols = _getMultipleReplacementRule(
                              config->getStopSymbolsFilePath(), "");
    _spaceSymbols = _getMultipleReplacementRule(
                              config->getSpaceSymbolsFilePath(), " ");
 }
-SentenceAnonymizer::~SentenceAnonymizer() {
+SentenceTokenizer::~SentenceTokenizer() {
 }
-boost::shared_ptr<AnonymizedSentence>
+boost::shared_ptr<TokenizedSentence>
-              SentenceAnonymizer::anonymize(const std::string & sentence) {
+              SentenceTokenizer::tokenize(const std::string & sentence) {
-    boost::shared_ptr<AnonymizedSentence> 
+    boost::shared_ptr<TokenizedSentence> 
-                    result(new AnonymizedSentence(sentence));
+                    result(new TokenizedSentence(sentence));
    _htmlTags->apply(result);
@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
    if (_stopWordsEnabled) {
        _stopWords->apply(result);
    }
-    _stopSymbols->apply(result);
+    
-    _spaceSymbols->apply(result);
+    boost::shared_ptr<RegexRule> wordsRule(
                        new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
    return result;
 }
-void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
+void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
    if (boost::filesystem::exists(namedEntitiesPath)) {
        std::string line;
        std::ifstream neFile(namedEntitiesPath.c_str());
@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
                    throw ConcordiaException(ss.str());
                } else {
                    _namedEntities.push_back(RegexRule(
-                                tokenTexts->at(0), tokenTexts->at(1)));
+                                tokenTexts->at(0),
                                TokenAnnotation::NE_TYPE,
                                tokenTexts->at(1)));
                }
            }
            neFile.close();
@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
    }
 }
-void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
+void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    std::string tagsExpression = "<\\/?(";
    if (boost::filesystem::exists(htmlTagsPath)) {
        std::string line;
@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
    tagsExpression += "br).*?>";
    _htmlTags = boost::shared_ptr<RegexRule>(
-                        new RegexRule(tagsExpression, "", false));
+                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
 }
 boost::shared_ptr<RegexRule>
-        SentenceAnonymizer::_getMultipleReplacementRule(
+        SentenceTokenizer::_getMultipleRegexRule(
-            std::string & filePath, std::string replacement, bool wholeWord) {
+            std::string filePath,
            char annotationType,
            std::string value,
            bool wholeWord) {
    std::string expression = "(";
    if (boost::filesystem::exists(filePath)) {
        std::string line;
@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
    expression = expression.substr(0, expression.size()-1);
    expression += ")";
    return boost::shared_ptr<RegexRule>(
-                        new RegexRule(expression, replacement, false));
+                        new RegexRule(expression, annotationType, value, false));
 }
--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_anonymizer.hpp
@ -1,10 +1,10 @@
-#ifndef SENTENCE_ANONYMIZER_HDR
+#ifndef SENTENCE_TOKENIZER_HDR
-#define SENTENCE_ANONYMIZER_HDR
+#define SENTENCE_TOKENIZER_HDR
 #include <string>
 #include <vector>
 #include "concordia/common/config.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/regex_rule.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
@ -13,42 +13,42 @@
 /*!
-  Class for anonymizing sentence before generating hash.
+  Class for tokenizing sentence before generating hash.
  This operation is is used to
  remove unnecessary symbols and possibly words from sentences added to index
-  and search patterns. Anonymizer removes html tags, substitutes predefined symbols
+  and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
-  with a single space, removes stop words (if the option is enabled), as well as
+  as well as annotates named entities and special symbols. All these have to be listed in files
  named entities and special symbols. All these have to be listed in files
  (see \ref tutorial3).
 */
-class SentenceAnonymizer {
+class SentenceTokenizer {
 public:
    /*! Constructor.
      \param config config object, holding paths to necessary files
    */
-    explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
+    explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
                                                 throw(ConcordiaException);
    /*! Destructor.
    */
-    virtual ~SentenceAnonymizer();
+    virtual ~SentenceTokenizer();
-    /*! Anonymizes the sentence.
+    /*! Tokenizes the sentence.
      \param sentence input sentence
      \returns altered version of the input sentence
    */
-    boost::shared_ptr<AnonymizedSentence>
+    boost::shared_ptr<TokenizedSentence>
-                                   anonymize(const std::string & sentence);
+                                   tokenize(const std::string & sentence);
 private:
    void _createNeRules(std::string & namedEntitiesPath);
    void _createHtmlTagsRule(std::string & htmlTagsPath);
-    boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
+    boost::shared_ptr<RegexRule> _getMultipleRegexRule(
-                                             std::string & filePath,
+                                             std::string filePath,
-                                             std::string replacement,
+                                             char annotationType,
                                             std::string value,
                                             bool wholeWord = false);
    std::vector<RegexRule> _namedEntities;
@ -59,9 +59,6 @@ private:
    boost::shared_ptr<RegexRule> _stopWords;
    boost::shared_ptr<RegexRule> _stopSymbols;
    boost::shared_ptr<RegexRule> _spaceSymbols;
 };
 #endif
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@ -1,8 +1,8 @@
 add_library(concordia-tests
  test_regex_rule.cpp
-  test_anonymized_sentence.cpp
+  test_tokenized_sentence.cpp
  test_concordia_searcher.cpp
-  test_sentence_anonymizer.cpp
+  test_sentence_tokenizer.cpp
  test_text_utils.cpp
  test_example.cpp
  test_tm_matches.cpp
--- a/concordia/t/test_concordia_config.cpp
+++ b/concordia/t/test_concordia_config.cpp
@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
    BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
    BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
    BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
    BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
    BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
    BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
    BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
 }
 BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@ -1,6 +1,7 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/regex_rule.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/algorithm/string/predicate.hpp>
@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
 BOOST_AUTO_TEST_CASE( SimpleReplacement )
 {
-    RegexRule rr("a","b");
+    RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
-    rr.apply(as);    
+    rr.apply(ts);    
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
    bool exceptionThrown = false;
    std::string message = "";
    try {
-        RegexRule rr("+a","b");
+        RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
    } catch (ConcordiaException & e) {
        exceptionThrown = true;
        message = e.what();
@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )
 BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
 {
-    RegexRule rr("['\"\\\\.]","");
+    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
-    rr.apply(as);
+    rr.apply(ts);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
 {
-    RegexRule rr("abc","xxx", false);
+    RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
-    rr.apply(as);
+    rr.apply(ts);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
 BOOST_AUTO_TEST_CASE( UnicodeReplacement )
 {
-    RegexRule rr("ą","x");
+    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
-    rr.apply(as);
+    rr.apply(ts);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
 {
-    RegexRule rr("ą","x", false);
+    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
-    rr.apply(as);
+    rr.apply(ts);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 {
-    RegexRule rr("[ąćęłńóśżź]","x", false);
+    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
-    boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
-    rr.apply(as);
+    rr.apply(ts);
-    BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
+    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
-    std::list<TokenAnnotation> annotations = as->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),2);
--- a/concordia/t/test_sentence_anonymizer.cpp
+++ b/concordia/t/test_sentence_anonymizer.cpp
@ -1,76 +0,0 @@
 #include <boost/filesystem.hpp>
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include <string>
 #include <sstream>
 #include <boost/shared_ptr.hpp>
 #include "concordia/common/config.hpp"
 #include "concordia/sentence_anonymizer.hpp"
 #include "tests/common/test_resources_manager.hpp"
 BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
 BOOST_AUTO_TEST_CASE( NETest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceAnonymizer anonymizer(config);
    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date  ne_date mail  ne_email number  ne_number");
 }
 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceAnonymizer anonymizer(config);
    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
 }
 BOOST_AUTO_TEST_CASE( StopWordsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    if (config->isStopWordsEnabled()) {
        SentenceAnonymizer anonymizer(config);
        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
        BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"  wiem   konieczne");
    }
 }
 BOOST_AUTO_TEST_CASE( StopSymbolsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceAnonymizer anonymizer(config);
    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
 }
 BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceAnonymizer anonymizer(config);
    std::string sentence = "xxx-xxx xx|xx";
    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
 }
 BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceAnonymizer anonymizer(config);
    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
    BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -0,0 +1,89 @@
 #include <boost/filesystem.hpp>
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include <string>
 #include <sstream>
 #include <iostream>
 #include <boost/shared_ptr.hpp>
 #include <boost/foreach.hpp>
 #include "concordia/common/config.hpp"
 #include "concordia/sentence_tokenizer.hpp"
 #include "concordia/tokenized_sentence.hpp"
 #include "tests/common/test_resources_manager.hpp"
 BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
 BOOST_AUTO_TEST_CASE( NETest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceTokenizer tokenizer(config);
    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
    std::list<TokenAnnotation> annotations = ts->getAnnotations();
    BOOST_CHECK_EQUAL(8,annotations.size());
    BOOST_FOREACH(TokenAnnotation annotation, annotations) {
        std::cout << annotation.getStart() << ","
                  << annotation.getEnd() << " type: "
                  << annotation.getType() << " value: "
                  << annotation.getValue() << std::endl;
    }
 //    BOOST_CHECK_EQUAL(,"date  ne_date mail  ne_email number  ne_number");
 }
 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceTokenizer tokenizer(config);
    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
 }
 BOOST_AUTO_TEST_CASE( StopWordsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    if (config->isStopWordsEnabled()) {
        SentenceTokenizer tokenizer(config);
        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"  wiem   konieczne");
    }
 }
 BOOST_AUTO_TEST_CASE( StopSymbolsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceTokenizer tokenizer(config);
    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
 }
 BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceTokenizer tokenizer(config);
    std::string sentence = "xxx-xxx xx|xx";
    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
 }
 BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
 {
    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
    SentenceTokenizer tokenizer(config);
    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_anonymized_sentence.cpp
+++ b/concordia/t/test_anonymized_sentence.cpp
@ -1,14 +1,14 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/token_annotation.hpp"
 #include "concordia/common/config.hpp"
 #include <iostream>
-BOOST_AUTO_TEST_SUITE(anonymized_sentence)
+BOOST_AUTO_TEST_SUITE(tokenized_sentence)
 BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
 {
-    AnonymizedSentence as("This is a test sentence");
+    TokenizedSentence ts("This is a test sentence");
    std::vector<TokenAnnotation> annotations;
    annotations.push_back(TokenAnnotation(0,1,'a',"val"));
@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
    annotations.push_back(TokenAnnotation(7,10,'a',"val"));
    annotations.push_back(TokenAnnotation(12,14,'a',"val"));
-    as.addAnnotations(annotations);
+    ts.addAnnotations(annotations);
-    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
 }
 BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
 {
-    AnonymizedSentence as("This is a test sentence");
+    TokenizedSentence ts("This is a test sentence");
    std::vector<TokenAnnotation> annotations1;
    annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
    annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
    annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
    annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
-    as.addAnnotations(annotations1);
+    ts.addAnnotations(annotations1);
    /* annotation
    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
    -           ----     -------       -----
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
    annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
    annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
    annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
-    as.addAnnotations(annotations2);
+    ts.addAnnotations(annotations2);
    /* annotations2
    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
       -------  -------          -- -----   
@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
    -  -------  ----     ------- --    -----
    */   
-    BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
-    std::list<TokenAnnotation> annotations = as.getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),0);
--- a/concordia/token_annotation.cpp
+++ b/concordia/token_annotation.cpp
@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
 TokenAnnotation::~TokenAnnotation() {
 }
 char TokenAnnotation::NE_TYPE = 0;
 char TokenAnnotation::WORD_TYPE = 1;
 char TokenAnnotation::HTML_TAG_TYPE = 2;
 char TokenAnnotation::STOP_WORD_TYPE = 3;
--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@ -44,6 +44,14 @@ public:
        return _value;
    }
    static char NE_TYPE;
    static char WORD_TYPE;
    static char HTML_TAG_TYPE;
    static char STOP_WORD_TYPE;
 protected:
    char _annotationType;
--- a/concordia/anonymized_sentence.cpp
+++ b/concordia/anonymized_sentence.cpp
@ -1,16 +1,16 @@
-#include "concordia/anonymized_sentence.hpp"
+#include "concordia/tokenized_sentence.hpp"
 #include "concordia/common/text_utils.hpp"
 #include <iostream>
-AnonymizedSentence::AnonymizedSentence(std::string sentence):
+TokenizedSentence::TokenizedSentence(std::string sentence):
                                         _sentence(sentence) {
 }
-AnonymizedSentence::~AnonymizedSentence() {
+TokenizedSentence::~TokenizedSentence() {
 }
-void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
+void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
    std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
    std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
 }
-void AnonymizedSentence::toLowerCase() {
+void TokenizedSentence::toLowerCase() {
    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
 }
--- a/concordia/anonymized_sentence.hpp
+++ b/concordia/anonymized_sentence.hpp
@ -1,5 +1,5 @@
-#ifndef ANONYMIZED_SENTENCE_HDR
+#ifndef TOKENIZED_SENTENCE_HDR
-#define ANONYMIZED_SENTENCE_HDR
+#define TOKENIZED_SENTENCE_HDR
 #include "concordia/common/config.hpp"
 #include "concordia/token_annotation.hpp"
@ -13,17 +13,17 @@
  along with the annotations list.
 */
-class AnonymizedSentence {
+class TokenizedSentence {
 public:
    /*!
      Constructor.
    */
-    AnonymizedSentence(std::string sentence);
+    TokenizedSentence(std::string sentence);
    /*! Destructor.
    */
-    virtual ~AnonymizedSentence();
+    virtual ~TokenizedSentence();
    /*! Getter for sentence
      \returns sentence
--- a/concordia/tutorial.dox
+++ b/concordia/tutorial.dox
@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
 word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
 #-------------------------------------------------------------------------------
-# The following settings control the sentence anonymizer mechanism. It is used to
+# The following settings control the sentence tokenizer mechanism. Tokenizer
-# remove unnecessary symbols and possibly words from sentences added to index
+# takes into account html tags, substitutes predefined symbols
 # and search patterns. Anonymizer removes html tags, substitutes predefined symbols
 # with a single space, removes stop words (if the option is enabled), as well as
 # named entities and special symbols. All these have to be listed in files.
--- a/prod/resources/anonymizer/space_symbols.txt
+++ b/prod/resources/anonymizer/space_symbols.txt
@ -1,6 +0,0 @@
 \|
 \–
 \-
 \/
 ;
 :
--- a/prod/resources/anonymizer/stop_symbols.txt
+++ b/prod/resources/anonymizer/stop_symbols.txt
@ -1,37 +0,0 @@
 \\tab
 \\emdash
 \&lt;
 \&gt;
 \&amp;
 \&quot;
 \&dash;
 \&nbsp;
 <
 >
 =
 \+
 „
 ”
 \"
 …
 \.
 \,
 \?
 !
 '
 \(
 \)
 \{
 \}
 \@
 \#
 \$
 \%
 \^
 \&
 \*
 \[
 \]
 \\
 \~
 &#\d+
--- a/prod/resources/concordia-config/concordia.cfg.in
+++ b/prod/resources/concordia-config/concordia.cfg.in
@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
 # named entities and special symbols. All these have to be listed in files.
 # File containing all html tags (one per line)
-html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
+html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
 # File containing all symbols to be replaced by spaces
 space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
 # If set to true, words from predefined list are removed
 stop_words_enabled = "@STOP_WORDS_ENABLED@"
 # If stop_words_enabled is true, set the path to the stop words file
-#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
+#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
 # File containing regular expressions that match named entities
-named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
+named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
 # File containing special symbols (one per line) to be removed
 stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
 ### eof
--- a/prod/resources/anonymizer/html_tags.txt
+++ b/prod/resources/anonymizer/html_tags.txt
--- a/prod/resources/anonymizer/named_entities.txt
+++ b/prod/resources/anonymizer/named_entities.txt
--- a/prod/resources/anonymizer/stop_words.txt
+++ b/prod/resources/anonymizer/stop_words.txt
--- a/tests/resources/anonymizer/space_symbols.txt
+++ b/tests/resources/anonymizer/space_symbols.txt
@ -1,6 +0,0 @@
 \|
 \–
 \-
 \/
 ;
 :
--- a/tests/resources/anonymizer/stop_symbols.txt
+++ b/tests/resources/anonymizer/stop_symbols.txt
@ -1,37 +0,0 @@
 \\tab
 \\emdash
 \&lt;
 \&gt;
 \&amp;
 \&quot;
 \&dash;
 \&nbsp;
 <
 >
 =
 \+
 „
 ”
 \"
 …
 \.
 \,
 \?
 !
 '
 \(
 \)
 \{
 \}
 \@
 \#
 \$
 \%
 \^
 \&
 \*
 \[
 \]
 \\
 \~
 &#\d+
--- a/tests/resources/concordia-config/concordia-mock.cfg
+++ b/tests/resources/concordia-config/concordia-mock.cfg
@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"
 html_tags_path = "/tmp/html_tags.txt"
 space_symbols_path = "/tmp/space_symbols.txt"
 stop_words_enabled = "true"
 stop_words_path = "/tmp/stop_words.txt"
 named_entities_path = "/tmp/named_entities.txt"
 stop_symbols_path = "/tmp/stop_symbols.txt"
 ### eof
--- a/tests/resources/concordia-config/concordia.cfg.in
+++ b/tests/resources/concordia-config/concordia.cfg.in
@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
 # named entities and special symbols. All these have to be listed in files.
 # File containing all html tags (one per line)
-html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
+html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
 # File containing all symbols to be replaced by spaces
 space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
 # If set to true, words from predefined list are removed
 stop_words_enabled = "@STOP_WORDS_ENABLED@"
 # If stop_words_enabled is true, set the path to the stop words file
-#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
+#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
 # File containing regular expressions that match named entities
-named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
+named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
 # File containing special symbols (one per line) to be removed
 stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
 ### eof
--- a/tests/resources/anonymizer/html_tags.txt
+++ b/tests/resources/anonymizer/html_tags.txt
--- a/tests/resources/anonymizer/named_entities.txt
+++ b/tests/resources/anonymizer/named_entities.txt
--- a/tests/resources/anonymizer/stop_words.txt
+++ b/tests/resources/anonymizer/stop_words.txt