From 9b1735516c1ee4a59a81dfbbdf89f5ef76886aac Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Thu, 25 Jun 2015 20:49:22 +0200
Subject: [PATCH] working sentence tokenizer

---
 TODO.txt                                |   1 +
 concordia/regex_rule.cpp                |  12 +-
 concordia/regex_rule.hpp                |   4 +-
 concordia/sentence_tokenizer.cpp        |  12 +-
 concordia/t/test_regex_rule.cpp         |  24 +-
 concordia/t/test_sentence_tokenizer.cpp | 332 +++++++++++++++++++++---
 concordia/token_annotation.cpp          |  10 +-
 concordia/token_annotation.hpp          |  14 +-
 concordia/tokenized_sentence.cpp        |   1 +
 9 files changed, 343 insertions(+), 67 deletions(-)

diff --git a/TODO.txt b/TODO.txt
index 01c38d3..41a583d 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,5 +1,6 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
 
+- repair stop words feature
 - work on word regex pattern (allow for some symbols and digits within the word)
 - document the code (classes, cfg files) and update tutorial
 IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp
index 636dfda..062c118 100644
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@@ -5,7 +5,7 @@
 #include <boost/throw_exception.hpp>
 
 RegexRule::RegexRule(std::string patternString,
-                     char annotationType,
+                     int annotationType,
                      std::string value,
                      bool caseSensitive)
                              throw(ConcordiaException):
@@ -43,7 +43,15 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
         for (; begin != end; ++begin) {
             SUFFIX_MARKER_TYPE matchBegin = begin->position();
             SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
-            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
+            std::string value;
+            if (_annotationType == TokenAnnotation::WORD) {
+                UnicodeString unicodeValue;
+                s.extract(begin->position(), begin->length(), unicodeValue); 
+                unicodeValue.toUTF8String(value);
+            } else {
+                value = _value;
+            }
+            TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
             annotations.push_back(annotation);
         }
         sentence->addAnnotations(annotations);
diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp
index 2c40bb3..dbc509a 100644
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@@ -28,7 +28,7 @@ public:
         \param caseSensitive case sensitivity of the pattern
     */
     RegexRule(std::string patternString,
-              char annotationType,
+              int annotationType,
               std::string value,
               bool caseSensitive = true)
               throw(ConcordiaException);
@@ -43,7 +43,7 @@ public:
     void apply(boost::shared_ptr<TokenizedSentence> sentence);
 
 private:
-    char _annotationType;
+    int _annotationType;
 
     std::string _value;
     
diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp
index 663ed80..2adfbcd 100644
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@@ -16,7 +16,7 @@ SentenceTokenizer::SentenceTokenizer(
     if (_stopWordsEnabled) {
         _stopWords = _getMultipleRegexRule(
                                   config->getStopWordsFilePath(),
-                                  TokenAnnotation::STOP_WORD_TYPE,
+                                  TokenAnnotation::STOP_WORD,
                                   "", true);
     }
 }
@@ -42,7 +42,11 @@ boost::shared_ptr<TokenizedSentence>
     }
     
     boost::shared_ptr<RegexRule> wordsRule(
-                        new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
+                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
+    wordsRule->apply(result);
+    boost::shared_ptr<RegexRule> singleLetterWordsRule(
+                        new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
+    singleLetterWordsRule->apply(result);
 
     return result;
 }
@@ -67,7 +71,7 @@ void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
                 } else {
                     _namedEntities.push_back(RegexRule(
                                 tokenTexts->at(0),
-                                TokenAnnotation::NE_TYPE,
+                                TokenAnnotation::NE,
                                 tokenTexts->at(1)));
                 }
             }
@@ -99,7 +103,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
     tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
     tagsExpression += "br).*?>";
     _htmlTags = boost::shared_ptr<RegexRule>(
-                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
+                        new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
 }
 
 boost::shared_ptr<RegexRule>
diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp
index e650067..78685cf 100644
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@@ -10,9 +10,9 @@
 
 BOOST_AUTO_TEST_SUITE(regex_rule)
 
-BOOST_AUTO_TEST_CASE( SimpleReplacement )
+BOOST_AUTO_TEST_CASE( SimpleAnnotation )
 {
-    RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
+    RegexRule rr("a", TokenAnnotation::WORD, "b");
     boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
     rr.apply(ts);    
     BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
@@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
     bool exceptionThrown = false;
     std::string message = "";
     try {
-        RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
+        RegexRule rr("+a",TokenAnnotation::WORD, "b");
     } catch (ConcordiaException & e) {
         exceptionThrown = true;
         message = e.what();
@@ -53,9 +53,9 @@ BOOST_AUTO_TEST_CASE( BadRegex )
     BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);    
 }
 
-BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
+BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
 {
-    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
+    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
     boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
     rr.apply(ts);
     BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
@@ -83,9 +83,9 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
 }
 
 
-BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
+BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
 {
-    RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
+    RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
     boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
     rr.apply(ts);
     BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
@@ -108,9 +108,9 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
     BOOST_CHECK_EQUAL(iter->getEnd(),35);
 }
 
-BOOST_AUTO_TEST_CASE( UnicodeReplacement )
+BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
 {
-    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
+    RegexRule rr("ą", TokenAnnotation::WORD, "x");
     boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
     rr.apply(ts);
     BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
@@ -121,9 +121,9 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
     BOOST_CHECK_EQUAL(iter->getEnd(),12);
 }
 
-BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
 {
-    RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
+    RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
     boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
     rr.apply(ts);
     BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
@@ -140,7 +140,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
 
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 {
-    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
+    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
     boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
     rr.apply(ts);
     BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp
index 49d7244..cd1df21 100644
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@@ -19,29 +19,310 @@ BOOST_AUTO_TEST_CASE( NETest )
     SentenceTokenizer tokenizer(config);
     
     
-    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
+    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
     boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
     std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+    
+    BOOST_CHECK_EQUAL(14,annotations.size());
+    
+    /*
+    0,4 type: 1 value: date
+    6,16 type: 0 value: ne_date
+    18,22 type: 1 value: mail
+    24,40 type: 0 value: ne_email
+    42,48 type: 1 value: number
+    50,54 type: 0 value: ne_number
+    56,61 type: 1 value: hello
+    61,62 type: 0 value: ne_number
+    63,69 type: 1 value: zażółć
+    70,75 type: 1 value: gęślą
+    76,80 type: 1 value: jaźń
+    82,88 type: 1 value: zażółć
+    89,94 type: 1 value: gęślą
+    95,99 type: 1 value: jaźń
+    */
 
-    BOOST_CHECK_EQUAL(8,annotations.size());
-    BOOST_FOREACH(TokenAnnotation annotation, annotations) {
-        std::cout << annotation.getStart() << ","
-                  << annotation.getEnd() << " type: "
-                  << annotation.getType() << " value: "
-                  << annotation.getValue() << std::endl;
-    }
-//    BOOST_CHECK_EQUAL(,"date  ne_date mail  ne_email number  ne_number");
+    BOOST_CHECK_EQUAL(iter->getStart(),0);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "date");
+    iter++;
+    
+    BOOST_CHECK_EQUAL(iter->getStart(),6);
+    BOOST_CHECK_EQUAL(iter->getEnd(),16);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
+    BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),18);
+    BOOST_CHECK_EQUAL(iter->getEnd(),22);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "mail");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),24);
+    BOOST_CHECK_EQUAL(iter->getEnd(),40);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
+    BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),42);
+    BOOST_CHECK_EQUAL(iter->getEnd(),48);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "number");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),50);
+    BOOST_CHECK_EQUAL(iter->getEnd(),54);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
+    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
+    iter++;
+    
+    BOOST_CHECK_EQUAL(iter->getStart(),56);
+    BOOST_CHECK_EQUAL(iter->getEnd(),61);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "hello");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),61);
+    BOOST_CHECK_EQUAL(iter->getEnd(),62);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
+    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),63);
+    BOOST_CHECK_EQUAL(iter->getEnd(),69);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),70);
+    BOOST_CHECK_EQUAL(iter->getEnd(),75);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),76);
+    BOOST_CHECK_EQUAL(iter->getEnd(),80);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),82);
+    BOOST_CHECK_EQUAL(iter->getEnd(),88);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),89);
+    BOOST_CHECK_EQUAL(iter->getEnd(),94);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),95);
+    BOOST_CHECK_EQUAL(iter->getEnd(),99);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
+    
 }
 
 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
 {
     boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
     SentenceTokenizer tokenizer(config);
-    
-    
+
     std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
+    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+        
+    /*
+    0,23 type: 2 value: 
+    23,27 type: 1 value: link
+    27,31 type: 2 value: 
+    32,35 type: 1 value: and
+    36,39 type: 2 value: 
+    39,43 type: 1 value: bold
+    43,47 type: 2 value: 
+    48,51 type: 1 value: and
+    52,59 type: 1 value: newline
+    60,65 type: 2 value:
+    */
+
+    BOOST_CHECK_EQUAL(10,annotations.size());
     
+    BOOST_CHECK_EQUAL(iter->getStart(),0);
+    BOOST_CHECK_EQUAL(iter->getEnd(),23);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
+    iter++;
+    
+    BOOST_CHECK_EQUAL(iter->getStart(),23);
+    BOOST_CHECK_EQUAL(iter->getEnd(),27);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"link");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),27);
+    BOOST_CHECK_EQUAL(iter->getEnd(),31);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),32);
+    BOOST_CHECK_EQUAL(iter->getEnd(),35);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"and");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),36);
+    BOOST_CHECK_EQUAL(iter->getEnd(),39);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),39);
+    BOOST_CHECK_EQUAL(iter->getEnd(),43);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"bold");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),43);
+    BOOST_CHECK_EQUAL(iter->getEnd(),47);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),48);
+    BOOST_CHECK_EQUAL(iter->getEnd(),51);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"and");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),52);
+    BOOST_CHECK_EQUAL(iter->getEnd(),59);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"newline");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),60);
+    BOOST_CHECK_EQUAL(iter->getEnd(),65);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
+}
+
+BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);
+
+    std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
+    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+
+    /*    
+    BOOST_FOREACH(TokenAnnotation annotation, annotations) {
+        std::cout << annotation.getStart() << ","
+                  << annotation.getEnd() << " type: "
+                  << annotation.getType() << " value: "
+                  << annotation.getValue() << std::endl;
+    }
+    */
+    
+    /*
+    0,4 type: 1 value: this
+    5,7 type: 1 value: is
+    8,9 type: 1 value: a
+    10,18 type: 1 value: sentence
+    20,25 type: 1 value: don't
+    26,38 type: 1 value: over-analyze
+    39,41 type: 1 value: it
+    43,49 type: 1 value: zażółć
+    51,57 type: 1 value: gęś'lą
+    59,63 type: 1 value: jaźń
+    64,71 type: 1 value: zaż-ółć
+    72,77 type: 1 value: gęślą
+    78,83 type: 1 value: jaź'ń
+    */
+
+    BOOST_CHECK_EQUAL(13,annotations.size());
+    
+    BOOST_CHECK_EQUAL(iter->getStart(),0);
+    BOOST_CHECK_EQUAL(iter->getEnd(),4);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"this");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),5);
+    BOOST_CHECK_EQUAL(iter->getEnd(),7);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"is");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),8);
+    BOOST_CHECK_EQUAL(iter->getEnd(),9);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"a");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),10);
+    BOOST_CHECK_EQUAL(iter->getEnd(),18);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),20);
+    BOOST_CHECK_EQUAL(iter->getEnd(),25);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"don't");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),26);
+    BOOST_CHECK_EQUAL(iter->getEnd(),38);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),39);
+    BOOST_CHECK_EQUAL(iter->getEnd(),41);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"it");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),43);
+    BOOST_CHECK_EQUAL(iter->getEnd(),49);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),51);
+    BOOST_CHECK_EQUAL(iter->getEnd(),57);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),59);
+    BOOST_CHECK_EQUAL(iter->getEnd(),63);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),64);
+    BOOST_CHECK_EQUAL(iter->getEnd(),71);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),72);
+    BOOST_CHECK_EQUAL(iter->getEnd(),77);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
+    iter++;
+
+    BOOST_CHECK_EQUAL(iter->getStart(),78);
+    BOOST_CHECK_EQUAL(iter->getEnd(),83);
+    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
+    BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
+    iter++;
+
 }
 
 BOOST_AUTO_TEST_CASE( StopWordsTest )
@@ -54,36 +335,17 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
     }
 }
 
-BOOST_AUTO_TEST_CASE( StopSymbolsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceTokenizer tokenizer(config);
-    
-    
-    std::string sentence = "xxx, . xxx  # xx $xx@ xx";
-    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx  xxx   xx xx xx");
-    
-}
-
-BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
-{
-    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    SentenceTokenizer tokenizer(config);
-    
-    
-    std::string sentence = "xxx-xxx xx|xx";
-    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
-    
-}
-
 BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
 {
     boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
     SentenceTokenizer tokenizer(config);
     
     std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony   dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl   dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw   ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap   rm mne_numbergrm mne_numbere   —   ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number  ");
+    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation>::iterator iter = annotations.begin();
     
+    BOOST_CHECK_EQUAL(161, annotations.size());
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/token_annotation.cpp b/concordia/token_annotation.cpp
index a44f820..2d0a470 100644
--- a/concordia/token_annotation.cpp
+++ b/concordia/token_annotation.cpp
@@ -3,7 +3,7 @@
 
 TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
                                  const SUFFIX_MARKER_TYPE end,
-                                 const char annotationType,
+                                 const int annotationType,
                                  const std::string & value):
                                             Interval(start, end),
                                             _annotationType(annotationType),
@@ -13,7 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
 TokenAnnotation::~TokenAnnotation() {
 }
 
-char TokenAnnotation::NE_TYPE = 0;
-char TokenAnnotation::WORD_TYPE = 1;
-char TokenAnnotation::HTML_TAG_TYPE = 2;
-char TokenAnnotation::STOP_WORD_TYPE = 3;
+int TokenAnnotation::NE = 0;
+int TokenAnnotation::WORD = 1;
+int TokenAnnotation::HTML_TAG = 2;
+int TokenAnnotation::STOP_WORD = 3;
diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp
index d98af1a..11153e5 100644
--- a/concordia/token_annotation.hpp
+++ b/concordia/token_annotation.hpp
@@ -23,7 +23,7 @@ public:
     */
     TokenAnnotation(const SUFFIX_MARKER_TYPE start,
                     const SUFFIX_MARKER_TYPE end,
-                    const char annotationType,
+                    const int annotationType,
                     const std::string & value);
 
     /*! Destructor.
@@ -33,7 +33,7 @@ public:
     /*! Getter for annotation type.
       \returns annotation type
     */
-    char getType() const {
+    int getType() const {
         return _annotationType;
     }
 
@@ -44,16 +44,16 @@ public:
         return _value;
     }
 
-    static char NE_TYPE;
+    static int NE;
 
-    static char WORD_TYPE;
+    static int WORD;
 
-    static char HTML_TAG_TYPE;
+    static int HTML_TAG;
 
-    static char STOP_WORD_TYPE;
+    static int STOP_WORD;
     
 protected:
-    char _annotationType;
+    int _annotationType;
 
     std::string _value;
 };
diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp
index 0c0c014..e828c54 100644
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@@ -2,6 +2,7 @@
 #include "concordia/common/text_utils.hpp"
 
 #include <iostream>
+#include <boost/foreach.hpp>
 
 TokenizedSentence::TokenizedSentence(std::string sentence):
                                          _sentence(sentence) {