adding all tokenized examples

2015-08-19 20:49:26 +02:00 · 2015-08-19 20:49:26 +02:00 · 68fecaddf8
commit 68fecaddf8
parent a765443a01
20 changed files with 220 additions and 119 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,4 +1,5 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
 DONE - change the arguments of addExample* fucntions to const reference to TokenizedSentence (not boost::shared_ptr<TokenizedSentence>
 - multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
 IN PROGRESS - document the code (classes, cfg files) and update tutorial
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
--- a/concordia-console/concordia-console.cpp
+++ b/concordia-console/concordia-console.cpp
@ -29,7 +29,7 @@ void checkConcordiaResults(
    long lineIndex = 1;
    BOOST_FOREACH(ConcordiaSearchResult result, results) {
        SUFFIX_MARKER_TYPE patternSize =
-                    result.getTokenizedPattern()->getTokens().size();
+                    result.getTokenizedPattern().getTokens().size();
        if (patternSize > 0) {
            if (result.getBestOverlay().size() != 1) {
                reportError(baseLineCount + lineIndex,
@ -203,7 +203,7 @@ int main(int argc, char** argv) {
            std::cout << "\tPattern used: " << std::endl << "\t\t";
            BOOST_FOREACH(TokenAnnotation annotation,
-                                  result->getTokenizedPattern()->getTokens()) {
+                                  result->getTokenizedPattern().getTokens()) {
                std::cout << annotation.getValue() << " ";
            }
            std::cout << std::endl;
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -1,4 +1,5 @@
 #include <sstream>
 #include <boost/foreach.hpp>
 #include "concordia/concordia.hpp"
 #include "concordia/common/config.hpp"
@ -42,19 +43,31 @@ std::string _createLibraryVersion() {
    return version.str();
 }
-boost::shared_ptr<TokenizedSentence>
+TokenizedSentence
            Concordia::tokenize(const std::string & sentence)
                                  throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> result =
+    TokenizedSentence result =
                _hashGenerator->generateHash(sentence);
    _hashGenerator->serializeWordMap();
    return result;
 }
 std::vector<TokenizedSentence> Concordia::tokenizeAll(
                     const std::vector<std::string> & sentences)
                                     throw(ConcordiaException) {
    std::vector<TokenizedSentence> result;
    BOOST_FOREACH(std::string sentence, sentences) {
        result.push_back(_hashGenerator->generateHash(sentence));
    }
    _hashGenerator->serializeWordMap();
    return result;
 }
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-boost::shared_ptr<TokenizedSentence> Concordia::addExample(
+TokenizedSentence Concordia::addExample(
                                      const Example & example)
                                      throw(ConcordiaException) {
    return _index->addExample(_hashGenerator, _T, _markers, example);
@ -63,13 +76,21 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
 void Concordia::addTokenizedExample(
-                    boost::shared_ptr<TokenizedSentence> tokenizedSentence,
+                    const TokenizedSentence & tokenizedSentence,
-                    SUFFIX_MARKER_TYPE id)
+                    const SUFFIX_MARKER_TYPE id)
                                              throw(ConcordiaException) {
    _index->addTokenizedExample(_hashGenerator, _T,
                                _markers, tokenizedSentence, id);
 }
 void Concordia::addAllTokenizedExamples(
                const std::vector<TokenizedSentence> & tokenizedSentences,
                const std::vector<SUFFIX_MARKER_TYPE> & ids)
                                              throw(ConcordiaException) {
    _index->addAllTokenizedExamples(_hashGenerator, _T,
                                _markers, tokenizedSentences, ids);
 }
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
@ -188,8 +209,7 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
    } else {
        std::string empty;
        return boost::shared_ptr<ConcordiaSearchResult>(
-            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
+            new ConcordiaSearchResult(TokenizedSentence(empty)));
                                            new TokenizedSentence(empty))));
    }
 }
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -58,7 +58,16 @@ public:
               containing information about original word positions
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
+    TokenizedSentence tokenize(const std::string & sentence)
                                                     throw(ConcordiaException);
    /*! Tokenizes all the given sentences.
      \param sentences vector of sentences to be tokenized
      \returns vector of tokenized sentence objects
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> tokenizeAll(
                                   const std::vector<std::string> & sentences)
                                                     throw(ConcordiaException);
    /*! Adds an Example to the index.
@ -67,17 +76,27 @@ public:
               containing information about original word positions
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
+    TokenizedSentence addExample(const Example & example)
                                          throw(ConcordiaException);
    /*! Adds a tokenized example to the index.
      \param tokenizedSentence tokenized sentence to be added
-      \param id of the sentence to be added
+      \param id id of the sentence to be added
      \throws ConcordiaException
    */
    void addTokenizedExample(
-                    boost::shared_ptr<TokenizedSentence> tokenizedSentence,
+                    const TokenizedSentence & tokenizedSentence,
-                    SUFFIX_MARKER_TYPE id)
+                    const SUFFIX_MARKER_TYPE id)
                                                  throw(ConcordiaException);
    /*! Adds multiple tokenized examples to the index.
      \param examples vector of examples to be added
      \param ids vector of ids of the sentences to be added
      \throws ConcordiaException
    */
    void addAllTokenizedExamples(
                    const std::vector<TokenizedSentence> & tokenizedSentences,
                    const std::vector<SUFFIX_MARKER_TYPE> & ids)
                                                  throw(ConcordiaException);
    /*! Adds multiple examples to the index.
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -4,6 +4,8 @@
 #include "concordia/common/config.hpp"
 #include <boost/filesystem.hpp>
 #include <boost/foreach.hpp>
 #include <boost/make_shared.hpp>
 #include <iostream>
 #include <climits>
@ -48,10 +50,10 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
    std::vector<TokenizedSentence> hashedPatterns;
    BOOST_FOREACH(Example example, examples) {
-        boost::shared_ptr<TokenizedSentence> hashedPattern =
+        TokenizedSentence hashedPattern =
             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
-        hashedPatterns.push_back(*hashedPattern);
+        hashedPatterns.push_back(hashedPattern);
    }
    hashedIndexFile.close();
@ -61,7 +63,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
    return hashedPatterns;
 }
-boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
+TokenizedSentence ConcordiaIndex::addExample(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -72,7 +74,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
    std::ofstream markersFile;
    markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
+    TokenizedSentence hashedPattern =
             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
    hashedIndexFile.close();
@ -86,8 +88,8 @@ void ConcordiaIndex::addTokenizedExample(
            boost::shared_ptr<HashGenerator> hashGenerator,
            boost::shared_ptr<std::vector<sauchar_t> > T,
            boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-            boost::shared_ptr<TokenizedSentence> tokenizedSentence,
+            const TokenizedSentence & tokenizedSentence,
-            SUFFIX_MARKER_TYPE id) {
+            const SUFFIX_MARKER_TYPE id) {
    std::ofstream hashedIndexFile;
    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
@ -100,15 +102,38 @@ void ConcordiaIndex::addTokenizedExample(
    markersFile.close();
 }
 void ConcordiaIndex::addAllTokenizedExamples(
            boost::shared_ptr<HashGenerator> hashGenerator,
            boost::shared_ptr<std::vector<sauchar_t> > T,
            boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
            const std::vector<TokenizedSentence> & tokenizedSentences,
            const std::vector<SUFFIX_MARKER_TYPE> & ids) {
    std::ofstream hashedIndexFile;
    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
    std::ofstream markersFile;
    markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
    int index = 0;
    BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
        _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
                                  T, markers, tokenizedSentence, ids.at(index));
        index++;
    }
    hashedIndexFile.close();
    markersFile.close();
 }
 void ConcordiaIndex::_addSingleTokenizedExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                   boost::shared_ptr<TokenizedSentence> tokenizedSentence,
+                   const TokenizedSentence & tokenizedSentence,
-                   SUFFIX_MARKER_TYPE id) {
+                   const SUFFIX_MARKER_TYPE id) {
-    std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
    int offset = 0;
    for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
@ -139,14 +164,14 @@ void ConcordiaIndex::_addSingleTokenizedExample(
    markers->push_back(sentenceBoundaryMA);
 }
-boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
+TokenizedSentence ConcordiaIndex::_addSingleExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                   const Example & example) {
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
+    TokenizedSentence hashedPattern =
                    hashGenerator->generateHash(example.getSentence());
    _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
                               T, markers, hashedPattern, example.getId());
--- a/concordia/concordia_index.hpp
+++ b/concordia/concordia_index.hpp
@ -53,7 +53,7 @@ public:
      \returns tokenized example
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> addExample(
+    TokenizedSentence addExample(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -63,7 +63,6 @@ public:
        and markers array are appended with the example.
        At the same time, HDD versions of these
        two data structures are also appended with the same example.
        The method returns a tokenized version of the example.
      \param hashGenerator hash generator to be used to prepare the hash
             of the example
      \param T RAM-based hash index to be appended to
@ -77,8 +76,28 @@ public:
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                boost::shared_ptr<TokenizedSentence> tokenizedSentence,
+                const TokenizedSentence & tokenizedSentence,
-                SUFFIX_MARKER_TYPE id);
+                const SUFFIX_MARKER_TYPE id);
    /*! Adds multiple tokenized examples to the index. Hashed index
        and markers array are appended with the examples.
        At the same time, HDD versions of these
        two data structures are also appended with the same examples.
      \param hashGenerator hash generator to be used to prepare the hash
             of the example
      \param T RAM-based hash index to be appended to
      \param markers RAM-based markers array to be appended to
      \param example example to be added to index
      \param tokenizedSentences vector of tokenized sentences to be added
      \param ids vector of ids of the sentences to be added
      \throws ConcordiaException
    */
    void addAllTokenizedExamples(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                const std::vector<TokenizedSentence> & tokenizedSentences,
                const std::vector<SUFFIX_MARKER_TYPE> & ids);
    /*! Adds multiple examples to the index. Examples are first hashed using
        the hash generator passed to this method. Then, hashed index
@ -114,10 +133,10 @@ private:
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                boost::shared_ptr<TokenizedSentence> tokenizedSentence,
+                const TokenizedSentence & tokenizedSentence,
-                SUFFIX_MARKER_TYPE id);
+                const SUFFIX_MARKER_TYPE id);
-    boost::shared_ptr<TokenizedSentence> _addSingleExample(
+    TokenizedSentence _addSingleExample(
                std::ofstream & hashedIndexFile,
                std::ofstream & markersFile,
                boost::shared_ptr<HashGenerator> hashGenerator,
--- a/concordia/concordia_search_result.cpp
+++ b/concordia/concordia_search_result.cpp
@ -4,7 +4,7 @@
 #include <algorithm>
 ConcordiaSearchResult::ConcordiaSearchResult(
-                boost::shared_ptr<TokenizedSentence> tokenizedPattern):
+                TokenizedSentence tokenizedPattern):
                _tokenizedPattern(tokenizedPattern),
                _bestOverlayScore(0) {
 }
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
    // the fragments are already sorted by their ends, ascending
    _checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
                           -1,
-                           _tokenizedPattern->getTokens().size());
+                           _tokenizedPattern.getTokens().size());
 }
 void ConcordiaSearchResult::_checkPossibleOverlays(
--- a/concordia/concordia_search_result.hpp
+++ b/concordia/concordia_search_result.hpp
@ -26,8 +26,7 @@ public:
    /*! Constructor.
      \param tokenVector tokenized pattern which was used for searching
    */
-    explicit ConcordiaSearchResult(
+    explicit ConcordiaSearchResult(TokenizedSentence tokenizedPattern);
                boost::shared_ptr<TokenizedSentence> tokenizedPattern);
    /*! Destructor.
    */
@ -51,7 +50,7 @@ public:
    /*! Getter for tokenized pattern.
        \returns tokenized search pattern
    */
-    boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
+    TokenizedSentence getTokenizedPattern() const {
        return _tokenizedPattern;
    }
@ -82,7 +81,7 @@ private:
                SUFFIX_MARKER_TYPE lastAddedPos,
                SUFFIX_MARKER_TYPE patternSize);
-    boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
+    TokenizedSentence _tokenizedPattern;
    std::vector<MatchedPatternFragment> _matchedPatternFragments;
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -27,13 +27,12 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
 HashGenerator::~HashGenerator() {
 }
-boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
+TokenizedSentence HashGenerator::generateHash(
                     const std::string & sentence) throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> ts =
+    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
-                                    _sentenceTokenizer->tokenize(sentence);
+    ts.generateHash(_wordMap);
    ts->generateHash(_wordMap);
-    if (ts->getTokens().size() > Utils::maxSentenceSize) {
+    if (ts.getTokens().size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -44,8 +44,7 @@ public:
      \param sentence sentence to generate hash from
      \returns tokenized sentence, containing the hash
    */
-    boost::shared_ptr<TokenizedSentence> generateHash(
+    TokenizedSentence generateHash(const std::string & sentence)
                                const std::string & sentence)
                                       throw(ConcordiaException);
    /*!
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -23,7 +23,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
    int left;
    std::vector<INDEX_CHARACTER_TYPE> hash =
-                            hashGenerator->generateHash(pattern)->getCodes();
+                            hashGenerator->generateHash(pattern).getCodes();
    saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
    sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -60,7 +60,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern) throw(ConcordiaException) {
    std::vector<INDEX_CHARACTER_TYPE> hash =
-                         hashGenerator->generateHash(pattern)->getCodes();
+                         hashGenerator->generateHash(pattern).getCodes();
    return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
 }
@ -70,13 +70,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
                  boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern) throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
+    TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern);
                                 hashGenerator->generateHash(pattern);
    boost::shared_ptr<ConcordiaSearchResult> result =
     boost::shared_ptr<ConcordiaSearchResult>(
       new ConcordiaSearchResult(hashedPattern));
    _concordiaSearcher->concordiaSearch(result, T, markers,
-                                        SA, hashedPattern->getCodes());
+                                        SA, hashedPattern.getCodes());
    return result;
 }
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@ -36,9 +36,9 @@ RegexRule::RegexRule(std::string patternString,
 RegexRule::~RegexRule() {
 }
-void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
+void RegexRule::apply(TokenizedSentence & sentence) {
    try {
-        UnicodeString s(sentence->getSentence().c_str());
+        UnicodeString s(sentence.getSentence().c_str());
        boost::u32regex_iterator<const UChar*> begin(
                             boost::make_u32regex_iterator(s, _pattern));
        boost::u32regex_iterator<const UChar*> end;
@ -58,12 +58,12 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
                                      _annotationType, value);
            annotations.push_back(annotation);
        }
-        sentence->addAnnotations(annotations);
+        sentence.addAnnotations(annotations);
    } catch(const std::exception & e) {
        std::stringstream ss;
        ss << "Exception while applying regex rule: "
                          << _annotationType << " to text: "
-                          << sentence->getSentence();
+                          << sentence.getSentence();
        ss << ", message: " << e.what();
        throw ConcordiaException(ss.str());
    }
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@ -42,7 +42,7 @@ public:
    /*! Applies regex annotation on tokenized sentence.
      \param sentence the input sentence
    */
-    void apply(boost::shared_ptr<TokenizedSentence> sentence);
+    void apply(TokenizedSentence & sentence);
 private:
    int _annotationType;
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -24,10 +24,8 @@ SentenceTokenizer::SentenceTokenizer(
 SentenceTokenizer::~SentenceTokenizer() {
 }
-boost::shared_ptr<TokenizedSentence>
+TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
-              SentenceTokenizer::tokenize(const std::string & sentence) {
+    TokenizedSentence result(sentence);
    boost::shared_ptr<TokenizedSentence>
                    result(new TokenizedSentence(sentence));
    _htmlTags->apply(result);
@ -35,7 +33,7 @@ boost::shared_ptr<TokenizedSentence>
        neRule.apply(result);
    }
-    result->toLowerCase();
+    result.toLowerCase();
    if (_stopWordsEnabled) {
        _stopWords->apply(result);
--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@ -36,8 +36,7 @@ public:
      \param sentence input sentence
      \returns tokenized sentence object build on the input sentence
    */
-    boost::shared_ptr<TokenizedSentence>
+    TokenizedSentence tokenize(const std::string & sentence);
                                   tokenize(const std::string & sentence);
 private:
    void _createNeRules(std::string & namedEntitiesPath);
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -27,17 +27,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
 BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
-    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
+    TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
    /*
    0,3 type: 1 value: ala
    4,11 type: 1 value: posiada
    12,16 type: 1 value: kota
    */
-    BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
    concordia.addExample(Example("Ala posiada rysia",51));
    concordia.addExample(Example("Marysia posiada rysia",123));
@ -293,24 +293,36 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
 BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
    /*
    concordia.addExample(Example("Alice has a cat", 56));
    concordia.addExample(Example("Alice has a dog", 23));
    concordia.addExample(Example("New test product has a mistake", 321));
-    boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
+    */
    std::vector<std::string> sentences;
    std::vector<SUFFIX_MARKER_TYPE> ids;
    sentences.push_back("Alice has a cat");
    ids.push_back(56);
    sentences.push_back("Alice has a dog");
    ids.push_back(23);
    sentences.push_back("New test product has a mistake");
    ids.push_back(321);
    std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
    concordia.addAllTokenizedExamples(tokenizedSentences, ids);
    TokenizedSentence ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
    concordia.addTokenizedExample(ts, 14);
    concordia.refreshSAfromRAM();
    boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
    // best overlay: 
    /*
    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
-    BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
+    BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.537, 0.1);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 5);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 5);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 9);
    */
    BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);
@ -338,7 +350,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
 BOOST_AUTO_TEST_CASE( Tokenize )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
-    boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("  Ala    posiada kota");
+    TokenizedSentence ts = concordia.tokenize("  Ala    posiada kota");
    /*
    0,3 type: 1 value: ala
    4,11 type: 1 value: posiada
@ -347,10 +359,22 @@ BOOST_AUTO_TEST_CASE( Tokenize )
    concordia.clearIndex();
-    BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 9);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
    std::vector<std::string> sentences;
    sentences.push_back("Marysia, ma rysia;");
    sentences.push_back("Testing complete;");
    sentences.push_back("This, is (a) weird;! sentence <>");
    std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
    BOOST_CHECK_EQUAL(tokenizedSentences.size(), 3);
    BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
    BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
    BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_concordia_searcher.cpp
+++ b/concordia/t/test_concordia_searcher.cpp
@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
    // searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
-    std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia").getCodes();
    boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    HashGenerator hashGenerator = HashGenerator(config);
-    std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota").getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected;
    expected.push_back(0);
    expected.push_back(1);
@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
    HashGenerator hashGenerator1 = HashGenerator(config);
-    std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected1;
    expected1.push_back(0);
    expected1.push_back(1);
@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
    hashGenerator1.serializeWordMap();
    HashGenerator hashGenerator2 = HashGenerator(config);
-    std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected2;
    expected2.push_back(0);
    expected2.push_back(1);
@ -106,9 +106,9 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
    HashGenerator hashGenerator = HashGenerator(config);
-    boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014   o  godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował  samochód.");
+    TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014   o  godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował  samochód.");
-    std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
+    std::vector<TokenAnnotation> tokens = tokenizedSentence.getTokens();
    /*    
    BOOST_FOREACH(TokenAnnotation annotation, tokens) {
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@ -13,10 +13,10 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
 BOOST_AUTO_TEST_CASE( SimpleAnnotation )
 {
    RegexRule rr("a", TokenAnnotation::WORD, "b");
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    TokenizedSentence ts("xxxxxxxaxxxaxxaxaxa");
    rr.apply(ts);    
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -56,10 +56,10 @@ BOOST_AUTO_TEST_CASE( BadRegex )
 BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
 {
    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    TokenizedSentence ts("Don't stop believin' \\ Hold on to the feelin'.");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
 {
    RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
+    TokenizedSentence ts("This is AbC and ABC and abc and aBC.");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),4);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -111,10 +111,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
 BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
 {
    RegexRule rr("ą", TokenAnnotation::WORD, "x");
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
+    TokenizedSentence ts("zażółć gęślą jaźń");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),1);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -124,10 +124,10 @@ BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
 {
    RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),2);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -141,10 +141,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 {
    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),18);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(iter->getStart(),2);
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE( NETest )
    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(14,annotations.size());
@ -134,8 +134,8 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
    SentenceTokenizer tokenizer(config);
    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    /*
@ -214,8 +214,8 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
    SentenceTokenizer tokenizer(config);
    std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    /*
@ -322,7 +322,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
    if (config->isStopWordsEnabled()) {
        SentenceTokenizer tokenizer(config);
        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
-        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"  wiem   konieczne");
+        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence).getSentence(),"  wiem   konieczne");
    }
 }
@ -332,8 +332,8 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
    SentenceTokenizer tokenizer(config);
    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    BOOST_CHECK_EQUAL(161, annotations.size());