adding all tokenized examples

2015-08-19 20:49:26 +02:00 · 2015-08-19 20:49:26 +02:00 · 68fecaddf8
commit 68fecaddf8
parent a765443a01
20 changed files with 220 additions and 119 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,4 +1,5 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
+DONE - change the arguments of addExample* fucntions to const reference to TokenizedSentence (not boost::shared_ptr<TokenizedSentence>
 - multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
 IN PROGRESS - document the code (classes, cfg files) and update tutorial
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
--- a/concordia-console/concordia-console.cpp
+++ b/concordia-console/concordia-console.cpp
@ -29,7 +29,7 @@ void checkConcordiaResults(
    long lineIndex = 1;
    BOOST_FOREACH(ConcordiaSearchResult result, results) {
        SUFFIX_MARKER_TYPE patternSize =
-                    result.getTokenizedPattern()->getTokens().size();
+                    result.getTokenizedPattern().getTokens().size();
        if (patternSize > 0) {
            if (result.getBestOverlay().size() != 1) {
                reportError(baseLineCount + lineIndex,
@ -203,7 +203,7 @@ int main(int argc, char** argv) {

            std::cout << "\tPattern used: " << std::endl << "\t\t";
            BOOST_FOREACH(TokenAnnotation annotation,
-                                  result->getTokenizedPattern()->getTokens()) {
+                                  result->getTokenizedPattern().getTokens()) {
                std::cout << annotation.getValue() << " ";
            }
            std::cout << std::endl;
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -1,4 +1,5 @@
 #include <sstream>
+#include <boost/foreach.hpp>

 #include "concordia/concordia.hpp"
 #include "concordia/common/config.hpp"
@ -42,19 +43,31 @@ std::string _createLibraryVersion() {
    return version.str();
 }

-boost::shared_ptr<TokenizedSentence>
+TokenizedSentence
            Concordia::tokenize(const std::string & sentence)
                                  throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> result =
+    TokenizedSentence result =
                _hashGenerator->generateHash(sentence);
    _hashGenerator->serializeWordMap();
    return result;
 }

+std::vector<TokenizedSentence> Concordia::tokenizeAll(
+                     const std::vector<std::string> & sentences)
+                                     throw(ConcordiaException) {
+    std::vector<TokenizedSentence> result;
+    BOOST_FOREACH(std::string sentence, sentences) {
+        result.push_back(_hashGenerator->generateHash(sentence));
+    }
+
+    _hashGenerator->serializeWordMap();
+    return result;
+}
+

 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
-boost::shared_ptr<TokenizedSentence> Concordia::addExample(
+TokenizedSentence Concordia::addExample(
                                      const Example & example)
                                      throw(ConcordiaException) {
    return _index->addExample(_hashGenerator, _T, _markers, example);
@ -63,13 +76,21 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
 void Concordia::addTokenizedExample(
-                    boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-                    SUFFIX_MARKER_TYPE id)
+                    const TokenizedSentence & tokenizedSentence,
+                    const SUFFIX_MARKER_TYPE id)
                                              throw(ConcordiaException) {
    _index->addTokenizedExample(_hashGenerator, _T,
                                _markers, tokenizedSentence, id);
 }

+void Concordia::addAllTokenizedExamples(
+                const std::vector<TokenizedSentence> & tokenizedSentences,
+                const std::vector<SUFFIX_MARKER_TYPE> & ids)
+                                              throw(ConcordiaException) {
+    _index->addAllTokenizedExamples(_hashGenerator, _T,
+                                _markers, tokenizedSentences, ids);
+}
+

 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
@ -188,8 +209,7 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
    } else {
        std::string empty;
        return boost::shared_ptr<ConcordiaSearchResult>(
-            new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
-                                            new TokenizedSentence(empty))));
+            new ConcordiaSearchResult(TokenizedSentence(empty)));
    }
 }

--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -58,7 +58,16 @@ public:
               containing information about original word positions
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
+    TokenizedSentence tokenize(const std::string & sentence)
+                                                     throw(ConcordiaException);
+
+    /*! Tokenizes all the given sentences.
+      \param sentences vector of sentences to be tokenized
+      \returns vector of tokenized sentence objects
+      \throws ConcordiaException
+    */
+    std::vector<TokenizedSentence> tokenizeAll(
+                                   const std::vector<std::string> & sentences)
                                                     throw(ConcordiaException);

    /*! Adds an Example to the index.
@ -67,17 +76,27 @@ public:
               containing information about original word positions
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
-                                                   throw(ConcordiaException);
+    TokenizedSentence addExample(const Example & example)
+                                          throw(ConcordiaException);

    /*! Adds a tokenized example to the index.
      \param tokenizedSentence tokenized sentence to be added
-      \param id of the sentence to be added
+      \param id id of the sentence to be added
      \throws ConcordiaException
    */
    void addTokenizedExample(
-                    boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-                    SUFFIX_MARKER_TYPE id)
+                    const TokenizedSentence & tokenizedSentence,
+                    const SUFFIX_MARKER_TYPE id)
+                                                  throw(ConcordiaException);
+
+    /*! Adds multiple tokenized examples to the index.
+      \param examples vector of examples to be added
+      \param ids vector of ids of the sentences to be added
+      \throws ConcordiaException
+    */
+    void addAllTokenizedExamples(
+                    const std::vector<TokenizedSentence> & tokenizedSentences,
+                    const std::vector<SUFFIX_MARKER_TYPE> & ids)
                                                  throw(ConcordiaException);

    /*! Adds multiple examples to the index.
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -4,6 +4,8 @@
 #include "concordia/common/config.hpp"
 #include <boost/filesystem.hpp>
 #include <boost/foreach.hpp>
+#include <boost/make_shared.hpp>
+
 #include <iostream>
 #include <climits>

@ -48,10 +50,10 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(

    std::vector<TokenizedSentence> hashedPatterns;
    BOOST_FOREACH(Example example, examples) {
-        boost::shared_ptr<TokenizedSentence> hashedPattern =
+        TokenizedSentence hashedPattern =
             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
-        hashedPatterns.push_back(*hashedPattern);
+        hashedPatterns.push_back(hashedPattern);
    }

    hashedIndexFile.close();
@ -61,7 +63,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
    return hashedPatterns;
 }

-boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
+TokenizedSentence ConcordiaIndex::addExample(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -72,7 +74,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
    std::ofstream markersFile;
    markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
+    TokenizedSentence hashedPattern =
             _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
                                                      T, markers, example);
    hashedIndexFile.close();
@ -86,8 +88,8 @@ void ConcordiaIndex::addTokenizedExample(
            boost::shared_ptr<HashGenerator> hashGenerator,
            boost::shared_ptr<std::vector<sauchar_t> > T,
            boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-            boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-            SUFFIX_MARKER_TYPE id) {
+            const TokenizedSentence & tokenizedSentence,
+            const SUFFIX_MARKER_TYPE id) {
    std::ofstream hashedIndexFile;
    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
@ -100,15 +102,38 @@ void ConcordiaIndex::addTokenizedExample(
    markersFile.close();
 }

+void ConcordiaIndex::addAllTokenizedExamples(
+            boost::shared_ptr<HashGenerator> hashGenerator,
+            boost::shared_ptr<std::vector<sauchar_t> > T,
+            boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
+            const std::vector<TokenizedSentence> & tokenizedSentences,
+            const std::vector<SUFFIX_MARKER_TYPE> & ids) {
+    std::ofstream hashedIndexFile;
+    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
+                                             std::ios::app|std::ios::binary);
+    std::ofstream markersFile;
+    markersFile.open(_markersFilePath.c_str(), std::ios::out|
+                                             std::ios::app|std::ios::binary);
+
+    int index = 0;
+    BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
+        _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
+                                  T, markers, tokenizedSentence, ids.at(index));
+        index++;
+    }
+    hashedIndexFile.close();
+    markersFile.close();
+}
+
 void ConcordiaIndex::_addSingleTokenizedExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                   boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-                   SUFFIX_MARKER_TYPE id) {
-    std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
+                   const TokenizedSentence & tokenizedSentence,
+                   const SUFFIX_MARKER_TYPE id) {
+    std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();

    int offset = 0;
    for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
@ -139,14 +164,14 @@ void ConcordiaIndex::_addSingleTokenizedExample(
    markers->push_back(sentenceBoundaryMA);
 }

-boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
+TokenizedSentence ConcordiaIndex::_addSingleExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                   const Example & example) {
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
+    TokenizedSentence hashedPattern =
                    hashGenerator->generateHash(example.getSentence());
    _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
                               T, markers, hashedPattern, example.getId());
--- a/concordia/concordia_index.hpp
+++ b/concordia/concordia_index.hpp
@ -53,7 +53,7 @@ public:
      \returns tokenized example
      \throws ConcordiaException
    */
-    boost::shared_ptr<TokenizedSentence> addExample(
+    TokenizedSentence addExample(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -63,7 +63,6 @@ public:
        and markers array are appended with the example.
        At the same time, HDD versions of these
        two data structures are also appended with the same example.
-        The method returns a tokenized version of the example.
      \param hashGenerator hash generator to be used to prepare the hash
             of the example
      \param T RAM-based hash index to be appended to
@ -77,8 +76,28 @@ public:
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-                SUFFIX_MARKER_TYPE id);
+                const TokenizedSentence & tokenizedSentence,
+                const SUFFIX_MARKER_TYPE id);
+
+    /*! Adds multiple tokenized examples to the index. Hashed index
+        and markers array are appended with the examples.
+        At the same time, HDD versions of these
+        two data structures are also appended with the same examples.
+      \param hashGenerator hash generator to be used to prepare the hash
+             of the example
+      \param T RAM-based hash index to be appended to
+      \param markers RAM-based markers array to be appended to
+      \param example example to be added to index
+      \param tokenizedSentences vector of tokenized sentences to be added
+      \param ids vector of ids of the sentences to be added
+      \throws ConcordiaException
+    */
+    void addAllTokenizedExamples(
+                boost::shared_ptr<HashGenerator> hashGenerator,
+                boost::shared_ptr<std::vector<sauchar_t> > T,
+                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
+                const std::vector<TokenizedSentence> & tokenizedSentences,
+                const std::vector<SUFFIX_MARKER_TYPE> & ids);

    /*! Adds multiple examples to the index. Examples are first hashed using
        the hash generator passed to this method. Then, hashed index
@ -114,10 +133,10 @@ private:
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-                SUFFIX_MARKER_TYPE id);
+                const TokenizedSentence & tokenizedSentence,
+                const SUFFIX_MARKER_TYPE id);

-    boost::shared_ptr<TokenizedSentence> _addSingleExample(
+    TokenizedSentence _addSingleExample(
                std::ofstream & hashedIndexFile,
                std::ofstream & markersFile,
                boost::shared_ptr<HashGenerator> hashGenerator,
--- a/concordia/concordia_search_result.cpp
+++ b/concordia/concordia_search_result.cpp
@ -4,9 +4,9 @@
 #include <algorithm>

 ConcordiaSearchResult::ConcordiaSearchResult(
-                boost::shared_ptr<TokenizedSentence> tokenizedPattern):
-                                   _tokenizedPattern(tokenizedPattern),
-                                   _bestOverlayScore(0) {
+                TokenizedSentence tokenizedPattern):
+                _tokenizedPattern(tokenizedPattern),
+                _bestOverlayScore(0) {
 }

 ConcordiaSearchResult::~ConcordiaSearchResult() {
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
    // the fragments are already sorted by their ends, ascending
    _checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
                           -1,
-                           _tokenizedPattern->getTokens().size());
+                           _tokenizedPattern.getTokens().size());
 }

 void ConcordiaSearchResult::_checkPossibleOverlays(
--- a/concordia/concordia_search_result.hpp
+++ b/concordia/concordia_search_result.hpp
@ -26,8 +26,7 @@ public:
    /*! Constructor.
      \param tokenVector tokenized pattern which was used for searching
    */
-    explicit ConcordiaSearchResult(
-                boost::shared_ptr<TokenizedSentence> tokenizedPattern);
+    explicit ConcordiaSearchResult(TokenizedSentence tokenizedPattern);

    /*! Destructor.
    */
@ -51,7 +50,7 @@ public:
    /*! Getter for tokenized pattern.
        \returns tokenized search pattern
    */
-    boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
+    TokenizedSentence getTokenizedPattern() const {
        return _tokenizedPattern;
    }

@ -82,7 +81,7 @@ private:
                SUFFIX_MARKER_TYPE lastAddedPos,
                SUFFIX_MARKER_TYPE patternSize);

-    boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
+    TokenizedSentence _tokenizedPattern;

    std::vector<MatchedPatternFragment> _matchedPatternFragments;

--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -27,13 +27,12 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
 HashGenerator::~HashGenerator() {
 }

-boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
+TokenizedSentence HashGenerator::generateHash(
                     const std::string & sentence) throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> ts =
-                                    _sentenceTokenizer->tokenize(sentence);
-    ts->generateHash(_wordMap);
+    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
+    ts.generateHash(_wordMap);

-    if (ts->getTokens().size() > Utils::maxSentenceSize) {
+    if (ts.getTokens().size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }

--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -44,9 +44,8 @@ public:
      \param sentence sentence to generate hash from
      \returns tokenized sentence, containing the hash
    */
-    boost::shared_ptr<TokenizedSentence> generateHash(
-                                const std::string & sentence)
-                                throw(ConcordiaException);
+    TokenizedSentence generateHash(const std::string & sentence)
+                                       throw(ConcordiaException);

    /*!
        Saves the contents of current WordMap to HDD.
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -23,7 +23,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(

    int left;
    std::vector<INDEX_CHARACTER_TYPE> hash =
-                            hashGenerator->generateHash(pattern)->getCodes();
+                            hashGenerator->generateHash(pattern).getCodes();
    saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
    sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);

@ -60,7 +60,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern) throw(ConcordiaException) {
    std::vector<INDEX_CHARACTER_TYPE> hash =
-                         hashGenerator->generateHash(pattern)->getCodes();
+                         hashGenerator->generateHash(pattern).getCodes();
    return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
 }

@ -70,13 +70,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
                  boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern) throw(ConcordiaException) {
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
-                                 hashGenerator->generateHash(pattern);
+    TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern);
    boost::shared_ptr<ConcordiaSearchResult> result =
     boost::shared_ptr<ConcordiaSearchResult>(
       new ConcordiaSearchResult(hashedPattern));

    _concordiaSearcher->concordiaSearch(result, T, markers,
-                                        SA, hashedPattern->getCodes());
+                                        SA, hashedPattern.getCodes());
    return result;
 }
--- a/concordia/regex_rule.cpp
+++ b/concordia/regex_rule.cpp
@ -36,9 +36,9 @@ RegexRule::RegexRule(std::string patternString,
 RegexRule::~RegexRule() {
 }

-void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
+void RegexRule::apply(TokenizedSentence & sentence) {
    try {
-        UnicodeString s(sentence->getSentence().c_str());
+        UnicodeString s(sentence.getSentence().c_str());
        boost::u32regex_iterator<const UChar*> begin(
                             boost::make_u32regex_iterator(s, _pattern));
        boost::u32regex_iterator<const UChar*> end;
@ -58,12 +58,12 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
                                      _annotationType, value);
            annotations.push_back(annotation);
        }
-        sentence->addAnnotations(annotations);
+        sentence.addAnnotations(annotations);
    } catch(const std::exception & e) {
        std::stringstream ss;
        ss << "Exception while applying regex rule: "
                          << _annotationType << " to text: "
-                          << sentence->getSentence();
+                          << sentence.getSentence();
        ss << ", message: " << e.what();
        throw ConcordiaException(ss.str());
    }
--- a/concordia/regex_rule.hpp
+++ b/concordia/regex_rule.hpp
@ -42,7 +42,7 @@ public:
    /*! Applies regex annotation on tokenized sentence.
      \param sentence the input sentence
    */
-    void apply(boost::shared_ptr<TokenizedSentence> sentence);
+    void apply(TokenizedSentence & sentence);

 private:
    int _annotationType;
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -24,10 +24,8 @@ SentenceTokenizer::SentenceTokenizer(
 SentenceTokenizer::~SentenceTokenizer() {
 }

-boost::shared_ptr<TokenizedSentence>
-              SentenceTokenizer::tokenize(const std::string & sentence) {
-    boost::shared_ptr<TokenizedSentence>
-                    result(new TokenizedSentence(sentence));
+TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
+    TokenizedSentence result(sentence);

    _htmlTags->apply(result);

@ -35,7 +33,7 @@ boost::shared_ptr<TokenizedSentence>
        neRule.apply(result);
    }

-    result->toLowerCase();
+    result.toLowerCase();

    if (_stopWordsEnabled) {
        _stopWords->apply(result);
--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@ -36,8 +36,7 @@ public:
      \param sentence input sentence
      \returns tokenized sentence object build on the input sentence
    */
-    boost::shared_ptr<TokenizedSentence>
-                                   tokenize(const std::string & sentence);
+    TokenizedSentence tokenize(const std::string & sentence);

 private:
    void _createNeRules(std::string & namedEntitiesPath);
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -27,17 +27,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
 BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
-    boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
+    TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
    /*
    0,3 type: 1 value: ala
    4,11 type: 1 value: posiada
    12,16 type: 1 value: kota
    */
-    BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
    
    concordia.addExample(Example("Ala posiada rysia",51));
    concordia.addExample(Example("Marysia posiada rysia",123));
@ -293,24 +293,36 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
 BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
+    /*
    concordia.addExample(Example("Alice has a cat", 56));
    concordia.addExample(Example("Alice has a dog", 23));
    concordia.addExample(Example("New test product has a mistake", 321));
-    boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
+    */
+    std::vector<std::string> sentences;
+    std::vector<SUFFIX_MARKER_TYPE> ids;
+    sentences.push_back("Alice has a cat");
+    ids.push_back(56);
+    sentences.push_back("Alice has a dog");
+    ids.push_back(23);
+    sentences.push_back("New test product has a mistake");
+    ids.push_back(321);
+    std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
+    concordia.addAllTokenizedExamples(tokenizedSentences, ids);
+
+    TokenizedSentence ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
    concordia.addTokenizedExample(ts, 14);
+
    concordia.refreshSAfromRAM();
        
    boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
    // best overlay: 

-    /*
    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
-    BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
-    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
-    */
+    BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.537, 0.1);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 5);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 5);
+    BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 9);

    BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);

@ -338,7 +350,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
 BOOST_AUTO_TEST_CASE( Tokenize )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
-    boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("  Ala    posiada kota");
+    TokenizedSentence ts = concordia.tokenize("  Ala    posiada kota");
    /*
    0,3 type: 1 value: ala
    4,11 type: 1 value: posiada
@ -347,10 +359,22 @@ BOOST_AUTO_TEST_CASE( Tokenize )

    concordia.clearIndex();

-    BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
-    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 9);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
+    
+    std::vector<std::string> sentences;
+    sentences.push_back("Marysia, ma rysia;");
+    sentences.push_back("Testing complete;");
+    sentences.push_back("This, is (a) weird;! sentence <>");
+    std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
+    
+    BOOST_CHECK_EQUAL(tokenizedSentences.size(), 3);
+    BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
+    BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
+    BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
+    
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_concordia_searcher.cpp
+++ b/concordia/t/test_concordia_searcher.cpp
@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
        
    // searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
    
-    std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia").getCodes();
    
    boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);

--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    
    HashGenerator hashGenerator = HashGenerator(config);

-    std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota").getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected;
    expected.push_back(0);
    expected.push_back(1);
@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
    
    HashGenerator hashGenerator1 = HashGenerator(config);

-    std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected1;
    expected1.push_back(0);
    expected1.push_back(1);
@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
    hashGenerator1.serializeWordMap();
   
    HashGenerator hashGenerator2 = HashGenerator(config);
-    std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
+    std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes();
    std::vector<INDEX_CHARACTER_TYPE> expected2;
    expected2.push_back(0);
    expected2.push_back(1);
@ -106,9 +106,9 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
    
    HashGenerator hashGenerator = HashGenerator(config);

-    boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014   o  godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował  samochód.");
+    TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014   o  godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował  samochód.");

-    std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
+    std::vector<TokenAnnotation> tokens = tokenizedSentence.getTokens();

    /*    
    BOOST_FOREACH(TokenAnnotation annotation, tokens) {
--- a/concordia/t/test_regex_rule.cpp
+++ b/concordia/t/test_regex_rule.cpp
@ -13,10 +13,10 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
 BOOST_AUTO_TEST_CASE( SimpleAnnotation )
 {
    RegexRule rr("a", TokenAnnotation::WORD, "b");
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
+    TokenizedSentence ts("xxxxxxxaxxxaxxaxaxa");
    rr.apply(ts);    
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -56,10 +56,10 @@ BOOST_AUTO_TEST_CASE( BadRegex )
 BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
 {
    RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
+    TokenizedSentence ts("Don't stop believin' \\ Hold on to the feelin'.");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
 {
    RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
+    TokenizedSentence ts("This is AbC and ABC and abc and aBC.");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),4);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -111,10 +111,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
 BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
 {
    RegexRule rr("ą", TokenAnnotation::WORD, "x");
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
+    TokenizedSentence ts("zażółć gęślą jaźń");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),1);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -124,10 +124,10 @@ BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
 {
    RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),2);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -141,10 +141,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
 BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
 {
    RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
-    boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
+    TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
    rr.apply(ts);
-    BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getAnnotations().size(),18);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

    BOOST_CHECK_EQUAL(iter->getStart(),2);
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE( NETest )
    
    
    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    
    BOOST_CHECK_EQUAL(14,annotations.size());
@ -134,8 +134,8 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
    SentenceTokenizer tokenizer(config);

    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
        
    /*
@ -214,8 +214,8 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
    SentenceTokenizer tokenizer(config);

    std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    
    /*
@ -322,7 +322,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
    if (config->isStopWordsEnabled()) {
        SentenceTokenizer tokenizer(config);
        std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
-        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"  wiem   konieczne");
+        BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence).getSentence(),"  wiem   konieczne");
    }
 }

@ -332,8 +332,8 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
    SentenceTokenizer tokenizer(config);
    
    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
-    boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts->getAnnotations();
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
    
    BOOST_CHECK_EQUAL(161, annotations.size());