From 724bf0d0802ec1ac7f617677f26cf132157e0870 Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Fri, 26 Jun 2015 15:38:24 +0200
Subject: [PATCH] new responsibilities of tokenized sentence

---
 concordia/CMakeLists.txt            |  2 -
 concordia/hash_generator.cpp        | 27 +++++++------
 concordia/hash_generator.hpp        |  2 +-
 concordia/hashed_sentence.cpp       |  7 ----
 concordia/hashed_sentence.hpp       | 61 -----------------------------
 concordia/t/test_hash_generator.cpp |  2 +-
 concordia/tokenized_sentence.cpp    | 12 ++++++
 concordia/tokenized_sentence.hpp    | 17 ++++++++
 8 files changed, 45 insertions(+), 85 deletions(-)
 delete mode 100644 concordia/hashed_sentence.cpp
 delete mode 100644 concordia/hashed_sentence.hpp
diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt
index 43a33b5..6f6e246 100644
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@@ -8,7 +8,6 @@ endforeach(dir)
 add_library(concordia SHARED
   token_annotation.cpp
   tokenized_sentence.cpp
-  hashed_sentence.cpp
   concordia_search_result.cpp
   matched_pattern_fragment.cpp
   concordia_searcher.cpp
@@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
 install(FILES 
           token_annotation.hpp
           tokenized_sentence.hpp
-          hashed_sentence.hpp
           concordia_search_result.hpp
           matched_pattern_fragment.hpp
           concordia_searcher.hpp
diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp
index 05e9afe..896f24e 100644
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@@ -1,10 +1,12 @@
 #include "concordia/hash_generator.hpp"
 #include "concordia/common/utils.hpp"
+#include "concordia/token_annotation.hpp"
 
 #include <boost/filesystem.hpp>
 #include <boost/archive/binary_oarchive.hpp>
 #include <boost/archive/binary_iarchive.hpp>
 #include <boost/algorithm/string.hpp>
+#include <boost/foreach.hpp>
 
 #include <fstream>
 
@@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
 std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
                      const std::string & sentence) throw(ConcordiaException) {
     std::vector<INDEX_CHARACTER_TYPE> result;
-    std::vector<std::string> tokenTexts = generateTokenVector(sentence);
-    if (tokenTexts.size() > Utils::maxSentenceSize) {
+    
+    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
+    ts->generateHash(_wordMap);
+    
+    if (ts->getTokens().size() > Utils::maxSentenceSize) {
         throw ConcordiaException("Trying to add too long sentence.");
     }
-    for (std::vector<std::string>::iterator it = tokenTexts.begin();
-                                it != tokenTexts.end(); ++it) {
-        std::string token = *it;
-        INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
-        result.push_back(code);
-    }
 
-    return result;
+    return ts->getCodes();
 }
 
 std::vector<std::string> HashGenerator::generateTokenVector(
                                                const std::string & sentence) {
     boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
-    std::string tokenizedSentence = ts->getSentence();
-    boost::trim(tokenizedSentence);
     std::vector<std::string> tokenTexts;
-    boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
-                 boost::algorithm::token_compress_on);
+    BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            tokenTexts.push_back(annotation.getValue());
+        } 
+    }
     return tokenTexts;
 }
 
diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp
index f9a4562..cd4c1d2 100644
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@@ -51,7 +51,7 @@ public:
       Generates vector of tokens from a sentence. This method is internally
       used by generateHash. However, for the sake of concordiaSearch
       (see \ref tutorial1_3), the vector of tokens resulting from sentence
-      anonymizing and tokenization is also needed.
+      tokenization is also needed.
       \param sentence sentence to tokenize
       \returns vector of tokens
     */
diff --git a/concordia/hashed_sentence.cpp b/concordia/hashed_sentence.cpp
deleted file mode 100644
index 93c1147..0000000
--- a/concordia/hashed_sentence.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "concordia/hashed_sentence.hpp"
-
-HashedSentence::HashedSentence() {
-}
-
-HashedSentence::~HashedSentence() {
-}
diff --git a/concordia/hashed_sentence.hpp b/concordia/hashed_sentence.hpp
deleted file mode 100644
index 85e234a..0000000
--- a/concordia/hashed_sentence.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef HASHED_SENTENCE_HDR
-#define HASHED_SENTENCE_HDR
-
-#include "concordia/common/config.hpp"
-#include "concordia/interval.hpp"
-#include <vector>
-#include <string>
-
-/*!
-  A sentence after hashing by the HashGenerator. The class holds
-  the list of word codes and intervals representing original
-  word positions in the sentence (char-based).
-*/
-
-class HashedSentence {
-public:
-    /*!
-      Constructor.
-
-    */
-    HashedSentence();
-
-    /*! Destructor.
-    */
-    virtual ~HashedSentence();
-
-    /*! Getter for original word positions list.
-      \returns original word positions list
-    */
-    std::vector<Interval> getOriginalWordPositions() const {
-        return _originalWordPositions;
-    }
-
-    /*! Getter for word codes list.
-      \returns word codes list
-    */
-    std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
-        return _wordCodes;
-    }
-    
-    /*! Method for adding a word code to the list
-      \param word code to be added
-    */
-    void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
-        _wordCodes.push_back(wordCode);
-    }
-
-    /*! Method for adding an original word position to the list.
-      \param original word position
-    */
-    void addOriginalWordPosition(Interval & originalWordPosition) {
-        _originalWordPositions.push_back(originalWordPosition);
-    }
-
-private:
-    std::vector<Interval> _originalWordPositions;
-
-    std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
-};
-
-#endif
diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp
index 8fdef81..d71f112 100644
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
     BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
 }
 
-/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
+/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
                    Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
 BOOST_AUTO_TEST_CASE( TooLongHashTest )
 {
diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp
index e828c54..6302567 100644
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
 void TokenizedSentence::toLowerCase() {
     _sentence = TextUtils::getInstance().toLowerCase(_sentence);
 }
+
+void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
+    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            _codes.push_back(wordMap->getWordCode(annotation.getValue()));
+            _tokens.push_back(annotation);
+        } 
+    }
+
+}
+
diff --git a/concordia/tokenized_sentence.hpp b/concordia/tokenized_sentence.hpp
index b1aa77e..a0ff96b 100644
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@@ -3,6 +3,9 @@
 
 #include "concordia/common/config.hpp"
 #include "concordia/token_annotation.hpp"
+#include "concordia/word_map.hpp"
+
+#include <boost/shared_ptr.hpp>
 #include <string>
 #include <vector>
 #include <list>
@@ -39,6 +42,16 @@ public:
         return _tokenAnnotations;
     }
 
+    std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
+        return _codes;
+    }
+    
+    std::vector<TokenAnnotation> getTokens() const {
+        return _tokens;
+    }
+    
+    void generateHash(boost::shared_ptr<WordMap> wordMap);
+
     /*! 
         Transform the sentence to lower case.
     */
@@ -59,6 +72,10 @@ private:
     std::string _sentence;
 
     std::list<TokenAnnotation> _tokenAnnotations;
+    
+    std::vector<INDEX_CHARACTER_TYPE> _codes;
+    
+    std::vector<TokenAnnotation> _tokens;
 };
 
 #endif