new responsibilities of tokenized sentence

2015-06-26 15:38:24 +02:00 · 2015-06-26 15:38:24 +02:00 · 724bf0d080
commit 724bf0d080
parent 9b1735516c
8 changed files with 45 additions and 85 deletions
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@ -8,7 +8,6 @@ endforeach(dir)
 add_library(concordia SHARED
  token_annotation.cpp
  tokenized_sentence.cpp
-  hashed_sentence.cpp
  concordia_search_result.cpp
  matched_pattern_fragment.cpp
  concordia_searcher.cpp
@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
 install(FILES 
          token_annotation.hpp
          tokenized_sentence.hpp
-          hashed_sentence.hpp
          concordia_search_result.hpp
          matched_pattern_fragment.hpp
          concordia_searcher.hpp
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -1,10 +1,12 @@
 #include "concordia/hash_generator.hpp"
 #include "concordia/common/utils.hpp"
+#include "concordia/token_annotation.hpp"

 #include <boost/filesystem.hpp>
 #include <boost/archive/binary_oarchive.hpp>
 #include <boost/archive/binary_iarchive.hpp>
 #include <boost/algorithm/string.hpp>
+#include <boost/foreach.hpp>

 #include <fstream>

@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
 std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
                     const std::string & sentence) throw(ConcordiaException) {
    std::vector<INDEX_CHARACTER_TYPE> result;
-    std::vector<std::string> tokenTexts = generateTokenVector(sentence);
-    if (tokenTexts.size() > Utils::maxSentenceSize) {
+    
+    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
+    ts->generateHash(_wordMap);
+    
+    if (ts->getTokens().size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }
-    for (std::vector<std::string>::iterator it = tokenTexts.begin();
-                                it != tokenTexts.end(); ++it) {
-        std::string token = *it;
-        INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
-        result.push_back(code);
-    }

-    return result;
+    return ts->getCodes();
 }

 std::vector<std::string> HashGenerator::generateTokenVector(
                                               const std::string & sentence) {
    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
-    std::string tokenizedSentence = ts->getSentence();
-    boost::trim(tokenizedSentence);
    std::vector<std::string> tokenTexts;
-    boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
-                 boost::algorithm::token_compress_on);
+    BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            tokenTexts.push_back(annotation.getValue());
+        } 
+    }
    return tokenTexts;
 }

--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -51,7 +51,7 @@ public:
      Generates vector of tokens from a sentence. This method is internally
      used by generateHash. However, for the sake of concordiaSearch
      (see \ref tutorial1_3), the vector of tokens resulting from sentence
-      anonymizing and tokenization is also needed.
+      tokenization is also needed.
      \param sentence sentence to tokenize
      \returns vector of tokens
    */
--- a/concordia/hashed_sentence.cpp
+++ b/concordia/hashed_sentence.cpp
@ -1,7 +0,0 @@
-#include "concordia/hashed_sentence.hpp"
-
-HashedSentence::HashedSentence() {
-}
-
-HashedSentence::~HashedSentence() {
-}
--- a/concordia/hashed_sentence.hpp
+++ b/concordia/hashed_sentence.hpp
@ -1,61 +0,0 @@
-#ifndef HASHED_SENTENCE_HDR
-#define HASHED_SENTENCE_HDR
-
-#include "concordia/common/config.hpp"
-#include "concordia/interval.hpp"
-#include <vector>
-#include <string>
-
-/*!
-  A sentence after hashing by the HashGenerator. The class holds
-  the list of word codes and intervals representing original
-  word positions in the sentence (char-based).
-*/
-
-class HashedSentence {
-public:
-    /*!
-      Constructor.
-
-    */
-    HashedSentence();
-
-    /*! Destructor.
-    */
-    virtual ~HashedSentence();
-
-    /*! Getter for original word positions list.
-      \returns original word positions list
-    */
-    std::vector<Interval> getOriginalWordPositions() const {
-        return _originalWordPositions;
-    }
-
-    /*! Getter for word codes list.
-      \returns word codes list
-    */
-    std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
-        return _wordCodes;
-    }
-    
-    /*! Method for adding a word code to the list
-      \param word code to be added
-    */
-    void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
-        _wordCodes.push_back(wordCode);
-    }
-
-    /*! Method for adding an original word position to the list.
-      \param original word position
-    */
-    void addOriginalWordPosition(Interval & originalWordPosition) {
-        _originalWordPositions.push_back(originalWordPosition);
-    }
-
-private:
-    std::vector<Interval> _originalWordPositions;
-
-    std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
-};
-
-#endif
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
 }

-/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
+/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
                   Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
 BOOST_AUTO_TEST_CASE( TooLongHashTest )
 {
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
 void TokenizedSentence::toLowerCase() {
    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
 }
+
+void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
+    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            _codes.push_back(wordMap->getWordCode(annotation.getValue()));
+            _tokens.push_back(annotation);
+        } 
+    }
+
+}
+
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -3,6 +3,9 @@

 #include "concordia/common/config.hpp"
 #include "concordia/token_annotation.hpp"
+#include "concordia/word_map.hpp"
+
+#include <boost/shared_ptr.hpp>
 #include <string>
 #include <vector>
 #include <list>
@ -39,6 +42,16 @@ public:
        return _tokenAnnotations;
    }

+    std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
+        return _codes;
+    }
+    
+    std::vector<TokenAnnotation> getTokens() const {
+        return _tokens;
+    }
+    
+    void generateHash(boost::shared_ptr<WordMap> wordMap);
+
    /*! 
        Transform the sentence to lower case.
    */
@ -59,6 +72,10 @@ private:
    std::string _sentence;

    std::list<TokenAnnotation> _tokenAnnotations;
+    
+    std::vector<INDEX_CHARACTER_TYPE> _codes;
+    
+    std::vector<TokenAnnotation> _tokens;
 };

 #endif