new responsibilities of tokenized sentence

2015-06-26 15:38:24 +02:00 · 2015-06-26 15:38:24 +02:00 · 724bf0d080
commit 724bf0d080
parent 9b1735516c
8 changed files with 45 additions and 85 deletions
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@ -8,7 +8,6 @@ endforeach(dir)
 add_library(concordia SHARED
  token_annotation.cpp
  tokenized_sentence.cpp
  hashed_sentence.cpp
  concordia_search_result.cpp
  matched_pattern_fragment.cpp
  concordia_searcher.cpp
@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
 install(FILES 
          token_annotation.hpp
          tokenized_sentence.hpp
          hashed_sentence.hpp
          concordia_search_result.hpp
          matched_pattern_fragment.hpp
          concordia_searcher.hpp
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -1,10 +1,12 @@
 #include "concordia/hash_generator.hpp"
 #include "concordia/common/utils.hpp"
 #include "concordia/token_annotation.hpp"
 #include <boost/filesystem.hpp>
 #include <boost/archive/binary_oarchive.hpp>
 #include <boost/archive/binary_iarchive.hpp>
 #include <boost/algorithm/string.hpp>
 #include <boost/foreach.hpp>
 #include <fstream>
@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
 std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
                     const std::string & sentence) throw(ConcordiaException) {
    std::vector<INDEX_CHARACTER_TYPE> result;
-    std::vector<std::string> tokenTexts = generateTokenVector(sentence);
+    
-    if (tokenTexts.size() > Utils::maxSentenceSize) {
+    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
    ts->generateHash(_wordMap);
    if (ts->getTokens().size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }
    for (std::vector<std::string>::iterator it = tokenTexts.begin();
                                it != tokenTexts.end(); ++it) {
        std::string token = *it;
        INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
        result.push_back(code);
    }
-    return result;
+    return ts->getCodes();
 }
 std::vector<std::string> HashGenerator::generateTokenVector(
                                               const std::string & sentence) {
    boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
    std::string tokenizedSentence = ts->getSentence();
    boost::trim(tokenizedSentence);
    std::vector<std::string> tokenTexts;
-    boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
+    BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
-                 boost::algorithm::token_compress_on);
+        if (annotation.getType() == TokenAnnotation::WORD ||
               annotation.getType() == TokenAnnotation::NE) {
            tokenTexts.push_back(annotation.getValue());
        } 
    }
    return tokenTexts;
 }
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -51,7 +51,7 @@ public:
      Generates vector of tokens from a sentence. This method is internally
      used by generateHash. However, for the sake of concordiaSearch
      (see \ref tutorial1_3), the vector of tokens resulting from sentence
-      anonymizing and tokenization is also needed.
+      tokenization is also needed.
      \param sentence sentence to tokenize
      \returns vector of tokens
    */
--- a/concordia/hashed_sentence.cpp
+++ b/concordia/hashed_sentence.cpp
@ -1,7 +0,0 @@
 #include "concordia/hashed_sentence.hpp"
 HashedSentence::HashedSentence() {
 }
 HashedSentence::~HashedSentence() {
 }
--- a/concordia/hashed_sentence.hpp
+++ b/concordia/hashed_sentence.hpp
@ -1,61 +0,0 @@
 #ifndef HASHED_SENTENCE_HDR
 #define HASHED_SENTENCE_HDR
 #include "concordia/common/config.hpp"
 #include "concordia/interval.hpp"
 #include <vector>
 #include <string>
 /*!
  A sentence after hashing by the HashGenerator. The class holds
  the list of word codes and intervals representing original
  word positions in the sentence (char-based).
 */
 class HashedSentence {
 public:
    /*!
      Constructor.
    */
    HashedSentence();
    /*! Destructor.
    */
    virtual ~HashedSentence();
    /*! Getter for original word positions list.
      \returns original word positions list
    */
    std::vector<Interval> getOriginalWordPositions() const {
        return _originalWordPositions;
    }
    /*! Getter for word codes list.
      \returns word codes list
    */
    std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
        return _wordCodes;
    }
    /*! Method for adding a word code to the list
      \param word code to be added
    */
    void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
        _wordCodes.push_back(wordCode);
    }
    /*! Method for adding an original word position to the list.
      \param original word position
    */
    void addOriginalWordPosition(Interval & originalWordPosition) {
        _originalWordPositions.push_back(originalWordPosition);
    }
 private:
    std::vector<Interval> _originalWordPositions;
    std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
 };
 #endif
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
 }
-/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
+/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
                   Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
 BOOST_AUTO_TEST_CASE( TooLongHashTest )
 {
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
 void TokenizedSentence::toLowerCase() {
    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
 }
 void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
        if (annotation.getType() == TokenAnnotation::WORD ||
               annotation.getType() == TokenAnnotation::NE) {
            _codes.push_back(wordMap->getWordCode(annotation.getValue()));
            _tokens.push_back(annotation);
        } 
    }
 }
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -3,6 +3,9 @@
 #include "concordia/common/config.hpp"
 #include "concordia/token_annotation.hpp"
 #include "concordia/word_map.hpp"
 #include <boost/shared_ptr.hpp>
 #include <string>
 #include <vector>
 #include <list>
@ -39,6 +42,16 @@ public:
        return _tokenAnnotations;
    }
    std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
        return _codes;
    }
    std::vector<TokenAnnotation> getTokens() const {
        return _tokens;
    }
    void generateHash(boost::shared_ptr<WordMap> wordMap);
    /*! 
        Transform the sentence to lower case.
    */
@ -59,6 +72,10 @@ private:
    std::string _sentence;
    std::list<TokenAnnotation> _tokenAnnotations;
    std::vector<INDEX_CHARACTER_TYPE> _codes;
    std::vector<TokenAnnotation> _tokens;
 };
 #endif