diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 43a33b5..6f6e246 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -8,7 +8,6 @@ endforeach(dir) add_library(concordia SHARED token_annotation.cpp tokenized_sentence.cpp - hashed_sentence.cpp concordia_search_result.cpp matched_pattern_fragment.cpp concordia_searcher.cpp @@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/) install(FILES token_annotation.hpp tokenized_sentence.hpp - hashed_sentence.hpp concordia_search_result.hpp matched_pattern_fragment.hpp concordia_searcher.hpp diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 05e9afe..896f24e 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -1,10 +1,12 @@ #include "concordia/hash_generator.hpp" #include "concordia/common/utils.hpp" +#include "concordia/token_annotation.hpp" #include #include #include #include +#include #include @@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() { std::vector HashGenerator::generateHash( const std::string & sentence) throw(ConcordiaException) { std::vector result; - std::vector tokenTexts = generateTokenVector(sentence); - if (tokenTexts.size() > Utils::maxSentenceSize) { + + boost::shared_ptr ts = _sentenceTokenizer->tokenize(sentence); + ts->generateHash(_wordMap); + + if (ts->getTokens().size() > Utils::maxSentenceSize) { throw ConcordiaException("Trying to add too long sentence."); } - for (std::vector::iterator it = tokenTexts.begin(); - it != tokenTexts.end(); ++it) { - std::string token = *it; - INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token); - result.push_back(code); - } - return result; + return ts->getCodes(); } std::vector HashGenerator::generateTokenVector( const std::string & sentence) { boost::shared_ptr ts = _sentenceTokenizer->tokenize(sentence); - std::string tokenizedSentence = ts->getSentence(); - boost::trim(tokenizedSentence); std::vector tokenTexts; - boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"), - boost::algorithm::token_compress_on); + BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) { + if (annotation.getType() == TokenAnnotation::WORD || + annotation.getType() == TokenAnnotation::NE) { + tokenTexts.push_back(annotation.getValue()); + } + } return tokenTexts; } diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index f9a4562..cd4c1d2 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -51,7 +51,7 @@ public: Generates vector of tokens from a sentence. This method is internally used by generateHash. However, for the sake of concordiaSearch (see \ref tutorial1_3), the vector of tokens resulting from sentence - anonymizing and tokenization is also needed. + tokenization is also needed. \param sentence sentence to tokenize \returns vector of tokens */ diff --git a/concordia/hashed_sentence.cpp b/concordia/hashed_sentence.cpp deleted file mode 100644 index 93c1147..0000000 --- a/concordia/hashed_sentence.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include "concordia/hashed_sentence.hpp" - -HashedSentence::HashedSentence() { -} - -HashedSentence::~HashedSentence() { -} diff --git a/concordia/hashed_sentence.hpp b/concordia/hashed_sentence.hpp deleted file mode 100644 index 85e234a..0000000 --- a/concordia/hashed_sentence.hpp +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef HASHED_SENTENCE_HDR -#define HASHED_SENTENCE_HDR - -#include "concordia/common/config.hpp" -#include "concordia/interval.hpp" -#include -#include - -/*! - A sentence after hashing by the HashGenerator. The class holds - the list of word codes and intervals representing original - word positions in the sentence (char-based). -*/ - -class HashedSentence { -public: - /*! - Constructor. - - */ - HashedSentence(); - - /*! Destructor. - */ - virtual ~HashedSentence(); - - /*! Getter for original word positions list. - \returns original word positions list - */ - std::vector getOriginalWordPositions() const { - return _originalWordPositions; - } - - /*! Getter for word codes list. - \returns word codes list - */ - std::vector getWordCodes() const { - return _wordCodes; - } - - /*! Method for adding a word code to the list - \param word code to be added - */ - void addWordCode(INDEX_CHARACTER_TYPE wordCode) { - _wordCodes.push_back(wordCode); - } - - /*! Method for adding an original word position to the list. - \param original word position - */ - void addOriginalWordPosition(Interval & originalWordPosition) { - _originalWordPositions.push_back(originalWordPosition); - } - -private: - std::vector _originalWordPositions; - - std::vector _wordCodes; -}; - -#endif diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index 8fdef81..d71f112 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest ) BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end()); } -/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes. +/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes. Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp BOOST_AUTO_TEST_CASE( TooLongHashTest ) { diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp index e828c54..6302567 100644 --- a/concordia/tokenized_sentence.cpp +++ b/concordia/tokenized_sentence.cpp @@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector annotations) void TokenizedSentence::toLowerCase() { _sentence = TextUtils::getInstance().toLowerCase(_sentence); } + +void TokenizedSentence::generateHash(boost::shared_ptr wordMap) { + BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) { + if (annotation.getType() == TokenAnnotation::WORD || + annotation.getType() == TokenAnnotation::NE) { + _codes.push_back(wordMap->getWordCode(annotation.getValue())); + _tokens.push_back(annotation); + } + } + +} + diff --git a/concordia/tokenized_sentence.hpp b/concordia/tokenized_sentence.hpp index b1aa77e..a0ff96b 100644 --- a/concordia/tokenized_sentence.hpp +++ b/concordia/tokenized_sentence.hpp @@ -3,6 +3,9 @@ #include "concordia/common/config.hpp" #include "concordia/token_annotation.hpp" +#include "concordia/word_map.hpp" + +#include #include #include #include @@ -39,6 +42,16 @@ public: return _tokenAnnotations; } + std::vector getCodes() const { + return _codes; + } + + std::vector getTokens() const { + return _tokens; + } + + void generateHash(boost::shared_ptr wordMap); + /*! Transform the sentence to lower case. */ @@ -59,6 +72,10 @@ private: std::string _sentence; std::list _tokenAnnotations; + + std::vector _codes; + + std::vector _tokens; }; #endif