#ifndef HASH_GENERATOR_HDR #define HASH_GENERATOR_HDR #include #include #include #include #include #include "concordia/word_map.hpp" #include "concordia/common/config.hpp" #include "concordia/sentence_tokenizer.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" /*! Class for generating a sentence hash. The hash is generated from a sentence given in raw string. String is first anonymized and tokenized. After these operations, each token is coded as an integer, according to WordMap. Resulting hash is a vector of integers. Sentence hashed is used when adding a sentence to index and during searching. HashGenerator holds an instance of WordMap, used to code tokens as integers and SentenceAnonymizer, used to preprocess the sentence string. */ class HashGenerator { public: /*! Constructor. \param config pointer to current config object */ explicit HashGenerator(boost::shared_ptr config) throw(ConcordiaException); /*! Destructor. */ virtual ~HashGenerator(); /*! Generates hash of a sentence. \param sentence sentence to generate hash from \returns vector of integers */ std::vector generateHash(const std::string & sentence) throw(ConcordiaException); /*! Generates vector of tokens from a sentence. This method is internally used by generateHash. However, for the sake of concordiaSearch (see \ref tutorial1_3), the vector of tokens resulting from sentence anonymizing and tokenization is also needed. \param sentence sentence to tokenize \returns vector of tokens */ std::vector generateTokenVector(const std::string & sentence); /*! Saves the contents of current WordMap to HDD. */ void serializeWordMap(); /*! Clears word map. */ void clearWordMap(); private: boost::shared_ptr _wordMap; boost::shared_ptr _sentenceTokenizer; std::string _wordMapFilePath; }; #endif