concordia-library/concordia/hash_generator.hpp

69 lines
1.8 KiB
C++

#ifndef HASH_GENERATOR_HDR
#define HASH_GENERATOR_HDR
#include <string>
#include <map>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
/*!
Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first tokenized by SentenceTokenizer and
then each token is coded as an integer, according to WordMap.
Resulting hash is an instance of TokenizedSentence.
Hashed sentence is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceTokenizer, used to tokenize the sentence string.
*/
class HashGenerator {
public:
/*!
Constructor.
\param config pointer to current config object
*/
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~HashGenerator();
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\returns tokenized sentence, containing the hash
*/
TokenizedSentence generateHash(const std::string & sentence)
throw(ConcordiaException);
/*!
Saves the contents of current WordMap to HDD.
*/
void serializeWordMap();
/*!
Clears word map.
*/
void clearWordMap();
private:
boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
std::string _wordMapFilePath;
};
#endif