69 lines
1.8 KiB
C++
69 lines
1.8 KiB
C++
#ifndef HASH_GENERATOR_HDR
|
|
#define HASH_GENERATOR_HDR
|
|
|
|
#include <string>
|
|
#include <map>
|
|
#include <vector>
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <boost/algorithm/string/predicate.hpp>
|
|
#include "concordia/word_map.hpp"
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/sentence_tokenizer.hpp"
|
|
#include "concordia/concordia_config.hpp"
|
|
#include "concordia/concordia_exception.hpp"
|
|
|
|
|
|
/*!
|
|
Class for generating a sentence hash. The hash is generated from a sentence
|
|
given in raw string. String is first anonymized and tokenized. After these
|
|
operations, each token is coded as an integer, according to WordMap.
|
|
Resulting hash is a vector of integers.
|
|
|
|
Sentence hashed is used when adding a sentence to index and during searching.
|
|
|
|
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
|
and SentenceAnonymizer, used to preprocess the sentence string.
|
|
|
|
*/
|
|
|
|
class HashGenerator {
|
|
public:
|
|
/*!
|
|
Constructor.
|
|
\param config pointer to current config object
|
|
*/
|
|
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~HashGenerator();
|
|
|
|
/*!
|
|
Generates hash of a sentence.
|
|
\param sentence sentence to generate hash from
|
|
\returns vector of integers
|
|
*/
|
|
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
|
|
throw(ConcordiaException);
|
|
|
|
/*!
|
|
Saves the contents of current WordMap to HDD.
|
|
*/
|
|
void serializeWordMap();
|
|
|
|
/*!
|
|
Clears word map.
|
|
*/
|
|
void clearWordMap();
|
|
|
|
private:
|
|
boost::shared_ptr<WordMap> _wordMap;
|
|
|
|
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
|
|
|
|
std::string _wordMapFilePath;
|
|
};
|
|
|
|
#endif
|