concordia-library/concordia/hash_generator.hpp

80 lines
2.2 KiB
C++

#ifndef HASH_GENERATOR_HDR
#define HASH_GENERATOR_HDR
#include <string>
#include <map>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
/*!
Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first anonymized and tokenized. After these
operations, each token is coded as an integer, according to WordMap.
Resulting hash is a vector of integers.
Sentence hashed is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceAnonymizer, used to preprocess the sentence string.
*/
class HashGenerator {
public:
/*!
Constructor.
\param config pointer to current config object
*/
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~HashGenerator();
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\returns vector of integers
*/
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
throw(ConcordiaException);
/*!
Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence
tokenization is also needed.
\param sentence sentence to tokenize
\returns vector of tokens
*/
std::vector<std::string> generateTokenVector(const std::string & sentence);
/*!
Saves the contents of current WordMap to HDD.
*/
void serializeWordMap();
/*!
Clears word map.
*/
void clearWordMap();
private:
boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
std::string _wordMapFilePath;
};
#endif