concordia-library/concordia/hash_generator.hpp

84 lines
2.5 KiB
C++

#ifndef HASH_GENERATOR_HDR
#define HASH_GENERATOR_HDR
#include <string>
#include <map>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
/*!
Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first tokenized by SentenceTokenizer and
then each token is coded as an integer, according to WordMap.
Resulting hash is an instance of TokenizedSentence.
Hashed sentence is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceTokenizer, used to tokenize the sentence string.
*/
class HashGenerator {
public:
/*!
Constructor.
\param indexPath path to the index directory
\param config pointer to current config object
*/
explicit HashGenerator(std::string indexPath,
boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~HashGenerator();
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the hash
*/
TokenizedSentence generateHash(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
/*!
This method acts like generateHash, but only performs tokenization.
Resulting TokenizedSentence does not have token codes information.
\param sentence sentence to tokenize
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the tokens
*/
TokenizedSentence generateTokens(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
/*!
Saves the contents of current WordMap to HDD.
*/
void serializeWordMap();
/*!
Clears word map.
*/
void clearWordMap();
private:
boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
std::string _wordMapFilePath;
};
#endif