81 lines
2.3 KiB
C++
81 lines
2.3 KiB
C++
#ifndef HASH_GENERATOR_HDR
|
|
#define HASH_GENERATOR_HDR
|
|
|
|
#include <string>
|
|
#include <map>
|
|
#include <vector>
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <boost/algorithm/string/predicate.hpp>
|
|
#include "concordia/word_map.hpp"
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/sentence_tokenizer.hpp"
|
|
#include "concordia/concordia_config.hpp"
|
|
#include "concordia/concordia_exception.hpp"
|
|
|
|
|
|
/*!
|
|
Class for generating a sentence hash. The hash is generated from a sentence
|
|
given in raw string. String is first tokenized by SentenceTokenizer and
|
|
then each token is coded as an integer, according to WordMap.
|
|
Resulting hash is an instance of TokenizedSentence.
|
|
|
|
Hashed sentence is used when adding a sentence to index and during searching.
|
|
|
|
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
|
and SentenceTokenizer, used to tokenize the sentence string.
|
|
|
|
*/
|
|
|
|
class HashGenerator {
|
|
public:
|
|
/*!
|
|
Constructor.
|
|
\param indexPath path to the index directory
|
|
\param config pointer to current config object
|
|
*/
|
|
explicit HashGenerator(std::string indexPath,
|
|
boost::shared_ptr<ConcordiaConfig> config);
|
|
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~HashGenerator();
|
|
|
|
/*!
|
|
Generates hash of a sentence.
|
|
\param sentence sentence to generate hash from
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
|
\returns tokenized sentence, containing the hash
|
|
*/
|
|
TokenizedSentence generateHash(const std::string & sentence,
|
|
bool byWhitespace = false);
|
|
|
|
/*!
|
|
This method acts like generateHash, but only performs tokenization.
|
|
Resulting TokenizedSentence does not have token codes information.
|
|
\param sentence sentence to tokenize
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
|
\returns tokenized sentence, containing the tokens
|
|
*/
|
|
TokenizedSentence generateTokens(const std::string & sentence,
|
|
bool byWhitespace = false);
|
|
|
|
/*!
|
|
Saves the contents of current WordMap to HDD.
|
|
*/
|
|
void serializeWordMap();
|
|
|
|
/*!
|
|
Clears word map.
|
|
*/
|
|
void clearWordMap();
|
|
|
|
private:
|
|
boost::shared_ptr<WordMap> _wordMap;
|
|
|
|
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
|
|
|
|
std::string _wordMapFilePath;
|
|
};
|
|
|
|
#endif
|