2013-11-12 16:58:31 +01:00
|
|
|
#ifndef HASH_GENERATOR_HDR
|
|
|
|
#define HASH_GENERATOR_HDR
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <map>
|
|
|
|
#include <vector>
|
2013-11-14 15:44:50 +01:00
|
|
|
#include <boost/shared_ptr.hpp>
|
2014-03-14 11:30:17 +01:00
|
|
|
#include <boost/algorithm/string/predicate.hpp>
|
2013-11-14 15:44:50 +01:00
|
|
|
#include "concordia/word_map.hpp"
|
2013-12-06 22:29:25 +01:00
|
|
|
#include "concordia/common/config.hpp"
|
2014-04-13 12:21:30 +02:00
|
|
|
#include "concordia/sentence_anonymizer.hpp"
|
|
|
|
#include "concordia/concordia_config.hpp"
|
2013-11-12 22:08:37 +01:00
|
|
|
#include "concordia/concordia_exception.hpp"
|
2013-11-12 16:58:31 +01:00
|
|
|
|
2013-11-20 17:43:29 +01:00
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
/*!
|
2015-05-01 14:52:53 +02:00
|
|
|
Class for generating a sentence hash. The hash is generated from a sentence
|
|
|
|
given in raw string. String is first anonymized and tokenized. After these
|
|
|
|
operations, each token is coded as an integer, according to WordMap.
|
|
|
|
Resulting hash is a vector of integers.
|
|
|
|
|
|
|
|
Sentence hashed is used when adding a sentence to index and during searching.
|
|
|
|
|
|
|
|
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
|
|
|
and SentenceAnonymizer, used to preprocess the sentence string.
|
2013-11-12 16:58:31 +01:00
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
class HashGenerator {
|
|
|
|
public:
|
2015-05-01 14:52:53 +02:00
|
|
|
/*!
|
|
|
|
Constructor.
|
|
|
|
\param config pointer to current config object
|
|
|
|
*/
|
2014-04-13 12:21:30 +02:00
|
|
|
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
|
|
|
throw(ConcordiaException);
|
2013-11-14 15:44:50 +01:00
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
/*! Destructor.
|
|
|
|
*/
|
|
|
|
virtual ~HashGenerator();
|
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*!
|
|
|
|
Generates hash of a sentence.
|
|
|
|
\param sentence sentence to generate hash from
|
|
|
|
\returns vector of integers
|
|
|
|
*/
|
2015-04-15 14:14:10 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
|
2014-03-14 11:30:17 +01:00
|
|
|
throw(ConcordiaException);
|
2013-11-12 16:58:31 +01:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*!
|
|
|
|
Generates vector of tokens from a sentence. This method is internally
|
|
|
|
used by generateHash. However, for the sake of concordiaSearch
|
|
|
|
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
|
|
|
anonymizing and tokenization is also needed.
|
|
|
|
\param sentence sentence to tokenize
|
|
|
|
\returns vector of tokens
|
|
|
|
*/
|
2015-04-15 14:14:10 +02:00
|
|
|
std::vector<std::string> generateTokenVector(const std::string & sentence);
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
|
|
|
|
/*!
|
|
|
|
Saves the contents of current WordMap to HDD.
|
|
|
|
*/
|
2013-11-12 16:58:31 +01:00
|
|
|
void serializeWordMap();
|
|
|
|
|
2015-05-04 20:40:44 +02:00
|
|
|
/*!
|
|
|
|
Clears word map.
|
|
|
|
*/
|
|
|
|
void clearWordMap();
|
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
private:
|
2013-11-14 15:44:50 +01:00
|
|
|
boost::shared_ptr<WordMap> _wordMap;
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2014-04-13 12:21:30 +02:00
|
|
|
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
2013-11-12 16:58:31 +01:00
|
|
|
|
2015-04-15 14:14:10 +02:00
|
|
|
std::string _wordMapFilePath;
|
2013-11-12 16:58:31 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|