concordia-library/concordia/hash_generator.hpp

75 lines
2.1 KiB
C++
Raw Normal View History

2013-11-12 16:58:31 +01:00
#ifndef HASH_GENERATOR_HDR
#define HASH_GENERATOR_HDR
#include <string>
#include <map>
#include <vector>
2013-11-14 15:44:50 +01:00
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
2013-11-14 15:44:50 +01:00
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/sentence_anonymizer.hpp"
#include "concordia/concordia_config.hpp"
2013-11-12 22:08:37 +01:00
#include "concordia/concordia_exception.hpp"
2013-11-12 16:58:31 +01:00
2013-11-20 17:43:29 +01:00
2013-11-12 16:58:31 +01:00
/*!
2015-05-01 14:52:53 +02:00
Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first anonymized and tokenized. After these
operations, each token is coded as an integer, according to WordMap.
Resulting hash is a vector of integers.
Sentence hashed is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceAnonymizer, used to preprocess the sentence string.
2013-11-12 16:58:31 +01:00
*/
class HashGenerator {
public:
2015-05-01 14:52:53 +02:00
/*!
Constructor.
\param config pointer to current config object
*/
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
2013-11-14 15:44:50 +01:00
2013-11-12 16:58:31 +01:00
/*! Destructor.
*/
virtual ~HashGenerator();
2015-05-01 14:52:53 +02:00
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\returns vector of integers
*/
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
throw(ConcordiaException);
2013-11-12 16:58:31 +01:00
2015-05-01 14:52:53 +02:00
/*!
Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence
anonymizing and tokenization is also needed.
\param sentence sentence to tokenize
\returns vector of tokens
*/
std::vector<std::string> generateTokenVector(const std::string & sentence);
2015-05-01 14:52:53 +02:00
/*!
Saves the contents of current WordMap to HDD.
*/
2013-11-12 16:58:31 +01:00
void serializeWordMap();
private:
2013-11-14 15:44:50 +01:00
boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
2013-11-12 16:58:31 +01:00
std::string _wordMapFilePath;
2013-11-12 16:58:31 +01:00
};
#endif