concordia-library/concordia/hash_generator.hpp

73 lines
2.0 KiB
C++
Raw Normal View History

2013-11-12 16:58:31 +01:00
#ifndef HASH_GENERATOR_HDR
#define HASH_GENERATOR_HDR
#include <string>
#include <map>
#include <vector>
2013-11-14 15:44:50 +01:00
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
2013-11-14 15:44:50 +01:00
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
2015-06-25 10:12:51 +02:00
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp"
2013-11-12 22:08:37 +01:00
#include "concordia/concordia_exception.hpp"
2013-11-12 16:58:31 +01:00
2013-11-20 17:43:29 +01:00
2013-11-12 16:58:31 +01:00
/*!
2015-05-01 14:52:53 +02:00
Class for generating a sentence hash. The hash is generated from a sentence
2015-06-27 12:40:24 +02:00
given in raw string. String is first tokenized by SentenceTokenizer and
then each token is coded as an integer, according to WordMap.
Resulting hash is an instance of TokenizedSentence.
2015-05-01 14:52:53 +02:00
2015-06-27 12:40:24 +02:00
Hashed sentence is used when adding a sentence to index and during searching.
2015-05-01 14:52:53 +02:00
HashGenerator holds an instance of WordMap, used to code tokens as integers
2015-06-27 12:40:24 +02:00
and SentenceTokenizer, used to tokenize the sentence string.
2013-11-12 16:58:31 +01:00
*/
class HashGenerator {
public:
2015-05-01 14:52:53 +02:00
/*!
Constructor.
\param indexPath path to the index directory
2015-05-01 14:52:53 +02:00
\param config pointer to current config object
*/
explicit HashGenerator(std::string indexPath,
boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
2013-11-14 15:44:50 +01:00
2013-11-12 16:58:31 +01:00
/*! Destructor.
*/
virtual ~HashGenerator();
2015-05-01 14:52:53 +02:00
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
2015-12-27 20:54:40 +01:00
\param byWhitespace whether to tokenize the sentence by whitespace
2015-06-27 12:40:24 +02:00
\returns tokenized sentence, containing the hash
2015-05-01 14:52:53 +02:00
*/
2015-12-27 20:54:40 +01:00
TokenizedSentence generateHash(const std::string & sentence,
bool byWhitespace = false)
2015-08-19 20:49:26 +02:00
throw(ConcordiaException);
2013-11-12 16:58:31 +01:00
2015-05-01 14:52:53 +02:00
/*!
Saves the contents of current WordMap to HDD.
*/
2013-11-12 16:58:31 +01:00
void serializeWordMap();
2015-05-04 20:40:44 +02:00
/*!
Clears word map.
*/
void clearWordMap();
2013-11-12 16:58:31 +01:00
private:
2013-11-14 15:44:50 +01:00
boost::shared_ptr<WordMap> _wordMap;
2015-06-25 10:12:51 +02:00
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
2013-11-12 16:58:31 +01:00
std::string _wordMapFilePath;
2013-11-12 16:58:31 +01:00
};
#endif