concordia-library/concordia/hash_generator.hpp

84 lines
2.5 KiB
C++
Raw Normal View History

2013-11-12 16:58:31 +01:00
#ifndef HASH_GENERATOR_HDR
#define HASH_GENERATOR_HDR
#include <string>
#include <map>
#include <vector>
2013-11-14 15:44:50 +01:00
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
2013-11-14 15:44:50 +01:00
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
2015-06-25 10:12:51 +02:00
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp"
2013-11-12 22:08:37 +01:00
#include "concordia/concordia_exception.hpp"
2013-11-12 16:58:31 +01:00
2013-11-20 17:43:29 +01:00
2013-11-12 16:58:31 +01:00
/*!
2015-05-01 14:52:53 +02:00
Class for generating a sentence hash. The hash is generated from a sentence
2015-06-27 12:40:24 +02:00
given in raw string. String is first tokenized by SentenceTokenizer and
then each token is coded as an integer, according to WordMap.
Resulting hash is an instance of TokenizedSentence.
2015-05-01 14:52:53 +02:00
2015-06-27 12:40:24 +02:00
Hashed sentence is used when adding a sentence to index and during searching.
2015-05-01 14:52:53 +02:00
HashGenerator holds an instance of WordMap, used to code tokens as integers
2015-06-27 12:40:24 +02:00
and SentenceTokenizer, used to tokenize the sentence string.
2013-11-12 16:58:31 +01:00
*/
class HashGenerator {
public:
2015-05-01 14:52:53 +02:00
/*!
Constructor.
\param indexPath path to the index directory
2015-05-01 14:52:53 +02:00
\param config pointer to current config object
*/
explicit HashGenerator(std::string indexPath,
boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
2013-11-14 15:44:50 +01:00
2013-11-12 16:58:31 +01:00
/*! Destructor.
*/
virtual ~HashGenerator();
2015-05-01 14:52:53 +02:00
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
2015-12-27 20:54:40 +01:00
\param byWhitespace whether to tokenize the sentence by whitespace
2015-06-27 12:40:24 +02:00
\returns tokenized sentence, containing the hash
2015-05-01 14:52:53 +02:00
*/
2015-12-27 20:54:40 +01:00
TokenizedSentence generateHash(const std::string & sentence,
bool byWhitespace = false)
2015-08-19 20:49:26 +02:00
throw(ConcordiaException);
2013-11-12 16:58:31 +01:00
2016-01-01 20:45:07 +01:00
/*!
This method acts like generateHash, but only performs tokenization.
Resulting TokenizedSentence does not have token codes information.
\param sentence sentence to tokenize
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the tokens
*/
TokenizedSentence generateTokens(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
2015-05-01 14:52:53 +02:00
/*!
Saves the contents of current WordMap to HDD.
*/
2013-11-12 16:58:31 +01:00
void serializeWordMap();
2015-05-04 20:40:44 +02:00
/*!
Clears word map.
*/
void clearWordMap();
2013-11-12 16:58:31 +01:00
private:
2013-11-14 15:44:50 +01:00
boost::shared_ptr<WordMap> _wordMap;
2015-06-25 10:12:51 +02:00
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
2013-11-12 16:58:31 +01:00
std::string _wordMapFilePath;
2013-11-12 16:58:31 +01:00
};
#endif