concordia-library/concordia/sentence_tokenizer.hpp

65 lines
1.9 KiB
C++
Raw Normal View History

2015-06-25 10:12:51 +02:00
#ifndef SENTENCE_TOKENIZER_HDR
#define SENTENCE_TOKENIZER_HDR
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
2015-06-25 10:12:51 +02:00
#include "concordia/tokenized_sentence.hpp"
2015-06-22 13:52:56 +02:00
#include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>
/*!
2015-06-25 10:12:51 +02:00
Class for tokenizing sentence before generating hash.
2015-05-01 14:52:53 +02:00
This operation is is used to
remove unnecessary symbols and possibly words from sentences added to index
2015-06-25 10:12:51 +02:00
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
as well as annotates named entities and special symbols. All these have to be listed in files
2015-05-01 14:52:53 +02:00
(see \ref tutorial3).
*/
2015-06-25 10:12:51 +02:00
class SentenceTokenizer {
public:
2015-05-01 14:52:53 +02:00
/*! Constructor.
\param config config object, holding paths to necessary files
*/
2015-06-25 10:12:51 +02:00
explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
2015-06-25 10:12:51 +02:00
virtual ~SentenceTokenizer();
2015-06-25 10:12:51 +02:00
/*! Tokenizes the sentence.
2015-05-01 14:52:53 +02:00
\param sentence input sentence
\returns altered version of the input sentence
*/
2015-06-25 10:12:51 +02:00
boost::shared_ptr<TokenizedSentence>
tokenize(const std::string & sentence);
private:
void _createNeRules(std::string & namedEntitiesPath);
void _createHtmlTagsRule(std::string & htmlTagsPath);
2015-06-25 10:12:51 +02:00
boost::shared_ptr<RegexRule> _getMultipleRegexRule(
std::string filePath,
char annotationType,
std::string value,
bool wholeWord = false);
2015-06-22 13:52:56 +02:00
std::vector<RegexRule> _namedEntities;
2015-06-22 13:52:56 +02:00
boost::shared_ptr<RegexRule> _htmlTags;
bool _stopWordsEnabled;
2015-06-22 13:52:56 +02:00
boost::shared_ptr<RegexRule> _stopWords;
};
#endif