concordia-library/concordia/sentence_tokenizer.hpp

63 lines
1.9 KiB
C++
Raw Permalink Normal View History

2015-06-25 10:12:51 +02:00
#ifndef SENTENCE_TOKENIZER_HDR
#define SENTENCE_TOKENIZER_HDR
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
2015-06-25 10:12:51 +02:00
#include "concordia/tokenized_sentence.hpp"
2015-06-22 13:52:56 +02:00
#include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>
/*!
2015-06-25 10:12:51 +02:00
Class for tokenizing sentence before generating hash.
2015-06-27 12:40:24 +02:00
Tokenizer ignores unnecessary symbols, html tags and possibly stop words
(if the option is enabled) in sentences added to index
as well as annotates named entities. All these have to be listed in files
2015-05-01 14:52:53 +02:00
(see \ref tutorial3).
*/
2015-06-25 10:12:51 +02:00
class SentenceTokenizer {
public:
2015-05-01 14:52:53 +02:00
/*! Constructor.
\param config config object, holding paths to necessary files
*/
2019-01-18 13:30:51 +01:00
explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config);
/*! Destructor.
*/
2015-06-25 10:12:51 +02:00
virtual ~SentenceTokenizer();
2015-06-25 10:12:51 +02:00
/*! Tokenizes the sentence.
2015-05-01 14:52:53 +02:00
\param sentence input sentence
2015-12-27 20:54:40 +01:00
\param byWhitespace whether to tokenize the sentence by whitespace
2015-06-27 12:40:24 +02:00
\returns tokenized sentence object build on the input sentence
2015-05-01 14:52:53 +02:00
*/
2015-12-27 20:54:40 +01:00
TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false);
private:
void _createNeRules(std::string & namedEntitiesPath);
void _createHtmlTagsRule(std::string & htmlTagsPath);
2015-06-25 10:12:51 +02:00
boost::shared_ptr<RegexRule> _getMultipleRegexRule(
std::string filePath,
char annotationType,
std::string value,
bool wholeWord = false);
2015-06-22 13:52:56 +02:00
std::vector<RegexRule> _namedEntities;
2015-06-22 13:52:56 +02:00
boost::shared_ptr<RegexRule> _htmlTags;
bool _stopWordsEnabled;
2015-06-22 13:52:56 +02:00
boost::shared_ptr<RegexRule> _stopWords;
};
#endif