2015-06-25 10:12:51 +02:00
|
|
|
#ifndef SENTENCE_TOKENIZER_HDR
|
|
|
|
#define SENTENCE_TOKENIZER_HDR
|
2014-04-13 12:21:30 +02:00
|
|
|
|
|
|
|
#include <string>
|
2015-04-15 10:55:26 +02:00
|
|
|
#include <vector>
|
2014-04-13 12:21:30 +02:00
|
|
|
#include "concordia/common/config.hpp"
|
2015-06-25 10:12:51 +02:00
|
|
|
#include "concordia/tokenized_sentence.hpp"
|
2015-06-22 13:52:56 +02:00
|
|
|
#include "concordia/regex_rule.hpp"
|
2014-04-13 12:21:30 +02:00
|
|
|
#include "concordia/concordia_config.hpp"
|
|
|
|
#include "concordia/concordia_exception.hpp"
|
|
|
|
#include <boost/shared_ptr.hpp>
|
2014-04-29 14:46:04 +02:00
|
|
|
#include <boost/filesystem.hpp>
|
2014-04-13 12:21:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*!
|
2015-06-25 10:12:51 +02:00
|
|
|
Class for tokenizing sentence before generating hash.
|
2015-06-27 12:40:24 +02:00
|
|
|
Tokenizer ignores unnecessary symbols, html tags and possibly stop words
|
|
|
|
(if the option is enabled) in sentences added to index
|
|
|
|
as well as annotates named entities. All these have to be listed in files
|
2015-05-01 14:52:53 +02:00
|
|
|
(see \ref tutorial3).
|
2014-04-13 12:21:30 +02:00
|
|
|
*/
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
class SentenceTokenizer {
|
2014-04-13 12:21:30 +02:00
|
|
|
public:
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Constructor.
|
|
|
|
\param config config object, holding paths to necessary files
|
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
|
2014-04-13 12:21:30 +02:00
|
|
|
throw(ConcordiaException);
|
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
virtual ~SentenceTokenizer();
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
/*! Tokenizes the sentence.
|
2015-05-01 14:52:53 +02:00
|
|
|
\param sentence input sentence
|
2015-06-27 12:40:24 +02:00
|
|
|
\returns tokenized sentence object build on the input sentence
|
2015-05-01 14:52:53 +02:00
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
boost::shared_ptr<TokenizedSentence>
|
|
|
|
tokenize(const std::string & sentence);
|
2014-04-13 12:21:30 +02:00
|
|
|
|
|
|
|
private:
|
2015-04-15 14:14:10 +02:00
|
|
|
void _createNeRules(std::string & namedEntitiesPath);
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-04-15 14:14:10 +02:00
|
|
|
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
boost::shared_ptr<RegexRule> _getMultipleRegexRule(
|
|
|
|
std::string filePath,
|
|
|
|
char annotationType,
|
|
|
|
std::string value,
|
2015-04-15 14:14:10 +02:00
|
|
|
bool wholeWord = false);
|
2014-04-24 08:36:48 +02:00
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
std::vector<RegexRule> _namedEntities;
|
2014-04-24 08:36:48 +02:00
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
boost::shared_ptr<RegexRule> _htmlTags;
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-04-21 21:33:08 +02:00
|
|
|
bool _stopWordsEnabled;
|
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
boost::shared_ptr<RegexRule> _stopWords;
|
2014-04-13 12:21:30 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|