68 lines
2.0 KiB
C++
68 lines
2.0 KiB
C++
#ifndef SENTENCE_ANONYMIZER_HDR
|
|
#define SENTENCE_ANONYMIZER_HDR
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/anonymized_sentence.hpp"
|
|
#include "concordia/regex_rule.hpp"
|
|
#include "concordia/concordia_config.hpp"
|
|
#include "concordia/concordia_exception.hpp"
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <boost/filesystem.hpp>
|
|
|
|
|
|
/*!
|
|
Class for anonymizing sentence before generating hash.
|
|
This operation is is used to
|
|
remove unnecessary symbols and possibly words from sentences added to index
|
|
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
|
with a single space, removes stop words (if the option is enabled), as well as
|
|
named entities and special symbols. All these have to be listed in files
|
|
(see \ref tutorial3).
|
|
*/
|
|
|
|
class SentenceAnonymizer {
|
|
public:
|
|
/*! Constructor.
|
|
\param config config object, holding paths to necessary files
|
|
*/
|
|
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~SentenceAnonymizer();
|
|
|
|
/*! Anonymizes the sentence.
|
|
\param sentence input sentence
|
|
\returns altered version of the input sentence
|
|
*/
|
|
boost::shared_ptr<AnonymizedSentence>
|
|
anonymize(const std::string & sentence);
|
|
|
|
private:
|
|
void _createNeRules(std::string & namedEntitiesPath);
|
|
|
|
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
|
|
|
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
|
|
std::string & filePath,
|
|
std::string replacement,
|
|
bool wholeWord = false);
|
|
|
|
std::vector<RegexRule> _namedEntities;
|
|
|
|
boost::shared_ptr<RegexRule> _htmlTags;
|
|
|
|
bool _stopWordsEnabled;
|
|
|
|
boost::shared_ptr<RegexRule> _stopWords;
|
|
|
|
boost::shared_ptr<RegexRule> _stopSymbols;
|
|
|
|
boost::shared_ptr<RegexRule> _spaceSymbols;
|
|
};
|
|
|
|
#endif
|