concordia-library/concordia/sentence_anonymizer.hpp

68 lines
2.0 KiB
C++

#ifndef SENTENCE_ANONYMIZER_HDR
#define SENTENCE_ANONYMIZER_HDR
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>
/*!
Class for anonymizing sentence before generating hash.
This operation is is used to
remove unnecessary symbols and possibly words from sentences added to index
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
with a single space, removes stop words (if the option is enabled), as well as
named entities and special symbols. All these have to be listed in files
(see \ref tutorial3).
*/
class SentenceAnonymizer {
public:
/*! Constructor.
\param config config object, holding paths to necessary files
*/
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~SentenceAnonymizer();
/*! Anonymizes the sentence.
\param sentence input sentence
\returns altered version of the input sentence
*/
boost::shared_ptr<AnonymizedSentence>
anonymize(const std::string & sentence);
private:
void _createNeRules(std::string & namedEntitiesPath);
void _createHtmlTagsRule(std::string & htmlTagsPath);
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
std::string & filePath,
std::string replacement,
bool wholeWord = false);
std::vector<RegexRule> _namedEntities;
boost::shared_ptr<RegexRule> _htmlTags;
bool _stopWordsEnabled;
boost::shared_ptr<RegexRule> _stopWords;
boost::shared_ptr<RegexRule> _stopSymbols;
boost::shared_ptr<RegexRule> _spaceSymbols;
};
#endif