#ifndef SENTENCE_ANONYMIZER_HDR #define SENTENCE_ANONYMIZER_HDR #include #include #include "concordia/common/config.hpp" #include "concordia/regex_replacement.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" #include #include /*! Class for anonymizing sentence before generating hash. This operation is is used to remove unnecessary symbols and possibly words from sentences added to index and search patterns. Anonymizer removes html tags, substitutes predefined symbols with a single space, removes stop words (if the option is enabled), as well as named entities and special symbols. All these have to be listed in files (see \ref tutorial3). */ class SentenceAnonymizer { public: /*! Constructor. \param config config object, holding paths to necessary files */ explicit SentenceAnonymizer(boost::shared_ptr config) throw(ConcordiaException); /*! Destructor. */ virtual ~SentenceAnonymizer(); /*! Anonymizes the sentence. \param sentence input sentence \returns altered version of the input sentence */ std::string anonymize(const std::string & sentence); private: void _createNeRules(std::string & namedEntitiesPath); void _createHtmlTagsRule(std::string & htmlTagsPath); boost::shared_ptr _getMultipleReplacementRule( std::string & filePath, std::string replacement, bool wholeWord = false); std::vector _namedEntities; boost::shared_ptr _htmlTags; bool _stopWordsEnabled; boost::shared_ptr _stopWords; boost::shared_ptr _stopSymbols; boost::shared_ptr _spaceSymbols; }; #endif