#ifndef SENTENCE_ANONYMIZER_HDR #define SENTENCE_ANONYMIZER_HDR #include <string> #include <vector> #include "concordia/common/config.hpp" #include "concordia/regex_replacement.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" #include <boost/shared_ptr.hpp> #include <boost/filesystem.hpp> /*! Class for anonymizing sentence before generating hash. This operation is is used to remove unnecessary symbols and possibly words from sentences added to index and search patterns. Anonymizer removes html tags, substitutes predefined symbols with a single space, removes stop words (if the option is enabled), as well as named entities and special symbols. All these have to be listed in files (see \ref tutorial3). */ class SentenceAnonymizer { public: /*! Constructor. \param config config object, holding paths to necessary files */ explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config) throw(ConcordiaException); /*! Destructor. */ virtual ~SentenceAnonymizer(); /*! Anonymizes the sentence. \param sentence input sentence \returns altered version of the input sentence */ std::string anonymize(const std::string & sentence); private: void _createNeRules(std::string & namedEntitiesPath); void _createHtmlTagsRule(std::string & htmlTagsPath); boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule( std::string & filePath, std::string replacement, bool wholeWord = false); std::vector<RegexReplacement> _namedEntities; boost::shared_ptr<RegexReplacement> _htmlTags; bool _stopWordsEnabled; boost::shared_ptr<RegexReplacement> _stopWords; boost::shared_ptr<RegexReplacement> _stopSymbols; boost::shared_ptr<RegexReplacement> _spaceSymbols; }; #endif