2014-04-13 12:21:30 +02:00
|
|
|
#ifndef SENTENCE_ANONYMIZER_HDR
|
|
|
|
#define SENTENCE_ANONYMIZER_HDR
|
|
|
|
|
|
|
|
#include <string>
|
2015-04-15 10:55:26 +02:00
|
|
|
#include <vector>
|
2014-04-13 12:21:30 +02:00
|
|
|
#include "concordia/common/config.hpp"
|
2014-04-24 08:36:48 +02:00
|
|
|
#include "concordia/regex_replacement.hpp"
|
2014-04-13 12:21:30 +02:00
|
|
|
#include "concordia/concordia_config.hpp"
|
|
|
|
#include "concordia/concordia_exception.hpp"
|
|
|
|
#include <boost/shared_ptr.hpp>
|
2014-04-29 14:46:04 +02:00
|
|
|
#include <boost/filesystem.hpp>
|
2014-04-13 12:21:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Class for anonymizing sentence before adding to index.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
class SentenceAnonymizer {
|
|
|
|
public:
|
|
|
|
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
|
|
|
throw(ConcordiaException);
|
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
|
|
|
virtual ~SentenceAnonymizer();
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2014-04-13 12:21:30 +02:00
|
|
|
string anonymize(const string & sentence);
|
|
|
|
|
|
|
|
private:
|
2014-04-29 14:46:04 +02:00
|
|
|
void _createNeRules(string & namedEntitiesPath);
|
|
|
|
|
|
|
|
void _createHtmlTagsRule(string & htmlTagsPath);
|
|
|
|
|
|
|
|
boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
|
|
|
|
string & filePath,
|
|
|
|
string replacement,
|
|
|
|
bool wholeWord = false);
|
2014-04-24 08:36:48 +02:00
|
|
|
|
2015-04-15 10:55:26 +02:00
|
|
|
vector<RegexReplacement> _namedEntities;
|
2014-04-24 08:36:48 +02:00
|
|
|
|
2014-04-29 14:46:04 +02:00
|
|
|
boost::shared_ptr<RegexReplacement> _htmlTags;
|
|
|
|
|
2014-04-24 08:36:48 +02:00
|
|
|
boost::shared_ptr<RegexReplacement> _stopWords;
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2014-04-24 08:36:48 +02:00
|
|
|
boost::shared_ptr<RegexReplacement> _stopSymbols;
|
2014-04-29 14:46:04 +02:00
|
|
|
|
|
|
|
boost::shared_ptr<RegexReplacement> _spaceSymbols;
|
2014-04-13 12:21:30 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|