57 lines
2.0 KiB
C++
57 lines
2.0 KiB
C++
#include "concordia/regex_rule.hpp"
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <boost/exception/all.hpp>
|
|
#include <boost/throw_exception.hpp>
|
|
|
|
RegexRule::RegexRule(std::string patternString,
|
|
std::string value,
|
|
bool caseSensitive)
|
|
throw(ConcordiaException):
|
|
_value(value) {
|
|
try {
|
|
if (caseSensitive) {
|
|
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
|
|
} else {
|
|
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
|
|
}
|
|
} catch(const std::exception & e) {
|
|
std::stringstream ss;
|
|
|
|
ss << "Bad regex pattern: " << patternString <<
|
|
" Detailed info: " << e.what();
|
|
|
|
if (std::string const * extra =
|
|
boost::get_error_info<my_tag_error_info>(e) ) {
|
|
ss << *extra;
|
|
}
|
|
throw ConcordiaException(ss.str());
|
|
}
|
|
}
|
|
|
|
RegexRule::~RegexRule() {
|
|
}
|
|
|
|
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
|
|
try {
|
|
UnicodeString s(sentence->getSentence().c_str());
|
|
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
|
boost::u32regex_iterator<const UChar*> end;
|
|
std::vector<TokenAnnotation> annotations;
|
|
for (; begin != end; ++begin) {
|
|
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
|
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
|
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
|
|
annotations.push_back(annotation);
|
|
}
|
|
sentence->addAnnotations(annotations);
|
|
} catch(const std::exception & e) {
|
|
std::stringstream ss;
|
|
ss << "Exception while applying regex rule: "
|
|
<< _value << " to text: " << sentence->getSentence();
|
|
ss << ", message: " << e.what();
|
|
throw ConcordiaException(ss.str());
|
|
}
|
|
}
|
|
|