concordia-library/concordia/regex_rule.cpp
2015-06-25 10:12:51 +02:00

59 lines
2.1 KiB
C++

#include "concordia/regex_rule.hpp"
#include <sstream>
#include <iostream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
RegexRule::RegexRule(std::string patternString,
char annotationType,
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_annotationType(annotationType),
_value(value) {
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
} else {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
}
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if (std::string const * extra =
boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexRule::~RegexRule() {
}
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
try {
UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
annotations.push_back(annotation);
}
sentence->addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
<< _annotationType << " to text: " << sentence->getSentence();
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}
}