#include "concordia/regex_rule.hpp" #include #include #include #include RegexRule::RegexRule(std::string patternString, char annotationType, std::string value, bool caseSensitive) throw(ConcordiaException): _annotationType(annotationType), _value(value) { try { if (caseSensitive) { _pattern = boost::make_u32regex(UnicodeString(patternString.c_str())); } else { _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase); } } catch(const std::exception & e) { std::stringstream ss; ss << "Bad regex pattern: " << patternString << " Detailed info: " << e.what(); if (std::string const * extra = boost::get_error_info(e) ) { ss << *extra; } throw ConcordiaException(ss.str()); } } RegexRule::~RegexRule() { } void RegexRule::apply(boost::shared_ptr sentence) { try { UnicodeString s(sentence->getSentence().c_str()); boost::u32regex_iterator begin(boost::make_u32regex_iterator(s, _pattern)); boost::u32regex_iterator end; std::vector annotations; for (; begin != end; ++begin) { SUFFIX_MARKER_TYPE matchBegin = begin->position(); SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, ""); annotations.push_back(annotation); } sentence->addAnnotations(annotations); } catch(const std::exception & e) { std::stringstream ss; ss << "Exception while applying regex rule: " << _annotationType << " to text: " << sentence->getSentence(); ss << ", message: " << e.what(); throw ConcordiaException(ss.str()); } }