#include "concordia/regex_rule.hpp" #include #include #include #include #include RegexRule::RegexRule(std::string patternString, int annotationType, std::string value, bool caseSensitive) throw(ConcordiaException): _annotationType(annotationType), _value(value) { try { if (caseSensitive) { _pattern = boost::make_u32regex( UnicodeString(patternString.c_str())); } else { _pattern = boost::make_u32regex( UnicodeString(patternString.c_str()), boost::regex::icase); } } catch(const std::exception & e) { std::stringstream ss; ss << "Bad regex pattern: " << patternString << " Detailed info: " << e.what(); if (std::string const * extra = boost::get_error_info(e) ) { ss << *extra; } throw ConcordiaException(ss.str()); } } RegexRule::~RegexRule() { } void RegexRule::apply(TokenizedSentence & sentence) { try { UnicodeString s(sentence.getSentence().c_str()); boost::u32regex_iterator begin( boost::make_u32regex_iterator(s, _pattern)); boost::u32regex_iterator end; std::vector annotations; for (; begin != end; ++begin) { SUFFIX_MARKER_TYPE matchBegin = begin->position(); SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); std::string value; if (_annotationType == TokenAnnotation::WORD) { UnicodeString unicodeValue; s.extract(begin->position(), begin->length(), unicodeValue); unicodeValue.toUTF8String(value); } else { value = _value; } boost::trim(value); TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value); annotations.push_back(annotation); /* std::cout << "found annotation. Rule type: " << _annotationType << std::endl; std::cout << "value: " << value << std::endl; std::cout << "matchBegin: " << matchBegin << std::endl; std::cout << "matchEnd: " << matchEnd << std::endl; */ } sentence.addAnnotations(annotations); } catch(const std::exception & e) { std::stringstream ss; ss << "Exception while applying regex rule: " << _annotationType << " to text: " << sentence.getSentence(); ss << ", message: " << e.what(); throw ConcordiaException(ss.str()); } }