concordia-library/concordia/regex_rule.cpp

67 lines
2.4 KiB
C++
Raw Normal View History

2015-06-22 13:52:56 +02:00
#include "concordia/regex_rule.hpp"
#include <sstream>
#include <iostream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
RegexRule::RegexRule(std::string patternString,
2015-06-25 20:49:22 +02:00
int annotationType,
2015-06-25 10:12:51 +02:00
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_annotationType(annotationType),
_value(value) {
2015-06-22 13:52:56 +02:00
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
} else {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
}
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if (std::string const * extra =
boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexRule::~RegexRule() {
}
2015-06-25 10:12:51 +02:00
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
2015-06-22 13:52:56 +02:00
try {
UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
2015-06-25 20:49:22 +02:00
std::string value;
if (_annotationType == TokenAnnotation::WORD) {
UnicodeString unicodeValue;
s.extract(begin->position(), begin->length(), unicodeValue);
unicodeValue.toUTF8String(value);
} else {
value = _value;
}
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
2015-06-22 13:52:56 +02:00
annotations.push_back(annotation);
}
sentence->addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
2015-06-25 10:12:51 +02:00
<< _annotationType << " to text: " << sentence->getSentence();
2015-06-22 13:52:56 +02:00
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}
}