concordia-library/concordia/regex_rule.cpp
2017-05-05 12:58:32 +02:00

79 lines
2.9 KiB
C++

#include "concordia/regex_rule.hpp"
#include <sstream>
#include <iostream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
#include <boost/algorithm/string.hpp>
RegexRule::RegexRule(std::string patternString,
int annotationType,
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_annotationType(annotationType),
_value(value) {
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()));
} else {
_pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()), boost::regex::icase);
}
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if (std::string const * extra =
boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexRule::~RegexRule() {
}
void RegexRule::apply(TokenizedSentence & sentence) {
try {
UnicodeString s(sentence.getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
std::string value;
if (_annotationType == TokenAnnotation::WORD) {
UnicodeString unicodeValue;
s.extract(begin->position(), begin->length(), unicodeValue);
unicodeValue.toUTF8String(value);
} else {
value = _value;
}
boost::trim(value);
TokenAnnotation annotation(matchBegin, matchEnd,
_annotationType, value);
annotations.push_back(annotation);
/*
std::cout << "found annotation. Rule type: " << _annotationType << std::endl;
std::cout << "value: " << value << std::endl;
std::cout << "matchBegin: " << matchBegin << std::endl;
std::cout << "matchEnd: " << matchEnd << std::endl;
*/
}
sentence.addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
<< _annotationType << " to text: "
<< sentence.getSentence();
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}
}