concordia-library/concordia/regex_rule.cpp

79 lines
2.9 KiB
C++
Raw Normal View History

2015-06-22 13:52:56 +02:00
#include "concordia/regex_rule.hpp"
#include <sstream>
#include <iostream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
2017-05-05 12:58:32 +02:00
#include <boost/algorithm/string.hpp>
2015-06-22 13:52:56 +02:00
RegexRule::RegexRule(std::string patternString,
2015-06-25 20:49:22 +02:00
int annotationType,
2015-06-25 10:12:51 +02:00
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_annotationType(annotationType),
_value(value) {
2015-06-22 13:52:56 +02:00
try {
if (caseSensitive) {
2015-06-27 12:40:24 +02:00
_pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()));
2015-06-22 13:52:56 +02:00
} else {
2015-06-27 12:40:24 +02:00
_pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()), boost::regex::icase);
2015-06-22 13:52:56 +02:00
}
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if (std::string const * extra =
boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexRule::~RegexRule() {
}
2015-08-19 20:49:26 +02:00
void RegexRule::apply(TokenizedSentence & sentence) {
2015-06-22 13:52:56 +02:00
try {
2015-08-19 20:49:26 +02:00
UnicodeString s(sentence.getSentence().c_str());
2015-06-27 12:40:24 +02:00
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(s, _pattern));
2015-06-22 13:52:56 +02:00
boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
2015-06-25 20:49:22 +02:00
std::string value;
if (_annotationType == TokenAnnotation::WORD) {
UnicodeString unicodeValue;
2015-06-27 12:40:24 +02:00
s.extract(begin->position(), begin->length(), unicodeValue);
2015-06-25 20:49:22 +02:00
unicodeValue.toUTF8String(value);
} else {
value = _value;
}
2017-05-05 12:58:32 +02:00
boost::trim(value);
2015-06-27 12:40:24 +02:00
TokenAnnotation annotation(matchBegin, matchEnd,
_annotationType, value);
2015-06-22 13:52:56 +02:00
annotations.push_back(annotation);
2017-05-05 12:58:32 +02:00
/*
std::cout << "found annotation. Rule type: " << _annotationType << std::endl;
std::cout << "value: " << value << std::endl;
std::cout << "matchBegin: " << matchBegin << std::endl;
std::cout << "matchEnd: " << matchEnd << std::endl;
*/
2015-06-22 13:52:56 +02:00
}
2015-08-19 20:49:26 +02:00
sentence.addAnnotations(annotations);
2015-06-22 13:52:56 +02:00
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
2015-06-27 12:40:24 +02:00
<< _annotationType << " to text: "
2015-08-19 20:49:26 +02:00
<< sentence.getSentence();
2015-06-22 13:52:56 +02:00
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}
}