78 lines
2.9 KiB
C++
78 lines
2.9 KiB
C++
#include "concordia/regex_rule.hpp"
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <boost/exception/all.hpp>
|
|
#include <boost/throw_exception.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
|
|
RegexRule::RegexRule(std::string patternString,
|
|
int annotationType,
|
|
std::string value,
|
|
bool caseSensitive):
|
|
_annotationType(annotationType),
|
|
_value(value) {
|
|
try {
|
|
if (caseSensitive) {
|
|
_pattern = boost::make_u32regex(
|
|
UnicodeString(patternString.c_str()));
|
|
} else {
|
|
_pattern = boost::make_u32regex(
|
|
UnicodeString(patternString.c_str()), boost::regex::icase);
|
|
}
|
|
} catch(const std::exception & e) {
|
|
std::stringstream ss;
|
|
|
|
ss << "Bad regex pattern: " << patternString <<
|
|
" Detailed info: " << e.what();
|
|
|
|
if (std::string const * extra =
|
|
boost::get_error_info<my_tag_error_info>(e) ) {
|
|
ss << *extra;
|
|
}
|
|
throw ConcordiaException(ss.str());
|
|
}
|
|
}
|
|
|
|
RegexRule::~RegexRule() {
|
|
}
|
|
|
|
void RegexRule::apply(TokenizedSentence & sentence) {
|
|
try {
|
|
UnicodeString s(sentence.getSentence().c_str());
|
|
boost::u32regex_iterator<const UChar*> begin(
|
|
boost::make_u32regex_iterator(s, _pattern));
|
|
boost::u32regex_iterator<const UChar*> end;
|
|
std::vector<TokenAnnotation> annotations;
|
|
for (; begin != end; ++begin) {
|
|
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
|
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
|
std::string value;
|
|
if (_annotationType == TokenAnnotation::WORD) {
|
|
UnicodeString unicodeValue;
|
|
s.extract(begin->position(), begin->length(), unicodeValue);
|
|
unicodeValue.toUTF8String(value);
|
|
} else {
|
|
value = _value;
|
|
}
|
|
boost::trim(value);
|
|
TokenAnnotation annotation(matchBegin, matchEnd,
|
|
_annotationType, value);
|
|
annotations.push_back(annotation);
|
|
/*
|
|
std::cout << "found annotation. Rule type: " << _annotationType << std::endl;
|
|
std::cout << "value: " << value << std::endl;
|
|
std::cout << "matchBegin: " << matchBegin << std::endl;
|
|
std::cout << "matchEnd: " << matchEnd << std::endl;
|
|
*/
|
|
}
|
|
sentence.addAnnotations(annotations);
|
|
} catch(const std::exception & e) {
|
|
std::stringstream ss;
|
|
ss << "Exception while applying regex rule: "
|
|
<< _annotationType << " to text: "
|
|
<< sentence.getSentence();
|
|
ss << ", message: " << e.what();
|
|
throw ConcordiaException(ss.str());
|
|
}
|
|
}
|