lowercasing bad utf

This commit is contained in:
rjawor 2018-12-13 17:43:01 +01:00
parent 2eda92fe7a
commit 53b100b2e4
3 changed files with 26 additions and 1 deletions

View File

@ -2,6 +2,8 @@
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/case_conv.hpp> #include <boost/algorithm/string/case_conv.hpp>
#include <boost/locale.hpp> #include <boost/locale.hpp>
#include <string>
#include <algorithm>
#include "utf8/utf8.h" #include "utf8/utf8.h"
@ -12,9 +14,18 @@ TextUtils::TextUtils() {
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl"); StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
} }
char TextUtils::_easytolower(char in) {
if(in <= 'Z' && in >= 'A') {
return in - ('Z' - 'z');
}
return in;
}
std::string TextUtils::toLowerCase(const std::string & text) { std::string TextUtils::toLowerCase(const std::string & text) {
if (!utf8::is_valid(text.begin(), text.end())) { if (!utf8::is_valid(text.begin(), text.end())) {
throw ConcordiaException("Bad input encoding, use UTF-8"); std::string data = text;
std::transform(data.begin(), data.end(), data.begin(), TextUtils::_easytolower);
return data;
} }
return simpleConvert(*_lowerConverter, text); return simpleConvert(*_lowerConverter, text);
} }

View File

@ -41,6 +41,8 @@ private:
void operator=(TextUtils const&); // Don't implement void operator=(TextUtils const&); // Don't implement
static char _easytolower(char in);
boost::shared_ptr<StringGeneralCaseConverter> _lowerConverter; boost::shared_ptr<StringGeneralCaseConverter> _lowerConverter;
boost::shared_ptr<StringGeneralCaseConverter> _upperConverter; boost::shared_ptr<StringGeneralCaseConverter> _upperConverter;

View File

@ -571,4 +571,16 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
} }
BOOST_AUTO_TEST_CASE( TokenizeBadUtf )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
TokenizedSentence ts = concordia.tokenize("Ala \xc3\x28 23 --- ..//,./ '''8902347 poSiaDa KOTA", false, false);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ala ne_number ne_number posiada kota");
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()