From 53b100b2e4cf60f96ba352cd5329a3837047cc59 Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 13 Dec 2018 17:43:01 +0100 Subject: [PATCH] lowercasing bad utf --- concordia/common/text_utils.cpp | 13 ++++++++++++- concordia/common/text_utils.hpp | 2 ++ concordia/t/test_concordia.cpp | 12 ++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/concordia/common/text_utils.cpp b/concordia/common/text_utils.cpp index 91049de..902c392 100644 --- a/concordia/common/text_utils.cpp +++ b/concordia/common/text_utils.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include "utf8/utf8.h" @@ -12,9 +14,18 @@ TextUtils::TextUtils() { StringCaseConverterManager::getInstance().getUpperCaseConverter("pl"); } +char TextUtils::_easytolower(char in) { + if(in <= 'Z' && in >= 'A') { + return in - ('Z' - 'z'); + } + return in; +} + std::string TextUtils::toLowerCase(const std::string & text) { if (!utf8::is_valid(text.begin(), text.end())) { - throw ConcordiaException("Bad input encoding, use UTF-8"); + std::string data = text; + std::transform(data.begin(), data.end(), data.begin(), TextUtils::_easytolower); + return data; } return simpleConvert(*_lowerConverter, text); } diff --git a/concordia/common/text_utils.hpp b/concordia/common/text_utils.hpp index de83d1e..5c4f92a 100644 --- a/concordia/common/text_utils.hpp +++ b/concordia/common/text_utils.hpp @@ -41,6 +41,8 @@ private: void operator=(TextUtils const&); // Don't implement + static char _easytolower(char in); + boost::shared_ptr _lowerConverter; boost::shared_ptr _upperConverter; diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index f0c008a..2d5063c 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -571,4 +571,16 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly ) } +BOOST_AUTO_TEST_CASE( TokenizeBadUtf ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + TokenizedSentence ts = concordia.tokenize("Ala \xc3\x28 23 --- ..//,./ '''8902347 poSiaDa KOTA", false, false); + + BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ala ne_number ne_number posiada kota"); + + concordia.clearIndex(); + +} + BOOST_AUTO_TEST_SUITE_END()