lowercasing bad utf
This commit is contained in:
parent
2eda92fe7a
commit
53b100b2e4
@ -2,6 +2,8 @@
|
|||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
#include <boost/algorithm/string/case_conv.hpp>
|
#include <boost/algorithm/string/case_conv.hpp>
|
||||||
#include <boost/locale.hpp>
|
#include <boost/locale.hpp>
|
||||||
|
#include <string>
|
||||||
|
#include <algorithm>
|
||||||
#include "utf8/utf8.h"
|
#include "utf8/utf8.h"
|
||||||
|
|
||||||
|
|
||||||
@ -12,9 +14,18 @@ TextUtils::TextUtils() {
|
|||||||
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char TextUtils::_easytolower(char in) {
|
||||||
|
if(in <= 'Z' && in >= 'A') {
|
||||||
|
return in - ('Z' - 'z');
|
||||||
|
}
|
||||||
|
return in;
|
||||||
|
}
|
||||||
|
|
||||||
std::string TextUtils::toLowerCase(const std::string & text) {
|
std::string TextUtils::toLowerCase(const std::string & text) {
|
||||||
if (!utf8::is_valid(text.begin(), text.end())) {
|
if (!utf8::is_valid(text.begin(), text.end())) {
|
||||||
throw ConcordiaException("Bad input encoding, use UTF-8");
|
std::string data = text;
|
||||||
|
std::transform(data.begin(), data.end(), data.begin(), TextUtils::_easytolower);
|
||||||
|
return data;
|
||||||
}
|
}
|
||||||
return simpleConvert(*_lowerConverter, text);
|
return simpleConvert(*_lowerConverter, text);
|
||||||
}
|
}
|
||||||
|
@ -41,6 +41,8 @@ private:
|
|||||||
|
|
||||||
void operator=(TextUtils const&); // Don't implement
|
void operator=(TextUtils const&); // Don't implement
|
||||||
|
|
||||||
|
static char _easytolower(char in);
|
||||||
|
|
||||||
boost::shared_ptr<StringGeneralCaseConverter> _lowerConverter;
|
boost::shared_ptr<StringGeneralCaseConverter> _lowerConverter;
|
||||||
|
|
||||||
boost::shared_ptr<StringGeneralCaseConverter> _upperConverter;
|
boost::shared_ptr<StringGeneralCaseConverter> _upperConverter;
|
||||||
|
@ -571,4 +571,16 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( TokenizeBadUtf )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
TokenizedSentence ts = concordia.tokenize("Ala \xc3\x28 23 --- ..//,./ '''8902347 poSiaDa KOTA", false, false);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ala ne_number ne_number posiada kota");
|
||||||
|
|
||||||
|
concordia.clearIndex();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
Loading…
Reference in New Issue
Block a user