diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b46060..d23e74a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,7 @@ file(MAKE_DIRECTORY ${PROD_RESOURCES_DIRECTORY}/temp) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") -set(BASE_TARGETS concordia) +set(BASE_TARGETS concordia utf8case) @@ -135,7 +135,7 @@ configure_file ( # Concordia: sub-projects # ================================================ -set(ALL_DIRECTORIES concordia concordia-console libdivsufsort) +set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case) include_directories("${concordia_SOURCE_DIR}") @@ -150,7 +150,7 @@ endforeach(dir) # Tests # ================================================ -set(TESTS_TARGETS concordia-tests) +set(TESTS_TARGETS concordia-tests utf8case-tests) add_subdirectory(tests) diff --git a/TODO.txt b/TODO.txt index 2f60d3f..c366fad 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,3 +1,4 @@ 1. lokalizowane to_lower 2. anonimizacja zdań 3. Dzielenie zdań (max 255 tokenów) +4. concordia-server diff --git a/tests/tests.hpp b/tests/tests.hpp new file mode 100644 index 0000000..ecbddc8 --- /dev/null +++ b/tests/tests.hpp @@ -0,0 +1,7 @@ +#ifndef TESTS_HDR +#define TESTS_HDR + +#define BOOST_TEST_NO_MAIN +#include + +#endif diff --git a/utf8/CMakeLists.txt b/utf8/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/utf8/utf8.h b/utf8/utf8.h new file mode 100644 index 0000000..836bf69 --- /dev/null +++ b/utf8/utf8.h @@ -0,0 +1,45 @@ +/*! + ## Character encoding + + In PSI toolkit UTF8 is uniformly used. All the textual data is + assumed to be encoded in UTF8. + + Technically, std::string is simply used to store UTF8 strings. To handle + UTF8, a small external library was incorporated into the project, + see: http://utfcpp.sourceforge.net/ +*/ + +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/utf8/utf8/checked.h b/utf8/utf8/checked.h new file mode 100644 index 0000000..383c1e5 --- /dev/null +++ b/utf8/utf8/checked.h @@ -0,0 +1,327 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t cp) : cp(cp) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "invalid UTF-8, convert file to UTF-8 encoding and run again"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "invalid UTF-16, convert file to UTF-16 encoding and run again"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (internal::is_trail(*start) && start != end) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = internal::mask16(0xfffd); + return replace_invalid(start, end, out, replacement_marker); + } + + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = internal::validate_next(it, end, &cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return next(it, end); + } + + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return peek_next(it, end); + } + + /// Deprecated in versions that include "prior" + template + uint32_t previous(octet_iterator& it, octet_iterator pass_start) + { + octet_iterator end = it; + while (internal::is_trail(*(--it))) + if (it == pass_start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return next(temp, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + next(it, end); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = internal::mask16(*start++); + // Take care of surrogate pairs first + if (internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = internal::mask16(*start++); + if (internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start != end) + (*result++) = next(start, end); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + iterator () {}; + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + next(it, range_end); + return temp; + } + iterator& operator -- () + { + prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#endif //header guard + + diff --git a/utf8/utf8/core.h b/utf8/utf8/core.h new file mode 100644 index 0000000..268cf7c --- /dev/null +++ b/utf8/utf8/core.h @@ -0,0 +1,358 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline uint8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline uint16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + template + inline bool is_trail(octet_type oc) + { + return ((mask8(oc) >> 6) == 0x2); + } + + template + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !is_surrogate(cp)); + } + + template + inline typename std::iterator_traits::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// get_sequence_x functions decode utf-8 sequences of the length x + + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point) + { + if (it != end) { + if (code_point) + *code_point = mask8(*it); + return UTF8_OK; + } + return NOT_ENOUGH_ROOM; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point) + { + utf_error ret_code = NOT_ENOUGH_ROOM; + + if (it != end) { + uint32_t cp = mask8(*it); + if (++it != end) { + if (is_trail(*it)) { + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + + if (code_point) + *code_point = cp; + ret_code = UTF8_OK; + } + else + ret_code = INCOMPLETE_SEQUENCE; + } + else + ret_code = NOT_ENOUGH_ROOM; + } + + return ret_code; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point) + { + utf_error ret_code = NOT_ENOUGH_ROOM; + + if (it != end) { + uint32_t cp = mask8(*it); + if (++it != end) { + if (is_trail(*it)) { + cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); + if (++it != end) { + if (is_trail(*it)) { + cp += (*it) & 0x3f; + + if (code_point) + *code_point = cp; + ret_code = UTF8_OK; + } + else + ret_code = INCOMPLETE_SEQUENCE; + } + else + ret_code = NOT_ENOUGH_ROOM; + } + else + ret_code = INCOMPLETE_SEQUENCE; + } + else + ret_code = NOT_ENOUGH_ROOM; + } + + return ret_code; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point) + { + utf_error ret_code = NOT_ENOUGH_ROOM; + + if (it != end) { + uint32_t cp = mask8(*it); + if (++it != end) { + if (is_trail(*it)) { + cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); + if (++it != end) { + if (is_trail(*it)) { + cp += (mask8(*it) << 6) & 0xfff; + if (++it != end) { + if (is_trail(*it)) { + cp += (*it) & 0x3f; + + if (code_point) + *code_point = cp; + ret_code = UTF8_OK; + } + else + ret_code = INCOMPLETE_SEQUENCE; + } + else + ret_code = NOT_ENOUGH_ROOM; + } + else + ret_code = INCOMPLETE_SEQUENCE; + } + else + ret_code = NOT_ENOUGH_ROOM; + } + else + ret_code = INCOMPLETE_SEQUENCE; + } + else + ret_code = NOT_ENOUGH_ROOM; + } + + return ret_code; + } + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) + { + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits::difference_type octet_difference_type; + octet_difference_type length = sequence_length(it); + if (length == 0) + return INVALID_LEAD; + + // Now that we have a valid sequence length, get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 1: + err = get_sequence_1(it, end, &cp); + break; + case 2: + err = get_sequence_2(it, end, &cp); + break; + case 3: + err = get_sequence_3(it, end, &cp); + break; + case 4: + err = get_sequence_4(it, end, &cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (is_code_point_valid(cp)) { + if (!is_overlong_sequence(cp, length)){ + // Passed! Return here. + if (code_point) + *code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + return validate_next(it, end, 0); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + internal::utf_error err_code = internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (find_invalid(start, end) == end); + } + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (internal::mask8(*it++)) == bom[0]) && + ((it != end) && (internal::mask8(*it++)) == bom[1]) && + ((it != end) && (internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 + template + inline bool is_bom (octet_iterator it) + { + return ( + (internal::mask8(*it++)) == bom[0] && + (internal::mask8(*it++)) == bom[1] && + (internal::mask8(*it)) == bom[2] + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/utf8/utf8/unchecked.h b/utf8/utf8/unchecked.h new file mode 100644 index 0000000..95a3d74 --- /dev/null +++ b/utf8/utf8/unchecked.h @@ -0,0 +1,234 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + uint32_t sequence_length(octet_iterator it) + { + return utf8::internal::sequence_length(it); + } + + template + uint32_t next(octet_iterator& it) + { + uint32_t cp = internal::mask8(*it); + typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + uint32_t peek_next(octet_iterator it) + { + return next(it); + } + + template + uint32_t prior(octet_iterator& it) + { + while (internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return next(temp); + } + + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return prior(it); + } + + template + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + next(it); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + next(first); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = internal::mask16(*start++); + // Take care of surrogate pairs first + if (internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = next(start); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + public: + iterator () {}; + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + std::advance(it, internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + std::advance(it, internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + diff --git a/utf8case/CMakeLists.txt b/utf8case/CMakeLists.txt new file mode 100644 index 0000000..8c2edbb --- /dev/null +++ b/utf8case/CMakeLists.txt @@ -0,0 +1,28 @@ +add_library(utf8case SHARED + case_tables.cpp + range_based_case_converter.cpp + regular_contextual_case_converter.cpp + simple_convert.cpp + special_casing_converter.cpp + string_case_converter_manager.cpp + turkish_and_azeri_lower_contextual_case_converter.cpp + turkish_and_azeri_upper_contextual_case_converter.cpp +) + +add_subdirectory(t) + +install(TARGETS utf8case DESTINATION lib/) +install(FILES + case_converter_factory.hpp + general_case_converter.hpp + simple_convert.hpp + turkish_and_azeri_lower_contextual_case_converter.hpp + case_tables.hpp + range_based_case_converter.hpp + special_casing_converter.hpp + turkish_and_azeri_upper_contextual_case_converter.hpp + contextual_case_converter.hpp + regular_contextual_case_converter.hpp + string_case_converter_manager.hpp + +DESTINATION include/utf8case/) diff --git a/utf8case/case_converter_factory.hpp b/utf8case/case_converter_factory.hpp new file mode 100644 index 0000000..34211e8 --- /dev/null +++ b/utf8case/case_converter_factory.hpp @@ -0,0 +1,149 @@ +#ifndef CASE_CONVERTER_FACTORY_HDR +#define CASE_CONVERTER_FACTORY_HDR + +#include + +#include "general_case_converter.hpp" +#include "concordia/concordia_exception.hpp" +#include "regular_contextual_case_converter.hpp" +#include "turkish_and_azeri_lower_contextual_case_converter.hpp" +#include "turkish_and_azeri_upper_contextual_case_converter.hpp" + +const size_t NUMBER_OF_CASE_TYPES = 3; + +template +class CaseConverterFactory { + +private: + enum { + LOWER_INDEX = 0, + UPPER_INDEX = 1, + TITLE_INDEX = 2 + }; + + boost::shared_ptr rangeBasedCaseConverters_[NUMBER_OF_CASE_TYPES]; + boost::shared_ptr specialCasingConverters_[NUMBER_OF_CASE_TYPES]; + + boost::shared_ptr regularContextualCaseConverter_; + boost::shared_ptr turkishAndAzeriUpperContextualCaseConverter_; + boost::shared_ptr turkishAndAzeriLowerContextualCaseConverter_; + + class Exception : public ConcordiaException { + public: + Exception(const std::string& msg): ConcordiaException(msg) { + } + + virtual ~Exception() throw() {} + }; + + void checkRawConverters_(int case_index) { + if (!rangeBasedCaseConverters_[case_index]) { + boost::shared_ptr converter; + + switch (case_index) { + case LOWER_INDEX: converter.reset( + new RangeBasedCaseConverter( + LOWER_CASE_RANGES_SIZE, + LOWER_CASE_RANGES)); + break; + case UPPER_INDEX: converter.reset( + new RangeBasedCaseConverter( + UPPER_CASE_RANGES_SIZE, + UPPER_CASE_RANGES)); + break; + case TITLE_INDEX: converter.reset( + new RangeBasedCaseConverter( + TITLE_CASE_RANGES_SIZE, + TITLE_CASE_RANGES)); + break; + default: + throw Exception("????"); + } + + rangeBasedCaseConverters_[case_index] = converter; + } + + if (!specialCasingConverters_[case_index]) { + boost::shared_ptr converter; + + switch (case_index) { + case LOWER_INDEX: converter.reset( + new SpecialCasingConverter( + LOWER_SPECIAL_CASING_SIZE, + LOWER_SPECIAL_CASING)); + break; + case UPPER_INDEX: converter.reset( + new SpecialCasingConverter( + UPPER_SPECIAL_CASING_SIZE, + UPPER_SPECIAL_CASING)); + break; + case TITLE_INDEX: converter.reset( + new SpecialCasingConverter( + TITLE_SPECIAL_CASING_SIZE, + TITLE_SPECIAL_CASING)); + break; + default: + throw Exception("????"); + } + + specialCasingConverters_[case_index] = converter; + } + } + + boost::shared_ptr > getCaseConverter_( + int case_index, const std::string& language_code) { + + checkRawConverters_(case_index); + + return boost::shared_ptr >( + new GeneralCaseConverter ( + rangeBasedCaseConverters_[case_index], + specialCasingConverters_[case_index], + getContextualCaseConverterForLanguage_(language_code, case_index))); + } + +public: + CaseConverterFactory(): + regularContextualCaseConverter_( + boost::shared_ptr(new RegularContextualCaseConverter())), + turkishAndAzeriUpperContextualCaseConverter_( + boost::shared_ptr( + new TurkishAndAzeriUpperContextualCaseConverter())), + turkishAndAzeriLowerContextualCaseConverter_( + boost::shared_ptr( + new TurkishAndAzeriLowerContextualCaseConverter())) { + } + + boost::shared_ptr getContextualCaseConverterForLanguage_( + const std::string& languageCode, int caseIndex) { + if (languageCode == "lt") + throw Exception(std::string("language '") + languageCode + + "' is not handled yet in lower/upper/title-casing"); + + if (languageCode == "tr" || languageCode == "az") + return + caseIndex == LOWER_INDEX + ? turkishAndAzeriLowerContextualCaseConverter_ + : turkishAndAzeriUpperContextualCaseConverter_; + + return regularContextualCaseConverter_; + } + + boost::shared_ptr > + getLowerCaseConverter(const std::string& language_code) { + return getCaseConverter_(LOWER_INDEX, language_code); + } + + boost::shared_ptr > + getUpperCaseConverter(const std::string& language_code) { + return getCaseConverter_(UPPER_INDEX, language_code); + } + + boost::shared_ptr > + getTitleCaseConverter(const std::string& language_code) { + return getCaseConverter_(TITLE_INDEX, language_code); + } + +}; + +#endif diff --git a/utf8case/case_tables.cpp b/utf8case/case_tables.cpp new file mode 100644 index 0000000..4816d59 --- /dev/null +++ b/utf8case/case_tables.cpp @@ -0,0 +1,804 @@ +// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT. + +#include "case_tables.hpp" + +const size_t LOWER_CASE_RANGES_SIZE = 151; +const CaseConversionRecord LOWER_CASE_RANGES[LOWER_CASE_RANGES_SIZE] = { + {65, 90, 32}, + {192, 214, 32}, + {216, 222, 32}, + {256, 302, EVEN_ODD_SKIP}, + {304, 304, -199}, + {306, 310, EVEN_ODD_SKIP}, + {313, 327, ODD_EVEN_SKIP}, + {330, 374, EVEN_ODD_SKIP}, + {376, 376, -121}, + {377, 381, ODD_EVEN_SKIP}, + {385, 385, 210}, + {386, 388, EVEN_ODD_SKIP}, + {390, 390, 206}, + {391, 391, ODD_EVEN}, + {393, 394, 205}, + {395, 395, ODD_EVEN}, + {398, 398, 79}, + {399, 399, 202}, + {400, 400, 203}, + {401, 401, ODD_EVEN}, + {403, 403, 205}, + {404, 404, 207}, + {406, 406, 211}, + {407, 407, 209}, + {408, 408, EVEN_ODD}, + {412, 412, 211}, + {413, 413, 213}, + {415, 415, 214}, + {416, 420, EVEN_ODD_SKIP}, + {422, 422, 218}, + {423, 423, ODD_EVEN}, + {425, 425, 218}, + {428, 428, EVEN_ODD}, + {430, 430, 218}, + {431, 431, ODD_EVEN}, + {433, 434, 217}, + {435, 437, ODD_EVEN_SKIP}, + {439, 439, 219}, + {440, 440, EVEN_ODD}, + {444, 444, EVEN_ODD}, + {452, 452, 2}, + {453, 453, ODD_EVEN}, + {455, 455, 2}, + {456, 456, EVEN_ODD}, + {458, 458, 2}, + {459, 475, ODD_EVEN_SKIP}, + {478, 494, EVEN_ODD_SKIP}, + {497, 497, 2}, + {498, 500, EVEN_ODD_SKIP}, + {502, 502, -97}, + {503, 503, -56}, + {504, 542, EVEN_ODD_SKIP}, + {544, 544, -130}, + {546, 562, EVEN_ODD_SKIP}, + {570, 570, 10795}, + {571, 571, ODD_EVEN}, + {573, 573, -163}, + {574, 574, 10792}, + {577, 577, ODD_EVEN}, + {579, 579, -195}, + {580, 580, 69}, + {581, 581, 71}, + {582, 590, EVEN_ODD_SKIP}, + {880, 882, EVEN_ODD_SKIP}, + {886, 886, EVEN_ODD}, + {902, 902, 38}, + {904, 906, 37}, + {908, 908, 64}, + {910, 911, 63}, + {913, 929, 32}, + {931, 939, 32}, + {975, 975, 8}, + {984, 1006, EVEN_ODD_SKIP}, + {1012, 1012, -60}, + {1015, 1015, ODD_EVEN}, + {1017, 1017, -7}, + {1018, 1018, EVEN_ODD}, + {1021, 1023, -130}, + {1024, 1039, 80}, + {1040, 1071, 32}, + {1120, 1152, EVEN_ODD_SKIP}, + {1162, 1214, EVEN_ODD_SKIP}, + {1216, 1216, 15}, + {1217, 1229, ODD_EVEN_SKIP}, + {1232, 1318, EVEN_ODD_SKIP}, + {1329, 1366, 48}, + {4256, 4293, 7264}, + {7680, 7828, EVEN_ODD_SKIP}, + {7838, 7838, -7615}, + {7840, 7934, EVEN_ODD_SKIP}, + {7944, 7951, -8}, + {7960, 7965, -8}, + {7976, 7983, -8}, + {7992, 7999, -8}, + {8008, 8013, -8}, + {8025, 8025, -8}, + {8027, 8027, -8}, + {8029, 8029, -8}, + {8031, 8031, -8}, + {8040, 8047, -8}, + {8072, 8079, -8}, + {8088, 8095, -8}, + {8104, 8111, -8}, + {8120, 8121, -8}, + {8122, 8123, -74}, + {8124, 8124, -9}, + {8136, 8139, -86}, + {8140, 8140, -9}, + {8152, 8153, -8}, + {8154, 8155, -100}, + {8168, 8169, -8}, + {8170, 8171, -112}, + {8172, 8172, -7}, + {8184, 8185, -128}, + {8186, 8187, -126}, + {8188, 8188, -9}, + {8486, 8486, -7517}, + {8490, 8490, -8383}, + {8491, 8491, -8262}, + {8498, 8498, 28}, + {8544, 8559, 16}, + {8579, 8579, ODD_EVEN}, + {9398, 9423, 26}, + {11264, 11310, 48}, + {11360, 11360, EVEN_ODD}, + {11362, 11362, -10743}, + {11363, 11363, -3814}, + {11364, 11364, -10727}, + {11367, 11371, ODD_EVEN_SKIP}, + {11373, 11373, -10780}, + {11374, 11374, -10749}, + {11375, 11375, -10783}, + {11376, 11376, -10782}, + {11378, 11378, EVEN_ODD}, + {11381, 11381, ODD_EVEN}, + {11390, 11391, -10815}, + {11392, 11490, EVEN_ODD_SKIP}, + {11499, 11501, ODD_EVEN_SKIP}, + {42560, 42604, EVEN_ODD_SKIP}, + {42624, 42646, EVEN_ODD_SKIP}, + {42786, 42798, EVEN_ODD_SKIP}, + {42802, 42862, EVEN_ODD_SKIP}, + {42873, 42875, ODD_EVEN_SKIP}, + {42877, 42877, -35332}, + {42878, 42886, EVEN_ODD_SKIP}, + {42891, 42891, ODD_EVEN}, + {42893, 42893, -42280}, + {42896, 42896, EVEN_ODD}, + {42912, 42920, EVEN_ODD_SKIP}, + {65313, 65338, 32}, + {66560, 66599, 40} +}; +const size_t UPPER_CASE_RANGES_SIZE = 161; +const CaseConversionRecord UPPER_CASE_RANGES[UPPER_CASE_RANGES_SIZE] = { + {97, 122, -32}, + {181, 181, 743}, + {224, 246, -32}, + {248, 254, -32}, + {255, 255, 121}, + {257, 303, EVEN_ODD_SKIP}, + {305, 305, -232}, + {307, 311, EVEN_ODD_SKIP}, + {314, 328, ODD_EVEN_SKIP}, + {331, 375, EVEN_ODD_SKIP}, + {378, 382, ODD_EVEN_SKIP}, + {383, 383, -300}, + {384, 384, 195}, + {387, 389, EVEN_ODD_SKIP}, + {392, 392, ODD_EVEN}, + {396, 396, ODD_EVEN}, + {402, 402, ODD_EVEN}, + {405, 405, 97}, + {409, 409, EVEN_ODD}, + {410, 410, 163}, + {414, 414, 130}, + {417, 421, EVEN_ODD_SKIP}, + {424, 424, ODD_EVEN}, + {429, 429, EVEN_ODD}, + {432, 432, ODD_EVEN}, + {436, 438, ODD_EVEN_SKIP}, + {441, 441, EVEN_ODD}, + {445, 445, EVEN_ODD}, + {447, 447, 56}, + {453, 453, EVEN_ODD}, + {454, 454, -2}, + {456, 456, ODD_EVEN}, + {457, 457, -2}, + {459, 459, EVEN_ODD}, + {460, 460, -2}, + {462, 476, ODD_EVEN_SKIP}, + {477, 477, -79}, + {479, 495, EVEN_ODD_SKIP}, + {498, 498, ODD_EVEN}, + {499, 499, -2}, + {501, 501, EVEN_ODD}, + {505, 543, EVEN_ODD_SKIP}, + {547, 563, EVEN_ODD_SKIP}, + {572, 572, ODD_EVEN}, + {575, 576, 10815}, + {578, 578, ODD_EVEN}, + {583, 591, EVEN_ODD_SKIP}, + {592, 592, 10783}, + {593, 593, 10780}, + {594, 594, 10782}, + {595, 595, -210}, + {596, 596, -206}, + {598, 599, -205}, + {601, 601, -202}, + {603, 603, -203}, + {608, 608, -205}, + {611, 611, -207}, + {613, 613, 42280}, + {616, 616, -209}, + {617, 617, -211}, + {619, 619, 10743}, + {623, 623, -211}, + {625, 625, 10749}, + {626, 626, -213}, + {629, 629, -214}, + {637, 637, 10727}, + {640, 640, -218}, + {643, 643, -218}, + {648, 648, -218}, + {649, 649, -69}, + {650, 651, -217}, + {652, 652, -71}, + {658, 658, -219}, + {837, 837, 84}, + {881, 883, EVEN_ODD_SKIP}, + {887, 887, EVEN_ODD}, + {891, 893, 130}, + {940, 940, -38}, + {941, 943, -37}, + {945, 961, -32}, + {962, 962, -31}, + {963, 971, -32}, + {972, 972, -64}, + {973, 974, -63}, + {976, 976, -62}, + {977, 977, -57}, + {981, 981, -47}, + {982, 982, -54}, + {983, 983, -8}, + {985, 1007, EVEN_ODD_SKIP}, + {1008, 1008, -86}, + {1009, 1009, -80}, + {1010, 1010, 7}, + {1013, 1013, -96}, + {1016, 1016, ODD_EVEN}, + {1019, 1019, EVEN_ODD}, + {1072, 1103, -32}, + {1104, 1119, -80}, + {1121, 1153, EVEN_ODD_SKIP}, + {1163, 1215, EVEN_ODD_SKIP}, + {1218, 1230, ODD_EVEN_SKIP}, + {1231, 1231, -15}, + {1233, 1319, EVEN_ODD_SKIP}, + {1377, 1414, -48}, + {7545, 7545, 35332}, + {7549, 7549, 3814}, + {7681, 7829, EVEN_ODD_SKIP}, + {7835, 7835, -59}, + {7841, 7935, EVEN_ODD_SKIP}, + {7936, 7943, 8}, + {7952, 7957, 8}, + {7968, 7975, 8}, + {7984, 7991, 8}, + {8000, 8005, 8}, + {8017, 8017, 8}, + {8019, 8019, 8}, + {8021, 8021, 8}, + {8023, 8023, 8}, + {8032, 8039, 8}, + {8048, 8049, 74}, + {8050, 8053, 86}, + {8054, 8055, 100}, + {8056, 8057, 128}, + {8058, 8059, 112}, + {8060, 8061, 126}, + {8064, 8071, 8}, + {8080, 8087, 8}, + {8096, 8103, 8}, + {8112, 8113, 8}, + {8115, 8115, 9}, + {8126, 8126, -7205}, + {8131, 8131, 9}, + {8144, 8145, 8}, + {8160, 8161, 8}, + {8165, 8165, 7}, + {8179, 8179, 9}, + {8526, 8526, -28}, + {8560, 8575, -16}, + {8580, 8580, ODD_EVEN}, + {9424, 9449, -26}, + {11312, 11358, -48}, + {11361, 11361, EVEN_ODD}, + {11365, 11365, -10795}, + {11366, 11366, -10792}, + {11368, 11372, ODD_EVEN_SKIP}, + {11379, 11379, EVEN_ODD}, + {11382, 11382, ODD_EVEN}, + {11393, 11491, EVEN_ODD_SKIP}, + {11500, 11502, ODD_EVEN_SKIP}, + {11520, 11557, -7264}, + {42561, 42605, EVEN_ODD_SKIP}, + {42625, 42647, EVEN_ODD_SKIP}, + {42787, 42799, EVEN_ODD_SKIP}, + {42803, 42863, EVEN_ODD_SKIP}, + {42874, 42876, ODD_EVEN_SKIP}, + {42879, 42887, EVEN_ODD_SKIP}, + {42892, 42892, ODD_EVEN}, + {42897, 42897, EVEN_ODD}, + {42913, 42921, EVEN_ODD_SKIP}, + {65345, 65370, -32}, + {66600, 66639, -40} +}; +const size_t TITLE_CASE_RANGES_SIZE = 161; +const CaseConversionRecord TITLE_CASE_RANGES[TITLE_CASE_RANGES_SIZE] = { + {97, 122, -32}, + {181, 181, 743}, + {224, 246, -32}, + {248, 254, -32}, + {255, 255, 121}, + {257, 303, EVEN_ODD_SKIP}, + {305, 305, -232}, + {307, 311, EVEN_ODD_SKIP}, + {314, 328, ODD_EVEN_SKIP}, + {331, 375, EVEN_ODD_SKIP}, + {378, 382, ODD_EVEN_SKIP}, + {383, 383, -300}, + {384, 384, 195}, + {387, 389, EVEN_ODD_SKIP}, + {392, 392, ODD_EVEN}, + {396, 396, ODD_EVEN}, + {402, 402, ODD_EVEN}, + {405, 405, 97}, + {409, 409, EVEN_ODD}, + {410, 410, 163}, + {414, 414, 130}, + {417, 421, EVEN_ODD_SKIP}, + {424, 424, ODD_EVEN}, + {429, 429, EVEN_ODD}, + {432, 432, ODD_EVEN}, + {436, 438, ODD_EVEN_SKIP}, + {441, 441, EVEN_ODD}, + {445, 445, EVEN_ODD}, + {447, 447, 56}, + {452, 452, EVEN_ODD}, + {453, 453, 0}, + {454, 455, ODD_EVEN}, + {456, 456, 0}, + {457, 458, EVEN_ODD}, + {459, 459, 0}, + {460, 476, ODD_EVEN_SKIP}, + {477, 477, -79}, + {479, 495, EVEN_ODD_SKIP}, + {497, 497, ODD_EVEN}, + {498, 498, 0}, + {499, 501, EVEN_ODD_SKIP}, + {505, 543, EVEN_ODD_SKIP}, + {547, 563, EVEN_ODD_SKIP}, + {572, 572, ODD_EVEN}, + {575, 576, 10815}, + {578, 578, ODD_EVEN}, + {583, 591, EVEN_ODD_SKIP}, + {592, 592, 10783}, + {593, 593, 10780}, + {594, 594, 10782}, + {595, 595, -210}, + {596, 596, -206}, + {598, 599, -205}, + {601, 601, -202}, + {603, 603, -203}, + {608, 608, -205}, + {611, 611, -207}, + {613, 613, 42280}, + {616, 616, -209}, + {617, 617, -211}, + {619, 619, 10743}, + {623, 623, -211}, + {625, 625, 10749}, + {626, 626, -213}, + {629, 629, -214}, + {637, 637, 10727}, + {640, 640, -218}, + {643, 643, -218}, + {648, 648, -218}, + {649, 649, -69}, + {650, 651, -217}, + {652, 652, -71}, + {658, 658, -219}, + {837, 837, 84}, + {881, 883, EVEN_ODD_SKIP}, + {887, 887, EVEN_ODD}, + {891, 893, 130}, + {940, 940, -38}, + {941, 943, -37}, + {945, 961, -32}, + {962, 962, -31}, + {963, 971, -32}, + {972, 972, -64}, + {973, 974, -63}, + {976, 976, -62}, + {977, 977, -57}, + {981, 981, -47}, + {982, 982, -54}, + {983, 983, -8}, + {985, 1007, EVEN_ODD_SKIP}, + {1008, 1008, -86}, + {1009, 1009, -80}, + {1010, 1010, 7}, + {1013, 1013, -96}, + {1016, 1016, ODD_EVEN}, + {1019, 1019, EVEN_ODD}, + {1072, 1103, -32}, + {1104, 1119, -80}, + {1121, 1153, EVEN_ODD_SKIP}, + {1163, 1215, EVEN_ODD_SKIP}, + {1218, 1230, ODD_EVEN_SKIP}, + {1231, 1231, -15}, + {1233, 1319, EVEN_ODD_SKIP}, + {1377, 1414, -48}, + {7545, 7545, 35332}, + {7549, 7549, 3814}, + {7681, 7829, EVEN_ODD_SKIP}, + {7835, 7835, -59}, + {7841, 7935, EVEN_ODD_SKIP}, + {7936, 7943, 8}, + {7952, 7957, 8}, + {7968, 7975, 8}, + {7984, 7991, 8}, + {8000, 8005, 8}, + {8017, 8017, 8}, + {8019, 8019, 8}, + {8021, 8021, 8}, + {8023, 8023, 8}, + {8032, 8039, 8}, + {8048, 8049, 74}, + {8050, 8053, 86}, + {8054, 8055, 100}, + {8056, 8057, 128}, + {8058, 8059, 112}, + {8060, 8061, 126}, + {8064, 8071, 8}, + {8080, 8087, 8}, + {8096, 8103, 8}, + {8112, 8113, 8}, + {8115, 8115, 9}, + {8126, 8126, -7205}, + {8131, 8131, 9}, + {8144, 8145, 8}, + {8160, 8161, 8}, + {8165, 8165, 7}, + {8179, 8179, 9}, + {8526, 8526, -28}, + {8560, 8575, -16}, + {8580, 8580, ODD_EVEN}, + {9424, 9449, -26}, + {11312, 11358, -48}, + {11361, 11361, EVEN_ODD}, + {11365, 11365, -10795}, + {11366, 11366, -10792}, + {11368, 11372, ODD_EVEN_SKIP}, + {11379, 11379, EVEN_ODD}, + {11382, 11382, ODD_EVEN}, + {11393, 11491, EVEN_ODD_SKIP}, + {11500, 11502, ODD_EVEN_SKIP}, + {11520, 11557, -7264}, + {42561, 42605, EVEN_ODD_SKIP}, + {42625, 42647, EVEN_ODD_SKIP}, + {42787, 42799, EVEN_ODD_SKIP}, + {42803, 42863, EVEN_ODD_SKIP}, + {42874, 42876, ODD_EVEN_SKIP}, + {42879, 42887, EVEN_ODD_SKIP}, + {42892, 42892, ODD_EVEN}, + {42897, 42897, EVEN_ODD}, + {42913, 42921, EVEN_ODD_SKIP}, + {65345, 65370, -32}, + {66600, 66639, -40} +}; +const size_t LOWER_SPECIAL_CASING_SIZE = 103; +const SpecialCasingConversionRecord LOWER_SPECIAL_CASING[LOWER_SPECIAL_CASING_SIZE] = { + {223, "\xc3\x9f"}, + {304, "\x69\xcc\x87"}, + {64256, "\xef\xac\x80"}, + {64257, "\xef\xac\x81"}, + {64258, "\xef\xac\x82"}, + {64259, "\xef\xac\x83"}, + {64260, "\xef\xac\x84"}, + {64261, "\xef\xac\x85"}, + {64262, "\xef\xac\x86"}, + {1415, "\xd6\x87"}, + {64275, "\xef\xac\x93"}, + {64276, "\xef\xac\x94"}, + {64277, "\xef\xac\x95"}, + {64278, "\xef\xac\x96"}, + {64279, "\xef\xac\x97"}, + {329, "\xc5\x89"}, + {912, "\xce\x90"}, + {944, "\xce\xb0"}, + {496, "\xc7\xb0"}, + {7830, "\xe1\xba\x96"}, + {7831, "\xe1\xba\x97"}, + {7832, "\xe1\xba\x98"}, + {7833, "\xe1\xba\x99"}, + {7834, "\xe1\xba\x9a"}, + {8016, "\xe1\xbd\x90"}, + {8018, "\xe1\xbd\x92"}, + {8020, "\xe1\xbd\x94"}, + {8022, "\xe1\xbd\x96"}, + {8118, "\xe1\xbe\xb6"}, + {8134, "\xe1\xbf\x86"}, + {8146, "\xe1\xbf\x92"}, + {8147, "\xe1\xbf\x93"}, + {8150, "\xe1\xbf\x96"}, + {8151, "\xe1\xbf\x97"}, + {8162, "\xe1\xbf\xa2"}, + {8163, "\xe1\xbf\xa3"}, + {8164, "\xe1\xbf\xa4"}, + {8166, "\xe1\xbf\xa6"}, + {8167, "\xe1\xbf\xa7"}, + {8182, "\xe1\xbf\xb6"}, + {8064, "\xe1\xbe\x80"}, + {8065, "\xe1\xbe\x81"}, + {8066, "\xe1\xbe\x82"}, + {8067, "\xe1\xbe\x83"}, + {8068, "\xe1\xbe\x84"}, + {8069, "\xe1\xbe\x85"}, + {8070, "\xe1\xbe\x86"}, + {8071, "\xe1\xbe\x87"}, + {8072, "\xe1\xbe\x80"}, + {8073, "\xe1\xbe\x81"}, + {8074, "\xe1\xbe\x82"}, + {8075, "\xe1\xbe\x83"}, + {8076, "\xe1\xbe\x84"}, + {8077, "\xe1\xbe\x85"}, + {8078, "\xe1\xbe\x86"}, + {8079, "\xe1\xbe\x87"}, + {8080, "\xe1\xbe\x90"}, + {8081, "\xe1\xbe\x91"}, + {8082, "\xe1\xbe\x92"}, + {8083, "\xe1\xbe\x93"}, + {8084, "\xe1\xbe\x94"}, + {8085, "\xe1\xbe\x95"}, + {8086, "\xe1\xbe\x96"}, + {8087, "\xe1\xbe\x97"}, + {8088, "\xe1\xbe\x90"}, + {8089, "\xe1\xbe\x91"}, + {8090, "\xe1\xbe\x92"}, + {8091, "\xe1\xbe\x93"}, + {8092, "\xe1\xbe\x94"}, + {8093, "\xe1\xbe\x95"}, + {8094, "\xe1\xbe\x96"}, + {8095, "\xe1\xbe\x97"}, + {8096, "\xe1\xbe\xa0"}, + {8097, "\xe1\xbe\xa1"}, + {8098, "\xe1\xbe\xa2"}, + {8099, "\xe1\xbe\xa3"}, + {8100, "\xe1\xbe\xa4"}, + {8101, "\xe1\xbe\xa5"}, + {8102, "\xe1\xbe\xa6"}, + {8103, "\xe1\xbe\xa7"}, + {8104, "\xe1\xbe\xa0"}, + {8105, "\xe1\xbe\xa1"}, + {8106, "\xe1\xbe\xa2"}, + {8107, "\xe1\xbe\xa3"}, + {8108, "\xe1\xbe\xa4"}, + {8109, "\xe1\xbe\xa5"}, + {8110, "\xe1\xbe\xa6"}, + {8111, "\xe1\xbe\xa7"}, + {8115, "\xe1\xbe\xb3"}, + {8124, "\xe1\xbe\xb3"}, + {8131, "\xe1\xbf\x83"}, + {8140, "\xe1\xbf\x83"}, + {8179, "\xe1\xbf\xb3"}, + {8188, "\xe1\xbf\xb3"}, + {8114, "\xe1\xbe\xb2"}, + {8116, "\xe1\xbe\xb4"}, + {8130, "\xe1\xbf\x82"}, + {8132, "\xe1\xbf\x84"}, + {8178, "\xe1\xbf\xb2"}, + {8180, "\xe1\xbf\xb4"}, + {8119, "\xe1\xbe\xb7"}, + {8135, "\xe1\xbf\x87"}, + {8183, "\xe1\xbf\xb7"} +}; +const size_t TITLE_SPECIAL_CASING_SIZE = 103; +const SpecialCasingConversionRecord TITLE_SPECIAL_CASING[TITLE_SPECIAL_CASING_SIZE] = { + {223, "\x53\x73"}, + {304, "\xc4\xb0"}, + {64256, "\x46\x66"}, + {64257, "\x46\x69"}, + {64258, "\x46\x6c"}, + {64259, "\x46\x66\x69"}, + {64260, "\x46\x66\x6c"}, + {64261, "\x53\x74"}, + {64262, "\x53\x74"}, + {1415, "\xd4\xb5\xd6\x82"}, + {64275, "\xd5\x84\xd5\xb6"}, + {64276, "\xd5\x84\xd5\xa5"}, + {64277, "\xd5\x84\xd5\xab"}, + {64278, "\xd5\x8e\xd5\xb6"}, + {64279, "\xd5\x84\xd5\xad"}, + {329, "\xca\xbc\x4e"}, + {912, "\xce\x99\xcc\x88\xcc\x81"}, + {944, "\xce\xa5\xcc\x88\xcc\x81"}, + {496, "\x4a\xcc\x8c"}, + {7830, "\x48\xcc\xb1"}, + {7831, "\x54\xcc\x88"}, + {7832, "\x57\xcc\x8a"}, + {7833, "\x59\xcc\x8a"}, + {7834, "\x41\xca\xbe"}, + {8016, "\xce\xa5\xcc\x93"}, + {8018, "\xce\xa5\xcc\x93\xcc\x80"}, + {8020, "\xce\xa5\xcc\x93\xcc\x81"}, + {8022, "\xce\xa5\xcc\x93\xcd\x82"}, + {8118, "\xce\x91\xcd\x82"}, + {8134, "\xce\x97\xcd\x82"}, + {8146, "\xce\x99\xcc\x88\xcc\x80"}, + {8147, "\xce\x99\xcc\x88\xcc\x81"}, + {8150, "\xce\x99\xcd\x82"}, + {8151, "\xce\x99\xcc\x88\xcd\x82"}, + {8162, "\xce\xa5\xcc\x88\xcc\x80"}, + {8163, "\xce\xa5\xcc\x88\xcc\x81"}, + {8164, "\xce\xa1\xcc\x93"}, + {8166, "\xce\xa5\xcd\x82"}, + {8167, "\xce\xa5\xcc\x88\xcd\x82"}, + {8182, "\xce\xa9\xcd\x82"}, + {8064, "\xe1\xbe\x88"}, + {8065, "\xe1\xbe\x89"}, + {8066, "\xe1\xbe\x8a"}, + {8067, "\xe1\xbe\x8b"}, + {8068, "\xe1\xbe\x8c"}, + {8069, "\xe1\xbe\x8d"}, + {8070, "\xe1\xbe\x8e"}, + {8071, "\xe1\xbe\x8f"}, + {8072, "\xe1\xbe\x88"}, + {8073, "\xe1\xbe\x89"}, + {8074, "\xe1\xbe\x8a"}, + {8075, "\xe1\xbe\x8b"}, + {8076, "\xe1\xbe\x8c"}, + {8077, "\xe1\xbe\x8d"}, + {8078, "\xe1\xbe\x8e"}, + {8079, "\xe1\xbe\x8f"}, + {8080, "\xe1\xbe\x98"}, + {8081, "\xe1\xbe\x99"}, + {8082, "\xe1\xbe\x9a"}, + {8083, "\xe1\xbe\x9b"}, + {8084, "\xe1\xbe\x9c"}, + {8085, "\xe1\xbe\x9d"}, + {8086, "\xe1\xbe\x9e"}, + {8087, "\xe1\xbe\x9f"}, + {8088, "\xe1\xbe\x98"}, + {8089, "\xe1\xbe\x99"}, + {8090, "\xe1\xbe\x9a"}, + {8091, "\xe1\xbe\x9b"}, + {8092, "\xe1\xbe\x9c"}, + {8093, "\xe1\xbe\x9d"}, + {8094, "\xe1\xbe\x9e"}, + {8095, "\xe1\xbe\x9f"}, + {8096, "\xe1\xbe\xa8"}, + {8097, "\xe1\xbe\xa9"}, + {8098, "\xe1\xbe\xaa"}, + {8099, "\xe1\xbe\xab"}, + {8100, "\xe1\xbe\xac"}, + {8101, "\xe1\xbe\xad"}, + {8102, "\xe1\xbe\xae"}, + {8103, "\xe1\xbe\xaf"}, + {8104, "\xe1\xbe\xa8"}, + {8105, "\xe1\xbe\xa9"}, + {8106, "\xe1\xbe\xaa"}, + {8107, "\xe1\xbe\xab"}, + {8108, "\xe1\xbe\xac"}, + {8109, "\xe1\xbe\xad"}, + {8110, "\xe1\xbe\xae"}, + {8111, "\xe1\xbe\xaf"}, + {8115, "\xe1\xbe\xbc"}, + {8124, "\xe1\xbe\xbc"}, + {8131, "\xe1\xbf\x8c"}, + {8140, "\xe1\xbf\x8c"}, + {8179, "\xe1\xbf\xbc"}, + {8188, "\xe1\xbf\xbc"}, + {8114, "\xe1\xbe\xba\xcd\x85"}, + {8116, "\xce\x86\xcd\x85"}, + {8130, "\xe1\xbf\x8a\xcd\x85"}, + {8132, "\xce\x89\xcd\x85"}, + {8178, "\xe1\xbf\xba\xcd\x85"}, + {8180, "\xce\x8f\xcd\x85"}, + {8119, "\xce\x91\xcd\x82\xcd\x85"}, + {8135, "\xce\x97\xcd\x82\xcd\x85"}, + {8183, "\xce\xa9\xcd\x82\xcd\x85"} +}; +const size_t UPPER_SPECIAL_CASING_SIZE = 103; +const SpecialCasingConversionRecord UPPER_SPECIAL_CASING[UPPER_SPECIAL_CASING_SIZE] = { + {223, "\x53\x53"}, + {304, "\xc4\xb0"}, + {64256, "\x46\x46"}, + {64257, "\x46\x49"}, + {64258, "\x46\x4c"}, + {64259, "\x46\x46\x49"}, + {64260, "\x46\x46\x4c"}, + {64261, "\x53\x54"}, + {64262, "\x53\x54"}, + {1415, "\xd4\xb5\xd5\x92"}, + {64275, "\xd5\x84\xd5\x86"}, + {64276, "\xd5\x84\xd4\xb5"}, + {64277, "\xd5\x84\xd4\xbb"}, + {64278, "\xd5\x8e\xd5\x86"}, + {64279, "\xd5\x84\xd4\xbd"}, + {329, "\xca\xbc\x4e"}, + {912, "\xce\x99\xcc\x88\xcc\x81"}, + {944, "\xce\xa5\xcc\x88\xcc\x81"}, + {496, "\x4a\xcc\x8c"}, + {7830, "\x48\xcc\xb1"}, + {7831, "\x54\xcc\x88"}, + {7832, "\x57\xcc\x8a"}, + {7833, "\x59\xcc\x8a"}, + {7834, "\x41\xca\xbe"}, + {8016, "\xce\xa5\xcc\x93"}, + {8018, "\xce\xa5\xcc\x93\xcc\x80"}, + {8020, "\xce\xa5\xcc\x93\xcc\x81"}, + {8022, "\xce\xa5\xcc\x93\xcd\x82"}, + {8118, "\xce\x91\xcd\x82"}, + {8134, "\xce\x97\xcd\x82"}, + {8146, "\xce\x99\xcc\x88\xcc\x80"}, + {8147, "\xce\x99\xcc\x88\xcc\x81"}, + {8150, "\xce\x99\xcd\x82"}, + {8151, "\xce\x99\xcc\x88\xcd\x82"}, + {8162, "\xce\xa5\xcc\x88\xcc\x80"}, + {8163, "\xce\xa5\xcc\x88\xcc\x81"}, + {8164, "\xce\xa1\xcc\x93"}, + {8166, "\xce\xa5\xcd\x82"}, + {8167, "\xce\xa5\xcc\x88\xcd\x82"}, + {8182, "\xce\xa9\xcd\x82"}, + {8064, "\xe1\xbc\x88\xce\x99"}, + {8065, "\xe1\xbc\x89\xce\x99"}, + {8066, "\xe1\xbc\x8a\xce\x99"}, + {8067, "\xe1\xbc\x8b\xce\x99"}, + {8068, "\xe1\xbc\x8c\xce\x99"}, + {8069, "\xe1\xbc\x8d\xce\x99"}, + {8070, "\xe1\xbc\x8e\xce\x99"}, + {8071, "\xe1\xbc\x8f\xce\x99"}, + {8072, "\xe1\xbc\x88\xce\x99"}, + {8073, "\xe1\xbc\x89\xce\x99"}, + {8074, "\xe1\xbc\x8a\xce\x99"}, + {8075, "\xe1\xbc\x8b\xce\x99"}, + {8076, "\xe1\xbc\x8c\xce\x99"}, + {8077, "\xe1\xbc\x8d\xce\x99"}, + {8078, "\xe1\xbc\x8e\xce\x99"}, + {8079, "\xe1\xbc\x8f\xce\x99"}, + {8080, "\xe1\xbc\xa8\xce\x99"}, + {8081, "\xe1\xbc\xa9\xce\x99"}, + {8082, "\xe1\xbc\xaa\xce\x99"}, + {8083, "\xe1\xbc\xab\xce\x99"}, + {8084, "\xe1\xbc\xac\xce\x99"}, + {8085, "\xe1\xbc\xad\xce\x99"}, + {8086, "\xe1\xbc\xae\xce\x99"}, + {8087, "\xe1\xbc\xaf\xce\x99"}, + {8088, "\xe1\xbc\xa8\xce\x99"}, + {8089, "\xe1\xbc\xa9\xce\x99"}, + {8090, "\xe1\xbc\xaa\xce\x99"}, + {8091, "\xe1\xbc\xab\xce\x99"}, + {8092, "\xe1\xbc\xac\xce\x99"}, + {8093, "\xe1\xbc\xad\xce\x99"}, + {8094, "\xe1\xbc\xae\xce\x99"}, + {8095, "\xe1\xbc\xaf\xce\x99"}, + {8096, "\xe1\xbd\xa8\xce\x99"}, + {8097, "\xe1\xbd\xa9\xce\x99"}, + {8098, "\xe1\xbd\xaa\xce\x99"}, + {8099, "\xe1\xbd\xab\xce\x99"}, + {8100, "\xe1\xbd\xac\xce\x99"}, + {8101, "\xe1\xbd\xad\xce\x99"}, + {8102, "\xe1\xbd\xae\xce\x99"}, + {8103, "\xe1\xbd\xaf\xce\x99"}, + {8104, "\xe1\xbd\xa8\xce\x99"}, + {8105, "\xe1\xbd\xa9\xce\x99"}, + {8106, "\xe1\xbd\xaa\xce\x99"}, + {8107, "\xe1\xbd\xab\xce\x99"}, + {8108, "\xe1\xbd\xac\xce\x99"}, + {8109, "\xe1\xbd\xad\xce\x99"}, + {8110, "\xe1\xbd\xae\xce\x99"}, + {8111, "\xe1\xbd\xaf\xce\x99"}, + {8115, "\xce\x91\xce\x99"}, + {8124, "\xce\x91\xce\x99"}, + {8131, "\xce\x97\xce\x99"}, + {8140, "\xce\x97\xce\x99"}, + {8179, "\xce\xa9\xce\x99"}, + {8188, "\xce\xa9\xce\x99"}, + {8114, "\xe1\xbe\xba\xce\x99"}, + {8116, "\xce\x86\xce\x99"}, + {8130, "\xe1\xbf\x8a\xce\x99"}, + {8132, "\xce\x89\xce\x99"}, + {8178, "\xe1\xbf\xba\xce\x99"}, + {8180, "\xce\x8f\xce\x99"}, + {8119, "\xce\x91\xcd\x82\xce\x99"}, + {8135, "\xce\x97\xcd\x82\xce\x99"}, + {8183, "\xce\xa9\xcd\x82\xce\x99"} +}; diff --git a/utf8case/case_tables.hpp b/utf8case/case_tables.hpp new file mode 100644 index 0000000..bd91695 --- /dev/null +++ b/utf8case/case_tables.hpp @@ -0,0 +1,42 @@ +#ifndef CASE_TABLES_HDR +#define CASE_TABLES_HDR + +#include + +enum { + EVEN_ODD = 1, + ODD_EVEN = -1, + EVEN_ODD_SKIP = 1<<30, + ODD_EVEN_SKIP, +}; + +struct CaseConversionRecord { + uint32_t lo_code_point; + uint32_t hi_code_point; + int32_t delta; +}; + +struct SpecialCasingConversionRecord { + uint32_t code_point; + const char* replacement; +}; + +extern const size_t LOWER_CASE_RANGES_SIZE; +extern const CaseConversionRecord LOWER_CASE_RANGES[]; + +extern const size_t UPPER_CASE_RANGES_SIZE; +extern const CaseConversionRecord UPPER_CASE_RANGES[]; + +extern const size_t TITLE_CASE_RANGES_SIZE; +extern const CaseConversionRecord TITLE_CASE_RANGES[]; + +extern const size_t LOWER_SPECIAL_CASING_SIZE; +extern const SpecialCasingConversionRecord LOWER_SPECIAL_CASING[]; + +extern const size_t TITLE_SPECIAL_CASING_SIZE; +extern const SpecialCasingConversionRecord TITLE_SPECIAL_CASING[]; + +extern const size_t UPPER_SPECIAL_CASING_SIZE; +extern const SpecialCasingConversionRecord UPPER_SPECIAL_CASING[]; + +#endif diff --git a/utf8case/contextual_case_converter.hpp b/utf8case/contextual_case_converter.hpp new file mode 100644 index 0000000..0a6e9de --- /dev/null +++ b/utf8case/contextual_case_converter.hpp @@ -0,0 +1,17 @@ +#ifndef CONTEXTUAL_CASE_CONVERTER_HDR +#define CONTEXTUAL_CASE_CONVERTER_HDR + +#include + +class ContextualCaseConverter { +public: + virtual ~ContextualCaseConverter() { + } + + virtual const char* convert( + uint32_t prev_code_point, + uint32_t code_point, + uint32_t next_code_point) = 0; +}; + +#endif diff --git a/utf8case/general_case_converter.hpp b/utf8case/general_case_converter.hpp new file mode 100644 index 0000000..6d5439e --- /dev/null +++ b/utf8case/general_case_converter.hpp @@ -0,0 +1,138 @@ +#ifndef GENERAL_CASE_CONVERTER_HDR +#define GENERAL_CASE_CONVERTER_HDR + +#include + +#include "range_based_case_converter.hpp" +#include "special_casing_converter.hpp" +#include "contextual_case_converter.hpp" + +#include "utf8/utf8.h" + +template +class GeneralCaseConverter { + +public: + GeneralCaseConverter( + boost::shared_ptr rangeBasedCaseConverter, + boost::shared_ptr specialCasingConverter, + boost::shared_ptr contextualCaseConverter) + :rangeBasedCaseConverter_(rangeBasedCaseConverter), + specialCasingConverter_(specialCasingConverter), + contextualCaseConverter_(contextualCaseConverter) { + } + + bool willBeTouchedWhenConverted(octet_iterator start, octet_iterator end) const { + while (start != end) { + uint32_t code_point = utf8::unchecked::next(start); + + if (specialCasingConverter_->convert(code_point) + || rangeBasedCaseConverter_->convert(code_point) != code_point) + return true; + } + + return false; + } + + bool willBeTouchedWhenHeadConverted(octet_iterator start, octet_iterator end) const { + if (start == end) + return false; + + octet_iterator prev_start = start; + utf8::unchecked::next(start); + return willBeTouchedWhenConverted(prev_start, start); + } + + bool willBeTouchedWhenTailConverted(octet_iterator start, octet_iterator end) const { + if (start == end) + return false; + + utf8::unchecked::next(start); + return willBeTouchedWhenConverted(start, end); + } + + void convert(octet_iterator start, octet_iterator end, output_iterator out) const { + uint32_t prev_prev_code_point = SPECIAL_CODE_POINT; + uint32_t prev_code_point = SPECIAL_CODE_POINT; + + while (start != end) { + uint32_t code_point = utf8::unchecked::next(start); + + if (prev_code_point != SPECIAL_CODE_POINT) + convertSingleCodePoint( + prev_prev_code_point, + prev_code_point, + code_point, + out); + + prev_prev_code_point = prev_code_point; + prev_code_point = code_point; + } + + if (prev_code_point != SPECIAL_CODE_POINT) + convertSingleCodePoint( + prev_prev_code_point, + prev_code_point, + SPECIAL_CODE_POINT, + out); + } + + void convertSingleCodePoint( + uint32_t prev_code_point, + uint32_t current_code_point, + uint32_t next_code_point, + output_iterator out) const { + + if (const char* contextual = contextualCaseConverter_->convert( + prev_code_point, + current_code_point, + next_code_point)) { + copyCharArrayToOutputIterator_(contextual, out); + } else if (const char* special = specialCasingConverter_->convert(current_code_point)) { + copyCharArrayToOutputIterator_(special, out); + } else { + uint32_t converted_code_point = rangeBasedCaseConverter_->convert(current_code_point); + utf8::unchecked::append(converted_code_point, out); + } + } + + void headConvert(octet_iterator start, octet_iterator end, output_iterator out) const { + bool first = true; + + while (start != end) { + if (first) { + octet_iterator prev_start = start; + utf8::unchecked::next(start); + convert(prev_start, start, out); + first = false; + } else { + *out++ = *start++; + } + } + } + + void tailConvert(octet_iterator start, octet_iterator end, output_iterator out) const { + if (start != end) { + uint32_t code_point = utf8::unchecked::next(start); + + utf8::unchecked::append(code_point, out); + + convert(start, end, out); + } + } + + +private: + void copyCharArrayToOutputIterator_(const char* charVector, output_iterator out) const { + while (*charVector) + *out++ = *charVector++; + } + + boost::shared_ptr rangeBasedCaseConverter_; + boost::shared_ptr specialCasingConverter_; + boost::shared_ptr contextualCaseConverter_; + + const static uint32_t SPECIAL_CODE_POINT = 0xFFFFFFFF; +}; + +#endif diff --git a/utf8case/generate_case_tables.pl b/utf8case/generate_case_tables.pl new file mode 100755 index 0000000..6a163c1 --- /dev/null +++ b/utf8case/generate_case_tables.pl @@ -0,0 +1,251 @@ +#!/usr/bin/perl + +# Based on ideas from re2 library. + +use strict; +use LWP::Simple; +use String::Util qw(hascontent); +use Data::Dumper; +use Clone qw(clone); + +my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/}; +my $OUTPUT_CPP_FILE = 'case_tables.cpp'; + +my @lower_case_ranges; +my @upper_case_ranges; +my @title_case_ranges; + +my @lower_special_casing; +my @upper_special_casing; +my @title_special_casing; + +open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE; +generate_intro(); +generate_standard_case_tables(); +generate_special_casing_tables(); + +sub generate_intro { + print $output_cpp_fh <<'END_OF_INTRO'; +// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT. + +#include "case_tables.hpp" + +END_OF_INTRO +} + +sub generate_standard_case_tables { + my @unicode_data_lines = download_unidata_file('UnicodeData.txt'); + + for my $line (@unicode_data_lines) { + append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]); + append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]); + append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]); + } + + @lower_case_ranges = compactify(\@lower_case_ranges); + @upper_case_ranges = compactify(\@upper_case_ranges); + @title_case_ranges = compactify(\@title_case_ranges); + + write_case_table('lower_case_ranges', \@lower_case_ranges); + print "\n"; + + write_case_table('upper_case_ranges', \@upper_case_ranges); + print "\n"; + + write_case_table('title_case_ranges', \@title_case_ranges); + print "\n"; +} + +sub generate_special_casing_tables { + my @special_casing_lines = download_unidata_file('SpecialCasing.txt'); + + for my $line (@special_casing_lines) { + if (hascontent($line->[4])) { + print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n"; + } else { + append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]); + append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]); + append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]); + } + } + + write_special_casing_table('lower_special_casing', \@lower_special_casing); + print "\n"; + + write_special_casing_table('title_special_casing', \@title_special_casing); + print "\n"; + + write_special_casing_table('upper_special_casing', \@upper_special_casing); +} + +sub download_unidata_file { + my ($file_name) = @_; + + my $url = $UNIDATA_PREFIX . $file_name; + + print STDERR "Downloading ${url}...\n"; + + my $contents = get($url); + + return map { [ split/\s*;\s*/ ] } + grep { /\S/ } + map{ s/\#.*\Z//; $_} + split/\r?\n/, $contents; +} + +sub append_to_case_ranges { + my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_; + + if (!hascontent($hex_modified_code_point)) { + return; + } + + my $code_point = hex($hex_code_point); + my $modified_code_point = hex($hex_modified_code_point); + + push @{$case_ranges_ref}, + [ $code_point, $code_point, delta($code_point, $modified_code_point) ]; +} + +sub compactify { + my ($case_ranges_ref) = @_; + + my @new_table; + + my $current_compact_range; + + for my $range (@{$case_ranges_ref}) { + if (!defined($current_compact_range)) { + $current_compact_range = clone($range); + } elsif ($range->[2] eq $current_compact_range->[2] + && $range->[0] == $current_compact_range->[1] + 1) { + ++$current_compact_range->[1]; + } elsif ($range->[2] eq de_skip($current_compact_range->[2]) + && $range->[0] == $current_compact_range->[1] + 2) { + $current_compact_range->[1] += 2; + $current_compact_range->[2] = add_skip($current_compact_range->[2]); + } else { + push @new_table, $current_compact_range; + $current_compact_range = clone($range); + } + } + + push @new_table, $current_compact_range; + + return @new_table; +} + +sub write_case_table { + my ($name, $case_ranges_ref) = @_; + + my $table_name = uc($name); + my $size_constant_name = $table_name . "_SIZE"; + my $table_size = $#{$case_ranges_ref} + 1; + + + print $output_cpp_fh <<"END_OF_INTRO"; +const size_t $size_constant_name = $table_size; +const CaseConversionRecord ${table_name}[$size_constant_name] = { +END_OF_INTRO + + my $string_to_prepend = ''; + + for my $range (@{$case_ranges_ref}) { + my $from = $range->[0]; + my $to = $range->[1]; + my $delta = $range->[2]; + + print $output_cpp_fh "${string_to_prepend} {$from, $to, $delta}"; + + $string_to_prepend = ",\n"; + } + + + print $output_cpp_fh "\n};\n"; +} + +sub append_to_special_casing_table { + my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_; + + if (!hascontent($hex_code_point_vector)) { + return; + } + + my $code_point = hex($hex_code_point); + my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector; + + push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)]; +} + +sub write_special_casing_table { + my ($name, $special_casing_table_ref) = @_; + + my $table_name = uc($name); + my $size_constant_name = $table_name . "_SIZE"; + my $table_size = $#{$special_casing_table_ref} + 1; + + print $output_cpp_fh <<"END_OF_INTRO"; +const size_t $size_constant_name = $table_size; +const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = { +END_OF_INTRO + + my $string_to_prepend = ''; + + for my $item (@{$special_casing_table_ref}) { + my $code_point = $item->[0]; + my $replacement = $item->[1]; + + print $output_cpp_fh "${string_to_prepend} {$code_point, \"$replacement\"}"; + + $string_to_prepend = ",\n"; + } + + print $output_cpp_fh "\n};\n"; +} + +sub cpp_encode { + my (@v) = @_; + + my $s = join('', map{ chr($_) } @v); + + return join('', map { "\\x$_" } unpack("U0(H2)*", $s)); +} + +sub de_skip { + my ($delta) = @_; + + if ($delta =~ /^(EVEN_ODD|ODD_EVEN)(?:_SKIP)?$/) { + return $1; + } + + return 'CANNOT_BE_SKIPPED'; +} + +sub add_skip { + my ($delta) = @_; + + return de_skip($delta) . '_SKIP'; +} + + +sub delta { + my ($a, $b) = @_; + + if ($a + 1 == $b) { + if ($a % 2 == 0) { + return 'EVEN_ODD' + } + else { + return 'ODD_EVEN'; + } + } elsif ($a == $b + 1) { + if ($a % 2 == 0) { + return 'ODD_EVEN'; + } + else { + return 'EVEN_ODD'; + } + } + + return $b - $a; +} diff --git a/utf8case/range_based_case_converter.cpp b/utf8case/range_based_case_converter.cpp new file mode 100644 index 0000000..a6aa8b5 --- /dev/null +++ b/utf8case/range_based_case_converter.cpp @@ -0,0 +1,63 @@ +#include "range_based_case_converter.hpp" + +uint32_t RangeBasedCaseConverter::convert(uint32_t code_point) const { + + const CaseConversionRecord* conversionRecord = findRecord_(code_point); + + return + conversionRecord == 0 + ? code_point + : applyRecord_(conversionRecord, code_point); +} + +const CaseConversionRecord* RangeBasedCaseConverter::findRecord_(uint32_t code_point) const { + + for (size_t i = 0; i < tableSize_; ++i) { + const CaseConversionRecord* currentRecord = &conversionTable_[i]; + + if (code_point < currentRecord->lo_code_point) + return 0; + + if (code_point <= currentRecord->hi_code_point) + return currentRecord; + } + + return 0; +} + +uint32_t RangeBasedCaseConverter::applyRecord_( + const CaseConversionRecord* conversionRecord, uint32_t code_point) const { + + if (shouldBeSkipped_(conversionRecord, code_point)) + return code_point; + + return applyDelta_(conversionRecord->delta, code_point); +} + + +bool RangeBasedCaseConverter::shouldBeSkipped_( + const CaseConversionRecord* conversionRecord, uint32_t code_point) const { + + return + isSkipRecord_(conversionRecord) + && code_point % 2 != conversionRecord->lo_code_point % 2; +} + +bool RangeBasedCaseConverter::isSkipRecord_(const CaseConversionRecord* conversionRecord) const { + return + conversionRecord->delta == EVEN_ODD_SKIP + || conversionRecord->delta == ODD_EVEN_SKIP; +} + +uint32_t RangeBasedCaseConverter::applyDelta_(int32_t delta, uint32_t code_point) const { + switch (delta) { + case EVEN_ODD: + case EVEN_ODD_SKIP: + return code_point % 2 == 0 ? code_point+1 : code_point-1; + case ODD_EVEN: + case ODD_EVEN_SKIP: + return code_point % 2 == 1 ? code_point+1 : code_point-1; + default: + return code_point + delta; + } +} diff --git a/utf8case/range_based_case_converter.hpp b/utf8case/range_based_case_converter.hpp new file mode 100644 index 0000000..4451f79 --- /dev/null +++ b/utf8case/range_based_case_converter.hpp @@ -0,0 +1,27 @@ +#ifndef RANGE_BASED_CASE_CONVERTED_HDR +#define RANGE_BASED_CASE_CONVERTED_HDR + +#include "case_tables.hpp" + +class RangeBasedCaseConverter { + +public: + RangeBasedCaseConverter(size_t tableSize, const CaseConversionRecord* conversionTable) + :tableSize_(tableSize), conversionTable_(conversionTable) { + } + + uint32_t convert(uint32_t code_point) const; + +private: + const CaseConversionRecord* findRecord_(uint32_t code_point) const; + uint32_t applyRecord_(const CaseConversionRecord* conversionRecord, uint32_t code_point) const; + bool shouldBeSkipped_( + const CaseConversionRecord* conversionRecord, uint32_t code_point) const; + bool isSkipRecord_(const CaseConversionRecord* conversionRecord) const; + uint32_t applyDelta_(int32_t delta, uint32_t code_point) const; + + size_t tableSize_; + const CaseConversionRecord* conversionTable_; +}; + +#endif diff --git a/utf8case/regular_contextual_case_converter.cpp b/utf8case/regular_contextual_case_converter.cpp new file mode 100644 index 0000000..184b6db --- /dev/null +++ b/utf8case/regular_contextual_case_converter.cpp @@ -0,0 +1,11 @@ +#include "regular_contextual_case_converter.hpp" + +RegularContextualCaseConverter::~RegularContextualCaseConverter() { +} + +const char* RegularContextualCaseConverter::convert( + uint32_t /*prev_code_point*/, + uint32_t /*code_point*/, + uint32_t /*next_code_point*/) { + return 0; +} diff --git a/utf8case/regular_contextual_case_converter.hpp b/utf8case/regular_contextual_case_converter.hpp new file mode 100644 index 0000000..3b02e10 --- /dev/null +++ b/utf8case/regular_contextual_case_converter.hpp @@ -0,0 +1,16 @@ +#ifndef REGULAR_CONTEXTUAL_CASE_CONVERTER_HDR +#define REGULAR_CONTEXTUAL_CASE_CONVERTER_HDR + +#include "contextual_case_converter.hpp" + +class RegularContextualCaseConverter: public ContextualCaseConverter { +public: + virtual ~RegularContextualCaseConverter(); + + virtual const char* convert( + uint32_t prev_code_point, + uint32_t code_point, + uint32_t next_code_point); +}; + +#endif diff --git a/utf8case/simple_convert.cpp b/utf8case/simple_convert.cpp new file mode 100644 index 0000000..62304e5 --- /dev/null +++ b/utf8case/simple_convert.cpp @@ -0,0 +1,56 @@ +#include "simple_convert.hpp" + +std::string simpleConvert( + const StringGeneralCaseConverter& converter, + const std::string& s) { + + std::string result; + + converter.convert(s.begin(), s.end(), std::back_inserter(result)); + + return result; +} + +std::string simpleHeadConvert( + const StringGeneralCaseConverter& converter, + const std::string& s) { + + std::string result; + + converter.headConvert(s.begin(), s.end(), std::back_inserter(result)); + + return result; +} + +std::string simpleTailConvert( + const StringGeneralCaseConverter& converter, + const std::string& s) { + + std::string result; + + converter.tailConvert(s.begin(), s.end(), std::back_inserter(result)); + + return result; +} + + +bool simpleWillBeTouchedWhenConverted( + const StringGeneralCaseConverter& converter, + const std::string& s) { + + return converter.willBeTouchedWhenConverted(s.begin(), s.end()); +} + +bool simpleWillBeTouchedWhenHeadConverted( + const StringGeneralCaseConverter& converter, + const std::string& s) { + + return converter.willBeTouchedWhenHeadConverted(s.begin(), s.end()); +} + +bool simpleWillBeTouchedWhenTailConverted( + const StringGeneralCaseConverter& converter, + const std::string& s) { + + return converter.willBeTouchedWhenTailConverted(s.begin(), s.end()); +} diff --git a/utf8case/simple_convert.hpp b/utf8case/simple_convert.hpp new file mode 100644 index 0000000..4e0011e --- /dev/null +++ b/utf8case/simple_convert.hpp @@ -0,0 +1,34 @@ +#ifndef SIMPLE_CONVERT_HDR +#define SIMPLE_CONVERT_HDR + +#include "general_case_converter.hpp" + +typedef GeneralCaseConverter > StringGeneralCaseConverter; + +std::string simpleConvert( + const StringGeneralCaseConverter& converter, + const std::string& s); + +std::string simpleHeadConvert( + const StringGeneralCaseConverter& converter, + const std::string& s); + +std::string simpleTailConvert( + const StringGeneralCaseConverter& converter, + const std::string& s); + + +bool simpleWillBeTouchedWhenConverted( + const StringGeneralCaseConverter& converter, + const std::string& s); + +bool simpleWillBeTouchedWhenHeadConverted( + const StringGeneralCaseConverter& converter, + const std::string& s); + +bool simpleWillBeTouchedWhenTailConverted( + const StringGeneralCaseConverter& converter, + const std::string& s); + +#endif diff --git a/utf8case/special_casing_converter.cpp b/utf8case/special_casing_converter.cpp new file mode 100644 index 0000000..c23afb9 --- /dev/null +++ b/utf8case/special_casing_converter.cpp @@ -0,0 +1,16 @@ +#include "special_casing_converter.hpp" + +const char* SpecialCasingConverter::convert(uint32_t code_point) const { + + for (size_t i = 0; i < tableSize_; ++i) { + const SpecialCasingConversionRecord* currentRecord = &conversionTable_[i]; + + if (code_point < currentRecord->code_point) + return 0; + + if (code_point == currentRecord->code_point) + return currentRecord->replacement; + } + + return 0; +} diff --git a/utf8case/special_casing_converter.hpp b/utf8case/special_casing_converter.hpp new file mode 100644 index 0000000..08f7e92 --- /dev/null +++ b/utf8case/special_casing_converter.hpp @@ -0,0 +1,21 @@ +#ifndef SPECIAL_CASING_CONVERTER_HDR +#define SPECIAL_CASING_CONVERTER_HDR + +#include "case_tables.hpp" + +class SpecialCasingConverter { + +public: + SpecialCasingConverter(size_t tableSize, const SpecialCasingConversionRecord* conversionTable) + :tableSize_(tableSize), conversionTable_(conversionTable) { + } + + const char* convert(uint32_t code_point) const; + +private: + + size_t tableSize_; + const SpecialCasingConversionRecord* conversionTable_; +}; + +#endif diff --git a/utf8case/string_case_converter_manager.cpp b/utf8case/string_case_converter_manager.cpp new file mode 100644 index 0000000..0af4316 --- /dev/null +++ b/utf8case/string_case_converter_manager.cpp @@ -0,0 +1,11 @@ +#include "string_case_converter_manager.hpp" + +StringCaseConverterManager& StringCaseConverterManager::getInstance() { + static StringCaseConverterManager instance; + + return instance; +} + + +StringCaseConverterManager::StringCaseConverterManager() { +} diff --git a/utf8case/string_case_converter_manager.hpp b/utf8case/string_case_converter_manager.hpp new file mode 100644 index 0000000..c4dd5e2 --- /dev/null +++ b/utf8case/string_case_converter_manager.hpp @@ -0,0 +1,16 @@ +#ifndef STRING_CASE_CONVERTER_MANAGER_HDR +#define STRING_CASE_CONVERTER_MANAGER_HDR + +#include "case_converter_factory.hpp" + +class StringCaseConverterManager : public CaseConverterFactory< + std::string::const_iterator, std::back_insert_iterator > { + +public: + static StringCaseConverterManager& getInstance(); + +private: + StringCaseConverterManager(); +}; + +#endif diff --git a/utf8case/t/CMakeLists.txt b/utf8case/t/CMakeLists.txt new file mode 100644 index 0000000..a550899 --- /dev/null +++ b/utf8case/t/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(utf8case-tests + range_based_case_converter_tests.cpp + simple_convert_tests.cpp + special_casing_converter_tests.cpp +) + +target_link_libraries(utf8case-tests utf8case) diff --git a/utf8case/t/range_based_case_converter_tests.cpp b/utf8case/t/range_based_case_converter_tests.cpp new file mode 100644 index 0000000..cd33d3a --- /dev/null +++ b/utf8case/t/range_based_case_converter_tests.cpp @@ -0,0 +1,43 @@ +#include "tests/tests.hpp" + +#include "utf8case/range_based_case_converter.hpp" + +BOOST_AUTO_TEST_SUITE( utf8case ) + +void lower_single_letter_checker(uint32_t lower_code_point, uint32_t upper_code_point) { + RangeBasedCaseConverter converter(LOWER_CASE_RANGES_SIZE, + LOWER_CASE_RANGES); + + BOOST_CHECK_EQUAL(converter.convert(upper_code_point), lower_code_point); + BOOST_CHECK_EQUAL(converter.convert(lower_code_point), lower_code_point); +} + +BOOST_AUTO_TEST_CASE( range_based_case_converter ) { + RangeBasedCaseConverter converter(LOWER_CASE_RANGES_SIZE, + LOWER_CASE_RANGES); + + const uint32_t COMMA_CODE_POINT = 44U; + BOOST_CHECK_EQUAL(converter.convert(COMMA_CODE_POINT), COMMA_CODE_POINT); + + const uint32_t UPPER_F_CODE_POINT = 70U; + const uint32_t LOWER_F_CODE_POINT = 102U; + lower_single_letter_checker(LOWER_F_CODE_POINT, UPPER_F_CODE_POINT); + + const uint32_t UPPER_A_CODE_POINT = 65U; + const uint32_t LOWER_A_CODE_POINT = 97U; + lower_single_letter_checker(LOWER_A_CODE_POINT, UPPER_A_CODE_POINT); + + const uint32_t UPPER_Z_CODE_POINT = 90U; + const uint32_t LOWER_Z_CODE_POINT = 122U; + lower_single_letter_checker(LOWER_Z_CODE_POINT, UPPER_Z_CODE_POINT); + + const uint32_t UPPER_E_OGONEK_CODE_POINT = 280U; + const uint32_t LOWER_E_OGONEK_CODE_POINT = 281U; + lower_single_letter_checker(LOWER_E_OGONEK_CODE_POINT, UPPER_E_OGONEK_CODE_POINT); + + const uint32_t UPPER_SHCHA_CODE_POINT = 1065U; + const uint32_t LOWER_SHCHA_CODE_POINT = 1097U; + lower_single_letter_checker(LOWER_SHCHA_CODE_POINT, UPPER_SHCHA_CODE_POINT); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/utf8case/t/simple_convert_tests.cpp b/utf8case/t/simple_convert_tests.cpp new file mode 100644 index 0000000..3f72e20 --- /dev/null +++ b/utf8case/t/simple_convert_tests.cpp @@ -0,0 +1,176 @@ +#include "tests/tests.hpp" + +#include "utf8case/simple_convert.hpp" +#include "utf8case/case_converter_factory.hpp" +#include "utf8case/string_case_converter_manager.hpp" + +BOOST_AUTO_TEST_SUITE( utf8case ) + +BOOST_AUTO_TEST_CASE( simple_convert_lower ) { + + boost::shared_ptr lowerConverter = + StringCaseConverterManager::getInstance().getLowerCaseConverter("pl"); + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "KOMPUTER"), + std::string("komputer")); + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "ŹDŹBŁO"), + std::string("źdźbło")); + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "Zażółć gęślą JAŹŃ"), + std::string("zażółć gęślą jaźń")); + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "I"), + std::string("i")); + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "БУКВЫ"), + std::string("буквы")); + + + BOOST_CHECK_EQUAL(simpleHeadConvert(*lowerConverter, "ŹDŹBŁO"), + std::string("źDŹBŁO")); + + BOOST_CHECK_EQUAL(simpleTailConvert(*lowerConverter, "ŹDŹBŁO"), + std::string("Źdźbło")); + + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "Ś"), + std::string("ś")); + + BOOST_CHECK_EQUAL(simpleHeadConvert(*lowerConverter, "Ś"), + std::string("ś")); + + BOOST_CHECK_EQUAL(simpleTailConvert(*lowerConverter, "Ś"), + std::string("Ś")); + + + BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, ""), + std::string("")); + + BOOST_CHECK_EQUAL(simpleHeadConvert(*lowerConverter, ""), + std::string("")); + + BOOST_CHECK_EQUAL(simpleTailConvert(*lowerConverter, ""), + std::string("")); + +} + + +BOOST_AUTO_TEST_CASE( will_be_touched ) { + boost::shared_ptr upperConverter = + StringCaseConverterManager::getInstance().getUpperCaseConverter("pl"); + + BOOST_CHECK(simpleWillBeTouchedWhenConverted(*upperConverter, "KOMPUTEr")); + BOOST_CHECK(simpleWillBeTouchedWhenTailConverted(*upperConverter, "KOMPUTEr")); + BOOST_CHECK(!simpleWillBeTouchedWhenHeadConverted(*upperConverter, "KOMPUTEr")); + + BOOST_CHECK(!simpleWillBeTouchedWhenConverted(*upperConverter, "KOMPUTER")); + BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "KOMPUTER")); + BOOST_CHECK(!simpleWillBeTouchedWhenHeadConverted(*upperConverter, "KOMPUTER")); + + BOOST_CHECK(simpleWillBeTouchedWhenConverted(*upperConverter, "śNIEG")); + BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "śNIEG")); + BOOST_CHECK(simpleWillBeTouchedWhenHeadConverted(*upperConverter, "śNIEG")); + + BOOST_CHECK(simpleWillBeTouchedWhenConverted(*upperConverter, "ź")); + BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "ź")); + BOOST_CHECK(simpleWillBeTouchedWhenHeadConverted(*upperConverter, "ź")); + + BOOST_CHECK(!simpleWillBeTouchedWhenConverted(*upperConverter, "")); + BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "")); + BOOST_CHECK(!simpleWillBeTouchedWhenHeadConverted(*upperConverter, "")); +} + +BOOST_AUTO_TEST_CASE( simple_convert_upper ) { + + boost::shared_ptr upperConverter = + StringCaseConverterManager::getInstance().getUpperCaseConverter("pl"); + + BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "komputer"), + std::string("KOMPUTER")); + + BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "źdźbło"), + std::string("ŹDŹBŁO")); + + BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "daß"), + std::string("DASS")); + + BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "ffi"), + std::string("FFI")); + +} + + +BOOST_AUTO_TEST_CASE( simple_convert_title ) { + + boost::shared_ptr titleConverter = + StringCaseConverterManager::getInstance().getTitleCaseConverter("pl"); + + BOOST_CHECK_EQUAL(simpleConvert(*titleConverter, "źdźbło"), + std::string("ŹDŹBŁO")); + + BOOST_CHECK_EQUAL(simpleConvert(*titleConverter, "daß"), + std::string("DASs")); + + BOOST_CHECK_EQUAL(simpleConvert(*titleConverter, "ffi"), + std::string("Ffi")); + +} + +BOOST_AUTO_TEST_CASE( simple_turkish_lower ) { + + boost::shared_ptr standardLowerConverter = + StringCaseConverterManager::getInstance().getLowerCaseConverter("pl"); + + boost::shared_ptr turkishLowerConverter = + StringCaseConverterManager::getInstance().getLowerCaseConverter("tr"); + + BOOST_CHECK_EQUAL(simpleConvert(*standardLowerConverter, "YAZICI"), + std::string("yazici")); + + BOOST_CHECK_EQUAL(simpleConvert(*turkishLowerConverter, "YAZICI"), + std::string("yazıcı")); + + BOOST_CHECK_EQUAL(simpleConvert(*standardLowerConverter, "I"), + std::string("i")); + + BOOST_CHECK_EQUAL(simpleConvert(*turkishLowerConverter, "I"), + std::string("ı")); + + BOOST_CHECK_EQUAL(simpleConvert(*standardLowerConverter, "İ"), + std::string("i̇")); + + BOOST_CHECK_EQUAL(simpleConvert(*turkishLowerConverter, "İ"), + std::string("i")); + +} + +BOOST_AUTO_TEST_CASE( simple_turkish_upper ) { + + boost::shared_ptr standardUpperConverter = + StringCaseConverterManager::getInstance().getUpperCaseConverter("pl"); + + boost::shared_ptr turkishUpperConverter = + StringCaseConverterManager::getInstance().getUpperCaseConverter("tr"); + + BOOST_CHECK_EQUAL(simpleConvert(*standardUpperConverter, "yazici"), + std::string("YAZICI")); + + BOOST_CHECK_EQUAL(simpleConvert(*turkishUpperConverter, "yazici"), + std::string("YAZİCİ")); + + BOOST_CHECK_EQUAL(simpleConvert(*standardUpperConverter, "i"), + std::string("I")); + + BOOST_CHECK_EQUAL(simpleConvert(*turkishUpperConverter, "i"), + std::string("İ")); + + BOOST_CHECK_EQUAL(simpleConvert(*standardUpperConverter, "ı"), + std::string("I")); + + BOOST_CHECK_EQUAL(simpleConvert(*turkishUpperConverter, "ı"), + std::string("I")); + +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/utf8case/t/special_casing_converter_tests.cpp b/utf8case/t/special_casing_converter_tests.cpp new file mode 100644 index 0000000..f320e86 --- /dev/null +++ b/utf8case/t/special_casing_converter_tests.cpp @@ -0,0 +1,32 @@ +#include "tests/tests.hpp" + +#include "utf8case/special_casing_converter.hpp" + +BOOST_AUTO_TEST_SUITE( utf8case ) + +BOOST_AUTO_TEST_CASE( special_casing_converter ) { + SpecialCasingConverter converter(UPPER_SPECIAL_CASING_SIZE, + UPPER_SPECIAL_CASING); + + const uint32_t COMMA_CODE_POINT = 44U; + BOOST_CHECK_EQUAL(converter.convert(COMMA_CODE_POINT), (const char*)0); + + const uint32_t UPPER_F_CODE_POINT = 70U; + const uint32_t LOWER_F_CODE_POINT = 102U; + BOOST_CHECK_EQUAL(converter.convert(UPPER_F_CODE_POINT), (const char*)0); + BOOST_CHECK_EQUAL(converter.convert(LOWER_F_CODE_POINT), (const char*)0); + + const uint32_t UPPER_SHCHA_CODE_POINT = 1065U; + const uint32_t LOWER_SHCHA_CODE_POINT = 1097U; + BOOST_CHECK_EQUAL(converter.convert(UPPER_SHCHA_CODE_POINT), (const char*)0); + BOOST_CHECK_EQUAL(converter.convert(LOWER_SHCHA_CODE_POINT), (const char*)0); + + const uint32_t ESZET_CODE_POINT = 223U; + BOOST_CHECK_EQUAL(converter.convert(ESZET_CODE_POINT), "SS"); +} + +BOOST_AUTO_TEST_CASE( special_casing_converter2 ) { + BOOST_CHECK_EQUAL("SS", "SS"); + +} +BOOST_AUTO_TEST_SUITE_END() diff --git a/utf8case/turkish_and_azeri_lower_contextual_case_converter.cpp b/utf8case/turkish_and_azeri_lower_contextual_case_converter.cpp new file mode 100644 index 0000000..fed182c --- /dev/null +++ b/utf8case/turkish_and_azeri_lower_contextual_case_converter.cpp @@ -0,0 +1,29 @@ +#include "turkish_and_azeri_lower_contextual_case_converter.hpp" + +TurkishAndAzeriLowerContextualCaseConverter::~TurkishAndAzeriLowerContextualCaseConverter() { +} + +const char* TurkishAndAzeriLowerContextualCaseConverter::convert( + uint32_t prev_code_point, + uint32_t code_point, + uint32_t next_code_point) { + + if (code_point == LATIN_CAPITAL_LETTER_I && next_code_point != DOT_ABOVE) + return "ı"; + + if (code_point == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) + return "i"; + + if (code_point == DOT_ABOVE && prev_code_point == LATIN_CAPITAL_LETTER_I) + return ""; + + return 0; +} + +const uint32_t TurkishAndAzeriLowerContextualCaseConverter::LATIN_CAPITAL_LETTER_I = 0x0049; + + +const uint32_t TurkishAndAzeriLowerContextualCaseConverter::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = + 0x0130; + +const uint32_t TurkishAndAzeriLowerContextualCaseConverter::DOT_ABOVE = 0x0307; diff --git a/utf8case/turkish_and_azeri_lower_contextual_case_converter.hpp b/utf8case/turkish_and_azeri_lower_contextual_case_converter.hpp new file mode 100644 index 0000000..ba14436 --- /dev/null +++ b/utf8case/turkish_and_azeri_lower_contextual_case_converter.hpp @@ -0,0 +1,21 @@ +#ifndef TURKISH_AND_AZERI_LOWER_CONTEXTUAL_CASE_CONVERTER_HDR +#define TURKISH_AND_AZERI_LOWER_CONTEXTUAL_CASE_CONVERTER_HDR + +#include "contextual_case_converter.hpp" + +class TurkishAndAzeriLowerContextualCaseConverter: public ContextualCaseConverter { +public: + virtual ~TurkishAndAzeriLowerContextualCaseConverter(); + + virtual const char* convert( + uint32_t prev_code_point, + uint32_t code_point, + uint32_t next_code_point); +private: + const static uint32_t LATIN_CAPITAL_LETTER_I; + const static uint32_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; + const static uint32_t DOT_ABOVE; +}; + + +#endif diff --git a/utf8case/turkish_and_azeri_upper_contextual_case_converter.cpp b/utf8case/turkish_and_azeri_upper_contextual_case_converter.cpp new file mode 100644 index 0000000..7a70c5f --- /dev/null +++ b/utf8case/turkish_and_azeri_upper_contextual_case_converter.cpp @@ -0,0 +1,17 @@ +#include "turkish_and_azeri_upper_contextual_case_converter.hpp" + +TurkishAndAzeriUpperContextualCaseConverter::~TurkishAndAzeriUpperContextualCaseConverter() { +} + +const char* TurkishAndAzeriUpperContextualCaseConverter::convert( + uint32_t /*prev_code_point*/, + uint32_t code_point, + uint32_t /*next_code_point*/) { + + if (code_point == LATIN_SMALL_LETTER_I) + return "İ"; + + return 0; +} + +const uint32_t TurkishAndAzeriUpperContextualCaseConverter::LATIN_SMALL_LETTER_I = 0x0069; diff --git a/utf8case/turkish_and_azeri_upper_contextual_case_converter.hpp b/utf8case/turkish_and_azeri_upper_contextual_case_converter.hpp new file mode 100644 index 0000000..d5a845a --- /dev/null +++ b/utf8case/turkish_and_azeri_upper_contextual_case_converter.hpp @@ -0,0 +1,20 @@ +#ifndef TURKISH_AND_AZERI_UPPER_CONTEXTUAL_CASE_CONVERTER_HDR +#define TURKISH_AND_AZERI_UPPER_CONTEXTUAL_CASE_CONVERTER_HDR + +#include "contextual_case_converter.hpp" + +class TurkishAndAzeriUpperContextualCaseConverter: public ContextualCaseConverter { +public: + virtual ~TurkishAndAzeriUpperContextualCaseConverter(); + + virtual const char* convert( + uint32_t prev_code_point, + uint32_t code_point, + uint32_t next_code_point); + +private: + const static uint32_t LATIN_SMALL_LETTER_I; +}; + + +#endif