utf8
Former-commit-id: fa7407621e839f87613476596c6589aeceb9d796
This commit is contained in:
parent
9358863f8d
commit
6ddba32f48
@ -43,7 +43,7 @@ file(MAKE_DIRECTORY ${PROD_RESOURCES_DIRECTORY}/temp)
|
|||||||
|
|
||||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||||
|
|
||||||
set(BASE_TARGETS concordia)
|
set(BASE_TARGETS concordia utf8case)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -135,7 +135,7 @@ configure_file (
|
|||||||
# Concordia: sub-projects
|
# Concordia: sub-projects
|
||||||
# ================================================
|
# ================================================
|
||||||
|
|
||||||
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort)
|
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case)
|
||||||
|
|
||||||
include_directories("${concordia_SOURCE_DIR}")
|
include_directories("${concordia_SOURCE_DIR}")
|
||||||
|
|
||||||
@ -150,7 +150,7 @@ endforeach(dir)
|
|||||||
# Tests
|
# Tests
|
||||||
# ================================================
|
# ================================================
|
||||||
|
|
||||||
set(TESTS_TARGETS concordia-tests)
|
set(TESTS_TARGETS concordia-tests utf8case-tests)
|
||||||
|
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
|
|
||||||
|
1
TODO.txt
1
TODO.txt
@ -1,3 +1,4 @@
|
|||||||
1. lokalizowane to_lower
|
1. lokalizowane to_lower
|
||||||
2. anonimizacja zdań
|
2. anonimizacja zdań
|
||||||
3. Dzielenie zdań (max 255 tokenów)
|
3. Dzielenie zdań (max 255 tokenów)
|
||||||
|
4. concordia-server
|
||||||
|
7
tests/tests.hpp
Normal file
7
tests/tests.hpp
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#ifndef TESTS_HDR
|
||||||
|
#define TESTS_HDR
|
||||||
|
|
||||||
|
#define BOOST_TEST_NO_MAIN
|
||||||
|
#include <boost/test/unit_test.hpp>
|
||||||
|
|
||||||
|
#endif
|
0
utf8/CMakeLists.txt
Normal file
0
utf8/CMakeLists.txt
Normal file
45
utf8/utf8.h
Normal file
45
utf8/utf8.h
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
/*!
|
||||||
|
## Character encoding
|
||||||
|
|
||||||
|
In PSI toolkit UTF8 is uniformly used. All the textual data is
|
||||||
|
assumed to be encoded in UTF8.
|
||||||
|
|
||||||
|
Technically, std::string is simply used to store UTF8 strings. To handle
|
||||||
|
UTF8, a small external library was incorporated into the project,
|
||||||
|
see: http://utfcpp.sourceforge.net/
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Copyright 2006 Nemanja Trifunovic
|
||||||
|
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
|
||||||
|
#include "utf8/checked.h"
|
||||||
|
#include "utf8/unchecked.h"
|
||||||
|
|
||||||
|
#endif // header guard
|
327
utf8/utf8/checked.h
Normal file
327
utf8/utf8/checked.h
Normal file
@ -0,0 +1,327 @@
|
|||||||
|
// Copyright 2006 Nemanja Trifunovic
|
||||||
|
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
|
||||||
|
#include "core.h"
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
// Base for the exceptions that may be thrown from the library
|
||||||
|
class exception : public std::exception {
|
||||||
|
};
|
||||||
|
|
||||||
|
// Exceptions that may be thrown from the library functions.
|
||||||
|
class invalid_code_point : public exception {
|
||||||
|
uint32_t cp;
|
||||||
|
public:
|
||||||
|
invalid_code_point(uint32_t cp) : cp(cp) {}
|
||||||
|
virtual const char* what() const throw() { return "Invalid code point"; }
|
||||||
|
uint32_t code_point() const {return cp;}
|
||||||
|
};
|
||||||
|
|
||||||
|
class invalid_utf8 : public exception {
|
||||||
|
uint8_t u8;
|
||||||
|
public:
|
||||||
|
invalid_utf8 (uint8_t u) : u8(u) {}
|
||||||
|
virtual const char* what() const throw() { return "invalid UTF-8, convert file to UTF-8 encoding and run again"; }
|
||||||
|
uint8_t utf8_octet() const {return u8;}
|
||||||
|
};
|
||||||
|
|
||||||
|
class invalid_utf16 : public exception {
|
||||||
|
uint16_t u16;
|
||||||
|
public:
|
||||||
|
invalid_utf16 (uint16_t u) : u16(u) {}
|
||||||
|
virtual const char* what() const throw() { return "invalid UTF-16, convert file to UTF-16 encoding and run again"; }
|
||||||
|
uint16_t utf16_word() const {return u16;}
|
||||||
|
};
|
||||||
|
|
||||||
|
class not_enough_room : public exception {
|
||||||
|
public:
|
||||||
|
virtual const char* what() const throw() { return "Not enough space"; }
|
||||||
|
};
|
||||||
|
|
||||||
|
/// The library API - functions intended to be called by the users
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename output_iterator>
|
||||||
|
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
|
||||||
|
{
|
||||||
|
while (start != end) {
|
||||||
|
octet_iterator sequence_start = start;
|
||||||
|
internal::utf_error err_code = internal::validate_next(start, end);
|
||||||
|
switch (err_code) {
|
||||||
|
case internal::UTF8_OK :
|
||||||
|
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||||
|
*out++ = *it;
|
||||||
|
break;
|
||||||
|
case internal::NOT_ENOUGH_ROOM:
|
||||||
|
throw not_enough_room();
|
||||||
|
case internal::INVALID_LEAD:
|
||||||
|
append (replacement, out);
|
||||||
|
++start;
|
||||||
|
break;
|
||||||
|
case internal::INCOMPLETE_SEQUENCE:
|
||||||
|
case internal::OVERLONG_SEQUENCE:
|
||||||
|
case internal::INVALID_CODE_POINT:
|
||||||
|
append (replacement, out);
|
||||||
|
++start;
|
||||||
|
// just one replacement mark for the sequence
|
||||||
|
while (internal::is_trail(*start) && start != end)
|
||||||
|
++start;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename output_iterator>
|
||||||
|
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
|
||||||
|
{
|
||||||
|
static const uint32_t replacement_marker = internal::mask16(0xfffd);
|
||||||
|
return replace_invalid(start, end, out, replacement_marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||||
|
{
|
||||||
|
if (!internal::is_code_point_valid(cp))
|
||||||
|
throw invalid_code_point(cp);
|
||||||
|
|
||||||
|
if (cp < 0x80) // one octet
|
||||||
|
*(result++) = static_cast<uint8_t>(cp);
|
||||||
|
else if (cp < 0x800) { // two octets
|
||||||
|
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||||
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else if (cp < 0x10000) { // three octets
|
||||||
|
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||||
|
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else { // four octets
|
||||||
|
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||||
|
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t next(octet_iterator& it, octet_iterator end)
|
||||||
|
{
|
||||||
|
uint32_t cp = 0;
|
||||||
|
internal::utf_error err_code = internal::validate_next(it, end, &cp);
|
||||||
|
switch (err_code) {
|
||||||
|
case internal::UTF8_OK :
|
||||||
|
break;
|
||||||
|
case internal::NOT_ENOUGH_ROOM :
|
||||||
|
throw not_enough_room();
|
||||||
|
case internal::INVALID_LEAD :
|
||||||
|
case internal::INCOMPLETE_SEQUENCE :
|
||||||
|
case internal::OVERLONG_SEQUENCE :
|
||||||
|
throw invalid_utf8(*it);
|
||||||
|
case internal::INVALID_CODE_POINT :
|
||||||
|
throw invalid_code_point(cp);
|
||||||
|
}
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t peek_next(octet_iterator it, octet_iterator end)
|
||||||
|
{
|
||||||
|
return next(it, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t prior(octet_iterator& it, octet_iterator start)
|
||||||
|
{
|
||||||
|
// can't do much if it == start
|
||||||
|
if (it == start)
|
||||||
|
throw not_enough_room();
|
||||||
|
|
||||||
|
octet_iterator end = it;
|
||||||
|
// Go back until we hit either a lead octet or start
|
||||||
|
while (internal::is_trail(*(--it)))
|
||||||
|
if (it == start)
|
||||||
|
throw invalid_utf8(*it); // error - no lead byte in the sequence
|
||||||
|
return peek_next(it, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Deprecated in versions that include "prior"
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
|
||||||
|
{
|
||||||
|
octet_iterator end = it;
|
||||||
|
while (internal::is_trail(*(--it)))
|
||||||
|
if (it == pass_start)
|
||||||
|
throw invalid_utf8(*it); // error - no lead byte in the sequence
|
||||||
|
octet_iterator temp = it;
|
||||||
|
return next(temp, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename distance_type>
|
||||||
|
void advance (octet_iterator& it, distance_type n, octet_iterator end)
|
||||||
|
{
|
||||||
|
for (distance_type i = 0; i < n; ++i)
|
||||||
|
next(it, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
typename std::iterator_traits<octet_iterator>::difference_type
|
||||||
|
distance (octet_iterator first, octet_iterator last)
|
||||||
|
{
|
||||||
|
typename std::iterator_traits<octet_iterator>::difference_type dist;
|
||||||
|
for (dist = 0; first < last; ++dist)
|
||||||
|
next(first, last);
|
||||||
|
return dist;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16bit_iterator, typename octet_iterator>
|
||||||
|
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||||
|
{
|
||||||
|
while (start != end) {
|
||||||
|
uint32_t cp = internal::mask16(*start++);
|
||||||
|
// Take care of surrogate pairs first
|
||||||
|
if (internal::is_lead_surrogate(cp)) {
|
||||||
|
if (start != end) {
|
||||||
|
uint32_t trail_surrogate = internal::mask16(*start++);
|
||||||
|
if (internal::is_trail_surrogate(trail_surrogate))
|
||||||
|
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||||
|
else
|
||||||
|
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||||
|
|
||||||
|
}
|
||||||
|
// Lone trail surrogate
|
||||||
|
else if (internal::is_trail_surrogate(cp))
|
||||||
|
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||||
|
|
||||||
|
result = append(cp, result);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16bit_iterator, typename octet_iterator>
|
||||||
|
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||||
|
{
|
||||||
|
while (start != end) {
|
||||||
|
uint32_t cp = next(start, end);
|
||||||
|
if (cp > 0xffff) { //make a surrogate pair
|
||||||
|
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
|
||||||
|
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*result++ = static_cast<uint16_t>(cp);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename u32bit_iterator>
|
||||||
|
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
|
||||||
|
{
|
||||||
|
while (start != end)
|
||||||
|
result = append(*(start++), result);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename u32bit_iterator>
|
||||||
|
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
|
||||||
|
{
|
||||||
|
while (start != end)
|
||||||
|
(*result++) = next(start, end);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The iterator class
|
||||||
|
template <typename octet_iterator>
|
||||||
|
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
|
||||||
|
octet_iterator it;
|
||||||
|
octet_iterator range_start;
|
||||||
|
octet_iterator range_end;
|
||||||
|
public:
|
||||||
|
iterator () {};
|
||||||
|
explicit iterator (const octet_iterator& octet_it,
|
||||||
|
const octet_iterator& range_start,
|
||||||
|
const octet_iterator& range_end) :
|
||||||
|
it(octet_it), range_start(range_start), range_end(range_end)
|
||||||
|
{
|
||||||
|
if (it < range_start || it > range_end)
|
||||||
|
throw std::out_of_range("Invalid utf-8 iterator position");
|
||||||
|
}
|
||||||
|
// the default "big three" are OK
|
||||||
|
octet_iterator base () const { return it; }
|
||||||
|
uint32_t operator * () const
|
||||||
|
{
|
||||||
|
octet_iterator temp = it;
|
||||||
|
return next(temp, range_end);
|
||||||
|
}
|
||||||
|
bool operator == (const iterator& rhs) const
|
||||||
|
{
|
||||||
|
if (range_start != rhs.range_start || range_end != rhs.range_end)
|
||||||
|
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
|
||||||
|
return (it == rhs.it);
|
||||||
|
}
|
||||||
|
bool operator != (const iterator& rhs) const
|
||||||
|
{
|
||||||
|
return !(operator == (rhs));
|
||||||
|
}
|
||||||
|
iterator& operator ++ ()
|
||||||
|
{
|
||||||
|
next(it, range_end);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
iterator operator ++ (int)
|
||||||
|
{
|
||||||
|
iterator temp = *this;
|
||||||
|
next(it, range_end);
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
iterator& operator -- ()
|
||||||
|
{
|
||||||
|
prior(it, range_start);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
iterator operator -- (int)
|
||||||
|
{
|
||||||
|
iterator temp = *this;
|
||||||
|
prior(it, range_start);
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
}; // class iterator
|
||||||
|
|
||||||
|
} // namespace utf8
|
||||||
|
|
||||||
|
#endif //header guard
|
||||||
|
|
||||||
|
|
358
utf8/utf8/core.h
Normal file
358
utf8/utf8/core.h
Normal file
@ -0,0 +1,358 @@
|
|||||||
|
// Copyright 2006 Nemanja Trifunovic
|
||||||
|
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
|
||||||
|
// You may need to change them to match your system.
|
||||||
|
// These typedefs have the same names as ones from cstdint, or boost/cstdint
|
||||||
|
typedef unsigned char uint8_t;
|
||||||
|
typedef unsigned short uint16_t;
|
||||||
|
typedef unsigned int uint32_t;
|
||||||
|
|
||||||
|
// Helper code - not intended to be directly called by the library users. May be changed at any time
|
||||||
|
namespace internal
|
||||||
|
{
|
||||||
|
// Unicode constants
|
||||||
|
// Leading (high) surrogates: 0xd800 - 0xdbff
|
||||||
|
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
||||||
|
const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
|
||||||
|
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
|
||||||
|
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
|
||||||
|
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
|
||||||
|
const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
|
||||||
|
const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
|
||||||
|
|
||||||
|
// Maximum valid value for a Unicode code point
|
||||||
|
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
|
||||||
|
|
||||||
|
template<typename octet_type>
|
||||||
|
inline uint8_t mask8(octet_type oc)
|
||||||
|
{
|
||||||
|
return static_cast<uint8_t>(0xff & oc);
|
||||||
|
}
|
||||||
|
template<typename u16_type>
|
||||||
|
inline uint16_t mask16(u16_type oc)
|
||||||
|
{
|
||||||
|
return static_cast<uint16_t>(0xffff & oc);
|
||||||
|
}
|
||||||
|
template<typename octet_type>
|
||||||
|
inline bool is_trail(octet_type oc)
|
||||||
|
{
|
||||||
|
return ((mask8(oc) >> 6) == 0x2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16>
|
||||||
|
inline bool is_lead_surrogate(u16 cp)
|
||||||
|
{
|
||||||
|
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16>
|
||||||
|
inline bool is_trail_surrogate(u16 cp)
|
||||||
|
{
|
||||||
|
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16>
|
||||||
|
inline bool is_surrogate(u16 cp)
|
||||||
|
{
|
||||||
|
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u32>
|
||||||
|
inline bool is_code_point_valid(u32 cp)
|
||||||
|
{
|
||||||
|
return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline typename std::iterator_traits<octet_iterator>::difference_type
|
||||||
|
sequence_length(octet_iterator lead_it)
|
||||||
|
{
|
||||||
|
uint8_t lead = mask8(*lead_it);
|
||||||
|
if (lead < 0x80)
|
||||||
|
return 1;
|
||||||
|
else if ((lead >> 5) == 0x6)
|
||||||
|
return 2;
|
||||||
|
else if ((lead >> 4) == 0xe)
|
||||||
|
return 3;
|
||||||
|
else if ((lead >> 3) == 0x1e)
|
||||||
|
return 4;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_difference_type>
|
||||||
|
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
|
||||||
|
{
|
||||||
|
if (cp < 0x80) {
|
||||||
|
if (length != 1)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if (cp < 0x800) {
|
||||||
|
if (length != 2)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if (cp < 0x10000) {
|
||||||
|
if (length != 3)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||||
|
|
||||||
|
/// get_sequence_x functions decode utf-8 sequences of the length x
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||||
|
{
|
||||||
|
if (it != end) {
|
||||||
|
if (code_point)
|
||||||
|
*code_point = mask8(*it);
|
||||||
|
return UTF8_OK;
|
||||||
|
}
|
||||||
|
return NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||||
|
{
|
||||||
|
utf_error ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
|
||||||
|
if (it != end) {
|
||||||
|
uint32_t cp = mask8(*it);
|
||||||
|
if (++it != end) {
|
||||||
|
if (is_trail(*it)) {
|
||||||
|
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||||
|
|
||||||
|
if (code_point)
|
||||||
|
*code_point = cp;
|
||||||
|
ret_code = UTF8_OK;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = INCOMPLETE_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret_code;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||||
|
{
|
||||||
|
utf_error ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
|
||||||
|
if (it != end) {
|
||||||
|
uint32_t cp = mask8(*it);
|
||||||
|
if (++it != end) {
|
||||||
|
if (is_trail(*it)) {
|
||||||
|
cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
|
||||||
|
if (++it != end) {
|
||||||
|
if (is_trail(*it)) {
|
||||||
|
cp += (*it) & 0x3f;
|
||||||
|
|
||||||
|
if (code_point)
|
||||||
|
*code_point = cp;
|
||||||
|
ret_code = UTF8_OK;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = INCOMPLETE_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = INCOMPLETE_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret_code;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||||
|
{
|
||||||
|
utf_error ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
|
||||||
|
if (it != end) {
|
||||||
|
uint32_t cp = mask8(*it);
|
||||||
|
if (++it != end) {
|
||||||
|
if (is_trail(*it)) {
|
||||||
|
cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
|
||||||
|
if (++it != end) {
|
||||||
|
if (is_trail(*it)) {
|
||||||
|
cp += (mask8(*it) << 6) & 0xfff;
|
||||||
|
if (++it != end) {
|
||||||
|
if (is_trail(*it)) {
|
||||||
|
cp += (*it) & 0x3f;
|
||||||
|
|
||||||
|
if (code_point)
|
||||||
|
*code_point = cp;
|
||||||
|
ret_code = UTF8_OK;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = INCOMPLETE_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = INCOMPLETE_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = INCOMPLETE_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ret_code = NOT_ENOUGH_ROOM;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret_code;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||||
|
{
|
||||||
|
// Save the original value of it so we can go back in case of failure
|
||||||
|
// Of course, it does not make much sense with i.e. stream iterators
|
||||||
|
octet_iterator original_it = it;
|
||||||
|
|
||||||
|
uint32_t cp = 0;
|
||||||
|
// Determine the sequence length based on the lead octet
|
||||||
|
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
|
||||||
|
octet_difference_type length = sequence_length(it);
|
||||||
|
if (length == 0)
|
||||||
|
return INVALID_LEAD;
|
||||||
|
|
||||||
|
// Now that we have a valid sequence length, get trail octets and calculate the code point
|
||||||
|
utf_error err = UTF8_OK;
|
||||||
|
switch (length) {
|
||||||
|
case 1:
|
||||||
|
err = get_sequence_1(it, end, &cp);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
err = get_sequence_2(it, end, &cp);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
err = get_sequence_3(it, end, &cp);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
err = get_sequence_4(it, end, &cp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (err == UTF8_OK) {
|
||||||
|
// Decoding succeeded. Now, security checks...
|
||||||
|
if (is_code_point_valid(cp)) {
|
||||||
|
if (!is_overlong_sequence(cp, length)){
|
||||||
|
// Passed! Return here.
|
||||||
|
if (code_point)
|
||||||
|
*code_point = cp;
|
||||||
|
++it;
|
||||||
|
return UTF8_OK;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
err = OVERLONG_SEQUENCE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
err = INVALID_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Failure branch - restore the original value of the iterator
|
||||||
|
it = original_it;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
|
||||||
|
return validate_next(it, end, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
/// The library API - functions intended to be called by the users
|
||||||
|
|
||||||
|
// Byte order mark
|
||||||
|
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
|
||||||
|
{
|
||||||
|
octet_iterator result = start;
|
||||||
|
while (result != end) {
|
||||||
|
internal::utf_error err_code = internal::validate_next(result, end);
|
||||||
|
if (err_code != internal::UTF8_OK)
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline bool is_valid(octet_iterator start, octet_iterator end)
|
||||||
|
{
|
||||||
|
return (find_invalid(start, end) == end);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
|
||||||
|
{
|
||||||
|
return (
|
||||||
|
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
|
||||||
|
((it != end) && (internal::mask8(*it++)) == bom[1]) &&
|
||||||
|
((it != end) && (internal::mask8(*it)) == bom[2])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Deprecated in release 2.3
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline bool is_bom (octet_iterator it)
|
||||||
|
{
|
||||||
|
return (
|
||||||
|
(internal::mask8(*it++)) == bom[0] &&
|
||||||
|
(internal::mask8(*it++)) == bom[1] &&
|
||||||
|
(internal::mask8(*it)) == bom[2]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} // namespace utf8
|
||||||
|
|
||||||
|
#endif // header guard
|
||||||
|
|
||||||
|
|
234
utf8/utf8/unchecked.h
Normal file
234
utf8/utf8/unchecked.h
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
// Copyright 2006 Nemanja Trifunovic
|
||||||
|
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
|
||||||
|
#include "core.h"
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
namespace unchecked
|
||||||
|
{
|
||||||
|
template <typename octet_iterator>
|
||||||
|
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||||
|
{
|
||||||
|
if (cp < 0x80) // one octet
|
||||||
|
*(result++) = static_cast<uint8_t>(cp);
|
||||||
|
else if (cp < 0x800) { // two octets
|
||||||
|
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||||
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else if (cp < 0x10000) { // three octets
|
||||||
|
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||||
|
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else { // four octets
|
||||||
|
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||||
|
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
|
||||||
|
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t sequence_length(octet_iterator it)
|
||||||
|
{
|
||||||
|
return utf8::internal::sequence_length(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t next(octet_iterator& it)
|
||||||
|
{
|
||||||
|
uint32_t cp = internal::mask8(*it);
|
||||||
|
typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
|
||||||
|
switch (length) {
|
||||||
|
case 1:
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
it++;
|
||||||
|
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
++it;
|
||||||
|
cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
|
||||||
|
++it;
|
||||||
|
cp += (*it) & 0x3f;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
++it;
|
||||||
|
cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);
|
||||||
|
++it;
|
||||||
|
cp += (internal::mask8(*it) << 6) & 0xfff;
|
||||||
|
++it;
|
||||||
|
cp += (*it) & 0x3f;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
++it;
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t peek_next(octet_iterator it)
|
||||||
|
{
|
||||||
|
return next(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
uint32_t prior(octet_iterator& it)
|
||||||
|
{
|
||||||
|
while (internal::is_trail(*(--it))) ;
|
||||||
|
octet_iterator temp = it;
|
||||||
|
return next(temp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline uint32_t previous(octet_iterator& it)
|
||||||
|
{
|
||||||
|
return prior(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename distance_type>
|
||||||
|
void advance (octet_iterator& it, distance_type n)
|
||||||
|
{
|
||||||
|
for (distance_type i = 0; i < n; ++i)
|
||||||
|
next(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
typename std::iterator_traits<octet_iterator>::difference_type
|
||||||
|
distance (octet_iterator first, octet_iterator last)
|
||||||
|
{
|
||||||
|
typename std::iterator_traits<octet_iterator>::difference_type dist;
|
||||||
|
for (dist = 0; first < last; ++dist)
|
||||||
|
next(first);
|
||||||
|
return dist;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16bit_iterator, typename octet_iterator>
|
||||||
|
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||||
|
{
|
||||||
|
while (start != end) {
|
||||||
|
uint32_t cp = internal::mask16(*start++);
|
||||||
|
// Take care of surrogate pairs first
|
||||||
|
if (internal::is_lead_surrogate(cp)) {
|
||||||
|
uint32_t trail_surrogate = internal::mask16(*start++);
|
||||||
|
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||||
|
}
|
||||||
|
result = append(cp, result);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename u16bit_iterator, typename octet_iterator>
|
||||||
|
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||||
|
{
|
||||||
|
while (start < end) {
|
||||||
|
uint32_t cp = next(start);
|
||||||
|
if (cp > 0xffff) { //make a surrogate pair
|
||||||
|
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
|
||||||
|
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*result++ = static_cast<uint16_t>(cp);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename u32bit_iterator>
|
||||||
|
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
|
||||||
|
{
|
||||||
|
while (start != end)
|
||||||
|
result = append(*(start++), result);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator, typename u32bit_iterator>
|
||||||
|
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
|
||||||
|
{
|
||||||
|
while (start < end)
|
||||||
|
(*result++) = next(start);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The iterator class
|
||||||
|
template <typename octet_iterator>
|
||||||
|
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
|
||||||
|
octet_iterator it;
|
||||||
|
public:
|
||||||
|
iterator () {};
|
||||||
|
explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
|
||||||
|
// the default "big three" are OK
|
||||||
|
octet_iterator base () const { return it; }
|
||||||
|
uint32_t operator * () const
|
||||||
|
{
|
||||||
|
octet_iterator temp = it;
|
||||||
|
return next(temp);
|
||||||
|
}
|
||||||
|
bool operator == (const iterator& rhs) const
|
||||||
|
{
|
||||||
|
return (it == rhs.it);
|
||||||
|
}
|
||||||
|
bool operator != (const iterator& rhs) const
|
||||||
|
{
|
||||||
|
return !(operator == (rhs));
|
||||||
|
}
|
||||||
|
iterator& operator ++ ()
|
||||||
|
{
|
||||||
|
std::advance(it, internal::sequence_length(it));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
iterator operator ++ (int)
|
||||||
|
{
|
||||||
|
iterator temp = *this;
|
||||||
|
std::advance(it, internal::sequence_length(it));
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
iterator& operator -- ()
|
||||||
|
{
|
||||||
|
prior(it);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
iterator operator -- (int)
|
||||||
|
{
|
||||||
|
iterator temp = *this;
|
||||||
|
prior(it);
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
}; // class iterator
|
||||||
|
|
||||||
|
} // namespace utf8::unchecked
|
||||||
|
} // namespace utf8
|
||||||
|
|
||||||
|
|
||||||
|
#endif // header guard
|
||||||
|
|
28
utf8case/CMakeLists.txt
Normal file
28
utf8case/CMakeLists.txt
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
add_library(utf8case SHARED
|
||||||
|
case_tables.cpp
|
||||||
|
range_based_case_converter.cpp
|
||||||
|
regular_contextual_case_converter.cpp
|
||||||
|
simple_convert.cpp
|
||||||
|
special_casing_converter.cpp
|
||||||
|
string_case_converter_manager.cpp
|
||||||
|
turkish_and_azeri_lower_contextual_case_converter.cpp
|
||||||
|
turkish_and_azeri_upper_contextual_case_converter.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
add_subdirectory(t)
|
||||||
|
|
||||||
|
install(TARGETS utf8case DESTINATION lib/)
|
||||||
|
install(FILES
|
||||||
|
case_converter_factory.hpp
|
||||||
|
general_case_converter.hpp
|
||||||
|
simple_convert.hpp
|
||||||
|
turkish_and_azeri_lower_contextual_case_converter.hpp
|
||||||
|
case_tables.hpp
|
||||||
|
range_based_case_converter.hpp
|
||||||
|
special_casing_converter.hpp
|
||||||
|
turkish_and_azeri_upper_contextual_case_converter.hpp
|
||||||
|
contextual_case_converter.hpp
|
||||||
|
regular_contextual_case_converter.hpp
|
||||||
|
string_case_converter_manager.hpp
|
||||||
|
|
||||||
|
DESTINATION include/utf8case/)
|
149
utf8case/case_converter_factory.hpp
Normal file
149
utf8case/case_converter_factory.hpp
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
#ifndef CASE_CONVERTER_FACTORY_HDR
|
||||||
|
#define CASE_CONVERTER_FACTORY_HDR
|
||||||
|
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
|
||||||
|
#include "general_case_converter.hpp"
|
||||||
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
#include "regular_contextual_case_converter.hpp"
|
||||||
|
#include "turkish_and_azeri_lower_contextual_case_converter.hpp"
|
||||||
|
#include "turkish_and_azeri_upper_contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
const size_t NUMBER_OF_CASE_TYPES = 3;
|
||||||
|
|
||||||
|
template<typename octet_iterator, typename output_iterator>
|
||||||
|
class CaseConverterFactory {
|
||||||
|
|
||||||
|
private:
|
||||||
|
enum {
|
||||||
|
LOWER_INDEX = 0,
|
||||||
|
UPPER_INDEX = 1,
|
||||||
|
TITLE_INDEX = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
boost::shared_ptr<RangeBasedCaseConverter> rangeBasedCaseConverters_[NUMBER_OF_CASE_TYPES];
|
||||||
|
boost::shared_ptr<SpecialCasingConverter> specialCasingConverters_[NUMBER_OF_CASE_TYPES];
|
||||||
|
|
||||||
|
boost::shared_ptr<ContextualCaseConverter> regularContextualCaseConverter_;
|
||||||
|
boost::shared_ptr<ContextualCaseConverter> turkishAndAzeriUpperContextualCaseConverter_;
|
||||||
|
boost::shared_ptr<ContextualCaseConverter> turkishAndAzeriLowerContextualCaseConverter_;
|
||||||
|
|
||||||
|
class Exception : public ConcordiaException {
|
||||||
|
public:
|
||||||
|
Exception(const std::string& msg): ConcordiaException(msg) {
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual ~Exception() throw() {}
|
||||||
|
};
|
||||||
|
|
||||||
|
void checkRawConverters_(int case_index) {
|
||||||
|
if (!rangeBasedCaseConverters_[case_index]) {
|
||||||
|
boost::shared_ptr<RangeBasedCaseConverter> converter;
|
||||||
|
|
||||||
|
switch (case_index) {
|
||||||
|
case LOWER_INDEX: converter.reset(
|
||||||
|
new RangeBasedCaseConverter(
|
||||||
|
LOWER_CASE_RANGES_SIZE,
|
||||||
|
LOWER_CASE_RANGES));
|
||||||
|
break;
|
||||||
|
case UPPER_INDEX: converter.reset(
|
||||||
|
new RangeBasedCaseConverter(
|
||||||
|
UPPER_CASE_RANGES_SIZE,
|
||||||
|
UPPER_CASE_RANGES));
|
||||||
|
break;
|
||||||
|
case TITLE_INDEX: converter.reset(
|
||||||
|
new RangeBasedCaseConverter(
|
||||||
|
TITLE_CASE_RANGES_SIZE,
|
||||||
|
TITLE_CASE_RANGES));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw Exception("????");
|
||||||
|
}
|
||||||
|
|
||||||
|
rangeBasedCaseConverters_[case_index] = converter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!specialCasingConverters_[case_index]) {
|
||||||
|
boost::shared_ptr<SpecialCasingConverter> converter;
|
||||||
|
|
||||||
|
switch (case_index) {
|
||||||
|
case LOWER_INDEX: converter.reset(
|
||||||
|
new SpecialCasingConverter(
|
||||||
|
LOWER_SPECIAL_CASING_SIZE,
|
||||||
|
LOWER_SPECIAL_CASING));
|
||||||
|
break;
|
||||||
|
case UPPER_INDEX: converter.reset(
|
||||||
|
new SpecialCasingConverter(
|
||||||
|
UPPER_SPECIAL_CASING_SIZE,
|
||||||
|
UPPER_SPECIAL_CASING));
|
||||||
|
break;
|
||||||
|
case TITLE_INDEX: converter.reset(
|
||||||
|
new SpecialCasingConverter(
|
||||||
|
TITLE_SPECIAL_CASING_SIZE,
|
||||||
|
TITLE_SPECIAL_CASING));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw Exception("????");
|
||||||
|
}
|
||||||
|
|
||||||
|
specialCasingConverters_[case_index] = converter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<GeneralCaseConverter<octet_iterator, output_iterator> > getCaseConverter_(
|
||||||
|
int case_index, const std::string& language_code) {
|
||||||
|
|
||||||
|
checkRawConverters_(case_index);
|
||||||
|
|
||||||
|
return boost::shared_ptr<GeneralCaseConverter<octet_iterator, output_iterator> >(
|
||||||
|
new GeneralCaseConverter<octet_iterator, output_iterator> (
|
||||||
|
rangeBasedCaseConverters_[case_index],
|
||||||
|
specialCasingConverters_[case_index],
|
||||||
|
getContextualCaseConverterForLanguage_(language_code, case_index)));
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
CaseConverterFactory():
|
||||||
|
regularContextualCaseConverter_(
|
||||||
|
boost::shared_ptr<ContextualCaseConverter>(new RegularContextualCaseConverter())),
|
||||||
|
turkishAndAzeriUpperContextualCaseConverter_(
|
||||||
|
boost::shared_ptr<ContextualCaseConverter>(
|
||||||
|
new TurkishAndAzeriUpperContextualCaseConverter())),
|
||||||
|
turkishAndAzeriLowerContextualCaseConverter_(
|
||||||
|
boost::shared_ptr<ContextualCaseConverter>(
|
||||||
|
new TurkishAndAzeriLowerContextualCaseConverter())) {
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<ContextualCaseConverter> getContextualCaseConverterForLanguage_(
|
||||||
|
const std::string& languageCode, int caseIndex) {
|
||||||
|
if (languageCode == "lt")
|
||||||
|
throw Exception(std::string("language '") + languageCode
|
||||||
|
+ "' is not handled yet in lower/upper/title-casing");
|
||||||
|
|
||||||
|
if (languageCode == "tr" || languageCode == "az")
|
||||||
|
return
|
||||||
|
caseIndex == LOWER_INDEX
|
||||||
|
? turkishAndAzeriLowerContextualCaseConverter_
|
||||||
|
: turkishAndAzeriUpperContextualCaseConverter_;
|
||||||
|
|
||||||
|
return regularContextualCaseConverter_;
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<GeneralCaseConverter<octet_iterator, output_iterator> >
|
||||||
|
getLowerCaseConverter(const std::string& language_code) {
|
||||||
|
return getCaseConverter_(LOWER_INDEX, language_code);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<GeneralCaseConverter<octet_iterator, output_iterator> >
|
||||||
|
getUpperCaseConverter(const std::string& language_code) {
|
||||||
|
return getCaseConverter_(UPPER_INDEX, language_code);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<GeneralCaseConverter<octet_iterator, output_iterator> >
|
||||||
|
getTitleCaseConverter(const std::string& language_code) {
|
||||||
|
return getCaseConverter_(TITLE_INDEX, language_code);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
804
utf8case/case_tables.cpp
Normal file
804
utf8case/case_tables.cpp
Normal file
@ -0,0 +1,804 @@
|
|||||||
|
// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT.
|
||||||
|
|
||||||
|
#include "case_tables.hpp"
|
||||||
|
|
||||||
|
const size_t LOWER_CASE_RANGES_SIZE = 151;
|
||||||
|
const CaseConversionRecord LOWER_CASE_RANGES[LOWER_CASE_RANGES_SIZE] = {
|
||||||
|
{65, 90, 32},
|
||||||
|
{192, 214, 32},
|
||||||
|
{216, 222, 32},
|
||||||
|
{256, 302, EVEN_ODD_SKIP},
|
||||||
|
{304, 304, -199},
|
||||||
|
{306, 310, EVEN_ODD_SKIP},
|
||||||
|
{313, 327, ODD_EVEN_SKIP},
|
||||||
|
{330, 374, EVEN_ODD_SKIP},
|
||||||
|
{376, 376, -121},
|
||||||
|
{377, 381, ODD_EVEN_SKIP},
|
||||||
|
{385, 385, 210},
|
||||||
|
{386, 388, EVEN_ODD_SKIP},
|
||||||
|
{390, 390, 206},
|
||||||
|
{391, 391, ODD_EVEN},
|
||||||
|
{393, 394, 205},
|
||||||
|
{395, 395, ODD_EVEN},
|
||||||
|
{398, 398, 79},
|
||||||
|
{399, 399, 202},
|
||||||
|
{400, 400, 203},
|
||||||
|
{401, 401, ODD_EVEN},
|
||||||
|
{403, 403, 205},
|
||||||
|
{404, 404, 207},
|
||||||
|
{406, 406, 211},
|
||||||
|
{407, 407, 209},
|
||||||
|
{408, 408, EVEN_ODD},
|
||||||
|
{412, 412, 211},
|
||||||
|
{413, 413, 213},
|
||||||
|
{415, 415, 214},
|
||||||
|
{416, 420, EVEN_ODD_SKIP},
|
||||||
|
{422, 422, 218},
|
||||||
|
{423, 423, ODD_EVEN},
|
||||||
|
{425, 425, 218},
|
||||||
|
{428, 428, EVEN_ODD},
|
||||||
|
{430, 430, 218},
|
||||||
|
{431, 431, ODD_EVEN},
|
||||||
|
{433, 434, 217},
|
||||||
|
{435, 437, ODD_EVEN_SKIP},
|
||||||
|
{439, 439, 219},
|
||||||
|
{440, 440, EVEN_ODD},
|
||||||
|
{444, 444, EVEN_ODD},
|
||||||
|
{452, 452, 2},
|
||||||
|
{453, 453, ODD_EVEN},
|
||||||
|
{455, 455, 2},
|
||||||
|
{456, 456, EVEN_ODD},
|
||||||
|
{458, 458, 2},
|
||||||
|
{459, 475, ODD_EVEN_SKIP},
|
||||||
|
{478, 494, EVEN_ODD_SKIP},
|
||||||
|
{497, 497, 2},
|
||||||
|
{498, 500, EVEN_ODD_SKIP},
|
||||||
|
{502, 502, -97},
|
||||||
|
{503, 503, -56},
|
||||||
|
{504, 542, EVEN_ODD_SKIP},
|
||||||
|
{544, 544, -130},
|
||||||
|
{546, 562, EVEN_ODD_SKIP},
|
||||||
|
{570, 570, 10795},
|
||||||
|
{571, 571, ODD_EVEN},
|
||||||
|
{573, 573, -163},
|
||||||
|
{574, 574, 10792},
|
||||||
|
{577, 577, ODD_EVEN},
|
||||||
|
{579, 579, -195},
|
||||||
|
{580, 580, 69},
|
||||||
|
{581, 581, 71},
|
||||||
|
{582, 590, EVEN_ODD_SKIP},
|
||||||
|
{880, 882, EVEN_ODD_SKIP},
|
||||||
|
{886, 886, EVEN_ODD},
|
||||||
|
{902, 902, 38},
|
||||||
|
{904, 906, 37},
|
||||||
|
{908, 908, 64},
|
||||||
|
{910, 911, 63},
|
||||||
|
{913, 929, 32},
|
||||||
|
{931, 939, 32},
|
||||||
|
{975, 975, 8},
|
||||||
|
{984, 1006, EVEN_ODD_SKIP},
|
||||||
|
{1012, 1012, -60},
|
||||||
|
{1015, 1015, ODD_EVEN},
|
||||||
|
{1017, 1017, -7},
|
||||||
|
{1018, 1018, EVEN_ODD},
|
||||||
|
{1021, 1023, -130},
|
||||||
|
{1024, 1039, 80},
|
||||||
|
{1040, 1071, 32},
|
||||||
|
{1120, 1152, EVEN_ODD_SKIP},
|
||||||
|
{1162, 1214, EVEN_ODD_SKIP},
|
||||||
|
{1216, 1216, 15},
|
||||||
|
{1217, 1229, ODD_EVEN_SKIP},
|
||||||
|
{1232, 1318, EVEN_ODD_SKIP},
|
||||||
|
{1329, 1366, 48},
|
||||||
|
{4256, 4293, 7264},
|
||||||
|
{7680, 7828, EVEN_ODD_SKIP},
|
||||||
|
{7838, 7838, -7615},
|
||||||
|
{7840, 7934, EVEN_ODD_SKIP},
|
||||||
|
{7944, 7951, -8},
|
||||||
|
{7960, 7965, -8},
|
||||||
|
{7976, 7983, -8},
|
||||||
|
{7992, 7999, -8},
|
||||||
|
{8008, 8013, -8},
|
||||||
|
{8025, 8025, -8},
|
||||||
|
{8027, 8027, -8},
|
||||||
|
{8029, 8029, -8},
|
||||||
|
{8031, 8031, -8},
|
||||||
|
{8040, 8047, -8},
|
||||||
|
{8072, 8079, -8},
|
||||||
|
{8088, 8095, -8},
|
||||||
|
{8104, 8111, -8},
|
||||||
|
{8120, 8121, -8},
|
||||||
|
{8122, 8123, -74},
|
||||||
|
{8124, 8124, -9},
|
||||||
|
{8136, 8139, -86},
|
||||||
|
{8140, 8140, -9},
|
||||||
|
{8152, 8153, -8},
|
||||||
|
{8154, 8155, -100},
|
||||||
|
{8168, 8169, -8},
|
||||||
|
{8170, 8171, -112},
|
||||||
|
{8172, 8172, -7},
|
||||||
|
{8184, 8185, -128},
|
||||||
|
{8186, 8187, -126},
|
||||||
|
{8188, 8188, -9},
|
||||||
|
{8486, 8486, -7517},
|
||||||
|
{8490, 8490, -8383},
|
||||||
|
{8491, 8491, -8262},
|
||||||
|
{8498, 8498, 28},
|
||||||
|
{8544, 8559, 16},
|
||||||
|
{8579, 8579, ODD_EVEN},
|
||||||
|
{9398, 9423, 26},
|
||||||
|
{11264, 11310, 48},
|
||||||
|
{11360, 11360, EVEN_ODD},
|
||||||
|
{11362, 11362, -10743},
|
||||||
|
{11363, 11363, -3814},
|
||||||
|
{11364, 11364, -10727},
|
||||||
|
{11367, 11371, ODD_EVEN_SKIP},
|
||||||
|
{11373, 11373, -10780},
|
||||||
|
{11374, 11374, -10749},
|
||||||
|
{11375, 11375, -10783},
|
||||||
|
{11376, 11376, -10782},
|
||||||
|
{11378, 11378, EVEN_ODD},
|
||||||
|
{11381, 11381, ODD_EVEN},
|
||||||
|
{11390, 11391, -10815},
|
||||||
|
{11392, 11490, EVEN_ODD_SKIP},
|
||||||
|
{11499, 11501, ODD_EVEN_SKIP},
|
||||||
|
{42560, 42604, EVEN_ODD_SKIP},
|
||||||
|
{42624, 42646, EVEN_ODD_SKIP},
|
||||||
|
{42786, 42798, EVEN_ODD_SKIP},
|
||||||
|
{42802, 42862, EVEN_ODD_SKIP},
|
||||||
|
{42873, 42875, ODD_EVEN_SKIP},
|
||||||
|
{42877, 42877, -35332},
|
||||||
|
{42878, 42886, EVEN_ODD_SKIP},
|
||||||
|
{42891, 42891, ODD_EVEN},
|
||||||
|
{42893, 42893, -42280},
|
||||||
|
{42896, 42896, EVEN_ODD},
|
||||||
|
{42912, 42920, EVEN_ODD_SKIP},
|
||||||
|
{65313, 65338, 32},
|
||||||
|
{66560, 66599, 40}
|
||||||
|
};
|
||||||
|
const size_t UPPER_CASE_RANGES_SIZE = 161;
|
||||||
|
const CaseConversionRecord UPPER_CASE_RANGES[UPPER_CASE_RANGES_SIZE] = {
|
||||||
|
{97, 122, -32},
|
||||||
|
{181, 181, 743},
|
||||||
|
{224, 246, -32},
|
||||||
|
{248, 254, -32},
|
||||||
|
{255, 255, 121},
|
||||||
|
{257, 303, EVEN_ODD_SKIP},
|
||||||
|
{305, 305, -232},
|
||||||
|
{307, 311, EVEN_ODD_SKIP},
|
||||||
|
{314, 328, ODD_EVEN_SKIP},
|
||||||
|
{331, 375, EVEN_ODD_SKIP},
|
||||||
|
{378, 382, ODD_EVEN_SKIP},
|
||||||
|
{383, 383, -300},
|
||||||
|
{384, 384, 195},
|
||||||
|
{387, 389, EVEN_ODD_SKIP},
|
||||||
|
{392, 392, ODD_EVEN},
|
||||||
|
{396, 396, ODD_EVEN},
|
||||||
|
{402, 402, ODD_EVEN},
|
||||||
|
{405, 405, 97},
|
||||||
|
{409, 409, EVEN_ODD},
|
||||||
|
{410, 410, 163},
|
||||||
|
{414, 414, 130},
|
||||||
|
{417, 421, EVEN_ODD_SKIP},
|
||||||
|
{424, 424, ODD_EVEN},
|
||||||
|
{429, 429, EVEN_ODD},
|
||||||
|
{432, 432, ODD_EVEN},
|
||||||
|
{436, 438, ODD_EVEN_SKIP},
|
||||||
|
{441, 441, EVEN_ODD},
|
||||||
|
{445, 445, EVEN_ODD},
|
||||||
|
{447, 447, 56},
|
||||||
|
{453, 453, EVEN_ODD},
|
||||||
|
{454, 454, -2},
|
||||||
|
{456, 456, ODD_EVEN},
|
||||||
|
{457, 457, -2},
|
||||||
|
{459, 459, EVEN_ODD},
|
||||||
|
{460, 460, -2},
|
||||||
|
{462, 476, ODD_EVEN_SKIP},
|
||||||
|
{477, 477, -79},
|
||||||
|
{479, 495, EVEN_ODD_SKIP},
|
||||||
|
{498, 498, ODD_EVEN},
|
||||||
|
{499, 499, -2},
|
||||||
|
{501, 501, EVEN_ODD},
|
||||||
|
{505, 543, EVEN_ODD_SKIP},
|
||||||
|
{547, 563, EVEN_ODD_SKIP},
|
||||||
|
{572, 572, ODD_EVEN},
|
||||||
|
{575, 576, 10815},
|
||||||
|
{578, 578, ODD_EVEN},
|
||||||
|
{583, 591, EVEN_ODD_SKIP},
|
||||||
|
{592, 592, 10783},
|
||||||
|
{593, 593, 10780},
|
||||||
|
{594, 594, 10782},
|
||||||
|
{595, 595, -210},
|
||||||
|
{596, 596, -206},
|
||||||
|
{598, 599, -205},
|
||||||
|
{601, 601, -202},
|
||||||
|
{603, 603, -203},
|
||||||
|
{608, 608, -205},
|
||||||
|
{611, 611, -207},
|
||||||
|
{613, 613, 42280},
|
||||||
|
{616, 616, -209},
|
||||||
|
{617, 617, -211},
|
||||||
|
{619, 619, 10743},
|
||||||
|
{623, 623, -211},
|
||||||
|
{625, 625, 10749},
|
||||||
|
{626, 626, -213},
|
||||||
|
{629, 629, -214},
|
||||||
|
{637, 637, 10727},
|
||||||
|
{640, 640, -218},
|
||||||
|
{643, 643, -218},
|
||||||
|
{648, 648, -218},
|
||||||
|
{649, 649, -69},
|
||||||
|
{650, 651, -217},
|
||||||
|
{652, 652, -71},
|
||||||
|
{658, 658, -219},
|
||||||
|
{837, 837, 84},
|
||||||
|
{881, 883, EVEN_ODD_SKIP},
|
||||||
|
{887, 887, EVEN_ODD},
|
||||||
|
{891, 893, 130},
|
||||||
|
{940, 940, -38},
|
||||||
|
{941, 943, -37},
|
||||||
|
{945, 961, -32},
|
||||||
|
{962, 962, -31},
|
||||||
|
{963, 971, -32},
|
||||||
|
{972, 972, -64},
|
||||||
|
{973, 974, -63},
|
||||||
|
{976, 976, -62},
|
||||||
|
{977, 977, -57},
|
||||||
|
{981, 981, -47},
|
||||||
|
{982, 982, -54},
|
||||||
|
{983, 983, -8},
|
||||||
|
{985, 1007, EVEN_ODD_SKIP},
|
||||||
|
{1008, 1008, -86},
|
||||||
|
{1009, 1009, -80},
|
||||||
|
{1010, 1010, 7},
|
||||||
|
{1013, 1013, -96},
|
||||||
|
{1016, 1016, ODD_EVEN},
|
||||||
|
{1019, 1019, EVEN_ODD},
|
||||||
|
{1072, 1103, -32},
|
||||||
|
{1104, 1119, -80},
|
||||||
|
{1121, 1153, EVEN_ODD_SKIP},
|
||||||
|
{1163, 1215, EVEN_ODD_SKIP},
|
||||||
|
{1218, 1230, ODD_EVEN_SKIP},
|
||||||
|
{1231, 1231, -15},
|
||||||
|
{1233, 1319, EVEN_ODD_SKIP},
|
||||||
|
{1377, 1414, -48},
|
||||||
|
{7545, 7545, 35332},
|
||||||
|
{7549, 7549, 3814},
|
||||||
|
{7681, 7829, EVEN_ODD_SKIP},
|
||||||
|
{7835, 7835, -59},
|
||||||
|
{7841, 7935, EVEN_ODD_SKIP},
|
||||||
|
{7936, 7943, 8},
|
||||||
|
{7952, 7957, 8},
|
||||||
|
{7968, 7975, 8},
|
||||||
|
{7984, 7991, 8},
|
||||||
|
{8000, 8005, 8},
|
||||||
|
{8017, 8017, 8},
|
||||||
|
{8019, 8019, 8},
|
||||||
|
{8021, 8021, 8},
|
||||||
|
{8023, 8023, 8},
|
||||||
|
{8032, 8039, 8},
|
||||||
|
{8048, 8049, 74},
|
||||||
|
{8050, 8053, 86},
|
||||||
|
{8054, 8055, 100},
|
||||||
|
{8056, 8057, 128},
|
||||||
|
{8058, 8059, 112},
|
||||||
|
{8060, 8061, 126},
|
||||||
|
{8064, 8071, 8},
|
||||||
|
{8080, 8087, 8},
|
||||||
|
{8096, 8103, 8},
|
||||||
|
{8112, 8113, 8},
|
||||||
|
{8115, 8115, 9},
|
||||||
|
{8126, 8126, -7205},
|
||||||
|
{8131, 8131, 9},
|
||||||
|
{8144, 8145, 8},
|
||||||
|
{8160, 8161, 8},
|
||||||
|
{8165, 8165, 7},
|
||||||
|
{8179, 8179, 9},
|
||||||
|
{8526, 8526, -28},
|
||||||
|
{8560, 8575, -16},
|
||||||
|
{8580, 8580, ODD_EVEN},
|
||||||
|
{9424, 9449, -26},
|
||||||
|
{11312, 11358, -48},
|
||||||
|
{11361, 11361, EVEN_ODD},
|
||||||
|
{11365, 11365, -10795},
|
||||||
|
{11366, 11366, -10792},
|
||||||
|
{11368, 11372, ODD_EVEN_SKIP},
|
||||||
|
{11379, 11379, EVEN_ODD},
|
||||||
|
{11382, 11382, ODD_EVEN},
|
||||||
|
{11393, 11491, EVEN_ODD_SKIP},
|
||||||
|
{11500, 11502, ODD_EVEN_SKIP},
|
||||||
|
{11520, 11557, -7264},
|
||||||
|
{42561, 42605, EVEN_ODD_SKIP},
|
||||||
|
{42625, 42647, EVEN_ODD_SKIP},
|
||||||
|
{42787, 42799, EVEN_ODD_SKIP},
|
||||||
|
{42803, 42863, EVEN_ODD_SKIP},
|
||||||
|
{42874, 42876, ODD_EVEN_SKIP},
|
||||||
|
{42879, 42887, EVEN_ODD_SKIP},
|
||||||
|
{42892, 42892, ODD_EVEN},
|
||||||
|
{42897, 42897, EVEN_ODD},
|
||||||
|
{42913, 42921, EVEN_ODD_SKIP},
|
||||||
|
{65345, 65370, -32},
|
||||||
|
{66600, 66639, -40}
|
||||||
|
};
|
||||||
|
const size_t TITLE_CASE_RANGES_SIZE = 161;
|
||||||
|
const CaseConversionRecord TITLE_CASE_RANGES[TITLE_CASE_RANGES_SIZE] = {
|
||||||
|
{97, 122, -32},
|
||||||
|
{181, 181, 743},
|
||||||
|
{224, 246, -32},
|
||||||
|
{248, 254, -32},
|
||||||
|
{255, 255, 121},
|
||||||
|
{257, 303, EVEN_ODD_SKIP},
|
||||||
|
{305, 305, -232},
|
||||||
|
{307, 311, EVEN_ODD_SKIP},
|
||||||
|
{314, 328, ODD_EVEN_SKIP},
|
||||||
|
{331, 375, EVEN_ODD_SKIP},
|
||||||
|
{378, 382, ODD_EVEN_SKIP},
|
||||||
|
{383, 383, -300},
|
||||||
|
{384, 384, 195},
|
||||||
|
{387, 389, EVEN_ODD_SKIP},
|
||||||
|
{392, 392, ODD_EVEN},
|
||||||
|
{396, 396, ODD_EVEN},
|
||||||
|
{402, 402, ODD_EVEN},
|
||||||
|
{405, 405, 97},
|
||||||
|
{409, 409, EVEN_ODD},
|
||||||
|
{410, 410, 163},
|
||||||
|
{414, 414, 130},
|
||||||
|
{417, 421, EVEN_ODD_SKIP},
|
||||||
|
{424, 424, ODD_EVEN},
|
||||||
|
{429, 429, EVEN_ODD},
|
||||||
|
{432, 432, ODD_EVEN},
|
||||||
|
{436, 438, ODD_EVEN_SKIP},
|
||||||
|
{441, 441, EVEN_ODD},
|
||||||
|
{445, 445, EVEN_ODD},
|
||||||
|
{447, 447, 56},
|
||||||
|
{452, 452, EVEN_ODD},
|
||||||
|
{453, 453, 0},
|
||||||
|
{454, 455, ODD_EVEN},
|
||||||
|
{456, 456, 0},
|
||||||
|
{457, 458, EVEN_ODD},
|
||||||
|
{459, 459, 0},
|
||||||
|
{460, 476, ODD_EVEN_SKIP},
|
||||||
|
{477, 477, -79},
|
||||||
|
{479, 495, EVEN_ODD_SKIP},
|
||||||
|
{497, 497, ODD_EVEN},
|
||||||
|
{498, 498, 0},
|
||||||
|
{499, 501, EVEN_ODD_SKIP},
|
||||||
|
{505, 543, EVEN_ODD_SKIP},
|
||||||
|
{547, 563, EVEN_ODD_SKIP},
|
||||||
|
{572, 572, ODD_EVEN},
|
||||||
|
{575, 576, 10815},
|
||||||
|
{578, 578, ODD_EVEN},
|
||||||
|
{583, 591, EVEN_ODD_SKIP},
|
||||||
|
{592, 592, 10783},
|
||||||
|
{593, 593, 10780},
|
||||||
|
{594, 594, 10782},
|
||||||
|
{595, 595, -210},
|
||||||
|
{596, 596, -206},
|
||||||
|
{598, 599, -205},
|
||||||
|
{601, 601, -202},
|
||||||
|
{603, 603, -203},
|
||||||
|
{608, 608, -205},
|
||||||
|
{611, 611, -207},
|
||||||
|
{613, 613, 42280},
|
||||||
|
{616, 616, -209},
|
||||||
|
{617, 617, -211},
|
||||||
|
{619, 619, 10743},
|
||||||
|
{623, 623, -211},
|
||||||
|
{625, 625, 10749},
|
||||||
|
{626, 626, -213},
|
||||||
|
{629, 629, -214},
|
||||||
|
{637, 637, 10727},
|
||||||
|
{640, 640, -218},
|
||||||
|
{643, 643, -218},
|
||||||
|
{648, 648, -218},
|
||||||
|
{649, 649, -69},
|
||||||
|
{650, 651, -217},
|
||||||
|
{652, 652, -71},
|
||||||
|
{658, 658, -219},
|
||||||
|
{837, 837, 84},
|
||||||
|
{881, 883, EVEN_ODD_SKIP},
|
||||||
|
{887, 887, EVEN_ODD},
|
||||||
|
{891, 893, 130},
|
||||||
|
{940, 940, -38},
|
||||||
|
{941, 943, -37},
|
||||||
|
{945, 961, -32},
|
||||||
|
{962, 962, -31},
|
||||||
|
{963, 971, -32},
|
||||||
|
{972, 972, -64},
|
||||||
|
{973, 974, -63},
|
||||||
|
{976, 976, -62},
|
||||||
|
{977, 977, -57},
|
||||||
|
{981, 981, -47},
|
||||||
|
{982, 982, -54},
|
||||||
|
{983, 983, -8},
|
||||||
|
{985, 1007, EVEN_ODD_SKIP},
|
||||||
|
{1008, 1008, -86},
|
||||||
|
{1009, 1009, -80},
|
||||||
|
{1010, 1010, 7},
|
||||||
|
{1013, 1013, -96},
|
||||||
|
{1016, 1016, ODD_EVEN},
|
||||||
|
{1019, 1019, EVEN_ODD},
|
||||||
|
{1072, 1103, -32},
|
||||||
|
{1104, 1119, -80},
|
||||||
|
{1121, 1153, EVEN_ODD_SKIP},
|
||||||
|
{1163, 1215, EVEN_ODD_SKIP},
|
||||||
|
{1218, 1230, ODD_EVEN_SKIP},
|
||||||
|
{1231, 1231, -15},
|
||||||
|
{1233, 1319, EVEN_ODD_SKIP},
|
||||||
|
{1377, 1414, -48},
|
||||||
|
{7545, 7545, 35332},
|
||||||
|
{7549, 7549, 3814},
|
||||||
|
{7681, 7829, EVEN_ODD_SKIP},
|
||||||
|
{7835, 7835, -59},
|
||||||
|
{7841, 7935, EVEN_ODD_SKIP},
|
||||||
|
{7936, 7943, 8},
|
||||||
|
{7952, 7957, 8},
|
||||||
|
{7968, 7975, 8},
|
||||||
|
{7984, 7991, 8},
|
||||||
|
{8000, 8005, 8},
|
||||||
|
{8017, 8017, 8},
|
||||||
|
{8019, 8019, 8},
|
||||||
|
{8021, 8021, 8},
|
||||||
|
{8023, 8023, 8},
|
||||||
|
{8032, 8039, 8},
|
||||||
|
{8048, 8049, 74},
|
||||||
|
{8050, 8053, 86},
|
||||||
|
{8054, 8055, 100},
|
||||||
|
{8056, 8057, 128},
|
||||||
|
{8058, 8059, 112},
|
||||||
|
{8060, 8061, 126},
|
||||||
|
{8064, 8071, 8},
|
||||||
|
{8080, 8087, 8},
|
||||||
|
{8096, 8103, 8},
|
||||||
|
{8112, 8113, 8},
|
||||||
|
{8115, 8115, 9},
|
||||||
|
{8126, 8126, -7205},
|
||||||
|
{8131, 8131, 9},
|
||||||
|
{8144, 8145, 8},
|
||||||
|
{8160, 8161, 8},
|
||||||
|
{8165, 8165, 7},
|
||||||
|
{8179, 8179, 9},
|
||||||
|
{8526, 8526, -28},
|
||||||
|
{8560, 8575, -16},
|
||||||
|
{8580, 8580, ODD_EVEN},
|
||||||
|
{9424, 9449, -26},
|
||||||
|
{11312, 11358, -48},
|
||||||
|
{11361, 11361, EVEN_ODD},
|
||||||
|
{11365, 11365, -10795},
|
||||||
|
{11366, 11366, -10792},
|
||||||
|
{11368, 11372, ODD_EVEN_SKIP},
|
||||||
|
{11379, 11379, EVEN_ODD},
|
||||||
|
{11382, 11382, ODD_EVEN},
|
||||||
|
{11393, 11491, EVEN_ODD_SKIP},
|
||||||
|
{11500, 11502, ODD_EVEN_SKIP},
|
||||||
|
{11520, 11557, -7264},
|
||||||
|
{42561, 42605, EVEN_ODD_SKIP},
|
||||||
|
{42625, 42647, EVEN_ODD_SKIP},
|
||||||
|
{42787, 42799, EVEN_ODD_SKIP},
|
||||||
|
{42803, 42863, EVEN_ODD_SKIP},
|
||||||
|
{42874, 42876, ODD_EVEN_SKIP},
|
||||||
|
{42879, 42887, EVEN_ODD_SKIP},
|
||||||
|
{42892, 42892, ODD_EVEN},
|
||||||
|
{42897, 42897, EVEN_ODD},
|
||||||
|
{42913, 42921, EVEN_ODD_SKIP},
|
||||||
|
{65345, 65370, -32},
|
||||||
|
{66600, 66639, -40}
|
||||||
|
};
|
||||||
|
const size_t LOWER_SPECIAL_CASING_SIZE = 103;
|
||||||
|
const SpecialCasingConversionRecord LOWER_SPECIAL_CASING[LOWER_SPECIAL_CASING_SIZE] = {
|
||||||
|
{223, "\xc3\x9f"},
|
||||||
|
{304, "\x69\xcc\x87"},
|
||||||
|
{64256, "\xef\xac\x80"},
|
||||||
|
{64257, "\xef\xac\x81"},
|
||||||
|
{64258, "\xef\xac\x82"},
|
||||||
|
{64259, "\xef\xac\x83"},
|
||||||
|
{64260, "\xef\xac\x84"},
|
||||||
|
{64261, "\xef\xac\x85"},
|
||||||
|
{64262, "\xef\xac\x86"},
|
||||||
|
{1415, "\xd6\x87"},
|
||||||
|
{64275, "\xef\xac\x93"},
|
||||||
|
{64276, "\xef\xac\x94"},
|
||||||
|
{64277, "\xef\xac\x95"},
|
||||||
|
{64278, "\xef\xac\x96"},
|
||||||
|
{64279, "\xef\xac\x97"},
|
||||||
|
{329, "\xc5\x89"},
|
||||||
|
{912, "\xce\x90"},
|
||||||
|
{944, "\xce\xb0"},
|
||||||
|
{496, "\xc7\xb0"},
|
||||||
|
{7830, "\xe1\xba\x96"},
|
||||||
|
{7831, "\xe1\xba\x97"},
|
||||||
|
{7832, "\xe1\xba\x98"},
|
||||||
|
{7833, "\xe1\xba\x99"},
|
||||||
|
{7834, "\xe1\xba\x9a"},
|
||||||
|
{8016, "\xe1\xbd\x90"},
|
||||||
|
{8018, "\xe1\xbd\x92"},
|
||||||
|
{8020, "\xe1\xbd\x94"},
|
||||||
|
{8022, "\xe1\xbd\x96"},
|
||||||
|
{8118, "\xe1\xbe\xb6"},
|
||||||
|
{8134, "\xe1\xbf\x86"},
|
||||||
|
{8146, "\xe1\xbf\x92"},
|
||||||
|
{8147, "\xe1\xbf\x93"},
|
||||||
|
{8150, "\xe1\xbf\x96"},
|
||||||
|
{8151, "\xe1\xbf\x97"},
|
||||||
|
{8162, "\xe1\xbf\xa2"},
|
||||||
|
{8163, "\xe1\xbf\xa3"},
|
||||||
|
{8164, "\xe1\xbf\xa4"},
|
||||||
|
{8166, "\xe1\xbf\xa6"},
|
||||||
|
{8167, "\xe1\xbf\xa7"},
|
||||||
|
{8182, "\xe1\xbf\xb6"},
|
||||||
|
{8064, "\xe1\xbe\x80"},
|
||||||
|
{8065, "\xe1\xbe\x81"},
|
||||||
|
{8066, "\xe1\xbe\x82"},
|
||||||
|
{8067, "\xe1\xbe\x83"},
|
||||||
|
{8068, "\xe1\xbe\x84"},
|
||||||
|
{8069, "\xe1\xbe\x85"},
|
||||||
|
{8070, "\xe1\xbe\x86"},
|
||||||
|
{8071, "\xe1\xbe\x87"},
|
||||||
|
{8072, "\xe1\xbe\x80"},
|
||||||
|
{8073, "\xe1\xbe\x81"},
|
||||||
|
{8074, "\xe1\xbe\x82"},
|
||||||
|
{8075, "\xe1\xbe\x83"},
|
||||||
|
{8076, "\xe1\xbe\x84"},
|
||||||
|
{8077, "\xe1\xbe\x85"},
|
||||||
|
{8078, "\xe1\xbe\x86"},
|
||||||
|
{8079, "\xe1\xbe\x87"},
|
||||||
|
{8080, "\xe1\xbe\x90"},
|
||||||
|
{8081, "\xe1\xbe\x91"},
|
||||||
|
{8082, "\xe1\xbe\x92"},
|
||||||
|
{8083, "\xe1\xbe\x93"},
|
||||||
|
{8084, "\xe1\xbe\x94"},
|
||||||
|
{8085, "\xe1\xbe\x95"},
|
||||||
|
{8086, "\xe1\xbe\x96"},
|
||||||
|
{8087, "\xe1\xbe\x97"},
|
||||||
|
{8088, "\xe1\xbe\x90"},
|
||||||
|
{8089, "\xe1\xbe\x91"},
|
||||||
|
{8090, "\xe1\xbe\x92"},
|
||||||
|
{8091, "\xe1\xbe\x93"},
|
||||||
|
{8092, "\xe1\xbe\x94"},
|
||||||
|
{8093, "\xe1\xbe\x95"},
|
||||||
|
{8094, "\xe1\xbe\x96"},
|
||||||
|
{8095, "\xe1\xbe\x97"},
|
||||||
|
{8096, "\xe1\xbe\xa0"},
|
||||||
|
{8097, "\xe1\xbe\xa1"},
|
||||||
|
{8098, "\xe1\xbe\xa2"},
|
||||||
|
{8099, "\xe1\xbe\xa3"},
|
||||||
|
{8100, "\xe1\xbe\xa4"},
|
||||||
|
{8101, "\xe1\xbe\xa5"},
|
||||||
|
{8102, "\xe1\xbe\xa6"},
|
||||||
|
{8103, "\xe1\xbe\xa7"},
|
||||||
|
{8104, "\xe1\xbe\xa0"},
|
||||||
|
{8105, "\xe1\xbe\xa1"},
|
||||||
|
{8106, "\xe1\xbe\xa2"},
|
||||||
|
{8107, "\xe1\xbe\xa3"},
|
||||||
|
{8108, "\xe1\xbe\xa4"},
|
||||||
|
{8109, "\xe1\xbe\xa5"},
|
||||||
|
{8110, "\xe1\xbe\xa6"},
|
||||||
|
{8111, "\xe1\xbe\xa7"},
|
||||||
|
{8115, "\xe1\xbe\xb3"},
|
||||||
|
{8124, "\xe1\xbe\xb3"},
|
||||||
|
{8131, "\xe1\xbf\x83"},
|
||||||
|
{8140, "\xe1\xbf\x83"},
|
||||||
|
{8179, "\xe1\xbf\xb3"},
|
||||||
|
{8188, "\xe1\xbf\xb3"},
|
||||||
|
{8114, "\xe1\xbe\xb2"},
|
||||||
|
{8116, "\xe1\xbe\xb4"},
|
||||||
|
{8130, "\xe1\xbf\x82"},
|
||||||
|
{8132, "\xe1\xbf\x84"},
|
||||||
|
{8178, "\xe1\xbf\xb2"},
|
||||||
|
{8180, "\xe1\xbf\xb4"},
|
||||||
|
{8119, "\xe1\xbe\xb7"},
|
||||||
|
{8135, "\xe1\xbf\x87"},
|
||||||
|
{8183, "\xe1\xbf\xb7"}
|
||||||
|
};
|
||||||
|
const size_t TITLE_SPECIAL_CASING_SIZE = 103;
|
||||||
|
const SpecialCasingConversionRecord TITLE_SPECIAL_CASING[TITLE_SPECIAL_CASING_SIZE] = {
|
||||||
|
{223, "\x53\x73"},
|
||||||
|
{304, "\xc4\xb0"},
|
||||||
|
{64256, "\x46\x66"},
|
||||||
|
{64257, "\x46\x69"},
|
||||||
|
{64258, "\x46\x6c"},
|
||||||
|
{64259, "\x46\x66\x69"},
|
||||||
|
{64260, "\x46\x66\x6c"},
|
||||||
|
{64261, "\x53\x74"},
|
||||||
|
{64262, "\x53\x74"},
|
||||||
|
{1415, "\xd4\xb5\xd6\x82"},
|
||||||
|
{64275, "\xd5\x84\xd5\xb6"},
|
||||||
|
{64276, "\xd5\x84\xd5\xa5"},
|
||||||
|
{64277, "\xd5\x84\xd5\xab"},
|
||||||
|
{64278, "\xd5\x8e\xd5\xb6"},
|
||||||
|
{64279, "\xd5\x84\xd5\xad"},
|
||||||
|
{329, "\xca\xbc\x4e"},
|
||||||
|
{912, "\xce\x99\xcc\x88\xcc\x81"},
|
||||||
|
{944, "\xce\xa5\xcc\x88\xcc\x81"},
|
||||||
|
{496, "\x4a\xcc\x8c"},
|
||||||
|
{7830, "\x48\xcc\xb1"},
|
||||||
|
{7831, "\x54\xcc\x88"},
|
||||||
|
{7832, "\x57\xcc\x8a"},
|
||||||
|
{7833, "\x59\xcc\x8a"},
|
||||||
|
{7834, "\x41\xca\xbe"},
|
||||||
|
{8016, "\xce\xa5\xcc\x93"},
|
||||||
|
{8018, "\xce\xa5\xcc\x93\xcc\x80"},
|
||||||
|
{8020, "\xce\xa5\xcc\x93\xcc\x81"},
|
||||||
|
{8022, "\xce\xa5\xcc\x93\xcd\x82"},
|
||||||
|
{8118, "\xce\x91\xcd\x82"},
|
||||||
|
{8134, "\xce\x97\xcd\x82"},
|
||||||
|
{8146, "\xce\x99\xcc\x88\xcc\x80"},
|
||||||
|
{8147, "\xce\x99\xcc\x88\xcc\x81"},
|
||||||
|
{8150, "\xce\x99\xcd\x82"},
|
||||||
|
{8151, "\xce\x99\xcc\x88\xcd\x82"},
|
||||||
|
{8162, "\xce\xa5\xcc\x88\xcc\x80"},
|
||||||
|
{8163, "\xce\xa5\xcc\x88\xcc\x81"},
|
||||||
|
{8164, "\xce\xa1\xcc\x93"},
|
||||||
|
{8166, "\xce\xa5\xcd\x82"},
|
||||||
|
{8167, "\xce\xa5\xcc\x88\xcd\x82"},
|
||||||
|
{8182, "\xce\xa9\xcd\x82"},
|
||||||
|
{8064, "\xe1\xbe\x88"},
|
||||||
|
{8065, "\xe1\xbe\x89"},
|
||||||
|
{8066, "\xe1\xbe\x8a"},
|
||||||
|
{8067, "\xe1\xbe\x8b"},
|
||||||
|
{8068, "\xe1\xbe\x8c"},
|
||||||
|
{8069, "\xe1\xbe\x8d"},
|
||||||
|
{8070, "\xe1\xbe\x8e"},
|
||||||
|
{8071, "\xe1\xbe\x8f"},
|
||||||
|
{8072, "\xe1\xbe\x88"},
|
||||||
|
{8073, "\xe1\xbe\x89"},
|
||||||
|
{8074, "\xe1\xbe\x8a"},
|
||||||
|
{8075, "\xe1\xbe\x8b"},
|
||||||
|
{8076, "\xe1\xbe\x8c"},
|
||||||
|
{8077, "\xe1\xbe\x8d"},
|
||||||
|
{8078, "\xe1\xbe\x8e"},
|
||||||
|
{8079, "\xe1\xbe\x8f"},
|
||||||
|
{8080, "\xe1\xbe\x98"},
|
||||||
|
{8081, "\xe1\xbe\x99"},
|
||||||
|
{8082, "\xe1\xbe\x9a"},
|
||||||
|
{8083, "\xe1\xbe\x9b"},
|
||||||
|
{8084, "\xe1\xbe\x9c"},
|
||||||
|
{8085, "\xe1\xbe\x9d"},
|
||||||
|
{8086, "\xe1\xbe\x9e"},
|
||||||
|
{8087, "\xe1\xbe\x9f"},
|
||||||
|
{8088, "\xe1\xbe\x98"},
|
||||||
|
{8089, "\xe1\xbe\x99"},
|
||||||
|
{8090, "\xe1\xbe\x9a"},
|
||||||
|
{8091, "\xe1\xbe\x9b"},
|
||||||
|
{8092, "\xe1\xbe\x9c"},
|
||||||
|
{8093, "\xe1\xbe\x9d"},
|
||||||
|
{8094, "\xe1\xbe\x9e"},
|
||||||
|
{8095, "\xe1\xbe\x9f"},
|
||||||
|
{8096, "\xe1\xbe\xa8"},
|
||||||
|
{8097, "\xe1\xbe\xa9"},
|
||||||
|
{8098, "\xe1\xbe\xaa"},
|
||||||
|
{8099, "\xe1\xbe\xab"},
|
||||||
|
{8100, "\xe1\xbe\xac"},
|
||||||
|
{8101, "\xe1\xbe\xad"},
|
||||||
|
{8102, "\xe1\xbe\xae"},
|
||||||
|
{8103, "\xe1\xbe\xaf"},
|
||||||
|
{8104, "\xe1\xbe\xa8"},
|
||||||
|
{8105, "\xe1\xbe\xa9"},
|
||||||
|
{8106, "\xe1\xbe\xaa"},
|
||||||
|
{8107, "\xe1\xbe\xab"},
|
||||||
|
{8108, "\xe1\xbe\xac"},
|
||||||
|
{8109, "\xe1\xbe\xad"},
|
||||||
|
{8110, "\xe1\xbe\xae"},
|
||||||
|
{8111, "\xe1\xbe\xaf"},
|
||||||
|
{8115, "\xe1\xbe\xbc"},
|
||||||
|
{8124, "\xe1\xbe\xbc"},
|
||||||
|
{8131, "\xe1\xbf\x8c"},
|
||||||
|
{8140, "\xe1\xbf\x8c"},
|
||||||
|
{8179, "\xe1\xbf\xbc"},
|
||||||
|
{8188, "\xe1\xbf\xbc"},
|
||||||
|
{8114, "\xe1\xbe\xba\xcd\x85"},
|
||||||
|
{8116, "\xce\x86\xcd\x85"},
|
||||||
|
{8130, "\xe1\xbf\x8a\xcd\x85"},
|
||||||
|
{8132, "\xce\x89\xcd\x85"},
|
||||||
|
{8178, "\xe1\xbf\xba\xcd\x85"},
|
||||||
|
{8180, "\xce\x8f\xcd\x85"},
|
||||||
|
{8119, "\xce\x91\xcd\x82\xcd\x85"},
|
||||||
|
{8135, "\xce\x97\xcd\x82\xcd\x85"},
|
||||||
|
{8183, "\xce\xa9\xcd\x82\xcd\x85"}
|
||||||
|
};
|
||||||
|
const size_t UPPER_SPECIAL_CASING_SIZE = 103;
|
||||||
|
const SpecialCasingConversionRecord UPPER_SPECIAL_CASING[UPPER_SPECIAL_CASING_SIZE] = {
|
||||||
|
{223, "\x53\x53"},
|
||||||
|
{304, "\xc4\xb0"},
|
||||||
|
{64256, "\x46\x46"},
|
||||||
|
{64257, "\x46\x49"},
|
||||||
|
{64258, "\x46\x4c"},
|
||||||
|
{64259, "\x46\x46\x49"},
|
||||||
|
{64260, "\x46\x46\x4c"},
|
||||||
|
{64261, "\x53\x54"},
|
||||||
|
{64262, "\x53\x54"},
|
||||||
|
{1415, "\xd4\xb5\xd5\x92"},
|
||||||
|
{64275, "\xd5\x84\xd5\x86"},
|
||||||
|
{64276, "\xd5\x84\xd4\xb5"},
|
||||||
|
{64277, "\xd5\x84\xd4\xbb"},
|
||||||
|
{64278, "\xd5\x8e\xd5\x86"},
|
||||||
|
{64279, "\xd5\x84\xd4\xbd"},
|
||||||
|
{329, "\xca\xbc\x4e"},
|
||||||
|
{912, "\xce\x99\xcc\x88\xcc\x81"},
|
||||||
|
{944, "\xce\xa5\xcc\x88\xcc\x81"},
|
||||||
|
{496, "\x4a\xcc\x8c"},
|
||||||
|
{7830, "\x48\xcc\xb1"},
|
||||||
|
{7831, "\x54\xcc\x88"},
|
||||||
|
{7832, "\x57\xcc\x8a"},
|
||||||
|
{7833, "\x59\xcc\x8a"},
|
||||||
|
{7834, "\x41\xca\xbe"},
|
||||||
|
{8016, "\xce\xa5\xcc\x93"},
|
||||||
|
{8018, "\xce\xa5\xcc\x93\xcc\x80"},
|
||||||
|
{8020, "\xce\xa5\xcc\x93\xcc\x81"},
|
||||||
|
{8022, "\xce\xa5\xcc\x93\xcd\x82"},
|
||||||
|
{8118, "\xce\x91\xcd\x82"},
|
||||||
|
{8134, "\xce\x97\xcd\x82"},
|
||||||
|
{8146, "\xce\x99\xcc\x88\xcc\x80"},
|
||||||
|
{8147, "\xce\x99\xcc\x88\xcc\x81"},
|
||||||
|
{8150, "\xce\x99\xcd\x82"},
|
||||||
|
{8151, "\xce\x99\xcc\x88\xcd\x82"},
|
||||||
|
{8162, "\xce\xa5\xcc\x88\xcc\x80"},
|
||||||
|
{8163, "\xce\xa5\xcc\x88\xcc\x81"},
|
||||||
|
{8164, "\xce\xa1\xcc\x93"},
|
||||||
|
{8166, "\xce\xa5\xcd\x82"},
|
||||||
|
{8167, "\xce\xa5\xcc\x88\xcd\x82"},
|
||||||
|
{8182, "\xce\xa9\xcd\x82"},
|
||||||
|
{8064, "\xe1\xbc\x88\xce\x99"},
|
||||||
|
{8065, "\xe1\xbc\x89\xce\x99"},
|
||||||
|
{8066, "\xe1\xbc\x8a\xce\x99"},
|
||||||
|
{8067, "\xe1\xbc\x8b\xce\x99"},
|
||||||
|
{8068, "\xe1\xbc\x8c\xce\x99"},
|
||||||
|
{8069, "\xe1\xbc\x8d\xce\x99"},
|
||||||
|
{8070, "\xe1\xbc\x8e\xce\x99"},
|
||||||
|
{8071, "\xe1\xbc\x8f\xce\x99"},
|
||||||
|
{8072, "\xe1\xbc\x88\xce\x99"},
|
||||||
|
{8073, "\xe1\xbc\x89\xce\x99"},
|
||||||
|
{8074, "\xe1\xbc\x8a\xce\x99"},
|
||||||
|
{8075, "\xe1\xbc\x8b\xce\x99"},
|
||||||
|
{8076, "\xe1\xbc\x8c\xce\x99"},
|
||||||
|
{8077, "\xe1\xbc\x8d\xce\x99"},
|
||||||
|
{8078, "\xe1\xbc\x8e\xce\x99"},
|
||||||
|
{8079, "\xe1\xbc\x8f\xce\x99"},
|
||||||
|
{8080, "\xe1\xbc\xa8\xce\x99"},
|
||||||
|
{8081, "\xe1\xbc\xa9\xce\x99"},
|
||||||
|
{8082, "\xe1\xbc\xaa\xce\x99"},
|
||||||
|
{8083, "\xe1\xbc\xab\xce\x99"},
|
||||||
|
{8084, "\xe1\xbc\xac\xce\x99"},
|
||||||
|
{8085, "\xe1\xbc\xad\xce\x99"},
|
||||||
|
{8086, "\xe1\xbc\xae\xce\x99"},
|
||||||
|
{8087, "\xe1\xbc\xaf\xce\x99"},
|
||||||
|
{8088, "\xe1\xbc\xa8\xce\x99"},
|
||||||
|
{8089, "\xe1\xbc\xa9\xce\x99"},
|
||||||
|
{8090, "\xe1\xbc\xaa\xce\x99"},
|
||||||
|
{8091, "\xe1\xbc\xab\xce\x99"},
|
||||||
|
{8092, "\xe1\xbc\xac\xce\x99"},
|
||||||
|
{8093, "\xe1\xbc\xad\xce\x99"},
|
||||||
|
{8094, "\xe1\xbc\xae\xce\x99"},
|
||||||
|
{8095, "\xe1\xbc\xaf\xce\x99"},
|
||||||
|
{8096, "\xe1\xbd\xa8\xce\x99"},
|
||||||
|
{8097, "\xe1\xbd\xa9\xce\x99"},
|
||||||
|
{8098, "\xe1\xbd\xaa\xce\x99"},
|
||||||
|
{8099, "\xe1\xbd\xab\xce\x99"},
|
||||||
|
{8100, "\xe1\xbd\xac\xce\x99"},
|
||||||
|
{8101, "\xe1\xbd\xad\xce\x99"},
|
||||||
|
{8102, "\xe1\xbd\xae\xce\x99"},
|
||||||
|
{8103, "\xe1\xbd\xaf\xce\x99"},
|
||||||
|
{8104, "\xe1\xbd\xa8\xce\x99"},
|
||||||
|
{8105, "\xe1\xbd\xa9\xce\x99"},
|
||||||
|
{8106, "\xe1\xbd\xaa\xce\x99"},
|
||||||
|
{8107, "\xe1\xbd\xab\xce\x99"},
|
||||||
|
{8108, "\xe1\xbd\xac\xce\x99"},
|
||||||
|
{8109, "\xe1\xbd\xad\xce\x99"},
|
||||||
|
{8110, "\xe1\xbd\xae\xce\x99"},
|
||||||
|
{8111, "\xe1\xbd\xaf\xce\x99"},
|
||||||
|
{8115, "\xce\x91\xce\x99"},
|
||||||
|
{8124, "\xce\x91\xce\x99"},
|
||||||
|
{8131, "\xce\x97\xce\x99"},
|
||||||
|
{8140, "\xce\x97\xce\x99"},
|
||||||
|
{8179, "\xce\xa9\xce\x99"},
|
||||||
|
{8188, "\xce\xa9\xce\x99"},
|
||||||
|
{8114, "\xe1\xbe\xba\xce\x99"},
|
||||||
|
{8116, "\xce\x86\xce\x99"},
|
||||||
|
{8130, "\xe1\xbf\x8a\xce\x99"},
|
||||||
|
{8132, "\xce\x89\xce\x99"},
|
||||||
|
{8178, "\xe1\xbf\xba\xce\x99"},
|
||||||
|
{8180, "\xce\x8f\xce\x99"},
|
||||||
|
{8119, "\xce\x91\xcd\x82\xce\x99"},
|
||||||
|
{8135, "\xce\x97\xcd\x82\xce\x99"},
|
||||||
|
{8183, "\xce\xa9\xcd\x82\xce\x99"}
|
||||||
|
};
|
42
utf8case/case_tables.hpp
Normal file
42
utf8case/case_tables.hpp
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#ifndef CASE_TABLES_HDR
|
||||||
|
#define CASE_TABLES_HDR
|
||||||
|
|
||||||
|
#include <boost/cstdint.hpp>
|
||||||
|
|
||||||
|
enum {
|
||||||
|
EVEN_ODD = 1,
|
||||||
|
ODD_EVEN = -1,
|
||||||
|
EVEN_ODD_SKIP = 1<<30,
|
||||||
|
ODD_EVEN_SKIP,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct CaseConversionRecord {
|
||||||
|
uint32_t lo_code_point;
|
||||||
|
uint32_t hi_code_point;
|
||||||
|
int32_t delta;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SpecialCasingConversionRecord {
|
||||||
|
uint32_t code_point;
|
||||||
|
const char* replacement;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const size_t LOWER_CASE_RANGES_SIZE;
|
||||||
|
extern const CaseConversionRecord LOWER_CASE_RANGES[];
|
||||||
|
|
||||||
|
extern const size_t UPPER_CASE_RANGES_SIZE;
|
||||||
|
extern const CaseConversionRecord UPPER_CASE_RANGES[];
|
||||||
|
|
||||||
|
extern const size_t TITLE_CASE_RANGES_SIZE;
|
||||||
|
extern const CaseConversionRecord TITLE_CASE_RANGES[];
|
||||||
|
|
||||||
|
extern const size_t LOWER_SPECIAL_CASING_SIZE;
|
||||||
|
extern const SpecialCasingConversionRecord LOWER_SPECIAL_CASING[];
|
||||||
|
|
||||||
|
extern const size_t TITLE_SPECIAL_CASING_SIZE;
|
||||||
|
extern const SpecialCasingConversionRecord TITLE_SPECIAL_CASING[];
|
||||||
|
|
||||||
|
extern const size_t UPPER_SPECIAL_CASING_SIZE;
|
||||||
|
extern const SpecialCasingConversionRecord UPPER_SPECIAL_CASING[];
|
||||||
|
|
||||||
|
#endif
|
17
utf8case/contextual_case_converter.hpp
Normal file
17
utf8case/contextual_case_converter.hpp
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#ifndef CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
#define CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
|
||||||
|
#include <boost/cstdint.hpp>
|
||||||
|
|
||||||
|
class ContextualCaseConverter {
|
||||||
|
public:
|
||||||
|
virtual ~ContextualCaseConverter() {
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual const char* convert(
|
||||||
|
uint32_t prev_code_point,
|
||||||
|
uint32_t code_point,
|
||||||
|
uint32_t next_code_point) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
138
utf8case/general_case_converter.hpp
Normal file
138
utf8case/general_case_converter.hpp
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
#ifndef GENERAL_CASE_CONVERTER_HDR
|
||||||
|
#define GENERAL_CASE_CONVERTER_HDR
|
||||||
|
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
|
||||||
|
#include "range_based_case_converter.hpp"
|
||||||
|
#include "special_casing_converter.hpp"
|
||||||
|
#include "contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
#include "utf8/utf8.h"
|
||||||
|
|
||||||
|
template<typename octet_iterator, typename output_iterator>
|
||||||
|
class GeneralCaseConverter {
|
||||||
|
|
||||||
|
public:
|
||||||
|
GeneralCaseConverter(
|
||||||
|
boost::shared_ptr<RangeBasedCaseConverter> rangeBasedCaseConverter,
|
||||||
|
boost::shared_ptr<SpecialCasingConverter> specialCasingConverter,
|
||||||
|
boost::shared_ptr<ContextualCaseConverter> contextualCaseConverter)
|
||||||
|
:rangeBasedCaseConverter_(rangeBasedCaseConverter),
|
||||||
|
specialCasingConverter_(specialCasingConverter),
|
||||||
|
contextualCaseConverter_(contextualCaseConverter) {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool willBeTouchedWhenConverted(octet_iterator start, octet_iterator end) const {
|
||||||
|
while (start != end) {
|
||||||
|
uint32_t code_point = utf8::unchecked::next(start);
|
||||||
|
|
||||||
|
if (specialCasingConverter_->convert(code_point)
|
||||||
|
|| rangeBasedCaseConverter_->convert(code_point) != code_point)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool willBeTouchedWhenHeadConverted(octet_iterator start, octet_iterator end) const {
|
||||||
|
if (start == end)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
octet_iterator prev_start = start;
|
||||||
|
utf8::unchecked::next(start);
|
||||||
|
return willBeTouchedWhenConverted(prev_start, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool willBeTouchedWhenTailConverted(octet_iterator start, octet_iterator end) const {
|
||||||
|
if (start == end)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
utf8::unchecked::next(start);
|
||||||
|
return willBeTouchedWhenConverted(start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
void convert(octet_iterator start, octet_iterator end, output_iterator out) const {
|
||||||
|
uint32_t prev_prev_code_point = SPECIAL_CODE_POINT;
|
||||||
|
uint32_t prev_code_point = SPECIAL_CODE_POINT;
|
||||||
|
|
||||||
|
while (start != end) {
|
||||||
|
uint32_t code_point = utf8::unchecked::next(start);
|
||||||
|
|
||||||
|
if (prev_code_point != SPECIAL_CODE_POINT)
|
||||||
|
convertSingleCodePoint(
|
||||||
|
prev_prev_code_point,
|
||||||
|
prev_code_point,
|
||||||
|
code_point,
|
||||||
|
out);
|
||||||
|
|
||||||
|
prev_prev_code_point = prev_code_point;
|
||||||
|
prev_code_point = code_point;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prev_code_point != SPECIAL_CODE_POINT)
|
||||||
|
convertSingleCodePoint(
|
||||||
|
prev_prev_code_point,
|
||||||
|
prev_code_point,
|
||||||
|
SPECIAL_CODE_POINT,
|
||||||
|
out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void convertSingleCodePoint(
|
||||||
|
uint32_t prev_code_point,
|
||||||
|
uint32_t current_code_point,
|
||||||
|
uint32_t next_code_point,
|
||||||
|
output_iterator out) const {
|
||||||
|
|
||||||
|
if (const char* contextual = contextualCaseConverter_->convert(
|
||||||
|
prev_code_point,
|
||||||
|
current_code_point,
|
||||||
|
next_code_point)) {
|
||||||
|
copyCharArrayToOutputIterator_(contextual, out);
|
||||||
|
} else if (const char* special = specialCasingConverter_->convert(current_code_point)) {
|
||||||
|
copyCharArrayToOutputIterator_(special, out);
|
||||||
|
} else {
|
||||||
|
uint32_t converted_code_point = rangeBasedCaseConverter_->convert(current_code_point);
|
||||||
|
utf8::unchecked::append(converted_code_point, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void headConvert(octet_iterator start, octet_iterator end, output_iterator out) const {
|
||||||
|
bool first = true;
|
||||||
|
|
||||||
|
while (start != end) {
|
||||||
|
if (first) {
|
||||||
|
octet_iterator prev_start = start;
|
||||||
|
utf8::unchecked::next(start);
|
||||||
|
convert(prev_start, start, out);
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
*out++ = *start++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void tailConvert(octet_iterator start, octet_iterator end, output_iterator out) const {
|
||||||
|
if (start != end) {
|
||||||
|
uint32_t code_point = utf8::unchecked::next(start);
|
||||||
|
|
||||||
|
utf8::unchecked::append(code_point, out);
|
||||||
|
|
||||||
|
convert(start, end, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
void copyCharArrayToOutputIterator_(const char* charVector, output_iterator out) const {
|
||||||
|
while (*charVector)
|
||||||
|
*out++ = *charVector++;
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<RangeBasedCaseConverter> rangeBasedCaseConverter_;
|
||||||
|
boost::shared_ptr<SpecialCasingConverter> specialCasingConverter_;
|
||||||
|
boost::shared_ptr<ContextualCaseConverter> contextualCaseConverter_;
|
||||||
|
|
||||||
|
const static uint32_t SPECIAL_CODE_POINT = 0xFFFFFFFF;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
251
utf8case/generate_case_tables.pl
Executable file
251
utf8case/generate_case_tables.pl
Executable file
@ -0,0 +1,251 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Based on ideas from re2 library.
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use LWP::Simple;
|
||||||
|
use String::Util qw(hascontent);
|
||||||
|
use Data::Dumper;
|
||||||
|
use Clone qw(clone);
|
||||||
|
|
||||||
|
my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/};
|
||||||
|
my $OUTPUT_CPP_FILE = 'case_tables.cpp';
|
||||||
|
|
||||||
|
my @lower_case_ranges;
|
||||||
|
my @upper_case_ranges;
|
||||||
|
my @title_case_ranges;
|
||||||
|
|
||||||
|
my @lower_special_casing;
|
||||||
|
my @upper_special_casing;
|
||||||
|
my @title_special_casing;
|
||||||
|
|
||||||
|
open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE;
|
||||||
|
generate_intro();
|
||||||
|
generate_standard_case_tables();
|
||||||
|
generate_special_casing_tables();
|
||||||
|
|
||||||
|
sub generate_intro {
|
||||||
|
print $output_cpp_fh <<'END_OF_INTRO';
|
||||||
|
// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT.
|
||||||
|
|
||||||
|
#include "case_tables.hpp"
|
||||||
|
|
||||||
|
END_OF_INTRO
|
||||||
|
}
|
||||||
|
|
||||||
|
sub generate_standard_case_tables {
|
||||||
|
my @unicode_data_lines = download_unidata_file('UnicodeData.txt');
|
||||||
|
|
||||||
|
for my $line (@unicode_data_lines) {
|
||||||
|
append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]);
|
||||||
|
append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]);
|
||||||
|
append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@lower_case_ranges = compactify(\@lower_case_ranges);
|
||||||
|
@upper_case_ranges = compactify(\@upper_case_ranges);
|
||||||
|
@title_case_ranges = compactify(\@title_case_ranges);
|
||||||
|
|
||||||
|
write_case_table('lower_case_ranges', \@lower_case_ranges);
|
||||||
|
print "\n";
|
||||||
|
|
||||||
|
write_case_table('upper_case_ranges', \@upper_case_ranges);
|
||||||
|
print "\n";
|
||||||
|
|
||||||
|
write_case_table('title_case_ranges', \@title_case_ranges);
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub generate_special_casing_tables {
|
||||||
|
my @special_casing_lines = download_unidata_file('SpecialCasing.txt');
|
||||||
|
|
||||||
|
for my $line (@special_casing_lines) {
|
||||||
|
if (hascontent($line->[4])) {
|
||||||
|
print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n";
|
||||||
|
} else {
|
||||||
|
append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]);
|
||||||
|
append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]);
|
||||||
|
append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write_special_casing_table('lower_special_casing', \@lower_special_casing);
|
||||||
|
print "\n";
|
||||||
|
|
||||||
|
write_special_casing_table('title_special_casing', \@title_special_casing);
|
||||||
|
print "\n";
|
||||||
|
|
||||||
|
write_special_casing_table('upper_special_casing', \@upper_special_casing);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub download_unidata_file {
|
||||||
|
my ($file_name) = @_;
|
||||||
|
|
||||||
|
my $url = $UNIDATA_PREFIX . $file_name;
|
||||||
|
|
||||||
|
print STDERR "Downloading ${url}...\n";
|
||||||
|
|
||||||
|
my $contents = get($url);
|
||||||
|
|
||||||
|
return map { [ split/\s*;\s*/ ] }
|
||||||
|
grep { /\S/ }
|
||||||
|
map{ s/\#.*\Z//; $_}
|
||||||
|
split/\r?\n/, $contents;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub append_to_case_ranges {
|
||||||
|
my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_;
|
||||||
|
|
||||||
|
if (!hascontent($hex_modified_code_point)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $code_point = hex($hex_code_point);
|
||||||
|
my $modified_code_point = hex($hex_modified_code_point);
|
||||||
|
|
||||||
|
push @{$case_ranges_ref},
|
||||||
|
[ $code_point, $code_point, delta($code_point, $modified_code_point) ];
|
||||||
|
}
|
||||||
|
|
||||||
|
sub compactify {
|
||||||
|
my ($case_ranges_ref) = @_;
|
||||||
|
|
||||||
|
my @new_table;
|
||||||
|
|
||||||
|
my $current_compact_range;
|
||||||
|
|
||||||
|
for my $range (@{$case_ranges_ref}) {
|
||||||
|
if (!defined($current_compact_range)) {
|
||||||
|
$current_compact_range = clone($range);
|
||||||
|
} elsif ($range->[2] eq $current_compact_range->[2]
|
||||||
|
&& $range->[0] == $current_compact_range->[1] + 1) {
|
||||||
|
++$current_compact_range->[1];
|
||||||
|
} elsif ($range->[2] eq de_skip($current_compact_range->[2])
|
||||||
|
&& $range->[0] == $current_compact_range->[1] + 2) {
|
||||||
|
$current_compact_range->[1] += 2;
|
||||||
|
$current_compact_range->[2] = add_skip($current_compact_range->[2]);
|
||||||
|
} else {
|
||||||
|
push @new_table, $current_compact_range;
|
||||||
|
$current_compact_range = clone($range);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
push @new_table, $current_compact_range;
|
||||||
|
|
||||||
|
return @new_table;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub write_case_table {
|
||||||
|
my ($name, $case_ranges_ref) = @_;
|
||||||
|
|
||||||
|
my $table_name = uc($name);
|
||||||
|
my $size_constant_name = $table_name . "_SIZE";
|
||||||
|
my $table_size = $#{$case_ranges_ref} + 1;
|
||||||
|
|
||||||
|
|
||||||
|
print $output_cpp_fh <<"END_OF_INTRO";
|
||||||
|
const size_t $size_constant_name = $table_size;
|
||||||
|
const CaseConversionRecord ${table_name}[$size_constant_name] = {
|
||||||
|
END_OF_INTRO
|
||||||
|
|
||||||
|
my $string_to_prepend = '';
|
||||||
|
|
||||||
|
for my $range (@{$case_ranges_ref}) {
|
||||||
|
my $from = $range->[0];
|
||||||
|
my $to = $range->[1];
|
||||||
|
my $delta = $range->[2];
|
||||||
|
|
||||||
|
print $output_cpp_fh "${string_to_prepend} {$from, $to, $delta}";
|
||||||
|
|
||||||
|
$string_to_prepend = ",\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
print $output_cpp_fh "\n};\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub append_to_special_casing_table {
|
||||||
|
my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_;
|
||||||
|
|
||||||
|
if (!hascontent($hex_code_point_vector)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $code_point = hex($hex_code_point);
|
||||||
|
my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector;
|
||||||
|
|
||||||
|
push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)];
|
||||||
|
}
|
||||||
|
|
||||||
|
sub write_special_casing_table {
|
||||||
|
my ($name, $special_casing_table_ref) = @_;
|
||||||
|
|
||||||
|
my $table_name = uc($name);
|
||||||
|
my $size_constant_name = $table_name . "_SIZE";
|
||||||
|
my $table_size = $#{$special_casing_table_ref} + 1;
|
||||||
|
|
||||||
|
print $output_cpp_fh <<"END_OF_INTRO";
|
||||||
|
const size_t $size_constant_name = $table_size;
|
||||||
|
const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = {
|
||||||
|
END_OF_INTRO
|
||||||
|
|
||||||
|
my $string_to_prepend = '';
|
||||||
|
|
||||||
|
for my $item (@{$special_casing_table_ref}) {
|
||||||
|
my $code_point = $item->[0];
|
||||||
|
my $replacement = $item->[1];
|
||||||
|
|
||||||
|
print $output_cpp_fh "${string_to_prepend} {$code_point, \"$replacement\"}";
|
||||||
|
|
||||||
|
$string_to_prepend = ",\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
print $output_cpp_fh "\n};\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub cpp_encode {
|
||||||
|
my (@v) = @_;
|
||||||
|
|
||||||
|
my $s = join('', map{ chr($_) } @v);
|
||||||
|
|
||||||
|
return join('', map { "\\x$_" } unpack("U0(H2)*", $s));
|
||||||
|
}
|
||||||
|
|
||||||
|
sub de_skip {
|
||||||
|
my ($delta) = @_;
|
||||||
|
|
||||||
|
if ($delta =~ /^(EVEN_ODD|ODD_EVEN)(?:_SKIP)?$/) {
|
||||||
|
return $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 'CANNOT_BE_SKIPPED';
|
||||||
|
}
|
||||||
|
|
||||||
|
sub add_skip {
|
||||||
|
my ($delta) = @_;
|
||||||
|
|
||||||
|
return de_skip($delta) . '_SKIP';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub delta {
|
||||||
|
my ($a, $b) = @_;
|
||||||
|
|
||||||
|
if ($a + 1 == $b) {
|
||||||
|
if ($a % 2 == 0) {
|
||||||
|
return 'EVEN_ODD'
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return 'ODD_EVEN';
|
||||||
|
}
|
||||||
|
} elsif ($a == $b + 1) {
|
||||||
|
if ($a % 2 == 0) {
|
||||||
|
return 'ODD_EVEN';
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return 'EVEN_ODD';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $b - $a;
|
||||||
|
}
|
63
utf8case/range_based_case_converter.cpp
Normal file
63
utf8case/range_based_case_converter.cpp
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
#include "range_based_case_converter.hpp"
|
||||||
|
|
||||||
|
uint32_t RangeBasedCaseConverter::convert(uint32_t code_point) const {
|
||||||
|
|
||||||
|
const CaseConversionRecord* conversionRecord = findRecord_(code_point);
|
||||||
|
|
||||||
|
return
|
||||||
|
conversionRecord == 0
|
||||||
|
? code_point
|
||||||
|
: applyRecord_(conversionRecord, code_point);
|
||||||
|
}
|
||||||
|
|
||||||
|
const CaseConversionRecord* RangeBasedCaseConverter::findRecord_(uint32_t code_point) const {
|
||||||
|
|
||||||
|
for (size_t i = 0; i < tableSize_; ++i) {
|
||||||
|
const CaseConversionRecord* currentRecord = &conversionTable_[i];
|
||||||
|
|
||||||
|
if (code_point < currentRecord->lo_code_point)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (code_point <= currentRecord->hi_code_point)
|
||||||
|
return currentRecord;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t RangeBasedCaseConverter::applyRecord_(
|
||||||
|
const CaseConversionRecord* conversionRecord, uint32_t code_point) const {
|
||||||
|
|
||||||
|
if (shouldBeSkipped_(conversionRecord, code_point))
|
||||||
|
return code_point;
|
||||||
|
|
||||||
|
return applyDelta_(conversionRecord->delta, code_point);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool RangeBasedCaseConverter::shouldBeSkipped_(
|
||||||
|
const CaseConversionRecord* conversionRecord, uint32_t code_point) const {
|
||||||
|
|
||||||
|
return
|
||||||
|
isSkipRecord_(conversionRecord)
|
||||||
|
&& code_point % 2 != conversionRecord->lo_code_point % 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool RangeBasedCaseConverter::isSkipRecord_(const CaseConversionRecord* conversionRecord) const {
|
||||||
|
return
|
||||||
|
conversionRecord->delta == EVEN_ODD_SKIP
|
||||||
|
|| conversionRecord->delta == ODD_EVEN_SKIP;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t RangeBasedCaseConverter::applyDelta_(int32_t delta, uint32_t code_point) const {
|
||||||
|
switch (delta) {
|
||||||
|
case EVEN_ODD:
|
||||||
|
case EVEN_ODD_SKIP:
|
||||||
|
return code_point % 2 == 0 ? code_point+1 : code_point-1;
|
||||||
|
case ODD_EVEN:
|
||||||
|
case ODD_EVEN_SKIP:
|
||||||
|
return code_point % 2 == 1 ? code_point+1 : code_point-1;
|
||||||
|
default:
|
||||||
|
return code_point + delta;
|
||||||
|
}
|
||||||
|
}
|
27
utf8case/range_based_case_converter.hpp
Normal file
27
utf8case/range_based_case_converter.hpp
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#ifndef RANGE_BASED_CASE_CONVERTED_HDR
|
||||||
|
#define RANGE_BASED_CASE_CONVERTED_HDR
|
||||||
|
|
||||||
|
#include "case_tables.hpp"
|
||||||
|
|
||||||
|
class RangeBasedCaseConverter {
|
||||||
|
|
||||||
|
public:
|
||||||
|
RangeBasedCaseConverter(size_t tableSize, const CaseConversionRecord* conversionTable)
|
||||||
|
:tableSize_(tableSize), conversionTable_(conversionTable) {
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t convert(uint32_t code_point) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
const CaseConversionRecord* findRecord_(uint32_t code_point) const;
|
||||||
|
uint32_t applyRecord_(const CaseConversionRecord* conversionRecord, uint32_t code_point) const;
|
||||||
|
bool shouldBeSkipped_(
|
||||||
|
const CaseConversionRecord* conversionRecord, uint32_t code_point) const;
|
||||||
|
bool isSkipRecord_(const CaseConversionRecord* conversionRecord) const;
|
||||||
|
uint32_t applyDelta_(int32_t delta, uint32_t code_point) const;
|
||||||
|
|
||||||
|
size_t tableSize_;
|
||||||
|
const CaseConversionRecord* conversionTable_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
11
utf8case/regular_contextual_case_converter.cpp
Normal file
11
utf8case/regular_contextual_case_converter.cpp
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#include "regular_contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
RegularContextualCaseConverter::~RegularContextualCaseConverter() {
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* RegularContextualCaseConverter::convert(
|
||||||
|
uint32_t /*prev_code_point*/,
|
||||||
|
uint32_t /*code_point*/,
|
||||||
|
uint32_t /*next_code_point*/) {
|
||||||
|
return 0;
|
||||||
|
}
|
16
utf8case/regular_contextual_case_converter.hpp
Normal file
16
utf8case/regular_contextual_case_converter.hpp
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#ifndef REGULAR_CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
#define REGULAR_CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
|
||||||
|
#include "contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
class RegularContextualCaseConverter: public ContextualCaseConverter {
|
||||||
|
public:
|
||||||
|
virtual ~RegularContextualCaseConverter();
|
||||||
|
|
||||||
|
virtual const char* convert(
|
||||||
|
uint32_t prev_code_point,
|
||||||
|
uint32_t code_point,
|
||||||
|
uint32_t next_code_point);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
56
utf8case/simple_convert.cpp
Normal file
56
utf8case/simple_convert.cpp
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
#include "simple_convert.hpp"
|
||||||
|
|
||||||
|
std::string simpleConvert(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s) {
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
converter.convert(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string simpleHeadConvert(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s) {
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
converter.headConvert(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string simpleTailConvert(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s) {
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
converter.tailConvert(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool simpleWillBeTouchedWhenConverted(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s) {
|
||||||
|
|
||||||
|
return converter.willBeTouchedWhenConverted(s.begin(), s.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool simpleWillBeTouchedWhenHeadConverted(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s) {
|
||||||
|
|
||||||
|
return converter.willBeTouchedWhenHeadConverted(s.begin(), s.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool simpleWillBeTouchedWhenTailConverted(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s) {
|
||||||
|
|
||||||
|
return converter.willBeTouchedWhenTailConverted(s.begin(), s.end());
|
||||||
|
}
|
34
utf8case/simple_convert.hpp
Normal file
34
utf8case/simple_convert.hpp
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
#ifndef SIMPLE_CONVERT_HDR
|
||||||
|
#define SIMPLE_CONVERT_HDR
|
||||||
|
|
||||||
|
#include "general_case_converter.hpp"
|
||||||
|
|
||||||
|
typedef GeneralCaseConverter<std::string::const_iterator,
|
||||||
|
std::back_insert_iterator<std::string> > StringGeneralCaseConverter;
|
||||||
|
|
||||||
|
std::string simpleConvert(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s);
|
||||||
|
|
||||||
|
std::string simpleHeadConvert(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s);
|
||||||
|
|
||||||
|
std::string simpleTailConvert(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s);
|
||||||
|
|
||||||
|
|
||||||
|
bool simpleWillBeTouchedWhenConverted(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s);
|
||||||
|
|
||||||
|
bool simpleWillBeTouchedWhenHeadConverted(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s);
|
||||||
|
|
||||||
|
bool simpleWillBeTouchedWhenTailConverted(
|
||||||
|
const StringGeneralCaseConverter& converter,
|
||||||
|
const std::string& s);
|
||||||
|
|
||||||
|
#endif
|
16
utf8case/special_casing_converter.cpp
Normal file
16
utf8case/special_casing_converter.cpp
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#include "special_casing_converter.hpp"
|
||||||
|
|
||||||
|
const char* SpecialCasingConverter::convert(uint32_t code_point) const {
|
||||||
|
|
||||||
|
for (size_t i = 0; i < tableSize_; ++i) {
|
||||||
|
const SpecialCasingConversionRecord* currentRecord = &conversionTable_[i];
|
||||||
|
|
||||||
|
if (code_point < currentRecord->code_point)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (code_point == currentRecord->code_point)
|
||||||
|
return currentRecord->replacement;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
21
utf8case/special_casing_converter.hpp
Normal file
21
utf8case/special_casing_converter.hpp
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
#ifndef SPECIAL_CASING_CONVERTER_HDR
|
||||||
|
#define SPECIAL_CASING_CONVERTER_HDR
|
||||||
|
|
||||||
|
#include "case_tables.hpp"
|
||||||
|
|
||||||
|
class SpecialCasingConverter {
|
||||||
|
|
||||||
|
public:
|
||||||
|
SpecialCasingConverter(size_t tableSize, const SpecialCasingConversionRecord* conversionTable)
|
||||||
|
:tableSize_(tableSize), conversionTable_(conversionTable) {
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* convert(uint32_t code_point) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
size_t tableSize_;
|
||||||
|
const SpecialCasingConversionRecord* conversionTable_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
11
utf8case/string_case_converter_manager.cpp
Normal file
11
utf8case/string_case_converter_manager.cpp
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#include "string_case_converter_manager.hpp"
|
||||||
|
|
||||||
|
StringCaseConverterManager& StringCaseConverterManager::getInstance() {
|
||||||
|
static StringCaseConverterManager instance;
|
||||||
|
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
StringCaseConverterManager::StringCaseConverterManager() {
|
||||||
|
}
|
16
utf8case/string_case_converter_manager.hpp
Normal file
16
utf8case/string_case_converter_manager.hpp
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#ifndef STRING_CASE_CONVERTER_MANAGER_HDR
|
||||||
|
#define STRING_CASE_CONVERTER_MANAGER_HDR
|
||||||
|
|
||||||
|
#include "case_converter_factory.hpp"
|
||||||
|
|
||||||
|
class StringCaseConverterManager : public CaseConverterFactory<
|
||||||
|
std::string::const_iterator, std::back_insert_iterator<std::string> > {
|
||||||
|
|
||||||
|
public:
|
||||||
|
static StringCaseConverterManager& getInstance();
|
||||||
|
|
||||||
|
private:
|
||||||
|
StringCaseConverterManager();
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
7
utf8case/t/CMakeLists.txt
Normal file
7
utf8case/t/CMakeLists.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
add_library(utf8case-tests
|
||||||
|
range_based_case_converter_tests.cpp
|
||||||
|
simple_convert_tests.cpp
|
||||||
|
special_casing_converter_tests.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
target_link_libraries(utf8case-tests utf8case)
|
43
utf8case/t/range_based_case_converter_tests.cpp
Normal file
43
utf8case/t/range_based_case_converter_tests.cpp
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#include "tests/tests.hpp"
|
||||||
|
|
||||||
|
#include "utf8case/range_based_case_converter.hpp"
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE( utf8case )
|
||||||
|
|
||||||
|
void lower_single_letter_checker(uint32_t lower_code_point, uint32_t upper_code_point) {
|
||||||
|
RangeBasedCaseConverter converter(LOWER_CASE_RANGES_SIZE,
|
||||||
|
LOWER_CASE_RANGES);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(upper_code_point), lower_code_point);
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(lower_code_point), lower_code_point);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( range_based_case_converter ) {
|
||||||
|
RangeBasedCaseConverter converter(LOWER_CASE_RANGES_SIZE,
|
||||||
|
LOWER_CASE_RANGES);
|
||||||
|
|
||||||
|
const uint32_t COMMA_CODE_POINT = 44U;
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(COMMA_CODE_POINT), COMMA_CODE_POINT);
|
||||||
|
|
||||||
|
const uint32_t UPPER_F_CODE_POINT = 70U;
|
||||||
|
const uint32_t LOWER_F_CODE_POINT = 102U;
|
||||||
|
lower_single_letter_checker(LOWER_F_CODE_POINT, UPPER_F_CODE_POINT);
|
||||||
|
|
||||||
|
const uint32_t UPPER_A_CODE_POINT = 65U;
|
||||||
|
const uint32_t LOWER_A_CODE_POINT = 97U;
|
||||||
|
lower_single_letter_checker(LOWER_A_CODE_POINT, UPPER_A_CODE_POINT);
|
||||||
|
|
||||||
|
const uint32_t UPPER_Z_CODE_POINT = 90U;
|
||||||
|
const uint32_t LOWER_Z_CODE_POINT = 122U;
|
||||||
|
lower_single_letter_checker(LOWER_Z_CODE_POINT, UPPER_Z_CODE_POINT);
|
||||||
|
|
||||||
|
const uint32_t UPPER_E_OGONEK_CODE_POINT = 280U;
|
||||||
|
const uint32_t LOWER_E_OGONEK_CODE_POINT = 281U;
|
||||||
|
lower_single_letter_checker(LOWER_E_OGONEK_CODE_POINT, UPPER_E_OGONEK_CODE_POINT);
|
||||||
|
|
||||||
|
const uint32_t UPPER_SHCHA_CODE_POINT = 1065U;
|
||||||
|
const uint32_t LOWER_SHCHA_CODE_POINT = 1097U;
|
||||||
|
lower_single_letter_checker(LOWER_SHCHA_CODE_POINT, UPPER_SHCHA_CODE_POINT);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
176
utf8case/t/simple_convert_tests.cpp
Normal file
176
utf8case/t/simple_convert_tests.cpp
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
#include "tests/tests.hpp"
|
||||||
|
|
||||||
|
#include "utf8case/simple_convert.hpp"
|
||||||
|
#include "utf8case/case_converter_factory.hpp"
|
||||||
|
#include "utf8case/string_case_converter_manager.hpp"
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE( utf8case )
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( simple_convert_lower ) {
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> lowerConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getLowerCaseConverter("pl");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "KOMPUTER"),
|
||||||
|
std::string("komputer"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "ŹDŹBŁO"),
|
||||||
|
std::string("źdźbło"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "Zażółć gęślą JAŹŃ"),
|
||||||
|
std::string("zażółć gęślą jaźń"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "I"),
|
||||||
|
std::string("i"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "БУКВЫ"),
|
||||||
|
std::string("буквы"));
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleHeadConvert(*lowerConverter, "ŹDŹBŁO"),
|
||||||
|
std::string("źDŹBŁO"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleTailConvert(*lowerConverter, "ŹDŹBŁO"),
|
||||||
|
std::string("Źdźbło"));
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, "Ś"),
|
||||||
|
std::string("ś"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleHeadConvert(*lowerConverter, "Ś"),
|
||||||
|
std::string("ś"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleTailConvert(*lowerConverter, "Ś"),
|
||||||
|
std::string("Ś"));
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*lowerConverter, ""),
|
||||||
|
std::string(""));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleHeadConvert(*lowerConverter, ""),
|
||||||
|
std::string(""));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleTailConvert(*lowerConverter, ""),
|
||||||
|
std::string(""));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( will_be_touched ) {
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> upperConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
||||||
|
|
||||||
|
BOOST_CHECK(simpleWillBeTouchedWhenConverted(*upperConverter, "KOMPUTEr"));
|
||||||
|
BOOST_CHECK(simpleWillBeTouchedWhenTailConverted(*upperConverter, "KOMPUTEr"));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenHeadConverted(*upperConverter, "KOMPUTEr"));
|
||||||
|
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenConverted(*upperConverter, "KOMPUTER"));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "KOMPUTER"));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenHeadConverted(*upperConverter, "KOMPUTER"));
|
||||||
|
|
||||||
|
BOOST_CHECK(simpleWillBeTouchedWhenConverted(*upperConverter, "śNIEG"));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "śNIEG"));
|
||||||
|
BOOST_CHECK(simpleWillBeTouchedWhenHeadConverted(*upperConverter, "śNIEG"));
|
||||||
|
|
||||||
|
BOOST_CHECK(simpleWillBeTouchedWhenConverted(*upperConverter, "ź"));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, "ź"));
|
||||||
|
BOOST_CHECK(simpleWillBeTouchedWhenHeadConverted(*upperConverter, "ź"));
|
||||||
|
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenConverted(*upperConverter, ""));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenTailConverted(*upperConverter, ""));
|
||||||
|
BOOST_CHECK(!simpleWillBeTouchedWhenHeadConverted(*upperConverter, ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( simple_convert_upper ) {
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> upperConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "komputer"),
|
||||||
|
std::string("KOMPUTER"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "źdźbło"),
|
||||||
|
std::string("ŹDŹBŁO"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "daß"),
|
||||||
|
std::string("DASS"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*upperConverter, "ffi"),
|
||||||
|
std::string("FFI"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( simple_convert_title ) {
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> titleConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getTitleCaseConverter("pl");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*titleConverter, "źdźbło"),
|
||||||
|
std::string("ŹDŹBŁO"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*titleConverter, "daß"),
|
||||||
|
std::string("DASs"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*titleConverter, "ffi"),
|
||||||
|
std::string("Ffi"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( simple_turkish_lower ) {
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> standardLowerConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getLowerCaseConverter("pl");
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> turkishLowerConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getLowerCaseConverter("tr");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*standardLowerConverter, "YAZICI"),
|
||||||
|
std::string("yazici"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*turkishLowerConverter, "YAZICI"),
|
||||||
|
std::string("yazıcı"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*standardLowerConverter, "I"),
|
||||||
|
std::string("i"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*turkishLowerConverter, "I"),
|
||||||
|
std::string("ı"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*standardLowerConverter, "İ"),
|
||||||
|
std::string("i̇"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*turkishLowerConverter, "İ"),
|
||||||
|
std::string("i"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( simple_turkish_upper ) {
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> standardUpperConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
||||||
|
|
||||||
|
boost::shared_ptr<StringGeneralCaseConverter> turkishUpperConverter =
|
||||||
|
StringCaseConverterManager::getInstance().getUpperCaseConverter("tr");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*standardUpperConverter, "yazici"),
|
||||||
|
std::string("YAZICI"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*turkishUpperConverter, "yazici"),
|
||||||
|
std::string("YAZİCİ"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*standardUpperConverter, "i"),
|
||||||
|
std::string("I"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*turkishUpperConverter, "i"),
|
||||||
|
std::string("İ"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*standardUpperConverter, "ı"),
|
||||||
|
std::string("I"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(simpleConvert(*turkishUpperConverter, "ı"),
|
||||||
|
std::string("I"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
32
utf8case/t/special_casing_converter_tests.cpp
Normal file
32
utf8case/t/special_casing_converter_tests.cpp
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#include "tests/tests.hpp"
|
||||||
|
|
||||||
|
#include "utf8case/special_casing_converter.hpp"
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE( utf8case )
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( special_casing_converter ) {
|
||||||
|
SpecialCasingConverter converter(UPPER_SPECIAL_CASING_SIZE,
|
||||||
|
UPPER_SPECIAL_CASING);
|
||||||
|
|
||||||
|
const uint32_t COMMA_CODE_POINT = 44U;
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(COMMA_CODE_POINT), (const char*)0);
|
||||||
|
|
||||||
|
const uint32_t UPPER_F_CODE_POINT = 70U;
|
||||||
|
const uint32_t LOWER_F_CODE_POINT = 102U;
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(UPPER_F_CODE_POINT), (const char*)0);
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(LOWER_F_CODE_POINT), (const char*)0);
|
||||||
|
|
||||||
|
const uint32_t UPPER_SHCHA_CODE_POINT = 1065U;
|
||||||
|
const uint32_t LOWER_SHCHA_CODE_POINT = 1097U;
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(UPPER_SHCHA_CODE_POINT), (const char*)0);
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(LOWER_SHCHA_CODE_POINT), (const char*)0);
|
||||||
|
|
||||||
|
const uint32_t ESZET_CODE_POINT = 223U;
|
||||||
|
BOOST_CHECK_EQUAL(converter.convert(ESZET_CODE_POINT), "SS");
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( special_casing_converter2 ) {
|
||||||
|
BOOST_CHECK_EQUAL("SS", "SS");
|
||||||
|
|
||||||
|
}
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
@ -0,0 +1,29 @@
|
|||||||
|
#include "turkish_and_azeri_lower_contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
TurkishAndAzeriLowerContextualCaseConverter::~TurkishAndAzeriLowerContextualCaseConverter() {
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* TurkishAndAzeriLowerContextualCaseConverter::convert(
|
||||||
|
uint32_t prev_code_point,
|
||||||
|
uint32_t code_point,
|
||||||
|
uint32_t next_code_point) {
|
||||||
|
|
||||||
|
if (code_point == LATIN_CAPITAL_LETTER_I && next_code_point != DOT_ABOVE)
|
||||||
|
return "ı";
|
||||||
|
|
||||||
|
if (code_point == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
|
||||||
|
return "i";
|
||||||
|
|
||||||
|
if (code_point == DOT_ABOVE && prev_code_point == LATIN_CAPITAL_LETTER_I)
|
||||||
|
return "";
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t TurkishAndAzeriLowerContextualCaseConverter::LATIN_CAPITAL_LETTER_I = 0x0049;
|
||||||
|
|
||||||
|
|
||||||
|
const uint32_t TurkishAndAzeriLowerContextualCaseConverter::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE =
|
||||||
|
0x0130;
|
||||||
|
|
||||||
|
const uint32_t TurkishAndAzeriLowerContextualCaseConverter::DOT_ABOVE = 0x0307;
|
@ -0,0 +1,21 @@
|
|||||||
|
#ifndef TURKISH_AND_AZERI_LOWER_CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
#define TURKISH_AND_AZERI_LOWER_CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
|
||||||
|
#include "contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
class TurkishAndAzeriLowerContextualCaseConverter: public ContextualCaseConverter {
|
||||||
|
public:
|
||||||
|
virtual ~TurkishAndAzeriLowerContextualCaseConverter();
|
||||||
|
|
||||||
|
virtual const char* convert(
|
||||||
|
uint32_t prev_code_point,
|
||||||
|
uint32_t code_point,
|
||||||
|
uint32_t next_code_point);
|
||||||
|
private:
|
||||||
|
const static uint32_t LATIN_CAPITAL_LETTER_I;
|
||||||
|
const static uint32_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
|
||||||
|
const static uint32_t DOT_ABOVE;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
@ -0,0 +1,17 @@
|
|||||||
|
#include "turkish_and_azeri_upper_contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
TurkishAndAzeriUpperContextualCaseConverter::~TurkishAndAzeriUpperContextualCaseConverter() {
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* TurkishAndAzeriUpperContextualCaseConverter::convert(
|
||||||
|
uint32_t /*prev_code_point*/,
|
||||||
|
uint32_t code_point,
|
||||||
|
uint32_t /*next_code_point*/) {
|
||||||
|
|
||||||
|
if (code_point == LATIN_SMALL_LETTER_I)
|
||||||
|
return "İ";
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t TurkishAndAzeriUpperContextualCaseConverter::LATIN_SMALL_LETTER_I = 0x0069;
|
@ -0,0 +1,20 @@
|
|||||||
|
#ifndef TURKISH_AND_AZERI_UPPER_CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
#define TURKISH_AND_AZERI_UPPER_CONTEXTUAL_CASE_CONVERTER_HDR
|
||||||
|
|
||||||
|
#include "contextual_case_converter.hpp"
|
||||||
|
|
||||||
|
class TurkishAndAzeriUpperContextualCaseConverter: public ContextualCaseConverter {
|
||||||
|
public:
|
||||||
|
virtual ~TurkishAndAzeriUpperContextualCaseConverter();
|
||||||
|
|
||||||
|
virtual const char* convert(
|
||||||
|
uint32_t prev_code_point,
|
||||||
|
uint32_t code_point,
|
||||||
|
uint32_t next_code_point);
|
||||||
|
|
||||||
|
private:
|
||||||
|
const static uint32_t LATIN_SMALL_LETTER_I;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user