Pracownia_programowania/venv/Lib/site-packages/Cython/Compiler/StringEncoding.py

#
#   Cython -- encoding related tools
#

from __future__ import absolute_import

import re
import sys

if sys.version_info[0] >= 3:
    _unicode, _str, _bytes, _unichr = str, str, bytes, chr
    IS_PYTHON3 = True
else:
    _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
    IS_PYTHON3 = False

empty_bytes = _bytes()
empty_unicode = _unicode()

join_bytes = empty_bytes.join


class UnicodeLiteralBuilder(object):
    """Assemble a unicode string.
    """
    def __init__(self):
        self.chars = []

    def append(self, characters):
        if isinstance(characters, _bytes):
            # this came from a Py2 string literal in the parser code
            characters = characters.decode("ASCII")
        assert isinstance(characters, _unicode), str(type(characters))
        self.chars.append(characters)

    if sys.maxunicode == 65535:
        def append_charval(self, char_number):
            if char_number > 65535:
                # wide Unicode character on narrow platform => replace
                # by surrogate pair
                char_number -= 0x10000
                self.chars.append( _unichr((char_number // 1024) + 0xD800) )
                self.chars.append( _unichr((char_number  % 1024) + 0xDC00) )
            else:
                self.chars.append( _unichr(char_number) )
    else:
        def append_charval(self, char_number):
            self.chars.append( _unichr(char_number) )

    def append_uescape(self, char_number, escape_string):
        self.append_charval(char_number)

    def getstring(self):
        return EncodedString(u''.join(self.chars))

    def getstrings(self):
        return (None, self.getstring())


class BytesLiteralBuilder(object):
    """Assemble a byte string or char value.
    """
    def __init__(self, target_encoding):
        self.chars = []
        self.target_encoding = target_encoding

    def append(self, characters):
        if isinstance(characters, _unicode):
            characters = characters.encode(self.target_encoding)
        assert isinstance(characters, _bytes), str(type(characters))
        self.chars.append(characters)

    def append_charval(self, char_number):
        self.chars.append( _unichr(char_number).encode('ISO-8859-1') )

    def append_uescape(self, char_number, escape_string):
        self.append(escape_string)

    def getstring(self):
        # this *must* return a byte string!
        return bytes_literal(join_bytes(self.chars), self.target_encoding)

    def getchar(self):
        # this *must* return a byte string!
        return self.getstring()

    def getstrings(self):
        return (self.getstring(), None)


class StrLiteralBuilder(object):
    """Assemble both a bytes and a unicode representation of a string.
    """
    def __init__(self, target_encoding):
        self._bytes   = BytesLiteralBuilder(target_encoding)
        self._unicode = UnicodeLiteralBuilder()

    def append(self, characters):
        self._bytes.append(characters)
        self._unicode.append(characters)

    def append_charval(self, char_number):
        self._bytes.append_charval(char_number)
        self._unicode.append_charval(char_number)

    def append_uescape(self, char_number, escape_string):
        self._bytes.append(escape_string)
        self._unicode.append_charval(char_number)

    def getstrings(self):
        return (self._bytes.getstring(), self._unicode.getstring())


class EncodedString(_unicode):
    # unicode string subclass to keep track of the original encoding.
    # 'encoding' is None for unicode strings and the source encoding
    # otherwise
    encoding = None

    def __deepcopy__(self, memo):
        return self

    def byteencode(self):
        assert self.encoding is not None
        return self.encode(self.encoding)

    def utf8encode(self):
        assert self.encoding is None
        return self.encode("UTF-8")

    @property
    def is_unicode(self):
        return self.encoding is None

    def contains_surrogates(self):
        return string_contains_surrogates(self)

    def as_utf8_string(self):
        return bytes_literal(self.utf8encode(), 'utf8')


def string_contains_surrogates(ustring):
    """
    Check if the unicode string contains surrogate code points
    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
    Unicode, i.e. characters that would be spelled as two
    separate code units on a narrow platform.
    """
    for c in map(ord, ustring):
        if c > 65535:  # can only happen on wide platforms
            return True
        if 0xD800 <= c <= 0xDFFF:
            return True
    return False


class BytesLiteral(_bytes):
    # bytes subclass that is compatible with EncodedString
    encoding = None

    def __deepcopy__(self, memo):
        return self

    def byteencode(self):
        if IS_PYTHON3:
            return _bytes(self)
        else:
            # fake-recode the string to make it a plain bytes object
            return self.decode('ISO-8859-1').encode('ISO-8859-1')

    def utf8encode(self):
        assert False, "this is not a unicode string: %r" % self

    def __str__(self):
        """Fake-decode the byte string to unicode to support %
        formatting of unicode strings.
        """
        return self.decode('ISO-8859-1')

    is_unicode = False

    def as_c_string_literal(self):
        value = split_string_literal(escape_byte_string(self))
        return '"%s"' % value


def bytes_literal(s, encoding):
    assert isinstance(s, bytes)
    s = BytesLiteral(s)
    s.encoding = encoding
    return s


def encoded_string(s, encoding):
    assert isinstance(s, (_unicode, bytes))
    s = EncodedString(s)
    if encoding is not None:
        s.encoding = encoding
    return s


char_from_escape_sequence = {
    r'\a' : u'\a',
    r'\b' : u'\b',
    r'\f' : u'\f',
    r'\n' : u'\n',
    r'\r' : u'\r',
    r'\t' : u'\t',
    r'\v' : u'\v',
    }.get

_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))


def _to_escape_sequence(s):
    if s in '\n\r\t':
        return repr(s)[1:-1]
    elif s == '"':
        return r'\"'
    elif s == '\\':
        return r'\\'
    else:
        # within a character sequence, oct passes much better than hex
        return ''.join(['\\%03o' % ord(c) for c in s])


def _build_specials_replacer():
    subexps = []
    replacements = {}
    for special in _c_special:
        regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
        subexps.append(regexp)
        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
    def replace_specials(m):
        return replacements[m.group(1)]
    def replace(s):
        return sub(replace_specials, s)
    return replace

_replace_specials = _build_specials_replacer()


def escape_char(c):
    if IS_PYTHON3:
        c = c.decode('ISO-8859-1')
    if c in '\n\r\t\\':
        return repr(c)[1:-1]
    elif c == "'":
        return "\\'"
    n = ord(c)
    if n < 32 or n > 127:
        # hex works well for characters
        return "\\x%02X" % n
    else:
        return c

def escape_byte_string(s):
    """Escape a byte string so that it can be written into C code.
    Note that this returns a Unicode string instead which, when
    encoded as ISO-8859-1, will result in the correct byte sequence
    being written.
    """
    s = _replace_specials(s)
    try:
        return s.decode("ASCII") # trial decoding: plain ASCII => done
    except UnicodeDecodeError:
        pass
    if IS_PYTHON3:
        s_new = bytearray()
        append, extend = s_new.append, s_new.extend
        for b in s:
            if b >= 128:
                extend(('\\%3o' % b).encode('ASCII'))
            else:
                append(b)
        return s_new.decode('ISO-8859-1')
    else:
        l = []
        append = l.append
        for c in s:
            o = ord(c)
            if o >= 128:
                append('\\%3o' % o)
            else:
                append(c)
        return join_bytes(l).decode('ISO-8859-1')

def split_string_literal(s, limit=2000):
    # MSVC can't handle long string literals.
    if len(s) < limit:
        return s
    else:
        start = 0
        chunks = []
        while start < len(s):
            end = start + limit
            if len(s) > end-4 and '\\' in s[end-4:end]:
                end -= 4 - s[end-4:end].find('\\') # just before the backslash
                while s[end-1] == '\\':
                    end -= 1
                    if end == start:
                        # must have been a long line of backslashes
                        end = start + limit - (limit % 2) - 4
                        break
            chunks.append(s[start:end])
            start = end
        return '""'.join(chunks)

def encode_pyunicode_string(s):
    """Create Py_UNICODE[] representation of a given unicode string.
    """
    s = list(map(ord, s)) + [0]

    if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
        utf16, utf32 = [], s
        for code_point in s:
            if code_point >= 0x10000:  # outside of BMP
                high, low = divmod(code_point - 0x10000, 1024)
                utf16.append(high + 0xD800)
                utf16.append(low + 0xDC00)
            else:
                utf16.append(code_point)
    else:
        utf16, utf32 = s, []
        for code_unit in s:
            if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
                high, low = utf32[-1], code_unit
                utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
            else:
                utf32.append(code_unit)

    if utf16 == utf32:
        utf16 = []
    return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))
Projekt 2020-02-01 19:54:00 +01:00			`#`
			`# Cython -- encoding related tools`
			`#`

			`from __future__ import absolute_import`

			`import re`
			`import sys`

			`if sys.version_info[0] >= 3:`
			`_unicode, _str, _bytes, _unichr = str, str, bytes, chr`
			`IS_PYTHON3 = True`
			`else:`
			`_unicode, _str, _bytes, _unichr = unicode, str, str, unichr`
			`IS_PYTHON3 = False`

			`empty_bytes = _bytes()`
			`empty_unicode = _unicode()`

			`join_bytes = empty_bytes.join`


			`class UnicodeLiteralBuilder(object):`
			`"""Assemble a unicode string.`
			`"""`
			`def __init__(self):`
			`self.chars = []`

			`def append(self, characters):`
			`if isinstance(characters, _bytes):`
			`# this came from a Py2 string literal in the parser code`
			`characters = characters.decode("ASCII")`
			`assert isinstance(characters, _unicode), str(type(characters))`
			`self.chars.append(characters)`

			`if sys.maxunicode == 65535:`
			`def append_charval(self, char_number):`
			`if char_number > 65535:`
			`# wide Unicode character on narrow platform => replace`
			`# by surrogate pair`
			`char_number -= 0x10000`
			`self.chars.append( _unichr((char_number // 1024) + 0xD800) )`
			`self.chars.append( _unichr((char_number % 1024) + 0xDC00) )`
			`else:`
			`self.chars.append( _unichr(char_number) )`
			`else:`
			`def append_charval(self, char_number):`
			`self.chars.append( _unichr(char_number) )`

			`def append_uescape(self, char_number, escape_string):`
			`self.append_charval(char_number)`

			`def getstring(self):`
			`return EncodedString(u''.join(self.chars))`

			`def getstrings(self):`
			`return (None, self.getstring())`


			`class BytesLiteralBuilder(object):`
			`"""Assemble a byte string or char value.`
			`"""`
			`def __init__(self, target_encoding):`
			`self.chars = []`
			`self.target_encoding = target_encoding`

			`def append(self, characters):`
			`if isinstance(characters, _unicode):`
			`characters = characters.encode(self.target_encoding)`
			`assert isinstance(characters, _bytes), str(type(characters))`
			`self.chars.append(characters)`

			`def append_charval(self, char_number):`
			`self.chars.append( _unichr(char_number).encode('ISO-8859-1') )`

			`def append_uescape(self, char_number, escape_string):`
			`self.append(escape_string)`

			`def getstring(self):`
			`# this must return a byte string!`
			`return bytes_literal(join_bytes(self.chars), self.target_encoding)`

			`def getchar(self):`
			`# this must return a byte string!`
			`return self.getstring()`

			`def getstrings(self):`
			`return (self.getstring(), None)`


			`class StrLiteralBuilder(object):`
			`"""Assemble both a bytes and a unicode representation of a string.`
			`"""`
			`def __init__(self, target_encoding):`
			`self._bytes = BytesLiteralBuilder(target_encoding)`
			`self._unicode = UnicodeLiteralBuilder()`

			`def append(self, characters):`
			`self._bytes.append(characters)`
			`self._unicode.append(characters)`

			`def append_charval(self, char_number):`
			`self._bytes.append_charval(char_number)`
			`self._unicode.append_charval(char_number)`

			`def append_uescape(self, char_number, escape_string):`
			`self._bytes.append(escape_string)`
			`self._unicode.append_charval(char_number)`

			`def getstrings(self):`
			`return (self._bytes.getstring(), self._unicode.getstring())`


			`class EncodedString(_unicode):`
			`# unicode string subclass to keep track of the original encoding.`
			`# 'encoding' is None for unicode strings and the source encoding`
			`# otherwise`
			`encoding = None`

			`def __deepcopy__(self, memo):`
			`return self`

			`def byteencode(self):`
			`assert self.encoding is not None`
			`return self.encode(self.encoding)`

			`def utf8encode(self):`
			`assert self.encoding is None`
			`return self.encode("UTF-8")`

			`@property`
			`def is_unicode(self):`
			`return self.encoding is None`

			`def contains_surrogates(self):`
			`return string_contains_surrogates(self)`

			`def as_utf8_string(self):`
			`return bytes_literal(self.utf8encode(), 'utf8')`


			`def string_contains_surrogates(ustring):`
			`"""`
			`Check if the unicode string contains surrogate code points`
			`on a CPython platform with wide (UCS-4) or narrow (UTF-16)`
			`Unicode, i.e. characters that would be spelled as two`
			`separate code units on a narrow platform.`
			`"""`
			`for c in map(ord, ustring):`
			`if c > 65535: # can only happen on wide platforms`
			`return True`
			`if 0xD800 <= c <= 0xDFFF:`
			`return True`
			`return False`


			`class BytesLiteral(_bytes):`
			`# bytes subclass that is compatible with EncodedString`
			`encoding = None`

			`def __deepcopy__(self, memo):`
			`return self`

			`def byteencode(self):`
			`if IS_PYTHON3:`
			`return _bytes(self)`
			`else:`
			`# fake-recode the string to make it a plain bytes object`
			`return self.decode('ISO-8859-1').encode('ISO-8859-1')`

			`def utf8encode(self):`
			`assert False, "this is not a unicode string: %r" % self`

			`def __str__(self):`
			`"""Fake-decode the byte string to unicode to support %`
			`formatting of unicode strings.`
			`"""`
			`return self.decode('ISO-8859-1')`

			`is_unicode = False`

			`def as_c_string_literal(self):`
			`value = split_string_literal(escape_byte_string(self))`
			`return '"%s"' % value`


			`def bytes_literal(s, encoding):`
			`assert isinstance(s, bytes)`
			`s = BytesLiteral(s)`
			`s.encoding = encoding`
			`return s`


			`def encoded_string(s, encoding):`
			`assert isinstance(s, (_unicode, bytes))`
			`s = EncodedString(s)`
			`if encoding is not None:`
			`s.encoding = encoding`
			`return s`


			`char_from_escape_sequence = {`
			`r'\a' : u'\a',`
			`r'\b' : u'\b',`
			`r'\f' : u'\f',`
			`r'\n' : u'\n',`
			`r'\r' : u'\r',`
			`r'\t' : u'\t',`
			`r'\v' : u'\v',`
			`}.get`

			`_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))`


			`def _to_escape_sequence(s):`
			`if s in '\n\r\t':`
			`return repr(s)[1:-1]`
			`elif s == '"':`
			`return r'\"'`
			`elif s == '\\':`
			`return r'\\'`
			`else:`
			`# within a character sequence, oct passes much better than hex`
			`return ''.join(['\\%03o' % ord(c) for c in s])`


			`def _build_specials_replacer():`
			`subexps = []`
			`replacements = {}`
			`for special in _c_special:`
			`regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])`
			`subexps.append(regexp)`
			`replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')`
			`sub = re.compile(('(%s)' % '\|'.join(subexps)).encode('ASCII')).sub`
			`def replace_specials(m):`
			`return replacements[m.group(1)]`
			`def replace(s):`
			`return sub(replace_specials, s)`
			`return replace`

			`_replace_specials = _build_specials_replacer()`


			`def escape_char(c):`
			`if IS_PYTHON3:`
			`c = c.decode('ISO-8859-1')`
			`if c in '\n\r\t\\':`
			`return repr(c)[1:-1]`
			`elif c == "'":`
			`return "\\'"`
			`n = ord(c)`
			`if n < 32 or n > 127:`
			`# hex works well for characters`
			`return "\\x%02X" % n`
			`else:`
			`return c`

			`def escape_byte_string(s):`
			`"""Escape a byte string so that it can be written into C code.`
			`Note that this returns a Unicode string instead which, when`
			`encoded as ISO-8859-1, will result in the correct byte sequence`
			`being written.`
			`"""`
			`s = _replace_specials(s)`
			`try:`
			`return s.decode("ASCII") # trial decoding: plain ASCII => done`
			`except UnicodeDecodeError:`
			`pass`
			`if IS_PYTHON3:`
			`s_new = bytearray()`
			`append, extend = s_new.append, s_new.extend`
			`for b in s:`
			`if b >= 128:`
			`extend(('\\%3o' % b).encode('ASCII'))`
			`else:`
			`append(b)`
			`return s_new.decode('ISO-8859-1')`
			`else:`
			`l = []`
			`append = l.append`
			`for c in s:`
			`o = ord(c)`
			`if o >= 128:`
			`append('\\%3o' % o)`
			`else:`
			`append(c)`
			`return join_bytes(l).decode('ISO-8859-1')`

			`def split_string_literal(s, limit=2000):`
			`# MSVC can't handle long string literals.`
			`if len(s) < limit:`
			`return s`
			`else:`
			`start = 0`
			`chunks = []`
			`while start < len(s):`
			`end = start + limit`
			`if len(s) > end-4 and '\\' in s[end-4:end]:`
			`end -= 4 - s[end-4:end].find('\\') # just before the backslash`
			`while s[end-1] == '\\':`
			`end -= 1`
			`if end == start:`
			`# must have been a long line of backslashes`
			`end = start + limit - (limit % 2) - 4`
			`break`
			`chunks.append(s[start:end])`
			`start = end`
			`return '""'.join(chunks)`

			`def encode_pyunicode_string(s):`
			`"""Create Py_UNICODE[] representation of a given unicode string.`
			`"""`
			`s = list(map(ord, s)) + [0]`

			`if sys.maxunicode >= 0x10000: # Wide build or Py3.3`
			`utf16, utf32 = [], s`
			`for code_point in s:`
			`if code_point >= 0x10000: # outside of BMP`
			`high, low = divmod(code_point - 0x10000, 1024)`
			`utf16.append(high + 0xD800)`
			`utf16.append(low + 0xDC00)`
			`else:`
			`utf16.append(code_point)`
			`else:`
			`utf16, utf32 = s, []`
			`for code_unit in s:`
			`if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:`
			`high, low = utf32[-1], code_unit`
			`utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000`
			`else:`
			`utf32.append(code_unit)`

			`if utf16 == utf32:`
			`utf16 = []`
			`return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))`