PCQRSCANER/venv/Lib/site-packages/nltk/test/unit/test_seekable_unicode_stream_reader.py

# -*- coding: utf-8 -*-
"""
The following test performs a random series of reads, seeks, and
tells, and checks that the results are consistent.
"""
from __future__ import absolute_import, unicode_literals
import random
import functools
from io import BytesIO
from nltk.corpus.reader import SeekableUnicodeStreamReader


def check_reader(unicode_string, encoding, n=1000):
    bytestr = unicode_string.encode(encoding)
    strlen = len(unicode_string)
    stream = BytesIO(bytestr)
    reader = SeekableUnicodeStreamReader(stream, encoding)
    # Find all character positions
    chars = []
    while True:
        pos = reader.tell()
        chars.append((pos, reader.read(1)))
        if chars[-1][1] == '':
            break
    # Find all strings
    strings = dict((pos, '') for (pos, c) in chars)
    for pos1, char in chars:
        for pos2, _ in chars:
            if pos2 <= pos1:
                strings[pos2] += char
    while True:
        op = random.choice('tsrr')
        # Check our position?
        if op == 't':  # tell
            reader.tell()
        # Perform a seek?
        if op == 's':  # seek
            new_pos = random.choice([p for (p, c) in chars])
            reader.seek(new_pos)
        # Perform a read?
        if op == 'r':  # read
            if random.random() < 0.3:
                pos = reader.tell()
            else:
                pos = None
            if random.random() < 0.2:
                size = None
            elif random.random() < 0.8:
                size = random.randint(0, int(strlen / 6))
            else:
                size = random.randint(0, strlen + 20)
            if random.random() < 0.8:
                s = reader.read(size)
            else:
                s = reader.readline(size)
            # check that everything's consistent
            if pos is not None:
                assert pos in strings
                assert strings[pos].startswith(s)
                n -= 1
                if n == 0:
                    return 'passed'


# Call the randomized test function `check_reader` with a variety of
# input strings and encodings.

ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8']

STRINGS = [
    """
    This is a test file.
    It is fairly short.
    """,
    "This file can be encoded with latin1. \x83",
    """\
    This is a test file.
    Here's a blank line:

    And here's some unicode: \xee \u0123 \uffe3
    """,
    """\
    This is a test file.
    Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
    """,
]


def test_reader():
    for string in STRINGS:
        for encoding in ENCODINGS:
            try:
                # skip strings that can't be encoded with the current encoding
                string.encode(encoding)
                yield check_reader, string, encoding
            except UnicodeEncodeError:
                pass


# nose shows the whole string arguments in a verbose mode; this is annoying,
# so large string test is separated.

LARGE_STRING = (
    """\
This is a larger file.  It has some lines that are longer \
than 72 characters.  It's got lots of repetition.  Here's \
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345

How fun!  Let's repeat it twenty times.
"""
    * 10
)


def test_reader_on_large_string():
    for encoding in ENCODINGS:
        try:
            # skip strings that can't be encoded with the current encoding
            LARGE_STRING.encode(encoding)

            def _check(encoding, n=1000):
                check_reader(LARGE_STRING, encoding, n)

            yield _check, encoding

        except UnicodeEncodeError:
            pass


def test_reader_stream_is_closed():
    reader = SeekableUnicodeStreamReader(BytesIO(b''), 'ascii')
    assert reader.stream.closed is False
    reader.__del__()
    assert reader.stream.closed is True


def teardown_module(module=None):
    import gc

    gc.collect()
3 2019-12-22 21:51:47 +01:00			`# -- coding: utf-8 --`
			`"""`
			`The following test performs a random series of reads, seeks, and`
			`tells, and checks that the results are consistent.`
			`"""`
			`from __future__ import absolute_import, unicode_literals`
			`import random`
			`import functools`
			`from io import BytesIO`
			`from nltk.corpus.reader import SeekableUnicodeStreamReader`


			`def check_reader(unicode_string, encoding, n=1000):`
			`bytestr = unicode_string.encode(encoding)`
			`strlen = len(unicode_string)`
			`stream = BytesIO(bytestr)`
			`reader = SeekableUnicodeStreamReader(stream, encoding)`
			`# Find all character positions`
			`chars = []`
			`while True:`
			`pos = reader.tell()`
			`chars.append((pos, reader.read(1)))`
			`if chars[-1][1] == '':`
			`break`
			`# Find all strings`
			`strings = dict((pos, '') for (pos, c) in chars)`
			`for pos1, char in chars:`
			`for pos2, _ in chars:`
			`if pos2 <= pos1:`
			`strings[pos2] += char`
			`while True:`
			`op = random.choice('tsrr')`
			`# Check our position?`
			`if op == 't': # tell`
			`reader.tell()`
			`# Perform a seek?`
			`if op == 's': # seek`
			`new_pos = random.choice([p for (p, c) in chars])`
			`reader.seek(new_pos)`
			`# Perform a read?`
			`if op == 'r': # read`
			`if random.random() < 0.3:`
			`pos = reader.tell()`
			`else:`
			`pos = None`
			`if random.random() < 0.2:`
			`size = None`
			`elif random.random() < 0.8:`
			`size = random.randint(0, int(strlen / 6))`
			`else:`
			`size = random.randint(0, strlen + 20)`
			`if random.random() < 0.8:`
			`s = reader.read(size)`
			`else:`
			`s = reader.readline(size)`
			`# check that everything's consistent`
			`if pos is not None:`
			`assert pos in strings`
			`assert strings[pos].startswith(s)`
			`n -= 1`
			`if n == 0:`
			`return 'passed'`


			# Call the randomized test function `check_reader` with a variety of
			`# input strings and encodings.`

			`ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8']`

			`STRINGS = [`
			`"""`
			`This is a test file.`
			`It is fairly short.`
			`""",`
			`"This file can be encoded with latin1. \x83",`
			`"""\`
			`This is a test file.`
			`Here's a blank line:`

			`And here's some unicode: \xee \u0123 \uffe3`
			`""",`
			`"""\`
			`This is a test file.`
			`Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555`
			`""",`
			`]`


			`def test_reader():`
			`for string in STRINGS:`
			`for encoding in ENCODINGS:`
			`try:`
			`# skip strings that can't be encoded with the current encoding`
			`string.encode(encoding)`
			`yield check_reader, string, encoding`
			`except UnicodeEncodeError:`
			`pass`


			`# nose shows the whole string arguments in a verbose mode; this is annoying,`
			`# so large string test is separated.`

			`LARGE_STRING = (`
			`"""\`
			`This is a larger file. It has some lines that are longer \`
			`than 72 characters. It's got lots of repetition. Here's \`
			`some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345`

			`How fun! Let's repeat it twenty times.`
			`"""`
			`* 10`
			`)`


			`def test_reader_on_large_string():`
			`for encoding in ENCODINGS:`
			`try:`
			`# skip strings that can't be encoded with the current encoding`
			`LARGE_STRING.encode(encoding)`

			`def _check(encoding, n=1000):`
			`check_reader(LARGE_STRING, encoding, n)`

			`yield _check, encoding`

			`except UnicodeEncodeError:`
			`pass`


			`def test_reader_stream_is_closed():`
			`reader = SeekableUnicodeStreamReader(BytesIO(b''), 'ascii')`
			`assert reader.stream.closed is False`
			`reader.__del__()`
			`assert reader.stream.closed is True`


			`def teardown_module(module=None):`
			`import gc`

			`gc.collect()`