Traktor/myenv/Lib/site-packages/fsspec/tests/test_utils.py

import io
import sys
from pathlib import Path, PurePath
from unittest.mock import Mock

import pytest

import fsspec.utils
from fsspec.utils import (
    can_be_local,
    common_prefix,
    get_protocol,
    infer_storage_options,
    merge_offset_ranges,
    mirror_from,
    other_paths,
    read_block,
    seek_delimiter,
    setup_logging,
)

WIN = sys.platform.startswith("win")


def test_read_block():
    delimiter = b"\n"
    data = delimiter.join([b"123", b"456", b"789"])
    f = io.BytesIO(data)

    assert read_block(f, 1, 2) == b"23"
    assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n"
    assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789"
    assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789"
    assert read_block(f, 1, 1, delimiter=b"\n") == b""
    assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n"
    assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789"

    for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]:
        out = [read_block(f, o, l, b"\n") for o, l in ols]
        assert b"".join(filter(None, out)) == data


def test_read_block_split_before():
    """Test start/middle/end cases of split_before."""  # noqa: I
    d = (
        "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000))
    ).encode()

    # Read single record at beginning.
    # All reads include beginning of file and read through termination of
    # delimited record.
    assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n"
    assert (
        read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True)
        == b"#header>foo0"
    )
    assert (
        read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>"
    )
    assert (
        read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True)
        == b"#header>foo0\nFOOBAR0\n"
    )

    # Read multiple records at beginning.
    # All reads include beginning of file and read through termination of
    # delimited record.
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b"\n")
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
    )
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True)
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1"
    )
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b">")
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>"
    )
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True)
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
    )

    # Read with offset spanning into next record, splits on either side of delimiter.
    # Read not spanning the full record returns nothing.
    assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n"
    assert (
        read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True)
        == b"\nFOOBAR0"
    )
    assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b""
    assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b""

    # Read with offset spanning multiple records, splits on either side of delimiter
    assert (
        read_block(io.BytesIO(d), 10, 20, delimiter=b"\n")
        == b"FOOBAR0\n>foo1\nFOOBAR1\n"
    )
    assert (
        read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True)
        == b"\nFOOBAR0\n>foo1\nFOOBAR1"
    )
    assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>"
    assert (
        read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True)
        == b">foo1\nFOOBAR1\n"
    )

    # Read record at end, all records read to end

    tlen = len(d)

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n")
        == b">foo99999\nFOOBAR99999\n"
    )

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True)
        == b"\n>foo99999\nFOOBAR99999\n"
    )

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">")
        == b"foo99999\nFOOBAR99999\n"
    )

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True)
        == b">foo99999\nFOOBAR99999\n"
    )


def test_seek_delimiter_endline():
    f = io.BytesIO(b"123\n456\n789")

    # if at zero, stay at zero
    seek_delimiter(f, b"\n", 5)
    assert f.tell() == 0

    # choose the first block
    for bs in [1, 5, 100]:
        f.seek(1)
        seek_delimiter(f, b"\n", blocksize=bs)
        assert f.tell() == 4

    # handle long delimiters well, even with short blocksizes
    f = io.BytesIO(b"123abc456abc789")
    for bs in [1, 2, 3, 4, 5, 6, 10]:
        f.seek(1)
        seek_delimiter(f, b"abc", blocksize=bs)
        assert f.tell() == 6

    # End at the end
    f = io.BytesIO(b"123\n456")
    f.seek(5)
    seek_delimiter(f, b"\n", 5)
    assert f.tell() == 7


def test_infer_options():
    so = infer_storage_options("/mnt/datasets/test.csv")
    assert so.pop("protocol") == "file"
    assert so.pop("path") == "/mnt/datasets/test.csv"
    assert not so

    assert infer_storage_options("./test.csv")["path"] == "./test.csv"
    assert infer_storage_options("../test.csv")["path"] == "../test.csv"

    so = infer_storage_options("C:\\test.csv")
    assert so.pop("protocol") == "file"
    assert so.pop("path") == "C:\\test.csv"
    assert not so

    assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv"
    assert infer_storage_options("\\test.csv")["path"] == "\\test.csv"
    assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv"
    assert infer_storage_options("test.csv")["path"] == "test.csv"

    so = infer_storage_options(
        "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm",
        inherit_storage_options={"extra": "value"},
    )
    assert so.pop("protocol") == "hdfs"
    assert so.pop("username") == "username"
    assert so.pop("password") == "pwd"
    assert so.pop("host") == "Node"
    assert so.pop("port") == 123
    assert so.pop("path") == "/mnt/datasets/test.csv#fragm"
    assert so.pop("url_query") == "q=1"
    assert so.pop("url_fragment") == "fragm"
    assert so.pop("extra") == "value"
    assert not so

    so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv")
    assert so.pop("username") == "User-name"
    assert so.pop("host") == "Node-name.com"

    u = "http://127.0.0.1:8080/test.csv"
    assert infer_storage_options(u) == {"protocol": "http", "path": u}

    # For s3 and gcs the netloc is actually the bucket name, so we want to
    # include it in the path. Test that:
    # - Parsing doesn't lowercase the bucket
    # - The bucket is included in path
    for protocol in ["s3", "s3a", "gcs", "gs"]:
        options = infer_storage_options(f"{protocol}://Bucket-name.com/test.csv")
        assert options["path"] == "Bucket-name.com/test.csv"

    with pytest.raises(KeyError):
        infer_storage_options("file:///bucket/file.csv", {"path": "collide"})
    with pytest.raises(KeyError):
        infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})


def test_infer_simple():
    out = infer_storage_options("//mnt/datasets/test.csv")
    assert out["protocol"] == "file"
    assert out["path"] == "//mnt/datasets/test.csv"
    assert out.get("host", None) is None


@pytest.mark.parametrize(
    "urlpath, expected_path",
    (
        (r"c:\foo\bar", r"c:\foo\bar"),
        (r"C:\\foo\bar", r"C:\\foo\bar"),
        (r"c:/foo/bar", r"c:/foo/bar"),
        (r"file:///c|\foo\bar", r"c:\foo\bar"),
        (r"file:///C|/foo/bar", r"C:/foo/bar"),
        (r"file:///C:/foo/bar", r"C:/foo/bar"),
    ),
)
def test_infer_storage_options_c(urlpath, expected_path):
    so = infer_storage_options(urlpath)
    assert so["protocol"] == "file"
    assert so["path"] == expected_path


@pytest.mark.parametrize(
    "paths, out",
    (
        (["/more/dir/", "/more/dir/two", "/more/one", "/more/three"], "/more"),
        (["/", "", "/"], ""),
        (["/", "/"], "/"),
        (["/more/", "/"], ""),
        (["/more/", "/more"], "/more"),
        (["more/dir/", "more/dir/two", "more/one", "more/three"], "more"),
    ),
)
def test_common_prefix(paths, out):
    assert common_prefix(paths) == out


@pytest.mark.parametrize(
    "paths, other, exists, expected",
    (
        (["/path1"], "/path2", False, ["/path2"]),
        (["/path1"], "/path2", True, ["/path2/path1"]),
        (["/path1"], "/path2", False, ["/path2"]),
        (["/path1"], "/path2/", True, ["/path2/path1"]),
        (["/path1"], ["/path2"], False, ["/path2"]),
        (["/path1"], ["/path2"], True, ["/path2"]),
        (["/path1", "/path2"], "/path2", False, ["/path2/path1", "/path2/path2"]),
        (["/path1", "/path2"], "/path2", True, ["/path2/path1", "/path2/path2"]),
        (
            ["/more/path1", "/more/path2"],
            "/path2",
            False,
            ["/path2/path1", "/path2/path2"],
        ),
        (
            ["/more/path1", "/more/path2"],
            "/path2",
            True,
            ["/path2/more/path1", "/path2/more/path2"],
        ),
        (
            ["/more/path1", "/more/path2"],
            "/path2",
            False,
            ["/path2/path1", "/path2/path2"],
        ),
        (
            ["/more/path1", "/more/path2"],
            "/path2",
            True,
            ["/path2/more/path1", "/path2/more/path2"],
        ),
        (
            ["/more/path1", "/more/path2"],
            "/path2/",
            False,
            ["/path2/path1", "/path2/path2"],
        ),
        (
            ["/more/path1", "/more/path2"],
            "/path2/",
            True,
            ["/path2/more/path1", "/path2/more/path2"],
        ),
        (
            ["/more/path1", "/diff/path2"],
            "/path2/",
            False,
            ["/path2/more/path1", "/path2/diff/path2"],
        ),
        (
            ["/more/path1", "/diff/path2"],
            "/path2/",
            True,
            ["/path2/more/path1", "/path2/diff/path2"],
        ),
        (["a", "b/", "b/c"], "dest/", False, ["dest/a", "dest/b/", "dest/b/c"]),
        (
            ["/a", "/b/", "/b/c"],
            "dest/",
            False,
            ["dest/a", "dest/b/", "dest/b/c"],
        ),
    ),
)
def test_other_paths(paths, other, exists, expected):
    assert other_paths(paths, other, exists) == expected


def test_log():
    import logging

    logger = setup_logging(logger_name="fsspec.test")
    assert logger.level == logging.DEBUG


@pytest.mark.parametrize(
    "par",
    [
        ("afile", "file"),
        ("file://afile", "file"),
        ("noproto://afile", "noproto"),
        ("noproto::stuff", "noproto"),
        ("simplecache::stuff", "simplecache"),
        ("simplecache://stuff", "simplecache"),
        ("s3://afile", "s3"),
        (Path("afile"), "file"),
    ],
)
def test_get_protocol(par):
    url, outcome = par
    assert get_protocol(url) == outcome


@pytest.mark.parametrize(
    "par",
    [
        ("afile", True),
        ("file://afile", True),
        ("noproto://afile", False),
        ("noproto::stuff", False),
        ("simplecache::stuff", True),
        ("simplecache://stuff", True),
        (Path("afile"), True),
    ],
)
def test_can_local(par):
    url, outcome = par
    assert can_be_local(url) == outcome


def test_mirror_from():
    mock = Mock()
    mock.attr = 1

    @mirror_from("client", ["attr", "func_1", "func_2"])
    class Real:
        @property
        def client(self):
            return mock

        def func_2(self):
            assert False, "have to overwrite this"

        def func_3(self):
            return "should succeed"

    obj = Real()
    assert obj.attr == mock.attr

    obj.func_1()
    mock.func_1.assert_called()

    obj.func_2(1, 2)
    mock.func_2.assert_called_with(1, 2)

    assert obj.func_3() == "should succeed"
    mock.func_3.assert_not_called()


@pytest.mark.parametrize("max_gap", [0, 32])
@pytest.mark.parametrize("max_block", [None, 128])
def test_merge_offset_ranges(max_gap, max_block):
    # Input ranges
    # (Using out-of-order ranges for full coverage)
    paths = ["foo", "bar", "bar", "bar", "foo"]
    starts = [0, 0, 512, 64, 32]
    ends = [32, 32, 1024, 256, 64]

    # Call merge_offset_ranges
    (
        result_paths,
        result_starts,
        result_ends,
    ) = merge_offset_ranges(
        paths,
        starts,
        ends,
        max_gap=max_gap,
        max_block=max_block,
    )

    # Check result
    if max_block is None and max_gap == 32:
        expect_paths = ["bar", "bar", "foo"]
        expect_starts = [0, 512, 0]
        expect_ends = [256, 1024, 64]
    else:
        expect_paths = ["bar", "bar", "bar", "foo"]
        expect_starts = [0, 64, 512, 0]
        expect_ends = [32, 256, 1024, 64]

    assert expect_paths == result_paths
    assert expect_starts == result_starts
    assert expect_ends == result_ends


def test_size():
    f = io.BytesIO(b"hello")
    assert fsspec.utils.file_size(f) == 5
    assert f.tell() == 0


class _HasFspath:
    def __fspath__(self):
        return "foo"


class _HasPathAttr:
    def __init__(self):
        self.path = "foo"


@pytest.mark.parametrize(
    "path,expected",
    [
        # coerce to string
        ("foo", "foo"),
        (Path("foo"), "foo"),
        (PurePath("foo"), "foo"),
        (_HasFspath(), "foo"),
        (_HasPathAttr(), "foo"),
        # passthrough
        (b"bytes", b"bytes"),
        (None, None),
        (1, 1),
        (True, True),
        (o := object(), o),
        ([], []),
        ((), ()),
        (set(), set()),
    ],
)
def test_stringify_path(path, expected):
    path = fsspec.utils.stringify_path(path)

    assert path == expected