import io import sys from pathlib import Path, PurePath from unittest.mock import Mock import pytest import fsspec.utils from fsspec.utils import ( can_be_local, common_prefix, get_protocol, infer_storage_options, merge_offset_ranges, mirror_from, other_paths, read_block, seek_delimiter, setup_logging, ) WIN = sys.platform.startswith("win") def test_read_block(): delimiter = b"\n" data = delimiter.join([b"123", b"456", b"789"]) f = io.BytesIO(data) assert read_block(f, 1, 2) == b"23" assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n" assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 1, 1, delimiter=b"\n") == b"" assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n" assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789" for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]: out = [read_block(f, o, l, b"\n") for o, l in ols] assert b"".join(filter(None, out)) == data def test_read_block_split_before(): """Test start/middle/end cases of split_before.""" # noqa: I d = ( "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000)) ).encode() # Read single record at beginning. # All reads include beginning of file and read through termination of # delimited record. assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n" assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True) == b"#header>foo0" ) assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>" ) assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True) == b"#header>foo0\nFOOBAR0\n" ) # Read multiple records at beginning. # All reads include beginning of file and read through termination of # delimited record. assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b"\n") == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True) == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True) == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" ) # Read with offset spanning into next record, splits on either side of delimiter. # Read not spanning the full record returns nothing. assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n" assert ( read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True) == b"\nFOOBAR0" ) assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b"" assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b"" # Read with offset spanning multiple records, splits on either side of delimiter assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b"\n") == b"FOOBAR0\n>foo1\nFOOBAR1\n" ) assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True) == b"\nFOOBAR0\n>foo1\nFOOBAR1" ) assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>" assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True) == b">foo1\nFOOBAR1\n" ) # Read record at end, all records read to end tlen = len(d) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n") == b">foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True) == b"\n>foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">") == b"foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True) == b">foo99999\nFOOBAR99999\n" ) def test_seek_delimiter_endline(): f = io.BytesIO(b"123\n456\n789") # if at zero, stay at zero seek_delimiter(f, b"\n", 5) assert f.tell() == 0 # choose the first block for bs in [1, 5, 100]: f.seek(1) seek_delimiter(f, b"\n", blocksize=bs) assert f.tell() == 4 # handle long delimiters well, even with short blocksizes f = io.BytesIO(b"123abc456abc789") for bs in [1, 2, 3, 4, 5, 6, 10]: f.seek(1) seek_delimiter(f, b"abc", blocksize=bs) assert f.tell() == 6 # End at the end f = io.BytesIO(b"123\n456") f.seek(5) seek_delimiter(f, b"\n", 5) assert f.tell() == 7 def test_infer_options(): so = infer_storage_options("/mnt/datasets/test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "/mnt/datasets/test.csv" assert not so assert infer_storage_options("./test.csv")["path"] == "./test.csv" assert infer_storage_options("../test.csv")["path"] == "../test.csv" so = infer_storage_options("C:\\test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "C:\\test.csv" assert not so assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv" assert infer_storage_options("\\test.csv")["path"] == "\\test.csv" assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv" assert infer_storage_options("test.csv")["path"] == "test.csv" so = infer_storage_options( "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm", inherit_storage_options={"extra": "value"}, ) assert so.pop("protocol") == "hdfs" assert so.pop("username") == "username" assert so.pop("password") == "pwd" assert so.pop("host") == "Node" assert so.pop("port") == 123 assert so.pop("path") == "/mnt/datasets/test.csv#fragm" assert so.pop("url_query") == "q=1" assert so.pop("url_fragment") == "fragm" assert so.pop("extra") == "value" assert not so so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv") assert so.pop("username") == "User-name" assert so.pop("host") == "Node-name.com" u = "http://127.0.0.1:8080/test.csv" assert infer_storage_options(u) == {"protocol": "http", "path": u} # For s3 and gcs the netloc is actually the bucket name, so we want to # include it in the path. Test that: # - Parsing doesn't lowercase the bucket # - The bucket is included in path for protocol in ["s3", "s3a", "gcs", "gs"]: options = infer_storage_options(f"{protocol}://Bucket-name.com/test.csv") assert options["path"] == "Bucket-name.com/test.csv" with pytest.raises(KeyError): infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) with pytest.raises(KeyError): infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"}) def test_infer_simple(): out = infer_storage_options("//mnt/datasets/test.csv") assert out["protocol"] == "file" assert out["path"] == "//mnt/datasets/test.csv" assert out.get("host", None) is None @pytest.mark.parametrize( "urlpath, expected_path", ( (r"c:\foo\bar", r"c:\foo\bar"), (r"C:\\foo\bar", r"C:\\foo\bar"), (r"c:/foo/bar", r"c:/foo/bar"), (r"file:///c|\foo\bar", r"c:\foo\bar"), (r"file:///C|/foo/bar", r"C:/foo/bar"), (r"file:///C:/foo/bar", r"C:/foo/bar"), ), ) def test_infer_storage_options_c(urlpath, expected_path): so = infer_storage_options(urlpath) assert so["protocol"] == "file" assert so["path"] == expected_path @pytest.mark.parametrize( "paths, out", ( (["/more/dir/", "/more/dir/two", "/more/one", "/more/three"], "/more"), (["/", "", "/"], ""), (["/", "/"], "/"), (["/more/", "/"], ""), (["/more/", "/more"], "/more"), (["more/dir/", "more/dir/two", "more/one", "more/three"], "more"), ), ) def test_common_prefix(paths, out): assert common_prefix(paths) == out @pytest.mark.parametrize( "paths, other, exists, expected", ( (["/path1"], "/path2", False, ["/path2"]), (["/path1"], "/path2", True, ["/path2/path1"]), (["/path1"], "/path2", False, ["/path2"]), (["/path1"], "/path2/", True, ["/path2/path1"]), (["/path1"], ["/path2"], False, ["/path2"]), (["/path1"], ["/path2"], True, ["/path2"]), (["/path1", "/path2"], "/path2", False, ["/path2/path1", "/path2/path2"]), (["/path1", "/path2"], "/path2", True, ["/path2/path1", "/path2/path2"]), ( ["/more/path1", "/more/path2"], "/path2", False, ["/path2/path1", "/path2/path2"], ), ( ["/more/path1", "/more/path2"], "/path2", True, ["/path2/more/path1", "/path2/more/path2"], ), ( ["/more/path1", "/more/path2"], "/path2", False, ["/path2/path1", "/path2/path2"], ), ( ["/more/path1", "/more/path2"], "/path2", True, ["/path2/more/path1", "/path2/more/path2"], ), ( ["/more/path1", "/more/path2"], "/path2/", False, ["/path2/path1", "/path2/path2"], ), ( ["/more/path1", "/more/path2"], "/path2/", True, ["/path2/more/path1", "/path2/more/path2"], ), ( ["/more/path1", "/diff/path2"], "/path2/", False, ["/path2/more/path1", "/path2/diff/path2"], ), ( ["/more/path1", "/diff/path2"], "/path2/", True, ["/path2/more/path1", "/path2/diff/path2"], ), (["a", "b/", "b/c"], "dest/", False, ["dest/a", "dest/b/", "dest/b/c"]), ( ["/a", "/b/", "/b/c"], "dest/", False, ["dest/a", "dest/b/", "dest/b/c"], ), ), ) def test_other_paths(paths, other, exists, expected): assert other_paths(paths, other, exists) == expected def test_log(): import logging logger = setup_logging(logger_name="fsspec.test") assert logger.level == logging.DEBUG @pytest.mark.parametrize( "par", [ ("afile", "file"), ("file://afile", "file"), ("noproto://afile", "noproto"), ("noproto::stuff", "noproto"), ("simplecache::stuff", "simplecache"), ("simplecache://stuff", "simplecache"), ("s3://afile", "s3"), (Path("afile"), "file"), ], ) def test_get_protocol(par): url, outcome = par assert get_protocol(url) == outcome @pytest.mark.parametrize( "par", [ ("afile", True), ("file://afile", True), ("noproto://afile", False), ("noproto::stuff", False), ("simplecache::stuff", True), ("simplecache://stuff", True), (Path("afile"), True), ], ) def test_can_local(par): url, outcome = par assert can_be_local(url) == outcome def test_mirror_from(): mock = Mock() mock.attr = 1 @mirror_from("client", ["attr", "func_1", "func_2"]) class Real: @property def client(self): return mock def func_2(self): assert False, "have to overwrite this" def func_3(self): return "should succeed" obj = Real() assert obj.attr == mock.attr obj.func_1() mock.func_1.assert_called() obj.func_2(1, 2) mock.func_2.assert_called_with(1, 2) assert obj.func_3() == "should succeed" mock.func_3.assert_not_called() @pytest.mark.parametrize("max_gap", [0, 32]) @pytest.mark.parametrize("max_block", [None, 128]) def test_merge_offset_ranges(max_gap, max_block): # Input ranges # (Using out-of-order ranges for full coverage) paths = ["foo", "bar", "bar", "bar", "foo"] starts = [0, 0, 512, 64, 32] ends = [32, 32, 1024, 256, 64] # Call merge_offset_ranges ( result_paths, result_starts, result_ends, ) = merge_offset_ranges( paths, starts, ends, max_gap=max_gap, max_block=max_block, ) # Check result if max_block is None and max_gap == 32: expect_paths = ["bar", "bar", "foo"] expect_starts = [0, 512, 0] expect_ends = [256, 1024, 64] else: expect_paths = ["bar", "bar", "bar", "foo"] expect_starts = [0, 64, 512, 0] expect_ends = [32, 256, 1024, 64] assert expect_paths == result_paths assert expect_starts == result_starts assert expect_ends == result_ends def test_size(): f = io.BytesIO(b"hello") assert fsspec.utils.file_size(f) == 5 assert f.tell() == 0 class _HasFspath: def __fspath__(self): return "foo" class _HasPathAttr: def __init__(self): self.path = "foo" @pytest.mark.parametrize( "path,expected", [ # coerce to string ("foo", "foo"), (Path("foo"), "foo"), (PurePath("foo"), "foo"), (_HasFspath(), "foo"), (_HasPathAttr(), "foo"), # passthrough (b"bytes", b"bytes"), (None, None), (1, 1), (True, True), (o := object(), o), ([], []), ((), ()), (set(), set()), ], ) def test_stringify_path(path, expected): path = fsspec.utils.stringify_path(path) assert path == expected