Inzynierka/Lib/site-packages/pandas/tests/io/parser/common/test_index.py

"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from io import StringIO
import os

import pytest

from pandas import (
    DataFrame,
    Index,
    MultiIndex,
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")

# GH#43650: Some expected failures with the pyarrow engine can occasionally
# cause a deadlock instead, so we skip these instead of xfailing
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")


@pytest.mark.parametrize(
    "data,kwargs,expected",
    [
        (
            """foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
""",
            {"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
            DataFrame(
                [
                    [2, 3, 4, 5],
                    [7, 8, 9, 10],
                    [12, 13, 14, 15],
                    [12, 13, 14, 15],
                    [12, 13, 14, 15],
                    [12, 13, 14, 15],
                ],
                index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
                columns=["A", "B", "C", "D"],
            ),
        ),
        (
            """foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
""",
            {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
            DataFrame(
                [
                    [2, 3, 4, 5],
                    [7, 8, 9, 10],
                    [12, 13, 14, 15],
                    [12, 13, 14, 15],
                    [12, 13, 14, 15],
                ],
                index=MultiIndex.from_tuples(
                    [
                        ("foo", "one"),
                        ("foo", "two"),
                        ("foo", "three"),
                        ("bar", "one"),
                        ("bar", "two"),
                    ],
                    names=["index1", "index2"],
                ),
                columns=["A", "B", "C", "D"],
            ),
        ),
    ],
)
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
    parser = all_parsers
    result = parser.read_csv(StringIO(data), **kwargs)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
    data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
    headless_data = "\n".join(data.split("\n")[1:])

    names = ["A", "B", "C", "D"]
    parser = all_parsers

    result = parser.read_csv(
        StringIO(headless_data), index_col=index_col, header=None, names=names
    )
    expected = parser.read_csv(StringIO(data), index_col=index_col)

    # No index names in headless data.
    expected.index.names = [None] * 2
    tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_multi_index_no_level_names_implicit(all_parsers):
    parser = all_parsers
    data = """A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""

    result = parser.read_csv(StringIO(data))
    expected = DataFrame(
        [
            [2, 3, 4, 5],
            [7, 8, 9, 10],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
        ],
        columns=["A", "B", "C", "D"],
        index=MultiIndex.from_tuples(
            [
                ("foo", "one"),
                ("foo", "two"),
                ("foo", "three"),
                ("bar", "one"),
                ("bar", "two"),
            ]
        ),
    )
    tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
    "data,expected,header",
    [
        ("a,b", DataFrame(columns=["a", "b"]), [0]),
        (
            "a,b\nc,d",
            DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
            [0, 1],
        ),
    ],
)
@pytest.mark.parametrize("round_trip", [True, False])
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
    # see gh-14545
    parser = all_parsers
    data = expected.to_csv(index=False) if round_trip else data

    result = parser.read_csv(StringIO(data), header=header)
    tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_no_unnamed_index(all_parsers):
    parser = all_parsers
    data = """ id c0 c1 c2
0 1 0 a b
1 2 0 c d
2 2 2 e f
"""
    result = parser.read_csv(StringIO(data), sep=" ")
    expected = DataFrame(
        [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
        columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
    )
    tm.assert_frame_equal(result, expected)


def test_read_duplicate_index_explicit(all_parsers):
    data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
    parser = all_parsers
    result = parser.read_csv(StringIO(data), index_col=0)

    expected = DataFrame(
        [
            [2, 3, 4, 5],
            [7, 8, 9, 10],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
        ],
        columns=["A", "B", "C", "D"],
        index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
    )
    tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_read_duplicate_index_implicit(all_parsers):
    data = """A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
    parser = all_parsers
    result = parser.read_csv(StringIO(data))

    expected = DataFrame(
        [
            [2, 3, 4, 5],
            [7, 8, 9, 10],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
            [12, 13, 14, 15],
        ],
        columns=["A", "B", "C", "D"],
        index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
    )
    tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
    parser = all_parsers
    csv2 = os.path.join(csv_dir_path, "test2.csv")
    result = parser.read_csv(csv2, index_col=0, parse_dates=True)

    expected = DataFrame(
        [
            [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
            [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
            [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
            [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
            [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
        ],
        columns=["A", "B", "C", "D", "E"],
        index=Index(
            [
                datetime(2000, 1, 3),
                datetime(2000, 1, 4),
                datetime(2000, 1, 5),
                datetime(2000, 1, 6),
                datetime(2000, 1, 7),
            ]
        ),
    )
    tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_empty_with_index(all_parsers):
    # see gh-10184
    data = "x,y"
    parser = all_parsers
    result = parser.read_csv(StringIO(data), index_col=0)

    expected = DataFrame(columns=["y"], index=Index([], name="x"))
    tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_empty_with_multi_index(all_parsers):
    # see gh-10467
    data = "x,y,z"
    parser = all_parsers
    result = parser.read_csv(StringIO(data), index_col=["x", "y"])

    expected = DataFrame(
        columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
    )
    tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_empty_with_reversed_multi_index(all_parsers):
    data = "x,y,z"
    parser = all_parsers
    result = parser.read_csv(StringIO(data), index_col=[1, 0])

    expected = DataFrame(
        columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
    )
    tm.assert_frame_equal(result, expected)
first commit 2023-06-02 12:51:02 +02:00			`"""`
			`Tests that work on both the Python and C engines but do not have a`
			`specific classification into the other test modules.`
			`"""`
			`from datetime import datetime`
			`from io import StringIO`
			`import os`

			`import pytest`

			`from pandas import (`
			`DataFrame,`
			`Index,`
			`MultiIndex,`
			`)`
			`import pandas._testing as tm`

			`xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")`

			`# GH#43650: Some expected failures with the pyarrow engine can occasionally`
			`# cause a deadlock instead, so we skip these instead of xfailing`
			`skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")`


			`@pytest.mark.parametrize(`
			`"data,kwargs,expected",`
			`[`
			`(`
			`"""foo,2,3,4,5`
			`bar,7,8,9,10`
			`baz,12,13,14,15`
			`qux,12,13,14,15`
			`foo2,12,13,14,15`
			`bar2,12,13,14,15`
			`""",`
			`{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},`
			`DataFrame(`
			`[`
			`[2, 3, 4, 5],`
			`[7, 8, 9, 10],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`],`
			`index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),`
			`columns=["A", "B", "C", "D"],`
			`),`
			`),`
			`(`
			`"""foo,one,2,3,4,5`
			`foo,two,7,8,9,10`
			`foo,three,12,13,14,15`
			`bar,one,12,13,14,15`
			`bar,two,12,13,14,15`
			`""",`
			`{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},`
			`DataFrame(`
			`[`
			`[2, 3, 4, 5],`
			`[7, 8, 9, 10],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`],`
			`index=MultiIndex.from_tuples(`
			`[`
			`("foo", "one"),`
			`("foo", "two"),`
			`("foo", "three"),`
			`("bar", "one"),`
			`("bar", "two"),`
			`],`
			`names=["index1", "index2"],`
			`),`
			`columns=["A", "B", "C", "D"],`
			`),`
			`),`
			`],`
			`)`
			`def test_pass_names_with_index(all_parsers, data, kwargs, expected):`
			`parser = all_parsers`
			`result = parser.read_csv(StringIO(data), **kwargs)`
			`tm.assert_frame_equal(result, expected)`


			`@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])`
			`def test_multi_index_no_level_names(all_parsers, index_col):`
			`data = """index1,index2,A,B,C,D`
			`foo,one,2,3,4,5`
			`foo,two,7,8,9,10`
			`foo,three,12,13,14,15`
			`bar,one,12,13,14,15`
			`bar,two,12,13,14,15`
			`"""`
			`headless_data = "\n".join(data.split("\n")[1:])`

			`names = ["A", "B", "C", "D"]`
			`parser = all_parsers`

			`result = parser.read_csv(`
			`StringIO(headless_data), index_col=index_col, header=None, names=names`
			`)`
			`expected = parser.read_csv(StringIO(data), index_col=index_col)`

			`# No index names in headless data.`
			`expected.index.names = [None] * 2`
			`tm.assert_frame_equal(result, expected)`


			`@xfail_pyarrow`
			`def test_multi_index_no_level_names_implicit(all_parsers):`
			`parser = all_parsers`
			`data = """A,B,C,D`
			`foo,one,2,3,4,5`
			`foo,two,7,8,9,10`
			`foo,three,12,13,14,15`
			`bar,one,12,13,14,15`
			`bar,two,12,13,14,15`
			`"""`

			`result = parser.read_csv(StringIO(data))`
			`expected = DataFrame(`
			`[`
			`[2, 3, 4, 5],`
			`[7, 8, 9, 10],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`],`
			`columns=["A", "B", "C", "D"],`
			`index=MultiIndex.from_tuples(`
			`[`
			`("foo", "one"),`
			`("foo", "two"),`
			`("foo", "three"),`
			`("bar", "one"),`
			`("bar", "two"),`
			`]`
			`),`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`@xfail_pyarrow`
			`@pytest.mark.parametrize(`
			`"data,expected,header",`
			`[`
			`("a,b", DataFrame(columns=["a", "b"]), [0]),`
			`(`
			`"a,b\nc,d",`
			`DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),`
			`[0, 1],`
			`),`
			`],`
			`)`
			`@pytest.mark.parametrize("round_trip", [True, False])`
			`def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):`
			`# see gh-14545`
			`parser = all_parsers`
			`data = expected.to_csv(index=False) if round_trip else data`

			`result = parser.read_csv(StringIO(data), header=header)`
			`tm.assert_frame_equal(result, expected)`


			`@xfail_pyarrow`
			`def test_no_unnamed_index(all_parsers):`
			`parser = all_parsers`
			`data = """ id c0 c1 c2`
			`0 1 0 a b`
			`1 2 0 c d`
			`2 2 2 e f`
			`"""`
			`result = parser.read_csv(StringIO(data), sep=" ")`
			`expected = DataFrame(`
			`[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],`
			`columns=["Unnamed: 0", "id", "c0", "c1", "c2"],`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`def test_read_duplicate_index_explicit(all_parsers):`
			`data = """index,A,B,C,D`
			`foo,2,3,4,5`
			`bar,7,8,9,10`
			`baz,12,13,14,15`
			`qux,12,13,14,15`
			`foo,12,13,14,15`
			`bar,12,13,14,15`
			`"""`
			`parser = all_parsers`
			`result = parser.read_csv(StringIO(data), index_col=0)`

			`expected = DataFrame(`
			`[`
			`[2, 3, 4, 5],`
			`[7, 8, 9, 10],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`],`
			`columns=["A", "B", "C", "D"],`
			`index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`@xfail_pyarrow`
			`def test_read_duplicate_index_implicit(all_parsers):`
			`data = """A,B,C,D`
			`foo,2,3,4,5`
			`bar,7,8,9,10`
			`baz,12,13,14,15`
			`qux,12,13,14,15`
			`foo,12,13,14,15`
			`bar,12,13,14,15`
			`"""`
			`parser = all_parsers`
			`result = parser.read_csv(StringIO(data))`

			`expected = DataFrame(`
			`[`
			`[2, 3, 4, 5],`
			`[7, 8, 9, 10],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`[12, 13, 14, 15],`
			`],`
			`columns=["A", "B", "C", "D"],`
			`index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`@xfail_pyarrow`
			`def test_read_csv_no_index_name(all_parsers, csv_dir_path):`
			`parser = all_parsers`
			`csv2 = os.path.join(csv_dir_path, "test2.csv")`
			`result = parser.read_csv(csv2, index_col=0, parse_dates=True)`

			`expected = DataFrame(`
			`[`
			`[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],`
			`[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],`
			`[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],`
			`[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],`
			`[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],`
			`],`
			`columns=["A", "B", "C", "D", "E"],`
			`index=Index(`
			`[`
			`datetime(2000, 1, 3),`
			`datetime(2000, 1, 4),`
			`datetime(2000, 1, 5),`
			`datetime(2000, 1, 6),`
			`datetime(2000, 1, 7),`
			`]`
			`),`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`@xfail_pyarrow`
			`def test_empty_with_index(all_parsers):`
			`# see gh-10184`
			`data = "x,y"`
			`parser = all_parsers`
			`result = parser.read_csv(StringIO(data), index_col=0)`

			`expected = DataFrame(columns=["y"], index=Index([], name="x"))`
			`tm.assert_frame_equal(result, expected)`


			`@skip_pyarrow`
			`def test_empty_with_multi_index(all_parsers):`
			`# see gh-10467`
			`data = "x,y,z"`
			`parser = all_parsers`
			`result = parser.read_csv(StringIO(data), index_col=["x", "y"])`

			`expected = DataFrame(`
			`columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`@skip_pyarrow`
			`def test_empty_with_reversed_multi_index(all_parsers):`
			`data = "x,y,z"`
			`parser = all_parsers`
			`result = parser.read_csv(StringIO(data), index_col=[1, 0])`

			`expected = DataFrame(`
			`columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])`
			`)`
			`tm.assert_frame_equal(result, expected)`