projektAI/venv/Lib/site-packages/pandas/tests/io/parser/test_index_col.py

225 lines
6.7 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
Tests that the specified index column (a.k.a "index_col")
is properly handled or inferred during parsing for all of
the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import DataFrame, Index, MultiIndex
import pandas._testing as tm
@pytest.mark.parametrize("with_header", [True, False])
def test_index_col_named(all_parsers, with_header):
parser = all_parsers
no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
if with_header:
data = header + no_header
result = parser.read_csv(StringIO(data), index_col="ID")
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
tm.assert_frame_equal(result, expected)
else:
data = no_header
msg = "Index ID invalid"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col="ID")
def test_index_col_named2(all_parsers):
parser = all_parsers
data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
expected = DataFrame(
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
index=Index(["hello", "world", "foo"], name="message"),
)
names = ["a", "b", "c", "d", "message"]
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
tm.assert_frame_equal(result, expected)
def test_index_col_is_true(all_parsers):
# see gh-9798
data = "a,b\n1,2"
parser = all_parsers
msg = "The value of index_col couldn't be 'True'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col=True)
def test_infer_index_col(all_parsers):
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_col,kwargs",
[
(None, dict(columns=["x", "y", "z"])),
(False, dict(columns=["x", "y", "z"])),
(0, dict(columns=["y", "z"], index=Index([], name="x"))),
(1, dict(columns=["x", "z"], index=Index([], name="y"))),
("x", dict(columns=["y", "z"], index=Index([], name="x"))),
("y", dict(columns=["x", "z"], index=Index([], name="y"))),
(
[0, 1],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
),
),
(
["x", "y"],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
),
),
(
[1, 0],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
),
),
(
["y", "x"],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
),
),
],
)
def test_index_col_empty_data(all_parsers, index_col, kwargs):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=index_col)
expected = DataFrame(**kwargs)
tm.assert_frame_equal(result, expected)
def test_empty_with_index_col_false(all_parsers):
# see gh-10413
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame(columns=["x", "y"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_names",
[
["", ""],
["foo", ""],
["", "bar"],
["foo", "bar"],
["NotReallyUnnamed", "Unnamed: 0"],
],
)
def test_multi_index_naming(all_parsers, index_names):
parser = all_parsers
# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = parser.read_csv(StringIO(data), index_col=[0, 1])
expected = DataFrame(
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
)
expected.index.names = [name if name else None for name in index_names]
tm.assert_frame_equal(result, expected)
def test_multi_index_naming_not_all_at_beginning(all_parsers):
parser = all_parsers
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
result = parser.read_csv(StringIO(data), index_col=[0, 2])
expected = DataFrame(
{"Unnamed: 2": ["c", "d", "c", "d"]},
index=MultiIndex(
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
),
)
tm.assert_frame_equal(result, expected)
def test_no_multi_index_level_names_empty(all_parsers):
# GH 10984
parser = all_parsers
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"])
with tm.ensure_clean() as path:
expected.to_csv(path)
result = parser.read_csv(path, index_col=[0, 1, 2])
tm.assert_frame_equal(result, expected)
def test_header_with_index_col(all_parsers):
# GH 33476
parser = all_parsers
data = """
I11,A,A
I12,B,B
I2,1,3
"""
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
idx = Index(["I2"])
expected = DataFrame([[1, 3]], index=idx, columns=midx)
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
tm.assert_frame_equal(result, expected)
col_idx = Index(["A", "A.1"])
idx = Index(["I12", "I2"], name="I11")
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_index_col_large_csv(all_parsers):
# https://github.com/pandas-dev/pandas/issues/37094
parser = all_parsers
N = 1_000_001
df = DataFrame({"a": range(N), "b": np.random.randn(N)})
with tm.ensure_clean() as path:
df.to_csv(path, index=False)
result = parser.read_csv(path, index_col=[0])
tm.assert_frame_equal(result, df.set_index("a"))