Inzynierka/Lib/site-packages/pandas/tests/frame/methods/test_to_dict.py
2023-06-02 12:51:02 +02:00

497 lines
17 KiB
Python

from collections import (
OrderedDict,
defaultdict,
)
from datetime import datetime
import numpy as np
import pytest
import pytz
from pandas import (
NA,
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
)
import pandas._testing as tm
class TestDataFrameToDict:
def test_to_dict_timestamp(self):
# GH#11247
# split/records producing np.datetime64 rather than Timestamps
# on datetime64[ns] dtypes only
tsmp = Timestamp("20130101")
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
assert test_data.to_dict(orient="records") == expected_records
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
expected_series = {
"A": Series([tsmp, tsmp], name="A"),
"B": Series([tsmp, tsmp], name="B"),
}
expected_series_mixed = {
"A": Series([tsmp, tsmp], name="A"),
"B": Series([1, 2], name="B"),
}
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
tm.assert_dict_equal(
test_data_mixed.to_dict(orient="series"), expected_series_mixed
)
expected_split = {
"index": [0, 1],
"data": [[tsmp, tsmp], [tsmp, tsmp]],
"columns": ["A", "B"],
}
expected_split_mixed = {
"index": [0, 1],
"data": [[tsmp, 1], [tsmp, 2]],
"columns": ["A", "B"],
}
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
tm.assert_dict_equal(
test_data_mixed.to_dict(orient="split"), expected_split_mixed
)
def test_to_dict_index_not_unique_with_index_orient(self):
# GH#22801
# Data loss when indexes are not unique. Raise ValueError.
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
msg = "DataFrame index must be unique for orient='index'"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient="index")
def test_to_dict_invalid_orient(self):
df = DataFrame({"A": [0, 1]})
msg = "orient 'xinvalid' not understood"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient="xinvalid")
@pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
def test_to_dict_short_orient_raises(self, orient):
# GH#32515
df = DataFrame({"A": [0, 1]})
with pytest.raises(ValueError, match="not understood"):
df.to_dict(orient=orient)
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
def test_to_dict(self, mapping):
# orient= should only take the listed options
# see GH#32515
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
# GH#16122
recons_data = DataFrame(test_data).to_dict(into=mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][k2]
recons_data = DataFrame(test_data).to_dict("list", mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][int(k2) - 1]
recons_data = DataFrame(test_data).to_dict("series", mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][k2]
recons_data = DataFrame(test_data).to_dict("split", mapping)
expected_split = {
"columns": ["A", "B"],
"index": ["1", "2", "3"],
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
}
tm.assert_dict_equal(recons_data, expected_split)
recons_data = DataFrame(test_data).to_dict("records", mapping)
expected_records = [
{"A": 1.0, "B": "1"},
{"A": 2.0, "B": "2"},
{"A": np.nan, "B": "3"},
]
assert isinstance(recons_data, list)
assert len(recons_data) == 3
for left, right in zip(recons_data, expected_records):
tm.assert_dict_equal(left, right)
# GH#10844
recons_data = DataFrame(test_data).to_dict("index")
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k2][k]
df = DataFrame(test_data)
df["duped"] = df[df.columns[0]]
recons_data = df.to_dict("index")
comp_data = test_data.copy()
comp_data["duped"] = comp_data[df.columns[0]]
for k, v in comp_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k2][k]
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
def test_to_dict_errors(self, mapping):
# GH#16122
df = DataFrame(np.random.randn(3, 3))
msg = "|".join(
[
"unsupported type: <class 'list'>",
r"to_dict\(\) only accepts initialized defaultdicts",
]
)
with pytest.raises(TypeError, match=msg):
df.to_dict(into=mapping)
def test_to_dict_not_unique_warning(self):
# GH#16927: When converting to a dict, if a column has a non-unique name
# it will be dropped, throwing a warning.
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
with tm.assert_produces_warning(UserWarning):
df.to_dict()
# orient - orient argument to to_dict function
# item_getter - function for extracting value from
# the resulting dict using column name and index
@pytest.mark.parametrize(
"orient,item_getter",
[
("dict", lambda d, col, idx: d[col][idx]),
("records", lambda d, col, idx: d[idx][col]),
("list", lambda d, col, idx: d[col][idx]),
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
("index", lambda d, col, idx: d[idx][col]),
],
)
def test_to_dict_box_scalars(self, orient, item_getter):
# GH#14216, GH#23753
# make sure that we are boxing properly
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
result = df.to_dict(orient=orient)
assert isinstance(item_getter(result, "a", 0), int)
assert isinstance(item_getter(result, "b", 0), float)
def test_to_dict_tz(self):
# GH#18372 When converting to dict with orient='records' columns of
# datetime that are tz-aware were not converted to required arrays
data = [
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
]
df = DataFrame(list(data), columns=["d"])
result = df.to_dict(orient="records")
expected = [
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
]
tm.assert_dict_equal(result[0], expected[0])
tm.assert_dict_equal(result[1], expected[1])
@pytest.mark.parametrize(
"into, expected",
[
(
dict,
{
0: {"int_col": 1, "float_col": 1.0},
1: {"int_col": 2, "float_col": 2.0},
2: {"int_col": 3, "float_col": 3.0},
},
),
(
OrderedDict,
OrderedDict(
[
(0, {"int_col": 1, "float_col": 1.0}),
(1, {"int_col": 2, "float_col": 2.0}),
(2, {"int_col": 3, "float_col": 3.0}),
]
),
),
(
defaultdict(dict),
defaultdict(
dict,
{
0: {"int_col": 1, "float_col": 1.0},
1: {"int_col": 2, "float_col": 2.0},
2: {"int_col": 3, "float_col": 3.0},
},
),
),
],
)
def test_to_dict_index_dtypes(self, into, expected):
# GH#18580
# When using to_dict(orient='index') on a dataframe with int
# and float columns only the int columns were cast to float
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
result = df.to_dict(orient="index", into=into)
cols = ["int_col", "float_col"]
result = DataFrame.from_dict(result, orient="index")[cols]
expected = DataFrame.from_dict(expected, orient="index")[cols]
tm.assert_frame_equal(result, expected)
def test_to_dict_numeric_names(self):
# GH#24940
df = DataFrame({str(i): [i] for i in range(5)})
result = set(df.to_dict("records")[0].keys())
expected = set(df.columns)
assert result == expected
def test_to_dict_wide(self):
# GH#24939
df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
result = df.to_dict("records")[0]
expected = {f"A_{i:d}": i for i in range(256)}
assert result == expected
@pytest.mark.parametrize(
"data,dtype",
(
([True, True, False], bool),
[
[
datetime(2018, 1, 1),
datetime(2019, 2, 2),
datetime(2020, 3, 3),
],
Timestamp,
],
[[1.0, 2.0, 3.0], float],
[[1, 2, 3], int],
[["X", "Y", "Z"], str],
),
)
def test_to_dict_orient_dtype(self, data, dtype):
# GH22620 & GH21256
df = DataFrame({"a": data})
d = df.to_dict(orient="records")
assert all(type(record["a"]) is dtype for record in d)
@pytest.mark.parametrize(
"data,expected_dtype",
(
[np.uint64(2), int],
[np.int64(-9), int],
[np.float64(1.1), float],
[np.bool_(True), bool],
[np.datetime64("2005-02-25"), Timestamp],
),
)
def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype):
# GH22620 & GH21256
df = DataFrame({"a": data}, index=[0])
d = df.to_dict(orient="records")
result = type(d[0]["a"])
assert result is expected_dtype
def test_to_dict_mixed_numeric_frame(self):
# GH 12859
df = DataFrame({"a": [1.0], "b": [9.0]})
result = df.reset_index().to_dict("records")
expected = [{"index": 0, "a": 1.0, "b": 9.0}]
assert result == expected
@pytest.mark.parametrize(
"index",
[
None,
Index(["aa", "bb"]),
Index(["aa", "bb"], name="cc"),
MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
],
)
@pytest.mark.parametrize(
"columns",
[
["x", "y"],
Index(["x", "y"]),
Index(["x", "y"], name="z"),
MultiIndex.from_tuples([("x", 1), ("y", 2)]),
MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
],
)
def test_to_dict_orient_tight(self, index, columns):
df = DataFrame.from_records(
[[1, 3], [2, 4]],
columns=columns,
index=index,
)
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
tm.assert_frame_equal(df, roundtrip)
@pytest.mark.parametrize(
"orient",
["dict", "list", "split", "records", "index", "tight"],
)
@pytest.mark.parametrize(
"data,expected_types",
(
(
{
"a": [np.int64(1), 1, np.int64(3)],
"b": [np.float64(1.0), 2.0, np.float64(3.0)],
"c": [np.float64(1.0), 2, np.int64(3)],
"d": [np.float64(1.0), "a", np.int64(3)],
"e": [np.float64(1.0), ["a"], np.int64(3)],
"f": [np.float64(1.0), ("a",), np.int64(3)],
},
{
"a": [int, int, int],
"b": [float, float, float],
"c": [float, float, float],
"d": [float, str, int],
"e": [float, list, int],
"f": [float, tuple, int],
},
),
(
{
"a": [1, 2, 3],
"b": [1.1, 2.2, 3.3],
},
{
"a": [int, int, int],
"b": [float, float, float],
},
),
( # Make sure we have one df which is all object type cols
{
"a": [1, "hello", 3],
"b": [1.1, "world", 3.3],
},
{
"a": [int, str, int],
"b": [float, str, float],
},
),
),
)
def test_to_dict_returns_native_types(self, orient, data, expected_types):
# GH 46751
# Tests we get back native types for all orient types
df = DataFrame(data)
result = df.to_dict(orient)
if orient == "dict":
assertion_iterator = (
(i, key, value)
for key, index_value_map in result.items()
for i, value in index_value_map.items()
)
elif orient == "list":
assertion_iterator = (
(i, key, value)
for key, values in result.items()
for i, value in enumerate(values)
)
elif orient in {"split", "tight"}:
assertion_iterator = (
(i, key, result["data"][i][j])
for i in result["index"]
for j, key in enumerate(result["columns"])
)
elif orient == "records":
assertion_iterator = (
(i, key, value)
for i, record in enumerate(result)
for key, value in record.items()
)
elif orient == "index":
assertion_iterator = (
(i, key, value)
for i, record in result.items()
for key, value in record.items()
)
for i, key, value in assertion_iterator:
assert value == data[key][i]
assert type(value) is expected_types[key][i]
@pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"])
def test_to_dict_index_false_error(self, orient):
# GH#46398
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient=orient, index=False)
@pytest.mark.parametrize(
"orient, expected",
[
("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}),
(
"tight",
{
"columns": ["col1", "col2"],
"data": [[1, 3], [2, 4]],
"column_names": [None],
},
),
],
)
def test_to_dict_index_false(self, orient, expected):
# GH#46398
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
result = df.to_dict(orient=orient, index=False)
tm.assert_dict_equal(result, expected)
@pytest.mark.parametrize(
"orient, expected",
[
("dict", {"a": {0: 1, 1: None}}),
("list", {"a": [1, None]}),
("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
(
"tight",
{
"index": [0, 1],
"columns": ["a"],
"data": [[1], [None]],
"index_names": [None],
"column_names": [None],
},
),
("records", [{"a": 1}, {"a": None}]),
("index", {0: {"a": 1}, 1: {"a": None}}),
],
)
def test_to_dict_na_to_none(self, orient, expected):
# GH#50795
df = DataFrame({"a": [1, NA]}, dtype="Int64")
result = df.to_dict(orient=orient)
assert result == expected
def test_to_dict_masked_native_python(self):
# GH#34665
df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1})
result = df.to_dict(orient="records")
assert type(result[0]["a"]) is int
df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1})
result = df.to_dict(orient="records")
assert type(result[0]["a"]) is int