497 lines
17 KiB
Python
497 lines
17 KiB
Python
from collections import (
|
|
OrderedDict,
|
|
defaultdict,
|
|
)
|
|
from datetime import datetime
|
|
|
|
import numpy as np
|
|
import pytest
|
|
import pytz
|
|
|
|
from pandas import (
|
|
NA,
|
|
DataFrame,
|
|
Index,
|
|
MultiIndex,
|
|
Series,
|
|
Timestamp,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestDataFrameToDict:
|
|
def test_to_dict_timestamp(self):
|
|
# GH#11247
|
|
# split/records producing np.datetime64 rather than Timestamps
|
|
# on datetime64[ns] dtypes only
|
|
|
|
tsmp = Timestamp("20130101")
|
|
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
|
|
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
|
|
|
|
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
|
|
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
|
|
|
|
assert test_data.to_dict(orient="records") == expected_records
|
|
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
|
|
|
|
expected_series = {
|
|
"A": Series([tsmp, tsmp], name="A"),
|
|
"B": Series([tsmp, tsmp], name="B"),
|
|
}
|
|
expected_series_mixed = {
|
|
"A": Series([tsmp, tsmp], name="A"),
|
|
"B": Series([1, 2], name="B"),
|
|
}
|
|
|
|
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
|
|
tm.assert_dict_equal(
|
|
test_data_mixed.to_dict(orient="series"), expected_series_mixed
|
|
)
|
|
|
|
expected_split = {
|
|
"index": [0, 1],
|
|
"data": [[tsmp, tsmp], [tsmp, tsmp]],
|
|
"columns": ["A", "B"],
|
|
}
|
|
expected_split_mixed = {
|
|
"index": [0, 1],
|
|
"data": [[tsmp, 1], [tsmp, 2]],
|
|
"columns": ["A", "B"],
|
|
}
|
|
|
|
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
|
|
tm.assert_dict_equal(
|
|
test_data_mixed.to_dict(orient="split"), expected_split_mixed
|
|
)
|
|
|
|
def test_to_dict_index_not_unique_with_index_orient(self):
|
|
# GH#22801
|
|
# Data loss when indexes are not unique. Raise ValueError.
|
|
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
|
|
msg = "DataFrame index must be unique for orient='index'"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.to_dict(orient="index")
|
|
|
|
def test_to_dict_invalid_orient(self):
|
|
df = DataFrame({"A": [0, 1]})
|
|
msg = "orient 'xinvalid' not understood"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.to_dict(orient="xinvalid")
|
|
|
|
@pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
|
|
def test_to_dict_short_orient_raises(self, orient):
|
|
# GH#32515
|
|
df = DataFrame({"A": [0, 1]})
|
|
with pytest.raises(ValueError, match="not understood"):
|
|
df.to_dict(orient=orient)
|
|
|
|
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
|
|
def test_to_dict(self, mapping):
|
|
# orient= should only take the listed options
|
|
# see GH#32515
|
|
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
|
|
|
|
# GH#16122
|
|
recons_data = DataFrame(test_data).to_dict(into=mapping)
|
|
|
|
for k, v in test_data.items():
|
|
for k2, v2 in v.items():
|
|
assert v2 == recons_data[k][k2]
|
|
|
|
recons_data = DataFrame(test_data).to_dict("list", mapping)
|
|
|
|
for k, v in test_data.items():
|
|
for k2, v2 in v.items():
|
|
assert v2 == recons_data[k][int(k2) - 1]
|
|
|
|
recons_data = DataFrame(test_data).to_dict("series", mapping)
|
|
|
|
for k, v in test_data.items():
|
|
for k2, v2 in v.items():
|
|
assert v2 == recons_data[k][k2]
|
|
|
|
recons_data = DataFrame(test_data).to_dict("split", mapping)
|
|
expected_split = {
|
|
"columns": ["A", "B"],
|
|
"index": ["1", "2", "3"],
|
|
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
|
|
}
|
|
tm.assert_dict_equal(recons_data, expected_split)
|
|
|
|
recons_data = DataFrame(test_data).to_dict("records", mapping)
|
|
expected_records = [
|
|
{"A": 1.0, "B": "1"},
|
|
{"A": 2.0, "B": "2"},
|
|
{"A": np.nan, "B": "3"},
|
|
]
|
|
assert isinstance(recons_data, list)
|
|
assert len(recons_data) == 3
|
|
for left, right in zip(recons_data, expected_records):
|
|
tm.assert_dict_equal(left, right)
|
|
|
|
# GH#10844
|
|
recons_data = DataFrame(test_data).to_dict("index")
|
|
|
|
for k, v in test_data.items():
|
|
for k2, v2 in v.items():
|
|
assert v2 == recons_data[k2][k]
|
|
|
|
df = DataFrame(test_data)
|
|
df["duped"] = df[df.columns[0]]
|
|
recons_data = df.to_dict("index")
|
|
comp_data = test_data.copy()
|
|
comp_data["duped"] = comp_data[df.columns[0]]
|
|
for k, v in comp_data.items():
|
|
for k2, v2 in v.items():
|
|
assert v2 == recons_data[k2][k]
|
|
|
|
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
|
|
def test_to_dict_errors(self, mapping):
|
|
# GH#16122
|
|
df = DataFrame(np.random.randn(3, 3))
|
|
msg = "|".join(
|
|
[
|
|
"unsupported type: <class 'list'>",
|
|
r"to_dict\(\) only accepts initialized defaultdicts",
|
|
]
|
|
)
|
|
with pytest.raises(TypeError, match=msg):
|
|
df.to_dict(into=mapping)
|
|
|
|
def test_to_dict_not_unique_warning(self):
|
|
# GH#16927: When converting to a dict, if a column has a non-unique name
|
|
# it will be dropped, throwing a warning.
|
|
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
|
|
with tm.assert_produces_warning(UserWarning):
|
|
df.to_dict()
|
|
|
|
# orient - orient argument to to_dict function
|
|
# item_getter - function for extracting value from
|
|
# the resulting dict using column name and index
|
|
@pytest.mark.parametrize(
|
|
"orient,item_getter",
|
|
[
|
|
("dict", lambda d, col, idx: d[col][idx]),
|
|
("records", lambda d, col, idx: d[idx][col]),
|
|
("list", lambda d, col, idx: d[col][idx]),
|
|
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
|
|
("index", lambda d, col, idx: d[idx][col]),
|
|
],
|
|
)
|
|
def test_to_dict_box_scalars(self, orient, item_getter):
|
|
# GH#14216, GH#23753
|
|
# make sure that we are boxing properly
|
|
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
|
|
result = df.to_dict(orient=orient)
|
|
assert isinstance(item_getter(result, "a", 0), int)
|
|
assert isinstance(item_getter(result, "b", 0), float)
|
|
|
|
def test_to_dict_tz(self):
|
|
# GH#18372 When converting to dict with orient='records' columns of
|
|
# datetime that are tz-aware were not converted to required arrays
|
|
data = [
|
|
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
|
|
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
|
|
]
|
|
df = DataFrame(list(data), columns=["d"])
|
|
|
|
result = df.to_dict(orient="records")
|
|
expected = [
|
|
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
|
|
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
|
|
]
|
|
tm.assert_dict_equal(result[0], expected[0])
|
|
tm.assert_dict_equal(result[1], expected[1])
|
|
|
|
@pytest.mark.parametrize(
|
|
"into, expected",
|
|
[
|
|
(
|
|
dict,
|
|
{
|
|
0: {"int_col": 1, "float_col": 1.0},
|
|
1: {"int_col": 2, "float_col": 2.0},
|
|
2: {"int_col": 3, "float_col": 3.0},
|
|
},
|
|
),
|
|
(
|
|
OrderedDict,
|
|
OrderedDict(
|
|
[
|
|
(0, {"int_col": 1, "float_col": 1.0}),
|
|
(1, {"int_col": 2, "float_col": 2.0}),
|
|
(2, {"int_col": 3, "float_col": 3.0}),
|
|
]
|
|
),
|
|
),
|
|
(
|
|
defaultdict(dict),
|
|
defaultdict(
|
|
dict,
|
|
{
|
|
0: {"int_col": 1, "float_col": 1.0},
|
|
1: {"int_col": 2, "float_col": 2.0},
|
|
2: {"int_col": 3, "float_col": 3.0},
|
|
},
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_to_dict_index_dtypes(self, into, expected):
|
|
# GH#18580
|
|
# When using to_dict(orient='index') on a dataframe with int
|
|
# and float columns only the int columns were cast to float
|
|
|
|
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
|
|
|
|
result = df.to_dict(orient="index", into=into)
|
|
cols = ["int_col", "float_col"]
|
|
result = DataFrame.from_dict(result, orient="index")[cols]
|
|
expected = DataFrame.from_dict(expected, orient="index")[cols]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_to_dict_numeric_names(self):
|
|
# GH#24940
|
|
df = DataFrame({str(i): [i] for i in range(5)})
|
|
result = set(df.to_dict("records")[0].keys())
|
|
expected = set(df.columns)
|
|
assert result == expected
|
|
|
|
def test_to_dict_wide(self):
|
|
# GH#24939
|
|
df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
|
|
result = df.to_dict("records")[0]
|
|
expected = {f"A_{i:d}": i for i in range(256)}
|
|
assert result == expected
|
|
|
|
@pytest.mark.parametrize(
|
|
"data,dtype",
|
|
(
|
|
([True, True, False], bool),
|
|
[
|
|
[
|
|
datetime(2018, 1, 1),
|
|
datetime(2019, 2, 2),
|
|
datetime(2020, 3, 3),
|
|
],
|
|
Timestamp,
|
|
],
|
|
[[1.0, 2.0, 3.0], float],
|
|
[[1, 2, 3], int],
|
|
[["X", "Y", "Z"], str],
|
|
),
|
|
)
|
|
def test_to_dict_orient_dtype(self, data, dtype):
|
|
# GH22620 & GH21256
|
|
|
|
df = DataFrame({"a": data})
|
|
d = df.to_dict(orient="records")
|
|
assert all(type(record["a"]) is dtype for record in d)
|
|
|
|
@pytest.mark.parametrize(
|
|
"data,expected_dtype",
|
|
(
|
|
[np.uint64(2), int],
|
|
[np.int64(-9), int],
|
|
[np.float64(1.1), float],
|
|
[np.bool_(True), bool],
|
|
[np.datetime64("2005-02-25"), Timestamp],
|
|
),
|
|
)
|
|
def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype):
|
|
# GH22620 & GH21256
|
|
|
|
df = DataFrame({"a": data}, index=[0])
|
|
d = df.to_dict(orient="records")
|
|
result = type(d[0]["a"])
|
|
assert result is expected_dtype
|
|
|
|
def test_to_dict_mixed_numeric_frame(self):
|
|
# GH 12859
|
|
df = DataFrame({"a": [1.0], "b": [9.0]})
|
|
result = df.reset_index().to_dict("records")
|
|
expected = [{"index": 0, "a": 1.0, "b": 9.0}]
|
|
assert result == expected
|
|
|
|
@pytest.mark.parametrize(
|
|
"index",
|
|
[
|
|
None,
|
|
Index(["aa", "bb"]),
|
|
Index(["aa", "bb"], name="cc"),
|
|
MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
|
|
MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"columns",
|
|
[
|
|
["x", "y"],
|
|
Index(["x", "y"]),
|
|
Index(["x", "y"], name="z"),
|
|
MultiIndex.from_tuples([("x", 1), ("y", 2)]),
|
|
MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
|
|
],
|
|
)
|
|
def test_to_dict_orient_tight(self, index, columns):
|
|
df = DataFrame.from_records(
|
|
[[1, 3], [2, 4]],
|
|
columns=columns,
|
|
index=index,
|
|
)
|
|
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
|
|
|
|
tm.assert_frame_equal(df, roundtrip)
|
|
|
|
@pytest.mark.parametrize(
|
|
"orient",
|
|
["dict", "list", "split", "records", "index", "tight"],
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"data,expected_types",
|
|
(
|
|
(
|
|
{
|
|
"a": [np.int64(1), 1, np.int64(3)],
|
|
"b": [np.float64(1.0), 2.0, np.float64(3.0)],
|
|
"c": [np.float64(1.0), 2, np.int64(3)],
|
|
"d": [np.float64(1.0), "a", np.int64(3)],
|
|
"e": [np.float64(1.0), ["a"], np.int64(3)],
|
|
"f": [np.float64(1.0), ("a",), np.int64(3)],
|
|
},
|
|
{
|
|
"a": [int, int, int],
|
|
"b": [float, float, float],
|
|
"c": [float, float, float],
|
|
"d": [float, str, int],
|
|
"e": [float, list, int],
|
|
"f": [float, tuple, int],
|
|
},
|
|
),
|
|
(
|
|
{
|
|
"a": [1, 2, 3],
|
|
"b": [1.1, 2.2, 3.3],
|
|
},
|
|
{
|
|
"a": [int, int, int],
|
|
"b": [float, float, float],
|
|
},
|
|
),
|
|
( # Make sure we have one df which is all object type cols
|
|
{
|
|
"a": [1, "hello", 3],
|
|
"b": [1.1, "world", 3.3],
|
|
},
|
|
{
|
|
"a": [int, str, int],
|
|
"b": [float, str, float],
|
|
},
|
|
),
|
|
),
|
|
)
|
|
def test_to_dict_returns_native_types(self, orient, data, expected_types):
|
|
# GH 46751
|
|
# Tests we get back native types for all orient types
|
|
df = DataFrame(data)
|
|
result = df.to_dict(orient)
|
|
if orient == "dict":
|
|
assertion_iterator = (
|
|
(i, key, value)
|
|
for key, index_value_map in result.items()
|
|
for i, value in index_value_map.items()
|
|
)
|
|
elif orient == "list":
|
|
assertion_iterator = (
|
|
(i, key, value)
|
|
for key, values in result.items()
|
|
for i, value in enumerate(values)
|
|
)
|
|
elif orient in {"split", "tight"}:
|
|
assertion_iterator = (
|
|
(i, key, result["data"][i][j])
|
|
for i in result["index"]
|
|
for j, key in enumerate(result["columns"])
|
|
)
|
|
elif orient == "records":
|
|
assertion_iterator = (
|
|
(i, key, value)
|
|
for i, record in enumerate(result)
|
|
for key, value in record.items()
|
|
)
|
|
elif orient == "index":
|
|
assertion_iterator = (
|
|
(i, key, value)
|
|
for i, record in result.items()
|
|
for key, value in record.items()
|
|
)
|
|
|
|
for i, key, value in assertion_iterator:
|
|
assert value == data[key][i]
|
|
assert type(value) is expected_types[key][i]
|
|
|
|
@pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"])
|
|
def test_to_dict_index_false_error(self, orient):
|
|
# GH#46398
|
|
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
|
|
msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.to_dict(orient=orient, index=False)
|
|
|
|
@pytest.mark.parametrize(
|
|
"orient, expected",
|
|
[
|
|
("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}),
|
|
(
|
|
"tight",
|
|
{
|
|
"columns": ["col1", "col2"],
|
|
"data": [[1, 3], [2, 4]],
|
|
"column_names": [None],
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_to_dict_index_false(self, orient, expected):
|
|
# GH#46398
|
|
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
|
|
result = df.to_dict(orient=orient, index=False)
|
|
tm.assert_dict_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"orient, expected",
|
|
[
|
|
("dict", {"a": {0: 1, 1: None}}),
|
|
("list", {"a": [1, None]}),
|
|
("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
|
|
(
|
|
"tight",
|
|
{
|
|
"index": [0, 1],
|
|
"columns": ["a"],
|
|
"data": [[1], [None]],
|
|
"index_names": [None],
|
|
"column_names": [None],
|
|
},
|
|
),
|
|
("records", [{"a": 1}, {"a": None}]),
|
|
("index", {0: {"a": 1}, 1: {"a": None}}),
|
|
],
|
|
)
|
|
def test_to_dict_na_to_none(self, orient, expected):
|
|
# GH#50795
|
|
df = DataFrame({"a": [1, NA]}, dtype="Int64")
|
|
result = df.to_dict(orient=orient)
|
|
assert result == expected
|
|
|
|
def test_to_dict_masked_native_python(self):
|
|
# GH#34665
|
|
df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1})
|
|
result = df.to_dict(orient="records")
|
|
assert type(result[0]["a"]) is int
|
|
|
|
df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1})
|
|
result = df.to_dict(orient="records")
|
|
assert type(result[0]["a"]) is int
|