287 lines
10 KiB
Python
287 lines
10 KiB
Python
|
from collections import OrderedDict, defaultdict
|
||
|
from datetime import datetime
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
import pytz
|
||
|
|
||
|
from pandas import DataFrame, Series, Timestamp
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
class TestDataFrameToDict:
|
||
|
def test_to_dict_timestamp(self):
|
||
|
|
||
|
# GH#11247
|
||
|
# split/records producing np.datetime64 rather than Timestamps
|
||
|
# on datetime64[ns] dtypes only
|
||
|
|
||
|
tsmp = Timestamp("20130101")
|
||
|
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
|
||
|
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
|
||
|
|
||
|
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
|
||
|
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
|
||
|
|
||
|
assert test_data.to_dict(orient="records") == expected_records
|
||
|
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
|
||
|
|
||
|
expected_series = {
|
||
|
"A": Series([tsmp, tsmp], name="A"),
|
||
|
"B": Series([tsmp, tsmp], name="B"),
|
||
|
}
|
||
|
expected_series_mixed = {
|
||
|
"A": Series([tsmp, tsmp], name="A"),
|
||
|
"B": Series([1, 2], name="B"),
|
||
|
}
|
||
|
|
||
|
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
|
||
|
tm.assert_dict_equal(
|
||
|
test_data_mixed.to_dict(orient="series"), expected_series_mixed
|
||
|
)
|
||
|
|
||
|
expected_split = {
|
||
|
"index": [0, 1],
|
||
|
"data": [[tsmp, tsmp], [tsmp, tsmp]],
|
||
|
"columns": ["A", "B"],
|
||
|
}
|
||
|
expected_split_mixed = {
|
||
|
"index": [0, 1],
|
||
|
"data": [[tsmp, 1], [tsmp, 2]],
|
||
|
"columns": ["A", "B"],
|
||
|
}
|
||
|
|
||
|
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
|
||
|
tm.assert_dict_equal(
|
||
|
test_data_mixed.to_dict(orient="split"), expected_split_mixed
|
||
|
)
|
||
|
|
||
|
def test_to_dict_index_not_unique_with_index_orient(self):
|
||
|
# GH#22801
|
||
|
# Data loss when indexes are not unique. Raise ValueError.
|
||
|
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
|
||
|
msg = "DataFrame index must be unique for orient='index'"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.to_dict(orient="index")
|
||
|
|
||
|
def test_to_dict_invalid_orient(self):
|
||
|
df = DataFrame({"A": [0, 1]})
|
||
|
msg = "orient 'xinvalid' not understood"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.to_dict(orient="xinvalid")
|
||
|
|
||
|
@pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
|
||
|
def test_to_dict_short_orient_warns(self, orient):
|
||
|
# GH#32515
|
||
|
df = DataFrame({"A": [0, 1]})
|
||
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||
|
df.to_dict(orient=orient)
|
||
|
|
||
|
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
|
||
|
def test_to_dict(self, mapping):
|
||
|
# orient= should only take the listed options
|
||
|
# see GH#32515
|
||
|
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
|
||
|
|
||
|
# GH#16122
|
||
|
recons_data = DataFrame(test_data).to_dict(into=mapping)
|
||
|
|
||
|
for k, v in test_data.items():
|
||
|
for k2, v2 in v.items():
|
||
|
assert v2 == recons_data[k][k2]
|
||
|
|
||
|
recons_data = DataFrame(test_data).to_dict("list", mapping)
|
||
|
|
||
|
for k, v in test_data.items():
|
||
|
for k2, v2 in v.items():
|
||
|
assert v2 == recons_data[k][int(k2) - 1]
|
||
|
|
||
|
recons_data = DataFrame(test_data).to_dict("series", mapping)
|
||
|
|
||
|
for k, v in test_data.items():
|
||
|
for k2, v2 in v.items():
|
||
|
assert v2 == recons_data[k][k2]
|
||
|
|
||
|
recons_data = DataFrame(test_data).to_dict("split", mapping)
|
||
|
expected_split = {
|
||
|
"columns": ["A", "B"],
|
||
|
"index": ["1", "2", "3"],
|
||
|
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
|
||
|
}
|
||
|
tm.assert_dict_equal(recons_data, expected_split)
|
||
|
|
||
|
recons_data = DataFrame(test_data).to_dict("records", mapping)
|
||
|
expected_records = [
|
||
|
{"A": 1.0, "B": "1"},
|
||
|
{"A": 2.0, "B": "2"},
|
||
|
{"A": np.nan, "B": "3"},
|
||
|
]
|
||
|
assert isinstance(recons_data, list)
|
||
|
assert len(recons_data) == 3
|
||
|
for left, right in zip(recons_data, expected_records):
|
||
|
tm.assert_dict_equal(left, right)
|
||
|
|
||
|
# GH#10844
|
||
|
recons_data = DataFrame(test_data).to_dict("index")
|
||
|
|
||
|
for k, v in test_data.items():
|
||
|
for k2, v2 in v.items():
|
||
|
assert v2 == recons_data[k2][k]
|
||
|
|
||
|
df = DataFrame(test_data)
|
||
|
df["duped"] = df[df.columns[0]]
|
||
|
recons_data = df.to_dict("index")
|
||
|
comp_data = test_data.copy()
|
||
|
comp_data["duped"] = comp_data[df.columns[0]]
|
||
|
for k, v in comp_data.items():
|
||
|
for k2, v2 in v.items():
|
||
|
assert v2 == recons_data[k2][k]
|
||
|
|
||
|
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
|
||
|
def test_to_dict_errors(self, mapping):
|
||
|
# GH#16122
|
||
|
df = DataFrame(np.random.randn(3, 3))
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"unsupported type: <class 'list'>",
|
||
|
r"to_dict\(\) only accepts initialized defaultdicts",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df.to_dict(into=mapping)
|
||
|
|
||
|
def test_to_dict_not_unique_warning(self):
|
||
|
# GH#16927: When converting to a dict, if a column has a non-unique name
|
||
|
# it will be dropped, throwing a warning.
|
||
|
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
|
||
|
with tm.assert_produces_warning(UserWarning):
|
||
|
df.to_dict()
|
||
|
|
||
|
# orient - orient argument to to_dict function
|
||
|
# item_getter - function for extracting value from
|
||
|
# the resulting dict using column name and index
|
||
|
@pytest.mark.parametrize(
|
||
|
"orient,item_getter",
|
||
|
[
|
||
|
("dict", lambda d, col, idx: d[col][idx]),
|
||
|
("records", lambda d, col, idx: d[idx][col]),
|
||
|
("list", lambda d, col, idx: d[col][idx]),
|
||
|
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
|
||
|
("index", lambda d, col, idx: d[idx][col]),
|
||
|
],
|
||
|
)
|
||
|
def test_to_dict_box_scalars(self, orient, item_getter):
|
||
|
# GH#14216, GH#23753
|
||
|
# make sure that we are boxing properly
|
||
|
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
|
||
|
result = df.to_dict(orient=orient)
|
||
|
assert isinstance(item_getter(result, "a", 0), int)
|
||
|
assert isinstance(item_getter(result, "b", 0), float)
|
||
|
|
||
|
def test_to_dict_tz(self):
|
||
|
# GH#18372 When converting to dict with orient='records' columns of
|
||
|
# datetime that are tz-aware were not converted to required arrays
|
||
|
data = [
|
||
|
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
|
||
|
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
|
||
|
]
|
||
|
df = DataFrame(list(data), columns=["d"])
|
||
|
|
||
|
result = df.to_dict(orient="records")
|
||
|
expected = [
|
||
|
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
|
||
|
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
|
||
|
]
|
||
|
tm.assert_dict_equal(result[0], expected[0])
|
||
|
tm.assert_dict_equal(result[1], expected[1])
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"into, expected",
|
||
|
[
|
||
|
(
|
||
|
dict,
|
||
|
{
|
||
|
0: {"int_col": 1, "float_col": 1.0},
|
||
|
1: {"int_col": 2, "float_col": 2.0},
|
||
|
2: {"int_col": 3, "float_col": 3.0},
|
||
|
},
|
||
|
),
|
||
|
(
|
||
|
OrderedDict,
|
||
|
OrderedDict(
|
||
|
[
|
||
|
(0, {"int_col": 1, "float_col": 1.0}),
|
||
|
(1, {"int_col": 2, "float_col": 2.0}),
|
||
|
(2, {"int_col": 3, "float_col": 3.0}),
|
||
|
]
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
defaultdict(dict),
|
||
|
defaultdict(
|
||
|
dict,
|
||
|
{
|
||
|
0: {"int_col": 1, "float_col": 1.0},
|
||
|
1: {"int_col": 2, "float_col": 2.0},
|
||
|
2: {"int_col": 3, "float_col": 3.0},
|
||
|
},
|
||
|
),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_to_dict_index_dtypes(self, into, expected):
|
||
|
# GH#18580
|
||
|
# When using to_dict(orient='index') on a dataframe with int
|
||
|
# and float columns only the int columns were cast to float
|
||
|
|
||
|
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
|
||
|
|
||
|
result = df.to_dict(orient="index", into=into)
|
||
|
cols = ["int_col", "float_col"]
|
||
|
result = DataFrame.from_dict(result, orient="index")[cols]
|
||
|
expected = DataFrame.from_dict(expected, orient="index")[cols]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_to_dict_numeric_names(self):
|
||
|
# GH#24940
|
||
|
df = DataFrame({str(i): [i] for i in range(5)})
|
||
|
result = set(df.to_dict("records")[0].keys())
|
||
|
expected = set(df.columns)
|
||
|
assert result == expected
|
||
|
|
||
|
def test_to_dict_wide(self):
|
||
|
# GH#24939
|
||
|
df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
|
||
|
result = df.to_dict("records")[0]
|
||
|
expected = {f"A_{i:d}": i for i in range(256)}
|
||
|
assert result == expected
|
||
|
|
||
|
def test_to_dict_orient_dtype(self):
|
||
|
# GH22620 & GH21256
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"bool": [True, True, False],
|
||
|
"datetime": [
|
||
|
datetime(2018, 1, 1),
|
||
|
datetime(2019, 2, 2),
|
||
|
datetime(2020, 3, 3),
|
||
|
],
|
||
|
"float": [1.0, 2.0, 3.0],
|
||
|
"int": [1, 2, 3],
|
||
|
"str": ["X", "Y", "Z"],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected = {
|
||
|
"int": int,
|
||
|
"float": float,
|
||
|
"str": str,
|
||
|
"datetime": Timestamp,
|
||
|
"bool": bool,
|
||
|
}
|
||
|
|
||
|
for df_dict in df.to_dict("records"):
|
||
|
result = {col: type(df_dict[col]) for col in list(df.columns)}
|
||
|
assert result == expected
|