416 lines
14 KiB
Python
416 lines
14 KiB
Python
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas.core.dtypes.base import registry as ea_registry
|
||
|
from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype
|
||
|
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
DataFrame,
|
||
|
Index,
|
||
|
Interval,
|
||
|
NaT,
|
||
|
Period,
|
||
|
PeriodIndex,
|
||
|
Series,
|
||
|
Timestamp,
|
||
|
date_range,
|
||
|
notna,
|
||
|
period_range,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
from pandas.core.arrays import SparseArray
|
||
|
|
||
|
|
||
|
class TestDataFrameSetItem:
|
||
|
@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
|
||
|
def test_setitem_dtype(self, dtype, float_frame):
|
||
|
arr = np.random.randn(len(float_frame))
|
||
|
|
||
|
float_frame[dtype] = np.array(arr, dtype=dtype)
|
||
|
assert float_frame[dtype].dtype.name == dtype
|
||
|
|
||
|
def test_setitem_list_not_dataframe(self, float_frame):
|
||
|
data = np.random.randn(len(float_frame), 2)
|
||
|
float_frame[["A", "B"]] = data
|
||
|
tm.assert_almost_equal(float_frame[["A", "B"]].values, data)
|
||
|
|
||
|
def test_setitem_error_msmgs(self):
|
||
|
|
||
|
# GH 7432
|
||
|
df = DataFrame(
|
||
|
{"bar": [1, 2, 3], "baz": ["d", "e", "f"]},
|
||
|
index=Index(["a", "b", "c"], name="foo"),
|
||
|
)
|
||
|
ser = Series(
|
||
|
["g", "h", "i", "j"],
|
||
|
index=Index(["a", "b", "c", "a"], name="foo"),
|
||
|
name="fiz",
|
||
|
)
|
||
|
msg = "cannot reindex from a duplicate axis"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df["newcol"] = ser
|
||
|
|
||
|
# GH 4107, more descriptive error message
|
||
|
df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
|
||
|
|
||
|
msg = "incompatible index of inserted column with frame index"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df["gr"] = df.groupby(["b", "c"]).count()
|
||
|
|
||
|
def test_setitem_benchmark(self):
|
||
|
# from the vb_suite/frame_methods/frame_insert_columns
|
||
|
N = 10
|
||
|
K = 5
|
||
|
df = DataFrame(index=range(N))
|
||
|
new_col = np.random.randn(N)
|
||
|
for i in range(K):
|
||
|
df[i] = new_col
|
||
|
expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N))
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_setitem_different_dtype(self):
|
||
|
df = DataFrame(
|
||
|
np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
|
||
|
)
|
||
|
df.insert(0, "foo", df["a"])
|
||
|
df.insert(2, "bar", df["c"])
|
||
|
|
||
|
# diff dtype
|
||
|
|
||
|
# new item
|
||
|
df["x"] = df["a"].astype("float32")
|
||
|
result = df.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("float64")] * 5 + [np.dtype("float32")],
|
||
|
index=["foo", "c", "bar", "b", "a", "x"],
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# replacing current (in different block)
|
||
|
df["a"] = df["a"].astype("float32")
|
||
|
result = df.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
|
||
|
index=["foo", "c", "bar", "b", "a", "x"],
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
df["y"] = df["a"].astype("int32")
|
||
|
result = df.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")],
|
||
|
index=["foo", "c", "bar", "b", "a", "x", "y"],
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_setitem_empty_columns(self):
|
||
|
# GH 13522
|
||
|
df = DataFrame(index=["A", "B", "C"])
|
||
|
df["X"] = df.index
|
||
|
df["X"] = ["x", "y", "z"]
|
||
|
exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
|
||
|
tm.assert_frame_equal(df, exp)
|
||
|
|
||
|
def test_setitem_dt64_index_empty_columns(self):
|
||
|
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
|
||
|
df = DataFrame(index=np.arange(len(rng)))
|
||
|
|
||
|
df["A"] = rng
|
||
|
assert df["A"].dtype == np.dtype("M8[ns]")
|
||
|
|
||
|
def test_setitem_timestamp_empty_columns(self):
|
||
|
# GH#19843
|
||
|
df = DataFrame(index=range(3))
|
||
|
df["now"] = Timestamp("20130101", tz="UTC")
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
|
||
|
)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_setitem_wrong_length_categorical_dtype_raises(self):
|
||
|
# GH#29523
|
||
|
cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"])
|
||
|
df = DataFrame(range(10), columns=["bar"])
|
||
|
|
||
|
msg = (
|
||
|
rf"Length of values \({len(cat)}\) "
|
||
|
rf"does not match length of index \({len(df)}\)"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df["foo"] = cat
|
||
|
|
||
|
def test_setitem_with_sparse_value(self):
|
||
|
# GH#8131
|
||
|
df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
|
||
|
sp_array = SparseArray([0, 0, 1])
|
||
|
df["new_column"] = sp_array
|
||
|
|
||
|
expected = Series(sp_array, name="new_column")
|
||
|
tm.assert_series_equal(df["new_column"], expected)
|
||
|
|
||
|
def test_setitem_with_unaligned_sparse_value(self):
|
||
|
df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
|
||
|
sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0])
|
||
|
|
||
|
df["new_column"] = sp_series
|
||
|
expected = Series(SparseArray([1, 0, 0]), name="new_column")
|
||
|
tm.assert_series_equal(df["new_column"], expected)
|
||
|
|
||
|
def test_setitem_dict_preserves_dtypes(self):
|
||
|
# https://github.com/pandas-dev/pandas/issues/34573
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"a": Series([0, 1, 2], dtype="int64"),
|
||
|
"b": Series([1, 2, 3], dtype=float),
|
||
|
"c": Series([1, 2, 3], dtype=float),
|
||
|
}
|
||
|
)
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": Series([], dtype="int64"),
|
||
|
"b": Series([], dtype=float),
|
||
|
"c": Series([], dtype=float),
|
||
|
}
|
||
|
)
|
||
|
for idx, b in enumerate([1, 2, 3]):
|
||
|
df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"obj,dtype",
|
||
|
[
|
||
|
(Period("2020-01"), PeriodDtype("M")),
|
||
|
(Interval(left=0, right=5), IntervalDtype("int64")),
|
||
|
(
|
||
|
Timestamp("2011-01-01", tz="US/Eastern"),
|
||
|
DatetimeTZDtype(tz="US/Eastern"),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_setitem_extension_types(self, obj, dtype):
|
||
|
# GH: 34832
|
||
|
expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)})
|
||
|
|
||
|
df = DataFrame({"idx": [1, 2, 3]})
|
||
|
df["obj"] = obj
|
||
|
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"ea_name",
|
||
|
[
|
||
|
dtype.name
|
||
|
for dtype in ea_registry.dtypes
|
||
|
# property would require instantiation
|
||
|
if not isinstance(dtype.name, property)
|
||
|
]
|
||
|
# mypy doesn't allow adding lists of different types
|
||
|
# https://github.com/python/mypy/issues/5492
|
||
|
+ ["datetime64[ns, UTC]", "period[D]"], # type: ignore[list-item]
|
||
|
)
|
||
|
def test_setitem_with_ea_name(self, ea_name):
|
||
|
# GH 38386
|
||
|
result = DataFrame([0])
|
||
|
result[ea_name] = [1]
|
||
|
expected = DataFrame({0: [0], ea_name: [1]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self):
|
||
|
# GH#7492
|
||
|
data_ns = np.array([1, "nat"], dtype="datetime64[ns]")
|
||
|
result = Series(data_ns).to_frame()
|
||
|
result["new"] = data_ns
|
||
|
expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# OutOfBoundsDatetime error shouldn't occur
|
||
|
data_s = np.array([1, "nat"], dtype="datetime64[s]")
|
||
|
result["new"] = data_s
|
||
|
expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
|
||
|
def test_frame_setitem_datetime64_col_other_units(self, unit):
|
||
|
# Check that non-nano dt64 values get cast to dt64 on setitem
|
||
|
# into a not-yet-existing column
|
||
|
n = 100
|
||
|
|
||
|
dtype = np.dtype(f"M8[{unit}]")
|
||
|
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||
|
ex_vals = vals.astype("datetime64[ns]")
|
||
|
|
||
|
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
|
||
|
df[unit] = vals
|
||
|
|
||
|
assert df[unit].dtype == np.dtype("M8[ns]")
|
||
|
assert (df[unit].values == ex_vals).all()
|
||
|
|
||
|
@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
|
||
|
def test_frame_setitem_existing_datetime64_col_other_units(self, unit):
|
||
|
# Check that non-nano dt64 values get cast to dt64 on setitem
|
||
|
# into an already-existing dt64 column
|
||
|
n = 100
|
||
|
|
||
|
dtype = np.dtype(f"M8[{unit}]")
|
||
|
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||
|
ex_vals = vals.astype("datetime64[ns]")
|
||
|
|
||
|
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
|
||
|
df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]")
|
||
|
|
||
|
# We overwrite existing dt64 column with new, non-nano dt64 vals
|
||
|
df["dates"] = vals
|
||
|
assert (df["dates"].values == ex_vals).all()
|
||
|
|
||
|
def test_setitem_dt64tz(self, timezone_frame):
|
||
|
|
||
|
df = timezone_frame
|
||
|
idx = df["B"].rename("foo")
|
||
|
|
||
|
# setitem
|
||
|
df["C"] = idx
|
||
|
tm.assert_series_equal(df["C"], Series(idx, name="C"))
|
||
|
|
||
|
df["D"] = "foo"
|
||
|
df["D"] = idx
|
||
|
tm.assert_series_equal(df["D"], Series(idx, name="D"))
|
||
|
del df["D"]
|
||
|
|
||
|
# assert that A & C are not sharing the same base (e.g. they
|
||
|
# are copies)
|
||
|
b1 = df._mgr.blocks[1]
|
||
|
b2 = df._mgr.blocks[2]
|
||
|
tm.assert_extension_array_equal(b1.values, b2.values)
|
||
|
b1base = b1.values._data.base
|
||
|
b2base = b2.values._data.base
|
||
|
assert b1base is None or (id(b1base) != id(b2base))
|
||
|
|
||
|
# with nan
|
||
|
df2 = df.copy()
|
||
|
df2.iloc[1, 1] = NaT
|
||
|
df2.iloc[1, 2] = NaT
|
||
|
result = df2["B"]
|
||
|
tm.assert_series_equal(notna(result), Series([True, False, True], name="B"))
|
||
|
tm.assert_series_equal(df2.dtypes, df.dtypes)
|
||
|
|
||
|
def test_setitem_periodindex(self):
|
||
|
rng = period_range("1/1/2000", periods=5, name="index")
|
||
|
df = DataFrame(np.random.randn(5, 3), index=rng)
|
||
|
|
||
|
df["Index"] = rng
|
||
|
rs = Index(df["Index"])
|
||
|
tm.assert_index_equal(rs, rng, check_names=False)
|
||
|
assert rs.name == "Index"
|
||
|
assert rng.name == "index"
|
||
|
|
||
|
rs = df.reset_index().set_index("index")
|
||
|
assert isinstance(rs.index, PeriodIndex)
|
||
|
tm.assert_index_equal(rs.index, rng)
|
||
|
|
||
|
def test_setitem_complete_column_with_array(self):
|
||
|
# GH#37954
|
||
|
df = DataFrame({"a": ["one", "two", "three"], "b": [1, 2, 3]})
|
||
|
arr = np.array([[1, 1], [3, 1], [5, 1]])
|
||
|
df[["c", "d"]] = arr
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"a": ["one", "two", "three"],
|
||
|
"b": [1, 2, 3],
|
||
|
"c": [1, 3, 5],
|
||
|
"d": [1, 1, 1],
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("dtype", ["f8", "i8", "u8"])
|
||
|
def test_setitem_bool_with_numeric_index(self, dtype):
|
||
|
# GH#36319
|
||
|
cols = Index([1, 2, 3], dtype=dtype)
|
||
|
df = DataFrame(np.random.randn(3, 3), columns=cols)
|
||
|
|
||
|
df[False] = ["a", "b", "c"]
|
||
|
|
||
|
expected_cols = Index([1, 2, 3, False], dtype=object)
|
||
|
if dtype == "f8":
|
||
|
expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object)
|
||
|
|
||
|
tm.assert_index_equal(df.columns, expected_cols)
|
||
|
|
||
|
|
||
|
class TestDataFrameSetItemWithExpansion:
|
||
|
def test_setitem_listlike_views(self):
|
||
|
# GH#38148
|
||
|
df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]})
|
||
|
|
||
|
# get one column as a view of df
|
||
|
ser = df["a"]
|
||
|
|
||
|
# add columns with list-like indexer
|
||
|
df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]])
|
||
|
|
||
|
# edit in place the first column to check view semantics
|
||
|
df.iloc[0, 0] = 100
|
||
|
|
||
|
expected = Series([100, 2, 3], name="a")
|
||
|
tm.assert_series_equal(ser, expected)
|
||
|
|
||
|
def test_setitem_string_column_numpy_dtype_raising(self):
|
||
|
# GH#39010
|
||
|
df = DataFrame([[1, 2], [3, 4]])
|
||
|
df["0 - Name"] = [5, 6]
|
||
|
expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameSetItemSlicing:
|
||
|
def test_setitem_slice_position(self):
|
||
|
# GH#31469
|
||
|
df = DataFrame(np.zeros((100, 1)))
|
||
|
df[-4:] = 1
|
||
|
arr = np.zeros((100, 1))
|
||
|
arr[-4:] = 1
|
||
|
expected = DataFrame(arr)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameSetItemCallable:
|
||
|
def test_setitem_callable(self):
|
||
|
# GH#12533
|
||
|
df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
|
||
|
df[lambda x: "A"] = [11, 12, 13, 14]
|
||
|
|
||
|
exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]})
|
||
|
tm.assert_frame_equal(df, exp)
|
||
|
|
||
|
|
||
|
class TestDataFrameSetItemBooleanMask:
|
||
|
@pytest.mark.parametrize(
|
||
|
"mask_type",
|
||
|
[lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values],
|
||
|
ids=["dataframe", "array"],
|
||
|
)
|
||
|
def test_setitem_boolean_mask(self, mask_type, float_frame):
|
||
|
|
||
|
# Test for issue #18582
|
||
|
df = float_frame.copy()
|
||
|
mask = mask_type(df)
|
||
|
|
||
|
# index with boolean mask
|
||
|
result = df.copy()
|
||
|
result[mask] = np.nan
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected.values[np.array(mask)] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("indexer", [lambda x: x, lambda x: x.loc])
|
||
|
def test_setitem_boolean_mask_aligning(self, indexer):
|
||
|
# GH#39931
|
||
|
df = DataFrame({"a": [1, 4, 2, 3], "b": [5, 6, 7, 8]})
|
||
|
expected = df.copy()
|
||
|
mask = df["a"] >= 3
|
||
|
indexer(df)[mask] = indexer(df)[mask].sort_values("a")
|
||
|
tm.assert_frame_equal(df, expected)
|