Inzynierka/Lib/site-packages/pandas/tests/copy_view/test_indexing.py

1080 lines
35 KiB
Python
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
import numpy as np
import pytest
from pandas.errors import SettingWithCopyWarning
from pandas.core.dtypes.common import is_float_dtype
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@pytest.fixture(params=["numpy", "nullable"])
def backend(request):
if request.param == "numpy":
def make_dataframe(*args, **kwargs):
return DataFrame(*args, **kwargs)
def make_series(*args, **kwargs):
return Series(*args, **kwargs)
elif request.param == "nullable":
def make_dataframe(*args, **kwargs):
df = DataFrame(*args, **kwargs)
df_nullable = df.convert_dtypes()
# convert_dtypes will try to cast float to int if there is no loss in
# precision -> undo that change
for col in df.columns:
if is_float_dtype(df[col].dtype) and not is_float_dtype(
df_nullable[col].dtype
):
df_nullable[col] = df_nullable[col].astype("Float64")
# copy final result to ensure we start with a fully self-owning DataFrame
return df_nullable.copy()
def make_series(*args, **kwargs):
ser = Series(*args, **kwargs)
return ser.convert_dtypes().copy()
return request.param, make_dataframe, make_series
# -----------------------------------------------------------------------------
# Indexing operations taking subset + modifying the subset/parent
def test_subset_column_selection(backend, using_copy_on_write):
# Case: taking a subset of the columns of a DataFrame
# + afterwards modifying the subset
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
subset = df[["a", "c"]]
if using_copy_on_write:
# the subset shares memory ...
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
# ... but uses CoW when being modified
subset.iloc[0, 0] = 0
else:
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
# INFO this no longer raise warning since pandas 1.4
# with pd.option_context("chained_assignment", "warn"):
# with tm.assert_produces_warning(SettingWithCopyWarning):
subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]})
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
def test_subset_column_selection_modify_parent(backend, using_copy_on_write):
# Case: taking a subset of the columns of a DataFrame
# + afterwards modifying the parent
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
subset = df[["a", "c"]]
if using_copy_on_write:
# the subset shares memory ...
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
# ... but parent uses CoW parent when it is modified
df.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
if using_copy_on_write:
# different column/block still shares memory
assert np.shares_memory(get_array(subset, "c"), get_array(df, "c"))
expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]})
tm.assert_frame_equal(subset, expected)
def test_subset_row_slice(backend, using_copy_on_write):
# Case: taking a subset of the rows of a DataFrame using a slice
# + afterwards modifying the subset
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
subset = df[1:3]
subset._mgr._verify_integrity()
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
if using_copy_on_write:
subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
else:
# INFO this no longer raise warning since pandas 1.4
# with pd.option_context("chained_assignment", "warn"):
# with tm.assert_produces_warning(SettingWithCopyWarning):
subset.iloc[0, 0] = 0
subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
# original parent dataframe is actually updated
df_orig.iloc[1, 0] = 0
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, dtype):
# Case: taking a subset of the columns of a DataFrame using a slice
# + afterwards modifying the subset
dtype_backend, DataFrame, _ = backend
single_block = (
dtype == "int64" and dtype_backend == "numpy"
) and not using_array_manager
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df.iloc[:, 1:]
subset._mgr._verify_integrity()
if using_copy_on_write:
assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
else:
# we only get a warning in case of a single block
warn = SettingWithCopyWarning if single_block else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
subset.iloc[0, 0] = 0
expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)})
tm.assert_frame_equal(subset, expected)
# original parent dataframe is not modified (also not for BlockManager case,
# except for single block)
if not using_copy_on_write and (using_array_manager or single_block):
df_orig.iloc[0, 1] = 0
tm.assert_frame_equal(df, df_orig)
else:
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
@pytest.mark.parametrize(
"row_indexer",
[slice(1, 2), np.array([False, True, True]), np.array([1, 2])],
ids=["slice", "mask", "array"],
)
@pytest.mark.parametrize(
"column_indexer",
[slice("b", "c"), np.array([False, True, True]), ["b", "c"]],
ids=["slice", "mask", "array"],
)
def test_subset_loc_rows_columns(
backend,
dtype,
row_indexer,
column_indexer,
using_array_manager,
using_copy_on_write,
):
# Case: taking a subset of the rows+columns of a DataFrame using .loc
# + afterwards modifying the subset
# Generic test for several combinations of row/column indexers, not all
# of those could actually return a view / need CoW (so this test is not
# checking memory sharing, only ensuring subsequent mutation doesn't
# affect the parent dataframe)
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df.loc[row_indexer, column_indexer]
# modifying the subset never modifies the parent
subset.iloc[0, 0] = 0
expected = DataFrame(
{"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
# a few corner cases _do_ actually modify the parent (with both row and column
# slice, and in case of ArrayManager or BlockManager with single block)
if (
isinstance(row_indexer, slice)
and isinstance(column_indexer, slice)
and (
using_array_manager
or (
dtype == "int64"
and dtype_backend == "numpy"
and not using_copy_on_write
)
)
):
df_orig.iloc[1, 1] = 0
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
@pytest.mark.parametrize(
"row_indexer",
[slice(1, 3), np.array([False, True, True]), np.array([1, 2])],
ids=["slice", "mask", "array"],
)
@pytest.mark.parametrize(
"column_indexer",
[slice(1, 3), np.array([False, True, True]), [1, 2]],
ids=["slice", "mask", "array"],
)
def test_subset_iloc_rows_columns(
backend,
dtype,
row_indexer,
column_indexer,
using_array_manager,
using_copy_on_write,
):
# Case: taking a subset of the rows+columns of a DataFrame using .iloc
# + afterwards modifying the subset
# Generic test for several combinations of row/column indexers, not all
# of those could actually return a view / need CoW (so this test is not
# checking memory sharing, only ensuring subsequent mutation doesn't
# affect the parent dataframe)
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df.iloc[row_indexer, column_indexer]
# modifying the subset never modifies the parent
subset.iloc[0, 0] = 0
expected = DataFrame(
{"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
# a few corner cases _do_ actually modify the parent (with both row and column
# slice, and in case of ArrayManager or BlockManager with single block)
if (
isinstance(row_indexer, slice)
and isinstance(column_indexer, slice)
and (
using_array_manager
or (
dtype == "int64"
and dtype_backend == "numpy"
and not using_copy_on_write
)
)
):
df_orig.iloc[1, 1] = 0
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"indexer",
[slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on_write):
# Case: setting values with a row indexer on a viewing subset
# subset[indexer] = value and subset.iloc[indexer] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
df_orig = df.copy()
subset = df[1:4]
if (
indexer_si is tm.setitem
and isinstance(indexer, np.ndarray)
and indexer.dtype == "int"
):
pytest.skip("setitem with labels selects on columns")
if using_copy_on_write:
indexer_si(subset)[indexer] = 0
else:
# INFO iloc no longer raises warning since pandas 1.4
warn = SettingWithCopyWarning if indexer_si is tm.setitem else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
indexer_si(subset)[indexer] = 0
expected = DataFrame(
{"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4)
)
tm.assert_frame_equal(subset, expected)
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
# original parent dataframe is actually updated
df_orig[1:3] = 0
tm.assert_frame_equal(df, df_orig)
def test_subset_set_with_mask(backend, using_copy_on_write):
# Case: setting values with a mask on a viewing subset: subset[mask] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
df_orig = df.copy()
subset = df[1:4]
mask = subset > 3
if using_copy_on_write:
subset[mask] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
subset[mask] = 0
expected = DataFrame(
{"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4)
)
tm.assert_frame_equal(subset, expected)
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
# original parent dataframe is actually updated
df_orig.loc[3, "a"] = 0
df_orig.loc[1:3, "b"] = 0
tm.assert_frame_equal(df, df_orig)
def test_subset_set_column(backend, using_copy_on_write):
# Case: setting a single column on a viewing subset -> subset[col] = value
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
subset = df[1:3]
if dtype_backend == "numpy":
arr = np.array([10, 11], dtype="int64")
else:
arr = pd.array([10, 11], dtype="Int64")
if using_copy_on_write:
subset["a"] = arr
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
subset["a"] = arr
subset._mgr._verify_integrity()
expected = DataFrame(
{"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_column_with_loc(
backend, using_copy_on_write, using_array_manager, dtype
):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
_, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df[1:3]
if using_copy_on_write:
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(
None,
raise_on_extra_warnings=not using_array_manager,
):
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
subset._mgr._verify_integrity()
expected = DataFrame(
{"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)},
index=range(1, 3),
)
tm.assert_frame_equal(subset, expected)
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
# original parent dataframe is actually updated
df_orig.loc[1:3, "a"] = np.array([10, 11], dtype="int64")
tm.assert_frame_equal(df, df_orig)
def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_manager):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
# separate test for case of DataFrame of a single column -> takes a separate
# code path
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
subset = df[1:3]
if using_copy_on_write:
subset.loc[:, "a"] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(
None,
raise_on_extra_warnings=not using_array_manager,
):
subset.loc[:, "a"] = 0
subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 0]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
# original parent dataframe is actually updated
df_orig.loc[1:3, "a"] = 0
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_columns(backend, using_copy_on_write, dtype):
# Case: setting multiple columns on a viewing subset
# -> subset[[col1, col2]] = value
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df[1:3]
if using_copy_on_write:
subset[["a", "c"]] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
subset[["a", "c"]] = 0
subset._mgr._verify_integrity()
if using_copy_on_write:
# first and third column should certainly have no references anymore
assert all(subset._mgr._has_no_reference(i) for i in [0, 2])
expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3))
if dtype_backend == "nullable":
# there is not yet a global option, so overriding a column by setting a scalar
# defaults to numpy dtype even if original column was nullable
expected["a"] = expected["a"].astype("int64")
expected["c"] = expected["c"].astype("int64")
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"indexer",
[slice("a", "b"), np.array([True, True, False]), ["a", "b"]],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write):
# Case: setting multiple columns with a column indexer on a viewing subset
# -> subset.loc[:, [col1, col2]] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]})
df_orig = df.copy()
subset = df[1:3]
if using_copy_on_write:
subset.loc[:, indexer] = 0
else:
with pd.option_context("chained_assignment", "warn"):
# As of 2.0, this setitem attempts (successfully) to set values
# inplace, so the assignment is not chained.
subset.loc[:, indexer] = 0
subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
# pre-2.0, in the mixed case with BlockManager, only column "a"
# would be mutated in the parent frame. this changed with the
# enforcement of GH#45333
df_orig.loc[1:2, ["a", "b"]] = 0
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"method",
[
lambda df: df[["a", "b"]][0:2],
lambda df: df[0:2][["a", "b"]],
lambda df: df[["a", "b"]].iloc[0:2],
lambda df: df[["a", "b"]].loc[0:1],
lambda df: df[0:2].iloc[:, 0:2],
lambda df: df[0:2].loc[:, "a":"b"], # type: ignore[misc]
],
ids=[
"row-getitem-slice",
"column-getitem",
"row-iloc-slice",
"row-loc-slice",
"column-iloc-slice",
"column-loc-slice",
],
)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_chained_getitem(
request, backend, method, dtype, using_copy_on_write, using_array_manager
):
# Case: creating a subset using multiple, chained getitem calls using views
# still needs to guarantee proper CoW behaviour
_, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
# when not using CoW, it depends on whether we have a single block or not
# and whether we are slicing the columns -> in that case we have a view
test_callspec = request.node.callspec.id
if not using_array_manager:
subset_is_view = test_callspec in (
"numpy-single-block-column-iloc-slice",
"numpy-single-block-column-loc-slice",
)
else:
# with ArrayManager, it doesn't matter whether we have
# single vs mixed block or numpy vs nullable dtypes
subset_is_view = test_callspec.endswith(
"column-iloc-slice"
) or test_callspec.endswith("column-loc-slice")
# modify subset -> don't modify parent
subset = method(df)
subset.iloc[0, 0] = 0
if using_copy_on_write or (not subset_is_view):
tm.assert_frame_equal(df, df_orig)
else:
assert df.iloc[0, 0] == 0
# modify parent -> don't modify subset
subset = method(df)
df.iloc[0, 0] = 0
expected = DataFrame({"a": [1, 2], "b": [4, 5]})
if using_copy_on_write or not subset_is_view:
tm.assert_frame_equal(subset, expected)
else:
assert subset.iloc[0, 0] == 0
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write):
# Case: creating a subset using multiple, chained getitem calls using views
# still needs to guarantee proper CoW behaviour
_, DataFrame, Series = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
# modify subset -> don't modify parent
subset = df[:]["a"][0:2]
df._clear_item_cache()
subset.iloc[0] = 0
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
assert df.iloc[0, 0] == 0
# modify parent -> don't modify subset
subset = df[:]["a"][0:2]
df._clear_item_cache()
df.iloc[0, 0] = 0
expected = Series([1, 2], name="a")
if using_copy_on_write:
tm.assert_series_equal(subset, expected)
else:
assert subset.iloc[0] == 0
@pytest.mark.parametrize(
"method",
[
lambda s: s["a":"c"]["a":"b"], # type: ignore[misc]
lambda s: s.iloc[0:3].iloc[0:2],
lambda s: s.loc["a":"c"].loc["a":"b"], # type: ignore[misc]
lambda s: s.loc["a":"c"] # type: ignore[misc]
.iloc[0:3]
.iloc[0:2]
.loc["a":"b"] # type: ignore[misc]
.iloc[0:1],
],
ids=["getitem", "iloc", "loc", "long-chain"],
)
def test_subset_chained_getitem_series(backend, method, using_copy_on_write):
# Case: creating a subset using multiple, chained getitem calls using views
# still needs to guarantee proper CoW behaviour
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
# modify subset -> don't modify parent
subset = method(s)
subset.iloc[0] = 0
if using_copy_on_write:
tm.assert_series_equal(s, s_orig)
else:
assert s.iloc[0] == 0
# modify parent -> don't modify subset
subset = s.iloc[0:3].iloc[0:2]
s.iloc[0] = 0
expected = Series([1, 2], index=["a", "b"])
if using_copy_on_write:
tm.assert_series_equal(subset, expected)
else:
assert subset.iloc[0] == 0
def test_subset_chained_single_block_row(using_copy_on_write, using_array_manager):
# not parametrizing this for dtype backend, since this explicitly tests single block
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df_orig = df.copy()
# modify subset -> don't modify parent
subset = df[:].iloc[0].iloc[0:2]
subset.iloc[0] = 0
if using_copy_on_write or using_array_manager:
tm.assert_frame_equal(df, df_orig)
else:
assert df.iloc[0, 0] == 0
# modify parent -> don't modify subset
subset = df[:].iloc[0].iloc[0:2]
df.iloc[0, 0] = 0
expected = Series([1, 4], index=["a", "b"], name=0)
if using_copy_on_write or using_array_manager:
tm.assert_series_equal(subset, expected)
else:
assert subset.iloc[0] == 0
@pytest.mark.parametrize(
"method",
[
lambda df: df[:],
lambda df: df.loc[:, :],
lambda df: df.loc[:],
lambda df: df.iloc[:, :],
lambda df: df.iloc[:],
],
ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"],
)
def test_null_slice(backend, method, using_copy_on_write):
# Case: also all variants of indexing with a null slice (:) should return
# new objects to ensure we correctly use CoW for the results
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df_orig = df.copy()
df2 = method(df)
# we always return new objects (shallow copy), regardless of CoW or not
assert df2 is not df
# and those trigger CoW when mutated
df2.iloc[0, 0] = 0
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
assert df.iloc[0, 0] == 0
@pytest.mark.parametrize(
"method",
[
lambda s: s[:],
lambda s: s.loc[:],
lambda s: s.iloc[:],
],
ids=["getitem", "loc", "iloc"],
)
def test_null_slice_series(backend, method, using_copy_on_write):
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
s2 = method(s)
# we always return new objects, regardless of CoW or not
assert s2 is not s
# and those trigger CoW when mutated
s2.iloc[0] = 0
if using_copy_on_write:
tm.assert_series_equal(s, s_orig)
else:
assert s.iloc[0] == 0
# TODO add more tests modifying the parent
# -----------------------------------------------------------------------------
# Series -- Indexing operations taking subset + modifying the subset/parent
def test_series_getitem_slice(backend, using_copy_on_write):
# Case: taking a slice of a Series + afterwards modifying the subset
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
subset = s[:]
assert np.shares_memory(get_array(subset), get_array(s))
subset.iloc[0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(subset), get_array(s))
expected = Series([0, 2, 3], index=["a", "b", "c"])
tm.assert_series_equal(subset, expected)
if using_copy_on_write:
# original parent series is not modified (CoW)
tm.assert_series_equal(s, s_orig)
else:
# original parent series is actually updated
assert s.iloc[0] == 0
@pytest.mark.parametrize(
"indexer",
[slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
ids=["slice", "mask", "array"],
)
def test_series_subset_set_with_indexer(
backend, indexer_si, indexer, using_copy_on_write
):
# Case: setting values in a viewing Series with an indexer
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
subset = s[:]
indexer_si(subset)[indexer] = 0
expected = Series([0, 0, 3], index=["a", "b", "c"])
tm.assert_series_equal(subset, expected)
if using_copy_on_write:
tm.assert_series_equal(s, s_orig)
else:
tm.assert_series_equal(s, expected)
# -----------------------------------------------------------------------------
# del operator
def test_del_frame(backend, using_copy_on_write):
# Case: deleting a column with `del` on a viewing child dataframe should
# not modify parent + update the references
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
df2 = df[:]
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
del df2["b"]
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
tm.assert_frame_equal(df, df_orig)
tm.assert_frame_equal(df2, df_orig[["a", "c"]])
df2._mgr._verify_integrity()
# TODO in theory modifying column "b" of the parent wouldn't need a CoW
# but the weakref is still alive and so we still perform CoW
df2.loc[0, "a"] = 100
if using_copy_on_write:
# modifying child after deleting a column still doesn't update parent
tm.assert_frame_equal(df, df_orig)
else:
assert df.loc[0, "a"] == 100
def test_del_series(backend):
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
s2 = s[:]
assert np.shares_memory(get_array(s), get_array(s2))
del s2["a"]
assert not np.shares_memory(get_array(s), get_array(s2))
tm.assert_series_equal(s, s_orig)
tm.assert_series_equal(s2, s_orig[["b", "c"]])
# modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array)
values = s2.values
s2.loc["b"] = 100
assert values[0] == 100
# -----------------------------------------------------------------------------
# Accessing column as Series
def test_column_as_series(backend, using_copy_on_write, using_array_manager):
# Case: selecting a single column now also uses Copy-on-Write
dtype_backend, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
s = df["a"]
assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))
if using_copy_on_write or using_array_manager:
s[0] = 0
else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
s[0] = 0
expected = Series([0, 2, 3], name="a")
tm.assert_series_equal(s, expected)
if using_copy_on_write:
# assert not np.shares_memory(s.values, get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
# ensure cached series on getitem is not the changed series
tm.assert_series_equal(df["a"], df_orig["a"])
else:
df_orig.iloc[0, 0] = 0
tm.assert_frame_equal(df, df_orig)
def test_column_as_series_set_with_upcast(
backend, using_copy_on_write, using_array_manager
):
# Case: selecting a single column now also uses Copy-on-Write -> when
# setting a value causes an upcast, we don't need to update the parent
# DataFrame through the cache mechanism
dtype_backend, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
s = df["a"]
if dtype_backend == "nullable":
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
expected = Series([1, 2, 3], name="a")
elif using_copy_on_write or using_array_manager:
s[0] = "foo"
expected = Series(["foo", 2, 3], dtype=object, name="a")
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
s[0] = "foo"
expected = Series(["foo", 2, 3], dtype=object, name="a")
tm.assert_series_equal(s, expected)
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
# ensure cached series on getitem is not the changed series
tm.assert_series_equal(df["a"], df_orig["a"])
else:
df_orig["a"] = expected
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"method",
[
lambda df: df["a"],
lambda df: df.loc[:, "a"],
lambda df: df.iloc[:, 0],
],
ids=["getitem", "loc", "iloc"],
)
def test_column_as_series_no_item_cache(
request, backend, method, using_copy_on_write, using_array_manager
):
# Case: selecting a single column (which now also uses Copy-on-Write to protect
# the view) should always give a new object (i.e. not make use of a cache)
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
s1 = method(df)
s2 = method(df)
is_iloc = "iloc" in request.node.name
if using_copy_on_write or is_iloc:
assert s1 is not s2
else:
assert s1 is s2
if using_copy_on_write or using_array_manager:
s1.iloc[0] = 0
else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
s1.iloc[0] = 0
if using_copy_on_write:
tm.assert_series_equal(s2, df_orig["a"])
tm.assert_frame_equal(df, df_orig)
else:
assert s2.iloc[0] == 0
# TODO add tests for other indexing methods on the Series
def test_dataframe_add_column_from_series(backend):
# Case: adding a new column to a DataFrame from an existing column/series
# -> always already takes a copy on assignment
# (no change in behaviour here)
# TODO can we achieve the same behaviour with Copy-on-Write?
_, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
s = Series([10, 11, 12])
df["new"] = s
assert not np.shares_memory(get_array(df, "new"), s.values)
# editing series -> doesn't modify column in frame
s[0] = 0
expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]})
tm.assert_frame_equal(df, expected)
# editing column in frame -> doesn't modify series
df.loc[2, "new"] = 100
expected_s = Series([0, 11, 12])
tm.assert_series_equal(s, expected_s)
@pytest.mark.parametrize("val", [100, "a"])
@pytest.mark.parametrize(
"indexer_func, indexer",
[
(tm.loc, (0, "a")),
(tm.iloc, (0, 0)),
(tm.loc, ([0], "a")),
(tm.iloc, ([0], 0)),
(tm.loc, (slice(None), "a")),
(tm.iloc, (slice(None), 0)),
],
)
def test_set_value_copy_only_necessary_column(
using_copy_on_write, indexer_func, indexer, val
):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
# TODO multi-block only for now
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
view = df[:]
indexer_func(df)[indexer] = val
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "c"), get_array(view, "c"))
if val == "a":
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))
def test_series_midx_slice(using_copy_on_write):
ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]))
result = ser[1]
assert np.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = 100
if using_copy_on_write:
expected = Series(
[1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])
)
tm.assert_series_equal(ser, expected)
def test_getitem_midx_slice(using_copy_on_write, using_array_manager):
df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2})
df_orig = df.copy()
new_df = df[("a",)]
if using_copy_on_write:
assert not new_df._mgr._has_no_reference(0)
if not using_array_manager:
assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x"))
if using_copy_on_write:
new_df.iloc[0, 0] = 100
tm.assert_frame_equal(df_orig, df)
def test_series_midx_tuples_slice(using_copy_on_write):
ser = Series(
[1, 2, 3],
index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
)
result = ser[(1, 2)]
assert np.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = 100
if using_copy_on_write:
expected = Series(
[1, 2, 3],
index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
)
tm.assert_series_equal(ser, expected)