Inzynierka/Lib/site-packages/pandas/tests/arrays/test_array.py
2023-06-02 12:51:02 +02:00

432 lines
14 KiB
Python

import datetime
import decimal
import numpy as np
import pytest
import pytz
from pandas.core.dtypes.base import _registry as registry
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import register_extension_dtype
from pandas.arrays import (
BooleanArray,
DatetimeArray,
FloatingArray,
IntegerArray,
IntervalArray,
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays import (
PandasArray,
period_array,
)
from pandas.tests.extension.decimal import (
DecimalArray,
DecimalDtype,
to_decimal,
)
@pytest.mark.parametrize(
"data, dtype, expected",
[
# Basic NumPy defaults.
([1, 2], None, IntegerArray._from_sequence([1, 2])),
([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
(
[1, 2],
np.dtype("float32"),
PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
),
(np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])),
(
np.array([1.0, 2.0], dtype="float64"),
None,
FloatingArray._from_sequence([1.0, 2.0]),
),
# String alias passes through to NumPy
([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),
([1, 2], "int64", PandasArray(np.array([1, 2], dtype=np.int64))),
# GH#44715 FloatingArray does not support float16, so fall back to PandasArray
(
np.array([1, 2], dtype=np.float16),
None,
PandasArray(np.array([1, 2], dtype=np.float16)),
),
# idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
(
PandasArray(np.array([1, 2], dtype=np.int32)),
None,
PandasArray(np.array([1, 2], dtype=np.int32)),
),
# Period alias
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
"Period[D]",
period_array(["2000", "2001"], freq="D"),
),
# Period dtype
(
[pd.Period("2000", "D")],
pd.PeriodDtype("D"),
period_array(["2000"], freq="D"),
),
# Datetime (naive)
(
[1, 2],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
),
(
np.array([1, 2], dtype="datetime64[ns]"),
None,
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
),
(
pd.DatetimeIndex(["2000", "2001"]),
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
pd.DatetimeIndex(["2000", "2001"]),
None,
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
["2000", "2001"],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"]),
),
# Datetime (tz-aware)
(
["2000", "2001"],
pd.DatetimeTZDtype(tz="CET"),
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
# Timedelta
(
["1H", "2H"],
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
pd.TimedeltaIndex(["1H", "2H"]),
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
pd.TimedeltaIndex(["1H", "2H"]),
None,
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
# preserve non-nano, i.e. don't cast to PandasArray
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
None,
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
),
(
# preserve non-nano, i.e. don't cast to PandasArray
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
np.dtype("m8[s]"),
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
),
# Category
(["a", "b"], "category", pd.Categorical(["a", "b"])),
(
["a", "b"],
pd.CategoricalDtype(None, ordered=True),
pd.Categorical(["a", "b"], ordered=True),
),
# Interval
(
[pd.Interval(1, 2), pd.Interval(3, 4)],
"interval",
IntervalArray.from_tuples([(1, 2), (3, 4)]),
),
# Sparse
([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
# IntegerNA
([1, None], "Int16", pd.array([1, None], dtype="Int16")),
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# String
(
["a", None],
"string",
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
# Boolean
([True, None], "boolean", BooleanArray._from_sequence([True, None])),
([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),
# Index
(pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# Series[EA] returns the EA
(
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
None,
pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
),
# "3rd party" EAs work
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
# pass an ExtensionArray, but a different dtype
(
period_array(["2000", "2001"], freq="D"),
"category",
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
),
],
)
def test_array(data, dtype, expected):
result = pd.array(data, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_copy():
a = np.array([1, 2])
# default is to copy
b = pd.array(a, dtype=a.dtype)
assert not tm.shares_memory(a, b)
# copy=True
b = pd.array(a, dtype=a.dtype, copy=True)
assert not tm.shares_memory(a, b)
# copy=False
b = pd.array(a, dtype=a.dtype, copy=False)
assert tm.shares_memory(a, b)
cet = pytz.timezone("CET")
@pytest.mark.parametrize(
"data, expected",
[
# period
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
period_array(["2000", "2001"], freq="D"),
),
# interval
([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
np.array([1, 2], dtype="M8[ns]"),
DatetimeArray(np.array([1, 2], dtype="M8[ns]")),
),
(
np.array([1, 2], dtype="M8[us]"),
DatetimeArray._simple_new(
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
),
),
# datetimetz
(
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
(
[
datetime.datetime(2000, 1, 1, tzinfo=cet),
datetime.datetime(2001, 1, 1, tzinfo=cet),
],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet)
),
),
# timedelta
(
[pd.Timedelta("1H"), pd.Timedelta("2H")],
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
np.array([1, 2], dtype="m8[ns]"),
TimedeltaArray(np.array([1, 2], dtype="m8[ns]")),
),
(
np.array([1, 2], dtype="m8[us]"),
TimedeltaArray(np.array([1, 2], dtype="m8[us]")),
),
# integer
([1, 2], IntegerArray._from_sequence([1, 2])),
([1, None], IntegerArray._from_sequence([1, None])),
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])),
([1, np.nan], IntegerArray._from_sequence([1, np.nan])),
# float
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2])),
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA])),
# integer-like float
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA])),
# mixed-integer-float
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),
# string
(
["a", "b"],
pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),
),
(
["a", None],
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False])),
([True, None], BooleanArray._from_sequence([True, None])),
],
)
def test_array_inference(data, expected):
result = pd.array(data)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
# mix of frequencies
[pd.Period("2000", "D"), pd.Period("2001", "A")],
# mix of closed
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
# Mix of timezones
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
# Mix of tz-aware and tz-naive
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
],
)
def test_array_inference_fails(data):
result = pd.array(data)
expected = PandasArray(np.array(data, dtype=object))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("data", [np.array(0)])
def test_nd_raises(data):
with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"):
pd.array(data, dtype="int64")
def test_scalar_raises():
with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
pd.array(1)
def test_dataframe_raises():
# GH#51167 don't accidentally cast to StringArray by doing inference on columns
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
msg = "Cannot pass DataFrame to 'pandas.array'"
with pytest.raises(TypeError, match=msg):
pd.array(df)
def test_bounds_check():
# GH21796
with pytest.raises(
TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
):
pd.array([-1, 2, 3], dtype="UInt16")
# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.
@register_extension_dtype
class DecimalDtype2(DecimalDtype):
name = "decimal2"
@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return DecimalArray2
class DecimalArray2(DecimalArray):
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if isinstance(scalars, (pd.Series, pd.Index)):
raise TypeError("scalars should not be of type pd.Series or pd.Index")
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
def test_array_unboxes(index_or_series):
box = index_or_series
data = box([decimal.Decimal("1"), decimal.Decimal("2")])
# make sure it works
with pytest.raises(
TypeError, match="scalars should not be of type pd.Series or pd.Index"
):
DecimalArray2._from_sequence(data)
result = pd.array(data, dtype="decimal2")
expected = DecimalArray2._from_sequence(data.values)
tm.assert_equal(result, expected)
@pytest.fixture
def registry_without_decimal():
"""Fixture yielding 'registry' with no DecimalDtype entries"""
idx = registry.dtypes.index(DecimalDtype)
registry.dtypes.pop(idx)
yield
registry.dtypes.append(DecimalDtype)
def test_array_not_registered(registry_without_decimal):
# check we aren't on it
assert registry.find("decimal") is None
data = [decimal.Decimal("1"), decimal.Decimal("2")]
result = pd.array(data, dtype=DecimalDtype)
expected = DecimalArray._from_sequence(data)
tm.assert_equal(result, expected)
def test_array_to_numpy_na():
# GH#40638
arr = pd.array([pd.NA, 1], dtype="string")
result = arr.to_numpy(na_value=True, dtype=bool)
expected = np.array([True, True])
tm.assert_numpy_array_equal(result, expected)