3RNN/Lib/site-packages/pandas/tests/arrays/test_array.py

import datetime
import decimal
import re

import numpy as np
import pytest
import pytz

import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import register_extension_dtype
from pandas.arrays import (
    BooleanArray,
    DatetimeArray,
    FloatingArray,
    IntegerArray,
    IntervalArray,
    SparseArray,
    TimedeltaArray,
)
from pandas.core.arrays import (
    NumpyExtensionArray,
    period_array,
)
from pandas.tests.extension.decimal import (
    DecimalArray,
    DecimalDtype,
    to_decimal,
)


@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"])
def test_dt64_array(dtype_unit):
    # PR 53817
    dtype_var = np.dtype(dtype_unit)
    msg = (
        r"datetime64 and timedelta64 dtype resolutions other than "
        r"'s', 'ms', 'us', and 'ns' are deprecated. "
        r"In future releases passing unsupported resolutions will "
        r"raise an exception."
    )
    with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)):
        pd.array([], dtype=dtype_var)


@pytest.mark.parametrize(
    "data, dtype, expected",
    [
        # Basic NumPy defaults.
        ([], None, FloatingArray._from_sequence([], dtype="Float64")),
        ([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
        ([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
        (
            [1, 2],
            np.dtype("float32"),
            NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
        ),
        (
            np.array([], dtype=object),
            None,
            NumpyExtensionArray(np.array([], dtype=object)),
        ),
        (
            np.array([1, 2], dtype="int64"),
            None,
            IntegerArray._from_sequence([1, 2], dtype="Int64"),
        ),
        (
            np.array([1.0, 2.0], dtype="float64"),
            None,
            FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
        ),
        # String alias passes through to NumPy
        ([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
        ([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
        # GH#44715 FloatingArray does not support float16, so fall
        #  back to NumpyExtensionArray
        (
            np.array([1, 2], dtype=np.float16),
            None,
            NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
        ),
        # idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
        (
            NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
            None,
            NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
        ),
        # Period alias
        (
            [pd.Period("2000", "D"), pd.Period("2001", "D")],
            "Period[D]",
            period_array(["2000", "2001"], freq="D"),
        ),
        # Period dtype
        (
            [pd.Period("2000", "D")],
            pd.PeriodDtype("D"),
            period_array(["2000"], freq="D"),
        ),
        # Datetime (naive)
        (
            [1, 2],
            np.dtype("datetime64[ns]"),
            DatetimeArray._from_sequence(
                np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
            ),
        ),
        (
            [1, 2],
            np.dtype("datetime64[s]"),
            DatetimeArray._from_sequence(
                np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
            ),
        ),
        (
            np.array([1, 2], dtype="datetime64[ns]"),
            None,
            DatetimeArray._from_sequence(
                np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
            ),
        ),
        (
            pd.DatetimeIndex(["2000", "2001"]),
            np.dtype("datetime64[ns]"),
            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
        ),
        (
            pd.DatetimeIndex(["2000", "2001"]),
            None,
            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
        ),
        (
            ["2000", "2001"],
            np.dtype("datetime64[ns]"),
            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
        ),
        # Datetime (tz-aware)
        (
            ["2000", "2001"],
            pd.DatetimeTZDtype(tz="CET"),
            DatetimeArray._from_sequence(
                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
            ),
        ),
        # Timedelta
        (
            ["1h", "2h"],
            np.dtype("timedelta64[ns]"),
            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
        ),
        (
            pd.TimedeltaIndex(["1h", "2h"]),
            np.dtype("timedelta64[ns]"),
            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
        ),
        (
            np.array([1, 2], dtype="m8[s]"),
            np.dtype("timedelta64[s]"),
            TimedeltaArray._from_sequence(
                np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
            ),
        ),
        (
            pd.TimedeltaIndex(["1h", "2h"]),
            None,
            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
        ),
        (
            # preserve non-nano, i.e. don't cast to NumpyExtensionArray
            TimedeltaArray._simple_new(
                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
            ),
            None,
            TimedeltaArray._simple_new(
                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
            ),
        ),
        (
            # preserve non-nano, i.e. don't cast to NumpyExtensionArray
            TimedeltaArray._simple_new(
                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
            ),
            np.dtype("m8[s]"),
            TimedeltaArray._simple_new(
                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
            ),
        ),
        # Category
        (["a", "b"], "category", pd.Categorical(["a", "b"])),
        (
            ["a", "b"],
            pd.CategoricalDtype(None, ordered=True),
            pd.Categorical(["a", "b"], ordered=True),
        ),
        # Interval
        (
            [pd.Interval(1, 2), pd.Interval(3, 4)],
            "interval",
            IntervalArray.from_tuples([(1, 2), (3, 4)]),
        ),
        # Sparse
        ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
        # IntegerNA
        ([1, None], "Int16", pd.array([1, None], dtype="Int16")),
        (
            pd.Series([1, 2]),
            None,
            NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
        ),
        # String
        (
            ["a", None],
            "string",
            pd.StringDtype()
            .construct_array_type()
            ._from_sequence(["a", None], dtype=pd.StringDtype()),
        ),
        (
            ["a", None],
            pd.StringDtype(),
            pd.StringDtype()
            .construct_array_type()
            ._from_sequence(["a", None], dtype=pd.StringDtype()),
        ),
        # Boolean
        (
            [True, None],
            "boolean",
            BooleanArray._from_sequence([True, None], dtype="boolean"),
        ),
        (
            [True, None],
            pd.BooleanDtype(),
            BooleanArray._from_sequence([True, None], dtype="boolean"),
        ),
        # Index
        (pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
        # Series[EA] returns the EA
        (
            pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
            None,
            pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
        ),
        # "3rd party" EAs work
        ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
        # pass an ExtensionArray, but a different dtype
        (
            period_array(["2000", "2001"], freq="D"),
            "category",
            pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
        ),
    ],
)
def test_array(data, dtype, expected):
    result = pd.array(data, dtype=dtype)
    tm.assert_equal(result, expected)


def test_array_copy():
    a = np.array([1, 2])
    # default is to copy
    b = pd.array(a, dtype=a.dtype)
    assert not tm.shares_memory(a, b)

    # copy=True
    b = pd.array(a, dtype=a.dtype, copy=True)
    assert not tm.shares_memory(a, b)

    # copy=False
    b = pd.array(a, dtype=a.dtype, copy=False)
    assert tm.shares_memory(a, b)


cet = pytz.timezone("CET")


@pytest.mark.parametrize(
    "data, expected",
    [
        # period
        (
            [pd.Period("2000", "D"), pd.Period("2001", "D")],
            period_array(["2000", "2001"], freq="D"),
        ),
        # interval
        ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
        # datetime
        (
            [pd.Timestamp("2000"), pd.Timestamp("2001")],
            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
        ),
        (
            [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
        ),
        (
            np.array([1, 2], dtype="M8[ns]"),
            DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
        ),
        (
            np.array([1, 2], dtype="M8[us]"),
            DatetimeArray._simple_new(
                np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
            ),
        ),
        # datetimetz
        (
            [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
            DatetimeArray._from_sequence(
                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns")
            ),
        ),
        (
            [
                datetime.datetime(2000, 1, 1, tzinfo=cet),
                datetime.datetime(2001, 1, 1, tzinfo=cet),
            ],
            DatetimeArray._from_sequence(
                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns")
            ),
        ),
        # timedelta
        (
            [pd.Timedelta("1h"), pd.Timedelta("2h")],
            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
        ),
        (
            np.array([1, 2], dtype="m8[ns]"),
            TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
        ),
        (
            np.array([1, 2], dtype="m8[us]"),
            TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
        ),
        # integer
        ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
        ([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
        ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
        ([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
        # float
        ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
        ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
        ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
        ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
        # integer-like float
        ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
        ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
        ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
        ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
        # mixed-integer-float
        ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
        (
            [1, np.nan, 2.0],
            FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
        ),
        # string
        (
            ["a", "b"],
            pd.StringDtype()
            .construct_array_type()
            ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
        ),
        (
            ["a", None],
            pd.StringDtype()
            .construct_array_type()
            ._from_sequence(["a", None], dtype=pd.StringDtype()),
        ),
        # Boolean
        ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
        ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
    ],
)
def test_array_inference(data, expected):
    result = pd.array(data)
    tm.assert_equal(result, expected)


@pytest.mark.parametrize(
    "data",
    [
        # mix of frequencies
        [pd.Period("2000", "D"), pd.Period("2001", "Y")],
        # mix of closed
        [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
        # Mix of timezones
        [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
        # Mix of tz-aware and tz-naive
        [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
        np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
    ],
)
def test_array_inference_fails(data):
    result = pd.array(data)
    expected = NumpyExtensionArray(np.array(data, dtype=object))
    tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("data", [np.array(0)])
def test_nd_raises(data):
    with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
        pd.array(data, dtype="int64")


def test_scalar_raises():
    with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
        pd.array(1)


def test_dataframe_raises():
    # GH#51167 don't accidentally cast to StringArray by doing inference on columns
    df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
    msg = "Cannot pass DataFrame to 'pandas.array'"
    with pytest.raises(TypeError, match=msg):
        pd.array(df)


def test_bounds_check():
    # GH21796
    with pytest.raises(
        TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
    ):
        pd.array([-1, 2, 3], dtype="UInt16")


# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.


@register_extension_dtype
class DecimalDtype2(DecimalDtype):
    name = "decimal2"

    @classmethod
    def construct_array_type(cls):
        """
        Return the array type associated with this dtype.

        Returns
        -------
        type
        """
        return DecimalArray2


class DecimalArray2(DecimalArray):
    @classmethod
    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
        if isinstance(scalars, (pd.Series, pd.Index)):
            raise TypeError("scalars should not be of type pd.Series or pd.Index")

        return super()._from_sequence(scalars, dtype=dtype, copy=copy)


def test_array_unboxes(index_or_series):
    box = index_or_series

    data = box([decimal.Decimal("1"), decimal.Decimal("2")])
    dtype = DecimalDtype2()
    # make sure it works
    with pytest.raises(
        TypeError, match="scalars should not be of type pd.Series or pd.Index"
    ):
        DecimalArray2._from_sequence(data, dtype=dtype)

    result = pd.array(data, dtype="decimal2")
    expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
    tm.assert_equal(result, expected)


def test_array_to_numpy_na():
    # GH#40638
    arr = pd.array([pd.NA, 1], dtype="string[python]")
    result = arr.to_numpy(na_value=True, dtype=bool)
    expected = np.array([True, True])
    tm.assert_numpy_array_equal(result, expected)