479 lines
15 KiB
Python
479 lines
15 KiB
Python
|
import datetime
|
||
|
import decimal
|
||
|
import re
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
import pytz
|
||
|
|
||
|
import pandas as pd
|
||
|
import pandas._testing as tm
|
||
|
from pandas.api.extensions import register_extension_dtype
|
||
|
from pandas.arrays import (
|
||
|
BooleanArray,
|
||
|
DatetimeArray,
|
||
|
FloatingArray,
|
||
|
IntegerArray,
|
||
|
IntervalArray,
|
||
|
SparseArray,
|
||
|
TimedeltaArray,
|
||
|
)
|
||
|
from pandas.core.arrays import (
|
||
|
NumpyExtensionArray,
|
||
|
period_array,
|
||
|
)
|
||
|
from pandas.tests.extension.decimal import (
|
||
|
DecimalArray,
|
||
|
DecimalDtype,
|
||
|
to_decimal,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"])
|
||
|
def test_dt64_array(dtype_unit):
|
||
|
# PR 53817
|
||
|
dtype_var = np.dtype(dtype_unit)
|
||
|
msg = (
|
||
|
r"datetime64 and timedelta64 dtype resolutions other than "
|
||
|
r"'s', 'ms', 'us', and 'ns' are deprecated. "
|
||
|
r"In future releases passing unsupported resolutions will "
|
||
|
r"raise an exception."
|
||
|
)
|
||
|
with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)):
|
||
|
pd.array([], dtype=dtype_var)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data, dtype, expected",
|
||
|
[
|
||
|
# Basic NumPy defaults.
|
||
|
([], None, FloatingArray._from_sequence([], dtype="Float64")),
|
||
|
([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
|
||
|
([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
|
||
|
(
|
||
|
[1, 2],
|
||
|
np.dtype("float32"),
|
||
|
NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
|
||
|
),
|
||
|
(
|
||
|
np.array([], dtype=object),
|
||
|
None,
|
||
|
NumpyExtensionArray(np.array([], dtype=object)),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="int64"),
|
||
|
None,
|
||
|
IntegerArray._from_sequence([1, 2], dtype="Int64"),
|
||
|
),
|
||
|
(
|
||
|
np.array([1.0, 2.0], dtype="float64"),
|
||
|
None,
|
||
|
FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
|
||
|
),
|
||
|
# String alias passes through to NumPy
|
||
|
([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
|
||
|
([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
|
||
|
# GH#44715 FloatingArray does not support float16, so fall
|
||
|
# back to NumpyExtensionArray
|
||
|
(
|
||
|
np.array([1, 2], dtype=np.float16),
|
||
|
None,
|
||
|
NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
|
||
|
),
|
||
|
# idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
|
||
|
(
|
||
|
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
|
||
|
None,
|
||
|
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
|
||
|
),
|
||
|
# Period alias
|
||
|
(
|
||
|
[pd.Period("2000", "D"), pd.Period("2001", "D")],
|
||
|
"Period[D]",
|
||
|
period_array(["2000", "2001"], freq="D"),
|
||
|
),
|
||
|
# Period dtype
|
||
|
(
|
||
|
[pd.Period("2000", "D")],
|
||
|
pd.PeriodDtype("D"),
|
||
|
period_array(["2000"], freq="D"),
|
||
|
),
|
||
|
# Datetime (naive)
|
||
|
(
|
||
|
[1, 2],
|
||
|
np.dtype("datetime64[ns]"),
|
||
|
DatetimeArray._from_sequence(
|
||
|
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
[1, 2],
|
||
|
np.dtype("datetime64[s]"),
|
||
|
DatetimeArray._from_sequence(
|
||
|
np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="datetime64[ns]"),
|
||
|
None,
|
||
|
DatetimeArray._from_sequence(
|
||
|
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
pd.DatetimeIndex(["2000", "2001"]),
|
||
|
np.dtype("datetime64[ns]"),
|
||
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
pd.DatetimeIndex(["2000", "2001"]),
|
||
|
None,
|
||
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
["2000", "2001"],
|
||
|
np.dtype("datetime64[ns]"),
|
||
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
|
||
|
),
|
||
|
# Datetime (tz-aware)
|
||
|
(
|
||
|
["2000", "2001"],
|
||
|
pd.DatetimeTZDtype(tz="CET"),
|
||
|
DatetimeArray._from_sequence(
|
||
|
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
|
||
|
),
|
||
|
),
|
||
|
# Timedelta
|
||
|
(
|
||
|
["1h", "2h"],
|
||
|
np.dtype("timedelta64[ns]"),
|
||
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
pd.TimedeltaIndex(["1h", "2h"]),
|
||
|
np.dtype("timedelta64[ns]"),
|
||
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="m8[s]"),
|
||
|
np.dtype("timedelta64[s]"),
|
||
|
TimedeltaArray._from_sequence(
|
||
|
np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
pd.TimedeltaIndex(["1h", "2h"]),
|
||
|
None,
|
||
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
# preserve non-nano, i.e. don't cast to NumpyExtensionArray
|
||
|
TimedeltaArray._simple_new(
|
||
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
|
||
|
),
|
||
|
None,
|
||
|
TimedeltaArray._simple_new(
|
||
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
# preserve non-nano, i.e. don't cast to NumpyExtensionArray
|
||
|
TimedeltaArray._simple_new(
|
||
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
|
||
|
),
|
||
|
np.dtype("m8[s]"),
|
||
|
TimedeltaArray._simple_new(
|
||
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
|
||
|
),
|
||
|
),
|
||
|
# Category
|
||
|
(["a", "b"], "category", pd.Categorical(["a", "b"])),
|
||
|
(
|
||
|
["a", "b"],
|
||
|
pd.CategoricalDtype(None, ordered=True),
|
||
|
pd.Categorical(["a", "b"], ordered=True),
|
||
|
),
|
||
|
# Interval
|
||
|
(
|
||
|
[pd.Interval(1, 2), pd.Interval(3, 4)],
|
||
|
"interval",
|
||
|
IntervalArray.from_tuples([(1, 2), (3, 4)]),
|
||
|
),
|
||
|
# Sparse
|
||
|
([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
|
||
|
# IntegerNA
|
||
|
([1, None], "Int16", pd.array([1, None], dtype="Int16")),
|
||
|
(
|
||
|
pd.Series([1, 2]),
|
||
|
None,
|
||
|
NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
|
||
|
),
|
||
|
# String
|
||
|
(
|
||
|
["a", None],
|
||
|
"string",
|
||
|
pd.StringDtype()
|
||
|
.construct_array_type()
|
||
|
._from_sequence(["a", None], dtype=pd.StringDtype()),
|
||
|
),
|
||
|
(
|
||
|
["a", None],
|
||
|
pd.StringDtype(),
|
||
|
pd.StringDtype()
|
||
|
.construct_array_type()
|
||
|
._from_sequence(["a", None], dtype=pd.StringDtype()),
|
||
|
),
|
||
|
# Boolean
|
||
|
(
|
||
|
[True, None],
|
||
|
"boolean",
|
||
|
BooleanArray._from_sequence([True, None], dtype="boolean"),
|
||
|
),
|
||
|
(
|
||
|
[True, None],
|
||
|
pd.BooleanDtype(),
|
||
|
BooleanArray._from_sequence([True, None], dtype="boolean"),
|
||
|
),
|
||
|
# Index
|
||
|
(pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
|
||
|
# Series[EA] returns the EA
|
||
|
(
|
||
|
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
|
||
|
None,
|
||
|
pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
|
||
|
),
|
||
|
# "3rd party" EAs work
|
||
|
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
|
||
|
# pass an ExtensionArray, but a different dtype
|
||
|
(
|
||
|
period_array(["2000", "2001"], freq="D"),
|
||
|
"category",
|
||
|
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_array(data, dtype, expected):
|
||
|
result = pd.array(data, dtype=dtype)
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_array_copy():
|
||
|
a = np.array([1, 2])
|
||
|
# default is to copy
|
||
|
b = pd.array(a, dtype=a.dtype)
|
||
|
assert not tm.shares_memory(a, b)
|
||
|
|
||
|
# copy=True
|
||
|
b = pd.array(a, dtype=a.dtype, copy=True)
|
||
|
assert not tm.shares_memory(a, b)
|
||
|
|
||
|
# copy=False
|
||
|
b = pd.array(a, dtype=a.dtype, copy=False)
|
||
|
assert tm.shares_memory(a, b)
|
||
|
|
||
|
|
||
|
cet = pytz.timezone("CET")
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data, expected",
|
||
|
[
|
||
|
# period
|
||
|
(
|
||
|
[pd.Period("2000", "D"), pd.Period("2001", "D")],
|
||
|
period_array(["2000", "2001"], freq="D"),
|
||
|
),
|
||
|
# interval
|
||
|
([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
|
||
|
# datetime
|
||
|
(
|
||
|
[pd.Timestamp("2000"), pd.Timestamp("2001")],
|
||
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
|
||
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="M8[ns]"),
|
||
|
DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="M8[us]"),
|
||
|
DatetimeArray._simple_new(
|
||
|
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
|
||
|
),
|
||
|
),
|
||
|
# datetimetz
|
||
|
(
|
||
|
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
|
||
|
DatetimeArray._from_sequence(
|
||
|
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns")
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
[
|
||
|
datetime.datetime(2000, 1, 1, tzinfo=cet),
|
||
|
datetime.datetime(2001, 1, 1, tzinfo=cet),
|
||
|
],
|
||
|
DatetimeArray._from_sequence(
|
||
|
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns")
|
||
|
),
|
||
|
),
|
||
|
# timedelta
|
||
|
(
|
||
|
[pd.Timedelta("1h"), pd.Timedelta("2h")],
|
||
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="m8[ns]"),
|
||
|
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
|
||
|
),
|
||
|
(
|
||
|
np.array([1, 2], dtype="m8[us]"),
|
||
|
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
|
||
|
),
|
||
|
# integer
|
||
|
([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
|
||
|
([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
|
||
|
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
|
||
|
([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
|
||
|
# float
|
||
|
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
|
||
|
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
|
||
|
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
|
||
|
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
|
||
|
# integer-like float
|
||
|
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
|
||
|
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
|
||
|
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
|
||
|
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
|
||
|
# mixed-integer-float
|
||
|
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
|
||
|
(
|
||
|
[1, np.nan, 2.0],
|
||
|
FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
|
||
|
),
|
||
|
# string
|
||
|
(
|
||
|
["a", "b"],
|
||
|
pd.StringDtype()
|
||
|
.construct_array_type()
|
||
|
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
|
||
|
),
|
||
|
(
|
||
|
["a", None],
|
||
|
pd.StringDtype()
|
||
|
.construct_array_type()
|
||
|
._from_sequence(["a", None], dtype=pd.StringDtype()),
|
||
|
),
|
||
|
# Boolean
|
||
|
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
|
||
|
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
|
||
|
],
|
||
|
)
|
||
|
def test_array_inference(data, expected):
|
||
|
result = pd.array(data)
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data",
|
||
|
[
|
||
|
# mix of frequencies
|
||
|
[pd.Period("2000", "D"), pd.Period("2001", "Y")],
|
||
|
# mix of closed
|
||
|
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
|
||
|
# Mix of timezones
|
||
|
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
|
||
|
# Mix of tz-aware and tz-naive
|
||
|
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
|
||
|
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
|
||
|
],
|
||
|
)
|
||
|
def test_array_inference_fails(data):
|
||
|
result = pd.array(data)
|
||
|
expected = NumpyExtensionArray(np.array(data, dtype=object))
|
||
|
tm.assert_extension_array_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("data", [np.array(0)])
|
||
|
def test_nd_raises(data):
|
||
|
with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
|
||
|
pd.array(data, dtype="int64")
|
||
|
|
||
|
|
||
|
def test_scalar_raises():
|
||
|
with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
|
||
|
pd.array(1)
|
||
|
|
||
|
|
||
|
def test_dataframe_raises():
|
||
|
# GH#51167 don't accidentally cast to StringArray by doing inference on columns
|
||
|
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
|
||
|
msg = "Cannot pass DataFrame to 'pandas.array'"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
pd.array(df)
|
||
|
|
||
|
|
||
|
def test_bounds_check():
|
||
|
# GH21796
|
||
|
with pytest.raises(
|
||
|
TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
|
||
|
):
|
||
|
pd.array([-1, 2, 3], dtype="UInt16")
|
||
|
|
||
|
|
||
|
# ---------------------------------------------------------------------------
|
||
|
# A couple dummy classes to ensure that Series and Indexes are unboxed before
|
||
|
# getting to the EA classes.
|
||
|
|
||
|
|
||
|
@register_extension_dtype
|
||
|
class DecimalDtype2(DecimalDtype):
|
||
|
name = "decimal2"
|
||
|
|
||
|
@classmethod
|
||
|
def construct_array_type(cls):
|
||
|
"""
|
||
|
Return the array type associated with this dtype.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
type
|
||
|
"""
|
||
|
return DecimalArray2
|
||
|
|
||
|
|
||
|
class DecimalArray2(DecimalArray):
|
||
|
@classmethod
|
||
|
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
|
||
|
if isinstance(scalars, (pd.Series, pd.Index)):
|
||
|
raise TypeError("scalars should not be of type pd.Series or pd.Index")
|
||
|
|
||
|
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
|
||
|
|
||
|
|
||
|
def test_array_unboxes(index_or_series):
|
||
|
box = index_or_series
|
||
|
|
||
|
data = box([decimal.Decimal("1"), decimal.Decimal("2")])
|
||
|
dtype = DecimalDtype2()
|
||
|
# make sure it works
|
||
|
with pytest.raises(
|
||
|
TypeError, match="scalars should not be of type pd.Series or pd.Index"
|
||
|
):
|
||
|
DecimalArray2._from_sequence(data, dtype=dtype)
|
||
|
|
||
|
result = pd.array(data, dtype="decimal2")
|
||
|
expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
|
||
|
tm.assert_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_array_to_numpy_na():
|
||
|
# GH#40638
|
||
|
arr = pd.array([pd.NA, 1], dtype="string[python]")
|
||
|
result = arr.to_numpy(na_value=True, dtype=bool)
|
||
|
expected = np.array([True, True])
|
||
|
tm.assert_numpy_array_equal(result, expected)
|