1803 lines
55 KiB
Python
1803 lines
55 KiB
Python
"""
|
|
Routines for casting.
|
|
"""
|
|
|
|
from contextlib import suppress
|
|
from datetime import date, datetime, timedelta
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Dict,
|
|
List,
|
|
Optional,
|
|
Sequence,
|
|
Set,
|
|
Sized,
|
|
Tuple,
|
|
Type,
|
|
Union,
|
|
)
|
|
|
|
import numpy as np
|
|
|
|
from pandas._libs import lib, tslib, tslibs
|
|
from pandas._libs.tslibs import (
|
|
NaT,
|
|
OutOfBoundsDatetime,
|
|
Period,
|
|
Timedelta,
|
|
Timestamp,
|
|
conversion,
|
|
iNaT,
|
|
ints_to_pydatetime,
|
|
ints_to_pytimedelta,
|
|
)
|
|
from pandas._libs.tslibs.timezones import tz_compare
|
|
from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar
|
|
from pandas.util._validators import validate_bool_kwarg
|
|
|
|
from pandas.core.dtypes.common import (
|
|
DT64NS_DTYPE,
|
|
INT64_DTYPE,
|
|
POSSIBLY_CAST_DTYPES,
|
|
TD64NS_DTYPE,
|
|
ensure_int8,
|
|
ensure_int16,
|
|
ensure_int32,
|
|
ensure_int64,
|
|
ensure_object,
|
|
ensure_str,
|
|
is_bool,
|
|
is_bool_dtype,
|
|
is_categorical_dtype,
|
|
is_complex,
|
|
is_complex_dtype,
|
|
is_datetime64_dtype,
|
|
is_datetime64_ns_dtype,
|
|
is_datetime64tz_dtype,
|
|
is_datetime_or_timedelta_dtype,
|
|
is_dtype_equal,
|
|
is_extension_array_dtype,
|
|
is_float,
|
|
is_float_dtype,
|
|
is_integer,
|
|
is_integer_dtype,
|
|
is_numeric_dtype,
|
|
is_object_dtype,
|
|
is_scalar,
|
|
is_sparse,
|
|
is_string_dtype,
|
|
is_timedelta64_dtype,
|
|
is_timedelta64_ns_dtype,
|
|
is_unsigned_integer_dtype,
|
|
pandas_dtype,
|
|
)
|
|
from pandas.core.dtypes.dtypes import (
|
|
DatetimeTZDtype,
|
|
ExtensionDtype,
|
|
IntervalDtype,
|
|
PeriodDtype,
|
|
)
|
|
from pandas.core.dtypes.generic import (
|
|
ABCDataFrame,
|
|
ABCDatetimeArray,
|
|
ABCDatetimeIndex,
|
|
ABCExtensionArray,
|
|
ABCPeriodArray,
|
|
ABCPeriodIndex,
|
|
ABCSeries,
|
|
)
|
|
from pandas.core.dtypes.inference import is_list_like
|
|
from pandas.core.dtypes.missing import (
|
|
is_valid_nat_for_dtype,
|
|
isna,
|
|
na_value_for_dtype,
|
|
notna,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from pandas import Series
|
|
from pandas.core.arrays import ExtensionArray
|
|
from pandas.core.indexes.base import Index
|
|
|
|
_int8_max = np.iinfo(np.int8).max
|
|
_int16_max = np.iinfo(np.int16).max
|
|
_int32_max = np.iinfo(np.int32).max
|
|
_int64_max = np.iinfo(np.int64).max
|
|
|
|
|
|
def maybe_convert_platform(values):
|
|
""" try to do platform conversion, allow ndarray or list here """
|
|
if isinstance(values, (list, tuple, range)):
|
|
values = construct_1d_object_array_from_listlike(values)
|
|
if getattr(values, "dtype", None) == np.object_:
|
|
if hasattr(values, "_values"):
|
|
values = values._values
|
|
values = lib.maybe_convert_objects(values)
|
|
|
|
return values
|
|
|
|
|
|
def is_nested_object(obj) -> bool:
|
|
"""
|
|
return a boolean if we have a nested object, e.g. a Series with 1 or
|
|
more Series elements
|
|
|
|
This may not be necessarily be performant.
|
|
|
|
"""
|
|
if isinstance(obj, ABCSeries) and is_object_dtype(obj.dtype):
|
|
|
|
if any(isinstance(v, ABCSeries) for v in obj._values):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def maybe_box_datetimelike(value: Scalar, dtype: Optional[Dtype] = None) -> Scalar:
|
|
"""
|
|
Cast scalar to Timestamp or Timedelta if scalar is datetime-like
|
|
and dtype is not object.
|
|
|
|
Parameters
|
|
----------
|
|
value : scalar
|
|
dtype : Dtype, optional
|
|
|
|
Returns
|
|
-------
|
|
scalar
|
|
"""
|
|
if dtype == object:
|
|
pass
|
|
elif isinstance(value, (np.datetime64, datetime)):
|
|
value = tslibs.Timestamp(value)
|
|
elif isinstance(value, (np.timedelta64, timedelta)):
|
|
value = tslibs.Timedelta(value)
|
|
|
|
return value
|
|
|
|
|
|
def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]):
|
|
"""
|
|
try to cast to the specified dtype (e.g. convert back to bool/int
|
|
or could be an astype of float64->float32
|
|
"""
|
|
do_round = False
|
|
|
|
if is_scalar(result):
|
|
return result
|
|
elif isinstance(result, ABCDataFrame):
|
|
# occurs in pivot_table doctest
|
|
return result
|
|
|
|
if isinstance(dtype, str):
|
|
if dtype == "infer":
|
|
inferred_type = lib.infer_dtype(ensure_object(result), skipna=False)
|
|
if inferred_type == "boolean":
|
|
dtype = "bool"
|
|
elif inferred_type == "integer":
|
|
dtype = "int64"
|
|
elif inferred_type == "datetime64":
|
|
dtype = "datetime64[ns]"
|
|
elif inferred_type == "timedelta64":
|
|
dtype = "timedelta64[ns]"
|
|
|
|
# try to upcast here
|
|
elif inferred_type == "floating":
|
|
dtype = "int64"
|
|
if issubclass(result.dtype.type, np.number):
|
|
do_round = True
|
|
|
|
else:
|
|
dtype = "object"
|
|
|
|
dtype = np.dtype(dtype)
|
|
|
|
elif dtype.type is Period:
|
|
from pandas.core.arrays import PeriodArray
|
|
|
|
with suppress(TypeError):
|
|
# e.g. TypeError: int() argument must be a string, a
|
|
# bytes-like object or a number, not 'Period
|
|
return PeriodArray(result, freq=dtype.freq)
|
|
|
|
converted = maybe_downcast_numeric(result, dtype, do_round)
|
|
if converted is not result:
|
|
return converted
|
|
|
|
# a datetimelike
|
|
# GH12821, iNaT is cast to float
|
|
if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
|
|
if hasattr(dtype, "tz"):
|
|
# not a numpy dtype
|
|
if dtype.tz:
|
|
# convert to datetime and change timezone
|
|
from pandas import to_datetime
|
|
|
|
result = to_datetime(result).tz_localize("utc")
|
|
result = result.tz_convert(dtype.tz)
|
|
else:
|
|
result = result.astype(dtype)
|
|
|
|
return result
|
|
|
|
|
|
def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False):
|
|
"""
|
|
Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
|
|
|
|
Parameters
|
|
----------
|
|
result : ndarray or ExtensionArray
|
|
dtype : np.dtype or ExtensionDtype
|
|
do_round : bool
|
|
|
|
Returns
|
|
-------
|
|
ndarray or ExtensionArray
|
|
"""
|
|
if not isinstance(dtype, np.dtype):
|
|
# e.g. SparseDtype has no itemsize attr
|
|
return result
|
|
|
|
if isinstance(result, list):
|
|
# reached via groupby.agg._ohlc; really this should be handled earlier
|
|
result = np.array(result)
|
|
|
|
def trans(x):
|
|
if do_round:
|
|
return x.round()
|
|
return x
|
|
|
|
if dtype.kind == result.dtype.kind:
|
|
# don't allow upcasts here (except if empty)
|
|
if result.dtype.itemsize <= dtype.itemsize and result.size:
|
|
return result
|
|
|
|
if is_bool_dtype(dtype) or is_integer_dtype(dtype):
|
|
|
|
if not result.size:
|
|
# if we don't have any elements, just astype it
|
|
return trans(result).astype(dtype)
|
|
|
|
# do a test on the first element, if it fails then we are done
|
|
r = result.ravel()
|
|
arr = np.array([r[0]])
|
|
|
|
if isna(arr).any():
|
|
# if we have any nulls, then we are done
|
|
return result
|
|
|
|
elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):
|
|
# a comparable, e.g. a Decimal may slip in here
|
|
return result
|
|
|
|
if (
|
|
issubclass(result.dtype.type, (np.object_, np.number))
|
|
and notna(result).all()
|
|
):
|
|
new_result = trans(result).astype(dtype)
|
|
if new_result.dtype.kind == "O" or result.dtype.kind == "O":
|
|
# np.allclose may raise TypeError on object-dtype
|
|
if (new_result == result).all():
|
|
return new_result
|
|
else:
|
|
if np.allclose(new_result, result, rtol=0):
|
|
return new_result
|
|
|
|
elif (
|
|
issubclass(dtype.type, np.floating)
|
|
and not is_bool_dtype(result.dtype)
|
|
and not is_string_dtype(result.dtype)
|
|
):
|
|
return result.astype(dtype)
|
|
|
|
return result
|
|
|
|
|
|
def maybe_cast_result(
|
|
result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = ""
|
|
) -> ArrayLike:
|
|
"""
|
|
Try casting result to a different type if appropriate
|
|
|
|
Parameters
|
|
----------
|
|
result : array-like
|
|
Result to cast.
|
|
obj : Series
|
|
Input Series from which result was calculated.
|
|
numeric_only : bool, default False
|
|
Whether to cast only numerics or datetimes as well.
|
|
how : str, default ""
|
|
How the result was computed.
|
|
|
|
Returns
|
|
-------
|
|
result : array-like
|
|
result maybe casted to the dtype.
|
|
"""
|
|
dtype = obj.dtype
|
|
dtype = maybe_cast_result_dtype(dtype, how)
|
|
|
|
assert not is_scalar(result)
|
|
|
|
if (
|
|
is_extension_array_dtype(dtype)
|
|
and not is_categorical_dtype(dtype)
|
|
and dtype.kind != "M"
|
|
):
|
|
# We have to special case categorical so as not to upcast
|
|
# things like counts back to categorical
|
|
cls = dtype.construct_array_type()
|
|
result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
|
|
|
|
elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
|
|
result = maybe_downcast_to_dtype(result, dtype)
|
|
|
|
return result
|
|
|
|
|
|
def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
|
|
"""
|
|
Get the desired dtype of a result based on the
|
|
input dtype and how it was computed.
|
|
|
|
Parameters
|
|
----------
|
|
dtype : DtypeObj
|
|
Input dtype.
|
|
how : str
|
|
How the result was computed.
|
|
|
|
Returns
|
|
-------
|
|
DtypeObj
|
|
The desired dtype of the result.
|
|
"""
|
|
from pandas.core.arrays.boolean import BooleanDtype
|
|
from pandas.core.arrays.floating import Float64Dtype
|
|
from pandas.core.arrays.integer import Int64Dtype, _IntegerDtype
|
|
|
|
if how in ["add", "cumsum", "sum", "prod"]:
|
|
if dtype == np.dtype(bool):
|
|
return np.dtype(np.int64)
|
|
elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
|
|
return Int64Dtype()
|
|
elif how in ["mean", "median", "var"] and isinstance(
|
|
dtype, (BooleanDtype, _IntegerDtype)
|
|
):
|
|
return Float64Dtype()
|
|
return dtype
|
|
|
|
|
|
def maybe_cast_to_extension_array(
|
|
cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None
|
|
) -> ArrayLike:
|
|
"""
|
|
Call to `_from_sequence` that returns the object unchanged on Exception.
|
|
|
|
Parameters
|
|
----------
|
|
cls : class, subclass of ExtensionArray
|
|
obj : arraylike
|
|
Values to pass to cls._from_sequence
|
|
dtype : ExtensionDtype, optional
|
|
|
|
Returns
|
|
-------
|
|
ExtensionArray or obj
|
|
"""
|
|
from pandas.core.arrays.string_ import StringArray
|
|
from pandas.core.arrays.string_arrow import ArrowStringArray
|
|
|
|
assert isinstance(cls, type), f"must pass a type: {cls}"
|
|
assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
|
|
assert issubclass(cls, ABCExtensionArray), assertion_msg
|
|
|
|
# Everything can be converted to StringArrays, but we may not want to convert
|
|
if (
|
|
issubclass(cls, (StringArray, ArrowStringArray))
|
|
and lib.infer_dtype(obj) != "string"
|
|
):
|
|
return obj
|
|
|
|
try:
|
|
result = cls._from_sequence(obj, dtype=dtype)
|
|
except Exception:
|
|
# We can't predict what downstream EA constructors may raise
|
|
result = obj
|
|
return result
|
|
|
|
|
|
def maybe_upcast_putmask(
|
|
result: np.ndarray, mask: np.ndarray, other: Scalar
|
|
) -> Tuple[np.ndarray, bool]:
|
|
"""
|
|
A safe version of putmask that potentially upcasts the result.
|
|
|
|
The result is replaced with the first N elements of other,
|
|
where N is the number of True values in mask.
|
|
If the length of other is shorter than N, other will be repeated.
|
|
|
|
Parameters
|
|
----------
|
|
result : ndarray
|
|
The destination array. This will be mutated in-place if no upcasting is
|
|
necessary.
|
|
mask : boolean ndarray
|
|
other : scalar
|
|
The source value.
|
|
|
|
Returns
|
|
-------
|
|
result : ndarray
|
|
changed : bool
|
|
Set to true if the result array was upcasted.
|
|
|
|
Examples
|
|
--------
|
|
>>> arr = np.arange(1, 6)
|
|
>>> mask = np.array([False, True, False, True, True])
|
|
>>> result, _ = maybe_upcast_putmask(arr, mask, False)
|
|
>>> result
|
|
array([1, 0, 3, 0, 0])
|
|
"""
|
|
if not isinstance(result, np.ndarray):
|
|
raise ValueError("The result input must be a ndarray.")
|
|
if not is_scalar(other):
|
|
# We _could_ support non-scalar other, but until we have a compelling
|
|
# use case, we assume away the possibility.
|
|
raise ValueError("other must be a scalar")
|
|
|
|
if mask.any():
|
|
# Two conversions for date-like dtypes that can't be done automatically
|
|
# in np.place:
|
|
# NaN -> NaT
|
|
# integer or integer array -> date-like array
|
|
if result.dtype.kind in ["m", "M"]:
|
|
if isna(other):
|
|
other = result.dtype.type("nat")
|
|
elif is_integer(other):
|
|
other = np.array(other, dtype=result.dtype)
|
|
|
|
def changeit():
|
|
# we are forced to change the dtype of the result as the input
|
|
# isn't compatible
|
|
r, _ = maybe_upcast(result, fill_value=other, copy=True)
|
|
np.place(r, mask, other)
|
|
|
|
return r, True
|
|
|
|
# we want to decide whether place will work
|
|
# if we have nans in the False portion of our mask then we need to
|
|
# upcast (possibly), otherwise we DON't want to upcast (e.g. if we
|
|
# have values, say integers, in the success portion then it's ok to not
|
|
# upcast)
|
|
new_dtype, _ = maybe_promote(result.dtype, other)
|
|
if new_dtype != result.dtype:
|
|
|
|
# we have a scalar or len 0 ndarray
|
|
# and its nan and we are changing some values
|
|
if isna(other):
|
|
return changeit()
|
|
|
|
try:
|
|
np.place(result, mask, other)
|
|
except TypeError:
|
|
# e.g. int-dtype result and float-dtype other
|
|
return changeit()
|
|
|
|
return result, False
|
|
|
|
|
|
def maybe_casted_values(
|
|
index: "Index", codes: Optional[np.ndarray] = None
|
|
) -> ArrayLike:
|
|
"""
|
|
Convert an index, given directly or as a pair (level, code), to a 1D array.
|
|
|
|
Parameters
|
|
----------
|
|
index : Index
|
|
codes : np.ndarray[intp] or None, default None
|
|
|
|
Returns
|
|
-------
|
|
ExtensionArray or ndarray
|
|
If codes is `None`, the values of `index`.
|
|
If codes is passed, an array obtained by taking from `index` the indices
|
|
contained in `codes`.
|
|
"""
|
|
|
|
values = index._values
|
|
if values.dtype == np.object_:
|
|
values = lib.maybe_convert_objects(values)
|
|
|
|
# if we have the codes, extract the values with a mask
|
|
if codes is not None:
|
|
mask: np.ndarray = codes == -1
|
|
|
|
if mask.size > 0 and mask.all():
|
|
# we can have situations where the whole mask is -1,
|
|
# meaning there is nothing found in codes, so make all nan's
|
|
|
|
dtype = index.dtype
|
|
fill_value = na_value_for_dtype(dtype)
|
|
values = construct_1d_arraylike_from_scalar(fill_value, len(mask), dtype)
|
|
|
|
else:
|
|
values = values.take(codes)
|
|
|
|
if mask.any():
|
|
if isinstance(values, np.ndarray):
|
|
values, _ = maybe_upcast_putmask(values, mask, np.nan)
|
|
else:
|
|
values[mask] = np.nan
|
|
|
|
return values
|
|
|
|
|
|
def maybe_promote(dtype, fill_value=np.nan):
|
|
"""
|
|
Find the minimal dtype that can hold both the given dtype and fill_value.
|
|
|
|
Parameters
|
|
----------
|
|
dtype : np.dtype or ExtensionDtype
|
|
fill_value : scalar, default np.nan
|
|
|
|
Returns
|
|
-------
|
|
dtype
|
|
Upcasted from dtype argument if necessary.
|
|
fill_value
|
|
Upcasted from fill_value argument if necessary.
|
|
"""
|
|
if not is_scalar(fill_value) and not is_object_dtype(dtype):
|
|
# with object dtype there is nothing to promote, and the user can
|
|
# pass pretty much any weird fill_value they like
|
|
raise ValueError("fill_value must be a scalar")
|
|
|
|
# if we passed an array here, determine the fill value by dtype
|
|
if isinstance(fill_value, np.ndarray):
|
|
if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
|
|
fill_value = fill_value.dtype.type("NaT", "ns")
|
|
else:
|
|
|
|
# we need to change to object type as our
|
|
# fill_value is of object type
|
|
if fill_value.dtype == np.object_:
|
|
dtype = np.dtype(np.object_)
|
|
fill_value = np.nan
|
|
|
|
if dtype == np.object_ or dtype.kind in ["U", "S"]:
|
|
# We treat string-like dtypes as object, and _always_ fill
|
|
# with np.nan
|
|
fill_value = np.nan
|
|
dtype = np.dtype(np.object_)
|
|
|
|
# returns tuple of (dtype, fill_value)
|
|
if issubclass(dtype.type, np.datetime64):
|
|
if isinstance(fill_value, datetime) and fill_value.tzinfo is not None:
|
|
# Trying to insert tzaware into tznaive, have to cast to object
|
|
dtype = np.dtype(np.object_)
|
|
elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)):
|
|
dtype = np.dtype(np.object_)
|
|
else:
|
|
try:
|
|
fill_value = Timestamp(fill_value).to_datetime64()
|
|
except (TypeError, ValueError):
|
|
dtype = np.dtype(np.object_)
|
|
elif issubclass(dtype.type, np.timedelta64):
|
|
if (
|
|
is_integer(fill_value)
|
|
or (is_float(fill_value) and not np.isnan(fill_value))
|
|
or isinstance(fill_value, str)
|
|
):
|
|
# TODO: What about str that can be a timedelta?
|
|
dtype = np.dtype(np.object_)
|
|
else:
|
|
try:
|
|
fv = Timedelta(fill_value)
|
|
except ValueError:
|
|
dtype = np.dtype(np.object_)
|
|
else:
|
|
if fv is NaT:
|
|
# NaT has no `to_timedelta64` method
|
|
fill_value = np.timedelta64("NaT", "ns")
|
|
else:
|
|
fill_value = fv.to_timedelta64()
|
|
elif is_datetime64tz_dtype(dtype):
|
|
if isna(fill_value):
|
|
fill_value = NaT
|
|
elif not isinstance(fill_value, datetime):
|
|
dtype = np.dtype(np.object_)
|
|
elif fill_value.tzinfo is None:
|
|
dtype = np.dtype(np.object_)
|
|
elif not tz_compare(fill_value.tzinfo, dtype.tz):
|
|
# TODO: sure we want to cast here?
|
|
dtype = np.dtype(np.object_)
|
|
|
|
elif is_extension_array_dtype(dtype) and isna(fill_value):
|
|
fill_value = dtype.na_value
|
|
|
|
elif is_float(fill_value):
|
|
if issubclass(dtype.type, np.bool_):
|
|
dtype = np.dtype(np.object_)
|
|
|
|
elif issubclass(dtype.type, np.integer):
|
|
dtype = np.dtype(np.float64)
|
|
|
|
elif dtype.kind == "f":
|
|
mst = np.min_scalar_type(fill_value)
|
|
if mst > dtype:
|
|
# e.g. mst is np.float64 and dtype is np.float32
|
|
dtype = mst
|
|
|
|
elif dtype.kind == "c":
|
|
mst = np.min_scalar_type(fill_value)
|
|
dtype = np.promote_types(dtype, mst)
|
|
|
|
elif is_bool(fill_value):
|
|
if not issubclass(dtype.type, np.bool_):
|
|
dtype = np.dtype(np.object_)
|
|
|
|
elif is_integer(fill_value):
|
|
if issubclass(dtype.type, np.bool_):
|
|
dtype = np.dtype(np.object_)
|
|
|
|
elif issubclass(dtype.type, np.integer):
|
|
if not np.can_cast(fill_value, dtype):
|
|
# upcast to prevent overflow
|
|
mst = np.min_scalar_type(fill_value)
|
|
dtype = np.promote_types(dtype, mst)
|
|
if dtype.kind == "f":
|
|
# Case where we disagree with numpy
|
|
dtype = np.dtype(np.object_)
|
|
|
|
elif is_complex(fill_value):
|
|
if issubclass(dtype.type, np.bool_):
|
|
dtype = np.dtype(np.object_)
|
|
|
|
elif issubclass(dtype.type, (np.integer, np.floating)):
|
|
mst = np.min_scalar_type(fill_value)
|
|
dtype = np.promote_types(dtype, mst)
|
|
|
|
elif dtype.kind == "c":
|
|
mst = np.min_scalar_type(fill_value)
|
|
if mst > dtype:
|
|
# e.g. mst is np.complex128 and dtype is np.complex64
|
|
dtype = mst
|
|
|
|
elif fill_value is None:
|
|
if is_float_dtype(dtype) or is_complex_dtype(dtype):
|
|
fill_value = np.nan
|
|
elif is_integer_dtype(dtype):
|
|
dtype = np.float64
|
|
fill_value = np.nan
|
|
elif is_datetime_or_timedelta_dtype(dtype):
|
|
fill_value = dtype.type("NaT", "ns")
|
|
else:
|
|
dtype = np.dtype(np.object_)
|
|
fill_value = np.nan
|
|
else:
|
|
dtype = np.dtype(np.object_)
|
|
|
|
# in case we have a string that looked like a number
|
|
if is_extension_array_dtype(dtype):
|
|
pass
|
|
elif issubclass(np.dtype(dtype).type, (bytes, str)):
|
|
dtype = np.dtype(np.object_)
|
|
|
|
fill_value = _ensure_dtype_type(fill_value, dtype)
|
|
return dtype, fill_value
|
|
|
|
|
|
def _ensure_dtype_type(value, dtype: DtypeObj):
|
|
"""
|
|
Ensure that the given value is an instance of the given dtype.
|
|
|
|
e.g. if out dtype is np.complex64_, we should have an instance of that
|
|
as opposed to a python complex object.
|
|
|
|
Parameters
|
|
----------
|
|
value : object
|
|
dtype : np.dtype or ExtensionDtype
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
"""
|
|
# Start with exceptions in which we do _not_ cast to numpy types
|
|
if is_extension_array_dtype(dtype):
|
|
return value
|
|
elif dtype == np.object_:
|
|
return value
|
|
elif isna(value):
|
|
# e.g. keep np.nan rather than try to cast to np.float32(np.nan)
|
|
return value
|
|
|
|
return dtype.type(value)
|
|
|
|
|
|
def infer_dtype_from(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]:
|
|
"""
|
|
Interpret the dtype from a scalar or array.
|
|
|
|
Parameters
|
|
----------
|
|
val : object
|
|
pandas_dtype : bool, default False
|
|
whether to infer dtype including pandas extension types.
|
|
If False, scalar/array belongs to pandas extension types is inferred as
|
|
object
|
|
"""
|
|
if is_scalar(val):
|
|
return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
|
|
return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
|
|
|
|
|
|
def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]:
|
|
"""
|
|
Interpret the dtype from a scalar.
|
|
|
|
Parameters
|
|
----------
|
|
pandas_dtype : bool, default False
|
|
whether to infer dtype including pandas extension types.
|
|
If False, scalar belongs to pandas extension types is inferred as
|
|
object
|
|
"""
|
|
dtype: DtypeObj = np.dtype(object)
|
|
|
|
# a 1-element ndarray
|
|
if isinstance(val, np.ndarray):
|
|
msg = "invalid ndarray passed to infer_dtype_from_scalar"
|
|
if val.ndim != 0:
|
|
raise ValueError(msg)
|
|
|
|
dtype = val.dtype
|
|
val = val.item()
|
|
|
|
elif isinstance(val, str):
|
|
|
|
# If we create an empty array using a string to infer
|
|
# the dtype, NumPy will only allocate one character per entry
|
|
# so this is kind of bad. Alternately we could use np.repeat
|
|
# instead of np.empty (but then you still don't want things
|
|
# coming out as np.str_!
|
|
|
|
dtype = np.dtype(object)
|
|
|
|
elif isinstance(val, (np.datetime64, datetime)):
|
|
val = Timestamp(val)
|
|
if val is NaT or val.tz is None:
|
|
dtype = np.dtype("M8[ns]")
|
|
else:
|
|
if pandas_dtype:
|
|
dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
|
|
else:
|
|
# return datetimetz as object
|
|
return np.dtype(object), val
|
|
val = val.value
|
|
|
|
elif isinstance(val, (np.timedelta64, timedelta)):
|
|
val = Timedelta(val).value
|
|
dtype = np.dtype("m8[ns]")
|
|
|
|
elif is_bool(val):
|
|
dtype = np.dtype(np.bool_)
|
|
|
|
elif is_integer(val):
|
|
if isinstance(val, np.integer):
|
|
dtype = np.dtype(type(val))
|
|
else:
|
|
dtype = np.dtype(np.int64)
|
|
|
|
try:
|
|
np.array(val, dtype=dtype)
|
|
except OverflowError:
|
|
dtype = np.array(val).dtype
|
|
|
|
elif is_float(val):
|
|
if isinstance(val, np.floating):
|
|
dtype = np.dtype(type(val))
|
|
else:
|
|
dtype = np.dtype(np.float64)
|
|
|
|
elif is_complex(val):
|
|
dtype = np.dtype(np.complex_)
|
|
|
|
elif pandas_dtype:
|
|
if lib.is_period(val):
|
|
dtype = PeriodDtype(freq=val.freq)
|
|
elif lib.is_interval(val):
|
|
subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
|
|
dtype = IntervalDtype(subtype=subtype)
|
|
|
|
return dtype, val
|
|
|
|
|
|
def dict_compat(d: Dict[Scalar, Scalar]) -> Dict[Scalar, Scalar]:
|
|
"""
|
|
Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
|
|
|
|
Parameters
|
|
----------
|
|
d: dict-like object
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
|
|
"""
|
|
return {maybe_box_datetimelike(key): value for key, value in d.items()}
|
|
|
|
|
|
def infer_dtype_from_array(
|
|
arr, pandas_dtype: bool = False
|
|
) -> Tuple[DtypeObj, ArrayLike]:
|
|
"""
|
|
Infer the dtype from an array.
|
|
|
|
Parameters
|
|
----------
|
|
arr : array
|
|
pandas_dtype : bool, default False
|
|
whether to infer dtype including pandas extension types.
|
|
If False, array belongs to pandas extension types
|
|
is inferred as object
|
|
|
|
Returns
|
|
-------
|
|
tuple (numpy-compat/pandas-compat dtype, array)
|
|
|
|
Notes
|
|
-----
|
|
if pandas_dtype=False. these infer to numpy dtypes
|
|
exactly with the exception that mixed / object dtypes
|
|
are not coerced by stringifying or conversion
|
|
|
|
if pandas_dtype=True. datetime64tz-aware/categorical
|
|
types will retain there character.
|
|
|
|
Examples
|
|
--------
|
|
>>> np.asarray([1, '1'])
|
|
array(['1', '1'], dtype='<U21')
|
|
|
|
>>> infer_dtype_from_array([1, '1'])
|
|
(dtype('O'), [1, '1'])
|
|
"""
|
|
if isinstance(arr, np.ndarray):
|
|
return arr.dtype, arr
|
|
|
|
if not is_list_like(arr):
|
|
arr = [arr]
|
|
|
|
if pandas_dtype and is_extension_array_dtype(arr):
|
|
return arr.dtype, arr
|
|
|
|
elif isinstance(arr, ABCSeries):
|
|
return arr.dtype, np.asarray(arr)
|
|
|
|
# don't force numpy coerce with nan's
|
|
inferred = lib.infer_dtype(arr, skipna=False)
|
|
if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
|
|
return (np.dtype(np.object_), arr)
|
|
|
|
arr = np.asarray(arr)
|
|
return arr.dtype, arr
|
|
|
|
|
|
def maybe_infer_dtype_type(element):
|
|
"""
|
|
Try to infer an object's dtype, for use in arithmetic ops.
|
|
|
|
Uses `element.dtype` if that's available.
|
|
Objects implementing the iterator protocol are cast to a NumPy array,
|
|
and from there the array's type is used.
|
|
|
|
Parameters
|
|
----------
|
|
element : object
|
|
Possibly has a `.dtype` attribute, and possibly the iterator
|
|
protocol.
|
|
|
|
Returns
|
|
-------
|
|
tipo : type
|
|
|
|
Examples
|
|
--------
|
|
>>> from collections import namedtuple
|
|
>>> Foo = namedtuple("Foo", "dtype")
|
|
>>> maybe_infer_dtype_type(Foo(np.dtype("i8")))
|
|
dtype('int64')
|
|
"""
|
|
tipo = None
|
|
if hasattr(element, "dtype"):
|
|
tipo = element.dtype
|
|
elif is_list_like(element):
|
|
element = np.asarray(element)
|
|
tipo = element.dtype
|
|
return tipo
|
|
|
|
|
|
def maybe_upcast(
|
|
values: ArrayLike,
|
|
fill_value: Scalar = np.nan,
|
|
dtype: Dtype = None,
|
|
copy: bool = False,
|
|
) -> Tuple[ArrayLike, Scalar]:
|
|
"""
|
|
Provide explicit type promotion and coercion.
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray or ExtensionArray
|
|
The array that we want to maybe upcast.
|
|
fill_value : what we want to fill with
|
|
dtype : if None, then use the dtype of the values, else coerce to this type
|
|
copy : bool, default True
|
|
If True always make a copy even if no upcast is required.
|
|
|
|
Returns
|
|
-------
|
|
values: ndarray or ExtensionArray
|
|
the original array, possibly upcast
|
|
fill_value:
|
|
the fill value, possibly upcast
|
|
"""
|
|
if not is_scalar(fill_value) and not is_object_dtype(values.dtype):
|
|
# We allow arbitrary fill values for object dtype
|
|
raise ValueError("fill_value must be a scalar")
|
|
|
|
if is_extension_array_dtype(values):
|
|
if copy:
|
|
values = values.copy()
|
|
else:
|
|
if dtype is None:
|
|
dtype = values.dtype
|
|
new_dtype, fill_value = maybe_promote(dtype, fill_value)
|
|
if new_dtype != values.dtype:
|
|
values = values.astype(new_dtype)
|
|
elif copy:
|
|
values = values.copy()
|
|
|
|
return values, fill_value
|
|
|
|
|
|
def invalidate_string_dtypes(dtype_set: Set[DtypeObj]):
|
|
"""
|
|
Change string like dtypes to object for
|
|
``DataFrame.select_dtypes()``.
|
|
"""
|
|
non_string_dtypes = dtype_set - {np.dtype("S").type, np.dtype("<U").type}
|
|
if non_string_dtypes != dtype_set:
|
|
raise TypeError("string dtypes are not allowed, use 'object' instead")
|
|
|
|
|
|
def coerce_indexer_dtype(indexer, categories):
|
|
""" coerce the indexer input array to the smallest dtype possible """
|
|
length = len(categories)
|
|
if length < _int8_max:
|
|
return ensure_int8(indexer)
|
|
elif length < _int16_max:
|
|
return ensure_int16(indexer)
|
|
elif length < _int32_max:
|
|
return ensure_int32(indexer)
|
|
return ensure_int64(indexer)
|
|
|
|
|
|
def astype_nansafe(
|
|
arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False
|
|
) -> ArrayLike:
|
|
"""
|
|
Cast the elements of an array to a given dtype a nan-safe manner.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ndarray
|
|
dtype : np.dtype
|
|
copy : bool, default True
|
|
If False, a view will be attempted but may fail, if
|
|
e.g. the item sizes don't align.
|
|
skipna: bool, default False
|
|
Whether or not we should skip NaN when casting as a string-type.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
The dtype was a datetime64/timedelta64 dtype, but it had no unit.
|
|
"""
|
|
# dispatch on extension dtype if needed
|
|
if is_extension_array_dtype(dtype):
|
|
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
|
|
|
|
if not isinstance(dtype, np.dtype):
|
|
dtype = pandas_dtype(dtype)
|
|
|
|
if issubclass(dtype.type, str):
|
|
return lib.ensure_string_array(
|
|
arr.ravel(), skipna=skipna, convert_na_value=False
|
|
).reshape(arr.shape)
|
|
|
|
elif is_datetime64_dtype(arr):
|
|
if is_object_dtype(dtype):
|
|
return ints_to_pydatetime(arr.view(np.int64))
|
|
elif dtype == np.int64:
|
|
if isna(arr).any():
|
|
raise ValueError("Cannot convert NaT values to integer")
|
|
return arr.view(dtype)
|
|
|
|
# allow frequency conversions
|
|
if dtype.kind == "M":
|
|
return arr.astype(dtype)
|
|
|
|
raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]")
|
|
|
|
elif is_timedelta64_dtype(arr):
|
|
if is_object_dtype(dtype):
|
|
return ints_to_pytimedelta(arr.view(np.int64))
|
|
elif dtype == np.int64:
|
|
if isna(arr).any():
|
|
raise ValueError("Cannot convert NaT values to integer")
|
|
return arr.view(dtype)
|
|
|
|
if dtype not in [INT64_DTYPE, TD64NS_DTYPE]:
|
|
|
|
# allow frequency conversions
|
|
# we return a float here!
|
|
if dtype.kind == "m":
|
|
mask = isna(arr)
|
|
result = arr.astype(dtype).astype(np.float64)
|
|
result[mask] = np.nan
|
|
return result
|
|
elif dtype == TD64NS_DTYPE:
|
|
return arr.astype(TD64NS_DTYPE, copy=copy)
|
|
|
|
raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")
|
|
|
|
elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):
|
|
|
|
if not np.isfinite(arr).all():
|
|
raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
|
|
|
|
elif is_object_dtype(arr):
|
|
|
|
# work around NumPy brokenness, #1987
|
|
if np.issubdtype(dtype.type, np.integer):
|
|
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
|
|
|
|
# if we have a datetime/timedelta array of objects
|
|
# then coerce to a proper dtype and recall astype_nansafe
|
|
|
|
elif is_datetime64_dtype(dtype):
|
|
from pandas import to_datetime
|
|
|
|
return astype_nansafe(to_datetime(arr).values, dtype, copy=copy)
|
|
elif is_timedelta64_dtype(dtype):
|
|
from pandas import to_timedelta
|
|
|
|
return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy)
|
|
|
|
if dtype.name in ("datetime64", "timedelta64"):
|
|
msg = (
|
|
f"The '{dtype.name}' dtype has no unit. Please pass in "
|
|
f"'{dtype.name}[ns]' instead."
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
if copy or is_object_dtype(arr) or is_object_dtype(dtype):
|
|
# Explicit copy, or required since NumPy can't view from / to object.
|
|
return arr.astype(dtype, copy=True)
|
|
|
|
return arr.view(dtype)
|
|
|
|
|
|
def soft_convert_objects(
|
|
values: np.ndarray,
|
|
datetime: bool = True,
|
|
numeric: bool = True,
|
|
timedelta: bool = True,
|
|
copy: bool = True,
|
|
):
|
|
"""
|
|
Try to coerce datetime, timedelta, and numeric object-dtype columns
|
|
to inferred dtype.
|
|
|
|
Parameters
|
|
----------
|
|
values : np.ndarray[object]
|
|
datetime : bool, default True
|
|
numeric: bool, default True
|
|
timedelta : bool, default True
|
|
copy : bool, default True
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray
|
|
"""
|
|
validate_bool_kwarg(datetime, "datetime")
|
|
validate_bool_kwarg(numeric, "numeric")
|
|
validate_bool_kwarg(timedelta, "timedelta")
|
|
validate_bool_kwarg(copy, "copy")
|
|
|
|
conversion_count = sum((datetime, numeric, timedelta))
|
|
if conversion_count == 0:
|
|
raise ValueError("At least one of datetime, numeric or timedelta must be True.")
|
|
|
|
# Soft conversions
|
|
if datetime:
|
|
# GH 20380, when datetime is beyond year 2262, hence outside
|
|
# bound of nanosecond-resolution 64-bit integers.
|
|
try:
|
|
values = lib.maybe_convert_objects(values, convert_datetime=True)
|
|
except (OutOfBoundsDatetime, ValueError):
|
|
pass
|
|
|
|
if timedelta and is_object_dtype(values.dtype):
|
|
# Object check to ensure only run if previous did not convert
|
|
values = lib.maybe_convert_objects(values, convert_timedelta=True)
|
|
|
|
if numeric and is_object_dtype(values.dtype):
|
|
try:
|
|
converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
else:
|
|
# If all NaNs, then do not-alter
|
|
values = converted if not isna(converted).all() else values
|
|
values = values.copy() if copy else values
|
|
|
|
return values
|
|
|
|
|
|
def convert_dtypes(
|
|
input_array: AnyArrayLike,
|
|
convert_string: bool = True,
|
|
convert_integer: bool = True,
|
|
convert_boolean: bool = True,
|
|
convert_floating: bool = True,
|
|
) -> Dtype:
|
|
"""
|
|
Convert objects to best possible type, and optionally,
|
|
to types supporting ``pd.NA``.
|
|
|
|
Parameters
|
|
----------
|
|
input_array : ExtensionArray, Index, Series or np.ndarray
|
|
convert_string : bool, default True
|
|
Whether object dtypes should be converted to ``StringDtype()``.
|
|
convert_integer : bool, default True
|
|
Whether, if possible, conversion can be done to integer extension types.
|
|
convert_boolean : bool, defaults True
|
|
Whether object dtypes should be converted to ``BooleanDtypes()``.
|
|
convert_floating : bool, defaults True
|
|
Whether, if possible, conversion can be done to floating extension types.
|
|
If `convert_integer` is also True, preference will be give to integer
|
|
dtypes if the floats can be faithfully casted to integers.
|
|
|
|
Returns
|
|
-------
|
|
dtype
|
|
new dtype
|
|
"""
|
|
is_extension = is_extension_array_dtype(input_array.dtype)
|
|
if (
|
|
convert_string or convert_integer or convert_boolean or convert_floating
|
|
) and not is_extension:
|
|
try:
|
|
inferred_dtype = lib.infer_dtype(input_array)
|
|
except ValueError:
|
|
# Required to catch due to Period. Can remove once GH 23553 is fixed
|
|
inferred_dtype = input_array.dtype
|
|
|
|
if not convert_string and is_string_dtype(inferred_dtype):
|
|
inferred_dtype = input_array.dtype
|
|
|
|
if convert_integer:
|
|
target_int_dtype = "Int64"
|
|
|
|
if is_integer_dtype(input_array.dtype):
|
|
from pandas.core.arrays.integer import INT_STR_TO_DTYPE
|
|
|
|
inferred_dtype = INT_STR_TO_DTYPE.get(
|
|
input_array.dtype.name, target_int_dtype
|
|
)
|
|
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
|
|
input_array.dtype
|
|
):
|
|
inferred_dtype = target_int_dtype
|
|
|
|
else:
|
|
if is_integer_dtype(inferred_dtype):
|
|
inferred_dtype = input_array.dtype
|
|
|
|
if convert_floating:
|
|
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
|
|
input_array.dtype
|
|
):
|
|
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
|
|
|
|
inferred_float_dtype = FLOAT_STR_TO_DTYPE.get(
|
|
input_array.dtype.name, "Float64"
|
|
)
|
|
# if we could also convert to integer, check if all floats
|
|
# are actually integers
|
|
if convert_integer:
|
|
arr = input_array[notna(input_array)]
|
|
if (arr.astype(int) == arr).all():
|
|
inferred_dtype = "Int64"
|
|
else:
|
|
inferred_dtype = inferred_float_dtype
|
|
else:
|
|
inferred_dtype = inferred_float_dtype
|
|
else:
|
|
if is_float_dtype(inferred_dtype):
|
|
inferred_dtype = input_array.dtype
|
|
|
|
if convert_boolean:
|
|
if is_bool_dtype(input_array.dtype):
|
|
inferred_dtype = "boolean"
|
|
else:
|
|
if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
|
|
inferred_dtype = input_array.dtype
|
|
|
|
else:
|
|
inferred_dtype = input_array.dtype
|
|
|
|
return inferred_dtype
|
|
|
|
|
|
def maybe_castable(arr: np.ndarray) -> bool:
|
|
# return False to force a non-fastpath
|
|
|
|
assert isinstance(arr, np.ndarray) # GH 37024
|
|
|
|
# check datetime64[ns]/timedelta64[ns] are valid
|
|
# otherwise try to coerce
|
|
kind = arr.dtype.kind
|
|
if kind == "M":
|
|
return is_datetime64_ns_dtype(arr.dtype)
|
|
elif kind == "m":
|
|
return is_timedelta64_ns_dtype(arr.dtype)
|
|
|
|
return arr.dtype.name not in POSSIBLY_CAST_DTYPES
|
|
|
|
|
|
def maybe_infer_to_datetimelike(
|
|
value: Union[ArrayLike, Scalar], convert_dates: bool = False
|
|
):
|
|
"""
|
|
we might have a array (or single object) that is datetime like,
|
|
and no dtype is passed don't change the value unless we find a
|
|
datetime/timedelta set
|
|
|
|
this is pretty strict in that a datetime/timedelta is REQUIRED
|
|
in addition to possible nulls/string likes
|
|
|
|
Parameters
|
|
----------
|
|
value : np.array / Series / Index / list-like
|
|
convert_dates : bool, default False
|
|
if True try really hard to convert dates (such as datetime.date), other
|
|
leave inferred dtype 'date' alone
|
|
|
|
"""
|
|
# TODO: why not timedelta?
|
|
if isinstance(
|
|
value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray)
|
|
):
|
|
return value
|
|
|
|
v = value
|
|
|
|
if not is_list_like(v):
|
|
v = [v]
|
|
v = np.array(v, copy=False)
|
|
|
|
# we only care about object dtypes
|
|
if not is_object_dtype(v):
|
|
return value
|
|
|
|
shape = v.shape
|
|
if v.ndim != 1:
|
|
v = v.ravel()
|
|
|
|
if not len(v):
|
|
return value
|
|
|
|
def try_datetime(v):
|
|
# safe coerce to datetime64
|
|
try:
|
|
# GH19671
|
|
v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0]
|
|
except ValueError:
|
|
|
|
# we might have a sequence of the same-datetimes with tz's
|
|
# if so coerce to a DatetimeIndex; if they are not the same,
|
|
# then these stay as object dtype, xref GH19671
|
|
from pandas import DatetimeIndex
|
|
|
|
try:
|
|
|
|
values, tz = conversion.datetime_to_datetime64(v)
|
|
return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
return v.reshape(shape)
|
|
|
|
def try_timedelta(v):
|
|
# safe coerce to timedelta64
|
|
|
|
# will try first with a string & object conversion
|
|
from pandas import to_timedelta
|
|
|
|
try:
|
|
td_values = to_timedelta(v)
|
|
except ValueError:
|
|
return v.reshape(shape)
|
|
else:
|
|
return np.asarray(td_values).reshape(shape)
|
|
|
|
inferred_type = lib.infer_datetimelike_array(ensure_object(v))
|
|
|
|
if inferred_type == "date" and convert_dates:
|
|
value = try_datetime(v)
|
|
elif inferred_type == "datetime":
|
|
value = try_datetime(v)
|
|
elif inferred_type == "timedelta":
|
|
value = try_timedelta(v)
|
|
elif inferred_type == "nat":
|
|
|
|
# if all NaT, return as datetime
|
|
if isna(v).all():
|
|
value = try_datetime(v)
|
|
else:
|
|
|
|
# We have at least a NaT and a string
|
|
# try timedelta first to avoid spurious datetime conversions
|
|
# e.g. '00:00:01' is a timedelta but technically is also a datetime
|
|
value = try_timedelta(v)
|
|
if lib.infer_dtype(value, skipna=False) in ["mixed"]:
|
|
# cannot skip missing values, as NaT implies that the string
|
|
# is actually a datetime
|
|
value = try_datetime(v)
|
|
|
|
return value
|
|
|
|
|
|
def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
|
|
"""
|
|
try to cast the array/value to a datetimelike dtype, converting float
|
|
nan to iNaT
|
|
"""
|
|
from pandas.core.tools.datetimes import to_datetime
|
|
from pandas.core.tools.timedeltas import to_timedelta
|
|
|
|
if dtype is not None:
|
|
is_datetime64 = is_datetime64_dtype(dtype)
|
|
is_datetime64tz = is_datetime64tz_dtype(dtype)
|
|
is_timedelta64 = is_timedelta64_dtype(dtype)
|
|
|
|
if is_datetime64 or is_datetime64tz or is_timedelta64:
|
|
|
|
# Force the dtype if needed.
|
|
msg = (
|
|
f"The '{dtype.name}' dtype has no unit. "
|
|
f"Please pass in '{dtype.name}[ns]' instead."
|
|
)
|
|
|
|
if is_datetime64:
|
|
# unpack e.g. SparseDtype
|
|
dtype = getattr(dtype, "subtype", dtype)
|
|
if not is_dtype_equal(dtype, DT64NS_DTYPE):
|
|
|
|
# pandas supports dtype whose granularity is less than [ns]
|
|
# e.g., [ps], [fs], [as]
|
|
if dtype <= np.dtype("M8[ns]"):
|
|
if dtype.name == "datetime64":
|
|
raise ValueError(msg)
|
|
dtype = DT64NS_DTYPE
|
|
else:
|
|
raise TypeError(
|
|
f"cannot convert datetimelike to dtype [{dtype}]"
|
|
)
|
|
elif is_datetime64tz:
|
|
|
|
# our NaT doesn't support tz's
|
|
# this will coerce to DatetimeIndex with
|
|
# a matching dtype below
|
|
if is_scalar(value) and isna(value):
|
|
value = [value]
|
|
|
|
elif is_timedelta64 and not is_dtype_equal(dtype, TD64NS_DTYPE):
|
|
|
|
# pandas supports dtype whose granularity is less than [ns]
|
|
# e.g., [ps], [fs], [as]
|
|
if dtype <= np.dtype("m8[ns]"):
|
|
if dtype.name == "timedelta64":
|
|
raise ValueError(msg)
|
|
dtype = TD64NS_DTYPE
|
|
else:
|
|
raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]")
|
|
|
|
if is_scalar(value):
|
|
if value == iNaT or isna(value):
|
|
value = iNaT
|
|
elif not is_sparse(value):
|
|
value = np.array(value, copy=False)
|
|
|
|
# have a scalar array-like (e.g. NaT)
|
|
if value.ndim == 0:
|
|
value = iNaT
|
|
|
|
# we have an array of datetime or timedeltas & nulls
|
|
elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype):
|
|
try:
|
|
if is_datetime64:
|
|
value = to_datetime(value, errors="raise")
|
|
# GH 25843: Remove tz information since the dtype
|
|
# didn't specify one
|
|
if value.tz is not None:
|
|
value = value.tz_localize(None)
|
|
value = value._values
|
|
elif is_datetime64tz:
|
|
# The string check can be removed once issue #13712
|
|
# is solved. String data that is passed with a
|
|
# datetime64tz is assumed to be naive which should
|
|
# be localized to the timezone.
|
|
is_dt_string = is_string_dtype(value.dtype)
|
|
value = to_datetime(value, errors="raise").array
|
|
if is_dt_string:
|
|
# Strings here are naive, so directly localize
|
|
value = value.tz_localize(dtype.tz)
|
|
else:
|
|
# Numeric values are UTC at this point,
|
|
# so localize and convert
|
|
value = value.tz_localize("UTC").tz_convert(dtype.tz)
|
|
elif is_timedelta64:
|
|
value = to_timedelta(value, errors="raise")._values
|
|
except OutOfBoundsDatetime:
|
|
raise
|
|
except (AttributeError, ValueError, TypeError):
|
|
pass
|
|
|
|
# coerce datetimelike to object
|
|
elif is_datetime64_dtype(
|
|
getattr(value, "dtype", None)
|
|
) and not is_datetime64_dtype(dtype):
|
|
if is_object_dtype(dtype):
|
|
if value.dtype != DT64NS_DTYPE:
|
|
value = value.astype(DT64NS_DTYPE)
|
|
ints = np.asarray(value).view("i8")
|
|
return ints_to_pydatetime(ints)
|
|
|
|
# we have a non-castable dtype that was passed
|
|
raise TypeError(f"Cannot cast datetime64 to {dtype}")
|
|
|
|
else:
|
|
|
|
is_array = isinstance(value, np.ndarray)
|
|
|
|
# catch a datetime/timedelta that is not of ns variety
|
|
# and no coercion specified
|
|
if is_array and value.dtype.kind in ["M", "m"]:
|
|
dtype = value.dtype
|
|
|
|
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
|
|
value = conversion.ensure_datetime64ns(value)
|
|
|
|
elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
|
|
value = conversion.ensure_timedelta64ns(value)
|
|
|
|
# only do this if we have an array and the dtype of the array is not
|
|
# setup already we are not an integer/object, so don't bother with this
|
|
# conversion
|
|
elif not (
|
|
is_array
|
|
and not (
|
|
issubclass(value.dtype.type, np.integer) or value.dtype == np.object_
|
|
)
|
|
):
|
|
value = maybe_infer_to_datetimelike(value)
|
|
|
|
return value
|
|
|
|
|
|
def find_common_type(types: List[DtypeObj]) -> DtypeObj:
|
|
"""
|
|
Find a common data type among the given dtypes.
|
|
|
|
Parameters
|
|
----------
|
|
types : list of dtypes
|
|
|
|
Returns
|
|
-------
|
|
pandas extension or numpy dtype
|
|
|
|
See Also
|
|
--------
|
|
numpy.find_common_type
|
|
|
|
"""
|
|
if len(types) == 0:
|
|
raise ValueError("no types given")
|
|
|
|
first = types[0]
|
|
|
|
# workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
|
|
# => object
|
|
if all(is_dtype_equal(first, t) for t in types[1:]):
|
|
return first
|
|
|
|
# get unique types (dict.fromkeys is used as order-preserving set())
|
|
types = list(dict.fromkeys(types).keys())
|
|
|
|
if any(isinstance(t, ExtensionDtype) for t in types):
|
|
for t in types:
|
|
if isinstance(t, ExtensionDtype):
|
|
res = t._get_common_dtype(types)
|
|
if res is not None:
|
|
return res
|
|
return np.dtype("object")
|
|
|
|
# take lowest unit
|
|
if all(is_datetime64_dtype(t) for t in types):
|
|
return np.dtype("datetime64[ns]")
|
|
if all(is_timedelta64_dtype(t) for t in types):
|
|
return np.dtype("timedelta64[ns]")
|
|
|
|
# don't mix bool / int or float or complex
|
|
# this is different from numpy, which casts bool with float/int as int
|
|
has_bools = any(is_bool_dtype(t) for t in types)
|
|
if has_bools:
|
|
for t in types:
|
|
if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
|
|
return np.dtype("object")
|
|
|
|
return np.find_common_type(types, [])
|
|
|
|
|
|
def construct_1d_arraylike_from_scalar(
|
|
value: Scalar, length: int, dtype: DtypeObj
|
|
) -> ArrayLike:
|
|
"""
|
|
create a np.ndarray / pandas type of specified shape and dtype
|
|
filled with values
|
|
|
|
Parameters
|
|
----------
|
|
value : scalar value
|
|
length : int
|
|
dtype : pandas_dtype or np.dtype
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray / pandas type of length, filled with value
|
|
|
|
"""
|
|
if is_extension_array_dtype(dtype):
|
|
cls = dtype.construct_array_type()
|
|
subarr = cls._from_sequence([value] * length, dtype=dtype)
|
|
|
|
else:
|
|
|
|
if length and is_integer_dtype(dtype) and isna(value):
|
|
# coerce if we have nan for an integer dtype
|
|
dtype = np.dtype("float64")
|
|
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
|
|
# we need to coerce to object dtype to avoid
|
|
# to allow numpy to take our string as a scalar value
|
|
dtype = np.dtype("object")
|
|
if not isna(value):
|
|
value = ensure_str(value)
|
|
elif dtype.kind in ["M", "m"] and is_valid_nat_for_dtype(value, dtype):
|
|
# GH36541: can't fill array directly with pd.NaT
|
|
# > np.empty(10, dtype="datetime64[64]").fill(pd.NaT)
|
|
# ValueError: cannot convert float NaN to integer
|
|
value = dtype.type("NaT", "ns")
|
|
|
|
subarr = np.empty(length, dtype=dtype)
|
|
subarr.fill(value)
|
|
|
|
return subarr
|
|
|
|
|
|
def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
|
|
"""
|
|
Transform any list-like object in a 1-dimensional numpy array of object
|
|
dtype.
|
|
|
|
Parameters
|
|
----------
|
|
values : any iterable which has a len()
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
* If `values` does not have a len()
|
|
|
|
Returns
|
|
-------
|
|
1-dimensional numpy array of dtype object
|
|
"""
|
|
# numpy will try to interpret nested lists as further dimensions, hence
|
|
# making a 1D array that contains list-likes is a bit tricky:
|
|
result = np.empty(len(values), dtype="object")
|
|
result[:] = values
|
|
return result
|
|
|
|
|
|
def construct_1d_ndarray_preserving_na(
|
|
values: Sequence, dtype: Optional[DtypeObj] = None, copy: bool = False
|
|
) -> np.ndarray:
|
|
"""
|
|
Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
|
|
|
|
Parameters
|
|
----------
|
|
values : Sequence
|
|
dtype : numpy.dtype, optional
|
|
copy : bool, default False
|
|
Note that copies may still be made with ``copy=False`` if casting
|
|
is required.
|
|
|
|
Returns
|
|
-------
|
|
arr : ndarray[dtype]
|
|
|
|
Examples
|
|
--------
|
|
>>> np.array([1.0, 2.0, None], dtype='str')
|
|
array(['1.0', '2.0', 'None'], dtype='<U4')
|
|
|
|
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
|
|
array(['1.0', '2.0', None], dtype=object)
|
|
"""
|
|
|
|
if dtype is not None and dtype.kind == "U":
|
|
subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
|
|
else:
|
|
subarr = np.array(values, dtype=dtype, copy=copy)
|
|
|
|
return subarr
|
|
|
|
|
|
def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False):
|
|
"""
|
|
Takes any dtype and returns the casted version, raising for when data is
|
|
incompatible with integer/unsigned integer dtypes.
|
|
|
|
.. versionadded:: 0.24.0
|
|
|
|
Parameters
|
|
----------
|
|
arr : array-like
|
|
The array to cast.
|
|
dtype : str, np.dtype
|
|
The integer dtype to cast the array to.
|
|
copy: bool, default False
|
|
Whether to make a copy of the array before returning.
|
|
|
|
Returns
|
|
-------
|
|
ndarray
|
|
Array of integer or unsigned integer dtype.
|
|
|
|
Raises
|
|
------
|
|
OverflowError : the dtype is incompatible with the data
|
|
ValueError : loss of precision has occurred during casting
|
|
|
|
Examples
|
|
--------
|
|
If you try to coerce negative values to unsigned integers, it raises:
|
|
|
|
>>> pd.Series([-1], dtype="uint64")
|
|
Traceback (most recent call last):
|
|
...
|
|
OverflowError: Trying to coerce negative values to unsigned integers
|
|
|
|
Also, if you try to coerce float values to integers, it raises:
|
|
|
|
>>> pd.Series([1, 2, 3.5], dtype="int64")
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: Trying to coerce float values to integers
|
|
"""
|
|
assert is_integer_dtype(dtype)
|
|
|
|
try:
|
|
if not hasattr(arr, "astype"):
|
|
casted = np.array(arr, dtype=dtype, copy=copy)
|
|
else:
|
|
casted = arr.astype(dtype, copy=copy)
|
|
except OverflowError as err:
|
|
raise OverflowError(
|
|
"The elements provided in the data cannot all be "
|
|
f"casted to the dtype {dtype}"
|
|
) from err
|
|
|
|
if np.array_equal(arr, casted):
|
|
return casted
|
|
|
|
# We do this casting to allow for proper
|
|
# data and dtype checking.
|
|
#
|
|
# We didn't do this earlier because NumPy
|
|
# doesn't handle `uint64` correctly.
|
|
arr = np.asarray(arr)
|
|
|
|
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
|
|
raise OverflowError("Trying to coerce negative values to unsigned integers")
|
|
|
|
if is_float_dtype(arr) or is_object_dtype(arr):
|
|
raise ValueError("Trying to coerce float values to integers")
|
|
|
|
|
|
def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar:
|
|
"""
|
|
Convert datetimelike scalar if we are setting into a datetime64
|
|
or timedelta64 ndarray.
|
|
|
|
Parameters
|
|
----------
|
|
scalar : scalar
|
|
dtype : np.dtype
|
|
|
|
Returns
|
|
-------
|
|
scalar
|
|
"""
|
|
if dtype.kind == "m":
|
|
if isinstance(scalar, (timedelta, np.timedelta64)):
|
|
# We have to cast after asm8 in case we have NaT
|
|
return Timedelta(scalar).asm8.view("timedelta64[ns]")
|
|
elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)):
|
|
return np.timedelta64("NaT", "ns")
|
|
if dtype.kind == "M":
|
|
if isinstance(scalar, (date, np.datetime64)):
|
|
# Note: we include date, not just datetime
|
|
return Timestamp(scalar).to_datetime64()
|
|
elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)):
|
|
return np.datetime64("NaT", "ns")
|
|
else:
|
|
validate_numeric_casting(dtype, scalar)
|
|
return scalar
|
|
|
|
|
|
def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None:
|
|
"""
|
|
Check that we can losslessly insert the given value into an array
|
|
with the given dtype.
|
|
|
|
Parameters
|
|
----------
|
|
dtype : np.dtype
|
|
value : scalar
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
"""
|
|
if issubclass(dtype.type, (np.integer, np.bool_)):
|
|
if is_float(value) and np.isnan(value):
|
|
raise ValueError("Cannot assign nan to integer series")
|
|
|
|
if issubclass(dtype.type, (np.integer, np.floating, complex)) and not issubclass(
|
|
dtype.type, np.bool_
|
|
):
|
|
if is_bool(value):
|
|
raise ValueError("Cannot assign bool to float/integer series")
|