projektAI/venv/Lib/site-packages/pandas/core/dtypes/cast.py

1803 lines
55 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
Routines for casting.
"""
from contextlib import suppress
from datetime import date, datetime, timedelta
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
Optional,
Sequence,
Set,
Sized,
Tuple,
Type,
Union,
)
import numpy as np
from pandas._libs import lib, tslib, tslibs
from pandas._libs.tslibs import (
NaT,
OutOfBoundsDatetime,
Period,
Timedelta,
Timestamp,
conversion,
iNaT,
ints_to_pydatetime,
ints_to_pytimedelta,
)
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import (
DT64NS_DTYPE,
INT64_DTYPE,
POSSIBLY_CAST_DTYPES,
TD64NS_DTYPE,
ensure_int8,
ensure_int16,
ensure_int32,
ensure_int64,
ensure_object,
ensure_str,
is_bool,
is_bool_dtype,
is_categorical_dtype,
is_complex,
is_complex_dtype,
is_datetime64_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_sparse,
is_string_dtype,
is_timedelta64_dtype,
is_timedelta64_ns_dtype,
is_unsigned_integer_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
IntervalDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCDatetimeArray,
ABCDatetimeIndex,
ABCExtensionArray,
ABCPeriodArray,
ABCPeriodIndex,
ABCSeries,
)
from pandas.core.dtypes.inference import is_list_like
from pandas.core.dtypes.missing import (
is_valid_nat_for_dtype,
isna,
na_value_for_dtype,
notna,
)
if TYPE_CHECKING:
from pandas import Series
from pandas.core.arrays import ExtensionArray
from pandas.core.indexes.base import Index
_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max
_int64_max = np.iinfo(np.int64).max
def maybe_convert_platform(values):
""" try to do platform conversion, allow ndarray or list here """
if isinstance(values, (list, tuple, range)):
values = construct_1d_object_array_from_listlike(values)
if getattr(values, "dtype", None) == np.object_:
if hasattr(values, "_values"):
values = values._values
values = lib.maybe_convert_objects(values)
return values
def is_nested_object(obj) -> bool:
"""
return a boolean if we have a nested object, e.g. a Series with 1 or
more Series elements
This may not be necessarily be performant.
"""
if isinstance(obj, ABCSeries) and is_object_dtype(obj.dtype):
if any(isinstance(v, ABCSeries) for v in obj._values):
return True
return False
def maybe_box_datetimelike(value: Scalar, dtype: Optional[Dtype] = None) -> Scalar:
"""
Cast scalar to Timestamp or Timedelta if scalar is datetime-like
and dtype is not object.
Parameters
----------
value : scalar
dtype : Dtype, optional
Returns
-------
scalar
"""
if dtype == object:
pass
elif isinstance(value, (np.datetime64, datetime)):
value = tslibs.Timestamp(value)
elif isinstance(value, (np.timedelta64, timedelta)):
value = tslibs.Timedelta(value)
return value
def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]):
"""
try to cast to the specified dtype (e.g. convert back to bool/int
or could be an astype of float64->float32
"""
do_round = False
if is_scalar(result):
return result
elif isinstance(result, ABCDataFrame):
# occurs in pivot_table doctest
return result
if isinstance(dtype, str):
if dtype == "infer":
inferred_type = lib.infer_dtype(ensure_object(result), skipna=False)
if inferred_type == "boolean":
dtype = "bool"
elif inferred_type == "integer":
dtype = "int64"
elif inferred_type == "datetime64":
dtype = "datetime64[ns]"
elif inferred_type == "timedelta64":
dtype = "timedelta64[ns]"
# try to upcast here
elif inferred_type == "floating":
dtype = "int64"
if issubclass(result.dtype.type, np.number):
do_round = True
else:
dtype = "object"
dtype = np.dtype(dtype)
elif dtype.type is Period:
from pandas.core.arrays import PeriodArray
with suppress(TypeError):
# e.g. TypeError: int() argument must be a string, a
# bytes-like object or a number, not 'Period
return PeriodArray(result, freq=dtype.freq)
converted = maybe_downcast_numeric(result, dtype, do_round)
if converted is not result:
return converted
# a datetimelike
# GH12821, iNaT is cast to float
if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
if hasattr(dtype, "tz"):
# not a numpy dtype
if dtype.tz:
# convert to datetime and change timezone
from pandas import to_datetime
result = to_datetime(result).tz_localize("utc")
result = result.tz_convert(dtype.tz)
else:
result = result.astype(dtype)
return result
def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False):
"""
Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
Parameters
----------
result : ndarray or ExtensionArray
dtype : np.dtype or ExtensionDtype
do_round : bool
Returns
-------
ndarray or ExtensionArray
"""
if not isinstance(dtype, np.dtype):
# e.g. SparseDtype has no itemsize attr
return result
if isinstance(result, list):
# reached via groupby.agg._ohlc; really this should be handled earlier
result = np.array(result)
def trans(x):
if do_round:
return x.round()
return x
if dtype.kind == result.dtype.kind:
# don't allow upcasts here (except if empty)
if result.dtype.itemsize <= dtype.itemsize and result.size:
return result
if is_bool_dtype(dtype) or is_integer_dtype(dtype):
if not result.size:
# if we don't have any elements, just astype it
return trans(result).astype(dtype)
# do a test on the first element, if it fails then we are done
r = result.ravel()
arr = np.array([r[0]])
if isna(arr).any():
# if we have any nulls, then we are done
return result
elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):
# a comparable, e.g. a Decimal may slip in here
return result
if (
issubclass(result.dtype.type, (np.object_, np.number))
and notna(result).all()
):
new_result = trans(result).astype(dtype)
if new_result.dtype.kind == "O" or result.dtype.kind == "O":
# np.allclose may raise TypeError on object-dtype
if (new_result == result).all():
return new_result
else:
if np.allclose(new_result, result, rtol=0):
return new_result
elif (
issubclass(dtype.type, np.floating)
and not is_bool_dtype(result.dtype)
and not is_string_dtype(result.dtype)
):
return result.astype(dtype)
return result
def maybe_cast_result(
result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = ""
) -> ArrayLike:
"""
Try casting result to a different type if appropriate
Parameters
----------
result : array-like
Result to cast.
obj : Series
Input Series from which result was calculated.
numeric_only : bool, default False
Whether to cast only numerics or datetimes as well.
how : str, default ""
How the result was computed.
Returns
-------
result : array-like
result maybe casted to the dtype.
"""
dtype = obj.dtype
dtype = maybe_cast_result_dtype(dtype, how)
assert not is_scalar(result)
if (
is_extension_array_dtype(dtype)
and not is_categorical_dtype(dtype)
and dtype.kind != "M"
):
# We have to special case categorical so as not to upcast
# things like counts back to categorical
cls = dtype.construct_array_type()
result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)
return result
def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
"""
Get the desired dtype of a result based on the
input dtype and how it was computed.
Parameters
----------
dtype : DtypeObj
Input dtype.
how : str
How the result was computed.
Returns
-------
DtypeObj
The desired dtype of the result.
"""
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import Float64Dtype
from pandas.core.arrays.integer import Int64Dtype, _IntegerDtype
if how in ["add", "cumsum", "sum", "prod"]:
if dtype == np.dtype(bool):
return np.dtype(np.int64)
elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
return Int64Dtype()
elif how in ["mean", "median", "var"] and isinstance(
dtype, (BooleanDtype, _IntegerDtype)
):
return Float64Dtype()
return dtype
def maybe_cast_to_extension_array(
cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None
) -> ArrayLike:
"""
Call to `_from_sequence` that returns the object unchanged on Exception.
Parameters
----------
cls : class, subclass of ExtensionArray
obj : arraylike
Values to pass to cls._from_sequence
dtype : ExtensionDtype, optional
Returns
-------
ExtensionArray or obj
"""
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
assert isinstance(cls, type), f"must pass a type: {cls}"
assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
assert issubclass(cls, ABCExtensionArray), assertion_msg
# Everything can be converted to StringArrays, but we may not want to convert
if (
issubclass(cls, (StringArray, ArrowStringArray))
and lib.infer_dtype(obj) != "string"
):
return obj
try:
result = cls._from_sequence(obj, dtype=dtype)
except Exception:
# We can't predict what downstream EA constructors may raise
result = obj
return result
def maybe_upcast_putmask(
result: np.ndarray, mask: np.ndarray, other: Scalar
) -> Tuple[np.ndarray, bool]:
"""
A safe version of putmask that potentially upcasts the result.
The result is replaced with the first N elements of other,
where N is the number of True values in mask.
If the length of other is shorter than N, other will be repeated.
Parameters
----------
result : ndarray
The destination array. This will be mutated in-place if no upcasting is
necessary.
mask : boolean ndarray
other : scalar
The source value.
Returns
-------
result : ndarray
changed : bool
Set to true if the result array was upcasted.
Examples
--------
>>> arr = np.arange(1, 6)
>>> mask = np.array([False, True, False, True, True])
>>> result, _ = maybe_upcast_putmask(arr, mask, False)
>>> result
array([1, 0, 3, 0, 0])
"""
if not isinstance(result, np.ndarray):
raise ValueError("The result input must be a ndarray.")
if not is_scalar(other):
# We _could_ support non-scalar other, but until we have a compelling
# use case, we assume away the possibility.
raise ValueError("other must be a scalar")
if mask.any():
# Two conversions for date-like dtypes that can't be done automatically
# in np.place:
# NaN -> NaT
# integer or integer array -> date-like array
if result.dtype.kind in ["m", "M"]:
if isna(other):
other = result.dtype.type("nat")
elif is_integer(other):
other = np.array(other, dtype=result.dtype)
def changeit():
# we are forced to change the dtype of the result as the input
# isn't compatible
r, _ = maybe_upcast(result, fill_value=other, copy=True)
np.place(r, mask, other)
return r, True
# we want to decide whether place will work
# if we have nans in the False portion of our mask then we need to
# upcast (possibly), otherwise we DON't want to upcast (e.g. if we
# have values, say integers, in the success portion then it's ok to not
# upcast)
new_dtype, _ = maybe_promote(result.dtype, other)
if new_dtype != result.dtype:
# we have a scalar or len 0 ndarray
# and its nan and we are changing some values
if isna(other):
return changeit()
try:
np.place(result, mask, other)
except TypeError:
# e.g. int-dtype result and float-dtype other
return changeit()
return result, False
def maybe_casted_values(
index: "Index", codes: Optional[np.ndarray] = None
) -> ArrayLike:
"""
Convert an index, given directly or as a pair (level, code), to a 1D array.
Parameters
----------
index : Index
codes : np.ndarray[intp] or None, default None
Returns
-------
ExtensionArray or ndarray
If codes is `None`, the values of `index`.
If codes is passed, an array obtained by taking from `index` the indices
contained in `codes`.
"""
values = index._values
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)
# if we have the codes, extract the values with a mask
if codes is not None:
mask: np.ndarray = codes == -1
if mask.size > 0 and mask.all():
# we can have situations where the whole mask is -1,
# meaning there is nothing found in codes, so make all nan's
dtype = index.dtype
fill_value = na_value_for_dtype(dtype)
values = construct_1d_arraylike_from_scalar(fill_value, len(mask), dtype)
else:
values = values.take(codes)
if mask.any():
if isinstance(values, np.ndarray):
values, _ = maybe_upcast_putmask(values, mask, np.nan)
else:
values[mask] = np.nan
return values
def maybe_promote(dtype, fill_value=np.nan):
"""
Find the minimal dtype that can hold both the given dtype and fill_value.
Parameters
----------
dtype : np.dtype or ExtensionDtype
fill_value : scalar, default np.nan
Returns
-------
dtype
Upcasted from dtype argument if necessary.
fill_value
Upcasted from fill_value argument if necessary.
"""
if not is_scalar(fill_value) and not is_object_dtype(dtype):
# with object dtype there is nothing to promote, and the user can
# pass pretty much any weird fill_value they like
raise ValueError("fill_value must be a scalar")
# if we passed an array here, determine the fill value by dtype
if isinstance(fill_value, np.ndarray):
if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
fill_value = fill_value.dtype.type("NaT", "ns")
else:
# we need to change to object type as our
# fill_value is of object type
if fill_value.dtype == np.object_:
dtype = np.dtype(np.object_)
fill_value = np.nan
if dtype == np.object_ or dtype.kind in ["U", "S"]:
# We treat string-like dtypes as object, and _always_ fill
# with np.nan
fill_value = np.nan
dtype = np.dtype(np.object_)
# returns tuple of (dtype, fill_value)
if issubclass(dtype.type, np.datetime64):
if isinstance(fill_value, datetime) and fill_value.tzinfo is not None:
# Trying to insert tzaware into tznaive, have to cast to object
dtype = np.dtype(np.object_)
elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)):
dtype = np.dtype(np.object_)
else:
try:
fill_value = Timestamp(fill_value).to_datetime64()
except (TypeError, ValueError):
dtype = np.dtype(np.object_)
elif issubclass(dtype.type, np.timedelta64):
if (
is_integer(fill_value)
or (is_float(fill_value) and not np.isnan(fill_value))
or isinstance(fill_value, str)
):
# TODO: What about str that can be a timedelta?
dtype = np.dtype(np.object_)
else:
try:
fv = Timedelta(fill_value)
except ValueError:
dtype = np.dtype(np.object_)
else:
if fv is NaT:
# NaT has no `to_timedelta64` method
fill_value = np.timedelta64("NaT", "ns")
else:
fill_value = fv.to_timedelta64()
elif is_datetime64tz_dtype(dtype):
if isna(fill_value):
fill_value = NaT
elif not isinstance(fill_value, datetime):
dtype = np.dtype(np.object_)
elif fill_value.tzinfo is None:
dtype = np.dtype(np.object_)
elif not tz_compare(fill_value.tzinfo, dtype.tz):
# TODO: sure we want to cast here?
dtype = np.dtype(np.object_)
elif is_extension_array_dtype(dtype) and isna(fill_value):
fill_value = dtype.na_value
elif is_float(fill_value):
if issubclass(dtype.type, np.bool_):
dtype = np.dtype(np.object_)
elif issubclass(dtype.type, np.integer):
dtype = np.dtype(np.float64)
elif dtype.kind == "f":
mst = np.min_scalar_type(fill_value)
if mst > dtype:
# e.g. mst is np.float64 and dtype is np.float32
dtype = mst
elif dtype.kind == "c":
mst = np.min_scalar_type(fill_value)
dtype = np.promote_types(dtype, mst)
elif is_bool(fill_value):
if not issubclass(dtype.type, np.bool_):
dtype = np.dtype(np.object_)
elif is_integer(fill_value):
if issubclass(dtype.type, np.bool_):
dtype = np.dtype(np.object_)
elif issubclass(dtype.type, np.integer):
if not np.can_cast(fill_value, dtype):
# upcast to prevent overflow
mst = np.min_scalar_type(fill_value)
dtype = np.promote_types(dtype, mst)
if dtype.kind == "f":
# Case where we disagree with numpy
dtype = np.dtype(np.object_)
elif is_complex(fill_value):
if issubclass(dtype.type, np.bool_):
dtype = np.dtype(np.object_)
elif issubclass(dtype.type, (np.integer, np.floating)):
mst = np.min_scalar_type(fill_value)
dtype = np.promote_types(dtype, mst)
elif dtype.kind == "c":
mst = np.min_scalar_type(fill_value)
if mst > dtype:
# e.g. mst is np.complex128 and dtype is np.complex64
dtype = mst
elif fill_value is None:
if is_float_dtype(dtype) or is_complex_dtype(dtype):
fill_value = np.nan
elif is_integer_dtype(dtype):
dtype = np.float64
fill_value = np.nan
elif is_datetime_or_timedelta_dtype(dtype):
fill_value = dtype.type("NaT", "ns")
else:
dtype = np.dtype(np.object_)
fill_value = np.nan
else:
dtype = np.dtype(np.object_)
# in case we have a string that looked like a number
if is_extension_array_dtype(dtype):
pass
elif issubclass(np.dtype(dtype).type, (bytes, str)):
dtype = np.dtype(np.object_)
fill_value = _ensure_dtype_type(fill_value, dtype)
return dtype, fill_value
def _ensure_dtype_type(value, dtype: DtypeObj):
"""
Ensure that the given value is an instance of the given dtype.
e.g. if out dtype is np.complex64_, we should have an instance of that
as opposed to a python complex object.
Parameters
----------
value : object
dtype : np.dtype or ExtensionDtype
Returns
-------
object
"""
# Start with exceptions in which we do _not_ cast to numpy types
if is_extension_array_dtype(dtype):
return value
elif dtype == np.object_:
return value
elif isna(value):
# e.g. keep np.nan rather than try to cast to np.float32(np.nan)
return value
return dtype.type(value)
def infer_dtype_from(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]:
"""
Interpret the dtype from a scalar or array.
Parameters
----------
val : object
pandas_dtype : bool, default False
whether to infer dtype including pandas extension types.
If False, scalar/array belongs to pandas extension types is inferred as
object
"""
if is_scalar(val):
return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]:
"""
Interpret the dtype from a scalar.
Parameters
----------
pandas_dtype : bool, default False
whether to infer dtype including pandas extension types.
If False, scalar belongs to pandas extension types is inferred as
object
"""
dtype: DtypeObj = np.dtype(object)
# a 1-element ndarray
if isinstance(val, np.ndarray):
msg = "invalid ndarray passed to infer_dtype_from_scalar"
if val.ndim != 0:
raise ValueError(msg)
dtype = val.dtype
val = val.item()
elif isinstance(val, str):
# If we create an empty array using a string to infer
# the dtype, NumPy will only allocate one character per entry
# so this is kind of bad. Alternately we could use np.repeat
# instead of np.empty (but then you still don't want things
# coming out as np.str_!
dtype = np.dtype(object)
elif isinstance(val, (np.datetime64, datetime)):
val = Timestamp(val)
if val is NaT or val.tz is None:
dtype = np.dtype("M8[ns]")
else:
if pandas_dtype:
dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
else:
# return datetimetz as object
return np.dtype(object), val
val = val.value
elif isinstance(val, (np.timedelta64, timedelta)):
val = Timedelta(val).value
dtype = np.dtype("m8[ns]")
elif is_bool(val):
dtype = np.dtype(np.bool_)
elif is_integer(val):
if isinstance(val, np.integer):
dtype = np.dtype(type(val))
else:
dtype = np.dtype(np.int64)
try:
np.array(val, dtype=dtype)
except OverflowError:
dtype = np.array(val).dtype
elif is_float(val):
if isinstance(val, np.floating):
dtype = np.dtype(type(val))
else:
dtype = np.dtype(np.float64)
elif is_complex(val):
dtype = np.dtype(np.complex_)
elif pandas_dtype:
if lib.is_period(val):
dtype = PeriodDtype(freq=val.freq)
elif lib.is_interval(val):
subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
dtype = IntervalDtype(subtype=subtype)
return dtype, val
def dict_compat(d: Dict[Scalar, Scalar]) -> Dict[Scalar, Scalar]:
"""
Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
Parameters
----------
d: dict-like object
Returns
-------
dict
"""
return {maybe_box_datetimelike(key): value for key, value in d.items()}
def infer_dtype_from_array(
arr, pandas_dtype: bool = False
) -> Tuple[DtypeObj, ArrayLike]:
"""
Infer the dtype from an array.
Parameters
----------
arr : array
pandas_dtype : bool, default False
whether to infer dtype including pandas extension types.
If False, array belongs to pandas extension types
is inferred as object
Returns
-------
tuple (numpy-compat/pandas-compat dtype, array)
Notes
-----
if pandas_dtype=False. these infer to numpy dtypes
exactly with the exception that mixed / object dtypes
are not coerced by stringifying or conversion
if pandas_dtype=True. datetime64tz-aware/categorical
types will retain there character.
Examples
--------
>>> np.asarray([1, '1'])
array(['1', '1'], dtype='<U21')
>>> infer_dtype_from_array([1, '1'])
(dtype('O'), [1, '1'])
"""
if isinstance(arr, np.ndarray):
return arr.dtype, arr
if not is_list_like(arr):
arr = [arr]
if pandas_dtype and is_extension_array_dtype(arr):
return arr.dtype, arr
elif isinstance(arr, ABCSeries):
return arr.dtype, np.asarray(arr)
# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr, skipna=False)
if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
return (np.dtype(np.object_), arr)
arr = np.asarray(arr)
return arr.dtype, arr
def maybe_infer_dtype_type(element):
"""
Try to infer an object's dtype, for use in arithmetic ops.
Uses `element.dtype` if that's available.
Objects implementing the iterator protocol are cast to a NumPy array,
and from there the array's type is used.
Parameters
----------
element : object
Possibly has a `.dtype` attribute, and possibly the iterator
protocol.
Returns
-------
tipo : type
Examples
--------
>>> from collections import namedtuple
>>> Foo = namedtuple("Foo", "dtype")
>>> maybe_infer_dtype_type(Foo(np.dtype("i8")))
dtype('int64')
"""
tipo = None
if hasattr(element, "dtype"):
tipo = element.dtype
elif is_list_like(element):
element = np.asarray(element)
tipo = element.dtype
return tipo
def maybe_upcast(
values: ArrayLike,
fill_value: Scalar = np.nan,
dtype: Dtype = None,
copy: bool = False,
) -> Tuple[ArrayLike, Scalar]:
"""
Provide explicit type promotion and coercion.
Parameters
----------
values : ndarray or ExtensionArray
The array that we want to maybe upcast.
fill_value : what we want to fill with
dtype : if None, then use the dtype of the values, else coerce to this type
copy : bool, default True
If True always make a copy even if no upcast is required.
Returns
-------
values: ndarray or ExtensionArray
the original array, possibly upcast
fill_value:
the fill value, possibly upcast
"""
if not is_scalar(fill_value) and not is_object_dtype(values.dtype):
# We allow arbitrary fill values for object dtype
raise ValueError("fill_value must be a scalar")
if is_extension_array_dtype(values):
if copy:
values = values.copy()
else:
if dtype is None:
dtype = values.dtype
new_dtype, fill_value = maybe_promote(dtype, fill_value)
if new_dtype != values.dtype:
values = values.astype(new_dtype)
elif copy:
values = values.copy()
return values, fill_value
def invalidate_string_dtypes(dtype_set: Set[DtypeObj]):
"""
Change string like dtypes to object for
``DataFrame.select_dtypes()``.
"""
non_string_dtypes = dtype_set - {np.dtype("S").type, np.dtype("<U").type}
if non_string_dtypes != dtype_set:
raise TypeError("string dtypes are not allowed, use 'object' instead")
def coerce_indexer_dtype(indexer, categories):
""" coerce the indexer input array to the smallest dtype possible """
length = len(categories)
if length < _int8_max:
return ensure_int8(indexer)
elif length < _int16_max:
return ensure_int16(indexer)
elif length < _int32_max:
return ensure_int32(indexer)
return ensure_int64(indexer)
def astype_nansafe(
arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False
) -> ArrayLike:
"""
Cast the elements of an array to a given dtype a nan-safe manner.
Parameters
----------
arr : ndarray
dtype : np.dtype
copy : bool, default True
If False, a view will be attempted but may fail, if
e.g. the item sizes don't align.
skipna: bool, default False
Whether or not we should skip NaN when casting as a string-type.
Raises
------
ValueError
The dtype was a datetime64/timedelta64 dtype, but it had no unit.
"""
# dispatch on extension dtype if needed
if is_extension_array_dtype(dtype):
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
if not isinstance(dtype, np.dtype):
dtype = pandas_dtype(dtype)
if issubclass(dtype.type, str):
return lib.ensure_string_array(
arr.ravel(), skipna=skipna, convert_na_value=False
).reshape(arr.shape)
elif is_datetime64_dtype(arr):
if is_object_dtype(dtype):
return ints_to_pydatetime(arr.view(np.int64))
elif dtype == np.int64:
if isna(arr).any():
raise ValueError("Cannot convert NaT values to integer")
return arr.view(dtype)
# allow frequency conversions
if dtype.kind == "M":
return arr.astype(dtype)
raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]")
elif is_timedelta64_dtype(arr):
if is_object_dtype(dtype):
return ints_to_pytimedelta(arr.view(np.int64))
elif dtype == np.int64:
if isna(arr).any():
raise ValueError("Cannot convert NaT values to integer")
return arr.view(dtype)
if dtype not in [INT64_DTYPE, TD64NS_DTYPE]:
# allow frequency conversions
# we return a float here!
if dtype.kind == "m":
mask = isna(arr)
result = arr.astype(dtype).astype(np.float64)
result[mask] = np.nan
return result
elif dtype == TD64NS_DTYPE:
return arr.astype(TD64NS_DTYPE, copy=copy)
raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")
elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):
if not np.isfinite(arr).all():
raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
elif is_object_dtype(arr):
# work around NumPy brokenness, #1987
if np.issubdtype(dtype.type, np.integer):
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
# if we have a datetime/timedelta array of objects
# then coerce to a proper dtype and recall astype_nansafe
elif is_datetime64_dtype(dtype):
from pandas import to_datetime
return astype_nansafe(to_datetime(arr).values, dtype, copy=copy)
elif is_timedelta64_dtype(dtype):
from pandas import to_timedelta
return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy)
if dtype.name in ("datetime64", "timedelta64"):
msg = (
f"The '{dtype.name}' dtype has no unit. Please pass in "
f"'{dtype.name}[ns]' instead."
)
raise ValueError(msg)
if copy or is_object_dtype(arr) or is_object_dtype(dtype):
# Explicit copy, or required since NumPy can't view from / to object.
return arr.astype(dtype, copy=True)
return arr.view(dtype)
def soft_convert_objects(
values: np.ndarray,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
copy: bool = True,
):
"""
Try to coerce datetime, timedelta, and numeric object-dtype columns
to inferred dtype.
Parameters
----------
values : np.ndarray[object]
datetime : bool, default True
numeric: bool, default True
timedelta : bool, default True
copy : bool, default True
Returns
-------
np.ndarray
"""
validate_bool_kwarg(datetime, "datetime")
validate_bool_kwarg(numeric, "numeric")
validate_bool_kwarg(timedelta, "timedelta")
validate_bool_kwarg(copy, "copy")
conversion_count = sum((datetime, numeric, timedelta))
if conversion_count == 0:
raise ValueError("At least one of datetime, numeric or timedelta must be True.")
# Soft conversions
if datetime:
# GH 20380, when datetime is beyond year 2262, hence outside
# bound of nanosecond-resolution 64-bit integers.
try:
values = lib.maybe_convert_objects(values, convert_datetime=True)
except (OutOfBoundsDatetime, ValueError):
pass
if timedelta and is_object_dtype(values.dtype):
# Object check to ensure only run if previous did not convert
values = lib.maybe_convert_objects(values, convert_timedelta=True)
if numeric and is_object_dtype(values.dtype):
try:
converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
except (ValueError, TypeError):
pass
else:
# If all NaNs, then do not-alter
values = converted if not isna(converted).all() else values
values = values.copy() if copy else values
return values
def convert_dtypes(
input_array: AnyArrayLike,
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
convert_floating: bool = True,
) -> Dtype:
"""
Convert objects to best possible type, and optionally,
to types supporting ``pd.NA``.
Parameters
----------
input_array : ExtensionArray, Index, Series or np.ndarray
convert_string : bool, default True
Whether object dtypes should be converted to ``StringDtype()``.
convert_integer : bool, default True
Whether, if possible, conversion can be done to integer extension types.
convert_boolean : bool, defaults True
Whether object dtypes should be converted to ``BooleanDtypes()``.
convert_floating : bool, defaults True
Whether, if possible, conversion can be done to floating extension types.
If `convert_integer` is also True, preference will be give to integer
dtypes if the floats can be faithfully casted to integers.
Returns
-------
dtype
new dtype
"""
is_extension = is_extension_array_dtype(input_array.dtype)
if (
convert_string or convert_integer or convert_boolean or convert_floating
) and not is_extension:
try:
inferred_dtype = lib.infer_dtype(input_array)
except ValueError:
# Required to catch due to Period. Can remove once GH 23553 is fixed
inferred_dtype = input_array.dtype
if not convert_string and is_string_dtype(inferred_dtype):
inferred_dtype = input_array.dtype
if convert_integer:
target_int_dtype = "Int64"
if is_integer_dtype(input_array.dtype):
from pandas.core.arrays.integer import INT_STR_TO_DTYPE
inferred_dtype = INT_STR_TO_DTYPE.get(
input_array.dtype.name, target_int_dtype
)
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
input_array.dtype
):
inferred_dtype = target_int_dtype
else:
if is_integer_dtype(inferred_dtype):
inferred_dtype = input_array.dtype
if convert_floating:
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
input_array.dtype
):
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
inferred_float_dtype = FLOAT_STR_TO_DTYPE.get(
input_array.dtype.name, "Float64"
)
# if we could also convert to integer, check if all floats
# are actually integers
if convert_integer:
arr = input_array[notna(input_array)]
if (arr.astype(int) == arr).all():
inferred_dtype = "Int64"
else:
inferred_dtype = inferred_float_dtype
else:
inferred_dtype = inferred_float_dtype
else:
if is_float_dtype(inferred_dtype):
inferred_dtype = input_array.dtype
if convert_boolean:
if is_bool_dtype(input_array.dtype):
inferred_dtype = "boolean"
else:
if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
inferred_dtype = input_array.dtype
else:
inferred_dtype = input_array.dtype
return inferred_dtype
def maybe_castable(arr: np.ndarray) -> bool:
# return False to force a non-fastpath
assert isinstance(arr, np.ndarray) # GH 37024
# check datetime64[ns]/timedelta64[ns] are valid
# otherwise try to coerce
kind = arr.dtype.kind
if kind == "M":
return is_datetime64_ns_dtype(arr.dtype)
elif kind == "m":
return is_timedelta64_ns_dtype(arr.dtype)
return arr.dtype.name not in POSSIBLY_CAST_DTYPES
def maybe_infer_to_datetimelike(
value: Union[ArrayLike, Scalar], convert_dates: bool = False
):
"""
we might have a array (or single object) that is datetime like,
and no dtype is passed don't change the value unless we find a
datetime/timedelta set
this is pretty strict in that a datetime/timedelta is REQUIRED
in addition to possible nulls/string likes
Parameters
----------
value : np.array / Series / Index / list-like
convert_dates : bool, default False
if True try really hard to convert dates (such as datetime.date), other
leave inferred dtype 'date' alone
"""
# TODO: why not timedelta?
if isinstance(
value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray)
):
return value
v = value
if not is_list_like(v):
v = [v]
v = np.array(v, copy=False)
# we only care about object dtypes
if not is_object_dtype(v):
return value
shape = v.shape
if v.ndim != 1:
v = v.ravel()
if not len(v):
return value
def try_datetime(v):
# safe coerce to datetime64
try:
# GH19671
v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0]
except ValueError:
# we might have a sequence of the same-datetimes with tz's
# if so coerce to a DatetimeIndex; if they are not the same,
# then these stay as object dtype, xref GH19671
from pandas import DatetimeIndex
try:
values, tz = conversion.datetime_to_datetime64(v)
return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz)
except (ValueError, TypeError):
pass
except Exception:
pass
return v.reshape(shape)
def try_timedelta(v):
# safe coerce to timedelta64
# will try first with a string & object conversion
from pandas import to_timedelta
try:
td_values = to_timedelta(v)
except ValueError:
return v.reshape(shape)
else:
return np.asarray(td_values).reshape(shape)
inferred_type = lib.infer_datetimelike_array(ensure_object(v))
if inferred_type == "date" and convert_dates:
value = try_datetime(v)
elif inferred_type == "datetime":
value = try_datetime(v)
elif inferred_type == "timedelta":
value = try_timedelta(v)
elif inferred_type == "nat":
# if all NaT, return as datetime
if isna(v).all():
value = try_datetime(v)
else:
# We have at least a NaT and a string
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but technically is also a datetime
value = try_timedelta(v)
if lib.infer_dtype(value, skipna=False) in ["mixed"]:
# cannot skip missing values, as NaT implies that the string
# is actually a datetime
value = try_datetime(v)
return value
def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
"""
try to cast the array/value to a datetimelike dtype, converting float
nan to iNaT
"""
from pandas.core.tools.datetimes import to_datetime
from pandas.core.tools.timedeltas import to_timedelta
if dtype is not None:
is_datetime64 = is_datetime64_dtype(dtype)
is_datetime64tz = is_datetime64tz_dtype(dtype)
is_timedelta64 = is_timedelta64_dtype(dtype)
if is_datetime64 or is_datetime64tz or is_timedelta64:
# Force the dtype if needed.
msg = (
f"The '{dtype.name}' dtype has no unit. "
f"Please pass in '{dtype.name}[ns]' instead."
)
if is_datetime64:
# unpack e.g. SparseDtype
dtype = getattr(dtype, "subtype", dtype)
if not is_dtype_equal(dtype, DT64NS_DTYPE):
# pandas supports dtype whose granularity is less than [ns]
# e.g., [ps], [fs], [as]
if dtype <= np.dtype("M8[ns]"):
if dtype.name == "datetime64":
raise ValueError(msg)
dtype = DT64NS_DTYPE
else:
raise TypeError(
f"cannot convert datetimelike to dtype [{dtype}]"
)
elif is_datetime64tz:
# our NaT doesn't support tz's
# this will coerce to DatetimeIndex with
# a matching dtype below
if is_scalar(value) and isna(value):
value = [value]
elif is_timedelta64 and not is_dtype_equal(dtype, TD64NS_DTYPE):
# pandas supports dtype whose granularity is less than [ns]
# e.g., [ps], [fs], [as]
if dtype <= np.dtype("m8[ns]"):
if dtype.name == "timedelta64":
raise ValueError(msg)
dtype = TD64NS_DTYPE
else:
raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]")
if is_scalar(value):
if value == iNaT or isna(value):
value = iNaT
elif not is_sparse(value):
value = np.array(value, copy=False)
# have a scalar array-like (e.g. NaT)
if value.ndim == 0:
value = iNaT
# we have an array of datetime or timedeltas & nulls
elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype):
try:
if is_datetime64:
value = to_datetime(value, errors="raise")
# GH 25843: Remove tz information since the dtype
# didn't specify one
if value.tz is not None:
value = value.tz_localize(None)
value = value._values
elif is_datetime64tz:
# The string check can be removed once issue #13712
# is solved. String data that is passed with a
# datetime64tz is assumed to be naive which should
# be localized to the timezone.
is_dt_string = is_string_dtype(value.dtype)
value = to_datetime(value, errors="raise").array
if is_dt_string:
# Strings here are naive, so directly localize
value = value.tz_localize(dtype.tz)
else:
# Numeric values are UTC at this point,
# so localize and convert
value = value.tz_localize("UTC").tz_convert(dtype.tz)
elif is_timedelta64:
value = to_timedelta(value, errors="raise")._values
except OutOfBoundsDatetime:
raise
except (AttributeError, ValueError, TypeError):
pass
# coerce datetimelike to object
elif is_datetime64_dtype(
getattr(value, "dtype", None)
) and not is_datetime64_dtype(dtype):
if is_object_dtype(dtype):
if value.dtype != DT64NS_DTYPE:
value = value.astype(DT64NS_DTYPE)
ints = np.asarray(value).view("i8")
return ints_to_pydatetime(ints)
# we have a non-castable dtype that was passed
raise TypeError(f"Cannot cast datetime64 to {dtype}")
else:
is_array = isinstance(value, np.ndarray)
# catch a datetime/timedelta that is not of ns variety
# and no coercion specified
if is_array and value.dtype.kind in ["M", "m"]:
dtype = value.dtype
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
value = conversion.ensure_datetime64ns(value)
elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
value = conversion.ensure_timedelta64ns(value)
# only do this if we have an array and the dtype of the array is not
# setup already we are not an integer/object, so don't bother with this
# conversion
elif not (
is_array
and not (
issubclass(value.dtype.type, np.integer) or value.dtype == np.object_
)
):
value = maybe_infer_to_datetimelike(value)
return value
def find_common_type(types: List[DtypeObj]) -> DtypeObj:
"""
Find a common data type among the given dtypes.
Parameters
----------
types : list of dtypes
Returns
-------
pandas extension or numpy dtype
See Also
--------
numpy.find_common_type
"""
if len(types) == 0:
raise ValueError("no types given")
first = types[0]
# workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
# => object
if all(is_dtype_equal(first, t) for t in types[1:]):
return first
# get unique types (dict.fromkeys is used as order-preserving set())
types = list(dict.fromkeys(types).keys())
if any(isinstance(t, ExtensionDtype) for t in types):
for t in types:
if isinstance(t, ExtensionDtype):
res = t._get_common_dtype(types)
if res is not None:
return res
return np.dtype("object")
# take lowest unit
if all(is_datetime64_dtype(t) for t in types):
return np.dtype("datetime64[ns]")
if all(is_timedelta64_dtype(t) for t in types):
return np.dtype("timedelta64[ns]")
# don't mix bool / int or float or complex
# this is different from numpy, which casts bool with float/int as int
has_bools = any(is_bool_dtype(t) for t in types)
if has_bools:
for t in types:
if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
return np.dtype("object")
return np.find_common_type(types, [])
def construct_1d_arraylike_from_scalar(
value: Scalar, length: int, dtype: DtypeObj
) -> ArrayLike:
"""
create a np.ndarray / pandas type of specified shape and dtype
filled with values
Parameters
----------
value : scalar value
length : int
dtype : pandas_dtype or np.dtype
Returns
-------
np.ndarray / pandas type of length, filled with value
"""
if is_extension_array_dtype(dtype):
cls = dtype.construct_array_type()
subarr = cls._from_sequence([value] * length, dtype=dtype)
else:
if length and is_integer_dtype(dtype) and isna(value):
# coerce if we have nan for an integer dtype
dtype = np.dtype("float64")
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
# we need to coerce to object dtype to avoid
# to allow numpy to take our string as a scalar value
dtype = np.dtype("object")
if not isna(value):
value = ensure_str(value)
elif dtype.kind in ["M", "m"] and is_valid_nat_for_dtype(value, dtype):
# GH36541: can't fill array directly with pd.NaT
# > np.empty(10, dtype="datetime64[64]").fill(pd.NaT)
# ValueError: cannot convert float NaN to integer
value = dtype.type("NaT", "ns")
subarr = np.empty(length, dtype=dtype)
subarr.fill(value)
return subarr
def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
"""
Transform any list-like object in a 1-dimensional numpy array of object
dtype.
Parameters
----------
values : any iterable which has a len()
Raises
------
TypeError
* If `values` does not have a len()
Returns
-------
1-dimensional numpy array of dtype object
"""
# numpy will try to interpret nested lists as further dimensions, hence
# making a 1D array that contains list-likes is a bit tricky:
result = np.empty(len(values), dtype="object")
result[:] = values
return result
def construct_1d_ndarray_preserving_na(
values: Sequence, dtype: Optional[DtypeObj] = None, copy: bool = False
) -> np.ndarray:
"""
Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
Parameters
----------
values : Sequence
dtype : numpy.dtype, optional
copy : bool, default False
Note that copies may still be made with ``copy=False`` if casting
is required.
Returns
-------
arr : ndarray[dtype]
Examples
--------
>>> np.array([1.0, 2.0, None], dtype='str')
array(['1.0', '2.0', 'None'], dtype='<U4')
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
array(['1.0', '2.0', None], dtype=object)
"""
if dtype is not None and dtype.kind == "U":
subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
else:
subarr = np.array(values, dtype=dtype, copy=copy)
return subarr
def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False):
"""
Takes any dtype and returns the casted version, raising for when data is
incompatible with integer/unsigned integer dtypes.
.. versionadded:: 0.24.0
Parameters
----------
arr : array-like
The array to cast.
dtype : str, np.dtype
The integer dtype to cast the array to.
copy: bool, default False
Whether to make a copy of the array before returning.
Returns
-------
ndarray
Array of integer or unsigned integer dtype.
Raises
------
OverflowError : the dtype is incompatible with the data
ValueError : loss of precision has occurred during casting
Examples
--------
If you try to coerce negative values to unsigned integers, it raises:
>>> pd.Series([-1], dtype="uint64")
Traceback (most recent call last):
...
OverflowError: Trying to coerce negative values to unsigned integers
Also, if you try to coerce float values to integers, it raises:
>>> pd.Series([1, 2, 3.5], dtype="int64")
Traceback (most recent call last):
...
ValueError: Trying to coerce float values to integers
"""
assert is_integer_dtype(dtype)
try:
if not hasattr(arr, "astype"):
casted = np.array(arr, dtype=dtype, copy=copy)
else:
casted = arr.astype(dtype, copy=copy)
except OverflowError as err:
raise OverflowError(
"The elements provided in the data cannot all be "
f"casted to the dtype {dtype}"
) from err
if np.array_equal(arr, casted):
return casted
# We do this casting to allow for proper
# data and dtype checking.
#
# We didn't do this earlier because NumPy
# doesn't handle `uint64` correctly.
arr = np.asarray(arr)
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
raise OverflowError("Trying to coerce negative values to unsigned integers")
if is_float_dtype(arr) or is_object_dtype(arr):
raise ValueError("Trying to coerce float values to integers")
def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar:
"""
Convert datetimelike scalar if we are setting into a datetime64
or timedelta64 ndarray.
Parameters
----------
scalar : scalar
dtype : np.dtype
Returns
-------
scalar
"""
if dtype.kind == "m":
if isinstance(scalar, (timedelta, np.timedelta64)):
# We have to cast after asm8 in case we have NaT
return Timedelta(scalar).asm8.view("timedelta64[ns]")
elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)):
return np.timedelta64("NaT", "ns")
if dtype.kind == "M":
if isinstance(scalar, (date, np.datetime64)):
# Note: we include date, not just datetime
return Timestamp(scalar).to_datetime64()
elif scalar is None or scalar is NaT or (is_float(scalar) and np.isnan(scalar)):
return np.datetime64("NaT", "ns")
else:
validate_numeric_casting(dtype, scalar)
return scalar
def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None:
"""
Check that we can losslessly insert the given value into an array
with the given dtype.
Parameters
----------
dtype : np.dtype
value : scalar
Raises
------
ValueError
"""
if issubclass(dtype.type, (np.integer, np.bool_)):
if is_float(value) and np.isnan(value):
raise ValueError("Cannot assign nan to integer series")
if issubclass(dtype.type, (np.integer, np.floating, complex)) and not issubclass(
dtype.type, np.bool_
):
if is_bool(value):
raise ValueError("Cannot assign bool to float/integer series")