Inzynierka/Lib/site-packages/pandas/core/tools/datetimes.py

1273 lines
42 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
from __future__ import annotations
from collections import abc
from datetime import datetime
from functools import partial
from itertools import islice
from typing import (
TYPE_CHECKING,
Callable,
Hashable,
List,
Tuple,
TypedDict,
Union,
cast,
overload,
)
import warnings
import numpy as np
from pandas._libs import (
lib,
tslib,
)
from pandas._libs.tslibs import (
OutOfBoundsDatetime,
Timedelta,
Timestamp,
astype_overflowsafe,
get_unit_from_dtype,
iNaT,
is_supported_unit,
nat_strings,
parsing,
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.parsing import (
DateParseError,
guess_datetime_format,
)
from pandas._libs.tslibs.strptime import array_strptime
from pandas._typing import (
AnyArrayLike,
ArrayLike,
DateTimeErrorChoices,
npt,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
ensure_object,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_float,
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_scalar,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.missing import notna
from pandas.arrays import (
DatetimeArray,
IntegerArray,
PandasArray,
)
from pandas.core import algorithms
from pandas.core.algorithms import unique
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.datetimes import (
maybe_convert_dtype,
objects_to_datetime64ns,
tz_to_dtype,
)
from pandas.core.construction import extract_array
from pandas.core.indexes.base import Index
from pandas.core.indexes.datetimes import DatetimeIndex
if TYPE_CHECKING:
from pandas._libs.tslibs.nattype import NaTType
from pandas._libs.tslibs.timedeltas import UnitChoices
from pandas import (
DataFrame,
Series,
)
# ---------------------------------------------------------------------
# types used in annotations
ArrayConvertible = Union[List, Tuple, AnyArrayLike]
Scalar = Union[float, str]
DatetimeScalar = Union[Scalar, datetime]
DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
DatetimeDictArg = Union[List[Scalar], Tuple[Scalar, ...], AnyArrayLike]
class YearMonthDayDict(TypedDict, total=True):
year: DatetimeDictArg
month: DatetimeDictArg
day: DatetimeDictArg
class FulldatetimeDict(YearMonthDayDict, total=False):
hour: DatetimeDictArg
hours: DatetimeDictArg
minute: DatetimeDictArg
minutes: DatetimeDictArg
second: DatetimeDictArg
seconds: DatetimeDictArg
ms: DatetimeDictArg
us: DatetimeDictArg
ns: DatetimeDictArg
DictConvertible = Union[FulldatetimeDict, "DataFrame"]
start_caching_at = 50
# ---------------------------------------------------------------------
def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
# Try to guess the format based on the first non-NaN element, return None if can't
if (first_non_null := tslib.first_non_null(arr)) != -1:
if type(first_non_nan_element := arr[first_non_null]) is str:
# GH#32264 np.str_ object
guessed_format = guess_datetime_format(
first_non_nan_element, dayfirst=dayfirst
)
if guessed_format is not None:
return guessed_format
# If there are multiple non-null elements, warn about
# how parsing might not be consistent
if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
warnings.warn(
"Could not infer format, so each element will be parsed "
"individually, falling back to `dateutil`. To ensure parsing is "
"consistent and as-expected, please specify a format.",
UserWarning,
stacklevel=find_stack_level(),
)
return None
def should_cache(
arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None
) -> bool:
"""
Decides whether to do caching.
If the percent of unique elements among `check_count` elements less
than `unique_share * 100` then we can do caching.
Parameters
----------
arg: listlike, tuple, 1-d array, Series
unique_share: float, default=0.7, optional
0 < unique_share < 1
check_count: int, optional
0 <= check_count <= len(arg)
Returns
-------
do_caching: bool
Notes
-----
By default for a sequence of less than 50 items in size, we don't do
caching; for the number of elements less than 5000, we take ten percent of
all elements to check for a uniqueness share; if the sequence size is more
than 5000, then we check only the first 500 elements.
All constants were chosen empirically by.
"""
do_caching = True
# default realization
if check_count is None:
# in this case, the gain from caching is negligible
if len(arg) <= start_caching_at:
return False
if len(arg) <= 5000:
check_count = len(arg) // 10
else:
check_count = 500
else:
assert (
0 <= check_count <= len(arg)
), "check_count must be in next bounds: [0; len(arg)]"
if check_count == 0:
return False
assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"
try:
# We can't cache if the items are not hashable.
unique_elements = set(islice(arg, check_count))
except TypeError:
return False
if len(unique_elements) > check_count * unique_share:
do_caching = False
return do_caching
def _maybe_cache(
arg: ArrayConvertible,
format: str | None,
cache: bool,
convert_listlike: Callable,
) -> Series:
"""
Create a cache of unique dates from an array of dates
Parameters
----------
arg : listlike, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : bool
True attempts to create a cache of converted values
convert_listlike : function
Conversion function to apply on dates
Returns
-------
cache_array : Series
Cache of converted, unique dates. Can be empty
"""
from pandas import Series
cache_array = Series(dtype=object)
if cache:
# Perform a quicker unique check
if not should_cache(arg):
return cache_array
unique_dates = unique(arg)
if len(unique_dates) < len(arg):
cache_dates = convert_listlike(unique_dates, format)
# GH#45319
try:
cache_array = Series(cache_dates, index=unique_dates, copy=False)
except OutOfBoundsDatetime:
return cache_array
# GH#39882 and GH#35888 in case of None and NaT we get duplicates
if not cache_array.index.is_unique:
cache_array = cache_array[~cache_array.index.duplicated()]
return cache_array
def _box_as_indexlike(
dt_array: ArrayLike, utc: bool = False, name: Hashable = None
) -> Index:
"""
Properly boxes the ndarray of datetimes to DatetimeIndex
if it is possible or to generic Index instead
Parameters
----------
dt_array: 1-d array
Array of datetimes to be wrapped in an Index.
utc : bool
Whether to convert/localize timestamps to UTC.
name : string, default None
Name for a resulting index
Returns
-------
result : datetime of converted dates
- DatetimeIndex if convertible to sole datetime64 type
- general Index otherwise
"""
if is_datetime64_dtype(dt_array):
tz = "utc" if utc else None
return DatetimeIndex(dt_array, tz=tz, name=name)
return Index(dt_array, name=name, dtype=dt_array.dtype)
def _convert_and_box_cache(
arg: DatetimeScalarOrArrayConvertible,
cache_array: Series,
name: Hashable | None = None,
) -> Index:
"""
Convert array of dates with a cache and wrap the result in an Index.
Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
cache_array : Series
Cache of converted, unique dates
name : string, default None
Name for a DatetimeIndex
Returns
-------
result : Index-like of converted dates
"""
from pandas import Series
result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
return _box_as_indexlike(result._values, utc=False, name=name)
def _return_parsed_timezone_results(
result: np.ndarray, timezones, utc: bool, name
) -> Index:
"""
Return results from array_strptime if a %z or %Z directive was passed.
Parameters
----------
result : ndarray[int64]
int64 date representations of the dates
timezones : ndarray
pytz timezone objects
utc : bool
Whether to convert/localize timestamps to UTC.
name : string, default None
Name for a DatetimeIndex
Returns
-------
tz_result : Index-like of parsed dates with timezone
"""
tz_results = np.empty(len(result), dtype=object)
for zone in unique(timezones):
mask = timezones == zone
dta = DatetimeArray(result[mask]).tz_localize(zone)
if utc:
if dta.tzinfo is None:
dta = dta.tz_localize("utc")
else:
dta = dta.tz_convert("utc")
tz_results[mask] = dta
return Index(tz_results, name=name)
def _convert_listlike_datetimes(
arg,
format: str | None,
name: Hashable = None,
utc: bool = False,
unit: str | None = None,
errors: DateTimeErrorChoices = "raise",
dayfirst: bool | None = None,
yearfirst: bool | None = None,
exact: bool = True,
):
"""
Helper function for to_datetime. Performs the conversions of 1D listlike
of dates
Parameters
----------
arg : list, tuple, ndarray, Series, Index
date to be parsed
name : object
None or string for the Index name
utc : bool
Whether to convert/localize timestamps to UTC.
unit : str
None or string of the frequency of the passed data
errors : str
error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
dayfirst : bool
dayfirst parsing behavior from to_datetime
yearfirst : bool
yearfirst parsing behavior from to_datetime
exact : bool, default True
exact format matching behavior from to_datetime
Returns
-------
Index-like of parsed dates
"""
if isinstance(arg, (list, tuple)):
arg = np.array(arg, dtype="O")
elif isinstance(arg, PandasArray):
arg = np.array(arg)
arg_dtype = getattr(arg, "dtype", None)
# these are shortcutable
tz = "utc" if utc else None
if is_datetime64tz_dtype(arg_dtype):
if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
return DatetimeIndex(arg, tz=tz, name=name)
if utc:
arg = arg.tz_convert(None).tz_localize("utc")
return arg
elif is_datetime64_dtype(arg_dtype):
arg_dtype = cast(np.dtype, arg_dtype)
if not is_supported_unit(get_unit_from_dtype(arg_dtype)):
# We go to closest supported reso, i.e. "s"
arg = astype_overflowsafe(
# TODO: looks like we incorrectly raise with errors=="ignore"
np.asarray(arg),
np.dtype("M8[s]"),
is_coerce=errors == "coerce",
)
if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
return DatetimeIndex(arg, tz=tz, name=name)
elif utc:
# DatetimeArray, DatetimeIndex
return arg.tz_localize("utc")
return arg
elif unit is not None:
if format is not None:
raise ValueError("cannot specify both format and unit")
return _to_datetime_with_unit(arg, unit, name, utc, errors)
elif getattr(arg, "ndim", 1) > 1:
raise TypeError(
"arg must be a string, datetime, list, tuple, 1-d array, or Series"
)
# warn if passing timedelta64, raise for PeriodDtype
# NB: this must come after unit transformation
try:
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
except TypeError:
if errors == "coerce":
npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))
return DatetimeIndex(npvalues, name=name)
elif errors == "ignore":
idx = Index(arg, name=name)
return idx
raise
arg = ensure_object(arg)
if format is None:
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
# `format` could be inferred, or user didn't ask for mixed-format parsing.
if format is not None and format != "mixed":
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
result, tz_parsed = objects_to_datetime64ns(
arg,
dayfirst=dayfirst,
yearfirst=yearfirst,
utc=utc,
errors=errors,
allow_object=True,
)
if tz_parsed is not None:
# We can take a shortcut since the datetime64 numpy array
# is in UTC
dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
return DatetimeIndex._simple_new(dta, name=name)
return _box_as_indexlike(result, utc=utc, name=name)
def _array_strptime_with_fallback(
arg,
name,
utc: bool,
fmt: str,
exact: bool,
errors: str,
) -> Index:
"""
Call array_strptime, with fallback behavior depending on 'errors'.
"""
result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)
if any(tz is not None for tz in timezones):
return _return_parsed_timezone_results(result, timezones, utc, name)
return _box_as_indexlike(result, utc=utc, name=name)
def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
"""
to_datetime specalized to the case where a 'unit' is passed.
"""
arg = extract_array(arg, extract_numpy=True)
# GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
# because it expects an ndarray argument
if isinstance(arg, IntegerArray):
arr = arg.astype(f"datetime64[{unit}]")
tz_parsed = None
else:
arg = np.asarray(arg)
if arg.dtype.kind in ["i", "u"]:
# Note we can't do "f" here because that could induce unwanted
# rounding GH#14156, GH#20445
arr = arg.astype(f"datetime64[{unit}]", copy=False)
try:
arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)
except OutOfBoundsDatetime:
if errors == "raise":
raise
arg = arg.astype(object)
return _to_datetime_with_unit(arg, unit, name, utc, errors)
tz_parsed = None
elif arg.dtype.kind == "f":
mult, _ = precision_from_unit(unit)
mask = np.isnan(arg) | (arg == iNaT)
fvalues = (arg * mult).astype("f8", copy=False)
fvalues[mask] = 0
if (fvalues < Timestamp.min._value).any() or (
fvalues > Timestamp.max._value
).any():
if errors != "raise":
arg = arg.astype(object)
return _to_datetime_with_unit(arg, unit, name, utc, errors)
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
arr = fvalues.astype("M8[ns]", copy=False)
arr[mask] = np.datetime64("NaT", "ns")
tz_parsed = None
else:
arg = arg.astype(object, copy=False)
arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
if errors == "ignore":
# Index constructor _may_ infer to DatetimeIndex
result = Index._with_infer(arr, name=name)
else:
result = DatetimeIndex(arr, name=name)
if not isinstance(result, DatetimeIndex):
return result
# GH#23758: We may still need to localize the result with tz
# GH#25546: Apply tz_parsed first (from arg), then tz (from caller)
# result will be naive but in UTC
result = result.tz_localize("UTC").tz_convert(tz_parsed)
if utc:
if result.tz is None:
result = result.tz_localize("utc")
else:
result = result.tz_convert("utc")
return result
def _adjust_to_origin(arg, origin, unit):
"""
Helper function for to_datetime.
Adjust input argument to the specified origin
Parameters
----------
arg : list, tuple, ndarray, Series, Index
date to be adjusted
origin : 'julian' or Timestamp
origin offset for the arg
unit : str
passed unit from to_datetime, must be 'D'
Returns
-------
ndarray or scalar of adjusted date(s)
"""
if origin == "julian":
original = arg
j0 = Timestamp(0).to_julian_date()
if unit != "D":
raise ValueError("unit must be 'D' for origin='julian'")
try:
arg = arg - j0
except TypeError as err:
raise ValueError(
"incompatible 'arg' type for given 'origin'='julian'"
) from err
# preemptively check this for a nice range
j_max = Timestamp.max.to_julian_date() - j0
j_min = Timestamp.min.to_julian_date() - j0
if np.any(arg > j_max) or np.any(arg < j_min):
raise OutOfBoundsDatetime(
f"{original} is Out of Bounds for origin='julian'"
)
else:
# arg must be numeric
if not (
(is_scalar(arg) and (is_integer(arg) or is_float(arg)))
or is_numeric_dtype(np.asarray(arg))
):
raise ValueError(
f"'{arg}' is not compatible with origin='{origin}'; "
"it must be numeric with a unit specified"
)
# we are going to offset back to unix / epoch time
try:
offset = Timestamp(origin, unit=unit)
except OutOfBoundsDatetime as err:
raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err
except ValueError as err:
raise ValueError(
f"origin {origin} cannot be converted to a Timestamp"
) from err
if offset.tz is not None:
raise ValueError(f"origin offset {offset} must be tz-naive")
td_offset = offset - Timestamp(0)
# convert the offset to the unit of the arg
# this should be lossless in terms of precision
ioffset = td_offset // Timedelta(1, unit=unit)
# scalars & ndarray-like can handle the addition
if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):
arg = np.asarray(arg)
arg = arg + ioffset
return arg
@overload
def to_datetime(
arg: DatetimeScalar,
errors: DateTimeErrorChoices = ...,
dayfirst: bool = ...,
yearfirst: bool = ...,
utc: bool = ...,
format: str | None = ...,
exact: bool = ...,
unit: str | None = ...,
infer_datetime_format: bool = ...,
origin=...,
cache: bool = ...,
) -> Timestamp:
...
@overload
def to_datetime(
arg: Series | DictConvertible,
errors: DateTimeErrorChoices = ...,
dayfirst: bool = ...,
yearfirst: bool = ...,
utc: bool = ...,
format: str | None = ...,
exact: bool = ...,
unit: str | None = ...,
infer_datetime_format: bool = ...,
origin=...,
cache: bool = ...,
) -> Series:
...
@overload
def to_datetime(
arg: list | tuple | Index | ArrayLike,
errors: DateTimeErrorChoices = ...,
dayfirst: bool = ...,
yearfirst: bool = ...,
utc: bool = ...,
format: str | None = ...,
exact: bool = ...,
unit: str | None = ...,
infer_datetime_format: bool = ...,
origin=...,
cache: bool = ...,
) -> DatetimeIndex:
...
def to_datetime(
arg: DatetimeScalarOrArrayConvertible | DictConvertible,
errors: DateTimeErrorChoices = "raise",
dayfirst: bool = False,
yearfirst: bool = False,
utc: bool = False,
format: str | None = None,
exact: bool | lib.NoDefault = lib.no_default,
unit: str | None = None,
infer_datetime_format: lib.NoDefault | bool = lib.no_default,
origin: str = "unix",
cache: bool = True,
) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
"""
Convert argument to datetime.
This function converts a scalar, array-like, :class:`Series` or
:class:`DataFrame`/dict-like to a pandas datetime object.
Parameters
----------
arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like
The object to convert to a datetime. If a :class:`DataFrame` is provided, the
method expects minimally the following columns: :const:`"year"`,
:const:`"month"`, :const:`"day"`.
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- If :const:`'raise'`, then invalid parsing will raise an exception.
- If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
- If :const:`'ignore'`, then invalid parsing will return the input.
dayfirst : bool, default False
Specify a date parse order if `arg` is str or is list-like.
If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
is parsed as :const:`2012-11-10`.
.. warning::
``dayfirst=True`` is not strict, but will prefer to parse
with day first.
yearfirst : bool, default False
Specify a date parse order if `arg` is str or is list-like.
- If :const:`True` parses dates with the year first, e.g.
:const:`"10/11/12"` is parsed as :const:`2010-11-12`.
- If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
preceded (same as :mod:`dateutil`).
.. warning::
``yearfirst=True`` is not strict, but will prefer to parse
with year first.
utc : bool, default False
Control timezone-related parsing, localization and conversion.
- If :const:`True`, the function *always* returns a timezone-aware
UTC-localized :class:`Timestamp`, :class:`Series` or
:class:`DatetimeIndex`. To do this, timezone-naive inputs are
*localized* as UTC, while timezone-aware inputs are *converted* to UTC.
- If :const:`False` (default), inputs will not be coerced to UTC.
Timezone-naive inputs will remain naive, while timezone-aware ones
will keep their time offsets. Limitations exist for mixed
offsets (typically, daylight savings), see :ref:`Examples
<to_datetime_tz_examples>` section for details.
See also: pandas general documentation about `timezone conversion and
localization
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
#time-zone-handling>`_.
format : str, default None
The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
`strftime documentation
<https://docs.python.org/3/library/datetime.html
#strftime-and-strptime-behavior>`_ for more information on choices, though
note that :const:`"%f"` will parse all the way up to nanoseconds.
You can also pass:
- "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
time string (not necessarily in exactly the same format);
- "mixed", to infer the format for each element individually. This is risky,
and you should probably use it along with `dayfirst`.
exact : bool, default True
Control how `format` is used:
- If :const:`True`, require an exact `format` match.
- If :const:`False`, allow the `format` to match anywhere in the target
string.
Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
unit : str, default 'ns'
The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
integer or float number. This will be based off the origin.
Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
the number of milliseconds to the unix epoch start.
infer_datetime_format : bool, default False
If :const:`True` and no `format` is given, attempt to infer the format
of the datetime strings based on the first non-NaN element,
and if it can be inferred, switch to a faster method of parsing them.
In some cases this can increase the parsing speed by ~5-10x.
.. deprecated:: 2.0.0
A strict version of this argument is now the default, passing it has
no effect.
origin : scalar, default 'unix'
Define the reference date. The numeric values would be parsed as number
of units (defined by `unit`) since this reference date.
- If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
- If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
beginning of Julian Calendar. Julian day number :const:`0` is assigned
to the day starting at noon on January 1, 4713 BC.
- If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date
string), origin is set to Timestamp identified by origin.
- If a float or integer, origin is the millisecond difference
relative to 1970-01-01.
cache : bool, default True
If :const:`True`, use a cache of unique, converted dates to apply the
datetime conversion. May produce significant speed-up when parsing
duplicate date strings, especially ones with timezone offsets. The cache
is only used when there are at least 50 values. The presence of
out-of-bounds values will render the cache unusable and may slow down
parsing.
Returns
-------
datetime
If parsing succeeded.
Return type depends on input (types in parenthesis correspond to
fallback in case of unsuccessful timezone or out-of-range timestamp
parsing):
- scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
- array-like: :class:`DatetimeIndex` (or :class:`Series` with
:class:`object` dtype containing :class:`datetime.datetime`)
- Series: :class:`Series` of :class:`datetime64` dtype (or
:class:`Series` of :class:`object` dtype containing
:class:`datetime.datetime`)
- DataFrame: :class:`Series` of :class:`datetime64` dtype (or
:class:`Series` of :class:`object` dtype containing
:class:`datetime.datetime`)
Raises
------
ParserError
When parsing a date from string fails.
ValueError
When another datetime conversion error happens. For example when one
of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
when a Timezone-aware :class:`datetime.datetime` is found in an array-like
of mixed time offsets, and ``utc=False``.
See Also
--------
DataFrame.astype : Cast argument to a specified dtype.
to_timedelta : Convert argument to timedelta.
convert_dtypes : Convert dtypes.
Notes
-----
Many input types are supported, and lead to different output types:
- **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`
module or :mod:`numpy`). They are converted to :class:`Timestamp` when
possible, otherwise they are converted to :class:`datetime.datetime`.
None/NaN/null scalars are converted to :const:`NaT`.
- **array-like** can contain int, float, str, datetime objects. They are
converted to :class:`DatetimeIndex` when possible, otherwise they are
converted to :class:`Index` with :class:`object` dtype, containing
:class:`datetime.datetime`. None/NaN/null entries are converted to
:const:`NaT` in both cases.
- **Series** are converted to :class:`Series` with :class:`datetime64`
dtype when possible, otherwise they are converted to :class:`Series` with
:class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null
entries are converted to :const:`NaT` in both cases.
- **DataFrame/dict-like** are converted to :class:`Series` with
:class:`datetime64` dtype. For each row a datetime is created from assembling
the various dataframe columns. Column keys can be common abbreviations
like [year, month, day, minute, second, ms, us, ns]) or
plurals of the same.
The following causes are responsible for :class:`datetime.datetime` objects
being returned (possibly inside an :class:`Index` or a :class:`Series` with
:class:`object` dtype) instead of a proper pandas designated type
(:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`
with :class:`datetime64` dtype):
- when any input element is before :const:`Timestamp.min` or after
:const:`Timestamp.max`, see `timestamp limitations
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
#timeseries-timestamp-limits>`_.
- when ``utc=False`` (default) and the input is an array-like or
:class:`Series` containing mixed naive/aware datetime, or aware with mixed
time offsets. Note that this happens in the (quite frequent) situation when
the timezone has a daylight savings policy. In that case you may wish to
use ``utc=True``.
Examples
--------
**Handling various input formats**
Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys
can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',
'ms', 'us', 'ns']) or plurals of the same
>>> df = pd.DataFrame({'year': [2015, 2016],
... 'month': [2, 3],
... 'day': [4, 5]})
>>> pd.to_datetime(df)
0 2015-02-04
1 2016-03-05
dtype: datetime64[ns]
Using a unix epoch time
>>> pd.to_datetime(1490195805, unit='s')
Timestamp('2017-03-22 15:16:45')
>>> pd.to_datetime(1490195805433502912, unit='ns')
Timestamp('2017-03-22 15:16:45.433502912')
.. warning:: For float arg, precision rounding might happen. To prevent
unexpected behavior use a fixed-width exact type.
Using a non-unix epoch origin
>>> pd.to_datetime([1, 2, 3], unit='D',
... origin=pd.Timestamp('1960-01-01'))
DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],
dtype='datetime64[ns]', freq=None)
**Differences with strptime behavior**
:const:`"%f"` will parse all the way up to nanoseconds.
>>> pd.to_datetime('2018-10-26 12:00:00.0000000011',
... format='%Y-%m-%d %H:%M:%S.%f')
Timestamp('2018-10-26 12:00:00.000000001')
**Non-convertible date/times**
If a date does not meet the `timestamp limitations
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
#timeseries-timestamp-limits>`_, passing ``errors='ignore'``
will return the original input instead of raising any exception.
Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
'13000101'
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
NaT
.. _to_datetime_tz_examples:
**Timezones and time offsets**
The default behaviour (``utc=False``) is as follows:
- Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:
>>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],
dtype='datetime64[ns]', freq=None)
- Timezone-aware inputs *with constant time offset* are converted to
timezone-aware :class:`DatetimeIndex`:
>>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])
DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],
dtype='datetime64[ns, UTC-05:00]', freq=None)
- However, timezone-aware inputs *with mixed time offsets* (for example
issued from a timezone with daylight savings, such as Europe/Paris)
are **not successfully converted** to a :class:`DatetimeIndex`. Instead a
simple :class:`Index` containing :class:`datetime.datetime` objects is
returned:
>>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100'])
Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
dtype='object')
- A mix of timezone-aware and timezone-naive inputs is also converted to
a simple :class:`Index` containing :class:`datetime.datetime` objects:
>>> from datetime import datetime
>>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])
Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')
|
Setting ``utc=True`` solves most of the above issues:
- Timezone-naive inputs are *localized* as UTC
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],
dtype='datetime64[ns, UTC]', freq=None)
- Timezone-aware inputs are *converted* to UTC (the output represents the
exact same datetime, but viewed from the UTC time offset `+00:00`).
>>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
... utc=True)
DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],
dtype='datetime64[ns, UTC]', freq=None)
- Inputs can contain both string or datetime, the above
rules still apply
>>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True)
DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],
dtype='datetime64[ns, UTC]', freq=None)
"""
if exact is not lib.no_default and format in {"mixed", "ISO8601"}:
raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")
if infer_datetime_format is not lib.no_default:
warnings.warn(
"The argument 'infer_datetime_format' is deprecated and will "
"be removed in a future version. "
"A strict version of it is now the default, see "
"https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
"You can safely remove this argument.",
stacklevel=find_stack_level(),
)
if arg is None:
return None
if origin != "unix":
arg = _adjust_to_origin(arg, origin, unit)
convert_listlike = partial(
_convert_listlike_datetimes,
utc=utc,
unit=unit,
dayfirst=dayfirst,
yearfirst=yearfirst,
errors=errors,
exact=exact,
)
# pylint: disable-next=used-before-assignment
result: Timestamp | NaTType | Series | Index
if isinstance(arg, Timestamp):
result = arg
if utc:
if arg.tz is not None:
result = arg.tz_convert("utc")
else:
result = arg.tz_localize("utc")
elif isinstance(arg, ABCSeries):
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
if not cache_array.empty:
result = arg.map(cache_array)
else:
values = convert_listlike(arg._values, format)
result = arg._constructor(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):
result = _assemble_from_unit_mappings(arg, errors, utc)
elif isinstance(arg, Index):
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, name=arg.name)
else:
result = convert_listlike(arg, format, name=arg.name)
elif is_list_like(arg):
try:
# error: Argument 1 to "_maybe_cache" has incompatible type
# "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,
# ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],
# Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"
argc = cast(
Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg
)
cache_array = _maybe_cache(argc, format, cache, convert_listlike)
except OutOfBoundsDatetime:
# caching attempts to create a DatetimeIndex, which may raise
# an OOB. If that's the desired behavior, then just reraise...
if errors == "raise":
raise
# ... otherwise, continue without the cache.
from pandas import Series
cache_array = Series([], dtype=object) # just an empty array
if not cache_array.empty:
result = _convert_and_box_cache(argc, cache_array)
else:
result = convert_listlike(argc, format)
else:
result = convert_listlike(np.array([arg]), format)[0]
if isinstance(arg, bool) and isinstance(result, np.bool_):
result = bool(result) # TODO: avoid this kludge.
# error: Incompatible return value type (got "Union[Timestamp, NaTType,
# Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
# NaTType, None]")
return result # type: ignore[return-value]
# mappings for assembling units
_unit_map = {
"year": "year",
"years": "year",
"month": "month",
"months": "month",
"day": "day",
"days": "day",
"hour": "h",
"hours": "h",
"minute": "m",
"minutes": "m",
"second": "s",
"seconds": "s",
"ms": "ms",
"millisecond": "ms",
"milliseconds": "ms",
"us": "us",
"microsecond": "us",
"microseconds": "us",
"ns": "ns",
"nanosecond": "ns",
"nanoseconds": "ns",
}
def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool):
"""
assemble the unit specified fields from the arg (DataFrame)
Return a Series for actual parsing
Parameters
----------
arg : DataFrame
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- If :const:`'raise'`, then invalid parsing will raise an exception
- If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`
- If :const:`'ignore'`, then invalid parsing will return the input
utc : bool
Whether to convert/localize timestamps to UTC.
Returns
-------
Series
"""
from pandas import (
DataFrame,
to_numeric,
to_timedelta,
)
arg = DataFrame(arg)
if not arg.columns.is_unique:
raise ValueError("cannot assemble with duplicate keys")
# replace passed unit with _unit_map
def f(value):
if value in _unit_map:
return _unit_map[value]
# m is case significant
if value.lower() in _unit_map:
return _unit_map[value.lower()]
return value
unit = {k: f(k) for k in arg.keys()}
unit_rev = {v: k for k, v in unit.items()}
# we require at least Ymd
required = ["year", "month", "day"]
req = sorted(set(required) - set(unit_rev.keys()))
if len(req):
_required = ",".join(req)
raise ValueError(
"to assemble mappings requires at least that "
f"[year, month, day] be specified: [{_required}] is missing"
)
# keys we don't recognize
excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))
if len(excess):
_excess = ",".join(excess)
raise ValueError(
f"extra keys have been passed to the datetime assemblage: [{_excess}]"
)
def coerce(values):
# we allow coercion to if errors allows
values = to_numeric(values, errors=errors)
# prevent overflow in case of int8 or int16
if is_integer_dtype(values):
values = values.astype("int64", copy=False)
return values
values = (
coerce(arg[unit_rev["year"]]) * 10000
+ coerce(arg[unit_rev["month"]]) * 100
+ coerce(arg[unit_rev["day"]])
)
try:
values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)
except (TypeError, ValueError) as err:
raise ValueError(f"cannot assemble the datetimes: {err}") from err
units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]
for u in units:
value = unit_rev.get(u)
if value is not None and value in arg:
try:
values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)
except (TypeError, ValueError) as err:
raise ValueError(
f"cannot assemble the datetimes [{value}]: {err}"
) from err
return values
def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
"""
try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
arg is a passed in as an object dtype, but could really be ints/strings
with nan-like/or floats (e.g. with nan)
Parameters
----------
arg : np.ndarray[object]
errors : {'raise','ignore','coerce'}
"""
def calc(carg):
# calculate the actual result
carg = carg.astype(object, copy=False)
parsed = parsing.try_parse_year_month_day(
carg / 10000, carg / 100 % 100, carg % 100
)
return tslib.array_to_datetime(parsed, errors=errors)[0]
def calc_with_mask(carg, mask):
result = np.empty(carg.shape, dtype="M8[ns]")
iresult = result.view("i8")
iresult[~mask] = iNaT
masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
result[mask] = masked_result.astype("M8[ns]")
return result
# try intlike / strings that are ints
try:
return calc(arg.astype(np.int64))
except (ValueError, OverflowError, TypeError):
pass
# a float with actual np.nan
try:
carg = arg.astype(np.float64)
return calc_with_mask(carg, notna(carg))
except (ValueError, OverflowError, TypeError):
pass
# string with NaN-like
try:
# error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
# "Union[Union[ExtensionArray, ndarray], Index, Series]"
mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type]
return calc_with_mask(arg, mask)
except (ValueError, OverflowError, TypeError):
pass
return None
__all__ = [
"DateParseError",
"should_cache",
"to_datetime",
]