projektAI/venv/Lib/site-packages/pandas/core/indexes/period.py

768 lines
25 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
from datetime import datetime, timedelta
from typing import Any
import warnings
import numpy as np
from pandas._libs import index as libindex, lib
from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick
from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
from pandas._typing import DtypeObj
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, cache_readonly, doc
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_any_dtype,
is_dtype_equal,
is_float,
is_integer,
is_object_dtype,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import PeriodDtype
from pandas.core.arrays.period import (
PeriodArray,
period_array,
raise_on_incompatible,
validate_dtype_freq,
)
import pandas.core.common as com
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
_index_shared_docs,
ensure_index,
maybe_extract_name,
)
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
from pandas.core.indexes.datetimes import DatetimeIndex, Index
from pandas.core.indexes.extension import inherit_names
from pandas.core.indexes.numeric import Int64Index
from pandas.core.ops import get_op_result_name
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"})
# --- Period index sketch
def _new_PeriodIndex(cls, **d):
# GH13277 for unpickling
values = d.pop("data")
if values.dtype == "int64":
freq = d.pop("freq", None)
values = PeriodArray(values, freq=freq)
return cls._simple_new(values, **d)
else:
return cls(values, **d)
@inherit_names(
["strftime", "start_time", "end_time"] + PeriodArray._field_ops,
PeriodArray,
wrap=True,
)
@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray)
class PeriodIndex(DatetimeIndexOpsMixin):
"""
Immutable ndarray holding ordinal values indicating regular periods in time.
Index keys are boxed to Period objects which carries the metadata (eg,
frequency information).
Parameters
----------
data : array-like (1d int np.ndarray or PeriodArray), optional
Optional period-like data to construct index with.
copy : bool
Make a copy of input ndarray.
freq : str or period object, optional
One of pandas period strings or corresponding objects.
year : int, array, or Series, default None
month : int, array, or Series, default None
quarter : int, array, or Series, default None
day : int, array, or Series, default None
hour : int, array, or Series, default None
minute : int, array, or Series, default None
second : int, array, or Series, default None
tz : object, default None
Timezone for converting datetime64 data to Periods.
dtype : str or PeriodDtype, default None
Attributes
----------
day
dayofweek
day_of_week
dayofyear
day_of_year
days_in_month
daysinmonth
end_time
freq
freqstr
hour
is_leap_year
minute
month
quarter
qyear
second
start_time
week
weekday
weekofyear
year
Methods
-------
asfreq
strftime
to_timestamp
See Also
--------
Index : The base pandas Index type.
Period : Represents a period of time.
DatetimeIndex : Index with datetime64 data.
TimedeltaIndex : Index of timedelta64 data.
period_range : Create a fixed-frequency PeriodIndex.
Examples
--------
>>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3])
>>> idx
PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]', freq='Q-DEC')
"""
_typ = "periodindex"
_attributes = ["name", "freq"]
# define my properties & methods for delegation
_is_numeric_dtype = False
_data: PeriodArray
freq: BaseOffset
_data_cls = PeriodArray
_engine_type = libindex.PeriodEngine
_supports_partial_string_indexing = True
# --------------------------------------------------------------------
# methods that dispatch to array and wrap result in Index
# These are defined here instead of via inherit_names for mypy
@doc(PeriodArray.asfreq)
def asfreq(self, freq=None, how: str = "E") -> "PeriodIndex":
arr = self._data.asfreq(freq, how)
return type(self)._simple_new(arr, name=self.name)
@doc(PeriodArray.to_timestamp)
def to_timestamp(self, freq=None, how="start") -> DatetimeIndex:
arr = self._data.to_timestamp(freq, how)
return DatetimeIndex._simple_new(arr, name=self.name)
# error: Decorated property not supported [misc]
@property # type:ignore[misc]
@doc(PeriodArray.hour.fget)
def hour(self) -> Int64Index:
return Int64Index(self._data.hour, name=self.name)
# error: Decorated property not supported [misc]
@property # type:ignore[misc]
@doc(PeriodArray.minute.fget)
def minute(self) -> Int64Index:
return Int64Index(self._data.minute, name=self.name)
# error: Decorated property not supported [misc]
@property # type:ignore[misc]
@doc(PeriodArray.second.fget)
def second(self) -> Int64Index:
return Int64Index(self._data.second, name=self.name)
# ------------------------------------------------------------------------
# Index Constructors
def __new__(
cls,
data=None,
ordinal=None,
freq=None,
tz=None,
dtype=None,
copy=False,
name=None,
**fields,
):
valid_field_set = {
"year",
"month",
"day",
"quarter",
"hour",
"minute",
"second",
}
if not set(fields).issubset(valid_field_set):
argument = list(set(fields) - valid_field_set)[0]
raise TypeError(f"__new__() got an unexpected keyword argument {argument}")
name = maybe_extract_name(name, data, cls)
if data is None and ordinal is None:
# range-based.
data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields)
# PeriodArray._generate range does validation that fields is
# empty when really using the range-based constructor.
freq = freq2
data = PeriodArray(data, freq=freq)
else:
freq = validate_dtype_freq(dtype, freq)
# PeriodIndex allow PeriodIndex(period_index, freq=different)
# Let's not encourage that kind of behavior in PeriodArray.
if freq and isinstance(data, cls) and data.freq != freq:
# TODO: We can do some of these with no-copy / coercion?
# e.g. D -> 2D seems to be OK
data = data.asfreq(freq)
if data is None and ordinal is not None:
# we strangely ignore `ordinal` if data is passed.
ordinal = np.asarray(ordinal, dtype=np.int64)
data = PeriodArray(ordinal, freq=freq)
else:
# don't pass copy here, since we copy later.
data = period_array(data=data, freq=freq)
if copy:
data = data.copy()
return cls._simple_new(data, name=name)
# ------------------------------------------------------------------------
# Data
@property
def values(self) -> np.ndarray:
return np.asarray(self, dtype=object)
def _maybe_convert_timedelta(self, other):
"""
Convert timedelta-like input to an integer multiple of self.freq
Parameters
----------
other : timedelta, np.timedelta64, DateOffset, int, np.ndarray
Returns
-------
converted : int, np.ndarray[int64]
Raises
------
IncompatibleFrequency : if the input cannot be written as a multiple
of self.freq. Note IncompatibleFrequency subclasses ValueError.
"""
if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)):
if isinstance(self.freq, Tick):
# _check_timedeltalike_freq_compat will raise if incompatible
delta = self._data._check_timedeltalike_freq_compat(other)
return delta
elif isinstance(other, BaseOffset):
if other.base == self.freq.base:
return other.n
raise raise_on_incompatible(self, other)
elif is_integer(other):
# integer is passed to .shift via
# _add_datetimelike_methods basically
# but ufunc may pass integer to _add_delta
return other
# raise when input doesn't have freq
raise raise_on_incompatible(self, None)
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
"""
if not isinstance(dtype, PeriodDtype):
return False
return dtype.freq == self.freq
# ------------------------------------------------------------------------
# Rendering Methods
def _mpl_repr(self):
# how to represent ourselves to matplotlib
return self.astype(object)._values
# ------------------------------------------------------------------------
# Indexing
@doc(Index.__contains__)
def __contains__(self, key: Any) -> bool:
if isinstance(key, Period):
if key.freq != self.freq:
return False
else:
return key.ordinal in self._engine
else:
hash(key)
try:
self.get_loc(key)
return True
except KeyError:
return False
@cache_readonly
def _int64index(self) -> Int64Index:
return Int64Index._simple_new(self.asi8, name=self.name)
# ------------------------------------------------------------------------
# Index Methods
def __array_wrap__(self, result, context=None):
"""
Gets called after a ufunc and other functions.
Needs additional handling as PeriodIndex stores internal data as int
dtype
Replace this to __numpy_ufunc__ in future version and implement
__array_function__ for Indexes
"""
if isinstance(context, tuple) and len(context) > 0:
func = context[0]
if func is np.add:
pass
elif func is np.subtract:
name = self.name
left = context[1][0]
right = context[1][1]
if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex):
name = left.name if left.name == right.name else None
return Index(result, name=name)
elif isinstance(left, Period) or isinstance(right, Period):
return Index(result, name=name)
elif isinstance(func, np.ufunc):
if "M->M" not in func.types:
msg = f"ufunc '{func.__name__}' not supported for the PeriodIndex"
# This should be TypeError, but TypeError cannot be raised
# from here because numpy catches.
raise ValueError(msg)
if is_bool_dtype(result):
return result
# the result is object dtype array of Period
# cannot pass _simple_new as it is
return type(self)(result, freq=self.freq, name=self.name)
def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray:
"""
where : array of timestamps
mask : array of booleans where data is not NA
"""
if isinstance(where, DatetimeIndex):
where = PeriodIndex(where._values, freq=self.freq)
elif not isinstance(where, PeriodIndex):
raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex")
return super().asof_locs(where, mask)
@doc(Index.astype)
def astype(self, dtype, copy: bool = True, how=lib.no_default):
dtype = pandas_dtype(dtype)
if how is not lib.no_default:
# GH#37982
warnings.warn(
"The 'how' keyword in PeriodIndex.astype is deprecated and "
"will be removed in a future version. "
"Use index.to_timestamp(how=how) instead",
FutureWarning,
stacklevel=2,
)
else:
how = "start"
if is_datetime64_any_dtype(dtype):
# 'how' is index-specific, isn't part of the EA interface.
tz = getattr(dtype, "tz", None)
return self.to_timestamp(how=how).tz_localize(tz)
return super().astype(dtype, copy=copy)
@property
def is_full(self) -> bool:
"""
Returns True if this PeriodIndex is range-like in that all Periods
between start and end are present, in order.
"""
if len(self) == 0:
return True
if not self.is_monotonic_increasing:
raise ValueError("Index is not monotonic")
values = self.asi8
return ((values[1:] - values[:-1]) < 2).all()
@property
def inferred_type(self) -> str:
# b/c data is represented as ints make sure we can't have ambiguous
# indexing
return "period"
def insert(self, loc: int, item):
if not isinstance(item, Period) or self.freq != item.freq:
return self.astype(object).insert(loc, item)
return DatetimeIndexOpsMixin.insert(self, loc, item)
def join(self, other, how="left", level=None, return_indexers=False, sort=False):
"""
See Index.join
"""
self._assert_can_do_setop(other)
if not isinstance(other, PeriodIndex):
return self.astype(object).join(
other, how=how, level=level, return_indexers=return_indexers, sort=sort
)
# _assert_can_do_setop ensures we have matching dtype
result = super().join(
other,
how=how,
level=level,
return_indexers=return_indexers,
sort=sort,
)
return result
# ------------------------------------------------------------------------
# Indexing Methods
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ensure_index(target)
if not self._should_compare(target):
return self._get_indexer_non_comparable(target, method, unique=True)
if isinstance(target, PeriodIndex):
target = target._get_engine_target() # i.e. target.asi8
self_index = self._int64index
else:
self_index = self
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance, target)
if self_index is not self:
# convert tolerance to i8
tolerance = self._maybe_convert_timedelta(tolerance)
return Index.get_indexer(self_index, target, method, limit, tolerance)
def get_loc(self, key, method=None, tolerance=None):
"""
Get integer location for requested label.
Parameters
----------
key : Period, NaT, str, or datetime
String or datetime key must be parsable as Period.
Returns
-------
loc : int or ndarray[int64]
Raises
------
KeyError
Key is not present in the index.
TypeError
If key is listlike or otherwise not hashable.
"""
orig_key = key
if not is_scalar(key):
raise InvalidIndexError(key)
if isinstance(key, str):
try:
loc = self._get_string_slice(key)
return loc
except (TypeError, ValueError):
pass
try:
asdt, reso = parse_time_string(key, self.freq)
except (ValueError, DateParseError) as err:
# A string with invalid format
raise KeyError(f"Cannot interpret '{key}' as period") from err
reso = Resolution.from_attrname(reso)
grp = reso.freq_group
freqn = self.dtype.freq_group
# _get_string_slice will handle cases where grp < freqn
assert grp >= freqn
# BusinessDay is a bit strange. It has a *lower* code, but we never parse
# a string as "BusinessDay" resolution, just Day.
if grp == freqn or (
reso == Resolution.RESO_DAY and self.dtype.freq.name == "B"
):
key = Period(asdt, freq=self.freq)
loc = self.get_loc(key, method=method, tolerance=tolerance)
return loc
elif method is None:
raise KeyError(key)
else:
key = asdt
elif is_integer(key):
# Period constructor will cast to string, which we dont want
raise KeyError(key)
try:
key = Period(key, freq=self.freq)
except ValueError as err:
# we cannot construct the Period
raise KeyError(orig_key) from err
try:
return Index.get_loc(self, key, method, tolerance)
except KeyError as err:
raise KeyError(orig_key) from err
def _maybe_cast_slice_bound(self, label, side: str, kind: str):
"""
If label is a string or a datetime, cast it to Period.ordinal according
to resolution.
Parameters
----------
label : object
side : {'left', 'right'}
kind : {'loc', 'getitem'}
Returns
-------
bound : Period or object
Notes
-----
Value of `side` parameter should be validated in caller.
"""
assert kind in ["loc", "getitem"]
if isinstance(label, datetime):
return Period(label, freq=self.freq)
elif isinstance(label, str):
try:
parsed, reso = parse_time_string(label, self.freq)
reso = Resolution.from_attrname(reso)
bounds = self._parsed_string_to_bounds(reso, parsed)
return bounds[0 if side == "left" else 1]
except ValueError as err:
# string cannot be parsed as datetime-like
raise self._invalid_indexer("slice", label) from err
elif is_integer(label) or is_float(label):
raise self._invalid_indexer("slice", label)
return label
def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime):
grp = reso.freq_group
iv = Period(parsed, freq=grp)
return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end"))
def _validate_partial_date_slice(self, reso: Resolution):
assert isinstance(reso, Resolution), (type(reso), reso)
grp = reso.freq_group
freqn = self.dtype.freq_group
if not grp < freqn:
# TODO: we used to also check for
# reso in ["day", "hour", "minute", "second"]
# why is that check not needed?
raise ValueError
def _get_string_slice(self, key: str):
parsed, reso = parse_time_string(key, self.freq)
reso = Resolution.from_attrname(reso)
try:
return self._partial_date_slice(reso, parsed)
except KeyError as err:
raise KeyError(key) from err
# ------------------------------------------------------------------------
# Set Operation Methods
def _assert_can_do_setop(self, other):
super()._assert_can_do_setop(other)
# *Can't* use PeriodIndexes of different freqs
# *Can* use PeriodIndex/DatetimeIndex
if isinstance(other, PeriodIndex) and self.freq != other.freq:
raise raise_on_incompatible(self, other)
def _setop(self, other, sort, opname: str):
"""
Perform a set operation by dispatching to the Int64Index implementation.
"""
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
res_name = get_op_result_name(self, other)
other = ensure_index(other)
i8self = Int64Index._simple_new(self.asi8)
i8other = Int64Index._simple_new(other.asi8)
i8result = getattr(i8self, opname)(i8other, sort=sort)
parr = type(self._data)(np.asarray(i8result, dtype=np.int64), dtype=self.dtype)
result = type(self)._simple_new(parr, name=res_name)
return result
def intersection(self, other, sort=False):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
other, _ = self._convert_can_do_setop(other)
if self.equals(other):
if self.has_duplicates:
return self.unique()._get_reconciled_name_object(other)
return self._get_reconciled_name_object(other)
return self._intersection(other, sort=sort)
def _intersection(self, other, sort=False):
if is_object_dtype(other.dtype):
return self.astype("O").intersection(other, sort=sort)
elif not self._is_comparable_dtype(other.dtype):
# We can infer that the intersection is empty.
# assert_can_do_setop ensures that this is not just a mismatched freq
this = self[:0].astype("O")
other = other[:0].astype("O")
return this.intersection(other, sort=sort)
return self._setop(other, sort, opname="intersection")
def difference(self, other, sort=None):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
other, result_name = self._convert_can_do_setop(other)
if self.equals(other):
return self[:0].rename(result_name)
return self._difference(other, sort=sort)
def _difference(self, other, sort):
if is_object_dtype(other):
return self.astype(object).difference(other).astype(self.dtype)
elif not is_dtype_equal(self.dtype, other.dtype):
return self
return self._setop(other, sort, opname="difference")
def _union(self, other, sort):
if not len(other) or self.equals(other) or not len(self):
return super()._union(other, sort=sort)
# We are called by `union`, which is responsible for this validation
assert isinstance(other, type(self))
if not is_dtype_equal(self.dtype, other.dtype):
this = self.astype("O")
other = other.astype("O")
return this._union(other, sort=sort)
return self._setop(other, sort, opname="_union")
# ------------------------------------------------------------------------
def memory_usage(self, deep: bool = False) -> int:
result = super().memory_usage(deep=deep)
if hasattr(self, "_cache") and "_int64index" in self._cache:
result += self._int64index.memory_usage(deep=deep)
return result
def period_range(
start=None, end=None, periods=None, freq=None, name=None
) -> PeriodIndex:
"""
Return a fixed frequency PeriodIndex.
The day (calendar) is the default frequency.
Parameters
----------
start : str or period-like, default None
Left bound for generating periods.
end : str or period-like, default None
Right bound for generating periods.
periods : int, default None
Number of periods to generate.
freq : str or DateOffset, optional
Frequency alias. By default the freq is taken from `start` or `end`
if those are Period objects. Otherwise, the default is ``"D"`` for
daily frequency.
name : str, default None
Name of the resulting PeriodIndex.
Returns
-------
PeriodIndex
Notes
-----
Of the three parameters: ``start``, ``end``, and ``periods``, exactly two
must be specified.
To learn more about the frequency strings, please see `this link
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
Examples
--------
>>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
'2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
'2018-01'],
dtype='period[M]', freq='M')
If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor
endpoints for a ``PeriodIndex`` with frequency matching that of the
``period_range`` constructor.
>>> pd.period_range(start=pd.Period('2017Q1', freq='Q'),
... end=pd.Period('2017Q2', freq='Q'), freq='M')
PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'],
dtype='period[M]', freq='M')
"""
if com.count_not_none(start, end, periods) != 2:
raise ValueError(
"Of the three parameters: start, end, and periods, "
"exactly two must be specified"
)
if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)):
freq = "D"
data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={})
data = PeriodArray(data, freq=freq)
return PeriodIndex(data, name=name)