projektAI/venv/Lib/site-packages/pandas/core/arrays/sparse/array.py

1529 lines
49 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
SparseArray data structure
"""
from collections import abc
import numbers
import operator
from typing import Any, Callable, Sequence, Type, TypeVar, Union
import warnings
import numpy as np
from pandas._libs import lib
import pandas._libs.sparse as splib
from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex
from pandas._libs.tslibs import NaT
from pandas._typing import Scalar
from pandas.compat.numpy import function as nv
from pandas.errors import PerformanceWarning
from pandas.core.dtypes.cast import (
astype_nansafe,
construct_1d_arraylike_from_scalar,
find_common_type,
infer_dtype_from_scalar,
maybe_box_datetimelike,
)
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_datetime64_any_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
is_integer,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
import pandas.core.algorithms as algos
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse.dtype import SparseDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array, sanitize_array
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import interpolate_2d
from pandas.core.nanops import check_below_min_count
import pandas.core.ops as ops
import pandas.io.formats.printing as printing
# ----------------------------------------------------------------------------
# Array
SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
_sparray_doc_kwargs = {"klass": "SparseArray"}
def _get_fill(arr: "SparseArray") -> np.ndarray:
"""
Create a 0-dim ndarray containing the fill value
Parameters
----------
arr : SparseArray
Returns
-------
fill_value : ndarray
0-dim ndarray with just the fill value.
Notes
-----
coerce fill_value to arr dtype if possible
int64 SparseArray can have NaN as fill_value if there is no missing
"""
try:
return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
except ValueError:
return np.asarray(arr.fill_value)
def _sparse_array_op(
left: "SparseArray", right: "SparseArray", op: Callable, name: str
) -> Any:
"""
Perform a binary operation between two arrays.
Parameters
----------
left : Union[SparseArray, ndarray]
right : Union[SparseArray, ndarray]
op : Callable
The binary operation to perform
name str
Name of the callable.
Returns
-------
SparseArray
"""
if name.startswith("__"):
# For lookups in _libs.sparse we need non-dunder op name
name = name[2:-2]
# dtype used to find corresponding sparse method
ltype = left.dtype.subtype
rtype = right.dtype.subtype
if not is_dtype_equal(ltype, rtype):
subtype = find_common_type([ltype, rtype])
ltype = SparseDtype(subtype, left.fill_value)
rtype = SparseDtype(subtype, right.fill_value)
# TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
left = left.astype(ltype)
right = right.astype(rtype)
dtype = ltype.subtype
else:
dtype = ltype
# dtype the result must have
result_dtype = None
if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
with np.errstate(all="ignore"):
result = op(left.to_dense(), right.to_dense())
fill = op(_get_fill(left), _get_fill(right))
if left.sp_index.ngaps == 0:
index = left.sp_index
else:
index = right.sp_index
elif left.sp_index.equals(right.sp_index):
with np.errstate(all="ignore"):
result = op(left.sp_values, right.sp_values)
fill = op(_get_fill(left), _get_fill(right))
index = left.sp_index
else:
if name[0] == "r":
left, right = right, left
name = name[1:]
if name in ("and", "or", "xor") and dtype == "bool":
opname = f"sparse_{name}_uint8"
# to make template simple, cast here
left_sp_values = left.sp_values.view(np.uint8)
right_sp_values = right.sp_values.view(np.uint8)
result_dtype = bool
else:
opname = f"sparse_{name}_{dtype}"
left_sp_values = left.sp_values
right_sp_values = right.sp_values
sparse_op = getattr(splib, opname)
with np.errstate(all="ignore"):
result, index, fill = sparse_op(
left_sp_values,
left.sp_index,
left.fill_value,
right_sp_values,
right.sp_index,
right.fill_value,
)
if result_dtype is None:
result_dtype = result.dtype
return _wrap_result(name, result, index, fill, dtype=result_dtype)
def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
"""
wrap op result to have correct dtype
"""
if name.startswith("__"):
# e.g. __eq__ --> eq
name = name[2:-2]
if name in ("eq", "ne", "lt", "gt", "le", "ge"):
dtype = bool
fill_value = lib.item_from_zerodim(fill_value)
if is_bool_dtype(dtype):
# fill_value may be np.bool_
fill_value = bool(fill_value)
return SparseArray(
data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
)
class SparseArray(OpsMixin, PandasObject, ExtensionArray):
"""
An ExtensionArray for storing sparse data.
.. versionchanged:: 0.24.0
Implements the ExtensionArray interface.
Parameters
----------
data : array-like
A dense array of values to store in the SparseArray. This may contain
`fill_value`.
sparse_index : SparseIndex, optional
index : Index
fill_value : scalar, optional
Elements in `data` that are `fill_value` are not stored in the
SparseArray. For memory savings, this should be the most common value
in `data`. By default, `fill_value` depends on the dtype of `data`:
=========== ==========
data.dtype na_value
=========== ==========
float ``np.nan``
int ``0``
bool False
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
=========== ==========
The fill value is potentially specified in three ways. In order of
precedence, these are
1. The `fill_value` argument
2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
a ``SparseDtype``
3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
is not a ``SparseDtype`` and `data` is a ``SparseArray``.
kind : {'integer', 'block'}, default 'integer'
The type of storage for sparse locations.
* 'block': Stores a `block` and `block_length` for each
contiguous *span* of sparse values. This is best when
sparse data tends to be clumped together, with large
regions of ``fill-value`` values between sparse values.
* 'integer': uses an integer to store the location of
each sparse value.
dtype : np.dtype or SparseDtype, optional
The dtype to use for the SparseArray. For numpy dtypes, this
determines the dtype of ``self.sp_values``. For SparseDtype,
this determines ``self.sp_values`` and ``self.fill_value``.
copy : bool, default False
Whether to explicitly copy the incoming `data` array.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> from pandas.arrays import SparseArray
>>> arr = SparseArray([0, 0, 1, 2])
>>> arr
[0, 0, 1, 2]
Fill: 0
IntIndex
Indices: array([2, 3], dtype=int32)
"""
_subtyp = "sparse_array" # register ABCSparseArray
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"])
_sparse_index: SparseIndex
def __init__(
self,
data,
sparse_index=None,
index=None,
fill_value=None,
kind="integer",
dtype=None,
copy=False,
):
if fill_value is None and isinstance(dtype, SparseDtype):
fill_value = dtype.fill_value
if isinstance(data, type(self)):
# disable normal inference on dtype, sparse_index, & fill_value
if sparse_index is None:
sparse_index = data.sp_index
if fill_value is None:
fill_value = data.fill_value
if dtype is None:
dtype = data.dtype
# TODO: make kind=None, and use data.kind?
data = data.sp_values
# Handle use-provided dtype
if isinstance(dtype, str):
# Two options: dtype='int', regular numpy dtype
# or dtype='Sparse[int]', a sparse dtype
try:
dtype = SparseDtype.construct_from_string(dtype)
except TypeError:
dtype = pandas_dtype(dtype)
if isinstance(dtype, SparseDtype):
if fill_value is None:
fill_value = dtype.fill_value
dtype = dtype.subtype
if index is not None and not is_scalar(data):
raise Exception("must only pass scalars with an index")
if is_scalar(data):
if index is not None and data is None:
data = np.nan
if index is not None:
npoints = len(index)
elif sparse_index is None:
npoints = 1
else:
npoints = sparse_index.length
dtype = infer_dtype_from_scalar(data)[0]
data = construct_1d_arraylike_from_scalar(data, npoints, dtype)
if dtype is not None:
dtype = pandas_dtype(dtype)
# TODO: disentangle the fill_value dtype inference from
# dtype inference
if data is None:
# TODO: What should the empty dtype be? Object or float?
data = np.array([], dtype=dtype)
if not is_array_like(data):
try:
# probably shared code in sanitize_series
data = sanitize_array(data, index=None)
except ValueError:
# NumPy may raise a ValueError on data like [1, []]
# we retry with object dtype here.
if dtype is None:
dtype = object
data = np.atleast_1d(np.asarray(data, dtype=dtype))
else:
raise
if copy:
# TODO: avoid double copy when dtype forces cast.
data = data.copy()
if fill_value is None:
fill_value_dtype = data.dtype if dtype is None else dtype
if fill_value_dtype is None:
fill_value = np.nan
else:
fill_value = na_value_for_dtype(fill_value_dtype)
if isinstance(data, type(self)) and sparse_index is None:
sparse_index = data._sparse_index
sparse_values = np.asarray(data.sp_values, dtype=dtype)
elif sparse_index is None:
data = extract_array(data, extract_numpy=True)
if not isinstance(data, np.ndarray):
# EA
if is_datetime64tz_dtype(data.dtype):
warnings.warn(
f"Creating SparseArray from {data.dtype} data "
"loses timezone information. Cast to object before "
"sparse to retain timezone information.",
UserWarning,
stacklevel=2,
)
data = np.asarray(data, dtype="datetime64[ns]")
data = np.asarray(data)
sparse_values, sparse_index, fill_value = make_sparse(
data, kind=kind, fill_value=fill_value, dtype=dtype
)
else:
sparse_values = np.asarray(data, dtype=dtype)
if len(sparse_values) != sparse_index.npoints:
raise AssertionError(
f"Non array-like type {type(sparse_values)} must "
"have the same length as the index"
)
self._sparse_index = sparse_index
self._sparse_values = sparse_values
self._dtype = SparseDtype(sparse_values.dtype, fill_value)
@classmethod
def _simple_new(
cls: Type[SparseArrayT],
sparse_array: np.ndarray,
sparse_index: SparseIndex,
dtype: SparseDtype,
) -> SparseArrayT:
new = object.__new__(cls)
new._sparse_index = sparse_index
new._sparse_values = sparse_array
new._dtype = dtype
return new
@classmethod
def from_spmatrix(cls, data):
"""
Create a SparseArray from a scipy.sparse matrix.
.. versionadded:: 0.25.0
Parameters
----------
data : scipy.sparse.sp_matrix
This should be a SciPy sparse matrix where the size
of the second dimension is 1. In other words, a
sparse matrix with a single column.
Returns
-------
SparseArray
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.coo_matrix((4, 1))
>>> pd.arrays.SparseArray.from_spmatrix(mat)
[0.0, 0.0, 0.0, 0.0]
Fill: 0.0
IntIndex
Indices: array([], dtype=int32)
"""
length, ncol = data.shape
if ncol != 1:
raise ValueError(f"'data' must have a single column, not '{ncol}'")
# our sparse index classes require that the positions be strictly
# increasing. So we need to sort loc, and arr accordingly.
data = data.tocsc()
data.sort_indices()
arr = data.data
idx = data.indices
zero = np.array(0, dtype=arr.dtype).item()
dtype = SparseDtype(arr.dtype, zero)
index = IntIndex(length, idx)
return cls._simple_new(arr, index, dtype)
def __array__(self, dtype=None) -> np.ndarray:
fill_value = self.fill_value
if self.sp_index.ngaps == 0:
# Compat for na dtype and int values.
return self.sp_values
if dtype is None:
# Can NumPy represent this type?
# If not, `np.result_type` will raise. We catch that
# and return object.
if is_datetime64_any_dtype(self.sp_values.dtype):
# However, we *do* special-case the common case of
# a datetime64 with pandas NaT.
if fill_value is NaT:
# Can't put pd.NaT in a datetime64[ns]
fill_value = np.datetime64("NaT")
try:
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
except TypeError:
dtype = object
out = np.full(self.shape, fill_value, dtype=dtype)
out[self.sp_index.to_int_index().indices] = self.sp_values
return out
def __setitem__(self, key, value):
# I suppose we could allow setting of non-fill_value elements.
# TODO(SparseArray.__setitem__): remove special cases in
# ExtensionBlock.where
msg = "SparseArray does not support item assignment via setitem"
raise TypeError(msg)
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
return cls(scalars, dtype=dtype)
@classmethod
def _from_factorized(cls, values, original):
return cls(values, dtype=original.dtype)
# ------------------------------------------------------------------------
# Data
# ------------------------------------------------------------------------
@property
def sp_index(self):
"""
The SparseIndex containing the location of non- ``fill_value`` points.
"""
return self._sparse_index
@property
def sp_values(self) -> np.ndarray:
"""
An ndarray containing the non- ``fill_value`` values.
Examples
--------
>>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
>>> s.sp_values
array([1, 2])
"""
return self._sparse_values
@property
def dtype(self):
return self._dtype
@property
def fill_value(self):
"""
Elements in `data` that are `fill_value` are not stored.
For memory savings, this should be the most common value in the array.
"""
return self.dtype.fill_value
@fill_value.setter
def fill_value(self, value):
self._dtype = SparseDtype(self.dtype.subtype, value)
@property
def kind(self) -> str:
"""
The kind of sparse index for this array. One of {'integer', 'block'}.
"""
if isinstance(self.sp_index, IntIndex):
return "integer"
else:
return "block"
@property
def _valid_sp_values(self):
sp_vals = self.sp_values
mask = notna(sp_vals)
return sp_vals[mask]
def __len__(self) -> int:
return self.sp_index.length
@property
def _null_fill_value(self):
return self._dtype._is_na_fill_value
def _fill_value_matches(self, fill_value):
if self._null_fill_value:
return isna(fill_value)
else:
return self.fill_value == fill_value
@property
def nbytes(self) -> int:
return self.sp_values.nbytes + self.sp_index.nbytes
@property
def density(self):
"""
The percent of non- ``fill_value`` points, as decimal.
Examples
--------
>>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
>>> s.density
0.6
"""
return float(self.sp_index.npoints) / float(self.sp_index.length)
@property
def npoints(self) -> int:
"""
The number of non- ``fill_value`` points.
Examples
--------
>>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
>>> s.npoints
3
"""
return self.sp_index.npoints
def isna(self):
# If null fill value, we want SparseDtype[bool, true]
# to preserve the same memory usage.
dtype = SparseDtype(bool, self._null_fill_value)
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
def fillna(self, value=None, method=None, limit=None):
"""
Fill missing values with `value`.
Parameters
----------
value : scalar, optional
method : str, optional
.. warning::
Using 'method' will result in high memory use,
as all `fill_value` methods will be converted to
an in-memory ndarray
limit : int, optional
Returns
-------
SparseArray
Notes
-----
When `value` is specified, the result's ``fill_value`` depends on
``self.fill_value``. The goal is to maintain low-memory use.
If ``self.fill_value`` is NA, the result dtype will be
``SparseDtype(self.dtype, fill_value=value)``. This will preserve
amount of memory used before and after filling.
When ``self.fill_value`` is not NA, the result dtype will be
``self.dtype``. Again, this preserves the amount of memory used.
"""
if (method is None and value is None) or (
method is not None and value is not None
):
raise ValueError("Must specify one of 'method' or 'value'.")
elif method is not None:
msg = "fillna with 'method' requires high memory usage."
warnings.warn(msg, PerformanceWarning)
filled = interpolate_2d(np.asarray(self), method=method, limit=limit)
return type(self)(filled, fill_value=self.fill_value)
else:
new_values = np.where(isna(self.sp_values), value, self.sp_values)
if self._null_fill_value:
# This is essentially just updating the dtype.
new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
else:
new_dtype = self.dtype
return self._simple_new(new_values, self._sparse_index, new_dtype)
def shift(self, periods=1, fill_value=None):
if not len(self) or periods == 0:
return self.copy()
if isna(fill_value):
fill_value = self.dtype.na_value
subtype = np.result_type(fill_value, self.dtype.subtype)
if subtype != self.dtype.subtype:
# just coerce up front
arr = self.astype(SparseDtype(subtype, self.fill_value))
else:
arr = self
empty = self._from_sequence(
[fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
)
if periods > 0:
a = empty
b = arr[:-periods]
else:
a = arr[abs(periods) :]
b = empty
return arr._concat_same_type([a, b])
def _first_fill_value_loc(self):
"""
Get the location of the first missing value.
Returns
-------
int
"""
if len(self) == 0 or self.sp_index.npoints == len(self):
return -1
indices = self.sp_index.to_int_index().indices
if not len(indices) or indices[0] > 0:
return 0
diff = indices[1:] - indices[:-1]
return np.searchsorted(diff, 2) + 1
def unique(self):
uniques = list(algos.unique(self.sp_values))
fill_loc = self._first_fill_value_loc()
if fill_loc >= 0:
uniques.insert(fill_loc, self.fill_value)
return type(self)._from_sequence(uniques, dtype=self.dtype)
def _values_for_factorize(self):
# Still override this for hash_pandas_object
return np.asarray(self), self.fill_value
def factorize(self, na_sentinel=-1):
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
# The sparsity on this is backwards from what Sparse would want. Want
# ExtensionArray.factorize -> Tuple[EA, EA]
# Given that we have to return a dense array of codes, why bother
# implementing an efficient factorize?
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
uniques = SparseArray(uniques, dtype=self.dtype)
return codes, uniques
def value_counts(self, dropna=True):
"""
Returns a Series containing counts of unique values.
Parameters
----------
dropna : boolean, default True
Don't include counts of NaN, even if NaN is in sp_values.
Returns
-------
counts : Series
"""
from pandas import Index, Series
keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
fcounts = self.sp_index.ngaps
if fcounts > 0 and (not self._null_fill_value or not dropna):
mask = isna(keys) if self._null_fill_value else keys == self.fill_value
if mask.any():
counts[mask] += fcounts
else:
keys = np.insert(keys, 0, self.fill_value)
counts = np.insert(counts, 0, fcounts)
if not isinstance(keys, ABCIndexClass):
keys = Index(keys)
return Series(counts, index=keys)
# --------
# Indexing
# --------
def __getitem__(self, key):
if isinstance(key, tuple):
if len(key) > 1:
raise IndexError("too many indices for array.")
key = key[0]
if is_integer(key):
return self._get_val_at(key)
elif isinstance(key, tuple):
data_slice = self.to_dense()[key]
elif isinstance(key, slice):
# special case to preserve dtypes
if key == slice(None):
return self.copy()
# TODO: this logic is surely elsewhere
# TODO: this could be more efficient
indices = np.arange(len(self), dtype=np.int32)[key]
return self.take(indices)
else:
# TODO: I think we can avoid densifying when masking a
# boolean SparseArray with another. Need to look at the
# key's fill_value for True / False, and then do an intersection
# on the indices of the sp_values.
if isinstance(key, SparseArray):
if is_bool_dtype(key):
key = key.to_dense()
else:
key = np.asarray(key)
key = check_array_indexer(self, key)
if com.is_bool_indexer(key):
return self.take(np.arange(len(key), dtype=np.int32)[key])
elif hasattr(key, "__len__"):
return self.take(key)
else:
raise ValueError(f"Cannot slice with '{key}'")
return type(self)(data_slice, kind=self.kind)
def _get_val_at(self, loc):
n = len(self)
if loc < 0:
loc += n
if loc >= n or loc < 0:
raise IndexError("Out of bounds access")
sp_loc = self.sp_index.lookup(loc)
if sp_loc == -1:
return self.fill_value
else:
val = self.sp_values[sp_loc]
val = maybe_box_datetimelike(val, self.sp_values.dtype)
return val
def take(self, indices, *, allow_fill=False, fill_value=None) -> "SparseArray":
if is_scalar(indices):
raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
indices = np.asarray(indices, dtype=np.int32)
if indices.size == 0:
result = np.array([], dtype="object")
kwargs = {"dtype": self.dtype}
elif allow_fill:
result = self._take_with_fill(indices, fill_value=fill_value)
kwargs = {}
else:
result = self._take_without_fill(indices)
kwargs = {"dtype": self.dtype}
return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs)
def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
if fill_value is None:
fill_value = self.dtype.na_value
if indices.min() < -1:
raise ValueError(
"Invalid value in 'indices'. Must be between -1 "
"and the length of the array."
)
if indices.max() >= len(self):
raise IndexError("out of bounds value in 'indices'.")
if len(self) == 0:
# Empty... Allow taking only if all empty
if (indices == -1).all():
dtype = np.result_type(self.sp_values, type(fill_value))
taken = np.empty_like(indices, dtype=dtype)
taken.fill(fill_value)
return taken
else:
raise IndexError("cannot do a non-empty take from an empty axes.")
# sp_indexer may be -1 for two reasons
# 1.) we took for an index of -1 (new)
# 2.) we took a value that was self.fill_value (old)
sp_indexer = self.sp_index.lookup_array(indices)
new_fill_indices = indices == -1
old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
if self.sp_index.npoints == 0 and old_fill_indices.all():
# We've looked up all valid points on an all-sparse array.
taken = np.full(
sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
)
elif self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
_dtype = np.result_type(self.dtype.subtype, type(fill_value))
taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
else:
taken = self.sp_values.take(sp_indexer)
# Fill in two steps.
# Old fill values
# New fill values
# potentially coercing to a new dtype at each stage.
m0 = sp_indexer[old_fill_indices] < 0
m1 = sp_indexer[new_fill_indices] < 0
result_type = taken.dtype
if m0.any():
result_type = np.result_type(result_type, type(self.fill_value))
taken = taken.astype(result_type)
taken[old_fill_indices] = self.fill_value
if m1.any():
result_type = np.result_type(result_type, type(fill_value))
taken = taken.astype(result_type)
taken[new_fill_indices] = fill_value
return taken
def _take_without_fill(self, indices) -> Union[np.ndarray, "SparseArray"]:
to_shift = indices < 0
indices = indices.copy()
n = len(self)
if (indices.max() >= n) or (indices.min() < -n):
if n == 0:
raise IndexError("cannot do a non-empty take from an empty axes.")
else:
raise IndexError("out of bounds value in 'indices'.")
if to_shift.any():
indices[to_shift] += n
if self.sp_index.npoints == 0:
# edge case in take...
# I think just return
out = np.full(
indices.shape,
self.fill_value,
dtype=np.result_type(type(self.fill_value)),
)
arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value)
return type(self)(arr, sparse_index=sp_index, fill_value=fill_value)
sp_indexer = self.sp_index.lookup_array(indices)
taken = self.sp_values.take(sp_indexer)
fillable = sp_indexer < 0
if fillable.any():
# TODO: may need to coerce array to fill value
result_type = np.result_type(taken, type(self.fill_value))
taken = taken.astype(result_type)
taken[fillable] = self.fill_value
return taken
def searchsorted(self, v, side="left", sorter=None):
msg = "searchsorted requires high memory usage."
warnings.warn(msg, PerformanceWarning, stacklevel=2)
if not is_scalar(v):
v = np.asarray(v)
v = np.asarray(v)
return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
def copy(self: SparseArrayT) -> SparseArrayT:
values = self.sp_values.copy()
return self._simple_new(values, self.sp_index, self.dtype)
@classmethod
def _concat_same_type(
cls: Type[SparseArrayT], to_concat: Sequence[SparseArrayT]
) -> SparseArrayT:
fill_value = to_concat[0].fill_value
values = []
length = 0
if to_concat:
sp_kind = to_concat[0].kind
else:
sp_kind = "integer"
if sp_kind == "integer":
indices = []
for arr in to_concat:
idx = arr.sp_index.to_int_index().indices.copy()
idx += length # TODO: wraparound
length += arr.sp_index.length
values.append(arr.sp_values)
indices.append(idx)
data = np.concatenate(values)
indices = np.concatenate(indices)
sp_index = IntIndex(length, indices)
else:
# when concatenating block indices, we don't claim that you'll
# get an identical index as concating the values and then
# creating a new index. We don't want to spend the time trying
# to merge blocks across arrays in `to_concat`, so the resulting
# BlockIndex may have more blocks.
blengths = []
blocs = []
for arr in to_concat:
idx = arr.sp_index.to_block_index()
values.append(arr.sp_values)
blocs.append(idx.blocs.copy() + length)
blengths.append(idx.blengths)
length += arr.sp_index.length
data = np.concatenate(values)
blocs = np.concatenate(blocs)
blengths = np.concatenate(blengths)
sp_index = BlockIndex(length, blocs, blengths)
return cls(data, sparse_index=sp_index, fill_value=fill_value)
def astype(self, dtype=None, copy=True):
"""
Change the dtype of a SparseArray.
The output will always be a SparseArray. To convert to a dense
ndarray with a certain dtype, use :meth:`numpy.asarray`.
Parameters
----------
dtype : np.dtype or ExtensionDtype
For SparseDtype, this changes the dtype of
``self.sp_values`` and the ``self.fill_value``.
For other dtypes, this only changes the dtype of
``self.sp_values``.
copy : bool, default True
Whether to ensure a copy is made, even if not necessary.
Returns
-------
SparseArray
Examples
--------
>>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
>>> arr
[0, 0, 1, 2]
Fill: 0
IntIndex
Indices: array([2, 3], dtype=int32)
>>> arr.astype(np.dtype('int32'))
[0, 0, 1, 2]
Fill: 0
IntIndex
Indices: array([2, 3], dtype=int32)
Using a NumPy dtype with a different kind (e.g. float) will coerce
just ``self.sp_values``.
>>> arr.astype(np.dtype('float64'))
... # doctest: +NORMALIZE_WHITESPACE
[0.0, 0.0, 1.0, 2.0]
Fill: 0.0
IntIndex
Indices: array([2, 3], dtype=int32)
Use a SparseDtype if you wish to be change the fill value as well.
>>> arr.astype(SparseDtype("float64", fill_value=np.nan))
... # doctest: +NORMALIZE_WHITESPACE
[nan, nan, 1.0, 2.0]
Fill: nan
IntIndex
Indices: array([2, 3], dtype=int32)
"""
if is_dtype_equal(dtype, self._dtype):
if not copy:
return self
else:
return self.copy()
dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
# TODO copy=False is broken for astype_nansafe with int -> float, so cannot
# passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456
sp_values = astype_nansafe(self.sp_values, subtype, copy=True)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()
return self._simple_new(sp_values, self.sp_index, dtype)
def map(self, mapper):
"""
Map categories using input correspondence (dict, Series, or function).
Parameters
----------
mapper : dict, Series, callable
The correspondence from old values to new.
Returns
-------
SparseArray
The output array will have the same density as the input.
The output fill value will be the result of applying the
mapping to ``self.fill_value``
Examples
--------
>>> arr = pd.arrays.SparseArray([0, 1, 2])
>>> arr.map(lambda x: x + 10)
[10, 11, 12]
Fill: 10
IntIndex
Indices: array([1, 2], dtype=int32)
>>> arr.map({0: 10, 1: 11, 2: 12})
[10, 11, 12]
Fill: 10
IntIndex
Indices: array([1, 2], dtype=int32)
>>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
[10, 11, 12]
Fill: 10
IntIndex
Indices: array([1, 2], dtype=int32)
"""
# this is used in apply.
# We get hit since we're an "is_extension_type" but regular extension
# types are not hit. This may be worth adding to the interface.
if isinstance(mapper, ABCSeries):
mapper = mapper.to_dict()
if isinstance(mapper, abc.Mapping):
fill_value = mapper.get(self.fill_value, self.fill_value)
sp_values = [mapper.get(x, None) for x in self.sp_values]
else:
fill_value = mapper(self.fill_value)
sp_values = [mapper(x) for x in self.sp_values]
return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
def to_dense(self):
"""
Convert SparseArray to a NumPy array.
Returns
-------
arr : NumPy array
"""
return np.asarray(self, dtype=self.sp_values.dtype)
_internal_get_values = to_dense
# ------------------------------------------------------------------------
# IO
# ------------------------------------------------------------------------
def __setstate__(self, state):
"""Necessary for making this object picklable"""
if isinstance(state, tuple):
# Compat for pandas < 0.24.0
nd_state, (fill_value, sp_index) = state
sparse_values = np.array([])
sparse_values.__setstate__(nd_state)
self._sparse_values = sparse_values
self._sparse_index = sp_index
self._dtype = SparseDtype(sparse_values.dtype, fill_value)
else:
self.__dict__.update(state)
def nonzero(self):
if self.fill_value == 0:
return (self.sp_index.to_int_index().indices,)
else:
return (self.sp_index.to_int_index().indices[self.sp_values != 0],)
# ------------------------------------------------------------------------
# Reductions
# ------------------------------------------------------------------------
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
method = getattr(self, name, None)
if method is None:
raise TypeError(f"cannot perform {name} with type {self.dtype}")
if skipna:
arr = self
else:
arr = self.dropna()
# we don't support these kwargs.
# They should only be present when called via pandas, so do it here.
# instead of in `any` / `all` (which will raise if they're present,
# thanks to nv.validate
kwargs.pop("filter_type", None)
kwargs.pop("numeric_only", None)
kwargs.pop("op", None)
return getattr(arr, name)(**kwargs)
def all(self, axis=None, *args, **kwargs):
"""
Tests whether all elements evaluate True
Returns
-------
all : bool
See Also
--------
numpy.all
"""
nv.validate_all(args, kwargs)
values = self.sp_values
if len(values) != len(self) and not np.all(self.fill_value):
return False
return values.all()
def any(self, axis=0, *args, **kwargs):
"""
Tests whether at least one of elements evaluate True
Returns
-------
any : bool
See Also
--------
numpy.any
"""
nv.validate_any(args, kwargs)
values = self.sp_values
if len(values) != len(self) and np.any(self.fill_value):
return True
return values.any().item()
def sum(self, axis: int = 0, min_count: int = 0, *args, **kwargs) -> Scalar:
"""
Sum of non-NA/null values
Parameters
----------
axis : int, default 0
Not Used. NumPy compatibility.
min_count : int, default 0
The required number of valid values to perform the summation. If fewer
than ``min_count`` valid values are present, the result will be the missing
value indicator for subarray type.
*args, **kwargs
Not Used. NumPy compatibility.
Returns
-------
scalar
"""
nv.validate_sum(args, kwargs)
valid_vals = self._valid_sp_values
sp_sum = valid_vals.sum()
if self._null_fill_value:
if check_below_min_count(valid_vals.shape, None, min_count):
return na_value_for_dtype(self.dtype.subtype, compat=False)
return sp_sum
else:
nsparse = self.sp_index.ngaps
if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
return na_value_for_dtype(self.dtype.subtype, compat=False)
return sp_sum + self.fill_value * nsparse
def cumsum(self, axis=0, *args, **kwargs):
"""
Cumulative sum of non-NA/null values.
When performing the cumulative summation, any non-NA/null values will
be skipped. The resulting SparseArray will preserve the locations of
NaN values, but the fill value will be `np.nan` regardless.
Parameters
----------
axis : int or None
Axis over which to perform the cumulative summation. If None,
perform cumulative summation over flattened array.
Returns
-------
cumsum : SparseArray
"""
nv.validate_cumsum(args, kwargs)
if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
raise ValueError(f"axis(={axis}) out of bounds")
if not self._null_fill_value:
return SparseArray(self.to_dense()).cumsum()
return SparseArray(
self.sp_values.cumsum(),
sparse_index=self.sp_index,
fill_value=self.fill_value,
)
def mean(self, axis=0, *args, **kwargs):
"""
Mean of non-NA/null values
Returns
-------
mean : float
"""
nv.validate_mean(args, kwargs)
valid_vals = self._valid_sp_values
sp_sum = valid_vals.sum()
ct = len(valid_vals)
if self._null_fill_value:
return sp_sum / ct
else:
nsparse = self.sp_index.ngaps
return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
# ------------------------------------------------------------------------
# Ufuncs
# ------------------------------------------------------------------------
_HANDLED_TYPES = (np.ndarray, numbers.Number)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
out = kwargs.get("out", ())
for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
return NotImplemented
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
if len(inputs) == 1:
# No alignment necessary.
sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
if isinstance(sp_values, tuple):
# multiple outputs. e.g. modf
arrays = tuple(
self._simple_new(
sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
)
for sp_value, fv in zip(sp_values, fill_value)
)
return arrays
elif is_scalar(sp_values):
# e.g. reductions
return sp_values
return self._simple_new(
sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
)
result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs)
if out:
if len(out) == 1:
out = out[0]
return out
if type(result) is tuple:
return tuple(type(self)(x) for x in result)
elif method == "at":
# no return value
return None
else:
return type(self)(result)
def __abs__(self):
return np.abs(self)
# ------------------------------------------------------------------------
# Ops
# ------------------------------------------------------------------------
def _arith_method(self, other, op):
op_name = op.__name__
if isinstance(other, SparseArray):
return _sparse_array_op(self, other, op, op_name)
elif is_scalar(other):
with np.errstate(all="ignore"):
fill = op(_get_fill(self), np.asarray(other))
result = op(self.sp_values, other)
if op_name == "divmod":
left, right = result
lfill, rfill = fill
return (
_wrap_result(op_name, left, self.sp_index, lfill),
_wrap_result(op_name, right, self.sp_index, rfill),
)
return _wrap_result(op_name, result, self.sp_index, fill)
else:
other = np.asarray(other)
with np.errstate(all="ignore"):
# TODO: look into _wrap_result
if len(self) != len(other):
raise AssertionError(
f"length mismatch: {len(self)} vs. {len(other)}"
)
if not isinstance(other, SparseArray):
dtype = getattr(other, "dtype", None)
other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
return _sparse_array_op(self, other, op, op_name)
def _cmp_method(self, other, op) -> "SparseArray":
if not is_scalar(other) and not isinstance(other, type(self)):
# convert list-like to ndarray
other = np.asarray(other)
if isinstance(other, np.ndarray):
# TODO: make this more flexible than just ndarray...
if len(self) != len(other):
raise AssertionError(f"length mismatch: {len(self)} vs. {len(other)}")
other = SparseArray(other, fill_value=self.fill_value)
if isinstance(other, SparseArray):
op_name = op.__name__.strip("_")
return _sparse_array_op(self, other, op, op_name)
else:
with np.errstate(all="ignore"):
fill_value = op(self.fill_value, other)
result = op(self.sp_values, other)
return type(self)(
result,
sparse_index=self.sp_index,
fill_value=fill_value,
dtype=np.bool_,
)
_logical_method = _cmp_method
def _unary_method(self, op) -> "SparseArray":
fill_value = op(np.array(self.fill_value)).item()
values = op(self.sp_values)
dtype = SparseDtype(values.dtype, fill_value)
return type(self)._simple_new(values, self.sp_index, dtype)
def __pos__(self) -> "SparseArray":
return self._unary_method(operator.pos)
def __neg__(self) -> "SparseArray":
return self._unary_method(operator.neg)
def __invert__(self) -> "SparseArray":
return self._unary_method(operator.invert)
# ----------
# Formatting
# -----------
def __repr__(self) -> str:
pp_str = printing.pprint_thing(self)
pp_fill = printing.pprint_thing(self.fill_value)
pp_index = printing.pprint_thing(self.sp_index)
return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
def _formatter(self, boxed=False):
# Defer to the formatter from the GenericArrayFormatter calling us.
# This will infer the correct formatter from the dtype of the values.
return None
def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None):
"""
Convert ndarray to sparse format
Parameters
----------
arr : ndarray
kind : {'block', 'integer'}
fill_value : NaN or another value
dtype : np.dtype, optional
copy : bool, default False
Returns
-------
(sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
"""
assert isinstance(arr, np.ndarray)
if arr.ndim > 1:
raise TypeError("expected dimension <= 1 data")
if fill_value is None:
fill_value = na_value_for_dtype(arr.dtype)
if isna(fill_value):
mask = notna(arr)
else:
# cast to object comparison to be safe
if is_string_dtype(arr.dtype):
arr = arr.astype(object)
if is_object_dtype(arr.dtype):
# element-wise equality check method in numpy doesn't treat
# each element type, eg. 0, 0.0, and False are treated as
# same. So we have to check the both of its type and value.
mask = splib.make_mask_object_ndarray(arr, fill_value)
else:
mask = arr != fill_value
length = len(arr)
if length != len(mask):
# the arr is a SparseArray
indices = mask.sp_index.indices
else:
indices = mask.nonzero()[0].astype(np.int32)
index = make_sparse_index(length, indices, kind)
sparsified_values = arr[mask]
if dtype is not None:
sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
# TODO: copy
return sparsified_values, index, fill_value
def make_sparse_index(length, indices, kind):
if kind == "block" or isinstance(kind, BlockIndex):
locs, lens = splib.get_blocks(indices)
index = BlockIndex(length, locs, lens)
elif kind == "integer" or isinstance(kind, IntIndex):
index = IntIndex(length, indices)
else: # pragma: no cover
raise ValueError("must be block or integer type")
return index