429 lines
13 KiB
Python
429 lines
13 KiB
Python
from typing import TYPE_CHECKING, Type, Union
|
|
|
|
import numpy as np
|
|
|
|
from pandas._libs import lib, missing as libmissing
|
|
from pandas._typing import Scalar
|
|
from pandas.compat.numpy import function as nv
|
|
|
|
from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
|
|
from pandas.core.dtypes.common import (
|
|
is_array_like,
|
|
is_bool_dtype,
|
|
is_integer_dtype,
|
|
is_object_dtype,
|
|
is_string_dtype,
|
|
pandas_dtype,
|
|
)
|
|
|
|
from pandas.core import ops
|
|
from pandas.core.array_algos import masked_reductions
|
|
from pandas.core.arrays import FloatingArray, IntegerArray, PandasArray
|
|
from pandas.core.arrays.floating import FloatingDtype
|
|
from pandas.core.arrays.integer import _IntegerDtype
|
|
from pandas.core.construction import extract_array
|
|
from pandas.core.indexers import check_array_indexer
|
|
from pandas.core.missing import isna
|
|
|
|
if TYPE_CHECKING:
|
|
import pyarrow
|
|
|
|
|
|
@register_extension_dtype
|
|
class StringDtype(ExtensionDtype):
|
|
"""
|
|
Extension dtype for string data.
|
|
|
|
.. versionadded:: 1.0.0
|
|
|
|
.. warning::
|
|
|
|
StringDtype is considered experimental. The implementation and
|
|
parts of the API may change without warning.
|
|
|
|
In particular, StringDtype.na_value may change to no longer be
|
|
``numpy.nan``.
|
|
|
|
Attributes
|
|
----------
|
|
None
|
|
|
|
Methods
|
|
-------
|
|
None
|
|
|
|
Examples
|
|
--------
|
|
>>> pd.StringDtype()
|
|
StringDtype
|
|
"""
|
|
|
|
name = "string"
|
|
|
|
#: StringDtype.na_value uses pandas.NA
|
|
na_value = libmissing.NA
|
|
|
|
@property
|
|
def type(self) -> Type[str]:
|
|
return str
|
|
|
|
@classmethod
|
|
def construct_array_type(cls) -> Type["StringArray"]:
|
|
"""
|
|
Return the array type associated with this dtype.
|
|
|
|
Returns
|
|
-------
|
|
type
|
|
"""
|
|
return StringArray
|
|
|
|
def __repr__(self) -> str:
|
|
return "StringDtype"
|
|
|
|
def __from_arrow__(
|
|
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
|
|
) -> "StringArray":
|
|
"""
|
|
Construct StringArray from pyarrow Array/ChunkedArray.
|
|
"""
|
|
import pyarrow
|
|
|
|
if isinstance(array, pyarrow.Array):
|
|
chunks = [array]
|
|
else:
|
|
# pyarrow.ChunkedArray
|
|
chunks = array.chunks
|
|
|
|
results = []
|
|
for arr in chunks:
|
|
# using _from_sequence to ensure None is converted to NA
|
|
str_arr = StringArray._from_sequence(np.array(arr))
|
|
results.append(str_arr)
|
|
|
|
return StringArray._concat_same_type(results)
|
|
|
|
|
|
class StringArray(PandasArray):
|
|
"""
|
|
Extension array for string data.
|
|
|
|
.. versionadded:: 1.0.0
|
|
|
|
.. warning::
|
|
|
|
StringArray is considered experimental. The implementation and
|
|
parts of the API may change without warning.
|
|
|
|
Parameters
|
|
----------
|
|
values : array-like
|
|
The array of data.
|
|
|
|
.. warning::
|
|
|
|
Currently, this expects an object-dtype ndarray
|
|
where the elements are Python strings or :attr:`pandas.NA`.
|
|
This may change without warning in the future. Use
|
|
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
|
|
creating a `StringArray` from any sequence.
|
|
|
|
copy : bool, default False
|
|
Whether to copy the array of data.
|
|
|
|
Attributes
|
|
----------
|
|
None
|
|
|
|
Methods
|
|
-------
|
|
None
|
|
|
|
See Also
|
|
--------
|
|
array
|
|
The recommended function for creating a StringArray.
|
|
Series.str
|
|
The string methods are available on Series backed by
|
|
a StringArray.
|
|
|
|
Notes
|
|
-----
|
|
StringArray returns a BooleanArray for comparison methods.
|
|
|
|
Examples
|
|
--------
|
|
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
|
|
<StringArray>
|
|
['This is', 'some text', <NA>, 'data.']
|
|
Length: 4, dtype: string
|
|
|
|
Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
|
|
will convert the values to strings.
|
|
|
|
>>> pd.array(['1', 1], dtype="object")
|
|
<PandasArray>
|
|
['1', 1]
|
|
Length: 2, dtype: object
|
|
>>> pd.array(['1', 1], dtype="string")
|
|
<StringArray>
|
|
['1', '1']
|
|
Length: 2, dtype: string
|
|
|
|
However, instantiating StringArrays directly with non-strings will raise an error.
|
|
|
|
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
|
|
|
|
>>> pd.array(["a", None, "c"], dtype="string") == "a"
|
|
<BooleanArray>
|
|
[True, <NA>, False]
|
|
Length: 3, dtype: boolean
|
|
"""
|
|
|
|
# undo the PandasArray hack
|
|
_typ = "extension"
|
|
|
|
def __init__(self, values, copy=False):
|
|
values = extract_array(values)
|
|
|
|
super().__init__(values, copy=copy)
|
|
# pandas\core\arrays\string_.py:188: error: Incompatible types in
|
|
# assignment (expression has type "StringDtype", variable has type
|
|
# "PandasDtype") [assignment]
|
|
self._dtype = StringDtype() # type: ignore[assignment]
|
|
if not isinstance(values, type(self)):
|
|
self._validate()
|
|
|
|
def _validate(self):
|
|
"""Validate that we only store NA or strings."""
|
|
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
|
|
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
|
|
if self._ndarray.dtype != "object":
|
|
raise ValueError(
|
|
"StringArray requires a sequence of strings or pandas.NA. Got "
|
|
f"'{self._ndarray.dtype}' dtype instead."
|
|
)
|
|
|
|
@classmethod
|
|
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
|
|
if dtype:
|
|
assert dtype == "string"
|
|
|
|
from pandas.core.arrays.masked import BaseMaskedArray
|
|
|
|
if isinstance(scalars, BaseMaskedArray):
|
|
# avoid costly conversion to object dtype
|
|
na_values = scalars._mask
|
|
result = scalars._data
|
|
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
|
|
result[na_values] = StringDtype.na_value
|
|
|
|
else:
|
|
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
|
|
result = lib.ensure_string_array(
|
|
scalars, na_value=StringDtype.na_value, copy=copy
|
|
)
|
|
|
|
# Manually creating new array avoids the validation step in the __init__, so is
|
|
# faster. Refactor need for validation?
|
|
new_string_array = object.__new__(cls)
|
|
new_string_array._dtype = StringDtype()
|
|
new_string_array._ndarray = result
|
|
|
|
return new_string_array
|
|
|
|
@classmethod
|
|
def _from_sequence_of_strings(cls, strings, *, dtype=None, copy=False):
|
|
return cls._from_sequence(strings, dtype=dtype, copy=copy)
|
|
|
|
def __arrow_array__(self, type=None):
|
|
"""
|
|
Convert myself into a pyarrow Array.
|
|
"""
|
|
import pyarrow as pa
|
|
|
|
if type is None:
|
|
type = pa.string()
|
|
|
|
values = self._ndarray.copy()
|
|
values[self.isna()] = None
|
|
return pa.array(values, type=type, from_pandas=True)
|
|
|
|
def _values_for_factorize(self):
|
|
arr = self._ndarray.copy()
|
|
mask = self.isna()
|
|
arr[mask] = -1
|
|
return arr, -1
|
|
|
|
def __setitem__(self, key, value):
|
|
value = extract_array(value, extract_numpy=True)
|
|
if isinstance(value, type(self)):
|
|
# extract_array doesn't extract PandasArray subclasses
|
|
value = value._ndarray
|
|
|
|
key = check_array_indexer(self, key)
|
|
scalar_key = lib.is_scalar(key)
|
|
scalar_value = lib.is_scalar(value)
|
|
if scalar_key and not scalar_value:
|
|
raise ValueError("setting an array element with a sequence.")
|
|
|
|
# validate new items
|
|
if scalar_value:
|
|
if isna(value):
|
|
value = StringDtype.na_value
|
|
elif not isinstance(value, str):
|
|
raise ValueError(
|
|
f"Cannot set non-string value '{value}' into a StringArray."
|
|
)
|
|
else:
|
|
if not is_array_like(value):
|
|
value = np.asarray(value, dtype=object)
|
|
if len(value) and not lib.is_string_array(value, skipna=True):
|
|
raise ValueError("Must provide strings.")
|
|
|
|
super().__setitem__(key, value)
|
|
|
|
def astype(self, dtype, copy=True):
|
|
dtype = pandas_dtype(dtype)
|
|
if isinstance(dtype, StringDtype):
|
|
if copy:
|
|
return self.copy()
|
|
return self
|
|
elif isinstance(dtype, _IntegerDtype):
|
|
arr = self._ndarray.copy()
|
|
mask = self.isna()
|
|
arr[mask] = 0
|
|
values = arr.astype(dtype.numpy_dtype)
|
|
return IntegerArray(values, mask, copy=False)
|
|
elif isinstance(dtype, FloatingDtype):
|
|
arr = self.copy()
|
|
mask = self.isna()
|
|
arr[mask] = "0"
|
|
values = arr.astype(dtype.numpy_dtype)
|
|
return FloatingArray(values, mask, copy=False)
|
|
elif np.issubdtype(dtype, np.floating):
|
|
arr = self._ndarray.copy()
|
|
mask = self.isna()
|
|
arr[mask] = 0
|
|
values = arr.astype(dtype)
|
|
values[mask] = np.nan
|
|
return values
|
|
|
|
return super().astype(dtype, copy)
|
|
|
|
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
|
|
if name in ["min", "max"]:
|
|
return getattr(self, name)(skipna=skipna)
|
|
|
|
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
|
|
|
|
def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
|
|
nv.validate_min((), kwargs)
|
|
result = masked_reductions.min(
|
|
values=self.to_numpy(), mask=self.isna(), skipna=skipna
|
|
)
|
|
return self._wrap_reduction_result(axis, result)
|
|
|
|
def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
|
|
nv.validate_max((), kwargs)
|
|
result = masked_reductions.max(
|
|
values=self.to_numpy(), mask=self.isna(), skipna=skipna
|
|
)
|
|
return self._wrap_reduction_result(axis, result)
|
|
|
|
def value_counts(self, dropna=False):
|
|
from pandas import value_counts
|
|
|
|
return value_counts(self._ndarray, dropna=dropna).astype("Int64")
|
|
|
|
def memory_usage(self, deep: bool = False) -> int:
|
|
result = self._ndarray.nbytes
|
|
if deep:
|
|
return result + lib.memory_usage_of_objects(self._ndarray)
|
|
return result
|
|
|
|
def _cmp_method(self, other, op):
|
|
from pandas.arrays import BooleanArray
|
|
|
|
if isinstance(other, StringArray):
|
|
other = other._ndarray
|
|
|
|
mask = isna(self) | isna(other)
|
|
valid = ~mask
|
|
|
|
if not lib.is_scalar(other):
|
|
if len(other) != len(self):
|
|
# prevent improper broadcasting when other is 2D
|
|
raise ValueError(
|
|
f"Lengths of operands do not match: {len(self)} != {len(other)}"
|
|
)
|
|
|
|
other = np.asarray(other)
|
|
other = other[valid]
|
|
|
|
if op.__name__ in ops.ARITHMETIC_BINOPS:
|
|
result = np.empty_like(self._ndarray, dtype="object")
|
|
result[mask] = StringDtype.na_value
|
|
result[valid] = op(self._ndarray[valid], other)
|
|
return StringArray(result)
|
|
else:
|
|
# logical
|
|
result = np.zeros(len(self._ndarray), dtype="bool")
|
|
result[valid] = op(self._ndarray[valid], other)
|
|
return BooleanArray(result, mask)
|
|
|
|
_arith_method = _cmp_method
|
|
|
|
# ------------------------------------------------------------------------
|
|
# String methods interface
|
|
_str_na_value = StringDtype.na_value
|
|
|
|
def _str_map(self, f, na_value=None, dtype=None):
|
|
from pandas.arrays import BooleanArray, IntegerArray, StringArray
|
|
from pandas.core.arrays.string_ import StringDtype
|
|
|
|
if dtype is None:
|
|
dtype = StringDtype()
|
|
if na_value is None:
|
|
na_value = self.dtype.na_value
|
|
|
|
mask = isna(self)
|
|
arr = np.asarray(self)
|
|
|
|
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
|
|
constructor: Union[Type[IntegerArray], Type[BooleanArray]]
|
|
if is_integer_dtype(dtype):
|
|
constructor = IntegerArray
|
|
else:
|
|
constructor = BooleanArray
|
|
|
|
na_value_is_na = isna(na_value)
|
|
if na_value_is_na:
|
|
na_value = 1
|
|
result = lib.map_infer_mask(
|
|
arr,
|
|
f,
|
|
mask.view("uint8"),
|
|
convert=False,
|
|
na_value=na_value,
|
|
dtype=np.dtype(dtype),
|
|
)
|
|
|
|
if not na_value_is_na:
|
|
mask[:] = False
|
|
|
|
return constructor(result, mask)
|
|
|
|
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
|
|
# i.e. StringDtype
|
|
result = lib.map_infer_mask(
|
|
arr, f, mask.view("uint8"), convert=False, na_value=na_value
|
|
)
|
|
return StringArray(result)
|
|
else:
|
|
# This is when the result type is object. We reach this when
|
|
# -> We know the result type is truly object (e.g. .encode returns bytes
|
|
# or .findall returns a list).
|
|
# -> We don't know the result type. E.g. `.get` can return anything.
|
|
return lib.map_infer_mask(arr, f, mask.view("uint8"))
|