367 lines
11 KiB
Python
367 lines
11 KiB
Python
|
from contextlib import suppress
|
||
|
from collections import Counter
|
||
|
from typing import NamedTuple
|
||
|
|
||
|
import numpy as np
|
||
|
from . import is_scalar_nan
|
||
|
|
||
|
|
||
|
def _unique(values, *, return_inverse=False, return_counts=False):
|
||
|
"""Helper function to find unique values with support for python objects.
|
||
|
|
||
|
Uses pure python method for object dtype, and numpy method for
|
||
|
all other dtypes.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : ndarray
|
||
|
Values to check for unknowns.
|
||
|
|
||
|
return_inverse : bool, default=False
|
||
|
If True, also return the indices of the unique values.
|
||
|
|
||
|
return_counts : bool, default=False
|
||
|
If True, also return the number of times each unique item appears in
|
||
|
values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
unique : ndarray
|
||
|
The sorted unique values.
|
||
|
|
||
|
unique_inverse : ndarray
|
||
|
The indices to reconstruct the original array from the unique array.
|
||
|
Only provided if `return_inverse` is True.
|
||
|
|
||
|
unique_counts : ndarray
|
||
|
The number of times each of the unique values comes up in the original
|
||
|
array. Only provided if `return_counts` is True.
|
||
|
"""
|
||
|
if values.dtype == object:
|
||
|
return _unique_python(
|
||
|
values, return_inverse=return_inverse, return_counts=return_counts
|
||
|
)
|
||
|
# numerical
|
||
|
return _unique_np(
|
||
|
values, return_inverse=return_inverse, return_counts=return_counts
|
||
|
)
|
||
|
|
||
|
|
||
|
def _unique_np(values, return_inverse=False, return_counts=False):
|
||
|
"""Helper function to find unique values for numpy arrays that correctly
|
||
|
accounts for nans. See `_unique` documentation for details."""
|
||
|
uniques = np.unique(
|
||
|
values, return_inverse=return_inverse, return_counts=return_counts
|
||
|
)
|
||
|
|
||
|
inverse, counts = None, None
|
||
|
|
||
|
if return_counts:
|
||
|
*uniques, counts = uniques
|
||
|
|
||
|
if return_inverse:
|
||
|
*uniques, inverse = uniques
|
||
|
|
||
|
if return_counts or return_inverse:
|
||
|
uniques = uniques[0]
|
||
|
|
||
|
# np.unique will have duplicate missing values at the end of `uniques`
|
||
|
# here we clip the nans and remove it from uniques
|
||
|
if uniques.size and is_scalar_nan(uniques[-1]):
|
||
|
nan_idx = np.searchsorted(uniques, np.nan)
|
||
|
uniques = uniques[: nan_idx + 1]
|
||
|
if return_inverse:
|
||
|
inverse[inverse > nan_idx] = nan_idx
|
||
|
|
||
|
if return_counts:
|
||
|
counts[nan_idx] = np.sum(counts[nan_idx:])
|
||
|
counts = counts[: nan_idx + 1]
|
||
|
|
||
|
ret = (uniques,)
|
||
|
|
||
|
if return_inverse:
|
||
|
ret += (inverse,)
|
||
|
|
||
|
if return_counts:
|
||
|
ret += (counts,)
|
||
|
|
||
|
return ret[0] if len(ret) == 1 else ret
|
||
|
|
||
|
|
||
|
class MissingValues(NamedTuple):
|
||
|
"""Data class for missing data information"""
|
||
|
|
||
|
nan: bool
|
||
|
none: bool
|
||
|
|
||
|
def to_list(self):
|
||
|
"""Convert tuple to a list where None is always first."""
|
||
|
output = []
|
||
|
if self.none:
|
||
|
output.append(None)
|
||
|
if self.nan:
|
||
|
output.append(np.nan)
|
||
|
return output
|
||
|
|
||
|
|
||
|
def _extract_missing(values):
|
||
|
"""Extract missing values from `values`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values: set
|
||
|
Set of values to extract missing from.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
output: set
|
||
|
Set with missing values extracted.
|
||
|
|
||
|
missing_values: MissingValues
|
||
|
Object with missing value information.
|
||
|
"""
|
||
|
missing_values_set = {
|
||
|
value for value in values if value is None or is_scalar_nan(value)
|
||
|
}
|
||
|
|
||
|
if not missing_values_set:
|
||
|
return values, MissingValues(nan=False, none=False)
|
||
|
|
||
|
if None in missing_values_set:
|
||
|
if len(missing_values_set) == 1:
|
||
|
output_missing_values = MissingValues(nan=False, none=True)
|
||
|
else:
|
||
|
# If there is more than one missing value, then it has to be
|
||
|
# float('nan') or np.nan
|
||
|
output_missing_values = MissingValues(nan=True, none=True)
|
||
|
else:
|
||
|
output_missing_values = MissingValues(nan=True, none=False)
|
||
|
|
||
|
# create set without the missing values
|
||
|
output = values - missing_values_set
|
||
|
return output, output_missing_values
|
||
|
|
||
|
|
||
|
class _nandict(dict):
|
||
|
"""Dictionary with support for nans."""
|
||
|
|
||
|
def __init__(self, mapping):
|
||
|
super().__init__(mapping)
|
||
|
for key, value in mapping.items():
|
||
|
if is_scalar_nan(key):
|
||
|
self.nan_value = value
|
||
|
break
|
||
|
|
||
|
def __missing__(self, key):
|
||
|
if hasattr(self, "nan_value") and is_scalar_nan(key):
|
||
|
return self.nan_value
|
||
|
raise KeyError(key)
|
||
|
|
||
|
|
||
|
def _map_to_integer(values, uniques):
|
||
|
"""Map values based on its position in uniques."""
|
||
|
table = _nandict({val: i for i, val in enumerate(uniques)})
|
||
|
return np.array([table[v] for v in values])
|
||
|
|
||
|
|
||
|
def _unique_python(values, *, return_inverse, return_counts):
|
||
|
# Only used in `_uniques`, see docstring there for details
|
||
|
try:
|
||
|
uniques_set = set(values)
|
||
|
uniques_set, missing_values = _extract_missing(uniques_set)
|
||
|
|
||
|
uniques = sorted(uniques_set)
|
||
|
uniques.extend(missing_values.to_list())
|
||
|
uniques = np.array(uniques, dtype=values.dtype)
|
||
|
except TypeError:
|
||
|
types = sorted(t.__qualname__ for t in set(type(v) for v in values))
|
||
|
raise TypeError(
|
||
|
"Encoders require their input to be uniformly "
|
||
|
f"strings or numbers. Got {types}"
|
||
|
)
|
||
|
ret = (uniques,)
|
||
|
|
||
|
if return_inverse:
|
||
|
ret += (_map_to_integer(values, uniques),)
|
||
|
|
||
|
if return_counts:
|
||
|
ret += (_get_counts(values, uniques),)
|
||
|
|
||
|
return ret[0] if len(ret) == 1 else ret
|
||
|
|
||
|
|
||
|
def _encode(values, *, uniques, check_unknown=True):
|
||
|
"""Helper function to encode values into [0, n_uniques - 1].
|
||
|
|
||
|
Uses pure python method for object dtype, and numpy method for
|
||
|
all other dtypes.
|
||
|
The numpy method has the limitation that the `uniques` need to
|
||
|
be sorted. Importantly, this is not checked but assumed to already be
|
||
|
the case. The calling method needs to ensure this for all non-object
|
||
|
values.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : ndarray
|
||
|
Values to encode.
|
||
|
uniques : ndarray
|
||
|
The unique values in `values`. If the dtype is not object, then
|
||
|
`uniques` needs to be sorted.
|
||
|
check_unknown : bool, default=True
|
||
|
If True, check for values in `values` that are not in `unique`
|
||
|
and raise an error. This is ignored for object dtype, and treated as
|
||
|
True in this case. This parameter is useful for
|
||
|
_BaseEncoder._transform() to avoid calling _check_unknown()
|
||
|
twice.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
encoded : ndarray
|
||
|
Encoded values
|
||
|
"""
|
||
|
if values.dtype.kind in "OUS":
|
||
|
try:
|
||
|
return _map_to_integer(values, uniques)
|
||
|
except KeyError as e:
|
||
|
raise ValueError(f"y contains previously unseen labels: {str(e)}")
|
||
|
else:
|
||
|
if check_unknown:
|
||
|
diff = _check_unknown(values, uniques)
|
||
|
if diff:
|
||
|
raise ValueError(f"y contains previously unseen labels: {str(diff)}")
|
||
|
return np.searchsorted(uniques, values)
|
||
|
|
||
|
|
||
|
def _check_unknown(values, known_values, return_mask=False):
|
||
|
"""
|
||
|
Helper function to check for unknowns in values to be encoded.
|
||
|
|
||
|
Uses pure python method for object dtype, and numpy method for
|
||
|
all other dtypes.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : array
|
||
|
Values to check for unknowns.
|
||
|
known_values : array
|
||
|
Known values. Must be unique.
|
||
|
return_mask : bool, default=False
|
||
|
If True, return a mask of the same shape as `values` indicating
|
||
|
the valid values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
diff : list
|
||
|
The unique values present in `values` and not in `know_values`.
|
||
|
valid_mask : boolean array
|
||
|
Additionally returned if ``return_mask=True``.
|
||
|
|
||
|
"""
|
||
|
valid_mask = None
|
||
|
|
||
|
if values.dtype.kind in "OUS":
|
||
|
values_set = set(values)
|
||
|
values_set, missing_in_values = _extract_missing(values_set)
|
||
|
|
||
|
uniques_set = set(known_values)
|
||
|
uniques_set, missing_in_uniques = _extract_missing(uniques_set)
|
||
|
diff = values_set - uniques_set
|
||
|
|
||
|
nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
|
||
|
none_in_diff = missing_in_values.none and not missing_in_uniques.none
|
||
|
|
||
|
def is_valid(value):
|
||
|
return (
|
||
|
value in uniques_set
|
||
|
or missing_in_uniques.none
|
||
|
and value is None
|
||
|
or missing_in_uniques.nan
|
||
|
and is_scalar_nan(value)
|
||
|
)
|
||
|
|
||
|
if return_mask:
|
||
|
if diff or nan_in_diff or none_in_diff:
|
||
|
valid_mask = np.array([is_valid(value) for value in values])
|
||
|
else:
|
||
|
valid_mask = np.ones(len(values), dtype=bool)
|
||
|
|
||
|
diff = list(diff)
|
||
|
if none_in_diff:
|
||
|
diff.append(None)
|
||
|
if nan_in_diff:
|
||
|
diff.append(np.nan)
|
||
|
else:
|
||
|
unique_values = np.unique(values)
|
||
|
diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
|
||
|
if return_mask:
|
||
|
if diff.size:
|
||
|
valid_mask = np.in1d(values, known_values)
|
||
|
else:
|
||
|
valid_mask = np.ones(len(values), dtype=bool)
|
||
|
|
||
|
# check for nans in the known_values
|
||
|
if np.isnan(known_values).any():
|
||
|
diff_is_nan = np.isnan(diff)
|
||
|
if diff_is_nan.any():
|
||
|
# removes nan from valid_mask
|
||
|
if diff.size and return_mask:
|
||
|
is_nan = np.isnan(values)
|
||
|
valid_mask[is_nan] = 1
|
||
|
|
||
|
# remove nan from diff
|
||
|
diff = diff[~diff_is_nan]
|
||
|
diff = list(diff)
|
||
|
|
||
|
if return_mask:
|
||
|
return diff, valid_mask
|
||
|
return diff
|
||
|
|
||
|
|
||
|
class _NaNCounter(Counter):
|
||
|
"""Counter with support for nan values."""
|
||
|
|
||
|
def __init__(self, items):
|
||
|
super().__init__(self._generate_items(items))
|
||
|
|
||
|
def _generate_items(self, items):
|
||
|
"""Generate items without nans. Stores the nan counts separately."""
|
||
|
for item in items:
|
||
|
if not is_scalar_nan(item):
|
||
|
yield item
|
||
|
continue
|
||
|
if not hasattr(self, "nan_count"):
|
||
|
self.nan_count = 0
|
||
|
self.nan_count += 1
|
||
|
|
||
|
def __missing__(self, key):
|
||
|
if hasattr(self, "nan_count") and is_scalar_nan(key):
|
||
|
return self.nan_count
|
||
|
raise KeyError(key)
|
||
|
|
||
|
|
||
|
def _get_counts(values, uniques):
|
||
|
"""Get the count of each of the `uniques` in `values`.
|
||
|
|
||
|
The counts will use the order passed in by `uniques`. For non-object dtypes,
|
||
|
`uniques` is assumed to be sorted and `np.nan` is at the end.
|
||
|
"""
|
||
|
if values.dtype.kind in "OU":
|
||
|
counter = _NaNCounter(values)
|
||
|
output = np.zeros(len(uniques), dtype=np.int64)
|
||
|
for i, item in enumerate(uniques):
|
||
|
with suppress(KeyError):
|
||
|
output[i] = counter[item]
|
||
|
return output
|
||
|
|
||
|
unique_values, counts = _unique_np(values, return_counts=True)
|
||
|
|
||
|
# Recorder unique_values based on input: `uniques`
|
||
|
uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
|
||
|
if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
|
||
|
uniques_in_values[-1] = True
|
||
|
|
||
|
unique_valid_indices = np.searchsorted(unique_values, uniques[uniques_in_values])
|
||
|
output = np.zeros_like(uniques, dtype=np.int64)
|
||
|
output[uniques_in_values] = counts[unique_valid_indices]
|
||
|
return output
|