982 lines
33 KiB
Python
982 lines
33 KiB
Python
"""
|
|
Set operations for arrays based on sorting.
|
|
|
|
Notes
|
|
-----
|
|
|
|
For floating point arrays, inaccurate results may appear due to usual round-off
|
|
and floating point comparison issues.
|
|
|
|
Speed could be gained in some operations by an implementation of
|
|
`numpy.sort`, that can provide directly the permutation vectors, thus avoiding
|
|
calls to `numpy.argsort`.
|
|
|
|
Original author: Robert Cimrman
|
|
|
|
"""
|
|
import functools
|
|
|
|
import numpy as np
|
|
from numpy.core import overrides
|
|
|
|
|
|
array_function_dispatch = functools.partial(
|
|
overrides.array_function_dispatch, module='numpy')
|
|
|
|
|
|
__all__ = [
|
|
'ediff1d', 'intersect1d', 'setxor1d', 'union1d', 'setdiff1d', 'unique',
|
|
'in1d', 'isin'
|
|
]
|
|
|
|
|
|
def _ediff1d_dispatcher(ary, to_end=None, to_begin=None):
|
|
return (ary, to_end, to_begin)
|
|
|
|
|
|
@array_function_dispatch(_ediff1d_dispatcher)
|
|
def ediff1d(ary, to_end=None, to_begin=None):
|
|
"""
|
|
The differences between consecutive elements of an array.
|
|
|
|
Parameters
|
|
----------
|
|
ary : array_like
|
|
If necessary, will be flattened before the differences are taken.
|
|
to_end : array_like, optional
|
|
Number(s) to append at the end of the returned differences.
|
|
to_begin : array_like, optional
|
|
Number(s) to prepend at the beginning of the returned differences.
|
|
|
|
Returns
|
|
-------
|
|
ediff1d : ndarray
|
|
The differences. Loosely, this is ``ary.flat[1:] - ary.flat[:-1]``.
|
|
|
|
See Also
|
|
--------
|
|
diff, gradient
|
|
|
|
Notes
|
|
-----
|
|
When applied to masked arrays, this function drops the mask information
|
|
if the `to_begin` and/or `to_end` parameters are used.
|
|
|
|
Examples
|
|
--------
|
|
>>> x = np.array([1, 2, 4, 7, 0])
|
|
>>> np.ediff1d(x)
|
|
array([ 1, 2, 3, -7])
|
|
|
|
>>> np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
|
|
array([-99, 1, 2, ..., -7, 88, 99])
|
|
|
|
The returned array is always 1D.
|
|
|
|
>>> y = [[1, 2, 4], [1, 6, 24]]
|
|
>>> np.ediff1d(y)
|
|
array([ 1, 2, -3, 5, 18])
|
|
|
|
"""
|
|
# force a 1d array
|
|
ary = np.asanyarray(ary).ravel()
|
|
|
|
# enforce that the dtype of `ary` is used for the output
|
|
dtype_req = ary.dtype
|
|
|
|
# fast track default case
|
|
if to_begin is None and to_end is None:
|
|
return ary[1:] - ary[:-1]
|
|
|
|
if to_begin is None:
|
|
l_begin = 0
|
|
else:
|
|
to_begin = np.asanyarray(to_begin)
|
|
if not np.can_cast(to_begin, dtype_req, casting="same_kind"):
|
|
raise TypeError("dtype of `to_begin` must be compatible "
|
|
"with input `ary` under the `same_kind` rule.")
|
|
|
|
to_begin = to_begin.ravel()
|
|
l_begin = len(to_begin)
|
|
|
|
if to_end is None:
|
|
l_end = 0
|
|
else:
|
|
to_end = np.asanyarray(to_end)
|
|
if not np.can_cast(to_end, dtype_req, casting="same_kind"):
|
|
raise TypeError("dtype of `to_end` must be compatible "
|
|
"with input `ary` under the `same_kind` rule.")
|
|
|
|
to_end = to_end.ravel()
|
|
l_end = len(to_end)
|
|
|
|
# do the calculation in place and copy to_begin and to_end
|
|
l_diff = max(len(ary) - 1, 0)
|
|
result = np.empty(l_diff + l_begin + l_end, dtype=ary.dtype)
|
|
result = ary.__array_wrap__(result)
|
|
if l_begin > 0:
|
|
result[:l_begin] = to_begin
|
|
if l_end > 0:
|
|
result[l_begin + l_diff:] = to_end
|
|
np.subtract(ary[1:], ary[:-1], result[l_begin:l_begin + l_diff])
|
|
return result
|
|
|
|
|
|
def _unpack_tuple(x):
|
|
""" Unpacks one-element tuples for use as return values """
|
|
if len(x) == 1:
|
|
return x[0]
|
|
else:
|
|
return x
|
|
|
|
|
|
def _unique_dispatcher(ar, return_index=None, return_inverse=None,
|
|
return_counts=None, axis=None, *, equal_nan=None):
|
|
return (ar,)
|
|
|
|
|
|
@array_function_dispatch(_unique_dispatcher)
|
|
def unique(ar, return_index=False, return_inverse=False,
|
|
return_counts=False, axis=None, *, equal_nan=True):
|
|
"""
|
|
Find the unique elements of an array.
|
|
|
|
Returns the sorted unique elements of an array. There are three optional
|
|
outputs in addition to the unique elements:
|
|
|
|
* the indices of the input array that give the unique values
|
|
* the indices of the unique array that reconstruct the input array
|
|
* the number of times each unique value comes up in the input array
|
|
|
|
Parameters
|
|
----------
|
|
ar : array_like
|
|
Input array. Unless `axis` is specified, this will be flattened if it
|
|
is not already 1-D.
|
|
return_index : bool, optional
|
|
If True, also return the indices of `ar` (along the specified axis,
|
|
if provided, or in the flattened array) that result in the unique array.
|
|
return_inverse : bool, optional
|
|
If True, also return the indices of the unique array (for the specified
|
|
axis, if provided) that can be used to reconstruct `ar`.
|
|
return_counts : bool, optional
|
|
If True, also return the number of times each unique item appears
|
|
in `ar`.
|
|
axis : int or None, optional
|
|
The axis to operate on. If None, `ar` will be flattened. If an integer,
|
|
the subarrays indexed by the given axis will be flattened and treated
|
|
as the elements of a 1-D array with the dimension of the given axis,
|
|
see the notes for more details. Object arrays or structured arrays
|
|
that contain objects are not supported if the `axis` kwarg is used. The
|
|
default is None.
|
|
|
|
.. versionadded:: 1.13.0
|
|
|
|
equal_nan : bool, optional
|
|
If True, collapses multiple NaN values in the return array into one.
|
|
|
|
.. versionadded:: 1.24
|
|
|
|
Returns
|
|
-------
|
|
unique : ndarray
|
|
The sorted unique values.
|
|
unique_indices : ndarray, optional
|
|
The indices of the first occurrences of the unique values in the
|
|
original array. Only provided if `return_index` is True.
|
|
unique_inverse : ndarray, optional
|
|
The indices to reconstruct the original array from the
|
|
unique array. Only provided if `return_inverse` is True.
|
|
unique_counts : ndarray, optional
|
|
The number of times each of the unique values comes up in the
|
|
original array. Only provided if `return_counts` is True.
|
|
|
|
.. versionadded:: 1.9.0
|
|
|
|
See Also
|
|
--------
|
|
numpy.lib.arraysetops : Module with a number of other functions for
|
|
performing set operations on arrays.
|
|
repeat : Repeat elements of an array.
|
|
|
|
Notes
|
|
-----
|
|
When an axis is specified the subarrays indexed by the axis are sorted.
|
|
This is done by making the specified axis the first dimension of the array
|
|
(move the axis to the first dimension to keep the order of the other axes)
|
|
and then flattening the subarrays in C order. The flattened subarrays are
|
|
then viewed as a structured type with each element given a label, with the
|
|
effect that we end up with a 1-D array of structured types that can be
|
|
treated in the same way as any other 1-D array. The result is that the
|
|
flattened subarrays are sorted in lexicographic order starting with the
|
|
first element.
|
|
|
|
.. versionchanged: NumPy 1.21
|
|
If nan values are in the input array, a single nan is put
|
|
to the end of the sorted unique values.
|
|
|
|
Also for complex arrays all NaN values are considered equivalent
|
|
(no matter whether the NaN is in the real or imaginary part).
|
|
As the representant for the returned array the smallest one in the
|
|
lexicographical order is chosen - see np.sort for how the lexicographical
|
|
order is defined for complex arrays.
|
|
|
|
Examples
|
|
--------
|
|
>>> np.unique([1, 1, 2, 2, 3, 3])
|
|
array([1, 2, 3])
|
|
>>> a = np.array([[1, 1], [2, 3]])
|
|
>>> np.unique(a)
|
|
array([1, 2, 3])
|
|
|
|
Return the unique rows of a 2D array
|
|
|
|
>>> a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])
|
|
>>> np.unique(a, axis=0)
|
|
array([[1, 0, 0], [2, 3, 4]])
|
|
|
|
Return the indices of the original array that give the unique values:
|
|
|
|
>>> a = np.array(['a', 'b', 'b', 'c', 'a'])
|
|
>>> u, indices = np.unique(a, return_index=True)
|
|
>>> u
|
|
array(['a', 'b', 'c'], dtype='<U1')
|
|
>>> indices
|
|
array([0, 1, 3])
|
|
>>> a[indices]
|
|
array(['a', 'b', 'c'], dtype='<U1')
|
|
|
|
Reconstruct the input array from the unique values and inverse:
|
|
|
|
>>> a = np.array([1, 2, 6, 4, 2, 3, 2])
|
|
>>> u, indices = np.unique(a, return_inverse=True)
|
|
>>> u
|
|
array([1, 2, 3, 4, 6])
|
|
>>> indices
|
|
array([0, 1, 4, 3, 1, 2, 1])
|
|
>>> u[indices]
|
|
array([1, 2, 6, 4, 2, 3, 2])
|
|
|
|
Reconstruct the input values from the unique values and counts:
|
|
|
|
>>> a = np.array([1, 2, 6, 4, 2, 3, 2])
|
|
>>> values, counts = np.unique(a, return_counts=True)
|
|
>>> values
|
|
array([1, 2, 3, 4, 6])
|
|
>>> counts
|
|
array([1, 3, 1, 1, 1])
|
|
>>> np.repeat(values, counts)
|
|
array([1, 2, 2, 2, 3, 4, 6]) # original order not preserved
|
|
|
|
"""
|
|
ar = np.asanyarray(ar)
|
|
if axis is None:
|
|
ret = _unique1d(ar, return_index, return_inverse, return_counts,
|
|
equal_nan=equal_nan)
|
|
return _unpack_tuple(ret)
|
|
|
|
# axis was specified and not None
|
|
try:
|
|
ar = np.moveaxis(ar, axis, 0)
|
|
except np.AxisError:
|
|
# this removes the "axis1" or "axis2" prefix from the error message
|
|
raise np.AxisError(axis, ar.ndim) from None
|
|
|
|
# Must reshape to a contiguous 2D array for this to work...
|
|
orig_shape, orig_dtype = ar.shape, ar.dtype
|
|
ar = ar.reshape(orig_shape[0], np.prod(orig_shape[1:], dtype=np.intp))
|
|
ar = np.ascontiguousarray(ar)
|
|
dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])]
|
|
|
|
# At this point, `ar` has shape `(n, m)`, and `dtype` is a structured
|
|
# data type with `m` fields where each field has the data type of `ar`.
|
|
# In the following, we create the array `consolidated`, which has
|
|
# shape `(n,)` with data type `dtype`.
|
|
try:
|
|
if ar.shape[1] > 0:
|
|
consolidated = ar.view(dtype)
|
|
else:
|
|
# If ar.shape[1] == 0, then dtype will be `np.dtype([])`, which is
|
|
# a data type with itemsize 0, and the call `ar.view(dtype)` will
|
|
# fail. Instead, we'll use `np.empty` to explicitly create the
|
|
# array with shape `(len(ar),)`. Because `dtype` in this case has
|
|
# itemsize 0, the total size of the result is still 0 bytes.
|
|
consolidated = np.empty(len(ar), dtype=dtype)
|
|
except TypeError as e:
|
|
# There's no good way to do this for object arrays, etc...
|
|
msg = 'The axis argument to unique is not supported for dtype {dt}'
|
|
raise TypeError(msg.format(dt=ar.dtype)) from e
|
|
|
|
def reshape_uniq(uniq):
|
|
n = len(uniq)
|
|
uniq = uniq.view(orig_dtype)
|
|
uniq = uniq.reshape(n, *orig_shape[1:])
|
|
uniq = np.moveaxis(uniq, 0, axis)
|
|
return uniq
|
|
|
|
output = _unique1d(consolidated, return_index,
|
|
return_inverse, return_counts, equal_nan=equal_nan)
|
|
output = (reshape_uniq(output[0]),) + output[1:]
|
|
return _unpack_tuple(output)
|
|
|
|
|
|
def _unique1d(ar, return_index=False, return_inverse=False,
|
|
return_counts=False, *, equal_nan=True):
|
|
"""
|
|
Find the unique elements of an array, ignoring shape.
|
|
"""
|
|
ar = np.asanyarray(ar).flatten()
|
|
|
|
optional_indices = return_index or return_inverse
|
|
|
|
if optional_indices:
|
|
perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
|
|
aux = ar[perm]
|
|
else:
|
|
ar.sort()
|
|
aux = ar
|
|
mask = np.empty(aux.shape, dtype=np.bool_)
|
|
mask[:1] = True
|
|
if (equal_nan and aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and
|
|
np.isnan(aux[-1])):
|
|
if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent
|
|
aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left')
|
|
else:
|
|
aux_firstnan = np.searchsorted(aux, aux[-1], side='left')
|
|
if aux_firstnan > 0:
|
|
mask[1:aux_firstnan] = (
|
|
aux[1:aux_firstnan] != aux[:aux_firstnan - 1])
|
|
mask[aux_firstnan] = True
|
|
mask[aux_firstnan + 1:] = False
|
|
else:
|
|
mask[1:] = aux[1:] != aux[:-1]
|
|
|
|
ret = (aux[mask],)
|
|
if return_index:
|
|
ret += (perm[mask],)
|
|
if return_inverse:
|
|
imask = np.cumsum(mask) - 1
|
|
inv_idx = np.empty(mask.shape, dtype=np.intp)
|
|
inv_idx[perm] = imask
|
|
ret += (inv_idx,)
|
|
if return_counts:
|
|
idx = np.concatenate(np.nonzero(mask) + ([mask.size],))
|
|
ret += (np.diff(idx),)
|
|
return ret
|
|
|
|
|
|
def _intersect1d_dispatcher(
|
|
ar1, ar2, assume_unique=None, return_indices=None):
|
|
return (ar1, ar2)
|
|
|
|
|
|
@array_function_dispatch(_intersect1d_dispatcher)
|
|
def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
|
|
"""
|
|
Find the intersection of two arrays.
|
|
|
|
Return the sorted, unique values that are in both of the input arrays.
|
|
|
|
Parameters
|
|
----------
|
|
ar1, ar2 : array_like
|
|
Input arrays. Will be flattened if not already 1D.
|
|
assume_unique : bool
|
|
If True, the input arrays are both assumed to be unique, which
|
|
can speed up the calculation. If True but ``ar1`` or ``ar2`` are not
|
|
unique, incorrect results and out-of-bounds indices could result.
|
|
Default is False.
|
|
return_indices : bool
|
|
If True, the indices which correspond to the intersection of the two
|
|
arrays are returned. The first instance of a value is used if there are
|
|
multiple. Default is False.
|
|
|
|
.. versionadded:: 1.15.0
|
|
|
|
Returns
|
|
-------
|
|
intersect1d : ndarray
|
|
Sorted 1D array of common and unique elements.
|
|
comm1 : ndarray
|
|
The indices of the first occurrences of the common values in `ar1`.
|
|
Only provided if `return_indices` is True.
|
|
comm2 : ndarray
|
|
The indices of the first occurrences of the common values in `ar2`.
|
|
Only provided if `return_indices` is True.
|
|
|
|
|
|
See Also
|
|
--------
|
|
numpy.lib.arraysetops : Module with a number of other functions for
|
|
performing set operations on arrays.
|
|
|
|
Examples
|
|
--------
|
|
>>> np.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
|
|
array([1, 3])
|
|
|
|
To intersect more than two arrays, use functools.reduce:
|
|
|
|
>>> from functools import reduce
|
|
>>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
|
|
array([3])
|
|
|
|
To return the indices of the values common to the input arrays
|
|
along with the intersected values:
|
|
|
|
>>> x = np.array([1, 1, 2, 3, 4])
|
|
>>> y = np.array([2, 1, 4, 6])
|
|
>>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
|
|
>>> x_ind, y_ind
|
|
(array([0, 2, 4]), array([1, 0, 2]))
|
|
>>> xy, x[x_ind], y[y_ind]
|
|
(array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
|
|
|
|
"""
|
|
ar1 = np.asanyarray(ar1)
|
|
ar2 = np.asanyarray(ar2)
|
|
|
|
if not assume_unique:
|
|
if return_indices:
|
|
ar1, ind1 = unique(ar1, return_index=True)
|
|
ar2, ind2 = unique(ar2, return_index=True)
|
|
else:
|
|
ar1 = unique(ar1)
|
|
ar2 = unique(ar2)
|
|
else:
|
|
ar1 = ar1.ravel()
|
|
ar2 = ar2.ravel()
|
|
|
|
aux = np.concatenate((ar1, ar2))
|
|
if return_indices:
|
|
aux_sort_indices = np.argsort(aux, kind='mergesort')
|
|
aux = aux[aux_sort_indices]
|
|
else:
|
|
aux.sort()
|
|
|
|
mask = aux[1:] == aux[:-1]
|
|
int1d = aux[:-1][mask]
|
|
|
|
if return_indices:
|
|
ar1_indices = aux_sort_indices[:-1][mask]
|
|
ar2_indices = aux_sort_indices[1:][mask] - ar1.size
|
|
if not assume_unique:
|
|
ar1_indices = ind1[ar1_indices]
|
|
ar2_indices = ind2[ar2_indices]
|
|
|
|
return int1d, ar1_indices, ar2_indices
|
|
else:
|
|
return int1d
|
|
|
|
|
|
def _setxor1d_dispatcher(ar1, ar2, assume_unique=None):
|
|
return (ar1, ar2)
|
|
|
|
|
|
@array_function_dispatch(_setxor1d_dispatcher)
|
|
def setxor1d(ar1, ar2, assume_unique=False):
|
|
"""
|
|
Find the set exclusive-or of two arrays.
|
|
|
|
Return the sorted, unique values that are in only one (not both) of the
|
|
input arrays.
|
|
|
|
Parameters
|
|
----------
|
|
ar1, ar2 : array_like
|
|
Input arrays.
|
|
assume_unique : bool
|
|
If True, the input arrays are both assumed to be unique, which
|
|
can speed up the calculation. Default is False.
|
|
|
|
Returns
|
|
-------
|
|
setxor1d : ndarray
|
|
Sorted 1D array of unique values that are in only one of the input
|
|
arrays.
|
|
|
|
Examples
|
|
--------
|
|
>>> a = np.array([1, 2, 3, 2, 4])
|
|
>>> b = np.array([2, 3, 5, 7, 5])
|
|
>>> np.setxor1d(a,b)
|
|
array([1, 4, 5, 7])
|
|
|
|
"""
|
|
if not assume_unique:
|
|
ar1 = unique(ar1)
|
|
ar2 = unique(ar2)
|
|
|
|
aux = np.concatenate((ar1, ar2))
|
|
if aux.size == 0:
|
|
return aux
|
|
|
|
aux.sort()
|
|
flag = np.concatenate(([True], aux[1:] != aux[:-1], [True]))
|
|
return aux[flag[1:] & flag[:-1]]
|
|
|
|
|
|
def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None, *,
|
|
kind=None):
|
|
return (ar1, ar2)
|
|
|
|
|
|
@array_function_dispatch(_in1d_dispatcher)
|
|
def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
|
|
"""
|
|
Test whether each element of a 1-D array is also present in a second array.
|
|
|
|
Returns a boolean array the same length as `ar1` that is True
|
|
where an element of `ar1` is in `ar2` and False otherwise.
|
|
|
|
We recommend using :func:`isin` instead of `in1d` for new code.
|
|
|
|
Parameters
|
|
----------
|
|
ar1 : (M,) array_like
|
|
Input array.
|
|
ar2 : array_like
|
|
The values against which to test each value of `ar1`.
|
|
assume_unique : bool, optional
|
|
If True, the input arrays are both assumed to be unique, which
|
|
can speed up the calculation. Default is False.
|
|
invert : bool, optional
|
|
If True, the values in the returned array are inverted (that is,
|
|
False where an element of `ar1` is in `ar2` and True otherwise).
|
|
Default is False. ``np.in1d(a, b, invert=True)`` is equivalent
|
|
to (but is faster than) ``np.invert(in1d(a, b))``.
|
|
kind : {None, 'sort', 'table'}, optional
|
|
The algorithm to use. This will not affect the final result,
|
|
but will affect the speed and memory use. The default, None,
|
|
will select automatically based on memory considerations.
|
|
|
|
* If 'sort', will use a mergesort-based approach. This will have
|
|
a memory usage of roughly 6 times the sum of the sizes of
|
|
`ar1` and `ar2`, not accounting for size of dtypes.
|
|
* If 'table', will use a lookup table approach similar
|
|
to a counting sort. This is only available for boolean and
|
|
integer arrays. This will have a memory usage of the
|
|
size of `ar1` plus the max-min value of `ar2`. `assume_unique`
|
|
has no effect when the 'table' option is used.
|
|
* If None, will automatically choose 'table' if
|
|
the required memory allocation is less than or equal to
|
|
6 times the sum of the sizes of `ar1` and `ar2`,
|
|
otherwise will use 'sort'. This is done to not use
|
|
a large amount of memory by default, even though
|
|
'table' may be faster in most cases. If 'table' is chosen,
|
|
`assume_unique` will have no effect.
|
|
|
|
.. versionadded:: 1.8.0
|
|
|
|
Returns
|
|
-------
|
|
in1d : (M,) ndarray, bool
|
|
The values `ar1[in1d]` are in `ar2`.
|
|
|
|
See Also
|
|
--------
|
|
isin : Version of this function that preserves the
|
|
shape of ar1.
|
|
numpy.lib.arraysetops : Module with a number of other functions for
|
|
performing set operations on arrays.
|
|
|
|
Notes
|
|
-----
|
|
`in1d` can be considered as an element-wise function version of the
|
|
python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is roughly
|
|
equivalent to ``np.array([item in b for item in a])``.
|
|
However, this idea fails if `ar2` is a set, or similar (non-sequence)
|
|
container: As ``ar2`` is converted to an array, in those cases
|
|
``asarray(ar2)`` is an object array rather than the expected array of
|
|
contained values.
|
|
|
|
Using ``kind='table'`` tends to be faster than `kind='sort'` if the
|
|
following relationship is true:
|
|
``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``,
|
|
but may use greater memory. The default value for `kind` will
|
|
be automatically selected based only on memory usage, so one may
|
|
manually set ``kind='table'`` if memory constraints can be relaxed.
|
|
|
|
.. versionadded:: 1.4.0
|
|
|
|
Examples
|
|
--------
|
|
>>> test = np.array([0, 1, 2, 5, 0])
|
|
>>> states = [0, 2]
|
|
>>> mask = np.in1d(test, states)
|
|
>>> mask
|
|
array([ True, False, True, False, True])
|
|
>>> test[mask]
|
|
array([0, 2, 0])
|
|
>>> mask = np.in1d(test, states, invert=True)
|
|
>>> mask
|
|
array([False, True, False, True, False])
|
|
>>> test[mask]
|
|
array([1, 5])
|
|
"""
|
|
# Ravel both arrays, behavior for the first array could be different
|
|
ar1 = np.asarray(ar1).ravel()
|
|
ar2 = np.asarray(ar2).ravel()
|
|
|
|
# Ensure that iteration through object arrays yields size-1 arrays
|
|
if ar2.dtype == object:
|
|
ar2 = ar2.reshape(-1, 1)
|
|
|
|
if kind not in {None, 'sort', 'table'}:
|
|
raise ValueError(
|
|
f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")
|
|
|
|
# Can use the table method if all arrays are integers or boolean:
|
|
is_int_arrays = all(ar.dtype.kind in ("u", "i", "b") for ar in (ar1, ar2))
|
|
use_table_method = is_int_arrays and kind in {None, 'table'}
|
|
|
|
if use_table_method:
|
|
if ar2.size == 0:
|
|
if invert:
|
|
return np.ones_like(ar1, dtype=bool)
|
|
else:
|
|
return np.zeros_like(ar1, dtype=bool)
|
|
|
|
# Convert booleans to uint8 so we can use the fast integer algorithm
|
|
if ar1.dtype == bool:
|
|
ar1 = ar1.astype(np.uint8)
|
|
if ar2.dtype == bool:
|
|
ar2 = ar2.astype(np.uint8)
|
|
|
|
ar2_min = np.min(ar2)
|
|
ar2_max = np.max(ar2)
|
|
|
|
ar2_range = int(ar2_max) - int(ar2_min)
|
|
|
|
# Constraints on whether we can actually use the table method:
|
|
# 1. Assert memory usage is not too large
|
|
below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
|
|
# 2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
|
|
range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
|
|
# 3. Check overflows for (ar1 - ar2_min); dtype=ar1.dtype
|
|
if ar1.size > 0:
|
|
ar1_min = np.min(ar1)
|
|
ar1_max = np.max(ar1)
|
|
|
|
# After masking, the range of ar1 is guaranteed to be
|
|
# within the range of ar2:
|
|
ar1_upper = min(int(ar1_max), int(ar2_max))
|
|
ar1_lower = max(int(ar1_min), int(ar2_min))
|
|
|
|
range_safe_from_overflow &= all((
|
|
ar1_upper - int(ar2_min) <= np.iinfo(ar1.dtype).max,
|
|
ar1_lower - int(ar2_min) >= np.iinfo(ar1.dtype).min
|
|
))
|
|
|
|
# Optimal performance is for approximately
|
|
# log10(size) > (log10(range) - 2.27) / 0.927.
|
|
# However, here we set the requirement that by default
|
|
# the intermediate array can only be 6x
|
|
# the combined memory allocation of the original
|
|
# arrays. See discussion on
|
|
# https://github.com/numpy/numpy/pull/12065.
|
|
|
|
if (
|
|
range_safe_from_overflow and
|
|
(below_memory_constraint or kind == 'table')
|
|
):
|
|
|
|
if invert:
|
|
outgoing_array = np.ones_like(ar1, dtype=bool)
|
|
else:
|
|
outgoing_array = np.zeros_like(ar1, dtype=bool)
|
|
|
|
# Make elements 1 where the integer exists in ar2
|
|
if invert:
|
|
isin_helper_ar = np.ones(ar2_range + 1, dtype=bool)
|
|
isin_helper_ar[ar2 - ar2_min] = 0
|
|
else:
|
|
isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool)
|
|
isin_helper_ar[ar2 - ar2_min] = 1
|
|
|
|
# Mask out elements we know won't work
|
|
basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
|
|
outgoing_array[basic_mask] = isin_helper_ar[ar1[basic_mask] -
|
|
ar2_min]
|
|
|
|
return outgoing_array
|
|
elif kind == 'table': # not range_safe_from_overflow
|
|
raise RuntimeError(
|
|
"You have specified kind='table', "
|
|
"but the range of values in `ar2` or `ar1` exceed the "
|
|
"maximum integer of the datatype. "
|
|
"Please set `kind` to None or 'sort'."
|
|
)
|
|
elif kind == 'table':
|
|
raise ValueError(
|
|
"The 'table' method is only "
|
|
"supported for boolean or integer arrays. "
|
|
"Please select 'sort' or None for kind."
|
|
)
|
|
|
|
|
|
# Check if one of the arrays may contain arbitrary objects
|
|
contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
|
|
|
|
# This code is run when
|
|
# a) the first condition is true, making the code significantly faster
|
|
# b) the second condition is true (i.e. `ar1` or `ar2` may contain
|
|
# arbitrary objects), since then sorting is not guaranteed to work
|
|
if len(ar2) < 10 * len(ar1) ** 0.145 or contains_object:
|
|
if invert:
|
|
mask = np.ones(len(ar1), dtype=bool)
|
|
for a in ar2:
|
|
mask &= (ar1 != a)
|
|
else:
|
|
mask = np.zeros(len(ar1), dtype=bool)
|
|
for a in ar2:
|
|
mask |= (ar1 == a)
|
|
return mask
|
|
|
|
# Otherwise use sorting
|
|
if not assume_unique:
|
|
ar1, rev_idx = np.unique(ar1, return_inverse=True)
|
|
ar2 = np.unique(ar2)
|
|
|
|
ar = np.concatenate((ar1, ar2))
|
|
# We need this to be a stable sort, so always use 'mergesort'
|
|
# here. The values from the first array should always come before
|
|
# the values from the second array.
|
|
order = ar.argsort(kind='mergesort')
|
|
sar = ar[order]
|
|
if invert:
|
|
bool_ar = (sar[1:] != sar[:-1])
|
|
else:
|
|
bool_ar = (sar[1:] == sar[:-1])
|
|
flag = np.concatenate((bool_ar, [invert]))
|
|
ret = np.empty(ar.shape, dtype=bool)
|
|
ret[order] = flag
|
|
|
|
if assume_unique:
|
|
return ret[:len(ar1)]
|
|
else:
|
|
return ret[rev_idx]
|
|
|
|
|
|
def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None,
|
|
*, kind=None):
|
|
return (element, test_elements)
|
|
|
|
|
|
@array_function_dispatch(_isin_dispatcher)
|
|
def isin(element, test_elements, assume_unique=False, invert=False, *,
|
|
kind=None):
|
|
"""
|
|
Calculates ``element in test_elements``, broadcasting over `element` only.
|
|
Returns a boolean array of the same shape as `element` that is True
|
|
where an element of `element` is in `test_elements` and False otherwise.
|
|
|
|
Parameters
|
|
----------
|
|
element : array_like
|
|
Input array.
|
|
test_elements : array_like
|
|
The values against which to test each value of `element`.
|
|
This argument is flattened if it is an array or array_like.
|
|
See notes for behavior with non-array-like parameters.
|
|
assume_unique : bool, optional
|
|
If True, the input arrays are both assumed to be unique, which
|
|
can speed up the calculation. Default is False.
|
|
invert : bool, optional
|
|
If True, the values in the returned array are inverted, as if
|
|
calculating `element not in test_elements`. Default is False.
|
|
``np.isin(a, b, invert=True)`` is equivalent to (but faster
|
|
than) ``np.invert(np.isin(a, b))``.
|
|
kind : {None, 'sort', 'table'}, optional
|
|
The algorithm to use. This will not affect the final result,
|
|
but will affect the speed and memory use. The default, None,
|
|
will select automatically based on memory considerations.
|
|
|
|
* If 'sort', will use a mergesort-based approach. This will have
|
|
a memory usage of roughly 6 times the sum of the sizes of
|
|
`ar1` and `ar2`, not accounting for size of dtypes.
|
|
* If 'table', will use a lookup table approach similar
|
|
to a counting sort. This is only available for boolean and
|
|
integer arrays. This will have a memory usage of the
|
|
size of `ar1` plus the max-min value of `ar2`. `assume_unique`
|
|
has no effect when the 'table' option is used.
|
|
* If None, will automatically choose 'table' if
|
|
the required memory allocation is less than or equal to
|
|
6 times the sum of the sizes of `ar1` and `ar2`,
|
|
otherwise will use 'sort'. This is done to not use
|
|
a large amount of memory by default, even though
|
|
'table' may be faster in most cases. If 'table' is chosen,
|
|
`assume_unique` will have no effect.
|
|
|
|
|
|
Returns
|
|
-------
|
|
isin : ndarray, bool
|
|
Has the same shape as `element`. The values `element[isin]`
|
|
are in `test_elements`.
|
|
|
|
See Also
|
|
--------
|
|
in1d : Flattened version of this function.
|
|
numpy.lib.arraysetops : Module with a number of other functions for
|
|
performing set operations on arrays.
|
|
|
|
Notes
|
|
-----
|
|
|
|
`isin` is an element-wise function version of the python keyword `in`.
|
|
``isin(a, b)`` is roughly equivalent to
|
|
``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.
|
|
|
|
`element` and `test_elements` are converted to arrays if they are not
|
|
already. If `test_elements` is a set (or other non-sequence collection)
|
|
it will be converted to an object array with one element, rather than an
|
|
array of the values contained in `test_elements`. This is a consequence
|
|
of the `array` constructor's way of handling non-sequence collections.
|
|
Converting the set to a list usually gives the desired behavior.
|
|
|
|
Using ``kind='table'`` tends to be faster than `kind='sort'` if the
|
|
following relationship is true:
|
|
``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``,
|
|
but may use greater memory. The default value for `kind` will
|
|
be automatically selected based only on memory usage, so one may
|
|
manually set ``kind='table'`` if memory constraints can be relaxed.
|
|
|
|
.. versionadded:: 1.13.0
|
|
|
|
Examples
|
|
--------
|
|
>>> element = 2*np.arange(4).reshape((2, 2))
|
|
>>> element
|
|
array([[0, 2],
|
|
[4, 6]])
|
|
>>> test_elements = [1, 2, 4, 8]
|
|
>>> mask = np.isin(element, test_elements)
|
|
>>> mask
|
|
array([[False, True],
|
|
[ True, False]])
|
|
>>> element[mask]
|
|
array([2, 4])
|
|
|
|
The indices of the matched values can be obtained with `nonzero`:
|
|
|
|
>>> np.nonzero(mask)
|
|
(array([0, 1]), array([1, 0]))
|
|
|
|
The test can also be inverted:
|
|
|
|
>>> mask = np.isin(element, test_elements, invert=True)
|
|
>>> mask
|
|
array([[ True, False],
|
|
[False, True]])
|
|
>>> element[mask]
|
|
array([0, 6])
|
|
|
|
Because of how `array` handles sets, the following does not
|
|
work as expected:
|
|
|
|
>>> test_set = {1, 2, 4, 8}
|
|
>>> np.isin(element, test_set)
|
|
array([[False, False],
|
|
[False, False]])
|
|
|
|
Casting the set to a list gives the expected result:
|
|
|
|
>>> np.isin(element, list(test_set))
|
|
array([[False, True],
|
|
[ True, False]])
|
|
"""
|
|
element = np.asarray(element)
|
|
return in1d(element, test_elements, assume_unique=assume_unique,
|
|
invert=invert, kind=kind).reshape(element.shape)
|
|
|
|
|
|
def _union1d_dispatcher(ar1, ar2):
|
|
return (ar1, ar2)
|
|
|
|
|
|
@array_function_dispatch(_union1d_dispatcher)
|
|
def union1d(ar1, ar2):
|
|
"""
|
|
Find the union of two arrays.
|
|
|
|
Return the unique, sorted array of values that are in either of the two
|
|
input arrays.
|
|
|
|
Parameters
|
|
----------
|
|
ar1, ar2 : array_like
|
|
Input arrays. They are flattened if they are not already 1D.
|
|
|
|
Returns
|
|
-------
|
|
union1d : ndarray
|
|
Unique, sorted union of the input arrays.
|
|
|
|
See Also
|
|
--------
|
|
numpy.lib.arraysetops : Module with a number of other functions for
|
|
performing set operations on arrays.
|
|
|
|
Examples
|
|
--------
|
|
>>> np.union1d([-1, 0, 1], [-2, 0, 2])
|
|
array([-2, -1, 0, 1, 2])
|
|
|
|
To find the union of more than two arrays, use functools.reduce:
|
|
|
|
>>> from functools import reduce
|
|
>>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
|
|
array([1, 2, 3, 4, 6])
|
|
"""
|
|
return unique(np.concatenate((ar1, ar2), axis=None))
|
|
|
|
|
|
def _setdiff1d_dispatcher(ar1, ar2, assume_unique=None):
|
|
return (ar1, ar2)
|
|
|
|
|
|
@array_function_dispatch(_setdiff1d_dispatcher)
|
|
def setdiff1d(ar1, ar2, assume_unique=False):
|
|
"""
|
|
Find the set difference of two arrays.
|
|
|
|
Return the unique values in `ar1` that are not in `ar2`.
|
|
|
|
Parameters
|
|
----------
|
|
ar1 : array_like
|
|
Input array.
|
|
ar2 : array_like
|
|
Input comparison array.
|
|
assume_unique : bool
|
|
If True, the input arrays are both assumed to be unique, which
|
|
can speed up the calculation. Default is False.
|
|
|
|
Returns
|
|
-------
|
|
setdiff1d : ndarray
|
|
1D array of values in `ar1` that are not in `ar2`. The result
|
|
is sorted when `assume_unique=False`, but otherwise only sorted
|
|
if the input is sorted.
|
|
|
|
See Also
|
|
--------
|
|
numpy.lib.arraysetops : Module with a number of other functions for
|
|
performing set operations on arrays.
|
|
|
|
Examples
|
|
--------
|
|
>>> a = np.array([1, 2, 3, 2, 4, 1])
|
|
>>> b = np.array([3, 4, 5, 6])
|
|
>>> np.setdiff1d(a, b)
|
|
array([1, 2])
|
|
|
|
"""
|
|
if assume_unique:
|
|
ar1 = np.asarray(ar1).ravel()
|
|
else:
|
|
ar1 = unique(ar1)
|
|
ar2 = unique(ar2)
|
|
return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
|