1592 lines
55 KiB
Python
1592 lines
55 KiB
Python
"""
|
|
Collection of utilities to manipulate structured arrays.
|
|
|
|
Most of these functions were initially implemented by John Hunter for
|
|
matplotlib. They have been rewritten and extended for convenience.
|
|
|
|
"""
|
|
import itertools
|
|
import numpy as np
|
|
import numpy.ma as ma
|
|
from numpy import ndarray, recarray
|
|
from numpy.ma import MaskedArray
|
|
from numpy.ma.mrecords import MaskedRecords
|
|
from numpy.core.overrides import array_function_dispatch
|
|
from numpy.lib._iotools import _is_string_like
|
|
from numpy.testing import suppress_warnings
|
|
|
|
_check_fill_value = np.ma.core._check_fill_value
|
|
|
|
|
|
__all__ = [
|
|
'append_fields', 'apply_along_fields', 'assign_fields_by_name',
|
|
'drop_fields', 'find_duplicates', 'flatten_descr',
|
|
'get_fieldstructure', 'get_names', 'get_names_flat',
|
|
'join_by', 'merge_arrays', 'rec_append_fields',
|
|
'rec_drop_fields', 'rec_join', 'recursive_fill_fields',
|
|
'rename_fields', 'repack_fields', 'require_fields',
|
|
'stack_arrays', 'structured_to_unstructured', 'unstructured_to_structured',
|
|
]
|
|
|
|
|
|
def _recursive_fill_fields_dispatcher(input, output):
|
|
return (input, output)
|
|
|
|
|
|
@array_function_dispatch(_recursive_fill_fields_dispatcher)
|
|
def recursive_fill_fields(input, output):
|
|
"""
|
|
Fills fields from output with fields from input,
|
|
with support for nested structures.
|
|
|
|
Parameters
|
|
----------
|
|
input : ndarray
|
|
Input array.
|
|
output : ndarray
|
|
Output array.
|
|
|
|
Notes
|
|
-----
|
|
* `output` should be at least the same size as `input`
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', np.int64), ('B', np.float64)])
|
|
>>> b = np.zeros((3,), dtype=a.dtype)
|
|
>>> rfn.recursive_fill_fields(a, b)
|
|
array([(1, 10.), (2, 20.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])
|
|
|
|
"""
|
|
newdtype = output.dtype
|
|
for field in newdtype.names:
|
|
try:
|
|
current = input[field]
|
|
except ValueError:
|
|
continue
|
|
if current.dtype.names is not None:
|
|
recursive_fill_fields(current, output[field])
|
|
else:
|
|
output[field][:len(current)] = current
|
|
return output
|
|
|
|
|
|
def _get_fieldspec(dtype):
|
|
"""
|
|
Produce a list of name/dtype pairs corresponding to the dtype fields
|
|
|
|
Similar to dtype.descr, but the second item of each tuple is a dtype, not a
|
|
string. As a result, this handles subarray dtypes
|
|
|
|
Can be passed to the dtype constructor to reconstruct the dtype, noting that
|
|
this (deliberately) discards field offsets.
|
|
|
|
Examples
|
|
--------
|
|
>>> dt = np.dtype([(('a', 'A'), np.int64), ('b', np.double, 3)])
|
|
>>> dt.descr
|
|
[(('a', 'A'), '<i8'), ('b', '<f8', (3,))]
|
|
>>> _get_fieldspec(dt)
|
|
[(('a', 'A'), dtype('int64')), ('b', dtype(('<f8', (3,))))]
|
|
|
|
"""
|
|
if dtype.names is None:
|
|
# .descr returns a nameless field, so we should too
|
|
return [('', dtype)]
|
|
else:
|
|
fields = ((name, dtype.fields[name]) for name in dtype.names)
|
|
# keep any titles, if present
|
|
return [
|
|
(name if len(f) == 2 else (f[2], name), f[0])
|
|
for name, f in fields
|
|
]
|
|
|
|
|
|
def get_names(adtype):
|
|
"""
|
|
Returns the field names of the input datatype as a tuple. Input datatype
|
|
must have fields otherwise error is raised.
|
|
|
|
Parameters
|
|
----------
|
|
adtype : dtype
|
|
Input datatype
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> rfn.get_names(np.empty((1,), dtype=[('A', int)]).dtype)
|
|
('A',)
|
|
>>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]).dtype)
|
|
('A', 'B')
|
|
>>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
|
|
>>> rfn.get_names(adtype)
|
|
('a', ('b', ('ba', 'bb')))
|
|
"""
|
|
listnames = []
|
|
names = adtype.names
|
|
for name in names:
|
|
current = adtype[name]
|
|
if current.names is not None:
|
|
listnames.append((name, tuple(get_names(current))))
|
|
else:
|
|
listnames.append(name)
|
|
return tuple(listnames)
|
|
|
|
|
|
def get_names_flat(adtype):
|
|
"""
|
|
Returns the field names of the input datatype as a tuple. Input datatype
|
|
must have fields otherwise error is raised.
|
|
Nested structure are flattened beforehand.
|
|
|
|
Parameters
|
|
----------
|
|
adtype : dtype
|
|
Input datatype
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> rfn.get_names_flat(np.empty((1,), dtype=[('A', int)]).dtype) is None
|
|
False
|
|
>>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', str)]).dtype)
|
|
('A', 'B')
|
|
>>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
|
|
>>> rfn.get_names_flat(adtype)
|
|
('a', 'b', 'ba', 'bb')
|
|
"""
|
|
listnames = []
|
|
names = adtype.names
|
|
for name in names:
|
|
listnames.append(name)
|
|
current = adtype[name]
|
|
if current.names is not None:
|
|
listnames.extend(get_names_flat(current))
|
|
return tuple(listnames)
|
|
|
|
|
|
def flatten_descr(ndtype):
|
|
"""
|
|
Flatten a structured data-type description.
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
|
|
>>> rfn.flatten_descr(ndtype)
|
|
(('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
|
|
|
|
"""
|
|
names = ndtype.names
|
|
if names is None:
|
|
return (('', ndtype),)
|
|
else:
|
|
descr = []
|
|
for field in names:
|
|
(typ, _) = ndtype.fields[field]
|
|
if typ.names is not None:
|
|
descr.extend(flatten_descr(typ))
|
|
else:
|
|
descr.append((field, typ))
|
|
return tuple(descr)
|
|
|
|
|
|
def _zip_dtype(seqarrays, flatten=False):
|
|
newdtype = []
|
|
if flatten:
|
|
for a in seqarrays:
|
|
newdtype.extend(flatten_descr(a.dtype))
|
|
else:
|
|
for a in seqarrays:
|
|
current = a.dtype
|
|
if current.names is not None and len(current.names) == 1:
|
|
# special case - dtypes of 1 field are flattened
|
|
newdtype.extend(_get_fieldspec(current))
|
|
else:
|
|
newdtype.append(('', current))
|
|
return np.dtype(newdtype)
|
|
|
|
|
|
def _zip_descr(seqarrays, flatten=False):
|
|
"""
|
|
Combine the dtype description of a series of arrays.
|
|
|
|
Parameters
|
|
----------
|
|
seqarrays : sequence of arrays
|
|
Sequence of arrays
|
|
flatten : {boolean}, optional
|
|
Whether to collapse nested descriptions.
|
|
"""
|
|
return _zip_dtype(seqarrays, flatten=flatten).descr
|
|
|
|
|
|
def get_fieldstructure(adtype, lastname=None, parents=None,):
|
|
"""
|
|
Returns a dictionary with fields indexing lists of their parent fields.
|
|
|
|
This function is used to simplify access to fields nested in other fields.
|
|
|
|
Parameters
|
|
----------
|
|
adtype : np.dtype
|
|
Input datatype
|
|
lastname : optional
|
|
Last processed field name (used internally during recursion).
|
|
parents : dictionary
|
|
Dictionary of parent fields (used interbally during recursion).
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> ndtype = np.dtype([('A', int),
|
|
... ('B', [('BA', int),
|
|
... ('BB', [('BBA', int), ('BBB', int)])])])
|
|
>>> rfn.get_fieldstructure(ndtype)
|
|
... # XXX: possible regression, order of BBA and BBB is swapped
|
|
{'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
|
|
|
|
"""
|
|
if parents is None:
|
|
parents = {}
|
|
names = adtype.names
|
|
for name in names:
|
|
current = adtype[name]
|
|
if current.names is not None:
|
|
if lastname:
|
|
parents[name] = [lastname, ]
|
|
else:
|
|
parents[name] = []
|
|
parents.update(get_fieldstructure(current, name, parents))
|
|
else:
|
|
lastparent = [_ for _ in (parents.get(lastname, []) or [])]
|
|
if lastparent:
|
|
lastparent.append(lastname)
|
|
elif lastname:
|
|
lastparent = [lastname, ]
|
|
parents[name] = lastparent or []
|
|
return parents
|
|
|
|
|
|
def _izip_fields_flat(iterable):
|
|
"""
|
|
Returns an iterator of concatenated fields from a sequence of arrays,
|
|
collapsing any nested structure.
|
|
|
|
"""
|
|
for element in iterable:
|
|
if isinstance(element, np.void):
|
|
yield from _izip_fields_flat(tuple(element))
|
|
else:
|
|
yield element
|
|
|
|
|
|
def _izip_fields(iterable):
|
|
"""
|
|
Returns an iterator of concatenated fields from a sequence of arrays.
|
|
|
|
"""
|
|
for element in iterable:
|
|
if (hasattr(element, '__iter__') and
|
|
not isinstance(element, str)):
|
|
yield from _izip_fields(element)
|
|
elif isinstance(element, np.void) and len(tuple(element)) == 1:
|
|
# this statement is the same from the previous expression
|
|
yield from _izip_fields(element)
|
|
else:
|
|
yield element
|
|
|
|
|
|
def _izip_records(seqarrays, fill_value=None, flatten=True):
|
|
"""
|
|
Returns an iterator of concatenated items from a sequence of arrays.
|
|
|
|
Parameters
|
|
----------
|
|
seqarrays : sequence of arrays
|
|
Sequence of arrays.
|
|
fill_value : {None, integer}
|
|
Value used to pad shorter iterables.
|
|
flatten : {True, False},
|
|
Whether to
|
|
"""
|
|
|
|
# Should we flatten the items, or just use a nested approach
|
|
if flatten:
|
|
zipfunc = _izip_fields_flat
|
|
else:
|
|
zipfunc = _izip_fields
|
|
|
|
for tup in itertools.zip_longest(*seqarrays, fillvalue=fill_value):
|
|
yield tuple(zipfunc(tup))
|
|
|
|
|
|
def _fix_output(output, usemask=True, asrecarray=False):
|
|
"""
|
|
Private function: return a recarray, a ndarray, a MaskedArray
|
|
or a MaskedRecords depending on the input parameters
|
|
"""
|
|
if not isinstance(output, MaskedArray):
|
|
usemask = False
|
|
if usemask:
|
|
if asrecarray:
|
|
output = output.view(MaskedRecords)
|
|
else:
|
|
output = ma.filled(output)
|
|
if asrecarray:
|
|
output = output.view(recarray)
|
|
return output
|
|
|
|
|
|
def _fix_defaults(output, defaults=None):
|
|
"""
|
|
Update the fill_value and masked data of `output`
|
|
from the default given in a dictionary defaults.
|
|
"""
|
|
names = output.dtype.names
|
|
(data, mask, fill_value) = (output.data, output.mask, output.fill_value)
|
|
for (k, v) in (defaults or {}).items():
|
|
if k in names:
|
|
fill_value[k] = v
|
|
data[k][mask[k]] = v
|
|
return output
|
|
|
|
|
|
def _merge_arrays_dispatcher(seqarrays, fill_value=None, flatten=None,
|
|
usemask=None, asrecarray=None):
|
|
return seqarrays
|
|
|
|
|
|
@array_function_dispatch(_merge_arrays_dispatcher)
|
|
def merge_arrays(seqarrays, fill_value=-1, flatten=False,
|
|
usemask=False, asrecarray=False):
|
|
"""
|
|
Merge arrays field by field.
|
|
|
|
Parameters
|
|
----------
|
|
seqarrays : sequence of ndarrays
|
|
Sequence of arrays
|
|
fill_value : {float}, optional
|
|
Filling value used to pad missing data on the shorter arrays.
|
|
flatten : {False, True}, optional
|
|
Whether to collapse nested fields.
|
|
usemask : {False, True}, optional
|
|
Whether to return a masked array or not.
|
|
asrecarray : {False, True}, optional
|
|
Whether to return a recarray (MaskedRecords) or not.
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
|
|
array([( 1, 10.), ( 2, 20.), (-1, 30.)],
|
|
dtype=[('f0', '<i8'), ('f1', '<f8')])
|
|
|
|
>>> rfn.merge_arrays((np.array([1, 2], dtype=np.int64),
|
|
... np.array([10., 20., 30.])), usemask=False)
|
|
array([(1, 10.0), (2, 20.0), (-1, 30.0)],
|
|
dtype=[('f0', '<i8'), ('f1', '<f8')])
|
|
>>> rfn.merge_arrays((np.array([1, 2]).view([('a', np.int64)]),
|
|
... np.array([10., 20., 30.])),
|
|
... usemask=False, asrecarray=True)
|
|
rec.array([( 1, 10.), ( 2, 20.), (-1, 30.)],
|
|
dtype=[('a', '<i8'), ('f1', '<f8')])
|
|
|
|
Notes
|
|
-----
|
|
* Without a mask, the missing value will be filled with something,
|
|
depending on what its corresponding type:
|
|
|
|
* ``-1`` for integers
|
|
* ``-1.0`` for floating point numbers
|
|
* ``'-'`` for characters
|
|
* ``'-1'`` for strings
|
|
* ``True`` for boolean values
|
|
* XXX: I just obtained these values empirically
|
|
"""
|
|
# Only one item in the input sequence ?
|
|
if (len(seqarrays) == 1):
|
|
seqarrays = np.asanyarray(seqarrays[0])
|
|
# Do we have a single ndarray as input ?
|
|
if isinstance(seqarrays, (ndarray, np.void)):
|
|
seqdtype = seqarrays.dtype
|
|
# Make sure we have named fields
|
|
if seqdtype.names is None:
|
|
seqdtype = np.dtype([('', seqdtype)])
|
|
if not flatten or _zip_dtype((seqarrays,), flatten=True) == seqdtype:
|
|
# Minimal processing needed: just make sure everything's a-ok
|
|
seqarrays = seqarrays.ravel()
|
|
# Find what type of array we must return
|
|
if usemask:
|
|
if asrecarray:
|
|
seqtype = MaskedRecords
|
|
else:
|
|
seqtype = MaskedArray
|
|
elif asrecarray:
|
|
seqtype = recarray
|
|
else:
|
|
seqtype = ndarray
|
|
return seqarrays.view(dtype=seqdtype, type=seqtype)
|
|
else:
|
|
seqarrays = (seqarrays,)
|
|
else:
|
|
# Make sure we have arrays in the input sequence
|
|
seqarrays = [np.asanyarray(_m) for _m in seqarrays]
|
|
# Find the sizes of the inputs and their maximum
|
|
sizes = tuple(a.size for a in seqarrays)
|
|
maxlength = max(sizes)
|
|
# Get the dtype of the output (flattening if needed)
|
|
newdtype = _zip_dtype(seqarrays, flatten=flatten)
|
|
# Initialize the sequences for data and mask
|
|
seqdata = []
|
|
seqmask = []
|
|
# If we expect some kind of MaskedArray, make a special loop.
|
|
if usemask:
|
|
for (a, n) in zip(seqarrays, sizes):
|
|
nbmissing = (maxlength - n)
|
|
# Get the data and mask
|
|
data = a.ravel().__array__()
|
|
mask = ma.getmaskarray(a).ravel()
|
|
# Get the filling value (if needed)
|
|
if nbmissing:
|
|
fval = _check_fill_value(fill_value, a.dtype)
|
|
if isinstance(fval, (ndarray, np.void)):
|
|
if len(fval.dtype) == 1:
|
|
fval = fval.item()[0]
|
|
fmsk = True
|
|
else:
|
|
fval = np.array(fval, dtype=a.dtype, ndmin=1)
|
|
fmsk = np.ones((1,), dtype=mask.dtype)
|
|
else:
|
|
fval = None
|
|
fmsk = True
|
|
# Store an iterator padding the input to the expected length
|
|
seqdata.append(itertools.chain(data, [fval] * nbmissing))
|
|
seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
|
|
# Create an iterator for the data
|
|
data = tuple(_izip_records(seqdata, flatten=flatten))
|
|
output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
|
|
mask=list(_izip_records(seqmask, flatten=flatten)))
|
|
if asrecarray:
|
|
output = output.view(MaskedRecords)
|
|
else:
|
|
# Same as before, without the mask we don't need...
|
|
for (a, n) in zip(seqarrays, sizes):
|
|
nbmissing = (maxlength - n)
|
|
data = a.ravel().__array__()
|
|
if nbmissing:
|
|
fval = _check_fill_value(fill_value, a.dtype)
|
|
if isinstance(fval, (ndarray, np.void)):
|
|
if len(fval.dtype) == 1:
|
|
fval = fval.item()[0]
|
|
else:
|
|
fval = np.array(fval, dtype=a.dtype, ndmin=1)
|
|
else:
|
|
fval = None
|
|
seqdata.append(itertools.chain(data, [fval] * nbmissing))
|
|
output = np.fromiter(tuple(_izip_records(seqdata, flatten=flatten)),
|
|
dtype=newdtype, count=maxlength)
|
|
if asrecarray:
|
|
output = output.view(recarray)
|
|
# And we're done...
|
|
return output
|
|
|
|
|
|
def _drop_fields_dispatcher(base, drop_names, usemask=None, asrecarray=None):
|
|
return (base,)
|
|
|
|
|
|
@array_function_dispatch(_drop_fields_dispatcher)
|
|
def drop_fields(base, drop_names, usemask=True, asrecarray=False):
|
|
"""
|
|
Return a new array with fields in `drop_names` dropped.
|
|
|
|
Nested fields are supported.
|
|
|
|
.. versionchanged:: 1.18.0
|
|
`drop_fields` returns an array with 0 fields if all fields are dropped,
|
|
rather than returning ``None`` as it did previously.
|
|
|
|
Parameters
|
|
----------
|
|
base : array
|
|
Input array
|
|
drop_names : string or sequence
|
|
String or sequence of strings corresponding to the names of the
|
|
fields to drop.
|
|
usemask : {False, True}, optional
|
|
Whether to return a masked array or not.
|
|
asrecarray : string or sequence, optional
|
|
Whether to return a recarray or a mrecarray (`asrecarray=True`) or
|
|
a plain ndarray or masked array with flexible dtype. The default
|
|
is False.
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
|
|
... dtype=[('a', np.int64), ('b', [('ba', np.double), ('bb', np.int64)])])
|
|
>>> rfn.drop_fields(a, 'a')
|
|
array([((2., 3),), ((5., 6),)],
|
|
dtype=[('b', [('ba', '<f8'), ('bb', '<i8')])])
|
|
>>> rfn.drop_fields(a, 'ba')
|
|
array([(1, (3,)), (4, (6,))], dtype=[('a', '<i8'), ('b', [('bb', '<i8')])])
|
|
>>> rfn.drop_fields(a, ['ba', 'bb'])
|
|
array([(1,), (4,)], dtype=[('a', '<i8')])
|
|
"""
|
|
if _is_string_like(drop_names):
|
|
drop_names = [drop_names]
|
|
else:
|
|
drop_names = set(drop_names)
|
|
|
|
def _drop_descr(ndtype, drop_names):
|
|
names = ndtype.names
|
|
newdtype = []
|
|
for name in names:
|
|
current = ndtype[name]
|
|
if name in drop_names:
|
|
continue
|
|
if current.names is not None:
|
|
descr = _drop_descr(current, drop_names)
|
|
if descr:
|
|
newdtype.append((name, descr))
|
|
else:
|
|
newdtype.append((name, current))
|
|
return newdtype
|
|
|
|
newdtype = _drop_descr(base.dtype, drop_names)
|
|
|
|
output = np.empty(base.shape, dtype=newdtype)
|
|
output = recursive_fill_fields(base, output)
|
|
return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
|
|
|
|
|
|
def _keep_fields(base, keep_names, usemask=True, asrecarray=False):
|
|
"""
|
|
Return a new array keeping only the fields in `keep_names`,
|
|
and preserving the order of those fields.
|
|
|
|
Parameters
|
|
----------
|
|
base : array
|
|
Input array
|
|
keep_names : string or sequence
|
|
String or sequence of strings corresponding to the names of the
|
|
fields to keep. Order of the names will be preserved.
|
|
usemask : {False, True}, optional
|
|
Whether to return a masked array or not.
|
|
asrecarray : string or sequence, optional
|
|
Whether to return a recarray or a mrecarray (`asrecarray=True`) or
|
|
a plain ndarray or masked array with flexible dtype. The default
|
|
is False.
|
|
"""
|
|
newdtype = [(n, base.dtype[n]) for n in keep_names]
|
|
output = np.empty(base.shape, dtype=newdtype)
|
|
output = recursive_fill_fields(base, output)
|
|
return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
|
|
|
|
|
|
def _rec_drop_fields_dispatcher(base, drop_names):
|
|
return (base,)
|
|
|
|
|
|
@array_function_dispatch(_rec_drop_fields_dispatcher)
|
|
def rec_drop_fields(base, drop_names):
|
|
"""
|
|
Returns a new numpy.recarray with fields in `drop_names` dropped.
|
|
"""
|
|
return drop_fields(base, drop_names, usemask=False, asrecarray=True)
|
|
|
|
|
|
def _rename_fields_dispatcher(base, namemapper):
|
|
return (base,)
|
|
|
|
|
|
@array_function_dispatch(_rename_fields_dispatcher)
|
|
def rename_fields(base, namemapper):
|
|
"""
|
|
Rename the fields from a flexible-datatype ndarray or recarray.
|
|
|
|
Nested fields are supported.
|
|
|
|
Parameters
|
|
----------
|
|
base : ndarray
|
|
Input array whose fields must be modified.
|
|
namemapper : dictionary
|
|
Dictionary mapping old field names to their new version.
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
|
|
... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
|
|
>>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
|
|
array([(1, (2., [ 3., 30.])), (4, (5., [ 6., 60.]))],
|
|
dtype=[('A', '<i8'), ('b', [('ba', '<f8'), ('BB', '<f8', (2,))])])
|
|
|
|
"""
|
|
def _recursive_rename_fields(ndtype, namemapper):
|
|
newdtype = []
|
|
for name in ndtype.names:
|
|
newname = namemapper.get(name, name)
|
|
current = ndtype[name]
|
|
if current.names is not None:
|
|
newdtype.append(
|
|
(newname, _recursive_rename_fields(current, namemapper))
|
|
)
|
|
else:
|
|
newdtype.append((newname, current))
|
|
return newdtype
|
|
newdtype = _recursive_rename_fields(base.dtype, namemapper)
|
|
return base.view(newdtype)
|
|
|
|
|
|
def _append_fields_dispatcher(base, names, data, dtypes=None,
|
|
fill_value=None, usemask=None, asrecarray=None):
|
|
yield base
|
|
yield from data
|
|
|
|
|
|
@array_function_dispatch(_append_fields_dispatcher)
|
|
def append_fields(base, names, data, dtypes=None,
|
|
fill_value=-1, usemask=True, asrecarray=False):
|
|
"""
|
|
Add new fields to an existing array.
|
|
|
|
The names of the fields are given with the `names` arguments,
|
|
the corresponding values with the `data` arguments.
|
|
If a single field is appended, `names`, `data` and `dtypes` do not have
|
|
to be lists but just values.
|
|
|
|
Parameters
|
|
----------
|
|
base : array
|
|
Input array to extend.
|
|
names : string, sequence
|
|
String or sequence of strings corresponding to the names
|
|
of the new fields.
|
|
data : array or sequence of arrays
|
|
Array or sequence of arrays storing the fields to add to the base.
|
|
dtypes : sequence of datatypes, optional
|
|
Datatype or sequence of datatypes.
|
|
If None, the datatypes are estimated from the `data`.
|
|
fill_value : {float}, optional
|
|
Filling value used to pad missing data on the shorter arrays.
|
|
usemask : {False, True}, optional
|
|
Whether to return a masked array or not.
|
|
asrecarray : {False, True}, optional
|
|
Whether to return a recarray (MaskedRecords) or not.
|
|
|
|
"""
|
|
# Check the names
|
|
if isinstance(names, (tuple, list)):
|
|
if len(names) != len(data):
|
|
msg = "The number of arrays does not match the number of names"
|
|
raise ValueError(msg)
|
|
elif isinstance(names, str):
|
|
names = [names, ]
|
|
data = [data, ]
|
|
#
|
|
if dtypes is None:
|
|
data = [np.array(a, copy=False, subok=True) for a in data]
|
|
data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
|
|
else:
|
|
if not isinstance(dtypes, (tuple, list)):
|
|
dtypes = [dtypes, ]
|
|
if len(data) != len(dtypes):
|
|
if len(dtypes) == 1:
|
|
dtypes = dtypes * len(data)
|
|
else:
|
|
msg = "The dtypes argument must be None, a dtype, or a list."
|
|
raise ValueError(msg)
|
|
data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)])
|
|
for (a, n, d) in zip(data, names, dtypes)]
|
|
#
|
|
base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
|
|
if len(data) > 1:
|
|
data = merge_arrays(data, flatten=True, usemask=usemask,
|
|
fill_value=fill_value)
|
|
else:
|
|
data = data.pop()
|
|
#
|
|
output = ma.masked_all(
|
|
max(len(base), len(data)),
|
|
dtype=_get_fieldspec(base.dtype) + _get_fieldspec(data.dtype))
|
|
output = recursive_fill_fields(base, output)
|
|
output = recursive_fill_fields(data, output)
|
|
#
|
|
return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
|
|
|
|
|
|
def _rec_append_fields_dispatcher(base, names, data, dtypes=None):
|
|
yield base
|
|
yield from data
|
|
|
|
|
|
@array_function_dispatch(_rec_append_fields_dispatcher)
|
|
def rec_append_fields(base, names, data, dtypes=None):
|
|
"""
|
|
Add new fields to an existing array.
|
|
|
|
The names of the fields are given with the `names` arguments,
|
|
the corresponding values with the `data` arguments.
|
|
If a single field is appended, `names`, `data` and `dtypes` do not have
|
|
to be lists but just values.
|
|
|
|
Parameters
|
|
----------
|
|
base : array
|
|
Input array to extend.
|
|
names : string, sequence
|
|
String or sequence of strings corresponding to the names
|
|
of the new fields.
|
|
data : array or sequence of arrays
|
|
Array or sequence of arrays storing the fields to add to the base.
|
|
dtypes : sequence of datatypes, optional
|
|
Datatype or sequence of datatypes.
|
|
If None, the datatypes are estimated from the `data`.
|
|
|
|
See Also
|
|
--------
|
|
append_fields
|
|
|
|
Returns
|
|
-------
|
|
appended_array : np.recarray
|
|
"""
|
|
return append_fields(base, names, data=data, dtypes=dtypes,
|
|
asrecarray=True, usemask=False)
|
|
|
|
|
|
def _repack_fields_dispatcher(a, align=None, recurse=None):
|
|
return (a,)
|
|
|
|
|
|
@array_function_dispatch(_repack_fields_dispatcher)
|
|
def repack_fields(a, align=False, recurse=False):
|
|
"""
|
|
Re-pack the fields of a structured array or dtype in memory.
|
|
|
|
The memory layout of structured datatypes allows fields at arbitrary
|
|
byte offsets. This means the fields can be separated by padding bytes,
|
|
their offsets can be non-monotonically increasing, and they can overlap.
|
|
|
|
This method removes any overlaps and reorders the fields in memory so they
|
|
have increasing byte offsets, and adds or removes padding bytes depending
|
|
on the `align` option, which behaves like the `align` option to
|
|
`numpy.dtype`.
|
|
|
|
If `align=False`, this method produces a "packed" memory layout in which
|
|
each field starts at the byte the previous field ended, and any padding
|
|
bytes are removed.
|
|
|
|
If `align=True`, this methods produces an "aligned" memory layout in which
|
|
each field's offset is a multiple of its alignment, and the total itemsize
|
|
is a multiple of the largest alignment, by adding padding bytes as needed.
|
|
|
|
Parameters
|
|
----------
|
|
a : ndarray or dtype
|
|
array or dtype for which to repack the fields.
|
|
align : boolean
|
|
If true, use an "aligned" memory layout, otherwise use a "packed" layout.
|
|
recurse : boolean
|
|
If True, also repack nested structures.
|
|
|
|
Returns
|
|
-------
|
|
repacked : ndarray or dtype
|
|
Copy of `a` with fields repacked, or `a` itself if no repacking was
|
|
needed.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> def print_offsets(d):
|
|
... print("offsets:", [d.fields[name][1] for name in d.names])
|
|
... print("itemsize:", d.itemsize)
|
|
...
|
|
>>> dt = np.dtype('u1, <i8, <f8', align=True)
|
|
>>> dt
|
|
dtype({'names': ['f0', 'f1', 'f2'], 'formats': ['u1', '<i8', '<f8'], \
|
|
'offsets': [0, 8, 16], 'itemsize': 24}, align=True)
|
|
>>> print_offsets(dt)
|
|
offsets: [0, 8, 16]
|
|
itemsize: 24
|
|
>>> packed_dt = rfn.repack_fields(dt)
|
|
>>> packed_dt
|
|
dtype([('f0', 'u1'), ('f1', '<i8'), ('f2', '<f8')])
|
|
>>> print_offsets(packed_dt)
|
|
offsets: [0, 1, 9]
|
|
itemsize: 17
|
|
|
|
"""
|
|
if not isinstance(a, np.dtype):
|
|
dt = repack_fields(a.dtype, align=align, recurse=recurse)
|
|
return a.astype(dt, copy=False)
|
|
|
|
if a.names is None:
|
|
return a
|
|
|
|
fieldinfo = []
|
|
for name in a.names:
|
|
tup = a.fields[name]
|
|
if recurse:
|
|
fmt = repack_fields(tup[0], align=align, recurse=True)
|
|
else:
|
|
fmt = tup[0]
|
|
|
|
if len(tup) == 3:
|
|
name = (tup[2], name)
|
|
|
|
fieldinfo.append((name, fmt))
|
|
|
|
dt = np.dtype(fieldinfo, align=align)
|
|
return np.dtype((a.type, dt))
|
|
|
|
def _get_fields_and_offsets(dt, offset=0):
|
|
"""
|
|
Returns a flat list of (dtype, count, offset) tuples of all the
|
|
scalar fields in the dtype "dt", including nested fields, in left
|
|
to right order.
|
|
"""
|
|
|
|
# counts up elements in subarrays, including nested subarrays, and returns
|
|
# base dtype and count
|
|
def count_elem(dt):
|
|
count = 1
|
|
while dt.shape != ():
|
|
for size in dt.shape:
|
|
count *= size
|
|
dt = dt.base
|
|
return dt, count
|
|
|
|
fields = []
|
|
for name in dt.names:
|
|
field = dt.fields[name]
|
|
f_dt, f_offset = field[0], field[1]
|
|
f_dt, n = count_elem(f_dt)
|
|
|
|
if f_dt.names is None:
|
|
fields.append((np.dtype((f_dt, (n,))), n, f_offset + offset))
|
|
else:
|
|
subfields = _get_fields_and_offsets(f_dt, f_offset + offset)
|
|
size = f_dt.itemsize
|
|
|
|
for i in range(n):
|
|
if i == 0:
|
|
# optimization: avoid list comprehension if no subarray
|
|
fields.extend(subfields)
|
|
else:
|
|
fields.extend([(d, c, o + i*size) for d, c, o in subfields])
|
|
return fields
|
|
|
|
|
|
def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
|
|
casting=None):
|
|
return (arr,)
|
|
|
|
@array_function_dispatch(_structured_to_unstructured_dispatcher)
|
|
def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
|
|
"""
|
|
Converts an n-D structured array into an (n+1)-D unstructured array.
|
|
|
|
The new array will have a new last dimension equal in size to the
|
|
number of field-elements of the input array. If not supplied, the output
|
|
datatype is determined from the numpy type promotion rules applied to all
|
|
the field datatypes.
|
|
|
|
Nested fields, as well as each element of any subarray fields, all count
|
|
as a single field-elements.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ndarray
|
|
Structured array or dtype to convert. Cannot contain object datatype.
|
|
dtype : dtype, optional
|
|
The dtype of the output unstructured array.
|
|
copy : bool, optional
|
|
See copy argument to `numpy.ndarray.astype`. If true, always return a
|
|
copy. If false, and `dtype` requirements are satisfied, a view is
|
|
returned.
|
|
casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
|
|
See casting argument of `numpy.ndarray.astype`. Controls what kind of
|
|
data casting may occur.
|
|
|
|
Returns
|
|
-------
|
|
unstructured : ndarray
|
|
Unstructured array with one more dimension.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
|
|
>>> a
|
|
array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]),
|
|
(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])],
|
|
dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
|
|
>>> rfn.structured_to_unstructured(a)
|
|
array([[0., 0., 0., 0., 0.],
|
|
[0., 0., 0., 0., 0.],
|
|
[0., 0., 0., 0., 0.],
|
|
[0., 0., 0., 0., 0.]])
|
|
|
|
>>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
|
|
... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
|
|
>>> np.mean(rfn.structured_to_unstructured(b[['x', 'z']]), axis=-1)
|
|
array([ 3. , 5.5, 9. , 11. ])
|
|
|
|
"""
|
|
if arr.dtype.names is None:
|
|
raise ValueError('arr must be a structured array')
|
|
|
|
fields = _get_fields_and_offsets(arr.dtype)
|
|
n_fields = len(fields)
|
|
if n_fields == 0 and dtype is None:
|
|
raise ValueError("arr has no fields. Unable to guess dtype")
|
|
elif n_fields == 0:
|
|
# too many bugs elsewhere for this to work now
|
|
raise NotImplementedError("arr with no fields is not supported")
|
|
|
|
dts, counts, offsets = zip(*fields)
|
|
names = ['f{}'.format(n) for n in range(n_fields)]
|
|
|
|
if dtype is None:
|
|
out_dtype = np.result_type(*[dt.base for dt in dts])
|
|
else:
|
|
out_dtype = dtype
|
|
|
|
# Use a series of views and casts to convert to an unstructured array:
|
|
|
|
# first view using flattened fields (doesn't work for object arrays)
|
|
# Note: dts may include a shape for subarrays
|
|
flattened_fields = np.dtype({'names': names,
|
|
'formats': dts,
|
|
'offsets': offsets,
|
|
'itemsize': arr.dtype.itemsize})
|
|
with suppress_warnings() as sup: # until 1.16 (gh-12447)
|
|
sup.filter(FutureWarning, "Numpy has detected")
|
|
arr = arr.view(flattened_fields)
|
|
|
|
# next cast to a packed format with all fields converted to new dtype
|
|
packed_fields = np.dtype({'names': names,
|
|
'formats': [(out_dtype, dt.shape) for dt in dts]})
|
|
arr = arr.astype(packed_fields, copy=copy, casting=casting)
|
|
|
|
# finally is it safe to view the packed fields as the unstructured type
|
|
return arr.view((out_dtype, (sum(counts),)))
|
|
|
|
|
|
def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
|
|
align=None, copy=None, casting=None):
|
|
return (arr,)
|
|
|
|
@array_function_dispatch(_unstructured_to_structured_dispatcher)
|
|
def unstructured_to_structured(arr, dtype=None, names=None, align=False,
|
|
copy=False, casting='unsafe'):
|
|
"""
|
|
Converts an n-D unstructured array into an (n-1)-D structured array.
|
|
|
|
The last dimension of the input array is converted into a structure, with
|
|
number of field-elements equal to the size of the last dimension of the
|
|
input array. By default all output fields have the input array's dtype, but
|
|
an output structured dtype with an equal number of fields-elements can be
|
|
supplied instead.
|
|
|
|
Nested fields, as well as each element of any subarray fields, all count
|
|
towards the number of field-elements.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ndarray
|
|
Unstructured array or dtype to convert.
|
|
dtype : dtype, optional
|
|
The structured dtype of the output array
|
|
names : list of strings, optional
|
|
If dtype is not supplied, this specifies the field names for the output
|
|
dtype, in order. The field dtypes will be the same as the input array.
|
|
align : boolean, optional
|
|
Whether to create an aligned memory layout.
|
|
copy : bool, optional
|
|
See copy argument to `numpy.ndarray.astype`. If true, always return a
|
|
copy. If false, and `dtype` requirements are satisfied, a view is
|
|
returned.
|
|
casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
|
|
See casting argument of `numpy.ndarray.astype`. Controls what kind of
|
|
data casting may occur.
|
|
|
|
Returns
|
|
-------
|
|
structured : ndarray
|
|
Structured array with fewer dimensions.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
|
|
>>> a = np.arange(20).reshape((4,5))
|
|
>>> a
|
|
array([[ 0, 1, 2, 3, 4],
|
|
[ 5, 6, 7, 8, 9],
|
|
[10, 11, 12, 13, 14],
|
|
[15, 16, 17, 18, 19]])
|
|
>>> rfn.unstructured_to_structured(a, dt)
|
|
array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]),
|
|
(10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])],
|
|
dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
|
|
|
|
"""
|
|
if arr.shape == ():
|
|
raise ValueError('arr must have at least one dimension')
|
|
n_elem = arr.shape[-1]
|
|
if n_elem == 0:
|
|
# too many bugs elsewhere for this to work now
|
|
raise NotImplementedError("last axis with size 0 is not supported")
|
|
|
|
if dtype is None:
|
|
if names is None:
|
|
names = ['f{}'.format(n) for n in range(n_elem)]
|
|
out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align)
|
|
fields = _get_fields_and_offsets(out_dtype)
|
|
dts, counts, offsets = zip(*fields)
|
|
else:
|
|
if names is not None:
|
|
raise ValueError("don't supply both dtype and names")
|
|
# sanity check of the input dtype
|
|
fields = _get_fields_and_offsets(dtype)
|
|
if len(fields) == 0:
|
|
dts, counts, offsets = [], [], []
|
|
else:
|
|
dts, counts, offsets = zip(*fields)
|
|
|
|
if n_elem != sum(counts):
|
|
raise ValueError('The length of the last dimension of arr must '
|
|
'be equal to the number of fields in dtype')
|
|
out_dtype = dtype
|
|
if align and not out_dtype.isalignedstruct:
|
|
raise ValueError("align was True but dtype is not aligned")
|
|
|
|
names = ['f{}'.format(n) for n in range(len(fields))]
|
|
|
|
# Use a series of views and casts to convert to a structured array:
|
|
|
|
# first view as a packed structured array of one dtype
|
|
packed_fields = np.dtype({'names': names,
|
|
'formats': [(arr.dtype, dt.shape) for dt in dts]})
|
|
arr = np.ascontiguousarray(arr).view(packed_fields)
|
|
|
|
# next cast to an unpacked but flattened format with varied dtypes
|
|
flattened_fields = np.dtype({'names': names,
|
|
'formats': dts,
|
|
'offsets': offsets,
|
|
'itemsize': out_dtype.itemsize})
|
|
arr = arr.astype(flattened_fields, copy=copy, casting=casting)
|
|
|
|
# finally view as the final nested dtype and remove the last axis
|
|
return arr.view(out_dtype)[..., 0]
|
|
|
|
def _apply_along_fields_dispatcher(func, arr):
|
|
return (arr,)
|
|
|
|
@array_function_dispatch(_apply_along_fields_dispatcher)
|
|
def apply_along_fields(func, arr):
|
|
"""
|
|
Apply function 'func' as a reduction across fields of a structured array.
|
|
|
|
This is similar to `apply_along_axis`, but treats the fields of a
|
|
structured array as an extra axis. The fields are all first cast to a
|
|
common type following the type-promotion rules from `numpy.result_type`
|
|
applied to the field's dtypes.
|
|
|
|
Parameters
|
|
----------
|
|
func : function
|
|
Function to apply on the "field" dimension. This function must
|
|
support an `axis` argument, like np.mean, np.sum, etc.
|
|
arr : ndarray
|
|
Structured array for which to apply func.
|
|
|
|
Returns
|
|
-------
|
|
out : ndarray
|
|
Result of the recution operation
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
|
|
... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
|
|
>>> rfn.apply_along_fields(np.mean, b)
|
|
array([ 2.66666667, 5.33333333, 8.66666667, 11. ])
|
|
>>> rfn.apply_along_fields(np.mean, b[['x', 'z']])
|
|
array([ 3. , 5.5, 9. , 11. ])
|
|
|
|
"""
|
|
if arr.dtype.names is None:
|
|
raise ValueError('arr must be a structured array')
|
|
|
|
uarr = structured_to_unstructured(arr)
|
|
return func(uarr, axis=-1)
|
|
# works and avoids axis requirement, but very, very slow:
|
|
#return np.apply_along_axis(func, -1, uarr)
|
|
|
|
def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None):
|
|
return dst, src
|
|
|
|
@array_function_dispatch(_assign_fields_by_name_dispatcher)
|
|
def assign_fields_by_name(dst, src, zero_unassigned=True):
|
|
"""
|
|
Assigns values from one structured array to another by field name.
|
|
|
|
Normally in numpy >= 1.14, assignment of one structured array to another
|
|
copies fields "by position", meaning that the first field from the src is
|
|
copied to the first field of the dst, and so on, regardless of field name.
|
|
|
|
This function instead copies "by field name", such that fields in the dst
|
|
are assigned from the identically named field in the src. This applies
|
|
recursively for nested structures. This is how structure assignment worked
|
|
in numpy >= 1.6 to <= 1.13.
|
|
|
|
Parameters
|
|
----------
|
|
dst : ndarray
|
|
src : ndarray
|
|
The source and destination arrays during assignment.
|
|
zero_unassigned : bool, optional
|
|
If True, fields in the dst for which there was no matching
|
|
field in the src are filled with the value 0 (zero). This
|
|
was the behavior of numpy <= 1.13. If False, those fields
|
|
are not modified.
|
|
"""
|
|
|
|
if dst.dtype.names is None:
|
|
dst[...] = src
|
|
return
|
|
|
|
for name in dst.dtype.names:
|
|
if name not in src.dtype.names:
|
|
if zero_unassigned:
|
|
dst[name] = 0
|
|
else:
|
|
assign_fields_by_name(dst[name], src[name],
|
|
zero_unassigned)
|
|
|
|
def _require_fields_dispatcher(array, required_dtype):
|
|
return (array,)
|
|
|
|
@array_function_dispatch(_require_fields_dispatcher)
|
|
def require_fields(array, required_dtype):
|
|
"""
|
|
Casts a structured array to a new dtype using assignment by field-name.
|
|
|
|
This function assigns from the old to the new array by name, so the
|
|
value of a field in the output array is the value of the field with the
|
|
same name in the source array. This has the effect of creating a new
|
|
ndarray containing only the fields "required" by the required_dtype.
|
|
|
|
If a field name in the required_dtype does not exist in the
|
|
input array, that field is created and set to 0 in the output array.
|
|
|
|
Parameters
|
|
----------
|
|
a : ndarray
|
|
array to cast
|
|
required_dtype : dtype
|
|
datatype for output array
|
|
|
|
Returns
|
|
-------
|
|
out : ndarray
|
|
array with the new dtype, with field values copied from the fields in
|
|
the input array with the same name
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
|
|
>>> rfn.require_fields(a, [('b', 'f4'), ('c', 'u1')])
|
|
array([(1., 1), (1., 1), (1., 1), (1., 1)],
|
|
dtype=[('b', '<f4'), ('c', 'u1')])
|
|
>>> rfn.require_fields(a, [('b', 'f4'), ('newf', 'u1')])
|
|
array([(1., 0), (1., 0), (1., 0), (1., 0)],
|
|
dtype=[('b', '<f4'), ('newf', 'u1')])
|
|
|
|
"""
|
|
out = np.empty(array.shape, dtype=required_dtype)
|
|
assign_fields_by_name(out, array)
|
|
return out
|
|
|
|
|
|
def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None,
|
|
asrecarray=None, autoconvert=None):
|
|
return arrays
|
|
|
|
|
|
@array_function_dispatch(_stack_arrays_dispatcher)
|
|
def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
|
|
autoconvert=False):
|
|
"""
|
|
Superposes arrays fields by fields
|
|
|
|
Parameters
|
|
----------
|
|
arrays : array or sequence
|
|
Sequence of input arrays.
|
|
defaults : dictionary, optional
|
|
Dictionary mapping field names to the corresponding default values.
|
|
usemask : {True, False}, optional
|
|
Whether to return a MaskedArray (or MaskedRecords is
|
|
`asrecarray==True`) or a ndarray.
|
|
asrecarray : {False, True}, optional
|
|
Whether to return a recarray (or MaskedRecords if `usemask==True`)
|
|
or just a flexible-type ndarray.
|
|
autoconvert : {False, True}, optional
|
|
Whether automatically cast the type of the field to the maximum.
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> x = np.array([1, 2,])
|
|
>>> rfn.stack_arrays(x) is x
|
|
True
|
|
>>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
|
|
>>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
|
|
... dtype=[('A', '|S3'), ('B', np.double), ('C', np.double)])
|
|
>>> test = rfn.stack_arrays((z,zz))
|
|
>>> test
|
|
masked_array(data=[(b'A', 1.0, --), (b'B', 2.0, --), (b'a', 10.0, 100.0),
|
|
(b'b', 20.0, 200.0), (b'c', 30.0, 300.0)],
|
|
mask=[(False, False, True), (False, False, True),
|
|
(False, False, False), (False, False, False),
|
|
(False, False, False)],
|
|
fill_value=(b'N/A', 1.e+20, 1.e+20),
|
|
dtype=[('A', 'S3'), ('B', '<f8'), ('C', '<f8')])
|
|
|
|
"""
|
|
if isinstance(arrays, ndarray):
|
|
return arrays
|
|
elif len(arrays) == 1:
|
|
return arrays[0]
|
|
seqarrays = [np.asanyarray(a).ravel() for a in arrays]
|
|
nrecords = [len(a) for a in seqarrays]
|
|
ndtype = [a.dtype for a in seqarrays]
|
|
fldnames = [d.names for d in ndtype]
|
|
#
|
|
dtype_l = ndtype[0]
|
|
newdescr = _get_fieldspec(dtype_l)
|
|
names = [n for n, d in newdescr]
|
|
for dtype_n in ndtype[1:]:
|
|
for fname, fdtype in _get_fieldspec(dtype_n):
|
|
if fname not in names:
|
|
newdescr.append((fname, fdtype))
|
|
names.append(fname)
|
|
else:
|
|
nameidx = names.index(fname)
|
|
_, cdtype = newdescr[nameidx]
|
|
if autoconvert:
|
|
newdescr[nameidx] = (fname, max(fdtype, cdtype))
|
|
elif fdtype != cdtype:
|
|
raise TypeError("Incompatible type '%s' <> '%s'" %
|
|
(cdtype, fdtype))
|
|
# Only one field: use concatenate
|
|
if len(newdescr) == 1:
|
|
output = ma.concatenate(seqarrays)
|
|
else:
|
|
#
|
|
output = ma.masked_all((np.sum(nrecords),), newdescr)
|
|
offset = np.cumsum(np.r_[0, nrecords])
|
|
seen = []
|
|
for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
|
|
names = a.dtype.names
|
|
if names is None:
|
|
output['f%i' % len(seen)][i:j] = a
|
|
else:
|
|
for name in n:
|
|
output[name][i:j] = a[name]
|
|
if name not in seen:
|
|
seen.append(name)
|
|
#
|
|
return _fix_output(_fix_defaults(output, defaults),
|
|
usemask=usemask, asrecarray=asrecarray)
|
|
|
|
|
|
def _find_duplicates_dispatcher(
|
|
a, key=None, ignoremask=None, return_index=None):
|
|
return (a,)
|
|
|
|
|
|
@array_function_dispatch(_find_duplicates_dispatcher)
|
|
def find_duplicates(a, key=None, ignoremask=True, return_index=False):
|
|
"""
|
|
Find the duplicates in a structured array along a given key
|
|
|
|
Parameters
|
|
----------
|
|
a : array-like
|
|
Input array
|
|
key : {string, None}, optional
|
|
Name of the fields along which to check the duplicates.
|
|
If None, the search is performed by records
|
|
ignoremask : {True, False}, optional
|
|
Whether masked data should be discarded or considered as duplicates.
|
|
return_index : {False, True}, optional
|
|
Whether to return the indices of the duplicated values.
|
|
|
|
Examples
|
|
--------
|
|
>>> from numpy.lib import recfunctions as rfn
|
|
>>> ndtype = [('a', int)]
|
|
>>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
|
|
... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
|
|
>>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
|
|
(masked_array(data=[(1,), (1,), (2,), (2,)],
|
|
mask=[(False,), (False,), (False,), (False,)],
|
|
fill_value=(999999,),
|
|
dtype=[('a', '<i8')]), array([0, 1, 3, 4]))
|
|
"""
|
|
a = np.asanyarray(a).ravel()
|
|
# Get a dictionary of fields
|
|
fields = get_fieldstructure(a.dtype)
|
|
# Get the sorting data (by selecting the corresponding field)
|
|
base = a
|
|
if key:
|
|
for f in fields[key]:
|
|
base = base[f]
|
|
base = base[key]
|
|
# Get the sorting indices and the sorted data
|
|
sortidx = base.argsort()
|
|
sortedbase = base[sortidx]
|
|
sorteddata = sortedbase.filled()
|
|
# Compare the sorting data
|
|
flag = (sorteddata[:-1] == sorteddata[1:])
|
|
# If masked data must be ignored, set the flag to false where needed
|
|
if ignoremask:
|
|
sortedmask = sortedbase.recordmask
|
|
flag[sortedmask[1:]] = False
|
|
flag = np.concatenate(([False], flag))
|
|
# We need to take the point on the left as well (else we're missing it)
|
|
flag[:-1] = flag[:-1] + flag[1:]
|
|
duplicates = a[sortidx][flag]
|
|
if return_index:
|
|
return (duplicates, sortidx[flag])
|
|
else:
|
|
return duplicates
|
|
|
|
|
|
def _join_by_dispatcher(
|
|
key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
|
|
defaults=None, usemask=None, asrecarray=None):
|
|
return (r1, r2)
|
|
|
|
|
|
@array_function_dispatch(_join_by_dispatcher)
|
|
def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
|
|
defaults=None, usemask=True, asrecarray=False):
|
|
"""
|
|
Join arrays `r1` and `r2` on key `key`.
|
|
|
|
The key should be either a string or a sequence of string corresponding
|
|
to the fields used to join the array. An exception is raised if the
|
|
`key` field cannot be found in the two input arrays. Neither `r1` nor
|
|
`r2` should have any duplicates along `key`: the presence of duplicates
|
|
will make the output quite unreliable. Note that duplicates are not
|
|
looked for by the algorithm.
|
|
|
|
Parameters
|
|
----------
|
|
key : {string, sequence}
|
|
A string or a sequence of strings corresponding to the fields used
|
|
for comparison.
|
|
r1, r2 : arrays
|
|
Structured arrays.
|
|
jointype : {'inner', 'outer', 'leftouter'}, optional
|
|
If 'inner', returns the elements common to both r1 and r2.
|
|
If 'outer', returns the common elements as well as the elements of
|
|
r1 not in r2 and the elements of not in r2.
|
|
If 'leftouter', returns the common elements and the elements of r1
|
|
not in r2.
|
|
r1postfix : string, optional
|
|
String appended to the names of the fields of r1 that are present
|
|
in r2 but absent of the key.
|
|
r2postfix : string, optional
|
|
String appended to the names of the fields of r2 that are present
|
|
in r1 but absent of the key.
|
|
defaults : {dictionary}, optional
|
|
Dictionary mapping field names to the corresponding default values.
|
|
usemask : {True, False}, optional
|
|
Whether to return a MaskedArray (or MaskedRecords is
|
|
`asrecarray==True`) or a ndarray.
|
|
asrecarray : {False, True}, optional
|
|
Whether to return a recarray (or MaskedRecords if `usemask==True`)
|
|
or just a flexible-type ndarray.
|
|
|
|
Notes
|
|
-----
|
|
* The output is sorted along the key.
|
|
* A temporary array is formed by dropping the fields not in the key for
|
|
the two arrays and concatenating the result. This array is then
|
|
sorted, and the common entries selected. The output is constructed by
|
|
filling the fields with the selected entries. Matching is not
|
|
preserved if there are some duplicates...
|
|
|
|
"""
|
|
# Check jointype
|
|
if jointype not in ('inner', 'outer', 'leftouter'):
|
|
raise ValueError(
|
|
"The 'jointype' argument should be in 'inner', "
|
|
"'outer' or 'leftouter' (got '%s' instead)" % jointype
|
|
)
|
|
# If we have a single key, put it in a tuple
|
|
if isinstance(key, str):
|
|
key = (key,)
|
|
|
|
# Check the keys
|
|
if len(set(key)) != len(key):
|
|
dup = next(x for n,x in enumerate(key) if x in key[n+1:])
|
|
raise ValueError("duplicate join key %r" % dup)
|
|
for name in key:
|
|
if name not in r1.dtype.names:
|
|
raise ValueError('r1 does not have key field %r' % name)
|
|
if name not in r2.dtype.names:
|
|
raise ValueError('r2 does not have key field %r' % name)
|
|
|
|
# Make sure we work with ravelled arrays
|
|
r1 = r1.ravel()
|
|
r2 = r2.ravel()
|
|
# Fixme: nb2 below is never used. Commenting out for pyflakes.
|
|
# (nb1, nb2) = (len(r1), len(r2))
|
|
nb1 = len(r1)
|
|
(r1names, r2names) = (r1.dtype.names, r2.dtype.names)
|
|
|
|
# Check the names for collision
|
|
collisions = (set(r1names) & set(r2names)) - set(key)
|
|
if collisions and not (r1postfix or r2postfix):
|
|
msg = "r1 and r2 contain common names, r1postfix and r2postfix "
|
|
msg += "can't both be empty"
|
|
raise ValueError(msg)
|
|
|
|
# Make temporary arrays of just the keys
|
|
# (use order of keys in `r1` for back-compatibility)
|
|
key1 = [ n for n in r1names if n in key ]
|
|
r1k = _keep_fields(r1, key1)
|
|
r2k = _keep_fields(r2, key1)
|
|
|
|
# Concatenate the two arrays for comparison
|
|
aux = ma.concatenate((r1k, r2k))
|
|
idx_sort = aux.argsort(order=key)
|
|
aux = aux[idx_sort]
|
|
#
|
|
# Get the common keys
|
|
flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
|
|
flag_in[:-1] = flag_in[1:] + flag_in[:-1]
|
|
idx_in = idx_sort[flag_in]
|
|
idx_1 = idx_in[(idx_in < nb1)]
|
|
idx_2 = idx_in[(idx_in >= nb1)] - nb1
|
|
(r1cmn, r2cmn) = (len(idx_1), len(idx_2))
|
|
if jointype == 'inner':
|
|
(r1spc, r2spc) = (0, 0)
|
|
elif jointype == 'outer':
|
|
idx_out = idx_sort[~flag_in]
|
|
idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
|
|
idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
|
|
(r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
|
|
elif jointype == 'leftouter':
|
|
idx_out = idx_sort[~flag_in]
|
|
idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
|
|
(r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
|
|
# Select the entries from each input
|
|
(s1, s2) = (r1[idx_1], r2[idx_2])
|
|
#
|
|
# Build the new description of the output array .......
|
|
# Start with the key fields
|
|
ndtype = _get_fieldspec(r1k.dtype)
|
|
|
|
# Add the fields from r1
|
|
for fname, fdtype in _get_fieldspec(r1.dtype):
|
|
if fname not in key:
|
|
ndtype.append((fname, fdtype))
|
|
|
|
# Add the fields from r2
|
|
for fname, fdtype in _get_fieldspec(r2.dtype):
|
|
# Have we seen the current name already ?
|
|
# we need to rebuild this list every time
|
|
names = list(name for name, dtype in ndtype)
|
|
try:
|
|
nameidx = names.index(fname)
|
|
except ValueError:
|
|
#... we haven't: just add the description to the current list
|
|
ndtype.append((fname, fdtype))
|
|
else:
|
|
# collision
|
|
_, cdtype = ndtype[nameidx]
|
|
if fname in key:
|
|
# The current field is part of the key: take the largest dtype
|
|
ndtype[nameidx] = (fname, max(fdtype, cdtype))
|
|
else:
|
|
# The current field is not part of the key: add the suffixes,
|
|
# and place the new field adjacent to the old one
|
|
ndtype[nameidx:nameidx + 1] = [
|
|
(fname + r1postfix, cdtype),
|
|
(fname + r2postfix, fdtype)
|
|
]
|
|
# Rebuild a dtype from the new fields
|
|
ndtype = np.dtype(ndtype)
|
|
# Find the largest nb of common fields :
|
|
# r1cmn and r2cmn should be equal, but...
|
|
cmn = max(r1cmn, r2cmn)
|
|
# Construct an empty array
|
|
output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
|
|
names = output.dtype.names
|
|
for f in r1names:
|
|
selected = s1[f]
|
|
if f not in names or (f in r2names and not r2postfix and f not in key):
|
|
f += r1postfix
|
|
current = output[f]
|
|
current[:r1cmn] = selected[:r1cmn]
|
|
if jointype in ('outer', 'leftouter'):
|
|
current[cmn:cmn + r1spc] = selected[r1cmn:]
|
|
for f in r2names:
|
|
selected = s2[f]
|
|
if f not in names or (f in r1names and not r1postfix and f not in key):
|
|
f += r2postfix
|
|
current = output[f]
|
|
current[:r2cmn] = selected[:r2cmn]
|
|
if (jointype == 'outer') and r2spc:
|
|
current[-r2spc:] = selected[r2cmn:]
|
|
# Sort and finalize the output
|
|
output.sort(order=key)
|
|
kwargs = dict(usemask=usemask, asrecarray=asrecarray)
|
|
return _fix_output(_fix_defaults(output, defaults), **kwargs)
|
|
|
|
|
|
def _rec_join_dispatcher(
|
|
key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
|
|
defaults=None):
|
|
return (r1, r2)
|
|
|
|
|
|
@array_function_dispatch(_rec_join_dispatcher)
|
|
def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
|
|
defaults=None):
|
|
"""
|
|
Join arrays `r1` and `r2` on keys.
|
|
Alternative to join_by, that always returns a np.recarray.
|
|
|
|
See Also
|
|
--------
|
|
join_by : equivalent function
|
|
"""
|
|
kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
|
|
defaults=defaults, usemask=False, asrecarray=True)
|
|
return join_by(key, r1, r2, **kwargs)
|