3RNN/Lib/site-packages/scipy/sparse/_compressed.py

1373 lines
52 KiB
Python
Raw Permalink Normal View History

2024-05-26 19:49:15 +02:00
"""Base class for sparse matrix formats using compressed storage."""
__all__ = []
from warnings import warn
import operator
import numpy as np
from scipy._lib._util import _prune_array, copy_if_needed
from ._base import _spbase, issparse, SparseEfficiencyWarning, sparray
from ._data import _data_matrix, _minmax_mixin
from . import _sparsetools
from ._sparsetools import (get_csr_submatrix, csr_sample_offsets, csr_todense,
csr_sample_values, csr_row_index, csr_row_slice,
csr_column_index1, csr_column_index2)
from ._index import IndexMixin
from ._sputils import (upcast, upcast_char, to_native, isdense, isshape,
getdtype, isscalarlike, isintlike, downcast_intp_index,
get_sum_dtype, check_shape, is_pydata_spmatrix)
class _cs_matrix(_data_matrix, _minmax_mixin, IndexMixin):
"""
base array/matrix class for compressed row- and column-oriented arrays/matrices
"""
def __init__(self, arg1, shape=None, dtype=None, copy=False):
_data_matrix.__init__(self)
if issparse(arg1):
if arg1.format == self.format and copy:
arg1 = arg1.copy()
else:
arg1 = arg1.asformat(self.format)
self.indptr, self.indices, self.data, self._shape = (
arg1.indptr, arg1.indices, arg1.data, arg1._shape
)
elif isinstance(arg1, tuple):
if isshape(arg1):
# It's a tuple of matrix dimensions (M, N)
# create empty matrix
self._shape = check_shape(arg1)
M, N = self.shape
# Select index dtype large enough to pass array and
# scalar parameters to sparsetools
idx_dtype = self._get_index_dtype(maxval=max(M, N))
self.data = np.zeros(0, getdtype(dtype, default=float))
self.indices = np.zeros(0, idx_dtype)
self.indptr = np.zeros(self._swap((M, N))[0] + 1,
dtype=idx_dtype)
else:
if len(arg1) == 2:
# (data, ij) format
coo = self._coo_container(arg1, shape=shape, dtype=dtype)
arrays = coo._coo_to_compressed(self._swap)
self.indptr, self.indices, self.data, self._shape = arrays
self.sum_duplicates()
elif len(arg1) == 3:
# (data, indices, indptr) format
(data, indices, indptr) = arg1
# Select index dtype large enough to pass array and
# scalar parameters to sparsetools
maxval = None
if shape is not None:
maxval = max(shape)
idx_dtype = self._get_index_dtype((indices, indptr),
maxval=maxval,
check_contents=True)
if not copy:
copy = copy_if_needed
self.indices = np.array(indices, copy=copy, dtype=idx_dtype)
self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype)
self.data = np.array(data, copy=copy, dtype=dtype)
else:
raise ValueError(f"unrecognized {self.format}_matrix "
"constructor usage")
else:
# must be dense
try:
arg1 = np.asarray(arg1)
except Exception as e:
msg = f"unrecognized {self.format}_matrix constructor usage"
raise ValueError(msg) from e
if isinstance(self, sparray) and arg1.ndim < 2 and self.format == "csc":
raise ValueError(
f"CSC arrays don't support {arg1.ndim}D input. Use 2D"
)
coo = self._coo_container(arg1, dtype=dtype)
arrays = coo._coo_to_compressed(self._swap)
self.indptr, self.indices, self.data, self._shape = arrays
# Read matrix dimensions given, if any
if shape is not None:
self._shape = check_shape(shape)
else:
if self.shape is None:
# shape not already set, try to infer dimensions
try:
major_dim = len(self.indptr) - 1
minor_dim = self.indices.max() + 1
except Exception as e:
raise ValueError('unable to infer matrix dimensions') from e
else:
self._shape = check_shape(self._swap((major_dim, minor_dim)))
if dtype is not None:
self.data = self.data.astype(dtype, copy=False)
self.check_format(full_check=False)
def _getnnz(self, axis=None):
if axis is None:
return int(self.indptr[-1])
else:
if axis < 0:
axis += 2
axis, _ = self._swap((axis, 1 - axis))
_, N = self._swap(self.shape)
if axis == 0:
return np.bincount(downcast_intp_index(self.indices),
minlength=N)
elif axis == 1:
return np.diff(self.indptr)
raise ValueError('axis out of bounds')
_getnnz.__doc__ = _spbase._getnnz.__doc__
def check_format(self, full_check=True):
"""Check whether the array/matrix respects the CSR or CSC format.
Parameters
----------
full_check : bool, optional
If `True`, run rigorous check, scanning arrays for valid values.
Note that activating those check might copy arrays for casting,
modifying indices and index pointers' inplace.
If `False`, run basic checks on attributes. O(1) operations.
Default is `True`.
"""
# use _swap to determine proper bounds
major_name, minor_name = self._swap(('row', 'column'))
major_dim, minor_dim = self._swap(self.shape)
# index arrays should have integer data types
if self.indptr.dtype.kind != 'i':
warn(f"indptr array has non-integer dtype ({self.indptr.dtype.name})",
stacklevel=3)
if self.indices.dtype.kind != 'i':
warn(f"indices array has non-integer dtype ({self.indices.dtype.name})",
stacklevel=3)
# check array shapes
for x in [self.data.ndim, self.indices.ndim, self.indptr.ndim]:
if x != 1:
raise ValueError('data, indices, and indptr should be 1-D')
# check index pointer
if (len(self.indptr) != major_dim + 1):
raise ValueError("index pointer size ({}) should be ({})"
"".format(len(self.indptr), major_dim + 1))
if (self.indptr[0] != 0):
raise ValueError("index pointer should start with 0")
# check index and data arrays
if (len(self.indices) != len(self.data)):
raise ValueError("indices and data should have the same size")
if (self.indptr[-1] > len(self.indices)):
raise ValueError("Last value of index pointer should be less than "
"the size of index and data arrays")
self.prune()
if full_check:
# check format validity (more expensive)
if self.nnz > 0:
if self.indices.max() >= minor_dim:
raise ValueError(f"{minor_name} index values must be < {minor_dim}")
if self.indices.min() < 0:
raise ValueError(f"{minor_name} index values must be >= 0")
if np.diff(self.indptr).min() < 0:
raise ValueError("index pointer values must form a "
"non-decreasing sequence")
idx_dtype = self._get_index_dtype((self.indptr, self.indices))
self.indptr = np.asarray(self.indptr, dtype=idx_dtype)
self.indices = np.asarray(self.indices, dtype=idx_dtype)
self.data = to_native(self.data)
# if not self.has_sorted_indices():
# warn('Indices were not in sorted order. Sorting indices.')
# self.sort_indices()
# assert(self.has_sorted_indices())
# TODO check for duplicates?
#######################
# Boolean comparisons #
#######################
def _scalar_binopt(self, other, op):
"""Scalar version of self._binopt, for cases in which no new nonzeros
are added. Produces a new sparse array in canonical form.
"""
self.sum_duplicates()
res = self._with_data(op(self.data, other), copy=True)
res.eliminate_zeros()
return res
def __eq__(self, other):
# Scalar other.
if isscalarlike(other):
if np.isnan(other):
return self.__class__(self.shape, dtype=np.bool_)
if other == 0:
warn("Comparing a sparse matrix with 0 using == is inefficient"
", try using != instead.", SparseEfficiencyWarning,
stacklevel=3)
all_true = self.__class__(np.ones(self.shape, dtype=np.bool_))
inv = self._scalar_binopt(other, operator.ne)
return all_true - inv
else:
return self._scalar_binopt(other, operator.eq)
# Dense other.
elif isdense(other):
return self.todense() == other
# Pydata sparse other.
elif is_pydata_spmatrix(other):
return NotImplemented
# Sparse other.
elif issparse(other):
warn("Comparing sparse matrices using == is inefficient, try using"
" != instead.", SparseEfficiencyWarning, stacklevel=3)
# TODO sparse broadcasting
if self.shape != other.shape:
return False
elif self.format != other.format:
other = other.asformat(self.format)
res = self._binopt(other, '_ne_')
all_true = self.__class__(np.ones(self.shape, dtype=np.bool_))
return all_true - res
else:
return NotImplemented
def __ne__(self, other):
# Scalar other.
if isscalarlike(other):
if np.isnan(other):
warn("Comparing a sparse matrix with nan using != is"
" inefficient", SparseEfficiencyWarning, stacklevel=3)
all_true = self.__class__(np.ones(self.shape, dtype=np.bool_))
return all_true
elif other != 0:
warn("Comparing a sparse matrix with a nonzero scalar using !="
" is inefficient, try using == instead.",
SparseEfficiencyWarning, stacklevel=3)
all_true = self.__class__(np.ones(self.shape), dtype=np.bool_)
inv = self._scalar_binopt(other, operator.eq)
return all_true - inv
else:
return self._scalar_binopt(other, operator.ne)
# Dense other.
elif isdense(other):
return self.todense() != other
# Pydata sparse other.
elif is_pydata_spmatrix(other):
return NotImplemented
# Sparse other.
elif issparse(other):
# TODO sparse broadcasting
if self.shape != other.shape:
return True
elif self.format != other.format:
other = other.asformat(self.format)
return self._binopt(other, '_ne_')
else:
return NotImplemented
def _inequality(self, other, op, op_name, bad_scalar_msg):
# Scalar other.
if isscalarlike(other):
if 0 == other and op_name in ('_le_', '_ge_'):
raise NotImplementedError(" >= and <= don't work with 0.")
elif op(0, other):
warn(bad_scalar_msg, SparseEfficiencyWarning, stacklevel=3)
other_arr = np.empty(self.shape, dtype=np.result_type(other))
other_arr.fill(other)
other_arr = self.__class__(other_arr)
return self._binopt(other_arr, op_name)
else:
return self._scalar_binopt(other, op)
# Dense other.
elif isdense(other):
return op(self.todense(), other)
# Sparse other.
elif issparse(other):
# TODO sparse broadcasting
if self.shape != other.shape:
raise ValueError("inconsistent shapes")
elif self.format != other.format:
other = other.asformat(self.format)
if op_name not in ('_ge_', '_le_'):
return self._binopt(other, op_name)
warn("Comparing sparse matrices using >= and <= is inefficient, "
"using <, >, or !=, instead.",
SparseEfficiencyWarning, stacklevel=3)
all_true = self.__class__(np.ones(self.shape, dtype=np.bool_))
res = self._binopt(other, '_gt_' if op_name == '_le_' else '_lt_')
return all_true - res
else:
return NotImplemented
def __lt__(self, other):
return self._inequality(other, operator.lt, '_lt_',
"Comparing a sparse matrix with a scalar "
"greater than zero using < is inefficient, "
"try using >= instead.")
def __gt__(self, other):
return self._inequality(other, operator.gt, '_gt_',
"Comparing a sparse matrix with a scalar "
"less than zero using > is inefficient, "
"try using <= instead.")
def __le__(self, other):
return self._inequality(other, operator.le, '_le_',
"Comparing a sparse matrix with a scalar "
"greater than zero using <= is inefficient, "
"try using > instead.")
def __ge__(self, other):
return self._inequality(other, operator.ge, '_ge_',
"Comparing a sparse matrix with a scalar "
"less than zero using >= is inefficient, "
"try using < instead.")
#################################
# Arithmetic operator overrides #
#################################
def _add_dense(self, other):
if other.shape != self.shape:
raise ValueError(f'Incompatible shapes ({self.shape} and {other.shape})')
dtype = upcast_char(self.dtype.char, other.dtype.char)
order = self._swap('CF')[0]
result = np.array(other, dtype=dtype, order=order, copy=True)
M, N = self._swap(self.shape)
y = result if result.flags.c_contiguous else result.T
csr_todense(M, N, self.indptr, self.indices, self.data, y)
return self._container(result, copy=False)
def _add_sparse(self, other):
return self._binopt(other, '_plus_')
def _sub_sparse(self, other):
return self._binopt(other, '_minus_')
def multiply(self, other):
"""Point-wise multiplication by another array/matrix, vector, or
scalar.
"""
# Scalar multiplication.
if isscalarlike(other):
return self._mul_scalar(other)
# Sparse matrix or vector.
if issparse(other):
if self.shape == other.shape:
other = self.__class__(other)
return self._binopt(other, '_elmul_')
if other.ndim == 1:
raise TypeError("broadcast from a 1d array not yet supported")
# Single element.
elif other.shape == (1, 1):
return self._mul_scalar(other.toarray()[0, 0])
elif self.shape == (1, 1):
return other._mul_scalar(self.toarray()[0, 0])
# A row times a column.
elif self.shape[1] == 1 and other.shape[0] == 1:
return self._matmul_sparse(other.tocsc())
elif self.shape[0] == 1 and other.shape[1] == 1:
return other._matmul_sparse(self.tocsc())
# Row vector times matrix. other is a row.
elif other.shape[0] == 1 and self.shape[1] == other.shape[1]:
other = self._dia_container(
(other.toarray().ravel(), [0]),
shape=(other.shape[1], other.shape[1])
)
return self._matmul_sparse(other)
# self is a row.
elif self.shape[0] == 1 and self.shape[1] == other.shape[1]:
copy = self._dia_container(
(self.toarray().ravel(), [0]),
shape=(self.shape[1], self.shape[1])
)
return other._matmul_sparse(copy)
# Column vector times matrix. other is a column.
elif other.shape[1] == 1 and self.shape[0] == other.shape[0]:
other = self._dia_container(
(other.toarray().ravel(), [0]),
shape=(other.shape[0], other.shape[0])
)
return other._matmul_sparse(self)
# self is a column.
elif self.shape[1] == 1 and self.shape[0] == other.shape[0]:
copy = self._dia_container(
(self.toarray().ravel(), [0]),
shape=(self.shape[0], self.shape[0])
)
return copy._matmul_sparse(other)
else:
raise ValueError("inconsistent shapes")
# Assume other is a dense matrix/array, which produces a single-item
# object array if other isn't convertible to ndarray.
other = np.atleast_2d(other)
if other.ndim != 2:
return np.multiply(self.toarray(), other)
# Single element / wrapped object.
if other.size == 1:
if other.dtype == np.object_:
# 'other' not convertible to ndarray.
return NotImplemented
return self._mul_scalar(other.flat[0])
# Fast case for trivial sparse matrix.
elif self.shape == (1, 1):
return np.multiply(self.toarray()[0, 0], other)
ret = self.tocoo()
# Matching shapes.
if self.shape == other.shape:
data = np.multiply(ret.data, other[ret.row, ret.col])
# Sparse row vector times...
elif self.shape[0] == 1:
if other.shape[1] == 1: # Dense column vector.
data = np.multiply(ret.data, other)
elif other.shape[1] == self.shape[1]: # Dense matrix.
data = np.multiply(ret.data, other[:, ret.col])
else:
raise ValueError("inconsistent shapes")
row = np.repeat(np.arange(other.shape[0]), len(ret.row))
col = np.tile(ret.col, other.shape[0])
return self._coo_container(
(data.view(np.ndarray).ravel(), (row, col)),
shape=(other.shape[0], self.shape[1]),
copy=False
)
# Sparse column vector times...
elif self.shape[1] == 1:
if other.shape[0] == 1: # Dense row vector.
data = np.multiply(ret.data[:, None], other)
elif other.shape[0] == self.shape[0]: # Dense matrix.
data = np.multiply(ret.data[:, None], other[ret.row])
else:
raise ValueError("inconsistent shapes")
row = np.repeat(ret.row, other.shape[1])
col = np.tile(np.arange(other.shape[1]), len(ret.col))
return self._coo_container(
(data.view(np.ndarray).ravel(), (row, col)),
shape=(self.shape[0], other.shape[1]),
copy=False
)
# Sparse matrix times dense row vector.
elif other.shape[0] == 1 and self.shape[1] == other.shape[1]:
data = np.multiply(ret.data, other[:, ret.col].ravel())
# Sparse matrix times dense column vector.
elif other.shape[1] == 1 and self.shape[0] == other.shape[0]:
data = np.multiply(ret.data, other[ret.row].ravel())
else:
raise ValueError("inconsistent shapes")
ret.data = data.view(np.ndarray).ravel()
return ret
###########################
# Multiplication handlers #
###########################
def _matmul_vector(self, other):
M, N = self.shape
# output array
result = np.zeros(M, dtype=upcast_char(self.dtype.char,
other.dtype.char))
# csr_matvec or csc_matvec
fn = getattr(_sparsetools, self.format + '_matvec')
fn(M, N, self.indptr, self.indices, self.data, other, result)
return result
def _matmul_multivector(self, other):
M, N = self.shape
n_vecs = other.shape[1] # number of column vectors
result = np.zeros((M, n_vecs),
dtype=upcast_char(self.dtype.char, other.dtype.char))
# csr_matvecs or csc_matvecs
fn = getattr(_sparsetools, self.format + '_matvecs')
fn(M, N, n_vecs, self.indptr, self.indices, self.data,
other.ravel(), result.ravel())
return result
def _matmul_sparse(self, other):
M, K1 = self.shape
K2, N = other.shape
major_axis = self._swap((M, N))[0]
other = self.__class__(other) # convert to this format
idx_dtype = self._get_index_dtype((self.indptr, self.indices,
other.indptr, other.indices))
fn = getattr(_sparsetools, self.format + '_matmat_maxnnz')
nnz = fn(M, N,
np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype))
idx_dtype = self._get_index_dtype((self.indptr, self.indices,
other.indptr, other.indices),
maxval=nnz)
indptr = np.empty(major_axis + 1, dtype=idx_dtype)
indices = np.empty(nnz, dtype=idx_dtype)
data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
fn = getattr(_sparsetools, self.format + '_matmat')
fn(M, N, np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
self.data,
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
other.data,
indptr, indices, data)
return self.__class__((data, indices, indptr), shape=(M, N))
def diagonal(self, k=0):
rows, cols = self.shape
if k <= -rows or k >= cols:
return np.empty(0, dtype=self.data.dtype)
fn = getattr(_sparsetools, self.format + "_diagonal")
y = np.empty(min(rows + min(k, 0), cols - max(k, 0)),
dtype=upcast(self.dtype))
fn(k, self.shape[0], self.shape[1], self.indptr, self.indices,
self.data, y)
return y
diagonal.__doc__ = _spbase.diagonal.__doc__
#####################
# Other binary ops #
#####################
def _maximum_minimum(self, other, npop, op_name, dense_check):
if isscalarlike(other):
if dense_check(other):
warn("Taking maximum (minimum) with > 0 (< 0) number results"
" to a dense matrix.", SparseEfficiencyWarning,
stacklevel=3)
other_arr = np.empty(self.shape, dtype=np.asarray(other).dtype)
other_arr.fill(other)
other_arr = self.__class__(other_arr)
return self._binopt(other_arr, op_name)
else:
self.sum_duplicates()
new_data = npop(self.data, np.asarray(other))
mat = self.__class__((new_data, self.indices, self.indptr),
dtype=new_data.dtype, shape=self.shape)
return mat
elif isdense(other):
return npop(self.todense(), other)
elif issparse(other):
return self._binopt(other, op_name)
else:
raise ValueError("Operands not compatible.")
def maximum(self, other):
return self._maximum_minimum(other, np.maximum,
'_maximum_', lambda x: np.asarray(x) > 0)
maximum.__doc__ = _spbase.maximum.__doc__
def minimum(self, other):
return self._maximum_minimum(other, np.minimum,
'_minimum_', lambda x: np.asarray(x) < 0)
minimum.__doc__ = _spbase.minimum.__doc__
#####################
# Reduce operations #
#####################
def sum(self, axis=None, dtype=None, out=None):
"""Sum the array/matrix over the given axis. If the axis is None, sum
over both rows and columns, returning a scalar.
"""
# The _spbase base class already does axis=0 and axis=1 efficiently
# so we only do the case axis=None here
if (not hasattr(self, 'blocksize') and
axis in self._swap(((1, -1), (0, 2)))[0]):
# faster than multiplication for large minor axis in CSC/CSR
res_dtype = get_sum_dtype(self.dtype)
ret = np.zeros(len(self.indptr) - 1, dtype=res_dtype)
major_index, value = self._minor_reduce(np.add)
ret[major_index] = value
ret = self._ascontainer(ret)
if axis % 2 == 1:
ret = ret.T
if out is not None and out.shape != ret.shape:
raise ValueError('dimensions do not match')
return ret.sum(axis=(), dtype=dtype, out=out)
# _spbase will handle the remaining situations when axis
# is in {None, -1, 0, 1}
else:
return _spbase.sum(self, axis=axis, dtype=dtype, out=out)
sum.__doc__ = _spbase.sum.__doc__
def _minor_reduce(self, ufunc, data=None):
"""Reduce nonzeros with a ufunc over the minor axis when non-empty
Can be applied to a function of self.data by supplying data parameter.
Warning: this does not call sum_duplicates()
Returns
-------
major_index : array of ints
Major indices where nonzero
value : array of self.dtype
Reduce result for nonzeros in each major_index
"""
if data is None:
data = self.data
major_index = np.flatnonzero(np.diff(self.indptr))
value = ufunc.reduceat(data,
downcast_intp_index(self.indptr[major_index]))
return major_index, value
#######################
# Getting and Setting #
#######################
def _get_intXint(self, row, col):
M, N = self._swap(self.shape)
major, minor = self._swap((row, col))
indptr, indices, data = get_csr_submatrix(
M, N, self.indptr, self.indices, self.data,
major, major + 1, minor, minor + 1)
return data.sum(dtype=self.dtype)
def _get_sliceXslice(self, row, col):
major, minor = self._swap((row, col))
if major.step in (1, None) and minor.step in (1, None):
return self._get_submatrix(major, minor, copy=True)
return self._major_slice(major)._minor_slice(minor)
def _get_arrayXarray(self, row, col):
# inner indexing
idx_dtype = self.indices.dtype
M, N = self._swap(self.shape)
major, minor = self._swap((row, col))
major = np.asarray(major, dtype=idx_dtype)
minor = np.asarray(minor, dtype=idx_dtype)
val = np.empty(major.size, dtype=self.dtype)
csr_sample_values(M, N, self.indptr, self.indices, self.data,
major.size, major.ravel(), minor.ravel(), val)
if major.ndim == 1:
return self._ascontainer(val)
return self.__class__(val.reshape(major.shape))
def _get_columnXarray(self, row, col):
# outer indexing
major, minor = self._swap((row, col))
return self._major_index_fancy(major)._minor_index_fancy(minor)
def _major_index_fancy(self, idx):
"""Index along the major axis where idx is an array of ints.
"""
idx_dtype = self._get_index_dtype((self.indptr, self.indices))
indices = np.asarray(idx, dtype=idx_dtype).ravel()
_, N = self._swap(self.shape)
M = len(indices)
new_shape = self._swap((M, N))
if M == 0:
return self.__class__(new_shape, dtype=self.dtype)
row_nnz = (self.indptr[indices + 1] - self.indptr[indices]).astype(idx_dtype)
res_indptr = np.zeros(M+1, dtype=idx_dtype)
np.cumsum(row_nnz, out=res_indptr[1:])
nnz = res_indptr[-1]
res_indices = np.empty(nnz, dtype=idx_dtype)
res_data = np.empty(nnz, dtype=self.dtype)
csr_row_index(
M,
indices,
self.indptr.astype(idx_dtype, copy=False),
self.indices.astype(idx_dtype, copy=False),
self.data,
res_indices,
res_data
)
return self.__class__((res_data, res_indices, res_indptr),
shape=new_shape, copy=False)
def _major_slice(self, idx, copy=False):
"""Index along the major axis where idx is a slice object.
"""
if idx == slice(None):
return self.copy() if copy else self
M, N = self._swap(self.shape)
start, stop, step = idx.indices(M)
M = len(range(start, stop, step))
new_shape = self._swap((M, N))
if M == 0:
return self.__class__(new_shape, dtype=self.dtype)
# Work out what slices are needed for `row_nnz`
# start,stop can be -1, only if step is negative
start0, stop0 = start, stop
if stop == -1 and start >= 0:
stop0 = None
start1, stop1 = start + 1, stop + 1
row_nnz = self.indptr[start1:stop1:step] - \
self.indptr[start0:stop0:step]
idx_dtype = self.indices.dtype
res_indptr = np.zeros(M+1, dtype=idx_dtype)
np.cumsum(row_nnz, out=res_indptr[1:])
if step == 1:
all_idx = slice(self.indptr[start], self.indptr[stop])
res_indices = np.array(self.indices[all_idx], copy=copy)
res_data = np.array(self.data[all_idx], copy=copy)
else:
nnz = res_indptr[-1]
res_indices = np.empty(nnz, dtype=idx_dtype)
res_data = np.empty(nnz, dtype=self.dtype)
csr_row_slice(start, stop, step, self.indptr, self.indices,
self.data, res_indices, res_data)
return self.__class__((res_data, res_indices, res_indptr),
shape=new_shape, copy=False)
def _minor_index_fancy(self, idx):
"""Index along the minor axis where idx is an array of ints.
"""
idx_dtype = self._get_index_dtype((self.indices, self.indptr))
indices = self.indices.astype(idx_dtype, copy=False)
indptr = self.indptr.astype(idx_dtype, copy=False)
idx = np.asarray(idx, dtype=idx_dtype).ravel()
M, N = self._swap(self.shape)
k = len(idx)
new_shape = self._swap((M, k))
if k == 0:
return self.__class__(new_shape, dtype=self.dtype)
# pass 1: count idx entries and compute new indptr
col_offsets = np.zeros(N, dtype=idx_dtype)
res_indptr = np.empty_like(self.indptr, dtype=idx_dtype)
csr_column_index1(
k,
idx,
M,
N,
indptr,
indices,
col_offsets,
res_indptr,
)
# pass 2: copy indices/data for selected idxs
col_order = np.argsort(idx).astype(idx_dtype, copy=False)
nnz = res_indptr[-1]
res_indices = np.empty(nnz, dtype=idx_dtype)
res_data = np.empty(nnz, dtype=self.dtype)
csr_column_index2(col_order, col_offsets, len(self.indices),
indices, self.data, res_indices, res_data)
return self.__class__((res_data, res_indices, res_indptr),
shape=new_shape, copy=False)
def _minor_slice(self, idx, copy=False):
"""Index along the minor axis where idx is a slice object.
"""
if idx == slice(None):
return self.copy() if copy else self
M, N = self._swap(self.shape)
start, stop, step = idx.indices(N)
N = len(range(start, stop, step))
if N == 0:
return self.__class__(self._swap((M, N)), dtype=self.dtype)
if step == 1:
return self._get_submatrix(minor=idx, copy=copy)
# TODO: don't fall back to fancy indexing here
return self._minor_index_fancy(np.arange(start, stop, step))
def _get_submatrix(self, major=None, minor=None, copy=False):
"""Return a submatrix of this matrix.
major, minor: None, int, or slice with step 1
"""
M, N = self._swap(self.shape)
i0, i1 = _process_slice(major, M)
j0, j1 = _process_slice(minor, N)
if i0 == 0 and j0 == 0 and i1 == M and j1 == N:
return self.copy() if copy else self
indptr, indices, data = get_csr_submatrix(
M, N, self.indptr, self.indices, self.data, i0, i1, j0, j1)
shape = self._swap((i1 - i0, j1 - j0))
return self.__class__((data, indices, indptr), shape=shape,
dtype=self.dtype, copy=False)
def _set_intXint(self, row, col, x):
i, j = self._swap((row, col))
self._set_many(i, j, x)
def _set_arrayXarray(self, row, col, x):
i, j = self._swap((row, col))
self._set_many(i, j, x)
def _set_arrayXarray_sparse(self, row, col, x):
# clear entries that will be overwritten
self._zero_many(*self._swap((row, col)))
M, N = row.shape # matches col.shape
broadcast_row = M != 1 and x.shape[0] == 1
broadcast_col = N != 1 and x.shape[1] == 1
r, c = x.row, x.col
x = np.asarray(x.data, dtype=self.dtype)
if x.size == 0:
return
if broadcast_row:
r = np.repeat(np.arange(M), len(r))
c = np.tile(c, M)
x = np.tile(x, M)
if broadcast_col:
r = np.repeat(r, N)
c = np.tile(np.arange(N), len(c))
x = np.repeat(x, N)
# only assign entries in the new sparsity structure
i, j = self._swap((row[r, c], col[r, c]))
self._set_many(i, j, x)
def _setdiag(self, values, k):
if 0 in self.shape:
return
M, N = self.shape
broadcast = (values.ndim == 0)
if k < 0:
if broadcast:
max_index = min(M + k, N)
else:
max_index = min(M + k, N, len(values))
i = np.arange(-k, max_index - k, dtype=self.indices.dtype)
j = np.arange(max_index, dtype=self.indices.dtype)
else:
if broadcast:
max_index = min(M, N - k)
else:
max_index = min(M, N - k, len(values))
i = np.arange(max_index, dtype=self.indices.dtype)
j = np.arange(k, k + max_index, dtype=self.indices.dtype)
if not broadcast:
values = values[:len(i)]
x = np.atleast_1d(np.asarray(values, dtype=self.dtype)).ravel()
if x.squeeze().shape != i.squeeze().shape:
x = np.broadcast_to(x, i.shape)
if x.size == 0:
return
M, N = self._swap((M, N))
i, j = self._swap((i, j))
n_samples = x.size
offsets = np.empty(n_samples, dtype=self.indices.dtype)
ret = csr_sample_offsets(M, N, self.indptr, self.indices, n_samples,
i, j, offsets)
if ret == 1:
# rinse and repeat
self.sum_duplicates()
csr_sample_offsets(M, N, self.indptr, self.indices, n_samples,
i, j, offsets)
if -1 not in offsets:
# only affects existing non-zero cells
self.data[offsets] = x
return
mask = (offsets <= -1)
# Boundary between csc and convert to coo
# The value 0.001 is justified in gh-19962#issuecomment-1920499678
if mask.sum() < self.nnz * 0.001:
# create new entries
i = i[mask]
j = j[mask]
self._insert_many(i, j, x[mask])
# replace existing entries
mask = ~mask
self.data[offsets[mask]] = x[mask]
else:
# convert to coo for _set_diag
coo = self.tocoo()
coo._setdiag(values, k)
arrays = coo._coo_to_compressed(self._swap)
self.indptr, self.indices, self.data, _ = arrays
def _prepare_indices(self, i, j):
M, N = self._swap(self.shape)
def check_bounds(indices, bound):
idx = indices.max()
if idx >= bound:
raise IndexError('index (%d) out of range (>= %d)' %
(idx, bound))
idx = indices.min()
if idx < -bound:
raise IndexError('index (%d) out of range (< -%d)' %
(idx, bound))
i = np.atleast_1d(np.asarray(i, dtype=self.indices.dtype)).ravel()
j = np.atleast_1d(np.asarray(j, dtype=self.indices.dtype)).ravel()
check_bounds(i, M)
check_bounds(j, N)
return i, j, M, N
def _set_many(self, i, j, x):
"""Sets value at each (i, j) to x
Here (i,j) index major and minor respectively, and must not contain
duplicate entries.
"""
i, j, M, N = self._prepare_indices(i, j)
x = np.atleast_1d(np.asarray(x, dtype=self.dtype)).ravel()
n_samples = x.size
offsets = np.empty(n_samples, dtype=self.indices.dtype)
ret = csr_sample_offsets(M, N, self.indptr, self.indices, n_samples,
i, j, offsets)
if ret == 1:
# rinse and repeat
self.sum_duplicates()
csr_sample_offsets(M, N, self.indptr, self.indices, n_samples,
i, j, offsets)
if -1 not in offsets:
# only affects existing non-zero cells
self.data[offsets] = x
return
else:
warn("Changing the sparsity structure of a {}_matrix is expensive."
" lil_matrix is more efficient.".format(self.format),
SparseEfficiencyWarning, stacklevel=3)
# replace where possible
mask = offsets > -1
self.data[offsets[mask]] = x[mask]
# only insertions remain
mask = ~mask
i = i[mask]
i[i < 0] += M
j = j[mask]
j[j < 0] += N
self._insert_many(i, j, x[mask])
def _zero_many(self, i, j):
"""Sets value at each (i, j) to zero, preserving sparsity structure.
Here (i,j) index major and minor respectively.
"""
i, j, M, N = self._prepare_indices(i, j)
n_samples = len(i)
offsets = np.empty(n_samples, dtype=self.indices.dtype)
ret = csr_sample_offsets(M, N, self.indptr, self.indices, n_samples,
i, j, offsets)
if ret == 1:
# rinse and repeat
self.sum_duplicates()
csr_sample_offsets(M, N, self.indptr, self.indices, n_samples,
i, j, offsets)
# only assign zeros to the existing sparsity structure
self.data[offsets[offsets > -1]] = 0
def _insert_many(self, i, j, x):
"""Inserts new nonzero at each (i, j) with value x
Here (i,j) index major and minor respectively.
i, j and x must be non-empty, 1d arrays.
Inserts each major group (e.g. all entries per row) at a time.
Maintains has_sorted_indices property.
Modifies i, j, x in place.
"""
order = np.argsort(i, kind='mergesort') # stable for duplicates
i = i.take(order, mode='clip')
j = j.take(order, mode='clip')
x = x.take(order, mode='clip')
do_sort = self.has_sorted_indices
# Update index data type
idx_dtype = self._get_index_dtype((self.indices, self.indptr),
maxval=(self.indptr[-1] + x.size))
self.indptr = np.asarray(self.indptr, dtype=idx_dtype)
self.indices = np.asarray(self.indices, dtype=idx_dtype)
i = np.asarray(i, dtype=idx_dtype)
j = np.asarray(j, dtype=idx_dtype)
# Collate old and new in chunks by major index
indices_parts = []
data_parts = []
ui, ui_indptr = np.unique(i, return_index=True)
ui_indptr = np.append(ui_indptr, len(j))
new_nnzs = np.diff(ui_indptr)
prev = 0
for c, (ii, js, je) in enumerate(zip(ui, ui_indptr, ui_indptr[1:])):
# old entries
start = self.indptr[prev]
stop = self.indptr[ii]
indices_parts.append(self.indices[start:stop])
data_parts.append(self.data[start:stop])
# handle duplicate j: keep last setting
uj, uj_indptr = np.unique(j[js:je][::-1], return_index=True)
if len(uj) == je - js:
indices_parts.append(j[js:je])
data_parts.append(x[js:je])
else:
indices_parts.append(j[js:je][::-1][uj_indptr])
data_parts.append(x[js:je][::-1][uj_indptr])
new_nnzs[c] = len(uj)
prev = ii
# remaining old entries
start = self.indptr[ii]
indices_parts.append(self.indices[start:])
data_parts.append(self.data[start:])
# update attributes
self.indices = np.concatenate(indices_parts)
self.data = np.concatenate(data_parts)
nnzs = np.empty(self.indptr.shape, dtype=idx_dtype)
nnzs[0] = idx_dtype(0)
indptr_diff = np.diff(self.indptr)
indptr_diff[ui] += new_nnzs
nnzs[1:] = indptr_diff
self.indptr = np.cumsum(nnzs, out=nnzs)
if do_sort:
# TODO: only sort where necessary
self.has_sorted_indices = False
self.sort_indices()
self.check_format(full_check=False)
######################
# Conversion methods #
######################
def tocoo(self, copy=True):
major_dim, minor_dim = self._swap(self.shape)
minor_indices = self.indices
major_indices = np.empty(len(minor_indices), dtype=self.indices.dtype)
_sparsetools.expandptr(major_dim, self.indptr, major_indices)
coords = self._swap((major_indices, minor_indices))
return self._coo_container(
(self.data, coords), self.shape, copy=copy, dtype=self.dtype
)
tocoo.__doc__ = _spbase.tocoo.__doc__
def toarray(self, order=None, out=None):
if out is None and order is None:
order = self._swap('cf')[0]
out = self._process_toarray_args(order, out)
if not (out.flags.c_contiguous or out.flags.f_contiguous):
raise ValueError('Output array must be C or F contiguous')
# align ideal order with output array order
if out.flags.c_contiguous:
x = self.tocsr()
y = out
else:
x = self.tocsc()
y = out.T
M, N = x._swap(x.shape)
csr_todense(M, N, x.indptr, x.indices, x.data, y)
return out
toarray.__doc__ = _spbase.toarray.__doc__
##############################################################
# methods that examine or modify the internal data structure #
##############################################################
def eliminate_zeros(self):
"""Remove zero entries from the array/matrix
This is an *in place* operation.
"""
M, N = self._swap(self.shape)
_sparsetools.csr_eliminate_zeros(M, N, self.indptr, self.indices,
self.data)
self.prune() # nnz may have changed
@property
def has_canonical_format(self) -> bool:
"""Whether the array/matrix has sorted indices and no duplicates
Returns
- True: if the above applies
- False: otherwise
has_canonical_format implies has_sorted_indices, so if the latter flag
is False, so will the former be; if the former is found True, the
latter flag is also set.
"""
# first check to see if result was cached
if not getattr(self, '_has_sorted_indices', True):
# not sorted => not canonical
self._has_canonical_format = False
elif not hasattr(self, '_has_canonical_format'):
self.has_canonical_format = bool(
_sparsetools.csr_has_canonical_format(
len(self.indptr) - 1, self.indptr, self.indices)
)
return self._has_canonical_format
@has_canonical_format.setter
def has_canonical_format(self, val: bool):
self._has_canonical_format = bool(val)
if val:
self.has_sorted_indices = True
def sum_duplicates(self):
"""Eliminate duplicate entries by adding them together
This is an *in place* operation.
"""
if self.has_canonical_format:
return
self.sort_indices()
M, N = self._swap(self.shape)
_sparsetools.csr_sum_duplicates(M, N, self.indptr, self.indices,
self.data)
self.prune() # nnz may have changed
self.has_canonical_format = True
@property
def has_sorted_indices(self) -> bool:
"""Whether the indices are sorted
Returns
- True: if the indices of the array/matrix are in sorted order
- False: otherwise
"""
# first check to see if result was cached
if not hasattr(self, '_has_sorted_indices'):
self._has_sorted_indices = bool(
_sparsetools.csr_has_sorted_indices(
len(self.indptr) - 1, self.indptr, self.indices)
)
return self._has_sorted_indices
@has_sorted_indices.setter
def has_sorted_indices(self, val: bool):
self._has_sorted_indices = bool(val)
def sorted_indices(self):
"""Return a copy of this array/matrix with sorted indices
"""
A = self.copy()
A.sort_indices()
return A
# an alternative that has linear complexity is the following
# although the previous option is typically faster
# return self.toother().toother()
def sort_indices(self):
"""Sort the indices of this array/matrix *in place*
"""
if not self.has_sorted_indices:
_sparsetools.csr_sort_indices(len(self.indptr) - 1, self.indptr,
self.indices, self.data)
self.has_sorted_indices = True
def prune(self):
"""Remove empty space after all non-zero elements.
"""
major_dim = self._swap(self.shape)[0]
if len(self.indptr) != major_dim + 1:
raise ValueError('index pointer has invalid length')
if len(self.indices) < self.nnz:
raise ValueError('indices array has fewer than nnz elements')
if len(self.data) < self.nnz:
raise ValueError('data array has fewer than nnz elements')
self.indices = _prune_array(self.indices[:self.nnz])
self.data = _prune_array(self.data[:self.nnz])
def resize(self, *shape):
shape = check_shape(shape)
if hasattr(self, 'blocksize'):
bm, bn = self.blocksize
new_M, rm = divmod(shape[0], bm)
new_N, rn = divmod(shape[1], bn)
if rm or rn:
raise ValueError("shape must be divisible into {} blocks. "
"Got {}".format(self.blocksize, shape))
M, N = self.shape[0] // bm, self.shape[1] // bn
else:
new_M, new_N = self._swap(shape)
M, N = self._swap(self.shape)
if new_M < M:
self.indices = self.indices[:self.indptr[new_M]]
self.data = self.data[:self.indptr[new_M]]
self.indptr = self.indptr[:new_M + 1]
elif new_M > M:
self.indptr = np.resize(self.indptr, new_M + 1)
self.indptr[M + 1:].fill(self.indptr[M])
if new_N < N:
mask = self.indices < new_N
if not np.all(mask):
self.indices = self.indices[mask]
self.data = self.data[mask]
major_index, val = self._minor_reduce(np.add, mask)
self.indptr.fill(0)
self.indptr[1:][major_index] = val
np.cumsum(self.indptr, out=self.indptr)
self._shape = shape
resize.__doc__ = _spbase.resize.__doc__
###################
# utility methods #
###################
# needed by _data_matrix
def _with_data(self, data, copy=True):
"""Returns a matrix with the same sparsity structure as self,
but with different data. By default the structure arrays
(i.e. .indptr and .indices) are copied.
"""
if copy:
return self.__class__((data, self.indices.copy(),
self.indptr.copy()),
shape=self.shape,
dtype=data.dtype)
else:
return self.__class__((data, self.indices, self.indptr),
shape=self.shape, dtype=data.dtype)
def _binopt(self, other, op):
"""apply the binary operation fn to two sparse matrices."""
other = self.__class__(other)
# e.g. csr_plus_csr, csr_minus_csr, etc.
fn = getattr(_sparsetools, self.format + op + self.format)
maxnnz = self.nnz + other.nnz
idx_dtype = self._get_index_dtype((self.indptr, self.indices,
other.indptr, other.indices),
maxval=maxnnz)
indptr = np.empty(self.indptr.shape, dtype=idx_dtype)
indices = np.empty(maxnnz, dtype=idx_dtype)
bool_ops = ['_ne_', '_lt_', '_gt_', '_le_', '_ge_']
if op in bool_ops:
data = np.empty(maxnnz, dtype=np.bool_)
else:
data = np.empty(maxnnz, dtype=upcast(self.dtype, other.dtype))
fn(self.shape[0], self.shape[1],
np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
self.data,
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
other.data,
indptr, indices, data)
A = self.__class__((data, indices, indptr), shape=self.shape)
A.prune()
return A
def _divide_sparse(self, other):
"""
Divide this matrix by a second sparse matrix.
"""
if other.shape != self.shape:
raise ValueError('inconsistent shapes')
r = self._binopt(other, '_eldiv_')
if np.issubdtype(r.dtype, np.inexact):
# Eldiv leaves entries outside the combined sparsity
# pattern empty, so they must be filled manually.
# Everything outside of other's sparsity is NaN, and everything
# inside it is either zero or defined by eldiv.
out = np.empty(self.shape, dtype=self.dtype)
out.fill(np.nan)
row, col = other.nonzero()
out[row, col] = 0
r = r.tocoo()
out[r.row, r.col] = r.data
out = self._container(out)
else:
# integers types go with nan <-> 0
out = r
return out
def _process_slice(sl, num):
if sl is None:
i0, i1 = 0, num
elif isinstance(sl, slice):
i0, i1, stride = sl.indices(num)
if stride != 1:
raise ValueError('slicing with step != 1 not supported')
i0 = min(i0, i1) # give an empty slice when i0 > i1
elif isintlike(sl):
if sl < 0:
sl += num
i0, i1 = sl, sl + 1
if i0 < 0 or i1 > num:
raise IndexError('index out of bounds: 0 <= %d < %d <= %d' %
(i0, i1, num))
else:
raise TypeError('expected slice or scalar')
return i0, i1