641 lines
21 KiB
Cython
641 lines
21 KiB
Cython
|
"""
|
||
|
The :mod:`sklearn.utils.sparsefuncs_fast` module includes a collection of utilities to
|
||
|
work with sparse matrices and arrays written in Cython.
|
||
|
"""
|
||
|
|
||
|
# Authors: Mathieu Blondel
|
||
|
# Olivier Grisel
|
||
|
# Peter Prettenhofer
|
||
|
# Lars Buitinck
|
||
|
# Giorgio Patrini
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from libc.math cimport fabs, sqrt, isnan
|
||
|
from libc.stdint cimport intptr_t
|
||
|
|
||
|
import numpy as np
|
||
|
from cython cimport floating
|
||
|
from ..utils._typedefs cimport float64_t, int32_t, int64_t, intp_t, uint64_t
|
||
|
|
||
|
|
||
|
ctypedef fused integral:
|
||
|
int32_t
|
||
|
int64_t
|
||
|
|
||
|
|
||
|
def csr_row_norms(X):
|
||
|
"""Squared L2 norm of each row in CSR matrix X."""
|
||
|
if X.dtype not in [np.float32, np.float64]:
|
||
|
X = X.astype(np.float64)
|
||
|
return _sqeuclidean_row_norms_sparse(X.data, X.indptr)
|
||
|
|
||
|
|
||
|
def _sqeuclidean_row_norms_sparse(
|
||
|
const floating[::1] X_data,
|
||
|
const integral[::1] X_indptr,
|
||
|
):
|
||
|
cdef:
|
||
|
integral n_samples = X_indptr.shape[0] - 1
|
||
|
integral i, j
|
||
|
|
||
|
dtype = np.float32 if floating is float else np.float64
|
||
|
|
||
|
cdef floating[::1] squared_row_norms = np.zeros(n_samples, dtype=dtype)
|
||
|
|
||
|
with nogil:
|
||
|
for i in range(n_samples):
|
||
|
for j in range(X_indptr[i], X_indptr[i + 1]):
|
||
|
squared_row_norms[i] += X_data[j] * X_data[j]
|
||
|
|
||
|
return np.asarray(squared_row_norms)
|
||
|
|
||
|
|
||
|
def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
|
||
|
"""Compute mean and variance along axis 0 on a CSR matrix
|
||
|
|
||
|
Uses a np.float64 accumulator.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : CSR sparse matrix, shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
weights : ndarray of shape (n_samples,), dtype=floating, default=None
|
||
|
If it is set to None samples will be equally weighted.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
return_sum_weights : bool, default=False
|
||
|
If True, returns the sum of weights seen for each feature.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
means : float array with shape (n_features,)
|
||
|
Feature-wise means
|
||
|
|
||
|
variances : float array with shape (n_features,)
|
||
|
Feature-wise variances
|
||
|
|
||
|
sum_weights : ndarray of shape (n_features,), dtype=floating
|
||
|
Returned if return_sum_weights is True.
|
||
|
"""
|
||
|
if X.dtype not in [np.float32, np.float64]:
|
||
|
X = X.astype(np.float64)
|
||
|
|
||
|
if weights is None:
|
||
|
weights = np.ones(X.shape[0], dtype=X.dtype)
|
||
|
|
||
|
means, variances, sum_weights = _csr_mean_variance_axis0(
|
||
|
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
|
||
|
|
||
|
if return_sum_weights:
|
||
|
return means, variances, sum_weights
|
||
|
return means, variances
|
||
|
|
||
|
|
||
|
def _csr_mean_variance_axis0(
|
||
|
const floating[::1] X_data,
|
||
|
uint64_t n_samples,
|
||
|
uint64_t n_features,
|
||
|
const integral[:] X_indices,
|
||
|
const integral[:] X_indptr,
|
||
|
const floating[:] weights,
|
||
|
):
|
||
|
# Implement the function here since variables using fused types
|
||
|
# cannot be declared directly and can only be passed as function arguments
|
||
|
cdef:
|
||
|
intp_t row_ind
|
||
|
uint64_t feature_idx
|
||
|
integral i, col_ind
|
||
|
float64_t diff
|
||
|
# means[j] contains the mean of feature j
|
||
|
float64_t[::1] means = np.zeros(n_features)
|
||
|
# variances[j] contains the variance of feature j
|
||
|
float64_t[::1] variances = np.zeros(n_features)
|
||
|
|
||
|
float64_t[::1] sum_weights = np.full(
|
||
|
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
|
||
|
)
|
||
|
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
|
||
|
float64_t[::1] correction = np.zeros(shape=n_features)
|
||
|
|
||
|
uint64_t[::1] counts = np.full(
|
||
|
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
|
||
|
)
|
||
|
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
|
||
|
|
||
|
for row_ind in range(len(X_indptr) - 1):
|
||
|
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
|
||
|
col_ind = X_indices[i]
|
||
|
if not isnan(X_data[i]):
|
||
|
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
|
||
|
# sum of weights where X[:, col_ind] is non-zero
|
||
|
sum_weights_nz[col_ind] += weights[row_ind]
|
||
|
# number of non-zero elements of X[:, col_ind]
|
||
|
counts_nz[col_ind] += 1
|
||
|
else:
|
||
|
# sum of weights where X[:, col_ind] is not nan
|
||
|
sum_weights[col_ind] -= weights[row_ind]
|
||
|
# number of non nan elements of X[:, col_ind]
|
||
|
counts[col_ind] -= 1
|
||
|
|
||
|
for feature_idx in range(n_features):
|
||
|
means[feature_idx] /= sum_weights[feature_idx]
|
||
|
|
||
|
for row_ind in range(len(X_indptr) - 1):
|
||
|
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
|
||
|
col_ind = X_indices[i]
|
||
|
if not isnan(X_data[i]):
|
||
|
diff = X_data[i] - means[col_ind]
|
||
|
# correction term of the corrected 2 pass algorithm.
|
||
|
# See "Algorithms for computing the sample variance: analysis
|
||
|
# and recommendations", by Chan, Golub, and LeVeque.
|
||
|
correction[col_ind] += diff * weights[row_ind]
|
||
|
variances[col_ind] += diff * diff * weights[row_ind]
|
||
|
|
||
|
for feature_idx in range(n_features):
|
||
|
if counts[feature_idx] != counts_nz[feature_idx]:
|
||
|
correction[feature_idx] -= (
|
||
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
|
||
|
) * means[feature_idx]
|
||
|
correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
|
||
|
if counts[feature_idx] != counts_nz[feature_idx]:
|
||
|
# only compute it when it's guaranteed to be non-zero to avoid
|
||
|
# catastrophic cancellation.
|
||
|
variances[feature_idx] += (
|
||
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
|
||
|
) * means[feature_idx]**2
|
||
|
variances[feature_idx] = (
|
||
|
(variances[feature_idx] - correction[feature_idx]) /
|
||
|
sum_weights[feature_idx]
|
||
|
)
|
||
|
|
||
|
if floating is float:
|
||
|
return (
|
||
|
np.array(means, dtype=np.float32),
|
||
|
np.array(variances, dtype=np.float32),
|
||
|
np.array(sum_weights, dtype=np.float32),
|
||
|
)
|
||
|
else:
|
||
|
return (
|
||
|
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
|
||
|
)
|
||
|
|
||
|
|
||
|
def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
|
||
|
"""Compute mean and variance along axis 0 on a CSC matrix
|
||
|
|
||
|
Uses a np.float64 accumulator.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : CSC sparse matrix, shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
weights : ndarray of shape (n_samples,), dtype=floating, default=None
|
||
|
If it is set to None samples will be equally weighted.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
return_sum_weights : bool, default=False
|
||
|
If True, returns the sum of weights seen for each feature.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
means : float array with shape (n_features,)
|
||
|
Feature-wise means
|
||
|
|
||
|
variances : float array with shape (n_features,)
|
||
|
Feature-wise variances
|
||
|
|
||
|
sum_weights : ndarray of shape (n_features,), dtype=floating
|
||
|
Returned if return_sum_weights is True.
|
||
|
"""
|
||
|
if X.dtype not in [np.float32, np.float64]:
|
||
|
X = X.astype(np.float64)
|
||
|
|
||
|
if weights is None:
|
||
|
weights = np.ones(X.shape[0], dtype=X.dtype)
|
||
|
|
||
|
means, variances, sum_weights = _csc_mean_variance_axis0(
|
||
|
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
|
||
|
|
||
|
if return_sum_weights:
|
||
|
return means, variances, sum_weights
|
||
|
return means, variances
|
||
|
|
||
|
|
||
|
def _csc_mean_variance_axis0(
|
||
|
const floating[::1] X_data,
|
||
|
uint64_t n_samples,
|
||
|
uint64_t n_features,
|
||
|
const integral[:] X_indices,
|
||
|
const integral[:] X_indptr,
|
||
|
const floating[:] weights,
|
||
|
):
|
||
|
# Implement the function here since variables using fused types
|
||
|
# cannot be declared directly and can only be passed as function arguments
|
||
|
cdef:
|
||
|
integral i, row_ind
|
||
|
uint64_t feature_idx, col_ind
|
||
|
float64_t diff
|
||
|
# means[j] contains the mean of feature j
|
||
|
float64_t[::1] means = np.zeros(n_features)
|
||
|
# variances[j] contains the variance of feature j
|
||
|
float64_t[::1] variances = np.zeros(n_features)
|
||
|
|
||
|
float64_t[::1] sum_weights = np.full(
|
||
|
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
|
||
|
)
|
||
|
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
|
||
|
float64_t[::1] correction = np.zeros(shape=n_features)
|
||
|
|
||
|
uint64_t[::1] counts = np.full(
|
||
|
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
|
||
|
)
|
||
|
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
|
||
|
|
||
|
for col_ind in range(n_features):
|
||
|
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
|
||
|
row_ind = X_indices[i]
|
||
|
if not isnan(X_data[i]):
|
||
|
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
|
||
|
# sum of weights where X[:, col_ind] is non-zero
|
||
|
sum_weights_nz[col_ind] += weights[row_ind]
|
||
|
# number of non-zero elements of X[:, col_ind]
|
||
|
counts_nz[col_ind] += 1
|
||
|
else:
|
||
|
# sum of weights where X[:, col_ind] is not nan
|
||
|
sum_weights[col_ind] -= weights[row_ind]
|
||
|
# number of non nan elements of X[:, col_ind]
|
||
|
counts[col_ind] -= 1
|
||
|
|
||
|
for feature_idx in range(n_features):
|
||
|
means[feature_idx] /= sum_weights[feature_idx]
|
||
|
|
||
|
for col_ind in range(n_features):
|
||
|
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
|
||
|
row_ind = X_indices[i]
|
||
|
if not isnan(X_data[i]):
|
||
|
diff = X_data[i] - means[col_ind]
|
||
|
# correction term of the corrected 2 pass algorithm.
|
||
|
# See "Algorithms for computing the sample variance: analysis
|
||
|
# and recommendations", by Chan, Golub, and LeVeque.
|
||
|
correction[col_ind] += diff * weights[row_ind]
|
||
|
variances[col_ind] += diff * diff * weights[row_ind]
|
||
|
|
||
|
for feature_idx in range(n_features):
|
||
|
if counts[feature_idx] != counts_nz[feature_idx]:
|
||
|
correction[feature_idx] -= (
|
||
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
|
||
|
) * means[feature_idx]
|
||
|
correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
|
||
|
if counts[feature_idx] != counts_nz[feature_idx]:
|
||
|
# only compute it when it's guaranteed to be non-zero to avoid
|
||
|
# catastrophic cancellation.
|
||
|
variances[feature_idx] += (
|
||
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
|
||
|
) * means[feature_idx]**2
|
||
|
variances[feature_idx] = (
|
||
|
(variances[feature_idx] - correction[feature_idx])
|
||
|
) / sum_weights[feature_idx]
|
||
|
|
||
|
if floating is float:
|
||
|
return (np.array(means, dtype=np.float32),
|
||
|
np.array(variances, dtype=np.float32),
|
||
|
np.array(sum_weights, dtype=np.float32))
|
||
|
else:
|
||
|
return (
|
||
|
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
|
||
|
)
|
||
|
|
||
|
|
||
|
def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
|
||
|
"""Compute mean and variance along axis 0 on a CSR or CSC matrix.
|
||
|
|
||
|
last_mean, last_var are the statistics computed at the last step by this
|
||
|
function. Both must be initialized to 0.0. last_n is the
|
||
|
number of samples encountered until now and is initialized at 0.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
last_mean : float array with shape (n_features,)
|
||
|
Array of feature-wise means to update with the new data X.
|
||
|
|
||
|
last_var : float array with shape (n_features,)
|
||
|
Array of feature-wise var to update with the new data X.
|
||
|
|
||
|
last_n : float array with shape (n_features,)
|
||
|
Sum of the weights seen so far (if weights are all set to 1
|
||
|
this will be the same as number of samples seen so far, before X).
|
||
|
|
||
|
weights : float array with shape (n_samples,) or None. If it is set
|
||
|
to None samples will be equally weighted.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
updated_mean : float array with shape (n_features,)
|
||
|
Feature-wise means
|
||
|
|
||
|
updated_variance : float array with shape (n_features,)
|
||
|
Feature-wise variances
|
||
|
|
||
|
updated_n : int array with shape (n_features,)
|
||
|
Updated number of samples seen
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
NaNs are ignored during the computation.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
|
||
|
variance: recommendations, The American Statistician, Vol. 37, No. 3,
|
||
|
pp. 242-247
|
||
|
|
||
|
Also, see the non-sparse implementation of this in
|
||
|
`utils.extmath._batch_mean_variance_update`.
|
||
|
|
||
|
"""
|
||
|
if X.dtype not in [np.float32, np.float64]:
|
||
|
X = X.astype(np.float64)
|
||
|
X_dtype = X.dtype
|
||
|
if weights is None:
|
||
|
weights = np.ones(X.shape[0], dtype=X_dtype)
|
||
|
elif weights.dtype not in [np.float32, np.float64]:
|
||
|
weights = weights.astype(np.float64, copy=False)
|
||
|
if last_n.dtype not in [np.float32, np.float64]:
|
||
|
last_n = last_n.astype(np.float64, copy=False)
|
||
|
|
||
|
return _incr_mean_variance_axis0(X.data,
|
||
|
np.sum(weights),
|
||
|
X.shape[1],
|
||
|
X.indices,
|
||
|
X.indptr,
|
||
|
X.format,
|
||
|
last_mean.astype(X_dtype, copy=False),
|
||
|
last_var.astype(X_dtype, copy=False),
|
||
|
last_n.astype(X_dtype, copy=False),
|
||
|
weights.astype(X_dtype, copy=False))
|
||
|
|
||
|
|
||
|
def _incr_mean_variance_axis0(
|
||
|
const floating[:] X_data,
|
||
|
floating n_samples,
|
||
|
uint64_t n_features,
|
||
|
const int[:] X_indices,
|
||
|
# X_indptr might be either int32 or int64
|
||
|
const integral[:] X_indptr,
|
||
|
str X_format,
|
||
|
floating[:] last_mean,
|
||
|
floating[:] last_var,
|
||
|
floating[:] last_n,
|
||
|
# previous sum of the weights (ie float)
|
||
|
const floating[:] weights,
|
||
|
):
|
||
|
# Implement the function here since variables using fused types
|
||
|
# cannot be declared directly and can only be passed as function arguments
|
||
|
cdef:
|
||
|
uint64_t i
|
||
|
|
||
|
# last = stats until now
|
||
|
# new = the current increment
|
||
|
# updated = the aggregated stats
|
||
|
# when arrays, they are indexed by i per-feature
|
||
|
floating[::1] new_mean
|
||
|
floating[::1] new_var
|
||
|
floating[::1] updated_mean
|
||
|
floating[::1] updated_var
|
||
|
|
||
|
if floating is float:
|
||
|
dtype = np.float32
|
||
|
else:
|
||
|
dtype = np.float64
|
||
|
|
||
|
new_mean = np.zeros(n_features, dtype=dtype)
|
||
|
new_var = np.zeros_like(new_mean, dtype=dtype)
|
||
|
updated_mean = np.zeros_like(new_mean, dtype=dtype)
|
||
|
updated_var = np.zeros_like(new_mean, dtype=dtype)
|
||
|
|
||
|
cdef:
|
||
|
floating[::1] new_n
|
||
|
floating[::1] updated_n
|
||
|
floating[::1] last_over_new_n
|
||
|
|
||
|
# Obtain new stats first
|
||
|
updated_n = np.zeros(shape=n_features, dtype=dtype)
|
||
|
last_over_new_n = np.zeros_like(updated_n, dtype=dtype)
|
||
|
|
||
|
# X can be a CSR or CSC matrix
|
||
|
if X_format == 'csr':
|
||
|
new_mean, new_var, new_n = _csr_mean_variance_axis0(
|
||
|
X_data, n_samples, n_features, X_indices, X_indptr, weights)
|
||
|
else: # X_format == 'csc'
|
||
|
new_mean, new_var, new_n = _csc_mean_variance_axis0(
|
||
|
X_data, n_samples, n_features, X_indices, X_indptr, weights)
|
||
|
|
||
|
# First pass
|
||
|
cdef bint is_first_pass = True
|
||
|
for i in range(n_features):
|
||
|
if last_n[i] > 0:
|
||
|
is_first_pass = False
|
||
|
break
|
||
|
|
||
|
if is_first_pass:
|
||
|
return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n)
|
||
|
|
||
|
for i in range(n_features):
|
||
|
updated_n[i] = last_n[i] + new_n[i]
|
||
|
|
||
|
# Next passes
|
||
|
for i in range(n_features):
|
||
|
if new_n[i] > 0:
|
||
|
last_over_new_n[i] = dtype(last_n[i]) / dtype(new_n[i])
|
||
|
# Unnormalized stats
|
||
|
last_mean[i] *= last_n[i]
|
||
|
last_var[i] *= last_n[i]
|
||
|
new_mean[i] *= new_n[i]
|
||
|
new_var[i] *= new_n[i]
|
||
|
# Update stats
|
||
|
updated_var[i] = (
|
||
|
last_var[i] + new_var[i] +
|
||
|
last_over_new_n[i] / updated_n[i] *
|
||
|
(last_mean[i] / last_over_new_n[i] - new_mean[i])**2
|
||
|
)
|
||
|
updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i]
|
||
|
updated_var[i] /= updated_n[i]
|
||
|
else:
|
||
|
updated_var[i] = last_var[i]
|
||
|
updated_mean[i] = last_mean[i]
|
||
|
updated_n[i] = last_n[i]
|
||
|
|
||
|
return (
|
||
|
np.asarray(updated_mean),
|
||
|
np.asarray(updated_var),
|
||
|
np.asarray(updated_n),
|
||
|
)
|
||
|
|
||
|
|
||
|
def inplace_csr_row_normalize_l1(X):
|
||
|
"""Normalize inplace the rows of a CSR matrix or array by their L1 norm.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : scipy.sparse.csr_matrix and scipy.sparse.csr_array, \
|
||
|
shape=(n_samples, n_features)
|
||
|
The input matrix or array to be modified inplace.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from scipy.sparse import csr_matrix
|
||
|
>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
|
||
|
>>> X = csr_matrix(([1.0, 2.0, 3.0], [0, 2, 3], [0, 3, 4]), shape=(3, 4))
|
||
|
>>> X.toarray()
|
||
|
array([[1., 2., 0., 0.],
|
||
|
[0., 0., 3., 0.],
|
||
|
[0., 0., 0., 4.]])
|
||
|
>>> inplace_csr_row_normalize_l1(X)
|
||
|
>>> X.toarray()
|
||
|
array([[0.33... , 0.66... , 0. , 0. ],
|
||
|
[0. , 0. , 1. , 0. ],
|
||
|
[0. , 0. , 0. , 1. ]])
|
||
|
"""
|
||
|
_inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)
|
||
|
|
||
|
|
||
|
def _inplace_csr_row_normalize_l1(
|
||
|
floating[:] X_data,
|
||
|
shape,
|
||
|
const integral[:] X_indices,
|
||
|
const integral[:] X_indptr,
|
||
|
):
|
||
|
cdef:
|
||
|
uint64_t n_samples = shape[0]
|
||
|
|
||
|
# the column indices for row i are stored in:
|
||
|
# indices[indptr[i]:indices[i+1]]
|
||
|
# and their corresponding values are stored in:
|
||
|
# data[indptr[i]:indptr[i+1]]
|
||
|
uint64_t i
|
||
|
integral j
|
||
|
double sum_
|
||
|
|
||
|
for i in range(n_samples):
|
||
|
sum_ = 0.0
|
||
|
|
||
|
for j in range(X_indptr[i], X_indptr[i + 1]):
|
||
|
sum_ += fabs(X_data[j])
|
||
|
|
||
|
if sum_ == 0.0:
|
||
|
# do not normalize empty rows (can happen if CSR is not pruned
|
||
|
# correctly)
|
||
|
continue
|
||
|
|
||
|
for j in range(X_indptr[i], X_indptr[i + 1]):
|
||
|
X_data[j] /= sum_
|
||
|
|
||
|
|
||
|
def inplace_csr_row_normalize_l2(X):
|
||
|
"""Normalize inplace the rows of a CSR matrix or array by their L2 norm.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
|
||
|
The input matrix or array to be modified inplace.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from scipy.sparse import csr_matrix
|
||
|
>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
|
||
|
>>> X = csr_matrix(([1.0, 2.0, 3.0], [0, 2, 3], [0, 3, 4]), shape=(3, 4))
|
||
|
>>> X.toarray()
|
||
|
array([[1., 2., 0., 0.],
|
||
|
[0., 0., 3., 0.],
|
||
|
[0., 0., 0., 4.]])
|
||
|
>>> inplace_csr_row_normalize_l2(X)
|
||
|
>>> X.toarray()
|
||
|
array([[0.44... , 0.89... , 0. , 0. ],
|
||
|
[0. , 0. , 1. , 0. ],
|
||
|
[0. , 0. , 0. , 1. ]])
|
||
|
"""
|
||
|
_inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)
|
||
|
|
||
|
|
||
|
def _inplace_csr_row_normalize_l2(
|
||
|
floating[:] X_data,
|
||
|
shape,
|
||
|
const integral[:] X_indices,
|
||
|
const integral[:] X_indptr,
|
||
|
):
|
||
|
cdef:
|
||
|
uint64_t n_samples = shape[0]
|
||
|
uint64_t i
|
||
|
integral j
|
||
|
double sum_
|
||
|
|
||
|
for i in range(n_samples):
|
||
|
sum_ = 0.0
|
||
|
|
||
|
for j in range(X_indptr[i], X_indptr[i + 1]):
|
||
|
sum_ += (X_data[j] * X_data[j])
|
||
|
|
||
|
if sum_ == 0.0:
|
||
|
# do not normalize empty rows (can happen if CSR is not pruned
|
||
|
# correctly)
|
||
|
continue
|
||
|
|
||
|
sum_ = sqrt(sum_)
|
||
|
|
||
|
for j in range(X_indptr[i], X_indptr[i + 1]):
|
||
|
X_data[j] /= sum_
|
||
|
|
||
|
|
||
|
def assign_rows_csr(
|
||
|
X,
|
||
|
const intptr_t[:] X_rows,
|
||
|
const intptr_t[:] out_rows,
|
||
|
floating[:, ::1] out,
|
||
|
):
|
||
|
"""Densify selected rows of a CSR matrix into a preallocated array.
|
||
|
|
||
|
Like out[out_rows] = X[X_rows].toarray() but without copying.
|
||
|
No-copy supported for both dtype=np.float32 and dtype=np.float64.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
|
||
|
X_rows : array, dtype=np.intp, shape=n_rows
|
||
|
out_rows : array, dtype=np.intp, shape=n_rows
|
||
|
out : array, shape=(arbitrary, n_features)
|
||
|
"""
|
||
|
cdef:
|
||
|
# intptr_t (npy_intp, np.intp in Python) is what np.where returns,
|
||
|
# but int is what scipy.sparse uses.
|
||
|
intp_t i, ind, j, k
|
||
|
intptr_t rX
|
||
|
const floating[:] data = X.data
|
||
|
const int32_t[:] indices = X.indices
|
||
|
const int32_t[:] indptr = X.indptr
|
||
|
|
||
|
if X_rows.shape[0] != out_rows.shape[0]:
|
||
|
raise ValueError("cannot assign %d rows to %d"
|
||
|
% (X_rows.shape[0], out_rows.shape[0]))
|
||
|
|
||
|
with nogil:
|
||
|
for k in range(out_rows.shape[0]):
|
||
|
out[out_rows[k]] = 0.0
|
||
|
|
||
|
for i in range(X_rows.shape[0]):
|
||
|
rX = X_rows[i]
|
||
|
for ind in range(indptr[rX], indptr[rX + 1]):
|
||
|
j = indices[ind]
|
||
|
out[out_rows[i], j] = data[ind]
|