213 lines
7.7 KiB
Cython
213 lines
7.7 KiB
Cython
# mutual reachability distance computations
|
|
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
|
|
# Meekail Zain <zainmeekail@gmail.com>
|
|
# Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
|
# Copyright (c) 2015, Leland McInnes
|
|
# All rights reserved.
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
|
|
# 3. Neither the name of the copyright holder nor the names of its contributors
|
|
# may be used to endorse or promote products derived from this software without
|
|
# specific prior written permission.
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
cimport numpy as cnp
|
|
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from cython cimport floating, integral
|
|
from libc.math cimport isfinite, INFINITY
|
|
from ...utils._typedefs cimport intp_t
|
|
cnp.import_array()
|
|
|
|
|
|
def mutual_reachability_graph(
|
|
distance_matrix, min_samples=5, max_distance=0.0
|
|
):
|
|
"""Compute the weighted adjacency matrix of the mutual reachability graph.
|
|
|
|
The mutual reachability distance used to build the graph is defined as::
|
|
|
|
max(d_core(x_p), d_core(x_q), d(x_p, x_q))
|
|
|
|
and the core distance `d_core` is defined as the distance between a point
|
|
`x_p` and its k-th nearest neighbor.
|
|
|
|
Note that all computations are done in-place.
|
|
|
|
Parameters
|
|
----------
|
|
distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
|
|
Array of distances between samples. If sparse, the array must be in
|
|
`CSR` format.
|
|
|
|
min_samples : int, default=5
|
|
The number of points in a neighbourhood for a point to be considered
|
|
a core point.
|
|
|
|
max_distance : float, default=0.0
|
|
The distance which `np.inf` is replaced with. When the true mutual-
|
|
reachability distance is measured to be infinite, it is instead
|
|
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
|
matrix.
|
|
|
|
Returns
|
|
-------
|
|
mututal_reachability_graph: {ndarray, sparse matrix} of shape \
|
|
(n_samples, n_samples)
|
|
Weighted adjacency matrix of the mutual reachability graph.
|
|
|
|
References
|
|
----------
|
|
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
|
|
Density-based clustering based on hierarchical density estimates.
|
|
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
|
|
(pp. 160-172). Springer Berlin Heidelberg.
|
|
"""
|
|
further_neighbor_idx = min_samples - 1
|
|
if issparse(distance_matrix):
|
|
if distance_matrix.format != "csr":
|
|
raise ValueError(
|
|
"Only sparse CSR matrices are supported for `distance_matrix`."
|
|
)
|
|
_sparse_mutual_reachability_graph(
|
|
distance_matrix.data,
|
|
distance_matrix.indices,
|
|
distance_matrix.indptr,
|
|
distance_matrix.shape[0],
|
|
further_neighbor_idx=further_neighbor_idx,
|
|
max_distance=max_distance,
|
|
)
|
|
else:
|
|
_dense_mutual_reachability_graph(
|
|
distance_matrix, further_neighbor_idx=further_neighbor_idx
|
|
)
|
|
return distance_matrix
|
|
|
|
|
|
def _dense_mutual_reachability_graph(
|
|
floating[:, :] distance_matrix,
|
|
intp_t further_neighbor_idx,
|
|
):
|
|
"""Dense implementation of mutual reachability graph.
|
|
|
|
The computation is done in-place, i.e. the distance matrix is modified
|
|
directly.
|
|
|
|
Parameters
|
|
----------
|
|
distance_matrix : ndarray of shape (n_samples, n_samples)
|
|
Array of distances between samples.
|
|
|
|
further_neighbor_idx : int
|
|
The index of the furthest neighbor to use to define the core distances.
|
|
"""
|
|
cdef:
|
|
intp_t i, j, n_samples = distance_matrix.shape[0]
|
|
floating mutual_reachibility_distance
|
|
floating[::1] core_distances
|
|
|
|
# We assume that the distance matrix is symmetric. We choose to sort every
|
|
# row to have the same implementation than the sparse case that requires
|
|
# CSR matrix.
|
|
core_distances = np.ascontiguousarray(
|
|
np.partition(
|
|
distance_matrix, further_neighbor_idx, axis=1
|
|
)[:, further_neighbor_idx]
|
|
)
|
|
|
|
with nogil:
|
|
# TODO: Update w/ prange with thread count based on
|
|
# _openmp_effective_n_threads
|
|
for i in range(n_samples):
|
|
for j in range(n_samples):
|
|
mutual_reachibility_distance = max(
|
|
core_distances[i],
|
|
core_distances[j],
|
|
distance_matrix[i, j],
|
|
)
|
|
distance_matrix[i, j] = mutual_reachibility_distance
|
|
|
|
|
|
def _sparse_mutual_reachability_graph(
|
|
cnp.ndarray[floating, ndim=1, mode="c"] data,
|
|
cnp.ndarray[integral, ndim=1, mode="c"] indices,
|
|
cnp.ndarray[integral, ndim=1, mode="c"] indptr,
|
|
intp_t n_samples,
|
|
intp_t further_neighbor_idx,
|
|
floating max_distance,
|
|
):
|
|
"""Sparse implementation of mutual reachability graph.
|
|
|
|
The computation is done in-place, i.e. the distance matrix is modified
|
|
directly. This implementation only accepts `CSR` format sparse matrices.
|
|
|
|
Parameters
|
|
----------
|
|
distance_matrix : sparse matrix of shape (n_samples, n_samples)
|
|
Sparse matrix of distances between samples. The sparse format should
|
|
be `CSR`.
|
|
|
|
further_neighbor_idx : int
|
|
The index of the furthest neighbor to use to define the core distances.
|
|
|
|
max_distance : float
|
|
The distance which `np.inf` is replaced with. When the true mutual-
|
|
reachability distance is measured to be infinite, it is instead
|
|
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
|
matrix.
|
|
"""
|
|
cdef:
|
|
integral i, col_ind, row_ind
|
|
floating mutual_reachibility_distance
|
|
floating[:] core_distances
|
|
floating[:] row_data
|
|
|
|
if floating is float:
|
|
dtype = np.float32
|
|
else:
|
|
dtype = np.float64
|
|
|
|
core_distances = np.empty(n_samples, dtype=dtype)
|
|
|
|
for i in range(n_samples):
|
|
row_data = data[indptr[i]:indptr[i + 1]]
|
|
if further_neighbor_idx < row_data.size:
|
|
core_distances[i] = np.partition(
|
|
row_data, further_neighbor_idx
|
|
)[further_neighbor_idx]
|
|
else:
|
|
core_distances[i] = INFINITY
|
|
|
|
with nogil:
|
|
for row_ind in range(n_samples):
|
|
for i in range(indptr[row_ind], indptr[row_ind + 1]):
|
|
col_ind = indices[i]
|
|
mutual_reachibility_distance = max(
|
|
core_distances[row_ind], core_distances[col_ind], data[i]
|
|
)
|
|
if isfinite(mutual_reachibility_distance):
|
|
data[i] = mutual_reachibility_distance
|
|
elif max_distance > 0:
|
|
data[i] = max_distance
|