507 lines
16 KiB
Cython
507 lines
16 KiB
Cython
|
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||
|
|
||
|
import numpy as np
|
||
|
cimport cython
|
||
|
|
||
|
from ..metrics._dist_metrics cimport DistanceMetric64
|
||
|
from ..utils._fast_dict cimport IntFloatDict
|
||
|
from ..utils._typedefs cimport float64_t, intp_t, uint8_t
|
||
|
|
||
|
# C++
|
||
|
from cython.operator cimport dereference as deref, preincrement as inc
|
||
|
from libcpp.map cimport map as cpp_map
|
||
|
from libc.math cimport fmax, INFINITY
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
# Utilities for computing the ward momentum
|
||
|
|
||
|
def compute_ward_dist(
|
||
|
const float64_t[::1] m_1,
|
||
|
const float64_t[:, ::1] m_2,
|
||
|
const intp_t[::1] coord_row,
|
||
|
const intp_t[::1] coord_col,
|
||
|
float64_t[::1] res
|
||
|
):
|
||
|
cdef intp_t size_max = coord_row.shape[0]
|
||
|
cdef intp_t n_features = m_2.shape[1]
|
||
|
cdef intp_t i, j, row, col
|
||
|
cdef float64_t pa, n
|
||
|
|
||
|
for i in range(size_max):
|
||
|
row = coord_row[i]
|
||
|
col = coord_col[i]
|
||
|
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
|
||
|
pa = 0.
|
||
|
for j in range(n_features):
|
||
|
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
|
||
|
res[i] = pa * n
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
# Utilities for cutting and exploring a hierarchical tree
|
||
|
|
||
|
def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
|
||
|
"""
|
||
|
Function returning all the descendent leaves of a set of nodes in the tree.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node : integer
|
||
|
The node for which we want the descendents.
|
||
|
|
||
|
children : list of pairs, length n_nodes
|
||
|
The children of each non-leaf node. Values less than `n_samples` refer
|
||
|
to leaves of the tree. A greater value `i` indicates a node with
|
||
|
children `children[i - n_samples]`.
|
||
|
|
||
|
n_leaves : integer
|
||
|
Number of leaves.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
descendent : list of int
|
||
|
"""
|
||
|
ind = [node]
|
||
|
if node < n_leaves:
|
||
|
return ind
|
||
|
descendent = []
|
||
|
|
||
|
# It is actually faster to do the accounting of the number of
|
||
|
# elements is the list ourselves: len is a lengthy operation on a
|
||
|
# chained list
|
||
|
cdef intp_t i, n_indices = 1
|
||
|
|
||
|
while n_indices:
|
||
|
i = ind.pop()
|
||
|
if i < n_leaves:
|
||
|
descendent.append(i)
|
||
|
n_indices -= 1
|
||
|
else:
|
||
|
ind.extend(children[i - n_leaves])
|
||
|
n_indices += 1
|
||
|
return descendent
|
||
|
|
||
|
|
||
|
def hc_get_heads(intp_t[:] parents, copy=True):
|
||
|
"""Returns the heads of the forest, as defined by parents.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
parents : array of integers
|
||
|
The parent structure defining the forest (ensemble of trees)
|
||
|
copy : boolean
|
||
|
If copy is False, the input 'parents' array is modified inplace
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
heads : array of integers of same shape as parents
|
||
|
The indices in the 'parents' of the tree heads
|
||
|
|
||
|
"""
|
||
|
cdef intp_t parent, node0, node, size
|
||
|
if copy:
|
||
|
parents = np.copy(parents)
|
||
|
size = parents.size
|
||
|
|
||
|
# Start from the top of the tree and go down
|
||
|
for node0 in range(size - 1, -1, -1):
|
||
|
node = node0
|
||
|
parent = parents[node]
|
||
|
while parent != node:
|
||
|
parents[node0] = parent
|
||
|
node = parent
|
||
|
parent = parents[node]
|
||
|
return parents
|
||
|
|
||
|
|
||
|
def _get_parents(
|
||
|
nodes,
|
||
|
heads,
|
||
|
const intp_t[:] parents,
|
||
|
uint8_t[::1] not_visited
|
||
|
):
|
||
|
"""Returns the heads of the given nodes, as defined by parents.
|
||
|
|
||
|
Modifies 'heads' and 'not_visited' in-place.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
nodes : list of integers
|
||
|
The nodes to start from
|
||
|
heads : list of integers
|
||
|
A list to hold the results (modified inplace)
|
||
|
parents : array of integers
|
||
|
The parent structure defining the tree
|
||
|
not_visited
|
||
|
The tree nodes to consider (modified inplace)
|
||
|
|
||
|
"""
|
||
|
cdef intp_t parent, node
|
||
|
|
||
|
for node in nodes:
|
||
|
parent = parents[node]
|
||
|
while parent != node:
|
||
|
node = parent
|
||
|
parent = parents[node]
|
||
|
if not_visited[node]:
|
||
|
not_visited[node] = 0
|
||
|
heads.append(node)
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
# merge strategies implemented on IntFloatDicts
|
||
|
|
||
|
# These are used in the hierarchical clustering code, to implement
|
||
|
# merging between two clusters, defined as a dict containing node number
|
||
|
# as keys and edge weights as values.
|
||
|
|
||
|
|
||
|
def max_merge(
|
||
|
IntFloatDict a,
|
||
|
IntFloatDict b,
|
||
|
const intp_t[:] mask,
|
||
|
intp_t n_a,
|
||
|
intp_t n_b
|
||
|
):
|
||
|
"""Merge two IntFloatDicts with the max strategy: when the same key is
|
||
|
present in the two dicts, the max of the two values is used.
|
||
|
|
||
|
Parameters
|
||
|
==========
|
||
|
a, b : IntFloatDict object
|
||
|
The IntFloatDicts to merge
|
||
|
mask : ndarray array of dtype integer and of dimension 1
|
||
|
a mask for keys to ignore: if not mask[key] the corresponding key
|
||
|
is skipped in the output dictionary
|
||
|
n_a, n_b : float
|
||
|
n_a and n_b are weights for a and b for the merge strategy.
|
||
|
They are not used in the case of a max merge.
|
||
|
|
||
|
Returns
|
||
|
=======
|
||
|
out : IntFloatDict object
|
||
|
The IntFloatDict resulting from the merge
|
||
|
"""
|
||
|
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
|
||
|
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
|
||
|
cdef intp_t key
|
||
|
cdef float64_t value
|
||
|
# First copy a into out
|
||
|
while a_it != a_end:
|
||
|
key = deref(a_it).first
|
||
|
if mask[key]:
|
||
|
out_obj.my_map[key] = deref(a_it).second
|
||
|
inc(a_it)
|
||
|
|
||
|
# Then merge b into out
|
||
|
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
|
||
|
while b_it != b_end:
|
||
|
key = deref(b_it).first
|
||
|
value = deref(b_it).second
|
||
|
if mask[key]:
|
||
|
out_it = out_obj.my_map.find(key)
|
||
|
if out_it == out_end:
|
||
|
# Key not found
|
||
|
out_obj.my_map[key] = value
|
||
|
else:
|
||
|
deref(out_it).second = fmax(deref(out_it).second, value)
|
||
|
inc(b_it)
|
||
|
return out_obj
|
||
|
|
||
|
|
||
|
def average_merge(
|
||
|
IntFloatDict a,
|
||
|
IntFloatDict b,
|
||
|
const intp_t[:] mask,
|
||
|
intp_t n_a,
|
||
|
intp_t n_b
|
||
|
):
|
||
|
"""Merge two IntFloatDicts with the average strategy: when the
|
||
|
same key is present in the two dicts, the weighted average of the two
|
||
|
values is used.
|
||
|
|
||
|
Parameters
|
||
|
==========
|
||
|
a, b : IntFloatDict object
|
||
|
The IntFloatDicts to merge
|
||
|
mask : ndarray array of dtype integer and of dimension 1
|
||
|
a mask for keys to ignore: if not mask[key] the corresponding key
|
||
|
is skipped in the output dictionary
|
||
|
n_a, n_b : float
|
||
|
n_a and n_b are weights for a and b for the merge strategy.
|
||
|
They are used for a weighted mean.
|
||
|
|
||
|
Returns
|
||
|
=======
|
||
|
out : IntFloatDict object
|
||
|
The IntFloatDict resulting from the merge
|
||
|
"""
|
||
|
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
|
||
|
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
|
||
|
cdef intp_t key
|
||
|
cdef float64_t value
|
||
|
cdef float64_t n_out = <float64_t> (n_a + n_b)
|
||
|
# First copy a into out
|
||
|
while a_it != a_end:
|
||
|
key = deref(a_it).first
|
||
|
if mask[key]:
|
||
|
out_obj.my_map[key] = deref(a_it).second
|
||
|
inc(a_it)
|
||
|
|
||
|
# Then merge b into out
|
||
|
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
|
||
|
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
|
||
|
while b_it != b_end:
|
||
|
key = deref(b_it).first
|
||
|
value = deref(b_it).second
|
||
|
if mask[key]:
|
||
|
out_it = out_obj.my_map.find(key)
|
||
|
if out_it == out_end:
|
||
|
# Key not found
|
||
|
out_obj.my_map[key] = value
|
||
|
else:
|
||
|
deref(out_it).second = (n_a * deref(out_it).second
|
||
|
+ n_b * value) / n_out
|
||
|
inc(b_it)
|
||
|
return out_obj
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
# An edge object for fast comparisons
|
||
|
|
||
|
cdef class WeightedEdge:
|
||
|
cdef public intp_t a
|
||
|
cdef public intp_t b
|
||
|
cdef public float64_t weight
|
||
|
|
||
|
def __init__(self, float64_t weight, intp_t a, intp_t b):
|
||
|
self.weight = weight
|
||
|
self.a = a
|
||
|
self.b = b
|
||
|
|
||
|
def __richcmp__(self, WeightedEdge other, int op):
|
||
|
"""Cython-specific comparison method.
|
||
|
|
||
|
op is the comparison code::
|
||
|
< 0
|
||
|
== 2
|
||
|
> 4
|
||
|
<= 1
|
||
|
!= 3
|
||
|
>= 5
|
||
|
"""
|
||
|
if op == 0:
|
||
|
return self.weight < other.weight
|
||
|
elif op == 1:
|
||
|
return self.weight <= other.weight
|
||
|
elif op == 2:
|
||
|
return self.weight == other.weight
|
||
|
elif op == 3:
|
||
|
return self.weight != other.weight
|
||
|
elif op == 4:
|
||
|
return self.weight > other.weight
|
||
|
elif op == 5:
|
||
|
return self.weight >= other.weight
|
||
|
|
||
|
def __repr__(self):
|
||
|
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
|
||
|
self.weight,
|
||
|
self.a, self.b)
|
||
|
|
||
|
|
||
|
################################################################################
|
||
|
# Efficient labelling/conversion of MSTs to single linkage hierarchies
|
||
|
|
||
|
cdef class UnionFind(object):
|
||
|
|
||
|
def __init__(self, N):
|
||
|
self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
|
||
|
self.next_label = N
|
||
|
self.size = np.hstack((np.ones(N, dtype=np.intp),
|
||
|
np.zeros(N - 1, dtype=np.intp)))
|
||
|
|
||
|
cdef void union(self, intp_t m, intp_t n) noexcept:
|
||
|
self.parent[m] = self.next_label
|
||
|
self.parent[n] = self.next_label
|
||
|
self.size[self.next_label] = self.size[m] + self.size[n]
|
||
|
self.next_label += 1
|
||
|
return
|
||
|
|
||
|
@cython.wraparound(True)
|
||
|
cdef intp_t fast_find(self, intp_t n) noexcept:
|
||
|
cdef intp_t p
|
||
|
p = n
|
||
|
# find the highest node in the linkage graph so far
|
||
|
while self.parent[n] != -1:
|
||
|
n = self.parent[n]
|
||
|
# provide a shortcut up to the highest node
|
||
|
while self.parent[p] != n:
|
||
|
p, self.parent[p] = self.parent[p], n
|
||
|
return n
|
||
|
|
||
|
|
||
|
def _single_linkage_label(const float64_t[:, :] L):
|
||
|
"""
|
||
|
Convert an linkage array or MST to a tree by labelling clusters at merges.
|
||
|
This is done by using a Union find structure to keep track of merges
|
||
|
efficiently. This is the private version of the function that assumes that
|
||
|
``L`` has been properly validated. See ``single_linkage_label`` for the
|
||
|
user facing version of this function.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
L: array of shape (n_samples - 1, 3)
|
||
|
The linkage array or MST where each row specifies two samples
|
||
|
to be merged and a distance or weight at which the merge occurs. This
|
||
|
array is assumed to be sorted by the distance/weight.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
A tree in the format used by scipy.cluster.hierarchy.
|
||
|
"""
|
||
|
|
||
|
cdef float64_t[:, ::1] result_arr
|
||
|
|
||
|
cdef intp_t left, left_cluster, right, right_cluster, index
|
||
|
cdef float64_t delta
|
||
|
|
||
|
result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
|
||
|
U = UnionFind(L.shape[0] + 1)
|
||
|
|
||
|
for index in range(L.shape[0]):
|
||
|
|
||
|
left = <intp_t> L[index, 0]
|
||
|
right = <intp_t> L[index, 1]
|
||
|
delta = L[index, 2]
|
||
|
|
||
|
left_cluster = U.fast_find(left)
|
||
|
right_cluster = U.fast_find(right)
|
||
|
|
||
|
result_arr[index][0] = left_cluster
|
||
|
result_arr[index][1] = right_cluster
|
||
|
result_arr[index][2] = delta
|
||
|
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
|
||
|
|
||
|
U.union(left_cluster, right_cluster)
|
||
|
|
||
|
return np.asarray(result_arr)
|
||
|
|
||
|
|
||
|
@cython.wraparound(True)
|
||
|
def single_linkage_label(L):
|
||
|
"""
|
||
|
Convert an linkage array or MST to a tree by labelling clusters at merges.
|
||
|
This is done by using a Union find structure to keep track of merges
|
||
|
efficiently.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
L: array of shape (n_samples - 1, 3)
|
||
|
The linkage array or MST where each row specifies two samples
|
||
|
to be merged and a distance or weight at which the merge occurs. This
|
||
|
array is assumed to be sorted by the distance/weight.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
A tree in the format used by scipy.cluster.hierarchy.
|
||
|
"""
|
||
|
# Validate L
|
||
|
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
|
||
|
raise ValueError("Input MST array is not a validly formatted MST array")
|
||
|
|
||
|
is_sorted = lambda x: np.all(x[:-1] <= x[1:])
|
||
|
if not is_sorted(L[:, 2]):
|
||
|
raise ValueError("Input MST array must be sorted by weight")
|
||
|
|
||
|
return _single_linkage_label(L)
|
||
|
|
||
|
|
||
|
# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
|
||
|
def mst_linkage_core(
|
||
|
const float64_t [:, ::1] raw_data,
|
||
|
DistanceMetric64 dist_metric):
|
||
|
"""
|
||
|
Compute the necessary elements of a minimum spanning
|
||
|
tree for computation of single linkage clustering. This
|
||
|
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
|
||
|
:arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
|
||
|
algorithms" <1109.2378>`.
|
||
|
|
||
|
In contrast to the scipy implementation is never computes
|
||
|
a full distance matrix, generating distances only as they
|
||
|
are needed and releasing them when no longer needed.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_data: array of shape (n_samples, n_features)
|
||
|
The array of feature data to be clustered. Must be C-aligned
|
||
|
|
||
|
dist_metric: DistanceMetric64
|
||
|
A DistanceMetric64 object conforming to the API from
|
||
|
``sklearn.metrics._dist_metrics.pxd`` that will be
|
||
|
used to compute distances.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mst_core_data: array of shape (n_samples, 3)
|
||
|
An array providing information from which one
|
||
|
can either compute an MST, or the linkage hierarchy
|
||
|
very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
|
||
|
agglomerative clustering algorithms" <1109.2378>` algorithm
|
||
|
MST-LINKAGE-CORE for more details.
|
||
|
"""
|
||
|
cdef:
|
||
|
intp_t n_samples = raw_data.shape[0]
|
||
|
uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
|
||
|
float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
|
||
|
|
||
|
intp_t current_node = 0
|
||
|
intp_t new_node
|
||
|
intp_t i
|
||
|
intp_t j
|
||
|
intp_t num_features = raw_data.shape[1]
|
||
|
|
||
|
float64_t right_value
|
||
|
float64_t left_value
|
||
|
float64_t new_distance
|
||
|
|
||
|
float64_t[:] current_distances = np.full(n_samples, INFINITY)
|
||
|
|
||
|
for i in range(n_samples - 1):
|
||
|
|
||
|
in_tree[current_node] = 1
|
||
|
|
||
|
new_distance = INFINITY
|
||
|
new_node = 0
|
||
|
|
||
|
for j in range(n_samples):
|
||
|
if in_tree[j]:
|
||
|
continue
|
||
|
|
||
|
right_value = current_distances[j]
|
||
|
left_value = dist_metric.dist(&raw_data[current_node, 0],
|
||
|
&raw_data[j, 0],
|
||
|
num_features)
|
||
|
|
||
|
if left_value < right_value:
|
||
|
current_distances[j] = left_value
|
||
|
|
||
|
if current_distances[j] < new_distance:
|
||
|
new_distance = current_distances[j]
|
||
|
new_node = j
|
||
|
|
||
|
result[i, 0] = current_node
|
||
|
result[i, 1] = new_node
|
||
|
result[i, 2] = new_distance
|
||
|
current_node = new_node
|
||
|
|
||
|
return np.array(result)
|