742 lines
26 KiB
Python
742 lines
26 KiB
Python
|
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
||
|
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||
|
# Joel Nothman <joel.nothman@gmail.com>
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import warnings
|
||
|
from math import sqrt
|
||
|
from numbers import Integral, Real
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import sparse
|
||
|
|
||
|
from .._config import config_context
|
||
|
from ..base import (
|
||
|
BaseEstimator,
|
||
|
ClassNamePrefixFeaturesOutMixin,
|
||
|
ClusterMixin,
|
||
|
TransformerMixin,
|
||
|
_fit_context,
|
||
|
)
|
||
|
from ..exceptions import ConvergenceWarning
|
||
|
from ..metrics import pairwise_distances_argmin
|
||
|
from ..metrics.pairwise import euclidean_distances
|
||
|
from ..utils._param_validation import Interval
|
||
|
from ..utils.extmath import row_norms
|
||
|
from ..utils.validation import check_is_fitted
|
||
|
from . import AgglomerativeClustering
|
||
|
|
||
|
|
||
|
def _iterate_sparse_X(X):
|
||
|
"""This little hack returns a densified row when iterating over a sparse
|
||
|
matrix, instead of constructing a sparse matrix for every row that is
|
||
|
expensive.
|
||
|
"""
|
||
|
n_samples = X.shape[0]
|
||
|
X_indices = X.indices
|
||
|
X_data = X.data
|
||
|
X_indptr = X.indptr
|
||
|
|
||
|
for i in range(n_samples):
|
||
|
row = np.zeros(X.shape[1])
|
||
|
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
||
|
nonzero_indices = X_indices[startptr:endptr]
|
||
|
row[nonzero_indices] = X_data[startptr:endptr]
|
||
|
yield row
|
||
|
|
||
|
|
||
|
def _split_node(node, threshold, branching_factor):
|
||
|
"""The node has to be split if there is no place for a new subcluster
|
||
|
in the node.
|
||
|
1. Two empty nodes and two empty subclusters are initialized.
|
||
|
2. The pair of distant subclusters are found.
|
||
|
3. The properties of the empty subclusters and nodes are updated
|
||
|
according to the nearest distance between the subclusters to the
|
||
|
pair of distant subclusters.
|
||
|
4. The two nodes are set as children to the two subclusters.
|
||
|
"""
|
||
|
new_subcluster1 = _CFSubcluster()
|
||
|
new_subcluster2 = _CFSubcluster()
|
||
|
new_node1 = _CFNode(
|
||
|
threshold=threshold,
|
||
|
branching_factor=branching_factor,
|
||
|
is_leaf=node.is_leaf,
|
||
|
n_features=node.n_features,
|
||
|
dtype=node.init_centroids_.dtype,
|
||
|
)
|
||
|
new_node2 = _CFNode(
|
||
|
threshold=threshold,
|
||
|
branching_factor=branching_factor,
|
||
|
is_leaf=node.is_leaf,
|
||
|
n_features=node.n_features,
|
||
|
dtype=node.init_centroids_.dtype,
|
||
|
)
|
||
|
new_subcluster1.child_ = new_node1
|
||
|
new_subcluster2.child_ = new_node2
|
||
|
|
||
|
if node.is_leaf:
|
||
|
if node.prev_leaf_ is not None:
|
||
|
node.prev_leaf_.next_leaf_ = new_node1
|
||
|
new_node1.prev_leaf_ = node.prev_leaf_
|
||
|
new_node1.next_leaf_ = new_node2
|
||
|
new_node2.prev_leaf_ = new_node1
|
||
|
new_node2.next_leaf_ = node.next_leaf_
|
||
|
if node.next_leaf_ is not None:
|
||
|
node.next_leaf_.prev_leaf_ = new_node2
|
||
|
|
||
|
dist = euclidean_distances(
|
||
|
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
|
||
|
)
|
||
|
n_clusters = dist.shape[0]
|
||
|
|
||
|
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
|
||
|
node1_dist, node2_dist = dist[(farthest_idx,)]
|
||
|
|
||
|
node1_closer = node1_dist < node2_dist
|
||
|
# make sure node1 is closest to itself even if all distances are equal.
|
||
|
# This can only happen when all node.centroids_ are duplicates leading to all
|
||
|
# distances between centroids being zero.
|
||
|
node1_closer[farthest_idx[0]] = True
|
||
|
|
||
|
for idx, subcluster in enumerate(node.subclusters_):
|
||
|
if node1_closer[idx]:
|
||
|
new_node1.append_subcluster(subcluster)
|
||
|
new_subcluster1.update(subcluster)
|
||
|
else:
|
||
|
new_node2.append_subcluster(subcluster)
|
||
|
new_subcluster2.update(subcluster)
|
||
|
return new_subcluster1, new_subcluster2
|
||
|
|
||
|
|
||
|
class _CFNode:
|
||
|
"""Each node in a CFTree is called a CFNode.
|
||
|
|
||
|
The CFNode can have a maximum of branching_factor
|
||
|
number of CFSubclusters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
threshold : float
|
||
|
Threshold needed for a new subcluster to enter a CFSubcluster.
|
||
|
|
||
|
branching_factor : int
|
||
|
Maximum number of CF subclusters in each node.
|
||
|
|
||
|
is_leaf : bool
|
||
|
We need to know if the CFNode is a leaf or not, in order to
|
||
|
retrieve the final subclusters.
|
||
|
|
||
|
n_features : int
|
||
|
The number of features.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
subclusters_ : list
|
||
|
List of subclusters for a particular CFNode.
|
||
|
|
||
|
prev_leaf_ : _CFNode
|
||
|
Useful only if is_leaf is True.
|
||
|
|
||
|
next_leaf_ : _CFNode
|
||
|
next_leaf. Useful only if is_leaf is True.
|
||
|
the final subclusters.
|
||
|
|
||
|
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||
|
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
||
|
the centroids are just a view of the ``init_centroids_`` .
|
||
|
|
||
|
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||
|
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
|
||
|
|
||
|
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||
|
View of ``init_centroids_``.
|
||
|
|
||
|
squared_norm_ : ndarray of shape (branching_factor + 1,)
|
||
|
View of ``init_sq_norm_``.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
|
||
|
self.threshold = threshold
|
||
|
self.branching_factor = branching_factor
|
||
|
self.is_leaf = is_leaf
|
||
|
self.n_features = n_features
|
||
|
|
||
|
# The list of subclusters, centroids and squared norms
|
||
|
# to manipulate throughout.
|
||
|
self.subclusters_ = []
|
||
|
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
|
||
|
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
|
||
|
self.squared_norm_ = []
|
||
|
self.prev_leaf_ = None
|
||
|
self.next_leaf_ = None
|
||
|
|
||
|
def append_subcluster(self, subcluster):
|
||
|
n_samples = len(self.subclusters_)
|
||
|
self.subclusters_.append(subcluster)
|
||
|
self.init_centroids_[n_samples] = subcluster.centroid_
|
||
|
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
|
||
|
|
||
|
# Keep centroids and squared norm as views. In this way
|
||
|
# if we change init_centroids and init_sq_norm_, it is
|
||
|
# sufficient,
|
||
|
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
|
||
|
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
|
||
|
|
||
|
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
|
||
|
"""Remove a subcluster from a node and update it with the
|
||
|
split subclusters.
|
||
|
"""
|
||
|
ind = self.subclusters_.index(subcluster)
|
||
|
self.subclusters_[ind] = new_subcluster1
|
||
|
self.init_centroids_[ind] = new_subcluster1.centroid_
|
||
|
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
|
||
|
self.append_subcluster(new_subcluster2)
|
||
|
|
||
|
def insert_cf_subcluster(self, subcluster):
|
||
|
"""Insert a new subcluster into the node."""
|
||
|
if not self.subclusters_:
|
||
|
self.append_subcluster(subcluster)
|
||
|
return False
|
||
|
|
||
|
threshold = self.threshold
|
||
|
branching_factor = self.branching_factor
|
||
|
# We need to find the closest subcluster among all the
|
||
|
# subclusters so that we can insert our new subcluster.
|
||
|
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
|
||
|
dist_matrix *= -2.0
|
||
|
dist_matrix += self.squared_norm_
|
||
|
closest_index = np.argmin(dist_matrix)
|
||
|
closest_subcluster = self.subclusters_[closest_index]
|
||
|
|
||
|
# If the subcluster has a child, we need a recursive strategy.
|
||
|
if closest_subcluster.child_ is not None:
|
||
|
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
|
||
|
|
||
|
if not split_child:
|
||
|
# If it is determined that the child need not be split, we
|
||
|
# can just update the closest_subcluster
|
||
|
closest_subcluster.update(subcluster)
|
||
|
self.init_centroids_[closest_index] = self.subclusters_[
|
||
|
closest_index
|
||
|
].centroid_
|
||
|
self.init_sq_norm_[closest_index] = self.subclusters_[
|
||
|
closest_index
|
||
|
].sq_norm_
|
||
|
return False
|
||
|
|
||
|
# things not too good. we need to redistribute the subclusters in
|
||
|
# our child node, and add a new subcluster in the parent
|
||
|
# subcluster to accommodate the new child.
|
||
|
else:
|
||
|
new_subcluster1, new_subcluster2 = _split_node(
|
||
|
closest_subcluster.child_,
|
||
|
threshold,
|
||
|
branching_factor,
|
||
|
)
|
||
|
self.update_split_subclusters(
|
||
|
closest_subcluster, new_subcluster1, new_subcluster2
|
||
|
)
|
||
|
|
||
|
if len(self.subclusters_) > self.branching_factor:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
# good to go!
|
||
|
else:
|
||
|
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
|
||
|
if merged:
|
||
|
self.init_centroids_[closest_index] = closest_subcluster.centroid_
|
||
|
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
|
||
|
return False
|
||
|
|
||
|
# not close to any other subclusters, and we still
|
||
|
# have space, so add.
|
||
|
elif len(self.subclusters_) < self.branching_factor:
|
||
|
self.append_subcluster(subcluster)
|
||
|
return False
|
||
|
|
||
|
# We do not have enough space nor is it closer to an
|
||
|
# other subcluster. We need to split.
|
||
|
else:
|
||
|
self.append_subcluster(subcluster)
|
||
|
return True
|
||
|
|
||
|
|
||
|
class _CFSubcluster:
|
||
|
"""Each subcluster in a CFNode is called a CFSubcluster.
|
||
|
|
||
|
A CFSubcluster can have a CFNode has its child.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
linear_sum : ndarray of shape (n_features,), default=None
|
||
|
Sample. This is kept optional to allow initialization of empty
|
||
|
subclusters.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
n_samples_ : int
|
||
|
Number of samples that belong to each subcluster.
|
||
|
|
||
|
linear_sum_ : ndarray
|
||
|
Linear sum of all the samples in a subcluster. Prevents holding
|
||
|
all sample data in memory.
|
||
|
|
||
|
squared_sum_ : float
|
||
|
Sum of the squared l2 norms of all samples belonging to a subcluster.
|
||
|
|
||
|
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
||
|
Centroid of the subcluster. Prevent recomputing of centroids when
|
||
|
``CFNode.centroids_`` is called.
|
||
|
|
||
|
child_ : _CFNode
|
||
|
Child Node of the subcluster. Once a given _CFNode is set as the child
|
||
|
of the _CFNode, it is set to ``self.child_``.
|
||
|
|
||
|
sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||
|
Squared norm of the subcluster. Used to prevent recomputing when
|
||
|
pairwise minimum distances are computed.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, *, linear_sum=None):
|
||
|
if linear_sum is None:
|
||
|
self.n_samples_ = 0
|
||
|
self.squared_sum_ = 0.0
|
||
|
self.centroid_ = self.linear_sum_ = 0
|
||
|
else:
|
||
|
self.n_samples_ = 1
|
||
|
self.centroid_ = self.linear_sum_ = linear_sum
|
||
|
self.squared_sum_ = self.sq_norm_ = np.dot(
|
||
|
self.linear_sum_, self.linear_sum_
|
||
|
)
|
||
|
self.child_ = None
|
||
|
|
||
|
def update(self, subcluster):
|
||
|
self.n_samples_ += subcluster.n_samples_
|
||
|
self.linear_sum_ += subcluster.linear_sum_
|
||
|
self.squared_sum_ += subcluster.squared_sum_
|
||
|
self.centroid_ = self.linear_sum_ / self.n_samples_
|
||
|
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
|
||
|
|
||
|
def merge_subcluster(self, nominee_cluster, threshold):
|
||
|
"""Check if a cluster is worthy enough to be merged. If
|
||
|
yes then merge.
|
||
|
"""
|
||
|
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
|
||
|
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
|
||
|
new_n = self.n_samples_ + nominee_cluster.n_samples_
|
||
|
new_centroid = (1 / new_n) * new_ls
|
||
|
new_sq_norm = np.dot(new_centroid, new_centroid)
|
||
|
|
||
|
# The squared radius of the cluster is defined:
|
||
|
# r^2 = sum_i ||x_i - c||^2 / n
|
||
|
# with x_i the n points assigned to the cluster and c its centroid:
|
||
|
# c = sum_i x_i / n
|
||
|
# This can be expanded to:
|
||
|
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
|
||
|
# and therefore simplifies to:
|
||
|
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
|
||
|
sq_radius = new_ss / new_n - new_sq_norm
|
||
|
|
||
|
if sq_radius <= threshold**2:
|
||
|
(
|
||
|
self.n_samples_,
|
||
|
self.linear_sum_,
|
||
|
self.squared_sum_,
|
||
|
self.centroid_,
|
||
|
self.sq_norm_,
|
||
|
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
@property
|
||
|
def radius(self):
|
||
|
"""Return radius of the subcluster"""
|
||
|
# Because of numerical issues, this could become negative
|
||
|
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
|
||
|
return sqrt(max(0, sq_radius))
|
||
|
|
||
|
|
||
|
class Birch(
|
||
|
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
|
||
|
):
|
||
|
"""Implements the BIRCH clustering algorithm.
|
||
|
|
||
|
It is a memory-efficient, online-learning algorithm provided as an
|
||
|
alternative to :class:`MiniBatchKMeans`. It constructs a tree
|
||
|
data structure with the cluster centroids being read off the leaf.
|
||
|
These can be either the final cluster centroids or can be provided as input
|
||
|
to another clustering algorithm such as :class:`AgglomerativeClustering`.
|
||
|
|
||
|
Read more in the :ref:`User Guide <birch>`.
|
||
|
|
||
|
.. versionadded:: 0.16
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
threshold : float, default=0.5
|
||
|
The radius of the subcluster obtained by merging a new sample and the
|
||
|
closest subcluster should be lesser than the threshold. Otherwise a new
|
||
|
subcluster is started. Setting this value to be very low promotes
|
||
|
splitting and vice-versa.
|
||
|
|
||
|
branching_factor : int, default=50
|
||
|
Maximum number of CF subclusters in each node. If a new samples enters
|
||
|
such that the number of subclusters exceed the branching_factor then
|
||
|
that node is split into two nodes with the subclusters redistributed
|
||
|
in each. The parent subcluster of that node is removed and two new
|
||
|
subclusters are added as parents of the 2 split nodes.
|
||
|
|
||
|
n_clusters : int, instance of sklearn.cluster model or None, default=3
|
||
|
Number of clusters after the final clustering step, which treats the
|
||
|
subclusters from the leaves as new samples.
|
||
|
|
||
|
- `None` : the final clustering step is not performed and the
|
||
|
subclusters are returned as they are.
|
||
|
|
||
|
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
|
||
|
is fit treating the subclusters as new samples and the initial data
|
||
|
is mapped to the label of the closest subcluster.
|
||
|
|
||
|
- `int` : the model fit is :class:`AgglomerativeClustering` with
|
||
|
`n_clusters` set to be equal to the int.
|
||
|
|
||
|
compute_labels : bool, default=True
|
||
|
Whether or not to compute labels for each fit.
|
||
|
|
||
|
copy : bool, default=True
|
||
|
Whether or not to make a copy of the given data. If set to False,
|
||
|
the initial data will be overwritten.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
root_ : _CFNode
|
||
|
Root of the CFTree.
|
||
|
|
||
|
dummy_leaf_ : _CFNode
|
||
|
Start pointer to all the leaves.
|
||
|
|
||
|
subcluster_centers_ : ndarray
|
||
|
Centroids of all subclusters read directly from the leaves.
|
||
|
|
||
|
subcluster_labels_ : ndarray
|
||
|
Labels assigned to the centroids of the subclusters after
|
||
|
they are clustered globally.
|
||
|
|
||
|
labels_ : ndarray of shape (n_samples,)
|
||
|
Array of labels assigned to the input data.
|
||
|
if partial_fit is used instead of fit, they are assigned to the
|
||
|
last batch of data.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
MiniBatchKMeans : Alternative implementation that does incremental updates
|
||
|
of the centers' positions using mini-batches.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The tree data structure consists of nodes with each node consisting of
|
||
|
a number of subclusters. The maximum number of subclusters in a node
|
||
|
is determined by the branching factor. Each subcluster maintains a
|
||
|
linear sum, squared sum and the number of samples in that subcluster.
|
||
|
In addition, each subcluster can also have a node as its child, if the
|
||
|
subcluster is not a member of a leaf node.
|
||
|
|
||
|
For a new point entering the root, it is merged with the subcluster closest
|
||
|
to it and the linear sum, squared sum and the number of samples of that
|
||
|
subcluster are updated. This is done recursively till the properties of
|
||
|
the leaf node are updated.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
|
||
|
BIRCH: An efficient data clustering method for large databases.
|
||
|
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
|
||
|
|
||
|
* Roberto Perdisci
|
||
|
JBirch - Java implementation of BIRCH clustering algorithm
|
||
|
https://code.google.com/archive/p/jbirch
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.cluster import Birch
|
||
|
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
|
||
|
>>> brc = Birch(n_clusters=None)
|
||
|
>>> brc.fit(X)
|
||
|
Birch(n_clusters=None)
|
||
|
>>> brc.predict(X)
|
||
|
array([0, 0, 0, 1, 1, 1])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"threshold": [Interval(Real, 0.0, None, closed="neither")],
|
||
|
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
|
||
|
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
|
||
|
"compute_labels": ["boolean"],
|
||
|
"copy": ["boolean"],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
threshold=0.5,
|
||
|
branching_factor=50,
|
||
|
n_clusters=3,
|
||
|
compute_labels=True,
|
||
|
copy=True,
|
||
|
):
|
||
|
self.threshold = threshold
|
||
|
self.branching_factor = branching_factor
|
||
|
self.n_clusters = n_clusters
|
||
|
self.compute_labels = compute_labels
|
||
|
self.copy = copy
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y=None):
|
||
|
"""
|
||
|
Build a CF Tree for the input data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present here for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self
|
||
|
Fitted estimator.
|
||
|
"""
|
||
|
return self._fit(X, partial=False)
|
||
|
|
||
|
def _fit(self, X, partial):
|
||
|
has_root = getattr(self, "root_", None)
|
||
|
first_call = not (partial and has_root)
|
||
|
|
||
|
X = self._validate_data(
|
||
|
X,
|
||
|
accept_sparse="csr",
|
||
|
copy=self.copy,
|
||
|
reset=first_call,
|
||
|
dtype=[np.float64, np.float32],
|
||
|
)
|
||
|
threshold = self.threshold
|
||
|
branching_factor = self.branching_factor
|
||
|
|
||
|
n_samples, n_features = X.shape
|
||
|
|
||
|
# If partial_fit is called for the first time or fit is called, we
|
||
|
# start a new tree.
|
||
|
if first_call:
|
||
|
# The first root is the leaf. Manipulate this object throughout.
|
||
|
self.root_ = _CFNode(
|
||
|
threshold=threshold,
|
||
|
branching_factor=branching_factor,
|
||
|
is_leaf=True,
|
||
|
n_features=n_features,
|
||
|
dtype=X.dtype,
|
||
|
)
|
||
|
|
||
|
# To enable getting back subclusters.
|
||
|
self.dummy_leaf_ = _CFNode(
|
||
|
threshold=threshold,
|
||
|
branching_factor=branching_factor,
|
||
|
is_leaf=True,
|
||
|
n_features=n_features,
|
||
|
dtype=X.dtype,
|
||
|
)
|
||
|
self.dummy_leaf_.next_leaf_ = self.root_
|
||
|
self.root_.prev_leaf_ = self.dummy_leaf_
|
||
|
|
||
|
# Cannot vectorize. Enough to convince to use cython.
|
||
|
if not sparse.issparse(X):
|
||
|
iter_func = iter
|
||
|
else:
|
||
|
iter_func = _iterate_sparse_X
|
||
|
|
||
|
for sample in iter_func(X):
|
||
|
subcluster = _CFSubcluster(linear_sum=sample)
|
||
|
split = self.root_.insert_cf_subcluster(subcluster)
|
||
|
|
||
|
if split:
|
||
|
new_subcluster1, new_subcluster2 = _split_node(
|
||
|
self.root_, threshold, branching_factor
|
||
|
)
|
||
|
del self.root_
|
||
|
self.root_ = _CFNode(
|
||
|
threshold=threshold,
|
||
|
branching_factor=branching_factor,
|
||
|
is_leaf=False,
|
||
|
n_features=n_features,
|
||
|
dtype=X.dtype,
|
||
|
)
|
||
|
self.root_.append_subcluster(new_subcluster1)
|
||
|
self.root_.append_subcluster(new_subcluster2)
|
||
|
|
||
|
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
||
|
self.subcluster_centers_ = centroids
|
||
|
self._n_features_out = self.subcluster_centers_.shape[0]
|
||
|
|
||
|
self._global_clustering(X)
|
||
|
return self
|
||
|
|
||
|
def _get_leaves(self):
|
||
|
"""
|
||
|
Retrieve the leaves of the CF Node.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
leaves : list of shape (n_leaves,)
|
||
|
List of the leaf nodes.
|
||
|
"""
|
||
|
leaf_ptr = self.dummy_leaf_.next_leaf_
|
||
|
leaves = []
|
||
|
while leaf_ptr is not None:
|
||
|
leaves.append(leaf_ptr)
|
||
|
leaf_ptr = leaf_ptr.next_leaf_
|
||
|
return leaves
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def partial_fit(self, X=None, y=None):
|
||
|
"""
|
||
|
Online learning. Prevents rebuilding of CFTree from scratch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
|
||
|
default=None
|
||
|
Input data. If X is not provided, only the global clustering
|
||
|
step is done.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present here for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self
|
||
|
Fitted estimator.
|
||
|
"""
|
||
|
if X is None:
|
||
|
# Perform just the final global clustering step.
|
||
|
self._global_clustering()
|
||
|
return self
|
||
|
else:
|
||
|
return self._fit(X, partial=True)
|
||
|
|
||
|
def _check_fit(self, X):
|
||
|
check_is_fitted(self)
|
||
|
|
||
|
if (
|
||
|
hasattr(self, "subcluster_centers_")
|
||
|
and X.shape[1] != self.subcluster_centers_.shape[1]
|
||
|
):
|
||
|
raise ValueError(
|
||
|
"Training data and predicted data do not have same number of features."
|
||
|
)
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""
|
||
|
Predict data using the ``centroids_`` of subclusters.
|
||
|
|
||
|
Avoid computation of the row norms of X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
labels : ndarray of shape(n_samples,)
|
||
|
Labelled data.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||
|
return self._predict(X)
|
||
|
|
||
|
def _predict(self, X):
|
||
|
"""Predict data using the ``centroids_`` of subclusters."""
|
||
|
kwargs = {"Y_norm_squared": self._subcluster_norms}
|
||
|
|
||
|
with config_context(assume_finite=True):
|
||
|
argmin = pairwise_distances_argmin(
|
||
|
X, self.subcluster_centers_, metric_kwargs=kwargs
|
||
|
)
|
||
|
return self.subcluster_labels_[argmin]
|
||
|
|
||
|
def transform(self, X):
|
||
|
"""
|
||
|
Transform X into subcluster centroids dimension.
|
||
|
|
||
|
Each dimension represents the distance from the sample point to each
|
||
|
cluster centroid.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
|
||
|
Transformed data.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||
|
with config_context(assume_finite=True):
|
||
|
return euclidean_distances(X, self.subcluster_centers_)
|
||
|
|
||
|
def _global_clustering(self, X=None):
|
||
|
"""
|
||
|
Global clustering for the subclusters obtained after fitting
|
||
|
"""
|
||
|
clusterer = self.n_clusters
|
||
|
centroids = self.subcluster_centers_
|
||
|
compute_labels = (X is not None) and self.compute_labels
|
||
|
|
||
|
# Preprocessing for the global clustering.
|
||
|
not_enough_centroids = False
|
||
|
if isinstance(clusterer, Integral):
|
||
|
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
|
||
|
# There is no need to perform the global clustering step.
|
||
|
if len(centroids) < self.n_clusters:
|
||
|
not_enough_centroids = True
|
||
|
|
||
|
# To use in predict to avoid recalculation.
|
||
|
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
|
||
|
|
||
|
if clusterer is None or not_enough_centroids:
|
||
|
self.subcluster_labels_ = np.arange(len(centroids))
|
||
|
if not_enough_centroids:
|
||
|
warnings.warn(
|
||
|
"Number of subclusters found (%d) by BIRCH is less "
|
||
|
"than (%d). Decrease the threshold."
|
||
|
% (len(centroids), self.n_clusters),
|
||
|
ConvergenceWarning,
|
||
|
)
|
||
|
else:
|
||
|
# The global clustering step that clusters the subclusters of
|
||
|
# the leaves. It assumes the centroids of the subclusters as
|
||
|
# samples and finds the final centroids.
|
||
|
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
|
||
|
|
||
|
if compute_labels:
|
||
|
self.labels_ = self._predict(X)
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"preserves_dtype": [np.float64, np.float32]}
|