775 lines
35 KiB
Python
775 lines
35 KiB
Python
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""Clustering Operations."""
|
|
|
|
from tensorflow.python.framework import constant_op
|
|
from tensorflow.python.framework import dtypes
|
|
from tensorflow.python.framework import ops
|
|
from tensorflow.python.framework import random_seed as random_seed_ops
|
|
from tensorflow.python.ops import array_ops
|
|
from tensorflow.python.ops import check_ops
|
|
from tensorflow.python.ops import cond
|
|
from tensorflow.python.ops import control_flow_ops
|
|
from tensorflow.python.ops import gen_clustering_ops
|
|
from tensorflow.python.ops import math_ops
|
|
from tensorflow.python.ops import nn_impl
|
|
from tensorflow.python.ops import random_ops
|
|
from tensorflow.python.ops import state_ops
|
|
from tensorflow.python.ops import variable_v1
|
|
from tensorflow.python.ops import while_loop
|
|
from tensorflow.python.ops.embedding_ops import embedding_lookup
|
|
# go/tf-wildcard-import
|
|
# pylint: disable=wildcard-import
|
|
from tensorflow.python.ops.gen_clustering_ops import *
|
|
# pylint: enable=wildcard-import
|
|
|
|
# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
|
|
# which is the square root of the sum of the absolute squares of the elements
|
|
# difference.
|
|
SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
|
|
# Cosine distance between vectors U and V is defined as
|
|
# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
|
|
COSINE_DISTANCE = 'cosine'
|
|
|
|
RANDOM_INIT = 'random'
|
|
KMEANS_PLUS_PLUS_INIT = 'kmeans_plus_plus'
|
|
KMC2_INIT = 'kmc2'
|
|
|
|
CLUSTERS_VAR_NAME = 'clusters'
|
|
|
|
|
|
class KMeans:
|
|
"""Creates the graph for k-means clustering."""
|
|
|
|
def __init__(self,
|
|
inputs,
|
|
num_clusters,
|
|
initial_clusters=RANDOM_INIT,
|
|
distance_metric=SQUARED_EUCLIDEAN_DISTANCE,
|
|
use_mini_batch=False,
|
|
mini_batch_steps_per_iteration=1,
|
|
random_seed=0,
|
|
kmeans_plus_plus_num_retries=2,
|
|
kmc2_chain_length=200):
|
|
"""Creates an object for generating KMeans clustering graph.
|
|
|
|
This class implements the following variants of K-means algorithm:
|
|
|
|
If use_mini_batch is False, it runs standard full batch K-means. Each step
|
|
runs a single iteration of K-Means. This step can be run sharded across
|
|
multiple workers by passing a list of sharded inputs to this class. Note
|
|
however that a single step needs to process the full input at once.
|
|
|
|
If use_mini_batch is True, it runs a generalization of the mini-batch
|
|
K-means algorithm. It runs multiple iterations, where each iteration is
|
|
composed of mini_batch_steps_per_iteration steps. Two copies of cluster
|
|
centers are maintained: one that is updated at the end of each iteration,
|
|
and one that is updated every step. The first copy is used to compute
|
|
cluster allocations for each step, and for inference, while the second copy
|
|
is the one updated each step using the mini-batch update rule. After each
|
|
iteration is complete, this second copy is copied back the first copy.
|
|
|
|
Note that for use_mini_batch=True, when mini_batch_steps_per_iteration=1,
|
|
the algorithm reduces to the standard mini-batch algorithm. Also by setting
|
|
mini_batch_steps_per_iteration = num_inputs / batch_size, the algorithm
|
|
becomes an asynchronous version of the full-batch algorithm. Note however
|
|
that there is no guarantee by this implementation that each input is seen
|
|
exactly once per iteration. Also, different updates are applied
|
|
asynchronously without locking. So this asynchronous version may not behave
|
|
exactly like a full-batch version.
|
|
|
|
Args:
|
|
inputs: An input tensor or list of input tensors. It is assumed that the
|
|
data points have been previously randomly permuted.
|
|
num_clusters: An integer tensor specifying the number of clusters. This
|
|
argument is ignored if initial_clusters is a tensor or numpy array.
|
|
initial_clusters: Specifies the clusters used during initialization. One
|
|
of the following: - a tensor or numpy array with the initial cluster
|
|
centers. - a function f(inputs, k) that returns up to k centers from
|
|
`inputs`.
|
|
- "random": Choose centers randomly from `inputs`.
|
|
- "kmeans_plus_plus": Use kmeans++ to choose centers from `inputs`.
|
|
- "kmc2": Use the fast k-MC2 algorithm to choose centers from `inputs`.
|
|
In the last three cases, one batch of `inputs` may not yield
|
|
`num_clusters` centers, in which case initialization will require
|
|
multiple batches until enough centers are chosen. In the case of
|
|
"random" or "kmeans_plus_plus", if the input size is <= `num_clusters`
|
|
then the entire batch is chosen to be cluster centers.
|
|
distance_metric: Distance metric used for clustering. Supported options:
|
|
"squared_euclidean", "cosine".
|
|
use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
|
|
full batch.
|
|
mini_batch_steps_per_iteration: Number of steps after which the updated
|
|
cluster centers are synced back to a master copy.
|
|
random_seed: Seed for PRNG used to initialize seeds.
|
|
kmeans_plus_plus_num_retries: For each point that is sampled during
|
|
kmeans++ initialization, this parameter specifies the number of
|
|
additional points to draw from the current distribution before selecting
|
|
the best. If a negative value is specified, a heuristic is used to
|
|
sample O(log(num_to_sample)) additional points.
|
|
kmc2_chain_length: Determines how many candidate points are used by the
|
|
k-MC2 algorithm to produce one new cluster centers. If a (mini-)batch
|
|
contains less points, one new cluster center is generated from the
|
|
(mini-)batch.
|
|
|
|
Raises:
|
|
ValueError: An invalid argument was passed to initial_clusters or
|
|
distance_metric.
|
|
"""
|
|
initialization_algorithms = [RANDOM_INIT, KMEANS_PLUS_PLUS_INIT, KMC2_INIT]
|
|
if isinstance(initial_clusters,
|
|
str) and initial_clusters not in initialization_algorithms:
|
|
raise ValueError(
|
|
f'Unsupported initialization algorithm `{initial_clusters}`,'
|
|
f'must be one of `{initialization_algorithms}`.')
|
|
|
|
distance_metrics = [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]
|
|
if distance_metric not in distance_metrics:
|
|
raise ValueError(f'Unsupported distance metric `{distance_metric}`,'
|
|
f'must be one of `{distance_metrics}`.')
|
|
self._inputs = inputs if isinstance(inputs, list) else [inputs]
|
|
self._num_clusters = num_clusters
|
|
self._initial_clusters = initial_clusters
|
|
self._distance_metric = distance_metric
|
|
self._use_mini_batch = use_mini_batch
|
|
self._mini_batch_steps_per_iteration = int(mini_batch_steps_per_iteration)
|
|
self._seed = random_seed_ops.get_seed(random_seed)[0]
|
|
self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
|
|
self._kmc2_chain_length = kmc2_chain_length
|
|
|
|
@classmethod
|
|
def _distance_graph(cls, inputs, clusters, distance_metric):
|
|
"""Computes distance between each input and each cluster center.
|
|
|
|
Args:
|
|
inputs: list of input Tensors.
|
|
clusters: cluster Tensor.
|
|
distance_metric: distance metric used for clustering
|
|
|
|
Returns:
|
|
list of Tensors, where each element corresponds to each element in inputs.
|
|
The value is the distance of each row to all the cluster centers.
|
|
Currently only Euclidean distance and cosine distance are supported.
|
|
"""
|
|
assert isinstance(inputs, list)
|
|
if distance_metric == SQUARED_EUCLIDEAN_DISTANCE:
|
|
return cls._compute_euclidean_distance(inputs, clusters)
|
|
elif distance_metric == COSINE_DISTANCE:
|
|
return cls._compute_cosine_distance(
|
|
inputs, clusters, inputs_normalized=True)
|
|
else:
|
|
assert False, str(distance_metric)
|
|
|
|
@classmethod
|
|
def _compute_euclidean_distance(cls, inputs, clusters):
|
|
"""Computes Euclidean distance between each input and each cluster center.
|
|
|
|
Args:
|
|
inputs: list of input Tensors.
|
|
clusters: cluster Tensor.
|
|
|
|
Returns:
|
|
list of Tensors, where each element corresponds to each element in inputs.
|
|
The value is the distance of each row to all the cluster centers.
|
|
"""
|
|
output = []
|
|
for inp in inputs:
|
|
with ops.colocate_with(inp, ignore_existing=True):
|
|
# Computes Euclidean distance. Note the first and third terms are
|
|
# broadcast additions.
|
|
squared_distance = (
|
|
math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
|
|
2 * math_ops.matmul(inp, clusters, transpose_b=True) +
|
|
array_ops.transpose(
|
|
math_ops.reduce_sum(
|
|
math_ops.square(clusters), 1, keepdims=True)))
|
|
output.append(squared_distance)
|
|
|
|
return output
|
|
|
|
@classmethod
|
|
def _compute_cosine_distance(cls, inputs, clusters, inputs_normalized=True):
|
|
"""Computes cosine distance between each input and each cluster center.
|
|
|
|
Args:
|
|
inputs: list of input Tensor.
|
|
clusters: cluster Tensor
|
|
inputs_normalized: if True, it assumes that inp and clusters are
|
|
normalized and computes the dot product which is equivalent to the
|
|
cosine distance. Else it L2 normalizes the inputs first.
|
|
|
|
Returns:
|
|
list of Tensors, where each element corresponds to each element in inp.
|
|
The value is the distance of each row to all the cluster centers.
|
|
"""
|
|
output = []
|
|
if not inputs_normalized:
|
|
with ops.colocate_with(clusters, ignore_existing=True):
|
|
clusters = nn_impl.l2_normalize(clusters, axis=1)
|
|
for inp in inputs:
|
|
with ops.colocate_with(inp, ignore_existing=True):
|
|
if not inputs_normalized:
|
|
inp = nn_impl.l2_normalize(inp, axis=1)
|
|
output.append(1 - math_ops.matmul(inp, clusters, transpose_b=True))
|
|
return output
|
|
|
|
def _infer_graph(self, inputs, clusters):
|
|
"""Maps input to closest cluster and the score.
|
|
|
|
Args:
|
|
inputs: list of input Tensors.
|
|
clusters: Tensor of cluster centers.
|
|
|
|
Returns:
|
|
List of tuple, where each value in tuple corresponds to a value in inp.
|
|
The tuple has following three elements:
|
|
all_scores: distance of each input to each cluster center.
|
|
score: distance of each input to closest cluster center.
|
|
cluster_idx: index of cluster center closest to the corresponding input.
|
|
"""
|
|
assert isinstance(inputs, list)
|
|
# Pairwise distances are used only by transform(). In all other cases, this
|
|
# sub-graph is not evaluated.
|
|
scores = self._distance_graph(inputs, clusters, self._distance_metric)
|
|
output = []
|
|
if (self._distance_metric == COSINE_DISTANCE and
|
|
not self._clusters_l2_normalized()):
|
|
# The cosine distance between normalized vectors x and y is the same as
|
|
# 2 * squared_euclidean_distance. We are using this fact and reusing the
|
|
# nearest_neighbors op.
|
|
# TODO(ands): Support COSINE distance in nearest_neighbors and remove
|
|
# this.
|
|
with ops.colocate_with(clusters, ignore_existing=True):
|
|
clusters = nn_impl.l2_normalize(clusters, axis=1)
|
|
for inp, score in zip(inputs, scores):
|
|
with ops.colocate_with(inp, ignore_existing=True):
|
|
(indices,
|
|
distances) = gen_clustering_ops.nearest_neighbors(inp, clusters, 1)
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
distances *= 0.5
|
|
output.append(
|
|
(score, array_ops.squeeze(distances,
|
|
[-1]), array_ops.squeeze(indices, [-1])))
|
|
return zip(*output)
|
|
|
|
def _clusters_l2_normalized(self):
|
|
"""Returns True if clusters centers are kept normalized."""
|
|
return (self._distance_metric == COSINE_DISTANCE and
|
|
(not self._use_mini_batch or
|
|
self._mini_batch_steps_per_iteration > 1))
|
|
|
|
def _create_variables(self, num_clusters):
|
|
"""Creates variables.
|
|
|
|
Args:
|
|
num_clusters: an integer Tensor providing the number of clusters.
|
|
|
|
Returns:
|
|
Tuple with following elements:
|
|
- cluster_centers: a Tensor for storing cluster centers
|
|
- cluster_centers_initialized: bool Variable indicating whether clusters
|
|
are initialized.
|
|
- cluster_counts: a Tensor for storing counts of points assigned to this
|
|
cluster. This is used by mini-batch training.
|
|
- cluster_centers_updated: Tensor representing copy of cluster centers
|
|
that are updated every step.
|
|
- update_in_steps: numbers of steps left before we sync
|
|
cluster_centers_updated back to cluster_centers.
|
|
"""
|
|
init_value = array_ops.placeholder_with_default([], shape=None)
|
|
cluster_centers = variable_v1.VariableV1(
|
|
init_value, name=CLUSTERS_VAR_NAME, validate_shape=False)
|
|
cluster_centers_initialized = variable_v1.VariableV1(
|
|
False, dtype=dtypes.bool, name='initialized')
|
|
|
|
if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
|
|
# Copy of cluster centers actively updated each step according to
|
|
# mini-batch update rule.
|
|
cluster_centers_updated = variable_v1.VariableV1(
|
|
init_value, name='clusters_updated', validate_shape=False)
|
|
# How many steps till we copy the updated clusters to cluster_centers.
|
|
update_in_steps = variable_v1.VariableV1(
|
|
self._mini_batch_steps_per_iteration,
|
|
dtype=dtypes.int64,
|
|
name='update_in_steps')
|
|
# Count of points assigned to cluster_centers_updated.
|
|
cluster_counts = variable_v1.VariableV1(
|
|
array_ops.zeros([num_clusters], dtype=dtypes.int64))
|
|
else:
|
|
cluster_centers_updated = cluster_centers
|
|
update_in_steps = None
|
|
cluster_counts = (
|
|
variable_v1.VariableV1(
|
|
array_ops.ones([num_clusters], dtype=dtypes.int64))
|
|
if self._use_mini_batch else None)
|
|
return (cluster_centers, cluster_centers_initialized, cluster_counts,
|
|
cluster_centers_updated, update_in_steps)
|
|
|
|
@classmethod
|
|
def _l2_normalize_data(cls, inputs):
|
|
"""Normalized the input data."""
|
|
output = []
|
|
for inp in inputs:
|
|
with ops.colocate_with(inp, ignore_existing=True):
|
|
output.append(nn_impl.l2_normalize(inp, dim=1))
|
|
return output
|
|
|
|
def training_graph(self):
|
|
"""Generate a training graph for kmeans algorithm.
|
|
|
|
This returns, among other things, an op that chooses initial centers
|
|
(init_op), a boolean variable that is set to True when the initial centers
|
|
are chosen (cluster_centers_initialized), and an op to perform either an
|
|
entire Lloyd iteration or a mini-batch of a Lloyd iteration (training_op).
|
|
The caller should use these components as follows. A single worker should
|
|
execute init_op multiple times until cluster_centers_initialized becomes
|
|
True. Then multiple workers may execute training_op any number of times.
|
|
|
|
Returns:
|
|
A tuple consisting of:
|
|
all_scores: A matrix (or list of matrices) of dimensions (num_input,
|
|
num_clusters) where the value is the distance of an input vector and a
|
|
cluster center.
|
|
cluster_idx: A vector (or list of vectors). Each element in the vector
|
|
corresponds to an input row in 'inp' and specifies the cluster id
|
|
corresponding to the input.
|
|
scores: Similar to cluster_idx but specifies the distance to the
|
|
assigned cluster instead.
|
|
cluster_centers_initialized: scalar indicating whether clusters have been
|
|
initialized.
|
|
init_op: an op to initialize the clusters.
|
|
training_op: an op that runs an iteration of training.
|
|
"""
|
|
# Implementation of kmeans.
|
|
if (isinstance(self._initial_clusters, str) or
|
|
callable(self._initial_clusters)):
|
|
initial_clusters = self._initial_clusters
|
|
num_clusters = ops.convert_to_tensor(self._num_clusters)
|
|
else:
|
|
initial_clusters = ops.convert_to_tensor(self._initial_clusters)
|
|
num_clusters = array_ops.shape(initial_clusters)[0]
|
|
|
|
inputs = self._inputs
|
|
(cluster_centers_var, cluster_centers_initialized, total_counts,
|
|
cluster_centers_updated,
|
|
update_in_steps) = self._create_variables(num_clusters)
|
|
init_op = _InitializeClustersOpFactory(
|
|
self._inputs, num_clusters, initial_clusters, self._distance_metric,
|
|
self._seed, self._kmeans_plus_plus_num_retries, self._kmc2_chain_length,
|
|
cluster_centers_var, cluster_centers_updated,
|
|
cluster_centers_initialized).op()
|
|
cluster_centers = cluster_centers_var
|
|
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
inputs = self._l2_normalize_data(inputs)
|
|
if not self._clusters_l2_normalized():
|
|
cluster_centers = nn_impl.l2_normalize(cluster_centers, dim=1)
|
|
|
|
all_scores, scores, cluster_idx = self._infer_graph(inputs, cluster_centers)
|
|
if self._use_mini_batch:
|
|
sync_updates_op = self._mini_batch_sync_updates_op(
|
|
update_in_steps, cluster_centers_var, cluster_centers_updated,
|
|
total_counts)
|
|
assert sync_updates_op is not None
|
|
with ops.control_dependencies([sync_updates_op]):
|
|
training_op = self._mini_batch_training_op(inputs, cluster_idx,
|
|
cluster_centers_updated,
|
|
total_counts)
|
|
else:
|
|
assert cluster_centers == cluster_centers_var
|
|
training_op = self._full_batch_training_op(inputs, num_clusters,
|
|
cluster_idx,
|
|
cluster_centers_var)
|
|
|
|
return (all_scores, cluster_idx, scores, cluster_centers_initialized,
|
|
init_op, training_op)
|
|
|
|
def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
|
|
cluster_centers_updated, total_counts):
|
|
if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
|
|
assert update_in_steps is not None
|
|
with ops.colocate_with(update_in_steps, ignore_existing=True):
|
|
|
|
def _f():
|
|
# Note that there is a race condition here, so we do a best effort
|
|
# updates here. We reset update_in_steps first so that other workers
|
|
# don't duplicate the updates. Also we update cluster_center_vars
|
|
# before resetting total_counts to avoid large updates to
|
|
# cluster_centers_updated based on partially updated
|
|
# cluster_center_vars.
|
|
with ops.control_dependencies([
|
|
state_ops.assign(update_in_steps,
|
|
self._mini_batch_steps_per_iteration - 1)
|
|
]):
|
|
with ops.colocate_with(
|
|
cluster_centers_updated, ignore_existing=True):
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
cluster_centers = nn_impl.l2_normalize(
|
|
cluster_centers_updated, dim=1)
|
|
else:
|
|
cluster_centers = cluster_centers_updated
|
|
with ops.colocate_with(cluster_centers_var, ignore_existing=True):
|
|
with ops.control_dependencies(
|
|
[state_ops.assign(cluster_centers_var, cluster_centers)]):
|
|
with ops.colocate_with(None, ignore_existing=True):
|
|
with ops.control_dependencies([
|
|
state_ops.assign(total_counts,
|
|
array_ops.zeros_like(total_counts))
|
|
]):
|
|
return array_ops.identity(update_in_steps)
|
|
|
|
return cond.cond(
|
|
update_in_steps <= 0, _f,
|
|
lambda: state_ops.assign_sub(update_in_steps, 1))
|
|
else:
|
|
return control_flow_ops.no_op()
|
|
|
|
def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
|
|
total_counts):
|
|
"""Creates an op for training for mini batch case.
|
|
|
|
Args:
|
|
inputs: list of input Tensors.
|
|
cluster_idx_list: A vector (or list of vectors). Each element in the
|
|
vector corresponds to an input row in 'inp' and specifies the cluster id
|
|
corresponding to the input.
|
|
cluster_centers: Tensor Ref of cluster centers.
|
|
total_counts: Tensor Ref of cluster counts.
|
|
|
|
Returns:
|
|
An op for doing an update of mini-batch k-means.
|
|
"""
|
|
update_ops = []
|
|
for inp, cluster_idx in zip(inputs, cluster_idx_list):
|
|
with ops.colocate_with(inp, ignore_existing=True):
|
|
assert total_counts is not None
|
|
cluster_idx = array_ops.reshape(cluster_idx, [-1])
|
|
# Dedupe the unique ids of cluster_centers being updated so that updates
|
|
# can be locally aggregated.
|
|
unique_ids, unique_idx = array_ops.unique(cluster_idx)
|
|
num_unique_cluster_idx = array_ops.size(unique_ids)
|
|
# Fetch the old values of counts and cluster_centers.
|
|
with ops.colocate_with(total_counts, ignore_existing=True):
|
|
old_counts = array_ops.gather(total_counts, unique_ids)
|
|
# TODO(agarwal): This colocation seems to run into problems. Fix it.
|
|
with ops.colocate_with(cluster_centers, ignore_existing=True):
|
|
old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
|
|
# Locally aggregate the increment to counts.
|
|
count_updates = math_ops.unsorted_segment_sum(
|
|
array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
|
|
unique_idx, num_unique_cluster_idx)
|
|
# Locally compute the sum of inputs mapped to each id.
|
|
# For a cluster with old cluster value x, old count n, and with data
|
|
# d_1,...d_k newly assigned to it, we recompute the new value as
|
|
# \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
|
|
# Compute \\(sum_i(d_i)\\), see comment above.
|
|
cluster_center_updates = math_ops.unsorted_segment_sum(
|
|
inp, unique_idx, num_unique_cluster_idx)
|
|
# Shape to enable broadcasting count_updates and learning_rate to inp.
|
|
# It extends the shape with 1's to match the rank of inp.
|
|
broadcast_shape = array_ops.concat([
|
|
array_ops.reshape(num_unique_cluster_idx, [1]),
|
|
array_ops.ones(
|
|
array_ops.reshape(array_ops.rank(inp) - 1, [1]),
|
|
dtype=dtypes.int32)
|
|
], 0)
|
|
# Subtract k * x, see comment above.
|
|
cluster_center_updates -= math_ops.cast(
|
|
array_ops.reshape(count_updates, broadcast_shape),
|
|
inp.dtype) * old_cluster_centers
|
|
learning_rate = math_ops.reciprocal(
|
|
math_ops.cast(old_counts + count_updates, inp.dtype))
|
|
learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
|
|
# scale by 1 / (n + k), see comment above.
|
|
cluster_center_updates *= learning_rate
|
|
# Apply the updates.
|
|
update_counts = state_ops.scatter_add(total_counts, unique_ids,
|
|
count_updates)
|
|
update_cluster_centers = state_ops.scatter_add(cluster_centers,
|
|
unique_ids,
|
|
cluster_center_updates)
|
|
update_ops.extend([update_counts, update_cluster_centers])
|
|
return control_flow_ops.group(*update_ops)
|
|
|
|
def _full_batch_training_op(self, inputs, num_clusters, cluster_idx_list,
|
|
cluster_centers):
|
|
"""Creates an op for training for full batch case.
|
|
|
|
Args:
|
|
inputs: list of input Tensors.
|
|
num_clusters: an integer Tensor providing the number of clusters.
|
|
cluster_idx_list: A vector (or list of vectors). Each element in the
|
|
vector corresponds to an input row in 'inp' and specifies the cluster id
|
|
corresponding to the input.
|
|
cluster_centers: Tensor Ref of cluster centers.
|
|
|
|
Returns:
|
|
An op for doing an update of mini-batch k-means.
|
|
"""
|
|
cluster_sums = []
|
|
cluster_counts = []
|
|
epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
|
|
for inp, cluster_idx in zip(inputs, cluster_idx_list):
|
|
with ops.colocate_with(inp, ignore_existing=True):
|
|
cluster_sums.append(
|
|
math_ops.unsorted_segment_sum(inp, cluster_idx, num_clusters))
|
|
cluster_counts.append(
|
|
math_ops.unsorted_segment_sum(
|
|
array_ops.reshape(
|
|
array_ops.ones(
|
|
array_ops.reshape(array_ops.shape(inp)[0], [-1])),
|
|
[-1, 1]), cluster_idx, num_clusters))
|
|
with ops.colocate_with(cluster_centers, ignore_existing=True):
|
|
new_clusters_centers = math_ops.add_n(cluster_sums) / (
|
|
math_ops.cast(math_ops.add_n(cluster_counts), cluster_sums[0].dtype) +
|
|
epsilon)
|
|
if self._clusters_l2_normalized():
|
|
new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
|
|
return state_ops.assign(cluster_centers, new_clusters_centers)
|
|
|
|
|
|
class _InitializeClustersOpFactory:
|
|
"""Internal class to create the op to initialize the clusters.
|
|
|
|
The op performs this algorithm (see constructor args):
|
|
|
|
num_remaining = num_clusters - length(cluster_centers)
|
|
if num_remaining == 0:
|
|
assert that cluster_centers_initialized is true
|
|
else:
|
|
assert that num_remaining > 0
|
|
new_centers = choose up to num_remaining initial centers
|
|
l2-normalize new_centers if using cosine distance
|
|
all_centers = concat(cluster_centers, new_centers)
|
|
cluster_centers := all_centers
|
|
if there is a cluster_centers_updated variable:
|
|
cluster_centers_updated := cluster_centers
|
|
num_now_remaining = num_clusters - length(cluster_centers)
|
|
if num_now_remaining == 0:
|
|
cluster_centers_initialized := true
|
|
"""
|
|
|
|
# TODO(ccolby): Refactor this class so that kmc2 isn't so much a special case.
|
|
|
|
def __init__(self, inputs, num_clusters, initial_clusters, distance_metric,
|
|
random_seed, kmeans_plus_plus_num_retries, kmc2_chain_length,
|
|
cluster_centers, cluster_centers_updated,
|
|
cluster_centers_initialized):
|
|
"""Creates an op factory.
|
|
|
|
Args:
|
|
inputs: See KMeans constructor.
|
|
num_clusters: An integer Tensor providing the number of clusters.
|
|
initial_clusters: See KMeans constructor.
|
|
distance_metric: See KMeans constructor.
|
|
random_seed: See KMeans constructor.
|
|
kmeans_plus_plus_num_retries: See KMeans constructor.
|
|
kmc2_chain_length: See KMeans constructor.
|
|
cluster_centers: The TF variable holding the initial centers. It may
|
|
already contain some centers when the op is executed.
|
|
cluster_centers_updated: A second TF variable to hold a copy of the
|
|
initial centers, used for full-batch mode. In mini-batch mode,
|
|
cluster_centers_updated is the same variable as cluster_centers.
|
|
cluster_centers_initialized: A boolean TF variable that will be set to
|
|
true when all the initial centers have been chosen.
|
|
"""
|
|
# All of these instance variables are constants.
|
|
self._inputs = inputs
|
|
self._num_clusters = num_clusters
|
|
self._initial_clusters = initial_clusters
|
|
self._distance_metric = distance_metric
|
|
self._seed = random_seed
|
|
self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
|
|
self._kmc2_chain_length = kmc2_chain_length
|
|
self._cluster_centers = cluster_centers
|
|
self._cluster_centers_updated = cluster_centers_updated
|
|
self._cluster_centers_initialized = cluster_centers_initialized
|
|
|
|
self._num_selected = array_ops.shape(self._cluster_centers)[0]
|
|
self._num_remaining = self._num_clusters - self._num_selected
|
|
self._num_data = math_ops.add_n(
|
|
[array_ops.shape(i)[0] for i in self._inputs])
|
|
|
|
def _random(self):
|
|
indices = random_ops.random_uniform(
|
|
array_ops.reshape(self._num_remaining, [-1]),
|
|
minval=0,
|
|
maxval=math_ops.cast(self._num_data, dtypes.int64),
|
|
seed=self._seed,
|
|
dtype=dtypes.int64)
|
|
return embedding_lookup(self._inputs, indices, partition_strategy='div')
|
|
|
|
def _kmeans_plus_plus(self):
|
|
# Points from only the first shard are used for initializing centers.
|
|
# TODO(ands): Use all points.
|
|
inp = self._inputs[0]
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
inp = nn_impl.l2_normalize(inp, dim=1)
|
|
return gen_clustering_ops.kmeans_plus_plus_initialization(
|
|
inp, math_ops.cast(self._num_remaining, dtypes.int64), self._seed,
|
|
self._kmeans_plus_plus_num_retries)
|
|
|
|
def _kmc2_multiple_centers(self):
|
|
"""Adds new initial cluster centers using the k-MC2 algorithm.
|
|
|
|
In each call to the op, the provided batch is split into subsets based on
|
|
the specified `kmc2_chain_length`. On each subset, a single Markov chain of
|
|
the k-MC2 algorithm is used to add *one* new center cluster center. If there
|
|
are less than `kmc2_chain_length` points in the subset, a single center is
|
|
added using one Markov chain on the full input. It is assumed that the
|
|
provided batch has previously been randomly permuted. Otherwise, k-MC2 may
|
|
return suboptimal centers.
|
|
|
|
Returns:
|
|
An op that adds new cluster centers.
|
|
"""
|
|
# The op only operates on the first shard of data.
|
|
first_shard = self._inputs[0]
|
|
# Number of points in the input that can be used.
|
|
batch_size = array_ops.shape(first_shard)[0]
|
|
# Maximum number of subsets such that the size of each subset is at least
|
|
# `kmc2_chain_length`. Final subsets may be larger.
|
|
max_to_sample = math_ops.cast(
|
|
batch_size / self._kmc2_chain_length, dtype=dtypes.int32)
|
|
# We sample at least one new center and at most all remaining centers.
|
|
num_to_sample = math_ops.maximum(
|
|
math_ops.minimum(self._num_remaining, max_to_sample), 1)
|
|
|
|
def _cond(i, _):
|
|
"""Stopping condition for the while loop."""
|
|
return math_ops.less(i, num_to_sample)
|
|
|
|
def _body(i, _):
|
|
"""Body that adds a single new center based on a subset."""
|
|
|
|
def _sample_random():
|
|
"""Returns a random point as a cluster center."""
|
|
# By assumption the batch is reshuffled and _sample_random is always
|
|
# called for i=0. Hence, we simply return the first point.
|
|
new_center = array_ops.reshape(first_shard[0], [1, -1])
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
new_center = nn_impl.l2_normalize(new_center, dim=1)
|
|
return new_center
|
|
|
|
def _sample_kmc2_chain():
|
|
"""Returns previous centers as well as a new center sampled using k-MC2."""
|
|
# Extract the subset from the underlying batch.
|
|
start = i * self._kmc2_chain_length
|
|
end = start + self._kmc2_chain_length
|
|
subset = first_shard[start:end]
|
|
# Compute the distances from points in the subset to previous centers.
|
|
_, distances = gen_clustering_ops.nearest_neighbors(
|
|
subset, self._cluster_centers, 1)
|
|
# Sample index of new center using k-MC2 Markov chain.
|
|
new_center_index = gen_clustering_ops.kmc2_chain_initialization(
|
|
array_ops.squeeze(distances), self._seed)
|
|
# Extract actual new center.
|
|
newly_sampled_center = array_ops.reshape(subset[new_center_index],
|
|
[1, -1])
|
|
# Return concatenation with previously sampled centers.
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
newly_sampled_center = nn_impl.l2_normalize(
|
|
newly_sampled_center, dim=1)
|
|
return array_ops.concat([self._cluster_centers, newly_sampled_center],
|
|
0)
|
|
|
|
# Obtain a random point if there are no previously sampled centers.
|
|
# Otherwise, construct a k-MC2 Markov chain.
|
|
new_centers = cond.cond(
|
|
math_ops.equal(self._num_selected, 0), _sample_random,
|
|
_sample_kmc2_chain)
|
|
# Assign new cluster centers to underlying variable.
|
|
assigned_centers = state_ops.assign(
|
|
self._cluster_centers, new_centers, validate_shape=False)
|
|
if self._cluster_centers_updated is not self._cluster_centers:
|
|
assigned_centers = state_ops.assign(
|
|
self._cluster_centers_updated,
|
|
assigned_centers,
|
|
validate_shape=False)
|
|
return i + 1, self._num_clusters - array_ops.shape(assigned_centers)[0]
|
|
|
|
# Add num_to_sample new data points.
|
|
_, num_remaining = while_loop.while_loop(_cond, _body, [0, 0])
|
|
return num_remaining
|
|
|
|
def _greedy_batch_sampler(self, sampler):
|
|
# If the input dataset size is smaller than the number of centers
|
|
# remaining, choose the entire input dataset as centers. This can happen
|
|
# with mini-batch. Otherwise, sample the batch according to the provided
|
|
# sampler.
|
|
return cond.cond(self._num_data <= self._num_remaining,
|
|
lambda: array_ops.concat(self._inputs, 0),
|
|
sampler)
|
|
|
|
def _single_batch_sampler(self, sampler):
|
|
# Enforce that there are at least as many data points as centers
|
|
# remaining. This gives the provided sampler the chance to select all
|
|
# remaining centers from a single batch.
|
|
with ops.control_dependencies(
|
|
[check_ops.assert_greater_equal(self._num_data, self._num_remaining)]):
|
|
return sampler()
|
|
|
|
def _choose_initial_centers(self):
|
|
if isinstance(self._initial_clusters, str):
|
|
if self._initial_clusters == RANDOM_INIT:
|
|
return self._greedy_batch_sampler(self._random)
|
|
else: # self._initial_clusters == KMEANS_PLUS_PLUS_INIT
|
|
return self._single_batch_sampler(self._kmeans_plus_plus)
|
|
elif callable(self._initial_clusters):
|
|
return self._initial_clusters(self._inputs, self._num_remaining)
|
|
else:
|
|
with ops.control_dependencies([
|
|
check_ops.assert_equal(self._num_remaining,
|
|
array_ops.shape(self._initial_clusters)[0])
|
|
]):
|
|
return self._initial_clusters
|
|
|
|
def _add_new_centers(self):
|
|
"""Adds some centers and returns the number of centers remaining."""
|
|
new_centers = self._choose_initial_centers()
|
|
if self._distance_metric == COSINE_DISTANCE:
|
|
new_centers = nn_impl.l2_normalize(new_centers, dim=1)
|
|
# If cluster_centers is empty, it doesn't have the right shape for concat.
|
|
all_centers = cond.cond(
|
|
math_ops.equal(self._num_selected, 0), lambda: new_centers,
|
|
lambda: array_ops.concat([self._cluster_centers, new_centers], 0))
|
|
# TODO(ccolby): De-dupe all_centers?
|
|
a = state_ops.assign(
|
|
self._cluster_centers, all_centers, validate_shape=False)
|
|
if self._cluster_centers_updated is not self._cluster_centers:
|
|
a = state_ops.assign(
|
|
self._cluster_centers_updated, a, validate_shape=False)
|
|
return self._num_clusters - array_ops.shape(a)[0]
|
|
|
|
def _initialize(self):
|
|
with ops.control_dependencies([
|
|
check_ops.assert_positive(self._num_remaining),
|
|
]):
|
|
if self._initial_clusters == KMC2_INIT:
|
|
num_now_remaining = self._kmc2_multiple_centers()
|
|
else:
|
|
num_now_remaining = self._add_new_centers()
|
|
return cond.cond(
|
|
math_ops.equal(num_now_remaining, 0),
|
|
lambda: state_ops.assign(self._cluster_centers_initialized, True),
|
|
control_flow_ops.no_op)
|
|
|
|
def op(self):
|
|
"""Returns the cluster initializer op."""
|
|
return cond.cond(
|
|
math_ops.equal(self._num_remaining, 0),
|
|
lambda: check_ops.assert_equal(self._cluster_centers_initialized, True),
|
|
self._initialize)
|