# Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for tf.keras models using tf.distribute.Strategy.""" import os import numpy as np import tensorflow.compat.v2 as tf from absl.testing import parameterized import keras from keras import backend from keras.distribute import distributed_training_utils from keras.distribute import distributed_training_utils_v1 from keras.distribute import multi_worker_testing_utils from keras.distribute import optimizer_combinations from keras.distribute.strategy_combinations import all_strategies from keras.distribute.strategy_combinations import ( multi_worker_mirrored_strategies, ) from keras.distribute.strategy_combinations import ( strategies_minus_default_minus_tpu, ) from keras.distribute.strategy_combinations import strategies_minus_tpu from keras.distribute.strategy_combinations import tpu_strategies from keras.engine import base_layer_utils from keras.mixed_precision import policy from keras.optimizers import optimizer as optimizer_base from keras.optimizers.legacy import gradient_descent as gradient_descent_keras from keras.testing_infra import test_utils from keras.utils import losses_utils from keras.utils import np_utils # isort: off from tensorflow.python.distribute.cluster_resolver import ( SimpleClusterResolver, ) _RANDOM_SEED = 1337 _TRAIN_SIZE = 200 _INPUT_SIZE = (10,) _NUM_CLASS = 2 # Note: Please make sure the tests in this file are also covered in # keras_backward_compat_test for features that are supported with both APIs. # TODO(anjalisridhar): Add a decorator that will allow us to run these tests as # part of the tf.keras unit tests suite. def simple_sequential_model(): model = keras.models.Sequential() model.add( keras.layers.Dense(16, activation="relu", input_shape=_INPUT_SIZE) ) model.add(keras.layers.Dropout(0.1)) model.add(keras.layers.Dense(_NUM_CLASS, activation="softmax")) return model def simple_subclassed_model(num_labels=_NUM_CLASS): class _SimpleMLP(keras.Model): def __init__(self, num_labels): super().__init__() self.dense = keras.layers.Dense(num_labels) def call(self, inputs): return self.dense(inputs) return _SimpleMLP(num_labels) def simple_multi_inputs_multi_outputs_model(): input_a = keras.layers.Input(shape=(16,), name="input_a") input_b = keras.layers.Input(shape=(16,), name="input_b") merged = keras.layers.concatenate([input_a, input_b], name="merge") output_c = keras.layers.Dense(3, activation="softmax", name="dense_2")( merged ) output_d = keras.layers.Dense(2, activation="softmax", name="dense_3")( merged ) model = keras.models.Model( inputs=[input_a, input_b], outputs=[output_c, output_d] ) return model def get_multi_inputs_multi_outputs_data(): (a_train, c_train), (a_test, c_test) = test_utils.get_test_data( train_samples=_TRAIN_SIZE, test_samples=50, input_shape=(16,), num_classes=3, random_seed=_RANDOM_SEED, ) (b_train, d_train), (b_test, d_test) = test_utils.get_test_data( train_samples=_TRAIN_SIZE, test_samples=50, input_shape=(16,), num_classes=2, random_seed=_RANDOM_SEED, ) (m_train, _), (m_test, _) = test_utils.get_test_data( train_samples=_TRAIN_SIZE, test_samples=50, input_shape=(8,), num_classes=2, random_seed=_RANDOM_SEED, ) c_train = np_utils.to_categorical(c_train) c_test = np_utils.to_categorical(c_test) d_train = np_utils.to_categorical(d_train) d_test = np_utils.to_categorical(d_test) train_data = { "input_a": a_train, "input_b": b_train, "input_m": m_train, "output_c": c_train, "output_d": d_train, } test_data = { "input_a": a_test, "input_b": b_test, "input_m": m_test, "output_c": c_test, "output_d": d_test, } return (train_data, test_data) def batch_wrapper(dataset, batch_size, distribution, repeat=None): if repeat: dataset = dataset.repeat(repeat) # TPUs currently require fully defined input shapes, drop_remainder ensures # the input will have fully defined shapes. if backend.is_tpu_strategy(distribution): return dataset.batch(batch_size, drop_remainder=True) else: return dataset.batch(batch_size) def get_model(): x = keras.layers.Input(shape=(3,), name="input") y = keras.layers.Dense(4, name="dense")(x) model = keras.Model(x, y) return model def get_sample_weights_model(): x = keras.layers.Input(shape=(1,), name="input") y = keras.layers.Dense( 1, kernel_initializer="ones", bias_initializer="zeros", name="dense" )(x) model = keras.Model(x, y) return model def get_dataset(distribution): inputs = np.zeros((10, 3), dtype=np.float32) targets = np.zeros((10, 4), dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) dataset = batch_wrapper(dataset, 10, distribution) return dataset def get_predict_dataset(distribution): inputs = np.zeros((10, 3), dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices(inputs) dataset = dataset.repeat(100) dataset = batch_wrapper(dataset, 10, distribution) return dataset def convert_numpy_to_dataset_with_unknown_cardinality(inputs, targets=None): if targets is not None: input_slices = (inputs, targets) dummy_op = lambda inp, target: True else: input_slices = inputs dummy_op = lambda inp: True original_dataset = tf.data.Dataset.from_tensor_slices(input_slices) ds_with_unknown_cardinality = original_dataset.filter(dummy_op).batch( 10, drop_remainder=True ) return ds_with_unknown_cardinality def multi_input_output_model(): a = keras.layers.Input(shape=(3,), name="input_a") b = keras.layers.Input(shape=(5,), name="input_b") # TODO(anjalisridhar): Change the output dimension of the second Dense layer # once the iterator output validation issue has been fixed. dense_1 = keras.layers.Dense(7, name="dense_1") dense_2 = keras.layers.Dense(7, name="dense_2") c = dense_1(a) d = dense_2(b) e = keras.layers.Dropout(0.5, name="dropout")(c) model = keras.models.Model([a, b], [d, e]) return model def strategy_minus_tpu_combinations(): return tf.__internal__.test.combinations.combine( distribution=strategies_minus_tpu, mode=["graph", "eager"] ) def tpu_strategy_combinations(): return tf.__internal__.test.combinations.combine( distribution=tpu_strategies, mode=["graph", "eager"] ) def tpu_strategy_combinations_graph_only(): return tf.__internal__.test.combinations.combine( distribution=tpu_strategies, mode=["graph"] ) def multi_worker_strategy_combinations_eager_only(): return tf.__internal__.test.combinations.combine( distribution=multi_worker_mirrored_strategies, mode=["eager"] ) def all_strategy_combinations(): return ( strategy_minus_tpu_combinations() + tpu_strategy_combinations() + multi_worker_strategy_combinations_eager_only() ) def all_strategy_minus_default_and_tpu_combinations(): return tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.one_device_strategy, tf.__internal__.distribute.combinations.one_device_strategy_gpu, tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu, # noqa: E501 tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus, # noqa: E501 ], mode=["graph", "eager"], ) def all_strategy_combinations_minus_default(): return ( all_strategy_minus_default_and_tpu_combinations() + tpu_strategy_combinations() + multi_worker_strategy_combinations_eager_only() ) def strategy_and_optimizer_combinations(): non_tpu_strategies = tf.__internal__.test.combinations.times( strategy_minus_tpu_combinations(), tf.__internal__.test.combinations.combine( optimizer=[ optimizer_combinations.adagrad_optimizer_v1_fn, optimizer_combinations.adam_optimizer_v1_fn, optimizer_combinations.gradient_descent_optimizer_v1_fn, optimizer_combinations.rmsprop_optimizer_v1_fn, optimizer_combinations.adadelta_optimizer_keras_v2_fn, optimizer_combinations.adagrad_optimizer_keras_v2_fn, optimizer_combinations.adam_optimizer_keras_v2_fn, optimizer_combinations.adamax_optimizer_keras_v2_fn, optimizer_combinations.gradient_descent_optimizer_keras_v2_fn, optimizer_combinations.nadam_optimizer_keras_v2_fn, optimizer_combinations.rmsprop_optimizer_keras_v2_fn, optimizer_combinations.ftrl_optimizer_keras_v2_fn, ] ), ) tpu_strategies_graph = tf.__internal__.test.combinations.combine( distribution=tpu_strategies, mode=["graph"], optimizer=[ optimizer_combinations.adagrad_optimizer_v1_fn, optimizer_combinations.adam_optimizer_v1_fn, optimizer_combinations.gradient_descent_optimizer_v1_fn, optimizer_combinations.rmsprop_optimizer_v1_fn, optimizer_combinations.adagrad_optimizer_keras_v2_fn, optimizer_combinations.adam_optimizer_keras_v2_fn, optimizer_combinations.gradient_descent_optimizer_keras_v2_fn, optimizer_combinations.rmsprop_optimizer_keras_v2_fn, ], ) tpu_strategies_eager = tf.__internal__.test.combinations.combine( distribution=tpu_strategies, mode=["eager"], optimizer=[ optimizer_combinations.adagrad_optimizer_keras_v2_fn, optimizer_combinations.adam_optimizer_keras_v2_fn, optimizer_combinations.gradient_descent_optimizer_keras_v2_fn, optimizer_combinations.rmsprop_optimizer_keras_v2_fn, ], ) multi_worker_eager = tf.__internal__.test.combinations.combine( distribution=multi_worker_mirrored_strategies, mode=["eager"], optimizer=[ optimizer_combinations.adadelta_optimizer_keras_v2_fn, optimizer_combinations.adagrad_optimizer_keras_v2_fn, optimizer_combinations.adam_optimizer_keras_v2_fn, optimizer_combinations.adamax_optimizer_keras_v2_fn, optimizer_combinations.gradient_descent_optimizer_keras_v2_fn, optimizer_combinations.nadam_optimizer_keras_v2_fn, optimizer_combinations.rmsprop_optimizer_keras_v2_fn, optimizer_combinations.ftrl_optimizer_keras_v2_fn, ], ) return ( non_tpu_strategies + tpu_strategies_eager + tpu_strategies_graph + multi_worker_eager ) class BatchCountingCB(keras.callbacks.Callback): def __init__(self): super().__init__() self.train_begin_batches = [] self.train_end_batches = [] self.test_begin_batches = [] self.test_end_batches = [] self.predict_begin_batches = [] self.predict_end_batches = [] def on_train_batch_begin(self, batch, logs=None): self.train_begin_batches.append(batch) def on_train_batch_end(self, batch, logs=None): self.train_end_batches.append(batch) def on_test_batch_begin(self, batch, logs=None): self.test_begin_batches.append(batch) def on_test_batch_end(self, batch, logs=None): self.test_end_batches.append(batch) def on_predict_batch_begin(self, batch, logs=None): self.predict_begin_batches.append(batch) def on_predict_batch_end(self, batch, logs=None): self.predict_end_batches.append(batch) class TestDistributionStrategyWithNumpyArrays( tf.test.TestCase, parameterized.TestCase ): @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calculating_input_params_no_steps_no_batch_size( self, distribution ): # Calculate the per_replica_batch_size scaling factor for strategies # that use per_core_batch_size replica_scale_factor = 1.0 if not distributed_training_utils.global_batch_size_supported( distribution ): replica_scale_factor = distribution.num_replicas_in_sync with self.cached_session(): # Default global batch size 32 for input with 64 samples run in 2 # steps steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 64, steps=None, batch_size=None ) self.assertEqual(batch_size, 32 // replica_scale_factor) self.assertEqual(steps, 2) # Computed global batch size 20 is lower than 32 if we pass less # samples. steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 20, steps=None, batch_size=None ) self.assertEqual(batch_size, 20 // replica_scale_factor) self.assertEqual(steps, 1) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calculating_input_params_with_steps_no_batch_size( self, distribution ): # Calculate the per_replica_batch_size scaling factor for strategies # that use per_core_batch_size replica_scale_factor = 1.0 if not distributed_training_utils.global_batch_size_supported( distribution ): replica_scale_factor = distribution.num_replicas_in_sync with self.cached_session(): # Computed global batch size is correct for number of specified 1 # step steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 64, steps=1, batch_size=None ) self.assertEqual(batch_size, 64 // replica_scale_factor) self.assertEqual(steps, 1) # Computed global batch size is correct for number of specified 2 # steps steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 64, steps=2, batch_size=None ) self.assertEqual(batch_size, 32 // replica_scale_factor) self.assertEqual(steps, 2) # All samples can not be consumed in specified number of steps with self.assertRaisesRegex(ValueError, "not divisible by steps"): distributed_training_utils_v1.get_input_params( distribution, 63, steps=2, batch_size=None ) # This cases is different for different strategies due to the # difference in supported batch size being global or per-replica. if replica_scale_factor == 1: # Computed global batch size is correct even if not sharadable ( steps, batch_size, ) = distributed_training_utils_v1.get_input_params( distribution, 63, steps=3, batch_size=None ) self.assertEqual(batch_size, 21) self.assertEqual(steps, 3) else: # Computed global batch size can not be sharded across replicas with self.assertRaisesRegex( ValueError, "could not be sharded evenly across the sync replicas", ): distributed_training_utils_v1.get_input_params( distribution, 63, steps=1, batch_size=None ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calculating_input_params_no_steps_with_batch_size( self, distribution ): # Calculate the per_replica_batch_size scaling factor for strategies # that use per_core_batch_size replica_scale_factor = 1.0 if not distributed_training_utils.global_batch_size_supported( distribution ): replica_scale_factor = distribution.num_replicas_in_sync with self.cached_session(): # Computed steps is correct for specified batch size steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 64, steps=None, batch_size=16 ) self.assertEqual(batch_size, 16) self.assertEqual(steps, 4 // replica_scale_factor) # Computed steps is correct for specified batch size steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 64, steps=None, batch_size=32 ) self.assertEqual(batch_size, 32) self.assertEqual(steps, 2 // replica_scale_factor) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calculating_input_params_with_steps_with_batch_size( self, distribution ): with self.cached_session(): # No change to steps and batch size if both specified and feasible steps, batch_size = distributed_training_utils_v1.get_input_params( distribution, 64, steps=5, batch_size=3 ) self.assertEqual(batch_size, 3) self.assertEqual(steps, 5) # Number of samples is less than global batch size * steps with self.assertRaisesRegex( ValueError, "less than samples required" ): distributed_training_utils_v1.get_input_params( distribution, 64, steps=10, batch_size=13 ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calling_model_with_numpy_arrays(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" metrics = ["mae"] model.compile(optimizer, loss, metrics=metrics) inputs = np.zeros((64, 3), dtype=np.float32) targets = np.zeros((64, 4), dtype=np.float32) # Call fit with validation data model.fit( inputs, targets, epochs=1, batch_size=2, verbose=0, validation_data=(inputs, targets), ) # TODO(anjalisridhar): We need tests for when the batch size and # steps are smaller and results in a 0 batch_size and steps # value. model.evaluate(inputs, targets) model.evaluate(inputs, targets, batch_size=8) model.predict(inputs) model.predict(inputs, batch_size=8) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calling_model_with_mixed_precision(self, distribution): if isinstance( distribution, ( tf.compat.v1.distribute.experimental.ParameterServerStrategy, tf.distribute.experimental.ParameterServerStrategy, tf.distribute.experimental.CentralStorageStrategy, tf.compat.v1.distribute.experimental.CentralStorageStrategy, ), ): self.skipTest("b/152097775") if backend.is_tpu_strategy(distribution): policy_name = "mixed_bfloat16" else: policy_name = "mixed_float16" with self.cached_session(), distribution.scope(), policy.policy_scope( policy_name ): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) x = keras.layers.Input(shape=(3,), name="input") y = keras.layers.Dense(4, name="dense")(x) y = keras.layers.Activation("softmax", dtype="float32")(y) model = keras.Model(x, y) loss = "mse" metrics = ["mae"] model.compile(optimizer, loss, metrics=metrics) # We need to pass float32 since TPUs do not support float64, even # though these arrays will immediately be casted to bfloat16 on # TPUs. We also cannot pass bfloat16, as Numpy does not support it. inputs = np.zeros((64, 3), dtype="float32") targets = np.zeros((64, 4), dtype="float32") model.fit( inputs, targets, epochs=1, batch_size=2, verbose=0, validation_data=(inputs, targets), ) model.evaluate(inputs, targets) model.evaluate(inputs, targets, batch_size=8) model.predict(inputs) model.predict(inputs, batch_size=8) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_operator_overload_mixed_precision(self, distribution): # Regression test that tests a fixed bug does not reoccur. Adding an # AutoCastVariable to a tensor on a TPU, where the variable was the LHS # of the '+' operator, used to cause the gradient w.r.t. the variable to # be None. if isinstance( distribution, ( tf.compat.v1.distribute.experimental.ParameterServerStrategy, tf.distribute.experimental.ParameterServerStrategy, tf.distribute.experimental.CentralStorageStrategy, tf.compat.v1.distribute.experimental.CentralStorageStrategy, ), ): self.skipTest("b/152097775") if backend.is_tpu_strategy(distribution): policy_name = "mixed_bfloat16" else: policy_name = "mixed_float16" class MyLayer(keras.layers.Layer): def build(self, _): self.v1 = self.add_weight("v", ()) self.v2 = self.add_weight("v", ()) def call(self, inp): inp += self.v1 return self.v2 + inp with self.cached_session(), distribution.scope(): layer = MyLayer(dtype=policy_name) def run_fn(): x = np.array([1.0]) with tf.GradientTape() as tape: y = layer(x) grad_v1, grad_v2 = tape.gradient(y, [layer.v1, layer.v2]) return grad_v1, grad_v2 if tf.executing_eagerly(): run_fn = tf.function(run_fn) grad_v1, grad_v2 = distribution.run(run_fn) self.assertIsNotNone(grad_v1) self.assertIsNotNone(grad_v2) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.one_device_strategy ], mode=["graph", "eager"], ) ) def test_optimizer_in_cross_replica_context_raises_error( self, distribution ): with self.cached_session(), distribution.scope(): model = keras.models.Sequential([keras.layers.Dense(1)]) x = np.array([[1.0]]) with tf.GradientTape() as tape: y = model(x) gradients = tape.gradient(y, model.trainable_variables) optimizer = gradient_descent_keras.SGD() with self.assertRaisesRegex( RuntimeError, "cannot be called in cross-replica context" ): optimizer.apply_gradients( zip(gradients, model.trainable_variables) ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calling_model_with_nested_numpy_arrays(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) model = multi_input_output_model() loss = "mse" model.compile(optimizer, loss) input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32) input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32) inputs = [input_a_np, input_b_np] output_d_np = np.asarray( np.random.random((64, 7)), dtype=np.float32 ) output_e_np = np.asarray( np.random.random((64, 7)), dtype=np.float32 ) targets = [output_d_np, output_e_np] # Call fit with validation data model.fit(inputs, targets, epochs=1, batch_size=8, verbose=0) # TODO(anjalisridhar): We need tests for when the batch size and # steps are smaller and results in a 0 batch_size and steps value. model.evaluate(inputs, targets) model.evaluate(inputs, targets, batch_size=8) model.predict(inputs) model.predict(inputs, batch_size=8) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=strategies_minus_tpu, mode=["graph", "eager"] ) + tf.__internal__.test.combinations.combine( distribution=multi_worker_mirrored_strategies, mode=["eager"] ) ) def test_numpy_with_sample_weights(self, distribution): with self.cached_session(), distribution.scope(): model = get_sample_weights_model() optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001) loss = "mse" model.compile(optimizer, loss) inputs = np.array([[0], [1], [2], [3]], np.float32) targets = np.array([[2], [4], [6], [8]], np.float32) sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32) result = model.evaluate( inputs, targets, batch_size=2, sample_weight=sample_weights, verbose=1, ) # The per sample loss is multiplied by the corresponding sample # weight. The average of these weighted losses is the return value # of the `evaluate` call. For example, in the test above the average # weighted loss is calculated in the following manner: # batch_1 = (((2-0)^2) * 0.25 + ((4-1)^2) * 0.5) / 2 = 5.5 / 2 = # 2.75 # batch_2 = (((6-2)^2 * 0.75) + ((8-3)^2 * 1)) / 2 = 37 / 2 = 18.5 # final result = (batch_1 + batch_2) / 2 = 10.625. # The first time we divide by number of input samples and the second # time we divide by number of steps/batches that the loss is # aggregated over. self.assertAllClose(result, 10.625) # We now test without passing sample_weights: # batch_1 = ((2-0)^2) + ((4-1)^2) / 2 = 13 / 2 = 6.5 # batch_2 = ((6-2)^2) + ((8-3)^2) / 2 = 41 / 2 = 20.5 # final result = (batch_1 + batch_2) / 2 = 27 / 2 = 13.5 result = model.evaluate(inputs, targets, batch_size=2, verbose=1) self.assertAllClose(result, 13.5) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_flatten_predict_outputs(self, distribution): with self.cached_session(): with distribution.scope(): model = multi_input_output_model() optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) loss = "mse" model.compile(optimizer, loss) # We take 6 input samples with each input having a dimension of 3 or # 5. input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32) input_b_np = np.asarray(np.random.random((6, 5)), dtype=np.float32) inputs = [input_a_np, input_b_np] outs = model.predict(inputs) # `predict` a list that is equal in length to the number of model # outputs. In this test our model has two outputs and each element # of `outs` corresponds to all the samples of one of the model # outputs. self.assertLen(outs, 2) # Each of the output samples have a dimension of 7. We should # process all the available input samples(6). self.assertAllEqual([6, 7], outs[0].shape) self.assertAllEqual([6, 7], outs[1].shape) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( tpu_strategy_combinations_graph_only(), tf.__internal__.test.combinations.combine(batch_size=[4, 6]), ) ) def test_evaluate_with_partial_batch(self, distribution, batch_size): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] with distribution.scope(): model_with_ds_strategy = get_model() model_with_ds_strategy.compile(optimizer, loss, metrics=metrics) cpu_model = get_model() cpu_model.compile(optimizer, loss, metrics=metrics) x = np.random.random((10, 3)).astype("float32") y = np.random.random((10, 4)).astype("float32") # As sample size is 10, we batch by 4 so that the last batch is a # partial batch. Also `evaluate()` using numpy array as inputs # without distribution strategy uses entire sample as a single # batch. As so, we remove parameters `batch_size` and `steps`. cpu_model.set_weights(model_with_ds_strategy.get_weights()) evaluate_ground_truth = cpu_model.evaluate(x, y) # We don't compare the loss as loss is currently not computed as # metric in Keras, the loss value is inaccurate for last partial # batch due to more weights for the last batch samples. steps = np.ceil(10.0 / batch_size) self.assertAllClose( model_with_ds_strategy.evaluate( x, y, batch_size=batch_size, steps=steps )[1:], evaluate_ground_truth[1:], atol=1e-5, rtol=1e-5, ) # Test that `steps` is inferred correctly when final partial batch # exists. self.assertAllClose( model_with_ds_strategy.evaluate(x, y, batch_size=batch_size)[ 1: ], evaluate_ground_truth[1:], atol=1e-5, rtol=1e-5, ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( tpu_strategy_combinations_graph_only() ) ) def test_predict_with_partial_batch(self, distribution): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) loss = "mse" with distribution.scope(): model_with_ds_strategy = get_model() model_with_ds_strategy.compile(optimizer, loss) cpu_model = get_model() cpu_model.compile(optimizer, loss) inputs = np.random.random((10, 3)).astype(np.float32) # As sample size is 10, we batch by 4 so that the last batch is # a partial batch. Also `predict()` using numpy array as inputs # without distribution strategy uses entire sample as a single # batch. As so, we remove parameters `batch_size` and `steps`. cpu_model.set_weights(model_with_ds_strategy.get_weights()) predict_ground_truth = cpu_model.predict(inputs) self.assertAllClose( model_with_ds_strategy.predict(inputs, batch_size=4, steps=3), predict_ground_truth, atol=1e-5, rtol=1e-5, ) # Test that `steps` is inferred correctly when final partial batch # exists. self.assertAllClose( model_with_ds_strategy.predict(inputs, batch_size=4), predict_ground_truth, atol=1e-5, rtol=1e-5, ) @tf.__internal__.distribute.combinations.generate( tpu_strategy_combinations_graph_only() ) def test_no_target_model(self, distribution): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) class MyLayer(keras.layers.Layer): def call(self, inputs, training=None): self.add_loss(tf.reduce_sum(inputs), inputs=True) return inputs with distribution.scope(): model = keras.models.Sequential() model.add( keras.layers.Dense( 16, activation="relu", input_shape=_INPUT_SIZE ) ) model.add(MyLayer()) model.add(keras.layers.Dense(_NUM_CLASS, activation="softmax")) model.compile(optimizer) inputs = np.zeros((20, 10), np.float32) model.fit(inputs, epochs=1, steps_per_epoch=2) model.predict(inputs, steps=1) model.evaluate(inputs, steps=1) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( tpu_strategy_combinations_graph_only() ) ) def test_predict_multi_output_model_with_partial_batch(self, distribution): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) loss = "mse" with distribution.scope(): model_with_ds_strategy = ( simple_multi_inputs_multi_outputs_model() ) model_with_ds_strategy.compile(optimizer, loss) cpu_model = simple_multi_inputs_multi_outputs_model() cpu_model.compile(optimizer, loss) input_data, _ = get_multi_inputs_multi_outputs_data() input_dict = { "input_a": input_data["input_a"], "input_b": input_data["input_b"], } # As sample size is 200, we batch by 18 so that the last batch is # a partial batch. Also `fit()` using numpy array as inputs without # distribution strategy uses entire sample as a single batch. As so, # we remove parameters `batch_size` and `steps`. cpu_model.set_weights(model_with_ds_strategy.get_weights()) self.assertAllClose( model_with_ds_strategy.predict( input_dict, batch_size=18, steps=12 ), cpu_model.predict(input_dict), atol=1e-4, rtol=1e-4, ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_gradients_are_none(self, distribution): if not tf.executing_eagerly(): self.skipTest("None gradients are not supported in graph mode") class DenseWithExtraWeight(keras.layers.Dense): def build(self, input_shape): # Gradients w.r.t. extra_weights are None self.extra_weight_1 = self.add_weight( "extra_weight_1", shape=(), initializer="ones" ) super().build(input_shape) self.extra_weight_2 = self.add_weight( "extra_weight_2", shape=(), initializer="ones" ) with distribution.scope(): model = keras.Sequential( [DenseWithExtraWeight(4, input_shape=(4,))] ) model.compile("adam", "mse") inputs = np.random.normal(size=(64, 4)) targets = np.random.normal(size=(64, 4)) old_kernel = model.get_weights()[1] model.fit(inputs, targets) new_kernel = model.get_weights()[1] self.assertNotAllEqual(old_kernel, new_kernel) class TestDistributionStrategyWithDatasets( tf.test.TestCase, parameterized.TestCase ): @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_calling_model_on_same_dataset(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) dataset = get_dataset(distribution) # Call fit with validation data model.fit( dataset, epochs=1, steps_per_epoch=2, verbose=0, validation_data=dataset, validation_steps=2, ) model.fit( dataset, epochs=1, steps_per_epoch=2, verbose=0, validation_data=dataset, validation_steps=2, ) model.predict(get_predict_dataset(distribution), steps=2) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_model_interleaved_eval_same_as_direct_eval(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD user_controlled_model = get_model() user_controlled_model.compile( optimizer_fn(0.001), loss="mse", metrics=["mae", keras.metrics.CategoricalAccuracy()], ) interleaved_model = get_model() interleaved_model.set_weights( user_controlled_model.get_weights() ) interleaved_model.compile( optimizer_fn(0.001), loss="mse", metrics=["mae", keras.metrics.CategoricalAccuracy()], ) dataset = get_dataset(distribution) # Call fit with validation interleaved interleaved_output = interleaved_model.fit( dataset, epochs=2, steps_per_epoch=2, verbose=1, validation_data=dataset, validation_steps=2, shuffle=False, ) # Manually control the validation running after each epoch. user_controlled_output = [] for _ in range(2): user_controlled_model.fit( dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False, ) user_controlled_output.append( user_controlled_model.evaluate(dataset, steps=2) ) self.assertEqual( interleaved_output.history["val_loss"], [x[0] for x in user_controlled_output], ) val_mean_absolute_error = interleaved_output.history.get( "val_mean_absolute_error" ) if not val_mean_absolute_error: # The name of the metric changed in TF2.0 val_mean_absolute_error = interleaved_output.history["val_mae"] self.assertEqual( val_mean_absolute_error, [x[1] for x in user_controlled_output] ) self.assertEqual( interleaved_output.history["val_categorical_accuracy"], [x[2] for x in user_controlled_output], ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) model = multi_input_output_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) input_a_np = np.random.random((10, 3)).astype("float32") input_b_np = np.random.random((10, 5)).astype("float32") output_d_np = np.random.random((10, 7)).astype("float32") output_e_np = np.random.random((10, 7)).astype("float32") # Test with tuples dataset_tuple = tf.data.Dataset.from_tensor_slices( ((input_a_np, input_b_np), (output_d_np, output_e_np)) ) dataset_tuple = dataset_tuple.repeat(100) dataset_tuple = dataset_tuple.batch(10) model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1) # Test with dict dataset_dict = tf.data.Dataset.from_tensor_slices( ( {"input_a": input_a_np, "input_b": input_b_np}, (output_d_np, output_e_np), ) ) dataset_dict = dataset_dict.repeat(100) dataset_dict = dataset_dict.batch(10) model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_fit_with_dictionary_in_the_dataset_b135161171(self, distribution): if backend.is_tpu_strategy(distribution): self.skipTest("b/142805125") def custom_loss(predict, label, weight): bce = keras.losses.binary_crossentropy(label, predict) return tf.reduce_mean(bce * weight) with self.cached_session(): with distribution.scope(): input_img = keras.layers.Input([64, 64, 3], name="img") input_lbl = keras.layers.Input([64, 64, 1], name="lbl") input_weight = keras.layers.Input([64, 64], name="weight") predict = keras.layers.Conv2D(2, [1, 1], padding="same")( input_img ) loss_lambda = keras.layers.Lambda( lambda x: custom_loss(*x), name="my_loss" ) my_loss = loss_lambda([predict, input_lbl, input_weight]) model = keras.models.Model( inputs=[input_img, input_lbl, input_weight], outputs=[predict, my_loss], ) model.add_loss(model.get_layer("my_loss").output) model.compile(optimizer="adam") if tf.executing_eagerly(): def map_fn(img, lbl, weight): inputs = {"img": img, "lbl": lbl, "weight": weight} return (inputs,) else: def map_fn(img, lbl, weight): inputs = {"img": img, "lbl": lbl, "weight": weight} return inputs, {} fake_imgs = np.ones([50, 64, 64, 3], dtype=np.float32) fake_lbls = np.ones([50, 64, 64, 1], dtype=np.float32) fake_weights = np.ones([50, 64, 64], dtype=np.float32) data = ( tf.data.Dataset.from_tensor_slices( (fake_imgs, fake_lbls, fake_weights) ) .map(map_fn) .batch(10) ) model.fit(data) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_fit_eval_and_predict_methods_on_dataset_without_steps( self, distribution ): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) inputs = np.zeros((1000, 3), dtype=np.float32) targets = np.zeros((1000, 4), dtype=np.float32) # steps/steps_per_epoch are calculated when using numpy arrays as # input data. fit_with_numpy = model.fit( inputs, targets, epochs=1, batch_size=10 ).history eval_with_numpy = model.evaluate(inputs, targets, batch_size=10) predict_with_numpy = model.predict(inputs, batch_size=10) dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.batch(10, drop_remainder=True) fit_with_ds = model.fit(dataset, epochs=1).history eval_with_ds = model.evaluate(dataset) predict_dataset = tf.data.Dataset.from_tensor_slices(inputs) predict_dataset = predict_dataset.batch(10, drop_remainder=True) predict_with_ds = model.predict(predict_dataset) self.assertAllClose( fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4 ) self.assertAllClose( eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4 ) self.assertAllClose( predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4 ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_predict_on_dataset_with_unknown_cardinality_without_steps( self, distribution, mode ): if mode == "graph" and backend.is_tpu_strategy(distribution): self.skipTest("partial batch not supported with TPU in graph mode.") with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) inputs = np.zeros((20, 3), dtype=np.float32) # steps/steps_per_epoch are calculated when using numpy arrays as # input data. predict_with_numpy = model.predict(inputs, batch_size=10) predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs ) self.assertEqual( keras.backend.get_value( tf.data.experimental.cardinality(predict_dataset) ), tf.data.experimental.UNKNOWN_CARDINALITY, ) predict_with_ds = model.predict(predict_dataset) self.assertAllClose( predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4 ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_on_dataset_with_unknown_cardinality_without_steps( self, distribution, mode ): # TODO(b/155867206): Investigate why this test occasionally segfaults on # TPU in eager mode. if mode == "eager" and backend.is_tpu_strategy(distribution): self.skipTest("caused segfault with TPU in eager mode.") if mode == "graph" and backend.is_tpu_strategy(distribution): self.skipTest("partial batch not supported with TPU in graph mode.") with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.zeros((100, 4), dtype=np.float32) # steps/steps_per_epoch are calculated when using numpy arrays as # input data. fit_with_numpy = model.fit( inputs, targets, epochs=1, batch_size=10 ).history fit_with_numpy_multiple_epochs = model.fit( inputs, targets, epochs=2, batch_size=10 ).history eval_with_numpy = model.evaluate(inputs, targets, batch_size=10) predict_with_numpy = model.predict(inputs, batch_size=10) dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs, targets ) predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs ) self.assertEqual( keras.backend.get_value( tf.data.experimental.cardinality(dataset) ), tf.data.experimental.UNKNOWN_CARDINALITY, ) self.assertEqual( keras.backend.get_value( tf.data.experimental.cardinality(predict_dataset) ), tf.data.experimental.UNKNOWN_CARDINALITY, ) eval_with_ds = model.evaluate(dataset) predict_with_ds = model.predict(predict_dataset) self.assertAllClose( eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4 ) self.assertAllClose( predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4 ) fit_with_ds = model.fit(dataset, epochs=1).history fit_with_ds_multiple_epochs = model.fit(dataset, epochs=2).history self.assertAllClose( fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4 ) self.assertAllClose( fit_with_numpy_multiple_epochs, fit_with_ds_multiple_epochs, atol=1e-4, rtol=1e-4, ) @tf.__internal__.distribute.combinations.generate( tpu_strategy_combinations_graph_only() ) def test_on_dataset_with_unknown_cardinality(self, distribution): with self.cached_session(): with distribution.scope(): model = get_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile( tf.compat.v1.train.GradientDescentOptimizer(0.001), loss, metrics=metrics, ) inputs = np.zeros((1000, 3), dtype=np.float32) targets = np.zeros((1000, 4), dtype=np.float32) # steps/steps_per_epoch are calculated when using numpy arrays as # input data. eval_with_numpy = model.evaluate(inputs, targets, batch_size=10) predict_with_numpy = model.predict(inputs, batch_size=10) dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs, targets ) predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs ) self.assertEqual( keras.backend.get_value( tf.data.experimental.cardinality(dataset) ), tf.data.experimental.UNKNOWN_CARDINALITY, ) self.assertEqual( keras.backend.get_value( tf.data.experimental.cardinality(predict_dataset) ), tf.data.experimental.UNKNOWN_CARDINALITY, ) eval_with_ds = model.evaluate(dataset, steps=100) predict_with_ds = model.predict(predict_dataset, steps=100) self.assertAllClose( eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4 ) self.assertAllClose( predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4 ) with self.assertRaisesRegex( ValueError, "Number of steps could not be inferred" ): model.fit(dataset, epochs=1) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_fit_eval_and_predict_methods_on_dataset(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) dataset = get_dataset(distribution) model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1) model.evaluate(dataset, steps=2, verbose=1) model.predict(get_predict_dataset(distribution), steps=2) @tf.__internal__.distribute.combinations.generate( strategy_and_optimizer_combinations() ) def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer): with self.cached_session(): with distribution.scope(): model = get_model() loss = "mse" model.compile(optimizer(), loss) dataset = get_dataset(distribution) model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1) model.evaluate(dataset, steps=2, verbose=1) model.predict(get_predict_dataset(distribution), steps=2) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu, # noqa: E501 tf.__internal__.distribute.combinations.one_device_strategy, ], mode=["graph", "eager"], ) ) def test_dataset_wrong_input_shape(self, distribution, mode): if mode == "graph": self.skipTest( "TODO(b/120943676, b/120957836): Re-enable for graph once the " "validation code is restored." ) with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) model = get_model() loss = "mse" model.compile(optimizer, loss) # Wrong input shape inputs = np.zeros((10, 5), dtype=np.float32) targets = np.zeros((10, 4), dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) dataset = dataset.batch(10) with self.assertRaisesRegex(ValueError, "is incompatible with"): model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu # noqa: E501 ], mode=["graph", "eager"], ) ) def test_dataset_external_batch_input_validation(self, distribution): with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) model = get_model() loss = "mse" model.compile(optimizer, loss) # Batching is done outside tf.data's `batch` inputs = np.zeros((100, 10, 3), dtype=np.float32) targets = np.zeros((100, 10, 4), dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu, # noqa: E501 tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus, # noqa: E501 ], mode=["graph", "eager"], ) ) def test_learning_phase_value(self, distribution): # TODO(anjalisridhar): Modify this test to use Lambdas since we can # compare meaningful values. Currently we don't pass the learning phase # if the Lambda layer uses the learning phase. with self.cached_session(): with distribution.scope(): x = keras.layers.Input(shape=(1,), name="input") y = keras.layers.Dense(1, kernel_initializer="ones")(x) z = keras.layers.Dropout(0.9999)(y) model = keras.Model(x, z) initial_weights = model.get_weights() optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.005) loss = "mse" metrics = ["acc"] model.compile(optimizer, loss, metrics=metrics) batch_size = 8 if isinstance( distribution, ( tf.distribute.MirroredStrategy, tf.compat.v1.distribute.MirroredStrategy, ), ): # MirroredStrategy uses global batch size. batch_size = 8 * distribution.num_replicas_in_sync inputs = np.ones((10, 1), dtype=np.float32) targets = np.ones((10, 1), dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat().batch(batch_size) hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1) self.assertAlmostEqual(hist.history["acc"][0], 0, 0) with distribution.scope(): model.set_weights(initial_weights) # TODO(psv/anjalisridhar): Enable these lines after we fix # b/117431185. evaluate_output = model.evaluate(dataset, steps=20) # self.assertAlmostEqual(evaluate_output[1], 1, 0) inputs = np.ones((10, 1), dtype=np.float32) predict_dataset = tf.data.Dataset.from_tensor_slices(inputs) predict_dataset = predict_dataset.repeat().batch(batch_size) output = model.predict(predict_dataset, steps=10) # `predict` runs for 10 steps ref_output = np.ones((160, 1), dtype=np.float32) self.assertArrayNear(output, ref_output, 1e-1) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def testOptimizerWithCallbacks(self, distribution): with self.cached_session(): with distribution.scope(): model = get_model() optimizer = gradient_descent_keras.SGD(0.01) loss = "mse" model.compile(optimizer, loss) dataset = get_dataset(distribution) def schedule(_): return 0.001 model.fit( dataset, epochs=1, steps_per_epoch=2, verbose=0, callbacks=[keras.callbacks.LearningRateScheduler(schedule)], ) self.assertAllClose( 0.001, keras.backend.get_value(model.optimizer.lr) ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( tpu_strategy_combinations_graph_only(), tf.__internal__.test.combinations.combine(batch_size=[4, 6]), ) ) def test_evaluate_with_dataset_with_partial_batch( self, distribution, batch_size ): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) loss = "mse" metrics = ["mae", keras.metrics.CategoricalAccuracy()] with distribution.scope(): model_with_ds_strategy = get_model() model_with_ds_strategy.compile(optimizer, loss, metrics=metrics) cpu_model = get_model() cpu_model.compile(optimizer, loss, metrics=metrics) x = np.random.random((10, 3)).astype("float32") y = np.random.random((10, 4)).astype("float32") dataset = tf.data.Dataset.from_tensor_slices((x, y)) # As sample size is 10, we make the last batch a partial batch. cpu_model.set_weights(model_with_ds_strategy.get_weights()) dataset_with_partial_batch = dataset.batch(batch_size) # We don't compare the loss as loss is currently not computed as # metric in Keras, the loss value is inaccurate for last partial # batch due to more weights for the last batch samples. steps = np.ceil(10.0 / batch_size) self.assertAllClose( model_with_ds_strategy.evaluate( dataset_with_partial_batch, steps=steps )[1:], cpu_model.evaluate(dataset_with_partial_batch, steps=steps)[1:], atol=1e-5, rtol=1e-5, ) self.assertAllClose( model_with_ds_strategy.evaluate(dataset_with_partial_batch)[1:], cpu_model.evaluate(dataset_with_partial_batch)[1:], atol=1e-5, rtol=1e-5, ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( tpu_strategy_combinations_graph_only() ) ) def test_predict_with_dataset_with_partial_batch(self, distribution): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) loss = "mse" with distribution.scope(): model_with_ds_strategy = get_model() model_with_ds_strategy.compile(optimizer, loss) cpu_model = get_model() cpu_model.compile(optimizer, loss) inputs = np.random.random((10, 3)).astype(np.float32) dataset = tf.data.Dataset.from_tensor_slices((inputs)) # As sample size is 10, we batch by 4 so that the last batch is # a partial batch. dataset_with_partial_batch = dataset.batch(4) cpu_model.set_weights(model_with_ds_strategy.get_weights()) self.assertAllClose( model_with_ds_strategy.predict( dataset_with_partial_batch, steps=3 ), cpu_model.predict(dataset_with_partial_batch, steps=3), atol=1e-5, rtol=1e-5, ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( tpu_strategy_combinations_graph_only() ) ) def test_predict_multi_output_model_with_dataset_with_partial_batch( self, distribution ): with self.cached_session(): optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001) loss = "mse" with distribution.scope(): model_with_ds_strategy = ( simple_multi_inputs_multi_outputs_model() ) model_with_ds_strategy.compile(optimizer, loss) cpu_model = simple_multi_inputs_multi_outputs_model() cpu_model.compile(optimizer, loss) input_data, _ = get_multi_inputs_multi_outputs_data() input_dict = { "input_a": input_data["input_a"], "input_b": input_data["input_b"], } dataset = tf.data.Dataset.from_tensor_slices(input_dict) # As sample size is 200, we batch by 18 using 12 steps per epoch so # that the last batch is a partial batch. dataset_with_partial_batch = dataset.batch(18) cpu_model.set_weights(model_with_ds_strategy.get_weights()) self.assertAllClose( model_with_ds_strategy.predict( dataset_with_partial_batch, steps=12 ), cpu_model.predict(dataset_with_partial_batch, steps=12), atol=1e-4, rtol=1e-4, ) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations_minus_default() ) def test_match_model_input_matches_with_dataset_tensors(self, distribution): def _create_model_input_output_tensors(): input_a = keras.layers.Input( shape=(16,), name="z_input_sorted_last" ) input_b = keras.layers.Input( shape=(32,), name="a_input_sorted_first" ) intermediate_a = keras.layers.Dense(10)(input_a) intermediate_b = keras.layers.Dense(10)(input_b) merged = keras.layers.Add()([intermediate_a, intermediate_b]) output = keras.layers.Dense(2)(merged) return input_a, input_b, output input_dict = { "z_input_sorted_last": np.random.rand(32, 16).astype(np.float32), "a_input_sorted_first": np.random.rand(32, 32).astype(np.float32), } target = np.ones((32, 2), dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices((input_dict, target)) dataset = dataset.batch(4, drop_remainder=True) with self.cached_session(): with distribution.scope(): input_a, input_b, output = _create_model_input_output_tensors() # `input_a`, which has input name that comes last in # alphanumeric order, is the first input of the model input # layers. If tensors from `input_dict` is blindly flattened and # passed to model inputs incorrectly, this would result in # `input_a` input layer matching with tensor # `a_input_sorted_first` and would result in shape mismatch. model_with_array_input = keras.models.Model( inputs=[input_a, input_b], outputs=output ) model_with_array_input.compile("sgd", "mse") model_weights = model_with_array_input.get_weights() model_with_array_input_fit = model_with_array_input.fit( dataset, steps_per_epoch=1, epochs=1 ).history input_a, input_b, output = _create_model_input_output_tensors() model_with_dict_input = keras.models.Model( inputs={ "z_input_sorted_last": input_a, "a_input_sorted_first": input_b, }, outputs=output, ) model_with_dict_input.compile("sgd", "mse") model_with_dict_input.set_weights(model_weights) model_with_dict_input_fit = model_with_dict_input.fit( dataset, steps_per_epoch=1, epochs=1 ).history self.assertAllClose( model_with_dict_input_fit, model_with_array_input_fit, atol=1e-4, rtol=1e-4, ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=strategies_minus_tpu, mode=["graph", "eager"] ) + tf.__internal__.test.combinations.combine( distribution=multi_worker_mirrored_strategies, mode=["eager"] ) ) def test_dataset_with_sample_weights(self, distribution): with self.cached_session(), distribution.scope(): model = get_sample_weights_model() optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001) loss = "mse" model.compile(optimizer, loss) inputs = np.array([[0], [1], [2], [3]], np.float32) targets = np.array([[2], [4], [6], [8]], np.float32) sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32) ds = tf.data.Dataset.from_tensor_slices( (inputs, targets, sample_weights) ).batch(2) result = model.evaluate(ds, verbose=1) # The per sample loss is multiplied by the corresponding sample # weight. The average of these weighted losses is the return value # of the `evaluate` call. For example, in the test above the average # weighted loss is calculated in the following manner: # batch_1 = (((2-0)^2) * 0.25 + ((4-1)^2) * 0.5) / 2 = 5.5 / 2 = # 2.75 # batch_2 = (((6-2)^2 * 0.75) + ((8-3)^2 * 1)) / 2 = 37 / 2 = 18.5 # final result = (batch_1 + batch_2) / 2 = 10.625. # The first time we divide by number of input samples and the second # time we divide by number of steps/batches that the loss is # aggregated over. self.assertAllClose(result, 10.625) # We now test without passing sample_weights: # batch_1 = ((2-0)^2) + ((4-1)^2) / 2 = 13 / 2 = 6.5 # batch_2 = ((6-2)^2) + ((8-3)^2) / 2 = 41 / 2 = 20.5 # final result = (batch_1 + batch_2) / 2 = 27 / 2 = 13.5 ds = tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(2) result = model.evaluate(ds, verbose=1) self.assertAllClose(result, 13.5) class TestDistributionStrategyWithDatasetsFile( tf.test.TestCase, parameterized.TestCase ): def setUp(self): super().setUp() self.input_file_name = os.path.join( self.get_temp_dir(), "input.tfrecord" ) inputs = np.zeros((20, 3), dtype=np.float32) input_dataset = tf.data.Dataset.from_tensor_slices(inputs) input_dataset = input_dataset.map(tf.io.serialize_tensor) writer = tf.data.experimental.TFRecordWriter(self.input_file_name) writer.write(input_dataset) # TODO(wxinyi): add a multi-worker test for TPU @tf.__internal__.distribute.combinations.generate( multi_worker_strategy_combinations_eager_only() ) def test_predict_on_dataset_shard_options_file_multi_worker_mirrored( self, distribution, mode ): # This test is to verify if we successfully switch auto_shard_policy of # a input dataset inside model.predict with MultiWorkerMirroredStrategy # to AutoShardPolicy.DATA. Since there is only one input file for # multiple workers, AutoShardPolicy.AUTO or AutoShardPolicy.FILE will # lead to an error. However, since we switch to AutoShardPolicy.DATA in # model.predict, no error is raised. del mode with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(0.001) model = get_model() loss = "mse" model.compile(optimizer, loss) dataset = tf.data.TFRecordDataset(self.input_file_name) dataset = dataset.map(lambda x: tf.io.parse_tensor(x, tf.float32)) dummy_op = lambda inp: True dataset = dataset.filter(dummy_op).batch(8, drop_remainder=True) options = tf.data.Options() options.experimental_distribute.auto_shard_policy = ( tf.data.experimental.AutoShardPolicy.FILE ) dataset = dataset.with_options(options) model.predict(dataset, steps=1) class TestRegularizerLoss(tf.test.TestCase, parameterized.TestCase): class IdentityRegularizer(keras.regularizers.Regularizer): def __call__(self, x): return tf.identity(x) class AddLayer(keras.layers.Layer): def build(self, _): self.v = self.add_weight( "v", (), initializer="ones", regularizer=TestRegularizerLoss.IdentityRegularizer(), ) def call(self, inputs): return inputs + self.v @staticmethod def loss_fn(_, y_pred): return tf.reduce_mean(y_pred) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( all_strategy_combinations_minus_default() ) ) def test_regularizer_loss(self, distribution): batch_size = 2 if not distributed_training_utils.global_batch_size_supported( distribution ): batch_size //= distribution.num_replicas_in_sync # Given an input x, which is always 1, and variable v, this model # computes Loss=x+v+regularizer_loss, where regularizer_loss=v and # the variable is initialized to 1. Therefore, this model computes # Loss=1+2v, and so the gradient dLoss/dv = 2. This gradient of 2 is # averaged over all examples in a batch and then multiplied by the # learning rate of 1. As a result, the model update for one batch # should subtract 2 from v, resulting in v being -1. If the # regularizer loss is not scaled correctly by number of replicas, # the variable value will be incorrect when number of replicas >1. # For e.g. it will be -2 if num replicas = 2. with distribution.scope(): x = keras.layers.Input(shape=(1,), batch_size=batch_size) y = TestRegularizerLoss.AddLayer()(x) model = keras.models.Model(inputs=x, outputs=y) opt = gradient_descent_keras.SGD(1.0) model.compile(opt, loss=TestRegularizerLoss.loss_fn) model.fit( x=np.array([[1.0], [1.0]], dtype=np.float32), y=np.array([[1.0], [1.0]], dtype=np.float32), batch_size=batch_size, ) v = model.get_weights()[0] self.assertEqual(-1.0, v) @test_utils.run_all_without_tensor_float_32( "Uses Dense layers, which call matmul" ) class TestDistributionStrategyWithKerasModels( tf.test.TestCase, parameterized.TestCase ): @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_distribution_strategy_on_sequential_model(self, distribution): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) model = simple_sequential_model() loss = "mse" model.compile(optimizer, loss) inputs = np.zeros((20, 10), np.float32) targets = np.zeros((20, 2), np.float32) model.fit(inputs, targets, epochs=1, batch_size=10) model.predict(inputs, batch_size=10) model.evaluate(inputs, targets, batch_size=10) @tf.__internal__.distribute.combinations.generate( all_strategy_combinations() ) def test_distribution_strategy_on_functional_model(self, distribution): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD optimizer = optimizer_fn(learning_rate=0.001) model = get_model() loss = "mse" model.compile(optimizer, loss) inputs = np.zeros((64, 3), dtype=np.float32) targets = np.zeros((64, 4), dtype=np.float32) model.fit(inputs, targets, epochs=1) model.predict(inputs) model.evaluate(inputs, targets) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_distributed_dataset(self, distribution): with distribution.scope(): class CBCounter(keras.callbacks.Callback): def __init__(self): self.epochs = 0 self.train_batches = 0 self.test_batches = 0 def on_epoch_end(self, batch, logs=None): self.epochs += 1 def on_train_batch_end(self, batch, logs=None): self.train_batches += 1 def on_test_batch_end(self, batch, logs=None): self.test_batches += 1 model = keras.Sequential([keras.layers.Dense(1)]) model.compile("sgd", "mse") cb_counter = CBCounter() x, y = np.ones((100, 10)), np.ones((100, 1)) ds = tf.data.Dataset.from_tensor_slices((x, y)) ds = ds.batch(10).repeat(2) ds = distribution.experimental_distribute_dataset(ds) val_ds = tf.data.Dataset.from_tensor_slices((x, y)) val_ds = val_ds.batch(20) val_ds = distribution.experimental_distribute_dataset(val_ds) model.fit( ds, steps_per_epoch=10, validation_data=val_ds, validation_steps=5, epochs=2, callbacks=[cb_counter], ) self.assertEqual(cb_counter.train_batches, 20) self.assertEqual(cb_counter.test_batches, 10) self.assertEqual(cb_counter.epochs, 2) # Check for `steps_per_epoch`. if distribution.num_replicas_in_sync > 1: with self.assertRaisesRegex( ValueError, "distributed dataset, you must specify" ): model.fit(ds, epochs=2) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_distributed_datasets_from_function(self, distribution): with distribution.scope(): class CBCounter(keras.callbacks.Callback): def __init__(self): self.epochs = 0 self.train_batches = 0 self.test_batches = 0 def on_epoch_end(self, batch, logs=None): self.epochs += 1 def on_train_batch_end(self, batch, logs=None): self.train_batches += 1 def on_test_batch_end(self, batch, logs=None): self.test_batches += 1 model = keras.Sequential([keras.layers.Dense(1)]) model.compile("sgd", "mse") cb_counter = CBCounter() def make_dataset(_): x, y = np.ones((100, 10)), np.ones((100, 1)) ds = tf.data.Dataset.from_tensor_slices((x, y)) ds = ds.batch(5).repeat() return ds ds = distribution.distribute_datasets_from_function(make_dataset) val_ds = distribution.distribute_datasets_from_function( make_dataset ) model.fit( ds, steps_per_epoch=10, validation_data=val_ds, validation_steps=5, epochs=2, callbacks=[cb_counter], ) self.assertEqual(cb_counter.train_batches, 20) self.assertEqual(cb_counter.test_batches, 10) self.assertEqual(cb_counter.epochs, 2) # Check for `steps_per_epoch`. if distribution.num_replicas_in_sync > 1: with self.assertRaisesRegex( ValueError, "distributed dataset, you must specify" ): model.fit(ds, epochs=2) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_host_training_loop(self, distribution): if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy): self.skipTest("b/172032817") with distribution.scope(): inputs = keras.Input((10, 10, 3)) x = keras.layers.Conv2D(3, kernel_size=3)(inputs) x = keras.layers.Flatten()(x) outputs = keras.layers.Dense(1)(x) model = keras.Model(inputs, outputs) model.compile("sgd", "mse", steps_per_execution=10) bc = BatchCountingCB() x, y = np.ones((100, 10, 10, 3)), np.ones((100, 1)) model.fit(x, y, batch_size=2, epochs=1, callbacks=[bc]) self.assertEqual(bc.train_begin_batches, [0, 10, 20, 30, 40]) self.assertEqual(bc.train_end_batches, [9, 19, 29, 39, 49]) model.evaluate(x, y, batch_size=2, callbacks=[bc]) self.assertEqual(bc.test_begin_batches, [0, 10, 20, 30, 40]) self.assertEqual(bc.test_end_batches, [9, 19, 29, 39, 49]) model.predict(x, batch_size=2, callbacks=[bc]) self.assertEqual(bc.predict_begin_batches, [0, 10, 20, 30, 40]) self.assertEqual(bc.predict_end_batches, [9, 19, 29, 39, 49]) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_host_training_loop_last_partial_execution(self, distribution): if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy): self.skipTest("b/172032817") with distribution.scope(): inputs = keras.Input(10) outputs = keras.layers.Dense(1)(inputs) model = keras.Model(inputs, outputs) model.compile("sgd", "mse", steps_per_execution=20) bc = BatchCountingCB() x, y = np.ones((100, 10)), np.ones((100, 1)) model.fit(x, y, batch_size=2, epochs=1, callbacks=[bc]) self.assertEqual(bc.train_begin_batches, [0, 20, 40]) self.assertEqual(bc.train_end_batches, [19, 39, 49]) model.evaluate(x, y, batch_size=2, callbacks=[bc]) self.assertEqual(bc.test_begin_batches, [0, 20, 40]) self.assertEqual(bc.test_end_batches, [19, 39, 49]) model.predict(x, batch_size=2, callbacks=[bc]) self.assertEqual(bc.predict_begin_batches, [0, 20, 40]) self.assertEqual(bc.predict_end_batches, [19, 39, 49]) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_host_training_loop_dataset_unknown_size(self, distribution): if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy): self.skipTest("b/172032817") with distribution.scope(): inputs = keras.Input(10) outputs = keras.layers.Dense(1)(inputs) model = keras.Model(inputs, outputs) model.compile("sgd", "mse", steps_per_execution=20) x, y = np.ones((100, 10)), np.ones((100, 1)) ds = tf.data.Dataset.from_tensor_slices((x, y)).batch(2) ds = ds.filter(lambda *args, **kwargs: True) # Makes the size UNKNOWN. bc = BatchCountingCB() with self.assertRaisesRegex(ValueError, "steps_per_execution"): model.fit(ds, epochs=2, callbacks=[bc]) train_ds = ds.repeat(2) model.fit(train_ds, steps_per_epoch=50, epochs=2, callbacks=[bc]) self.assertEqual(bc.train_begin_batches, [0, 20, 40, 0, 20, 40]) self.assertEqual(bc.train_end_batches, [19, 39, 49, 19, 39, 49]) with self.assertRaisesRegex(ValueError, "steps_per_execution"): model.evaluate(ds, callbacks=[bc]) test_ds = ds.repeat(2) model.evaluate(test_ds, steps=50, callbacks=[bc]) self.assertEqual(bc.test_begin_batches, [0, 20, 40]) self.assertEqual(bc.test_end_batches, [19, 39, 49]) predict_ds = ds.repeat(2) model.predict(predict_ds, steps=50, callbacks=[bc]) self.assertEqual(bc.predict_begin_batches, [0, 20, 40]) self.assertEqual(bc.predict_end_batches, [19, 39, 49]) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_host_training_loop_truncate_to_epoch(self, distribution): if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy): self.skipTest("b/172032817") with distribution.scope(): inputs = keras.Input(10) outputs = keras.layers.Dense(1)(inputs) model = keras.Model(inputs, outputs) model.compile("sgd", "mse", steps_per_execution=500) x, y = np.ones((100, 10)), np.ones((100, 1)) bc = BatchCountingCB() model.fit(x, y, batch_size=2, epochs=2, callbacks=[bc]) self.assertEqual(bc.train_begin_batches, [0, 0]) self.assertEqual(bc.train_end_batches, [49, 49]) x, y = np.ones((50, 10)), np.ones((50, 1)) model.evaluate(x, y, batch_size=2, callbacks=[bc]) self.assertEqual(bc.test_begin_batches, [0]) self.assertEqual(bc.test_end_batches, [24]) x = np.ones((50, 10)) model.predict(x, batch_size=2, callbacks=[bc]) self.assertEqual(bc.predict_begin_batches, [0]) self.assertEqual(bc.predict_end_batches, [24]) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_gradient_clipping(self, distribution): class MyLayer(keras.layers.Layer): def build(self, _): self.v1 = tf.Variable(1.0) self.v2 = tf.Variable(1.0) def call(self, x): return 3 * self.v1 - 3 * self.v2 x, y = np.ones((10, 1)), np.ones((10, 1)) with distribution.scope(): layer = MyLayer() model = keras.Sequential([layer]) optimizer = gradient_descent_keras.SGD( 1.0, clipnorm=2.0, clipvalue=2.0 ) model.compile(optimizer, "mae") if isinstance( distribution, ( tf.distribute.experimental.CentralStorageStrategy, tf.compat.v1.distribute.experimental.CentralStorageStrategy, ), ): with self.assertRaisesRegex(ValueError, "not supported"): model.fit(x, y, batch_size=10, epochs=1) else: model.fit(x, y, batch_size=10, epochs=1) self.assertAllClose(self.evaluate(layer.v1), 3.0) self.assertAllClose(self.evaluate(layer.v2), -1.0) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_custom_gradient_transformation(self, distribution): if isinstance( distribution, ( tf.distribute.experimental.CentralStorageStrategy, tf.compat.v1.distribute.experimental.CentralStorageStrategy, ), ): self.skipTest("Not supported with `CentralStorageStrategy`") class MyLayer(keras.layers.Layer): def build(self, _): self.v1 = tf.Variable(1.0) self.v2 = tf.Variable(-1.0) def call(self, x): return x + self.v1 + self.v2 def custom_transform(grads_and_vars): # Always set gradients to 1. return [(tf.ones_like(g), v) for g, v in grads_and_vars] x, y = np.ones((10, 1)), np.ones((10, 1)) with distribution.scope(): layer = MyLayer() model = keras.Sequential([layer]) optimizer = gradient_descent_keras.SGD( 1.0, gradient_transformers=[custom_transform] ) model.compile(optimizer, "mae") model.fit(x, y, batch_size=10, epochs=1) self.assertAllClose(self.evaluate(layer.v1), 0.0) self.assertAllClose(self.evaluate(layer.v2), -2.0) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( all_strategy_combinations_minus_default() ) ) def test_distribution_strategy_one_dimensional(self, distribution): with distribution.scope(): inp = keras.layers.Input(shape=(10,)) out = keras.layers.Dense(3, activation="softmax")(inp) model = keras.Model(inputs=[inp], outputs=[out]) model.compile( optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"], ) x = np.random.random((64, 10)).astype("float32") y = np.random.randint(3, size=64) model.fit(x, y, epochs=1, steps_per_epoch=2) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu, # noqa: E501 tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus, # noqa: E501 ], mode=["graph", "eager"], reduction=[ losses_utils.ReductionV2.AUTO, losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE, losses_utils.ReductionV2.SUM, ], ) ) def test_distribution_strategy_with_loss_reduction_types( self, distribution, reduction ): np.random.seed(_RANDOM_SEED) def _get_model(): inputs = keras.Input((10,)) x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs) x2 = keras.layers.Dense(10, kernel_initializer="zeros")(x1) outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2) model = keras.Model(inputs, outputs) return model x = np.random.random((64, 10)) y = np.random.random((64, 1)) dataset = tf.data.Dataset.from_tensor_slices((x, y)) dataset = dataset.batch(32) model = _get_model() model.compile( "sgd", loss=keras.losses.MeanSquaredError(reduction=reduction) ) history = model.fit(dataset, steps_per_epoch=2, epochs=1, shuffle=False) with distribution.scope(): ds_model = _get_model() ds_model.compile( "sgd", loss=keras.losses.MeanSquaredError(reduction=reduction) ) ds_history = ds_model.fit( dataset, steps_per_epoch=2, epochs=1, shuffle=False ) self.assertArrayNear( history.history["loss"], ds_history.history["loss"], 1e-5 ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( all_strategy_combinations_minus_default() ) ) def test_distribution_strategy_with_symbolic_add_loss( self, mode, distribution ): def _make_model_with_add_loss(): inputs = keras.Input((10,)) x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs) x2 = keras.layers.Dense(10, kernel_initializer="zeros")(x1) outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2) model = keras.Model(inputs, outputs) model.add_loss(tf.reduce_mean(x1)) model.add_loss(tf.reduce_mean(outputs)) return model x = np.ones((64, 10)).astype("float32") model = _make_model_with_add_loss() model.compile("sgd") history = model.fit(x, epochs=1) with distribution.scope(): ds_model = _make_model_with_add_loss() ds_model.compile("sgd") ds_history = ds_model.fit(x, epochs=1) self.assertAllClose(history.history, ds_history.history) # TODO(omalleyt): Investigate flakiness and re-enable. @tf.__internal__.distribute.combinations.generate( all_strategy_minus_default_and_tpu_combinations() ) def DISABLED_test_distribution_strategy_with_callable_add_loss( self, distribution ): def _make_model(): inputs = keras.Input((10,)) x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs) x2 = keras.layers.Dense(10, kernel_initializer="zeros")(x1) d = keras.layers.Dense(1, kernel_initializer="zeros") outputs = d(x2) model = keras.Model(inputs, outputs) model.add_loss(lambda: 100.0 * tf.reduce_mean(d.kernel)) return model x = np.ones((64, 10)).astype("float32") y = np.ones((64, 1)).astype("float32") model = _make_model() self.assertLen(model.losses, 1) model.compile("sgd", "mse") history = model.fit(x, y, steps_per_epoch=2, epochs=1) with distribution.scope(): ds_model = _make_model() self.assertLen(ds_model.losses, 1) ds_model.compile("sgd", "mse") ds_history = ds_model.fit(x, y, steps_per_epoch=2, epochs=1) self.assertAllClose(history.history, ds_history.history) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( all_strategy_minus_default_and_tpu_combinations() ) ) def test_distribution_strategy_with_add_metric_in_call(self, distribution): class Bias(keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight( name="bias", initializer="zeros", shape=() ) def call(self, inputs): self.add_metric( tf.reduce_mean(inputs), name="bias", aggregation="mean" ) return inputs + self.bias def _make_model_with_add_metric(): inputs = keras.Input((10,)) x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs) x2 = Bias()(x1) outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2) model = keras.Model(inputs, outputs) return model x = np.ones((64, 10)).astype("float32") y = np.ones((64, 1)).astype("float32") model = _make_model_with_add_metric() self.assertLen(model.metrics, 1) model.compile("sgd", "mse") history = model.fit( x, y, validation_data=(x, y), validation_steps=2, epochs=2 ) with distribution.scope(): ds_model = _make_model_with_add_metric() self.assertLen(ds_model.metrics, 1) ds_model.compile("sgd", "mse") ds_history = ds_model.fit( x, y, validation_data=(x, y), validation_steps=2, epochs=2 ) # includes stateful loss metric in eager. metrics_len = 2 if tf.executing_eagerly() else 1 self.assertLen(ds_model.metrics, metrics_len) self.assertAllClose(history.history, ds_history.history) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=[ tf.__internal__.distribute.combinations.one_device_strategy, tf.__internal__.distribute.combinations.one_device_strategy_gpu, tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu, # noqa: E501 tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus, # noqa: E501 ], mode=["eager"], ) ) def test_distribution_strategy_with_add_metric_object(self, distribution): class Bias(keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight( name="bias", initializer="zeros", shape=() ) self.mean = keras.metrics.Mean(name="mean") def call(self, inputs): self.add_metric(self.mean(inputs)) return inputs + self.bias def _make_model_with_add_metric_object(): inputs = keras.Input((10,)) x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs) x2 = Bias()(x1) outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2) model = keras.Model(inputs, outputs) return model x = np.ones((64, 10)).astype("float32") y = np.ones((64, 1)).astype("float32") model = _make_model_with_add_metric_object() self.assertLen(model.metrics, 1) model.compile("sgd", "mse") history = model.fit( x, y, validation_data=(x, y), validation_steps=2, epochs=2 ) with distribution.scope(): ds_model = _make_model_with_add_metric_object() self.assertLen(ds_model.metrics, 1) ds_model.compile("sgd", "mse") ds_history = ds_model.fit( x, y, validation_data=(x, y), validation_steps=2, epochs=2 ) # includes stateful loss metric in eager. metrics_len = 2 if tf.executing_eagerly() else 1 self.assertLen(ds_model.metrics, metrics_len) self.assertAllClose(history.history, ds_history.history) @tf.__internal__.distribute.combinations.generate( # TODO(phillypham): Why does validation_steps > 1 not work on TPUs? tf.__internal__.test.combinations.times( all_strategy_minus_default_and_tpu_combinations() ) ) def test_distribution_strategy_with_add_metric_outside_call( self, distribution ): def _make_model_with_add_metric(): inputs = keras.Input((10,)) x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs) outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x1) model = keras.Model(inputs, outputs) model.add_metric( tf.reduce_mean(x1), name="mid_mean", aggregation="mean" ) return model x = np.ones((64, 10)).astype("float32") y = np.ones((64, 1)).astype("float32") model = _make_model_with_add_metric() self.assertLen(model.metrics, 1) model.compile("sgd", "mse") history = model.fit( x, y, validation_data=(x, y), validation_steps=2, epochs=2 ) with distribution.scope(): ds_model = _make_model_with_add_metric() self.assertLen(ds_model.metrics, 1) ds_model.compile("sgd", "mse") ds_history = ds_model.fit( x, y, validation_data=(x, y), validation_steps=2, epochs=2 ) # includes stateful loss metric in eager. metrics_len = 2 if tf.executing_eagerly() else 1 self.assertLen(ds_model.metrics, metrics_len) self.assertAllClose(history.history, ds_history.history) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=strategies_minus_tpu + multi_worker_mirrored_strategies, mode=["eager"], ) ) def test_sparse_tensor_outputs(self, distribution): class ToSparse(keras.layers.Layer): """Create a sparse tensor based on a given dense tensor.""" def call(self, inputs): indices = tf.where(tf.not_equal(inputs, 0)) values = tf.gather_nd(inputs, indices) shape = tf.shape(inputs, out_type="int64") return tf.SparseTensor(indices, values, dense_shape=shape) model = keras.Sequential([ToSparse()]) # Define some input data with additional padding. input_data = np.array([[1, 0, 0], [2, 3, 0]]) output = model.predict(input_data, batch_size=2) expected_indices = np.array([[0, 0], [1, 0], [1, 1]]) expected_values = np.array([1, 2, 3]) expected_dense_shape = np.array([2, 3]) self.assertAllEqual(output.indices, expected_indices) self.assertAllEqual(output.values, expected_values) self.assertAllEqual(output.dense_shape, expected_dense_shape) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=strategies_minus_tpu + multi_worker_mirrored_strategies, mode=["eager"], ) ) def test_ragged_tensor_outputs(self, distribution): class ToRagged(keras.layers.Layer): """Create a ragged tensor based on a given dense tensor.""" def __init__(self, padding, ragged_rank=1, **kwargs): super().__init__(**kwargs) self._padding = padding self._ragged_rank = ragged_rank def call(self, inputs): return tf.RaggedTensor.from_tensor( inputs, padding=self._padding, ragged_rank=self._ragged_rank ) model = keras.Sequential([ToRagged(padding=0)]) # Define some input data with additional padding. input_data = np.array([[1, 0, 0], [2, 3, 0]]) output = model.predict(input_data, batch_size=2) expected_values = [[1], [2, 3]] self.assertAllEqual(expected_values, output) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=strategies_minus_default_minus_tpu + tpu_strategies + multi_worker_mirrored_strategies, mode=["eager"], ) ) def test_correctness_of_add_loss_with_merge_call(self, distribution): batch_size = 32 def _get_model(): inputs = keras.layers.Input(shape=(1,)) labels = keras.layers.Input(shape=(1,)) x = keras.layers.Dense(10, activation="relu")(inputs) y = keras.layers.Dense(1)(x) model = keras.models.Model([inputs, labels], y) model.add_loss(keras.losses.mean_squared_error(labels, y)) return model def _get_data(): x_train = np.random.rand(64, 1) y_train = 3 * x_train x_train = x_train.astype("float32") y_train = y_train.astype("float32") dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) dataset = dataset.batch(batch_size) return dataset with distribution.scope(): model = _get_model() optimizer = gradient_descent_keras.SGD(0.2) @tf.function def train_step(dist_inputs): def step_fn(inputs): with tf.GradientTape() as tape: logits = model(inputs) # Invoke a merge_call() tf.distribute.get_replica_context().merge_call( lambda d: None ) # Verify that there is only one loss on the model. assert len(model.losses) == 1 loss_from_model = ( tf.reduce_sum(model.losses) * 1.0 / batch_size ) # Compute loss in this loop. loss = keras.losses.mean_squared_error( inputs[1], logits ) loss = tf.nn.compute_average_loss( loss, global_batch_size=batch_size ) # Verify that the loss computed in this loop is # equivalent to the loss from the model that was added # via add_loss. tf.compat.v1.assert_equal(loss, loss_from_model) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients( zip(grads, model.trainable_variables) ) return loss per_replica_losses = distribution.run( step_fn, args=(dist_inputs,) ) return distribution.reduce( tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None ) dataset = distribution.experimental_distribute_dataset(_get_data()) for _ in range(2): for x in dataset: train_step(x) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine(mode=["graph", "eager"]) ) def test_unimplemented_parameter_server_strategy(self): cluster_spec = multi_worker_testing_utils.create_in_process_cluster( num_workers=3, num_ps=2 ) cluster_resolver = SimpleClusterResolver( cluster_spec=tf.train.ClusterSpec(cluster_spec), task_type="worker", task_id=1, num_accelerators={"GPU": 0}, ) distribution = ( tf.compat.v1.distribute.experimental.ParameterServerStrategy( cluster_resolver ) ) self.assertIsInstance( distribution, tf.compat.v1.distribute.experimental.ParameterServerStrategy, ) with self.assertRaisesRegex( NotImplementedError, "ParameterServerStrategy*" ): with distribution.scope(): model = simple_sequential_model() optimizer = tf.compat.v1.train.RMSPropOptimizer( learning_rate=0.001 ) loss = "mse" model.compile(optimizer, loss) # Models to exercise inserting ancillary layers with add_loss and add_metric. def _functional_with_add_loss_and_metric(input_shape, num_classes, l1, l2): inputs = keras.Input(input_shape, name="images") x = keras.layers.Conv2D(32, kernel_size=5, activation="relu")(inputs) x = keras.layers.MaxPooling2D(pool_size=2)(x) x = keras.layers.Conv2D(64, kernel_size=5, activation="relu")(x) x = keras.layers.MaxPooling2D(pool_size=2)(x) # Apply L2 regularization to embedding. Use a mix of TensorFlow ops and # layers to exercise all code paths. x = keras.layers.Flatten(name="embedding")(x) l2_loss = tf.reduce_mean(tf.reduce_sum(tf.square(x), -1)) # Apply L1 regularization to next layer. x = keras.layers.Dense(1024, activation="relu", name="sparse_embedding")(x) l1_loss = keras.layers.Lambda( lambda x: tf.reduce_mean(tf.reduce_sum(x, -1)), name="l1_loss" )(x) outputs = keras.layers.Dense(num_classes, name="logits")(x) model = keras.Model(inputs=inputs, outputs=outputs) # Weight regularization terms. model.add_loss(keras.layers.Lambda(lambda x: x * l2)(l2_loss)) model.add_metric(l2_loss, aggregation="mean", name="l2_loss") model.add_loss(l1_loss * l1) model.add_metric(l1_loss, aggregation="mean", name="l1_loss") return model def _sequential_with_add_loss_and_metric(input_shape, num_classes, l1, l2): model = keras.Sequential( [ keras.layers.Conv2D( 32, kernel_size=5, activation="relu", input_shape=input_shape ), keras.layers.MaxPooling2D(pool_size=2), keras.layers.Conv2D(64, kernel_size=5, activation="relu"), keras.layers.MaxPooling2D(pool_size=2), keras.layers.Flatten(name="embedding"), keras.layers.Dense( 1024, activation="relu", name="sparse_embedding" ), keras.layers.Dense(num_classes, name="logits"), ] ) # Extract layer outputs, add regularization terms, and rescale the metric. # Use a mix of TensorFlow ops and layers to exercise all code paths. x = model.get_layer("sparse_embedding").get_output_at(-1) l1_loss = l1 * tf.reduce_mean(tf.reduce_sum(x, -1)) model.add_loss(l1_loss) model.add_metric( keras.layers.Lambda(lambda x: tf.divide(x, l1))(l1_loss), aggregation="mean", name="l1_loss", ) x = model.get_layer("embedding").get_output_at(-1) l2_loss = keras.layers.Lambda( lambda x: l2 * tf.reduce_mean(tf.reduce_sum(x * x, -1)), name="l2_loss" )(x) model.add_loss(l2_loss) model.add_metric(l2_loss / l2, aggregation="mean", name="l2_loss") return model def _functional_with_layer_reuse(input_shape, num_classes, l1, l2): base_model = keras.Sequential( [ keras.layers.Conv2D( 32, kernel_size=5, activation="relu", input_shape=input_shape ), keras.layers.MaxPooling2D(pool_size=2), keras.layers.Conv2D(64, kernel_size=5, activation="relu"), keras.layers.MaxPooling2D(pool_size=2), keras.layers.Flatten(), keras.layers.Dense(1024, activation="relu"), keras.layers.Dense(num_classes, name="logits"), ] ) inputs = keras.Input(input_shape, name="images") logits = base_model(inputs) model = keras.Model(inputs=inputs, outputs=logits) # Reuse sequential layer and create new nodes. zero_logits = base_model(tf.zeros_like(inputs)) one_logits = base_model(tf.ones_like(inputs)) # L2 loss. l2_loss = tf.reduce_mean(tf.reduce_sum(tf.square(logits - zero_logits), -1)) model.add_loss(l2_loss * l2) model.add_metric(l2_loss, aggregation="mean", name="l2_loss") # L1 loss. l1_loss = tf.reduce_mean(tf.reduce_sum(tf.abs(logits - one_logits), -1)) model.add_loss(l1_loss * l1) model.add_metric(l1_loss, aggregation="mean", name="l1_loss") return model class TestDistributionStrategyWithMultipleAddLossAndMetricCalls( tf.test.TestCase, parameterized.TestCase ): """Tests complex models with multiple add loss and metric calls.""" @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.times( all_strategy_combinations_minus_default(), tf.__internal__.test.combinations.combine( model_fn=[ _functional_with_add_loss_and_metric, _sequential_with_add_loss_and_metric, _functional_with_layer_reuse, ], l1=[0.01], l2=[0.1], ), ) ) def test_fit_and_evaluate(self, distribution, model_fn, l1, l2): # Make fake MNIST-like image data. np.random.seed(_RANDOM_SEED) dataset = tf.data.Dataset.from_tensor_slices( ( np.random.uniform(size=(64, 28, 28, 1)).astype(np.float32), np.random.randint(0, 10, size=(64,)), ) ) dataset = dataset.shuffle(64).batch( 8 * distribution.num_replicas_in_sync, drop_remainder=True ) # Make model with distribution strategy and initialize with dataset # shape. input_shape = tf.data.experimental.get_structure(dataset)[0].shape[1:] with distribution.scope(): model = model_fn(input_shape, 10, l1, l2) model.compile( optimizer=keras.optimizers.adam_legacy.Adam(1e-4), loss=keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE, ), metrics=[ keras.metrics.SparseCategoricalAccuracy(), keras.metrics.SparseCategoricalCrossentropy( from_logits=True ), ], ) # Non-eager training doesn't support steps_per_epoch=None. for unused_epoch in range(2): model.fit(dataset) results = dict(zip(model.metrics_names, model.evaluate(dataset))) # Sanity checks. self.assertBetween(results["sparse_categorical_accuracy"], 0.02, 1.0) self.assertGreater(results["l2_loss"], 0.0) self.assertGreater(results["l1_loss"], 0.0) # Assert correctness of the loss calculation and updating of metrics. self.assertNear( results["l1_loss"] * l1 + results["l2_loss"] * l2 + results["sparse_categorical_crossentropy"], results["loss"], 1e-6, ) class DeterministicModel(keras.Model): """Deterministic Model that always outputs the same initial result. It verifies the `call` method is run inside the same distribution strategy that the model was initially passed. """ def __init__(self, strategy): super().__init__() self.x = None self.strategy = strategy def build(self, input_shape): self.x = tf.Variable(tf.ones(shape=())) def call(self, inputs, training=None, mask=None): active_strategy = tf.distribute.get_strategy() if active_strategy is not self.strategy: raise ValueError("Model must execute call w/ the original strategy") return self.x * inputs class TestModelCapturesStrategy(tf.test.TestCase, parameterized.TestCase): """Tests that model creation captures the strategy.""" @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=all_strategies, mode=["eager"] ) ) def test_fit_and_evaluate(self, distribution): dataset = tf.data.Dataset.from_tensor_slices( (tf.ones(shape=(64,)), tf.ones(shape=(64,))) ) dataset = dataset.batch(8 * distribution.num_replicas_in_sync) # Make model with distribution strategy with distribution.scope(): model = DeterministicModel(distribution) optimizer = keras.optimizers.adam_legacy.Adam(1e-4) # Compile & evaluate the model outside of the distribution strategy # scope model.compile( optimizer=optimizer, loss=keras.losses.MeanSquaredError(), metrics=["binary_accuracy"], ) # Call `optimizer.iterations` out of strategy scope. self.assertEqual(model.optimizer.iterations.numpy(), 0) # Non-eager training doesn't support steps_per_epoch=None. for unused_epoch in range(2): model.fit(dataset) results = model.evaluate(dataset) results = dict(zip(model.metrics_names, results)) # Check that the metrics have a result we expect self.assertEqual(results["binary_accuracy"], 1.0) self.assertAllClose(results["loss"], 0.0) # Assert that all metric/optimizer/model variables were made in the # distribution strategy (Test that compile uses the captured # distribution strategy) metric_vars = tf.nest.flatten( [metric.variables for metric in model.metrics] ) for var in metric_vars: self.assertTrue( distribution.extended.variable_created_in_scope(var) ) for var in model.optimizer._weights: self.assertTrue( distribution.extended.variable_created_in_scope(var) ) for var in model.variables: self.assertTrue( distribution.extended.variable_created_in_scope(var) ) # Make sure the metric must be created in the same scope as the model: # This shouldn't raise any validation errors with distribution.scope(): metric = keras.metrics.BinaryAccuracy() model.compile( optimizer=optimizer, loss=keras.losses.MeanSquaredError(), metrics=[metric], ) # This should raise an error because the metric is constructed # outside of the scope, and not by compile if tf.distribute.has_strategy(): with self.assertRaisesRegex( ValueError, "All metrics must be created in" ): model.compile( optimizer=keras.optimizers.adam_v2.Adam(1e-4), loss=keras.losses.MeanSquaredError(), metrics=[keras.metrics.BinaryAccuracy()], ) @tf.__internal__.distribute.combinations.generate( tf.__internal__.test.combinations.combine( distribution=tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu, # noqa: E501 mode=["eager"], ) ) def test_optimizer(self, distribution): temp_dir = os.path.join(self.get_temp_dir(), "ckpt") def create_model(): model = keras.models.Sequential( [ keras.layers.Dense(1), ] ) model.compile(optimizer=keras.optimizers.Adam(), loss="mse") model.build([None, 1]) # create weights. return model model = create_model() x = y = tf.ones(shape=(1, 1)) model.fit(x=x, y=y, batch_size=1) model.save_weights(temp_dir) with distribution.scope(): model = create_model() model.load_weights(temp_dir) if isinstance(model.optimizer, optimizer_base.Optimizer): model.optimizer.build(model.trainable_variables) variables = model.optimizer.variables else: variables = model.optimizer.variables() self.assertNotEmpty(variables) self.assertTrue( distributed_training_utils.is_distributed_variable(variables[0]) ) with distribution.scope(): model = create_model() # create/restore slot variables outside of scope is fine. model.load_weights(temp_dir) if isinstance(model.optimizer, optimizer_base.Optimizer): # V3 optimizer has to restore variables in scope. return # From this point on, the optimizer must be a V2 optimizer. self.assertNotEmpty(model.optimizer.variables()) self.assertTrue( distributed_training_utils.is_distributed_variable( model.optimizer.variables()[0] ) ) if __name__ == "__main__": base_layer_utils.enable_v2_dtype_behavior() tf.__internal__.distribute.multi_process_runner.test_main()