273 lines
8.7 KiB
Python
273 lines
8.7 KiB
Python
![]() |
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""Utilities for testing multi-worker distribution strategies with Keras."""
|
||
|
|
||
|
import threading
|
||
|
import unittest
|
||
|
|
||
|
import tensorflow.compat.v2 as tf
|
||
|
|
||
|
import keras
|
||
|
from keras.optimizers.legacy import gradient_descent
|
||
|
|
||
|
# isort: off
|
||
|
from tensorflow.python.distribute.cluster_resolver import (
|
||
|
SimpleClusterResolver,
|
||
|
)
|
||
|
from tensorflow.python.platform import tf_logging as logging
|
||
|
from tensorflow.python.training.server_lib import (
|
||
|
ClusterSpec,
|
||
|
)
|
||
|
|
||
|
_portpicker_import_error = None
|
||
|
try:
|
||
|
import portpicker
|
||
|
except (
|
||
|
ImportError,
|
||
|
ModuleNotFoundError,
|
||
|
) as _error:
|
||
|
_portpicker_import_error = _error
|
||
|
portpicker = None
|
||
|
|
||
|
ASSIGNED_PORTS = set()
|
||
|
lock = threading.Lock()
|
||
|
|
||
|
|
||
|
def mnist_synthetic_dataset(
|
||
|
batch_size, steps_per_epoch, target_values="constant"
|
||
|
):
|
||
|
"""Generate synthetic MNIST dataset for testing."""
|
||
|
# train dataset
|
||
|
x_train = tf.ones(
|
||
|
[batch_size * steps_per_epoch, 28, 28, 1], dtype=tf.float32
|
||
|
)
|
||
|
if target_values == "constant":
|
||
|
y_train = tf.ones([batch_size * steps_per_epoch, 1], dtype=tf.int32)
|
||
|
elif target_values == "increasing":
|
||
|
y_train = tf.reshape(
|
||
|
tf.range(batch_size * steps_per_epoch, dtype=tf.int32), (-1, 1)
|
||
|
)
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
'Unknown value for `target_values` "'
|
||
|
+ str(target_values)
|
||
|
+ '". Valid options are "constant" and "increasing".'
|
||
|
)
|
||
|
|
||
|
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
|
||
|
train_ds = train_ds.repeat()
|
||
|
# train_ds = train_ds.shuffle(100)
|
||
|
train_ds = train_ds.batch(batch_size, drop_remainder=True)
|
||
|
|
||
|
# eval dataset
|
||
|
x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.float32)
|
||
|
y_test = tf.random.uniform([10000, 1], minval=0, maxval=9, dtype=tf.int32)
|
||
|
eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
|
||
|
eval_ds = eval_ds.batch(batch_size, drop_remainder=True)
|
||
|
|
||
|
return train_ds, eval_ds
|
||
|
|
||
|
|
||
|
def get_mnist_model(input_shape):
|
||
|
"""Define a deterministically-initialized CNN model for MNIST testing."""
|
||
|
inputs = keras.Input(shape=input_shape)
|
||
|
x = keras.layers.Conv2D(
|
||
|
32,
|
||
|
kernel_size=(3, 3),
|
||
|
activation="relu",
|
||
|
kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
|
||
|
)(inputs)
|
||
|
x = keras.layers.BatchNormalization()(x)
|
||
|
x = keras.layers.Flatten()(x) + keras.layers.Flatten()(x)
|
||
|
x = keras.layers.Dense(
|
||
|
10,
|
||
|
activation="softmax",
|
||
|
kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
|
||
|
)(x)
|
||
|
model = keras.Model(inputs=inputs, outputs=x)
|
||
|
|
||
|
# TODO(yuefengz): optimizer with slot variables doesn't work because of
|
||
|
# optimizer's bug.
|
||
|
# TODO(yuefengz): we should not allow non-v2 optimizer.
|
||
|
model.compile(
|
||
|
loss=keras.losses.sparse_categorical_crossentropy,
|
||
|
optimizer=gradient_descent.SGD(learning_rate=0.001),
|
||
|
metrics=["accuracy"],
|
||
|
)
|
||
|
return model
|
||
|
|
||
|
|
||
|
def make_parameter_server_cluster(num_workers, num_ps):
|
||
|
cluster_def = create_in_process_cluster(
|
||
|
num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc"
|
||
|
)
|
||
|
return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
|
||
|
|
||
|
|
||
|
def pick_unused_port():
|
||
|
"""Returns an unused and unassigned local port."""
|
||
|
if _portpicker_import_error:
|
||
|
raise _portpicker_import_error
|
||
|
|
||
|
global ASSIGNED_PORTS
|
||
|
with lock:
|
||
|
while True:
|
||
|
try:
|
||
|
port = portpicker.pick_unused_port()
|
||
|
except portpicker.NoFreePortFoundError:
|
||
|
raise unittest.SkipTest(
|
||
|
"Flakes in portpicker library do not represent "
|
||
|
"TensorFlow errors."
|
||
|
)
|
||
|
if port > 10000 and port not in ASSIGNED_PORTS:
|
||
|
ASSIGNED_PORTS.add(port)
|
||
|
logging.info("Using local port %r", port)
|
||
|
return port
|
||
|
|
||
|
|
||
|
def _create_cluster(
|
||
|
num_workers,
|
||
|
num_ps,
|
||
|
has_chief=False,
|
||
|
has_eval=False,
|
||
|
protocol="grpc",
|
||
|
worker_config=None,
|
||
|
ps_config=None,
|
||
|
eval_config=None,
|
||
|
worker_name="worker",
|
||
|
ps_name="ps",
|
||
|
chief_name="chief",
|
||
|
):
|
||
|
"""Creates and starts local servers and returns the cluster_spec dict."""
|
||
|
if _portpicker_import_error:
|
||
|
raise _portpicker_import_error
|
||
|
worker_ports = [pick_unused_port() for _ in range(num_workers)]
|
||
|
ps_ports = [pick_unused_port() for _ in range(num_ps)]
|
||
|
|
||
|
cluster_dict = {}
|
||
|
if num_workers > 0:
|
||
|
cluster_dict[worker_name] = [
|
||
|
f"localhost:{port}" for port in worker_ports
|
||
|
]
|
||
|
if num_ps > 0:
|
||
|
cluster_dict[ps_name] = [f"localhost:{port}" for port in ps_ports]
|
||
|
if has_eval:
|
||
|
cluster_dict["evaluator"] = [f"localhost:{pick_unused_port()}"]
|
||
|
if has_chief:
|
||
|
cluster_dict[chief_name] = [f"localhost:{pick_unused_port()}"]
|
||
|
|
||
|
cs = tf.train.ClusterSpec(cluster_dict)
|
||
|
|
||
|
for i in range(num_workers):
|
||
|
tf.distribute.Server(
|
||
|
cs,
|
||
|
job_name=worker_name,
|
||
|
protocol=protocol,
|
||
|
task_index=i,
|
||
|
config=worker_config,
|
||
|
start=True,
|
||
|
)
|
||
|
|
||
|
for i in range(num_ps):
|
||
|
tf.distribute.Server(
|
||
|
cs,
|
||
|
job_name=ps_name,
|
||
|
protocol=protocol,
|
||
|
task_index=i,
|
||
|
config=ps_config,
|
||
|
start=True,
|
||
|
)
|
||
|
|
||
|
if has_chief:
|
||
|
tf.distribute.Server(
|
||
|
cs,
|
||
|
job_name=chief_name,
|
||
|
protocol=protocol,
|
||
|
task_index=0,
|
||
|
config=worker_config,
|
||
|
start=True,
|
||
|
)
|
||
|
|
||
|
if has_eval:
|
||
|
tf.distribute.Server(
|
||
|
cs,
|
||
|
job_name="evaluator",
|
||
|
protocol=protocol,
|
||
|
task_index=0,
|
||
|
config=eval_config,
|
||
|
start=True,
|
||
|
)
|
||
|
|
||
|
return cluster_dict
|
||
|
|
||
|
|
||
|
def create_in_process_cluster(
|
||
|
num_workers, num_ps, has_chief=False, has_eval=False, rpc_layer="grpc"
|
||
|
):
|
||
|
"""Create an in-process cluster that consists of only standard server."""
|
||
|
# Leave some memory for cuda runtime.
|
||
|
gpu_mem_frac = 0.7 / (num_workers + int(has_chief) + int(has_eval))
|
||
|
worker_config = tf.compat.v1.ConfigProto()
|
||
|
worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
|
||
|
|
||
|
# The cluster may hang if workers don't have enough inter_op threads. See
|
||
|
# b/172296720 for more details.
|
||
|
if worker_config.inter_op_parallelism_threads < num_workers + 1:
|
||
|
worker_config.inter_op_parallelism_threads = num_workers + 1
|
||
|
|
||
|
# Enable collective ops which has no impact on non-collective ops.
|
||
|
if has_chief:
|
||
|
worker_config.experimental.collective_group_leader = (
|
||
|
"/job:chief/replica:0/task:0"
|
||
|
)
|
||
|
else:
|
||
|
worker_config.experimental.collective_group_leader = (
|
||
|
"/job:worker/replica:0/task:0"
|
||
|
)
|
||
|
|
||
|
ps_config = tf.compat.v1.ConfigProto()
|
||
|
ps_config.device_count["GPU"] = 0
|
||
|
|
||
|
eval_config = tf.compat.v1.ConfigProto()
|
||
|
eval_config.experimental.collective_group_leader = ""
|
||
|
|
||
|
# Create in-process servers. Once an in-process tensorflow server is
|
||
|
# created, there is no way to terminate it. So we create one cluster per
|
||
|
# test process. We could've started the server in another process, we could
|
||
|
# then kill that process to terminate the server. The reasons why we don"t
|
||
|
# want multiple processes are
|
||
|
# 1) it is more difficult to manage these processes;
|
||
|
# 2) there is something global in CUDA such that if we initialize CUDA in
|
||
|
# the parent process, the child process cannot initialize it again and thus
|
||
|
# cannot use GPUs (https://stackoverflow.com/questions/22950047).
|
||
|
cluster = None
|
||
|
try:
|
||
|
cluster = _create_cluster(
|
||
|
num_workers,
|
||
|
num_ps=num_ps,
|
||
|
has_chief=has_chief,
|
||
|
has_eval=has_eval,
|
||
|
worker_config=worker_config,
|
||
|
ps_config=ps_config,
|
||
|
eval_config=eval_config,
|
||
|
protocol=rpc_layer,
|
||
|
)
|
||
|
except tf.errors.UnknownError as e:
|
||
|
if "Could not start gRPC server" in e.message:
|
||
|
raise unittest.SkipTest("Cannot start std servers.")
|
||
|
else:
|
||
|
raise
|
||
|
return cluster
|