Intelegentny_Pszczelarz/.venv/Lib/site-packages/keras/distribute/multi_worker_testing_utils.py
2023-06-19 00:49:18 +02:00

273 lines
8.7 KiB
Python

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for testing multi-worker distribution strategies with Keras."""
import threading
import unittest
import tensorflow.compat.v2 as tf
import keras
from keras.optimizers.legacy import gradient_descent
# isort: off
from tensorflow.python.distribute.cluster_resolver import (
SimpleClusterResolver,
)
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training.server_lib import (
ClusterSpec,
)
_portpicker_import_error = None
try:
import portpicker
except (
ImportError,
ModuleNotFoundError,
) as _error:
_portpicker_import_error = _error
portpicker = None
ASSIGNED_PORTS = set()
lock = threading.Lock()
def mnist_synthetic_dataset(
batch_size, steps_per_epoch, target_values="constant"
):
"""Generate synthetic MNIST dataset for testing."""
# train dataset
x_train = tf.ones(
[batch_size * steps_per_epoch, 28, 28, 1], dtype=tf.float32
)
if target_values == "constant":
y_train = tf.ones([batch_size * steps_per_epoch, 1], dtype=tf.int32)
elif target_values == "increasing":
y_train = tf.reshape(
tf.range(batch_size * steps_per_epoch, dtype=tf.int32), (-1, 1)
)
else:
raise ValueError(
'Unknown value for `target_values` "'
+ str(target_values)
+ '". Valid options are "constant" and "increasing".'
)
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_ds = train_ds.repeat()
# train_ds = train_ds.shuffle(100)
train_ds = train_ds.batch(batch_size, drop_remainder=True)
# eval dataset
x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.float32)
y_test = tf.random.uniform([10000, 1], minval=0, maxval=9, dtype=tf.int32)
eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
eval_ds = eval_ds.batch(batch_size, drop_remainder=True)
return train_ds, eval_ds
def get_mnist_model(input_shape):
"""Define a deterministically-initialized CNN model for MNIST testing."""
inputs = keras.Input(shape=input_shape)
x = keras.layers.Conv2D(
32,
kernel_size=(3, 3),
activation="relu",
kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
)(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Flatten()(x) + keras.layers.Flatten()(x)
x = keras.layers.Dense(
10,
activation="softmax",
kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
)(x)
model = keras.Model(inputs=inputs, outputs=x)
# TODO(yuefengz): optimizer with slot variables doesn't work because of
# optimizer's bug.
# TODO(yuefengz): we should not allow non-v2 optimizer.
model.compile(
loss=keras.losses.sparse_categorical_crossentropy,
optimizer=gradient_descent.SGD(learning_rate=0.001),
metrics=["accuracy"],
)
return model
def make_parameter_server_cluster(num_workers, num_ps):
cluster_def = create_in_process_cluster(
num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc"
)
return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
def pick_unused_port():
"""Returns an unused and unassigned local port."""
if _portpicker_import_error:
raise _portpicker_import_error
global ASSIGNED_PORTS
with lock:
while True:
try:
port = portpicker.pick_unused_port()
except portpicker.NoFreePortFoundError:
raise unittest.SkipTest(
"Flakes in portpicker library do not represent "
"TensorFlow errors."
)
if port > 10000 and port not in ASSIGNED_PORTS:
ASSIGNED_PORTS.add(port)
logging.info("Using local port %r", port)
return port
def _create_cluster(
num_workers,
num_ps,
has_chief=False,
has_eval=False,
protocol="grpc",
worker_config=None,
ps_config=None,
eval_config=None,
worker_name="worker",
ps_name="ps",
chief_name="chief",
):
"""Creates and starts local servers and returns the cluster_spec dict."""
if _portpicker_import_error:
raise _portpicker_import_error
worker_ports = [pick_unused_port() for _ in range(num_workers)]
ps_ports = [pick_unused_port() for _ in range(num_ps)]
cluster_dict = {}
if num_workers > 0:
cluster_dict[worker_name] = [
f"localhost:{port}" for port in worker_ports
]
if num_ps > 0:
cluster_dict[ps_name] = [f"localhost:{port}" for port in ps_ports]
if has_eval:
cluster_dict["evaluator"] = [f"localhost:{pick_unused_port()}"]
if has_chief:
cluster_dict[chief_name] = [f"localhost:{pick_unused_port()}"]
cs = tf.train.ClusterSpec(cluster_dict)
for i in range(num_workers):
tf.distribute.Server(
cs,
job_name=worker_name,
protocol=protocol,
task_index=i,
config=worker_config,
start=True,
)
for i in range(num_ps):
tf.distribute.Server(
cs,
job_name=ps_name,
protocol=protocol,
task_index=i,
config=ps_config,
start=True,
)
if has_chief:
tf.distribute.Server(
cs,
job_name=chief_name,
protocol=protocol,
task_index=0,
config=worker_config,
start=True,
)
if has_eval:
tf.distribute.Server(
cs,
job_name="evaluator",
protocol=protocol,
task_index=0,
config=eval_config,
start=True,
)
return cluster_dict
def create_in_process_cluster(
num_workers, num_ps, has_chief=False, has_eval=False, rpc_layer="grpc"
):
"""Create an in-process cluster that consists of only standard server."""
# Leave some memory for cuda runtime.
gpu_mem_frac = 0.7 / (num_workers + int(has_chief) + int(has_eval))
worker_config = tf.compat.v1.ConfigProto()
worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
# The cluster may hang if workers don't have enough inter_op threads. See
# b/172296720 for more details.
if worker_config.inter_op_parallelism_threads < num_workers + 1:
worker_config.inter_op_parallelism_threads = num_workers + 1
# Enable collective ops which has no impact on non-collective ops.
if has_chief:
worker_config.experimental.collective_group_leader = (
"/job:chief/replica:0/task:0"
)
else:
worker_config.experimental.collective_group_leader = (
"/job:worker/replica:0/task:0"
)
ps_config = tf.compat.v1.ConfigProto()
ps_config.device_count["GPU"] = 0
eval_config = tf.compat.v1.ConfigProto()
eval_config.experimental.collective_group_leader = ""
# Create in-process servers. Once an in-process tensorflow server is
# created, there is no way to terminate it. So we create one cluster per
# test process. We could've started the server in another process, we could
# then kill that process to terminate the server. The reasons why we don"t
# want multiple processes are
# 1) it is more difficult to manage these processes;
# 2) there is something global in CUDA such that if we initialize CUDA in
# the parent process, the child process cannot initialize it again and thus
# cannot use GPUs (https://stackoverflow.com/questions/22950047).
cluster = None
try:
cluster = _create_cluster(
num_workers,
num_ps=num_ps,
has_chief=has_chief,
has_eval=has_eval,
worker_config=worker_config,
ps_config=ps_config,
eval_config=eval_config,
protocol=rpc_layer,
)
except tf.errors.UnknownError as e:
if "Could not start gRPC server" in e.message:
raise unittest.SkipTest("Cannot start std servers.")
else:
raise
return cluster