# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for testing multi-worker distribution strategies with Keras."""

import threading
import unittest

import tensorflow.compat.v2 as tf

import keras
from keras.optimizers.legacy import gradient_descent

# isort: off
from tensorflow.python.distribute.cluster_resolver import (
    SimpleClusterResolver,
)
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training.server_lib import (
    ClusterSpec,
)

_portpicker_import_error = None
try:
    import portpicker
except (
    ImportError,
    ModuleNotFoundError,
) as _error:
    _portpicker_import_error = _error
    portpicker = None

ASSIGNED_PORTS = set()
lock = threading.Lock()


def mnist_synthetic_dataset(
    batch_size, steps_per_epoch, target_values="constant"
):
    """Generate synthetic MNIST dataset for testing."""
    # train dataset
    x_train = tf.ones(
        [batch_size * steps_per_epoch, 28, 28, 1], dtype=tf.float32
    )
    if target_values == "constant":
        y_train = tf.ones([batch_size * steps_per_epoch, 1], dtype=tf.int32)
    elif target_values == "increasing":
        y_train = tf.reshape(
            tf.range(batch_size * steps_per_epoch, dtype=tf.int32), (-1, 1)
        )
    else:
        raise ValueError(
            'Unknown value for `target_values` "'
            + str(target_values)
            + '". Valid options are "constant" and "increasing".'
        )

    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_ds = train_ds.repeat()
    # train_ds = train_ds.shuffle(100)
    train_ds = train_ds.batch(batch_size, drop_remainder=True)

    # eval dataset
    x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.float32)
    y_test = tf.random.uniform([10000, 1], minval=0, maxval=9, dtype=tf.int32)
    eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    eval_ds = eval_ds.batch(batch_size, drop_remainder=True)

    return train_ds, eval_ds


def get_mnist_model(input_shape):
    """Define a deterministically-initialized CNN model for MNIST testing."""
    inputs = keras.Input(shape=input_shape)
    x = keras.layers.Conv2D(
        32,
        kernel_size=(3, 3),
        activation="relu",
        kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
    )(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Flatten()(x) + keras.layers.Flatten()(x)
    x = keras.layers.Dense(
        10,
        activation="softmax",
        kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
    )(x)
    model = keras.Model(inputs=inputs, outputs=x)

    # TODO(yuefengz): optimizer with slot variables doesn't work because of
    # optimizer's bug.
    # TODO(yuefengz): we should not allow non-v2 optimizer.
    model.compile(
        loss=keras.losses.sparse_categorical_crossentropy,
        optimizer=gradient_descent.SGD(learning_rate=0.001),
        metrics=["accuracy"],
    )
    return model


def make_parameter_server_cluster(num_workers, num_ps):
    cluster_def = create_in_process_cluster(
        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc"
    )
    return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")


def pick_unused_port():
    """Returns an unused and unassigned local port."""
    if _portpicker_import_error:
        raise _portpicker_import_error

    global ASSIGNED_PORTS
    with lock:
        while True:
            try:
                port = portpicker.pick_unused_port()
            except portpicker.NoFreePortFoundError:
                raise unittest.SkipTest(
                    "Flakes in portpicker library do not represent "
                    "TensorFlow errors."
                )
            if port > 10000 and port not in ASSIGNED_PORTS:
                ASSIGNED_PORTS.add(port)
                logging.info("Using local port %r", port)
                return port


def _create_cluster(
    num_workers,
    num_ps,
    has_chief=False,
    has_eval=False,
    protocol="grpc",
    worker_config=None,
    ps_config=None,
    eval_config=None,
    worker_name="worker",
    ps_name="ps",
    chief_name="chief",
):
    """Creates and starts local servers and returns the cluster_spec dict."""
    if _portpicker_import_error:
        raise _portpicker_import_error
    worker_ports = [pick_unused_port() for _ in range(num_workers)]
    ps_ports = [pick_unused_port() for _ in range(num_ps)]

    cluster_dict = {}
    if num_workers > 0:
        cluster_dict[worker_name] = [
            f"localhost:{port}" for port in worker_ports
        ]
    if num_ps > 0:
        cluster_dict[ps_name] = [f"localhost:{port}" for port in ps_ports]
    if has_eval:
        cluster_dict["evaluator"] = [f"localhost:{pick_unused_port()}"]
    if has_chief:
        cluster_dict[chief_name] = [f"localhost:{pick_unused_port()}"]

    cs = tf.train.ClusterSpec(cluster_dict)

    for i in range(num_workers):
        tf.distribute.Server(
            cs,
            job_name=worker_name,
            protocol=protocol,
            task_index=i,
            config=worker_config,
            start=True,
        )

    for i in range(num_ps):
        tf.distribute.Server(
            cs,
            job_name=ps_name,
            protocol=protocol,
            task_index=i,
            config=ps_config,
            start=True,
        )

    if has_chief:
        tf.distribute.Server(
            cs,
            job_name=chief_name,
            protocol=protocol,
            task_index=0,
            config=worker_config,
            start=True,
        )

    if has_eval:
        tf.distribute.Server(
            cs,
            job_name="evaluator",
            protocol=protocol,
            task_index=0,
            config=eval_config,
            start=True,
        )

    return cluster_dict


def create_in_process_cluster(
    num_workers, num_ps, has_chief=False, has_eval=False, rpc_layer="grpc"
):
    """Create an in-process cluster that consists of only standard server."""
    # Leave some memory for cuda runtime.
    gpu_mem_frac = 0.7 / (num_workers + int(has_chief) + int(has_eval))
    worker_config = tf.compat.v1.ConfigProto()
    worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac

    # The cluster may hang if workers don't have enough inter_op threads. See
    # b/172296720 for more details.
    if worker_config.inter_op_parallelism_threads < num_workers + 1:
        worker_config.inter_op_parallelism_threads = num_workers + 1

    # Enable collective ops which has no impact on non-collective ops.
    if has_chief:
        worker_config.experimental.collective_group_leader = (
            "/job:chief/replica:0/task:0"
        )
    else:
        worker_config.experimental.collective_group_leader = (
            "/job:worker/replica:0/task:0"
        )

    ps_config = tf.compat.v1.ConfigProto()
    ps_config.device_count["GPU"] = 0

    eval_config = tf.compat.v1.ConfigProto()
    eval_config.experimental.collective_group_leader = ""

    # Create in-process servers. Once an in-process tensorflow server is
    # created, there is no way to terminate it. So we create one cluster per
    # test process.  We could've started the server in another process, we could
    # then kill that process to terminate the server. The reasons why we don"t
    # want multiple processes are
    # 1) it is more difficult to manage these processes;
    # 2) there is something global in CUDA such that if we initialize CUDA in
    # the parent process, the child process cannot initialize it again and thus
    # cannot use GPUs (https://stackoverflow.com/questions/22950047).
    cluster = None
    try:
        cluster = _create_cluster(
            num_workers,
            num_ps=num_ps,
            has_chief=has_chief,
            has_eval=has_eval,
            worker_config=worker_config,
            ps_config=ps_config,
            eval_config=eval_config,
            protocol=rpc_layer,
        )
    except tf.errors.UnknownError as e:
        if "Could not start gRPC server" in e.message:
            raise unittest.SkipTest("Cannot start std servers.")
        else:
            raise
    return cluster