3RNN/Lib/site-packages/tensorflow/python/checkpoint/async_checkpoint_helper.py

# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for saving/loading Trackable objects asynchronously."""

import atexit
import copy
import queue
import threading
import time
import weakref

from absl import logging

from tensorflow.python.checkpoint import checkpoint_context
from tensorflow.python.checkpoint import trackable_view
from tensorflow.python.distribute import device_util
from tensorflow.python.eager import context
from tensorflow.python.eager import def_function
from tensorflow.python.eager import executor
from tensorflow.python.framework import ops
from tensorflow.python.ops import variables
from tensorflow.python.saved_model.pywrap_saved_model import metrics
from tensorflow.python.trackable import base
from tensorflow.python.util import object_identity

# Captures the timestamp of the first Checkpoint instantiation or end of a write
# operation. Can be accessed by multiple Checkpoint instances.
_END_TIME_OF_LAST_ASYNC_WRITE = None
_END_TIME_OF_LAST_ASYNC_WRITE_LOCK = threading.Lock()

# API label for cell names used in async checkpoint metrics.
_ASYNC_CHECKPOINT = "async_checkpoint"

# Name of TPUEmbedding attribute. This is a temporary workaround
# to identify TPUEmbedding while avoiding import cycles.
_TPU_EMBEDDING_ATTR = "_create_copy_for_async_checkpoint"


def _get_duration_microseconds(start_time_seconds, end_time_seconds):
  """Calculate the duration between start and end time.

  Args:
    start_time_seconds: The start time in seconds.
    end_time_seconds: The end time in seconds.

  Returns:
    The duration between the start and the end time. Return 0 if
    end_time_seconds < start_time_seconds.
  """
  if end_time_seconds < start_time_seconds:
    # Avoid returning negative value in case of clock skew.
    return 0
  return round((end_time_seconds - start_time_seconds) * 1000000)


def _get_all_trackables(root, exclude_set):
  """Return the list of checkpointable trackables dependent on `root`.

  Args:
    root: The root trackable from where we get all its dependent trackables.
    exclude_set: An ObjectIdentitySet of Trackables to exclude before returning.
        Each element in `exclude_set` is a specific instance of a `Trackable`
        and appears precisely once in `TrackableView(root).descendants()`.

  Returns:
    saveable_trackables: All trackables that are saveable in `all_trackables`
        (see definition of "saveable" in `_trackable_needs_to_be_saved()`). A
        subset of `all_trackables`.
    all_trackables: All trackables returned by `TrackableView`'s `descendants()`
        after excluding `exclude_set`. A superset of `saveable_trackables`.
  """
  all_trackables = trackable_view.TrackableView(root=root).descendants()

  # Kick out the trackable we want to exclude.
  # The goal of writing such loop is to only scan the list once and stop
  # scanning as early as possible (unlike filtering with list comprehension).
  trackable_index = 0
  while trackable_index < len(all_trackables) and exclude_set:
    # While we have not excluded all items, or gone through all trackables.
    if all_trackables[trackable_index] in exclude_set:
      # If want to exclude this trackable, we pop it and do not update ptr
      exclude_set.discard(all_trackables[trackable_index])
      all_trackables.pop(trackable_index)
    else:
      # Otherwise update ptr
      trackable_index += 1

  # Kick out trackables that do not need to be saved (e.g. ListWrapper, etc.)
  # We define any trackable that does not implement `_serialize_to_tensor` or
  # `_gather_saveables` as "no need to be saved". If the trackable has one or
  # both of the methods defined, it should have `_copy_trackable_to_cpu`
  # defined; if not, we will raise warning in `_copy_to_cpu()`. In case of
  # special case, we also check whether a trackable (who has neither of the
  # other two methods defined) defines `_copy_trackable_to_cpu` only; we still
  # define such cases as "needs to be saved".
  def _trackable_needs_to_be_saved(obj):
    """Returns whether a trackable needs to be saved.

    Returns a bool to indicate whether obj's class has `_serialize_to_tensors`,
    `gather_saveables_for_checkpoint`, or `_copy_trackable_to_cpu` defined.

    Args:
      obj: A Trackable object.
    """
    if hasattr(obj, "__dict__"):
      # Data structure proxy wrappers don't have __dict__.
      if ("_serialize_to_tensors" in obj.__dict__
          or "_gather_saveables_for_checkpoint" in obj.__dict__
          or "_copy_trackable_to_cpu" in obj.__dict__):
        return True

    # Use MRO so that if a parent class has one of the three methods, we still
    # consider `t` as needed to be saved.
    for t in type(obj).mro():
      if t is base.Trackable:
        # Base class always has them implemented, but would raise error.
        continue
      elif ("_serialize_to_tensors" in t.__dict__
            or "_gather_saveables_for_checkpoint" in t.__dict__
            or "_copy_trackable_to_cpu" in t.__dict__):
        return True

    return False

  saveable_trackables = [x for x in all_trackables if
                         _trackable_needs_to_be_saved(x)]

  return saveable_trackables, all_trackables


class AsyncCheckpointHelper:
  """Helper class for async checkpoint."""

  def __init__(self, checkpointer_impl, root=None, **kwargs):
    """Initialize AsyncCheckpoint.

    Args:
      checkpointer_impl: The Checkpoint class to power the AsyncCheckpoint.
      root: The root object to checkpoint. `root` may be a trackable object or
        `WeakRef` of a trackable object.
      **kwargs: The keyword arguments representing the checkpointed variables.

    Raises:
      AttributeError: when checkpointer_impl is None.
    """
    # TODO(chienchunh): Make sure the processing for the root object is
    #   consistent when integrating with the public API, e.g., adding all kwarg
    #   items as the child of the root object.
    if root:
      trackable_root = root() if isinstance(root, weakref.ref) else root
      kwargs["root"] = trackable_root
      trackable_root._maybe_initialize_trackable()

    # The underlying Checkpoint instance and its items.
    if checkpointer_impl is None:
      raise AttributeError(
          "checkpointer_impl cannot be None for AsyncCheckpointHelper."
      )
    self._checkpointer_impl = checkpointer_impl
    self._checkpoint_items = kwargs
    self._checkpoint = None
    self.checkpointer()
    self._checkpoint_options = None

    # Indicate whether async checkpoint has finished traversing the variable
    # list and created the object map between the original and copied variables.
    self._initialized = False

    # The list of all nodes from the original checkpoint items.
    # TODO(chienchunh): Consider changing this to local variable.
    self._original_nodes = None
    # The mapping between the original and the copied resource variables.
    # The copied variables are used for the underlying checkpointing.
    self._object_map = None
    # A list of TPUEmbedding objects included in the checkpoint items.
    self._tpu_embedding_objects = None
    # A list of highest level `Trackable`s we will copy; does not contain
    # TPUEmbedding objects
    self._saveable_trackables = None

    self._default_device = device_util.current() or "CPU:0"
    self._default_device = device_util.canonicalize(self._default_device)

    self._save_file_prefix = None
    self._use_checkpoint_save = False
    self._async_save_thread = None
    # Concurrent queue that coordinates the events for writing/reading the
    # cpu-copied variables. A 'True' in the queue triggers the async thread to
    # perform saving; a 'False' breaks the while loop so that the async thread
    # exits; no other values will be added to the queue.
    # Maxsize is set to 1 only to ensure the exit procedure. We could have used
    # queue.join() in _join_async_save_thread(), but queue.join() does not have
    # a timeout argument. Hence we use queue.put(timeout=300), in case the last
    # checkpoint takes forever. To achieve that, maxsize needs to be 1.
    self._queue = queue.Queue(maxsize=1)

    # Register to join the async save thread upon exit.
    atexit.register(self._join_async_save_thread)

    self._async_error = None

    global _END_TIME_OF_LAST_ASYNC_WRITE
    with _END_TIME_OF_LAST_ASYNC_WRITE_LOCK:
      if _END_TIME_OF_LAST_ASYNC_WRITE is None:
        _END_TIME_OF_LAST_ASYNC_WRITE = time.time()

  @def_function.function
  def _copy_to_cpu(self):
    """Copy the checkpointed variables from the accelerator to the host CPU.

    TODO(chienchunh): Get the concrete function before firstly called to avoid
                      hangining the accelerators idle during function tracing.
    """
    for t in self._saveable_trackables:
      try:
        t._copy_trackable_to_cpu(object_map=self._object_map)  # pylint: disable=protected-access
      except NotImplementedError as e:
        logging.warning("Trackable %s skipped due to: %s", t, e)

    for tpu_embedding in self._tpu_embedding_objects:
      tpu_embedding._retrieve_variables()  # pylint: disable=protected-access

  def checkpointer(self):
    """Gets or creates the underlying Checkpoint instance."""
    if self._checkpoint is None:
      self._checkpoint = self._checkpointer_impl(**self._checkpoint_items)
    return self._checkpoint

  def _ensure_initialized(self):
    """Initialize the async checkpoint internal state."""
    # This map will be used to store the CPU copy of all checkpointable objects
    self._object_map = object_identity.ObjectIdentityDictionary()
    self._tpu_embedding_objects = []

    # Populate self._all_tracakbles, but exclude the checkpoint instance itself
    # and its save_counter, as they will be returned by `descendants()`.
    exclude_set = object_identity.ObjectIdentitySet()
    exclude_set.add(self.checkpointer())
    exclude_set.add(self.checkpointer().save_counter)
    self._saveable_trackables, all_trackables = _get_all_trackables(
        root=self.checkpointer(), exclude_set=exclude_set)

    # Handle special cases: TPU Embedding, and slot variables.
    # 1. TPUEmbedding: Different from other trackables, TPUEmbedding needs to
    # call `_retrieve_variables` to checkpoint, while populating a dummy copy to
    # the object map.
    # 2. Slot variables: they need to be handled differently as they cannot be
    # retrieved from `TrackableView.descendants()`.

    # Note: dir() is used rather than hasattr() here to avoid triggering
    # custom __getattr__ code, see b/152031870 for context.
    for t in all_trackables:
      # Special case 1: TPU Embedding, populate object_map here
      # Special case 1: Handle TPU Embedding by addnig a dummy instance to the
      # object map. Also add TPUEmbedding to separate list for special handling
      # with values copy.
      if hasattr(type(t), _TPU_EMBEDDING_ATTR):
        self._handle_tpu_embedding(t)
      # Special case 2: handle slot variables. The object_map is populated later
      # when the variable values are being copied to host CPU for the first
      # time.
      if "get_slot_names" in dir(t):
        slot_names = t.get_slot_names()
        for slot_name in slot_names:
          for original_variable in all_trackables:
            if not isinstance(original_variable, variables.Variable):
              continue
            try:
              # Usage of hasattr may result in KeyError
              original_slot_variable = t.get_slot(original_variable, slot_name)
            except (AttributeError, KeyError):
              continue
            if isinstance(original_slot_variable, base.Trackable):
              self._saveable_trackables.append(original_slot_variable)

    # Initiate the underlying Checkpoint instance's save_counter.
    save_counter = self.checkpointer().save_counter.numpy()
    logging.info("Initializing async checkpoint's save_counter: %d",
                 save_counter)

    # Pass the object map of the copied variables to the underlying Checkpoint.
    self.checkpointer()._saver._object_map = self._object_map  # pylint: disable=protected-access

    # We perform a `_copy_to_cpu()` to populate `self._object_map`,
    # initializing copies. We do not call `self._copy_to_cpu()` directly
    # because it is a tf function, which leads to access out of scope error.

    # TODO(charlieruan) Figure out a better work around to solve the access
    # out of scope error.
    for t in self._saveable_trackables:
      try:
        t._copy_trackable_to_cpu(object_map=self._object_map)  # pylint: disable=protected-access
      except NotImplementedError as e:
        logging.warning("Trackable %s skipped due to: %s", t, e)

    for tpu_embedding in self._tpu_embedding_objects:
      tpu_embedding._retrieve_variables()  # pylint: disable=protected-access

    # Initiate the async thread for checkpoint saving.
    self._async_save_thread = threading.Thread(
        target=self._async_save, daemon=True)
    self._async_save_thread.start()

    self._initialized = True

  def _check_async_thread_error(self):
    """Expose the most recent error from the async saving thread to the caller.
    """
    if self._async_error:
      e = self._async_error
      self._async_error = None
      logging.error("Propagating the most recent error from the async thread "
                    "before joining: %s", str(e))
      raise e

  def _join_async_save_thread(self):
    """Join the async save thread.

    The steps for terminating the async save thread:
    1). Put will succeed when the last async save event is done. Putting a false
        triggers the async save thread's while loop to end. We use put instead
        of sync because sync does not have a timeout argument.
    2). Join the async save thread. (The thread may finish before joining.)
    """
    try:
      self._queue.put(False, timeout=300)  # Step-1.
      logging.info("Joining the async save thread.")
      if self._async_save_thread is not None:
        self._async_save_thread.join()  # Step-2.
    except queue.Full:
      logging.error("Timeout waiting for the async save thread; terminating the"
                    " thread instead. The last checkpoint may be incomeplete.")
    finally:
      self._check_async_thread_error()

  def _async_save(self):
    """The thread function for the async checkpoint save."""
    with context.executor_scope(
        executor.new_executor(
            enable_async=False, enable_streaming_enqueue=False)):
      # The main thread inserts: a True to the queue when the user calls save,
      # triggering async save; and a False when we exit the Checkpoint instance.
      while self._queue.get():
        logging.info("Starting async checkpoint save on the device: %s",
                     self._default_device)

        async_save_start_time = time.time()

        # Specify the ops placement on the worker if running with
        # coordinator-worker mode. This is required as launching a new thread
        # would clear the placement policy and make localhost the default
        # placement, while the main thread's default placement would be the
        # master worker's CPU:0.
        try:
          with ops.device(self._default_device):
            with checkpoint_context.async_metrics_context():
              if self._use_checkpoint_save:
                self.checkpointer().save(
                    self._save_file_prefix, self._checkpoint_options
                )
              else:
                self.checkpointer()._write(  # pylint: disable=protected-access
                    self._save_file_prefix,
                    options=self._checkpoint_options,
                )
        except Exception as e:   # # pylint: disable=broad-except
          self._async_error = e
        finally:
          self._queue.task_done()

        async_save_end_time = time.time()
        metrics.AddAsyncCheckpointWriteDuration(
            api_label=_ASYNC_CHECKPOINT,
            microseconds=_get_duration_microseconds(async_save_start_time,
                                                    async_save_end_time))

        # Measure the elapsed time since the last checkpoint.
        # Due to the nature of async checkpoint, here it actually captures the
        # duration between the start_time of the previous checkpoint and the
        # start time of this checkpoint. As a result, the duration of the final
        # async checkpoint is excluded, which is fine since it does not take
        # much time.
        global _END_TIME_OF_LAST_ASYNC_WRITE
        with _END_TIME_OF_LAST_ASYNC_WRITE_LOCK:
          metrics.AddTrainingTimeSaved(
              api_label=_ASYNC_CHECKPOINT,
              microseconds=_get_duration_microseconds(
                  _END_TIME_OF_LAST_ASYNC_WRITE, async_save_start_time))
          _END_TIME_OF_LAST_ASYNC_WRITE = async_save_start_time
    logging.info("Async save thread reached the end of the execution.")

  def _handle_tpu_embedding(self, tpu_embedding):
    """Handle TPUEmbedding.

    This is the only place where we populate object map in the class of
    `AsyncCheckpointHelper`. For all other checkpointable trackables, we
    populate object map using the trackable's own `_copy_trackable_to_cpu()`.

    Args:
      tpu_embedding: TPUEmbedding object to be handled.

    Raises:
      AttributeError: if the input trackable is not TPUEmbedding type.
    """
    if not hasattr(type(tpu_embedding), _TPU_EMBEDDING_ATTR) or not callable(
        tpu_embedding._create_copy_for_async_checkpoint  # pylint: disable=protected-access
    ):
      raise AttributeError(
          "Expecting TPUEmbedding type; got %s" % type(tpu_embedding)
      )

    # Create a dummy TPUEmbedding object and add it to the object_map. This is
    # to prevent the TPUEmbedding's save_callback from being triggered because
    # the embedding values have already being retrieved by AsyncCheckpoint.
    # pylint: disable=protected-access
    new_embedding = tpu_embedding._create_copy_for_async_checkpoint(
        feature_config=tpu_embedding._feature_config,
        optimizer=tpu_embedding._table_config[0]
        if tpu_embedding._table_config
        else None,
        pipeline_execution_with_tensor_core=tpu_embedding._pipeline_execution_with_tensor_core,
    )
    self._object_map[tpu_embedding] = new_embedding
    # pylint: enable=protected-access

    if tpu_embedding not in self._tpu_embedding_objects:
      self._tpu_embedding_objects.append(tpu_embedding)

  @property
  def save_counter(self):
    """An integer variable numbering the checkpoint events.

    This is maintained by the underlying tf.train.Checkpoing object employed by
    AsyncCheckpoint class. The number starts at 0 and gets incremented for each
    checkpoint event.

    Returns:
      The save counter variable.
    """
    return self.checkpointer().save_counter

  def write(self, save_path, options=None):
    """Save the checkpointed variables.

    Args:
      save_path: The file prefix of the checkpoint file.
      options: Optional CheckpointOption instance.

    Returns:
      The full path of the checkpoint file.
    """
    return self._write(save_path, options)

  def _write(self, save_path, options=None):
    """Save the checkpointed variables.

    This method has exactly the same logic as save(), except it does not
    increment the underlying save_counter, which is done by the caller, e.g.,
    CheckpointManager.

    Args:
      save_path: The file prefix of the checkpoint file.
      options: Optional CheckpointOption instance.

    Returns:
      The full path of the checkpoint file.
    """
    write_start_time = time.time()

    if not self._initialized:
      self._ensure_initialized()
    else:
      # First wait for async thread to finish the previous save, then copy the
      # variable values to the host CPU.
      self._queue.join()
      self._copy_to_cpu()

    # Surface the error from the async thread, if any.
    # This step should come after the sem acquision step in the above, so that
    # it makes sure it waits until the previous async save finishes storing the
    # error.
    self._check_async_thread_error()

    # Trigger the async thread to checkpoint the cpu-copied variables.
    # Need to wait until the weight copying finishes before checkpoint save.
    context.async_wait()
    self._save_file_prefix = save_path
    self._use_checkpoint_save = False

    # Ensure that we do not request async checkpointing to the underlying
    # checkpointer as this could lead to an infinite loop.
    self._checkpoint_options = copy.copy(options) if options else None
    if self._checkpoint_options:
      self._checkpoint_options.experimental_enable_async_checkpoint = False

    self._queue.put(True)  # Trigger save in async thread

    write_end_time = time.time()
    metrics.AddCheckpointWriteDuration(
        api_label=_ASYNC_CHECKPOINT,
        microseconds=_get_duration_microseconds(write_start_time,
                                                write_end_time))

    return save_path

  def save(self, save_path, options=None):
    """Save the checkpointed variables.

    Args:
      save_path: The file prefix of the checkpoint file.
      options: Optional CheckpointOption instance.

    Returns:
      The full path of the checkpoint file.
    """
    save_start_time = time.time()

    # If this is the first time that AsyncCheckpoint.save() is called,
    # initialize the internal states like `self._saveable_trackables`. We also
    # populate `self._object_map` (i.e. initializing the cpu-copied variables
    # and copy over the value for the first time) by essentially performing a
    # `self._copy_to_cpu()`, hence the if-else logic here.
    #
    # This is not performed in the initializer because some variables, e.g.,
    # slot variables of the optimizer, were not created until actually running
    # the train function, so we could only get the complete list of the
    # variables after some train steps were run.
    if not self._initialized:
      self._ensure_initialized()
    else:
      # First wait for async thread to finish the previous save, then copy the
      # variable values to the host CPU.
      self._queue.join()
      self._copy_to_cpu()

    # Surface the error from the async thread, if any.
    # This step should come after the sem acquision step in the above, so that
    # it makes sure it waits until the previous async save finishes storing the
    # error.
    self._check_async_thread_error()

    # Retrieve the save counter from the underlying checkpoint object to
    # re-construct the full path of the checkpoint file.
    # This step has to happen before triggering the underlying checkpoint;
    # otherwise, the save_counter value may or may not have been updated.
    save_counter = self.checkpointer().save_counter.numpy() + 1
    full_path = "{}-{}".format(save_path, save_counter)

    # Trigger the async thread to checkpoint the cpu-copied variables.
    # Need to wait until the weight copying finishes before checkpoint save.
    context.async_wait()
    self._save_file_prefix = save_path
    self._use_checkpoint_save = True

    # Ensure that we do not request async checkpointing to the underlying
    # checkpointer as this could lead to an infinite loop.
    self._checkpoint_options = copy.copy(options) if options else None
    if self._checkpoint_options:
      self._checkpoint_options.experimental_enable_async_checkpoint = False

    self._queue.put(True)  # Trigger save in async thread

    save_end_time = time.time()
    metrics.AddCheckpointWriteDuration(
        api_label=_ASYNC_CHECKPOINT,
        microseconds=_get_duration_microseconds(save_start_time, save_end_time))

    return full_path

  def read(self, save_path, options=None):
    """Restore the checkpointed variables.

    This method has exactly the same logic as restore(). This method is
    implemented only to fulfill the duty of subclassing tf.train.Checkpoint.

    Args:
      save_path: The full name of the checkpoint file to be restored.
      options: CheckpointOption instance.

    Returns:
      A load status object, which can be used to make assertions about the
      status of a checkpoint restoration. See tf.train.Checkpoint.restore()
      for more details.
    """
    return self.restore(save_path, options)

  def restore(self, save_path, options=None):
    """Restore the checkpointed variables.

    Args:
      save_path: The full name of the checkpoint file to be restored.
      options: CheckpointOption instance.

    Returns:
      A load status object, which can be used to make assertions about the
      status of a checkpoint restoration. See tf.train.Checkpoint.restore()
      for more details.
    """
    # Ensure that we do not request async checkpointing to the underlying
    # checkpointer as this could lead to an infinite loop.
    self._checkpoint_options = (
        copy.copy(options) if options else self._checkpoint_options)
    if self._checkpoint_options:
      self._checkpoint_options.experimental_enable_async_checkpoint = False

    # Wait for any ongoing checkpoint event to finish.
    self._queue.join()
    # Restore values of the cpu-copied variables directly back to accelerators
    status = self.checkpointer().restore(save_path, self._checkpoint_options)

    return status

  def sync(self):
    """Sync on any ongoing save or restore events."""
    self._queue.join()
    logging.info("Sync on ongoing save/restore.")