# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A wrapper around DebugDataReader used for retrieving tfdbg v2 data."""


import threading

from tensorboard import errors

# Dummy run name for the debugger.
# Currently, the `DebuggerV2ExperimentMultiplexer` class is tied to a single
# logdir, which holds at most one DebugEvent file set in the tfdbg v2 (tfdbg2
# for short) format.
# TODO(cais): When tfdbg2 allows there to be multiple DebugEvent file sets in
# the same logdir, replace this magic string with actual run names.
DEFAULT_DEBUGGER_RUN_NAME = "__default_debugger_run__"

# Default number of alerts per monitor type.
# Limiting the number of alerts is based on the consideration that usually
# only the first few alerting events are the most critical and the subsequent
# ones are either repetitions of the earlier ones or caused by the earlier ones.
DEFAULT_PER_TYPE_ALERT_LIMIT = 1000

# Default interval between successive calls to `DebugDataReader.update()``.
DEFAULT_RELOAD_INTERVAL_SEC = 30


def run_repeatedly_in_background(target, interval_sec):
    """Run a target task repeatedly in the background.

    In the context of this module, `target` is the `update()` method of the
    underlying reader for tfdbg2-format data.
    This method is mocked by unit tests for deterministic behaviors during
    testing.

    Args:
      target: The target task to run in the background, a callable with no args.
      interval_sec: Time interval between repeats, in seconds.

    Returns:
      - A `threading.Event` object that can be used to interrupt an ongoing
          waiting interval between successive runs of `target`. To interrupt the
          interval, call the `set()` method of the object.
      - The `threading.Thread` object on which `target` is run repeatedly.
    """
    event = threading.Event()

    def _run_repeatedly():
        while True:
            target()
            event.wait(interval_sec)
            event.clear()

    # Use `daemon=True` to make sure the thread doesn't block program exit.
    thread = threading.Thread(target=_run_repeatedly, daemon=True)
    thread.start()
    return event, thread


def _alert_to_json(alert):
    # TODO(cais): Replace this with Alert.to_json() when supported by the
    # backend.
    from tensorflow.python.debug.lib import debug_events_monitors

    if isinstance(alert, debug_events_monitors.InfNanAlert):
        return {
            "alert_type": "InfNanAlert",
            "op_type": alert.op_type,
            "output_slot": alert.output_slot,
            # TODO(cais): Once supported by backend, add 'op_name' key
            # for intra-graph execution events.
            "size": alert.size,
            "num_neg_inf": alert.num_neg_inf,
            "num_pos_inf": alert.num_pos_inf,
            "num_nan": alert.num_nan,
            "execution_index": alert.execution_index,
            "graph_execution_trace_index": alert.graph_execution_trace_index,
        }
    else:
        raise TypeError("Unrecognized alert subtype: %s" % type(alert))


def parse_tensor_name(tensor_name):
    """Helper function that extracts op name and slot from tensor name."""
    output_slot = 0
    if ":" in tensor_name:
        op_name, output_slot = tensor_name.split(":")
        output_slot = int(output_slot)
    else:
        op_name = tensor_name
    return op_name, output_slot


class DebuggerV2EventMultiplexer:
    """A class used for accessing tfdbg v2 DebugEvent data on local filesystem.

    This class is a short-term hack, mirroring the EventMultiplexer for the main
    TensorBoard plugins (e.g., scalar, histogram and graphs.) As such, it only
    implements the methods relevant to the Debugger V2 pluggin.

    TODO(cais): Integrate it with EventMultiplexer and use the integrated class
    from MultiplexerDataProvider for a single path of accessing debugger and
    non-debugger data.
    """

    def __init__(self, logdir):
        """Constructor for the `DebugEventMultiplexer`.

        Args:
          logdir: Path to the directory to load the tfdbg v2 data from.
        """
        self._logdir = logdir
        self._reader = None
        self._reader_lock = threading.Lock()
        self._reload_needed_event = None
        # Create the reader for the tfdbg2 data in the lodir as soon as
        # the backend of the debugger-v2 plugin is created, so it doesn't need
        # to wait for the first request from the FE to start loading data.
        self._tryCreateReader()

    def _tryCreateReader(self):
        """Try creating reader for tfdbg2 data in the logdir.

        If the reader has already been created, a new one will not be created and
        this function is a no-op.

        If a reader has not been created, create it and start periodic calls to
        `update()` on a separate thread.
        """
        if self._reader:
            return
        with self._reader_lock:
            if not self._reader:
                try:
                    # TODO(cais): Avoid conditional imports and instead use
                    # plugin loader to gate the loading of this entire plugin.
                    from tensorflow.python.debug.lib import debug_events_reader
                    from tensorflow.python.debug.lib import (
                        debug_events_monitors,
                    )
                except ImportError:
                    # This ensures graceful behavior when tensorflow install is
                    # unavailable or when the installed tensorflow version does not
                    # contain the required modules.
                    return

                try:
                    self._reader = debug_events_reader.DebugDataReader(
                        self._logdir
                    )
                except AttributeError:
                    # Gracefully fail for users without the required API changes to
                    # debug_events_reader.DebugDataReader introduced in
                    # TF 2.1.0.dev20200103. This should be safe to remove when
                    # TF 2.2 is released.
                    return
                except ValueError:
                    # When no DebugEvent file set is found in the logdir, a
                    # `ValueError` is thrown.
                    return

                self._monitors = [
                    debug_events_monitors.InfNanMonitor(
                        self._reader, limit=DEFAULT_PER_TYPE_ALERT_LIMIT
                    )
                ]
                self._reload_needed_event, _ = run_repeatedly_in_background(
                    self._reader.update, DEFAULT_RELOAD_INTERVAL_SEC
                )

    def _reloadReader(self):
        """If a reader exists and has started period updating, unblock the update.

        The updates are performed periodically with a sleep interval between
        successive calls to the reader's update() method. Calling this method
        interrupts the sleep immediately if one is ongoing.
        """
        if self._reload_needed_event:
            self._reload_needed_event.set()

    def FirstEventTimestamp(self, run):
        """Return the timestamp of the first DebugEvent of the given run.

        This may perform I/O if no events have been loaded yet for the run.

        Args:
          run: A string name of the run for which the timestamp is retrieved.
            This currently must be hardcoded as `DEFAULT_DEBUGGER_RUN_NAME`,
            as each logdir contains at most one DebugEvent file set (i.e., a
            run of a tfdbg2-instrumented TensorFlow program.)

        Returns:
            The wall_time of the first event of the run, which will be in seconds
            since the epoch as a `float`.
        """
        if self._reader is None:
            raise ValueError("No tfdbg2 runs exists.")
        if run != DEFAULT_DEBUGGER_RUN_NAME:
            raise ValueError(
                "Expected run name to be %s, but got %s"
                % (DEFAULT_DEBUGGER_RUN_NAME, run)
            )
        return self._reader.starting_wall_time()

    def PluginRunToTagToContent(self, plugin_name):
        raise NotImplementedError(
            "DebugDataMultiplexer.PluginRunToTagToContent() has not been "
            "implemented yet."
        )

    def Runs(self):
        """Return all the tfdbg2 run names in the logdir watched by this instance.

        The `Run()` method of this class is specialized for the tfdbg2-format
        DebugEvent files.

        As a side effect, this method unblocks the underlying reader's period
        reloading if a reader exists. This lets the reader update at a higher
        frequency than the default one with 30-second sleeping period between
        reloading when data is being queried actively from this instance.
        Note that this `Runs()` method is used by all other public data-access
        methods of this class (e.g., `ExecutionData()`, `GraphExecutionData()`).
        Hence calls to those methods will lead to accelerated data reloading of
        the reader.

        Returns:
          If tfdbg2-format data exists in the `logdir` of this object, returns:
              ```
              {runName: { "debugger-v2": [tag1, tag2, tag3] } }
              ```
              where `runName` is the hard-coded string `DEFAULT_DEBUGGER_RUN_NAME`
              string. This is related to the fact that tfdbg2 currently contains
              at most one DebugEvent file set per directory.
          If no tfdbg2-format data exists in the `logdir`, an empty `dict`.
        """
        # Call `_tryCreateReader()` here to cover the possibility of tfdbg2
        # data start being written to the logdir after the tensorboard backend
        # starts.
        self._tryCreateReader()
        if self._reader:
            # If a _reader exists, unblock its reloading (on a separate thread)
            # immediately.
            self._reloadReader()
            return {
                DEFAULT_DEBUGGER_RUN_NAME: {
                    # TODO(cais): Add the semantically meaningful tag names such as
                    # 'execution_digests_book', 'alerts_book'
                    "debugger-v2": []
                }
            }
        else:
            return {}

    def _checkBeginEndIndices(self, begin, end, total_count):
        if begin < 0:
            raise errors.InvalidArgumentError(
                "Invalid begin index (%d)" % begin
            )
        if end > total_count:
            raise errors.InvalidArgumentError(
                "end index (%d) out of bounds (%d)" % (end, total_count)
            )
        if end >= 0 and end < begin:
            raise errors.InvalidArgumentError(
                "end index (%d) is unexpectedly less than begin index (%d)"
                % (end, begin)
            )
        if end < 0:  # This means all digests.
            end = total_count
        return end

    def Alerts(self, run, begin, end, alert_type_filter=None):
        """Get alerts from the debugged TensorFlow program.

        Args:
          run: The tfdbg2 run to get Alerts from.
          begin: Beginning alert index.
          end: Ending alert index.
          alert_type_filter: Optional filter string for alert type, used to
            restrict retrieved alerts data to a single type. If used,
            `begin` and `end` refer to the beginning and ending indices within
            the filtered alert type.
        """
        from tensorflow.python.debug.lib import debug_events_monitors

        runs = self.Runs()
        if run not in runs:
            # TODO(cais): This should generate a 400 response instead.
            return None
        alerts = []
        alerts_breakdown = dict()
        alerts_by_type = dict()
        for monitor in self._monitors:
            monitor_alerts = monitor.alerts()
            if not monitor_alerts:
                continue
            alerts.extend(monitor_alerts)
            # TODO(cais): Replace this with Alert.to_json() when
            # monitor.alert_type() is available.
            if isinstance(monitor, debug_events_monitors.InfNanMonitor):
                alert_type = "InfNanAlert"
            else:
                alert_type = "__MiscellaneousAlert__"
            alerts_breakdown[alert_type] = len(monitor_alerts)
            alerts_by_type[alert_type] = monitor_alerts
        num_alerts = len(alerts)
        if alert_type_filter is not None:
            if alert_type_filter not in alerts_breakdown:
                raise errors.InvalidArgumentError(
                    "Filtering of alerts failed: alert type %s does not exist"
                    % alert_type_filter
                )
            alerts = alerts_by_type[alert_type_filter]
        end = self._checkBeginEndIndices(begin, end, len(alerts))
        return {
            "begin": begin,
            "end": end,
            "alert_type": alert_type_filter,
            "num_alerts": num_alerts,
            "alerts_breakdown": alerts_breakdown,
            "per_type_alert_limit": DEFAULT_PER_TYPE_ALERT_LIMIT,
            "alerts": [_alert_to_json(alert) for alert in alerts[begin:end]],
        }

    def ExecutionDigests(self, run, begin, end):
        """Get ExecutionDigests.

        Args:
          run: The tfdbg2 run to get `ExecutionDigest`s from.
          begin: Beginning execution index.
          end: Ending execution index.

        Returns:
          A JSON-serializable object containing the `ExecutionDigest`s and
          related meta-information
        """
        runs = self.Runs()
        if run not in runs:
            return None
        # TODO(cais): For scalability, use begin and end kwargs when available in
        # `DebugDataReader.execution()`.`
        execution_digests = self._reader.executions(digest=True)
        end = self._checkBeginEndIndices(begin, end, len(execution_digests))
        return {
            "begin": begin,
            "end": end,
            "num_digests": len(execution_digests),
            "execution_digests": [
                digest.to_json() for digest in execution_digests[begin:end]
            ],
        }

    def ExecutionData(self, run, begin, end):
        """Get Execution data objects (Detailed, non-digest form).

        Args:
          run: The tfdbg2 run to get `ExecutionDigest`s from.
          begin: Beginning execution index.
          end: Ending execution index.

        Returns:
          A JSON-serializable object containing the `ExecutionDigest`s and
          related meta-information
        """
        runs = self.Runs()
        if run not in runs:
            return None
        execution_digests = self._reader.executions(digest=True)
        end = self._checkBeginEndIndices(begin, end, len(execution_digests))
        execution_digests = execution_digests[begin:end]
        executions = self._reader.executions(digest=False, begin=begin, end=end)
        return {
            "begin": begin,
            "end": end,
            "executions": [execution.to_json() for execution in executions],
        }

    def GraphExecutionDigests(self, run, begin, end, trace_id=None):
        """Get `GraphExecutionTraceDigest`s.

        Args:
          run: The tfdbg2 run to get `GraphExecutionTraceDigest`s from.
          begin: Beginning graph-execution index.
          end: Ending graph-execution index.

        Returns:
          A JSON-serializable object containing the `ExecutionDigest`s and
          related meta-information
        """
        runs = self.Runs()
        if run not in runs:
            return None
        # TODO(cais): Implement support for trace_id once the joining of eager
        # execution and intra-graph execution is supported by DebugDataReader.
        if trace_id is not None:
            raise NotImplementedError(
                "trace_id support for GraphExecutionTraceDigest is "
                "not implemented yet."
            )
        graph_exec_digests = self._reader.graph_execution_traces(digest=True)
        end = self._checkBeginEndIndices(begin, end, len(graph_exec_digests))
        return {
            "begin": begin,
            "end": end,
            "num_digests": len(graph_exec_digests),
            "graph_execution_digests": [
                digest.to_json() for digest in graph_exec_digests[begin:end]
            ],
        }

    def GraphExecutionData(self, run, begin, end, trace_id=None):
        """Get `GraphExecutionTrace`s.

        Args:
          run: The tfdbg2 run to get `GraphExecutionTrace`s from.
          begin: Beginning graph-execution index.
          end: Ending graph-execution index.

        Returns:
          A JSON-serializable object containing the `ExecutionDigest`s and
          related meta-information
        """
        runs = self.Runs()
        if run not in runs:
            return None
        # TODO(cais): Implement support for trace_id once the joining of eager
        # execution and intra-graph execution is supported by DebugDataReader.
        if trace_id is not None:
            raise NotImplementedError(
                "trace_id support for GraphExecutionTraceData is "
                "not implemented yet."
            )
        digests = self._reader.graph_execution_traces(digest=True)
        end = self._checkBeginEndIndices(begin, end, len(digests))
        graph_executions = self._reader.graph_execution_traces(
            digest=False, begin=begin, end=end
        )
        return {
            "begin": begin,
            "end": end,
            "graph_executions": [
                graph_exec.to_json() for graph_exec in graph_executions
            ],
        }

    def GraphInfo(self, run, graph_id):
        """Get the information regarding a TensorFlow graph.

        Args:
          run: Name of the run.
          graph_id: Debugger-generated ID of the graph in question.
            This information is available in the return values
            of `GraphOpInfo`, `GraphExecution`, etc.

        Returns:
          A JSON-serializable object containing the information regarding
            the TensorFlow graph.

        Raises:
          NotFoundError if the graph_id is not known to the debugger.
        """
        runs = self.Runs()
        if run not in runs:
            return None
        try:
            graph = self._reader.graph_by_id(graph_id)
        except KeyError:
            raise errors.NotFoundError(
                'There is no graph with ID "%s"' % graph_id
            )
        return graph.to_json()

    def GraphOpInfo(self, run, graph_id, op_name):
        """Get the information regarding a graph op's creation.

        Args:
          run: Name of the run.
          graph_id: Debugger-generated ID of the graph that contains
            the op in question. This ID is available from other methods
            of this class, e.g., the return value of `GraphExecutionDigests()`.
          op_name: Name of the op.

        Returns:
          A JSON-serializable object containing the information regarding
            the op's creation and its immediate inputs and consumers.

        Raises:
          NotFoundError if the graph_id or op_name does not exist.
        """
        runs = self.Runs()
        if run not in runs:
            return None
        try:
            graph = self._reader.graph_by_id(graph_id)
        except KeyError:
            raise errors.NotFoundError(
                'There is no graph with ID "%s"' % graph_id
            )
        try:
            op_creation_digest = graph.get_op_creation_digest(op_name)
        except KeyError:
            raise errors.NotFoundError(
                'There is no op named "%s" in graph with ID "%s"'
                % (op_name, graph_id)
            )
        data_object = self._opCreationDigestToDataObject(
            op_creation_digest, graph
        )
        # Populate data about immediate inputs.
        for input_spec in data_object["inputs"]:
            try:
                input_op_digest = graph.get_op_creation_digest(
                    input_spec["op_name"]
                )
            except KeyError:
                input_op_digest = None
            if input_op_digest:
                input_spec["data"] = self._opCreationDigestToDataObject(
                    input_op_digest, graph
                )
        # Populate data about immediate consuming ops.
        for slot_consumer_specs in data_object["consumers"]:
            for consumer_spec in slot_consumer_specs:
                try:
                    digest = graph.get_op_creation_digest(
                        consumer_spec["op_name"]
                    )
                except KeyError:
                    digest = None
                if digest:
                    consumer_spec["data"] = self._opCreationDigestToDataObject(
                        digest, graph
                    )
        return data_object

    def _opCreationDigestToDataObject(self, op_creation_digest, graph):
        if op_creation_digest is None:
            return None
        json_object = op_creation_digest.to_json()
        del json_object["graph_id"]
        json_object["graph_ids"] = self._getGraphStackIds(
            op_creation_digest.graph_id
        )
        # TODO(cais): "num_outputs" should be populated in to_json() instead.
        json_object["num_outputs"] = op_creation_digest.num_outputs
        del json_object["input_names"]

        json_object["inputs"] = []
        for input_tensor_name in op_creation_digest.input_names or []:
            input_op_name, output_slot = parse_tensor_name(input_tensor_name)
            json_object["inputs"].append(
                {"op_name": input_op_name, "output_slot": output_slot}
            )
        json_object["consumers"] = []
        for _ in range(json_object["num_outputs"]):
            json_object["consumers"].append([])
        for src_slot, consumer_op_name, dst_slot in graph.get_op_consumers(
            json_object["op_name"]
        ):
            json_object["consumers"][src_slot].append(
                {"op_name": consumer_op_name, "input_slot": dst_slot}
            )
        return json_object

    def _getGraphStackIds(self, graph_id):
        """Retrieve the IDs of all outer graphs of a graph.

        Args:
          graph_id: Id of the graph being queried with respect to its outer
            graphs context.

        Returns:
          A list of graph_ids, ordered from outermost to innermost, including
            the input `graph_id` argument as the last item.
        """
        graph_ids = [graph_id]
        graph = self._reader.graph_by_id(graph_id)
        while graph.outer_graph_id:
            graph_ids.insert(0, graph.outer_graph_id)
            graph = self._reader.graph_by_id(graph.outer_graph_id)
        return graph_ids

    def SourceFileList(self, run):
        runs = self.Runs()
        if run not in runs:
            return None
        return self._reader.source_file_list()

    def SourceLines(self, run, index):
        runs = self.Runs()
        if run not in runs:
            return None
        try:
            host_name, file_path = self._reader.source_file_list()[index]
        except IndexError:
            raise errors.NotFoundError(
                "There is no source-code file at index %d" % index
            )
        return {
            "host_name": host_name,
            "file_path": file_path,
            "lines": self._reader.source_lines(host_name, file_path),
        }

    def StackFrames(self, run, stack_frame_ids):
        runs = self.Runs()
        if run not in runs:
            return None
        stack_frames = []
        for stack_frame_id in stack_frame_ids:
            if stack_frame_id not in self._reader._stack_frame_by_id:
                raise errors.NotFoundError(
                    "Cannot find stack frame with ID %s" % stack_frame_id
                )
            # TODO(cais): Use public method (`stack_frame_by_id()`) when
            # available.
            # pylint: disable=protected-access
            stack_frames.append(self._reader._stack_frame_by_id[stack_frame_id])
            # pylint: enable=protected-access
        return {"stack_frames": stack_frames}