Intelegentny_Pszczelarz/.venv/Lib/site-packages/tensorboard/plugins/debugger_v2/debug_data_multiplexer.py
2023-06-19 00:49:18 +02:00

633 lines
24 KiB
Python

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A wrapper around DebugDataReader used for retrieving tfdbg v2 data."""
import threading
from tensorboard import errors
# Dummy run name for the debugger.
# Currently, the `DebuggerV2ExperimentMultiplexer` class is tied to a single
# logdir, which holds at most one DebugEvent file set in the tfdbg v2 (tfdbg2
# for short) format.
# TODO(cais): When tfdbg2 allows there to be multiple DebugEvent file sets in
# the same logdir, replace this magic string with actual run names.
DEFAULT_DEBUGGER_RUN_NAME = "__default_debugger_run__"
# Default number of alerts per monitor type.
# Limiting the number of alerts is based on the consideration that usually
# only the first few alerting events are the most critical and the subsequent
# ones are either repetitions of the earlier ones or caused by the earlier ones.
DEFAULT_PER_TYPE_ALERT_LIMIT = 1000
# Default interval between successive calls to `DebugDataReader.update()``.
DEFAULT_RELOAD_INTERVAL_SEC = 30
def run_repeatedly_in_background(target, interval_sec):
"""Run a target task repeatedly in the background.
In the context of this module, `target` is the `update()` method of the
underlying reader for tfdbg2-format data.
This method is mocked by unit tests for deterministic behaviors during
testing.
Args:
target: The target task to run in the background, a callable with no args.
interval_sec: Time interval between repeats, in seconds.
Returns:
- A `threading.Event` object that can be used to interrupt an ongoing
waiting interval between successive runs of `target`. To interrupt the
interval, call the `set()` method of the object.
- The `threading.Thread` object on which `target` is run repeatedly.
"""
event = threading.Event()
def _run_repeatedly():
while True:
target()
event.wait(interval_sec)
event.clear()
# Use `daemon=True` to make sure the thread doesn't block program exit.
thread = threading.Thread(target=_run_repeatedly, daemon=True)
thread.start()
return event, thread
def _alert_to_json(alert):
# TODO(cais): Replace this with Alert.to_json() when supported by the
# backend.
from tensorflow.python.debug.lib import debug_events_monitors
if isinstance(alert, debug_events_monitors.InfNanAlert):
return {
"alert_type": "InfNanAlert",
"op_type": alert.op_type,
"output_slot": alert.output_slot,
# TODO(cais): Once supported by backend, add 'op_name' key
# for intra-graph execution events.
"size": alert.size,
"num_neg_inf": alert.num_neg_inf,
"num_pos_inf": alert.num_pos_inf,
"num_nan": alert.num_nan,
"execution_index": alert.execution_index,
"graph_execution_trace_index": alert.graph_execution_trace_index,
}
else:
raise TypeError("Unrecognized alert subtype: %s" % type(alert))
def parse_tensor_name(tensor_name):
"""Helper function that extracts op name and slot from tensor name."""
output_slot = 0
if ":" in tensor_name:
op_name, output_slot = tensor_name.split(":")
output_slot = int(output_slot)
else:
op_name = tensor_name
return op_name, output_slot
class DebuggerV2EventMultiplexer:
"""A class used for accessing tfdbg v2 DebugEvent data on local filesystem.
This class is a short-term hack, mirroring the EventMultiplexer for the main
TensorBoard plugins (e.g., scalar, histogram and graphs.) As such, it only
implements the methods relevant to the Debugger V2 pluggin.
TODO(cais): Integrate it with EventMultiplexer and use the integrated class
from MultiplexerDataProvider for a single path of accessing debugger and
non-debugger data.
"""
def __init__(self, logdir):
"""Constructor for the `DebugEventMultiplexer`.
Args:
logdir: Path to the directory to load the tfdbg v2 data from.
"""
self._logdir = logdir
self._reader = None
self._reader_lock = threading.Lock()
self._reload_needed_event = None
# Create the reader for the tfdbg2 data in the lodir as soon as
# the backend of the debugger-v2 plugin is created, so it doesn't need
# to wait for the first request from the FE to start loading data.
self._tryCreateReader()
def _tryCreateReader(self):
"""Try creating reader for tfdbg2 data in the logdir.
If the reader has already been created, a new one will not be created and
this function is a no-op.
If a reader has not been created, create it and start periodic calls to
`update()` on a separate thread.
"""
if self._reader:
return
with self._reader_lock:
if not self._reader:
try:
# TODO(cais): Avoid conditional imports and instead use
# plugin loader to gate the loading of this entire plugin.
from tensorflow.python.debug.lib import debug_events_reader
from tensorflow.python.debug.lib import (
debug_events_monitors,
)
except ImportError:
# This ensures graceful behavior when tensorflow install is
# unavailable or when the installed tensorflow version does not
# contain the required modules.
return
try:
self._reader = debug_events_reader.DebugDataReader(
self._logdir
)
except AttributeError:
# Gracefully fail for users without the required API changes to
# debug_events_reader.DebugDataReader introduced in
# TF 2.1.0.dev20200103. This should be safe to remove when
# TF 2.2 is released.
return
except ValueError:
# When no DebugEvent file set is found in the logdir, a
# `ValueError` is thrown.
return
self._monitors = [
debug_events_monitors.InfNanMonitor(
self._reader, limit=DEFAULT_PER_TYPE_ALERT_LIMIT
)
]
self._reload_needed_event, _ = run_repeatedly_in_background(
self._reader.update, DEFAULT_RELOAD_INTERVAL_SEC
)
def _reloadReader(self):
"""If a reader exists and has started period updating, unblock the update.
The updates are performed periodically with a sleep interval between
successive calls to the reader's update() method. Calling this method
interrupts the sleep immediately if one is ongoing.
"""
if self._reload_needed_event:
self._reload_needed_event.set()
def FirstEventTimestamp(self, run):
"""Return the timestamp of the first DebugEvent of the given run.
This may perform I/O if no events have been loaded yet for the run.
Args:
run: A string name of the run for which the timestamp is retrieved.
This currently must be hardcoded as `DEFAULT_DEBUGGER_RUN_NAME`,
as each logdir contains at most one DebugEvent file set (i.e., a
run of a tfdbg2-instrumented TensorFlow program.)
Returns:
The wall_time of the first event of the run, which will be in seconds
since the epoch as a `float`.
"""
if self._reader is None:
raise ValueError("No tfdbg2 runs exists.")
if run != DEFAULT_DEBUGGER_RUN_NAME:
raise ValueError(
"Expected run name to be %s, but got %s"
% (DEFAULT_DEBUGGER_RUN_NAME, run)
)
return self._reader.starting_wall_time()
def PluginRunToTagToContent(self, plugin_name):
raise NotImplementedError(
"DebugDataMultiplexer.PluginRunToTagToContent() has not been "
"implemented yet."
)
def Runs(self):
"""Return all the tfdbg2 run names in the logdir watched by this instance.
The `Run()` method of this class is specialized for the tfdbg2-format
DebugEvent files.
As a side effect, this method unblocks the underlying reader's period
reloading if a reader exists. This lets the reader update at a higher
frequency than the default one with 30-second sleeping period between
reloading when data is being queried actively from this instance.
Note that this `Runs()` method is used by all other public data-access
methods of this class (e.g., `ExecutionData()`, `GraphExecutionData()`).
Hence calls to those methods will lead to accelerated data reloading of
the reader.
Returns:
If tfdbg2-format data exists in the `logdir` of this object, returns:
```
{runName: { "debugger-v2": [tag1, tag2, tag3] } }
```
where `runName` is the hard-coded string `DEFAULT_DEBUGGER_RUN_NAME`
string. This is related to the fact that tfdbg2 currently contains
at most one DebugEvent file set per directory.
If no tfdbg2-format data exists in the `logdir`, an empty `dict`.
"""
# Call `_tryCreateReader()` here to cover the possibility of tfdbg2
# data start being written to the logdir after the tensorboard backend
# starts.
self._tryCreateReader()
if self._reader:
# If a _reader exists, unblock its reloading (on a separate thread)
# immediately.
self._reloadReader()
return {
DEFAULT_DEBUGGER_RUN_NAME: {
# TODO(cais): Add the semantically meaningful tag names such as
# 'execution_digests_book', 'alerts_book'
"debugger-v2": []
}
}
else:
return {}
def _checkBeginEndIndices(self, begin, end, total_count):
if begin < 0:
raise errors.InvalidArgumentError(
"Invalid begin index (%d)" % begin
)
if end > total_count:
raise errors.InvalidArgumentError(
"end index (%d) out of bounds (%d)" % (end, total_count)
)
if end >= 0 and end < begin:
raise errors.InvalidArgumentError(
"end index (%d) is unexpectedly less than begin index (%d)"
% (end, begin)
)
if end < 0: # This means all digests.
end = total_count
return end
def Alerts(self, run, begin, end, alert_type_filter=None):
"""Get alerts from the debugged TensorFlow program.
Args:
run: The tfdbg2 run to get Alerts from.
begin: Beginning alert index.
end: Ending alert index.
alert_type_filter: Optional filter string for alert type, used to
restrict retrieved alerts data to a single type. If used,
`begin` and `end` refer to the beginning and ending indices within
the filtered alert type.
"""
from tensorflow.python.debug.lib import debug_events_monitors
runs = self.Runs()
if run not in runs:
# TODO(cais): This should generate a 400 response instead.
return None
alerts = []
alerts_breakdown = dict()
alerts_by_type = dict()
for monitor in self._monitors:
monitor_alerts = monitor.alerts()
if not monitor_alerts:
continue
alerts.extend(monitor_alerts)
# TODO(cais): Replace this with Alert.to_json() when
# monitor.alert_type() is available.
if isinstance(monitor, debug_events_monitors.InfNanMonitor):
alert_type = "InfNanAlert"
else:
alert_type = "__MiscellaneousAlert__"
alerts_breakdown[alert_type] = len(monitor_alerts)
alerts_by_type[alert_type] = monitor_alerts
num_alerts = len(alerts)
if alert_type_filter is not None:
if alert_type_filter not in alerts_breakdown:
raise errors.InvalidArgumentError(
"Filtering of alerts failed: alert type %s does not exist"
% alert_type_filter
)
alerts = alerts_by_type[alert_type_filter]
end = self._checkBeginEndIndices(begin, end, len(alerts))
return {
"begin": begin,
"end": end,
"alert_type": alert_type_filter,
"num_alerts": num_alerts,
"alerts_breakdown": alerts_breakdown,
"per_type_alert_limit": DEFAULT_PER_TYPE_ALERT_LIMIT,
"alerts": [_alert_to_json(alert) for alert in alerts[begin:end]],
}
def ExecutionDigests(self, run, begin, end):
"""Get ExecutionDigests.
Args:
run: The tfdbg2 run to get `ExecutionDigest`s from.
begin: Beginning execution index.
end: Ending execution index.
Returns:
A JSON-serializable object containing the `ExecutionDigest`s and
related meta-information
"""
runs = self.Runs()
if run not in runs:
return None
# TODO(cais): For scalability, use begin and end kwargs when available in
# `DebugDataReader.execution()`.`
execution_digests = self._reader.executions(digest=True)
end = self._checkBeginEndIndices(begin, end, len(execution_digests))
return {
"begin": begin,
"end": end,
"num_digests": len(execution_digests),
"execution_digests": [
digest.to_json() for digest in execution_digests[begin:end]
],
}
def ExecutionData(self, run, begin, end):
"""Get Execution data objects (Detailed, non-digest form).
Args:
run: The tfdbg2 run to get `ExecutionDigest`s from.
begin: Beginning execution index.
end: Ending execution index.
Returns:
A JSON-serializable object containing the `ExecutionDigest`s and
related meta-information
"""
runs = self.Runs()
if run not in runs:
return None
execution_digests = self._reader.executions(digest=True)
end = self._checkBeginEndIndices(begin, end, len(execution_digests))
execution_digests = execution_digests[begin:end]
executions = self._reader.executions(digest=False, begin=begin, end=end)
return {
"begin": begin,
"end": end,
"executions": [execution.to_json() for execution in executions],
}
def GraphExecutionDigests(self, run, begin, end, trace_id=None):
"""Get `GraphExecutionTraceDigest`s.
Args:
run: The tfdbg2 run to get `GraphExecutionTraceDigest`s from.
begin: Beginning graph-execution index.
end: Ending graph-execution index.
Returns:
A JSON-serializable object containing the `ExecutionDigest`s and
related meta-information
"""
runs = self.Runs()
if run not in runs:
return None
# TODO(cais): Implement support for trace_id once the joining of eager
# execution and intra-graph execution is supported by DebugDataReader.
if trace_id is not None:
raise NotImplementedError(
"trace_id support for GraphExecutionTraceDigest is "
"not implemented yet."
)
graph_exec_digests = self._reader.graph_execution_traces(digest=True)
end = self._checkBeginEndIndices(begin, end, len(graph_exec_digests))
return {
"begin": begin,
"end": end,
"num_digests": len(graph_exec_digests),
"graph_execution_digests": [
digest.to_json() for digest in graph_exec_digests[begin:end]
],
}
def GraphExecutionData(self, run, begin, end, trace_id=None):
"""Get `GraphExecutionTrace`s.
Args:
run: The tfdbg2 run to get `GraphExecutionTrace`s from.
begin: Beginning graph-execution index.
end: Ending graph-execution index.
Returns:
A JSON-serializable object containing the `ExecutionDigest`s and
related meta-information
"""
runs = self.Runs()
if run not in runs:
return None
# TODO(cais): Implement support for trace_id once the joining of eager
# execution and intra-graph execution is supported by DebugDataReader.
if trace_id is not None:
raise NotImplementedError(
"trace_id support for GraphExecutionTraceData is "
"not implemented yet."
)
digests = self._reader.graph_execution_traces(digest=True)
end = self._checkBeginEndIndices(begin, end, len(digests))
graph_executions = self._reader.graph_execution_traces(
digest=False, begin=begin, end=end
)
return {
"begin": begin,
"end": end,
"graph_executions": [
graph_exec.to_json() for graph_exec in graph_executions
],
}
def GraphInfo(self, run, graph_id):
"""Get the information regarding a TensorFlow graph.
Args:
run: Name of the run.
graph_id: Debugger-generated ID of the graph in question.
This information is available in the return values
of `GraphOpInfo`, `GraphExecution`, etc.
Returns:
A JSON-serializable object containing the information regarding
the TensorFlow graph.
Raises:
NotFoundError if the graph_id is not known to the debugger.
"""
runs = self.Runs()
if run not in runs:
return None
try:
graph = self._reader.graph_by_id(graph_id)
except KeyError:
raise errors.NotFoundError(
'There is no graph with ID "%s"' % graph_id
)
return graph.to_json()
def GraphOpInfo(self, run, graph_id, op_name):
"""Get the information regarding a graph op's creation.
Args:
run: Name of the run.
graph_id: Debugger-generated ID of the graph that contains
the op in question. This ID is available from other methods
of this class, e.g., the return value of `GraphExecutionDigests()`.
op_name: Name of the op.
Returns:
A JSON-serializable object containing the information regarding
the op's creation and its immediate inputs and consumers.
Raises:
NotFoundError if the graph_id or op_name does not exist.
"""
runs = self.Runs()
if run not in runs:
return None
try:
graph = self._reader.graph_by_id(graph_id)
except KeyError:
raise errors.NotFoundError(
'There is no graph with ID "%s"' % graph_id
)
try:
op_creation_digest = graph.get_op_creation_digest(op_name)
except KeyError:
raise errors.NotFoundError(
'There is no op named "%s" in graph with ID "%s"'
% (op_name, graph_id)
)
data_object = self._opCreationDigestToDataObject(
op_creation_digest, graph
)
# Populate data about immediate inputs.
for input_spec in data_object["inputs"]:
try:
input_op_digest = graph.get_op_creation_digest(
input_spec["op_name"]
)
except KeyError:
input_op_digest = None
if input_op_digest:
input_spec["data"] = self._opCreationDigestToDataObject(
input_op_digest, graph
)
# Populate data about immediate consuming ops.
for slot_consumer_specs in data_object["consumers"]:
for consumer_spec in slot_consumer_specs:
try:
digest = graph.get_op_creation_digest(
consumer_spec["op_name"]
)
except KeyError:
digest = None
if digest:
consumer_spec["data"] = self._opCreationDigestToDataObject(
digest, graph
)
return data_object
def _opCreationDigestToDataObject(self, op_creation_digest, graph):
if op_creation_digest is None:
return None
json_object = op_creation_digest.to_json()
del json_object["graph_id"]
json_object["graph_ids"] = self._getGraphStackIds(
op_creation_digest.graph_id
)
# TODO(cais): "num_outputs" should be populated in to_json() instead.
json_object["num_outputs"] = op_creation_digest.num_outputs
del json_object["input_names"]
json_object["inputs"] = []
for input_tensor_name in op_creation_digest.input_names or []:
input_op_name, output_slot = parse_tensor_name(input_tensor_name)
json_object["inputs"].append(
{"op_name": input_op_name, "output_slot": output_slot}
)
json_object["consumers"] = []
for _ in range(json_object["num_outputs"]):
json_object["consumers"].append([])
for src_slot, consumer_op_name, dst_slot in graph.get_op_consumers(
json_object["op_name"]
):
json_object["consumers"][src_slot].append(
{"op_name": consumer_op_name, "input_slot": dst_slot}
)
return json_object
def _getGraphStackIds(self, graph_id):
"""Retrieve the IDs of all outer graphs of a graph.
Args:
graph_id: Id of the graph being queried with respect to its outer
graphs context.
Returns:
A list of graph_ids, ordered from outermost to innermost, including
the input `graph_id` argument as the last item.
"""
graph_ids = [graph_id]
graph = self._reader.graph_by_id(graph_id)
while graph.outer_graph_id:
graph_ids.insert(0, graph.outer_graph_id)
graph = self._reader.graph_by_id(graph.outer_graph_id)
return graph_ids
def SourceFileList(self, run):
runs = self.Runs()
if run not in runs:
return None
return self._reader.source_file_list()
def SourceLines(self, run, index):
runs = self.Runs()
if run not in runs:
return None
try:
host_name, file_path = self._reader.source_file_list()[index]
except IndexError:
raise errors.NotFoundError(
"There is no source-code file at index %d" % index
)
return {
"host_name": host_name,
"file_path": file_path,
"lines": self._reader.source_lines(host_name, file_path),
}
def StackFrames(self, run, stack_frame_ids):
runs = self.Runs()
if run not in runs:
return None
stack_frames = []
for stack_frame_id in stack_frame_ids:
if stack_frame_id not in self._reader._stack_frame_by_id:
raise errors.NotFoundError(
"Cannot find stack frame with ID %s" % stack_frame_id
)
# TODO(cais): Use public method (`stack_frame_by_id()`) when
# available.
# pylint: disable=protected-access
stack_frames.append(self._reader._stack_frame_by_id[stack_frame_id])
# pylint: enable=protected-access
return {"stack_frames": stack_frames}