# Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Gradients for operators defined in nn_ops.py.""" import functools import itertools import operator from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import array_ops_stack from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops @ops.RegisterGradient("Conv2DBackpropInput") def _Conv2DBackpropInputGrad(op: ops.Operation, grad): """The derivatives for deconvolution. Args: op: the Deconvolution op. grad: the tensor representing the gradient w.r.t. the output Returns: the gradients w.r.t. the input and the filter """ # We call the gen_nn_ops backprop functions instead of nn_ops backprop # functions for performance reasons in Eager mode. See _Conv2DGrad. return [ None, gen_nn_ops.conv2d_backprop_filter( grad, array_ops.shape(op.inputs[1]), op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), data_format=op.get_attr("data_format").decode()), gen_nn_ops.conv2d( grad, op.inputs[1], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), data_format=op.get_attr("data_format").decode()) ] @ops.RegisterGradient("Conv2DBackpropFilter") def _Conv2DBackpropFilterGrad(op: ops.Operation, grad): # We call the gen_nn_ops backprop functions instead of nn_ops backprop # functions for performance reasons in Eager mode. See _Conv2DGrad. return [ gen_nn_ops.conv2d_backprop_input( array_ops.shape(op.inputs[0]), grad, op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), data_format=op.get_attr("data_format").decode()), None, gen_nn_ops.conv2d( op.inputs[0], grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), data_format=op.get_attr("data_format").decode()) ] @ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput") def _DepthwiseConv2dNativeBackpropInputGrad(op: ops.Operation, grad): """The derivatives for deconvolution. Args: op: the Deconvolution op. grad: the tensor representing the gradient w.r.t. the output Returns: the gradients w.r.t. the input and the filter """ return [ None, gen_nn_ops.depthwise_conv2d_native_backprop_filter( grad, array_ops.shape(op.inputs[1]), op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")), gen_nn_ops.depthwise_conv2d_native( grad, op.inputs[1], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")) ] @ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter") def _DepthwiseConv2dNativeBackpropFilterGrad(op: ops.Operation, grad): return [ gen_nn_ops.depthwise_conv2d_native_backprop_input( array_ops.shape(op.inputs[0]), grad, op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")), None, gen_nn_ops.depthwise_conv2d_native( op.inputs[0], grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")) ] @ops.RegisterGradient("Conv3D") def _Conv3DGrad(op: ops.Operation, grad): data_format = op.get_attr("data_format").decode() shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]]) return [ gen_nn_ops.conv3d_backprop_input_v2( shape_0, op.inputs[1], grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format), gen_nn_ops.conv3d_backprop_filter_v2( op.inputs[0], shape_1, grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format, ), ] @ops.RegisterGradient("Conv3DBackpropInputV2") def _Conv3DBackpropInputGrad(op: ops.Operation, grad): data_format = op.get_attr("data_format").decode() return [ None, gen_nn_ops.conv3d_backprop_filter_v2( grad, array_ops.shape(op.inputs[1]), op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format), gen_nn_ops.conv3d( grad, op.inputs[1], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format) ] @ops.RegisterGradient("Conv3DBackpropFilterV2") def _Conv3DBackpropFilterGrad(op: ops.Operation, grad): data_format = op.get_attr("data_format").decode() return [ gen_nn_ops.conv3d_backprop_input_v2( array_ops.shape(op.inputs[0]), grad, op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format), None, gen_nn_ops.conv3d( op.inputs[0], grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format) ] @ops.RegisterGradient("AvgPool3D") def _AvgPool3DGrad(op: ops.Operation, grad): return gen_nn_ops.avg_pool3d_grad( array_ops.shape(op.inputs[0]), grad, ksize=op.get_attr("ksize"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=op.get_attr("data_format").decode()) @ops.RegisterGradient("AvgPool3DGrad") def _AvgPool3DGradGrad(op: ops.Operation, grad): return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d( grad, op.get_attr("ksize"), op.get_attr("strides"), op.get_attr("padding"), data_format=op.get_attr("data_format").decode())) @ops.RegisterGradient("MaxPool3D") def _MaxPool3DGrad(op: ops.Operation, grad): return gen_nn_ops.max_pool3d_grad( op.inputs[0], op.outputs[0], grad, ksize=op.get_attr("ksize"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=op.get_attr("data_format").decode()) @ops.RegisterGradient("MaxPool3DGrad") def _MaxPool3DGradGrad(op: ops.Operation, grad): return (array_ops.zeros_like(op.inputs[0]), array_ops.zeros_like(op.inputs[1]), gen_nn_ops.max_pool3d_grad_grad( op.inputs[0], op.inputs[1], grad, op.get_attr("ksize"), op.get_attr("strides"), padding=op.get_attr("padding"), data_format=op.get_attr("data_format").decode())) @ops.RegisterGradient("MaxPool3DGradGrad") def _MaxPool3DGradGradGrad(op: ops.Operation, grad): return (array_ops.zeros_like(op.inputs[0]), array_ops.zeros_like(op.inputs[1]), gen_nn_ops.max_pool3d_grad( op.inputs[0], op.inputs[1], grad, op.get_attr("ksize"), op.get_attr("strides"), padding=op.get_attr("padding"), data_format=op.get_attr("data_format").decode())) @ops.RegisterGradient("Softmax") def _SoftmaxGrad(op: ops.Operation, grad_softmax): """The derivative of the softmax nonlinearity. We assume that probs is of shape [batch_size * dim] The formula for dsoftmax / dx = (diag(softmax) - softmax * softmax'). This matrix is diagonal minus a rank one matrix, so it is easy to implement as follows: grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax Args: op: the Softmax op. grad_softmax: the tensor representing the gradient w.r.t. the softmax output. Returns: gradient w.r.t the input to the softmax """ softmax = op.outputs[0] sum_channels = math_ops.reduce_sum(grad_softmax * softmax, -1, keepdims=True) return (grad_softmax - sum_channels) * softmax @ops.RegisterGradient("LogSoftmax") def _LogSoftmaxGrad(op: ops.Operation, grad): """The gradient for log_softmax. log_softmax = input - log(sum(exp(input)) dlog_softmax/dinput = diag - softmax(input) Args: op: The log softmax op. grad: The tensor representing the gradient w.r.t. the output. Returns: The gradients w.r.t. the input. """ softmax = math_ops.exp(op.outputs[0]) return grad - math_ops.reduce_sum(grad, -1, keepdims=True) * softmax @ops.RegisterGradient("BiasAdd") def _BiasAddGrad(op: ops.Operation, received_grad): """Return the gradients for the 2 inputs of bias_op. The first input of unused_bias_op is the tensor t, and its gradient is just the gradient the unused_bias_op received. The second input of unused_bias_op is the bias vector which has one fewer dimension than "received_grad" (the batch dimension.) Its gradient is the received gradient Summed on the batch dimension, which is the first dimension. Args: op: The BiasOp for which we need to generate gradients. received_grad: Tensor. The gradients passed to the BiasOp. Returns: Two tensors, the first one for the "tensor" input of the BiasOp, the second one for the "bias" input of the BiasOp. """ try: data_format = op.get_attr("data_format") except ValueError: data_format = None return (received_grad, gen_nn_ops.bias_add_grad( out_backprop=received_grad, data_format=data_format)) @ops.RegisterGradient("BiasAddGrad") def _BiasAddGradGrad(op: ops.Operation, received_grad): """Gradient for the BiasAddGrad op. Args: op: BiasAddGrad op for which we are calculating gradients. received_grad: The gradients passed to the BiasAddGrad op. Returns: A single gradient Tensor for the input to BiasAddGrad (which is the gradient of the bias term in BiasAdd) """ try: data_format = op.get_attr("data_format") except ValueError: data_format = None shape = array_ops.shape(op.inputs[0]) bias_shape = array_ops.shape(received_grad) if data_format == b"NCHW": expanded_shape = array_ops.concat([ array_ops.ones_like(shape[:1]), bias_shape, array_ops.ones_like(shape[2:]) ], 0) tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0) else: expanded_shape = array_ops.concat( [array_ops.ones_like(shape[:-1]), bias_shape], 0) tile_mults = array_ops.concat([shape[:-1], [1]], 0) expanded_grad = array_ops.reshape(received_grad, expanded_shape) return array_ops.tile(expanded_grad, tile_mults) @ops.RegisterGradient("BiasAddV1") def _BiasAddGradV1(unused_bias_op: ops.Operation, received_grad): """Return the gradients for the 2 inputs of bias_op. The first input of unused_bias_op is the tensor t, and its gradient is just the gradient the unused_bias_op received. The second input of unused_bias_op is the bias vector which has one fewer dimension than "received_grad" (the batch dimension.) Its gradient is the received gradient Summed on the batch dimension, which is the first dimension. Args: unused_bias_op: The BiasOp for which we need to generate gradients. received_grad: Tensor. The gradients passed to the BiasOp. Returns: Two tensors, the first one for the "tensor" input of the BiasOp, the second one for the "bias" input of the BiasOp. """ reduction_dim_tensor = math_ops.range(array_ops.rank(received_grad) - 1) return (received_grad, math_ops.reduce_sum(received_grad, reduction_dim_tensor)) @ops.RegisterGradient("Relu") def _ReluGrad(op: ops.Operation, grad): return gen_nn_ops.relu_grad(grad, op.outputs[0]) @ops.RegisterGradient("EluGrad") def _EluGradGrad(op: ops.Operation, grad): elu_x = op.inputs[1] return (gen_nn_ops.elu_grad(grad, elu_x), array_ops.where( elu_x < 0, grad * op.inputs[0], array_ops.zeros_like(elu_x))) @ops.RegisterGradient("SeluGrad") def _SeluGradGrad(op: ops.Operation, grad): selu_x = op.inputs[1] return (gen_nn_ops.selu_grad(grad, selu_x), array_ops.where( selu_x < 0., grad * op.inputs[0], array_ops.zeros_like(selu_x))) @ops.RegisterGradient("Relu6") def _Relu6Grad(op: ops.Operation, grad): return gen_nn_ops.relu6_grad(grad, op.outputs[0]) @ops.RegisterGradient("Relu6Grad") def _Relu6GradGrad(op: ops.Operation, grad): x = op.inputs[1] return (gen_nn_ops.relu6_grad(grad, x), array_ops.zeros_like(x)) @ops.RegisterGradient("LeakyRelu") def _LeakyReluGrad(op: ops.Operation, grad): x = op.inputs[0] alpha = op.get_attr("alpha") return gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha) @ops.RegisterGradient("LeakyReluGrad") def _LeakyReluGradGrad(op: ops.Operation, grad): x = op.inputs[1] alpha = op.get_attr("alpha") return (gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha), array_ops.zeros_like(x)) @ops.RegisterGradient("Elu") def _EluGrad(op: ops.Operation, grad): return gen_nn_ops.elu_grad(grad, op.outputs[0]) @ops.RegisterGradient("Selu") def _SeluGrad(op: ops.Operation, grad): return gen_nn_ops.selu_grad(grad, op.outputs[0]) @ops.RegisterGradient("Softplus") def _SoftplusGrad(op: ops.Operation, grad): return grad * math_ops.sigmoid(op.inputs[0]) @ops.RegisterGradient("SoftplusGrad") def _SoftplusGradGrad(op: ops.Operation, grad): # Let: # y = tf.nn.softplus(x) # dx = gen_nn_ops.softplus_grad(dy, x) = dy / (1 + exp(-x)) # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx. dy, x = op.inputs with ops.control_dependencies([grad]): ddy = gen_nn_ops.softplus_grad(grad, x) d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x)) return (ddy, d2x) @ops.RegisterGradient("Softsign") def _SoftsignGrad(op: ops.Operation, grad): return gen_nn_ops.softsign_grad(grad, op.inputs[0]) @ops.RegisterGradient("ReluGrad") def _ReluGradGrad(op: ops.Operation, grad): x = op.inputs[1] return (gen_nn_ops.relu_grad(grad, x), array_ops.zeros_like(x)) def _BroadcastMul(vec, mat): """Multiply after broadcasting vec to match dimensions of mat. Args: vec: A 1-D tensor of dimension [D0] mat: A 2-D tensor of dimension [D0, D1] Returns: A tensor of dimension [D0, D1], the result of vec * mat """ # Reshape vec to [D0, 1] vec = array_ops.expand_dims(vec, -1) return vec * mat @ops.RegisterGradient("SoftmaxCrossEntropyWithLogits") def _SoftmaxCrossEntropyWithLogitsGrad(op: ops.Operation, grad_loss, grad_grad): """Gradient function for SoftmaxCrossEntropyWithLogits.""" # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) # grad_grad is the backprop for softmax gradient. # # Second derivative is just softmax derivative w.r.t. logits. softmax_grad = op.outputs[1] grad = _BroadcastMul(grad_loss, softmax_grad) logits = op.inputs[0] if (grad_grad is not None and not getattr(grad_grad, "_is_zeros_tensor", False)): softmax = gen_nn_ops.softmax(logits) grad += ((grad_grad - array_ops.squeeze( math_ops.matmul( array_ops.expand_dims(grad_grad, 1), array_ops.expand_dims(softmax, 2)), axis=1)) * softmax) return grad, _BroadcastMul(grad_loss, -gen_nn_ops.log_softmax(logits)) # pylint: disable=invalid-unary-operand-type @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits") def _SparseSoftmaxCrossEntropyWithLogitsGrad(op: ops.Operation, grad_loss, grad_grad): """Gradient function for SparseSoftmaxCrossEntropyWithLogits.""" # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) # grad_grad is the backprop for softmax gradient. # There is no gradient for the labels # # Second derivative is just softmax derivative w.r.t. logits. softmax_grad = op.outputs[1] grad = _BroadcastMul(grad_loss, softmax_grad) logits = op.inputs[0] if (grad_grad is not None and not getattr(grad_grad, "_is_zeros_tensor", False)): softmax = gen_nn_ops.softmax(logits) grad += ((grad_grad - array_ops.squeeze( math_ops.matmul( array_ops.expand_dims(grad_grad, 1), array_ops.expand_dims(softmax, 2)), axis=1)) * softmax) return grad, None @ops.RegisterGradient("Conv2D") def _Conv2DGrad(op: ops.Operation, grad): """Gradient function for Conv2D.""" dilations = op.get_attr("dilations") strides = op.get_attr("strides") padding = op.get_attr("padding") explicit_paddings = op.get_attr("explicit_paddings") use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu") data_format = op.get_attr("data_format") shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]]) # We call the gen_nn_ops backprop functions instead of nn_ops backprop # functions for performance reasons in Eager mode. gen_nn_ops functions take a # `explicit_paddings` parameter, but nn_ops functions do not. So if we were # to use the nn_ops functions, we would have to convert `padding` and # `explicit_paddings` into a single `padding` parameter, increasing overhead # in Eager mode. return [ gen_nn_ops.conv2d_backprop_input( shape_0, op.inputs[1], grad, dilations=dilations, strides=strides, padding=padding, explicit_paddings=explicit_paddings, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format), gen_nn_ops.conv2d_backprop_filter( op.inputs[0], shape_1, grad, dilations=dilations, strides=strides, padding=padding, explicit_paddings=explicit_paddings, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) ] @ops.RegisterGradient("DepthwiseConv2dNative") def _DepthwiseConv2dNativeGrad(op: ops.Operation, grad): return [ gen_nn_ops.depthwise_conv2d_native_backprop_input( array_ops.shape(op.inputs[0]), op.inputs[1], grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")), gen_nn_ops.depthwise_conv2d_native_backprop_filter( op.inputs[0], array_ops.shape(op.inputs[1]), grad, dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")) ] @ops.RegisterGradient("Dilation2D") def _Dilation2DGrad(op: ops.Operation, grad): return [ gen_nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad, op.get_attr("strides"), op.get_attr("rates"), op.get_attr("padding")), gen_nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad, op.get_attr("strides"), op.get_attr("rates"), op.get_attr("padding")) ] @ops.RegisterGradient("LRN") def _LRNGrad(op: ops.Operation, grad): depth_radius = op.get_attr("depth_radius") bias = op.get_attr("bias") alpha = op.get_attr("alpha") beta = op.get_attr("beta") return [ gen_nn_ops.lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius, bias, alpha, beta) ] @ops.RegisterGradient("AvgPool") def _AvgPoolGrad(op: ops.Operation, grad): return gen_nn_ops.avg_pool_grad( array_ops.shape(op.inputs[0]), grad, op.get_attr("ksize"), op.get_attr("strides"), op.get_attr("padding"), data_format=op.get_attr("data_format")) @ops.RegisterGradient("AvgPoolGrad") def _AvgPoolGradGrad(op: ops.Operation, grad): return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool( grad, op.get_attr("ksize"), op.get_attr("strides"), op.get_attr("padding"), data_format=op.get_attr("data_format"))) @ops.RegisterGradient("MaxPool") def _MaxPoolGrad(op: ops.Operation, grad): return gen_nn_ops.max_pool_grad( op.inputs[0], op.outputs[0], grad, op.get_attr("ksize"), op.get_attr("strides"), padding=op.get_attr("padding"), explicit_paddings=op.get_attr("explicit_paddings"), data_format=op.get_attr("data_format")) @ops.RegisterGradient("MaxPoolV2") def _MaxPoolGradV2(op: ops.Operation, grad): ksize = op.inputs[1] strides = op.inputs[2] return gen_nn_ops.max_pool_grad_v2( op.inputs[0], op.outputs[0], grad, ksize, strides, padding=op.get_attr("padding"), data_format=op.get_attr("data_format")), None, None @ops.RegisterGradient("MaxPoolWithArgmax") def _MaxPoolGradWithArgmax(op: ops.Operation, grad, unused_argmax_grad): del unused_argmax_grad return gen_nn_ops.max_pool_grad_with_argmax( op.inputs[0], grad, op.outputs[1], op.get_attr("ksize"), op.get_attr("strides"), padding=op.get_attr("padding"), include_batch_in_index=op.get_attr("include_batch_in_index"), ) @ops.RegisterGradient("MaxPoolGrad") def _MaxPoolGradGrad(op, grad): return ( array_ops.zeros_like(op.inputs[0]), array_ops.zeros_like(op.inputs[1]), gen_nn_ops.max_pool_grad_grad( op.inputs[0], op.inputs[1], grad, op.get_attr("ksize"), op.get_attr("strides"), padding=op.get_attr("padding"), data_format=op.get_attr("data_format"), ), ) @ops.RegisterGradient("MaxPoolGradV2") def _MaxPoolGradGradV2(op: ops.Operation, grad): ksize = op.inputs[3] strides = op.inputs[4] return ( array_ops.zeros_like(op.inputs[0]), array_ops.zeros_like(op.inputs[1]), gen_nn_ops.max_pool_grad_grad_v2( op.inputs[0], op.inputs[1], grad, ksize, strides, padding=op.get_attr("padding"), data_format=op.get_attr("data_format"), ), None, None, ) @ops.RegisterGradient("MaxPoolGradGrad") def _MaxPoolGradGradGrad(op: ops.Operation, grad): return ( array_ops.zeros_like(op.inputs[0]), array_ops.zeros_like(op.inputs[1]), gen_nn_ops.max_pool_grad( op.inputs[0], op.inputs[1], grad, op.get_attr("ksize"), op.get_attr("strides"), padding=op.get_attr("padding"), data_format=op.get_attr("data_format"), ), ) @ops.RegisterGradient("FractionalMaxPool") def _FractionalMaxPoolGrad( op: ops.Operation, grad_0, unused_grad_1, unused_grad_2 ): """Returns gradient for FractionalMaxPool. Since FractionalMaxPool has three outputs, there are three gradients passed in for each of the outputs. Only the first one is useful, the other two gradients are empty. Args: op: The FractionalMaxPoolOp. grad_0: Gradient with respect to op.outputs[0] unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty. unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty. Returns: Input backprop for FractionalMaxPool op. """ return gen_nn_ops.fractional_max_pool_grad( op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2], op.get_attr("overlapping"), ) @ops.RegisterGradient("FractionalAvgPool") def _FractionalAvgPoolGrad( op: ops.Operation, grad_0, unused_grad_1, unused_grad_2 ): """Returns gradient for FractionalAvgPool. Since FractionalAvgPool has three outputs, there are three gradients passed in for each of the outputs. Only the first one is useful, the other two gradients are empty. Args: op: The FractionalAvgPoolOp. grad_0: Gradient with respect to op.outputs[0] unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty. unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty. Returns: Input backprop for FractionalAvgPool op. """ return gen_nn_ops.fractional_avg_pool_grad(op.inputs[0].get_shape(), grad_0, op.outputs[1], op.outputs[2], op.get_attr("overlapping")) @ops.RegisterGradient("BatchNormWithGlobalNormalization") def _BatchNormWithGlobalNormalizationGrad(op: ops.Operation, grad): """Return the gradients for the 5 inputs of BatchNormWithGlobalNormalization. We do not backprop anything for the mean and var intentionally as they are not being trained with backprop in the operation. Args: op: The BatchNormOp for which we need to generate gradients. grad: Tensor. The gradients passed to the BatchNormOp. Returns: dx: Backprop for input, which is (grad * (g * rsqrt(v + epsilon))) dm: Backprop for mean, which is sum_over_rest(grad * g) * (-1 / rsqrt(v + epsilon)) dv: Backprop for variance, which is sum_over_rest(grad * g * (x - m)) * (-1/2) * (v + epsilon) ^ (-3/2) db: Backprop for beta, which is grad reduced in all except the last dimension. dg: Backprop for gamma, which is (grad * ((x - m) * rsqrt(v + epsilon))) """ dx, dm, dv, db, dg = gen_nn_ops.batch_norm_with_global_normalization_grad( op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[4], grad, op.get_attr("variance_epsilon"), op.get_attr("scale_after_normalization")) return dx, dm, dv, db, dg def _BaseFusedBatchNormGrad(op: ops.Operation, version, *grad): """Return the gradients for the 3 inputs of BatchNorm. Args: op: The BatchNormOp for which we need to compute gradients. version: Integer indicating which version to use of the fused batch norm gradient. *grad: An argument list for tensors of gradients wrt the outputs with grad[0] as grad_y. Returns: grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) * [grad_y - mean(grad_y) - (x - mean(x)) * mean(grad_y * (x - mean(x))) / (variance + epsilon)] in training mode; grad_y * scale * rsqrt(pop_variance + epsilon) in freeze mode. grad_scale: gradient for scale, which is sum(grad_y * (x - mean(x)) * rsqrt(variance + epsilon)) in training mode; sum(grad_y * (x - pop_mean) * rsqrt(pop_variance + epsilon)) in freeze mode. grad_offset: gradient for offset, which is sum(grad_y) in training mode; sum(grad_y) in freeze mode. """ x = op.inputs[0] grad_y = grad[0] scale = op.inputs[1] epsilon = op.get_attr("epsilon") data_format = op.get_attr("data_format") is_training = op.get_attr("is_training") if version == 2: grad_fun = gen_nn_ops.fused_batch_norm_grad_v3 elif version == 1: grad_fun = gen_nn_ops.fused_batch_norm_grad_v2 else: grad_fun = gen_nn_ops.fused_batch_norm_grad if is_training: args = { "y_backprop": grad_y, "x": x, "scale": scale, "reserve_space_1": op.outputs[3], "reserve_space_2": op.outputs[4], "epsilon": epsilon, "data_format": data_format, "is_training": is_training } if version == 2: args["reserve_space_3"] = op.outputs[5] dx, dscale, doffset, _, _ = grad_fun(**args) else: pop_mean = op.inputs[3] pop_var = op.inputs[4] if data_format == b"NCHW": x = array_ops.transpose(x, [0, 2, 3, 1]) grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1]) elif data_format == b"NCDHW": x = array_ops.transpose(x, [0, 2, 3, 4, 1]) grad_y = array_ops.transpose(grad_y, [0, 2, 3, 4, 1]) target_data_format = ("NHWC" if data_format in (b"NCHW", b"NHWC") else "NDHWC") args = { "y_backprop": grad_y, "x": x, "scale": scale, "reserve_space_1": pop_mean, "reserve_space_2": pop_var, "epsilon": epsilon, "data_format": target_data_format, "is_training": is_training } if version == 2: args["reserve_space_3"] = op.outputs[5] dx, dscale, doffset, _, _ = grad_fun(**args) if data_format == b"NCHW": dx = array_ops.transpose(dx, [0, 3, 1, 2]) elif data_format == b"NCDHW": dx = array_ops.transpose(dx, [0, 4, 1, 2, 3]) return dx, dscale, doffset, None, None @ops.RegisterGradient("FusedBatchNorm") def _FusedBatchNormGrad(op: ops.Operation, *grad): return _BaseFusedBatchNormGrad(op, 0, *grad) @ops.RegisterGradient("FusedBatchNormV2") def _FusedBatchNormV2Grad(op: ops.Operation, *grad): return _BaseFusedBatchNormGrad(op, 1, *grad) @ops.RegisterGradient("FusedBatchNormV3") def _FusedBatchNormV3Grad(op: ops.Operation, *grad): return _BaseFusedBatchNormGrad(op, 2, *grad) @ops.RegisterGradient("L2Loss") def _L2LossGrad(op: ops.Operation, grad): """Return the gradients for L2Loss. Args: op: The L2LossOp for which we need to generate gradients. grad: Tensor containing a single number. Returns: The gradient, which is (x * grad). """ return op.inputs[0] * grad @ops.RegisterGradient("TopK") @ops.RegisterGradient("TopKV2") def _TopKGrad(op: ops.Operation, grad, _): """Return the gradients for TopK. Args: op: The TopKOp for which we need to generate gradients. grad: Tensor. The gradients passed to the TopKOp. Returns: A list of two tensors, the first being the gradient w.r.t to the input and TopK, and the second being the gradient w.r.t. to the indices (all zero). """ in_shape = array_ops.shape(op.inputs[0]) ind_shape = array_ops.shape(op.outputs[1]) # int32 is not supported on GPU hence up-casting ind_lastdim = array_ops.gather( math_ops.cast(ind_shape, dtypes.int64), array_ops.size(ind_shape) - 1) # Flatten indices to 2D. ind_2d = array_ops.reshape( op.outputs[1], array_ops_stack.stack([-1, ind_lastdim])) in_lastdim = array_ops.gather( math_ops.cast(in_shape, dtypes.int64), array_ops.size(in_shape) - 1) outerdim = array_ops.shape(ind_2d)[0] # Compute linear indices (flattened to 1D). ind = array_ops.reshape( ind_2d + math_ops.cast( array_ops.expand_dims( math_ops.range(0, math_ops.cast(outerdim, dtypes.int64) * in_lastdim, in_lastdim), -1), dtypes.int32), [-1]) # Substitute grad to appropriate locations and fill the rest with zeros, # finally reshaping it to the original input shape. return [ array_ops.reshape( array_ops.scatter_nd( array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]), [math_ops.reduce_prod(in_shape)]), in_shape), array_ops.zeros([], dtype=dtypes.int32) ] @ops.RegisterGradient("ApproxTopK") def _ApproxTopKGradient(op: ops.Operation, grad, _): """Return the gradients for ApproxTopK. Args: op: The ApproxTopK for which we need to generate gradients. grad: The gradients for backprop. Returns: Scattered gradient based on the top-k indices. """ # The code below is to generate the correct index and value mapping for # scatter_nd to work properly. # # We use static evaluations as much as possible to reduce the runtime cost. # That's said, use operation.shape instead of array_ops.shape; # and use functools.reduce(operator.mul, ...) instead of math_ops.reduce_prod idx_shape = op.outputs[1].shape lifted_idx_shape = idx_shape + [1] flat_shape_len = functools.reduce(operator.mul, idx_shape) rank = idx_shape.rank reduction_dim = op.get_attr("reduction_dimension") if reduction_dim < 0: reduction_dim = rank + reduction_dim def GetLiftedIdx(d): if d == reduction_dim: return array_ops.reshape(op.outputs[1], lifted_idx_shape) iota_len = idx_shape[d] iota_shape = list(itertools.repeat(1, rank + 1)) iota_shape[d] = iota_len iota = array_ops.reshape(math_ops.range(iota_len), iota_shape) return array_ops.broadcast_to(iota, lifted_idx_shape) lifted_idx = array_ops.concat( list(GetLiftedIdx(d) for d in range(rank)), axis=rank) flat_idx = array_ops.reshape(lifted_idx, [flat_shape_len, rank]) flat_grad = array_ops.reshape(grad, [flat_shape_len]) return array_ops.scatter_nd(flat_idx, flat_grad, op.inputs[0].shape) @ops.RegisterGradient("NthElement") def _NthElementGrad(op: ops.Operation, grad): """Return the gradients for NthElement. Args: op: The NthElementOp for which we need to generate gradients. grad: Tensor. The gradients passed to the NthElementOp Returns: A list of two tensors, the first being the gradient w.r.t. the input, the second being the gradient w.r.t. the N (None). """ input = op.inputs[0] # pylint: disable=redefined-builtin output = op.outputs[0] # Compute the number of elements which equal to output in each reduction # dimension. If there are multiple elements then the gradient will be # divided between them. indicators = math_ops.cast( math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype) grad = array_ops.expand_dims(grad, -1) num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1) return [math_ops.divide(indicators, num_selected) * grad, None] def _MeanAggregator(inputs, segments): """Replaces each segment with its mean along the last axis. Specifically, each value in the `inputs` tensor gets replaced by the mean value computed from the values that belong to the same segment. Args: inputs: A 2-tensor. Aggregation is done over dimension 1. segments: A 2-tensor, same shape as `input`. Returns: The result, same shape and type as `inputs`. """ result = [] for inputs_i, segments_i in zip( array_ops.split(inputs, inputs.shape[0]), array_ops.split(segments, segments.shape[0])): # Note that we do not use tf.math.segment_mean, as it has no TPU support. means_i = math_ops.unsorted_segment_mean( inputs_i, segments_i, num_segments=math_ops.reduce_max(segments_i) + 1) result.append( array_ops.reshape(array_ops.gather(means_i, segments_i), [-1])) return array_ops_stack.stack(result, axis=0) # We have to register the gradients for these ops so that tensorflow will know # how to differentiate them. @ops.RegisterGradient("IsotonicRegression") def _IsotonicRegressionGrad(op: ops.Operation, grad_output, grad_segments): """Gradient for the isotonic regression function. Args: op: The IsotonicRegression tensorflow op. grad_output: Tensor of incoming gradients with respect to the output. grad_segments: Tensor of incoming gradients with respect to the segments. Returns: A tensor, same size as `grad_output` with the gradient with respect to the input. """ del grad_segments # Discrete, non-differentiable. segments = op.outputs[1] return _MeanAggregator(grad_output, segments)