429 lines
17 KiB
Python
429 lines
17 KiB
Python
|
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""Grouping dataset transformations."""
|
||
|
from tensorflow.python.data.ops import dataset_ops
|
||
|
from tensorflow.python.data.ops import structured_function
|
||
|
from tensorflow.python.data.util import nest
|
||
|
from tensorflow.python.data.util import structure
|
||
|
from tensorflow.python.framework import dtypes
|
||
|
from tensorflow.python.framework import ops
|
||
|
from tensorflow.python.framework import tensor_spec
|
||
|
from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
|
||
|
from tensorflow.python.util import deprecation
|
||
|
from tensorflow.python.util.tf_export import tf_export
|
||
|
|
||
|
|
||
|
@tf_export("data.experimental.group_by_reducer")
|
||
|
def group_by_reducer(key_func, reducer):
|
||
|
"""A transformation that groups elements and performs a reduction.
|
||
|
|
||
|
This transformation maps element of a dataset to a key using `key_func` and
|
||
|
groups the elements by key. The `reducer` is used to process each group; its
|
||
|
`init_func` is used to initialize state for each group when it is created, the
|
||
|
`reduce_func` is used to update the state every time an element is mapped to
|
||
|
the matching group, and the `finalize_func` is used to map the final state to
|
||
|
an output value.
|
||
|
|
||
|
Args:
|
||
|
key_func: A function mapping a nested structure of tensors
|
||
|
(having shapes and types defined by `self.output_shapes` and
|
||
|
`self.output_types`) to a scalar `tf.int64` tensor.
|
||
|
reducer: An instance of `Reducer`, which captures the reduction logic using
|
||
|
the `init_func`, `reduce_func`, and `finalize_func` functions.
|
||
|
|
||
|
Returns:
|
||
|
A `Dataset` transformation function, which can be passed to
|
||
|
`tf.data.Dataset.apply`.
|
||
|
"""
|
||
|
|
||
|
def _apply_fn(dataset):
|
||
|
"""Function from `Dataset` to `Dataset` that applies the transformation."""
|
||
|
return _GroupByReducerDataset(dataset, key_func, reducer)
|
||
|
|
||
|
return _apply_fn
|
||
|
|
||
|
|
||
|
@deprecation.deprecated(None, "Use `tf.data.Dataset.group_by_window(...)`.")
|
||
|
@tf_export("data.experimental.group_by_window")
|
||
|
def group_by_window(key_func,
|
||
|
reduce_func,
|
||
|
window_size=None,
|
||
|
window_size_func=None):
|
||
|
"""A transformation that groups windows of elements by key and reduces them.
|
||
|
|
||
|
This transformation maps each consecutive element in a dataset to a key
|
||
|
using `key_func` and groups the elements by key. It then applies
|
||
|
`reduce_func` to at most `window_size_func(key)` elements matching the same
|
||
|
key. All except the final window for each key will contain
|
||
|
`window_size_func(key)` elements; the final window may be smaller.
|
||
|
|
||
|
You may provide either a constant `window_size` or a window size determined by
|
||
|
the key through `window_size_func`.
|
||
|
|
||
|
Args:
|
||
|
key_func: A function mapping a nested structure of tensors
|
||
|
(having shapes and types defined by `self.output_shapes` and
|
||
|
`self.output_types`) to a scalar `tf.int64` tensor.
|
||
|
reduce_func: A function mapping a key and a dataset of up to `window_size`
|
||
|
consecutive elements matching that key to another dataset.
|
||
|
window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
|
||
|
consecutive elements matching the same key to combine in a single
|
||
|
batch, which will be passed to `reduce_func`. Mutually exclusive with
|
||
|
`window_size_func`.
|
||
|
window_size_func: A function mapping a key to a `tf.int64` scalar
|
||
|
`tf.Tensor`, representing the number of consecutive elements matching
|
||
|
the same key to combine in a single batch, which will be passed to
|
||
|
`reduce_func`. Mutually exclusive with `window_size`.
|
||
|
|
||
|
Returns:
|
||
|
A `Dataset` transformation function, which can be passed to
|
||
|
`tf.data.Dataset.apply`.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if neither or both of {`window_size`, `window_size_func`} are
|
||
|
passed.
|
||
|
"""
|
||
|
|
||
|
def _apply_fn(dataset):
|
||
|
"""Function from `Dataset` to `Dataset` that applies the transformation."""
|
||
|
return dataset.group_by_window(
|
||
|
key_func=key_func,
|
||
|
reduce_func=reduce_func,
|
||
|
window_size=window_size,
|
||
|
window_size_func=window_size_func)
|
||
|
|
||
|
return _apply_fn
|
||
|
|
||
|
|
||
|
@deprecation.deprecated(None,
|
||
|
"Use `tf.data.Dataset.bucket_by_sequence_length(...)`.")
|
||
|
@tf_export("data.experimental.bucket_by_sequence_length")
|
||
|
def bucket_by_sequence_length(element_length_func,
|
||
|
bucket_boundaries,
|
||
|
bucket_batch_sizes,
|
||
|
padded_shapes=None,
|
||
|
padding_values=None,
|
||
|
pad_to_bucket_boundary=False,
|
||
|
no_padding=False,
|
||
|
drop_remainder=False):
|
||
|
"""A transformation that buckets elements in a `Dataset` by length.
|
||
|
|
||
|
Elements of the `Dataset` are grouped together by length and then are padded
|
||
|
and batched.
|
||
|
|
||
|
This is useful for sequence tasks in which the elements have variable length.
|
||
|
Grouping together elements that have similar lengths reduces the total
|
||
|
fraction of padding in a batch which increases training step efficiency.
|
||
|
|
||
|
Below is an example to bucketize the input data to the 3 buckets
|
||
|
"[0, 3), [3, 5), [5, inf)" based on sequence length, with batch size 2.
|
||
|
|
||
|
>>> elements = [
|
||
|
... [0], [1, 2, 3, 4], [5, 6, 7],
|
||
|
... [7, 8, 9, 10, 11], [13, 14, 15, 16, 19, 20], [21, 22]]
|
||
|
|
||
|
>>> dataset = tf.data.Dataset.from_generator(
|
||
|
... lambda: elements, tf.int64, output_shapes=[None])
|
||
|
|
||
|
>>> dataset = dataset.apply(
|
||
|
... tf.data.experimental.bucket_by_sequence_length(
|
||
|
... element_length_func=lambda elem: tf.shape(elem)[0],
|
||
|
... bucket_boundaries=[3, 5],
|
||
|
... bucket_batch_sizes=[2, 2, 2]))
|
||
|
|
||
|
>>> for elem in dataset.as_numpy_iterator():
|
||
|
... print(elem)
|
||
|
[[1 2 3 4]
|
||
|
[5 6 7 0]]
|
||
|
[[ 7 8 9 10 11 0]
|
||
|
[13 14 15 16 19 20]]
|
||
|
[[ 0 0]
|
||
|
[21 22]]
|
||
|
|
||
|
There is also a possibility to pad the dataset till the bucket boundary.
|
||
|
You can also provide which value to be used while padding the data.
|
||
|
Below example uses `-1` as padding and it also shows the input data
|
||
|
being bucketizied to two buckets "[0,3], [4,6]".
|
||
|
|
||
|
>>> elements = [
|
||
|
... [0], [1, 2, 3, 4], [5, 6, 7],
|
||
|
... [7, 8, 9, 10, 11], [13, 14, 15, 16, 19, 20], [21, 22]]
|
||
|
|
||
|
>>> dataset = tf.data.Dataset.from_generator(
|
||
|
... lambda: elements, tf.int32, output_shapes=[None])
|
||
|
|
||
|
>>> dataset = dataset.apply(
|
||
|
... tf.data.experimental.bucket_by_sequence_length(
|
||
|
... element_length_func=lambda elem: tf.shape(elem)[0],
|
||
|
... bucket_boundaries=[4, 7],
|
||
|
... bucket_batch_sizes=[2, 2, 2],
|
||
|
... pad_to_bucket_boundary=True,
|
||
|
... padding_values=-1))
|
||
|
|
||
|
>>> for elem in dataset.as_numpy_iterator():
|
||
|
... print(elem)
|
||
|
[[ 0 -1 -1]
|
||
|
[ 5 6 7]]
|
||
|
[[ 1 2 3 4 -1 -1]
|
||
|
[ 7 8 9 10 11 -1]]
|
||
|
[[21 22 -1]]
|
||
|
[[13 14 15 16 19 20]]
|
||
|
|
||
|
When using `pad_to_bucket_boundary` option, it can be seen that it is
|
||
|
not always possible to maintain the bucket batch size.
|
||
|
You can drop the batches that do not maintain the bucket batch size by
|
||
|
using the option `drop_remainder`. Using the same input data as in the
|
||
|
above example you get the following result.
|
||
|
|
||
|
>>> elements = [
|
||
|
... [0], [1, 2, 3, 4], [5, 6, 7],
|
||
|
... [7, 8, 9, 10, 11], [13, 14, 15, 16, 19, 20], [21, 22]]
|
||
|
|
||
|
>>> dataset = tf.data.Dataset.from_generator(
|
||
|
... lambda: elements, tf.int32, output_shapes=[None])
|
||
|
|
||
|
>>> dataset = dataset.apply(
|
||
|
... tf.data.experimental.bucket_by_sequence_length(
|
||
|
... element_length_func=lambda elem: tf.shape(elem)[0],
|
||
|
... bucket_boundaries=[4, 7],
|
||
|
... bucket_batch_sizes=[2, 2, 2],
|
||
|
... pad_to_bucket_boundary=True,
|
||
|
... padding_values=-1,
|
||
|
... drop_remainder=True))
|
||
|
|
||
|
>>> for elem in dataset.as_numpy_iterator():
|
||
|
... print(elem)
|
||
|
[[ 0 -1 -1]
|
||
|
[ 5 6 7]]
|
||
|
[[ 1 2 3 4 -1 -1]
|
||
|
[ 7 8 9 10 11 -1]]
|
||
|
|
||
|
Args:
|
||
|
element_length_func: function from element in `Dataset` to `tf.int32`,
|
||
|
determines the length of the element, which will determine the bucket it
|
||
|
goes into.
|
||
|
bucket_boundaries: `list<int>`, upper length boundaries of the buckets.
|
||
|
bucket_batch_sizes: `list<int>`, batch size per bucket. Length should be
|
||
|
`len(bucket_boundaries) + 1`.
|
||
|
padded_shapes: Nested structure of `tf.TensorShape` to pass to
|
||
|
`tf.data.Dataset.padded_batch`. If not provided, will use
|
||
|
`dataset.output_shapes`, which will result in variable length dimensions
|
||
|
being padded out to the maximum length in each batch.
|
||
|
padding_values: Values to pad with, passed to
|
||
|
`tf.data.Dataset.padded_batch`. Defaults to padding with 0.
|
||
|
pad_to_bucket_boundary: bool, if `False`, will pad dimensions with unknown
|
||
|
size to maximum length in batch. If `True`, will pad dimensions with
|
||
|
unknown size to bucket boundary minus 1 (i.e., the maximum length in each
|
||
|
bucket), and caller must ensure that the source `Dataset` does not contain
|
||
|
any elements with length longer than `max(bucket_boundaries)`.
|
||
|
no_padding: `bool`, indicates whether to pad the batch features (features
|
||
|
need to be either of type `tf.sparse.SparseTensor` or of same shape).
|
||
|
drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
|
||
|
whether the last batch should be dropped in the case it has fewer than
|
||
|
`batch_size` elements; the default behavior is not to drop the smaller
|
||
|
batch.
|
||
|
|
||
|
Returns:
|
||
|
A `Dataset` transformation function, which can be passed to
|
||
|
`tf.data.Dataset.apply`.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if `len(bucket_batch_sizes) != len(bucket_boundaries) + 1`.
|
||
|
"""
|
||
|
|
||
|
def _apply_fn(dataset):
|
||
|
return dataset.bucket_by_sequence_length(
|
||
|
element_length_func=element_length_func,
|
||
|
bucket_boundaries=bucket_boundaries,
|
||
|
bucket_batch_sizes=bucket_batch_sizes,
|
||
|
padded_shapes=padded_shapes,
|
||
|
padding_values=padding_values,
|
||
|
pad_to_bucket_boundary=pad_to_bucket_boundary,
|
||
|
no_padding=no_padding,
|
||
|
drop_remainder=drop_remainder)
|
||
|
|
||
|
return _apply_fn
|
||
|
|
||
|
|
||
|
class _GroupByReducerDataset(dataset_ops.UnaryDataset):
|
||
|
"""A `Dataset` that groups its input and performs a reduction."""
|
||
|
|
||
|
def __init__(self, input_dataset, key_func, reducer):
|
||
|
"""See `group_by_reducer()` for details."""
|
||
|
self._input_dataset = input_dataset
|
||
|
self._make_key_func(key_func, input_dataset)
|
||
|
self._make_init_func(reducer.init_func)
|
||
|
self._make_reduce_func(reducer.reduce_func, input_dataset)
|
||
|
self._make_finalize_func(reducer.finalize_func)
|
||
|
variant_tensor = ged_ops.experimental_group_by_reducer_dataset(
|
||
|
self._input_dataset._variant_tensor, # pylint: disable=protected-access
|
||
|
self._key_func.function.captured_inputs,
|
||
|
self._init_func.function.captured_inputs,
|
||
|
self._reduce_func.function.captured_inputs,
|
||
|
self._finalize_func.function.captured_inputs,
|
||
|
key_func=self._key_func.function,
|
||
|
init_func=self._init_func.function,
|
||
|
reduce_func=self._reduce_func.function,
|
||
|
finalize_func=self._finalize_func.function,
|
||
|
**self._flat_structure)
|
||
|
super(_GroupByReducerDataset, self).__init__(input_dataset, variant_tensor)
|
||
|
|
||
|
def _make_key_func(self, key_func, input_dataset):
|
||
|
"""Make wrapping defun for key_func."""
|
||
|
self._key_func = structured_function.StructuredFunctionWrapper(
|
||
|
key_func, self._transformation_name(), dataset=input_dataset)
|
||
|
if not self._key_func.output_structure.is_compatible_with(
|
||
|
tensor_spec.TensorSpec([], dtypes.int64)):
|
||
|
raise ValueError(
|
||
|
f"Invalid `key_func`. Expected `key_func` to return a scalar "
|
||
|
f"tf.int64 tensor, but instead `key_func` has output "
|
||
|
f"types={self._key_func.output_types} "
|
||
|
f"and shapes={self._key_func.output_shapes}."
|
||
|
)
|
||
|
|
||
|
def _make_init_func(self, init_func):
|
||
|
"""Make wrapping defun for init_func."""
|
||
|
self._init_func = structured_function.StructuredFunctionWrapper(
|
||
|
init_func,
|
||
|
self._transformation_name(),
|
||
|
input_structure=tensor_spec.TensorSpec([], dtypes.int64))
|
||
|
|
||
|
def _make_reduce_func(self, reduce_func, input_dataset):
|
||
|
"""Make wrapping defun for reduce_func."""
|
||
|
|
||
|
# Iteratively rerun the reduce function until reaching a fixed point on
|
||
|
# `self._state_structure`.
|
||
|
self._state_structure = self._init_func.output_structure
|
||
|
state_types = self._init_func.output_types
|
||
|
state_shapes = self._init_func.output_shapes
|
||
|
state_classes = self._init_func.output_classes
|
||
|
need_to_rerun = True
|
||
|
while need_to_rerun:
|
||
|
|
||
|
wrapped_func = structured_function.StructuredFunctionWrapper(
|
||
|
reduce_func,
|
||
|
self._transformation_name(),
|
||
|
input_structure=(self._state_structure, input_dataset.element_spec),
|
||
|
add_to_graph=False)
|
||
|
|
||
|
# Extract and validate class information from the returned values.
|
||
|
for new_state_class, state_class in zip(
|
||
|
nest.flatten(wrapped_func.output_classes),
|
||
|
nest.flatten(state_classes)):
|
||
|
if not issubclass(new_state_class, state_class):
|
||
|
raise TypeError(
|
||
|
f"Invalid `reducer`. The output class of the "
|
||
|
f"`reducer.reduce_func` {wrapped_func.output_classes}, "
|
||
|
f"does not match the class of the reduce state "
|
||
|
f"{self._state_classes}.")
|
||
|
|
||
|
# Extract and validate type information from the returned values.
|
||
|
for new_state_type, state_type in zip(
|
||
|
nest.flatten(wrapped_func.output_types), nest.flatten(state_types)):
|
||
|
if new_state_type != state_type:
|
||
|
raise TypeError(
|
||
|
f"Invalid `reducer`. The element types for the new state "
|
||
|
f"{wrapped_func.output_types} do not match the element types "
|
||
|
f"of the old state {self._init_func.output_types}."
|
||
|
)
|
||
|
|
||
|
# Extract shape information from the returned values.
|
||
|
flat_state_shapes = nest.flatten(state_shapes)
|
||
|
flat_new_state_shapes = nest.flatten(wrapped_func.output_shapes)
|
||
|
weakened_state_shapes = [
|
||
|
original.most_specific_compatible_shape(new)
|
||
|
for original, new in zip(flat_state_shapes, flat_new_state_shapes)
|
||
|
]
|
||
|
|
||
|
need_to_rerun = False
|
||
|
for original_shape, weakened_shape in zip(flat_state_shapes,
|
||
|
weakened_state_shapes):
|
||
|
if original_shape.ndims is not None and (
|
||
|
weakened_shape.ndims is None or
|
||
|
original_shape.as_list() != weakened_shape.as_list()):
|
||
|
need_to_rerun = True
|
||
|
break
|
||
|
|
||
|
if need_to_rerun:
|
||
|
state_shapes = nest.pack_sequence_as(
|
||
|
self._init_func.output_shapes, weakened_state_shapes)
|
||
|
self._state_structure = structure.convert_legacy_structure(
|
||
|
state_types, state_shapes, state_classes)
|
||
|
|
||
|
self._reduce_func = wrapped_func
|
||
|
self._reduce_func.function.add_to_graph(ops.get_default_graph())
|
||
|
|
||
|
def _make_finalize_func(self, finalize_func):
|
||
|
"""Make wrapping defun for finalize_func."""
|
||
|
self._finalize_func = structured_function.StructuredFunctionWrapper(
|
||
|
finalize_func,
|
||
|
self._transformation_name(),
|
||
|
input_structure=self._state_structure)
|
||
|
|
||
|
@property
|
||
|
def element_spec(self):
|
||
|
return self._finalize_func.output_structure
|
||
|
|
||
|
def _functions(self):
|
||
|
return [
|
||
|
self._key_func, self._init_func, self._reduce_func, self._finalize_func
|
||
|
]
|
||
|
|
||
|
def _transformation_name(self):
|
||
|
return "tf.data.experimental.group_by_reducer()"
|
||
|
|
||
|
|
||
|
@tf_export("data.experimental.Reducer")
|
||
|
class Reducer:
|
||
|
"""A reducer is used for reducing a set of elements.
|
||
|
|
||
|
A reducer is represented as a tuple of the three functions:
|
||
|
- init_func - to define initial value: key => initial state
|
||
|
- reducer_func - operation to perform on values with same key: (old state, input) => new state
|
||
|
- finalize_func - value to return in the end: state => result
|
||
|
|
||
|
For example,
|
||
|
|
||
|
```
|
||
|
def init_func(_):
|
||
|
return (0.0, 0.0)
|
||
|
|
||
|
def reduce_func(state, value):
|
||
|
return (state[0] + value['features'], state[1] + 1)
|
||
|
|
||
|
def finalize_func(s, n):
|
||
|
return s / n
|
||
|
|
||
|
reducer = tf.data.experimental.Reducer(init_func, reduce_func, finalize_func)
|
||
|
```
|
||
|
"""
|
||
|
|
||
|
def __init__(self, init_func, reduce_func, finalize_func):
|
||
|
self._init_func = init_func
|
||
|
self._reduce_func = reduce_func
|
||
|
self._finalize_func = finalize_func
|
||
|
|
||
|
@property
|
||
|
def init_func(self):
|
||
|
return self._init_func
|
||
|
|
||
|
@property
|
||
|
def reduce_func(self):
|
||
|
return self._reduce_func
|
||
|
|
||
|
@property
|
||
|
def finalize_func(self):
|
||
|
return self._finalize_func
|