1234 lines
49 KiB
Python
1234 lines
49 KiB
Python
|
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
|
||
|
"""Parsing Ops."""
|
||
|
from tensorflow.python.framework import ops
|
||
|
from tensorflow.python.framework import sparse_tensor
|
||
|
from tensorflow.python.ops import array_ops
|
||
|
from tensorflow.python.ops import control_flow_assert
|
||
|
from tensorflow.python.ops import control_flow_ops
|
||
|
from tensorflow.python.ops import gen_parsing_ops
|
||
|
from tensorflow.python.ops import math_ops
|
||
|
from tensorflow.python.ops import parsing_config
|
||
|
# Ensure parsing_ops gradients are registered
|
||
|
from tensorflow.python.ops import parsing_grad # pylint: disable=unused-import
|
||
|
# go/tf-wildcard-import
|
||
|
# pylint: disable=wildcard-import,undefined-variable
|
||
|
from tensorflow.python.ops.gen_parsing_ops import *
|
||
|
# pylint: enable=wildcard-import,undefined-variable
|
||
|
from tensorflow.python.util import deprecation
|
||
|
from tensorflow.python.util import dispatch
|
||
|
from tensorflow.python.util.tf_export import tf_export
|
||
|
|
||
|
VarLenFeature = parsing_config.VarLenFeature
|
||
|
RaggedFeature = parsing_config.RaggedFeature
|
||
|
SparseFeature = parsing_config.SparseFeature
|
||
|
FixedLenFeature = parsing_config.FixedLenFeature
|
||
|
FixedLenSequenceFeature = parsing_config.FixedLenSequenceFeature
|
||
|
# pylint: disable=protected-access
|
||
|
_ParseOpParams = parsing_config._ParseOpParams
|
||
|
_construct_tensors_for_composite_features = (
|
||
|
parsing_config._construct_tensors_for_composite_features)
|
||
|
# pylint: enable=protected-access
|
||
|
|
||
|
|
||
|
# TODO(b/122887740) Switch files that use this private symbol to use new name.
|
||
|
_construct_sparse_tensors_for_sparse_features = \
|
||
|
_construct_tensors_for_composite_features
|
||
|
|
||
|
|
||
|
def _prepend_none_dimension(features):
|
||
|
"""Returns a copy of features with adjusted FixedLenSequenceFeature shapes."""
|
||
|
if features:
|
||
|
modified_features = dict(features) # Create a copy to modify
|
||
|
for key, feature in features.items():
|
||
|
if isinstance(feature, FixedLenSequenceFeature):
|
||
|
if not feature.allow_missing:
|
||
|
raise ValueError("Unsupported: FixedLenSequenceFeature requires "
|
||
|
"allow_missing to be True.")
|
||
|
modified_features[key] = FixedLenSequenceFeature(
|
||
|
[None] + list(feature.shape),
|
||
|
feature.dtype,
|
||
|
feature.allow_missing,
|
||
|
feature.default_value)
|
||
|
return modified_features
|
||
|
else:
|
||
|
return features
|
||
|
|
||
|
|
||
|
@tf_export("io.parse_example", v1=[])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def parse_example_v2(serialized, features, example_names=None, name=None):
|
||
|
# pylint: disable=line-too-long
|
||
|
"""Parses `Example` protos into a `dict` of tensors.
|
||
|
|
||
|
Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
|
||
|
protos given in `serialized`. We refer to `serialized` as a batch with
|
||
|
`batch_size` many entries of individual `Example` protos.
|
||
|
|
||
|
`example_names` may contain descriptive names for the corresponding serialized
|
||
|
protos. These may be useful for debugging purposes, but they have no effect on
|
||
|
the output. If not `None`, `example_names` must be the same length as
|
||
|
`serialized`.
|
||
|
|
||
|
This op parses serialized examples into a dictionary mapping keys to `Tensor`
|
||
|
`SparseTensor`, and `RaggedTensor` objects. `features` is a Mapping from keys
|
||
|
to `VarLenFeature`, `SparseFeature`, `RaggedFeature`, and `FixedLenFeature`
|
||
|
objects. Each `VarLenFeature` and `SparseFeature` is mapped to a
|
||
|
`SparseTensor`; each `FixedLenFeature` is mapped to a `Tensor`; and each
|
||
|
`RaggedFeature` is mapped to a `RaggedTensor`.
|
||
|
|
||
|
Each `VarLenFeature` maps to a `SparseTensor` of the specified type
|
||
|
representing a ragged matrix. Its indices are `[batch, index]` where `batch`
|
||
|
identifies the example in `serialized`, and `index` is the value's index in
|
||
|
the list of values associated with that feature and example.
|
||
|
|
||
|
Each `SparseFeature` maps to a `SparseTensor` of the specified type
|
||
|
representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`.
|
||
|
Its `values` come from the feature in the examples with key `value_key`.
|
||
|
A `values[i]` comes from a position `k` in the feature of an example at batch
|
||
|
entry `batch`. This positional information is recorded in `indices[i]` as
|
||
|
`[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
|
||
|
the feature in the example at with key `SparseFeature.index_key[j]`.
|
||
|
In other words, we split the indices (except the first index indicating the
|
||
|
batch entry) of a `SparseTensor` by dimension into different features of the
|
||
|
`Example`. Due to its complexity a `VarLenFeature` should be preferred over a
|
||
|
`SparseFeature` whenever possible.
|
||
|
|
||
|
Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
|
||
|
`tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
|
||
|
|
||
|
`FixedLenFeature` entries with a `default_value` are optional. With no default
|
||
|
value, we will fail if that `Feature` is missing from any example in
|
||
|
`serialized`.
|
||
|
|
||
|
Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
|
||
|
(or `tf.float32` if not specified) and shape
|
||
|
`(serialized.size(), None) + df.shape`.
|
||
|
All examples in `serialized` will be padded with `default_value` along the
|
||
|
second dimension.
|
||
|
|
||
|
Each `RaggedFeature` maps to a `RaggedTensor` of the specified type. It
|
||
|
is formed by stacking the `RaggedTensor` for each example, where the
|
||
|
`RaggedTensor` for each individual example is constructed using the tensors
|
||
|
specified by `RaggedTensor.values_key` and `RaggedTensor.partition`. See
|
||
|
the `tf.io.RaggedFeature` documentation for details and examples.
|
||
|
|
||
|
Examples:
|
||
|
|
||
|
For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three
|
||
|
serialized `Example`s are provided:
|
||
|
|
||
|
```
|
||
|
serialized = [
|
||
|
features
|
||
|
{ feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } },
|
||
|
features
|
||
|
{ feature []},
|
||
|
features
|
||
|
{ feature { key: "ft" value { float_list { value: [3.0] } } }
|
||
|
]
|
||
|
```
|
||
|
|
||
|
then the output will look like:
|
||
|
|
||
|
```python
|
||
|
{"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
|
||
|
values=[1.0, 2.0, 3.0],
|
||
|
dense_shape=(3, 2)) }
|
||
|
```
|
||
|
|
||
|
If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
|
||
|
`shape=[]` is used then the output will look like:
|
||
|
|
||
|
```python
|
||
|
{"ft": [[1.0, 2.0], [3.0, -1.0]]}
|
||
|
```
|
||
|
|
||
|
Given two `Example` input protos in `serialized`:
|
||
|
|
||
|
```
|
||
|
[
|
||
|
features {
|
||
|
feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } }
|
||
|
feature { key: "gps" value { float_list { value: [] } } }
|
||
|
},
|
||
|
features {
|
||
|
feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } }
|
||
|
feature { key: "dank" value { int64_list { value: [ 42 ] } } }
|
||
|
feature { key: "gps" value { } }
|
||
|
}
|
||
|
]
|
||
|
```
|
||
|
|
||
|
And arguments
|
||
|
|
||
|
```
|
||
|
example_names: ["input0", "input1"],
|
||
|
features: {
|
||
|
"kw": VarLenFeature(tf.string),
|
||
|
"dank": VarLenFeature(tf.int64),
|
||
|
"gps": VarLenFeature(tf.float32),
|
||
|
}
|
||
|
```
|
||
|
|
||
|
Then the output is a dictionary:
|
||
|
|
||
|
```python
|
||
|
{
|
||
|
"kw": SparseTensor(
|
||
|
indices=[[0, 0], [0, 1], [1, 0]],
|
||
|
values=["knit", "big", "emmy"]
|
||
|
dense_shape=[2, 2]),
|
||
|
"dank": SparseTensor(
|
||
|
indices=[[1, 0]],
|
||
|
values=[42],
|
||
|
dense_shape=[2, 1]),
|
||
|
"gps": SparseTensor(
|
||
|
indices=[],
|
||
|
values=[],
|
||
|
dense_shape=[2, 0]),
|
||
|
}
|
||
|
```
|
||
|
|
||
|
For dense results in two serialized `Example`s:
|
||
|
|
||
|
```
|
||
|
[
|
||
|
features {
|
||
|
feature { key: "age" value { int64_list { value: [ 0 ] } } }
|
||
|
feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
|
||
|
},
|
||
|
features {
|
||
|
feature { key: "age" value { int64_list { value: [] } } }
|
||
|
feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
|
||
|
}
|
||
|
]
|
||
|
```
|
||
|
|
||
|
We can use arguments:
|
||
|
|
||
|
```
|
||
|
example_names: ["input0", "input1"],
|
||
|
features: {
|
||
|
"age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
|
||
|
"gender": FixedLenFeature([], dtype=tf.string),
|
||
|
}
|
||
|
```
|
||
|
|
||
|
And the expected output is:
|
||
|
|
||
|
```python
|
||
|
{
|
||
|
"age": [[0], [-1]],
|
||
|
"gender": [["f"], ["f"]],
|
||
|
}
|
||
|
```
|
||
|
|
||
|
An alternative to `VarLenFeature` to obtain a `SparseTensor` is
|
||
|
`SparseFeature`. For example, given two `Example` input protos in
|
||
|
`serialized`:
|
||
|
|
||
|
```
|
||
|
[
|
||
|
features {
|
||
|
feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
|
||
|
feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
|
||
|
},
|
||
|
features {
|
||
|
feature { key: "val" value { float_list { value: [ 0.0 ] } } }
|
||
|
feature { key: "ix" value { int64_list { value: [ 42 ] } } }
|
||
|
}
|
||
|
]
|
||
|
```
|
||
|
|
||
|
And arguments
|
||
|
|
||
|
```
|
||
|
example_names: ["input0", "input1"],
|
||
|
features: {
|
||
|
"sparse": SparseFeature(
|
||
|
index_key="ix", value_key="val", dtype=tf.float32, size=100),
|
||
|
}
|
||
|
```
|
||
|
|
||
|
Then the output is a dictionary:
|
||
|
|
||
|
```python
|
||
|
{
|
||
|
"sparse": SparseTensor(
|
||
|
indices=[[0, 3], [0, 20], [1, 42]],
|
||
|
values=[0.5, -1.0, 0.0]
|
||
|
dense_shape=[2, 100]),
|
||
|
}
|
||
|
```
|
||
|
|
||
|
See the `tf.io.RaggedFeature` documentation for examples showing how
|
||
|
`RaggedFeature` can be used to obtain `RaggedTensor`s.
|
||
|
|
||
|
Args:
|
||
|
serialized: A vector (1-D Tensor) of strings, a batch of binary
|
||
|
serialized `Example` protos.
|
||
|
features: A mapping of feature keys to `FixedLenFeature`,
|
||
|
`VarLenFeature`, `SparseFeature`, and `RaggedFeature` values.
|
||
|
example_names: A vector (1-D Tensor) of strings (optional), the names of
|
||
|
the serialized protos in the batch.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A `dict` mapping feature keys to `Tensor`, `SparseTensor`, and
|
||
|
`RaggedTensor` values.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if any feature is invalid.
|
||
|
"""
|
||
|
if not features:
|
||
|
raise ValueError(
|
||
|
"Argument `features` cannot be None or falsy. Got %s" % features)
|
||
|
features = _prepend_none_dimension(features)
|
||
|
params = _ParseOpParams.from_features(features, [
|
||
|
VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature,
|
||
|
RaggedFeature
|
||
|
])
|
||
|
|
||
|
outputs = _parse_example_raw(serialized, example_names, params, name=name)
|
||
|
return _construct_tensors_for_composite_features(features, outputs)
|
||
|
|
||
|
|
||
|
@tf_export(v1=["io.parse_example", "parse_example"])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def parse_example(serialized, features, name=None, example_names=None):
|
||
|
return parse_example_v2(serialized, features, example_names, name)
|
||
|
|
||
|
|
||
|
parse_example.__doc__ = parse_example_v2.__doc__
|
||
|
|
||
|
|
||
|
def _parse_example_raw(serialized, names, params, name):
|
||
|
"""Parses `Example` protos.
|
||
|
|
||
|
Args:
|
||
|
serialized: A vector (1-D Tensor) of strings, a batch of binary
|
||
|
serialized `Example` protos.
|
||
|
names: A vector (1-D Tensor) of strings (optional), the names of
|
||
|
the serialized protos.
|
||
|
params: A `ParseOpParams` containing the parameters for the parse op.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A `dict` mapping keys to `Tensor`s and `SparseTensor`s and `RaggedTensor`s.
|
||
|
|
||
|
"""
|
||
|
if params.num_features == 0:
|
||
|
raise ValueError("Must provide at least one feature key.")
|
||
|
with ops.name_scope(name, "ParseExample", [serialized, names]):
|
||
|
names = [] if names is None else names
|
||
|
serialized = ops.convert_to_tensor(serialized, name="serialized")
|
||
|
if params.ragged_keys and serialized.shape.ndims is None:
|
||
|
raise ValueError("serialized must have statically-known rank to "
|
||
|
"parse ragged features.")
|
||
|
outputs = gen_parsing_ops.parse_example_v2(
|
||
|
serialized=serialized,
|
||
|
names=names,
|
||
|
sparse_keys=params.sparse_keys,
|
||
|
dense_keys=params.dense_keys,
|
||
|
ragged_keys=params.ragged_keys,
|
||
|
dense_defaults=params.dense_defaults_vec,
|
||
|
num_sparse=len(params.sparse_keys),
|
||
|
sparse_types=params.sparse_types,
|
||
|
ragged_value_types=params.ragged_value_types,
|
||
|
ragged_split_types=params.ragged_split_types,
|
||
|
dense_shapes=params.dense_shapes_as_proto,
|
||
|
name=name)
|
||
|
(sparse_indices, sparse_values, sparse_shapes, dense_values,
|
||
|
ragged_values, ragged_row_splits) = outputs
|
||
|
# pylint: disable=protected-access
|
||
|
ragged_tensors = parsing_config._build_ragged_tensors(
|
||
|
serialized.shape, ragged_values, ragged_row_splits)
|
||
|
|
||
|
sparse_tensors = [
|
||
|
sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape)
|
||
|
in zip(sparse_indices, sparse_values, sparse_shapes)]
|
||
|
|
||
|
return dict(
|
||
|
zip(params.sparse_keys + params.dense_keys + params.ragged_keys,
|
||
|
sparse_tensors + dense_values + ragged_tensors))
|
||
|
|
||
|
|
||
|
@tf_export(v1=["io.parse_single_example", "parse_single_example"])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def parse_single_example(serialized, features, name=None, example_names=None):
|
||
|
"""Parses a single `Example` proto.
|
||
|
|
||
|
Similar to `parse_example`, except:
|
||
|
|
||
|
For dense tensors, the returned `Tensor` is identical to the output of
|
||
|
`parse_example`, except there is no batch dimension, the output shape is the
|
||
|
same as the shape given in `dense_shape`.
|
||
|
|
||
|
For `SparseTensor`s, the first (batch) column of the indices matrix is removed
|
||
|
(the indices matrix is a column vector), the values vector is unchanged, and
|
||
|
the first (`batch_size`) entry of the shape vector is removed (it is now a
|
||
|
single element vector).
|
||
|
|
||
|
One might see performance advantages by batching `Example` protos with
|
||
|
`parse_example` instead of using this function directly.
|
||
|
|
||
|
Args:
|
||
|
serialized: A scalar string Tensor, a single serialized Example.
|
||
|
features: A mapping of feature keys to `FixedLenFeature` or
|
||
|
`VarLenFeature` values.
|
||
|
name: A name for this operation (optional).
|
||
|
example_names: (Optional) A scalar string Tensor, the associated name.
|
||
|
|
||
|
Returns:
|
||
|
A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if any feature is invalid.
|
||
|
"""
|
||
|
return parse_single_example_v2(serialized, features, example_names, name)
|
||
|
|
||
|
|
||
|
@tf_export("io.parse_single_example", v1=[])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def parse_single_example_v2(
|
||
|
serialized, features, example_names=None, name=None
|
||
|
):
|
||
|
"""Parses a single `Example` proto.
|
||
|
|
||
|
Similar to `parse_example`, except:
|
||
|
|
||
|
For dense tensors, the returned `Tensor` is identical to the output of
|
||
|
`parse_example`, except there is no batch dimension, the output shape is the
|
||
|
same as the shape given in `dense_shape`.
|
||
|
|
||
|
For `SparseTensor`s, the first (batch) column of the indices matrix is removed
|
||
|
(the indices matrix is a column vector), the values vector is unchanged, and
|
||
|
the first (`batch_size`) entry of the shape vector is removed (it is now a
|
||
|
single element vector).
|
||
|
|
||
|
One might see performance advantages by batching `Example` protos with
|
||
|
`parse_example` instead of using this function directly.
|
||
|
|
||
|
Args:
|
||
|
serialized: A scalar string Tensor, a single serialized Example.
|
||
|
features: A mapping of feature keys to `FixedLenFeature` or
|
||
|
`VarLenFeature` values.
|
||
|
example_names: (Optional) A scalar string Tensor, the associated name.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if any feature is invalid.
|
||
|
"""
|
||
|
if not features:
|
||
|
raise ValueError("Invalid argument: features cannot be None.")
|
||
|
with ops.name_scope(name, "ParseSingleExample", [serialized, example_names]):
|
||
|
serialized = ops.convert_to_tensor(serialized, name="serialized")
|
||
|
serialized = _assert_scalar(serialized, "serialized")
|
||
|
return parse_example_v2(serialized, features, example_names, name)
|
||
|
|
||
|
|
||
|
@tf_export("io.parse_sequence_example")
|
||
|
@dispatch.add_dispatch_support
|
||
|
def parse_sequence_example(serialized,
|
||
|
context_features=None,
|
||
|
sequence_features=None,
|
||
|
example_names=None,
|
||
|
name=None):
|
||
|
# pylint: disable=line-too-long
|
||
|
"""Parses a batch of `SequenceExample` protos.
|
||
|
|
||
|
Parses a vector of serialized
|
||
|
[`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
|
||
|
protos given in `serialized`.
|
||
|
|
||
|
This op parses serialized sequence examples into a tuple of dictionaries,
|
||
|
each mapping keys to `Tensor` and `SparseTensor` objects.
|
||
|
The first dictionary contains mappings for keys appearing in
|
||
|
`context_features`, and the second dictionary contains mappings for keys
|
||
|
appearing in `sequence_features`.
|
||
|
|
||
|
At least one of `context_features` and `sequence_features` must be provided
|
||
|
and non-empty.
|
||
|
|
||
|
The `context_features` keys are associated with a `SequenceExample` as a
|
||
|
whole, independent of time / frame. In contrast, the `sequence_features` keys
|
||
|
provide a way to access variable-length data within the `FeatureList` section
|
||
|
of the `SequenceExample` proto. While the shapes of `context_features` values
|
||
|
are fixed with respect to frame, the frame dimension (the first dimension)
|
||
|
of `sequence_features` values may vary between `SequenceExample` protos,
|
||
|
and even between `feature_list` keys within the same `SequenceExample`.
|
||
|
|
||
|
`context_features` contains `VarLenFeature`, `RaggedFeature`, and
|
||
|
`FixedLenFeature` objects. Each `VarLenFeature` is mapped to a
|
||
|
`SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and each
|
||
|
`FixedLenFeature` is mapped to a `Tensor`, of the specified type, shape, and
|
||
|
default value.
|
||
|
|
||
|
`sequence_features` contains `VarLenFeature`, `RaggedFeature`, and
|
||
|
`FixedLenSequenceFeature` objects. Each `VarLenFeature` is mapped to a
|
||
|
`SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and
|
||
|
each `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified
|
||
|
type. The shape will be `(B,T,) + df.dense_shape` for
|
||
|
`FixedLenSequenceFeature` `df`, where `B` is the batch size, and `T` is the
|
||
|
length of the associated `FeatureList` in the `SequenceExample`. For instance,
|
||
|
`FixedLenSequenceFeature([])` yields a scalar 2-D `Tensor` of static shape
|
||
|
`[None, None]` and dynamic shape `[B, T]`, while
|
||
|
`FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 3-D matrix `Tensor`
|
||
|
of static shape `[None, None, k]` and dynamic shape `[B, T, k]`.
|
||
|
|
||
|
Like the input, the resulting output tensors have a batch dimension. This
|
||
|
means that the original per-example shapes of `VarLenFeature`s and
|
||
|
`FixedLenSequenceFeature`s can be lost. To handle that situation, this op also
|
||
|
provides dicts of shape tensors as part of the output. There is one dict for
|
||
|
the context features, and one for the feature_list features. Context features
|
||
|
of type `FixedLenFeature`s will not be present, since their shapes are already
|
||
|
known by the caller. In situations where the input `FixedLenSequenceFeature`s
|
||
|
are of different sequence lengths across examples, the shorter examples will
|
||
|
be padded with default datatype values: 0 for numeric types, and the empty
|
||
|
string for string types.
|
||
|
|
||
|
Each `SparseTensor` corresponding to `sequence_features` represents a ragged
|
||
|
vector. Its indices are `[time, index]`, where `time` is the `FeatureList`
|
||
|
entry and `index` is the value's index in the list of values associated with
|
||
|
that time.
|
||
|
|
||
|
`FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature`
|
||
|
entries with `allow_missing=True` are optional; otherwise, we will fail if
|
||
|
that `Feature` or `FeatureList` is missing from any example in `serialized`.
|
||
|
|
||
|
`example_name` may contain a descriptive name for the corresponding serialized
|
||
|
proto. This may be useful for debugging purposes, but it has no effect on the
|
||
|
output. If not `None`, `example_name` must be a scalar.
|
||
|
|
||
|
Args:
|
||
|
serialized: A vector (1-D Tensor) of type string containing binary
|
||
|
serialized `SequenceExample` protos.
|
||
|
context_features: A mapping of feature keys to `FixedLenFeature` or
|
||
|
`VarLenFeature` or `RaggedFeature` values. These features are associated
|
||
|
with a `SequenceExample` as a whole.
|
||
|
sequence_features: A mapping of feature keys to
|
||
|
`FixedLenSequenceFeature` or `VarLenFeature` or `RaggedFeature` values.
|
||
|
These features are associated with data within the `FeatureList` section
|
||
|
of the `SequenceExample` proto.
|
||
|
example_names: A vector (1-D Tensor) of strings (optional), the name of the
|
||
|
serialized protos.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A tuple of three `dict`s, each mapping keys to `Tensor`s,
|
||
|
`SparseTensor`s, and `RaggedTensor`. The first dict contains the context
|
||
|
key/values, the second dict contains the feature_list key/values, and the
|
||
|
final dict contains the lengths of any dense feature_list features.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if any feature is invalid.
|
||
|
"""
|
||
|
if not (context_features or sequence_features):
|
||
|
raise ValueError("Both `context_features` and `sequence_features` argument "
|
||
|
"are None, but at least one should have values.")
|
||
|
context_params = _ParseOpParams.from_features(
|
||
|
context_features, [VarLenFeature, FixedLenFeature, RaggedFeature])
|
||
|
feature_list_params = _ParseOpParams.from_features(
|
||
|
sequence_features,
|
||
|
[VarLenFeature, FixedLenSequenceFeature, RaggedFeature])
|
||
|
|
||
|
with ops.name_scope(name, "ParseSequenceExample",
|
||
|
[serialized, example_names]):
|
||
|
outputs = _parse_sequence_example_raw(serialized, example_names,
|
||
|
context_params, feature_list_params,
|
||
|
name)
|
||
|
context_output, feature_list_output, feature_list_lengths = outputs
|
||
|
|
||
|
if context_params.ragged_keys:
|
||
|
context_output = _construct_tensors_for_composite_features(
|
||
|
context_features, context_output)
|
||
|
if feature_list_params.ragged_keys:
|
||
|
feature_list_output = _construct_tensors_for_composite_features(
|
||
|
sequence_features, feature_list_output)
|
||
|
|
||
|
return context_output, feature_list_output, feature_list_lengths
|
||
|
|
||
|
|
||
|
def _parse_sequence_example_raw(serialized,
|
||
|
debug_name,
|
||
|
context,
|
||
|
feature_list,
|
||
|
name=None):
|
||
|
"""Parses a vector of `SequenceExample` protos.
|
||
|
|
||
|
Args:
|
||
|
serialized: A vector (1-D Tensor) of type string, containing binary
|
||
|
serialized `SequenceExample` protos.
|
||
|
debug_name: A vector (1-D Tensor) of strings (optional), the names of the
|
||
|
serialized protos.
|
||
|
context: A `ParseOpParams` containing the parameters for the parse
|
||
|
op for the context features.
|
||
|
feature_list: A `ParseOpParams` containing the parameters for the
|
||
|
parse op for the feature_list features.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A tuple of three `dict`s, each mapping keys to `Tensor`s, `SparseTensor`s,
|
||
|
and `RaggedTensor`s. The first dict contains the context key/values, the
|
||
|
second dict contains the feature_list key/values, and the final dict
|
||
|
contains the lengths of any dense feature_list features.
|
||
|
|
||
|
Raises:
|
||
|
TypeError: if feature_list.dense_defaults is not either None or a dict.
|
||
|
"""
|
||
|
if context.num_features + feature_list.num_features == 0:
|
||
|
raise ValueError("Must provide at least one feature key.")
|
||
|
with ops.name_scope(name, "ParseSequenceExample", [serialized]):
|
||
|
debug_name = [] if debug_name is None else debug_name
|
||
|
|
||
|
# Internal
|
||
|
feature_list_dense_missing_assumed_empty = []
|
||
|
for k, v in feature_list.dense_defaults.items():
|
||
|
if v is not None:
|
||
|
raise ValueError("Value feature_list.dense_defaults[%s] must be None" %
|
||
|
k)
|
||
|
feature_list_dense_missing_assumed_empty.append(k)
|
||
|
|
||
|
has_ragged = context.ragged_keys or feature_list.ragged_keys
|
||
|
serialized = ops.convert_to_tensor(serialized, name="serialized")
|
||
|
if has_ragged and serialized.shape.ndims is None:
|
||
|
raise ValueError("serialized must have statically-known rank to "
|
||
|
"parse ragged features.")
|
||
|
feature_list_dense_missing_assumed_empty_vector = [
|
||
|
key in feature_list_dense_missing_assumed_empty
|
||
|
for key in feature_list.dense_keys
|
||
|
]
|
||
|
outputs = gen_parsing_ops.parse_sequence_example_v2(
|
||
|
# Inputs
|
||
|
serialized=serialized,
|
||
|
debug_name=debug_name,
|
||
|
context_sparse_keys=context.sparse_keys,
|
||
|
context_dense_keys=context.dense_keys,
|
||
|
context_ragged_keys=context.ragged_keys,
|
||
|
feature_list_sparse_keys=feature_list.sparse_keys,
|
||
|
feature_list_dense_keys=feature_list.dense_keys,
|
||
|
feature_list_ragged_keys=feature_list.ragged_keys,
|
||
|
feature_list_dense_missing_assumed_empty=(
|
||
|
feature_list_dense_missing_assumed_empty_vector),
|
||
|
context_dense_defaults=context.dense_defaults_vec,
|
||
|
# Attrs
|
||
|
Ncontext_sparse=len(context.sparse_keys),
|
||
|
Nfeature_list_sparse=len(feature_list.sparse_keys),
|
||
|
Nfeature_list_dense=len(feature_list.dense_keys),
|
||
|
context_sparse_types=context.sparse_types,
|
||
|
context_ragged_value_types=context.ragged_value_types,
|
||
|
context_ragged_split_types=context.ragged_split_types,
|
||
|
feature_list_dense_types=feature_list.dense_types,
|
||
|
feature_list_sparse_types=feature_list.sparse_types,
|
||
|
feature_list_ragged_value_types=feature_list.ragged_value_types,
|
||
|
feature_list_ragged_split_types=feature_list.ragged_split_types,
|
||
|
context_dense_shapes=context.dense_shapes_as_proto,
|
||
|
feature_list_dense_shapes=feature_list.dense_shapes,
|
||
|
name=name)
|
||
|
(context_sparse_indices, context_sparse_values, context_sparse_shapes,
|
||
|
context_dense_values, context_ragged_values, context_ragged_row_splits,
|
||
|
feature_list_sparse_indices, feature_list_sparse_values,
|
||
|
feature_list_sparse_shapes, feature_list_dense_values,
|
||
|
feature_list_dense_lengths, feature_list_ragged_values,
|
||
|
feature_list_ragged_outer_splits,
|
||
|
feature_list_ragged_inner_splits) = outputs
|
||
|
# pylint: disable=protected-access
|
||
|
context_ragged_tensors = parsing_config._build_ragged_tensors(
|
||
|
serialized.shape, context_ragged_values, context_ragged_row_splits)
|
||
|
feature_list_ragged_tensors = parsing_config._build_ragged_tensors(
|
||
|
serialized.shape, feature_list_ragged_values,
|
||
|
feature_list_ragged_outer_splits, feature_list_ragged_inner_splits)
|
||
|
|
||
|
# pylint: disable=g-complex-comprehension
|
||
|
context_sparse_tensors = [
|
||
|
sparse_tensor.SparseTensor(ix, val, shape)
|
||
|
for (ix, val,
|
||
|
shape) in zip(context_sparse_indices, context_sparse_values,
|
||
|
context_sparse_shapes)
|
||
|
]
|
||
|
|
||
|
feature_list_sparse_tensors = [
|
||
|
sparse_tensor.SparseTensor(ix, val, shape)
|
||
|
for (ix, val, shape
|
||
|
) in zip(feature_list_sparse_indices, feature_list_sparse_values,
|
||
|
feature_list_sparse_shapes)
|
||
|
]
|
||
|
# pylint: enable=g-complex-comprehension
|
||
|
|
||
|
context_output = dict(
|
||
|
zip(
|
||
|
context.sparse_keys + context.dense_keys + context.ragged_keys,
|
||
|
context_sparse_tensors + context_dense_values +
|
||
|
context_ragged_tensors))
|
||
|
feature_list_output = dict(
|
||
|
zip(
|
||
|
feature_list.sparse_keys + feature_list.dense_keys +
|
||
|
feature_list.ragged_keys, feature_list_sparse_tensors +
|
||
|
feature_list_dense_values + feature_list_ragged_tensors))
|
||
|
feature_list_lengths = dict(
|
||
|
zip(feature_list.dense_keys, feature_list_dense_lengths))
|
||
|
|
||
|
return (context_output, feature_list_output, feature_list_lengths)
|
||
|
|
||
|
|
||
|
@tf_export("io.parse_single_sequence_example",
|
||
|
v1=["io.parse_single_sequence_example",
|
||
|
"parse_single_sequence_example"])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def parse_single_sequence_example(
|
||
|
serialized, context_features=None, sequence_features=None,
|
||
|
example_name=None, name=None):
|
||
|
# pylint: disable=line-too-long
|
||
|
"""Parses a single `SequenceExample` proto.
|
||
|
|
||
|
Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
|
||
|
proto given in `serialized`.
|
||
|
|
||
|
This op parses a serialized sequence example into a tuple of dictionaries,
|
||
|
each mapping keys to `Tensor` and `SparseTensor` objects.
|
||
|
The first dictionary contains mappings for keys appearing in
|
||
|
`context_features`, and the second dictionary contains mappings for keys
|
||
|
appearing in `sequence_features`.
|
||
|
|
||
|
At least one of `context_features` and `sequence_features` must be provided
|
||
|
and non-empty.
|
||
|
|
||
|
The `context_features` keys are associated with a `SequenceExample` as a
|
||
|
whole, independent of time / frame. In contrast, the `sequence_features` keys
|
||
|
provide a way to access variable-length data within the `FeatureList` section
|
||
|
of the `SequenceExample` proto. While the shapes of `context_features` values
|
||
|
are fixed with respect to frame, the frame dimension (the first dimension)
|
||
|
of `sequence_features` values may vary between `SequenceExample` protos,
|
||
|
and even between `feature_list` keys within the same `SequenceExample`.
|
||
|
|
||
|
`context_features` contains `VarLenFeature`, `RaggedFeature`, and
|
||
|
`FixedLenFeature` objects. Each `VarLenFeature` is mapped to a `SparseTensor`;
|
||
|
each `RaggedFeature` is mapped to a `RaggedTensor`; and each `FixedLenFeature`
|
||
|
is mapped to a `Tensor`, of the specified type, shape, and default value.
|
||
|
|
||
|
`sequence_features` contains `VarLenFeature`, `RaggedFeature`, and
|
||
|
`FixedLenSequenceFeature` objects. Each `VarLenFeature` is mapped to a
|
||
|
`SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and each
|
||
|
`FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type.
|
||
|
The shape will be `(T,) + df.dense_shape` for `FixedLenSequenceFeature` `df`,
|
||
|
where `T` is the length of the associated `FeatureList` in the
|
||
|
`SequenceExample`. For instance, `FixedLenSequenceFeature([])` yields a scalar
|
||
|
1-D `Tensor` of static shape `[None]` and dynamic shape `[T]`, while
|
||
|
`FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 2-D matrix `Tensor`
|
||
|
of static shape `[None, k]` and dynamic shape `[T, k]`.
|
||
|
|
||
|
Each `SparseTensor` corresponding to `sequence_features` represents a ragged
|
||
|
vector. Its indices are `[time, index]`, where `time` is the `FeatureList`
|
||
|
entry and `index` is the value's index in the list of values associated with
|
||
|
that time.
|
||
|
|
||
|
`FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature`
|
||
|
entries with `allow_missing=True` are optional; otherwise, we will fail if
|
||
|
that `Feature` or `FeatureList` is missing from any example in `serialized`.
|
||
|
|
||
|
`example_name` may contain a descriptive name for the corresponding serialized
|
||
|
proto. This may be useful for debugging purposes, but it has no effect on the
|
||
|
output. If not `None`, `example_name` must be a scalar.
|
||
|
|
||
|
Note that the batch version of this function, `tf.parse_sequence_example`,
|
||
|
is written for better memory efficiency and will be faster on large
|
||
|
`SequenceExample`s.
|
||
|
|
||
|
Args:
|
||
|
serialized: A scalar (0-D Tensor) of type string, a single binary
|
||
|
serialized `SequenceExample` proto.
|
||
|
context_features: A mapping of feature keys to `FixedLenFeature` or
|
||
|
`VarLenFeature` or `RaggedFeature` values. These features are associated
|
||
|
with a `SequenceExample` as a whole.
|
||
|
sequence_features: A mapping of feature keys to
|
||
|
`FixedLenSequenceFeature` or `VarLenFeature` or `RaggedFeature` values.
|
||
|
These features are associated with data within the `FeatureList` section
|
||
|
of the `SequenceExample` proto.
|
||
|
example_name: A scalar (0-D Tensor) of strings (optional), the name of
|
||
|
the serialized proto.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s
|
||
|
and `RaggedTensor`s.
|
||
|
|
||
|
* The first dict contains the context key/values.
|
||
|
* The second dict contains the feature_list key/values.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if any feature is invalid.
|
||
|
"""
|
||
|
# pylint: enable=line-too-long
|
||
|
if not (context_features or sequence_features):
|
||
|
raise ValueError("Both context_features and sequence_features are None, but"
|
||
|
" at least one should have values.")
|
||
|
context_params = _ParseOpParams.from_features(
|
||
|
context_features, [VarLenFeature, FixedLenFeature, RaggedFeature])
|
||
|
feature_list_params = _ParseOpParams.from_features(
|
||
|
sequence_features,
|
||
|
[VarLenFeature, FixedLenSequenceFeature, RaggedFeature])
|
||
|
|
||
|
with ops.name_scope(name, "ParseSingleSequenceExample",
|
||
|
[serialized, example_name]):
|
||
|
context_output, feature_list_output = (
|
||
|
_parse_single_sequence_example_raw(serialized, context_params,
|
||
|
feature_list_params, example_name,
|
||
|
name))
|
||
|
|
||
|
if context_params.ragged_keys:
|
||
|
context_output = _construct_tensors_for_composite_features(
|
||
|
context_features, context_output)
|
||
|
if feature_list_params.ragged_keys:
|
||
|
feature_list_output = _construct_tensors_for_composite_features(
|
||
|
sequence_features, feature_list_output)
|
||
|
|
||
|
return context_output, feature_list_output
|
||
|
|
||
|
|
||
|
def _parse_single_sequence_example_raw(serialized,
|
||
|
context,
|
||
|
feature_list,
|
||
|
debug_name,
|
||
|
name=None):
|
||
|
"""Parses a single `SequenceExample` proto.
|
||
|
|
||
|
Args:
|
||
|
serialized: A scalar (0-D Tensor) of type string, a single binary serialized
|
||
|
`SequenceExample` proto.
|
||
|
context: A `ParseOpParams` containing the parameters for the parse op for
|
||
|
the context features.
|
||
|
feature_list: A `ParseOpParams` containing the parameters for the parse op
|
||
|
for the feature_list features.
|
||
|
debug_name: A scalar (0-D Tensor) of strings (optional), the name of the
|
||
|
serialized proto.
|
||
|
name: A name for this operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s.
|
||
|
The first dict contains the context key/values.
|
||
|
The second dict contains the feature_list key/values.
|
||
|
|
||
|
Raises:
|
||
|
TypeError: if feature_list.dense_defaults is not either None or a dict.
|
||
|
"""
|
||
|
with ops.name_scope(name, "ParseSingleExample", [serialized, debug_name]):
|
||
|
serialized = ops.convert_to_tensor(serialized, name="serialized")
|
||
|
serialized = _assert_scalar(serialized, "serialized")
|
||
|
return _parse_sequence_example_raw(serialized, debug_name, context,
|
||
|
feature_list, name)[:2]
|
||
|
|
||
|
|
||
|
@tf_export("io.decode_raw", v1=[])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def decode_raw(input_bytes,
|
||
|
out_type,
|
||
|
little_endian=True,
|
||
|
fixed_length=None,
|
||
|
name=None):
|
||
|
r"""Convert raw bytes from input tensor into numeric tensors.
|
||
|
|
||
|
Every component of the input tensor is interpreted as a sequence of bytes.
|
||
|
These bytes are then decoded as numbers in the format specified by `out_type`.
|
||
|
|
||
|
>>> tf.io.decode_raw(tf.constant("1"), tf.uint8)
|
||
|
<tf.Tensor: shape=(1,), dtype=uint8, numpy=array([49], dtype=uint8)>
|
||
|
>>> tf.io.decode_raw(tf.constant("1,2"), tf.uint8)
|
||
|
<tf.Tensor: shape=(3,), dtype=uint8, numpy=array([49, 44, 50], dtype=uint8)>
|
||
|
|
||
|
Note that the rank of the output tensor is always one more than the input one:
|
||
|
|
||
|
>>> tf.io.decode_raw(tf.constant(["1","2"]), tf.uint8).shape
|
||
|
TensorShape([2, 1])
|
||
|
>>> tf.io.decode_raw(tf.constant([["1"],["2"]]), tf.uint8).shape
|
||
|
TensorShape([2, 1, 1])
|
||
|
|
||
|
This is because each byte in the input is converted to a new value on the
|
||
|
output (if output type is `uint8` or `int8`, otherwise chunks of inputs get
|
||
|
coverted to a new value):
|
||
|
|
||
|
>>> tf.io.decode_raw(tf.constant("123"), tf.uint8)
|
||
|
<tf.Tensor: shape=(3,), dtype=uint8, numpy=array([49, 50, 51], dtype=uint8)>
|
||
|
>>> tf.io.decode_raw(tf.constant("1234"), tf.uint8)
|
||
|
<tf.Tensor: shape=(4,), dtype=uint8, numpy=array([49, 50, 51, 52], ...
|
||
|
>>> # chuncked output
|
||
|
>>> tf.io.decode_raw(tf.constant("12"), tf.uint16)
|
||
|
<tf.Tensor: shape=(1,), dtype=uint16, numpy=array([12849], dtype=uint16)>
|
||
|
>>> tf.io.decode_raw(tf.constant("1234"), tf.uint16)
|
||
|
<tf.Tensor: shape=(2,), dtype=uint16, numpy=array([12849, 13363], ...
|
||
|
>>> # int64 output
|
||
|
>>> tf.io.decode_raw(tf.constant("12345678"), tf.int64)
|
||
|
<tf.Tensor: ... numpy=array([4050765991979987505])>
|
||
|
>>> tf.io.decode_raw(tf.constant("1234567887654321"), tf.int64)
|
||
|
<tf.Tensor: ... numpy=array([4050765991979987505, 3544952156018063160])>
|
||
|
|
||
|
The operation allows specifying endianness via the `little_endian` parameter.
|
||
|
|
||
|
>>> tf.io.decode_raw(tf.constant("\x0a\x0b"), tf.int16)
|
||
|
<tf.Tensor: shape=(1,), dtype=int16, numpy=array([2826], dtype=int16)>
|
||
|
>>> hex(2826)
|
||
|
'0xb0a'
|
||
|
>>> tf.io.decode_raw(tf.constant("\x0a\x0b"), tf.int16, little_endian=False)
|
||
|
<tf.Tensor: shape=(1,), dtype=int16, numpy=array([2571], dtype=int16)>
|
||
|
>>> hex(2571)
|
||
|
'0xa0b'
|
||
|
|
||
|
If the elements of `input_bytes` are of different length, you must specify
|
||
|
`fixed_length`:
|
||
|
|
||
|
>>> tf.io.decode_raw(tf.constant([["1"],["23"]]), tf.uint8, fixed_length=4)
|
||
|
<tf.Tensor: shape=(2, 1, 4), dtype=uint8, numpy=
|
||
|
array([[[49, 0, 0, 0]],
|
||
|
[[50, 51, 0, 0]]], dtype=uint8)>
|
||
|
|
||
|
If the `fixed_length` value is larger that the length of the `out_type` dtype,
|
||
|
multiple values are generated:
|
||
|
|
||
|
>>> tf.io.decode_raw(tf.constant(["1212"]), tf.uint16, fixed_length=4)
|
||
|
<tf.Tensor: shape=(1, 2), dtype=uint16, numpy=array([[12849, 12849]], ...
|
||
|
|
||
|
If the input value is larger than `fixed_length`, it is truncated:
|
||
|
|
||
|
>>> x=''.join([chr(1), chr(2), chr(3), chr(4)])
|
||
|
>>> tf.io.decode_raw(x, tf.uint16, fixed_length=2)
|
||
|
<tf.Tensor: shape=(1,), dtype=uint16, numpy=array([513], dtype=uint16)>
|
||
|
>>> hex(513)
|
||
|
'0x201'
|
||
|
|
||
|
If `little_endian` and `fixed_length` are specified, truncation to the fixed
|
||
|
length occurs before endianness conversion:
|
||
|
|
||
|
>>> x=''.join([chr(1), chr(2), chr(3), chr(4)])
|
||
|
>>> tf.io.decode_raw(x, tf.uint16, fixed_length=2, little_endian=False)
|
||
|
<tf.Tensor: shape=(1,), dtype=uint16, numpy=array([258], dtype=uint16)>
|
||
|
>>> hex(258)
|
||
|
'0x102'
|
||
|
|
||
|
If input values all have the same length, then specifying `fixed_length`
|
||
|
equal to the size of the strings should not change output:
|
||
|
|
||
|
>>> x = ["12345678", "87654321"]
|
||
|
>>> tf.io.decode_raw(x, tf.int16)
|
||
|
<tf.Tensor: shape=(2, 4), dtype=int16, numpy=
|
||
|
array([[12849, 13363, 13877, 14391],
|
||
|
[14136, 13622, 13108, 12594]], dtype=int16)>
|
||
|
>>> tf.io.decode_raw(x, tf.int16, fixed_length=len(x[0]))
|
||
|
<tf.Tensor: shape=(2, 4), dtype=int16, numpy=
|
||
|
array([[12849, 13363, 13877, 14391],
|
||
|
[14136, 13622, 13108, 12594]], dtype=int16)>
|
||
|
|
||
|
Args:
|
||
|
input_bytes:
|
||
|
Each element of the input Tensor is converted to an array of bytes.
|
||
|
|
||
|
Currently, this must be a tensor of strings (bytes), although semantically
|
||
|
the operation should support any input.
|
||
|
out_type:
|
||
|
`DType` of the output. Acceptable types are `half`, `float`, `double`,
|
||
|
`int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`.
|
||
|
little_endian:
|
||
|
Whether the `input_bytes` data is in little-endian format. Data will be
|
||
|
converted into host byte order if necessary.
|
||
|
fixed_length:
|
||
|
If set, the first `fixed_length` bytes of each element will be converted.
|
||
|
Data will be zero-padded or truncated to the specified length.
|
||
|
|
||
|
`fixed_length` must be a multiple of the size of `out_type`.
|
||
|
|
||
|
`fixed_length` must be specified if the elements of `input_bytes` are of
|
||
|
variable length.
|
||
|
name: A name for the operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A `Tensor` object storing the decoded bytes.
|
||
|
"""
|
||
|
if fixed_length is not None:
|
||
|
return gen_parsing_ops.decode_padded_raw(
|
||
|
input_bytes,
|
||
|
fixed_length=fixed_length,
|
||
|
out_type=out_type,
|
||
|
little_endian=little_endian,
|
||
|
name=name)
|
||
|
else:
|
||
|
return gen_parsing_ops.decode_raw(
|
||
|
input_bytes, out_type, little_endian=little_endian, name=name)
|
||
|
|
||
|
|
||
|
@tf_export(v1=["decode_raw", "io.decode_raw"])
|
||
|
@dispatch.add_dispatch_support
|
||
|
@deprecation.deprecated_args(None,
|
||
|
"bytes is deprecated, use input_bytes instead",
|
||
|
"bytes")
|
||
|
def decode_raw_v1(
|
||
|
input_bytes=None,
|
||
|
out_type=None,
|
||
|
little_endian=True,
|
||
|
name=None,
|
||
|
bytes=None # pylint: disable=redefined-builtin
|
||
|
):
|
||
|
"""Convert raw byte strings into tensors.
|
||
|
|
||
|
Args:
|
||
|
input_bytes:
|
||
|
Each element of the input Tensor is converted to an array of bytes.
|
||
|
out_type:
|
||
|
`DType` of the output. Acceptable types are `half`, `float`, `double`,
|
||
|
`int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`.
|
||
|
little_endian:
|
||
|
Whether the `input_bytes` data is in little-endian format. Data will be
|
||
|
converted into host byte order if necessary.
|
||
|
name: A name for the operation (optional).
|
||
|
bytes: Deprecated parameter. Use `input_bytes` instead.
|
||
|
|
||
|
Returns:
|
||
|
A `Tensor` object storing the decoded bytes.
|
||
|
"""
|
||
|
input_bytes = deprecation.deprecated_argument_lookup("input_bytes",
|
||
|
input_bytes, "bytes",
|
||
|
bytes)
|
||
|
|
||
|
# out_type is a required positional argument in the original API, and had to
|
||
|
# be changed to a keyword argument in order to facilitate the transition from
|
||
|
# the reserved named `bytes` to `input_bytes`. Ensure it's still set.
|
||
|
if out_type is None:
|
||
|
raise ValueError(
|
||
|
"decode_raw_v1() missing 1 positional argument: 'out_type'")
|
||
|
|
||
|
return gen_parsing_ops.decode_raw(
|
||
|
input_bytes, out_type, little_endian=little_endian, name=name)
|
||
|
|
||
|
|
||
|
# Swap `name` and `na_value` for backward compatibility.
|
||
|
@tf_export(v1=["io.decode_csv", "decode_csv"])
|
||
|
@dispatch.add_dispatch_support
|
||
|
@deprecation.deprecated_endpoints("decode_csv")
|
||
|
def decode_csv(records,
|
||
|
record_defaults,
|
||
|
field_delim=",",
|
||
|
use_quote_delim=True,
|
||
|
name=None,
|
||
|
na_value="",
|
||
|
select_cols=None):
|
||
|
"""Convert CSV records to tensors. Each column maps to one tensor.
|
||
|
|
||
|
RFC 4180 format is expected for the CSV records.
|
||
|
(https://tools.ietf.org/html/rfc4180)
|
||
|
Note that we allow leading and trailing spaces with int or float field.
|
||
|
|
||
|
Args:
|
||
|
records: A `Tensor` of type `string`.
|
||
|
Each string is a record/row in the csv and all records should have
|
||
|
the same format.
|
||
|
record_defaults: A list of `Tensor` objects with specific types.
|
||
|
Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`.
|
||
|
One tensor per column of the input record, with either a
|
||
|
scalar default value for that column or an empty vector if the column is
|
||
|
required.
|
||
|
field_delim: An optional `string`. Defaults to `","`.
|
||
|
char delimiter to separate fields in a record.
|
||
|
use_quote_delim: An optional `bool`. Defaults to `True`.
|
||
|
If false, treats double quotation marks as regular
|
||
|
characters inside of the string fields (ignoring RFC 4180, Section 2,
|
||
|
Bullet 5).
|
||
|
name: A name for the operation (optional).
|
||
|
na_value: Additional string to recognize as NA/NaN.
|
||
|
select_cols: Optional sorted list of column indices to select. If specified,
|
||
|
only this subset of columns will be parsed and returned.
|
||
|
|
||
|
Returns:
|
||
|
A list of `Tensor` objects. Has the same type as `record_defaults`.
|
||
|
Each tensor will have the same shape as records.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: If any of the arguments is malformed.
|
||
|
"""
|
||
|
return decode_csv_v2(
|
||
|
records, record_defaults,
|
||
|
field_delim, use_quote_delim,
|
||
|
na_value, select_cols, name
|
||
|
)
|
||
|
|
||
|
|
||
|
@tf_export("io.decode_csv", v1=[])
|
||
|
@dispatch.add_dispatch_support
|
||
|
def decode_csv_v2(records,
|
||
|
record_defaults,
|
||
|
field_delim=",",
|
||
|
use_quote_delim=True,
|
||
|
na_value="",
|
||
|
select_cols=None,
|
||
|
name=None):
|
||
|
"""Convert CSV records to tensors. Each column maps to one tensor.
|
||
|
|
||
|
RFC 4180 format is expected for the CSV records.
|
||
|
(https://tools.ietf.org/html/rfc4180)
|
||
|
Note that we allow leading and trailing spaces with int or float field.
|
||
|
|
||
|
Args:
|
||
|
records: A `Tensor` of type `string`.
|
||
|
Each string is a record/row in the csv and all records should have
|
||
|
the same format.
|
||
|
record_defaults: A list of `Tensor` objects with specific types.
|
||
|
Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`.
|
||
|
One tensor per column of the input record, with either a
|
||
|
scalar default value for that column or an empty vector if the column is
|
||
|
required.
|
||
|
field_delim: An optional `string`. Defaults to `","`.
|
||
|
char delimiter to separate fields in a record.
|
||
|
use_quote_delim: An optional `bool`. Defaults to `True`.
|
||
|
If false, treats double quotation marks as regular
|
||
|
characters inside of the string fields (ignoring RFC 4180, Section 2,
|
||
|
Bullet 5).
|
||
|
na_value: Additional string to recognize as NA/NaN.
|
||
|
select_cols: Optional sorted list of column indices to select. If specified,
|
||
|
only this subset of columns will be parsed and returned.
|
||
|
name: A name for the operation (optional).
|
||
|
|
||
|
Returns:
|
||
|
A list of `Tensor` objects. Has the same type as `record_defaults`.
|
||
|
Each tensor will have the same shape as records.
|
||
|
|
||
|
Raises:
|
||
|
ValueError: If any of the arguments is malformed.
|
||
|
"""
|
||
|
if select_cols is not None and any(select_cols[i] >= select_cols[i + 1]
|
||
|
for i in range(len(select_cols) - 1)):
|
||
|
raise ValueError("select_cols is not strictly increasing.")
|
||
|
if select_cols is not None and select_cols[0] < 0:
|
||
|
raise ValueError("select_cols contains negative values.")
|
||
|
if select_cols is not None and len(select_cols) != len(record_defaults):
|
||
|
raise ValueError("Length of select_cols and record_defaults do not match.")
|
||
|
return gen_parsing_ops.decode_csv(
|
||
|
records=records,
|
||
|
record_defaults=record_defaults,
|
||
|
field_delim=field_delim,
|
||
|
use_quote_delim=use_quote_delim,
|
||
|
na_value=na_value,
|
||
|
name=name,
|
||
|
select_cols=select_cols,
|
||
|
)
|
||
|
|
||
|
|
||
|
def _assert_scalar(value, name):
|
||
|
"""Asserts that `value` is scalar, and returns `value`."""
|
||
|
value_rank = value.shape.rank
|
||
|
if value_rank is None:
|
||
|
check = control_flow_assert.Assert(
|
||
|
math_ops.equal(array_ops.rank(value), 0),
|
||
|
["Input %s must be a scalar" % name],
|
||
|
name="%sIsScalar" % name.capitalize())
|
||
|
result = control_flow_ops.with_dependencies([check],
|
||
|
value,
|
||
|
name="%sDependencies" % name)
|
||
|
result.set_shape([])
|
||
|
return result
|
||
|
elif value_rank == 0:
|
||
|
return value
|
||
|
else:
|
||
|
raise ValueError("Input %s must be a scalar" % name)
|
||
|
|
||
|
|
||
|
@tf_export("io.decode_json_example",
|
||
|
v1=["decode_json_example", "io.decode_json_example"])
|
||
|
def decode_json_example(json_examples, name=None):
|
||
|
r"""Convert JSON-encoded Example records to binary protocol buffer strings.
|
||
|
|
||
|
Note: This is **not** a general purpose JSON parsing op.
|
||
|
|
||
|
This op converts JSON-serialized `tf.train.Example` (maybe created with
|
||
|
`json_format.MessageToJson`, following the
|
||
|
[standard JSON mapping](
|
||
|
https://developers.google.com/protocol-buffers/docs/proto3#json))
|
||
|
to a binary-serialized `tf.train.Example` (equivalent to
|
||
|
`Example.SerializeToString()`) suitable for conversion to tensors with
|
||
|
`tf.io.parse_example`.
|
||
|
|
||
|
Here is a `tf.train.Example` proto:
|
||
|
|
||
|
>>> example = tf.train.Example(
|
||
|
... features=tf.train.Features(
|
||
|
... feature={
|
||
|
... "a": tf.train.Feature(
|
||
|
... int64_list=tf.train.Int64List(
|
||
|
... value=[1, 1, 3]))}))
|
||
|
|
||
|
Here it is converted to JSON:
|
||
|
|
||
|
>>> from google.protobuf import json_format
|
||
|
>>> example_json = json_format.MessageToJson(example)
|
||
|
>>> print(example_json)
|
||
|
{
|
||
|
"features": {
|
||
|
"feature": {
|
||
|
"a": {
|
||
|
"int64List": {
|
||
|
"value": [
|
||
|
"1",
|
||
|
"1",
|
||
|
"3"
|
||
|
]
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
This op converts the above json string to a binary proto:
|
||
|
|
||
|
>>> example_binary = tf.io.decode_json_example(example_json)
|
||
|
>>> example_binary.numpy()
|
||
|
b'\n\x0f\n\r\n\x01a\x12\x08\x1a\x06\x08\x01\x08\x01\x08\x03'
|
||
|
|
||
|
The OP works on string tensors of andy shape:
|
||
|
|
||
|
>>> tf.io.decode_json_example([
|
||
|
... [example_json, example_json],
|
||
|
... [example_json, example_json]]).shape.as_list()
|
||
|
[2, 2]
|
||
|
|
||
|
This resulting binary-string is equivalent to `Example.SerializeToString()`,
|
||
|
and can be converted to Tensors using `tf.io.parse_example` and related
|
||
|
functions:
|
||
|
|
||
|
>>> tf.io.parse_example(
|
||
|
... serialized=[example_binary.numpy(),
|
||
|
... example.SerializeToString()],
|
||
|
... features = {'a': tf.io.FixedLenFeature(shape=[3], dtype=tf.int64)})
|
||
|
{'a': <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
|
||
|
array([[1, 1, 3],
|
||
|
[1, 1, 3]])>}
|
||
|
|
||
|
Args:
|
||
|
json_examples: A string tensor containing json-serialized `tf.Example`
|
||
|
protos.
|
||
|
name: A name for the op.
|
||
|
|
||
|
Returns:
|
||
|
A string Tensor containing the binary-serialized `tf.Example` protos.
|
||
|
|
||
|
Raises:
|
||
|
`tf.errors.InvalidArgumentError`: If the JSON could not be converted to a
|
||
|
`tf.Example`
|
||
|
"""
|
||
|
return gen_parsing_ops.decode_json_example(json_examples, name=name)
|
||
|
|
||
|
|
||
|
# Register elementwise ops that don't have Python wrappers.
|
||
|
dispatch.register_unary_elementwise_api(gen_parsing_ops.decode_compressed)
|