773 lines
28 KiB
Python
773 lines
28 KiB
Python
|
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""FeatureSpace structured data preprocessing & encoding utility."""
|
||
|
|
||
|
import tensorflow.compat.v2 as tf
|
||
|
|
||
|
from keras import backend
|
||
|
from keras.engine import base_layer
|
||
|
from keras.saving import saving_lib
|
||
|
from keras.saving import serialization_lib
|
||
|
from keras.utils.generic_utils import LazyLoader
|
||
|
|
||
|
# isort: off
|
||
|
from tensorflow.python.util.tf_export import keras_export
|
||
|
|
||
|
layers = LazyLoader("layers", globals(), "keras.layers")
|
||
|
|
||
|
|
||
|
class Cross:
|
||
|
def __init__(self, feature_names, crossing_dim, output_mode="one_hot"):
|
||
|
if output_mode not in {"int", "one_hot"}:
|
||
|
raise ValueError(
|
||
|
"Invalid value for argument `output_mode`. "
|
||
|
"Expected one of {'int', 'one_hot'}. "
|
||
|
f"Received: output_mode={output_mode}"
|
||
|
)
|
||
|
self.feature_names = tuple(feature_names)
|
||
|
self.crossing_dim = crossing_dim
|
||
|
self.output_mode = output_mode
|
||
|
|
||
|
@property
|
||
|
def name(self):
|
||
|
return "_X_".join(self.feature_names)
|
||
|
|
||
|
def get_config(self):
|
||
|
return {
|
||
|
"feature_names": self.feature_names,
|
||
|
"crossing_dim": self.crossing_dim,
|
||
|
"output_mode": self.output_mode,
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def from_config(cls, config):
|
||
|
return cls(**config)
|
||
|
|
||
|
|
||
|
class Feature:
|
||
|
def __init__(self, dtype, preprocessor, output_mode):
|
||
|
if output_mode not in {"int", "one_hot", "float"}:
|
||
|
raise ValueError(
|
||
|
"Invalid value for argument `output_mode`. "
|
||
|
"Expected one of {'int', 'one_hot', 'float'}. "
|
||
|
f"Received: output_mode={output_mode}"
|
||
|
)
|
||
|
self.dtype = dtype
|
||
|
if isinstance(preprocessor, dict):
|
||
|
preprocessor = serialization_lib.deserialize_keras_object(
|
||
|
preprocessor
|
||
|
)
|
||
|
self.preprocessor = preprocessor
|
||
|
self.output_mode = output_mode
|
||
|
|
||
|
def get_config(self):
|
||
|
return {
|
||
|
"dtype": self.dtype,
|
||
|
"preprocessor": serialization_lib.serialize_keras_object(
|
||
|
self.preprocessor
|
||
|
),
|
||
|
"output_mode": self.output_mode,
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def from_config(cls, config):
|
||
|
return cls(**config)
|
||
|
|
||
|
|
||
|
@keras_export("keras.utils.FeatureSpace", v1=[])
|
||
|
class FeatureSpace(base_layer.Layer):
|
||
|
"""One-stop utility for preprocessing and encoding structured data.
|
||
|
|
||
|
Arguments:
|
||
|
feature_names: Dict mapping the names of your features to their
|
||
|
type specification, e.g. `{"my_feature": "integer_categorical"}`
|
||
|
or `{"my_feature": FeatureSpace.integer_categorical()}`.
|
||
|
For a complete list of all supported types, see
|
||
|
"Available feature types" paragraph below.
|
||
|
output_mode: One of `"concat"` or `"dict"`. In concat mode, all
|
||
|
features get concatenated together into a single vector.
|
||
|
In dict mode, the FeatureSpace returns a dict of individually
|
||
|
encoded features (with the same keys as the input dict keys).
|
||
|
crosses: List of features to be crossed together, e.g.
|
||
|
`crosses=[("feature_1", "feature_2")]`. The features will be
|
||
|
"crossed" by hashing their combined value into
|
||
|
a fixed-length vector.
|
||
|
crossing_dim: Default vector size for hashing crossed features.
|
||
|
Defaults to 32.
|
||
|
hashing_dim: Default vector size for hashing features of type
|
||
|
`"integer_hashed"` and `"string_hashed"`. Defaults to 32.
|
||
|
num_discretization_bins: Default number of bins to be used for
|
||
|
discretizing features of type `"float_discretized"`.
|
||
|
Defaults to 32.
|
||
|
|
||
|
**Available feature types:**
|
||
|
|
||
|
Note that all features can be referred to by their string name,
|
||
|
e.g. `"integer_categorical"`. When using the string name, the default
|
||
|
argument values are used.
|
||
|
|
||
|
```python
|
||
|
# Plain float values.
|
||
|
FeatureSpace.float(name=None)
|
||
|
|
||
|
# Float values to be preprocessed via featurewise standardization
|
||
|
# (i.e. via a `keras.layers.Normalization` layer).
|
||
|
FeatureSpace.float_normalized(name=None)
|
||
|
|
||
|
# Float values to be preprocessed via linear rescaling
|
||
|
# (i.e. via a `keras.layers.Rescaling` layer).
|
||
|
FeatureSpace.float_rescaled(scale=1., offset=0., name=None)
|
||
|
|
||
|
# Float values to be discretized. By default, the discrete
|
||
|
# representation will then be one-hot encoded.
|
||
|
FeatureSpace.float_discretized(
|
||
|
num_bins, bin_boundaries=None, output_mode="one_hot", name=None)
|
||
|
|
||
|
# Integer values to be indexed. By default, the discrete
|
||
|
# representation will then be one-hot encoded.
|
||
|
FeatureSpace.integer_categorical(
|
||
|
max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)
|
||
|
|
||
|
# String values to be indexed. By default, the discrete
|
||
|
# representation will then be one-hot encoded.
|
||
|
FeatureSpace.string_categorical(
|
||
|
max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)
|
||
|
|
||
|
# Integer values to be hashed into a fixed number of bins.
|
||
|
# By default, the discrete representation will then be one-hot encoded.
|
||
|
FeatureSpace.integer_hashed(num_bins, output_mode="one_hot", name=None)
|
||
|
|
||
|
# String values to be hashed into a fixed number of bins.
|
||
|
# By default, the discrete representation will then be one-hot encoded.
|
||
|
FeatureSpace.string_hashed(num_bins, output_mode="one_hot", name=None)
|
||
|
```
|
||
|
|
||
|
Examples:
|
||
|
|
||
|
**Basic usage with a dict of input data:**
|
||
|
|
||
|
```python
|
||
|
raw_data = {
|
||
|
"float_values": [0.0, 0.1, 0.2, 0.3],
|
||
|
"string_values": ["zero", "one", "two", "three"],
|
||
|
"int_values": [0, 1, 2, 3],
|
||
|
}
|
||
|
dataset = tf.data.Dataset.from_tensor_slices(raw_data)
|
||
|
|
||
|
feature_space = FeatureSpace(
|
||
|
features={
|
||
|
"float_values": "float_normalized",
|
||
|
"string_values": "string_categorical",
|
||
|
"int_values": "integer_categorical",
|
||
|
},
|
||
|
crosses=[("string_values", "int_values")],
|
||
|
output_mode="concat",
|
||
|
)
|
||
|
# Before you start using the FeatureSpace,
|
||
|
# you must `adapt()` it on some data.
|
||
|
feature_space.adapt(dataset)
|
||
|
|
||
|
# You can call the FeatureSpace on a dict of data (batched or unbatched).
|
||
|
output_vector = feature_space(raw_data)
|
||
|
```
|
||
|
|
||
|
**Basic usage with `tf.data`:**
|
||
|
|
||
|
```python
|
||
|
# Unlabeled data
|
||
|
preprocessed_ds = unlabeled_dataset.map(feature_space)
|
||
|
|
||
|
# Labeled data
|
||
|
preprocessed_ds = labeled_dataset.map(lambda x, y: (feature_space(x), y))
|
||
|
```
|
||
|
|
||
|
**Basic usage with the Keras Functional API:**
|
||
|
|
||
|
```python
|
||
|
# Retrieve a dict Keras Input objects
|
||
|
inputs = feature_space.get_inputs()
|
||
|
# Retrieve the corresponding encoded Keras tensors
|
||
|
encoded_features = feature_space.get_encoded_features()
|
||
|
# Build a Functional model
|
||
|
outputs = keras.layers.Dense(1, activation="sigmoid")(encoded_features)
|
||
|
model = keras.Model(inputs, outputs)
|
||
|
```
|
||
|
|
||
|
**Customizing each feature or feature cross:**
|
||
|
|
||
|
```python
|
||
|
feature_space = FeatureSpace(
|
||
|
features={
|
||
|
"float_values": FeatureSpace.float_normalized(),
|
||
|
"string_values": FeatureSpace.string_categorical(max_tokens=10),
|
||
|
"int_values": FeatureSpace.integer_categorical(max_tokens=10),
|
||
|
},
|
||
|
crosses=[
|
||
|
FeatureSpace.cross(("string_values", "int_values"), crossing_dim=32)
|
||
|
],
|
||
|
output_mode="concat",
|
||
|
)
|
||
|
```
|
||
|
|
||
|
**Returning a dict of integer-encoded features:**
|
||
|
|
||
|
```python
|
||
|
feature_space = FeatureSpace(
|
||
|
features={
|
||
|
"string_values": FeatureSpace.string_categorical(output_mode="int"),
|
||
|
"int_values": FeatureSpace.integer_categorical(output_mode="int"),
|
||
|
},
|
||
|
crosses=[
|
||
|
FeatureSpace.cross(
|
||
|
feature_names=("string_values", "int_values"),
|
||
|
crossing_dim=32,
|
||
|
output_mode="int",
|
||
|
)
|
||
|
],
|
||
|
output_mode="dict",
|
||
|
)
|
||
|
```
|
||
|
|
||
|
**Specifying your own Keras preprocessing layer:**
|
||
|
|
||
|
```python
|
||
|
# Let's say that one of the features is a short text paragraph that
|
||
|
# we want to encode as a vector (one vector per paragraph) via TF-IDF.
|
||
|
data = {
|
||
|
"text": ["1st string", "2nd string", "3rd string"],
|
||
|
}
|
||
|
|
||
|
# There's a Keras layer for this: TextVectorization.
|
||
|
custom_layer = layers.TextVectorization(output_mode="tf_idf")
|
||
|
|
||
|
# We can use FeatureSpace.feature to create a custom feature
|
||
|
# that will use our preprocessing layer.
|
||
|
feature_space = FeatureSpace(
|
||
|
features={
|
||
|
"text": FeatureSpace.feature(
|
||
|
preprocessor=custom_layer, dtype="string", output_mode="float"
|
||
|
),
|
||
|
},
|
||
|
output_mode="concat",
|
||
|
)
|
||
|
feature_space.adapt(tf.data.Dataset.from_tensor_slices(data))
|
||
|
output_vector = feature_space(data)
|
||
|
```
|
||
|
|
||
|
**Retrieving the underlying Keras preprocessing layers:**
|
||
|
|
||
|
```python
|
||
|
# The preprocessing layer of each feature is available in `.preprocessors`.
|
||
|
preprocessing_layer = feature_space.preprocessors["feature1"]
|
||
|
|
||
|
# The crossing layer of each feature cross is available in `.crossers`.
|
||
|
# It's an instance of keras.layers.HashedCrossing.
|
||
|
crossing_layer = feature_space.crossers["feature1_X_feature2"]
|
||
|
```
|
||
|
|
||
|
**Saving and reloading a FeatureSpace:**
|
||
|
|
||
|
```python
|
||
|
feature_space.save("myfeaturespace.keras")
|
||
|
reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
|
||
|
```
|
||
|
"""
|
||
|
|
||
|
@classmethod
|
||
|
def cross(cls, feature_names, crossing_dim, output_mode="one_hot"):
|
||
|
return Cross(feature_names, crossing_dim, output_mode=output_mode)
|
||
|
|
||
|
@classmethod
|
||
|
def feature(cls, dtype, preprocessor, output_mode):
|
||
|
return Feature(dtype, preprocessor, output_mode)
|
||
|
|
||
|
@classmethod
|
||
|
def float(cls, name=None):
|
||
|
from keras.layers.core import identity
|
||
|
|
||
|
name = name or backend.unique_object_name("float")
|
||
|
preprocessor = identity.Identity(
|
||
|
dtype="float32", name=f"{name}_preprocessor"
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="float32", preprocessor=preprocessor, output_mode="float"
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def float_rescaled(cls, scale=1.0, offset=0.0, name=None):
|
||
|
name = name or backend.unique_object_name("float_rescaled")
|
||
|
preprocessor = layers.Rescaling(
|
||
|
scale=scale, offset=offset, name=f"{name}_preprocessor"
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="float32", preprocessor=preprocessor, output_mode="float"
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def float_normalized(cls, name=None):
|
||
|
name = name or backend.unique_object_name("float_normalized")
|
||
|
preprocessor = layers.Normalization(
|
||
|
axis=-1, name=f"{name}_preprocessor"
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="float32", preprocessor=preprocessor, output_mode="float"
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def float_discretized(
|
||
|
cls, num_bins, bin_boundaries=None, output_mode="one_hot", name=None
|
||
|
):
|
||
|
name = name or backend.unique_object_name("float_discretized")
|
||
|
preprocessor = layers.Discretization(
|
||
|
num_bins=num_bins,
|
||
|
bin_boundaries=bin_boundaries,
|
||
|
name=f"{name}_preprocessor",
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="float32", preprocessor=preprocessor, output_mode=output_mode
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def integer_categorical(
|
||
|
cls,
|
||
|
max_tokens=None,
|
||
|
num_oov_indices=1,
|
||
|
output_mode="one_hot",
|
||
|
name=None,
|
||
|
):
|
||
|
name = name or backend.unique_object_name("integer_categorical")
|
||
|
preprocessor = layers.IntegerLookup(
|
||
|
name=f"{name}_preprocessor",
|
||
|
max_tokens=max_tokens,
|
||
|
num_oov_indices=num_oov_indices,
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="int64", preprocessor=preprocessor, output_mode=output_mode
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def string_categorical(
|
||
|
cls,
|
||
|
max_tokens=None,
|
||
|
num_oov_indices=1,
|
||
|
output_mode="one_hot",
|
||
|
name=None,
|
||
|
):
|
||
|
name = name or backend.unique_object_name("string_categorical")
|
||
|
preprocessor = layers.StringLookup(
|
||
|
name=f"{name}_preprocessor",
|
||
|
max_tokens=max_tokens,
|
||
|
num_oov_indices=num_oov_indices,
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="string", preprocessor=preprocessor, output_mode=output_mode
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def string_hashed(cls, num_bins, output_mode="one_hot", name=None):
|
||
|
name = name or backend.unique_object_name("string_hashed")
|
||
|
preprocessor = layers.Hashing(
|
||
|
name=f"{name}_preprocessor", num_bins=num_bins
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="string", preprocessor=preprocessor, output_mode=output_mode
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def integer_hashed(cls, num_bins, output_mode="one_hot", name=None):
|
||
|
name = name or backend.unique_object_name("integer_hashed")
|
||
|
preprocessor = layers.Hashing(
|
||
|
name=f"{name}_preprocessor", num_bins=num_bins
|
||
|
)
|
||
|
return Feature(
|
||
|
dtype="int64", preprocessor=preprocessor, output_mode=output_mode
|
||
|
)
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
features,
|
||
|
output_mode="concat",
|
||
|
crosses=None,
|
||
|
crossing_dim=32,
|
||
|
hashing_dim=32,
|
||
|
num_discretization_bins=32,
|
||
|
):
|
||
|
if not features:
|
||
|
raise ValueError("The `features` argument cannot be None or empty.")
|
||
|
self.crossing_dim = crossing_dim
|
||
|
self.hashing_dim = hashing_dim
|
||
|
self.num_discretization_bins = num_discretization_bins
|
||
|
self.features = {
|
||
|
name: self._standardize_feature(name, value)
|
||
|
for name, value in features.items()
|
||
|
}
|
||
|
self.crosses = []
|
||
|
if crosses:
|
||
|
feature_set = set(features.keys())
|
||
|
for cross in crosses:
|
||
|
if isinstance(cross, dict):
|
||
|
cross = serialization_lib.deserialize_keras_object(cross)
|
||
|
if isinstance(cross, Cross):
|
||
|
self.crosses.append(cross)
|
||
|
else:
|
||
|
if not crossing_dim:
|
||
|
raise ValueError(
|
||
|
"When specifying `crosses`, the argument "
|
||
|
"`crossing_dim` "
|
||
|
"(dimensionality of the crossing space) "
|
||
|
"should be specified as well."
|
||
|
)
|
||
|
for key in cross:
|
||
|
if key not in feature_set:
|
||
|
raise ValueError(
|
||
|
"All features referenced "
|
||
|
"in the `crosses` argument "
|
||
|
"should be present in the `features` dict. "
|
||
|
f"Received unknown features: {cross}"
|
||
|
)
|
||
|
self.crosses.append(Cross(cross, crossing_dim=crossing_dim))
|
||
|
self.crosses_by_name = {cross.name: cross for cross in self.crosses}
|
||
|
|
||
|
if output_mode not in {"dict", "concat"}:
|
||
|
raise ValueError(
|
||
|
"Invalid value for argument `output_mode`. "
|
||
|
"Expected one of {'dict', 'concat'}. "
|
||
|
f"Received: output_mode={output_mode}"
|
||
|
)
|
||
|
self.output_mode = output_mode
|
||
|
|
||
|
self.inputs = {
|
||
|
name: self._feature_to_input(name, value)
|
||
|
for name, value in self.features.items()
|
||
|
}
|
||
|
self.preprocessors = {
|
||
|
name: value.preprocessor for name, value in self.features.items()
|
||
|
}
|
||
|
self.encoded_features = None
|
||
|
self.crossers = {
|
||
|
cross.name: self._cross_to_crosser(cross) for cross in self.crosses
|
||
|
}
|
||
|
self.one_hot_encoders = {}
|
||
|
self.built = False
|
||
|
self._is_adapted = False
|
||
|
self.concat = None
|
||
|
self._preprocessed_features_names = None
|
||
|
self._crossed_features_names = None
|
||
|
|
||
|
def _feature_to_input(self, name, feature):
|
||
|
return layers.Input(shape=(1,), dtype=feature.dtype, name=name)
|
||
|
|
||
|
def _standardize_feature(self, name, feature):
|
||
|
if isinstance(feature, Feature):
|
||
|
return feature
|
||
|
|
||
|
if isinstance(feature, dict):
|
||
|
return serialization_lib.deserialize_keras_object(feature)
|
||
|
|
||
|
if feature == "float":
|
||
|
return self.float(name=name)
|
||
|
elif feature == "float_normalized":
|
||
|
return self.float_normalized(name=name)
|
||
|
elif feature == "float_rescaled":
|
||
|
return self.float_rescaled(name=name)
|
||
|
elif feature == "float_discretized":
|
||
|
return self.float_discretized(
|
||
|
name=name, num_bins=self.num_discretization_bins
|
||
|
)
|
||
|
elif feature == "integer_categorical":
|
||
|
return self.integer_categorical(name=name)
|
||
|
elif feature == "string_categorical":
|
||
|
return self.string_categorical(name=name)
|
||
|
elif feature == "integer_hashed":
|
||
|
return self.integer_hashed(self.hashing_dim, name=name)
|
||
|
elif feature == "string_hashed":
|
||
|
return self.string_hashed(self.hashing_dim, name=name)
|
||
|
else:
|
||
|
raise ValueError(f"Invalid feature type: {feature}")
|
||
|
|
||
|
def _cross_to_crosser(self, cross):
|
||
|
return layers.HashedCrossing(cross.crossing_dim, name=cross.name)
|
||
|
|
||
|
def _list_adaptable_preprocessors(self):
|
||
|
adaptable_preprocessors = []
|
||
|
for name in self.features.keys():
|
||
|
preprocessor = self.preprocessors[name]
|
||
|
# Special case: a Normalization layer with preset mean/variance.
|
||
|
# Not adaptable.
|
||
|
if isinstance(preprocessor, layers.Normalization):
|
||
|
if preprocessor.input_mean is not None:
|
||
|
continue
|
||
|
if hasattr(preprocessor, "adapt"):
|
||
|
adaptable_preprocessors.append(name)
|
||
|
return adaptable_preprocessors
|
||
|
|
||
|
def adapt(self, dataset):
|
||
|
if not isinstance(dataset, tf.data.Dataset):
|
||
|
raise ValueError(
|
||
|
"`adapt()` can only be called on a tf.data.Dataset. "
|
||
|
f"Received instead: {dataset} (of type {type(dataset)})"
|
||
|
)
|
||
|
|
||
|
for name in self._list_adaptable_preprocessors():
|
||
|
# Call adapt() on each individual adaptable layer.
|
||
|
|
||
|
# TODO: consider rewriting this to instead iterate on the
|
||
|
# dataset once, split each batch into individual features,
|
||
|
# and call the layer's `_adapt_function` on each batch
|
||
|
# to simulate the behavior of adapt() in a more performant fashion.
|
||
|
|
||
|
feature_dataset = dataset.map(lambda x: x[name])
|
||
|
preprocessor = self.preprocessors[name]
|
||
|
# TODO: consider adding an adapt progress bar.
|
||
|
# Sample 1 element to check the rank
|
||
|
for x in feature_dataset.take(1):
|
||
|
pass
|
||
|
if x.shape.rank == 0:
|
||
|
# The dataset yields unbatched scalars; batch it.
|
||
|
feature_dataset = feature_dataset.batch(32)
|
||
|
if x.shape.rank in {0, 1}:
|
||
|
# If the rank is 1, add a dimension
|
||
|
# so we can reduce on axis=-1.
|
||
|
# Note: if rank was previously 0, it is now 1.
|
||
|
feature_dataset = feature_dataset.map(
|
||
|
lambda x: tf.expand_dims(x, -1)
|
||
|
)
|
||
|
preprocessor.adapt(feature_dataset)
|
||
|
self._is_adapted = True
|
||
|
self.get_encoded_features() # Finish building the layer
|
||
|
self.built = True
|
||
|
|
||
|
def get_inputs(self):
|
||
|
self._check_if_built()
|
||
|
return self.inputs
|
||
|
|
||
|
def get_encoded_features(self):
|
||
|
self._check_if_adapted()
|
||
|
|
||
|
if self.encoded_features is None:
|
||
|
preprocessed_features = self._preprocess_features(self.inputs)
|
||
|
crossed_features = self._cross_features(preprocessed_features)
|
||
|
merged_features = self._merge_features(
|
||
|
preprocessed_features, crossed_features
|
||
|
)
|
||
|
self.encoded_features = merged_features
|
||
|
return self.encoded_features
|
||
|
|
||
|
def _preprocess_features(self, features):
|
||
|
return {
|
||
|
name: self.preprocessors[name](features[name])
|
||
|
for name in features.keys()
|
||
|
}
|
||
|
|
||
|
def _cross_features(self, features):
|
||
|
all_outputs = {}
|
||
|
for cross in self.crosses:
|
||
|
inputs = [features[name] for name in cross.feature_names]
|
||
|
outputs = self.crossers[cross.name](inputs)
|
||
|
all_outputs[cross.name] = outputs
|
||
|
return all_outputs
|
||
|
|
||
|
def _merge_features(self, preprocessed_features, crossed_features):
|
||
|
if not self._preprocessed_features_names:
|
||
|
self._preprocessed_features_names = sorted(
|
||
|
preprocessed_features.keys()
|
||
|
)
|
||
|
self._crossed_features_names = sorted(crossed_features.keys())
|
||
|
|
||
|
all_names = (
|
||
|
self._preprocessed_features_names + self._crossed_features_names
|
||
|
)
|
||
|
all_features = [
|
||
|
preprocessed_features[name]
|
||
|
for name in self._preprocessed_features_names
|
||
|
] + [crossed_features[name] for name in self._crossed_features_names]
|
||
|
|
||
|
if self.output_mode == "dict":
|
||
|
output_dict = {}
|
||
|
else:
|
||
|
features_to_concat = []
|
||
|
|
||
|
if self.built:
|
||
|
# Fast mode.
|
||
|
for name, feature in zip(all_names, all_features):
|
||
|
encoder = self.one_hot_encoders.get(name, None)
|
||
|
if encoder:
|
||
|
feature = encoder(feature)
|
||
|
if self.output_mode == "dict":
|
||
|
output_dict[name] = feature
|
||
|
else:
|
||
|
features_to_concat.append(feature)
|
||
|
if self.output_mode == "dict":
|
||
|
return output_dict
|
||
|
else:
|
||
|
return self.concat(features_to_concat)
|
||
|
|
||
|
# If the object isn't built,
|
||
|
# we create the encoder and concat layers below
|
||
|
all_specs = [
|
||
|
self.features[name] for name in self._preprocessed_features_names
|
||
|
] + [
|
||
|
self.crosses_by_name[name] for name in self._crossed_features_names
|
||
|
]
|
||
|
for name, feature, spec in zip(all_names, all_features, all_specs):
|
||
|
dtype = feature.dtype.name
|
||
|
|
||
|
if spec.output_mode == "one_hot":
|
||
|
preprocessor = self.preprocessors.get(
|
||
|
name
|
||
|
) or self.crossers.get(name)
|
||
|
cardinality = None
|
||
|
if not feature.dtype.name.startswith("int"):
|
||
|
raise ValueError(
|
||
|
f"Feature '{name}' has `output_mode='one_hot'`. "
|
||
|
"Thus its preprocessor should return an int64 dtype. "
|
||
|
f"Instead it returns a {dtype} dtype."
|
||
|
)
|
||
|
|
||
|
if isinstance(
|
||
|
preprocessor, (layers.IntegerLookup, layers.StringLookup)
|
||
|
):
|
||
|
cardinality = preprocessor.vocabulary_size()
|
||
|
elif isinstance(preprocessor, layers.CategoryEncoding):
|
||
|
cardinality = preprocessor.num_tokens
|
||
|
elif isinstance(preprocessor, layers.Discretization):
|
||
|
cardinality = preprocessor.num_bins
|
||
|
elif isinstance(
|
||
|
preprocessor, (layers.HashedCrossing, layers.Hashing)
|
||
|
):
|
||
|
cardinality = preprocessor.num_bins
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
f"Feature '{name}' has `output_mode='one_hot'`. "
|
||
|
"However it isn't a standard feature and the "
|
||
|
"dimensionality of its output space is not known, "
|
||
|
"thus it cannot be one-hot encoded. "
|
||
|
"Try using `output_mode='int'`."
|
||
|
)
|
||
|
if cardinality is not None:
|
||
|
encoder = layers.CategoryEncoding(
|
||
|
num_tokens=cardinality, output_mode="multi_hot"
|
||
|
)
|
||
|
self.one_hot_encoders[name] = encoder
|
||
|
feature = encoder(feature)
|
||
|
|
||
|
if self.output_mode == "concat":
|
||
|
dtype = feature.dtype.name
|
||
|
if dtype.startswith("int") or dtype == "string":
|
||
|
raise ValueError(
|
||
|
f"Cannot concatenate features because feature '{name}' "
|
||
|
f"has not been encoded (it has dtype {dtype}). "
|
||
|
"Consider using `output_mode='dict'`."
|
||
|
)
|
||
|
features_to_concat.append(feature)
|
||
|
else:
|
||
|
output_dict[name] = feature
|
||
|
|
||
|
if self.output_mode == "concat":
|
||
|
self.concat = layers.Concatenate(axis=-1)
|
||
|
return self.concat(features_to_concat)
|
||
|
else:
|
||
|
return output_dict
|
||
|
|
||
|
def _check_if_adapted(self):
|
||
|
if not self._is_adapted:
|
||
|
if not self._list_adaptable_preprocessors():
|
||
|
self._is_adapted = True
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"You need to call `.adapt(dataset)` on the FeatureSpace "
|
||
|
"before you can start using it."
|
||
|
)
|
||
|
|
||
|
def _check_if_built(self):
|
||
|
if not self.built:
|
||
|
self._check_if_adapted()
|
||
|
# Finishes building
|
||
|
self.get_encoded_features()
|
||
|
self.built = True
|
||
|
|
||
|
def __call__(self, data):
|
||
|
self._check_if_built()
|
||
|
if not isinstance(data, dict):
|
||
|
raise ValueError(
|
||
|
"A FeatureSpace can only be called with a dict. "
|
||
|
f"Received: data={data} (of type {type(data)}"
|
||
|
)
|
||
|
|
||
|
data = {key: tf.convert_to_tensor(value) for key, value in data.items()}
|
||
|
rebatched = False
|
||
|
for name, x in data.items():
|
||
|
if x.shape.rank == 0:
|
||
|
data[name] = tf.reshape(x, [1, 1])
|
||
|
rebatched = True
|
||
|
elif x.shape.rank == 1:
|
||
|
data[name] = tf.expand_dims(x, -1)
|
||
|
|
||
|
preprocessed_data = self._preprocess_features(data)
|
||
|
crossed_data = self._cross_features(preprocessed_data)
|
||
|
merged_data = self._merge_features(preprocessed_data, crossed_data)
|
||
|
if rebatched:
|
||
|
if self.output_mode == "concat":
|
||
|
assert merged_data.shape[0] == 1
|
||
|
return tf.squeeze(merged_data, axis=0)
|
||
|
else:
|
||
|
for name, x in merged_data.items():
|
||
|
if x.shape.rank == 2 and x.shape[0] == 1:
|
||
|
merged_data[name] = tf.squeeze(x, axis=0)
|
||
|
return merged_data
|
||
|
|
||
|
def get_config(self):
|
||
|
return {
|
||
|
"features": serialization_lib.serialize_keras_object(self.features),
|
||
|
"output_mode": self.output_mode,
|
||
|
"crosses": serialization_lib.serialize_keras_object(self.crosses),
|
||
|
"crossing_dim": self.crossing_dim,
|
||
|
"hashing_dim": self.hashing_dim,
|
||
|
"num_discretization_bins": self.num_discretization_bins,
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def from_config(cls, config):
|
||
|
return cls(**config)
|
||
|
|
||
|
def get_build_config(self):
|
||
|
return {
|
||
|
name: feature.preprocessor.get_build_config()
|
||
|
for name, feature in self.features.items()
|
||
|
}
|
||
|
|
||
|
def build_from_config(self, config):
|
||
|
for name in config.keys():
|
||
|
self.features[name].preprocessor.build_from_config(config[name])
|
||
|
self._is_adapted = True
|
||
|
|
||
|
def save(self, filepath):
|
||
|
"""Save the `FeatureSpace` instance to a `.keras` file.
|
||
|
|
||
|
You can reload it via `keras.models.load_model()`:
|
||
|
|
||
|
```python
|
||
|
feature_space.save("myfeaturespace.keras")
|
||
|
reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
|
||
|
```
|
||
|
"""
|
||
|
saving_lib.save_model(self, filepath)
|
||
|
|
||
|
def _save_own_variables(self, store):
|
||
|
return
|
||
|
|
||
|
def _load_own_variables(self, store):
|
||
|
return
|