# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""FeatureSpace structured data preprocessing & encoding utility."""

import tensorflow.compat.v2 as tf

from keras import backend
from keras.engine import base_layer
from keras.saving import saving_lib
from keras.saving import serialization_lib
from keras.utils.generic_utils import LazyLoader

# isort: off
from tensorflow.python.util.tf_export import keras_export

layers = LazyLoader("layers", globals(), "keras.layers")


class Cross:
    def __init__(self, feature_names, crossing_dim, output_mode="one_hot"):
        if output_mode not in {"int", "one_hot"}:
            raise ValueError(
                "Invalid value for argument `output_mode`. "
                "Expected one of {'int', 'one_hot'}. "
                f"Received: output_mode={output_mode}"
            )
        self.feature_names = tuple(feature_names)
        self.crossing_dim = crossing_dim
        self.output_mode = output_mode

    @property
    def name(self):
        return "_X_".join(self.feature_names)

    def get_config(self):
        return {
            "feature_names": self.feature_names,
            "crossing_dim": self.crossing_dim,
            "output_mode": self.output_mode,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)


class Feature:
    def __init__(self, dtype, preprocessor, output_mode):
        if output_mode not in {"int", "one_hot", "float"}:
            raise ValueError(
                "Invalid value for argument `output_mode`. "
                "Expected one of {'int', 'one_hot', 'float'}. "
                f"Received: output_mode={output_mode}"
            )
        self.dtype = dtype
        if isinstance(preprocessor, dict):
            preprocessor = serialization_lib.deserialize_keras_object(
                preprocessor
            )
        self.preprocessor = preprocessor
        self.output_mode = output_mode

    def get_config(self):
        return {
            "dtype": self.dtype,
            "preprocessor": serialization_lib.serialize_keras_object(
                self.preprocessor
            ),
            "output_mode": self.output_mode,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)


@keras_export("keras.utils.FeatureSpace", v1=[])
class FeatureSpace(base_layer.Layer):
    """One-stop utility for preprocessing and encoding structured data.

    Arguments:
        feature_names: Dict mapping the names of your features to their
            type specification, e.g. `{"my_feature": "integer_categorical"}`
            or `{"my_feature": FeatureSpace.integer_categorical()}`.
            For a complete list of all supported types, see
            "Available feature types" paragraph below.
        output_mode: One of `"concat"` or `"dict"`. In concat mode, all
            features get concatenated together into a single vector.
            In dict mode, the FeatureSpace returns a dict of individually
            encoded features (with the same keys as the input dict keys).
        crosses: List of features to be crossed together, e.g.
            `crosses=[("feature_1", "feature_2")]`. The features will be
            "crossed" by hashing their combined value into
            a fixed-length vector.
        crossing_dim: Default vector size for hashing crossed features.
            Defaults to 32.
        hashing_dim: Default vector size for hashing features of type
            `"integer_hashed"` and `"string_hashed"`. Defaults to 32.
        num_discretization_bins: Default number of bins to be used for
            discretizing features of type `"float_discretized"`.
            Defaults to 32.

    **Available feature types:**

    Note that all features can be referred to by their string name,
    e.g. `"integer_categorical"`. When using the string name, the default
    argument values are used.

    ```python
    # Plain float values.
    FeatureSpace.float(name=None)

    # Float values to be preprocessed via featurewise standardization
    # (i.e. via a `keras.layers.Normalization` layer).
    FeatureSpace.float_normalized(name=None)

    # Float values to be preprocessed via linear rescaling
    # (i.e. via a `keras.layers.Rescaling` layer).
    FeatureSpace.float_rescaled(scale=1., offset=0., name=None)

    # Float values to be discretized. By default, the discrete
    # representation will then be one-hot encoded.
    FeatureSpace.float_discretized(
        num_bins, bin_boundaries=None, output_mode="one_hot", name=None)

    # Integer values to be indexed. By default, the discrete
    # representation will then be one-hot encoded.
    FeatureSpace.integer_categorical(
        max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)

    # String values to be indexed. By default, the discrete
    # representation will then be one-hot encoded.
    FeatureSpace.string_categorical(
        max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)

    # Integer values to be hashed into a fixed number of bins.
    # By default, the discrete representation will then be one-hot encoded.
    FeatureSpace.integer_hashed(num_bins, output_mode="one_hot", name=None)

    # String values to be hashed into a fixed number of bins.
    # By default, the discrete representation will then be one-hot encoded.
    FeatureSpace.string_hashed(num_bins, output_mode="one_hot", name=None)
    ```

    Examples:

    **Basic usage with a dict of input data:**

    ```python
    raw_data = {
        "float_values": [0.0, 0.1, 0.2, 0.3],
        "string_values": ["zero", "one", "two", "three"],
        "int_values": [0, 1, 2, 3],
    }
    dataset = tf.data.Dataset.from_tensor_slices(raw_data)

    feature_space = FeatureSpace(
        features={
            "float_values": "float_normalized",
            "string_values": "string_categorical",
            "int_values": "integer_categorical",
        },
        crosses=[("string_values", "int_values")],
        output_mode="concat",
    )
    # Before you start using the FeatureSpace,
    # you must `adapt()` it on some data.
    feature_space.adapt(dataset)

    # You can call the FeatureSpace on a dict of data (batched or unbatched).
    output_vector = feature_space(raw_data)
    ```

    **Basic usage with `tf.data`:**

    ```python
    # Unlabeled data
    preprocessed_ds = unlabeled_dataset.map(feature_space)

    # Labeled data
    preprocessed_ds = labeled_dataset.map(lambda x, y: (feature_space(x), y))
    ```

    **Basic usage with the Keras Functional API:**

    ```python
    # Retrieve a dict Keras Input objects
    inputs = feature_space.get_inputs()
    # Retrieve the corresponding encoded Keras tensors
    encoded_features = feature_space.get_encoded_features()
    # Build a Functional model
    outputs = keras.layers.Dense(1, activation="sigmoid")(encoded_features)
    model = keras.Model(inputs, outputs)
    ```

    **Customizing each feature or feature cross:**

    ```python
    feature_space = FeatureSpace(
        features={
            "float_values": FeatureSpace.float_normalized(),
            "string_values": FeatureSpace.string_categorical(max_tokens=10),
            "int_values": FeatureSpace.integer_categorical(max_tokens=10),
        },
        crosses=[
            FeatureSpace.cross(("string_values", "int_values"), crossing_dim=32)
        ],
        output_mode="concat",
    )
    ```

    **Returning a dict of integer-encoded features:**

    ```python
    feature_space = FeatureSpace(
        features={
            "string_values": FeatureSpace.string_categorical(output_mode="int"),
            "int_values": FeatureSpace.integer_categorical(output_mode="int"),
        },
        crosses=[
            FeatureSpace.cross(
                feature_names=("string_values", "int_values"),
                crossing_dim=32,
                output_mode="int",
            )
        ],
        output_mode="dict",
    )
    ```

    **Specifying your own Keras preprocessing layer:**

    ```python
    # Let's say that one of the features is a short text paragraph that
    # we want to encode as a vector (one vector per paragraph) via TF-IDF.
    data = {
        "text": ["1st string", "2nd string", "3rd string"],
    }

    # There's a Keras layer for this: TextVectorization.
    custom_layer = layers.TextVectorization(output_mode="tf_idf")

    # We can use FeatureSpace.feature to create a custom feature
    # that will use our preprocessing layer.
    feature_space = FeatureSpace(
        features={
            "text": FeatureSpace.feature(
                preprocessor=custom_layer, dtype="string", output_mode="float"
            ),
        },
        output_mode="concat",
    )
    feature_space.adapt(tf.data.Dataset.from_tensor_slices(data))
    output_vector = feature_space(data)
    ```

    **Retrieving the underlying Keras preprocessing layers:**

    ```python
    # The preprocessing layer of each feature is available in `.preprocessors`.
    preprocessing_layer = feature_space.preprocessors["feature1"]

    # The crossing layer of each feature cross is available in `.crossers`.
    # It's an instance of keras.layers.HashedCrossing.
    crossing_layer = feature_space.crossers["feature1_X_feature2"]
    ```

    **Saving and reloading a FeatureSpace:**

    ```python
    feature_space.save("myfeaturespace.keras")
    reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
    ```
    """

    @classmethod
    def cross(cls, feature_names, crossing_dim, output_mode="one_hot"):
        return Cross(feature_names, crossing_dim, output_mode=output_mode)

    @classmethod
    def feature(cls, dtype, preprocessor, output_mode):
        return Feature(dtype, preprocessor, output_mode)

    @classmethod
    def float(cls, name=None):
        from keras.layers.core import identity

        name = name or backend.unique_object_name("float")
        preprocessor = identity.Identity(
            dtype="float32", name=f"{name}_preprocessor"
        )
        return Feature(
            dtype="float32", preprocessor=preprocessor, output_mode="float"
        )

    @classmethod
    def float_rescaled(cls, scale=1.0, offset=0.0, name=None):
        name = name or backend.unique_object_name("float_rescaled")
        preprocessor = layers.Rescaling(
            scale=scale, offset=offset, name=f"{name}_preprocessor"
        )
        return Feature(
            dtype="float32", preprocessor=preprocessor, output_mode="float"
        )

    @classmethod
    def float_normalized(cls, name=None):
        name = name or backend.unique_object_name("float_normalized")
        preprocessor = layers.Normalization(
            axis=-1, name=f"{name}_preprocessor"
        )
        return Feature(
            dtype="float32", preprocessor=preprocessor, output_mode="float"
        )

    @classmethod
    def float_discretized(
        cls, num_bins, bin_boundaries=None, output_mode="one_hot", name=None
    ):
        name = name or backend.unique_object_name("float_discretized")
        preprocessor = layers.Discretization(
            num_bins=num_bins,
            bin_boundaries=bin_boundaries,
            name=f"{name}_preprocessor",
        )
        return Feature(
            dtype="float32", preprocessor=preprocessor, output_mode=output_mode
        )

    @classmethod
    def integer_categorical(
        cls,
        max_tokens=None,
        num_oov_indices=1,
        output_mode="one_hot",
        name=None,
    ):
        name = name or backend.unique_object_name("integer_categorical")
        preprocessor = layers.IntegerLookup(
            name=f"{name}_preprocessor",
            max_tokens=max_tokens,
            num_oov_indices=num_oov_indices,
        )
        return Feature(
            dtype="int64", preprocessor=preprocessor, output_mode=output_mode
        )

    @classmethod
    def string_categorical(
        cls,
        max_tokens=None,
        num_oov_indices=1,
        output_mode="one_hot",
        name=None,
    ):
        name = name or backend.unique_object_name("string_categorical")
        preprocessor = layers.StringLookup(
            name=f"{name}_preprocessor",
            max_tokens=max_tokens,
            num_oov_indices=num_oov_indices,
        )
        return Feature(
            dtype="string", preprocessor=preprocessor, output_mode=output_mode
        )

    @classmethod
    def string_hashed(cls, num_bins, output_mode="one_hot", name=None):
        name = name or backend.unique_object_name("string_hashed")
        preprocessor = layers.Hashing(
            name=f"{name}_preprocessor", num_bins=num_bins
        )
        return Feature(
            dtype="string", preprocessor=preprocessor, output_mode=output_mode
        )

    @classmethod
    def integer_hashed(cls, num_bins, output_mode="one_hot", name=None):
        name = name or backend.unique_object_name("integer_hashed")
        preprocessor = layers.Hashing(
            name=f"{name}_preprocessor", num_bins=num_bins
        )
        return Feature(
            dtype="int64", preprocessor=preprocessor, output_mode=output_mode
        )

    def __init__(
        self,
        features,
        output_mode="concat",
        crosses=None,
        crossing_dim=32,
        hashing_dim=32,
        num_discretization_bins=32,
    ):
        if not features:
            raise ValueError("The `features` argument cannot be None or empty.")
        self.crossing_dim = crossing_dim
        self.hashing_dim = hashing_dim
        self.num_discretization_bins = num_discretization_bins
        self.features = {
            name: self._standardize_feature(name, value)
            for name, value in features.items()
        }
        self.crosses = []
        if crosses:
            feature_set = set(features.keys())
            for cross in crosses:
                if isinstance(cross, dict):
                    cross = serialization_lib.deserialize_keras_object(cross)
                if isinstance(cross, Cross):
                    self.crosses.append(cross)
                else:
                    if not crossing_dim:
                        raise ValueError(
                            "When specifying `crosses`, the argument "
                            "`crossing_dim` "
                            "(dimensionality of the crossing space) "
                            "should be specified as well."
                        )
                    for key in cross:
                        if key not in feature_set:
                            raise ValueError(
                                "All features referenced "
                                "in the `crosses` argument "
                                "should be present in the `features` dict. "
                                f"Received unknown features: {cross}"
                            )
                    self.crosses.append(Cross(cross, crossing_dim=crossing_dim))
        self.crosses_by_name = {cross.name: cross for cross in self.crosses}

        if output_mode not in {"dict", "concat"}:
            raise ValueError(
                "Invalid value for argument `output_mode`. "
                "Expected one of {'dict', 'concat'}. "
                f"Received: output_mode={output_mode}"
            )
        self.output_mode = output_mode

        self.inputs = {
            name: self._feature_to_input(name, value)
            for name, value in self.features.items()
        }
        self.preprocessors = {
            name: value.preprocessor for name, value in self.features.items()
        }
        self.encoded_features = None
        self.crossers = {
            cross.name: self._cross_to_crosser(cross) for cross in self.crosses
        }
        self.one_hot_encoders = {}
        self.built = False
        self._is_adapted = False
        self.concat = None
        self._preprocessed_features_names = None
        self._crossed_features_names = None

    def _feature_to_input(self, name, feature):
        return layers.Input(shape=(1,), dtype=feature.dtype, name=name)

    def _standardize_feature(self, name, feature):
        if isinstance(feature, Feature):
            return feature

        if isinstance(feature, dict):
            return serialization_lib.deserialize_keras_object(feature)

        if feature == "float":
            return self.float(name=name)
        elif feature == "float_normalized":
            return self.float_normalized(name=name)
        elif feature == "float_rescaled":
            return self.float_rescaled(name=name)
        elif feature == "float_discretized":
            return self.float_discretized(
                name=name, num_bins=self.num_discretization_bins
            )
        elif feature == "integer_categorical":
            return self.integer_categorical(name=name)
        elif feature == "string_categorical":
            return self.string_categorical(name=name)
        elif feature == "integer_hashed":
            return self.integer_hashed(self.hashing_dim, name=name)
        elif feature == "string_hashed":
            return self.string_hashed(self.hashing_dim, name=name)
        else:
            raise ValueError(f"Invalid feature type: {feature}")

    def _cross_to_crosser(self, cross):
        return layers.HashedCrossing(cross.crossing_dim, name=cross.name)

    def _list_adaptable_preprocessors(self):
        adaptable_preprocessors = []
        for name in self.features.keys():
            preprocessor = self.preprocessors[name]
            # Special case: a Normalization layer with preset mean/variance.
            # Not adaptable.
            if isinstance(preprocessor, layers.Normalization):
                if preprocessor.input_mean is not None:
                    continue
            if hasattr(preprocessor, "adapt"):
                adaptable_preprocessors.append(name)
        return adaptable_preprocessors

    def adapt(self, dataset):
        if not isinstance(dataset, tf.data.Dataset):
            raise ValueError(
                "`adapt()` can only be called on a tf.data.Dataset. "
                f"Received instead: {dataset} (of type {type(dataset)})"
            )

        for name in self._list_adaptable_preprocessors():
            # Call adapt() on each individual adaptable layer.

            # TODO: consider rewriting this to instead iterate on the
            # dataset once, split each batch into individual features,
            # and call the layer's `_adapt_function` on each batch
            # to simulate the behavior of adapt() in a more performant fashion.

            feature_dataset = dataset.map(lambda x: x[name])
            preprocessor = self.preprocessors[name]
            # TODO: consider adding an adapt progress bar.
            # Sample 1 element to check the rank
            for x in feature_dataset.take(1):
                pass
            if x.shape.rank == 0:
                # The dataset yields unbatched scalars; batch it.
                feature_dataset = feature_dataset.batch(32)
            if x.shape.rank in {0, 1}:
                # If the rank is 1, add a dimension
                # so we can reduce on axis=-1.
                # Note: if rank was previously 0, it is now 1.
                feature_dataset = feature_dataset.map(
                    lambda x: tf.expand_dims(x, -1)
                )
            preprocessor.adapt(feature_dataset)
        self._is_adapted = True
        self.get_encoded_features()  # Finish building the layer
        self.built = True

    def get_inputs(self):
        self._check_if_built()
        return self.inputs

    def get_encoded_features(self):
        self._check_if_adapted()

        if self.encoded_features is None:
            preprocessed_features = self._preprocess_features(self.inputs)
            crossed_features = self._cross_features(preprocessed_features)
            merged_features = self._merge_features(
                preprocessed_features, crossed_features
            )
            self.encoded_features = merged_features
        return self.encoded_features

    def _preprocess_features(self, features):
        return {
            name: self.preprocessors[name](features[name])
            for name in features.keys()
        }

    def _cross_features(self, features):
        all_outputs = {}
        for cross in self.crosses:
            inputs = [features[name] for name in cross.feature_names]
            outputs = self.crossers[cross.name](inputs)
            all_outputs[cross.name] = outputs
        return all_outputs

    def _merge_features(self, preprocessed_features, crossed_features):
        if not self._preprocessed_features_names:
            self._preprocessed_features_names = sorted(
                preprocessed_features.keys()
            )
            self._crossed_features_names = sorted(crossed_features.keys())

        all_names = (
            self._preprocessed_features_names + self._crossed_features_names
        )
        all_features = [
            preprocessed_features[name]
            for name in self._preprocessed_features_names
        ] + [crossed_features[name] for name in self._crossed_features_names]

        if self.output_mode == "dict":
            output_dict = {}
        else:
            features_to_concat = []

        if self.built:
            # Fast mode.
            for name, feature in zip(all_names, all_features):
                encoder = self.one_hot_encoders.get(name, None)
                if encoder:
                    feature = encoder(feature)
                if self.output_mode == "dict":
                    output_dict[name] = feature
                else:
                    features_to_concat.append(feature)
            if self.output_mode == "dict":
                return output_dict
            else:
                return self.concat(features_to_concat)

        # If the object isn't built,
        # we create the encoder and concat layers below
        all_specs = [
            self.features[name] for name in self._preprocessed_features_names
        ] + [
            self.crosses_by_name[name] for name in self._crossed_features_names
        ]
        for name, feature, spec in zip(all_names, all_features, all_specs):
            dtype = feature.dtype.name

            if spec.output_mode == "one_hot":
                preprocessor = self.preprocessors.get(
                    name
                ) or self.crossers.get(name)
                cardinality = None
                if not feature.dtype.name.startswith("int"):
                    raise ValueError(
                        f"Feature '{name}' has `output_mode='one_hot'`. "
                        "Thus its preprocessor should return an int64 dtype. "
                        f"Instead it returns a {dtype} dtype."
                    )

                if isinstance(
                    preprocessor, (layers.IntegerLookup, layers.StringLookup)
                ):
                    cardinality = preprocessor.vocabulary_size()
                elif isinstance(preprocessor, layers.CategoryEncoding):
                    cardinality = preprocessor.num_tokens
                elif isinstance(preprocessor, layers.Discretization):
                    cardinality = preprocessor.num_bins
                elif isinstance(
                    preprocessor, (layers.HashedCrossing, layers.Hashing)
                ):
                    cardinality = preprocessor.num_bins
                else:
                    raise ValueError(
                        f"Feature '{name}' has `output_mode='one_hot'`. "
                        "However it isn't a standard feature and the "
                        "dimensionality of its output space is not known, "
                        "thus it cannot be one-hot encoded. "
                        "Try using `output_mode='int'`."
                    )
                if cardinality is not None:
                    encoder = layers.CategoryEncoding(
                        num_tokens=cardinality, output_mode="multi_hot"
                    )
                    self.one_hot_encoders[name] = encoder
                    feature = encoder(feature)

            if self.output_mode == "concat":
                dtype = feature.dtype.name
                if dtype.startswith("int") or dtype == "string":
                    raise ValueError(
                        f"Cannot concatenate features because feature '{name}' "
                        f"has not been encoded (it has dtype {dtype}). "
                        "Consider using `output_mode='dict'`."
                    )
                features_to_concat.append(feature)
            else:
                output_dict[name] = feature

        if self.output_mode == "concat":
            self.concat = layers.Concatenate(axis=-1)
            return self.concat(features_to_concat)
        else:
            return output_dict

    def _check_if_adapted(self):
        if not self._is_adapted:
            if not self._list_adaptable_preprocessors():
                self._is_adapted = True
            else:
                raise ValueError(
                    "You need to call `.adapt(dataset)` on the FeatureSpace "
                    "before you can start using it."
                )

    def _check_if_built(self):
        if not self.built:
            self._check_if_adapted()
            # Finishes building
            self.get_encoded_features()
            self.built = True

    def __call__(self, data):
        self._check_if_built()
        if not isinstance(data, dict):
            raise ValueError(
                "A FeatureSpace can only be called with a dict. "
                f"Received: data={data} (of type {type(data)}"
            )

        data = {key: tf.convert_to_tensor(value) for key, value in data.items()}
        rebatched = False
        for name, x in data.items():
            if x.shape.rank == 0:
                data[name] = tf.reshape(x, [1, 1])
                rebatched = True
            elif x.shape.rank == 1:
                data[name] = tf.expand_dims(x, -1)

        preprocessed_data = self._preprocess_features(data)
        crossed_data = self._cross_features(preprocessed_data)
        merged_data = self._merge_features(preprocessed_data, crossed_data)
        if rebatched:
            if self.output_mode == "concat":
                assert merged_data.shape[0] == 1
                return tf.squeeze(merged_data, axis=0)
            else:
                for name, x in merged_data.items():
                    if x.shape.rank == 2 and x.shape[0] == 1:
                        merged_data[name] = tf.squeeze(x, axis=0)
        return merged_data

    def get_config(self):
        return {
            "features": serialization_lib.serialize_keras_object(self.features),
            "output_mode": self.output_mode,
            "crosses": serialization_lib.serialize_keras_object(self.crosses),
            "crossing_dim": self.crossing_dim,
            "hashing_dim": self.hashing_dim,
            "num_discretization_bins": self.num_discretization_bins,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)

    def get_build_config(self):
        return {
            name: feature.preprocessor.get_build_config()
            for name, feature in self.features.items()
        }

    def build_from_config(self, config):
        for name in config.keys():
            self.features[name].preprocessor.build_from_config(config[name])
        self._is_adapted = True

    def save(self, filepath):
        """Save the `FeatureSpace` instance to a `.keras` file.

        You can reload it via `keras.models.load_model()`:

        ```python
        feature_space.save("myfeaturespace.keras")
        reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
        ```
        """
        saving_lib.save_model(self, filepath)

    def _save_own_variables(self, store):
        return

    def _load_own_variables(self, store):
        return