Intelegentny_Pszczelarz/.venv/Lib/site-packages/keras/layers/preprocessing/category_encoding.py

231 lines
9.0 KiB
Python
Raw Normal View History

2023-06-19 00:49:18 +02:00
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras CategoryEncoding preprocessing layer."""
import tensorflow.compat.v2 as tf
from keras import backend
from keras.engine import base_layer
from keras.engine import base_preprocessing_layer
from keras.layers.preprocessing import preprocessing_utils as utils
from keras.utils import layer_utils
# isort: off
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util.tf_export import keras_export
INT = utils.INT
ONE_HOT = utils.ONE_HOT
MULTI_HOT = utils.MULTI_HOT
COUNT = utils.COUNT
@keras_export(
"keras.layers.CategoryEncoding",
"keras.layers.experimental.preprocessing.CategoryEncoding",
)
class CategoryEncoding(base_layer.Layer):
"""A preprocessing layer which encodes integer features.
This layer provides options for condensing data into a categorical encoding
when the total number of tokens are known in advance. It accepts integer
values as inputs, and it outputs a dense or sparse representation of those
inputs. For integer inputs where the total number of tokens is not known,
use `tf.keras.layers.IntegerLookup` instead.
For an overview and full list of preprocessing layers, see the preprocessing
[guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
Examples:
**One-hot encoding data**
>>> layer = tf.keras.layers.CategoryEncoding(
... num_tokens=4, output_mode="one_hot")
>>> layer([3, 2, 0, 1])
<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 0., 0., 1.],
[0., 0., 1., 0.],
[1., 0., 0., 0.],
[0., 1., 0., 0.]], dtype=float32)>
**Multi-hot encoding data**
>>> layer = tf.keras.layers.CategoryEncoding(
... num_tokens=4, output_mode="multi_hot")
>>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[1., 1., 0., 0.],
[1., 0., 0., 0.],
[0., 1., 1., 0.],
[0., 1., 0., 1.]], dtype=float32)>
**Using weighted inputs in `"count"` mode**
>>> layer = tf.keras.layers.CategoryEncoding(
... num_tokens=4, output_mode="count")
>>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
>>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
<tf.Tensor: shape=(4, 4), dtype=float64, numpy=
array([[0.1, 0.2, 0. , 0. ],
[0.2, 0. , 0. , 0. ],
[0. , 0.2, 0.3, 0. ],
[0. , 0.2, 0. , 0.4]], dtype=float32)>
Args:
num_tokens: The total number of tokens the layer should support. All
inputs to the layer must integers in the range `0 <= value <
num_tokens`, or an error will be thrown.
output_mode: Specification for the output of the layer.
Defaults to `"multi_hot"`. Values can be `"one_hot"`, `"multi_hot"` or
`"count"`, configuring the layer as follows:
- `"one_hot"`: Encodes each individual element in the input into an
array of `num_tokens` size, containing a 1 at the element index. If
the last dimension is size 1, will encode on that dimension. If the
last dimension is not size 1, will append a new dimension for the
encoded output.
- `"multi_hot"`: Encodes each sample in the input into a single array
of `num_tokens` size, containing a 1 for each vocabulary term
present in the sample. Treats the last dimension as the sample
dimension, if input shape is `(..., sample_length)`, output shape
will be `(..., num_tokens)`.
- `"count"`: Like `"multi_hot"`, but the int array contains a count of
the number of times the token at that index appeared in the sample.
For all output modes, currently only output up to rank 2 is supported.
sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
`Tensor`. Defaults to `False`.
Call arguments:
inputs: A 1D or 2D tensor of integer inputs.
count_weights: A tensor in the same shape as `inputs` indicating the
weight for each sample value when summing up in `count` mode. Not used
in `"multi_hot"` or `"one_hot"` modes.
"""
def __init__(
self, num_tokens=None, output_mode="multi_hot", sparse=False, **kwargs
):
# max_tokens is an old name for the num_tokens arg we continue to
# support because of usage.
if "max_tokens" in kwargs:
logging.warning(
"max_tokens is deprecated, please use num_tokens instead."
)
num_tokens = kwargs["max_tokens"]
del kwargs["max_tokens"]
# By default, output floats. This is already default for TF2, but in TF1
# dtype is inferred from inputs, and would default to int.
if "dtype" not in kwargs:
kwargs["dtype"] = backend.floatx()
super().__init__(**kwargs)
base_preprocessing_layer.keras_kpl_gauge.get_cell(
"CategoryEncoding"
).set(True)
# Support deprecated names for output_modes.
if output_mode == "binary":
output_mode = MULTI_HOT
# 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
layer_utils.validate_string_arg(
output_mode,
allowable_strings=(COUNT, ONE_HOT, MULTI_HOT),
layer_name="CategoryEncoding",
arg_name="output_mode",
)
if num_tokens is None:
raise ValueError(
"num_tokens must be set to use this layer. If the "
"number of tokens is not known beforehand, use the "
"IntegerLookup layer instead."
)
if num_tokens < 1:
raise ValueError(
f"`num_tokens` must be >= 1. Received: num_tokens={num_tokens}."
)
self.num_tokens = num_tokens
self.output_mode = output_mode
self.sparse = sparse
def compute_output_shape(self, input_shape):
input_shape = list(input_shape)
if not input_shape:
return tf.TensorShape([self.num_tokens])
if self.output_mode == ONE_HOT and input_shape[-1] != 1:
return tf.TensorShape(input_shape + [self.num_tokens])
else:
return tf.TensorShape(input_shape[:-1] + [self.num_tokens])
def compute_output_signature(self, input_spec):
output_shape = self.compute_output_shape(input_spec.shape.as_list())
if self.sparse:
return tf.SparseTensorSpec(shape=output_shape, dtype=tf.int64)
else:
return tf.TensorSpec(shape=output_shape, dtype=tf.int64)
def get_config(self):
config = {
"num_tokens": self.num_tokens,
"output_mode": self.output_mode,
"sparse": self.sparse,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, count_weights=None):
inputs = utils.ensure_tensor(inputs)
if count_weights is not None:
if self.output_mode != COUNT:
raise ValueError(
"`count_weights` is not used when `output_mode` is not "
"`'count'`. Received `count_weights={count_weights}`."
)
count_weights = utils.ensure_tensor(
count_weights, self.compute_dtype
)
depth = self.num_tokens
if isinstance(inputs, tf.SparseTensor):
max_value = tf.reduce_max(inputs.values)
min_value = tf.reduce_min(inputs.values)
else:
max_value = tf.reduce_max(inputs)
min_value = tf.reduce_min(inputs)
condition = tf.logical_and(
tf.greater(tf.cast(depth, max_value.dtype), max_value),
tf.greater_equal(min_value, tf.cast(0, min_value.dtype)),
)
assertion = tf.Assert(
condition,
[
"Input values must be in the range 0 <= values < num_tokens"
" with num_tokens={}".format(depth)
],
)
with tf.control_dependencies([assertion]):
return utils.encode_categorical_inputs(
inputs,
output_mode=self.output_mode,
depth=depth,
dtype=self.compute_dtype,
sparse=self.sparse,
count_weights=count_weights,
)