114 lines
4.6 KiB
Python
114 lines
4.6 KiB
Python
![]() |
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""Common utilities for our Keras preprocessing integration tests."""
|
||
|
|
||
|
import os
|
||
|
|
||
|
import tensorflow.compat.v2 as tf
|
||
|
|
||
|
preprocessing = tf.keras.layers
|
||
|
|
||
|
BATCH_SIZE = 64
|
||
|
DS_SIZE = BATCH_SIZE * 16
|
||
|
STEPS = DS_SIZE / BATCH_SIZE
|
||
|
VOCAB_SIZE = 100
|
||
|
|
||
|
|
||
|
def make_dataset():
|
||
|
"""Make a simple structured dataset.
|
||
|
|
||
|
The dataset contains three feature columns.
|
||
|
- float_col: an unnormalized numeric column.
|
||
|
- int_col: an column of integer IDs.
|
||
|
- string_col: a column of fixed vocabulary terms.
|
||
|
|
||
|
Returns:
|
||
|
The dataset.
|
||
|
"""
|
||
|
tf.random.set_seed(197011)
|
||
|
floats = tf.random.uniform((DS_SIZE, 1), maxval=10, dtype="float32")
|
||
|
# Generate a 100 unique integer values, but over a wide range to showcase a
|
||
|
# common use case for IntegerLookup.
|
||
|
ints = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64")
|
||
|
ints = ints * 1000
|
||
|
# Use a fixed vocabulary of strings from 0 to 99, to showcase loading a
|
||
|
# vocabulary from a file.
|
||
|
strings = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64")
|
||
|
strings = tf.strings.as_string(strings)
|
||
|
features = {"float_col": floats, "int_col": ints, "string_col": strings}
|
||
|
# Random binary label.
|
||
|
labels = tf.random.uniform((DS_SIZE, 1), maxval=2, dtype="int64")
|
||
|
ds = tf.data.Dataset.from_tensor_slices((features, labels))
|
||
|
return ds
|
||
|
|
||
|
|
||
|
def make_preprocessing_model(file_dir):
|
||
|
"""Make a standalone preprocessing model."""
|
||
|
# The name of our keras.Input should match the column name in the dataset.
|
||
|
float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col")
|
||
|
int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col")
|
||
|
string_in = tf.keras.Input(shape=(1,), dtype="string", name="string_col")
|
||
|
|
||
|
# We need to batch a dataset before adapting.
|
||
|
ds = make_dataset().batch(BATCH_SIZE)
|
||
|
# Normalize floats by adapting the mean and variance of the input.
|
||
|
normalization = preprocessing.Normalization()
|
||
|
normalization.adapt(ds.map(lambda features, labels: features["float_col"]))
|
||
|
float_out = normalization(float_in)
|
||
|
# Lookup ints by adapting a vocab of integer IDs.
|
||
|
int_lookup = preprocessing.IntegerLookup()
|
||
|
int_lookup.adapt(ds.map(lambda features, labels: features["int_col"]))
|
||
|
int_out = int_lookup(int_in)
|
||
|
# Lookup strings from a fixed file based vocabulary.
|
||
|
string_vocab = list(str(i) for i in range(VOCAB_SIZE))
|
||
|
vocab_file = os.path.join(file_dir, "vocab_file.txt")
|
||
|
with open(vocab_file, "w") as f:
|
||
|
f.write("\n".join(string_vocab))
|
||
|
string_lookup = preprocessing.StringLookup(vocabulary=vocab_file)
|
||
|
string_out = string_lookup(string_in)
|
||
|
|
||
|
return tf.keras.Model(
|
||
|
inputs=(float_in, int_in, string_in),
|
||
|
outputs=(float_out, int_out, string_out),
|
||
|
)
|
||
|
|
||
|
|
||
|
def make_training_model():
|
||
|
"""Make a trainable model for the preprocessed inputs."""
|
||
|
float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col")
|
||
|
# After preprocessing, both the string and int column are integer ready for
|
||
|
# embedding.
|
||
|
int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col")
|
||
|
string_in = tf.keras.Input(shape=(1,), dtype="int64", name="string_col")
|
||
|
|
||
|
# Feed the lookup layers into an embedding.
|
||
|
int_embedding = tf.keras.layers.Embedding(VOCAB_SIZE + 1, 8, input_length=1)
|
||
|
int_out = int_embedding(int_in)
|
||
|
int_out = tf.keras.layers.Flatten()(int_out)
|
||
|
string_embedding = tf.keras.layers.Embedding(
|
||
|
VOCAB_SIZE + 1, 8, input_length=1
|
||
|
)
|
||
|
string_out = string_embedding(string_in)
|
||
|
string_out = tf.keras.layers.Flatten()(string_out)
|
||
|
|
||
|
# Concatenate outputs.
|
||
|
concatate = tf.keras.layers.Concatenate()
|
||
|
# Feed our preprocessed inputs into a simple MLP.
|
||
|
x = concatate((float_in, int_out, string_out))
|
||
|
x = tf.keras.layers.Dense(32, activation="relu")(x)
|
||
|
x = tf.keras.layers.Dense(32, activation="relu")(x)
|
||
|
outputs = tf.keras.layers.Dense(1, activation="softmax")(x)
|
||
|
return tf.keras.Model(inputs=(float_in, int_in, string_in), outputs=outputs)
|