# Copyright 2021 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Common utilities for our Keras preprocessing integration tests.""" import os import tensorflow.compat.v2 as tf preprocessing = tf.keras.layers BATCH_SIZE = 64 DS_SIZE = BATCH_SIZE * 16 STEPS = DS_SIZE / BATCH_SIZE VOCAB_SIZE = 100 def make_dataset(): """Make a simple structured dataset. The dataset contains three feature columns. - float_col: an unnormalized numeric column. - int_col: an column of integer IDs. - string_col: a column of fixed vocabulary terms. Returns: The dataset. """ tf.random.set_seed(197011) floats = tf.random.uniform((DS_SIZE, 1), maxval=10, dtype="float32") # Generate a 100 unique integer values, but over a wide range to showcase a # common use case for IntegerLookup. ints = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64") ints = ints * 1000 # Use a fixed vocabulary of strings from 0 to 99, to showcase loading a # vocabulary from a file. strings = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64") strings = tf.strings.as_string(strings) features = {"float_col": floats, "int_col": ints, "string_col": strings} # Random binary label. labels = tf.random.uniform((DS_SIZE, 1), maxval=2, dtype="int64") ds = tf.data.Dataset.from_tensor_slices((features, labels)) return ds def make_preprocessing_model(file_dir): """Make a standalone preprocessing model.""" # The name of our keras.Input should match the column name in the dataset. float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col") int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col") string_in = tf.keras.Input(shape=(1,), dtype="string", name="string_col") # We need to batch a dataset before adapting. ds = make_dataset().batch(BATCH_SIZE) # Normalize floats by adapting the mean and variance of the input. normalization = preprocessing.Normalization() normalization.adapt(ds.map(lambda features, labels: features["float_col"])) float_out = normalization(float_in) # Lookup ints by adapting a vocab of integer IDs. int_lookup = preprocessing.IntegerLookup() int_lookup.adapt(ds.map(lambda features, labels: features["int_col"])) int_out = int_lookup(int_in) # Lookup strings from a fixed file based vocabulary. string_vocab = list(str(i) for i in range(VOCAB_SIZE)) vocab_file = os.path.join(file_dir, "vocab_file.txt") with open(vocab_file, "w") as f: f.write("\n".join(string_vocab)) string_lookup = preprocessing.StringLookup(vocabulary=vocab_file) string_out = string_lookup(string_in) return tf.keras.Model( inputs=(float_in, int_in, string_in), outputs=(float_out, int_out, string_out), ) def make_training_model(): """Make a trainable model for the preprocessed inputs.""" float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col") # After preprocessing, both the string and int column are integer ready for # embedding. int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col") string_in = tf.keras.Input(shape=(1,), dtype="int64", name="string_col") # Feed the lookup layers into an embedding. int_embedding = tf.keras.layers.Embedding(VOCAB_SIZE + 1, 8, input_length=1) int_out = int_embedding(int_in) int_out = tf.keras.layers.Flatten()(int_out) string_embedding = tf.keras.layers.Embedding( VOCAB_SIZE + 1, 8, input_length=1 ) string_out = string_embedding(string_in) string_out = tf.keras.layers.Flatten()(string_out) # Concatenate outputs. concatate = tf.keras.layers.Concatenate() # Feed our preprocessed inputs into a simple MLP. x = concatate((float_in, int_out, string_out)) x = tf.keras.layers.Dense(32, activation="relu")(x) x = tf.keras.layers.Dense(32, activation="relu")(x) outputs = tf.keras.layers.Dense(1, activation="softmax")(x) return tf.keras.Model(inputs=(float_in, int_in, string_in), outputs=outputs)