github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Schema and transform definition for the Criteo dataset."""
    19  from __future__ import absolute_import
    20  from __future__ import division
    21  from __future__ import print_function
    22  
    23  import tensorflow as tf
    24  import tensorflow_transform as tft
    25  
    26  
    27  def _get_raw_categorical_column_name(column_idx):
    28    return 'categorical-feature-{}'.format(column_idx)
    29  
    30  
    31  def get_transformed_categorical_column_name(column_name_or_id):
    32    if isinstance(column_name_or_id, bytes):
    33      # assume the input is column name
    34      column_name = column_name_or_id
    35    else:
    36      # assume the input is column id
    37      column_name = _get_raw_categorical_column_name(column_name_or_id)
    38    return column_name + '_id'
    39  
    40  
    41  _INTEGER_COLUMN_NAMES = [
    42      'int-feature-{}'.format(column_idx) for column_idx in range(1, 14)
    43  ]
    44  _CATEGORICAL_COLUMN_NAMES = [
    45      _get_raw_categorical_column_name(column_idx)
    46      for column_idx in range(14, 40)
    47  ]
    48  DEFAULT_DELIMITER = '\t'
    49  # Number of buckets for integer columns.
    50  _NUM_BUCKETS = 10
    51  
    52  # Schema annotations aren't supported in this build.
    53  tft.common.IS_ANNOTATIONS_PB_AVAILABLE = False
    54  
    55  
    56  def make_ordered_column_names(include_label=True):
    57    """Returns the column names in the dataset in the order as they appear.
    58  
    59    Args:
    60      include_label: Indicates whether the label feature should be included.
    61    Returns:
    62      A list of column names in the dataset.
    63    """
    64    result = ['clicked'] if include_label else []
    65    for name in _INTEGER_COLUMN_NAMES:
    66      result.append(name)
    67    for name in _CATEGORICAL_COLUMN_NAMES:
    68      result.append(name)
    69    return result
    70  
    71  
    72  def make_legacy_input_feature_spec(include_label=True):
    73    """Input schema definition.
    74  
    75    Args:
    76      include_label: Indicates whether the label feature should be included.
    77    Returns:
    78      A `Schema` object.
    79    """
    80    result = {}
    81    if include_label:
    82      result['clicked'] = tf.io.FixedLenFeature(shape=[], dtype=tf.int64)
    83    for name in _INTEGER_COLUMN_NAMES:
    84      result[name] = tf.io.FixedLenFeature(
    85          shape=[], dtype=tf.int64, default_value=-1)
    86    for name in _CATEGORICAL_COLUMN_NAMES:
    87      result[name] = tf.io.FixedLenFeature(
    88          shape=[], dtype=tf.string, default_value='')
    89    return result
    90  
    91  
    92  def make_input_feature_spec(include_label=True):
    93    """Input schema definition.
    94  
    95    Args:
    96      include_label: Indicates whether the label feature should be included.
    97  
    98    Returns:
    99      A `Schema` object.
   100    """
   101    result = {}
   102    if include_label:
   103      result['clicked'] = tf.io.FixedLenFeature(shape=[], dtype=tf.int64)
   104    for name in _INTEGER_COLUMN_NAMES:
   105      result[name] = tf.io.VarLenFeature(dtype=tf.int64)
   106  
   107    for name in _CATEGORICAL_COLUMN_NAMES:
   108      result[name] = tf.io.VarLenFeature(dtype=tf.string)
   109  
   110    return result
   111  
   112  
   113  def make_preprocessing_fn(frequency_threshold):
   114    """Creates a preprocessing function for criteo.
   115  
   116    Args:
   117      frequency_threshold: The frequency_threshold used when generating
   118        vocabularies for the categorical features.
   119  
   120    Returns:
   121      A preprocessing function.
   122    """
   123    def preprocessing_fn(inputs):
   124      """User defined preprocessing function for criteo columns.
   125  
   126      Args:
   127        inputs: dictionary of input `tensorflow_transform.Column`.
   128      Returns:
   129        A dictionary of `tensorflow_transform.Column` representing the transformed
   130            columns.
   131      """
   132      result = {'clicked': inputs['clicked']}
   133      for name in _INTEGER_COLUMN_NAMES:
   134        feature = inputs[name]
   135        # TODO(https://github.com/apache/beam/issues/24902):
   136        #  Replace this boilerplate with a helper function.
   137        # This is a SparseTensor because it is optional. Here we fill in a
   138        # default value when it is missing.
   139        feature = tft.sparse_tensor_to_dense_with_shape(
   140            feature, [None, 1], default_value=-1)
   141        # Reshaping from a batch of vectors of size 1 to a batch of scalars and
   142        # adding a bucketized version.
   143        feature = tf.squeeze(feature, axis=1)
   144        result[name] = feature
   145        result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS)
   146      for name in _CATEGORICAL_COLUMN_NAMES:
   147        feature = inputs[name]
   148        # Similar to for integer columns, but use '' as default.
   149        feature = tft.sparse_tensor_to_dense_with_shape(
   150            feature, [None, 1], default_value='')
   151        feature = tf.squeeze(feature, axis=1)
   152        result[get_transformed_categorical_column_name(
   153            name)] = tft.compute_and_apply_vocabulary(
   154                feature, frequency_threshold=frequency_threshold)
   155  
   156      return result
   157  
   158    return preprocessing_fn