github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Schema and transform definition for the Criteo dataset.""" 19 from __future__ import absolute_import 20 from __future__ import division 21 from __future__ import print_function 22 23 import tensorflow as tf 24 import tensorflow_transform as tft 25 26 27 def _get_raw_categorical_column_name(column_idx): 28 return 'categorical-feature-{}'.format(column_idx) 29 30 31 def get_transformed_categorical_column_name(column_name_or_id): 32 if isinstance(column_name_or_id, bytes): 33 # assume the input is column name 34 column_name = column_name_or_id 35 else: 36 # assume the input is column id 37 column_name = _get_raw_categorical_column_name(column_name_or_id) 38 return column_name + '_id' 39 40 41 _INTEGER_COLUMN_NAMES = [ 42 'int-feature-{}'.format(column_idx) for column_idx in range(1, 14) 43 ] 44 _CATEGORICAL_COLUMN_NAMES = [ 45 _get_raw_categorical_column_name(column_idx) 46 for column_idx in range(14, 40) 47 ] 48 DEFAULT_DELIMITER = '\t' 49 # Number of buckets for integer columns. 50 _NUM_BUCKETS = 10 51 52 # Schema annotations aren't supported in this build. 53 tft.common.IS_ANNOTATIONS_PB_AVAILABLE = False 54 55 56 def make_ordered_column_names(include_label=True): 57 """Returns the column names in the dataset in the order as they appear. 58 59 Args: 60 include_label: Indicates whether the label feature should be included. 61 Returns: 62 A list of column names in the dataset. 63 """ 64 result = ['clicked'] if include_label else [] 65 for name in _INTEGER_COLUMN_NAMES: 66 result.append(name) 67 for name in _CATEGORICAL_COLUMN_NAMES: 68 result.append(name) 69 return result 70 71 72 def make_legacy_input_feature_spec(include_label=True): 73 """Input schema definition. 74 75 Args: 76 include_label: Indicates whether the label feature should be included. 77 Returns: 78 A `Schema` object. 79 """ 80 result = {} 81 if include_label: 82 result['clicked'] = tf.io.FixedLenFeature(shape=[], dtype=tf.int64) 83 for name in _INTEGER_COLUMN_NAMES: 84 result[name] = tf.io.FixedLenFeature( 85 shape=[], dtype=tf.int64, default_value=-1) 86 for name in _CATEGORICAL_COLUMN_NAMES: 87 result[name] = tf.io.FixedLenFeature( 88 shape=[], dtype=tf.string, default_value='') 89 return result 90 91 92 def make_input_feature_spec(include_label=True): 93 """Input schema definition. 94 95 Args: 96 include_label: Indicates whether the label feature should be included. 97 98 Returns: 99 A `Schema` object. 100 """ 101 result = {} 102 if include_label: 103 result['clicked'] = tf.io.FixedLenFeature(shape=[], dtype=tf.int64) 104 for name in _INTEGER_COLUMN_NAMES: 105 result[name] = tf.io.VarLenFeature(dtype=tf.int64) 106 107 for name in _CATEGORICAL_COLUMN_NAMES: 108 result[name] = tf.io.VarLenFeature(dtype=tf.string) 109 110 return result 111 112 113 def make_preprocessing_fn(frequency_threshold): 114 """Creates a preprocessing function for criteo. 115 116 Args: 117 frequency_threshold: The frequency_threshold used when generating 118 vocabularies for the categorical features. 119 120 Returns: 121 A preprocessing function. 122 """ 123 def preprocessing_fn(inputs): 124 """User defined preprocessing function for criteo columns. 125 126 Args: 127 inputs: dictionary of input `tensorflow_transform.Column`. 128 Returns: 129 A dictionary of `tensorflow_transform.Column` representing the transformed 130 columns. 131 """ 132 result = {'clicked': inputs['clicked']} 133 for name in _INTEGER_COLUMN_NAMES: 134 feature = inputs[name] 135 # TODO(https://github.com/apache/beam/issues/24902): 136 # Replace this boilerplate with a helper function. 137 # This is a SparseTensor because it is optional. Here we fill in a 138 # default value when it is missing. 139 feature = tft.sparse_tensor_to_dense_with_shape( 140 feature, [None, 1], default_value=-1) 141 # Reshaping from a batch of vectors of size 1 to a batch of scalars and 142 # adding a bucketized version. 143 feature = tf.squeeze(feature, axis=1) 144 result[name] = feature 145 result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) 146 for name in _CATEGORICAL_COLUMN_NAMES: 147 feature = inputs[name] 148 # Similar to for integer columns, but use '' as default. 149 feature = tft.sparse_tensor_to_dense_with_shape( 150 feature, [None, 1], default_value='') 151 feature = tf.squeeze(feature, axis=1) 152 result[get_transformed_categorical_column_name( 153 name)] = tft.compute_and_apply_vocabulary( 154 feature, frequency_threshold=frequency_threshold) 155 156 return result 157 158 return preprocessing_fn