github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/chicago_taxi/run_chicago.sh (about) 1 #!/usr/bin/env bash 2 # 3 # Licensed to the Apache Software Foundation (ASF) under one or more 4 # contributor license agreements. See the NOTICE file distributed with 5 # this work for additional information regarding copyright ownership. 6 # The ASF licenses this file to You under the Apache License, Version 2.0 7 # (the "License"); you may not use this file except in compliance with 8 # the License. You may obtain a copy of the License at 9 # 10 # http://www.apache.org/licenses/LICENSE-2.0 11 # 12 # Unless required by applicable law or agreed to in writing, software 13 # distributed under the License is distributed on an "AS IS" BASIS, 14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 # See the License for the specific language governing permissions and 16 # limitations under the License. 17 # 18 # 19 # The Chicago Taxi example demonstrates the end-to-end workflow and steps 20 # of how to analyze, validate and transform data, train a model, analyze 21 # and serve it. 22 # 23 # Example usage: 24 # ./run_chicago.sh gs://my-gcs-bucket DataflowRunner \ 25 # --sdk_location=\"apache-beam.tar.gz\" --region=\"us-central1\" 26 # 27 set -eo pipefail 28 29 if [[ -z "$1" ]]; then 30 echo "GCS bucket name required" 31 exit 1 32 fi 33 34 if [[ -z "$2" ]]; then 35 echo "Runner required" 36 exit 1 37 fi 38 39 GCS_BUCKET=$1 40 RUNNER=$2 41 PIPELINE_OPTIONS=$3 42 43 if [[ "$RUNNER" == "PortableRunner" ]]; then 44 METRICS_TABLE_SUFFIX='_flink' 45 fi 46 47 # Loop through pipeline options and append 48 shift 49 while [[ $# -gt 2 ]] 50 do 51 PIPELINE_OPTIONS=${PIPELINE_OPTIONS}" "$3 52 shift 53 done 54 55 JOB_ID="chicago-taxi-tfdv-$(date +%Y%m%d-%H%M%S)" 56 JOB_OUTPUT_PATH=${GCS_BUCKET}/${JOB_ID}/chicago_taxi_output 57 TEMP_PATH=${GCS_BUCKET}/${JOB_ID}/tmp/ 58 GCP_PROJECT=$(gcloud config list --format 'value(core.project)' 2>/dev/null) 59 MAX_ROWS=100000 60 JOB_OUTPUT_PATH=${GCS_BUCKET}/${JOB_ID}/chicago_taxi_output 61 TFT_OUTPUT_PATH=${JOB_OUTPUT_PATH}/tft_output 62 EVAL_RESULT_DIR=${TFT_OUTPUT_PATH}/eval_result_dir 63 64 65 # Variables needed for subsequent stages. 66 TFDV_OUTPUT_PATH=${JOB_OUTPUT_PATH}/tfdv_output 67 SCHEMA_PATH=${TFDV_OUTPUT_PATH}/schema.pbtxt 68 69 echo Using GCP project: ${GCP_PROJECT} 70 echo Job output path: ${JOB_OUTPUT_PATH} 71 echo TFDV output path: ${TFDV_OUTPUT_PATH} 72 73 74 # Analyze and validate 75 # Compute stats and generate a schema based on the stats. 76 echo Starting distributed TFDV stats computation and schema generation... 77 78 python tfdv_analyze_and_validate.py \ 79 --input bigquery-public-data.chicago_taxi_trips.taxi_trips \ 80 --infer_schema \ 81 --stats_path ${TFDV_OUTPUT_PATH}/train_stats.tfrecord \ 82 --schema_path ${SCHEMA_PATH} \ 83 --project ${GCP_PROJECT} \ 84 --region us-central1 \ 85 --temp_location ${TEMP_PATH} \ 86 --job_name ${JOB_ID} \ 87 --save_main_session \ 88 --runner ${RUNNER} \ 89 --max_rows=${MAX_ROWS} \ 90 --publish_to_big_query=true \ 91 --metrics_dataset='beam_performance' \ 92 --metrics_table='tfdv_analyze'${METRICS_TABLE_SUFFIX} \ 93 --metric_reporting_project ${GCP_PROJECT} \ 94 --setup_file ./setup.py \ 95 ${PIPELINE_OPTIONS} 96 97 EVAL_JOB_ID=${JOB_ID}-eval 98 99 # Compute stats for eval data and validate stats against the schema. 100 python tfdv_analyze_and_validate.py \ 101 --input bigquery-public-data.chicago_taxi_trips.taxi_trips \ 102 --for_eval \ 103 --schema_path ${SCHEMA_PATH} \ 104 --validate_stats \ 105 --stats_path ${TFDV_OUTPUT_PATH}/eval_stats.tfrecord \ 106 --anomalies_path ${TFDV_OUTPUT_PATH}/anomalies.pbtxt \ 107 --project ${GCP_PROJECT} \ 108 --region us-central1 \ 109 --temp_location ${TEMP_PATH} \ 110 --job_name ${EVAL_JOB_ID} \ 111 --save_main_session \ 112 --runner ${RUNNER} \ 113 --max_rows=${MAX_ROWS} \ 114 --publish_to_big_query=true \ 115 --metrics_dataset='beam_performance' \ 116 --metrics_table='chicago_taxi_tfdv_validate'${METRICS_TABLE_SUFFIX} \ 117 --metric_reporting_project ${GCP_PROJECT} \ 118 --setup_file ./setup.py \ 119 ${PIPELINE_OPTIONS} 120 121 # End analyze and validate 122 echo Preprocessing train data... 123 124 python preprocess.py \ 125 --output_dir ${TFT_OUTPUT_PATH} \ 126 --outfile_prefix train_transformed \ 127 --input bigquery-public-data.chicago_taxi_trips.taxi_trips \ 128 --schema_file ${SCHEMA_PATH} \ 129 --project ${GCP_PROJECT} \ 130 --region us-central1 \ 131 --temp_location ${TEMP_PATH} \ 132 --job_name ${JOB_ID} \ 133 --runner ${RUNNER} \ 134 --max_rows ${MAX_ROWS} \ 135 --publish_to_big_query=true \ 136 --metrics_dataset='beam_performance' \ 137 --metrics_table='chicago_taxi_preprocess'${METRICS_TABLE_SUFFIX} \ 138 --metric_reporting_project ${GCP_PROJECT} \ 139 --setup_file ./setup.py \ 140 ${PIPELINE_OPTIONS} 141 142 #Train ML engine 143 TRAINER_JOB_ID="chicago_taxi_trainer_$(date +%Y%m%d_%H%M%S)" 144 TRAIN_OUTPUT_PATH=${JOB_OUTPUT_PATH}/trainer_output 145 WORKING_DIR=${TRAIN_OUTPUT_PATH}/working_dir 146 147 MODEL_DIR=${TRAIN_OUTPUT_PATH}/model_dir 148 # Inputs 149 TRAIN_FILE=${TFT_OUTPUT_PATH}/train_transformed-* 150 TF_VERSION=1.14 151 #workaround for boto in virtualenv, required for the gsutil commands to work: 152 export BOTO_CONFIG=/dev/null 153 # Start clean, but don't fail if the path does not exist yet. 154 gsutil rm ${TRAIN_OUTPUT_PATH} || true 155 # Options 156 TRAIN_STEPS=10000 157 EVAL_STEPS=1000 158 # Force a small eval so that the Estimator.train_and_eval() can be used to 159 # save the model with its standard paths. 160 EVAL_FILE=${TFT_OUTPUT_PATH}/train_transformed-* 161 echo Training the model 162 gcloud ml-engine jobs submit training ${TRAINER_JOB_ID} \ 163 --stream-logs \ 164 --job-dir ${MODEL_DIR} \ 165 --runtime-version ${TF_VERSION} \ 166 --module-name trainer.task \ 167 --package-path trainer/ \ 168 --region us-central1 \ 169 -- \ 170 --train-files ${TRAIN_FILE} \ 171 --train-steps ${TRAIN_STEPS} \ 172 --eval-files ${EVAL_FILE} \ 173 --eval-steps ${EVAL_STEPS} \ 174 --output-dir ${WORKING_DIR} \ 175 --schema-file ${SCHEMA_PATH} \ 176 --tf-transform-dir ${TFT_OUTPUT_PATH} 177 178 # We evaluate with the last eval model written (hence tail -n1) 179 EVAL_MODEL_DIR=${TRAIN_OUTPUT_PATH}/working_dir/eval_model_dir 180 LAST_EVAL_MODEL_DIR=$(gsutil ls ${EVAL_MODEL_DIR} | tail -n1) 181 182 echo Eval model dir: ${EVAL_MODEL_DIR} 183 184 python process_tfma.py \ 185 --big_query_table bigquery-public-data.chicago_taxi_trips.taxi_trips \ 186 --schema_file ${SCHEMA_PATH} \ 187 --eval_model_dir ${LAST_EVAL_MODEL_DIR} \ 188 --project ${GCP_PROJECT} \ 189 --region us-central1 \ 190 --temp_location ${GCS_BUCKET}/${JOB_ID}/tmp/ \ 191 --job_name ${JOB_ID} \ 192 --save_main_session \ 193 --runner ${RUNNER} \ 194 --max_eval_rows=${MAX_ROWS} \ 195 --publish_to_big_query=true \ 196 --metrics_dataset='beam_performance' \ 197 --metrics_table='chicago_taxi_process_tfma'${METRICS_TABLE_SUFFIX} \ 198 --metric_reporting_project ${GCP_PROJECT} \ 199 --setup_file ./setup.py \ 200 ${PIPELINE_OPTIONS}