github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/chicago_taxi/run_chicago.sh (about)

     1  #!/usr/bin/env bash
     2  #
     3  #    Licensed to the Apache Software Foundation (ASF) under one or more
     4  #    contributor license agreements.  See the NOTICE file distributed with
     5  #    this work for additional information regarding copyright ownership.
     6  #    The ASF licenses this file to You under the Apache License, Version 2.0
     7  #    (the "License"); you may not use this file except in compliance with
     8  #    the License.  You may obtain a copy of the License at
     9  #
    10  #       http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  #    Unless required by applicable law or agreed to in writing, software
    13  #    distributed under the License is distributed on an "AS IS" BASIS,
    14  #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  #    See the License for the specific language governing permissions and
    16  #    limitations under the License.
    17  #
    18  #
    19  #    The Chicago Taxi example demonstrates the end-to-end workflow and steps
    20  #    of how to analyze, validate and transform data, train a model, analyze
    21  #    and serve it.
    22  #
    23  #    Example usage:
    24  #    ./run_chicago.sh gs://my-gcs-bucket DataflowRunner \
    25  #                     --sdk_location=\"apache-beam.tar.gz\" --region=\"us-central1\"
    26  #
    27  set -eo pipefail
    28  
    29  if [[ -z "$1" ]]; then
    30    echo "GCS bucket name required"
    31    exit 1
    32  fi
    33  
    34  if [[ -z "$2" ]]; then
    35    echo "Runner required"
    36    exit 1
    37  fi
    38  
    39  GCS_BUCKET=$1
    40  RUNNER=$2
    41  PIPELINE_OPTIONS=$3
    42  
    43  if [[ "$RUNNER" == "PortableRunner" ]]; then
    44    METRICS_TABLE_SUFFIX='_flink'
    45  fi
    46  
    47  # Loop through pipeline options and append
    48  shift
    49  while [[ $# -gt 2 ]]
    50  do
    51    PIPELINE_OPTIONS=${PIPELINE_OPTIONS}" "$3
    52    shift
    53  done
    54  
    55  JOB_ID="chicago-taxi-tfdv-$(date +%Y%m%d-%H%M%S)"
    56  JOB_OUTPUT_PATH=${GCS_BUCKET}/${JOB_ID}/chicago_taxi_output
    57  TEMP_PATH=${GCS_BUCKET}/${JOB_ID}/tmp/
    58  GCP_PROJECT=$(gcloud config list --format 'value(core.project)' 2>/dev/null)
    59  MAX_ROWS=100000
    60  JOB_OUTPUT_PATH=${GCS_BUCKET}/${JOB_ID}/chicago_taxi_output
    61  TFT_OUTPUT_PATH=${JOB_OUTPUT_PATH}/tft_output
    62  EVAL_RESULT_DIR=${TFT_OUTPUT_PATH}/eval_result_dir
    63  
    64  
    65  # Variables needed for subsequent stages.
    66  TFDV_OUTPUT_PATH=${JOB_OUTPUT_PATH}/tfdv_output
    67  SCHEMA_PATH=${TFDV_OUTPUT_PATH}/schema.pbtxt
    68  
    69  echo Using GCP project: ${GCP_PROJECT}
    70  echo Job output path: ${JOB_OUTPUT_PATH}
    71  echo TFDV output path: ${TFDV_OUTPUT_PATH}
    72  
    73  
    74  # Analyze and validate
    75  # Compute stats and generate a schema based on the stats.
    76  echo Starting distributed TFDV stats computation and schema generation...
    77  
    78  python tfdv_analyze_and_validate.py \
    79    --input bigquery-public-data.chicago_taxi_trips.taxi_trips \
    80    --infer_schema \
    81    --stats_path ${TFDV_OUTPUT_PATH}/train_stats.tfrecord \
    82    --schema_path ${SCHEMA_PATH} \
    83    --project ${GCP_PROJECT} \
    84    --region us-central1 \
    85    --temp_location ${TEMP_PATH} \
    86    --job_name ${JOB_ID} \
    87    --save_main_session \
    88    --runner ${RUNNER} \
    89    --max_rows=${MAX_ROWS} \
    90    --publish_to_big_query=true \
    91    --metrics_dataset='beam_performance' \
    92    --metrics_table='tfdv_analyze'${METRICS_TABLE_SUFFIX} \
    93    --metric_reporting_project ${GCP_PROJECT} \
    94    --setup_file ./setup.py \
    95    ${PIPELINE_OPTIONS}
    96  
    97  EVAL_JOB_ID=${JOB_ID}-eval
    98  
    99  # Compute stats for eval data and validate stats against the schema.
   100  python tfdv_analyze_and_validate.py \
   101    --input bigquery-public-data.chicago_taxi_trips.taxi_trips \
   102    --for_eval \
   103    --schema_path ${SCHEMA_PATH} \
   104    --validate_stats \
   105    --stats_path ${TFDV_OUTPUT_PATH}/eval_stats.tfrecord \
   106    --anomalies_path ${TFDV_OUTPUT_PATH}/anomalies.pbtxt \
   107    --project ${GCP_PROJECT} \
   108    --region us-central1 \
   109    --temp_location ${TEMP_PATH} \
   110    --job_name ${EVAL_JOB_ID} \
   111    --save_main_session \
   112    --runner ${RUNNER} \
   113    --max_rows=${MAX_ROWS} \
   114    --publish_to_big_query=true \
   115    --metrics_dataset='beam_performance' \
   116    --metrics_table='chicago_taxi_tfdv_validate'${METRICS_TABLE_SUFFIX} \
   117    --metric_reporting_project ${GCP_PROJECT} \
   118    --setup_file ./setup.py \
   119    ${PIPELINE_OPTIONS}
   120  
   121  # End analyze and validate
   122  echo Preprocessing train data...
   123  
   124  python preprocess.py \
   125    --output_dir ${TFT_OUTPUT_PATH} \
   126    --outfile_prefix train_transformed \
   127    --input bigquery-public-data.chicago_taxi_trips.taxi_trips \
   128    --schema_file ${SCHEMA_PATH} \
   129    --project ${GCP_PROJECT} \
   130    --region us-central1 \
   131    --temp_location ${TEMP_PATH} \
   132    --job_name ${JOB_ID} \
   133    --runner ${RUNNER} \
   134    --max_rows ${MAX_ROWS} \
   135    --publish_to_big_query=true \
   136    --metrics_dataset='beam_performance' \
   137    --metrics_table='chicago_taxi_preprocess'${METRICS_TABLE_SUFFIX} \
   138    --metric_reporting_project ${GCP_PROJECT} \
   139    --setup_file ./setup.py \
   140    ${PIPELINE_OPTIONS}
   141  
   142  #Train ML engine
   143  TRAINER_JOB_ID="chicago_taxi_trainer_$(date +%Y%m%d_%H%M%S)"
   144  TRAIN_OUTPUT_PATH=${JOB_OUTPUT_PATH}/trainer_output
   145  WORKING_DIR=${TRAIN_OUTPUT_PATH}/working_dir
   146  
   147  MODEL_DIR=${TRAIN_OUTPUT_PATH}/model_dir
   148  # Inputs
   149  TRAIN_FILE=${TFT_OUTPUT_PATH}/train_transformed-*
   150  TF_VERSION=1.14
   151  #workaround for boto in virtualenv, required for the gsutil commands to work:
   152  export BOTO_CONFIG=/dev/null
   153  # Start clean, but don't fail if the path does not exist yet.
   154  gsutil rm ${TRAIN_OUTPUT_PATH} || true
   155  # Options
   156  TRAIN_STEPS=10000
   157  EVAL_STEPS=1000
   158  # Force a small eval so that the Estimator.train_and_eval() can be used to
   159  # save the model with its standard paths.
   160  EVAL_FILE=${TFT_OUTPUT_PATH}/train_transformed-*
   161  echo Training the model
   162  gcloud ml-engine jobs submit training ${TRAINER_JOB_ID} \
   163                                      --stream-logs \
   164                                      --job-dir ${MODEL_DIR} \
   165                                      --runtime-version ${TF_VERSION} \
   166                                      --module-name trainer.task \
   167                                      --package-path trainer/ \
   168                                      --region us-central1 \
   169                                      -- \
   170                                      --train-files ${TRAIN_FILE} \
   171                                      --train-steps ${TRAIN_STEPS} \
   172                                      --eval-files ${EVAL_FILE} \
   173                                      --eval-steps ${EVAL_STEPS} \
   174                                      --output-dir ${WORKING_DIR} \
   175                                      --schema-file ${SCHEMA_PATH} \
   176                                      --tf-transform-dir ${TFT_OUTPUT_PATH}
   177  
   178  # We evaluate with the last eval model written (hence tail -n1)
   179  EVAL_MODEL_DIR=${TRAIN_OUTPUT_PATH}/working_dir/eval_model_dir
   180  LAST_EVAL_MODEL_DIR=$(gsutil ls ${EVAL_MODEL_DIR} | tail -n1)
   181  
   182  echo Eval model dir: ${EVAL_MODEL_DIR}
   183  
   184  python process_tfma.py \
   185    --big_query_table bigquery-public-data.chicago_taxi_trips.taxi_trips \
   186    --schema_file ${SCHEMA_PATH} \
   187    --eval_model_dir ${LAST_EVAL_MODEL_DIR} \
   188    --project ${GCP_PROJECT} \
   189    --region us-central1 \
   190    --temp_location ${GCS_BUCKET}/${JOB_ID}/tmp/ \
   191    --job_name ${JOB_ID} \
   192    --save_main_session \
   193    --runner ${RUNNER} \
   194    --max_eval_rows=${MAX_ROWS} \
   195    --publish_to_big_query=true \
   196    --metrics_dataset='beam_performance' \
   197    --metrics_table='chicago_taxi_process_tfma'${METRICS_TABLE_SUFFIX} \
   198    --metric_reporting_project ${GCP_PROJECT} \
   199    --setup_file ./setup.py \
   200    ${PIPELINE_OPTIONS}