github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py (about)

     1  # Licensed to the Apache Software Foundation (ASF) under one or more
     2  # contributor license agreements.  See the NOTICE file distributed with
     3  # this work for additional information regarding copyright ownership.
     4  # The ASF licenses this file to You under the Apache License, Version 2.0
     5  # (the "License"); you may not use this file except in compliance with
     6  # the License.  You may obtain a copy of the License at
     7  #
     8  #    http://www.apache.org/licenses/LICENSE-2.0
     9  #
    10  # Unless required by applicable law or agreed to in writing, software
    11  # distributed under the License is distributed on an "AS IS" BASIS,
    12  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  # See the License for the specific language governing permissions and
    14  # limitations under the License.
    15  
    16  import argparse
    17  
    18  import kfp
    19  from kfp import components as comp
    20  from kfp.v2 import dsl
    21  from kfp.v2.compiler import Compiler
    22  
    23  
    24  def parse_args():
    25    """Parse arguments."""
    26    parser = argparse.ArgumentParser()
    27    parser.add_argument(
    28        "--gcp-project-id",
    29        type=str,
    30        help="ID for the google cloud project to deploy the pipeline to.",
    31        required=True)
    32    parser.add_argument(
    33        "--region",
    34        type=str,
    35        help="Region in which to deploy the pipeline.",
    36        required=True)
    37    parser.add_argument(
    38        "--pipeline-root",
    39        type=str,
    40        help=
    41        "Path to artifact repository where Kubeflow Pipelines stores a pipeline’s artifacts.",
    42        required=True)
    43    parser.add_argument(
    44        "--component-artifact-root",
    45        type=str,
    46        help=
    47        "Path to artifact repository where Kubeflow Pipelines components can store artifacts.",
    48        required=True)
    49    parser.add_argument(
    50        "--dataflow-staging-root",
    51        type=str,
    52        help="Path to staging directory for dataflow.",
    53        required=True)
    54    parser.add_argument(
    55        "--beam-runner",
    56        type=str,
    57        help="Beam runner: DataflowRunner or DirectRunner.",
    58        default="DirectRunner")
    59    return parser.parse_args()
    60  
    61  
    62  # arguments are parsed as a global variable so
    63  # they can be used in the pipeline decorator below
    64  ARGS = parse_args()
    65  PIPELINE_ROOT = vars(ARGS)['pipeline_root']
    66  
    67  # [START load_kfp_components]
    68  # load the kfp components from their yaml files
    69  DataIngestOp = comp.load_component('components/ingestion/component.yaml')
    70  DataPreprocessingOp = comp.load_component(
    71      'components/preprocessing/component.yaml')
    72  TrainModelOp = comp.load_component('components/train/component.yaml')
    73  # [END load_kfp_components]
    74  
    75  
    76  # [START define_kfp_pipeline]
    77  @dsl.pipeline(
    78      pipeline_root=PIPELINE_ROOT,
    79      name="beam-preprocessing-kfp-example",
    80      description="Pipeline to show an apache beam preprocessing example in KFP")
    81  def pipeline(
    82      gcp_project_id: str,
    83      region: str,
    84      component_artifact_root: str,
    85      dataflow_staging_root: str,
    86      beam_runner: str):
    87    """KFP pipeline definition.
    88  
    89    Args:
    90        gcp_project_id (str): ID for the google cloud project to deploy the pipeline to.
    91        region (str): Region in which to deploy the pipeline.
    92        component_artifact_root (str): Path to artifact repository where Kubeflow Pipelines
    93          components can store artifacts.
    94        dataflow_staging_root (str): Path to staging directory for the dataflow runner.
    95        beam_runner (str): Beam runner: DataflowRunner or DirectRunner.
    96    """
    97  
    98    ingest_data_task = DataIngestOp(base_artifact_path=component_artifact_root)
    99  
   100    data_preprocessing_task = DataPreprocessingOp(
   101        ingested_dataset_path=ingest_data_task.outputs["ingested_dataset_path"],
   102        base_artifact_path=component_artifact_root,
   103        gcp_project_id=gcp_project_id,
   104        region=region,
   105        dataflow_staging_root=dataflow_staging_root,
   106        beam_runner=beam_runner)
   107  
   108    train_model_task = TrainModelOp(
   109        preprocessed_dataset_path=data_preprocessing_task.
   110        outputs["preprocessed_dataset_path"],
   111        base_artifact_path=component_artifact_root)
   112  
   113  
   114  # [END define_kfp_pipeline]
   115  
   116  if __name__ == "__main__":
   117    # [START compile_kfp_pipeline]
   118    Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")
   119    # [END compile_kfp_pipeline]
   120  
   121    run_arguments = vars(ARGS)
   122    del run_arguments['pipeline_root']
   123  
   124    # [START execute_kfp_pipeline]
   125    client = kfp.Client()
   126    experiment = client.create_experiment("KFP orchestration example")
   127    run_result = client.run_pipeline(
   128        experiment_id=experiment.id,
   129        job_name="KFP orchestration job",
   130        pipeline_package_path="pipeline.json",
   131        params=run_arguments)
   132    # [END execute_kfp_pipeline]