github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_job_service.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  import argparse
    19  import logging
    20  import sys
    21  
    22  from apache_beam.runners.dataflow import dataflow_runner
    23  from apache_beam.runners.portability import local_job_service
    24  from apache_beam.runners.portability import local_job_service_main
    25  from apache_beam.runners.portability import portable_runner
    26  
    27  
    28  class DataflowBeamJob(local_job_service.BeamJob):
    29    """A representation of a single Beam job to be run on the Dataflow runner.
    30    """
    31    def _invoke_runner(self):
    32      """Actually calls Dataflow and waits for completion.
    33      """
    34      runner = dataflow_runner.DataflowRunner()
    35      self.result = runner.run_pipeline(
    36          None, self.pipeline_options(), self._pipeline_proto)
    37      # Prefer this to result.wait_until_finish() to get state updates
    38      # and avoid creating an extra thread (which also messes with logging).
    39      dataflow_runner.DataflowRunner.poll_for_job_completion(
    40          runner,
    41          self.result,
    42          None,
    43          lambda dataflow_state: self.set_state(
    44              portable_runner.PipelineResult.pipeline_state_to_runner_api_state(
    45                  self.result.api_jobstate_to_pipeline_state(dataflow_state))))
    46      return self.result
    47  
    48    def cancel(self):
    49      if not self.is_terminal_state(self.state):
    50        self.result.cancel()
    51  
    52  
    53  def run(argv, beam_job_type=DataflowBeamJob):
    54    if argv[0] == __file__:
    55      argv = argv[1:]
    56    parser = argparse.ArgumentParser()
    57    parser.add_argument(
    58        '-p',
    59        '--port',
    60        '--job_port',
    61        type=int,
    62        default=0,
    63        help='port on which to serve the job api')
    64    parser.add_argument('--staging_dir')
    65    options = parser.parse_args(argv)
    66  
    67    job_servicer = local_job_service.LocalJobServicer(
    68        options.staging_dir, beam_job_type=beam_job_type)
    69    port = job_servicer.start_grpc_server(options.port)
    70    try:
    71      local_job_service_main.serve(
    72          "Listening for beam jobs on port %d." % port, job_servicer)
    73    finally:
    74      job_servicer.stop()
    75  
    76  
    77  if __name__ == '__main__':
    78    logging.basicConfig()
    79    logging.getLogger().setLevel(logging.INFO)
    80    run(sys.argv)