github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/flink_runner.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A runner for executing portable pipelines on Flink."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import os
    24  import re
    25  import urllib
    26  
    27  from apache_beam.options import pipeline_options
    28  from apache_beam.runners.portability import flink_uber_jar_job_server
    29  from apache_beam.runners.portability import job_server
    30  from apache_beam.runners.portability import portable_runner
    31  
    32  MAGIC_HOST_NAMES = ['[local]', '[auto]']
    33  
    34  _LOGGER = logging.getLogger(__name__)
    35  
    36  
    37  class FlinkRunner(portable_runner.PortableRunner):
    38    def run_pipeline(self, pipeline, options):
    39      portable_options = options.view_as(pipeline_options.PortableOptions)
    40      flink_options = options.view_as(pipeline_options.FlinkRunnerOptions)
    41      if (flink_options.flink_master in MAGIC_HOST_NAMES and
    42          not portable_options.environment_type and
    43          not portable_options.output_executable_path):
    44        portable_options.environment_type = 'LOOPBACK'
    45      return super().run_pipeline(pipeline, options)
    46  
    47    def default_job_server(self, options):
    48      flink_options = options.view_as(pipeline_options.FlinkRunnerOptions)
    49      flink_master = self.add_http_scheme(flink_options.flink_master)
    50      flink_options.flink_master = flink_master
    51      if (flink_options.flink_submit_uber_jar and
    52          flink_master not in MAGIC_HOST_NAMES):
    53        # This has to be changed [auto], otherwise we will attempt to submit a
    54        # the pipeline remotely on the Flink JobMaster which will _fail_.
    55        # DO NOT CHANGE the following line, unless you have tested this.
    56        flink_options.flink_master = '[auto]'
    57        return flink_uber_jar_job_server.FlinkUberJarJobServer(
    58            flink_master, options)
    59      else:
    60        return job_server.StopOnExitJobServer(FlinkJarJobServer(options))
    61  
    62    def create_job_service_handle(self, job_service, options):
    63      return portable_runner.JobServiceHandle(
    64          job_service,
    65          options,
    66          retain_unknown_options=options.view_as(
    67              pipeline_options.FlinkRunnerOptions).flink_submit_uber_jar)
    68  
    69    @staticmethod
    70    def add_http_scheme(flink_master):
    71      """Adds a http protocol scheme if none provided."""
    72      flink_master = flink_master.strip()
    73      if not flink_master in MAGIC_HOST_NAMES and \
    74            not re.search('^http[s]?://', flink_master):
    75        _LOGGER.info(
    76            'Adding HTTP protocol scheme to flink_master parameter: '
    77            'http://%s',
    78            flink_master)
    79        flink_master = 'http://' + flink_master
    80      return flink_master
    81  
    82  
    83  class FlinkJarJobServer(job_server.JavaJarJobServer):
    84    def __init__(self, options):
    85      super().__init__(options)
    86      options = options.view_as(pipeline_options.FlinkRunnerOptions)
    87      self._jar = options.flink_job_server_jar
    88      self._master_url = options.flink_master
    89      self._flink_version = options.flink_version
    90  
    91    def path_to_jar(self):
    92      if self._jar:
    93        if not os.path.exists(self._jar):
    94          url = urllib.parse.urlparse(self._jar)
    95          if not url.scheme:
    96            raise ValueError(
    97                'Unable to parse jar URL "%s". If using a full URL, make sure '
    98                'the scheme is specified. If using a local file path, make sure '
    99                'the file exists; you may have to first build the job server '
   100                'using `./gradlew runners:flink:%s:job-server:shadowJar`.' %
   101                (self._jar, self._flink_version))
   102        return self._jar
   103      else:
   104        return self.path_to_beam_jar(
   105            ':runners:flink:%s:job-server:shadowJar' % self._flink_version)
   106  
   107    def java_arguments(
   108        self, job_port, artifact_port, expansion_port, artifacts_dir):
   109      return [
   110          '--flink-master',
   111          self._master_url,
   112          '--artifacts-dir',
   113          artifacts_dir,
   114          '--job-port',
   115          job_port,
   116          '--artifact-port',
   117          artifact_port,
   118          '--expansion-port',
   119          expansion_port
   120      ]