github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/spark_runner.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/spark_runner.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A runner for executing portable pipelines on Spark."""
    19  
    20  # pytype: skip-file
    21  
    22  import os
    23  import re
    24  import urllib
    25  
    26  from apache_beam.options import pipeline_options
    27  from apache_beam.runners.portability import job_server
    28  from apache_beam.runners.portability import portable_runner
    29  from apache_beam.runners.portability import spark_uber_jar_job_server
    30  
    31  # https://spark.apache.org/docs/latest/submitting-applications.html#master-urls
    32  LOCAL_MASTER_PATTERN = r'^local(\[.+\])?$'
    33  
    34  # Since Java job servers are heavyweight external processes, cache them.
    35  # This applies only to SparkJarJobServer, not SparkUberJarJobServer.
    36  JOB_SERVER_CACHE = {}
    37  
    38  
    39  class SparkRunner(portable_runner.PortableRunner):
    40    def run_pipeline(self, pipeline, options):
    41      spark_options = options.view_as(pipeline_options.SparkRunnerOptions)
    42      portable_options = options.view_as(pipeline_options.PortableOptions)
    43      if (re.match(LOCAL_MASTER_PATTERN, spark_options.spark_master_url) and
    44          not portable_options.environment_type and
    45          not portable_options.output_executable_path):
    46        portable_options.environment_type = 'LOOPBACK'
    47      return super().run_pipeline(pipeline, options)
    48  
    49    def default_job_server(self, options):
    50      spark_options = options.view_as(pipeline_options.SparkRunnerOptions)
    51      if spark_options.spark_submit_uber_jar:
    52        if not spark_options.spark_rest_url:
    53          raise ValueError('Option spark_rest_url must be set.')
    54        return spark_uber_jar_job_server.SparkUberJarJobServer(
    55            spark_options.spark_rest_url, options)
    56      # Use Java job server by default.
    57      # Only SparkRunnerOptions and JobServerOptions affect job server
    58      # configuration, so concat those as the cache key.
    59      job_server_options = options.view_as(pipeline_options.JobServerOptions)
    60      options_str = str(spark_options) + str(job_server_options)
    61      if not options_str in JOB_SERVER_CACHE:
    62        JOB_SERVER_CACHE[options_str] = job_server.StopOnExitJobServer(
    63            SparkJarJobServer(options))
    64      return JOB_SERVER_CACHE[options_str]
    65  
    66    def create_job_service_handle(self, job_service, options):
    67      return portable_runner.JobServiceHandle(
    68          job_service,
    69          options,
    70          retain_unknown_options=options.view_as(
    71              pipeline_options.SparkRunnerOptions).spark_submit_uber_jar)
    72  
    73  
    74  class SparkJarJobServer(job_server.JavaJarJobServer):
    75    def __init__(self, options):
    76      super().__init__(options)
    77      options = options.view_as(pipeline_options.SparkRunnerOptions)
    78      self._jar = options.spark_job_server_jar
    79      self._master_url = options.spark_master_url
    80      self._spark_version = options.spark_version
    81  
    82    def path_to_jar(self):
    83      if self._jar:
    84        if not os.path.exists(self._jar):
    85          url = urllib.parse.urlparse(self._jar)
    86          if not url.scheme:
    87            raise ValueError(
    88                'Unable to parse jar URL "%s". If using a full URL, make sure '
    89                'the scheme is specified. If using a local file path, make sure '
    90                'the file exists; you may have to first build the job server '
    91                'using `./gradlew runners:spark:3:job-server:shadowJar`.' %
    92                self._jar)
    93        return self._jar
    94      else:
    95        if self._spark_version == '2':
    96          raise ValueError('Support for Spark 2 was dropped.')
    97        return self.path_to_beam_jar(':runners:spark:3:job-server:shadowJar')
    98  
    99    def java_arguments(
   100        self, job_port, artifact_port, expansion_port, artifacts_dir):
   101      return [
   102          '--spark-master-url',
   103          self._master_url,
   104          '--artifacts-dir',
   105          artifacts_dir,
   106          '--job-port',
   107          job_port,
   108          '--artifact-port',
   109          artifact_port,
   110          '--expansion-port',
   111          expansion_port
   112      ]