github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/spark_runner.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A runner for executing portable pipelines on Spark.""" 19 20 # pytype: skip-file 21 22 import os 23 import re 24 import urllib 25 26 from apache_beam.options import pipeline_options 27 from apache_beam.runners.portability import job_server 28 from apache_beam.runners.portability import portable_runner 29 from apache_beam.runners.portability import spark_uber_jar_job_server 30 31 # https://spark.apache.org/docs/latest/submitting-applications.html#master-urls 32 LOCAL_MASTER_PATTERN = r'^local(\[.+\])?$' 33 34 # Since Java job servers are heavyweight external processes, cache them. 35 # This applies only to SparkJarJobServer, not SparkUberJarJobServer. 36 JOB_SERVER_CACHE = {} 37 38 39 class SparkRunner(portable_runner.PortableRunner): 40 def run_pipeline(self, pipeline, options): 41 spark_options = options.view_as(pipeline_options.SparkRunnerOptions) 42 portable_options = options.view_as(pipeline_options.PortableOptions) 43 if (re.match(LOCAL_MASTER_PATTERN, spark_options.spark_master_url) and 44 not portable_options.environment_type and 45 not portable_options.output_executable_path): 46 portable_options.environment_type = 'LOOPBACK' 47 return super().run_pipeline(pipeline, options) 48 49 def default_job_server(self, options): 50 spark_options = options.view_as(pipeline_options.SparkRunnerOptions) 51 if spark_options.spark_submit_uber_jar: 52 if not spark_options.spark_rest_url: 53 raise ValueError('Option spark_rest_url must be set.') 54 return spark_uber_jar_job_server.SparkUberJarJobServer( 55 spark_options.spark_rest_url, options) 56 # Use Java job server by default. 57 # Only SparkRunnerOptions and JobServerOptions affect job server 58 # configuration, so concat those as the cache key. 59 job_server_options = options.view_as(pipeline_options.JobServerOptions) 60 options_str = str(spark_options) + str(job_server_options) 61 if not options_str in JOB_SERVER_CACHE: 62 JOB_SERVER_CACHE[options_str] = job_server.StopOnExitJobServer( 63 SparkJarJobServer(options)) 64 return JOB_SERVER_CACHE[options_str] 65 66 def create_job_service_handle(self, job_service, options): 67 return portable_runner.JobServiceHandle( 68 job_service, 69 options, 70 retain_unknown_options=options.view_as( 71 pipeline_options.SparkRunnerOptions).spark_submit_uber_jar) 72 73 74 class SparkJarJobServer(job_server.JavaJarJobServer): 75 def __init__(self, options): 76 super().__init__(options) 77 options = options.view_as(pipeline_options.SparkRunnerOptions) 78 self._jar = options.spark_job_server_jar 79 self._master_url = options.spark_master_url 80 self._spark_version = options.spark_version 81 82 def path_to_jar(self): 83 if self._jar: 84 if not os.path.exists(self._jar): 85 url = urllib.parse.urlparse(self._jar) 86 if not url.scheme: 87 raise ValueError( 88 'Unable to parse jar URL "%s". If using a full URL, make sure ' 89 'the scheme is specified. If using a local file path, make sure ' 90 'the file exists; you may have to first build the job server ' 91 'using `./gradlew runners:spark:3:job-server:shadowJar`.' % 92 self._jar) 93 return self._jar 94 else: 95 if self._spark_version == '2': 96 raise ValueError('Support for Spark 2 was dropped.') 97 return self.path_to_beam_jar(':runners:spark:3:job-server:shadowJar') 98 99 def java_arguments( 100 self, job_port, artifact_port, expansion_port, artifacts_dir): 101 return [ 102 '--spark-master-url', 103 self._master_url, 104 '--artifacts-dir', 105 artifacts_dir, 106 '--job-port', 107 job_port, 108 '--artifact-port', 109 artifact_port, 110 '--expansion-port', 111 expansion_port 112 ]