github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/abstract_job_service.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 # pytype: skip-file 18 19 import copy 20 import itertools 21 import json 22 import logging 23 import shutil 24 import tempfile 25 import uuid 26 import zipfile 27 from concurrent import futures 28 from typing import TYPE_CHECKING 29 from typing import Dict 30 from typing import Iterator 31 from typing import Optional 32 from typing import Tuple 33 from typing import Union 34 35 import grpc 36 from google.protobuf import json_format 37 from google.protobuf import timestamp_pb2 38 39 from apache_beam.portability.api import beam_artifact_api_pb2_grpc 40 from apache_beam.portability.api import beam_job_api_pb2 41 from apache_beam.portability.api import beam_job_api_pb2_grpc 42 from apache_beam.portability.api import endpoints_pb2 43 from apache_beam.runners.portability import artifact_service 44 from apache_beam.utils.timestamp import Timestamp 45 46 if TYPE_CHECKING: 47 # pylint: disable=ungrouped-imports 48 from typing import BinaryIO 49 from google.protobuf import struct_pb2 50 from apache_beam.portability.api import beam_runner_api_pb2 51 52 _LOGGER = logging.getLogger(__name__) 53 54 StateEvent = Tuple[int, Union[timestamp_pb2.Timestamp, Timestamp]] 55 56 57 def make_state_event(state, timestamp): 58 if isinstance(timestamp, Timestamp): 59 proto_timestamp = timestamp.to_proto() 60 elif isinstance(timestamp, timestamp_pb2.Timestamp): 61 proto_timestamp = timestamp 62 else: 63 raise ValueError( 64 "Expected apache_beam.utils.timestamp.Timestamp, " 65 "or google.protobuf.timestamp_pb2.Timestamp. " 66 "Got %s" % type(timestamp)) 67 68 return beam_job_api_pb2.JobStateEvent(state=state, timestamp=proto_timestamp) 69 70 71 class AbstractJobServiceServicer(beam_job_api_pb2_grpc.JobServiceServicer): 72 """Manages one or more pipelines, possibly concurrently. 73 Experimental: No backward compatibility guaranteed. 74 Servicer for the Beam Job API. 75 """ 76 def __init__(self): 77 self._jobs = {} # type: Dict[str, AbstractBeamJob] 78 79 def create_beam_job(self, 80 preparation_id, # stype: str 81 job_name, # type: str 82 pipeline, # type: beam_runner_api_pb2.Pipeline 83 options # type: struct_pb2.Struct 84 ): 85 # type: (...) -> AbstractBeamJob 86 87 """Returns an instance of AbstractBeamJob specific to this servicer.""" 88 raise NotImplementedError(type(self)) 89 90 def Prepare(self, 91 request, # type: beam_job_api_pb2.PrepareJobRequest 92 context=None, 93 timeout=None 94 ): 95 # type: (...) -> beam_job_api_pb2.PrepareJobResponse 96 _LOGGER.debug('Got Prepare request.') 97 preparation_id = '%s-%s' % (request.job_name, uuid.uuid4()) 98 self._jobs[preparation_id] = self.create_beam_job( 99 preparation_id, 100 request.job_name, 101 request.pipeline, 102 request.pipeline_options) 103 self._jobs[preparation_id].prepare() 104 _LOGGER.debug("Prepared job '%s' as '%s'", request.job_name, preparation_id) 105 return beam_job_api_pb2.PrepareJobResponse( 106 preparation_id=preparation_id, 107 artifact_staging_endpoint=self._jobs[preparation_id]. 108 artifact_staging_endpoint(), 109 staging_session_token=preparation_id) 110 111 def Run(self, 112 request, # type: beam_job_api_pb2.RunJobRequest 113 context=None, 114 timeout=None 115 ): 116 # type: (...) -> beam_job_api_pb2.RunJobResponse 117 # For now, just use the preparation id as the job id. 118 job_id = request.preparation_id 119 _LOGGER.info("Running job '%s'", job_id) 120 self._jobs[job_id].run() 121 return beam_job_api_pb2.RunJobResponse(job_id=job_id) 122 123 def GetJobs(self, 124 request, # type: beam_job_api_pb2.GetJobsRequest 125 context=None, 126 timeout=None 127 ): 128 # type: (...) -> beam_job_api_pb2.GetJobsResponse 129 return beam_job_api_pb2.GetJobsResponse( 130 job_info=[job.to_runner_api() for job in self._jobs.values()]) 131 132 def GetState( 133 self, 134 request, # type: beam_job_api_pb2.GetJobStateRequest 135 context=None): 136 # type: (...) -> beam_job_api_pb2.JobStateEvent 137 return make_state_event(*self._jobs[request.job_id].get_state()) 138 139 def GetPipeline(self, 140 request, # type: beam_job_api_pb2.GetJobPipelineRequest 141 context=None, 142 timeout=None 143 ): 144 # type: (...) -> beam_job_api_pb2.GetJobPipelineResponse 145 return beam_job_api_pb2.GetJobPipelineResponse( 146 pipeline=self._jobs[request.job_id].get_pipeline()) 147 148 def Cancel(self, 149 request, # type: beam_job_api_pb2.CancelJobRequest 150 context=None, 151 timeout=None 152 ): 153 # type: (...) -> beam_job_api_pb2.CancelJobResponse 154 self._jobs[request.job_id].cancel() 155 return beam_job_api_pb2.CancelJobResponse( 156 state=self._jobs[request.job_id].get_state()[0]) 157 158 def GetStateStream(self, request, context=None, timeout=None): 159 # type: (...) -> Iterator[beam_job_api_pb2.JobStateEvent] 160 161 """Yields state transitions since the stream started. 162 """ 163 if request.job_id not in self._jobs: 164 raise LookupError("Job {} does not exist".format(request.job_id)) 165 166 job = self._jobs[request.job_id] 167 for state, timestamp in job.get_state_stream(): 168 yield make_state_event(state, timestamp) 169 170 def GetMessageStream(self, request, context=None, timeout=None): 171 # type: (...) -> Iterator[beam_job_api_pb2.JobMessagesResponse] 172 173 """Yields messages since the stream started. 174 """ 175 if request.job_id not in self._jobs: 176 raise LookupError("Job {} does not exist".format(request.job_id)) 177 178 job = self._jobs[request.job_id] 179 for msg in job.get_message_stream(): 180 if isinstance(msg, tuple): 181 resp = beam_job_api_pb2.JobMessagesResponse( 182 state_response=make_state_event(*msg)) 183 else: 184 resp = beam_job_api_pb2.JobMessagesResponse(message_response=msg) 185 yield resp 186 187 def DescribePipelineOptions(self, request, context=None, timeout=None): 188 # type: (...) -> beam_job_api_pb2.DescribePipelineOptionsResponse 189 return beam_job_api_pb2.DescribePipelineOptionsResponse() 190 191 192 class AbstractBeamJob(object): 193 """Abstract baseclass for managing a single Beam job.""" 194 195 def __init__(self, 196 job_id, # type: str 197 job_name, # type: str 198 pipeline, # type: beam_runner_api_pb2.Pipeline 199 options # type: struct_pb2.Struct 200 ): 201 self._job_id = job_id 202 self._job_name = job_name 203 self._pipeline_proto = pipeline 204 self._pipeline_options = options 205 self._state_history = [(beam_job_api_pb2.JobState.STOPPED, Timestamp.now())] 206 207 def prepare(self): 208 # type: () -> None 209 210 """Called immediately after this class is instantiated""" 211 raise NotImplementedError(self) 212 213 def run(self): 214 # type: () -> None 215 raise NotImplementedError(self) 216 217 def cancel(self): 218 # type: () -> Optional[beam_job_api_pb2.JobState.Enum] 219 raise NotImplementedError(self) 220 221 def artifact_staging_endpoint(self): 222 # type: () -> Optional[endpoints_pb2.ApiServiceDescriptor] 223 raise NotImplementedError(self) 224 225 def get_state_stream(self): 226 # type: () -> Iterator[StateEvent] 227 raise NotImplementedError(self) 228 229 def get_message_stream(self): 230 # type: () -> Iterator[Union[StateEvent, Optional[beam_job_api_pb2.JobMessage]]] 231 raise NotImplementedError(self) 232 233 @property 234 def state(self): 235 """Get the latest state enum.""" 236 return self.get_state()[0] 237 238 def get_state(self): 239 """Get a tuple of the latest state and its timestamp.""" 240 # this is safe: initial state is set in __init__ 241 return self._state_history[-1] 242 243 def set_state(self, new_state): 244 """Set the latest state as an int enum and update the state history. 245 246 :param new_state: int 247 latest state enum 248 :return: Timestamp or None 249 the new timestamp if the state has not changed, else None 250 """ 251 if new_state != self._state_history[-1][0]: 252 timestamp = Timestamp.now() 253 self._state_history.append((new_state, timestamp)) 254 return timestamp 255 else: 256 return None 257 258 def with_state_history(self, state_stream): 259 """Utility to prepend recorded state history to an active state stream""" 260 return itertools.chain(self._state_history[:], state_stream) 261 262 def get_pipeline(self): 263 # type: () -> beam_runner_api_pb2.Pipeline 264 return self._pipeline_proto 265 266 @staticmethod 267 def is_terminal_state(state): 268 from apache_beam.runners.portability import portable_runner 269 return state in portable_runner.TERMINAL_STATES 270 271 def to_runner_api(self): 272 # type: () -> beam_job_api_pb2.JobInfo 273 return beam_job_api_pb2.JobInfo( 274 job_id=self._job_id, 275 job_name=self._job_name, 276 pipeline_options=self._pipeline_options, 277 state=self.state) 278 279 280 class JarArtifactManager(object): 281 def __init__(self, jar_path, root): 282 self._root = root 283 self._zipfile_handle = zipfile.ZipFile(jar_path, 'a') 284 285 def close(self): 286 self._zipfile_handle.close() 287 288 def file_writer(self, path): 289 # type: (str) -> Tuple[BinaryIO, str] 290 291 """Given a relative path, returns an open handle that can be written to 292 and an reference that can later be used to read this file.""" 293 full_path = '%s/%s' % (self._root, path) 294 return self._zipfile_handle.open( 295 full_path, 'w', force_zip64=True), 'classpath://%s' % full_path 296 297 def zipfile_handle(self): 298 return self._zipfile_handle 299 300 301 class UberJarBeamJob(AbstractBeamJob): 302 """Abstract baseclass for creating a Beam job. The resulting job will be 303 packaged and run in an executable uber jar.""" 304 305 # These must agree with those defined in PortablePipelineJarUtils.java. 306 PIPELINE_FOLDER = 'BEAM-PIPELINE' 307 PIPELINE_MANIFEST = PIPELINE_FOLDER + '/pipeline-manifest.json' 308 309 # We only stage a single pipeline in the jar. 310 PIPELINE_NAME = 'pipeline' 311 PIPELINE_PATH = '/'.join([PIPELINE_FOLDER, PIPELINE_NAME, "pipeline.json"]) 312 PIPELINE_OPTIONS_PATH = '/'.join( 313 [PIPELINE_FOLDER, PIPELINE_NAME, 'pipeline-options.json']) 314 ARTIFACT_FOLDER = '/'.join([PIPELINE_FOLDER, PIPELINE_NAME, 'artifacts']) 315 316 def __init__( 317 self, 318 executable_jar, 319 job_id, 320 job_name, 321 pipeline, 322 options, 323 artifact_port=0): 324 super().__init__(job_id, job_name, pipeline, options) 325 self._executable_jar = executable_jar 326 self._jar_uploaded = False 327 self._artifact_port = artifact_port 328 329 def prepare(self): 330 # Copy the executable jar, injecting the pipeline and options as resources. 331 with tempfile.NamedTemporaryFile(suffix='.jar') as tout: 332 self._jar = tout.name 333 shutil.copy(self._executable_jar, self._jar) 334 self._start_artifact_service(self._jar, self._artifact_port) 335 336 def _start_artifact_service(self, jar, requested_port): 337 self._artifact_manager = JarArtifactManager(self._jar, self.ARTIFACT_FOLDER) 338 self._artifact_staging_service = artifact_service.ArtifactStagingService( 339 self._artifact_manager.file_writer) 340 self._artifact_staging_service.register_job( 341 self._job_id, 342 { 343 env_id: env.dependencies 344 for (env_id, 345 env) in self._pipeline_proto.components.environments.items() 346 }) 347 options = [("grpc.http2.max_pings_without_data", 0), 348 ("grpc.http2.max_ping_strikes", 0)] 349 self._artifact_staging_server = grpc.server( 350 futures.ThreadPoolExecutor(), options=options) 351 port = self._artifact_staging_server.add_insecure_port( 352 '[::]:%s' % requested_port) 353 beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server( 354 self._artifact_staging_service, self._artifact_staging_server) 355 self._artifact_staging_endpoint = endpoints_pb2.ApiServiceDescriptor( 356 url='localhost:%d' % port) 357 self._artifact_staging_server.start() 358 _LOGGER.info('Artifact server started on port %s', port) 359 return port 360 361 def _stop_artifact_service(self): 362 self._artifact_staging_server.stop(1) 363 364 # Update dependencies to point to staged files. 365 pipeline = copy.copy(self._pipeline_proto) 366 if any(env.dependencies 367 for env in pipeline.components.environments.values()): 368 for env_id, deps in self._artifact_staging_service.resolved_deps( 369 self._job_id).items(): 370 # Slice assignment not supported for repeated fields. 371 env = self._pipeline_proto.components.environments[env_id] 372 del env.dependencies[:] 373 env.dependencies.extend(deps) 374 375 # Copy the pipeline definition and metadata into the jar. 376 z = self._artifact_manager.zipfile_handle() 377 with z.open(self.PIPELINE_PATH, 'w') as fout: 378 fout.write( 379 json_format.MessageToJson(self._pipeline_proto).encode('utf-8')) 380 with z.open(self.PIPELINE_OPTIONS_PATH, 'w') as fout: 381 fout.write( 382 json_format.MessageToJson(self._pipeline_options).encode('utf-8')) 383 with z.open(self.PIPELINE_MANIFEST, 'w') as fout: 384 fout.write( 385 json.dumps({ 386 'defaultJobName': self.PIPELINE_NAME 387 }).encode('utf-8')) 388 389 # Closes the jar file. 390 self._artifact_manager.close() 391 392 def artifact_staging_endpoint(self): 393 return self._artifact_staging_endpoint