github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/abstract_job_service.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  # pytype: skip-file
    18  
    19  import copy
    20  import itertools
    21  import json
    22  import logging
    23  import shutil
    24  import tempfile
    25  import uuid
    26  import zipfile
    27  from concurrent import futures
    28  from typing import TYPE_CHECKING
    29  from typing import Dict
    30  from typing import Iterator
    31  from typing import Optional
    32  from typing import Tuple
    33  from typing import Union
    34  
    35  import grpc
    36  from google.protobuf import json_format
    37  from google.protobuf import timestamp_pb2
    38  
    39  from apache_beam.portability.api import beam_artifact_api_pb2_grpc
    40  from apache_beam.portability.api import beam_job_api_pb2
    41  from apache_beam.portability.api import beam_job_api_pb2_grpc
    42  from apache_beam.portability.api import endpoints_pb2
    43  from apache_beam.runners.portability import artifact_service
    44  from apache_beam.utils.timestamp import Timestamp
    45  
    46  if TYPE_CHECKING:
    47    # pylint: disable=ungrouped-imports
    48    from typing import BinaryIO
    49    from google.protobuf import struct_pb2
    50    from apache_beam.portability.api import beam_runner_api_pb2
    51  
    52  _LOGGER = logging.getLogger(__name__)
    53  
    54  StateEvent = Tuple[int, Union[timestamp_pb2.Timestamp, Timestamp]]
    55  
    56  
    57  def make_state_event(state, timestamp):
    58    if isinstance(timestamp, Timestamp):
    59      proto_timestamp = timestamp.to_proto()
    60    elif isinstance(timestamp, timestamp_pb2.Timestamp):
    61      proto_timestamp = timestamp
    62    else:
    63      raise ValueError(
    64          "Expected apache_beam.utils.timestamp.Timestamp, "
    65          "or google.protobuf.timestamp_pb2.Timestamp. "
    66          "Got %s" % type(timestamp))
    67  
    68    return beam_job_api_pb2.JobStateEvent(state=state, timestamp=proto_timestamp)
    69  
    70  
    71  class AbstractJobServiceServicer(beam_job_api_pb2_grpc.JobServiceServicer):
    72    """Manages one or more pipelines, possibly concurrently.
    73    Experimental: No backward compatibility guaranteed.
    74    Servicer for the Beam Job API.
    75    """
    76    def __init__(self):
    77      self._jobs = {}  # type: Dict[str, AbstractBeamJob]
    78  
    79    def create_beam_job(self,
    80                        preparation_id,  # stype: str
    81                        job_name,  # type: str
    82                        pipeline,  # type: beam_runner_api_pb2.Pipeline
    83                        options  # type: struct_pb2.Struct
    84                       ):
    85      # type: (...) -> AbstractBeamJob
    86  
    87      """Returns an instance of AbstractBeamJob specific to this servicer."""
    88      raise NotImplementedError(type(self))
    89  
    90    def Prepare(self,
    91                request,  # type: beam_job_api_pb2.PrepareJobRequest
    92                context=None,
    93                timeout=None
    94               ):
    95      # type: (...) -> beam_job_api_pb2.PrepareJobResponse
    96      _LOGGER.debug('Got Prepare request.')
    97      preparation_id = '%s-%s' % (request.job_name, uuid.uuid4())
    98      self._jobs[preparation_id] = self.create_beam_job(
    99          preparation_id,
   100          request.job_name,
   101          request.pipeline,
   102          request.pipeline_options)
   103      self._jobs[preparation_id].prepare()
   104      _LOGGER.debug("Prepared job '%s' as '%s'", request.job_name, preparation_id)
   105      return beam_job_api_pb2.PrepareJobResponse(
   106          preparation_id=preparation_id,
   107          artifact_staging_endpoint=self._jobs[preparation_id].
   108          artifact_staging_endpoint(),
   109          staging_session_token=preparation_id)
   110  
   111    def Run(self,
   112            request,  # type: beam_job_api_pb2.RunJobRequest
   113            context=None,
   114            timeout=None
   115           ):
   116      # type: (...) -> beam_job_api_pb2.RunJobResponse
   117      # For now, just use the preparation id as the job id.
   118      job_id = request.preparation_id
   119      _LOGGER.info("Running job '%s'", job_id)
   120      self._jobs[job_id].run()
   121      return beam_job_api_pb2.RunJobResponse(job_id=job_id)
   122  
   123    def GetJobs(self,
   124                request,  # type: beam_job_api_pb2.GetJobsRequest
   125                context=None,
   126                timeout=None
   127               ):
   128      # type: (...) -> beam_job_api_pb2.GetJobsResponse
   129      return beam_job_api_pb2.GetJobsResponse(
   130          job_info=[job.to_runner_api() for job in self._jobs.values()])
   131  
   132    def GetState(
   133        self,
   134        request,  # type: beam_job_api_pb2.GetJobStateRequest
   135        context=None):
   136      # type: (...) -> beam_job_api_pb2.JobStateEvent
   137      return make_state_event(*self._jobs[request.job_id].get_state())
   138  
   139    def GetPipeline(self,
   140                    request,  # type: beam_job_api_pb2.GetJobPipelineRequest
   141                    context=None,
   142                    timeout=None
   143                   ):
   144      # type: (...) -> beam_job_api_pb2.GetJobPipelineResponse
   145      return beam_job_api_pb2.GetJobPipelineResponse(
   146          pipeline=self._jobs[request.job_id].get_pipeline())
   147  
   148    def Cancel(self,
   149               request,  # type: beam_job_api_pb2.CancelJobRequest
   150               context=None,
   151               timeout=None
   152              ):
   153      # type: (...) -> beam_job_api_pb2.CancelJobResponse
   154      self._jobs[request.job_id].cancel()
   155      return beam_job_api_pb2.CancelJobResponse(
   156          state=self._jobs[request.job_id].get_state()[0])
   157  
   158    def GetStateStream(self, request, context=None, timeout=None):
   159      # type: (...) -> Iterator[beam_job_api_pb2.JobStateEvent]
   160  
   161      """Yields state transitions since the stream started.
   162        """
   163      if request.job_id not in self._jobs:
   164        raise LookupError("Job {} does not exist".format(request.job_id))
   165  
   166      job = self._jobs[request.job_id]
   167      for state, timestamp in job.get_state_stream():
   168        yield make_state_event(state, timestamp)
   169  
   170    def GetMessageStream(self, request, context=None, timeout=None):
   171      # type: (...) -> Iterator[beam_job_api_pb2.JobMessagesResponse]
   172  
   173      """Yields messages since the stream started.
   174        """
   175      if request.job_id not in self._jobs:
   176        raise LookupError("Job {} does not exist".format(request.job_id))
   177  
   178      job = self._jobs[request.job_id]
   179      for msg in job.get_message_stream():
   180        if isinstance(msg, tuple):
   181          resp = beam_job_api_pb2.JobMessagesResponse(
   182              state_response=make_state_event(*msg))
   183        else:
   184          resp = beam_job_api_pb2.JobMessagesResponse(message_response=msg)
   185        yield resp
   186  
   187    def DescribePipelineOptions(self, request, context=None, timeout=None):
   188      # type: (...) -> beam_job_api_pb2.DescribePipelineOptionsResponse
   189      return beam_job_api_pb2.DescribePipelineOptionsResponse()
   190  
   191  
   192  class AbstractBeamJob(object):
   193    """Abstract baseclass for managing a single Beam job."""
   194  
   195    def __init__(self,
   196                 job_id,  # type: str
   197                 job_name,  # type: str
   198                 pipeline,  # type: beam_runner_api_pb2.Pipeline
   199                 options  # type: struct_pb2.Struct
   200                ):
   201      self._job_id = job_id
   202      self._job_name = job_name
   203      self._pipeline_proto = pipeline
   204      self._pipeline_options = options
   205      self._state_history = [(beam_job_api_pb2.JobState.STOPPED, Timestamp.now())]
   206  
   207    def prepare(self):
   208      # type: () -> None
   209  
   210      """Called immediately after this class is instantiated"""
   211      raise NotImplementedError(self)
   212  
   213    def run(self):
   214      # type: () -> None
   215      raise NotImplementedError(self)
   216  
   217    def cancel(self):
   218      # type: () -> Optional[beam_job_api_pb2.JobState.Enum]
   219      raise NotImplementedError(self)
   220  
   221    def artifact_staging_endpoint(self):
   222      # type: () -> Optional[endpoints_pb2.ApiServiceDescriptor]
   223      raise NotImplementedError(self)
   224  
   225    def get_state_stream(self):
   226      # type: () -> Iterator[StateEvent]
   227      raise NotImplementedError(self)
   228  
   229    def get_message_stream(self):
   230      # type: () -> Iterator[Union[StateEvent, Optional[beam_job_api_pb2.JobMessage]]]
   231      raise NotImplementedError(self)
   232  
   233    @property
   234    def state(self):
   235      """Get the latest state enum."""
   236      return self.get_state()[0]
   237  
   238    def get_state(self):
   239      """Get a tuple of the latest state and its timestamp."""
   240      # this is safe: initial state is set in __init__
   241      return self._state_history[-1]
   242  
   243    def set_state(self, new_state):
   244      """Set the latest state as an int enum and update the state history.
   245  
   246      :param new_state: int
   247        latest state enum
   248      :return: Timestamp or None
   249        the new timestamp if the state has not changed, else None
   250      """
   251      if new_state != self._state_history[-1][0]:
   252        timestamp = Timestamp.now()
   253        self._state_history.append((new_state, timestamp))
   254        return timestamp
   255      else:
   256        return None
   257  
   258    def with_state_history(self, state_stream):
   259      """Utility to prepend recorded state history to an active state stream"""
   260      return itertools.chain(self._state_history[:], state_stream)
   261  
   262    def get_pipeline(self):
   263      # type: () -> beam_runner_api_pb2.Pipeline
   264      return self._pipeline_proto
   265  
   266    @staticmethod
   267    def is_terminal_state(state):
   268      from apache_beam.runners.portability import portable_runner
   269      return state in portable_runner.TERMINAL_STATES
   270  
   271    def to_runner_api(self):
   272      # type: () -> beam_job_api_pb2.JobInfo
   273      return beam_job_api_pb2.JobInfo(
   274          job_id=self._job_id,
   275          job_name=self._job_name,
   276          pipeline_options=self._pipeline_options,
   277          state=self.state)
   278  
   279  
   280  class JarArtifactManager(object):
   281    def __init__(self, jar_path, root):
   282      self._root = root
   283      self._zipfile_handle = zipfile.ZipFile(jar_path, 'a')
   284  
   285    def close(self):
   286      self._zipfile_handle.close()
   287  
   288    def file_writer(self, path):
   289      # type: (str) -> Tuple[BinaryIO, str]
   290  
   291      """Given a relative path, returns an open handle that can be written to
   292      and an reference that can later be used to read this file."""
   293      full_path = '%s/%s' % (self._root, path)
   294      return self._zipfile_handle.open(
   295          full_path, 'w', force_zip64=True), 'classpath://%s' % full_path
   296  
   297    def zipfile_handle(self):
   298      return self._zipfile_handle
   299  
   300  
   301  class UberJarBeamJob(AbstractBeamJob):
   302    """Abstract baseclass for creating a Beam job. The resulting job will be
   303    packaged and run in an executable uber jar."""
   304  
   305    # These must agree with those defined in PortablePipelineJarUtils.java.
   306    PIPELINE_FOLDER = 'BEAM-PIPELINE'
   307    PIPELINE_MANIFEST = PIPELINE_FOLDER + '/pipeline-manifest.json'
   308  
   309    # We only stage a single pipeline in the jar.
   310    PIPELINE_NAME = 'pipeline'
   311    PIPELINE_PATH = '/'.join([PIPELINE_FOLDER, PIPELINE_NAME, "pipeline.json"])
   312    PIPELINE_OPTIONS_PATH = '/'.join(
   313        [PIPELINE_FOLDER, PIPELINE_NAME, 'pipeline-options.json'])
   314    ARTIFACT_FOLDER = '/'.join([PIPELINE_FOLDER, PIPELINE_NAME, 'artifacts'])
   315  
   316    def __init__(
   317        self,
   318        executable_jar,
   319        job_id,
   320        job_name,
   321        pipeline,
   322        options,
   323        artifact_port=0):
   324      super().__init__(job_id, job_name, pipeline, options)
   325      self._executable_jar = executable_jar
   326      self._jar_uploaded = False
   327      self._artifact_port = artifact_port
   328  
   329    def prepare(self):
   330      # Copy the executable jar, injecting the pipeline and options as resources.
   331      with tempfile.NamedTemporaryFile(suffix='.jar') as tout:
   332        self._jar = tout.name
   333      shutil.copy(self._executable_jar, self._jar)
   334      self._start_artifact_service(self._jar, self._artifact_port)
   335  
   336    def _start_artifact_service(self, jar, requested_port):
   337      self._artifact_manager = JarArtifactManager(self._jar, self.ARTIFACT_FOLDER)
   338      self._artifact_staging_service = artifact_service.ArtifactStagingService(
   339          self._artifact_manager.file_writer)
   340      self._artifact_staging_service.register_job(
   341          self._job_id,
   342          {
   343              env_id: env.dependencies
   344              for (env_id,
   345                   env) in self._pipeline_proto.components.environments.items()
   346          })
   347      options = [("grpc.http2.max_pings_without_data", 0),
   348                 ("grpc.http2.max_ping_strikes", 0)]
   349      self._artifact_staging_server = grpc.server(
   350          futures.ThreadPoolExecutor(), options=options)
   351      port = self._artifact_staging_server.add_insecure_port(
   352          '[::]:%s' % requested_port)
   353      beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
   354          self._artifact_staging_service, self._artifact_staging_server)
   355      self._artifact_staging_endpoint = endpoints_pb2.ApiServiceDescriptor(
   356          url='localhost:%d' % port)
   357      self._artifact_staging_server.start()
   358      _LOGGER.info('Artifact server started on port %s', port)
   359      return port
   360  
   361    def _stop_artifact_service(self):
   362      self._artifact_staging_server.stop(1)
   363  
   364      # Update dependencies to point to staged files.
   365      pipeline = copy.copy(self._pipeline_proto)
   366      if any(env.dependencies
   367             for env in pipeline.components.environments.values()):
   368        for env_id, deps in self._artifact_staging_service.resolved_deps(
   369            self._job_id).items():
   370          # Slice assignment not supported for repeated fields.
   371          env = self._pipeline_proto.components.environments[env_id]
   372          del env.dependencies[:]
   373          env.dependencies.extend(deps)
   374  
   375      # Copy the pipeline definition and metadata into the jar.
   376      z = self._artifact_manager.zipfile_handle()
   377      with z.open(self.PIPELINE_PATH, 'w') as fout:
   378        fout.write(
   379            json_format.MessageToJson(self._pipeline_proto).encode('utf-8'))
   380      with z.open(self.PIPELINE_OPTIONS_PATH, 'w') as fout:
   381        fout.write(
   382            json_format.MessageToJson(self._pipeline_options).encode('utf-8'))
   383      with z.open(self.PIPELINE_MANIFEST, 'w') as fout:
   384        fout.write(
   385            json.dumps({
   386                'defaultJobName': self.PIPELINE_NAME
   387            }).encode('utf-8'))
   388  
   389      # Closes the jar file.
   390      self._artifact_manager.close()
   391  
   392    def artifact_staging_endpoint(self):
   393      return self._artifact_staging_endpoint