github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/local_job_service.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/local_job_service.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  # pytype: skip-file
    18  
    19  import concurrent.futures
    20  import itertools
    21  import logging
    22  import os
    23  import queue
    24  import shutil
    25  import subprocess
    26  import tempfile
    27  import threading
    28  import time
    29  import traceback
    30  from typing import TYPE_CHECKING
    31  from typing import List
    32  from typing import Optional
    33  
    34  import grpc
    35  from google.protobuf import json_format
    36  from google.protobuf import text_format  # type: ignore # not in typeshed
    37  
    38  from apache_beam import pipeline
    39  from apache_beam.metrics import monitoring_infos
    40  from apache_beam.options import pipeline_options
    41  from apache_beam.portability.api import beam_artifact_api_pb2_grpc
    42  from apache_beam.portability.api import beam_fn_api_pb2_grpc
    43  from apache_beam.portability.api import beam_job_api_pb2
    44  from apache_beam.portability.api import beam_job_api_pb2_grpc
    45  from apache_beam.portability.api import beam_provision_api_pb2
    46  from apache_beam.portability.api import endpoints_pb2
    47  from apache_beam.runners.job import utils as job_utils
    48  from apache_beam.runners.portability import abstract_job_service
    49  from apache_beam.runners.portability import artifact_service
    50  from apache_beam.runners.portability import portable_runner
    51  from apache_beam.runners.portability.fn_api_runner import fn_runner
    52  from apache_beam.runners.portability.fn_api_runner import worker_handlers
    53  from apache_beam.runners.worker.log_handler import LOGENTRY_TO_LOG_LEVEL_MAP
    54  from apache_beam.utils import thread_pool_executor
    55  
    56  if TYPE_CHECKING:
    57    from google.protobuf import struct_pb2  # pylint: disable=ungrouped-imports
    58    from apache_beam.portability.api import beam_runner_api_pb2
    59  
    60  _LOGGER = logging.getLogger(__name__)
    61  
    62  
    63  def _iter_queue(q):
    64    while True:
    65      yield q.get(block=True)
    66  
    67  
    68  class LocalJobServicer(abstract_job_service.AbstractJobServiceServicer):
    69    """Manages one or more pipelines, possibly concurrently.
    70      Experimental: No backward compatibility guaranteed.
    71      Servicer for the Beam Job API.
    72  
    73      This JobService uses a basic local implementation of runner to run the job.
    74      This JobService is not capable of managing job on remote clusters.
    75  
    76      By default, this JobService executes the job in process but still uses GRPC
    77      to communicate pipeline and worker state.  It can also be configured to use
    78      inline calls rather than GRPC (for speed) or launch completely separate
    79      subprocesses for the runner and worker(s).
    80      """
    81    def __init__(self, staging_dir=None, beam_job_type=None):
    82      super().__init__()
    83      self._cleanup_staging_dir = staging_dir is None
    84      self._staging_dir = staging_dir or tempfile.mkdtemp()
    85      self._artifact_service = artifact_service.ArtifactStagingService(
    86          artifact_service.BeamFilesystemHandler(self._staging_dir).file_writer)
    87      self._artifact_staging_endpoint = None  # type: Optional[endpoints_pb2.ApiServiceDescriptor]
    88      self._beam_job_type = beam_job_type or BeamJob
    89  
    90    def create_beam_job(self,
    91                        preparation_id,  # stype: str
    92                        job_name,  # type: str
    93                        pipeline,  # type: beam_runner_api_pb2.Pipeline
    94                        options  # type: struct_pb2.Struct
    95                       ):
    96      # type: (...) -> BeamJob
    97      self._artifact_service.register_job(
    98          staging_token=preparation_id,
    99          dependency_sets={
   100              id: env.dependencies
   101              for (id, env) in pipeline.components.environments.items()
   102          })
   103      provision_info = fn_runner.ExtendedProvisionInfo(
   104          beam_provision_api_pb2.ProvisionInfo(pipeline_options=options),
   105          self._staging_dir,
   106          job_name=job_name)
   107      return self._beam_job_type(
   108          preparation_id,
   109          pipeline,
   110          options,
   111          provision_info,
   112          self._artifact_staging_endpoint,
   113          self._artifact_service)
   114  
   115    def get_bind_address(self):
   116      """Return the address used to open the port on the gRPC server.
   117  
   118      This is often, but not always the same as the service address.  For
   119      example, to make the service accessible to external machines, override this
   120      to return '[::]' and override `get_service_address()` to return a publicly
   121      accessible host name.
   122      """
   123      return self.get_service_address()
   124  
   125    def get_service_address(self):
   126      """Return the host name at which this server will be accessible.
   127  
   128      In particular, this is provided to the client upon connection as the
   129      artifact staging endpoint.
   130      """
   131      return 'localhost'
   132  
   133    def start_grpc_server(self, port=0):
   134      options = [("grpc.max_receive_message_length", -1),
   135                 ("grpc.max_send_message_length", -1),
   136                 ("grpc.http2.max_pings_without_data", 0),
   137                 ("grpc.http2.max_ping_strikes", 0)]
   138      self._server = grpc.server(
   139          thread_pool_executor.shared_unbounded_instance(), options=options)
   140      port = self._server.add_insecure_port(
   141          '%s:%d' % (self.get_bind_address(), port))
   142      beam_job_api_pb2_grpc.add_JobServiceServicer_to_server(self, self._server)
   143      beam_artifact_api_pb2_grpc.add_ArtifactStagingServiceServicer_to_server(
   144          self._artifact_service, self._server)
   145      hostname = self.get_service_address()
   146      self._artifact_staging_endpoint = endpoints_pb2.ApiServiceDescriptor(
   147          url='%s:%d' % (hostname, port))
   148      self._server.start()
   149      _LOGGER.info('Grpc server started at %s on port %d' % (hostname, port))
   150      return port
   151  
   152    def stop(self, timeout=1):
   153      self._server.stop(timeout)
   154      if os.path.exists(self._staging_dir) and self._cleanup_staging_dir:
   155        shutil.rmtree(self._staging_dir, ignore_errors=True)
   156  
   157    def GetJobMetrics(self, request, context=None):
   158      if request.job_id not in self._jobs:
   159        raise LookupError("Job {} does not exist".format(request.job_id))
   160  
   161      result = self._jobs[request.job_id].result
   162      if result is None:
   163        monitoring_info_list = []
   164      else:
   165        monitoring_info_list = result.monitoring_infos()
   166  
   167      # Filter out system metrics
   168      user_monitoring_info_list = [
   169          x for x in monitoring_info_list
   170          if monitoring_infos.is_user_monitoring_info(x)
   171      ]
   172  
   173      return beam_job_api_pb2.GetJobMetricsResponse(
   174          metrics=beam_job_api_pb2.MetricResults(
   175              committed=user_monitoring_info_list))
   176  
   177  
   178  class SubprocessSdkWorker(object):
   179    """Manages a SDK worker implemented as a subprocess communicating over grpc.
   180    """
   181    def __init__(
   182        self,
   183        worker_command_line,  # type: bytes
   184        control_address,
   185        provision_info,
   186        worker_id=None):
   187      # worker_command_line is of bytes type received from grpc. It was encoded in
   188      # apache_beam.transforms.environments.SubprocessSDKEnvironment earlier.
   189      # decode it back as subprocess.Popen does not support bytes args in win32.
   190      self._worker_command_line = worker_command_line.decode('utf-8')
   191      self._control_address = control_address
   192      self._provision_info = provision_info
   193      self._worker_id = worker_id
   194  
   195    def run(self):
   196      options = [("grpc.http2.max_pings_without_data", 0),
   197                 ("grpc.http2.max_ping_strikes", 0)]
   198      logging_server = grpc.server(
   199          thread_pool_executor.shared_unbounded_instance(), options=options)
   200      logging_port = logging_server.add_insecure_port('[::]:0')
   201      logging_server.start()
   202      logging_servicer = BeamFnLoggingServicer()
   203      beam_fn_api_pb2_grpc.add_BeamFnLoggingServicer_to_server(
   204          logging_servicer, logging_server)
   205      logging_descriptor = text_format.MessageToString(
   206          endpoints_pb2.ApiServiceDescriptor(url='localhost:%s' % logging_port))
   207  
   208      control_descriptor = text_format.MessageToString(
   209          endpoints_pb2.ApiServiceDescriptor(url=self._control_address))
   210      pipeline_options = json_format.MessageToJson(
   211          self._provision_info.provision_info.pipeline_options)
   212  
   213      env_dict = dict(
   214          os.environ,
   215          CONTROL_API_SERVICE_DESCRIPTOR=control_descriptor,
   216          LOGGING_API_SERVICE_DESCRIPTOR=logging_descriptor,
   217          PIPELINE_OPTIONS=pipeline_options)
   218      # only add worker_id when it is set.
   219      if self._worker_id:
   220        env_dict['WORKER_ID'] = self._worker_id
   221  
   222      with worker_handlers.SUBPROCESS_LOCK:
   223        p = subprocess.Popen(self._worker_command_line, shell=True, env=env_dict)
   224      try:
   225        p.wait()
   226        if p.returncode:
   227          raise RuntimeError(
   228              'Worker subprocess exited with return code %s' % p.returncode)
   229      finally:
   230        if p.poll() is None:
   231          p.kill()
   232        logging_server.stop(0)
   233  
   234  
   235  class BeamJob(abstract_job_service.AbstractBeamJob):
   236    """This class handles running and managing a single pipeline.
   237  
   238      The current state of the pipeline is available as self.state.
   239      """
   240  
   241    def __init__(self,
   242                 job_id,   # type: str
   243                 pipeline,
   244                 options,
   245                 provision_info,  # type: fn_runner.ExtendedProvisionInfo
   246                 artifact_staging_endpoint,  # type: Optional[endpoints_pb2.ApiServiceDescriptor]
   247                 artifact_service,  # type: artifact_service.ArtifactStagingService
   248                ):
   249      super().__init__(job_id, provision_info.job_name, pipeline, options)
   250      self._provision_info = provision_info
   251      self._artifact_staging_endpoint = artifact_staging_endpoint
   252      self._artifact_service = artifact_service
   253      self._state_queues = []  # type: List[queue.Queue]
   254      self._log_queues = JobLogQueues()
   255      self.daemon = True
   256      self.result = None
   257  
   258    def pipeline_options(self):
   259      def from_urn(key):
   260        assert key.startswith('beam:option:')
   261        assert key.endswith(':v1')
   262        return key[12:-3]
   263  
   264      return pipeline_options.PipelineOptions(
   265          **{
   266              from_urn(key): value
   267              for (key, value
   268                   ) in job_utils.struct_to_dict(self._pipeline_options).items()
   269          })
   270  
   271    def set_state(self, new_state):
   272      """Set the latest state as an int enum and notify consumers"""
   273      timestamp = super().set_state(new_state)
   274      if timestamp is not None:
   275        # Inform consumers of the new state.
   276        for queue in self._state_queues:
   277          queue.put((new_state, timestamp))
   278  
   279    def prepare(self):
   280      pass
   281  
   282    def artifact_staging_endpoint(self):
   283      return self._artifact_staging_endpoint
   284  
   285    def run(self):
   286      self.set_state(beam_job_api_pb2.JobState.STARTING)
   287      self._run_thread = threading.Thread(target=self._run_job)
   288      self._run_thread.start()
   289  
   290    def _run_job(self):
   291      with JobLogHandler(self._log_queues) as log_handler:
   292        self._update_dependencies()
   293        pipeline.Pipeline.merge_compatible_environments(self._pipeline_proto)
   294        try:
   295          start = time.time()
   296          self.result = self._invoke_runner()
   297          self.result.wait_until_finish()
   298          _LOGGER.info(
   299              'Completed job in %s seconds with state %s.',
   300              time.time() - start,
   301              self.result.state)
   302          self.set_state(
   303              portable_runner.PipelineResult.pipeline_state_to_runner_api_state(
   304                  self.result.state))
   305        except:  # pylint: disable=bare-except
   306          self._log_queues.put(
   307              beam_job_api_pb2.JobMessage(
   308                  message_id=log_handler._next_id(),
   309                  time=time.strftime('%Y-%m-%d %H:%M:%S.'),
   310                  importance=beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR,
   311                  message_text=traceback.format_exc()))
   312          _LOGGER.exception('Error running pipeline.')
   313          self.set_state(beam_job_api_pb2.JobState.FAILED)
   314          raise
   315  
   316    def _invoke_runner(self):
   317      self.set_state(beam_job_api_pb2.JobState.RUNNING)
   318      return fn_runner.FnApiRunner(
   319          provision_info=self._provision_info).run_via_runner_api(
   320              self._pipeline_proto, self.pipeline_options())
   321  
   322    def _update_dependencies(self):
   323      try:
   324        for env_id, deps in self._artifact_service.resolved_deps(
   325            self._job_id, timeout=0).items():
   326          # Slice assignment not supported for repeated fields.
   327          env = self._pipeline_proto.components.environments[env_id]
   328          del env.dependencies[:]
   329          env.dependencies.extend(deps)
   330        self._provision_info.provision_info.ClearField('retrieval_token')
   331      except concurrent.futures.TimeoutError:
   332        # TODO(https://github.com/apache/beam/issues/20267): Require this once
   333        # all SDKs support it.
   334        pass
   335  
   336    def cancel(self):
   337      if not self.is_terminal_state(self.state):
   338        self.set_state(beam_job_api_pb2.JobState.CANCELLING)
   339        # TODO(robertwb): Actually cancel...
   340        self.set_state(beam_job_api_pb2.JobState.CANCELLED)
   341  
   342    def get_state_stream(self):
   343      # Register for any new state changes.
   344      state_queue = queue.Queue()
   345      self._state_queues.append(state_queue)
   346  
   347      for state, timestamp in self.with_state_history(_iter_queue(state_queue)):
   348        yield state, timestamp
   349        if self.is_terminal_state(state):
   350          break
   351  
   352    def get_message_stream(self):
   353      # Register for any new messages.
   354      log_queue = queue.Queue()
   355      self._log_queues.append(log_queue)
   356      self._state_queues.append(log_queue)
   357  
   358      for msg in itertools.chain(self._log_queues.cache(),
   359                                 self.with_state_history(_iter_queue(log_queue))):
   360        if isinstance(msg, tuple):
   361          assert len(msg) == 2 and isinstance(msg[0], int)
   362          current_state = msg[0]
   363          yield msg
   364          if self.is_terminal_state(current_state):
   365            break
   366        else:
   367          yield msg
   368  
   369  
   370  class BeamFnLoggingServicer(beam_fn_api_pb2_grpc.BeamFnLoggingServicer):
   371    def Logging(self, log_bundles, context=None):
   372      for log_bundle in log_bundles:
   373        for log_entry in log_bundle.log_entries:
   374          _LOGGER.log(
   375              LOGENTRY_TO_LOG_LEVEL_MAP[log_entry.severity],
   376              'Worker: %s',
   377              str(log_entry).replace('\n', ' '))
   378      return iter([])
   379  
   380  
   381  class JobLogQueues(object):
   382    def __init__(self):
   383      self._queues = []  # type: List[queue.Queue]
   384      self._cache = []
   385      self._cache_size = 10
   386      self._lock = threading.Lock()
   387  
   388    def cache(self):
   389      with self._lock:
   390        return list(self._cache)
   391  
   392    def append(self, queue):
   393      with self._lock:
   394        self._queues.append(queue)
   395  
   396    def put(self, msg):
   397      with self._lock:
   398        if len(self._cache) < self._cache_size:
   399          self._cache.append(msg)
   400        else:
   401          min_level = min(m.importance for m in self._cache)
   402          if msg.importance >= min_level:
   403            self._cache.append(msg)
   404            for ix, m in enumerate(self._cache):
   405              if m.importance == min_level:
   406                del self._cache[ix]
   407                break
   408  
   409        for queue in self._queues:
   410          queue.put(msg)
   411  
   412  
   413  class JobLogHandler(logging.Handler):
   414    """Captures logs to be returned via the Beam Job API.
   415  
   416      Enabled via the with statement."""
   417  
   418    # Mapping from logging levels to LogEntry levels.
   419    LOG_LEVEL_MAP = {
   420        logging.FATAL: beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR,
   421        logging.CRITICAL: beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR,
   422        logging.ERROR: beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR,
   423        logging.WARNING: beam_job_api_pb2.JobMessage.JOB_MESSAGE_WARNING,
   424        logging.INFO: beam_job_api_pb2.JobMessage.JOB_MESSAGE_BASIC,
   425        logging.DEBUG: beam_job_api_pb2.JobMessage.JOB_MESSAGE_DEBUG,
   426    }
   427  
   428    def __init__(self, log_queues):
   429      super().__init__()
   430      self._last_id = 0
   431      self._logged_thread = None
   432      self._log_queues = log_queues
   433  
   434    def __enter__(self):
   435      # Remember the current thread to demultiplex the logs of concurrently
   436      # running pipelines (as Python log handlers are global).
   437      self._logged_thread = threading.current_thread()
   438      logging.getLogger().addHandler(self)
   439      return self
   440  
   441    def __exit__(self, *args):
   442      self._logged_thread = None
   443      self.close()
   444  
   445    def _next_id(self):
   446      self._last_id += 1
   447      return str(self._last_id)
   448  
   449    def emit(self, record):
   450      if self._logged_thread is threading.current_thread():
   451        msg = beam_job_api_pb2.JobMessage(
   452            message_id=self._next_id(),
   453            time=time.strftime(
   454                '%Y-%m-%d %H:%M:%S.', time.localtime(record.created)),
   455            importance=self.LOG_LEVEL_MAP[record.levelno],
   456            message_text=self.format(record))
   457  
   458        # Inform all message consumers.
   459        self._log_queues.put(msg)