github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/portable_runner.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/portable_runner.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  # mypy: check-untyped-defs
    20  
    21  import atexit
    22  import functools
    23  import itertools
    24  import logging
    25  import threading
    26  import time
    27  from typing import TYPE_CHECKING
    28  from typing import Any
    29  from typing import Dict
    30  from typing import Iterator
    31  from typing import Optional
    32  from typing import Tuple
    33  
    34  import grpc
    35  
    36  from apache_beam.metrics import metric
    37  from apache_beam.metrics.execution import MetricResult
    38  from apache_beam.options.pipeline_options import DebugOptions
    39  from apache_beam.options.pipeline_options import PortableOptions
    40  from apache_beam.options.pipeline_options import SetupOptions
    41  from apache_beam.options.pipeline_options import StandardOptions
    42  from apache_beam.options.pipeline_options import TypeOptions
    43  from apache_beam.options.value_provider import ValueProvider
    44  from apache_beam.portability import common_urns
    45  from apache_beam.portability.api import beam_artifact_api_pb2_grpc
    46  from apache_beam.portability.api import beam_job_api_pb2
    47  from apache_beam.runners import runner
    48  from apache_beam.runners.common import group_by_key_input_visitor
    49  from apache_beam.runners.job import utils as job_utils
    50  from apache_beam.runners.portability import artifact_service
    51  from apache_beam.runners.portability import job_server
    52  from apache_beam.runners.portability import portable_metrics
    53  from apache_beam.runners.portability.fn_api_runner.fn_runner import translations
    54  from apache_beam.runners.worker import sdk_worker_main
    55  from apache_beam.runners.worker import worker_pool_main
    56  from apache_beam.transforms import environments
    57  
    58  if TYPE_CHECKING:
    59    from google.protobuf import struct_pb2  # pylint: disable=ungrouped-imports
    60    from apache_beam.options.pipeline_options import PipelineOptions
    61    from apache_beam.pipeline import Pipeline
    62    from apache_beam.portability.api import beam_runner_api_pb2
    63  
    64  __all__ = ['PortableRunner']
    65  
    66  MESSAGE_LOG_LEVELS = {
    67      beam_job_api_pb2.JobMessage.MESSAGE_IMPORTANCE_UNSPECIFIED: logging.INFO,
    68      beam_job_api_pb2.JobMessage.JOB_MESSAGE_DEBUG: logging.DEBUG,
    69      beam_job_api_pb2.JobMessage.JOB_MESSAGE_DETAILED: logging.DEBUG,
    70      beam_job_api_pb2.JobMessage.JOB_MESSAGE_BASIC: logging.INFO,
    71      beam_job_api_pb2.JobMessage.JOB_MESSAGE_WARNING: logging.WARNING,
    72      beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR: logging.ERROR,
    73  }
    74  
    75  TERMINAL_STATES = [
    76      beam_job_api_pb2.JobState.DONE,
    77      beam_job_api_pb2.JobState.DRAINED,
    78      beam_job_api_pb2.JobState.FAILED,
    79      beam_job_api_pb2.JobState.CANCELLED,
    80  ]
    81  
    82  ENV_TYPE_ALIASES = {'LOOPBACK': 'EXTERNAL'}
    83  
    84  _LOGGER = logging.getLogger(__name__)
    85  
    86  
    87  class JobServiceHandle(object):
    88    """
    89    Encapsulates the interactions necessary to submit a pipeline to a job service.
    90  
    91    The base set of interactions consists of 3 steps:
    92    - prepare
    93    - stage
    94    - run
    95    """
    96    def __init__(self, job_service, options, retain_unknown_options=False):
    97      self.job_service = job_service
    98      self.options = options
    99      self.timeout = options.view_as(PortableOptions).job_server_timeout
   100      self.artifact_endpoint = options.view_as(PortableOptions).artifact_endpoint
   101      self._retain_unknown_options = retain_unknown_options
   102  
   103    def submit(self, proto_pipeline):
   104      # type: (beam_runner_api_pb2.Pipeline) -> Tuple[str, Iterator[beam_job_api_pb2.JobStateEvent], Iterator[beam_job_api_pb2.JobMessagesResponse]]
   105  
   106      """
   107      Submit and run the pipeline defined by `proto_pipeline`.
   108      """
   109      prepare_response = self.prepare(proto_pipeline)
   110      artifact_endpoint = (
   111          self.artifact_endpoint or
   112          prepare_response.artifact_staging_endpoint.url)
   113      self.stage(
   114          proto_pipeline,
   115          artifact_endpoint,
   116          prepare_response.staging_session_token)
   117      return self.run(prepare_response.preparation_id)
   118  
   119    def get_pipeline_options(self):
   120      # type: () -> struct_pb2.Struct
   121  
   122      """
   123      Get `self.options` as a protobuf Struct
   124      """
   125  
   126      # fetch runner options from job service
   127      # retries in case the channel is not ready
   128      def send_options_request(max_retries=5):
   129        num_retries = 0
   130        while True:
   131          try:
   132            # This reports channel is READY but connections may fail
   133            # Seems to be only an issue on Mac with port forwardings
   134            return self.job_service.DescribePipelineOptions(
   135                beam_job_api_pb2.DescribePipelineOptionsRequest(),
   136                timeout=self.timeout)
   137          except grpc.FutureTimeoutError:
   138            # no retry for timeout errors
   139            raise
   140          except grpc.RpcError as e:
   141            num_retries += 1
   142            if num_retries > max_retries:
   143              raise e
   144            time.sleep(1)
   145  
   146      options_response = send_options_request()
   147  
   148      def add_runner_options(parser):
   149        for option in options_response.options:
   150          try:
   151            # no default values - we don't want runner options
   152            # added unless they were specified by the user
   153            add_arg_args = {'action': 'store', 'help': option.description}
   154            if option.type == beam_job_api_pb2.PipelineOptionType.BOOLEAN:
   155              add_arg_args['action'] = 'store_true' \
   156                if option.default_value != 'true' else 'store_false'
   157            elif option.type == beam_job_api_pb2.PipelineOptionType.INTEGER:
   158              add_arg_args['type'] = int
   159            elif option.type == beam_job_api_pb2.PipelineOptionType.ARRAY:
   160              add_arg_args['action'] = 'append'
   161            parser.add_argument("--%s" % option.name, **add_arg_args)
   162          except Exception as e:
   163            # ignore runner options that are already present
   164            # only in this case is duplicate not treated as error
   165            if 'conflicting option string' not in str(e):
   166              raise
   167            _LOGGER.debug("Runner option '%s' was already added" % option.name)
   168  
   169      all_options = self.options.get_all_options(
   170          add_extra_args_fn=add_runner_options,
   171          retain_unknown_options=self._retain_unknown_options)
   172  
   173      return self.encode_pipeline_options(all_options)
   174  
   175    @staticmethod
   176    def encode_pipeline_options(
   177        all_options: Dict[str, Any]) -> 'struct_pb2.Struct':
   178      def convert_pipeline_option_value(v):
   179        # convert int values: BEAM-5509
   180        if type(v) == int:
   181          return str(v)
   182        elif isinstance(v, ValueProvider):
   183          return convert_pipeline_option_value(
   184              v.get()) if v.is_accessible() else None
   185        return v
   186  
   187      # TODO: Define URNs for options.
   188      p_options = {
   189          'beam:option:' + k + ':v1': convert_pipeline_option_value(v)
   190          for k,
   191          v in all_options.items() if v is not None
   192      }
   193      return job_utils.dict_to_struct(p_options)
   194  
   195    def prepare(self, proto_pipeline):
   196      # type: (beam_runner_api_pb2.Pipeline) -> beam_job_api_pb2.PrepareJobResponse
   197  
   198      """Prepare the job on the job service"""
   199      return self.job_service.Prepare(
   200          beam_job_api_pb2.PrepareJobRequest(
   201              job_name='job',
   202              pipeline=proto_pipeline,
   203              pipeline_options=self.get_pipeline_options()),
   204          timeout=self.timeout)
   205  
   206    def stage(self,
   207              proto_pipeline,  # type: beam_runner_api_pb2.Pipeline
   208              artifact_staging_endpoint,
   209              staging_session_token
   210             ):
   211      # type: (...) -> None
   212  
   213      """Stage artifacts"""
   214      if artifact_staging_endpoint:
   215        artifact_service.offer_artifacts(
   216            beam_artifact_api_pb2_grpc.ArtifactStagingServiceStub(
   217                channel=grpc.insecure_channel(artifact_staging_endpoint)),
   218            artifact_service.ArtifactRetrievalService(
   219                artifact_service.BeamFilesystemHandler(None).file_reader),
   220            staging_session_token)
   221  
   222    def run(self, preparation_id):
   223      # type: (str) -> Tuple[str, Iterator[beam_job_api_pb2.JobStateEvent], Iterator[beam_job_api_pb2.JobMessagesResponse]]
   224  
   225      """Run the job"""
   226      try:
   227        state_stream = self.job_service.GetStateStream(
   228            beam_job_api_pb2.GetJobStateRequest(job_id=preparation_id),
   229            timeout=self.timeout)
   230        # If there's an error, we don't always get it until we try to read.
   231        # Fortunately, there's always an immediate current state published.
   232        state_stream = itertools.chain([next(state_stream)], state_stream)
   233        message_stream = self.job_service.GetMessageStream(
   234            beam_job_api_pb2.JobMessagesRequest(job_id=preparation_id),
   235            timeout=self.timeout)
   236      except Exception:
   237        # TODO(https://github.com/apache/beam/issues/19284): Unify preparation_id
   238        # and job_id for all runners.
   239        state_stream = message_stream = None
   240  
   241      # Run the job and wait for a result, we don't set a timeout here because
   242      # it may take a long time for a job to complete and streaming
   243      # jobs currently never return a response.
   244      run_response = self.job_service.Run(
   245          beam_job_api_pb2.RunJobRequest(preparation_id=preparation_id))
   246  
   247      if state_stream is None:
   248        state_stream = self.job_service.GetStateStream(
   249            beam_job_api_pb2.GetJobStateRequest(job_id=run_response.job_id))
   250        message_stream = self.job_service.GetMessageStream(
   251            beam_job_api_pb2.JobMessagesRequest(job_id=run_response.job_id))
   252  
   253      return run_response.job_id, message_stream, state_stream
   254  
   255  
   256  class PortableRunner(runner.PipelineRunner):
   257    """
   258      Experimental: No backward compatibility guaranteed.
   259      A BeamRunner that executes Python pipelines via the Beam Job API.
   260  
   261      This runner is a stub and does not run the actual job.
   262      This runner schedules the job on a job service. The responsibility of
   263      running and managing the job lies with the job service used.
   264    """
   265    def __init__(self):
   266      self._dockerized_job_server = None  # type: Optional[job_server.JobServer]
   267  
   268    @staticmethod
   269    def _create_environment(options):
   270      # type: (PipelineOptions) -> environments.Environment
   271      portable_options = options.view_as(PortableOptions)
   272      # Do not set a Runner. Otherwise this can cause problems in Java's
   273      # PipelineOptions, i.e. ClassNotFoundException, if the corresponding Runner
   274      # does not exist in the Java SDK. In portability, the entry point is clearly
   275      # defined via the JobService.
   276      portable_options.view_as(StandardOptions).runner = None
   277      environment_type = portable_options.environment_type
   278      if not environment_type:
   279        environment_urn = common_urns.environments.DOCKER.urn
   280      elif environment_type.startswith('beam:env:'):
   281        environment_urn = environment_type
   282      else:
   283        # e.g. handle LOOPBACK -> EXTERNAL
   284        environment_type = ENV_TYPE_ALIASES.get(
   285            environment_type, environment_type)
   286        try:
   287          environment_urn = getattr(
   288              common_urns.environments, environment_type).urn
   289        except AttributeError:
   290          raise ValueError('Unknown environment type: %s' % environment_type)
   291  
   292      env_class = environments.Environment.get_env_cls_from_urn(environment_urn)
   293      return env_class.from_options(portable_options)
   294  
   295    def default_job_server(self, options):
   296      raise NotImplementedError(
   297          'You must specify a --job_endpoint when using --runner=PortableRunner. '
   298          'Alternatively, you may specify which portable runner you intend to '
   299          'use, such as --runner=FlinkRunner or --runner=SparkRunner.')
   300  
   301    def create_job_service_handle(self, job_service, options):
   302      # type: (...) -> JobServiceHandle
   303      return JobServiceHandle(job_service, options)
   304  
   305    def create_job_service(self, options):
   306      # type: (PipelineOptions) -> JobServiceHandle
   307  
   308      """
   309      Start the job service and return a `JobServiceHandle`
   310      """
   311      job_endpoint = options.view_as(PortableOptions).job_endpoint
   312      if job_endpoint:
   313        if job_endpoint == 'embed':
   314          server = job_server.EmbeddedJobServer()  # type: job_server.JobServer
   315        else:
   316          job_server_timeout = options.view_as(PortableOptions).job_server_timeout
   317          server = job_server.ExternalJobServer(job_endpoint, job_server_timeout)
   318      else:
   319        server = self.default_job_server(options)
   320      return self.create_job_service_handle(server.start(), options)
   321  
   322    @staticmethod
   323    def get_proto_pipeline(pipeline, options):
   324      # type: (Pipeline, PipelineOptions) -> beam_runner_api_pb2.Pipeline
   325      portable_options = options.view_as(PortableOptions)
   326  
   327      proto_pipeline = pipeline.to_runner_api(
   328          default_environment=PortableRunner._create_environment(
   329              portable_options))
   330  
   331      # TODO: https://github.com/apache/beam/issues/19493
   332      # Eventually remove the 'pre_optimize' option alltogether and only perform
   333      # the equivalent of the 'default' case below (minus the 'lift_combiners'
   334      # part).
   335      pre_optimize = options.view_as(DebugOptions).lookup_experiment(
   336          'pre_optimize', 'default').lower()
   337      if (not options.view_as(StandardOptions).streaming and
   338          pre_optimize != 'none'):
   339        if pre_optimize == 'default':
   340          phases = [
   341              # TODO: https://github.com/apache/beam/issues/18584
   342              #       https://github.com/apache/beam/issues/18586
   343              # Eventually remove the 'lift_combiners' phase from 'default'.
   344              translations.pack_combiners,
   345              translations.lift_combiners,
   346              translations.sort_stages
   347          ]
   348          partial = True
   349        elif pre_optimize == 'all':
   350          phases = [
   351              translations.annotate_downstream_side_inputs,
   352              translations.annotate_stateful_dofns_as_roots,
   353              translations.fix_side_input_pcoll_coders,
   354              translations.pack_combiners,
   355              translations.lift_combiners,
   356              translations.expand_sdf,
   357              translations.fix_flatten_coders,
   358              # translations.sink_flattens,
   359              translations.greedily_fuse,
   360              translations.read_to_impulse,
   361              translations.extract_impulse_stages,
   362              translations.remove_data_plane_ops,
   363              translations.sort_stages
   364          ]
   365          partial = False
   366        elif pre_optimize == 'all_except_fusion':
   367          # TODO(https://github.com/apache/beam/issues/19422): Delete this branch
   368          # after PortableRunner supports beam:runner:executable_stage:v1.
   369          phases = [
   370              translations.annotate_downstream_side_inputs,
   371              translations.annotate_stateful_dofns_as_roots,
   372              translations.fix_side_input_pcoll_coders,
   373              translations.pack_combiners,
   374              translations.lift_combiners,
   375              translations.expand_sdf,
   376              translations.fix_flatten_coders,
   377              # translations.sink_flattens,
   378              # translations.greedily_fuse,
   379              translations.read_to_impulse,
   380              translations.extract_impulse_stages,
   381              translations.remove_data_plane_ops,
   382              translations.sort_stages
   383          ]
   384          partial = True
   385        else:
   386          phases = []
   387          for phase_name in pre_optimize.split(','):
   388            # For now, these are all we allow.
   389            if phase_name in ('pack_combiners', 'lift_combiners'):
   390              phases.append(getattr(translations, phase_name))
   391            else:
   392              raise ValueError(
   393                  'Unknown or inapplicable phase for pre_optimize: %s' %
   394                  phase_name)
   395          phases.append(translations.sort_stages)
   396          partial = True
   397  
   398        # All (known) portable runners (ie Flink and Spark) support these URNs.
   399        known_urns = frozenset([
   400            common_urns.composites.RESHUFFLE.urn,
   401            common_urns.primitives.IMPULSE.urn,
   402            common_urns.primitives.FLATTEN.urn,
   403            common_urns.primitives.GROUP_BY_KEY.urn
   404        ])
   405        proto_pipeline = translations.optimize_pipeline(
   406            proto_pipeline,
   407            phases=phases,
   408            known_runner_urns=known_urns,
   409            partial=partial)
   410  
   411      return proto_pipeline
   412  
   413    def run_pipeline(self, pipeline, options):
   414      # type: (Pipeline, PipelineOptions) -> PipelineResult
   415      portable_options = options.view_as(PortableOptions)
   416  
   417      # TODO: https://github.com/apache/beam/issues/19168
   418      # portable runner specific default
   419      if options.view_as(SetupOptions).sdk_location == 'default':
   420        options.view_as(SetupOptions).sdk_location = 'container'
   421  
   422      experiments = options.view_as(DebugOptions).experiments or []
   423  
   424      # This is needed as we start a worker server if one is requested
   425      # but none is provided.
   426      if portable_options.environment_type == 'LOOPBACK':
   427        use_loopback_process_worker = options.view_as(
   428            DebugOptions).lookup_experiment('use_loopback_process_worker', False)
   429        portable_options.environment_config, server = (
   430            worker_pool_main.BeamFnExternalWorkerPoolServicer.start(
   431                state_cache_size=
   432                sdk_worker_main._get_state_cache_size(experiments),
   433                data_buffer_time_limit_ms=
   434                sdk_worker_main._get_data_buffer_time_limit_ms(experiments),
   435                use_process=use_loopback_process_worker))
   436        cleanup_callbacks = [functools.partial(server.stop, 1)]
   437      else:
   438        cleanup_callbacks = []
   439  
   440      pipeline.visit(
   441          group_by_key_input_visitor(
   442              not options.view_as(TypeOptions).allow_non_deterministic_key_coders)
   443      )
   444  
   445      proto_pipeline = self.get_proto_pipeline(pipeline, options)
   446      job_service_handle = self.create_job_service(options)
   447      job_id, message_stream, state_stream = \
   448        job_service_handle.submit(proto_pipeline)
   449  
   450      result = PipelineResult(
   451          job_service_handle.job_service,
   452          job_id,
   453          message_stream,
   454          state_stream,
   455          cleanup_callbacks)
   456      if cleanup_callbacks:
   457        # Register an exit handler to ensure cleanup on exit.
   458        atexit.register(functools.partial(result._cleanup, on_exit=True))
   459        _LOGGER.info(
   460            'Environment "%s" has started a component necessary for the '
   461            'execution. Be sure to run the pipeline using\n'
   462            '  with Pipeline() as p:\n'
   463            '    p.apply(..)\n'
   464            'This ensures that the pipeline finishes before this program exits.',
   465            portable_options.environment_type)
   466      return result
   467  
   468  
   469  class PortableMetrics(metric.MetricResults):
   470    def __init__(self, job_metrics_response):
   471      metrics = job_metrics_response.metrics
   472      self.attempted = portable_metrics.from_monitoring_infos(metrics.attempted)
   473      self.committed = portable_metrics.from_monitoring_infos(metrics.committed)
   474  
   475    @staticmethod
   476    def _combine(committed, attempted, filter):
   477      all_keys = set(committed.keys()) | set(attempted.keys())
   478      return [
   479          MetricResult(key, committed.get(key), attempted.get(key))
   480          for key in all_keys if metric.MetricResults.matches(filter, key)
   481      ]
   482  
   483    def query(self, filter=None):
   484      counters, distributions, gauges = [
   485          self._combine(x, y, filter)
   486          for x, y in zip(self.committed, self.attempted)
   487      ]
   488  
   489      return {
   490          self.COUNTERS: counters,
   491          self.DISTRIBUTIONS: distributions,
   492          self.GAUGES: gauges
   493      }
   494  
   495  
   496  class PipelineResult(runner.PipelineResult):
   497    def __init__(
   498        self,
   499        job_service,
   500        job_id,
   501        message_stream,
   502        state_stream,
   503        cleanup_callbacks=()):
   504      super().__init__(beam_job_api_pb2.JobState.UNSPECIFIED)
   505      self._job_service = job_service
   506      self._job_id = job_id
   507      self._messages = []
   508      self._message_stream = message_stream
   509      self._state_stream = state_stream
   510      self._cleanup_callbacks = cleanup_callbacks
   511      self._metrics = None
   512      self._runtime_exception = None
   513  
   514    def cancel(self):
   515      # type: () -> None
   516      try:
   517        self._job_service.Cancel(
   518            beam_job_api_pb2.CancelJobRequest(job_id=self._job_id))
   519      finally:
   520        self._cleanup()
   521  
   522    @property
   523    def state(self):
   524      runner_api_state = self._job_service.GetState(
   525          beam_job_api_pb2.GetJobStateRequest(job_id=self._job_id)).state
   526      self._state = self.runner_api_state_to_pipeline_state(runner_api_state)
   527      return self._state
   528  
   529    @staticmethod
   530    def runner_api_state_to_pipeline_state(runner_api_state):
   531      return getattr(
   532          runner.PipelineState,
   533          beam_job_api_pb2.JobState.Enum.Name(runner_api_state))
   534  
   535    @staticmethod
   536    def pipeline_state_to_runner_api_state(pipeline_state):
   537      if pipeline_state == runner.PipelineState.PENDING:
   538        return beam_job_api_pb2.JobState.STARTING
   539      else:
   540        try:
   541          return beam_job_api_pb2.JobState.Enum.Value(pipeline_state)
   542        except ValueError:
   543          return beam_job_api_pb2.JobState.UNSPECIFIED
   544  
   545    def metrics(self):
   546      if not self._metrics:
   547  
   548        job_metrics_response = self._job_service.GetJobMetrics(
   549            beam_job_api_pb2.GetJobMetricsRequest(job_id=self._job_id))
   550  
   551        self._metrics = PortableMetrics(job_metrics_response)
   552      return self._metrics
   553  
   554    def _last_error_message(self):
   555      # type: () -> str
   556      # Filter only messages with the "message_response" and error messages.
   557      messages = [
   558          m.message_response for m in self._messages
   559          if m.HasField('message_response')
   560      ]
   561      error_messages = [
   562          m for m in messages
   563          if m.importance == beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR
   564      ]
   565      if error_messages:
   566        return error_messages[-1].message_text
   567      else:
   568        return 'unknown error'
   569  
   570    def wait_until_finish(self, duration=None):
   571      """
   572      :param duration: The maximum time in milliseconds to wait for the result of
   573      the execution. If None or zero, will wait until the pipeline finishes.
   574      :return: The result of the pipeline, i.e. PipelineResult.
   575      """
   576      def read_messages():
   577        # type: () -> None
   578        previous_state = -1
   579        for message in self._message_stream:
   580          if message.HasField('message_response'):
   581            logging.log(
   582                MESSAGE_LOG_LEVELS[message.message_response.importance],
   583                "%s",
   584                message.message_response.message_text)
   585          else:
   586            current_state = message.state_response.state
   587            if current_state != previous_state:
   588              _LOGGER.info(
   589                  "Job state changed to %s",
   590                  self.runner_api_state_to_pipeline_state(current_state))
   591              previous_state = current_state
   592          self._messages.append(message)
   593  
   594      message_thread = threading.Thread(
   595          target=read_messages, name='wait_until_finish_read')
   596      message_thread.daemon = True
   597      message_thread.start()
   598  
   599      if duration:
   600        state_thread = threading.Thread(
   601            target=functools.partial(self._observe_state, message_thread),
   602            name='wait_until_finish_state_observer')
   603        state_thread.daemon = True
   604        state_thread.start()
   605        start_time = time.time()
   606        duration_secs = duration / 1000
   607        while (time.time() - start_time < duration_secs and
   608               state_thread.is_alive()):
   609          time.sleep(1)
   610      else:
   611        self._observe_state(message_thread)
   612  
   613      if self._runtime_exception:
   614        raise self._runtime_exception
   615  
   616      return self._state
   617  
   618    def _observe_state(self, message_thread):
   619      try:
   620        for state_response in self._state_stream:
   621          self._state = self.runner_api_state_to_pipeline_state(
   622              state_response.state)
   623          if state_response.state in TERMINAL_STATES:
   624            # Wait for any last messages.
   625            message_thread.join(10)
   626            break
   627        if self._state != runner.PipelineState.DONE:
   628          self._runtime_exception = RuntimeError(
   629              'Pipeline %s failed in state %s: %s' %
   630              (self._job_id, self._state, self._last_error_message()))
   631      except Exception as e:
   632        self._runtime_exception = e
   633      finally:
   634        self._cleanup()
   635  
   636    def _cleanup(self, on_exit=False):
   637      # type: (bool) -> None
   638      if on_exit and self._cleanup_callbacks:
   639        _LOGGER.info(
   640            'Running cleanup on exit. If your pipeline should continue running, '
   641            'be sure to use the following syntax:\n'
   642            '  with Pipeline() as p:\n'
   643            '    p.apply(..)\n'
   644            'This ensures that the pipeline finishes before this program exits.')
   645      callback_exceptions = []
   646      for callback in self._cleanup_callbacks:
   647        try:
   648          callback()
   649        except Exception as e:
   650          callback_exceptions.append(e)
   651  
   652      self._cleanup_callbacks = ()
   653      if callback_exceptions:
   654        formatted_exceptions = ''.join(
   655            [f"\n\t{repr(e)}" for e in callback_exceptions])
   656        raise RuntimeError('Errors: {}'.format(formatted_exceptions))