github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/background_caching_job.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Module to build and run background source recording jobs.
    19  
    20  For internal use only; no backwards-compatibility guarantees.
    21  
    22  A background source recording job is a job that records events for all
    23  recordable sources of a given pipeline. With Interactive Beam, one such job is
    24  started when a pipeline run happens (which produces a main job in contrast to
    25  the background source recording job) and meets the following conditions:
    26  
    27    #. The pipeline contains recordable sources, configured through
    28       interactive_beam.options.recordable_sources.
    29    #. No such background job is running.
    30    #. No such background job has completed successfully and the cached events are
    31       still valid (invalidated when recordable sources change in the pipeline).
    32  
    33  Once started, the background source recording job runs asynchronously until it
    34  hits some recording limit configured in interactive_beam.options. Meanwhile,
    35  the main job and future main jobs from the pipeline will run using the
    36  deterministic replayable recorded events until they are invalidated.
    37  """
    38  
    39  # pytype: skip-file
    40  
    41  import logging
    42  import threading
    43  import time
    44  
    45  import apache_beam as beam
    46  from apache_beam.runners.interactive import interactive_environment as ie
    47  from apache_beam.runners.interactive import utils
    48  from apache_beam.runners.interactive.caching import streaming_cache
    49  from apache_beam.runners.runner import PipelineState
    50  
    51  _LOGGER = logging.getLogger(__name__)
    52  
    53  
    54  class BackgroundCachingJob(object):
    55    """A simple abstraction that controls necessary components of a timed and
    56    space limited background source recording job.
    57  
    58    A background source recording job successfully completes source data
    59    recording in 2 conditions:
    60  
    61      #. The job is finite and runs into DONE state;
    62      #. The job is infinite but hits an interactive_beam.options configured limit
    63         and gets cancelled into CANCELLED/CANCELLING state.
    64  
    65    In both situations, the background source recording job should be treated as
    66    done successfully.
    67    """
    68    def __init__(self, pipeline_result, limiters):
    69      self._pipeline_result = pipeline_result
    70      self._result_lock = threading.RLock()
    71      self._condition_checker = threading.Thread(
    72          target=self._background_caching_job_condition_checker, daemon=True)
    73  
    74      # Limiters are checks s.t. if any are triggered then the background caching
    75      # job gets cancelled.
    76      self._limiters = limiters
    77      self._condition_checker.start()
    78  
    79    def _background_caching_job_condition_checker(self):
    80      while True:
    81        with self._result_lock:
    82          if PipelineState.is_terminal(self._pipeline_result.state):
    83            break
    84  
    85        if self._should_end_condition_checker():
    86          self.cancel()
    87          break
    88        time.sleep(0.5)
    89  
    90    def _should_end_condition_checker(self):
    91      return any(l.is_triggered() for l in self._limiters)
    92  
    93    def is_done(self):
    94      with self._result_lock:
    95        is_terminated = self._pipeline_result.state in (
    96            PipelineState.DONE, PipelineState.CANCELLED)
    97        is_triggered = self._should_end_condition_checker()
    98        is_cancelling = self._pipeline_result.state is PipelineState.CANCELLING
    99      return is_terminated or (is_triggered and is_cancelling)
   100  
   101    def is_running(self):
   102      with self._result_lock:
   103        return self._pipeline_result.state is PipelineState.RUNNING
   104  
   105    def cancel(self):
   106      """Cancels this background source recording job.
   107      """
   108      with self._result_lock:
   109        if not PipelineState.is_terminal(self._pipeline_result.state):
   110          try:
   111            self._pipeline_result.cancel()
   112          except NotImplementedError:
   113            # Ignore the cancel invocation if it is never implemented by the
   114            # runner.
   115            pass
   116  
   117    @property
   118    def state(self):
   119      with self._result_lock:
   120        return self._pipeline_result.state
   121  
   122  
   123  def attempt_to_run_background_caching_job(
   124      runner, user_pipeline, options=None, limiters=None):
   125    """Attempts to run a background source recording job for a user-defined
   126    pipeline.
   127  
   128    Returns True if a job was started, False otherwise.
   129  
   130    The pipeline result is automatically tracked by Interactive Beam in case
   131    future cancellation/cleanup is needed.
   132    """
   133    if is_background_caching_job_needed(user_pipeline):
   134      # Cancel non-terminal jobs if there is any before starting a new one.
   135      attempt_to_cancel_background_caching_job(user_pipeline)
   136      # Cancel the gRPC server serving the test stream if there is one.
   137      attempt_to_stop_test_stream_service(user_pipeline)
   138      # TODO(BEAM-8335): refactor background source recording job logic from
   139      # pipeline_instrument module to this module and aggregate tests.
   140      from apache_beam.runners.interactive import pipeline_instrument as instr
   141      runner_pipeline = beam.pipeline.Pipeline.from_runner_api(
   142          user_pipeline.to_runner_api(), runner, options)
   143      ie.current_env().add_derived_pipeline(user_pipeline, runner_pipeline)
   144      background_caching_job_result = beam.pipeline.Pipeline.from_runner_api(
   145          instr.build_pipeline_instrument(
   146              runner_pipeline).background_caching_pipeline_proto(),
   147          runner,
   148          options).run()
   149  
   150      recording_limiters = (
   151          limiters
   152          if limiters else ie.current_env().options.capture_control.limiters())
   153      ie.current_env().set_background_caching_job(
   154          user_pipeline,
   155          BackgroundCachingJob(
   156              background_caching_job_result, limiters=recording_limiters))
   157      return True
   158    return False
   159  
   160  
   161  def is_background_caching_job_needed(user_pipeline):
   162    """Determines if a background source recording job needs to be started.
   163  
   164    It does several state checks and recording state changes throughout the
   165    process. It is not idempotent to simplify the usage.
   166    """
   167    job = ie.current_env().get_background_caching_job(user_pipeline)
   168    # Checks if the pipeline contains any source that needs to be cached.
   169    need_cache = has_source_to_cache(user_pipeline)
   170    # If this is True, we can invalidate a previous done/running job if there is
   171    # one.
   172    cache_changed = is_source_to_cache_changed(user_pipeline)
   173    # When recording replay is disabled, cache is always needed for recordable
   174    # sources (if any).
   175    if need_cache and not ie.current_env().options.enable_recording_replay:
   176      from apache_beam.runners.interactive.options import capture_control
   177      capture_control.evict_captured_data()
   178      return True
   179    return (
   180        need_cache and
   181        # Checks if it's the first time running a job from the pipeline.
   182        (
   183            not job or
   184            # Or checks if there is no previous job.
   185            # DONE means a previous job has completed successfully and the
   186            # cached events might still be valid.
   187            not (
   188                job.is_done() or
   189                # RUNNING means a previous job has been started and is still
   190                # running.
   191                job.is_running()) or
   192            # Or checks if we can invalidate the previous job.
   193            cache_changed))
   194  
   195  
   196  def is_cache_complete(pipeline_id):
   197    # type: (str) -> bool
   198  
   199    """Returns True if the backgrond cache for the given pipeline is done.
   200    """
   201    user_pipeline = ie.current_env().pipeline_id_to_pipeline(pipeline_id)
   202    job = ie.current_env().get_background_caching_job(user_pipeline)
   203    is_done = job and job.is_done()
   204    cache_changed = is_source_to_cache_changed(
   205        user_pipeline, update_cached_source_signature=False)
   206  
   207    # Stop reading from the cache if the background job is done or the underlying
   208    # cache signature changed that requires a new background source recording job.
   209    return is_done or cache_changed
   210  
   211  
   212  def has_source_to_cache(user_pipeline):
   213    """Determines if a user-defined pipeline contains any source that need to be
   214    cached. If so, also immediately wrap current cache manager held by current
   215    interactive environment into a streaming cache if this has not been done.
   216    The wrapping doesn't invalidate existing cache in any way.
   217  
   218    This can help determining if a background source recording job is needed to
   219    write cache for sources and if a test stream service is needed to serve the
   220    cache.
   221  
   222    Throughout the check, if source-to-cache has changed from the last check, it
   223    also cleans up the invalidated cache early on.
   224    """
   225    # TODO(BEAM-8335): we temporarily only cache replaceable unbounded sources.
   226    # Add logic for other cacheable sources here when they are available.
   227    has_cache = utils.has_unbounded_sources(user_pipeline)
   228    if has_cache:
   229      if not isinstance(ie.current_env().get_cache_manager(user_pipeline,
   230                                                           create_if_absent=True),
   231                        streaming_cache.StreamingCache):
   232  
   233        file_based_cm = ie.current_env().get_cache_manager(user_pipeline)
   234        cache_dir = file_based_cm._cache_dir
   235        cache_root = ie.current_env().options.cache_root
   236        if cache_root:
   237          if cache_root.startswith('gs://'):
   238            raise ValueError(
   239                'GCS cache paths are not currently supported for '
   240                'streaming pipelines.')
   241          cache_dir = cache_root
   242        ie.current_env().set_cache_manager(
   243            streaming_cache.StreamingCache(
   244                cache_dir,
   245                is_cache_complete=is_cache_complete,
   246                sample_resolution_sec=1.0,
   247                saved_pcoders=file_based_cm._saved_pcoders),
   248            user_pipeline)
   249    return has_cache
   250  
   251  
   252  def attempt_to_cancel_background_caching_job(user_pipeline):
   253    """Attempts to cancel background source recording job for a user-defined
   254    pipeline.
   255  
   256    If no background source recording job needs to be cancelled, NOOP. Otherwise,
   257    cancel such job.
   258    """
   259    job = ie.current_env().get_background_caching_job(user_pipeline)
   260    if job:
   261      job.cancel()
   262  
   263  
   264  def attempt_to_stop_test_stream_service(user_pipeline):
   265    """Attempts to stop the gRPC server/service serving the test stream.
   266  
   267    If there is no such server started, NOOP. Otherwise, stop it.
   268    """
   269    if is_a_test_stream_service_running(user_pipeline):
   270      ie.current_env().evict_test_stream_service_controller(user_pipeline).stop()
   271  
   272  
   273  def is_a_test_stream_service_running(user_pipeline):
   274    """Checks to see if there is a gPRC server/service running that serves the
   275    test stream to any job started from the given user_pipeline.
   276    """
   277    return ie.current_env().get_test_stream_service_controller(
   278        user_pipeline) is not None
   279  
   280  
   281  def is_source_to_cache_changed(
   282      user_pipeline, update_cached_source_signature=True):
   283    """Determines if there is any change in the sources that need to be cached
   284    used by the user-defined pipeline.
   285  
   286    Due to the expensiveness of computations and for the simplicity of usage, this
   287    function is not idempotent because Interactive Beam automatically discards
   288    previously tracked signature of transforms and tracks the current signature of
   289    transforms for the user-defined pipeline if there is any change.
   290  
   291    When it's True, there is addition/deletion/mutation of source transforms that
   292    requires a new background source recording job.
   293    """
   294    # By default gets empty set if the user_pipeline is first time seen because
   295    # we can treat it as adding transforms.
   296    recorded_signature = ie.current_env().get_cached_source_signature(
   297        user_pipeline)
   298    current_signature = extract_source_to_cache_signature(user_pipeline)
   299    is_changed = not current_signature.issubset(recorded_signature)
   300    # The computation of extract_unbounded_source_signature is expensive, track on
   301    # change by default.
   302    if is_changed and update_cached_source_signature:
   303      options = ie.current_env().options
   304      # No info needed when recording replay is disabled.
   305      if options.enable_recording_replay:
   306        if not recorded_signature:
   307  
   308          def sizeof_fmt(num, suffix='B'):
   309            for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
   310              if abs(num) < 1000.0:
   311                return "%3.1f%s%s" % (num, unit, suffix)
   312              num /= 1000.0
   313            return "%.1f%s%s" % (num, 'Yi', suffix)
   314  
   315          _LOGGER.info(
   316              'Interactive Beam has detected unbounded sources in your pipeline. '
   317              'In order to have a deterministic replay, a segment of data will '
   318              'be recorded from all sources for %s seconds or until a total of '
   319              '%s have been written to disk.',
   320              options.recording_duration.total_seconds(),
   321              sizeof_fmt(options.recording_size_limit))
   322        else:
   323          _LOGGER.info(
   324              'Interactive Beam has detected a new streaming source was '
   325              'added to the pipeline. In order for the cached streaming '
   326              'data to start at the same time, all recorded data has been '
   327              'cleared and a new segment of data will be recorded.')
   328  
   329      ie.current_env().cleanup(user_pipeline)
   330      ie.current_env().set_cached_source_signature(
   331          user_pipeline, current_signature)
   332      ie.current_env().add_user_pipeline(user_pipeline)
   333    return is_changed
   334  
   335  
   336  def extract_source_to_cache_signature(user_pipeline):
   337    """Extracts a set of signature for sources that need to be cached in the
   338    user-defined pipeline.
   339  
   340    A signature is a str representation of urn and payload of a source.
   341    """
   342    # TODO(BEAM-8335): we temporarily only cache replaceable unbounded sources.
   343    # Add logic for other cacheable sources here when they are available.
   344    unbounded_sources_as_applied_transforms = utils.unbounded_sources(
   345        user_pipeline)
   346    unbounded_sources_as_ptransforms = set(
   347        map(lambda x: x.transform, unbounded_sources_as_applied_transforms))
   348    _, context = user_pipeline.to_runner_api(return_context=True)
   349    signature = set(
   350        map(
   351            lambda transform: str(transform.to_runner_api(context)),
   352            unbounded_sources_as_ptransforms))
   353    return signature