github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/sdf_utils.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  
    20  """Common utility class to help SDK harness to execute an SDF. """
    21  
    22  import logging
    23  import threading
    24  from typing import TYPE_CHECKING
    25  from typing import Any
    26  from typing import NamedTuple
    27  from typing import Optional
    28  from typing import Tuple
    29  from typing import Union
    30  
    31  from apache_beam.transforms.core import WatermarkEstimatorProvider
    32  from apache_beam.utils.timestamp import Duration
    33  from apache_beam.utils.timestamp import Timestamp
    34  from apache_beam.utils.windowed_value import WindowedValue
    35  
    36  if TYPE_CHECKING:
    37    from apache_beam.io.iobase import RestrictionProgress
    38    from apache_beam.io.iobase import RestrictionTracker
    39    from apache_beam.io.iobase import WatermarkEstimator
    40  
    41  _LOGGER = logging.getLogger(__name__)
    42  
    43  SplitResultPrimary = NamedTuple(
    44      'SplitResultPrimary', [('primary_value', WindowedValue)])
    45  
    46  SplitResultResidual = NamedTuple(
    47      'SplitResultResidual',
    48      [('residual_value', WindowedValue), ('current_watermark', Timestamp),
    49       ('deferred_timestamp', Optional[Duration])])
    50  
    51  
    52  class ThreadsafeRestrictionTracker(object):
    53    """A thread-safe wrapper which wraps a `RestrictionTracker`.
    54  
    55    This wrapper guarantees synchronization of modifying restrictions across
    56    multi-thread.
    57    """
    58    def __init__(self, restriction_tracker):
    59      # type: (RestrictionTracker) -> None
    60      from apache_beam.io.iobase import RestrictionTracker
    61      if not isinstance(restriction_tracker, RestrictionTracker):
    62        raise ValueError(
    63            'Initialize ThreadsafeRestrictionTracker requires'
    64            'RestrictionTracker.')
    65      self._restriction_tracker = restriction_tracker
    66      # Records an absolute timestamp when defer_remainder is called.
    67      self._timestamp = None
    68      self._lock = threading.RLock()
    69      self._deferred_residual = None
    70      self._deferred_timestamp = None  # type: Optional[Union[Timestamp, Duration]]
    71  
    72    def current_restriction(self):
    73      with self._lock:
    74        return self._restriction_tracker.current_restriction()
    75  
    76    def try_claim(self, position):
    77      with self._lock:
    78        return self._restriction_tracker.try_claim(position)
    79  
    80    def defer_remainder(self, deferred_time=None):
    81      """Performs self-checkpoint on current processing restriction with an
    82      expected resuming time.
    83  
    84      Self-checkpoint could happen during processing elements. When executing an
    85      DoFn.process(), you may want to stop processing an element and resuming
    86      later if current element has been processed quit a long time or you also
    87      want to have some outputs from other elements. ``defer_remainder()`` can be
    88      called on per element if needed.
    89  
    90      Args:
    91        deferred_time: A relative ``Duration`` that indicates the ideal time gap
    92          between now and resuming, or an absolute ``Timestamp`` for resuming
    93          execution time. If the time_delay is None, the deferred work will be
    94          executed as soon as possible.
    95      """
    96  
    97      # Record current time for calculating deferred_time later.
    98      with self._lock:
    99        self._timestamp = Timestamp.now()
   100        if deferred_time and not isinstance(deferred_time, (Duration, Timestamp)):
   101          raise ValueError(
   102              'The timestamp of deter_remainder() should be a '
   103              'Duration or a Timestamp, or None.')
   104        self._deferred_timestamp = deferred_time
   105        checkpoint = self.try_split(0)
   106        if checkpoint:
   107          _, self._deferred_residual = checkpoint
   108  
   109    def check_done(self):
   110      with self._lock:
   111        return self._restriction_tracker.check_done()
   112  
   113    def current_progress(self):
   114      # type: () -> RestrictionProgress
   115      with self._lock:
   116        return self._restriction_tracker.current_progress()
   117  
   118    def try_split(self, fraction_of_remainder):
   119      with self._lock:
   120        return self._restriction_tracker.try_split(fraction_of_remainder)
   121  
   122    def deferred_status(self):
   123      # type: () -> Optional[Tuple[Any, Duration]]
   124  
   125      """Returns deferred work which is produced by ``defer_remainder()``.
   126  
   127      When there is a self-checkpoint performed, the system needs to fulfill the
   128      DelayedBundleApplication with deferred_work for a  ProcessBundleResponse.
   129      The system calls this API to get deferred_residual with watermark together
   130      to help the runner to schedule a future work.
   131  
   132      Returns: (deferred_residual, time_delay) if having any residual, else None.
   133      """
   134      if self._deferred_residual:
   135        # If _deferred_timestamp is None, create Duration(0).
   136        if not self._deferred_timestamp:
   137          self._deferred_timestamp = Duration()
   138        # If an absolute timestamp is provided, calculate the delta between
   139        # the absoluted time and the time deferred_status() is called.
   140        elif isinstance(self._deferred_timestamp, Timestamp):
   141          self._deferred_timestamp = (self._deferred_timestamp - Timestamp.now())
   142        # If a Duration is provided, the deferred time should be:
   143        # provided duration - the spent time since the defer_remainder() is
   144        # called.
   145        elif isinstance(self._deferred_timestamp, Duration):
   146          self._deferred_timestamp -= (Timestamp.now() - self._timestamp)
   147        return self._deferred_residual, self._deferred_timestamp
   148      return None
   149  
   150    def is_bounded(self):
   151      return self._restriction_tracker.is_bounded()
   152  
   153  
   154  class RestrictionTrackerView(object):
   155    """A DoFn view of thread-safe RestrictionTracker.
   156  
   157    The RestrictionTrackerView wraps a ThreadsafeRestrictionTracker and only
   158    exposes APIs that will be called by a ``DoFn.process()``. During execution
   159    time, the RestrictionTrackerView will be fed into the ``DoFn.process`` as a
   160    restriction_tracker.
   161    """
   162    def __init__(self, threadsafe_restriction_tracker):
   163      # type: (ThreadsafeRestrictionTracker) -> None
   164      if not isinstance(threadsafe_restriction_tracker,
   165                        ThreadsafeRestrictionTracker):
   166        raise ValueError(
   167            'Initialize RestrictionTrackerView requires '
   168            'ThreadsafeRestrictionTracker.')
   169      self._threadsafe_restriction_tracker = threadsafe_restriction_tracker
   170  
   171    def current_restriction(self):
   172      return self._threadsafe_restriction_tracker.current_restriction()
   173  
   174    def try_claim(self, position):
   175      return self._threadsafe_restriction_tracker.try_claim(position)
   176  
   177    def defer_remainder(self, deferred_time=None):
   178      self._threadsafe_restriction_tracker.defer_remainder(deferred_time)
   179  
   180    def is_bounded(self):
   181      self._threadsafe_restriction_tracker.is_bounded()
   182  
   183  
   184  class ThreadsafeWatermarkEstimator(object):
   185    """A threadsafe wrapper which wraps a WatermarkEstimator with locking
   186    mechanism to guarantee multi-thread safety.
   187    """
   188    def __init__(self, watermark_estimator):
   189      # type: (WatermarkEstimator) -> None
   190      from apache_beam.io.iobase import WatermarkEstimator
   191      if not isinstance(watermark_estimator, WatermarkEstimator):
   192        raise ValueError('Initializing Threadsafe requires a WatermarkEstimator')
   193      self._watermark_estimator = watermark_estimator
   194      self._lock = threading.Lock()
   195  
   196    def __getattr__(self, attr):
   197      if hasattr(self._watermark_estimator, attr):
   198  
   199        def method_wrapper(*args, **kw):
   200          with self._lock:
   201            return getattr(self._watermark_estimator, attr)(*args, **kw)
   202  
   203        return method_wrapper
   204      raise AttributeError(attr)
   205  
   206    def get_estimator_state(self):
   207      with self._lock:
   208        return self._watermark_estimator.get_estimator_state()
   209  
   210    def current_watermark(self):
   211      # type: () -> Timestamp
   212      with self._lock:
   213        return self._watermark_estimator.current_watermark()
   214  
   215    def observe_timestamp(self, timestamp):
   216      # type: (Timestamp) -> None
   217      if not isinstance(timestamp, Timestamp):
   218        raise ValueError(
   219            'Input of observe_timestamp should be a Timestamp '
   220            'object')
   221      with self._lock:
   222        self._watermark_estimator.observe_timestamp(timestamp)
   223  
   224  
   225  class NoOpWatermarkEstimatorProvider(WatermarkEstimatorProvider):
   226    """A WatermarkEstimatorProvider which creates NoOpWatermarkEstimator for the
   227    framework.
   228    """
   229    def initial_estimator_state(self, element, restriction):
   230      return None
   231  
   232    def create_watermark_estimator(self, estimator_state):
   233      from apache_beam.io.iobase import WatermarkEstimator
   234  
   235      class _NoOpWatermarkEstimator(WatermarkEstimator):
   236        """A No-op WatermarkEstimator which is provided for the framework if there
   237        is no custom one.
   238        """
   239        def observe_timestamp(self, timestamp):
   240          pass
   241  
   242        def current_watermark(self):
   243          return None
   244  
   245        def get_estimator_state(self):
   246          return None
   247  
   248      return _NoOpWatermarkEstimator()