github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/retry.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Retry decorators for calls raising exceptions.
    19  
    20  For internal use only; no backwards-compatibility guarantees.
    21  
    22  This module is used mostly to decorate all integration points where the code
    23  makes calls to remote services. Searching through the code base for @retry
    24  should find all such places. For this reason even places where retry is not
    25  needed right now use a @retry.no_retries decorator.
    26  """
    27  
    28  # pytype: skip-file
    29  
    30  import functools
    31  import logging
    32  import random
    33  import sys
    34  import time
    35  import traceback
    36  
    37  from apache_beam.io.filesystem import BeamIOError
    38  
    39  # Protect against environments where apitools library is not available.
    40  # pylint: disable=wrong-import-order, wrong-import-position
    41  # TODO(sourabhbajaj): Remove the GCP specific error code to a submodule
    42  try:
    43    from apitools.base.py.exceptions import HttpError
    44    from google.api_core.exceptions import GoogleAPICallError
    45  except ImportError as e:
    46    HttpError = None
    47    GoogleAPICallError = None  # type: ignore
    48  
    49  # Protect against environments where aws tools are not available.
    50  # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports
    51  try:
    52    from apache_beam.io.aws.clients.s3 import messages as _s3messages
    53  except ImportError:
    54    S3ClientError = None
    55  else:
    56    S3ClientError = _s3messages.S3ClientError
    57  # pylint: enable=wrong-import-order, wrong-import-position
    58  
    59  _LOGGER = logging.getLogger(__name__)
    60  
    61  
    62  class PermanentException(Exception):
    63    """Base class for exceptions that should not be retried."""
    64    pass
    65  
    66  
    67  class FuzzedExponentialIntervals(object):
    68    """Iterable for intervals that are exponentially spaced, with fuzzing.
    69  
    70    On iteration, yields retry interval lengths, in seconds. Every iteration over
    71    this iterable will yield differently fuzzed interval lengths, as long as fuzz
    72    is nonzero.
    73  
    74    Args:
    75      initial_delay_secs: The delay before the first retry, in seconds.
    76      num_retries: The total number of times to retry.
    77      factor: The exponential factor to use on subsequent retries.
    78        Default is 2 (doubling).
    79      fuzz: A value between 0 and 1, indicating the fraction of fuzz. For a
    80        given delay d, the fuzzed delay is randomly chosen between
    81        [(1 - fuzz) * d, d].
    82      max_delay_secs: Maximum delay (in seconds). After this limit is reached,
    83        further tries use max_delay_sec instead of exponentially increasing
    84        the time. Defaults to 1 hour.
    85      stop_after_secs: Places a limit on the sum of intervals returned (in
    86        seconds), such that the sum is <= stop_after_secs. Defaults to disabled
    87        (None). You may need to increase num_retries to effectively use this
    88        feature.
    89    """
    90    def __init__(
    91        self,
    92        initial_delay_secs,
    93        num_retries,
    94        factor=2,
    95        fuzz=0.5,
    96        max_delay_secs=60 * 60 * 1,
    97        stop_after_secs=None):
    98      self._initial_delay_secs = initial_delay_secs
    99      if num_retries > 10000:
   100        raise ValueError('num_retries parameter cannot exceed 10000.')
   101      self._num_retries = num_retries
   102      self._factor = factor
   103      if not 0 <= fuzz <= 1:
   104        raise ValueError('fuzz parameter expected to be in [0, 1] range.')
   105      self._fuzz = fuzz
   106      self._max_delay_secs = max_delay_secs
   107      self._stop_after_secs = stop_after_secs
   108  
   109    def __iter__(self):
   110      current_delay_secs = min(self._max_delay_secs, self._initial_delay_secs)
   111      total_delay_secs = 0
   112      for _ in range(self._num_retries):
   113        fuzz_multiplier = 1 - self._fuzz + random.random() * self._fuzz
   114        delay_secs = current_delay_secs * fuzz_multiplier
   115        total_delay_secs += delay_secs
   116        if (self._stop_after_secs is not None and
   117            total_delay_secs > self._stop_after_secs):
   118          break
   119        yield delay_secs
   120        current_delay_secs = min(
   121            self._max_delay_secs, current_delay_secs * self._factor)
   122  
   123  
   124  def retry_on_server_errors_filter(exception):
   125    """Filter allowing retries on server errors and non-HttpErrors."""
   126    if (HttpError is not None) and isinstance(exception, HttpError):
   127      return exception.status_code >= 500
   128    if GoogleAPICallError is not None and isinstance(exception,
   129                                                     GoogleAPICallError):
   130      if exception.code >= 500:  # 500 are internal server errors
   131        return True
   132      else:
   133        # If we have a GoogleAPICallError with a code that doesn't
   134        # indicate a server error, we do not need to retry.
   135        return False
   136    if (S3ClientError is not None) and isinstance(exception, S3ClientError):
   137      return exception.code is None or exception.code >= 500
   138    return not isinstance(exception, PermanentException)
   139  
   140  
   141  # TODO(https://github.com/apache/beam/issues/19350): Dataflow returns 404 for
   142  # job ids that actually exist. Retry on those errors.
   143  def retry_on_server_errors_and_notfound_filter(exception):
   144    if HttpError is not None and isinstance(exception, HttpError):
   145      if exception.status_code == 404:  # 404 Not Found
   146        return True
   147    if GoogleAPICallError is not None and isinstance(exception,
   148                                                     GoogleAPICallError):
   149      if exception.code == 404:  # 404 Not found
   150        return True
   151    return retry_on_server_errors_filter(exception)
   152  
   153  
   154  def retry_on_server_errors_and_timeout_filter(exception):
   155    if HttpError is not None and isinstance(exception, HttpError):
   156      if exception.status_code == 408:  # 408 Request Timeout
   157        return True
   158    if GoogleAPICallError is not None and isinstance(exception,
   159                                                     GoogleAPICallError):
   160      if exception.code == 408:  # 408 Request Timeout
   161        return True
   162    if S3ClientError is not None and isinstance(exception, S3ClientError):
   163      if exception.code == 408:  # 408 Request Timeout
   164        return True
   165    return retry_on_server_errors_filter(exception)
   166  
   167  
   168  def retry_on_server_errors_timeout_or_quota_issues_filter(exception):
   169    """Retry on server, timeout and 403 errors.
   170  
   171    403 errors can be accessDenied, billingNotEnabled, and also quotaExceeded,
   172    rateLimitExceeded."""
   173    if HttpError is not None and isinstance(exception, HttpError):
   174      if exception.status_code == 403:
   175        return True
   176    if GoogleAPICallError is not None and isinstance(exception,
   177                                                     GoogleAPICallError):
   178      if exception.code == 403:
   179        return True
   180    if S3ClientError is not None and isinstance(exception, S3ClientError):
   181      if exception.code == 403:
   182        return True
   183    return retry_on_server_errors_and_timeout_filter(exception)
   184  
   185  
   186  def retry_on_beam_io_error_filter(exception):
   187    """Filter allowing retries on Beam IO errors."""
   188    return isinstance(exception, BeamIOError)
   189  
   190  
   191  def retry_if_valid_input_but_server_error_and_timeout_filter(exception):
   192    if isinstance(exception, ValueError):
   193      return False
   194    return retry_on_server_errors_and_timeout_filter(exception)
   195  
   196  
   197  SERVER_ERROR_OR_TIMEOUT_CODES = [408, 500, 502, 503, 504, 598, 599]
   198  
   199  
   200  class Clock(object):
   201    """A simple clock implementing sleep()."""
   202    def sleep(self, value):
   203      time.sleep(value)
   204  
   205  
   206  def no_retries(fun):
   207    """A retry decorator for places where we do not want retries."""
   208    return with_exponential_backoff(retry_filter=lambda _: False, clock=None)(fun)
   209  
   210  
   211  def with_exponential_backoff(
   212      num_retries=7,
   213      initial_delay_secs=5.0,
   214      logger=_LOGGER.warning,
   215      retry_filter=retry_on_server_errors_filter,
   216      clock=Clock(),
   217      fuzz=True,
   218      factor=2,
   219      max_delay_secs=60 * 60,
   220      stop_after_secs=None):
   221    """Decorator with arguments that control the retry logic.
   222  
   223    Args:
   224      num_retries: The total number of times to retry.
   225      initial_delay_secs: The delay before the first retry, in seconds.
   226      logger: A callable used to report an exception. Must have the same signature
   227        as functions in the standard logging module. The default is
   228        _LOGGER.warning.
   229      retry_filter: A callable getting the exception raised and returning True
   230        if the retry should happen. For instance we do not want to retry on
   231        404 Http errors most of the time. The default value will return true
   232        for server errors (HTTP status code >= 500) and non Http errors.
   233      clock: A clock object implementing a sleep method. The default clock will
   234        use time.sleep().
   235      fuzz: True if the delay should be fuzzed (default). During testing False
   236        can be used so that the delays are not randomized.
   237      factor: The exponential factor to use on subsequent retries.
   238        Default is 2 (doubling).
   239      max_delay_secs: Maximum delay (in seconds). After this limit is reached,
   240        further tries use max_delay_sec instead of exponentially increasing
   241        the time. Defaults to 1 hour.
   242      stop_after_secs: Places a limit on the sum of delays between retries, such
   243        that the sum is <= stop_after_secs. Retries will stop after the limit is
   244        reached. Defaults to disabled (None). You may need to increase num_retries
   245        to effectively use this feature.
   246  
   247    Returns:
   248      As per Python decorators with arguments pattern returns a decorator
   249      for the function which in turn will return the wrapped (decorated) function.
   250  
   251    The decorator is intended to be used on callables that make HTTP or RPC
   252    requests that can temporarily timeout or have transient errors. For instance
   253    the make_http_request() call below will be retried 16 times with exponential
   254    backoff and fuzzing of the delay interval (default settings).
   255  
   256    from apache_beam.utils import retry
   257    # ...
   258    @retry.with_exponential_backoff()
   259    make_http_request(args)
   260    """
   261    def real_decorator(fun):
   262      """The real decorator whose purpose is to return the wrapped function."""
   263      @functools.wraps(fun)
   264      def wrapper(*args, **kwargs):
   265        retry_intervals = iter(
   266            FuzzedExponentialIntervals(
   267                initial_delay_secs,
   268                num_retries,
   269                factor,
   270                fuzz=0.5 if fuzz else 0,
   271                max_delay_secs=max_delay_secs,
   272                stop_after_secs=stop_after_secs))
   273        while True:
   274          try:
   275            return fun(*args, **kwargs)
   276          except Exception as exn:  # pylint: disable=broad-except
   277            if not retry_filter(exn):
   278              raise
   279            # Get the traceback object for the current exception. The
   280            # sys.exc_info() function returns a tuple with three elements:
   281            # exception type, exception value, and exception traceback.
   282            exn_traceback = sys.exc_info()[2]
   283            try:
   284              try:
   285                sleep_interval = next(retry_intervals)
   286              except StopIteration:
   287                # Re-raise the original exception since we finished the retries.
   288                raise exn.with_traceback(exn_traceback)
   289  
   290              logger(
   291                  'Retry with exponential backoff: waiting for %s seconds before '
   292                  'retrying %s because we caught exception: %s '
   293                  'Traceback for above exception (most recent call last):\n%s',
   294                  sleep_interval,
   295                  getattr(fun, '__name__', str(fun)),
   296                  ''.join(traceback.format_exception_only(exn.__class__, exn)),
   297                  ''.join(traceback.format_tb(exn_traceback)))
   298              clock.sleep(sleep_interval)
   299            finally:
   300              # Traceback objects in locals can cause reference cycles that will
   301              # prevent garbage collection. Clear it now since we do not need
   302              # it anymore.
   303              exn_traceback = None
   304  
   305      return wrapper
   306  
   307    return real_decorator