github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/retry.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Retry decorators for calls raising exceptions. 19 20 For internal use only; no backwards-compatibility guarantees. 21 22 This module is used mostly to decorate all integration points where the code 23 makes calls to remote services. Searching through the code base for @retry 24 should find all such places. For this reason even places where retry is not 25 needed right now use a @retry.no_retries decorator. 26 """ 27 28 # pytype: skip-file 29 30 import functools 31 import logging 32 import random 33 import sys 34 import time 35 import traceback 36 37 from apache_beam.io.filesystem import BeamIOError 38 39 # Protect against environments where apitools library is not available. 40 # pylint: disable=wrong-import-order, wrong-import-position 41 # TODO(sourabhbajaj): Remove the GCP specific error code to a submodule 42 try: 43 from apitools.base.py.exceptions import HttpError 44 from google.api_core.exceptions import GoogleAPICallError 45 except ImportError as e: 46 HttpError = None 47 GoogleAPICallError = None # type: ignore 48 49 # Protect against environments where aws tools are not available. 50 # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports 51 try: 52 from apache_beam.io.aws.clients.s3 import messages as _s3messages 53 except ImportError: 54 S3ClientError = None 55 else: 56 S3ClientError = _s3messages.S3ClientError 57 # pylint: enable=wrong-import-order, wrong-import-position 58 59 _LOGGER = logging.getLogger(__name__) 60 61 62 class PermanentException(Exception): 63 """Base class for exceptions that should not be retried.""" 64 pass 65 66 67 class FuzzedExponentialIntervals(object): 68 """Iterable for intervals that are exponentially spaced, with fuzzing. 69 70 On iteration, yields retry interval lengths, in seconds. Every iteration over 71 this iterable will yield differently fuzzed interval lengths, as long as fuzz 72 is nonzero. 73 74 Args: 75 initial_delay_secs: The delay before the first retry, in seconds. 76 num_retries: The total number of times to retry. 77 factor: The exponential factor to use on subsequent retries. 78 Default is 2 (doubling). 79 fuzz: A value between 0 and 1, indicating the fraction of fuzz. For a 80 given delay d, the fuzzed delay is randomly chosen between 81 [(1 - fuzz) * d, d]. 82 max_delay_secs: Maximum delay (in seconds). After this limit is reached, 83 further tries use max_delay_sec instead of exponentially increasing 84 the time. Defaults to 1 hour. 85 stop_after_secs: Places a limit on the sum of intervals returned (in 86 seconds), such that the sum is <= stop_after_secs. Defaults to disabled 87 (None). You may need to increase num_retries to effectively use this 88 feature. 89 """ 90 def __init__( 91 self, 92 initial_delay_secs, 93 num_retries, 94 factor=2, 95 fuzz=0.5, 96 max_delay_secs=60 * 60 * 1, 97 stop_after_secs=None): 98 self._initial_delay_secs = initial_delay_secs 99 if num_retries > 10000: 100 raise ValueError('num_retries parameter cannot exceed 10000.') 101 self._num_retries = num_retries 102 self._factor = factor 103 if not 0 <= fuzz <= 1: 104 raise ValueError('fuzz parameter expected to be in [0, 1] range.') 105 self._fuzz = fuzz 106 self._max_delay_secs = max_delay_secs 107 self._stop_after_secs = stop_after_secs 108 109 def __iter__(self): 110 current_delay_secs = min(self._max_delay_secs, self._initial_delay_secs) 111 total_delay_secs = 0 112 for _ in range(self._num_retries): 113 fuzz_multiplier = 1 - self._fuzz + random.random() * self._fuzz 114 delay_secs = current_delay_secs * fuzz_multiplier 115 total_delay_secs += delay_secs 116 if (self._stop_after_secs is not None and 117 total_delay_secs > self._stop_after_secs): 118 break 119 yield delay_secs 120 current_delay_secs = min( 121 self._max_delay_secs, current_delay_secs * self._factor) 122 123 124 def retry_on_server_errors_filter(exception): 125 """Filter allowing retries on server errors and non-HttpErrors.""" 126 if (HttpError is not None) and isinstance(exception, HttpError): 127 return exception.status_code >= 500 128 if GoogleAPICallError is not None and isinstance(exception, 129 GoogleAPICallError): 130 if exception.code >= 500: # 500 are internal server errors 131 return True 132 else: 133 # If we have a GoogleAPICallError with a code that doesn't 134 # indicate a server error, we do not need to retry. 135 return False 136 if (S3ClientError is not None) and isinstance(exception, S3ClientError): 137 return exception.code is None or exception.code >= 500 138 return not isinstance(exception, PermanentException) 139 140 141 # TODO(https://github.com/apache/beam/issues/19350): Dataflow returns 404 for 142 # job ids that actually exist. Retry on those errors. 143 def retry_on_server_errors_and_notfound_filter(exception): 144 if HttpError is not None and isinstance(exception, HttpError): 145 if exception.status_code == 404: # 404 Not Found 146 return True 147 if GoogleAPICallError is not None and isinstance(exception, 148 GoogleAPICallError): 149 if exception.code == 404: # 404 Not found 150 return True 151 return retry_on_server_errors_filter(exception) 152 153 154 def retry_on_server_errors_and_timeout_filter(exception): 155 if HttpError is not None and isinstance(exception, HttpError): 156 if exception.status_code == 408: # 408 Request Timeout 157 return True 158 if GoogleAPICallError is not None and isinstance(exception, 159 GoogleAPICallError): 160 if exception.code == 408: # 408 Request Timeout 161 return True 162 if S3ClientError is not None and isinstance(exception, S3ClientError): 163 if exception.code == 408: # 408 Request Timeout 164 return True 165 return retry_on_server_errors_filter(exception) 166 167 168 def retry_on_server_errors_timeout_or_quota_issues_filter(exception): 169 """Retry on server, timeout and 403 errors. 170 171 403 errors can be accessDenied, billingNotEnabled, and also quotaExceeded, 172 rateLimitExceeded.""" 173 if HttpError is not None and isinstance(exception, HttpError): 174 if exception.status_code == 403: 175 return True 176 if GoogleAPICallError is not None and isinstance(exception, 177 GoogleAPICallError): 178 if exception.code == 403: 179 return True 180 if S3ClientError is not None and isinstance(exception, S3ClientError): 181 if exception.code == 403: 182 return True 183 return retry_on_server_errors_and_timeout_filter(exception) 184 185 186 def retry_on_beam_io_error_filter(exception): 187 """Filter allowing retries on Beam IO errors.""" 188 return isinstance(exception, BeamIOError) 189 190 191 def retry_if_valid_input_but_server_error_and_timeout_filter(exception): 192 if isinstance(exception, ValueError): 193 return False 194 return retry_on_server_errors_and_timeout_filter(exception) 195 196 197 SERVER_ERROR_OR_TIMEOUT_CODES = [408, 500, 502, 503, 504, 598, 599] 198 199 200 class Clock(object): 201 """A simple clock implementing sleep().""" 202 def sleep(self, value): 203 time.sleep(value) 204 205 206 def no_retries(fun): 207 """A retry decorator for places where we do not want retries.""" 208 return with_exponential_backoff(retry_filter=lambda _: False, clock=None)(fun) 209 210 211 def with_exponential_backoff( 212 num_retries=7, 213 initial_delay_secs=5.0, 214 logger=_LOGGER.warning, 215 retry_filter=retry_on_server_errors_filter, 216 clock=Clock(), 217 fuzz=True, 218 factor=2, 219 max_delay_secs=60 * 60, 220 stop_after_secs=None): 221 """Decorator with arguments that control the retry logic. 222 223 Args: 224 num_retries: The total number of times to retry. 225 initial_delay_secs: The delay before the first retry, in seconds. 226 logger: A callable used to report an exception. Must have the same signature 227 as functions in the standard logging module. The default is 228 _LOGGER.warning. 229 retry_filter: A callable getting the exception raised and returning True 230 if the retry should happen. For instance we do not want to retry on 231 404 Http errors most of the time. The default value will return true 232 for server errors (HTTP status code >= 500) and non Http errors. 233 clock: A clock object implementing a sleep method. The default clock will 234 use time.sleep(). 235 fuzz: True if the delay should be fuzzed (default). During testing False 236 can be used so that the delays are not randomized. 237 factor: The exponential factor to use on subsequent retries. 238 Default is 2 (doubling). 239 max_delay_secs: Maximum delay (in seconds). After this limit is reached, 240 further tries use max_delay_sec instead of exponentially increasing 241 the time. Defaults to 1 hour. 242 stop_after_secs: Places a limit on the sum of delays between retries, such 243 that the sum is <= stop_after_secs. Retries will stop after the limit is 244 reached. Defaults to disabled (None). You may need to increase num_retries 245 to effectively use this feature. 246 247 Returns: 248 As per Python decorators with arguments pattern returns a decorator 249 for the function which in turn will return the wrapped (decorated) function. 250 251 The decorator is intended to be used on callables that make HTTP or RPC 252 requests that can temporarily timeout or have transient errors. For instance 253 the make_http_request() call below will be retried 16 times with exponential 254 backoff and fuzzing of the delay interval (default settings). 255 256 from apache_beam.utils import retry 257 # ... 258 @retry.with_exponential_backoff() 259 make_http_request(args) 260 """ 261 def real_decorator(fun): 262 """The real decorator whose purpose is to return the wrapped function.""" 263 @functools.wraps(fun) 264 def wrapper(*args, **kwargs): 265 retry_intervals = iter( 266 FuzzedExponentialIntervals( 267 initial_delay_secs, 268 num_retries, 269 factor, 270 fuzz=0.5 if fuzz else 0, 271 max_delay_secs=max_delay_secs, 272 stop_after_secs=stop_after_secs)) 273 while True: 274 try: 275 return fun(*args, **kwargs) 276 except Exception as exn: # pylint: disable=broad-except 277 if not retry_filter(exn): 278 raise 279 # Get the traceback object for the current exception. The 280 # sys.exc_info() function returns a tuple with three elements: 281 # exception type, exception value, and exception traceback. 282 exn_traceback = sys.exc_info()[2] 283 try: 284 try: 285 sleep_interval = next(retry_intervals) 286 except StopIteration: 287 # Re-raise the original exception since we finished the retries. 288 raise exn.with_traceback(exn_traceback) 289 290 logger( 291 'Retry with exponential backoff: waiting for %s seconds before ' 292 'retrying %s because we caught exception: %s ' 293 'Traceback for above exception (most recent call last):\n%s', 294 sleep_interval, 295 getattr(fun, '__name__', str(fun)), 296 ''.join(traceback.format_exception_only(exn.__class__, exn)), 297 ''.join(traceback.format_tb(exn_traceback))) 298 clock.sleep(sleep_interval) 299 finally: 300 # Traceback objects in locals can cause reference cycles that will 301 # prevent garbage collection. Clear it now since we do not need 302 # it anymore. 303 exn_traceback = None 304 305 return wrapper 306 307 return real_decorator