github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/aws/s3io.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """AWS S3 client 19 """ 20 21 # pytype: skip-file 22 23 import errno 24 import io 25 import logging 26 import re 27 import time 28 import traceback 29 30 from apache_beam.io.aws.clients.s3 import messages 31 from apache_beam.io.filesystemio import Downloader 32 from apache_beam.io.filesystemio import DownloaderStream 33 from apache_beam.io.filesystemio import Uploader 34 from apache_beam.io.filesystemio import UploaderStream 35 from apache_beam.utils import retry 36 from apache_beam.utils.annotations import deprecated 37 38 try: 39 # pylint: disable=wrong-import-order, wrong-import-position 40 # pylint: disable=ungrouped-imports 41 from apache_beam.io.aws.clients.s3 import boto3_client 42 BOTO3_INSTALLED = True 43 except ImportError: 44 BOTO3_INSTALLED = False 45 46 MAX_BATCH_OPERATION_SIZE = 100 47 48 49 def parse_s3_path(s3_path, object_optional=False): 50 """Return the bucket and object names of the given s3:// path.""" 51 match = re.match('^s3://([^/]+)/(.*)$', s3_path) 52 if match is None or (match.group(2) == '' and not object_optional): 53 raise ValueError('S3 path must be in the form s3://<bucket>/<object>.') 54 return match.group(1), match.group(2) 55 56 57 class S3IO(object): 58 """S3 I/O client.""" 59 def __init__(self, client=None, options=None): 60 if client is None and options is None: 61 raise ValueError('Must provide one of client or options') 62 if client is not None: 63 self.client = client 64 elif BOTO3_INSTALLED: 65 self.client = boto3_client.Client(options=options) 66 else: 67 message = 'AWS dependencies are not installed, and no alternative ' \ 68 'client was provided to S3IO.' 69 raise RuntimeError(message) 70 71 def open( 72 self, 73 filename, 74 mode='r', 75 read_buffer_size=16 * 1024 * 1024, 76 mime_type='application/octet-stream'): 77 """Open an S3 file path for reading or writing. 78 79 Args: 80 filename (str): S3 file path in the form ``s3://<bucket>/<object>``. 81 mode (str): ``'r'`` for reading or ``'w'`` for writing. 82 read_buffer_size (int): Buffer size to use during read operations. 83 mime_type (str): Mime type to set for write operations. 84 85 Returns: 86 S3 file object. 87 88 Raises: 89 ValueError: Invalid open file mode. 90 """ 91 if mode == 'r' or mode == 'rb': 92 downloader = S3Downloader( 93 self.client, filename, buffer_size=read_buffer_size) 94 return io.BufferedReader( 95 DownloaderStream(downloader, mode=mode), buffer_size=read_buffer_size) 96 elif mode == 'w' or mode == 'wb': 97 uploader = S3Uploader(self.client, filename, mime_type) 98 return io.BufferedWriter( 99 UploaderStream(uploader, mode=mode), buffer_size=128 * 1024) 100 else: 101 raise ValueError('Invalid file open mode: %s.' % mode) 102 103 @deprecated(since='2.45.0', current='list_files') 104 def list_prefix(self, path, with_metadata=False): 105 """Lists files matching the prefix. 106 107 ``list_prefix`` has been deprecated. Use `list_files` instead, which returns 108 a generator of file information instead of a dict. 109 110 Args: 111 path: S3 file path pattern in the form s3://<bucket>/[name]. 112 with_metadata: Experimental. Specify whether returns file metadata. 113 114 Returns: 115 If ``with_metadata`` is False: dict of file name -> size; if 116 ``with_metadata`` is True: dict of file name -> tuple(size, timestamp). 117 """ 118 file_info = {} 119 for file_metadata in self.list_files(path, with_metadata): 120 file_info[file_metadata[0]] = file_metadata[1] 121 122 return file_info 123 124 def list_files(self, path, with_metadata=False): 125 """Lists files matching the prefix. 126 127 Args: 128 path: S3 file path pattern in the form s3://<bucket>/[name]. 129 with_metadata: Experimental. Specify whether returns file metadata. 130 131 Returns: 132 If ``with_metadata`` is False: generator of tuple(file name, size); if 133 ``with_metadata`` is True: generator of 134 tuple(file name, tuple(size, timestamp)). 135 """ 136 bucket, prefix = parse_s3_path(path, object_optional=True) 137 request = messages.ListRequest(bucket=bucket, prefix=prefix) 138 139 file_info = set() 140 counter = 0 141 start_time = time.time() 142 143 if with_metadata: 144 logging.debug("Starting the file information of the input") 145 else: 146 logging.debug("Starting the size estimation of the input") 147 148 while True: 149 #The list operation will raise an exception 150 #when trying to list a nonexistent S3 path. 151 #This should not be an issue here. 152 #Ignore this exception or it will break the procedure. 153 try: 154 response = retry.with_exponential_backoff( 155 retry_filter=retry.retry_on_server_errors_and_timeout_filter)( 156 self.client.list)( 157 request) 158 except messages.S3ClientError as e: 159 if e.code == 404: 160 break 161 else: 162 raise e 163 164 for item in response.items: 165 file_name = 's3://%s/%s' % (bucket, item.key) 166 if file_name not in file_info: 167 file_info.add(file_name) 168 counter += 1 169 if counter % 10000 == 0: 170 if with_metadata: 171 logging.info( 172 "Finished computing file information of: %s files", 173 len(file_info)) 174 else: 175 logging.info( 176 "Finished computing size of: %s files", len(file_info)) 177 if with_metadata: 178 yield file_name, ( 179 item.size, self._updated_to_seconds(item.last_modified)) 180 else: 181 yield file_name, item.size 182 183 if response.next_token: 184 request.continuation_token = response.next_token 185 else: 186 break 187 188 logging.log( 189 # do not spam logs when list_prefix is likely used to check empty folder 190 logging.INFO if counter > 0 else logging.DEBUG, 191 "Finished listing %s files in %s seconds.", 192 counter, 193 time.time() - start_time) 194 195 return file_info 196 197 def checksum(self, path): 198 """Looks up the checksum of an S3 object. 199 200 Args: 201 path: S3 file path pattern in the form s3://<bucket>/<name>. 202 """ 203 return self._s3_object(path).etag 204 205 @retry.with_exponential_backoff( 206 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 207 def copy(self, src, dest): 208 """Copies a single S3 file object from src to dest. 209 210 Args: 211 src: S3 file path pattern in the form s3://<bucket>/<name>. 212 dest: S3 file path pattern in the form s3://<bucket>/<name>. 213 214 Raises: 215 TimeoutError: on timeout. 216 """ 217 src_bucket, src_key = parse_s3_path(src) 218 dest_bucket, dest_key = parse_s3_path(dest) 219 request = messages.CopyRequest(src_bucket, src_key, dest_bucket, dest_key) 220 self.client.copy(request) 221 222 # We intentionally do not decorate this method with a retry, since the 223 # underlying copy and delete operations are already idempotent operations 224 # protected by retry decorators. 225 def copy_paths(self, src_dest_pairs): 226 """Copies the given S3 objects from src to dest. This can handle directory 227 or file paths. 228 229 Args: 230 src_dest_pairs: list of (src, dest) tuples of s3://<bucket>/<name> file 231 paths to copy from src to dest 232 Returns: List of tuples of (src, dest, exception) in the same order as the 233 src_dest_pairs argument, where exception is None if the operation 234 succeeded or the relevant exception if the operation failed. 235 """ 236 if not src_dest_pairs: return [] 237 238 results = [] 239 240 for src_path, dest_path in src_dest_pairs: 241 242 # Copy a directory with self.copy_tree 243 if src_path.endswith('/') and dest_path.endswith('/'): 244 try: 245 results += self.copy_tree(src_path, dest_path) 246 except messages.S3ClientError as err: 247 results.append((src_path, dest_path, err)) 248 249 # Copy individual files with self.copy 250 elif not src_path.endswith('/') and not dest_path.endswith('/'): 251 src_bucket, src_key = parse_s3_path(src_path) 252 dest_bucket, dest_key = parse_s3_path(dest_path) 253 request = messages.CopyRequest( 254 src_bucket, src_key, dest_bucket, dest_key) 255 256 try: 257 self.client.copy(request) 258 results.append((src_path, dest_path, None)) 259 except messages.S3ClientError as err: 260 results.append((src_path, dest_path, err)) 261 262 # Mismatched paths (one directory, one non-directory) get an error result 263 else: 264 e = messages.S3ClientError( 265 "Can't copy mismatched paths (one directory, one non-directory):" + 266 ' %s, %s' % (src_path, dest_path), 267 400) 268 results.append((src_path, dest_path, e)) 269 270 return results 271 272 # We intentionally do not decorate this method with a retry, since the 273 # underlying copy and delete operations are already idempotent operations 274 # protected by retry decorators. 275 def copy_tree(self, src, dest): 276 """Renames the given S3 directory and it's contents recursively 277 from src to dest. 278 279 Args: 280 src: S3 file path pattern in the form s3://<bucket>/<name>/. 281 dest: S3 file path pattern in the form s3://<bucket>/<name>/. 282 283 Returns: 284 List of tuples of (src, dest, exception) where exception is None if the 285 operation succeeded or the relevant exception if the operation failed. 286 """ 287 assert src.endswith('/') 288 assert dest.endswith('/') 289 290 results = [] 291 for entry in self.list_prefix(src): 292 rel_path = entry[len(src):] 293 try: 294 self.copy(entry, dest + rel_path) 295 results.append((entry, dest + rel_path, None)) 296 except messages.S3ClientError as e: 297 results.append((entry, dest + rel_path, e)) 298 299 return results 300 301 @retry.with_exponential_backoff( 302 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 303 def delete(self, path): 304 """Deletes a single S3 file object from src to dest. 305 306 Args: 307 src: S3 file path pattern in the form s3://<bucket>/<name>/. 308 dest: S3 file path pattern in the form s3://<bucket>/<name>/. 309 310 Returns: 311 List of tuples of (src, dest, exception) in the same order as the 312 src_dest_pairs argument, where exception is None if the operation 313 succeeded or the relevant exception if the operation failed. 314 """ 315 bucket, object_path = parse_s3_path(path) 316 request = messages.DeleteRequest(bucket, object_path) 317 318 try: 319 self.client.delete(request) 320 except messages.S3ClientError as e: 321 if e.code == 404: 322 return # Same behavior as GCS - don't surface a 404 error 323 else: 324 logging.error('HTTP error while deleting file %s: %s', path, 3) 325 raise e 326 327 # We intentionally do not decorate this method with a retry, since the 328 # underlying copy and delete operations are already idempotent operations 329 # protected by retry decorators. 330 def delete_paths(self, paths): 331 """Deletes the given S3 objects from src to dest. This can handle directory 332 or file paths. 333 334 Args: 335 src: S3 file path pattern in the form s3://<bucket>/<name>/. 336 dest: S3 file path pattern in the form s3://<bucket>/<name>/. 337 338 Returns: 339 List of tuples of (src, dest, exception) in the same order as the 340 src_dest_pairs argument, where exception is None if the operation 341 succeeded or the relevant exception if the operation failed. 342 """ 343 directories, not_directories = [], [] 344 for path in paths: 345 if path.endswith('/'): directories.append(path) 346 else: not_directories.append(path) 347 348 results = {} 349 350 for directory in directories: 351 dir_result = dict(self.delete_tree(directory)) 352 results.update(dir_result) 353 354 not_directory_results = dict(self.delete_files(not_directories)) 355 results.update(not_directory_results) 356 357 return results 358 359 # We intentionally do not decorate this method with a retry, since the 360 # underlying copy and delete operations are already idempotent operations 361 # protected by retry decorators. 362 def delete_files(self, paths, max_batch_size=1000): 363 """Deletes the given S3 file object from src to dest. 364 365 Args: 366 paths: List of S3 file paths in the form s3://<bucket>/<name> 367 max_batch_size: Largest number of keys to send to the client to be deleted 368 simultaneously 369 370 Returns: List of tuples of (path, exception) in the same order as the paths 371 argument, where exception is None if the operation succeeded or 372 the relevant exception if the operation failed. 373 """ 374 if not paths: return [] 375 376 # Sort paths into bucket: [keys] 377 buckets, keys = zip(*[parse_s3_path(path) for path in paths]) 378 grouped_keys = {bucket: [] for bucket in buckets} 379 for bucket, key in zip(buckets, keys): 380 grouped_keys[bucket].append(key) 381 382 # For each bucket, delete minibatches of keys 383 results = {} 384 for bucket, keys in grouped_keys.items(): 385 for i in range(0, len(keys), max_batch_size): 386 minibatch_keys = keys[i:i + max_batch_size] 387 results.update(self._delete_minibatch(bucket, minibatch_keys)) 388 389 # Organize final results 390 final_results = [(path, results[parse_s3_path(path)]) for path in paths] 391 392 return final_results 393 394 @retry.with_exponential_backoff( 395 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 396 def _delete_minibatch(self, bucket, keys): 397 """A helper method. Boto3 allows batch deletions 398 for files within the same bucket. 399 400 Args: 401 bucket: String bucket name 402 keys: List of keys to be deleted in the bucket 403 404 Returns: dict of the form {(bucket, key): error}, where error is None if the 405 operation succeeded 406 """ 407 request = messages.DeleteBatchRequest(bucket, keys) 408 results = {} 409 try: 410 response = self.client.delete_batch(request) 411 412 for key in response.deleted: 413 results[(bucket, key)] = None 414 415 for key, error in zip(response.failed, response.errors): 416 results[(bucket, key)] = error 417 418 except messages.S3ClientError as e: 419 for key in keys: 420 results[(bucket, key)] = e 421 422 return results 423 424 # We intentionally do not decorate this method with a retry, since the 425 # underlying copy and delete operations are already idempotent operations 426 # protected by retry decorators. 427 def delete_tree(self, root): 428 """Deletes all objects under the given S3 directory. 429 430 Args: 431 path: S3 root path in the form s3://<bucket>/<name>/ (ending with a "/") 432 433 Returns: List of tuples of (path, exception), where each path is an object 434 under the given root. exception is None if the operation succeeded 435 or the relevant exception if the operation failed. 436 """ 437 assert root.endswith('/') 438 439 paths = self.list_prefix(root) 440 return self.delete_files(paths) 441 442 def size(self, path): 443 """Returns the size of a single S3 object. 444 445 This method does not perform glob expansion. Hence the given path must be 446 for a single S3 object. 447 448 Returns: size of the S3 object in bytes. 449 """ 450 return self._s3_object(path).size 451 452 # We intentionally do not decorate this method with a retry, since the 453 # underlying copy and delete operations are already idempotent operations 454 # protected by retry decorators. 455 def rename(self, src, dest): 456 """Renames the given S3 object from src to dest. 457 458 Args: 459 src: S3 file path pattern in the form s3://<bucket>/<name>. 460 dest: S3 file path pattern in the form s3://<bucket>/<name>. 461 """ 462 self.copy(src, dest) 463 self.delete(src) 464 465 def last_updated(self, path): 466 """Returns the last updated epoch time of a single S3 object. 467 468 This method does not perform glob expansion. Hence the given path must be 469 for a single S3 object. 470 471 Returns: last updated time of the S3 object in second. 472 """ 473 return self._updated_to_seconds(self._s3_object(path).last_modified) 474 475 def exists(self, path): 476 """Returns whether the given S3 object exists. 477 478 Args: 479 path: S3 file path pattern in the form s3://<bucket>/<name>. 480 """ 481 try: 482 self._s3_object(path) 483 return True 484 except messages.S3ClientError as e: 485 if e.code == 404: 486 # HTTP 404 indicates that the file did not exist 487 return False 488 else: 489 # We re-raise all other exceptions 490 raise 491 492 def _status(self, path): 493 """For internal use only; no backwards-compatibility guarantees. 494 495 Returns supported fields (checksum, last_updated, size) of a single object 496 as a dict at once. 497 498 This method does not perform glob expansion. Hence the given path must be 499 for a single S3 object. 500 501 Returns: dict of fields of the S3 object. 502 """ 503 s3_object = self._s3_object(path) 504 file_status = {} 505 if hasattr(s3_object, 'etag'): 506 file_status['checksum'] = s3_object.etag 507 if hasattr(s3_object, 'last_modified'): 508 file_status['last_updated'] = self._updated_to_seconds( 509 s3_object.last_modified) 510 if hasattr(s3_object, 'size'): 511 file_status['size'] = s3_object.size 512 return file_status 513 514 @retry.with_exponential_backoff( 515 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 516 def _s3_object(self, path): 517 """Returns a S3 object metadata for the given path 518 519 This method does not perform glob expansion. Hence the given path must be 520 for a single S3 object. 521 522 Returns: S3 object metadata. 523 """ 524 bucket, object = parse_s3_path(path) 525 request = messages.GetRequest(bucket, object) 526 return self.client.get_object_metadata(request) 527 528 @staticmethod 529 def _updated_to_seconds(updated): 530 """Helper function transform the updated field of response to seconds""" 531 return ( 532 time.mktime(updated.timetuple()) - time.timezone + 533 updated.microsecond / 1000000.0) 534 535 def rename_files(self, src_dest_pairs): 536 """Renames the given S3 objects from src to dest. 537 538 Args: 539 src_dest_pairs: list of (src, dest) tuples of s3://<bucket>/<name> file 540 paths to rename from src to dest 541 Returns: List of tuples of (src, dest, exception) in the same order as the 542 src_dest_pairs argument, where exception is None if the operation 543 succeeded or the relevant exception if the operation failed. 544 """ 545 if not src_dest_pairs: return [] 546 547 # TODO: Throw value error if path has directory 548 for src, dest in src_dest_pairs: 549 if src.endswith('/') or dest.endswith('/'): 550 raise ValueError('Cannot rename a directory') 551 552 copy_results = self.copy_paths(src_dest_pairs) 553 paths_to_delete = [src for (src, _, err) in copy_results if err is None] 554 delete_results = self.delete_files(paths_to_delete) 555 556 delete_results_dict = {src: err for (src, err) in delete_results} 557 rename_results = [] 558 for src, dest, err in copy_results: 559 if err is not None: rename_results.append((src, dest, err)) 560 elif delete_results_dict[src] is not None: 561 rename_results.append((src, dest, delete_results_dict[src])) 562 else: 563 rename_results.append((src, dest, None)) 564 565 return rename_results 566 567 568 class S3Downloader(Downloader): 569 def __init__(self, client, path, buffer_size): 570 self._client = client 571 self._path = path 572 self._bucket, self._name = parse_s3_path(path) 573 self._buffer_size = buffer_size 574 575 # Get object state. 576 self._get_request = ( 577 messages.GetRequest(bucket=self._bucket, object=self._name)) 578 579 try: 580 metadata = self._get_object_metadata(self._get_request) 581 582 except messages.S3ClientError as e: 583 if e.code == 404: 584 raise IOError(errno.ENOENT, 'Not found: %s' % self._path) 585 else: 586 logging.error('HTTP error while requesting file %s: %s', self._path, 3) 587 raise 588 589 self._size = metadata.size 590 591 @retry.with_exponential_backoff( 592 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 593 def _get_object_metadata(self, get_request): 594 return self._client.get_object_metadata(get_request) 595 596 @property 597 def size(self): 598 return self._size 599 600 def get_range(self, start, end): 601 return self._client.get_range(self._get_request, start, end) 602 603 604 class S3Uploader(Uploader): 605 def __init__(self, client, path, mime_type='application/octet-stream'): 606 self._client = client 607 self._path = path 608 self._bucket, self._name = parse_s3_path(path) 609 self._mime_type = mime_type 610 611 self.part_number = 1 612 self.buffer = b'' 613 614 self.last_error = None 615 616 self.upload_id = None 617 618 self.parts = [] 619 620 self._start_upload() 621 622 # There is retry logic in the underlying transfer library but we should make 623 # it more explicit so we can control the retry parameters. 624 @retry.no_retries # Using no_retries marks this as an integration point. 625 def _start_upload(self): 626 # The uploader by default transfers data in chunks of 1024 * 1024 bytes at 627 # a time, buffering writes until that size is reached. 628 try: 629 request = messages.UploadRequest( 630 self._bucket, self._name, self._mime_type) 631 response = self._client.create_multipart_upload(request) 632 self.upload_id = response.upload_id 633 except Exception as e: # pylint: disable=broad-except 634 logging.error( 635 'Error in _start_upload while inserting file %s: %s', 636 self._path, 637 traceback.format_exc()) 638 self.last_error = e 639 raise e 640 641 def put(self, data): 642 643 MIN_WRITE_SIZE = 5 * 1024 * 1024 644 MAX_WRITE_SIZE = 5 * 1024 * 1024 * 1024 645 646 # TODO: Byte strings might not be the most performant way to handle this 647 self.buffer += data.tobytes() 648 649 while len(self.buffer) >= MIN_WRITE_SIZE: 650 # Take the first chunk off the buffer and write it to S3 651 chunk = self.buffer[:MAX_WRITE_SIZE] 652 self._write_to_s3(chunk) 653 # Remove the written chunk from the buffer 654 self.buffer = self.buffer[MAX_WRITE_SIZE:] 655 656 def _write_to_s3(self, data): 657 658 try: 659 request = messages.UploadPartRequest( 660 self._bucket, self._name, self.upload_id, self.part_number, data) 661 response = self._client.upload_part(request) 662 self.parts.append({ 663 'ETag': response.etag, 'PartNumber': response.part_number 664 }) 665 self.part_number = self.part_number + 1 666 except messages.S3ClientError as e: 667 self.last_error = e 668 if e.code == 404: 669 raise IOError(errno.ENOENT, 'Not found: %s' % self._path) 670 else: 671 logging.error('HTTP error while requesting file %s: %s', self._path, 3) 672 raise 673 674 def finish(self): 675 if len(self.buffer) > 0: 676 # writes with zero length or mid size files between 677 # MIN_WRITE_SIZE = 5 * 1024 * 1024 678 # MAX_WRITE_SIZE = 5 * 1024 * 1024 * 1024 679 # as we will reach this finish() with len(self.buffer) == 0 680 # which will fail 681 self._write_to_s3(self.buffer) 682 683 if self.last_error is not None: 684 raise self.last_error # pylint: disable=raising-bad-type 685 686 request = messages.CompleteMultipartUploadRequest( 687 self._bucket, self._name, self.upload_id, self.parts) 688 self._client.complete_multipart_upload(request)