github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/azure/blobstorageio.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Azure Blob Storage client. 19 """ 20 21 # pytype: skip-file 22 23 import errno 24 import io 25 import logging 26 import os 27 import re 28 import tempfile 29 import time 30 31 from apache_beam.internal.azure import auth 32 from apache_beam.io.filesystemio import Downloader 33 from apache_beam.io.filesystemio import DownloaderStream 34 from apache_beam.io.filesystemio import Uploader 35 from apache_beam.io.filesystemio import UploaderStream 36 from apache_beam.options.pipeline_options import AzureOptions 37 from apache_beam.utils import retry 38 from apache_beam.utils.annotations import deprecated 39 40 _LOGGER = logging.getLogger(__name__) 41 42 try: 43 # pylint: disable=wrong-import-order, wrong-import-position 44 # pylint: disable=ungrouped-imports 45 from azure.core.exceptions import ResourceNotFoundError 46 from azure.storage.blob import ( 47 BlobServiceClient, 48 ContentSettings, 49 ) 50 AZURE_DEPS_INSTALLED = True 51 except ImportError: 52 AZURE_DEPS_INSTALLED = False 53 54 DEFAULT_READ_BUFFER_SIZE = 16 * 1024 * 1024 55 56 MAX_BATCH_OPERATION_SIZE = 100 57 58 59 def parse_azfs_path(azfs_path, blob_optional=False, get_account=False): 60 """Return the storage account, the container and 61 blob names of the given azfs:// path. 62 """ 63 match = re.match( 64 '^azfs://([a-z0-9]{3,24})/([a-z0-9](?![a-z0-9-]*--[a-z0-9-]*)' 65 '[a-z0-9-]{1,61}[a-z0-9])/(.*)$', 66 azfs_path) 67 if match is None or (match.group(3) == '' and not blob_optional): 68 raise ValueError( 69 'Azure Blob Storage path must be in the form ' 70 'azfs://<storage-account>/<container>/<path>.') 71 result = None 72 if get_account: 73 result = match.group(1), match.group(2), match.group(3) 74 else: 75 result = match.group(2), match.group(3) 76 return result 77 78 79 def get_azfs_url(storage_account, container, blob=''): 80 """Returns the url in the form of 81 https://account.blob.core.windows.net/container/blob-name 82 """ 83 return 'https://' + storage_account + '.blob.core.windows.net/' + \ 84 container + '/' + blob 85 86 87 class Blob(): 88 """A Blob in Azure Blob Storage.""" 89 def __init__(self, etag, name, last_updated, size, mime_type): 90 self.etag = etag 91 self.name = name 92 self.last_updated = last_updated 93 self.size = size 94 self.mime_type = mime_type 95 96 97 class BlobStorageIOError(IOError, retry.PermanentException): 98 """Blob Strorage IO error that should not be retried.""" 99 pass 100 101 102 class BlobStorageError(Exception): 103 """Blob Storage client error.""" 104 def __init__(self, message=None, code=None): 105 self.message = message 106 self.code = code 107 108 109 class BlobStorageIO(object): 110 """Azure Blob Storage I/O client.""" 111 def __init__(self, client=None, pipeline_options=None): 112 if client is None: 113 azure_options = pipeline_options.view_as(AzureOptions) 114 connect_str = azure_options.azure_connection_string or \ 115 os.getenv('AZURE_STORAGE_CONNECTION_STRING') 116 if connect_str: 117 self.client = BlobServiceClient.from_connection_string( 118 conn_str=connect_str) 119 else: 120 credential = auth.get_service_credentials(pipeline_options) 121 self.client = BlobServiceClient( 122 account_url=azure_options.blob_service_endpoint, 123 credential=credential) 124 else: 125 self.client = client 126 if not AZURE_DEPS_INSTALLED: 127 raise RuntimeError('Azure dependencies are not installed. Unable to run.') 128 129 def open( 130 self, 131 filename, 132 mode='r', 133 read_buffer_size=DEFAULT_READ_BUFFER_SIZE, 134 mime_type='application/octet-stream'): 135 """Open an Azure Blob Storage file path for reading or writing. 136 137 Args: 138 filename (str): Azure Blob Storage file path in the form 139 ``azfs://<storage-account>/<container>/<path>``. 140 mode (str): ``'r'`` for reading or ``'w'`` for writing. 141 read_buffer_size (int): Buffer size to use during read operations. 142 mime_type (str): Mime type to set for write operations. 143 144 Returns: 145 Azure Blob Storage file object. 146 Raises: 147 ValueError: Invalid open file mode. 148 """ 149 if mode == 'r' or mode == 'rb': 150 downloader = BlobStorageDownloader( 151 self.client, filename, buffer_size=read_buffer_size) 152 return io.BufferedReader( 153 DownloaderStream( 154 downloader, read_buffer_size=read_buffer_size, mode=mode), 155 buffer_size=read_buffer_size) 156 elif mode == 'w' or mode == 'wb': 157 uploader = BlobStorageUploader(self.client, filename, mime_type) 158 return io.BufferedWriter( 159 UploaderStream(uploader, mode=mode), buffer_size=128 * 1024) 160 else: 161 raise ValueError('Invalid file open mode: %s.' % mode) 162 163 @retry.with_exponential_backoff( 164 retry_filter=retry.retry_on_beam_io_error_filter) 165 def copy(self, src, dest): 166 """Copies a single Azure Blob Storage blob from src to dest. 167 168 Args: 169 src: Blob Storage file path pattern in the form 170 azfs://<storage-account>/<container>/[name]. 171 dest: Blob Storage file path pattern in the form 172 azfs://<storage-account>/<container>/[name]. 173 174 Raises: 175 TimeoutError: on timeout. 176 """ 177 src_storage_account, src_container, src_blob = parse_azfs_path( 178 src, get_account=True) 179 dest_container, dest_blob = parse_azfs_path(dest) 180 181 source_blob = get_azfs_url(src_storage_account, src_container, src_blob) 182 copied_blob = self.client.get_blob_client(dest_container, dest_blob) 183 184 try: 185 copied_blob.start_copy_from_url(source_blob) 186 except ResourceNotFoundError as e: 187 message = e.reason 188 code = e.status_code 189 raise BlobStorageError(message, code) 190 191 # We intentionally do not decorate this method with a retry, since the 192 # underlying copy operation is already an idempotent operation protected 193 # by retry decorators. 194 def copy_tree(self, src, dest): 195 """Renames the given Azure Blob storage directory and its contents 196 recursively from src to dest. 197 198 Args: 199 src: Blob Storage file path pattern in the form 200 azfs://<storage-account>/<container>/[name]. 201 dest: Blob Storage file path pattern in the form 202 azfs://<storage-account>/<container>/[name]. 203 204 Returns: 205 List of tuples of (src, dest, exception) where exception is None if the 206 operation succeeded or the relevant exception if the operation failed. 207 """ 208 assert src.endswith('/') 209 assert dest.endswith('/') 210 211 results = [] 212 for entry in self.list_prefix(src): 213 rel_path = entry[len(src):] 214 try: 215 self.copy(entry, dest + rel_path) 216 results.append((entry, dest + rel_path, None)) 217 except BlobStorageError as e: 218 results.append((entry, dest + rel_path, e)) 219 220 return results 221 222 # We intentionally do not decorate this method with a retry, since the 223 # underlying copy operation is already an idempotent operation protected 224 # by retry decorators. 225 def copy_paths(self, src_dest_pairs): 226 """Copies the given Azure Blob Storage blobs from src to dest. This can 227 handle directory or file paths. 228 229 Args: 230 src_dest_pairs: List of (src, dest) tuples of 231 azfs://<storage-account>/<container>/[name] file paths 232 to copy from src to dest. 233 234 Returns: 235 List of tuples of (src, dest, exception) in the same order as the 236 src_dest_pairs argument, where exception is None if the operation 237 succeeded or the relevant exception if the operation failed. 238 """ 239 if not src_dest_pairs: 240 return [] 241 242 results = [] 243 244 for src_path, dest_path in src_dest_pairs: 245 # Case 1. They are directories. 246 if src_path.endswith('/') and dest_path.endswith('/'): 247 try: 248 results += self.copy_tree(src_path, dest_path) 249 except BlobStorageError as e: 250 results.append((src_path, dest_path, e)) 251 252 # Case 2. They are individual blobs. 253 elif not src_path.endswith('/') and not dest_path.endswith('/'): 254 try: 255 self.copy(src_path, dest_path) 256 results.append((src_path, dest_path, None)) 257 except BlobStorageError as e: 258 results.append((src_path, dest_path, e)) 259 260 # Mismatched paths (one directory, one non-directory) get an error. 261 else: 262 e = BlobStorageError( 263 "Unable to copy mismatched paths" + 264 "(directory, non-directory): %s, %s" % (src_path, dest_path), 265 400) 266 results.append((src_path, dest_path, e)) 267 268 return results 269 270 # We intentionally do not decorate this method with a retry, since the 271 # underlying copy and delete operations are already idempotent operations 272 # protected by retry decorators. 273 def rename(self, src, dest): 274 """Renames the given Azure Blob Storage blob from src to dest. 275 276 Args: 277 src: Blob Storage file path pattern in the form 278 azfs://<storage-account>/<container>/[name]. 279 dest: Blob Storage file path pattern in the form 280 azfs://<storage-account>/<container>/[name]. 281 """ 282 self.copy(src, dest) 283 self.delete(src) 284 285 # We intentionally do not decorate this method with a retry, since the 286 # underlying copy and delete operations are already idempotent operations 287 # protected by retry decorators. 288 def rename_files(self, src_dest_pairs): 289 """Renames the given Azure Blob Storage blobs from src to dest. 290 291 Args: 292 src_dest_pairs: List of (src, dest) tuples of 293 azfs://<storage-account>/<container>/[name] 294 file paths to rename from src to dest. 295 Returns: List of tuples of (src, dest, exception) in the same order as the 296 src_dest_pairs argument, where exception is None if the operation 297 succeeded or the relevant exception if the operation failed. 298 """ 299 if not src_dest_pairs: 300 return [] 301 302 for src, dest in src_dest_pairs: 303 if src.endswith('/') or dest.endswith('/'): 304 raise ValueError('Unable to rename a directory.') 305 306 # Results from copy operation. 307 copy_results = self.copy_paths(src_dest_pairs) 308 paths_to_delete = \ 309 [src for (src, _, error) in copy_results if error is None] 310 # Results from delete operation. 311 delete_results = self.delete_files(paths_to_delete) 312 313 # Get rename file results (list of tuples). 314 results = [] 315 316 # Using a dictionary will make the operation faster. 317 delete_results_dict = {src: error for (src, error) in delete_results} 318 319 for src, dest, error in copy_results: 320 # If there was an error in the copy operation. 321 if error is not None: 322 results.append((src, dest, error)) 323 # If there was an error in the delete operation. 324 elif delete_results_dict[src] is not None: 325 results.append((src, dest, delete_results_dict[src])) 326 # If there was no error in the operations. 327 else: 328 results.append((src, dest, None)) 329 330 return results 331 332 def exists(self, path): 333 """Returns whether the given Azure Blob Storage blob exists. 334 335 Args: 336 path: Azure Blob Storage file path pattern in the form 337 azfs://<storage-account>/<container>/[name]. 338 """ 339 try: 340 self._blob_properties(path) 341 return True 342 except ResourceNotFoundError as e: 343 if e.status_code == 404: 344 # HTTP 404 indicates that the file did not exist. 345 return False 346 else: 347 # We re-raise all other exceptions. 348 raise 349 350 def size(self, path): 351 """Returns the size of a single Blob Storage blob. 352 353 This method does not perform glob expansion. Hence the 354 given path must be for a single Blob Storage blob. 355 356 Returns: size of the Blob Storage blob in bytes. 357 """ 358 return self._blob_properties(path).size 359 360 def last_updated(self, path): 361 """Returns the last updated epoch time of a single 362 Azure Blob Storage blob. 363 364 This method does not perform glob expansion. Hence the 365 given path must be for a single Azure Blob Storage blob. 366 367 Returns: last updated time of the Azure Blob Storage blob 368 in seconds. 369 """ 370 return self._updated_to_seconds(self._blob_properties(path).last_modified) 371 372 def checksum(self, path): 373 """Looks up the checksum of an Azure Blob Storage blob. 374 375 Args: 376 path: Azure Blob Storage file path pattern in the form 377 azfs://<storage-account>/<container>/[name]. 378 """ 379 return self._blob_properties(path).etag 380 381 def _status(self, path): 382 """For internal use only; no backwards-compatibility guarantees. 383 384 Returns supported fields (checksum, last_updated, size) of a single object 385 as a dict at once. 386 387 This method does not perform glob expansion. Hence the given path must be 388 for a single blob property. 389 390 Returns: dict of fields of the blob property. 391 """ 392 properties = self._blob_properties(path) 393 file_status = {} 394 if hasattr(properties, 'etag'): 395 file_status['checksum'] = properties.etag 396 if hasattr(properties, 'last_modified'): 397 file_status['last_updated'] = self._updated_to_seconds( 398 properties.last_modified) 399 if hasattr(properties, 'size'): 400 file_status['size'] = properties.size 401 return file_status 402 403 @retry.with_exponential_backoff( 404 retry_filter=retry.retry_on_beam_io_error_filter) 405 def _blob_properties(self, path): 406 """Returns a blob properties object for the given path 407 408 This method does not perform glob expansion. Hence the given path must be 409 for a single blob properties object. 410 411 Returns: blob properties. 412 """ 413 container, blob = parse_azfs_path(path) 414 blob_to_check = self.client.get_blob_client(container, blob) 415 try: 416 properties = blob_to_check.get_blob_properties() 417 except ResourceNotFoundError as e: 418 message = e.reason 419 code = e.status_code 420 raise BlobStorageError(message, code) 421 422 return properties 423 424 @staticmethod 425 def _updated_to_seconds(updated): 426 """Helper function transform the updated field of response to seconds""" 427 return ( 428 time.mktime(updated.timetuple()) - time.timezone + 429 updated.microsecond / 1000000.0) 430 431 @retry.with_exponential_backoff( 432 retry_filter=retry.retry_on_beam_io_error_filter) 433 def delete(self, path): 434 """Deletes a single blob at the given Azure Blob Storage path. 435 436 Args: 437 path: Azure Blob Storage file path pattern in the form 438 azfs://<storage-account>/<container>/[name]. 439 """ 440 container, blob = parse_azfs_path(path) 441 blob_to_delete = self.client.get_blob_client(container, blob) 442 try: 443 blob_to_delete.delete_blob() 444 except ResourceNotFoundError as e: 445 if e.status_code == 404: 446 # Return success when the file doesn't exist anymore for idempotency. 447 return 448 else: 449 logging.error('HTTP error while deleting file %s', path) 450 raise e 451 452 # We intentionally do not decorate this method with a retry, since the 453 # underlying copy and delete operations are already idempotent operations 454 # protected by retry decorators. 455 def delete_paths(self, paths): 456 """Deletes the given Azure Blob Storage blobs from src to dest. 457 This can handle directory or file paths. 458 459 Args: 460 paths: list of Azure Blob Storage paths in the form 461 azfs://<storage-account>/<container>/[name] that give the 462 file blobs to be deleted. 463 464 Returns: 465 List of tuples of (src, dest, exception) in the same order as the 466 src_dest_pairs argument, where exception is None if the operation 467 succeeded or the relevant exception if the operation failed. 468 """ 469 directories, blobs = [], [] 470 471 # Retrieve directories and not directories. 472 for path in paths: 473 if path.endswith('/'): 474 directories.append(path) 475 else: 476 blobs.append(path) 477 478 results = {} 479 480 for directory in directories: 481 directory_result = dict(self.delete_tree(directory)) 482 results.update(directory_result) 483 484 blobs_results = dict(self.delete_files(blobs)) 485 results.update(blobs_results) 486 487 return results 488 489 # We intentionally do not decorate this method with a retry, since the 490 # underlying copy and delete operations are already idempotent operations 491 # protected by retry decorators. 492 def delete_tree(self, root): 493 """Deletes all blobs under the given Azure BlobStorage virtual 494 directory. 495 496 Args: 497 path: Azure Blob Storage file path pattern in the form 498 azfs://<storage-account>/<container>/[name] 499 (ending with a "/"). 500 501 Returns: 502 List of tuples of (path, exception), where each path is a blob 503 under the given root. exception is None if the operation succeeded 504 or the relevant exception if the operation failed. 505 """ 506 assert root.endswith('/') 507 508 # Get the blob under the root directory. 509 paths_to_delete = self.list_prefix(root) 510 511 return self.delete_files(paths_to_delete) 512 513 # We intentionally do not decorate this method with a retry, since the 514 # underlying copy and delete operations are already idempotent operations 515 # protected by retry decorators. 516 def delete_files(self, paths): 517 """Deletes the given Azure Blob Storage blobs from src to dest. 518 519 Args: 520 paths: list of Azure Blob Storage paths in the form 521 azfs://<storage-account>/<container>/[name] that give the 522 file blobs to be deleted. 523 524 Returns: 525 List of tuples of (src, dest, exception) in the same order as the 526 src_dest_pairs argument, where exception is None if the operation 527 succeeded or the relevant exception if the operation failed. 528 """ 529 if not paths: 530 return [] 531 532 # Group blobs into containers. 533 containers, blobs = zip(*[parse_azfs_path(path, get_account=False) \ 534 for path in paths]) 535 536 grouped_blobs = {container: [] for container in containers} 537 538 # Fill dictionary. 539 for container, blob in zip(containers, blobs): 540 grouped_blobs[container].append(blob) 541 542 results = {} 543 544 # Delete minibatches of blobs for each container. 545 for container, blobs in grouped_blobs.items(): 546 for i in range(0, len(blobs), MAX_BATCH_OPERATION_SIZE): 547 blobs_to_delete = blobs[i:i + MAX_BATCH_OPERATION_SIZE] 548 results.update(self._delete_batch(container, blobs_to_delete)) 549 550 final_results = \ 551 [(path, results[parse_azfs_path(path, get_account=False)]) \ 552 for path in paths] 553 554 return final_results 555 556 @retry.with_exponential_backoff( 557 retry_filter=retry.retry_on_beam_io_error_filter) 558 def _delete_batch(self, container, blobs): 559 """A helper method. Azure Blob Storage Python Client allows batch 560 deletions for blobs within the same container. 561 562 Args: 563 container: container name. 564 blobs: list of blobs to be deleted. 565 566 Returns: 567 Dictionary of the form {(container, blob): error}, where error is 568 None if the operation succeeded. 569 """ 570 container_client = self.client.get_container_client(container) 571 results = {} 572 573 for blob in blobs: 574 try: 575 response = container_client.delete_blob(blob) 576 results[(container, blob)] = response 577 except ResourceNotFoundError as e: 578 results[(container, blob)] = e.status_code 579 580 return results 581 582 @deprecated(since='2.45.0', current='list_files') 583 def list_prefix(self, path, with_metadata=False): 584 """Lists files matching the prefix. 585 586 Args: 587 path: Azure Blob Storage file path pattern in the form 588 azfs://<storage-account>/<container>/[name]. 589 with_metadata: Experimental. Specify whether returns file metadata. 590 591 Returns: 592 If ``with_metadata`` is False: dict of file name -> size; if 593 ``with_metadata`` is True: dict of file name -> tuple(size, timestamp). 594 """ 595 file_info = {} 596 for file_metadata in self.list_files(path, with_metadata): 597 file_info[file_metadata[0]] = file_metadata[1] 598 599 return file_info 600 601 def list_files(self, path, with_metadata=False): 602 """Lists files matching the prefix. 603 604 Args: 605 path: Azure Blob Storage file path pattern in the form 606 azfs://<storage-account>/<container>/[name]. 607 with_metadata: Experimental. Specify whether returns file metadata. 608 609 Returns: 610 If ``with_metadata`` is False: generator of tuple(file name, size); if 611 ``with_metadata`` is True: generator of 612 tuple(file name, tuple(size, timestamp)). 613 """ 614 storage_account, container, blob = parse_azfs_path( 615 path, blob_optional=True, get_account=True) 616 file_info = set() 617 counter = 0 618 start_time = time.time() 619 620 if with_metadata: 621 logging.debug("Starting the file information of the input") 622 else: 623 logging.debug("Starting the size estimation of the input") 624 container_client = self.client.get_container_client(container) 625 626 response = retry.with_exponential_backoff( 627 retry_filter=retry.retry_on_beam_io_error_filter)( 628 container_client.list_blobs)( 629 name_starts_with=blob) 630 for item in response: 631 file_name = "azfs://%s/%s/%s" % (storage_account, container, item.name) 632 if file_name not in file_info: 633 file_info.add(file_name) 634 counter += 1 635 if counter % 10000 == 0: 636 if with_metadata: 637 logging.info( 638 "Finished computing file information of: %s files", 639 len(file_info)) 640 else: 641 logging.info("Finished computing size of: %s files", len(file_info)) 642 if with_metadata: 643 yield file_name, ( 644 item.size, self._updated_to_seconds(item.last_modified)) 645 else: 646 yield file_name, item.size 647 648 logging.log( 649 # do not spam logs when list_prefix is likely used to check empty folder 650 logging.INFO if counter > 0 else logging.DEBUG, 651 "Finished listing %s files in %s seconds.", 652 counter, 653 time.time() - start_time) 654 655 656 class BlobStorageDownloader(Downloader): 657 def __init__(self, client, path, buffer_size): 658 self._client = client 659 self._path = path 660 self._container, self._blob = parse_azfs_path(path) 661 self._buffer_size = buffer_size 662 663 self._blob_to_download = self._client.get_blob_client( 664 self._container, self._blob) 665 666 try: 667 properties = self._get_object_properties() 668 except ResourceNotFoundError as http_error: 669 if http_error.status_code == 404: 670 raise IOError(errno.ENOENT, 'Not found: %s' % self._path) 671 else: 672 _LOGGER.error( 673 'HTTP error while requesting file %s: %s', self._path, http_error) 674 raise 675 676 self._size = properties.size 677 678 @retry.with_exponential_backoff( 679 retry_filter=retry.retry_on_beam_io_error_filter) 680 def _get_object_properties(self): 681 return self._blob_to_download.get_blob_properties() 682 683 @property 684 def size(self): 685 return self._size 686 687 def get_range(self, start, end): 688 # Download_blob first parameter is offset and second is length (exclusive). 689 blob_data = self._blob_to_download.download_blob(start, end - start) 690 # Returns the content as bytes. 691 return blob_data.readall() 692 693 694 class BlobStorageUploader(Uploader): 695 def __init__(self, client, path, mime_type='application/octet-stream'): 696 self._client = client 697 self._path = path 698 self._container, self._blob = parse_azfs_path(path) 699 self._content_settings = ContentSettings(mime_type) 700 701 self._blob_to_upload = self._client.get_blob_client( 702 self._container, self._blob) 703 704 # Temporary file. 705 self._temporary_file = tempfile.NamedTemporaryFile() 706 707 def put(self, data): 708 self._temporary_file.write(data.tobytes()) 709 710 def finish(self): 711 self._temporary_file.seek(0) 712 # The temporary file is deleted immediately after the operation. 713 with open(self._temporary_file.name, "rb") as f: 714 self._blob_to_upload.upload_blob( 715 f.read(), overwrite=True, content_settings=self._content_settings)