github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/azure/blobstoragefilesystem.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Azure Blob Storage Implementation for accesing files on
    19  Azure Blob Storage.
    20  """
    21  
    22  from apache_beam.io.azure import blobstorageio
    23  from apache_beam.io.filesystem import BeamIOError
    24  from apache_beam.io.filesystem import CompressedFile
    25  from apache_beam.io.filesystem import CompressionTypes
    26  from apache_beam.io.filesystem import FileMetadata
    27  from apache_beam.io.filesystem import FileSystem
    28  
    29  __all__ = ['BlobStorageFileSystem']
    30  
    31  
    32  class BlobStorageFileSystem(FileSystem):
    33    """An Azure Blob Storage ``FileSystem`` implementation for accesing files on
    34    Azure Blob Storage.
    35    """
    36  
    37    CHUNK_SIZE = blobstorageio.MAX_BATCH_OPERATION_SIZE
    38    AZURE_FILE_SYSTEM_PREFIX = 'azfs://'
    39  
    40    def __init__(self, pipeline_options):
    41      super().__init__(pipeline_options)
    42      self._pipeline_options = pipeline_options
    43  
    44    @classmethod
    45    def scheme(cls):
    46      """URI scheme for the FileSystem
    47      """
    48      return 'azfs'
    49  
    50    def join(self, basepath, *paths):
    51      """Join two or more pathname components for the filesystem
    52  
    53      Args:
    54        basepath: string path of the first component of the path
    55        paths: path components to be added
    56  
    57      Returns: full path after combining all the passed components
    58      """
    59      if not basepath.startswith(BlobStorageFileSystem.AZURE_FILE_SYSTEM_PREFIX):
    60        raise ValueError(
    61            'Basepath %r must be an Azure Blob Storage path.' % basepath)
    62  
    63      path = basepath
    64      for p in paths:
    65        path = path.rstrip('/') + '/' + p.lstrip('/')
    66      return path
    67  
    68    def split(self, path):
    69      """Splits the given path into two parts.
    70  
    71      Splits the path into a pair (head, tail) such that tail contains the last
    72      component of the path and head contains everything up to that.
    73      For file-systems other than the local file-system, head should include the
    74      prefix.
    75  
    76      Args:
    77        path: path as a string
    78  
    79      Returns:
    80        a pair of path components as strings.
    81      """
    82      path = path.strip()
    83      if not path.startswith(BlobStorageFileSystem.AZURE_FILE_SYSTEM_PREFIX):
    84        raise ValueError('Path %r must be Azure Blob Storage path.' % path)
    85  
    86      prefix_len = len(BlobStorageFileSystem.AZURE_FILE_SYSTEM_PREFIX)
    87      last_sep = path[prefix_len:].rfind('/')
    88      if last_sep >= 0:
    89        last_sep += prefix_len
    90  
    91      if last_sep > 0:
    92        return (path[:last_sep], path[last_sep + 1:])
    93      elif last_sep < 0:
    94        return (path, '')
    95      else:
    96        raise ValueError('Invalid path: %s' % path)
    97  
    98    def mkdirs(self, path):
    99      """Recursively create directories for the provided path.
   100  
   101      Args:
   102        path: string path of the directory structure that should be created
   103  
   104      Raises:
   105        IOError: if leaf directory already exists.
   106      """
   107      pass
   108  
   109    def has_dirs(self):
   110      """Whether this FileSystem supports directories."""
   111      return False
   112  
   113    def _list(self, dir_or_prefix):
   114      """List files in a location.
   115      Listing is non-recursive (for filesystems that support directories).
   116      Args:
   117        dir_or_prefix: (string) A directory or location prefix (for filesystems
   118          that don't have directories).
   119      Returns:
   120        Generator of ``FileMetadata`` objects.
   121      Raises:
   122        ``BeamIOError``: if listing fails, but not if no files were found.
   123      """
   124      try:
   125        for path, (size, updated) in self._blobstorageIO().list_files(
   126            dir_or_prefix, with_metadata=True):
   127          yield FileMetadata(path, size, updated)
   128      except Exception as e:  # pylint: disable=broad-except
   129        raise BeamIOError("List operation failed", {dir_or_prefix: e})
   130  
   131    def _blobstorageIO(self):
   132      return blobstorageio.BlobStorageIO(pipeline_options=self._pipeline_options)
   133  
   134    def _path_open(
   135        self,
   136        path,
   137        mode,
   138        mime_type='application/octet-stream',
   139        compression_type=CompressionTypes.AUTO):
   140      """Helper functions to open a file in the provided mode.
   141      """
   142      compression_type = FileSystem._get_compression_type(path, compression_type)
   143      mime_type = CompressionTypes.mime_type(compression_type, mime_type)
   144      raw_file = self._blobstorageIO().open(path, mode, mime_type=mime_type)
   145      if compression_type == CompressionTypes.UNCOMPRESSED:
   146        return raw_file
   147      return CompressedFile(raw_file, compression_type=compression_type)
   148  
   149    def create(
   150        self,
   151        path,
   152        mime_type='application/octet-stream',
   153        compression_type=CompressionTypes.AUTO):
   154      # type: (...) -> BinaryIO # noqa: F821
   155  
   156      """Returns a write channel for the given file path.
   157  
   158      Args:
   159        path: string path of the file object to be written to the system
   160        mime_type: MIME type to specify the type of content in the file object
   161        compression_type: Type of compression to be used for this object
   162  
   163      Returns: file handle with a close function for the user to use
   164      """
   165      return self._path_open(path, 'wb', mime_type, compression_type)
   166  
   167    def open(
   168        self,
   169        path,
   170        mime_type='application/octet-stream',
   171        compression_type=CompressionTypes.AUTO):
   172      # type: (...) -> BinaryIO # noqa: F821
   173  
   174      """Returns a read channel for the given file path.
   175  
   176      Args:
   177        path: string path of the file object to be read
   178        mime_type: MIME type to specify the type of content in the file object
   179        compression_type: Type of compression to be used for this object
   180  
   181      Returns: file handle with a close function for the user to use
   182      """
   183      return self._path_open(path, 'rb', mime_type, compression_type)
   184  
   185    def copy(self, source_file_names, destination_file_names):
   186      """Recursively copy the file tree from the source to the destination
   187  
   188      Args:
   189        source_file_names: list of source file objects that needs to be copied
   190        destination_file_names: list of destination of the new object
   191  
   192      Raises:
   193        ``BeamIOError``: if any of the copy operations fail
   194      """
   195      if not len(source_file_names) == len(destination_file_names):
   196        message = 'Unable to copy unequal number of sources and destinations.'
   197        raise BeamIOError(message)
   198      src_dest_pairs = list(zip(source_file_names, destination_file_names))
   199      return self._blobstorageIO().copy_paths(src_dest_pairs)
   200  
   201    def rename(self, source_file_names, destination_file_names):
   202      """Rename the files at the source list to the destination list.
   203      Source and destination lists should be of the same size.
   204  
   205      Args:
   206        source_file_names: List of file paths that need to be moved
   207        destination_file_names: List of destination_file_names for the files
   208  
   209      Raises:
   210        ``BeamIOError``: if any of the rename operations fail
   211      """
   212      if not len(source_file_names) == len(destination_file_names):
   213        message = 'Unable to rename unequal number of sources and destinations.'
   214        raise BeamIOError(message)
   215      src_dest_pairs = list(zip(source_file_names, destination_file_names))
   216      results = self._blobstorageIO().rename_files(src_dest_pairs)
   217      # Retrieve exceptions.
   218      exceptions = {(src, dest): error
   219                    for (src, dest, error) in results if error is not None}
   220      if exceptions:
   221        raise BeamIOError("Rename operation failed.", exceptions)
   222  
   223    def exists(self, path):
   224      """Check if the provided path exists on the FileSystem.
   225  
   226      Args:
   227        path: string path that needs to be checked.
   228  
   229      Returns: boolean flag indicating if path exists
   230      """
   231      try:
   232        return self._blobstorageIO().exists(path)
   233      except Exception as e:  # pylint: disable=broad-except
   234        raise BeamIOError("Exists operation failed", {path: e})
   235  
   236    def size(self, path):
   237      """Get size in bytes of a file on the FileSystem.
   238  
   239      Args:
   240        path: string filepath of file.
   241  
   242      Returns: int size of file according to the FileSystem.
   243  
   244      Raises:
   245        ``BeamIOError``: if path doesn't exist.
   246      """
   247      try:
   248        return self._blobstorageIO().size(path)
   249      except Exception as e:  # pylint: disable=broad-except
   250        raise BeamIOError("Size operation failed", {path: e})
   251  
   252    def last_updated(self, path):
   253      """Get UNIX Epoch time in seconds on the FileSystem.
   254  
   255      Args:
   256        path: string path of file.
   257  
   258      Returns: float UNIX Epoch time
   259  
   260      Raises:
   261        ``BeamIOError``: if path doesn't exist.
   262      """
   263      try:
   264        return self._blobstorageIO().last_updated(path)
   265      except Exception as e:  # pylint: disable=broad-except
   266        raise BeamIOError("Last updated operation failed", {path: e})
   267  
   268    def checksum(self, path):
   269      """Fetch checksum metadata of a file on the
   270      :class:`~apache_beam.io.filesystem.FileSystem`.
   271  
   272      Args:
   273        path: string path of a file.
   274  
   275      Returns: string containing checksum
   276  
   277      Raises:
   278        ``BeamIOError``: if path isn't a file or doesn't exist.
   279      """
   280      try:
   281        return self._blobstorageIO().checksum(path)
   282      except Exception as e:  # pylint: disable=broad-except
   283        raise BeamIOError("Checksum operation failed", {path, e})
   284  
   285    def metadata(self, path):
   286      """Fetch metadata fields of a file on the FileSystem.
   287  
   288      Args:
   289        path: string path of a file.
   290  
   291      Returns:
   292        :class:`~apache_beam.io.filesystem.FileMetadata`.
   293  
   294      Raises:
   295        ``BeamIOError``: if path isn't a file or doesn't exist.
   296      """
   297      try:
   298        file_metadata = self._blobstorageIO()._status(path)
   299        return FileMetadata(
   300            path, file_metadata['size'], file_metadata['last_updated'])
   301      except Exception as e:  # pylint: disable=broad-except
   302        raise BeamIOError("Metadata operation failed", {path: e})
   303  
   304    def delete(self, paths):
   305      """Deletes files or directories at the provided paths.
   306      Directories will be deleted recursively.
   307  
   308      Args:
   309        paths: list of paths that give the file objects to be deleted
   310  
   311      Raises:
   312        ``BeamIOError``: if any of the delete operations fail
   313      """
   314      results = self._blobstorageIO().delete_paths(paths)
   315      # Retrieve exceptions.
   316      exceptions = {
   317          path: error
   318          for (path, error) in results.items() if error is not None
   319      }
   320  
   321      if exceptions:
   322        raise BeamIOError("Delete operation failed", exceptions)