github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/azure/blobstoragefilesystem.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Azure Blob Storage Implementation for accesing files on 19 Azure Blob Storage. 20 """ 21 22 from apache_beam.io.azure import blobstorageio 23 from apache_beam.io.filesystem import BeamIOError 24 from apache_beam.io.filesystem import CompressedFile 25 from apache_beam.io.filesystem import CompressionTypes 26 from apache_beam.io.filesystem import FileMetadata 27 from apache_beam.io.filesystem import FileSystem 28 29 __all__ = ['BlobStorageFileSystem'] 30 31 32 class BlobStorageFileSystem(FileSystem): 33 """An Azure Blob Storage ``FileSystem`` implementation for accesing files on 34 Azure Blob Storage. 35 """ 36 37 CHUNK_SIZE = blobstorageio.MAX_BATCH_OPERATION_SIZE 38 AZURE_FILE_SYSTEM_PREFIX = 'azfs://' 39 40 def __init__(self, pipeline_options): 41 super().__init__(pipeline_options) 42 self._pipeline_options = pipeline_options 43 44 @classmethod 45 def scheme(cls): 46 """URI scheme for the FileSystem 47 """ 48 return 'azfs' 49 50 def join(self, basepath, *paths): 51 """Join two or more pathname components for the filesystem 52 53 Args: 54 basepath: string path of the first component of the path 55 paths: path components to be added 56 57 Returns: full path after combining all the passed components 58 """ 59 if not basepath.startswith(BlobStorageFileSystem.AZURE_FILE_SYSTEM_PREFIX): 60 raise ValueError( 61 'Basepath %r must be an Azure Blob Storage path.' % basepath) 62 63 path = basepath 64 for p in paths: 65 path = path.rstrip('/') + '/' + p.lstrip('/') 66 return path 67 68 def split(self, path): 69 """Splits the given path into two parts. 70 71 Splits the path into a pair (head, tail) such that tail contains the last 72 component of the path and head contains everything up to that. 73 For file-systems other than the local file-system, head should include the 74 prefix. 75 76 Args: 77 path: path as a string 78 79 Returns: 80 a pair of path components as strings. 81 """ 82 path = path.strip() 83 if not path.startswith(BlobStorageFileSystem.AZURE_FILE_SYSTEM_PREFIX): 84 raise ValueError('Path %r must be Azure Blob Storage path.' % path) 85 86 prefix_len = len(BlobStorageFileSystem.AZURE_FILE_SYSTEM_PREFIX) 87 last_sep = path[prefix_len:].rfind('/') 88 if last_sep >= 0: 89 last_sep += prefix_len 90 91 if last_sep > 0: 92 return (path[:last_sep], path[last_sep + 1:]) 93 elif last_sep < 0: 94 return (path, '') 95 else: 96 raise ValueError('Invalid path: %s' % path) 97 98 def mkdirs(self, path): 99 """Recursively create directories for the provided path. 100 101 Args: 102 path: string path of the directory structure that should be created 103 104 Raises: 105 IOError: if leaf directory already exists. 106 """ 107 pass 108 109 def has_dirs(self): 110 """Whether this FileSystem supports directories.""" 111 return False 112 113 def _list(self, dir_or_prefix): 114 """List files in a location. 115 Listing is non-recursive (for filesystems that support directories). 116 Args: 117 dir_or_prefix: (string) A directory or location prefix (for filesystems 118 that don't have directories). 119 Returns: 120 Generator of ``FileMetadata`` objects. 121 Raises: 122 ``BeamIOError``: if listing fails, but not if no files were found. 123 """ 124 try: 125 for path, (size, updated) in self._blobstorageIO().list_files( 126 dir_or_prefix, with_metadata=True): 127 yield FileMetadata(path, size, updated) 128 except Exception as e: # pylint: disable=broad-except 129 raise BeamIOError("List operation failed", {dir_or_prefix: e}) 130 131 def _blobstorageIO(self): 132 return blobstorageio.BlobStorageIO(pipeline_options=self._pipeline_options) 133 134 def _path_open( 135 self, 136 path, 137 mode, 138 mime_type='application/octet-stream', 139 compression_type=CompressionTypes.AUTO): 140 """Helper functions to open a file in the provided mode. 141 """ 142 compression_type = FileSystem._get_compression_type(path, compression_type) 143 mime_type = CompressionTypes.mime_type(compression_type, mime_type) 144 raw_file = self._blobstorageIO().open(path, mode, mime_type=mime_type) 145 if compression_type == CompressionTypes.UNCOMPRESSED: 146 return raw_file 147 return CompressedFile(raw_file, compression_type=compression_type) 148 149 def create( 150 self, 151 path, 152 mime_type='application/octet-stream', 153 compression_type=CompressionTypes.AUTO): 154 # type: (...) -> BinaryIO # noqa: F821 155 156 """Returns a write channel for the given file path. 157 158 Args: 159 path: string path of the file object to be written to the system 160 mime_type: MIME type to specify the type of content in the file object 161 compression_type: Type of compression to be used for this object 162 163 Returns: file handle with a close function for the user to use 164 """ 165 return self._path_open(path, 'wb', mime_type, compression_type) 166 167 def open( 168 self, 169 path, 170 mime_type='application/octet-stream', 171 compression_type=CompressionTypes.AUTO): 172 # type: (...) -> BinaryIO # noqa: F821 173 174 """Returns a read channel for the given file path. 175 176 Args: 177 path: string path of the file object to be read 178 mime_type: MIME type to specify the type of content in the file object 179 compression_type: Type of compression to be used for this object 180 181 Returns: file handle with a close function for the user to use 182 """ 183 return self._path_open(path, 'rb', mime_type, compression_type) 184 185 def copy(self, source_file_names, destination_file_names): 186 """Recursively copy the file tree from the source to the destination 187 188 Args: 189 source_file_names: list of source file objects that needs to be copied 190 destination_file_names: list of destination of the new object 191 192 Raises: 193 ``BeamIOError``: if any of the copy operations fail 194 """ 195 if not len(source_file_names) == len(destination_file_names): 196 message = 'Unable to copy unequal number of sources and destinations.' 197 raise BeamIOError(message) 198 src_dest_pairs = list(zip(source_file_names, destination_file_names)) 199 return self._blobstorageIO().copy_paths(src_dest_pairs) 200 201 def rename(self, source_file_names, destination_file_names): 202 """Rename the files at the source list to the destination list. 203 Source and destination lists should be of the same size. 204 205 Args: 206 source_file_names: List of file paths that need to be moved 207 destination_file_names: List of destination_file_names for the files 208 209 Raises: 210 ``BeamIOError``: if any of the rename operations fail 211 """ 212 if not len(source_file_names) == len(destination_file_names): 213 message = 'Unable to rename unequal number of sources and destinations.' 214 raise BeamIOError(message) 215 src_dest_pairs = list(zip(source_file_names, destination_file_names)) 216 results = self._blobstorageIO().rename_files(src_dest_pairs) 217 # Retrieve exceptions. 218 exceptions = {(src, dest): error 219 for (src, dest, error) in results if error is not None} 220 if exceptions: 221 raise BeamIOError("Rename operation failed.", exceptions) 222 223 def exists(self, path): 224 """Check if the provided path exists on the FileSystem. 225 226 Args: 227 path: string path that needs to be checked. 228 229 Returns: boolean flag indicating if path exists 230 """ 231 try: 232 return self._blobstorageIO().exists(path) 233 except Exception as e: # pylint: disable=broad-except 234 raise BeamIOError("Exists operation failed", {path: e}) 235 236 def size(self, path): 237 """Get size in bytes of a file on the FileSystem. 238 239 Args: 240 path: string filepath of file. 241 242 Returns: int size of file according to the FileSystem. 243 244 Raises: 245 ``BeamIOError``: if path doesn't exist. 246 """ 247 try: 248 return self._blobstorageIO().size(path) 249 except Exception as e: # pylint: disable=broad-except 250 raise BeamIOError("Size operation failed", {path: e}) 251 252 def last_updated(self, path): 253 """Get UNIX Epoch time in seconds on the FileSystem. 254 255 Args: 256 path: string path of file. 257 258 Returns: float UNIX Epoch time 259 260 Raises: 261 ``BeamIOError``: if path doesn't exist. 262 """ 263 try: 264 return self._blobstorageIO().last_updated(path) 265 except Exception as e: # pylint: disable=broad-except 266 raise BeamIOError("Last updated operation failed", {path: e}) 267 268 def checksum(self, path): 269 """Fetch checksum metadata of a file on the 270 :class:`~apache_beam.io.filesystem.FileSystem`. 271 272 Args: 273 path: string path of a file. 274 275 Returns: string containing checksum 276 277 Raises: 278 ``BeamIOError``: if path isn't a file or doesn't exist. 279 """ 280 try: 281 return self._blobstorageIO().checksum(path) 282 except Exception as e: # pylint: disable=broad-except 283 raise BeamIOError("Checksum operation failed", {path, e}) 284 285 def metadata(self, path): 286 """Fetch metadata fields of a file on the FileSystem. 287 288 Args: 289 path: string path of a file. 290 291 Returns: 292 :class:`~apache_beam.io.filesystem.FileMetadata`. 293 294 Raises: 295 ``BeamIOError``: if path isn't a file or doesn't exist. 296 """ 297 try: 298 file_metadata = self._blobstorageIO()._status(path) 299 return FileMetadata( 300 path, file_metadata['size'], file_metadata['last_updated']) 301 except Exception as e: # pylint: disable=broad-except 302 raise BeamIOError("Metadata operation failed", {path: e}) 303 304 def delete(self, paths): 305 """Deletes files or directories at the provided paths. 306 Directories will be deleted recursively. 307 308 Args: 309 paths: list of paths that give the file objects to be deleted 310 311 Raises: 312 ``BeamIOError``: if any of the delete operations fail 313 """ 314 results = self._blobstorageIO().delete_paths(paths) 315 # Retrieve exceptions. 316 exceptions = { 317 path: error 318 for (path, error) in results.items() if error is not None 319 } 320 321 if exceptions: 322 raise BeamIOError("Delete operation failed", exceptions)