github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystems.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """FileSystems interface class for accessing the correct filesystem""" 19 20 # pytype: skip-file 21 22 import re 23 from typing import BinaryIO # pylint: disable=unused-import 24 25 from apache_beam.io.filesystem import BeamIOError 26 from apache_beam.io.filesystem import CompressionTypes 27 from apache_beam.io.filesystem import FileSystem 28 from apache_beam.options.value_provider import RuntimeValueProvider 29 30 # All filesystem implements should be added here as 31 # best effort imports. We don't want to force loading 32 # a module if the user doesn't supply the correct 33 # packages that these filesystems rely on. 34 # 35 # pylint: disable=wrong-import-position, unused-import 36 try: 37 from apache_beam.io.hadoopfilesystem import HadoopFileSystem 38 except ImportError: 39 pass 40 41 try: 42 from apache_beam.io.localfilesystem import LocalFileSystem 43 except ImportError: 44 pass 45 46 try: 47 from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem 48 except ImportError: 49 pass 50 51 try: 52 from apache_beam.io.aws.s3filesystem import S3FileSystem 53 except ImportError: 54 pass 55 56 try: 57 from apache_beam.io.azure.blobstoragefilesystem import BlobStorageFileSystem 58 except ImportError: 59 pass 60 61 # pylint: enable=wrong-import-position, unused-import 62 63 __all__ = ['FileSystems'] 64 65 66 class FileSystems(object): 67 """A class that defines the functions that can be performed on a filesystem. 68 All methods are static and access the underlying registered filesystems. 69 """ 70 URI_SCHEMA_PATTERN = re.compile('(?P<scheme>[a-zA-Z][-a-zA-Z0-9+.]*)://.*') 71 72 _pipeline_options = None 73 74 @classmethod 75 def set_options(cls, pipeline_options): 76 """Set filesystem options. 77 78 Args: 79 pipeline_options: Instance of ``PipelineOptions``. 80 """ 81 cls._pipeline_options = pipeline_options 82 83 @staticmethod 84 def get_scheme(path): 85 match_result = FileSystems.URI_SCHEMA_PATTERN.match(path.strip()) 86 if match_result is None: 87 return None 88 return match_result.groupdict()['scheme'] 89 90 @staticmethod 91 def get_filesystem(path): 92 # type: (str) -> FileSystems 93 94 """Get the correct filesystem for the specified path 95 """ 96 try: 97 path_scheme = FileSystems.get_scheme(path) 98 systems = [ 99 fs for fs in FileSystem.get_all_subclasses() 100 if fs.scheme() == path_scheme 101 ] 102 if len(systems) == 0: 103 raise ValueError( 104 'Unable to get filesystem from specified path, please use the ' 105 'correct path or ensure the required dependency is installed, ' 106 'e.g., pip install apache-beam[gcp]. Path specified: %s' % path) 107 elif len(systems) == 1: 108 # Pipeline options could come either from the Pipeline itself (using 109 # direct runner), or via RuntimeValueProvider (other runners). 110 options = ( 111 FileSystems._pipeline_options or 112 RuntimeValueProvider.runtime_options) 113 return systems[0](pipeline_options=options) 114 else: 115 raise ValueError('Found more than one filesystem for path %s' % path) 116 except ValueError: 117 raise 118 except Exception as e: 119 raise BeamIOError('Unable to get the Filesystem', {path: e}) 120 121 @staticmethod 122 def join(basepath, *paths): 123 # type: (str, *str) -> str 124 125 """Join two or more pathname components for the filesystem 126 127 Args: 128 basepath: string path of the first component of the path 129 paths: path components to be added 130 131 Returns: full path after combining all the passed components 132 """ 133 filesystem = FileSystems.get_filesystem(basepath) 134 return filesystem.join(basepath, *paths) 135 136 @staticmethod 137 def split(path): 138 """Splits the given path into two parts. 139 140 Splits the path into a pair (head, tail) such that tail contains the last 141 component of the path and head contains everything up to that. 142 143 For file-systems other than the local file-system, head should include the 144 prefix. 145 146 Args: 147 path: path as a string 148 Returns: 149 a pair of path components as strings. 150 """ 151 filesystem = FileSystems.get_filesystem(path) 152 return filesystem.split(path) 153 154 @staticmethod 155 def mkdirs(path): 156 """Recursively create directories for the provided path. 157 158 Args: 159 path: string path of the directory structure that should be created 160 161 Raises: 162 IOError: if leaf directory already exists. 163 """ 164 filesystem = FileSystems.get_filesystem(path) 165 return filesystem.mkdirs(path) 166 167 @staticmethod 168 def match(patterns, limits=None): 169 """Find all matching paths to the patterns provided. 170 171 Pattern matching is done using each filesystem's ``match`` method (e.g. 172 :meth:`.filesystem.FileSystem.match`). 173 174 .. note:: 175 - Depending on the :class:`.FileSystem` implementation, file listings 176 (the ``.FileSystem._list`` method) may not be recursive. 177 - If the file listing is not recursive, a pattern like 178 ``scheme://path/*/foo`` will not be able to mach any files. 179 180 See Also: 181 :meth:`.filesystem.FileSystem.match` 182 183 Pattern syntax: 184 The pattern syntax is based on the fnmatch_ syntax, with the following 185 differences: 186 187 - ``*`` Is equivalent to ``[^/\\]*`` rather than ``.*``. 188 - ``**`` Is equivalent to ``.*``. 189 190 .. _`fnmatch`: https://docs.python.org/2/library/fnmatch.html 191 192 Args: 193 patterns: list of string for the file path pattern to match against 194 limits: list of maximum number of responses that need to be fetched 195 196 Returns: list of ``MatchResult`` objects. 197 198 Raises: 199 ``BeamIOError``: if any of the pattern match operations fail 200 """ 201 if len(patterns) == 0: 202 return [] 203 filesystem = FileSystems.get_filesystem(patterns[0]) 204 return filesystem.match(patterns, limits) 205 206 @staticmethod 207 def create( 208 path, 209 mime_type='application/octet-stream', 210 compression_type=CompressionTypes.AUTO): 211 # type: (...) -> BinaryIO 212 213 """Returns a write channel for the given file path. 214 215 Args: 216 path: string path of the file object to be written to the system 217 mime_type: MIME type to specify the type of content in the file object 218 compression_type: Type of compression to be used for this object. See 219 ``CompressionTypes`` for possible values. 220 221 Returns: file handle with a ``close`` function for the user to use. 222 """ 223 filesystem = FileSystems.get_filesystem(path) 224 return filesystem.create(path, mime_type, compression_type) 225 226 @staticmethod 227 def open( 228 path, 229 mime_type='application/octet-stream', 230 compression_type=CompressionTypes.AUTO): 231 # type: (...) -> BinaryIO 232 233 """Returns a read channel for the given file path. 234 235 Args: 236 path: string path of the file object to be written to the system 237 mime_type: MIME type to specify the type of content in the file object 238 compression_type: Type of compression to be used for this object. See 239 ``CompressionTypes`` for possible values. 240 241 Returns: file handle with a ``close`` function for the user to use. 242 """ 243 filesystem = FileSystems.get_filesystem(path) 244 return filesystem.open(path, mime_type, compression_type) 245 246 @staticmethod 247 def copy(source_file_names, destination_file_names): 248 """Recursively copy the file list from the source to the destination 249 250 Args: 251 source_file_names: list of source file objects that needs to be copied 252 destination_file_names: list of destination of the new object 253 254 Raises: 255 ``BeamIOError``: if any of the copy operations fail 256 """ 257 if len(source_file_names) == 0: 258 return 259 filesystem = FileSystems.get_filesystem(source_file_names[0]) 260 return filesystem.copy(source_file_names, destination_file_names) 261 262 @staticmethod 263 def rename(source_file_names, destination_file_names): 264 """Rename the files at the source list to the destination list. 265 Source and destination lists should be of the same size. 266 267 Args: 268 source_file_names: List of file paths that need to be moved 269 destination_file_names: List of destination_file_names for the files 270 271 Raises: 272 ``BeamIOError``: if any of the rename operations fail 273 """ 274 if len(source_file_names) == 0: 275 return 276 filesystem = FileSystems.get_filesystem(source_file_names[0]) 277 return filesystem.rename(source_file_names, destination_file_names) 278 279 @staticmethod 280 def exists(path): 281 """Check if the provided path exists on the FileSystem. 282 283 Args: 284 path: string path that needs to be checked. 285 286 Returns: boolean flag indicating if path exists 287 """ 288 filesystem = FileSystems.get_filesystem(path) 289 return filesystem.exists(path) 290 291 @staticmethod 292 def last_updated(path): 293 """Get UNIX Epoch time in seconds on the FileSystem. 294 295 Args: 296 path: string path of file. 297 298 Returns: float UNIX Epoch time 299 300 Raises: 301 ``BeamIOError``: if path doesn't exist. 302 """ 303 filesystem = FileSystems.get_filesystem(path) 304 return filesystem.last_updated(path) 305 306 @staticmethod 307 def checksum(path): 308 """Fetch checksum metadata of a file on the 309 :class:`~apache_beam.io.filesystem.FileSystem`. 310 311 This operation returns checksum metadata as stored in the underlying 312 FileSystem. It should not read any file data. Checksum type and format are 313 FileSystem dependent and are not compatible between FileSystems. 314 315 Args: 316 path: string path of a file. 317 318 Returns: string containing checksum 319 320 Raises: 321 ``BeamIOError``: if path isn't a file or doesn't exist. 322 """ 323 filesystem = FileSystems.get_filesystem(path) 324 return filesystem.checksum(path) 325 326 @staticmethod 327 def delete(paths): 328 """Deletes files or directories at the provided paths. 329 Directories will be deleted recursively. 330 331 Args: 332 paths: list of paths that give the file objects to be deleted 333 334 Raises: 335 ``BeamIOError``: if any of the delete operations fail 336 """ 337 if isinstance(paths, str): 338 raise BeamIOError( 339 'Delete passed string argument instead of list: %s' % paths) 340 if len(paths) == 0: 341 return 342 filesystem = FileSystems.get_filesystem(paths[0]) 343 return filesystem.delete(paths) 344 345 @staticmethod 346 def get_chunk_size(path): 347 """Get the correct chunk size for the FileSystem. 348 349 Args: 350 path: string path that needs to be checked. 351 352 Returns: integer size for parallelization in the FS operations. 353 """ 354 filesystem = FileSystems.get_filesystem(path) 355 return filesystem.CHUNK_SIZE