github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsfilesystem.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """GCS file system implementation for accessing files on GCS. 19 20 **Updates to the I/O connector code** 21 22 For any significant updates to this I/O connector, please consider involving 23 corresponding code reviewers mentioned in 24 https://github.com/apache/beam/blob/master/sdks/python/OWNERS 25 """ 26 27 # pytype: skip-file 28 29 from typing import BinaryIO # pylint: disable=unused-import 30 31 from apache_beam.io.filesystem import BeamIOError 32 from apache_beam.io.filesystem import CompressedFile 33 from apache_beam.io.filesystem import CompressionTypes 34 from apache_beam.io.filesystem import FileMetadata 35 from apache_beam.io.filesystem import FileSystem 36 from apache_beam.io.gcp import gcsio 37 38 __all__ = ['GCSFileSystem'] 39 40 41 class GCSFileSystem(FileSystem): 42 """A GCS ``FileSystem`` implementation for accessing files on GCS. 43 """ 44 45 CHUNK_SIZE = gcsio.MAX_BATCH_OPERATION_SIZE # Chuck size in batch operations 46 GCS_PREFIX = 'gs://' 47 48 def __init__(self, pipeline_options): 49 super().__init__(pipeline_options) 50 self._pipeline_options = pipeline_options 51 52 @classmethod 53 def scheme(cls): 54 """URI scheme for the FileSystem 55 """ 56 return 'gs' 57 58 def join(self, basepath, *paths): 59 """Join two or more pathname components for the filesystem 60 61 Args: 62 basepath: string path of the first component of the path 63 paths: path components to be added 64 65 Returns: full path after combining all the passed components 66 """ 67 if not basepath.startswith(GCSFileSystem.GCS_PREFIX): 68 raise ValueError('Basepath %r must be GCS path.' % basepath) 69 path = basepath 70 for p in paths: 71 path = path.rstrip('/') + '/' + p.lstrip('/') 72 return path 73 74 def split(self, path): 75 """Splits the given path into two parts. 76 77 Splits the path into a pair (head, tail) such that tail contains the last 78 component of the path and head contains everything up to that. 79 80 Head will include the GCS prefix ('gs://'). 81 82 Args: 83 path: path as a string 84 Returns: 85 a pair of path components as strings. 86 """ 87 path = path.strip() 88 if not path.startswith(GCSFileSystem.GCS_PREFIX): 89 raise ValueError('Path %r must be GCS path.' % path) 90 91 prefix_len = len(GCSFileSystem.GCS_PREFIX) 92 last_sep = path[prefix_len:].rfind('/') 93 if last_sep >= 0: 94 last_sep += prefix_len 95 96 if last_sep > 0: 97 return (path[:last_sep], path[last_sep + 1:]) 98 elif last_sep < 0: 99 return (path, '') 100 else: 101 raise ValueError('Invalid path: %s' % path) 102 103 def mkdirs(self, path): 104 """Recursively create directories for the provided path. 105 106 Args: 107 path: string path of the directory structure that should be created 108 109 Raises: 110 IOError: if leaf directory already exists. 111 """ 112 pass 113 114 def has_dirs(self): 115 """Whether this FileSystem supports directories.""" 116 return False 117 118 def _list(self, dir_or_prefix): 119 """List files in a location. 120 121 Listing is non-recursive, for filesystems that support directories. 122 123 Args: 124 dir_or_prefix: (string) A directory or location prefix (for filesystems 125 that don't have directories). 126 127 Returns: 128 Generator of ``FileMetadata`` objects. 129 130 Raises: 131 ``BeamIOError``: if listing fails, but not if no files were found. 132 """ 133 try: 134 for path, (size, updated) in self._gcsIO().list_files(dir_or_prefix, 135 with_metadata=True): 136 yield FileMetadata(path, size, updated) 137 except Exception as e: # pylint: disable=broad-except 138 raise BeamIOError("List operation failed", {dir_or_prefix: e}) 139 140 def _gcsIO(self): 141 return gcsio.GcsIO(pipeline_options=self._pipeline_options) 142 143 def _path_open( 144 self, 145 path, 146 mode, 147 mime_type='application/octet-stream', 148 compression_type=CompressionTypes.AUTO): 149 """Helper functions to open a file in the provided mode. 150 """ 151 compression_type = FileSystem._get_compression_type(path, compression_type) 152 mime_type = CompressionTypes.mime_type(compression_type, mime_type) 153 raw_file = self._gcsIO().open(path, mode, mime_type=mime_type) 154 if compression_type == CompressionTypes.UNCOMPRESSED: 155 return raw_file 156 return CompressedFile(raw_file, compression_type=compression_type) 157 158 def create( 159 self, 160 path, 161 mime_type='application/octet-stream', 162 compression_type=CompressionTypes.AUTO): 163 # type: (...) -> BinaryIO 164 165 """Returns a write channel for the given file path. 166 167 Args: 168 path: string path of the file object to be written to the system 169 mime_type: MIME type to specify the type of content in the file object 170 compression_type: Type of compression to be used for this object 171 172 Returns: file handle with a close function for the user to use 173 """ 174 return self._path_open(path, 'wb', mime_type, compression_type) 175 176 def open( 177 self, 178 path, 179 mime_type='application/octet-stream', 180 compression_type=CompressionTypes.AUTO): 181 # type: (...) -> BinaryIO 182 183 """Returns a read channel for the given file path. 184 185 Args: 186 path: string path of the file object to be written to the system 187 mime_type: MIME type to specify the type of content in the file object 188 compression_type: Type of compression to be used for this object 189 190 Returns: file handle with a close function for the user to use 191 """ 192 return self._path_open(path, 'rb', mime_type, compression_type) 193 194 def copy(self, source_file_names, destination_file_names): 195 """Recursively copy the file tree from the source to the destination 196 197 Args: 198 source_file_names: list of source file objects that needs to be copied 199 destination_file_names: list of destination of the new object 200 201 Raises: 202 ``BeamIOError``: if any of the copy operations fail 203 """ 204 err_msg = ( 205 "source_file_names and destination_file_names should " 206 "be equal in length") 207 assert len(source_file_names) == len(destination_file_names), err_msg 208 209 def _copy_path(source, destination): 210 """Recursively copy the file tree from the source to the destination 211 """ 212 if not destination.startswith(GCSFileSystem.GCS_PREFIX): 213 raise ValueError('Destination %r must be GCS path.' % destination) 214 # Use copy_tree if the path ends with / as it is a directory 215 if source.endswith('/'): 216 self._gcsIO().copytree(source, destination) 217 else: 218 self._gcsIO().copy(source, destination) 219 220 exceptions = {} 221 for source, destination in zip(source_file_names, destination_file_names): 222 try: 223 _copy_path(source, destination) 224 except Exception as e: # pylint: disable=broad-except 225 exceptions[(source, destination)] = e 226 227 if exceptions: 228 raise BeamIOError("Copy operation failed", exceptions) 229 230 def rename(self, source_file_names, destination_file_names): 231 """Rename the files at the source list to the destination list. 232 Source and destination lists should be of the same size. 233 234 Args: 235 source_file_names: List of file paths that need to be moved 236 destination_file_names: List of destination_file_names for the files 237 238 Raises: 239 ``BeamIOError``: if any of the rename operations fail 240 """ 241 err_msg = ( 242 "source_file_names and destination_file_names should " 243 "be equal in length") 244 assert len(source_file_names) == len(destination_file_names), err_msg 245 246 gcs_batches = [] 247 gcs_current_batch = [] 248 for src, dest in zip(source_file_names, destination_file_names): 249 gcs_current_batch.append((src, dest)) 250 if len(gcs_current_batch) == self.CHUNK_SIZE: 251 gcs_batches.append(gcs_current_batch) 252 gcs_current_batch = [] 253 if gcs_current_batch: 254 gcs_batches.append(gcs_current_batch) 255 256 # Execute GCS renames if any and return exceptions. 257 exceptions = {} 258 for batch in gcs_batches: 259 copy_statuses = self._gcsIO().copy_batch(batch) 260 copy_succeeded = [] 261 for src, dest, exception in copy_statuses: 262 if exception: 263 exceptions[(src, dest)] = exception 264 else: 265 copy_succeeded.append((src, dest)) 266 delete_batch = [src for src, dest in copy_succeeded] 267 delete_statuses = self._gcsIO().delete_batch(delete_batch) 268 for i, (src, exception) in enumerate(delete_statuses): 269 dest = copy_succeeded[i][1] 270 if exception: 271 exceptions[(src, dest)] = exception 272 273 if exceptions: 274 raise BeamIOError("Rename operation failed", exceptions) 275 276 def exists(self, path): 277 """Check if the provided path exists on the FileSystem. 278 279 Args: 280 path: string path that needs to be checked. 281 282 Returns: boolean flag indicating if path exists 283 """ 284 return self._gcsIO().exists(path) 285 286 def size(self, path): 287 """Get size of path on the FileSystem. 288 289 Args: 290 path: string path in question. 291 292 Returns: int size of path according to the FileSystem. 293 294 Raises: 295 ``BeamIOError``: if path doesn't exist. 296 """ 297 return self._gcsIO().size(path) 298 299 def last_updated(self, path): 300 """Get UNIX Epoch time in seconds on the FileSystem. 301 302 Args: 303 path: string path of file. 304 305 Returns: float UNIX Epoch time 306 307 Raises: 308 ``BeamIOError``: if path doesn't exist. 309 """ 310 return self._gcsIO().last_updated(path) 311 312 def checksum(self, path): 313 """Fetch checksum metadata of a file on the 314 :class:`~apache_beam.io.filesystem.FileSystem`. 315 316 Args: 317 path: string path of a file. 318 319 Returns: string containing checksum 320 321 Raises: 322 ``BeamIOError``: if path isn't a file or doesn't exist. 323 """ 324 try: 325 return self._gcsIO().checksum(path) 326 except Exception as e: # pylint: disable=broad-except 327 raise BeamIOError("Checksum operation failed", {path: e}) 328 329 def metadata(self, path): 330 """Fetch metadata fields of a file on the FileSystem. 331 332 Args: 333 path: string path of a file. 334 335 Returns: 336 :class:`~apache_beam.io.filesystem.FileMetadata`. 337 338 Raises: 339 ``BeamIOError``: if path isn't a file or doesn't exist. 340 """ 341 try: 342 file_metadata = self._gcsIO()._status(path) 343 return FileMetadata( 344 path, file_metadata['size'], file_metadata['last_updated']) 345 except Exception as e: # pylint: disable=broad-except 346 raise BeamIOError("Metadata operation failed", {path: e}) 347 348 def delete(self, paths): 349 """Deletes files or directories at the provided paths. 350 Directories will be deleted recursively. 351 352 Args: 353 paths: list of paths that give the file objects to be deleted 354 """ 355 def _delete_path(path): 356 """Recursively delete the file or directory at the provided path. 357 """ 358 if path.endswith('/'): 359 path_to_use = path + '*' 360 else: 361 path_to_use = path 362 match_result = self.match([path_to_use])[0] 363 statuses = self._gcsIO().delete_batch( 364 [m.path for m in match_result.metadata_list]) 365 # pylint: disable=used-before-assignment 366 failures = [e for (_, e) in statuses if e is not None] 367 if failures: 368 raise failures[0] 369 370 exceptions = {} 371 for path in paths: 372 try: 373 _delete_path(path) 374 except Exception as e: # pylint: disable=broad-except 375 exceptions[path] = e 376 377 if exceptions: 378 raise BeamIOError("Delete operation failed", exceptions)