github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/aws/s3filesystem.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """S3 file system implementation for accessing files on AWS S3.""" 19 20 # pytype: skip-file 21 22 from apache_beam.io.aws import s3io 23 from apache_beam.io.filesystem import BeamIOError 24 from apache_beam.io.filesystem import CompressedFile 25 from apache_beam.io.filesystem import CompressionTypes 26 from apache_beam.io.filesystem import FileMetadata 27 from apache_beam.io.filesystem import FileSystem 28 29 __all__ = ['S3FileSystem'] 30 31 32 class S3FileSystem(FileSystem): 33 """An S3 `FileSystem` implementation for accessing files on AWS S3 34 """ 35 36 CHUNK_SIZE = s3io.MAX_BATCH_OPERATION_SIZE 37 S3_PREFIX = 's3://' 38 39 def __init__(self, pipeline_options): 40 """Initializes a connection to S3. 41 42 Connection configuration is done by passing pipeline options. 43 See :class:`~apache_beam.options.pipeline_options.S3Options`. 44 """ 45 super().__init__(pipeline_options) 46 self._options = pipeline_options 47 48 @classmethod 49 def scheme(cls): 50 """URI scheme for the FileSystem 51 """ 52 return 's3' 53 54 def join(self, basepath, *paths): 55 """Join two or more pathname components for the filesystem 56 57 Args: 58 basepath: string path of the first component of the path 59 paths: path components to be added 60 61 Returns: full path after combining all of the return nulled components 62 """ 63 if not basepath.startswith(S3FileSystem.S3_PREFIX): 64 raise ValueError('Basepath %r must be S3 path.' % basepath) 65 66 path = basepath 67 for p in paths: 68 path = path.rstrip('/') + '/' + p.lstrip('/') 69 return path 70 71 def split(self, path): 72 """Splits the given path into two parts. 73 74 Splits the path into a pair (head, tail) such that tail contains the last 75 component of the path and head contains everything up to that. 76 77 Head will include the S3 prefix ('s3://'). 78 79 Args: 80 path: path as a string 81 Returns: 82 a pair of path components as strings. 83 """ 84 path = path.strip() 85 if not path.startswith(S3FileSystem.S3_PREFIX): 86 raise ValueError('Path %r must be S3 path.' % path) 87 88 prefix_len = len(S3FileSystem.S3_PREFIX) 89 last_sep = path[prefix_len:].rfind('/') 90 if last_sep >= 0: 91 last_sep += prefix_len 92 93 if last_sep > 0: 94 return (path[:last_sep], path[last_sep + 1:]) 95 elif last_sep < 0: 96 return (path, '') 97 else: 98 raise ValueError('Invalid path: %s' % path) 99 100 def mkdirs(self, path): 101 """Recursively create directories for the provided path. 102 103 Args: 104 path: string path of the directory structure that should be created 105 106 Raises: 107 IOError: if leaf directory already exists. 108 """ 109 pass 110 111 def has_dirs(self): 112 """Whether this FileSystem supports directories.""" 113 return False 114 115 def _list(self, dir_or_prefix): 116 """List files in a location. 117 118 Listing is non-recursive, for filesystems that support directories. 119 120 Args: 121 dir_or_prefix: (string) A directory or location prefix (for filesystems 122 that don't have directories). 123 124 Returns: 125 Generator of ``FileMetadata`` objects. 126 127 Raises: 128 ``BeamIOError``: if listing fails, but not if no files were found. 129 """ 130 try: 131 for path, (size, updated) in s3io.S3IO(options=self._options).list_files( 132 dir_or_prefix, with_metadata=True): 133 yield FileMetadata(path, size, updated) 134 except Exception as e: # pylint: disable=broad-except 135 raise BeamIOError("List operation failed", {dir_or_prefix: e}) 136 137 def _path_open( 138 self, 139 path, 140 mode, 141 mime_type='application/octet-stream', 142 compression_type=CompressionTypes.AUTO): 143 """Helper functions to open a file in the provided mode. 144 """ 145 compression_type = FileSystem._get_compression_type(path, compression_type) 146 mime_type = CompressionTypes.mime_type(compression_type, mime_type) 147 raw_file = s3io.S3IO(options=self._options).open( 148 path, mode, mime_type=mime_type) 149 if compression_type == CompressionTypes.UNCOMPRESSED: 150 return raw_file 151 return CompressedFile(raw_file, compression_type=compression_type) 152 153 def create( 154 self, 155 path, 156 mime_type='application/octet-stream', 157 compression_type=CompressionTypes.AUTO): 158 """Returns a write channel for the given file path. 159 160 Args: 161 path: string path of the file object to be written to the system 162 mime_type: MIME type to specify the type of content in the file object 163 compression_type: Type of compression to be used for this object 164 165 Returns: file handle with a close function for the user to use 166 """ 167 return self._path_open(path, 'wb', mime_type, compression_type) 168 169 def open( 170 self, 171 path, 172 mime_type='application/octet-stream', 173 compression_type=CompressionTypes.AUTO): 174 """Returns a read channel for the given file path. 175 176 Args: 177 path: string path of the file object to be written to the system 178 mime_type: MIME type to specify the type of content in the file object 179 compression_type: Type of compression to be used for this object 180 181 Returns: file handle with a close function for the user to use 182 """ 183 return self._path_open(path, 'rb', mime_type, compression_type) 184 185 def copy(self, source_file_names, destination_file_names): 186 """Recursively copy the file tree from the source to the destination 187 188 Args: 189 source_file_names: list of source file objects that needs to be copied 190 destination_file_names: list of destination of the new object 191 192 Raises: 193 ``BeamIOError``: if any of the copy operations fail 194 """ 195 if not len(source_file_names) == len(destination_file_names): 196 message = 'Unable to copy unequal number of sources and destinations' 197 raise BeamIOError(message) 198 src_dest_pairs = list(zip(source_file_names, destination_file_names)) 199 return s3io.S3IO(options=self._options).copy_paths(src_dest_pairs) 200 201 def rename(self, source_file_names, destination_file_names): 202 """Rename the files at the source list to the destination list. 203 Source and destination lists should be of the same size. 204 205 Args: 206 source_file_names: List of file paths that need to be moved 207 destination_file_names: List of destination_file_names for the files 208 209 Raises: 210 ``BeamIOError``: if any of the rename operations fail 211 """ 212 if not len(source_file_names) == len(destination_file_names): 213 message = 'Unable to rename unequal number of sources and destinations' 214 raise BeamIOError(message) 215 src_dest_pairs = list(zip(source_file_names, destination_file_names)) 216 results = s3io.S3IO(options=self._options).rename_files(src_dest_pairs) 217 exceptions = {(src, dest): error 218 for (src, dest, error) in results if error is not None} 219 if exceptions: 220 raise BeamIOError("Rename operation failed", exceptions) 221 222 def exists(self, path): 223 """Check if the provided path exists on the FileSystem. 224 225 Args: 226 path: string path that needs to be checked. 227 228 Returns: boolean flag indicating if path exists 229 """ 230 try: 231 return s3io.S3IO(options=self._options).exists(path) 232 except Exception as e: # pylint: disable=broad-except 233 raise BeamIOError("exists() operation failed", {path: e}) 234 235 def size(self, path): 236 """Get size of path on the FileSystem. 237 238 Args: 239 path: string path in question. 240 241 Returns: int size of path according to the FileSystem. 242 243 Raises: 244 ``BeamIOError``: if path doesn't exist. 245 """ 246 try: 247 return s3io.S3IO(options=self._options).size(path) 248 except Exception as e: # pylint: disable=broad-except 249 raise BeamIOError("size() operation failed", {path: e}) 250 251 def last_updated(self, path): 252 """Get UNIX Epoch time in seconds on the FileSystem. 253 254 Args: 255 path: string path of file. 256 257 Returns: float UNIX Epoch time 258 259 Raises: 260 ``BeamIOError``: if path doesn't exist. 261 """ 262 try: 263 return s3io.S3IO(options=self._options).last_updated(path) 264 except Exception as e: # pylint: disable=broad-except 265 raise BeamIOError("last_updated operation failed", {path: e}) 266 267 def checksum(self, path): 268 """Fetch checksum metadata of a file on the 269 :class:`~apache_beam.io.filesystem.FileSystem`. 270 271 Args: 272 path: string path of a file. 273 274 Returns: string containing checksum 275 276 Raises: 277 ``BeamIOError``: if path isn't a file or doesn't exist. 278 """ 279 try: 280 return s3io.S3IO(options=self._options).checksum(path) 281 except Exception as e: # pylint: disable=broad-except 282 raise BeamIOError("Checksum operation failed", {path: e}) 283 284 def metadata(self, path): 285 """Fetch metadata fields of a file on the FileSystem. 286 287 Args: 288 path: string path of a file. 289 290 Returns: 291 :class:`~apache_beam.io.filesystem.FileMetadata`. 292 293 Raises: 294 ``BeamIOError``: if path isn't a file or doesn't exist. 295 """ 296 try: 297 file_metadata = s3io.S3IO(options=self._options)._status(path) 298 return FileMetadata( 299 path, file_metadata['size'], file_metadata['last_updated']) 300 except Exception as e: # pylint: disable=broad-except 301 raise BeamIOError("Metadata operation failed", {path: e}) 302 303 def delete(self, paths): 304 """Deletes files or directories at the provided paths. 305 Directories will be deleted recursively. 306 307 Args: 308 paths: list of paths that give the file objects to be deleted 309 """ 310 results = s3io.S3IO(options=self._options).delete_paths(paths) 311 exceptions = { 312 path: error 313 for (path, error) in results.items() if error is not None 314 } 315 if exceptions: 316 raise BeamIOError("Delete operation failed", exceptions)