github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/localfilesystem.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Local File system implementation for accessing files on disk.""" 19 20 # pytype: skip-file 21 22 import io 23 import os 24 import shutil 25 from typing import BinaryIO # pylint: disable=unused-import 26 27 from apache_beam.io.filesystem import BeamIOError 28 from apache_beam.io.filesystem import CompressedFile 29 from apache_beam.io.filesystem import CompressionTypes 30 from apache_beam.io.filesystem import FileMetadata 31 from apache_beam.io.filesystem import FileSystem 32 33 __all__ = ['LocalFileSystem'] 34 35 36 class LocalFileSystem(FileSystem): 37 """A Local ``FileSystem`` implementation for accessing files on disk. 38 """ 39 @classmethod 40 def scheme(cls): 41 """URI scheme for the FileSystem 42 """ 43 return None 44 45 def join(self, basepath, *paths): 46 """Join two or more pathname components for the filesystem 47 48 Args: 49 basepath: string path of the first component of the path 50 paths: path components to be added 51 52 Returns: full path after combining all the passed components 53 """ 54 return os.path.join(basepath, *paths) 55 56 def split(self, path): 57 """Splits the given path into two parts. 58 59 Splits the path into a pair (head, tail) such that tail contains the last 60 component of the path and head contains everything up to that. 61 62 Args: 63 path: path as a string 64 Returns: 65 a pair of path components as strings. 66 """ 67 return os.path.split(os.path.abspath(path)) 68 69 def mkdirs(self, path): 70 """Recursively create directories for the provided path. 71 72 Args: 73 path: string path of the directory structure that should be created 74 75 Raises: 76 IOError: if leaf directory already exists. 77 """ 78 try: 79 os.makedirs(path) 80 except OSError as err: 81 raise IOError(err) 82 83 def has_dirs(self): 84 """Whether this FileSystem supports directories.""" 85 return True 86 87 def _url_dirname(self, url_or_path): 88 """Pass through to os.path.dirname. 89 90 This version uses os.path instead of posixpath to be compatible with the 91 host OS. 92 93 Args: 94 url_or_path: A string in the form of /some/path. 95 """ 96 return os.path.dirname(url_or_path) 97 98 def _list(self, dir_or_prefix): 99 """List files in a location. 100 101 Listing is non-recursive, for filesystems that support directories. 102 103 Args: 104 dir_or_prefix: (string) A directory or location prefix (for filesystems 105 that don't have directories). 106 107 Returns: 108 Generator of ``FileMetadata`` objects. 109 110 Raises: 111 ``BeamIOError``: if listing fails, but not if no files were found. 112 """ 113 if not self.exists(dir_or_prefix): 114 return 115 116 def list_files(root): 117 for dirpath, _, files in os.walk(root): 118 for filename in files: 119 yield self.join(dirpath, filename) 120 121 try: 122 for f in list_files(dir_or_prefix): 123 try: 124 yield FileMetadata(f, os.path.getsize(f), os.path.getmtime(f)) 125 except OSError: 126 # Files may disappear, such as when listing /tmp. 127 pass 128 except Exception as e: # pylint: disable=broad-except 129 raise BeamIOError("List operation failed", {dir_or_prefix: e}) 130 131 def _path_open( 132 self, 133 path, 134 mode, 135 mime_type='application/octet-stream', 136 compression_type=CompressionTypes.AUTO): 137 """Helper functions to open a file in the provided mode. 138 """ 139 compression_type = FileSystem._get_compression_type(path, compression_type) 140 raw_file = io.open(path, mode) 141 if compression_type == CompressionTypes.UNCOMPRESSED: 142 return raw_file 143 else: 144 return CompressedFile(raw_file, compression_type=compression_type) 145 146 def create( 147 self, 148 path, 149 mime_type='application/octet-stream', 150 compression_type=CompressionTypes.AUTO): 151 # type: (...) -> BinaryIO 152 153 """Returns a write channel for the given file path. 154 155 Args: 156 path: string path of the file object to be written to the system 157 mime_type: MIME type to specify the type of content in the file object 158 compression_type: Type of compression to be used for this object 159 160 Returns: file handle with a close function for the user to use 161 """ 162 os.makedirs(os.path.dirname(path), exist_ok=True) 163 return self._path_open(path, 'wb', mime_type, compression_type) 164 165 def open( 166 self, 167 path, 168 mime_type='application/octet-stream', 169 compression_type=CompressionTypes.AUTO): 170 # type: (...) -> BinaryIO 171 172 """Returns a read channel for the given file path. 173 174 Args: 175 path: string path of the file object to be written to the system 176 mime_type: MIME type to specify the type of content in the file object 177 compression_type: Type of compression to be used for this object 178 179 Returns: file handle with a close function for the user to use 180 """ 181 return self._path_open(path, 'rb', mime_type, compression_type) 182 183 def copy(self, source_file_names, destination_file_names): 184 """Recursively copy the file tree from the source to the destination 185 186 Args: 187 source_file_names: list of source file objects that needs to be copied 188 destination_file_names: list of destination of the new object 189 190 Raises: 191 ``BeamIOError``: if any of the copy operations fail 192 """ 193 err_msg = ( 194 "source_file_names and destination_file_names should " 195 "be equal in length") 196 assert len(source_file_names) == len(destination_file_names), err_msg 197 198 def _copy_path(source, destination): 199 """Recursively copy the file tree from the source to the destination 200 """ 201 try: 202 if os.path.exists(destination): 203 if os.path.isdir(destination): 204 shutil.rmtree(destination) 205 else: 206 os.remove(destination) 207 if os.path.isdir(source): 208 shutil.copytree(source, destination) 209 else: 210 shutil.copy2(source, destination) 211 except OSError as err: 212 raise IOError(err) 213 214 exceptions = {} 215 for source, destination in zip(source_file_names, destination_file_names): 216 try: 217 _copy_path(source, destination) 218 except Exception as e: # pylint: disable=broad-except 219 exceptions[(source, destination)] = e 220 221 if exceptions: 222 raise BeamIOError("Copy operation failed", exceptions) 223 224 def rename(self, source_file_names, destination_file_names): 225 """Rename the files at the source list to the destination list. 226 Source and destination lists should be of the same size. 227 228 Args: 229 source_file_names: List of file paths that need to be moved 230 destination_file_names: List of destination_file_names for the files 231 232 Raises: 233 ``BeamIOError``: if any of the rename operations fail 234 """ 235 err_msg = ( 236 "source_file_names and destination_file_names should " 237 "be equal in length") 238 assert len(source_file_names) == len(destination_file_names), err_msg 239 240 def _rename_file(source, destination): 241 """Rename a single file object""" 242 try: 243 os.rename(source, destination) 244 except OSError as err: 245 raise IOError(err) 246 247 exceptions = {} 248 for source, destination in zip(source_file_names, destination_file_names): 249 try: 250 _rename_file(source, destination) 251 except Exception as e: # pylint: disable=broad-except 252 exceptions[(source, destination)] = e 253 254 if exceptions: 255 raise BeamIOError("Rename operation failed", exceptions) 256 257 def exists(self, path): 258 """Check if the provided path exists on the FileSystem. 259 260 Args: 261 path: string path that needs to be checked. 262 263 Returns: boolean flag indicating if path exists 264 """ 265 return os.path.exists(path) 266 267 def size(self, path): 268 """Get size of path on the FileSystem. 269 270 Args: 271 path: string path in question. 272 273 Returns: int size of path according to the FileSystem. 274 275 Raises: 276 ``BeamIOError``: if path doesn't exist. 277 """ 278 try: 279 return os.path.getsize(path) 280 except Exception as e: # pylint: disable=broad-except 281 raise BeamIOError("Size operation failed", {path: e}) 282 283 def last_updated(self, path): 284 """Get UNIX Epoch time in seconds on the FileSystem. 285 286 Args: 287 path: string path of file. 288 289 Returns: float UNIX Epoch time 290 291 Raises: 292 ``BeamIOError``: if path doesn't exist. 293 """ 294 if not self.exists(path): 295 raise BeamIOError('Path does not exist: %s' % path) 296 return os.path.getmtime(path) 297 298 def checksum(self, path): 299 """Fetch checksum metadata of a file on the 300 :class:`~apache_beam.io.filesystem.FileSystem`. 301 302 Args: 303 path: string path of a file. 304 305 Returns: string containing file size. 306 307 Raises: 308 ``BeamIOError``: if path isn't a file or doesn't exist. 309 """ 310 if not self.exists(path): 311 raise BeamIOError('Path does not exist: %s' % path) 312 return str(os.path.getsize(path)) 313 314 def metadata(self, path): 315 """Fetch metadata fields of a file on the FileSystem. 316 317 Args: 318 path: string path of a file. 319 320 Returns: 321 :class:`~apache_beam.io.filesystem.FileMetadata`. 322 323 Raises: 324 ``BeamIOError``: if path isn't a file or doesn't exist. 325 """ 326 if not self.exists(path): 327 raise BeamIOError('Path does not exist: %s' % path) 328 return FileMetadata(path, os.path.getsize(path), os.path.getmtime(path)) 329 330 def delete(self, paths): 331 """Deletes files or directories at the provided paths. 332 Directories will be deleted recursively. 333 334 Args: 335 paths: list of paths that give the file objects to be deleted 336 337 Raises: 338 ``BeamIOError``: if any of the delete operations fail 339 """ 340 def _delete_path(path): 341 """Recursively delete the file or directory at the provided path. 342 """ 343 try: 344 if os.path.isdir(path): 345 shutil.rmtree(path) 346 else: 347 os.remove(path) 348 except OSError as err: 349 raise IOError(err) 350 351 exceptions = {} 352 353 def try_delete(path): 354 try: 355 _delete_path(path) 356 except Exception as e: # pylint: disable=broad-except 357 exceptions[path] = e 358 359 for match_result in self.match(paths): 360 metadata_list = match_result.metadata_list 361 362 if not metadata_list: 363 exceptions[match_result.pattern] = \ 364 IOError('No files found to delete under: %s' % match_result.pattern) 365 366 for metadata in match_result.metadata_list: 367 try_delete(metadata.path) 368 369 if exceptions: 370 raise BeamIOError("Delete operation failed", exceptions)