github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystem.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """File system abstraction for file-based sources and sinks. 19 20 Note to implementors: 21 "path" arguments will be URLs in the form scheme://foo/bar. The exception is 22 LocalFileSystem, which gets unix-style paths in the form /foo/bar. 23 """ 24 25 # pytype: skip-file 26 27 import abc 28 import bz2 29 import io 30 import logging 31 import lzma 32 import os 33 import posixpath 34 import re 35 import time 36 import zlib 37 from typing import BinaryIO # pylint: disable=unused-import 38 from typing import Iterator 39 from typing import List 40 from typing import Optional 41 from typing import Tuple 42 43 import zstandard 44 45 from apache_beam.utils.plugin import BeamPlugin 46 47 logger = logging.getLogger(__name__) 48 49 DEFAULT_READ_BUFFER_SIZE = 16 * 1024 * 1024 50 51 __all__ = [ 52 'CompressionTypes', 53 'CompressedFile', 54 'FileMetadata', 55 'FileSystem', 56 'MatchResult' 57 ] 58 59 60 class CompressionTypes(object): 61 """Enum-like class representing known compression types.""" 62 63 # Detect compression based on filename extension. 64 # 65 # The following extensions are currently recognized by auto-detection: 66 # .bz2 (implies BZIP2 as described below). 67 # .gz (implies GZIP as described below) 68 # .deflate (implies DEFLATE as described below) 69 # .zst (implies ZSTD as described below) 70 # .zst (implies ZSTD as described below) 71 # .xz (implies LZMA as described below) 72 # .lzma (implies LZMA as described below) 73 # Any non-recognized extension implies UNCOMPRESSED as described below. 74 AUTO = 'auto' 75 76 # BZIP2 compression. 77 BZIP2 = 'bzip2' 78 79 # DEFLATE compression 80 DEFLATE = 'deflate' 81 82 # ZSTD compression 83 ZSTD = 'zstd' 84 85 # GZIP compression (deflate with GZIP headers). 86 GZIP = 'gzip' 87 88 # LZMA compression 89 LZMA = 'lzma' 90 91 # Uncompressed (i.e., may be split). 92 UNCOMPRESSED = 'uncompressed' 93 94 @classmethod 95 def is_valid_compression_type(cls, compression_type): 96 """Returns True for valid compression types, False otherwise.""" 97 types = set([ 98 CompressionTypes.AUTO, 99 CompressionTypes.BZIP2, 100 CompressionTypes.DEFLATE, 101 CompressionTypes.GZIP, 102 CompressionTypes.ZSTD, 103 CompressionTypes.LZMA, 104 CompressionTypes.UNCOMPRESSED 105 ]) 106 return compression_type in types 107 108 @classmethod 109 def mime_type(cls, compression_type, default='application/octet-stream'): 110 mime_types_by_compression_type = { 111 cls.BZIP2: 'application/x-bz2', 112 cls.DEFLATE: 'application/x-deflate', 113 cls.GZIP: 'application/x-gzip', 114 cls.ZSTD: 'application/zstd', 115 cls.LZMA: 'application/lzma' 116 } 117 return mime_types_by_compression_type.get(compression_type, default) 118 119 @classmethod 120 def detect_compression_type(cls, file_path): 121 """Returns the compression type of a file (based on its suffix).""" 122 compression_types_by_suffix = { 123 '.bz2': cls.BZIP2, 124 '.deflate': cls.DEFLATE, 125 '.gz': cls.GZIP, 126 '.zst': cls.ZSTD, 127 '.zstd': cls.ZSTD, 128 '.xz': cls.LZMA, 129 '.lzma': cls.LZMA 130 } 131 lowercased_path = file_path.lower() 132 for suffix, compression_type in compression_types_by_suffix.items(): 133 if lowercased_path.endswith(suffix): 134 return compression_type 135 return cls.UNCOMPRESSED 136 137 138 class CompressedFile(object): 139 """File wrapper for easier handling of compressed files.""" 140 # XXX: This class is not thread safe in the read path. 141 142 # The bit mask to use for the wbits parameters of the zlib compressor and 143 # decompressor objects. 144 _gzip_mask = zlib.MAX_WBITS | 16 # Mask when using GZIP headers. 145 146 def __init__( 147 self, 148 fileobj, # type: BinaryIO 149 compression_type=CompressionTypes.GZIP, 150 read_size=DEFAULT_READ_BUFFER_SIZE): 151 if not fileobj: 152 raise ValueError('File object must not be None') 153 154 if not CompressionTypes.is_valid_compression_type(compression_type): 155 raise TypeError( 156 'compression_type must be CompressionType object but ' 157 'was %s' % type(compression_type)) 158 if compression_type in (CompressionTypes.AUTO, 159 CompressionTypes.UNCOMPRESSED): 160 raise ValueError( 161 'Cannot create object with unspecified or no compression') 162 163 self._file = fileobj 164 self._compression_type = compression_type 165 166 if self._file.tell() != 0: 167 raise ValueError( 168 'File object must be at position 0 but was %d' % self._file.tell()) 169 self._uncompressed_position = 0 170 self._uncompressed_size = None # type: Optional[int] 171 172 if self.readable(): 173 self._read_size = read_size 174 self._read_buffer = io.BytesIO() 175 self._read_position = 0 176 self._read_eof = False 177 178 self._initialize_decompressor() 179 else: 180 self._decompressor = None 181 182 if self.writeable(): 183 self._initialize_compressor() 184 else: 185 self._compressor = None 186 187 def _initialize_decompressor(self): 188 if self._compression_type == CompressionTypes.BZIP2: 189 self._decompressor = bz2.BZ2Decompressor() 190 elif self._compression_type == CompressionTypes.DEFLATE: 191 self._decompressor = zlib.decompressobj() 192 elif self._compression_type == CompressionTypes.ZSTD: 193 # hardcoded max_window_size to avoid too much memory 194 # errors when reading big files, please refer 195 # to the following issue for further explanation: 196 # https://github.com/indygreg/python-zstandard/issues/157 197 self._decompressor = zstandard.ZstdDecompressor( 198 max_window_size=2147483648).decompressobj() 199 elif self._compression_type == CompressionTypes.LZMA: 200 self._decompressor = lzma.LZMADecompressor() 201 else: 202 assert self._compression_type == CompressionTypes.GZIP 203 self._decompressor = zlib.decompressobj(self._gzip_mask) 204 205 def _initialize_compressor(self): 206 if self._compression_type == CompressionTypes.BZIP2: 207 self._compressor = bz2.BZ2Compressor() 208 elif self._compression_type == CompressionTypes.DEFLATE: 209 self._compressor = zlib.compressobj( 210 zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED) 211 elif self._compression_type == CompressionTypes.ZSTD: 212 self._compressor = zstandard.ZstdCompressor().compressobj() 213 elif self._compression_type == CompressionTypes.LZMA: 214 self._compressor = lzma.LZMACompressor() 215 else: 216 assert self._compression_type == CompressionTypes.GZIP 217 self._compressor = zlib.compressobj( 218 zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, self._gzip_mask) 219 220 def readable(self): 221 # type: () -> bool 222 mode = self._file.mode 223 return 'r' in mode or 'a' in mode 224 225 def writeable(self): 226 # type: () -> bool 227 mode = self._file.mode 228 return 'w' in mode or 'a' in mode 229 230 def write(self, data): 231 # type: (bytes) -> None 232 233 """Write data to file.""" 234 if not self._compressor: 235 raise ValueError('compressor not initialized') 236 self._uncompressed_position += len(data) 237 compressed = self._compressor.compress(data) 238 if compressed: 239 self._file.write(compressed) 240 241 def _fetch_to_internal_buffer(self, num_bytes: int) -> None: 242 """Fetch up to num_bytes into the internal buffer.""" 243 if (not self._read_eof and self._read_position > 0 and 244 (self._read_buffer.tell() - self._read_position) < num_bytes): 245 # There aren't enough number of bytes to accommodate a read, so we 246 # prepare for a possibly large read by clearing up all internal buffers 247 # but without dropping any previous held data. 248 self._read_buffer.seek(self._read_position) 249 data = self._read_buffer.read() 250 self._clear_read_buffer() 251 self._read_buffer.write(data) 252 253 assert self._decompressor 254 while not self._read_eof and (self._read_buffer.tell() - 255 self._read_position) < num_bytes: 256 # Continue reading from the underlying file object until enough bytes are 257 # available, or EOF is reached. 258 if not self._decompressor.unused_data: 259 buf = self._file.read(self._read_size) 260 else: 261 # Any uncompressed data at the end of the stream of a gzip or bzip2 262 # file that is not corrupted points to a concatenated compressed 263 # file. We read concatenated files by recursively creating decompressor 264 # objects for the unused compressed data. 265 buf = self._decompressor.unused_data 266 self._initialize_decompressor() 267 if buf: 268 decompressed = self._decompressor.decompress(buf) 269 del buf # Free up some possibly large and no-longer-needed memory. 270 self._read_buffer.write(decompressed) 271 else: 272 # EOF of current stream reached. 273 if (self._compression_type == CompressionTypes.BZIP2 or 274 self._compression_type == CompressionTypes.DEFLATE or 275 self._compression_type == CompressionTypes.ZSTD or 276 self._compression_type == CompressionTypes.GZIP or 277 self._compression_type == CompressionTypes.LZMA): 278 pass 279 else: 280 # Deflate, Gzip and bzip2 formats do not require flushing 281 # remaining data in the decompressor into the read buffer when 282 # fully decompressing files. 283 self._read_buffer.write(self._decompressor.flush()) 284 285 # Record that we have hit the end of file, so we won't unnecessarily 286 # repeat the completeness verification step above. 287 self._read_eof = True 288 289 def _read_from_internal_buffer(self, read_fn): 290 """Read from the internal buffer by using the supplied read_fn.""" 291 self._read_buffer.seek(self._read_position) 292 result = read_fn() 293 self._read_position += len(result) 294 self._uncompressed_position += len(result) 295 self._read_buffer.seek(0, os.SEEK_END) # Allow future writes. 296 return result 297 298 def read(self, num_bytes: Optional[int] = None) -> bytes: 299 if not self._decompressor: 300 raise ValueError('decompressor not initialized') 301 302 self._fetch_to_internal_buffer(num_bytes) 303 return self._read_from_internal_buffer( 304 lambda: self._read_buffer.read(num_bytes)) 305 306 def readline(self): 307 # type: () -> bytes 308 309 """Equivalent to standard file.readline(). Same return conventions apply.""" 310 if not self._decompressor: 311 raise ValueError('decompressor not initialized') 312 313 bytes_io = io.BytesIO() 314 while True: 315 # Ensure that the internal buffer has at least half the read_size. Going 316 # with half the _read_size (as opposed to a full _read_size) to ensure 317 # that actual fetches are more evenly spread out, as opposed to having 2 318 # consecutive reads at the beginning of a read. 319 self._fetch_to_internal_buffer(self._read_size // 2) 320 line = self._read_from_internal_buffer( 321 lambda: self._read_buffer.readline()) 322 bytes_io.write(line) 323 if line.endswith(b'\n') or not line: 324 break # Newline or EOF reached. 325 326 return bytes_io.getvalue() 327 328 def closed(self) -> bool: 329 return not self._file or self._file.closed 330 331 def close(self) -> None: 332 if self.readable(): 333 self._read_buffer.close() 334 335 if self.writeable(): 336 assert self._compressor 337 self._file.write(self._compressor.flush()) 338 339 self._file.close() 340 341 def flush(self) -> None: 342 if self.writeable(): 343 assert self._compressor 344 self._file.write(self._compressor.flush()) 345 self._file.flush() 346 347 @property 348 def seekable(self): 349 # type: () -> bool 350 return 'r' in self._file.mode 351 352 def _clear_read_buffer(self): 353 # type: () -> None 354 355 """Clears the read buffer by removing all the contents and 356 resetting _read_position to 0""" 357 self._read_position = 0 358 self._read_buffer.seek(0) 359 self._read_buffer.truncate(0) 360 361 def _rewind_file(self): 362 # type: () -> None 363 364 """Seeks to the beginning of the input file. Input file's EOF marker 365 is cleared and _uncompressed_position is reset to zero""" 366 self._file.seek(0, os.SEEK_SET) 367 self._read_eof = False 368 self._uncompressed_position = 0 369 370 def _rewind(self): 371 # type: () -> None 372 373 """Seeks to the beginning of the input file and resets the internal read 374 buffer. The decompressor object is re-initialized to ensure that no data 375 left in it's buffer.""" 376 self._clear_read_buffer() 377 self._rewind_file() 378 379 # Re-initialize decompressor to clear any data buffered prior to rewind 380 self._initialize_decompressor() 381 382 def seek(self, offset, whence=os.SEEK_SET): 383 # type: (int, int) -> None 384 385 """Set the file's current offset. 386 387 Seeking behavior: 388 389 * seeking from the end :data:`os.SEEK_END` the whole file is decompressed 390 once to determine its size. Therefore it is preferred to use 391 :data:`os.SEEK_SET` or :data:`os.SEEK_CUR` to avoid the processing 392 overhead 393 * seeking backwards from the current position rewinds the file to ``0`` 394 and decompresses the chunks to the requested offset 395 * seeking is only supported in files opened for reading 396 * if the new offset is out of bound, it is adjusted to either ``0`` or 397 ``EOF``. 398 399 Args: 400 offset (int): seek offset in the uncompressed content represented as 401 number 402 whence (int): seek mode. Supported modes are :data:`os.SEEK_SET` 403 (absolute seek), :data:`os.SEEK_CUR` (seek relative to the current 404 position), and :data:`os.SEEK_END` (seek relative to the end, offset 405 should be negative). 406 407 Raises: 408 IOError: When this buffer is closed. 409 ValueError: When whence is invalid or the file is not seekable 410 """ 411 if whence == os.SEEK_SET: 412 absolute_offset = offset 413 elif whence == os.SEEK_CUR: 414 absolute_offset = self._uncompressed_position + offset 415 elif whence == os.SEEK_END: 416 # Determine and cache the uncompressed size of the file 417 if not self._uncompressed_size: 418 logger.warning( 419 "Seeking relative from end of file is requested. " 420 "Need to decompress the whole file once to determine " 421 "its size. This might take a while...") 422 uncompress_start_time = time.time() 423 while self.read(self._read_size): 424 pass 425 uncompress_end_time = time.time() 426 logger.warning( 427 "Full file decompression for seek " 428 "from end took %.2f secs", 429 (uncompress_end_time - uncompress_start_time)) 430 self._uncompressed_size = self._uncompressed_position 431 absolute_offset = self._uncompressed_size + offset 432 else: 433 raise ValueError("Whence mode %r is invalid." % whence) 434 435 # Determine how many bytes needs to be read before we reach 436 # the requested offset. Rewind if we already passed the position. 437 if absolute_offset < self._uncompressed_position: 438 self._rewind() 439 bytes_to_skip = absolute_offset - self._uncompressed_position 440 441 # Read until the desired position is reached or EOF occurs. 442 while bytes_to_skip: 443 data = self.read(min(self._read_size, bytes_to_skip)) 444 if not data: 445 break 446 bytes_to_skip -= len(data) 447 448 def tell(self): 449 # type: () -> int 450 451 """Returns current position in uncompressed file.""" 452 return self._uncompressed_position 453 454 def __enter__(self): 455 return self 456 457 def __exit__(self, exception_type, exception_value, traceback): 458 self.close() 459 460 461 class FileMetadata(object): 462 """Metadata about a file path that is the output of FileSystem.match. 463 464 Fields: 465 path: [Required] file path. 466 size_in_bytes: [Required] file size in bytes. 467 last_updated_in_seconds: [Optional] last modified timestamp of the file, or 468 valued 0.0 if not specified. 469 """ 470 def __init__( 471 self, 472 path: str, 473 size_in_bytes: int, 474 last_updated_in_seconds: float = 0.0): 475 assert isinstance(path, str) and path, "Path should be a string" 476 assert isinstance(size_in_bytes, int) and size_in_bytes >= 0, \ 477 "Invalid value for size_in_bytes should %s (of type %s)" % ( 478 size_in_bytes, type(size_in_bytes)) 479 self.path = path 480 self.size_in_bytes = size_in_bytes 481 self.last_updated_in_seconds = last_updated_in_seconds 482 483 def __eq__(self, other): 484 """Note: This is only used in tests where we verify that mock objects match. 485 """ 486 return ( 487 isinstance(other, FileMetadata) and self.path == other.path and 488 self.size_in_bytes == other.size_in_bytes and 489 self.last_updated_in_seconds == other.last_updated_in_seconds) 490 491 def __hash__(self): 492 return hash((self.path, self.size_in_bytes, self.last_updated_in_seconds)) 493 494 def __repr__(self): 495 if self.last_updated_in_seconds == 0.0: 496 return 'FileMetadata(%s, %s)' % (self.path, self.size_in_bytes) 497 else: 498 return 'FileMetadata(%s, %s, %s)' % ( 499 self.path, self.size_in_bytes, self.last_updated_in_seconds) 500 501 502 class MatchResult(object): 503 """Result from the ``FileSystem`` match operation which contains the list 504 of matched ``FileMetadata``. 505 """ 506 def __init__(self, pattern, metadata_list): 507 # type: (str, List[FileMetadata]) -> None 508 self.metadata_list = metadata_list 509 self.pattern = pattern 510 511 512 class BeamIOError(IOError): 513 def __init__(self, msg, exception_details=None): 514 """Class representing the errors thrown in the batch file operations. 515 Args: 516 msg: Message string for the exception thrown 517 exception_details: Optional map of individual input to exception for 518 failed operations in batch. This parameter is optional so if specified 519 the user can assume that the all errors in the filesystem operation 520 have been reported. When the details are missing then the operation 521 may have failed anywhere so the user should use match to determine 522 the current state of the system. 523 """ 524 message = "%s with exceptions %s" % (msg, exception_details) 525 super().__init__(message) 526 self.exception_details = exception_details 527 528 529 class FileSystem(BeamPlugin, metaclass=abc.ABCMeta): 530 """A class that defines the functions that can be performed on a filesystem. 531 532 All methods are abstract and they are for file system providers to 533 implement. Clients should use the FileSystems class to interact with 534 the correct file system based on the provided file pattern scheme. 535 """ 536 CHUNK_SIZE = 1 # Chuck size in the batch operations 537 538 def __init__(self, pipeline_options): 539 """ 540 Args: 541 pipeline_options: Instance of ``PipelineOptions`` or dict of options and 542 values (like ``RuntimeValueProvider.runtime_options``). 543 """ 544 545 @staticmethod 546 def _get_compression_type(path, compression_type): 547 if compression_type == CompressionTypes.AUTO: 548 compression_type = CompressionTypes.detect_compression_type(path) 549 elif not CompressionTypes.is_valid_compression_type(compression_type): 550 raise TypeError( 551 'compression_type must be CompressionType object but ' 552 'was %s' % type(compression_type)) 553 return compression_type 554 555 @classmethod 556 def scheme(cls): 557 """URI scheme for the FileSystem 558 """ 559 raise NotImplementedError 560 561 @abc.abstractmethod 562 def join(self, basepath, *paths): 563 # type: (str, *str) -> str 564 565 """Join two or more pathname components for the filesystem 566 567 Args: 568 basepath: string path of the first component of the path 569 paths: path components to be added 570 571 Returns: full path after combining all the passed components 572 """ 573 raise NotImplementedError 574 575 @abc.abstractmethod 576 def split(self, path): 577 # type: (str) -> Tuple[str, str] 578 579 """Splits the given path into two parts. 580 581 Splits the path into a pair (head, tail) such that tail contains the last 582 component of the path and head contains everything up to that. 583 584 For file-systems other than the local file-system, head should include the 585 prefix. 586 587 Args: 588 path: path as a string 589 Returns: 590 a pair of path components as strings. 591 """ 592 raise NotImplementedError 593 594 @abc.abstractmethod 595 def mkdirs(self, path): 596 """Recursively create directories for the provided path. 597 598 Args: 599 path: string path of the directory structure that should be created 600 601 Raises: 602 IOError: if leaf directory already exists. 603 """ 604 raise NotImplementedError 605 606 @abc.abstractmethod 607 def has_dirs(self): 608 """Whether this FileSystem supports directories.""" 609 raise NotImplementedError 610 611 @abc.abstractmethod 612 def _list(self, dir_or_prefix): 613 """List files in a location. 614 615 Listing is non-recursive (for filesystems that support directories). 616 617 Args: 618 dir_or_prefix: (string) A directory or location prefix (for filesystems 619 that don't have directories). 620 621 Returns: 622 Generator of ``FileMetadata`` objects. 623 624 Raises: 625 ``BeamIOError``: if listing fails, but not if no files were found. 626 """ 627 raise NotImplementedError 628 629 @staticmethod 630 def _split_scheme(url_or_path): 631 match = re.match(r'(^[a-z]+)://(.*)', url_or_path) 632 if match is not None: 633 return match.groups() 634 return None, url_or_path 635 636 @staticmethod 637 def _combine_scheme(scheme, path): 638 if scheme is None: 639 return path 640 return '{}://{}'.format(scheme, path) 641 642 def _url_dirname(self, url_or_path): 643 """Like posixpath.dirname, but preserves scheme:// prefix. 644 645 Args: 646 url_or_path: A string in the form of scheme://some/path OR /some/path. 647 """ 648 scheme, path = self._split_scheme(url_or_path) 649 return self._combine_scheme(scheme, posixpath.dirname(path)) 650 651 def match_files(self, file_metas, pattern): 652 # type: (List[FileMetadata], str) -> Iterator[FileMetadata] 653 654 """Filter :class:`FileMetadata` objects by *pattern* 655 656 Args: 657 file_metas (list of :class:`FileMetadata`): 658 Files to consider when matching 659 pattern (str): File pattern 660 661 See Also: 662 :meth:`translate_pattern` 663 664 Returns: 665 Generator of matching :class:`FileMetadata` 666 """ 667 re_pattern = re.compile(self.translate_pattern(pattern)) 668 match = re_pattern.match 669 for file_metadata in file_metas: 670 if match(file_metadata.path): 671 yield file_metadata 672 673 @staticmethod 674 def translate_pattern(pattern): 675 # type: (str) -> str 676 677 """ 678 Translate a *pattern* to a regular expression. 679 There is no way to quote meta-characters. 680 681 Pattern syntax: 682 The pattern syntax is based on the fnmatch_ syntax, with the following 683 differences: 684 685 - ``*`` Is equivalent to ``[^/\\]*`` rather than ``.*``. 686 - ``**`` Is equivalent to ``.*``. 687 688 See also: 689 :meth:`match` uses this method 690 691 This method is based on `Python 2.7's fnmatch.translate`_. 692 The code in this method is licensed under 693 PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2. 694 695 .. _`fnmatch`: https://docs.python.org/2/library/fnmatch.html 696 697 .. _`Python 2.7's fnmatch.translate`: https://github.com/python/cpython\ 698 /blob/170ea8ccd4235d28538ab713041502d07ad1cacd/Lib/fnmatch.py#L85-L120 699 """ 700 i, n = 0, len(pattern) 701 res = '' 702 while i < n: 703 c = pattern[i] 704 i = i + 1 705 if c == '*': 706 # One char lookahead for "**" 707 if i < n and pattern[i] == "*": 708 res = res + '.*' 709 i = i + 1 710 else: 711 res = res + r'[^/\\]*' 712 elif c == '?': 713 res = res + '.' 714 elif c == '[': 715 j = i 716 if j < n and pattern[j] == '!': 717 j = j + 1 718 if j < n and pattern[j] == ']': 719 j = j + 1 720 while j < n and pattern[j] != ']': 721 j = j + 1 722 if j >= n: 723 res = res + r'\[' 724 else: 725 stuff = pattern[i:j].replace('\\', '\\\\') 726 i = j + 1 727 if stuff[0] == '!': 728 stuff = '^' + stuff[1:] 729 elif stuff[0] == '^': 730 stuff = '\\' + stuff 731 res = '%s[%s]' % (res, stuff) 732 else: 733 res = res + re.escape(c) 734 735 logger.debug('translate_pattern: %r -> %r', pattern, res) 736 return r'(?ms)' + res + r'\Z' 737 738 def match(self, patterns, limits=None): 739 """Find all matching paths to the patterns provided. 740 741 See Also: 742 :meth:`translate_pattern` 743 744 Patterns ending with '/' or '\\' will be appended with '*'. 745 746 Args: 747 patterns: list of string for the file path pattern to match against 748 limits: list of maximum number of responses that need to be fetched 749 750 Returns: list of ``MatchResult`` objects. 751 752 Raises: 753 ``BeamIOError``: if any of the pattern match operations fail 754 """ 755 if limits is None: 756 limits = [None] * len(patterns) 757 else: 758 err_msg = "Patterns and limits should be equal in length" 759 assert len(patterns) == len(limits), err_msg 760 761 def _match(pattern, limit): 762 """Find all matching paths to the pattern provided.""" 763 if pattern.endswith('/') or pattern.endswith('\\'): 764 pattern += '*' 765 # Get the part of the pattern before the first globbing character. 766 # For example scheme://path/foo* will become scheme://path/foo for 767 # filesystems like GCS, or converted to scheme://path for filesystems with 768 # directories. 769 prefix_or_dir = re.match('^[^[*?]*', pattern).group(0) 770 771 file_metadatas = [] 772 if prefix_or_dir == pattern: 773 # Short-circuit calling self.list() if there's no glob pattern to match. 774 if self.exists(pattern): 775 file_metadatas = [self.metadata(pattern)] 776 else: 777 if self.has_dirs(): 778 prefix_dirname = self._url_dirname(prefix_or_dir) 779 if not prefix_dirname == prefix_or_dir: 780 logger.debug( 781 "Changed prefix_or_dir %r -> %r", prefix_or_dir, prefix_dirname) 782 prefix_or_dir = prefix_dirname 783 784 logger.debug("Listing files in %r", prefix_or_dir) 785 file_metadatas = self._list(prefix_or_dir) 786 787 metadata_list = [] 788 for file_metadata in self.match_files(file_metadatas, pattern): 789 if limit is not None and len(metadata_list) >= limit: 790 break 791 metadata_list.append(file_metadata) 792 793 return MatchResult(pattern, metadata_list) 794 795 exceptions = {} 796 result = [] 797 for pattern, limit in zip(patterns, limits): 798 try: 799 result.append(_match(pattern, limit)) 800 except Exception as e: # pylint: disable=broad-except 801 exceptions[pattern] = e 802 803 if exceptions: 804 raise BeamIOError("Match operation failed", exceptions) 805 return result 806 807 @abc.abstractmethod 808 def create( 809 self, 810 path, 811 mime_type='application/octet-stream', 812 compression_type=CompressionTypes.AUTO): 813 # type: (...) -> BinaryIO 814 815 """Returns a write channel for the given file path. 816 817 Args: 818 path: string path of the file object to be written to the system 819 mime_type: MIME type to specify the type of content in the file object 820 compression_type: Type of compression to be used for this object 821 822 Returns: file handle with a close function for the user to use 823 """ 824 raise NotImplementedError 825 826 @abc.abstractmethod 827 def open( 828 self, 829 path, 830 mime_type='application/octet-stream', 831 compression_type=CompressionTypes.AUTO): 832 # type: (...) -> BinaryIO 833 834 """Returns a read channel for the given file path. 835 836 Args: 837 path: string path of the file object to be read 838 mime_type: MIME type to specify the type of content in the file object 839 compression_type: Type of compression to be used for this object 840 841 Returns: file handle with a close function for the user to use 842 """ 843 raise NotImplementedError 844 845 @abc.abstractmethod 846 def copy(self, source_file_names, destination_file_names): 847 """Recursively copy the file tree from the source to the destination 848 849 Args: 850 source_file_names: list of source file objects that needs to be copied 851 destination_file_names: list of destination of the new object 852 853 Raises: 854 ``BeamIOError``: if any of the copy operations fail 855 """ 856 raise NotImplementedError 857 858 @abc.abstractmethod 859 def rename(self, source_file_names, destination_file_names): 860 """Rename the files at the source list to the destination list. 861 Source and destination lists should be of the same size. 862 863 Args: 864 source_file_names: List of file paths that need to be moved 865 destination_file_names: List of destination_file_names for the files 866 867 Raises: 868 ``BeamIOError``: if any of the rename operations fail 869 """ 870 raise NotImplementedError 871 872 @abc.abstractmethod 873 def exists(self, path): 874 # type: (str) -> bool 875 876 """Check if the provided path exists on the FileSystem. 877 878 Args: 879 path: string path that needs to be checked. 880 881 Returns: boolean flag indicating if path exists 882 """ 883 raise NotImplementedError 884 885 @abc.abstractmethod 886 def size(self, path): 887 # type: (str) -> int 888 889 """Get size in bytes of a file on the FileSystem. 890 891 Args: 892 path: string filepath of file. 893 894 Returns: int size of file according to the FileSystem. 895 896 Raises: 897 ``BeamIOError``: if path doesn't exist. 898 """ 899 raise NotImplementedError 900 901 @abc.abstractmethod 902 def last_updated(self, path): 903 """Get UNIX Epoch time in seconds on the FileSystem. 904 905 Args: 906 path: string path of file. 907 908 Returns: float UNIX Epoch time 909 910 Raises: 911 ``BeamIOError``: if path doesn't exist. 912 """ 913 raise NotImplementedError 914 915 def checksum(self, path): 916 """Fetch checksum metadata of a file on the 917 :class:`~apache_beam.io.filesystem.FileSystem`. 918 919 This operation returns checksum metadata as stored in the underlying 920 FileSystem. It should not need to read file data to obtain this value. 921 Checksum type and format are FileSystem dependent and are not compatible 922 between FileSystems. 923 FileSystem implementations may return file size if a checksum isn't 924 available. 925 926 Args: 927 path: string path of a file. 928 929 Returns: string containing checksum 930 931 Raises: 932 ``BeamIOError``: if path isn't a file or doesn't exist. 933 """ 934 raise NotImplementedError 935 936 @abc.abstractmethod 937 def metadata(self, path): 938 """Fetch metadata of a file on the 939 :class:`~apache_beam.io.filesystem.FileSystem`. 940 941 This operation returns metadata as stored in the underlying 942 FileSystem. It should not need to read file data to obtain this value. 943 For web based file systems, this method should also incur as few as 944 possible requests. 945 946 Args: 947 path: string path of a file. 948 949 Returns: 950 :class:`~apache_beam.io.filesystem.FileMetadata`. 951 952 Raises: 953 ``BeamIOError``: if path isn't a file or doesn't exist. 954 """ 955 raise NotImplementedError 956 957 @abc.abstractmethod 958 def delete(self, paths): 959 """Deletes files or directories at the provided paths. 960 Directories will be deleted recursively. 961 962 Args: 963 paths: list of paths that give the file objects to be deleted 964 965 Raises: 966 ``BeamIOError``: if any of the delete operations fail 967 """ 968 raise NotImplementedError