github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystem.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """File system abstraction for file-based sources and sinks.
    19  
    20  Note to implementors:
    21    "path" arguments will be URLs in the form scheme://foo/bar. The exception is
    22    LocalFileSystem, which gets unix-style paths in the form /foo/bar.
    23  """
    24  
    25  # pytype: skip-file
    26  
    27  import abc
    28  import bz2
    29  import io
    30  import logging
    31  import lzma
    32  import os
    33  import posixpath
    34  import re
    35  import time
    36  import zlib
    37  from typing import BinaryIO  # pylint: disable=unused-import
    38  from typing import Iterator
    39  from typing import List
    40  from typing import Optional
    41  from typing import Tuple
    42  
    43  import zstandard
    44  
    45  from apache_beam.utils.plugin import BeamPlugin
    46  
    47  logger = logging.getLogger(__name__)
    48  
    49  DEFAULT_READ_BUFFER_SIZE = 16 * 1024 * 1024
    50  
    51  __all__ = [
    52      'CompressionTypes',
    53      'CompressedFile',
    54      'FileMetadata',
    55      'FileSystem',
    56      'MatchResult'
    57  ]
    58  
    59  
    60  class CompressionTypes(object):
    61    """Enum-like class representing known compression types."""
    62  
    63    # Detect compression based on filename extension.
    64    #
    65    # The following extensions are currently recognized by auto-detection:
    66    #   .bz2 (implies BZIP2 as described below).
    67    #   .gz  (implies GZIP as described below)
    68    #   .deflate (implies DEFLATE as described below)
    69    #   .zst (implies ZSTD as described below)
    70    #   .zst (implies ZSTD as described below)
    71    #   .xz (implies LZMA as described below)
    72    #   .lzma (implies LZMA as described below)
    73    # Any non-recognized extension implies UNCOMPRESSED as described below.
    74    AUTO = 'auto'
    75  
    76    # BZIP2 compression.
    77    BZIP2 = 'bzip2'
    78  
    79    # DEFLATE compression
    80    DEFLATE = 'deflate'
    81  
    82    # ZSTD compression
    83    ZSTD = 'zstd'
    84  
    85    # GZIP compression (deflate with GZIP headers).
    86    GZIP = 'gzip'
    87  
    88    # LZMA compression
    89    LZMA = 'lzma'
    90  
    91    # Uncompressed (i.e., may be split).
    92    UNCOMPRESSED = 'uncompressed'
    93  
    94    @classmethod
    95    def is_valid_compression_type(cls, compression_type):
    96      """Returns True for valid compression types, False otherwise."""
    97      types = set([
    98          CompressionTypes.AUTO,
    99          CompressionTypes.BZIP2,
   100          CompressionTypes.DEFLATE,
   101          CompressionTypes.GZIP,
   102          CompressionTypes.ZSTD,
   103          CompressionTypes.LZMA,
   104          CompressionTypes.UNCOMPRESSED
   105      ])
   106      return compression_type in types
   107  
   108    @classmethod
   109    def mime_type(cls, compression_type, default='application/octet-stream'):
   110      mime_types_by_compression_type = {
   111          cls.BZIP2: 'application/x-bz2',
   112          cls.DEFLATE: 'application/x-deflate',
   113          cls.GZIP: 'application/x-gzip',
   114          cls.ZSTD: 'application/zstd',
   115          cls.LZMA: 'application/lzma'
   116      }
   117      return mime_types_by_compression_type.get(compression_type, default)
   118  
   119    @classmethod
   120    def detect_compression_type(cls, file_path):
   121      """Returns the compression type of a file (based on its suffix)."""
   122      compression_types_by_suffix = {
   123          '.bz2': cls.BZIP2,
   124          '.deflate': cls.DEFLATE,
   125          '.gz': cls.GZIP,
   126          '.zst': cls.ZSTD,
   127          '.zstd': cls.ZSTD,
   128          '.xz': cls.LZMA,
   129          '.lzma': cls.LZMA
   130      }
   131      lowercased_path = file_path.lower()
   132      for suffix, compression_type in compression_types_by_suffix.items():
   133        if lowercased_path.endswith(suffix):
   134          return compression_type
   135      return cls.UNCOMPRESSED
   136  
   137  
   138  class CompressedFile(object):
   139    """File wrapper for easier handling of compressed files."""
   140    # XXX: This class is not thread safe in the read path.
   141  
   142    # The bit mask to use for the wbits parameters of the zlib compressor and
   143    # decompressor objects.
   144    _gzip_mask = zlib.MAX_WBITS | 16  # Mask when using GZIP headers.
   145  
   146    def __init__(
   147        self,
   148        fileobj,  # type: BinaryIO
   149        compression_type=CompressionTypes.GZIP,
   150        read_size=DEFAULT_READ_BUFFER_SIZE):
   151      if not fileobj:
   152        raise ValueError('File object must not be None')
   153  
   154      if not CompressionTypes.is_valid_compression_type(compression_type):
   155        raise TypeError(
   156            'compression_type must be CompressionType object but '
   157            'was %s' % type(compression_type))
   158      if compression_type in (CompressionTypes.AUTO,
   159                              CompressionTypes.UNCOMPRESSED):
   160        raise ValueError(
   161            'Cannot create object with unspecified or no compression')
   162  
   163      self._file = fileobj
   164      self._compression_type = compression_type
   165  
   166      if self._file.tell() != 0:
   167        raise ValueError(
   168            'File object must be at position 0 but was %d' % self._file.tell())
   169      self._uncompressed_position = 0
   170      self._uncompressed_size = None  # type: Optional[int]
   171  
   172      if self.readable():
   173        self._read_size = read_size
   174        self._read_buffer = io.BytesIO()
   175        self._read_position = 0
   176        self._read_eof = False
   177  
   178        self._initialize_decompressor()
   179      else:
   180        self._decompressor = None
   181  
   182      if self.writeable():
   183        self._initialize_compressor()
   184      else:
   185        self._compressor = None
   186  
   187    def _initialize_decompressor(self):
   188      if self._compression_type == CompressionTypes.BZIP2:
   189        self._decompressor = bz2.BZ2Decompressor()
   190      elif self._compression_type == CompressionTypes.DEFLATE:
   191        self._decompressor = zlib.decompressobj()
   192      elif self._compression_type == CompressionTypes.ZSTD:
   193        # hardcoded max_window_size to avoid too much memory
   194        # errors when reading big files, please refer
   195        # to the following issue for further explanation:
   196        # https://github.com/indygreg/python-zstandard/issues/157
   197        self._decompressor = zstandard.ZstdDecompressor(
   198            max_window_size=2147483648).decompressobj()
   199      elif self._compression_type == CompressionTypes.LZMA:
   200        self._decompressor = lzma.LZMADecompressor()
   201      else:
   202        assert self._compression_type == CompressionTypes.GZIP
   203        self._decompressor = zlib.decompressobj(self._gzip_mask)
   204  
   205    def _initialize_compressor(self):
   206      if self._compression_type == CompressionTypes.BZIP2:
   207        self._compressor = bz2.BZ2Compressor()
   208      elif self._compression_type == CompressionTypes.DEFLATE:
   209        self._compressor = zlib.compressobj(
   210            zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED)
   211      elif self._compression_type == CompressionTypes.ZSTD:
   212        self._compressor = zstandard.ZstdCompressor().compressobj()
   213      elif self._compression_type == CompressionTypes.LZMA:
   214        self._compressor = lzma.LZMACompressor()
   215      else:
   216        assert self._compression_type == CompressionTypes.GZIP
   217        self._compressor = zlib.compressobj(
   218            zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, self._gzip_mask)
   219  
   220    def readable(self):
   221      # type: () -> bool
   222      mode = self._file.mode
   223      return 'r' in mode or 'a' in mode
   224  
   225    def writeable(self):
   226      # type: () -> bool
   227      mode = self._file.mode
   228      return 'w' in mode or 'a' in mode
   229  
   230    def write(self, data):
   231      # type: (bytes) -> None
   232  
   233      """Write data to file."""
   234      if not self._compressor:
   235        raise ValueError('compressor not initialized')
   236      self._uncompressed_position += len(data)
   237      compressed = self._compressor.compress(data)
   238      if compressed:
   239        self._file.write(compressed)
   240  
   241    def _fetch_to_internal_buffer(self, num_bytes: int) -> None:
   242      """Fetch up to num_bytes into the internal buffer."""
   243      if (not self._read_eof and self._read_position > 0 and
   244          (self._read_buffer.tell() - self._read_position) < num_bytes):
   245        # There aren't enough number of bytes to accommodate a read, so we
   246        # prepare for a possibly large read by clearing up all internal buffers
   247        # but without dropping any previous held data.
   248        self._read_buffer.seek(self._read_position)
   249        data = self._read_buffer.read()
   250        self._clear_read_buffer()
   251        self._read_buffer.write(data)
   252  
   253      assert self._decompressor
   254      while not self._read_eof and (self._read_buffer.tell() -
   255                                    self._read_position) < num_bytes:
   256        # Continue reading from the underlying file object until enough bytes are
   257        # available, or EOF is reached.
   258        if not self._decompressor.unused_data:
   259          buf = self._file.read(self._read_size)
   260        else:
   261          # Any uncompressed data at the end of the stream of a gzip or bzip2
   262          # file that is not corrupted points to a concatenated compressed
   263          # file. We read concatenated files by recursively creating decompressor
   264          # objects for the unused compressed data.
   265          buf = self._decompressor.unused_data
   266          self._initialize_decompressor()
   267        if buf:
   268          decompressed = self._decompressor.decompress(buf)
   269          del buf  # Free up some possibly large and no-longer-needed memory.
   270          self._read_buffer.write(decompressed)
   271        else:
   272          # EOF of current stream reached.
   273          if (self._compression_type == CompressionTypes.BZIP2 or
   274              self._compression_type == CompressionTypes.DEFLATE or
   275              self._compression_type == CompressionTypes.ZSTD or
   276              self._compression_type == CompressionTypes.GZIP or
   277              self._compression_type == CompressionTypes.LZMA):
   278            pass
   279          else:
   280            # Deflate, Gzip and bzip2 formats do not require flushing
   281            # remaining data in the decompressor into the read buffer when
   282            # fully decompressing files.
   283            self._read_buffer.write(self._decompressor.flush())
   284  
   285          # Record that we have hit the end of file, so we won't unnecessarily
   286          # repeat the completeness verification step above.
   287          self._read_eof = True
   288  
   289    def _read_from_internal_buffer(self, read_fn):
   290      """Read from the internal buffer by using the supplied read_fn."""
   291      self._read_buffer.seek(self._read_position)
   292      result = read_fn()
   293      self._read_position += len(result)
   294      self._uncompressed_position += len(result)
   295      self._read_buffer.seek(0, os.SEEK_END)  # Allow future writes.
   296      return result
   297  
   298    def read(self, num_bytes: Optional[int] = None) -> bytes:
   299      if not self._decompressor:
   300        raise ValueError('decompressor not initialized')
   301  
   302      self._fetch_to_internal_buffer(num_bytes)
   303      return self._read_from_internal_buffer(
   304          lambda: self._read_buffer.read(num_bytes))
   305  
   306    def readline(self):
   307      # type: () -> bytes
   308  
   309      """Equivalent to standard file.readline(). Same return conventions apply."""
   310      if not self._decompressor:
   311        raise ValueError('decompressor not initialized')
   312  
   313      bytes_io = io.BytesIO()
   314      while True:
   315        # Ensure that the internal buffer has at least half the read_size. Going
   316        # with half the _read_size (as opposed to a full _read_size) to ensure
   317        # that actual fetches are more evenly spread out, as opposed to having 2
   318        # consecutive reads at the beginning of a read.
   319        self._fetch_to_internal_buffer(self._read_size // 2)
   320        line = self._read_from_internal_buffer(
   321            lambda: self._read_buffer.readline())
   322        bytes_io.write(line)
   323        if line.endswith(b'\n') or not line:
   324          break  # Newline or EOF reached.
   325  
   326      return bytes_io.getvalue()
   327  
   328    def closed(self) -> bool:
   329      return not self._file or self._file.closed
   330  
   331    def close(self) -> None:
   332      if self.readable():
   333        self._read_buffer.close()
   334  
   335      if self.writeable():
   336        assert self._compressor
   337        self._file.write(self._compressor.flush())
   338  
   339      self._file.close()
   340  
   341    def flush(self) -> None:
   342      if self.writeable():
   343        assert self._compressor
   344        self._file.write(self._compressor.flush())
   345      self._file.flush()
   346  
   347    @property
   348    def seekable(self):
   349      # type: () -> bool
   350      return 'r' in self._file.mode
   351  
   352    def _clear_read_buffer(self):
   353      # type: () -> None
   354  
   355      """Clears the read buffer by removing all the contents and
   356      resetting _read_position to 0"""
   357      self._read_position = 0
   358      self._read_buffer.seek(0)
   359      self._read_buffer.truncate(0)
   360  
   361    def _rewind_file(self):
   362      # type: () -> None
   363  
   364      """Seeks to the beginning of the input file. Input file's EOF marker
   365      is cleared and _uncompressed_position is reset to zero"""
   366      self._file.seek(0, os.SEEK_SET)
   367      self._read_eof = False
   368      self._uncompressed_position = 0
   369  
   370    def _rewind(self):
   371      # type: () -> None
   372  
   373      """Seeks to the beginning of the input file and resets the internal read
   374      buffer. The decompressor object is re-initialized to ensure that no data
   375      left in it's buffer."""
   376      self._clear_read_buffer()
   377      self._rewind_file()
   378  
   379      # Re-initialize decompressor to clear any data buffered prior to rewind
   380      self._initialize_decompressor()
   381  
   382    def seek(self, offset, whence=os.SEEK_SET):
   383      # type: (int, int) -> None
   384  
   385      """Set the file's current offset.
   386  
   387      Seeking behavior:
   388  
   389        * seeking from the end :data:`os.SEEK_END` the whole file is decompressed
   390          once to determine its size. Therefore it is preferred to use
   391          :data:`os.SEEK_SET` or :data:`os.SEEK_CUR` to avoid the processing
   392          overhead
   393        * seeking backwards from the current position rewinds the file to ``0``
   394          and decompresses the chunks to the requested offset
   395        * seeking is only supported in files opened for reading
   396        * if the new offset is out of bound, it is adjusted to either ``0`` or
   397          ``EOF``.
   398  
   399      Args:
   400        offset (int): seek offset in the uncompressed content represented as
   401          number
   402        whence (int): seek mode. Supported modes are :data:`os.SEEK_SET`
   403          (absolute seek), :data:`os.SEEK_CUR` (seek relative to the current
   404          position), and :data:`os.SEEK_END` (seek relative to the end, offset
   405          should be negative).
   406  
   407      Raises:
   408        IOError: When this buffer is closed.
   409        ValueError: When whence is invalid or the file is not seekable
   410      """
   411      if whence == os.SEEK_SET:
   412        absolute_offset = offset
   413      elif whence == os.SEEK_CUR:
   414        absolute_offset = self._uncompressed_position + offset
   415      elif whence == os.SEEK_END:
   416        # Determine and cache the uncompressed size of the file
   417        if not self._uncompressed_size:
   418          logger.warning(
   419              "Seeking relative from end of file is requested. "
   420              "Need to decompress the whole file once to determine "
   421              "its size. This might take a while...")
   422          uncompress_start_time = time.time()
   423          while self.read(self._read_size):
   424            pass
   425          uncompress_end_time = time.time()
   426          logger.warning(
   427              "Full file decompression for seek "
   428              "from end took %.2f secs",
   429              (uncompress_end_time - uncompress_start_time))
   430          self._uncompressed_size = self._uncompressed_position
   431        absolute_offset = self._uncompressed_size + offset
   432      else:
   433        raise ValueError("Whence mode %r is invalid." % whence)
   434  
   435      # Determine how many bytes needs to be read before we reach
   436      # the requested offset. Rewind if we already passed the position.
   437      if absolute_offset < self._uncompressed_position:
   438        self._rewind()
   439      bytes_to_skip = absolute_offset - self._uncompressed_position
   440  
   441      # Read until the desired position is reached or EOF occurs.
   442      while bytes_to_skip:
   443        data = self.read(min(self._read_size, bytes_to_skip))
   444        if not data:
   445          break
   446        bytes_to_skip -= len(data)
   447  
   448    def tell(self):
   449      # type: () -> int
   450  
   451      """Returns current position in uncompressed file."""
   452      return self._uncompressed_position
   453  
   454    def __enter__(self):
   455      return self
   456  
   457    def __exit__(self, exception_type, exception_value, traceback):
   458      self.close()
   459  
   460  
   461  class FileMetadata(object):
   462    """Metadata about a file path that is the output of FileSystem.match.
   463  
   464    Fields:
   465      path: [Required] file path.
   466      size_in_bytes: [Required] file size in bytes.
   467      last_updated_in_seconds: [Optional] last modified timestamp of the file, or
   468      valued 0.0 if not specified.
   469    """
   470    def __init__(
   471        self,
   472        path: str,
   473        size_in_bytes: int,
   474        last_updated_in_seconds: float = 0.0):
   475      assert isinstance(path, str) and path, "Path should be a string"
   476      assert isinstance(size_in_bytes, int) and size_in_bytes >= 0, \
   477          "Invalid value for size_in_bytes should %s (of type %s)" % (
   478              size_in_bytes, type(size_in_bytes))
   479      self.path = path
   480      self.size_in_bytes = size_in_bytes
   481      self.last_updated_in_seconds = last_updated_in_seconds
   482  
   483    def __eq__(self, other):
   484      """Note: This is only used in tests where we verify that mock objects match.
   485      """
   486      return (
   487          isinstance(other, FileMetadata) and self.path == other.path and
   488          self.size_in_bytes == other.size_in_bytes and
   489          self.last_updated_in_seconds == other.last_updated_in_seconds)
   490  
   491    def __hash__(self):
   492      return hash((self.path, self.size_in_bytes, self.last_updated_in_seconds))
   493  
   494    def __repr__(self):
   495      if self.last_updated_in_seconds == 0.0:
   496        return 'FileMetadata(%s, %s)' % (self.path, self.size_in_bytes)
   497      else:
   498        return 'FileMetadata(%s, %s, %s)' % (
   499            self.path, self.size_in_bytes, self.last_updated_in_seconds)
   500  
   501  
   502  class MatchResult(object):
   503    """Result from the ``FileSystem`` match operation which contains the list
   504     of matched ``FileMetadata``.
   505    """
   506    def __init__(self, pattern, metadata_list):
   507      # type: (str, List[FileMetadata]) -> None
   508      self.metadata_list = metadata_list
   509      self.pattern = pattern
   510  
   511  
   512  class BeamIOError(IOError):
   513    def __init__(self, msg, exception_details=None):
   514      """Class representing the errors thrown in the batch file operations.
   515      Args:
   516        msg: Message string for the exception thrown
   517        exception_details: Optional map of individual input to exception for
   518          failed operations in batch. This parameter is optional so if specified
   519          the user can assume that the all errors in the filesystem operation
   520          have been reported. When the details are missing then the operation
   521          may have failed anywhere so the user should use match to determine
   522          the current state of the system.
   523      """
   524      message = "%s with exceptions %s" % (msg, exception_details)
   525      super().__init__(message)
   526      self.exception_details = exception_details
   527  
   528  
   529  class FileSystem(BeamPlugin, metaclass=abc.ABCMeta):
   530    """A class that defines the functions that can be performed on a filesystem.
   531  
   532    All methods are abstract and they are for file system providers to
   533    implement. Clients should use the FileSystems class to interact with
   534    the correct file system based on the provided file pattern scheme.
   535    """
   536    CHUNK_SIZE = 1  # Chuck size in the batch operations
   537  
   538    def __init__(self, pipeline_options):
   539      """
   540      Args:
   541        pipeline_options: Instance of ``PipelineOptions`` or dict of options and
   542          values (like ``RuntimeValueProvider.runtime_options``).
   543      """
   544  
   545    @staticmethod
   546    def _get_compression_type(path, compression_type):
   547      if compression_type == CompressionTypes.AUTO:
   548        compression_type = CompressionTypes.detect_compression_type(path)
   549      elif not CompressionTypes.is_valid_compression_type(compression_type):
   550        raise TypeError(
   551            'compression_type must be CompressionType object but '
   552            'was %s' % type(compression_type))
   553      return compression_type
   554  
   555    @classmethod
   556    def scheme(cls):
   557      """URI scheme for the FileSystem
   558      """
   559      raise NotImplementedError
   560  
   561    @abc.abstractmethod
   562    def join(self, basepath, *paths):
   563      # type: (str, *str) -> str
   564  
   565      """Join two or more pathname components for the filesystem
   566  
   567      Args:
   568        basepath: string path of the first component of the path
   569        paths: path components to be added
   570  
   571      Returns: full path after combining all the passed components
   572      """
   573      raise NotImplementedError
   574  
   575    @abc.abstractmethod
   576    def split(self, path):
   577      # type: (str) -> Tuple[str, str]
   578  
   579      """Splits the given path into two parts.
   580  
   581      Splits the path into a pair (head, tail) such that tail contains the last
   582      component of the path and head contains everything up to that.
   583  
   584      For file-systems other than the local file-system, head should include the
   585      prefix.
   586  
   587      Args:
   588        path: path as a string
   589      Returns:
   590        a pair of path components as strings.
   591      """
   592      raise NotImplementedError
   593  
   594    @abc.abstractmethod
   595    def mkdirs(self, path):
   596      """Recursively create directories for the provided path.
   597  
   598      Args:
   599        path: string path of the directory structure that should be created
   600  
   601      Raises:
   602        IOError: if leaf directory already exists.
   603      """
   604      raise NotImplementedError
   605  
   606    @abc.abstractmethod
   607    def has_dirs(self):
   608      """Whether this FileSystem supports directories."""
   609      raise NotImplementedError
   610  
   611    @abc.abstractmethod
   612    def _list(self, dir_or_prefix):
   613      """List files in a location.
   614  
   615      Listing is non-recursive (for filesystems that support directories).
   616  
   617      Args:
   618        dir_or_prefix: (string) A directory or location prefix (for filesystems
   619          that don't have directories).
   620  
   621      Returns:
   622        Generator of ``FileMetadata`` objects.
   623  
   624      Raises:
   625        ``BeamIOError``: if listing fails, but not if no files were found.
   626      """
   627      raise NotImplementedError
   628  
   629    @staticmethod
   630    def _split_scheme(url_or_path):
   631      match = re.match(r'(^[a-z]+)://(.*)', url_or_path)
   632      if match is not None:
   633        return match.groups()
   634      return None, url_or_path
   635  
   636    @staticmethod
   637    def _combine_scheme(scheme, path):
   638      if scheme is None:
   639        return path
   640      return '{}://{}'.format(scheme, path)
   641  
   642    def _url_dirname(self, url_or_path):
   643      """Like posixpath.dirname, but preserves scheme:// prefix.
   644  
   645      Args:
   646        url_or_path: A string in the form of scheme://some/path OR /some/path.
   647      """
   648      scheme, path = self._split_scheme(url_or_path)
   649      return self._combine_scheme(scheme, posixpath.dirname(path))
   650  
   651    def match_files(self, file_metas, pattern):
   652      # type: (List[FileMetadata], str) -> Iterator[FileMetadata]
   653  
   654      """Filter :class:`FileMetadata` objects by *pattern*
   655  
   656      Args:
   657        file_metas (list of :class:`FileMetadata`):
   658          Files to consider when matching
   659        pattern (str): File pattern
   660  
   661      See Also:
   662        :meth:`translate_pattern`
   663  
   664      Returns:
   665        Generator of matching :class:`FileMetadata`
   666      """
   667      re_pattern = re.compile(self.translate_pattern(pattern))
   668      match = re_pattern.match
   669      for file_metadata in file_metas:
   670        if match(file_metadata.path):
   671          yield file_metadata
   672  
   673    @staticmethod
   674    def translate_pattern(pattern):
   675      # type: (str) -> str
   676  
   677      """
   678      Translate a *pattern* to a regular expression.
   679      There is no way to quote meta-characters.
   680  
   681      Pattern syntax:
   682        The pattern syntax is based on the fnmatch_ syntax, with the following
   683        differences:
   684  
   685        -   ``*`` Is equivalent to ``[^/\\]*`` rather than ``.*``.
   686        -   ``**`` Is equivalent to ``.*``.
   687  
   688      See also:
   689        :meth:`match` uses this method
   690  
   691      This method is based on `Python 2.7's fnmatch.translate`_.
   692      The code in this method is licensed under
   693      PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2.
   694  
   695      .. _`fnmatch`: https://docs.python.org/2/library/fnmatch.html
   696  
   697      .. _`Python 2.7's fnmatch.translate`: https://github.com/python/cpython\
   698  /blob/170ea8ccd4235d28538ab713041502d07ad1cacd/Lib/fnmatch.py#L85-L120
   699      """
   700      i, n = 0, len(pattern)
   701      res = ''
   702      while i < n:
   703        c = pattern[i]
   704        i = i + 1
   705        if c == '*':
   706          # One char lookahead for "**"
   707          if i < n and pattern[i] == "*":
   708            res = res + '.*'
   709            i = i + 1
   710          else:
   711            res = res + r'[^/\\]*'
   712        elif c == '?':
   713          res = res + '.'
   714        elif c == '[':
   715          j = i
   716          if j < n and pattern[j] == '!':
   717            j = j + 1
   718          if j < n and pattern[j] == ']':
   719            j = j + 1
   720          while j < n and pattern[j] != ']':
   721            j = j + 1
   722          if j >= n:
   723            res = res + r'\['
   724          else:
   725            stuff = pattern[i:j].replace('\\', '\\\\')
   726            i = j + 1
   727            if stuff[0] == '!':
   728              stuff = '^' + stuff[1:]
   729            elif stuff[0] == '^':
   730              stuff = '\\' + stuff
   731            res = '%s[%s]' % (res, stuff)
   732        else:
   733          res = res + re.escape(c)
   734  
   735      logger.debug('translate_pattern: %r -> %r', pattern, res)
   736      return r'(?ms)' + res + r'\Z'
   737  
   738    def match(self, patterns, limits=None):
   739      """Find all matching paths to the patterns provided.
   740  
   741      See Also:
   742        :meth:`translate_pattern`
   743  
   744      Patterns ending with '/' or '\\' will be appended with '*'.
   745  
   746      Args:
   747        patterns: list of string for the file path pattern to match against
   748        limits: list of maximum number of responses that need to be fetched
   749  
   750      Returns: list of ``MatchResult`` objects.
   751  
   752      Raises:
   753        ``BeamIOError``: if any of the pattern match operations fail
   754      """
   755      if limits is None:
   756        limits = [None] * len(patterns)
   757      else:
   758        err_msg = "Patterns and limits should be equal in length"
   759        assert len(patterns) == len(limits), err_msg
   760  
   761      def _match(pattern, limit):
   762        """Find all matching paths to the pattern provided."""
   763        if pattern.endswith('/') or pattern.endswith('\\'):
   764          pattern += '*'
   765        # Get the part of the pattern before the first globbing character.
   766        # For example scheme://path/foo* will become scheme://path/foo for
   767        # filesystems like GCS, or converted to scheme://path for filesystems with
   768        # directories.
   769        prefix_or_dir = re.match('^[^[*?]*', pattern).group(0)
   770  
   771        file_metadatas = []
   772        if prefix_or_dir == pattern:
   773          # Short-circuit calling self.list() if there's no glob pattern to match.
   774          if self.exists(pattern):
   775            file_metadatas = [self.metadata(pattern)]
   776        else:
   777          if self.has_dirs():
   778            prefix_dirname = self._url_dirname(prefix_or_dir)
   779            if not prefix_dirname == prefix_or_dir:
   780              logger.debug(
   781                  "Changed prefix_or_dir %r -> %r", prefix_or_dir, prefix_dirname)
   782              prefix_or_dir = prefix_dirname
   783  
   784          logger.debug("Listing files in %r", prefix_or_dir)
   785          file_metadatas = self._list(prefix_or_dir)
   786  
   787        metadata_list = []
   788        for file_metadata in self.match_files(file_metadatas, pattern):
   789          if limit is not None and len(metadata_list) >= limit:
   790            break
   791          metadata_list.append(file_metadata)
   792  
   793        return MatchResult(pattern, metadata_list)
   794  
   795      exceptions = {}
   796      result = []
   797      for pattern, limit in zip(patterns, limits):
   798        try:
   799          result.append(_match(pattern, limit))
   800        except Exception as e:  # pylint: disable=broad-except
   801          exceptions[pattern] = e
   802  
   803      if exceptions:
   804        raise BeamIOError("Match operation failed", exceptions)
   805      return result
   806  
   807    @abc.abstractmethod
   808    def create(
   809        self,
   810        path,
   811        mime_type='application/octet-stream',
   812        compression_type=CompressionTypes.AUTO):
   813      # type: (...) -> BinaryIO
   814  
   815      """Returns a write channel for the given file path.
   816  
   817      Args:
   818        path: string path of the file object to be written to the system
   819        mime_type: MIME type to specify the type of content in the file object
   820        compression_type: Type of compression to be used for this object
   821  
   822      Returns: file handle with a close function for the user to use
   823      """
   824      raise NotImplementedError
   825  
   826    @abc.abstractmethod
   827    def open(
   828        self,
   829        path,
   830        mime_type='application/octet-stream',
   831        compression_type=CompressionTypes.AUTO):
   832      # type: (...) -> BinaryIO
   833  
   834      """Returns a read channel for the given file path.
   835  
   836      Args:
   837        path: string path of the file object to be read
   838        mime_type: MIME type to specify the type of content in the file object
   839        compression_type: Type of compression to be used for this object
   840  
   841      Returns: file handle with a close function for the user to use
   842      """
   843      raise NotImplementedError
   844  
   845    @abc.abstractmethod
   846    def copy(self, source_file_names, destination_file_names):
   847      """Recursively copy the file tree from the source to the destination
   848  
   849      Args:
   850        source_file_names: list of source file objects that needs to be copied
   851        destination_file_names: list of destination of the new object
   852  
   853      Raises:
   854        ``BeamIOError``: if any of the copy operations fail
   855      """
   856      raise NotImplementedError
   857  
   858    @abc.abstractmethod
   859    def rename(self, source_file_names, destination_file_names):
   860      """Rename the files at the source list to the destination list.
   861      Source and destination lists should be of the same size.
   862  
   863      Args:
   864        source_file_names: List of file paths that need to be moved
   865        destination_file_names: List of destination_file_names for the files
   866  
   867      Raises:
   868        ``BeamIOError``: if any of the rename operations fail
   869      """
   870      raise NotImplementedError
   871  
   872    @abc.abstractmethod
   873    def exists(self, path):
   874      # type: (str) -> bool
   875  
   876      """Check if the provided path exists on the FileSystem.
   877  
   878      Args:
   879        path: string path that needs to be checked.
   880  
   881      Returns: boolean flag indicating if path exists
   882      """
   883      raise NotImplementedError
   884  
   885    @abc.abstractmethod
   886    def size(self, path):
   887      # type: (str) -> int
   888  
   889      """Get size in bytes of a file on the FileSystem.
   890  
   891      Args:
   892        path: string filepath of file.
   893  
   894      Returns: int size of file according to the FileSystem.
   895  
   896      Raises:
   897        ``BeamIOError``: if path doesn't exist.
   898      """
   899      raise NotImplementedError
   900  
   901    @abc.abstractmethod
   902    def last_updated(self, path):
   903      """Get UNIX Epoch time in seconds on the FileSystem.
   904  
   905      Args:
   906        path: string path of file.
   907  
   908      Returns: float UNIX Epoch time
   909  
   910      Raises:
   911        ``BeamIOError``: if path doesn't exist.
   912      """
   913      raise NotImplementedError
   914  
   915    def checksum(self, path):
   916      """Fetch checksum metadata of a file on the
   917      :class:`~apache_beam.io.filesystem.FileSystem`.
   918  
   919      This operation returns checksum metadata as stored in the underlying
   920      FileSystem. It should not need to read file data to obtain this value.
   921      Checksum type and format are FileSystem dependent and are not compatible
   922      between FileSystems.
   923      FileSystem implementations may return file size if a checksum isn't
   924      available.
   925  
   926      Args:
   927        path: string path of a file.
   928  
   929      Returns: string containing checksum
   930  
   931      Raises:
   932        ``BeamIOError``: if path isn't a file or doesn't exist.
   933      """
   934      raise NotImplementedError
   935  
   936    @abc.abstractmethod
   937    def metadata(self, path):
   938      """Fetch metadata of a file on the
   939      :class:`~apache_beam.io.filesystem.FileSystem`.
   940  
   941      This operation returns metadata as stored in the underlying
   942      FileSystem. It should not need to read file data to obtain this value.
   943      For web based file systems, this method should also incur as few as
   944      possible requests.
   945  
   946      Args:
   947        path: string path of a file.
   948  
   949      Returns:
   950        :class:`~apache_beam.io.filesystem.FileMetadata`.
   951  
   952      Raises:
   953        ``BeamIOError``: if path isn't a file or doesn't exist.
   954      """
   955      raise NotImplementedError
   956  
   957    @abc.abstractmethod
   958    def delete(self, paths):
   959      """Deletes files or directories at the provided paths.
   960      Directories will be deleted recursively.
   961  
   962      Args:
   963        paths: list of paths that give the file objects to be deleted
   964  
   965      Raises:
   966        ``BeamIOError``: if any of the delete operations fail
   967      """
   968      raise NotImplementedError