github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystems.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """FileSystems interface class for accessing the correct filesystem"""
    19  
    20  # pytype: skip-file
    21  
    22  import re
    23  from typing import BinaryIO  # pylint: disable=unused-import
    24  
    25  from apache_beam.io.filesystem import BeamIOError
    26  from apache_beam.io.filesystem import CompressionTypes
    27  from apache_beam.io.filesystem import FileSystem
    28  from apache_beam.options.value_provider import RuntimeValueProvider
    29  
    30  # All filesystem implements should be added here as
    31  # best effort imports. We don't want to force loading
    32  # a module if the user doesn't supply the correct
    33  # packages that these filesystems rely on.
    34  #
    35  # pylint: disable=wrong-import-position, unused-import
    36  try:
    37    from apache_beam.io.hadoopfilesystem import HadoopFileSystem
    38  except ImportError:
    39    pass
    40  
    41  try:
    42    from apache_beam.io.localfilesystem import LocalFileSystem
    43  except ImportError:
    44    pass
    45  
    46  try:
    47    from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
    48  except ImportError:
    49    pass
    50  
    51  try:
    52    from apache_beam.io.aws.s3filesystem import S3FileSystem
    53  except ImportError:
    54    pass
    55  
    56  try:
    57    from apache_beam.io.azure.blobstoragefilesystem import BlobStorageFileSystem
    58  except ImportError:
    59    pass
    60  
    61  # pylint: enable=wrong-import-position, unused-import
    62  
    63  __all__ = ['FileSystems']
    64  
    65  
    66  class FileSystems(object):
    67    """A class that defines the functions that can be performed on a filesystem.
    68    All methods are static and access the underlying registered filesystems.
    69    """
    70    URI_SCHEMA_PATTERN = re.compile('(?P<scheme>[a-zA-Z][-a-zA-Z0-9+.]*)://.*')
    71  
    72    _pipeline_options = None
    73  
    74    @classmethod
    75    def set_options(cls, pipeline_options):
    76      """Set filesystem options.
    77  
    78      Args:
    79        pipeline_options: Instance of ``PipelineOptions``.
    80      """
    81      cls._pipeline_options = pipeline_options
    82  
    83    @staticmethod
    84    def get_scheme(path):
    85      match_result = FileSystems.URI_SCHEMA_PATTERN.match(path.strip())
    86      if match_result is None:
    87        return None
    88      return match_result.groupdict()['scheme']
    89  
    90    @staticmethod
    91    def get_filesystem(path):
    92      # type: (str) -> FileSystems
    93  
    94      """Get the correct filesystem for the specified path
    95      """
    96      try:
    97        path_scheme = FileSystems.get_scheme(path)
    98        systems = [
    99            fs for fs in FileSystem.get_all_subclasses()
   100            if fs.scheme() == path_scheme
   101        ]
   102        if len(systems) == 0:
   103          raise ValueError(
   104              'Unable to get filesystem from specified path, please use the '
   105              'correct path or ensure the required dependency is installed, '
   106              'e.g., pip install apache-beam[gcp]. Path specified: %s' % path)
   107        elif len(systems) == 1:
   108          # Pipeline options could come either from the Pipeline itself (using
   109          # direct runner), or via RuntimeValueProvider (other runners).
   110          options = (
   111              FileSystems._pipeline_options or
   112              RuntimeValueProvider.runtime_options)
   113          return systems[0](pipeline_options=options)
   114        else:
   115          raise ValueError('Found more than one filesystem for path %s' % path)
   116      except ValueError:
   117        raise
   118      except Exception as e:
   119        raise BeamIOError('Unable to get the Filesystem', {path: e})
   120  
   121    @staticmethod
   122    def join(basepath, *paths):
   123      # type: (str, *str) -> str
   124  
   125      """Join two or more pathname components for the filesystem
   126  
   127      Args:
   128        basepath: string path of the first component of the path
   129        paths: path components to be added
   130  
   131      Returns: full path after combining all the passed components
   132      """
   133      filesystem = FileSystems.get_filesystem(basepath)
   134      return filesystem.join(basepath, *paths)
   135  
   136    @staticmethod
   137    def split(path):
   138      """Splits the given path into two parts.
   139  
   140      Splits the path into a pair (head, tail) such that tail contains the last
   141      component of the path and head contains everything up to that.
   142  
   143      For file-systems other than the local file-system, head should include the
   144      prefix.
   145  
   146      Args:
   147        path: path as a string
   148      Returns:
   149        a pair of path components as strings.
   150      """
   151      filesystem = FileSystems.get_filesystem(path)
   152      return filesystem.split(path)
   153  
   154    @staticmethod
   155    def mkdirs(path):
   156      """Recursively create directories for the provided path.
   157  
   158      Args:
   159        path: string path of the directory structure that should be created
   160  
   161      Raises:
   162        IOError: if leaf directory already exists.
   163      """
   164      filesystem = FileSystems.get_filesystem(path)
   165      return filesystem.mkdirs(path)
   166  
   167    @staticmethod
   168    def match(patterns, limits=None):
   169      """Find all matching paths to the patterns provided.
   170  
   171      Pattern matching is done using each filesystem's ``match`` method (e.g.
   172      :meth:`.filesystem.FileSystem.match`).
   173  
   174      .. note::
   175        - Depending on the :class:`.FileSystem` implementation, file listings
   176          (the ``.FileSystem._list`` method) may not be recursive.
   177        - If the file listing is not recursive, a pattern like
   178          ``scheme://path/*/foo`` will not be able to mach any files.
   179  
   180      See Also:
   181        :meth:`.filesystem.FileSystem.match`
   182  
   183      Pattern syntax:
   184        The pattern syntax is based on the fnmatch_ syntax, with the following
   185        differences:
   186  
   187        -   ``*`` Is equivalent to ``[^/\\]*`` rather than ``.*``.
   188        -   ``**`` Is equivalent to ``.*``.
   189  
   190      .. _`fnmatch`: https://docs.python.org/2/library/fnmatch.html
   191  
   192      Args:
   193        patterns: list of string for the file path pattern to match against
   194        limits: list of maximum number of responses that need to be fetched
   195  
   196      Returns: list of ``MatchResult`` objects.
   197  
   198      Raises:
   199        ``BeamIOError``: if any of the pattern match operations fail
   200      """
   201      if len(patterns) == 0:
   202        return []
   203      filesystem = FileSystems.get_filesystem(patterns[0])
   204      return filesystem.match(patterns, limits)
   205  
   206    @staticmethod
   207    def create(
   208        path,
   209        mime_type='application/octet-stream',
   210        compression_type=CompressionTypes.AUTO):
   211      # type: (...) -> BinaryIO
   212  
   213      """Returns a write channel for the given file path.
   214  
   215      Args:
   216        path: string path of the file object to be written to the system
   217        mime_type: MIME type to specify the type of content in the file object
   218        compression_type: Type of compression to be used for this object. See
   219          ``CompressionTypes`` for possible values.
   220  
   221      Returns: file handle with a ``close`` function for the user to use.
   222      """
   223      filesystem = FileSystems.get_filesystem(path)
   224      return filesystem.create(path, mime_type, compression_type)
   225  
   226    @staticmethod
   227    def open(
   228        path,
   229        mime_type='application/octet-stream',
   230        compression_type=CompressionTypes.AUTO):
   231      # type: (...) -> BinaryIO
   232  
   233      """Returns a read channel for the given file path.
   234  
   235      Args:
   236        path: string path of the file object to be written to the system
   237        mime_type: MIME type to specify the type of content in the file object
   238        compression_type: Type of compression to be used for this object. See
   239          ``CompressionTypes`` for possible values.
   240  
   241      Returns: file handle with a ``close`` function for the user to use.
   242      """
   243      filesystem = FileSystems.get_filesystem(path)
   244      return filesystem.open(path, mime_type, compression_type)
   245  
   246    @staticmethod
   247    def copy(source_file_names, destination_file_names):
   248      """Recursively copy the file list from the source to the destination
   249  
   250      Args:
   251        source_file_names: list of source file objects that needs to be copied
   252        destination_file_names: list of destination of the new object
   253  
   254      Raises:
   255        ``BeamIOError``: if any of the copy operations fail
   256      """
   257      if len(source_file_names) == 0:
   258        return
   259      filesystem = FileSystems.get_filesystem(source_file_names[0])
   260      return filesystem.copy(source_file_names, destination_file_names)
   261  
   262    @staticmethod
   263    def rename(source_file_names, destination_file_names):
   264      """Rename the files at the source list to the destination list.
   265      Source and destination lists should be of the same size.
   266  
   267      Args:
   268        source_file_names: List of file paths that need to be moved
   269        destination_file_names: List of destination_file_names for the files
   270  
   271      Raises:
   272        ``BeamIOError``: if any of the rename operations fail
   273      """
   274      if len(source_file_names) == 0:
   275        return
   276      filesystem = FileSystems.get_filesystem(source_file_names[0])
   277      return filesystem.rename(source_file_names, destination_file_names)
   278  
   279    @staticmethod
   280    def exists(path):
   281      """Check if the provided path exists on the FileSystem.
   282  
   283      Args:
   284        path: string path that needs to be checked.
   285  
   286      Returns: boolean flag indicating if path exists
   287      """
   288      filesystem = FileSystems.get_filesystem(path)
   289      return filesystem.exists(path)
   290  
   291    @staticmethod
   292    def last_updated(path):
   293      """Get UNIX Epoch time in seconds on the FileSystem.
   294  
   295      Args:
   296        path: string path of file.
   297  
   298      Returns: float UNIX Epoch time
   299  
   300      Raises:
   301        ``BeamIOError``: if path doesn't exist.
   302      """
   303      filesystem = FileSystems.get_filesystem(path)
   304      return filesystem.last_updated(path)
   305  
   306    @staticmethod
   307    def checksum(path):
   308      """Fetch checksum metadata of a file on the
   309      :class:`~apache_beam.io.filesystem.FileSystem`.
   310  
   311      This operation returns checksum metadata as stored in the underlying
   312      FileSystem. It should not read any file data. Checksum type and format are
   313      FileSystem dependent and are not compatible between FileSystems.
   314  
   315      Args:
   316        path: string path of a file.
   317  
   318      Returns: string containing checksum
   319  
   320      Raises:
   321        ``BeamIOError``: if path isn't a file or doesn't exist.
   322      """
   323      filesystem = FileSystems.get_filesystem(path)
   324      return filesystem.checksum(path)
   325  
   326    @staticmethod
   327    def delete(paths):
   328      """Deletes files or directories at the provided paths.
   329      Directories will be deleted recursively.
   330  
   331      Args:
   332        paths: list of paths that give the file objects to be deleted
   333  
   334      Raises:
   335        ``BeamIOError``: if any of the delete operations fail
   336      """
   337      if isinstance(paths, str):
   338        raise BeamIOError(
   339            'Delete passed string argument instead of list: %s' % paths)
   340      if len(paths) == 0:
   341        return
   342      filesystem = FileSystems.get_filesystem(paths[0])
   343      return filesystem.delete(paths)
   344  
   345    @staticmethod
   346    def get_chunk_size(path):
   347      """Get the correct chunk size for the FileSystem.
   348  
   349      Args:
   350        path: string path that needs to be checked.
   351  
   352      Returns: integer size for parallelization in the FS operations.
   353      """
   354      filesystem = FileSystems.get_filesystem(path)
   355      return filesystem.CHUNK_SIZE