github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/aws/s3filesystem.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """S3 file system implementation for accessing files on AWS S3."""
    19  
    20  # pytype: skip-file
    21  
    22  from apache_beam.io.aws import s3io
    23  from apache_beam.io.filesystem import BeamIOError
    24  from apache_beam.io.filesystem import CompressedFile
    25  from apache_beam.io.filesystem import CompressionTypes
    26  from apache_beam.io.filesystem import FileMetadata
    27  from apache_beam.io.filesystem import FileSystem
    28  
    29  __all__ = ['S3FileSystem']
    30  
    31  
    32  class S3FileSystem(FileSystem):
    33    """An S3 `FileSystem` implementation for accessing files on AWS S3
    34    """
    35  
    36    CHUNK_SIZE = s3io.MAX_BATCH_OPERATION_SIZE
    37    S3_PREFIX = 's3://'
    38  
    39    def __init__(self, pipeline_options):
    40      """Initializes a connection to S3.
    41  
    42      Connection configuration is done by passing pipeline options.
    43      See :class:`~apache_beam.options.pipeline_options.S3Options`.
    44      """
    45      super().__init__(pipeline_options)
    46      self._options = pipeline_options
    47  
    48    @classmethod
    49    def scheme(cls):
    50      """URI scheme for the FileSystem
    51      """
    52      return 's3'
    53  
    54    def join(self, basepath, *paths):
    55      """Join two or more pathname components for the filesystem
    56  
    57      Args:
    58        basepath: string path of the first component of the path
    59        paths: path components to be added
    60  
    61      Returns: full path after combining all of the return nulled components
    62      """
    63      if not basepath.startswith(S3FileSystem.S3_PREFIX):
    64        raise ValueError('Basepath %r must be S3 path.' % basepath)
    65  
    66      path = basepath
    67      for p in paths:
    68        path = path.rstrip('/') + '/' + p.lstrip('/')
    69      return path
    70  
    71    def split(self, path):
    72      """Splits the given path into two parts.
    73  
    74      Splits the path into a pair (head, tail) such that tail contains the last
    75      component of the path and head contains everything up to that.
    76  
    77      Head will include the S3 prefix ('s3://').
    78  
    79      Args:
    80        path: path as a string
    81      Returns:
    82        a pair of path components as strings.
    83      """
    84      path = path.strip()
    85      if not path.startswith(S3FileSystem.S3_PREFIX):
    86        raise ValueError('Path %r must be S3 path.' % path)
    87  
    88      prefix_len = len(S3FileSystem.S3_PREFIX)
    89      last_sep = path[prefix_len:].rfind('/')
    90      if last_sep >= 0:
    91        last_sep += prefix_len
    92  
    93      if last_sep > 0:
    94        return (path[:last_sep], path[last_sep + 1:])
    95      elif last_sep < 0:
    96        return (path, '')
    97      else:
    98        raise ValueError('Invalid path: %s' % path)
    99  
   100    def mkdirs(self, path):
   101      """Recursively create directories for the provided path.
   102  
   103      Args:
   104        path: string path of the directory structure that should be created
   105  
   106      Raises:
   107        IOError: if leaf directory already exists.
   108      """
   109      pass
   110  
   111    def has_dirs(self):
   112      """Whether this FileSystem supports directories."""
   113      return False
   114  
   115    def _list(self, dir_or_prefix):
   116      """List files in a location.
   117  
   118      Listing is non-recursive, for filesystems that support directories.
   119  
   120      Args:
   121        dir_or_prefix: (string) A directory or location prefix (for filesystems
   122          that don't have directories).
   123  
   124      Returns:
   125        Generator of ``FileMetadata`` objects.
   126  
   127      Raises:
   128        ``BeamIOError``: if listing fails, but not if no files were found.
   129      """
   130      try:
   131        for path, (size, updated) in s3io.S3IO(options=self._options).list_files(
   132            dir_or_prefix, with_metadata=True):
   133          yield FileMetadata(path, size, updated)
   134      except Exception as e:  # pylint: disable=broad-except
   135        raise BeamIOError("List operation failed", {dir_or_prefix: e})
   136  
   137    def _path_open(
   138        self,
   139        path,
   140        mode,
   141        mime_type='application/octet-stream',
   142        compression_type=CompressionTypes.AUTO):
   143      """Helper functions to open a file in the provided mode.
   144      """
   145      compression_type = FileSystem._get_compression_type(path, compression_type)
   146      mime_type = CompressionTypes.mime_type(compression_type, mime_type)
   147      raw_file = s3io.S3IO(options=self._options).open(
   148          path, mode, mime_type=mime_type)
   149      if compression_type == CompressionTypes.UNCOMPRESSED:
   150        return raw_file
   151      return CompressedFile(raw_file, compression_type=compression_type)
   152  
   153    def create(
   154        self,
   155        path,
   156        mime_type='application/octet-stream',
   157        compression_type=CompressionTypes.AUTO):
   158      """Returns a write channel for the given file path.
   159  
   160      Args:
   161        path: string path of the file object to be written to the system
   162        mime_type: MIME type to specify the type of content in the file object
   163        compression_type: Type of compression to be used for this object
   164  
   165      Returns: file handle with a close function for the user to use
   166      """
   167      return self._path_open(path, 'wb', mime_type, compression_type)
   168  
   169    def open(
   170        self,
   171        path,
   172        mime_type='application/octet-stream',
   173        compression_type=CompressionTypes.AUTO):
   174      """Returns a read channel for the given file path.
   175  
   176      Args:
   177        path: string path of the file object to be written to the system
   178        mime_type: MIME type to specify the type of content in the file object
   179        compression_type: Type of compression to be used for this object
   180  
   181      Returns: file handle with a close function for the user to use
   182      """
   183      return self._path_open(path, 'rb', mime_type, compression_type)
   184  
   185    def copy(self, source_file_names, destination_file_names):
   186      """Recursively copy the file tree from the source to the destination
   187  
   188      Args:
   189        source_file_names: list of source file objects that needs to be copied
   190        destination_file_names: list of destination of the new object
   191  
   192      Raises:
   193        ``BeamIOError``: if any of the copy operations fail
   194      """
   195      if not len(source_file_names) == len(destination_file_names):
   196        message = 'Unable to copy unequal number of sources and destinations'
   197        raise BeamIOError(message)
   198      src_dest_pairs = list(zip(source_file_names, destination_file_names))
   199      return s3io.S3IO(options=self._options).copy_paths(src_dest_pairs)
   200  
   201    def rename(self, source_file_names, destination_file_names):
   202      """Rename the files at the source list to the destination list.
   203      Source and destination lists should be of the same size.
   204  
   205      Args:
   206        source_file_names: List of file paths that need to be moved
   207        destination_file_names: List of destination_file_names for the files
   208  
   209      Raises:
   210        ``BeamIOError``: if any of the rename operations fail
   211      """
   212      if not len(source_file_names) == len(destination_file_names):
   213        message = 'Unable to rename unequal number of sources and destinations'
   214        raise BeamIOError(message)
   215      src_dest_pairs = list(zip(source_file_names, destination_file_names))
   216      results = s3io.S3IO(options=self._options).rename_files(src_dest_pairs)
   217      exceptions = {(src, dest): error
   218                    for (src, dest, error) in results if error is not None}
   219      if exceptions:
   220        raise BeamIOError("Rename operation failed", exceptions)
   221  
   222    def exists(self, path):
   223      """Check if the provided path exists on the FileSystem.
   224  
   225      Args:
   226        path: string path that needs to be checked.
   227  
   228      Returns: boolean flag indicating if path exists
   229      """
   230      try:
   231        return s3io.S3IO(options=self._options).exists(path)
   232      except Exception as e:  # pylint: disable=broad-except
   233        raise BeamIOError("exists() operation failed", {path: e})
   234  
   235    def size(self, path):
   236      """Get size of path on the FileSystem.
   237  
   238      Args:
   239        path: string path in question.
   240  
   241      Returns: int size of path according to the FileSystem.
   242  
   243      Raises:
   244        ``BeamIOError``: if path doesn't exist.
   245      """
   246      try:
   247        return s3io.S3IO(options=self._options).size(path)
   248      except Exception as e:  # pylint: disable=broad-except
   249        raise BeamIOError("size() operation failed", {path: e})
   250  
   251    def last_updated(self, path):
   252      """Get UNIX Epoch time in seconds on the FileSystem.
   253  
   254      Args:
   255        path: string path of file.
   256  
   257      Returns: float UNIX Epoch time
   258  
   259      Raises:
   260        ``BeamIOError``: if path doesn't exist.
   261      """
   262      try:
   263        return s3io.S3IO(options=self._options).last_updated(path)
   264      except Exception as e:  # pylint: disable=broad-except
   265        raise BeamIOError("last_updated operation failed", {path: e})
   266  
   267    def checksum(self, path):
   268      """Fetch checksum metadata of a file on the
   269      :class:`~apache_beam.io.filesystem.FileSystem`.
   270  
   271      Args:
   272        path: string path of a file.
   273  
   274      Returns: string containing checksum
   275  
   276      Raises:
   277        ``BeamIOError``: if path isn't a file or doesn't exist.
   278      """
   279      try:
   280        return s3io.S3IO(options=self._options).checksum(path)
   281      except Exception as e:  # pylint: disable=broad-except
   282        raise BeamIOError("Checksum operation failed", {path: e})
   283  
   284    def metadata(self, path):
   285      """Fetch metadata fields of a file on the FileSystem.
   286  
   287      Args:
   288        path: string path of a file.
   289  
   290      Returns:
   291        :class:`~apache_beam.io.filesystem.FileMetadata`.
   292  
   293      Raises:
   294        ``BeamIOError``: if path isn't a file or doesn't exist.
   295      """
   296      try:
   297        file_metadata = s3io.S3IO(options=self._options)._status(path)
   298        return FileMetadata(
   299            path, file_metadata['size'], file_metadata['last_updated'])
   300      except Exception as e:  # pylint: disable=broad-except
   301        raise BeamIOError("Metadata operation failed", {path: e})
   302  
   303    def delete(self, paths):
   304      """Deletes files or directories at the provided paths.
   305      Directories will be deleted recursively.
   306  
   307      Args:
   308        paths: list of paths that give the file objects to be deleted
   309      """
   310      results = s3io.S3IO(options=self._options).delete_paths(paths)
   311      exceptions = {
   312          path: error
   313          for (path, error) in results.items() if error is not None
   314      }
   315      if exceptions:
   316        raise BeamIOError("Delete operation failed", exceptions)