github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsfilesystem.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """GCS file system implementation for accessing files on GCS.
    19  
    20  **Updates to the I/O connector code**
    21  
    22  For any significant updates to this I/O connector, please consider involving
    23  corresponding code reviewers mentioned in
    24  https://github.com/apache/beam/blob/master/sdks/python/OWNERS
    25  """
    26  
    27  # pytype: skip-file
    28  
    29  from typing import BinaryIO  # pylint: disable=unused-import
    30  
    31  from apache_beam.io.filesystem import BeamIOError
    32  from apache_beam.io.filesystem import CompressedFile
    33  from apache_beam.io.filesystem import CompressionTypes
    34  from apache_beam.io.filesystem import FileMetadata
    35  from apache_beam.io.filesystem import FileSystem
    36  from apache_beam.io.gcp import gcsio
    37  
    38  __all__ = ['GCSFileSystem']
    39  
    40  
    41  class GCSFileSystem(FileSystem):
    42    """A GCS ``FileSystem`` implementation for accessing files on GCS.
    43    """
    44  
    45    CHUNK_SIZE = gcsio.MAX_BATCH_OPERATION_SIZE  # Chuck size in batch operations
    46    GCS_PREFIX = 'gs://'
    47  
    48    def __init__(self, pipeline_options):
    49      super().__init__(pipeline_options)
    50      self._pipeline_options = pipeline_options
    51  
    52    @classmethod
    53    def scheme(cls):
    54      """URI scheme for the FileSystem
    55      """
    56      return 'gs'
    57  
    58    def join(self, basepath, *paths):
    59      """Join two or more pathname components for the filesystem
    60  
    61      Args:
    62        basepath: string path of the first component of the path
    63        paths: path components to be added
    64  
    65      Returns: full path after combining all the passed components
    66      """
    67      if not basepath.startswith(GCSFileSystem.GCS_PREFIX):
    68        raise ValueError('Basepath %r must be GCS path.' % basepath)
    69      path = basepath
    70      for p in paths:
    71        path = path.rstrip('/') + '/' + p.lstrip('/')
    72      return path
    73  
    74    def split(self, path):
    75      """Splits the given path into two parts.
    76  
    77      Splits the path into a pair (head, tail) such that tail contains the last
    78      component of the path and head contains everything up to that.
    79  
    80      Head will include the GCS prefix ('gs://').
    81  
    82      Args:
    83        path: path as a string
    84      Returns:
    85        a pair of path components as strings.
    86      """
    87      path = path.strip()
    88      if not path.startswith(GCSFileSystem.GCS_PREFIX):
    89        raise ValueError('Path %r must be GCS path.' % path)
    90  
    91      prefix_len = len(GCSFileSystem.GCS_PREFIX)
    92      last_sep = path[prefix_len:].rfind('/')
    93      if last_sep >= 0:
    94        last_sep += prefix_len
    95  
    96      if last_sep > 0:
    97        return (path[:last_sep], path[last_sep + 1:])
    98      elif last_sep < 0:
    99        return (path, '')
   100      else:
   101        raise ValueError('Invalid path: %s' % path)
   102  
   103    def mkdirs(self, path):
   104      """Recursively create directories for the provided path.
   105  
   106      Args:
   107        path: string path of the directory structure that should be created
   108  
   109      Raises:
   110        IOError: if leaf directory already exists.
   111      """
   112      pass
   113  
   114    def has_dirs(self):
   115      """Whether this FileSystem supports directories."""
   116      return False
   117  
   118    def _list(self, dir_or_prefix):
   119      """List files in a location.
   120  
   121      Listing is non-recursive, for filesystems that support directories.
   122  
   123      Args:
   124        dir_or_prefix: (string) A directory or location prefix (for filesystems
   125          that don't have directories).
   126  
   127      Returns:
   128        Generator of ``FileMetadata`` objects.
   129  
   130      Raises:
   131        ``BeamIOError``: if listing fails, but not if no files were found.
   132      """
   133      try:
   134        for path, (size, updated) in self._gcsIO().list_files(dir_or_prefix,
   135                                                              with_metadata=True):
   136          yield FileMetadata(path, size, updated)
   137      except Exception as e:  # pylint: disable=broad-except
   138        raise BeamIOError("List operation failed", {dir_or_prefix: e})
   139  
   140    def _gcsIO(self):
   141      return gcsio.GcsIO(pipeline_options=self._pipeline_options)
   142  
   143    def _path_open(
   144        self,
   145        path,
   146        mode,
   147        mime_type='application/octet-stream',
   148        compression_type=CompressionTypes.AUTO):
   149      """Helper functions to open a file in the provided mode.
   150      """
   151      compression_type = FileSystem._get_compression_type(path, compression_type)
   152      mime_type = CompressionTypes.mime_type(compression_type, mime_type)
   153      raw_file = self._gcsIO().open(path, mode, mime_type=mime_type)
   154      if compression_type == CompressionTypes.UNCOMPRESSED:
   155        return raw_file
   156      return CompressedFile(raw_file, compression_type=compression_type)
   157  
   158    def create(
   159        self,
   160        path,
   161        mime_type='application/octet-stream',
   162        compression_type=CompressionTypes.AUTO):
   163      # type: (...) -> BinaryIO
   164  
   165      """Returns a write channel for the given file path.
   166  
   167      Args:
   168        path: string path of the file object to be written to the system
   169        mime_type: MIME type to specify the type of content in the file object
   170        compression_type: Type of compression to be used for this object
   171  
   172      Returns: file handle with a close function for the user to use
   173      """
   174      return self._path_open(path, 'wb', mime_type, compression_type)
   175  
   176    def open(
   177        self,
   178        path,
   179        mime_type='application/octet-stream',
   180        compression_type=CompressionTypes.AUTO):
   181      # type: (...) -> BinaryIO
   182  
   183      """Returns a read channel for the given file path.
   184  
   185      Args:
   186        path: string path of the file object to be written to the system
   187        mime_type: MIME type to specify the type of content in the file object
   188        compression_type: Type of compression to be used for this object
   189  
   190      Returns: file handle with a close function for the user to use
   191      """
   192      return self._path_open(path, 'rb', mime_type, compression_type)
   193  
   194    def copy(self, source_file_names, destination_file_names):
   195      """Recursively copy the file tree from the source to the destination
   196  
   197      Args:
   198        source_file_names: list of source file objects that needs to be copied
   199        destination_file_names: list of destination of the new object
   200  
   201      Raises:
   202        ``BeamIOError``: if any of the copy operations fail
   203      """
   204      err_msg = (
   205          "source_file_names and destination_file_names should "
   206          "be equal in length")
   207      assert len(source_file_names) == len(destination_file_names), err_msg
   208  
   209      def _copy_path(source, destination):
   210        """Recursively copy the file tree from the source to the destination
   211        """
   212        if not destination.startswith(GCSFileSystem.GCS_PREFIX):
   213          raise ValueError('Destination %r must be GCS path.' % destination)
   214        # Use copy_tree if the path ends with / as it is a directory
   215        if source.endswith('/'):
   216          self._gcsIO().copytree(source, destination)
   217        else:
   218          self._gcsIO().copy(source, destination)
   219  
   220      exceptions = {}
   221      for source, destination in zip(source_file_names, destination_file_names):
   222        try:
   223          _copy_path(source, destination)
   224        except Exception as e:  # pylint: disable=broad-except
   225          exceptions[(source, destination)] = e
   226  
   227      if exceptions:
   228        raise BeamIOError("Copy operation failed", exceptions)
   229  
   230    def rename(self, source_file_names, destination_file_names):
   231      """Rename the files at the source list to the destination list.
   232      Source and destination lists should be of the same size.
   233  
   234      Args:
   235        source_file_names: List of file paths that need to be moved
   236        destination_file_names: List of destination_file_names for the files
   237  
   238      Raises:
   239        ``BeamIOError``: if any of the rename operations fail
   240      """
   241      err_msg = (
   242          "source_file_names and destination_file_names should "
   243          "be equal in length")
   244      assert len(source_file_names) == len(destination_file_names), err_msg
   245  
   246      gcs_batches = []
   247      gcs_current_batch = []
   248      for src, dest in zip(source_file_names, destination_file_names):
   249        gcs_current_batch.append((src, dest))
   250        if len(gcs_current_batch) == self.CHUNK_SIZE:
   251          gcs_batches.append(gcs_current_batch)
   252          gcs_current_batch = []
   253      if gcs_current_batch:
   254        gcs_batches.append(gcs_current_batch)
   255  
   256      # Execute GCS renames if any and return exceptions.
   257      exceptions = {}
   258      for batch in gcs_batches:
   259        copy_statuses = self._gcsIO().copy_batch(batch)
   260        copy_succeeded = []
   261        for src, dest, exception in copy_statuses:
   262          if exception:
   263            exceptions[(src, dest)] = exception
   264          else:
   265            copy_succeeded.append((src, dest))
   266        delete_batch = [src for src, dest in copy_succeeded]
   267        delete_statuses = self._gcsIO().delete_batch(delete_batch)
   268        for i, (src, exception) in enumerate(delete_statuses):
   269          dest = copy_succeeded[i][1]
   270          if exception:
   271            exceptions[(src, dest)] = exception
   272  
   273      if exceptions:
   274        raise BeamIOError("Rename operation failed", exceptions)
   275  
   276    def exists(self, path):
   277      """Check if the provided path exists on the FileSystem.
   278  
   279      Args:
   280        path: string path that needs to be checked.
   281  
   282      Returns: boolean flag indicating if path exists
   283      """
   284      return self._gcsIO().exists(path)
   285  
   286    def size(self, path):
   287      """Get size of path on the FileSystem.
   288  
   289      Args:
   290        path: string path in question.
   291  
   292      Returns: int size of path according to the FileSystem.
   293  
   294      Raises:
   295        ``BeamIOError``: if path doesn't exist.
   296      """
   297      return self._gcsIO().size(path)
   298  
   299    def last_updated(self, path):
   300      """Get UNIX Epoch time in seconds on the FileSystem.
   301  
   302      Args:
   303        path: string path of file.
   304  
   305      Returns: float UNIX Epoch time
   306  
   307      Raises:
   308        ``BeamIOError``: if path doesn't exist.
   309      """
   310      return self._gcsIO().last_updated(path)
   311  
   312    def checksum(self, path):
   313      """Fetch checksum metadata of a file on the
   314      :class:`~apache_beam.io.filesystem.FileSystem`.
   315  
   316      Args:
   317        path: string path of a file.
   318  
   319      Returns: string containing checksum
   320  
   321      Raises:
   322        ``BeamIOError``: if path isn't a file or doesn't exist.
   323      """
   324      try:
   325        return self._gcsIO().checksum(path)
   326      except Exception as e:  # pylint: disable=broad-except
   327        raise BeamIOError("Checksum operation failed", {path: e})
   328  
   329    def metadata(self, path):
   330      """Fetch metadata fields of a file on the FileSystem.
   331  
   332      Args:
   333        path: string path of a file.
   334  
   335      Returns:
   336        :class:`~apache_beam.io.filesystem.FileMetadata`.
   337  
   338      Raises:
   339        ``BeamIOError``: if path isn't a file or doesn't exist.
   340      """
   341      try:
   342        file_metadata = self._gcsIO()._status(path)
   343        return FileMetadata(
   344            path, file_metadata['size'], file_metadata['last_updated'])
   345      except Exception as e:  # pylint: disable=broad-except
   346        raise BeamIOError("Metadata operation failed", {path: e})
   347  
   348    def delete(self, paths):
   349      """Deletes files or directories at the provided paths.
   350      Directories will be deleted recursively.
   351  
   352      Args:
   353        paths: list of paths that give the file objects to be deleted
   354      """
   355      def _delete_path(path):
   356        """Recursively delete the file or directory at the provided path.
   357        """
   358        if path.endswith('/'):
   359          path_to_use = path + '*'
   360        else:
   361          path_to_use = path
   362        match_result = self.match([path_to_use])[0]
   363        statuses = self._gcsIO().delete_batch(
   364            [m.path for m in match_result.metadata_list])
   365        # pylint: disable=used-before-assignment
   366        failures = [e for (_, e) in statuses if e is not None]
   367        if failures:
   368          raise failures[0]
   369  
   370      exceptions = {}
   371      for path in paths:
   372        try:
   373          _delete_path(path)
   374        except Exception as e:  # pylint: disable=broad-except
   375          exceptions[path] = e
   376  
   377      if exceptions:
   378        raise BeamIOError("Delete operation failed", exceptions)