github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/localfilesystem.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Local File system implementation for accessing files on disk."""
    19  
    20  # pytype: skip-file
    21  
    22  import io
    23  import os
    24  import shutil
    25  from typing import BinaryIO  # pylint: disable=unused-import
    26  
    27  from apache_beam.io.filesystem import BeamIOError
    28  from apache_beam.io.filesystem import CompressedFile
    29  from apache_beam.io.filesystem import CompressionTypes
    30  from apache_beam.io.filesystem import FileMetadata
    31  from apache_beam.io.filesystem import FileSystem
    32  
    33  __all__ = ['LocalFileSystem']
    34  
    35  
    36  class LocalFileSystem(FileSystem):
    37    """A Local ``FileSystem`` implementation for accessing files on disk.
    38    """
    39    @classmethod
    40    def scheme(cls):
    41      """URI scheme for the FileSystem
    42      """
    43      return None
    44  
    45    def join(self, basepath, *paths):
    46      """Join two or more pathname components for the filesystem
    47  
    48      Args:
    49        basepath: string path of the first component of the path
    50        paths: path components to be added
    51  
    52      Returns: full path after combining all the passed components
    53      """
    54      return os.path.join(basepath, *paths)
    55  
    56    def split(self, path):
    57      """Splits the given path into two parts.
    58  
    59      Splits the path into a pair (head, tail) such that tail contains the last
    60      component of the path and head contains everything up to that.
    61  
    62      Args:
    63        path: path as a string
    64      Returns:
    65        a pair of path components as strings.
    66      """
    67      return os.path.split(os.path.abspath(path))
    68  
    69    def mkdirs(self, path):
    70      """Recursively create directories for the provided path.
    71  
    72      Args:
    73        path: string path of the directory structure that should be created
    74  
    75      Raises:
    76        IOError: if leaf directory already exists.
    77      """
    78      try:
    79        os.makedirs(path)
    80      except OSError as err:
    81        raise IOError(err)
    82  
    83    def has_dirs(self):
    84      """Whether this FileSystem supports directories."""
    85      return True
    86  
    87    def _url_dirname(self, url_or_path):
    88      """Pass through to os.path.dirname.
    89  
    90      This version uses os.path instead of posixpath to be compatible with the
    91      host OS.
    92  
    93      Args:
    94        url_or_path: A string in the form of /some/path.
    95      """
    96      return os.path.dirname(url_or_path)
    97  
    98    def _list(self, dir_or_prefix):
    99      """List files in a location.
   100  
   101      Listing is non-recursive, for filesystems that support directories.
   102  
   103      Args:
   104        dir_or_prefix: (string) A directory or location prefix (for filesystems
   105          that don't have directories).
   106  
   107      Returns:
   108        Generator of ``FileMetadata`` objects.
   109  
   110      Raises:
   111        ``BeamIOError``: if listing fails, but not if no files were found.
   112      """
   113      if not self.exists(dir_or_prefix):
   114        return
   115  
   116      def list_files(root):
   117        for dirpath, _, files in os.walk(root):
   118          for filename in files:
   119            yield self.join(dirpath, filename)
   120  
   121      try:
   122        for f in list_files(dir_or_prefix):
   123          try:
   124            yield FileMetadata(f, os.path.getsize(f), os.path.getmtime(f))
   125          except OSError:
   126            # Files may disappear, such as when listing /tmp.
   127            pass
   128      except Exception as e:  # pylint: disable=broad-except
   129        raise BeamIOError("List operation failed", {dir_or_prefix: e})
   130  
   131    def _path_open(
   132        self,
   133        path,
   134        mode,
   135        mime_type='application/octet-stream',
   136        compression_type=CompressionTypes.AUTO):
   137      """Helper functions to open a file in the provided mode.
   138      """
   139      compression_type = FileSystem._get_compression_type(path, compression_type)
   140      raw_file = io.open(path, mode)
   141      if compression_type == CompressionTypes.UNCOMPRESSED:
   142        return raw_file
   143      else:
   144        return CompressedFile(raw_file, compression_type=compression_type)
   145  
   146    def create(
   147        self,
   148        path,
   149        mime_type='application/octet-stream',
   150        compression_type=CompressionTypes.AUTO):
   151      # type: (...) -> BinaryIO
   152  
   153      """Returns a write channel for the given file path.
   154  
   155      Args:
   156        path: string path of the file object to be written to the system
   157        mime_type: MIME type to specify the type of content in the file object
   158        compression_type: Type of compression to be used for this object
   159  
   160      Returns: file handle with a close function for the user to use
   161      """
   162      os.makedirs(os.path.dirname(path), exist_ok=True)
   163      return self._path_open(path, 'wb', mime_type, compression_type)
   164  
   165    def open(
   166        self,
   167        path,
   168        mime_type='application/octet-stream',
   169        compression_type=CompressionTypes.AUTO):
   170      # type: (...) -> BinaryIO
   171  
   172      """Returns a read channel for the given file path.
   173  
   174      Args:
   175        path: string path of the file object to be written to the system
   176        mime_type: MIME type to specify the type of content in the file object
   177        compression_type: Type of compression to be used for this object
   178  
   179      Returns: file handle with a close function for the user to use
   180      """
   181      return self._path_open(path, 'rb', mime_type, compression_type)
   182  
   183    def copy(self, source_file_names, destination_file_names):
   184      """Recursively copy the file tree from the source to the destination
   185  
   186      Args:
   187        source_file_names: list of source file objects that needs to be copied
   188        destination_file_names: list of destination of the new object
   189  
   190      Raises:
   191        ``BeamIOError``: if any of the copy operations fail
   192      """
   193      err_msg = (
   194          "source_file_names and destination_file_names should "
   195          "be equal in length")
   196      assert len(source_file_names) == len(destination_file_names), err_msg
   197  
   198      def _copy_path(source, destination):
   199        """Recursively copy the file tree from the source to the destination
   200        """
   201        try:
   202          if os.path.exists(destination):
   203            if os.path.isdir(destination):
   204              shutil.rmtree(destination)
   205            else:
   206              os.remove(destination)
   207          if os.path.isdir(source):
   208            shutil.copytree(source, destination)
   209          else:
   210            shutil.copy2(source, destination)
   211        except OSError as err:
   212          raise IOError(err)
   213  
   214      exceptions = {}
   215      for source, destination in zip(source_file_names, destination_file_names):
   216        try:
   217          _copy_path(source, destination)
   218        except Exception as e:  # pylint: disable=broad-except
   219          exceptions[(source, destination)] = e
   220  
   221      if exceptions:
   222        raise BeamIOError("Copy operation failed", exceptions)
   223  
   224    def rename(self, source_file_names, destination_file_names):
   225      """Rename the files at the source list to the destination list.
   226      Source and destination lists should be of the same size.
   227  
   228      Args:
   229        source_file_names: List of file paths that need to be moved
   230        destination_file_names: List of destination_file_names for the files
   231  
   232      Raises:
   233        ``BeamIOError``: if any of the rename operations fail
   234      """
   235      err_msg = (
   236          "source_file_names and destination_file_names should "
   237          "be equal in length")
   238      assert len(source_file_names) == len(destination_file_names), err_msg
   239  
   240      def _rename_file(source, destination):
   241        """Rename a single file object"""
   242        try:
   243          os.rename(source, destination)
   244        except OSError as err:
   245          raise IOError(err)
   246  
   247      exceptions = {}
   248      for source, destination in zip(source_file_names, destination_file_names):
   249        try:
   250          _rename_file(source, destination)
   251        except Exception as e:  # pylint: disable=broad-except
   252          exceptions[(source, destination)] = e
   253  
   254      if exceptions:
   255        raise BeamIOError("Rename operation failed", exceptions)
   256  
   257    def exists(self, path):
   258      """Check if the provided path exists on the FileSystem.
   259  
   260      Args:
   261        path: string path that needs to be checked.
   262  
   263      Returns: boolean flag indicating if path exists
   264      """
   265      return os.path.exists(path)
   266  
   267    def size(self, path):
   268      """Get size of path on the FileSystem.
   269  
   270      Args:
   271        path: string path in question.
   272  
   273      Returns: int size of path according to the FileSystem.
   274  
   275      Raises:
   276        ``BeamIOError``: if path doesn't exist.
   277      """
   278      try:
   279        return os.path.getsize(path)
   280      except Exception as e:  # pylint: disable=broad-except
   281        raise BeamIOError("Size operation failed", {path: e})
   282  
   283    def last_updated(self, path):
   284      """Get UNIX Epoch time in seconds on the FileSystem.
   285  
   286      Args:
   287        path: string path of file.
   288  
   289      Returns: float UNIX Epoch time
   290  
   291      Raises:
   292        ``BeamIOError``: if path doesn't exist.
   293      """
   294      if not self.exists(path):
   295        raise BeamIOError('Path does not exist: %s' % path)
   296      return os.path.getmtime(path)
   297  
   298    def checksum(self, path):
   299      """Fetch checksum metadata of a file on the
   300      :class:`~apache_beam.io.filesystem.FileSystem`.
   301  
   302      Args:
   303        path: string path of a file.
   304  
   305      Returns: string containing file size.
   306  
   307      Raises:
   308        ``BeamIOError``: if path isn't a file or doesn't exist.
   309      """
   310      if not self.exists(path):
   311        raise BeamIOError('Path does not exist: %s' % path)
   312      return str(os.path.getsize(path))
   313  
   314    def metadata(self, path):
   315      """Fetch metadata fields of a file on the FileSystem.
   316  
   317      Args:
   318        path: string path of a file.
   319  
   320      Returns:
   321        :class:`~apache_beam.io.filesystem.FileMetadata`.
   322  
   323      Raises:
   324        ``BeamIOError``: if path isn't a file or doesn't exist.
   325      """
   326      if not self.exists(path):
   327        raise BeamIOError('Path does not exist: %s' % path)
   328      return FileMetadata(path, os.path.getsize(path), os.path.getmtime(path))
   329  
   330    def delete(self, paths):
   331      """Deletes files or directories at the provided paths.
   332      Directories will be deleted recursively.
   333  
   334      Args:
   335        paths: list of paths that give the file objects to be deleted
   336  
   337      Raises:
   338        ``BeamIOError``: if any of the delete operations fail
   339      """
   340      def _delete_path(path):
   341        """Recursively delete the file or directory at the provided path.
   342        """
   343        try:
   344          if os.path.isdir(path):
   345            shutil.rmtree(path)
   346          else:
   347            os.remove(path)
   348        except OSError as err:
   349          raise IOError(err)
   350  
   351      exceptions = {}
   352  
   353      def try_delete(path):
   354        try:
   355          _delete_path(path)
   356        except Exception as e:  # pylint: disable=broad-except
   357          exceptions[path] = e
   358  
   359      for match_result in self.match(paths):
   360        metadata_list = match_result.metadata_list
   361  
   362        if not metadata_list:
   363          exceptions[match_result.pattern] = \
   364            IOError('No files found to delete under: %s' % match_result.pattern)
   365  
   366        for metadata in match_result.metadata_list:
   367          try_delete(metadata.path)
   368  
   369      if exceptions:
   370        raise BeamIOError("Delete operation failed", exceptions)