github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/hadoopfilesystem.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """:class:`~apache_beam.io.filesystem.FileSystem` implementation for accessing
    19  Hadoop Distributed File System files."""
    20  
    21  # pytype: skip-file
    22  
    23  import io
    24  import logging
    25  import posixpath
    26  import re
    27  from typing import BinaryIO  # pylint: disable=unused-import
    28  
    29  import hdfs
    30  
    31  from apache_beam.io import filesystemio
    32  from apache_beam.io.filesystem import BeamIOError
    33  from apache_beam.io.filesystem import CompressedFile
    34  from apache_beam.io.filesystem import CompressionTypes
    35  from apache_beam.io.filesystem import FileMetadata
    36  from apache_beam.io.filesystem import FileSystem
    37  from apache_beam.options.pipeline_options import HadoopFileSystemOptions
    38  from apache_beam.options.pipeline_options import PipelineOptions
    39  
    40  __all__ = ['HadoopFileSystem']
    41  
    42  _HDFS_PREFIX = 'hdfs:/'
    43  _URL_RE = re.compile(r'^' + _HDFS_PREFIX + r'(/.*)')
    44  _FULL_URL_RE = re.compile(r'^' + _HDFS_PREFIX + r'/([^/]+)(/.*)*')
    45  _COPY_BUFFER_SIZE = 2**16
    46  _DEFAULT_BUFFER_SIZE = 20 * 1024 * 1024
    47  
    48  # WebHDFS FileChecksum property constants.
    49  _FILE_CHECKSUM_ALGORITHM = 'algorithm'
    50  _FILE_CHECKSUM_BYTES = 'bytes'
    51  _FILE_CHECKSUM_LENGTH = 'length'
    52  # WebHDFS FileStatus property constants.
    53  _FILE_STATUS_LENGTH = 'length'
    54  _FILE_STATUS_UPDATED = 'modificationTime'
    55  _FILE_STATUS_PATH_SUFFIX = 'pathSuffix'
    56  _FILE_STATUS_TYPE = 'type'
    57  _FILE_STATUS_TYPE_DIRECTORY = 'DIRECTORY'
    58  _FILE_STATUS_TYPE_FILE = 'FILE'
    59  
    60  _LOGGER = logging.getLogger(__name__)
    61  
    62  
    63  class HdfsDownloader(filesystemio.Downloader):
    64    def __init__(self, hdfs_client, path):
    65      self._hdfs_client = hdfs_client
    66      self._path = path
    67      self._size = self._hdfs_client.status(path)[_FILE_STATUS_LENGTH]
    68  
    69    @property
    70    def size(self):
    71      return self._size
    72  
    73    def get_range(self, start, end):
    74      with self._hdfs_client.read(self._path, offset=start,
    75                                  length=end - start) as reader:
    76        return reader.read()
    77  
    78  
    79  class HdfsUploader(filesystemio.Uploader):
    80    def __init__(self, hdfs_client, path):
    81      self._hdfs_client = hdfs_client
    82      if self._hdfs_client.status(path, strict=False) is not None:
    83        raise BeamIOError('Path already exists: %s' % path)
    84  
    85      self._handle_context = self._hdfs_client.write(path)
    86      self._handle = self._handle_context.__enter__()
    87  
    88    def put(self, data):
    89      # hdfs uses an async writer which first add data to a queue. To avoid buffer
    90      # gets reused upstream a deepcopy is required here.
    91      self._handle.write(bytes(data))
    92  
    93    def finish(self):
    94      self._handle.__exit__(None, None, None)
    95      self._handle = None
    96      self._handle_context = None
    97  
    98  
    99  class HadoopFileSystem(FileSystem):
   100    """``FileSystem`` implementation that supports HDFS.
   101  
   102    URL arguments to methods expect strings starting with ``hdfs://``.
   103    """
   104    def __init__(self, pipeline_options):
   105      """Initializes a connection to HDFS.
   106  
   107      Connection configuration is done by passing pipeline options.
   108      See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`.
   109      """
   110      super().__init__(pipeline_options)
   111      logging.getLogger('hdfs.client').setLevel(logging.WARN)
   112      if pipeline_options is None:
   113        raise ValueError('pipeline_options is not set')
   114      if isinstance(pipeline_options, PipelineOptions):
   115        hdfs_options = pipeline_options.view_as(HadoopFileSystemOptions)
   116        hdfs_host = hdfs_options.hdfs_host
   117        hdfs_port = hdfs_options.hdfs_port
   118        hdfs_user = hdfs_options.hdfs_user
   119        self._full_urls = hdfs_options.hdfs_full_urls
   120      else:
   121        hdfs_host = pipeline_options.get('hdfs_host')
   122        hdfs_port = pipeline_options.get('hdfs_port')
   123        hdfs_user = pipeline_options.get('hdfs_user')
   124        self._full_urls = pipeline_options.get('hdfs_full_urls', False)
   125  
   126      if hdfs_host is None:
   127        raise ValueError('hdfs_host is not set')
   128      if hdfs_port is None:
   129        raise ValueError('hdfs_port is not set')
   130      if hdfs_user is None:
   131        raise ValueError('hdfs_user is not set')
   132      if not isinstance(self._full_urls, bool):
   133        raise ValueError(
   134            'hdfs_full_urls should be bool, got: %s', self._full_urls)
   135      self._hdfs_client = hdfs.InsecureClient(
   136          'http://%s:%s' % (hdfs_host, str(hdfs_port)), user=hdfs_user)
   137  
   138    @classmethod
   139    def scheme(cls):
   140      return 'hdfs'
   141  
   142    def _parse_url(self, url):
   143      """Verifies that url begins with hdfs:// prefix, strips it and adds a
   144      leading /.
   145  
   146      Parsing behavior is determined by HadoopFileSystemOptions.hdfs_full_urls.
   147  
   148      Args:
   149        url: (str) A URL in the form hdfs://path/...
   150          or in the form hdfs://server/path/...
   151  
   152      Raises:
   153        ValueError if the URL doesn't match the expect format.
   154  
   155      Returns:
   156        (str, str) If using hdfs_full_urls, for an input of
   157        'hdfs://server/path/...' will return (server, '/path/...').
   158        Otherwise, for an input of 'hdfs://path/...', will return
   159        ('', '/path/...').
   160      """
   161      if not self._full_urls:
   162        m = _URL_RE.match(url)
   163        if m is None:
   164          raise ValueError('Could not parse url: %s' % url)
   165        return '', m.group(1)
   166      else:
   167        m = _FULL_URL_RE.match(url)
   168        if m is None:
   169          raise ValueError('Could not parse url: %s' % url)
   170        return m.group(1), m.group(2) or '/'
   171  
   172    def join(self, base_url, *paths):
   173      """Join two or more pathname components.
   174  
   175      Args:
   176        base_url: string path of the first component of the path.
   177          Must start with hdfs://.
   178        paths: path components to be added
   179  
   180      Returns:
   181        Full url after combining all the passed components.
   182      """
   183      server, basepath = self._parse_url(base_url)
   184      return _HDFS_PREFIX + self._join(server, basepath, *paths)
   185  
   186    def _join(self, server, basepath, *paths):
   187      res = posixpath.join(basepath, *paths)
   188      if server:
   189        server = '/' + server
   190      return server + res
   191  
   192    def split(self, url):
   193      server, rel_path = self._parse_url(url)
   194      if server:
   195        server = '/' + server
   196      head, tail = posixpath.split(rel_path)
   197      return _HDFS_PREFIX + server + head, tail
   198  
   199    def mkdirs(self, url):
   200      _, path = self._parse_url(url)
   201      if self._exists(path):
   202        raise BeamIOError('Path already exists: %s' % path)
   203      return self._mkdirs(path)
   204  
   205    def _mkdirs(self, path):
   206      self._hdfs_client.makedirs(path)
   207  
   208    def has_dirs(self):
   209      return True
   210  
   211    def _list(self, url):
   212      try:
   213        server, path = self._parse_url(url)
   214        for res in self._hdfs_client.list(path, status=True):
   215          yield FileMetadata(
   216              _HDFS_PREFIX + self._join(server, path, res[0]),
   217              res[1][_FILE_STATUS_LENGTH],
   218              res[1][_FILE_STATUS_UPDATED] / 1000.0)
   219      except Exception as e:  # pylint: disable=broad-except
   220        raise BeamIOError('List operation failed', {url: e})
   221  
   222    @staticmethod
   223    def _add_compression(stream, path, mime_type, compression_type):
   224      if mime_type != 'application/octet-stream':
   225        _LOGGER.warning(
   226            'Mime types are not supported. Got non-default mime_type:'
   227            ' %s',
   228            mime_type)
   229      if compression_type == CompressionTypes.AUTO:
   230        compression_type = CompressionTypes.detect_compression_type(path)
   231      if compression_type != CompressionTypes.UNCOMPRESSED:
   232        return CompressedFile(stream)
   233  
   234      return stream
   235  
   236    def create(
   237        self,
   238        url,
   239        mime_type='application/octet-stream',
   240        compression_type=CompressionTypes.AUTO):
   241      # type: (...) -> BinaryIO
   242  
   243      """
   244      Returns:
   245        A Python File-like object.
   246      """
   247      _, path = self._parse_url(url)
   248      return self._create(path, mime_type, compression_type)
   249  
   250    def _create(
   251        self,
   252        path,
   253        mime_type='application/octet-stream',
   254        compression_type=CompressionTypes.AUTO):
   255      stream = io.BufferedWriter(
   256          filesystemio.UploaderStream(HdfsUploader(self._hdfs_client, path)),
   257          buffer_size=_DEFAULT_BUFFER_SIZE)
   258      return self._add_compression(stream, path, mime_type, compression_type)
   259  
   260    def open(
   261        self,
   262        url,
   263        mime_type='application/octet-stream',
   264        compression_type=CompressionTypes.AUTO):
   265      # type: (...) -> BinaryIO
   266  
   267      """
   268      Returns:
   269        A Python File-like object.
   270      """
   271      _, path = self._parse_url(url)
   272      return self._open(path, mime_type, compression_type)
   273  
   274    def _open(
   275        self,
   276        path,
   277        mime_type='application/octet-stream',
   278        compression_type=CompressionTypes.AUTO):
   279      stream = io.BufferedReader(
   280          filesystemio.DownloaderStream(HdfsDownloader(self._hdfs_client, path)),
   281          buffer_size=_DEFAULT_BUFFER_SIZE)
   282      return self._add_compression(stream, path, mime_type, compression_type)
   283  
   284    def copy(self, source_file_names, destination_file_names):
   285      """
   286      It is an error if any file to copy already exists at the destination.
   287  
   288      Raises ``BeamIOError`` if any error occurred.
   289  
   290      Args:
   291        source_file_names: iterable of URLs.
   292        destination_file_names: iterable of URLs.
   293      """
   294      if len(source_file_names) != len(destination_file_names):
   295        raise BeamIOError(
   296            'source_file_names and destination_file_names should '
   297            'be equal in length: %d != %d' %
   298            (len(source_file_names), len(destination_file_names)))
   299  
   300      def _copy_file(source, destination):
   301        with self._open(source) as f1:
   302          with self._create(destination) as f2:
   303            while True:
   304              buf = f1.read(_COPY_BUFFER_SIZE)
   305              if not buf:
   306                break
   307              f2.write(buf)
   308  
   309      def _copy_path(source, destination):
   310        """Recursively copy the file tree from the source to the destination."""
   311        if self._hdfs_client.status(
   312            source)[_FILE_STATUS_TYPE] != _FILE_STATUS_TYPE_DIRECTORY:
   313          _copy_file(source, destination)
   314          return
   315  
   316        for path, dirs, files in self._hdfs_client.walk(source):
   317          for dir in dirs:
   318            new_dir = self._join('', destination, dir)
   319            if not self._exists(new_dir):
   320              self._mkdirs(new_dir)
   321  
   322          rel_path = posixpath.relpath(path, source)
   323          if rel_path == '.':
   324            rel_path = ''
   325          for file in files:
   326            _copy_file(
   327                self._join('', path, file),
   328                self._join('', destination, rel_path, file))
   329  
   330      exceptions = {}
   331      for source, destination in zip(source_file_names, destination_file_names):
   332        try:
   333          _, rel_source = self._parse_url(source)
   334          _, rel_destination = self._parse_url(destination)
   335          _copy_path(rel_source, rel_destination)
   336        except Exception as e:  # pylint: disable=broad-except
   337          exceptions[(source, destination)] = e
   338  
   339      if exceptions:
   340        raise BeamIOError('Copy operation failed', exceptions)
   341  
   342    def rename(self, source_file_names, destination_file_names):
   343      exceptions = {}
   344      for source, destination in zip(source_file_names, destination_file_names):
   345        try:
   346          _, rel_source = self._parse_url(source)
   347          _, rel_destination = self._parse_url(destination)
   348          try:
   349            self._hdfs_client.rename(rel_source, rel_destination)
   350          except hdfs.HdfsError as e:
   351            raise BeamIOError(
   352                'libhdfs error in renaming %s to %s' % (source, destination), e)
   353        except Exception as e:  # pylint: disable=broad-except
   354          exceptions[(source, destination)] = e
   355  
   356      if exceptions:
   357        raise BeamIOError('Rename operation failed', exceptions)
   358  
   359    def exists(self, url):
   360      # type: (str) -> bool
   361  
   362      """Checks existence of url in HDFS.
   363  
   364      Args:
   365        url: String in the form hdfs://...
   366  
   367      Returns:
   368        True if url exists as a file or directory in HDFS.
   369      """
   370      _, path = self._parse_url(url)
   371      return self._exists(path)
   372  
   373    def _exists(self, path):
   374      """Returns True if path exists as a file or directory in HDFS.
   375  
   376      Args:
   377        path: String in the form /...
   378      """
   379      return self._hdfs_client.status(path, strict=False) is not None
   380  
   381    def size(self, url):
   382      """Fetches file size for a URL.
   383  
   384      Returns:
   385        int size of path according to the FileSystem.
   386  
   387      Raises:
   388        ``BeamIOError``: if url doesn't exist.
   389      """
   390      return self.metadata(url).size_in_bytes
   391  
   392    def last_updated(self, url):
   393      """Fetches last updated time for a URL.
   394  
   395      Args:
   396        url: string url of file.
   397  
   398      Returns: float UNIX Epoch time
   399  
   400      Raises:
   401        ``BeamIOError``: if path doesn't exist.
   402      """
   403      return self.metadata(url).last_updated_in_seconds
   404  
   405    def checksum(self, url):
   406      """Fetches a checksum description for a URL.
   407  
   408      Returns:
   409        String describing the checksum.
   410  
   411      Raises:
   412        ``BeamIOError``: if url doesn't exist.
   413      """
   414      _, path = self._parse_url(url)
   415      file_checksum = self._hdfs_client.checksum(path)
   416      return '%s-%d-%s' % (
   417          file_checksum[_FILE_CHECKSUM_ALGORITHM],
   418          file_checksum[_FILE_CHECKSUM_LENGTH],
   419          file_checksum[_FILE_CHECKSUM_BYTES],
   420      )
   421  
   422    def metadata(self, url):
   423      """Fetch metadata fields of a file on the FileSystem.
   424  
   425      Args:
   426        url: string url of a file.
   427  
   428      Returns:
   429        :class:`~apache_beam.io.filesystem.FileMetadata`.
   430  
   431      Raises:
   432        ``BeamIOError``: if url doesn't exist.
   433      """
   434      _, path = self._parse_url(url)
   435      status = self._hdfs_client.status(path, strict=False)
   436      if status is None:
   437        raise BeamIOError('File not found: %s' % url)
   438      return FileMetadata(
   439          url, status[_FILE_STATUS_LENGTH], status[_FILE_STATUS_UPDATED] / 1000.0)
   440  
   441    def delete(self, urls):
   442      exceptions = {}
   443      for url in urls:
   444        try:
   445          _, path = self._parse_url(url)
   446          self._hdfs_client.delete(path, recursive=True)
   447        except Exception as e:  # pylint: disable=broad-except
   448          exceptions[url] = e
   449  
   450      if exceptions:
   451        raise BeamIOError("Delete operation failed", exceptions)