github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/hadoopfilesystem.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """:class:`~apache_beam.io.filesystem.FileSystem` implementation for accessing 19 Hadoop Distributed File System files.""" 20 21 # pytype: skip-file 22 23 import io 24 import logging 25 import posixpath 26 import re 27 from typing import BinaryIO # pylint: disable=unused-import 28 29 import hdfs 30 31 from apache_beam.io import filesystemio 32 from apache_beam.io.filesystem import BeamIOError 33 from apache_beam.io.filesystem import CompressedFile 34 from apache_beam.io.filesystem import CompressionTypes 35 from apache_beam.io.filesystem import FileMetadata 36 from apache_beam.io.filesystem import FileSystem 37 from apache_beam.options.pipeline_options import HadoopFileSystemOptions 38 from apache_beam.options.pipeline_options import PipelineOptions 39 40 __all__ = ['HadoopFileSystem'] 41 42 _HDFS_PREFIX = 'hdfs:/' 43 _URL_RE = re.compile(r'^' + _HDFS_PREFIX + r'(/.*)') 44 _FULL_URL_RE = re.compile(r'^' + _HDFS_PREFIX + r'/([^/]+)(/.*)*') 45 _COPY_BUFFER_SIZE = 2**16 46 _DEFAULT_BUFFER_SIZE = 20 * 1024 * 1024 47 48 # WebHDFS FileChecksum property constants. 49 _FILE_CHECKSUM_ALGORITHM = 'algorithm' 50 _FILE_CHECKSUM_BYTES = 'bytes' 51 _FILE_CHECKSUM_LENGTH = 'length' 52 # WebHDFS FileStatus property constants. 53 _FILE_STATUS_LENGTH = 'length' 54 _FILE_STATUS_UPDATED = 'modificationTime' 55 _FILE_STATUS_PATH_SUFFIX = 'pathSuffix' 56 _FILE_STATUS_TYPE = 'type' 57 _FILE_STATUS_TYPE_DIRECTORY = 'DIRECTORY' 58 _FILE_STATUS_TYPE_FILE = 'FILE' 59 60 _LOGGER = logging.getLogger(__name__) 61 62 63 class HdfsDownloader(filesystemio.Downloader): 64 def __init__(self, hdfs_client, path): 65 self._hdfs_client = hdfs_client 66 self._path = path 67 self._size = self._hdfs_client.status(path)[_FILE_STATUS_LENGTH] 68 69 @property 70 def size(self): 71 return self._size 72 73 def get_range(self, start, end): 74 with self._hdfs_client.read(self._path, offset=start, 75 length=end - start) as reader: 76 return reader.read() 77 78 79 class HdfsUploader(filesystemio.Uploader): 80 def __init__(self, hdfs_client, path): 81 self._hdfs_client = hdfs_client 82 if self._hdfs_client.status(path, strict=False) is not None: 83 raise BeamIOError('Path already exists: %s' % path) 84 85 self._handle_context = self._hdfs_client.write(path) 86 self._handle = self._handle_context.__enter__() 87 88 def put(self, data): 89 # hdfs uses an async writer which first add data to a queue. To avoid buffer 90 # gets reused upstream a deepcopy is required here. 91 self._handle.write(bytes(data)) 92 93 def finish(self): 94 self._handle.__exit__(None, None, None) 95 self._handle = None 96 self._handle_context = None 97 98 99 class HadoopFileSystem(FileSystem): 100 """``FileSystem`` implementation that supports HDFS. 101 102 URL arguments to methods expect strings starting with ``hdfs://``. 103 """ 104 def __init__(self, pipeline_options): 105 """Initializes a connection to HDFS. 106 107 Connection configuration is done by passing pipeline options. 108 See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`. 109 """ 110 super().__init__(pipeline_options) 111 logging.getLogger('hdfs.client').setLevel(logging.WARN) 112 if pipeline_options is None: 113 raise ValueError('pipeline_options is not set') 114 if isinstance(pipeline_options, PipelineOptions): 115 hdfs_options = pipeline_options.view_as(HadoopFileSystemOptions) 116 hdfs_host = hdfs_options.hdfs_host 117 hdfs_port = hdfs_options.hdfs_port 118 hdfs_user = hdfs_options.hdfs_user 119 self._full_urls = hdfs_options.hdfs_full_urls 120 else: 121 hdfs_host = pipeline_options.get('hdfs_host') 122 hdfs_port = pipeline_options.get('hdfs_port') 123 hdfs_user = pipeline_options.get('hdfs_user') 124 self._full_urls = pipeline_options.get('hdfs_full_urls', False) 125 126 if hdfs_host is None: 127 raise ValueError('hdfs_host is not set') 128 if hdfs_port is None: 129 raise ValueError('hdfs_port is not set') 130 if hdfs_user is None: 131 raise ValueError('hdfs_user is not set') 132 if not isinstance(self._full_urls, bool): 133 raise ValueError( 134 'hdfs_full_urls should be bool, got: %s', self._full_urls) 135 self._hdfs_client = hdfs.InsecureClient( 136 'http://%s:%s' % (hdfs_host, str(hdfs_port)), user=hdfs_user) 137 138 @classmethod 139 def scheme(cls): 140 return 'hdfs' 141 142 def _parse_url(self, url): 143 """Verifies that url begins with hdfs:// prefix, strips it and adds a 144 leading /. 145 146 Parsing behavior is determined by HadoopFileSystemOptions.hdfs_full_urls. 147 148 Args: 149 url: (str) A URL in the form hdfs://path/... 150 or in the form hdfs://server/path/... 151 152 Raises: 153 ValueError if the URL doesn't match the expect format. 154 155 Returns: 156 (str, str) If using hdfs_full_urls, for an input of 157 'hdfs://server/path/...' will return (server, '/path/...'). 158 Otherwise, for an input of 'hdfs://path/...', will return 159 ('', '/path/...'). 160 """ 161 if not self._full_urls: 162 m = _URL_RE.match(url) 163 if m is None: 164 raise ValueError('Could not parse url: %s' % url) 165 return '', m.group(1) 166 else: 167 m = _FULL_URL_RE.match(url) 168 if m is None: 169 raise ValueError('Could not parse url: %s' % url) 170 return m.group(1), m.group(2) or '/' 171 172 def join(self, base_url, *paths): 173 """Join two or more pathname components. 174 175 Args: 176 base_url: string path of the first component of the path. 177 Must start with hdfs://. 178 paths: path components to be added 179 180 Returns: 181 Full url after combining all the passed components. 182 """ 183 server, basepath = self._parse_url(base_url) 184 return _HDFS_PREFIX + self._join(server, basepath, *paths) 185 186 def _join(self, server, basepath, *paths): 187 res = posixpath.join(basepath, *paths) 188 if server: 189 server = '/' + server 190 return server + res 191 192 def split(self, url): 193 server, rel_path = self._parse_url(url) 194 if server: 195 server = '/' + server 196 head, tail = posixpath.split(rel_path) 197 return _HDFS_PREFIX + server + head, tail 198 199 def mkdirs(self, url): 200 _, path = self._parse_url(url) 201 if self._exists(path): 202 raise BeamIOError('Path already exists: %s' % path) 203 return self._mkdirs(path) 204 205 def _mkdirs(self, path): 206 self._hdfs_client.makedirs(path) 207 208 def has_dirs(self): 209 return True 210 211 def _list(self, url): 212 try: 213 server, path = self._parse_url(url) 214 for res in self._hdfs_client.list(path, status=True): 215 yield FileMetadata( 216 _HDFS_PREFIX + self._join(server, path, res[0]), 217 res[1][_FILE_STATUS_LENGTH], 218 res[1][_FILE_STATUS_UPDATED] / 1000.0) 219 except Exception as e: # pylint: disable=broad-except 220 raise BeamIOError('List operation failed', {url: e}) 221 222 @staticmethod 223 def _add_compression(stream, path, mime_type, compression_type): 224 if mime_type != 'application/octet-stream': 225 _LOGGER.warning( 226 'Mime types are not supported. Got non-default mime_type:' 227 ' %s', 228 mime_type) 229 if compression_type == CompressionTypes.AUTO: 230 compression_type = CompressionTypes.detect_compression_type(path) 231 if compression_type != CompressionTypes.UNCOMPRESSED: 232 return CompressedFile(stream) 233 234 return stream 235 236 def create( 237 self, 238 url, 239 mime_type='application/octet-stream', 240 compression_type=CompressionTypes.AUTO): 241 # type: (...) -> BinaryIO 242 243 """ 244 Returns: 245 A Python File-like object. 246 """ 247 _, path = self._parse_url(url) 248 return self._create(path, mime_type, compression_type) 249 250 def _create( 251 self, 252 path, 253 mime_type='application/octet-stream', 254 compression_type=CompressionTypes.AUTO): 255 stream = io.BufferedWriter( 256 filesystemio.UploaderStream(HdfsUploader(self._hdfs_client, path)), 257 buffer_size=_DEFAULT_BUFFER_SIZE) 258 return self._add_compression(stream, path, mime_type, compression_type) 259 260 def open( 261 self, 262 url, 263 mime_type='application/octet-stream', 264 compression_type=CompressionTypes.AUTO): 265 # type: (...) -> BinaryIO 266 267 """ 268 Returns: 269 A Python File-like object. 270 """ 271 _, path = self._parse_url(url) 272 return self._open(path, mime_type, compression_type) 273 274 def _open( 275 self, 276 path, 277 mime_type='application/octet-stream', 278 compression_type=CompressionTypes.AUTO): 279 stream = io.BufferedReader( 280 filesystemio.DownloaderStream(HdfsDownloader(self._hdfs_client, path)), 281 buffer_size=_DEFAULT_BUFFER_SIZE) 282 return self._add_compression(stream, path, mime_type, compression_type) 283 284 def copy(self, source_file_names, destination_file_names): 285 """ 286 It is an error if any file to copy already exists at the destination. 287 288 Raises ``BeamIOError`` if any error occurred. 289 290 Args: 291 source_file_names: iterable of URLs. 292 destination_file_names: iterable of URLs. 293 """ 294 if len(source_file_names) != len(destination_file_names): 295 raise BeamIOError( 296 'source_file_names and destination_file_names should ' 297 'be equal in length: %d != %d' % 298 (len(source_file_names), len(destination_file_names))) 299 300 def _copy_file(source, destination): 301 with self._open(source) as f1: 302 with self._create(destination) as f2: 303 while True: 304 buf = f1.read(_COPY_BUFFER_SIZE) 305 if not buf: 306 break 307 f2.write(buf) 308 309 def _copy_path(source, destination): 310 """Recursively copy the file tree from the source to the destination.""" 311 if self._hdfs_client.status( 312 source)[_FILE_STATUS_TYPE] != _FILE_STATUS_TYPE_DIRECTORY: 313 _copy_file(source, destination) 314 return 315 316 for path, dirs, files in self._hdfs_client.walk(source): 317 for dir in dirs: 318 new_dir = self._join('', destination, dir) 319 if not self._exists(new_dir): 320 self._mkdirs(new_dir) 321 322 rel_path = posixpath.relpath(path, source) 323 if rel_path == '.': 324 rel_path = '' 325 for file in files: 326 _copy_file( 327 self._join('', path, file), 328 self._join('', destination, rel_path, file)) 329 330 exceptions = {} 331 for source, destination in zip(source_file_names, destination_file_names): 332 try: 333 _, rel_source = self._parse_url(source) 334 _, rel_destination = self._parse_url(destination) 335 _copy_path(rel_source, rel_destination) 336 except Exception as e: # pylint: disable=broad-except 337 exceptions[(source, destination)] = e 338 339 if exceptions: 340 raise BeamIOError('Copy operation failed', exceptions) 341 342 def rename(self, source_file_names, destination_file_names): 343 exceptions = {} 344 for source, destination in zip(source_file_names, destination_file_names): 345 try: 346 _, rel_source = self._parse_url(source) 347 _, rel_destination = self._parse_url(destination) 348 try: 349 self._hdfs_client.rename(rel_source, rel_destination) 350 except hdfs.HdfsError as e: 351 raise BeamIOError( 352 'libhdfs error in renaming %s to %s' % (source, destination), e) 353 except Exception as e: # pylint: disable=broad-except 354 exceptions[(source, destination)] = e 355 356 if exceptions: 357 raise BeamIOError('Rename operation failed', exceptions) 358 359 def exists(self, url): 360 # type: (str) -> bool 361 362 """Checks existence of url in HDFS. 363 364 Args: 365 url: String in the form hdfs://... 366 367 Returns: 368 True if url exists as a file or directory in HDFS. 369 """ 370 _, path = self._parse_url(url) 371 return self._exists(path) 372 373 def _exists(self, path): 374 """Returns True if path exists as a file or directory in HDFS. 375 376 Args: 377 path: String in the form /... 378 """ 379 return self._hdfs_client.status(path, strict=False) is not None 380 381 def size(self, url): 382 """Fetches file size for a URL. 383 384 Returns: 385 int size of path according to the FileSystem. 386 387 Raises: 388 ``BeamIOError``: if url doesn't exist. 389 """ 390 return self.metadata(url).size_in_bytes 391 392 def last_updated(self, url): 393 """Fetches last updated time for a URL. 394 395 Args: 396 url: string url of file. 397 398 Returns: float UNIX Epoch time 399 400 Raises: 401 ``BeamIOError``: if path doesn't exist. 402 """ 403 return self.metadata(url).last_updated_in_seconds 404 405 def checksum(self, url): 406 """Fetches a checksum description for a URL. 407 408 Returns: 409 String describing the checksum. 410 411 Raises: 412 ``BeamIOError``: if url doesn't exist. 413 """ 414 _, path = self._parse_url(url) 415 file_checksum = self._hdfs_client.checksum(path) 416 return '%s-%d-%s' % ( 417 file_checksum[_FILE_CHECKSUM_ALGORITHM], 418 file_checksum[_FILE_CHECKSUM_LENGTH], 419 file_checksum[_FILE_CHECKSUM_BYTES], 420 ) 421 422 def metadata(self, url): 423 """Fetch metadata fields of a file on the FileSystem. 424 425 Args: 426 url: string url of a file. 427 428 Returns: 429 :class:`~apache_beam.io.filesystem.FileMetadata`. 430 431 Raises: 432 ``BeamIOError``: if url doesn't exist. 433 """ 434 _, path = self._parse_url(url) 435 status = self._hdfs_client.status(path, strict=False) 436 if status is None: 437 raise BeamIOError('File not found: %s' % url) 438 return FileMetadata( 439 url, status[_FILE_STATUS_LENGTH], status[_FILE_STATUS_UPDATED] / 1000.0) 440 441 def delete(self, urls): 442 exceptions = {} 443 for url in urls: 444 try: 445 _, path = self._parse_url(url) 446 self._hdfs_client.delete(path, recursive=True) 447 except Exception as e: # pylint: disable=broad-except 448 exceptions[url] = e 449 450 if exceptions: 451 raise BeamIOError("Delete operation failed", exceptions)