github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filebasedsource.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A framework for developing sources for new file types. 19 20 To create a source for a new file type a sub-class of :class:`FileBasedSource` 21 should be created. Sub-classes of :class:`FileBasedSource` must implement the 22 method :meth:`FileBasedSource.read_records()`. Please read the documentation of 23 that method for more details. 24 25 For an example implementation of :class:`FileBasedSource` see 26 :class:`~apache_beam.io._AvroSource`. 27 """ 28 29 # pytype: skip-file 30 31 from typing import Callable 32 from typing import Iterable 33 from typing import Tuple 34 from typing import Union 35 36 from apache_beam.internal import pickler 37 from apache_beam.io import concat_source 38 from apache_beam.io import iobase 39 from apache_beam.io import range_trackers 40 from apache_beam.io.filesystem import CompressionTypes 41 from apache_beam.io.filesystem import FileMetadata 42 from apache_beam.io.filesystems import FileSystems 43 from apache_beam.io.restriction_trackers import OffsetRange 44 from apache_beam.options.value_provider import StaticValueProvider 45 from apache_beam.options.value_provider import ValueProvider 46 from apache_beam.options.value_provider import check_accessible 47 from apache_beam.transforms.core import DoFn 48 from apache_beam.transforms.core import ParDo 49 from apache_beam.transforms.core import PTransform 50 from apache_beam.transforms.display import DisplayDataItem 51 from apache_beam.transforms.util import Reshuffle 52 53 MAX_NUM_THREADS_FOR_SIZE_ESTIMATION = 25 54 55 __all__ = ['FileBasedSource'] 56 57 58 class FileBasedSource(iobase.BoundedSource): 59 """A :class:`~apache_beam.io.iobase.BoundedSource` for reading a file glob of 60 a given type.""" 61 62 MIN_NUMBER_OF_FILES_TO_STAT = 100 63 MIN_FRACTION_OF_FILES_TO_STAT = 0.01 64 65 def __init__( 66 self, 67 file_pattern, 68 min_bundle_size=0, 69 compression_type=CompressionTypes.AUTO, 70 splittable=True, 71 validate=True): 72 """Initializes :class:`FileBasedSource`. 73 74 Args: 75 file_pattern (str): the file glob to read a string or a 76 :class:`~apache_beam.options.value_provider.ValueProvider` 77 (placeholder to inject a runtime value). 78 min_bundle_size (int): minimum size of bundles that should be generated 79 when performing initial splitting on this source. 80 compression_type (str): Used to handle compressed output files. 81 Typical value is :attr:`CompressionTypes.AUTO 82 <apache_beam.io.filesystem.CompressionTypes.AUTO>`, 83 in which case the final file path's extension will be used to detect 84 the compression. 85 splittable (bool): whether :class:`FileBasedSource` should try to 86 logically split a single file into data ranges so that different parts 87 of the same file can be read in parallel. If set to :data:`False`, 88 :class:`FileBasedSource` will prevent both initial and dynamic splitting 89 of sources for single files. File patterns that represent multiple files 90 may still get split into sources for individual files. Even if set to 91 :data:`True` by the user, :class:`FileBasedSource` may choose to not 92 split the file, for example, for compressed files where currently it is 93 not possible to efficiently read a data range without decompressing the 94 whole file. 95 validate (bool): Boolean flag to verify that the files exist during the 96 pipeline creation time. 97 98 Raises: 99 TypeError: when **compression_type** is not valid or if 100 **file_pattern** is not a :class:`str` or a 101 :class:`~apache_beam.options.value_provider.ValueProvider`. 102 ValueError: when compression and splittable files are 103 specified. 104 IOError: when the file pattern specified yields an empty 105 result. 106 """ 107 108 if not isinstance(file_pattern, (str, ValueProvider)): 109 raise TypeError( 110 '%s: file_pattern must be of type string' 111 ' or ValueProvider; got %r instead' % 112 (self.__class__.__name__, file_pattern)) 113 114 if isinstance(file_pattern, str): 115 file_pattern = StaticValueProvider(str, file_pattern) 116 self._pattern = file_pattern 117 118 self._concat_source = None 119 self._min_bundle_size = min_bundle_size 120 if not CompressionTypes.is_valid_compression_type(compression_type): 121 raise TypeError( 122 'compression_type must be CompressionType object but ' 123 'was %s' % type(compression_type)) 124 self._compression_type = compression_type 125 self._splittable = splittable 126 if validate and file_pattern.is_accessible(): 127 self._validate() 128 129 def display_data(self): 130 return { 131 'file_pattern': DisplayDataItem( 132 str(self._pattern), label="File Pattern"), 133 'compression': DisplayDataItem( 134 str(self._compression_type), label='Compression Type') 135 } 136 137 @check_accessible(['_pattern']) 138 def _get_concat_source(self): 139 # type: () -> concat_source.ConcatSource 140 if self._concat_source is None: 141 pattern = self._pattern.get() 142 143 single_file_sources = [] 144 match_result = FileSystems.match([pattern])[0] 145 files_metadata = match_result.metadata_list 146 147 # We create a reference for FileBasedSource that will be serialized along 148 # with each _SingleFileSource. To prevent this FileBasedSource from having 149 # a reference to ConcatSource (resulting in quadratic space complexity) 150 # we clone it here. 151 file_based_source_ref = pickler.loads(pickler.dumps(self)) 152 153 for file_metadata in files_metadata: 154 file_name = file_metadata.path 155 file_size = file_metadata.size_in_bytes 156 if file_size == 0: 157 continue # Ignoring empty file. 158 159 # We determine splittability of this specific file. 160 splittable = ( 161 self.splittable and _determine_splittability_from_compression_type( 162 file_name, self._compression_type)) 163 164 single_file_source = _SingleFileSource( 165 file_based_source_ref, 166 file_name, 167 0, 168 file_size, 169 min_bundle_size=self._min_bundle_size, 170 splittable=splittable) 171 single_file_sources.append(single_file_source) 172 self._concat_source = concat_source.ConcatSource(single_file_sources) 173 return self._concat_source 174 175 def open_file(self, file_name): 176 return FileSystems.open( 177 file_name, 178 'application/octet-stream', 179 compression_type=self._compression_type) 180 181 @check_accessible(['_pattern']) 182 def _validate(self): 183 """Validate if there are actual files in the specified glob pattern 184 """ 185 pattern = self._pattern.get() 186 187 # Limit the responses as we only want to check if something exists 188 match_result = FileSystems.match([pattern], limits=[1])[0] 189 if len(match_result.metadata_list) <= 0: 190 raise IOError('No files found based on the file pattern %s' % pattern) 191 192 def split( 193 self, desired_bundle_size=None, start_position=None, stop_position=None): 194 return self._get_concat_source().split( 195 desired_bundle_size=desired_bundle_size, 196 start_position=start_position, 197 stop_position=stop_position) 198 199 def estimate_size(self): 200 return self._get_concat_source().estimate_size() 201 202 def read(self, range_tracker): 203 return self._get_concat_source().read(range_tracker) 204 205 def get_range_tracker(self, start_position, stop_position): 206 return self._get_concat_source().get_range_tracker( 207 start_position, stop_position) 208 209 def read_records(self, file_name, offset_range_tracker): 210 """Returns a generator of records created by reading file 'file_name'. 211 212 Args: 213 file_name: a ``string`` that gives the name of the file to be read. Method 214 ``FileBasedSource.open_file()`` must be used to open the file 215 and create a seekable file object. 216 offset_range_tracker: a object of type ``OffsetRangeTracker``. This 217 defines the byte range of the file that should be 218 read. See documentation in 219 ``iobase.BoundedSource.read()`` for more information 220 on reading records while complying to the range 221 defined by a given ``RangeTracker``. 222 223 Returns: 224 an iterator that gives the records read from the given file. 225 """ 226 raise NotImplementedError 227 228 @property 229 def splittable(self): 230 return self._splittable 231 232 233 def _determine_splittability_from_compression_type(file_path, compression_type): 234 if compression_type == CompressionTypes.AUTO: 235 compression_type = CompressionTypes.detect_compression_type(file_path) 236 237 return compression_type == CompressionTypes.UNCOMPRESSED 238 239 240 class _SingleFileSource(iobase.BoundedSource): 241 """Denotes a source for a specific file type.""" 242 def __init__( 243 self, 244 file_based_source, 245 file_name, 246 start_offset, 247 stop_offset, 248 min_bundle_size=0, 249 splittable=True): 250 if not isinstance(start_offset, int): 251 raise TypeError( 252 'start_offset must be a number. Received: %r' % start_offset) 253 if stop_offset != range_trackers.OffsetRangeTracker.OFFSET_INFINITY: 254 if not isinstance(stop_offset, int): 255 raise TypeError( 256 'stop_offset must be a number. Received: %r' % stop_offset) 257 if start_offset >= stop_offset: 258 raise ValueError( 259 'start_offset must be smaller than stop_offset. Received %d and %d ' 260 'for start and stop offsets respectively' % 261 (start_offset, stop_offset)) 262 263 self._file_name = file_name 264 self._is_gcs_file = file_name.startswith('gs://') if file_name else False 265 self._start_offset = start_offset 266 self._stop_offset = stop_offset 267 self._min_bundle_size = min_bundle_size 268 self._file_based_source = file_based_source 269 self._splittable = splittable 270 271 def split(self, desired_bundle_size, start_offset=None, stop_offset=None): 272 if start_offset is None: 273 start_offset = self._start_offset 274 if stop_offset is None: 275 stop_offset = self._stop_offset 276 277 if self._splittable: 278 splits = OffsetRange(start_offset, stop_offset).split( 279 desired_bundle_size, self._min_bundle_size) 280 for split in splits: 281 yield iobase.SourceBundle( 282 split.stop - split.start, 283 _SingleFileSource( 284 # Copying this so that each sub-source gets a fresh instance. 285 pickler.loads(pickler.dumps(self._file_based_source)), 286 self._file_name, 287 split.start, 288 split.stop, 289 min_bundle_size=self._min_bundle_size, 290 splittable=self._splittable), 291 split.start, 292 split.stop) 293 else: 294 # Returning a single sub-source with end offset set to OFFSET_INFINITY (so 295 # that all data of the source gets read) since this source is 296 # unsplittable. Choosing size of the file as end offset will be wrong for 297 # certain unsplittable source, e.g., compressed sources. 298 yield iobase.SourceBundle( 299 stop_offset - start_offset, 300 _SingleFileSource( 301 self._file_based_source, 302 self._file_name, 303 start_offset, 304 range_trackers.OffsetRangeTracker.OFFSET_INFINITY, 305 min_bundle_size=self._min_bundle_size, 306 splittable=self._splittable), 307 start_offset, 308 range_trackers.OffsetRangeTracker.OFFSET_INFINITY) 309 310 def estimate_size(self): 311 return self._stop_offset - self._start_offset 312 313 def get_range_tracker(self, start_position, stop_position): 314 if start_position is None: 315 start_position = self._start_offset 316 if stop_position is None: 317 # If file is unsplittable we choose OFFSET_INFINITY as the default end 318 # offset so that all data of the source gets read. Choosing size of the 319 # file as end offset will be wrong for certain unsplittable source, for 320 # e.g., compressed sources. 321 stop_position = ( 322 self._stop_offset if self._splittable else 323 range_trackers.OffsetRangeTracker.OFFSET_INFINITY) 324 325 range_tracker = range_trackers.OffsetRangeTracker( 326 start_position, stop_position) 327 if not self._splittable: 328 range_tracker = range_trackers.UnsplittableRangeTracker(range_tracker) 329 330 return range_tracker 331 332 def read(self, range_tracker): 333 return self._file_based_source.read_records(self._file_name, range_tracker) 334 335 def default_output_coder(self): 336 return self._file_based_source.default_output_coder() 337 338 339 class _ExpandIntoRanges(DoFn): 340 def __init__( 341 self, splittable, compression_type, desired_bundle_size, min_bundle_size): 342 self._desired_bundle_size = desired_bundle_size 343 self._min_bundle_size = min_bundle_size 344 self._splittable = splittable 345 self._compression_type = compression_type 346 347 def process(self, element: Union[str, FileMetadata], *args, 348 **kwargs) -> Iterable[Tuple[FileMetadata, OffsetRange]]: 349 if isinstance(element, FileMetadata): 350 metadata_list = [element] 351 else: 352 match_results = FileSystems.match([element]) 353 metadata_list = match_results[0].metadata_list 354 for metadata in metadata_list: 355 splittable = ( 356 self._splittable and _determine_splittability_from_compression_type( 357 metadata.path, self._compression_type)) 358 359 if splittable: 360 for split in OffsetRange(0, metadata.size_in_bytes).split( 361 self._desired_bundle_size, self._min_bundle_size): 362 yield (metadata, split) 363 else: 364 yield ( 365 metadata, 366 OffsetRange(0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY)) 367 368 369 class _ReadRange(DoFn): 370 def __init__( 371 self, 372 source_from_file, # type: Union[str, iobase.BoundedSource] 373 with_filename=False # type: bool 374 ) -> None: 375 self._source_from_file = source_from_file 376 self._with_filename = with_filename 377 378 def process(self, element, *args, **kwargs): 379 metadata, range = element 380 source = self._source_from_file(metadata.path) 381 # Following split() operation has to be performed to create a proper 382 # _SingleFileSource. Otherwise what we have is a ConcatSource that contains 383 # a single _SingleFileSource. ConcatSource.read() expects a RangeTracker for 384 # sub-source range and reads full sub-sources (not byte ranges). 385 source_list = list(source.split(float('inf'))) 386 # Handle the case of an empty source. 387 if not source_list: 388 return 389 source = source_list[0].source 390 391 for record in source.read(range.new_tracker()): 392 if self._with_filename: 393 yield (metadata.path, record) 394 else: 395 yield record 396 397 398 class ReadAllFiles(PTransform): 399 """A Read transform that reads a PCollection of files. 400 401 Pipeline authors should not use this directly. This is to be used by Read 402 PTransform authors who wishes to implement file-based Read transforms that 403 read a PCollection of files. 404 """ 405 def __init__(self, 406 splittable, # type: bool 407 compression_type, 408 desired_bundle_size, # type: int 409 min_bundle_size, # type: int 410 source_from_file, # type: Callable[[str], iobase.BoundedSource] 411 with_filename=False # type: bool 412 ): 413 """ 414 Args: 415 splittable: If False, files won't be split into sub-ranges. If True, 416 files may or may not be split into data ranges. 417 compression_type: A ``CompressionType`` object that specifies the 418 compression type of the files that will be processed. If 419 ``CompressionType.AUTO``, system will try to automatically 420 determine the compression type based on the extension of 421 files. 422 desired_bundle_size: the desired size of data ranges that should be 423 generated when splitting a file into data ranges. 424 min_bundle_size: minimum size of data ranges that should be generated when 425 splitting a file into data ranges. 426 source_from_file: a function that produces a ``BoundedSource`` given a 427 file name. System will use this function to generate 428 ``BoundedSource`` objects for file paths. Note that file 429 paths passed to this will be for individual files, not 430 for file patterns even if the ``PCollection`` of files 431 processed by the transform consist of file patterns. 432 with_filename: If True, returns a Key Value with the key being the file 433 name and the value being the actual data. If False, it only returns 434 the data. 435 """ 436 self._splittable = splittable 437 self._compression_type = compression_type 438 self._desired_bundle_size = desired_bundle_size 439 self._min_bundle_size = min_bundle_size 440 self._source_from_file = source_from_file 441 self._with_filename = with_filename 442 # TODO(BEAM-14497) always reshuffle once gbk always trigger works. 443 self._is_reshuffle = True 444 445 def _disable_reshuffle(self): 446 # TODO(BEAM-14497) Remove this private method once gbk always trigger works. 447 # 448 # Currently Reshuffle() holds elements until the stage is completed. When 449 # ReadRange is needed instantly after match (like read continuously), the 450 # reshard is temporarily disabled. However, the read then does not scale and 451 # is deemed experimental. 452 self._is_reshuffle = False 453 return self 454 455 def expand(self, pvalue): 456 pvalue = ( 457 pvalue 458 | 'ExpandIntoRanges' >> ParDo( 459 _ExpandIntoRanges( 460 self._splittable, 461 self._compression_type, 462 self._desired_bundle_size, 463 self._min_bundle_size))) 464 if self._is_reshuffle: 465 pvalue = pvalue | 'Reshard' >> Reshuffle() 466 return ( 467 pvalue 468 | 'ReadRange' >> ParDo( 469 _ReadRange( 470 self._source_from_file, with_filename=self._with_filename)))