github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filebasedsource.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A framework for developing sources for new file types.
    19  
    20  To create a source for a new file type a sub-class of :class:`FileBasedSource`
    21  should be created. Sub-classes of :class:`FileBasedSource` must implement the
    22  method :meth:`FileBasedSource.read_records()`. Please read the documentation of
    23  that method for more details.
    24  
    25  For an example implementation of :class:`FileBasedSource` see
    26  :class:`~apache_beam.io._AvroSource`.
    27  """
    28  
    29  # pytype: skip-file
    30  
    31  from typing import Callable
    32  from typing import Iterable
    33  from typing import Tuple
    34  from typing import Union
    35  
    36  from apache_beam.internal import pickler
    37  from apache_beam.io import concat_source
    38  from apache_beam.io import iobase
    39  from apache_beam.io import range_trackers
    40  from apache_beam.io.filesystem import CompressionTypes
    41  from apache_beam.io.filesystem import FileMetadata
    42  from apache_beam.io.filesystems import FileSystems
    43  from apache_beam.io.restriction_trackers import OffsetRange
    44  from apache_beam.options.value_provider import StaticValueProvider
    45  from apache_beam.options.value_provider import ValueProvider
    46  from apache_beam.options.value_provider import check_accessible
    47  from apache_beam.transforms.core import DoFn
    48  from apache_beam.transforms.core import ParDo
    49  from apache_beam.transforms.core import PTransform
    50  from apache_beam.transforms.display import DisplayDataItem
    51  from apache_beam.transforms.util import Reshuffle
    52  
    53  MAX_NUM_THREADS_FOR_SIZE_ESTIMATION = 25
    54  
    55  __all__ = ['FileBasedSource']
    56  
    57  
    58  class FileBasedSource(iobase.BoundedSource):
    59    """A :class:`~apache_beam.io.iobase.BoundedSource` for reading a file glob of
    60    a given type."""
    61  
    62    MIN_NUMBER_OF_FILES_TO_STAT = 100
    63    MIN_FRACTION_OF_FILES_TO_STAT = 0.01
    64  
    65    def __init__(
    66        self,
    67        file_pattern,
    68        min_bundle_size=0,
    69        compression_type=CompressionTypes.AUTO,
    70        splittable=True,
    71        validate=True):
    72      """Initializes :class:`FileBasedSource`.
    73  
    74      Args:
    75        file_pattern (str): the file glob to read a string or a
    76          :class:`~apache_beam.options.value_provider.ValueProvider`
    77          (placeholder to inject a runtime value).
    78        min_bundle_size (int): minimum size of bundles that should be generated
    79          when performing initial splitting on this source.
    80        compression_type (str): Used to handle compressed output files.
    81          Typical value is :attr:`CompressionTypes.AUTO
    82          <apache_beam.io.filesystem.CompressionTypes.AUTO>`,
    83          in which case the final file path's extension will be used to detect
    84          the compression.
    85        splittable (bool): whether :class:`FileBasedSource` should try to
    86          logically split a single file into data ranges so that different parts
    87          of the same file can be read in parallel. If set to :data:`False`,
    88          :class:`FileBasedSource` will prevent both initial and dynamic splitting
    89          of sources for single files. File patterns that represent multiple files
    90          may still get split into sources for individual files. Even if set to
    91          :data:`True` by the user, :class:`FileBasedSource` may choose to not
    92          split the file, for example, for compressed files where currently it is
    93          not possible to efficiently read a data range without decompressing the
    94          whole file.
    95        validate (bool): Boolean flag to verify that the files exist during the
    96          pipeline creation time.
    97  
    98      Raises:
    99        TypeError: when **compression_type** is not valid or if
   100          **file_pattern** is not a :class:`str` or a
   101          :class:`~apache_beam.options.value_provider.ValueProvider`.
   102        ValueError: when compression and splittable files are
   103          specified.
   104        IOError: when the file pattern specified yields an empty
   105          result.
   106      """
   107  
   108      if not isinstance(file_pattern, (str, ValueProvider)):
   109        raise TypeError(
   110            '%s: file_pattern must be of type string'
   111            ' or ValueProvider; got %r instead' %
   112            (self.__class__.__name__, file_pattern))
   113  
   114      if isinstance(file_pattern, str):
   115        file_pattern = StaticValueProvider(str, file_pattern)
   116      self._pattern = file_pattern
   117  
   118      self._concat_source = None
   119      self._min_bundle_size = min_bundle_size
   120      if not CompressionTypes.is_valid_compression_type(compression_type):
   121        raise TypeError(
   122            'compression_type must be CompressionType object but '
   123            'was %s' % type(compression_type))
   124      self._compression_type = compression_type
   125      self._splittable = splittable
   126      if validate and file_pattern.is_accessible():
   127        self._validate()
   128  
   129    def display_data(self):
   130      return {
   131          'file_pattern': DisplayDataItem(
   132              str(self._pattern), label="File Pattern"),
   133          'compression': DisplayDataItem(
   134              str(self._compression_type), label='Compression Type')
   135      }
   136  
   137    @check_accessible(['_pattern'])
   138    def _get_concat_source(self):
   139      # type: () -> concat_source.ConcatSource
   140      if self._concat_source is None:
   141        pattern = self._pattern.get()
   142  
   143        single_file_sources = []
   144        match_result = FileSystems.match([pattern])[0]
   145        files_metadata = match_result.metadata_list
   146  
   147        # We create a reference for FileBasedSource that will be serialized along
   148        # with each _SingleFileSource. To prevent this FileBasedSource from having
   149        # a reference to ConcatSource (resulting in quadratic space complexity)
   150        # we clone it here.
   151        file_based_source_ref = pickler.loads(pickler.dumps(self))
   152  
   153        for file_metadata in files_metadata:
   154          file_name = file_metadata.path
   155          file_size = file_metadata.size_in_bytes
   156          if file_size == 0:
   157            continue  # Ignoring empty file.
   158  
   159          # We determine splittability of this specific file.
   160          splittable = (
   161              self.splittable and _determine_splittability_from_compression_type(
   162                  file_name, self._compression_type))
   163  
   164          single_file_source = _SingleFileSource(
   165              file_based_source_ref,
   166              file_name,
   167              0,
   168              file_size,
   169              min_bundle_size=self._min_bundle_size,
   170              splittable=splittable)
   171          single_file_sources.append(single_file_source)
   172        self._concat_source = concat_source.ConcatSource(single_file_sources)
   173      return self._concat_source
   174  
   175    def open_file(self, file_name):
   176      return FileSystems.open(
   177          file_name,
   178          'application/octet-stream',
   179          compression_type=self._compression_type)
   180  
   181    @check_accessible(['_pattern'])
   182    def _validate(self):
   183      """Validate if there are actual files in the specified glob pattern
   184      """
   185      pattern = self._pattern.get()
   186  
   187      # Limit the responses as we only want to check if something exists
   188      match_result = FileSystems.match([pattern], limits=[1])[0]
   189      if len(match_result.metadata_list) <= 0:
   190        raise IOError('No files found based on the file pattern %s' % pattern)
   191  
   192    def split(
   193        self, desired_bundle_size=None, start_position=None, stop_position=None):
   194      return self._get_concat_source().split(
   195          desired_bundle_size=desired_bundle_size,
   196          start_position=start_position,
   197          stop_position=stop_position)
   198  
   199    def estimate_size(self):
   200      return self._get_concat_source().estimate_size()
   201  
   202    def read(self, range_tracker):
   203      return self._get_concat_source().read(range_tracker)
   204  
   205    def get_range_tracker(self, start_position, stop_position):
   206      return self._get_concat_source().get_range_tracker(
   207          start_position, stop_position)
   208  
   209    def read_records(self, file_name, offset_range_tracker):
   210      """Returns a generator of records created by reading file 'file_name'.
   211  
   212      Args:
   213        file_name: a ``string`` that gives the name of the file to be read. Method
   214                   ``FileBasedSource.open_file()`` must be used to open the file
   215                   and create a seekable file object.
   216        offset_range_tracker: a object of type ``OffsetRangeTracker``. This
   217                              defines the byte range of the file that should be
   218                              read. See documentation in
   219                              ``iobase.BoundedSource.read()`` for more information
   220                              on reading records while complying to the range
   221                              defined by a given ``RangeTracker``.
   222  
   223      Returns:
   224        an iterator that gives the records read from the given file.
   225      """
   226      raise NotImplementedError
   227  
   228    @property
   229    def splittable(self):
   230      return self._splittable
   231  
   232  
   233  def _determine_splittability_from_compression_type(file_path, compression_type):
   234    if compression_type == CompressionTypes.AUTO:
   235      compression_type = CompressionTypes.detect_compression_type(file_path)
   236  
   237    return compression_type == CompressionTypes.UNCOMPRESSED
   238  
   239  
   240  class _SingleFileSource(iobase.BoundedSource):
   241    """Denotes a source for a specific file type."""
   242    def __init__(
   243        self,
   244        file_based_source,
   245        file_name,
   246        start_offset,
   247        stop_offset,
   248        min_bundle_size=0,
   249        splittable=True):
   250      if not isinstance(start_offset, int):
   251        raise TypeError(
   252            'start_offset must be a number. Received: %r' % start_offset)
   253      if stop_offset != range_trackers.OffsetRangeTracker.OFFSET_INFINITY:
   254        if not isinstance(stop_offset, int):
   255          raise TypeError(
   256              'stop_offset must be a number. Received: %r' % stop_offset)
   257        if start_offset >= stop_offset:
   258          raise ValueError(
   259              'start_offset must be smaller than stop_offset. Received %d and %d '
   260              'for start and stop offsets respectively' %
   261              (start_offset, stop_offset))
   262  
   263      self._file_name = file_name
   264      self._is_gcs_file = file_name.startswith('gs://') if file_name else False
   265      self._start_offset = start_offset
   266      self._stop_offset = stop_offset
   267      self._min_bundle_size = min_bundle_size
   268      self._file_based_source = file_based_source
   269      self._splittable = splittable
   270  
   271    def split(self, desired_bundle_size, start_offset=None, stop_offset=None):
   272      if start_offset is None:
   273        start_offset = self._start_offset
   274      if stop_offset is None:
   275        stop_offset = self._stop_offset
   276  
   277      if self._splittable:
   278        splits = OffsetRange(start_offset, stop_offset).split(
   279            desired_bundle_size, self._min_bundle_size)
   280        for split in splits:
   281          yield iobase.SourceBundle(
   282              split.stop - split.start,
   283              _SingleFileSource(
   284                  # Copying this so that each sub-source gets a fresh instance.
   285                  pickler.loads(pickler.dumps(self._file_based_source)),
   286                  self._file_name,
   287                  split.start,
   288                  split.stop,
   289                  min_bundle_size=self._min_bundle_size,
   290                  splittable=self._splittable),
   291              split.start,
   292              split.stop)
   293      else:
   294        # Returning a single sub-source with end offset set to OFFSET_INFINITY (so
   295        # that all data of the source gets read) since this source is
   296        # unsplittable. Choosing size of the file as end offset will be wrong for
   297        # certain unsplittable source, e.g., compressed sources.
   298        yield iobase.SourceBundle(
   299            stop_offset - start_offset,
   300            _SingleFileSource(
   301                self._file_based_source,
   302                self._file_name,
   303                start_offset,
   304                range_trackers.OffsetRangeTracker.OFFSET_INFINITY,
   305                min_bundle_size=self._min_bundle_size,
   306                splittable=self._splittable),
   307            start_offset,
   308            range_trackers.OffsetRangeTracker.OFFSET_INFINITY)
   309  
   310    def estimate_size(self):
   311      return self._stop_offset - self._start_offset
   312  
   313    def get_range_tracker(self, start_position, stop_position):
   314      if start_position is None:
   315        start_position = self._start_offset
   316      if stop_position is None:
   317        # If file is unsplittable we choose OFFSET_INFINITY as the default end
   318        # offset so that all data of the source gets read. Choosing size of the
   319        # file as end offset will be wrong for certain unsplittable source, for
   320        # e.g., compressed sources.
   321        stop_position = (
   322            self._stop_offset if self._splittable else
   323            range_trackers.OffsetRangeTracker.OFFSET_INFINITY)
   324  
   325      range_tracker = range_trackers.OffsetRangeTracker(
   326          start_position, stop_position)
   327      if not self._splittable:
   328        range_tracker = range_trackers.UnsplittableRangeTracker(range_tracker)
   329  
   330      return range_tracker
   331  
   332    def read(self, range_tracker):
   333      return self._file_based_source.read_records(self._file_name, range_tracker)
   334  
   335    def default_output_coder(self):
   336      return self._file_based_source.default_output_coder()
   337  
   338  
   339  class _ExpandIntoRanges(DoFn):
   340    def __init__(
   341        self, splittable, compression_type, desired_bundle_size, min_bundle_size):
   342      self._desired_bundle_size = desired_bundle_size
   343      self._min_bundle_size = min_bundle_size
   344      self._splittable = splittable
   345      self._compression_type = compression_type
   346  
   347    def process(self, element: Union[str, FileMetadata], *args,
   348                **kwargs) -> Iterable[Tuple[FileMetadata, OffsetRange]]:
   349      if isinstance(element, FileMetadata):
   350        metadata_list = [element]
   351      else:
   352        match_results = FileSystems.match([element])
   353        metadata_list = match_results[0].metadata_list
   354      for metadata in metadata_list:
   355        splittable = (
   356            self._splittable and _determine_splittability_from_compression_type(
   357                metadata.path, self._compression_type))
   358  
   359        if splittable:
   360          for split in OffsetRange(0, metadata.size_in_bytes).split(
   361              self._desired_bundle_size, self._min_bundle_size):
   362            yield (metadata, split)
   363        else:
   364          yield (
   365              metadata,
   366              OffsetRange(0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
   367  
   368  
   369  class _ReadRange(DoFn):
   370    def __init__(
   371        self,
   372        source_from_file,  # type: Union[str, iobase.BoundedSource]
   373        with_filename=False  # type: bool
   374      ) -> None:
   375      self._source_from_file = source_from_file
   376      self._with_filename = with_filename
   377  
   378    def process(self, element, *args, **kwargs):
   379      metadata, range = element
   380      source = self._source_from_file(metadata.path)
   381      # Following split() operation has to be performed to create a proper
   382      # _SingleFileSource. Otherwise what we have is a ConcatSource that contains
   383      # a single _SingleFileSource. ConcatSource.read() expects a RangeTracker for
   384      # sub-source range and reads full sub-sources (not byte ranges).
   385      source_list = list(source.split(float('inf')))
   386      # Handle the case of an empty source.
   387      if not source_list:
   388        return
   389      source = source_list[0].source
   390  
   391      for record in source.read(range.new_tracker()):
   392        if self._with_filename:
   393          yield (metadata.path, record)
   394        else:
   395          yield record
   396  
   397  
   398  class ReadAllFiles(PTransform):
   399    """A Read transform that reads a PCollection of files.
   400  
   401    Pipeline authors should not use this directly. This is to be used by Read
   402    PTransform authors who wishes to implement file-based Read transforms that
   403    read a PCollection of files.
   404    """
   405    def __init__(self,
   406                 splittable,  # type: bool
   407                 compression_type,
   408                 desired_bundle_size,  # type: int
   409                 min_bundle_size,  # type: int
   410                 source_from_file,  # type: Callable[[str], iobase.BoundedSource]
   411                 with_filename=False  # type: bool
   412                ):
   413      """
   414      Args:
   415        splittable: If False, files won't be split into sub-ranges. If True,
   416                    files may or may not be split into data ranges.
   417        compression_type: A ``CompressionType`` object that specifies the
   418                    compression type of the files that will be processed. If
   419                    ``CompressionType.AUTO``, system will try to automatically
   420                    determine the compression type based on the extension of
   421                    files.
   422        desired_bundle_size: the desired size of data ranges that should be
   423                             generated when splitting a file into data ranges.
   424        min_bundle_size: minimum size of data ranges that should be generated when
   425                             splitting a file into data ranges.
   426        source_from_file: a function that produces a ``BoundedSource`` given a
   427                          file name. System will use this function to generate
   428                          ``BoundedSource`` objects for file paths. Note that file
   429                          paths passed to this will be for individual files, not
   430                          for file patterns even if the ``PCollection`` of files
   431                          processed by the transform consist of file patterns.
   432        with_filename: If True, returns a Key Value with the key being the file
   433          name and the value being the actual data. If False, it only returns
   434          the data.
   435      """
   436      self._splittable = splittable
   437      self._compression_type = compression_type
   438      self._desired_bundle_size = desired_bundle_size
   439      self._min_bundle_size = min_bundle_size
   440      self._source_from_file = source_from_file
   441      self._with_filename = with_filename
   442      # TODO(BEAM-14497) always reshuffle once gbk always trigger works.
   443      self._is_reshuffle = True
   444  
   445    def _disable_reshuffle(self):
   446      # TODO(BEAM-14497) Remove this private method once gbk always trigger works.
   447      #
   448      # Currently Reshuffle() holds elements until the stage is completed. When
   449      # ReadRange is needed instantly after match (like read continuously), the
   450      # reshard is temporarily disabled. However, the read then does not scale and
   451      # is deemed experimental.
   452      self._is_reshuffle = False
   453      return self
   454  
   455    def expand(self, pvalue):
   456      pvalue = (
   457          pvalue
   458          | 'ExpandIntoRanges' >> ParDo(
   459              _ExpandIntoRanges(
   460                  self._splittable,
   461                  self._compression_type,
   462                  self._desired_bundle_size,
   463                  self._min_bundle_size)))
   464      if self._is_reshuffle:
   465        pvalue = pvalue | 'Reshard' >> Reshuffle()
   466      return (
   467          pvalue
   468          | 'ReadRange' >> ParDo(
   469              _ReadRange(
   470                  self._source_from_file, with_filename=self._with_filename)))