github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/fileio.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/fileio.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """``PTransforms`` for manipulating files in Apache Beam.
    19  
    20  Provides reading ``PTransform``\\s, ``MatchFiles``,
    21  ``MatchAll``, that produces a ``PCollection`` of records representing a file
    22  and its metadata; and ``ReadMatches``, which takes in a ``PCollection`` of file
    23  metadata records, and produces a ``PCollection`` of ``ReadableFile`` objects.
    24  These transforms currently do not support splitting by themselves.
    25  
    26  Writing to Files
    27  ================
    28  
    29  The transforms in this file include ``WriteToFiles``, which allows you to write
    30  a ``beam.PCollection`` to files, and gives you many options to customize how to
    31  do this.
    32  
    33  The ``WriteToFiles`` transform supports bounded and unbounded PCollections
    34  (i.e. it can be used both batch and streaming pipelines). For streaming
    35  pipelines, it currently does not have support for multiple trigger firings
    36  on the same window.
    37  
    38  File Naming
    39  -----------
    40  One of the parameters received by ``WriteToFiles`` is a function specifying how
    41  to name the files that are written. This is a function that takes in the
    42  following parameters:
    43  
    44  - window
    45  - pane
    46  - shard_index
    47  - total_shards
    48  - compression
    49  - destination
    50  
    51  It should return a file name that is unique for a combination of these
    52  parameters.
    53  
    54  The default naming strategy is to name files
    55  in the format
    56  `$prefix-$start-$end-$pane-$shard-of-$numShards$suffix$compressionSuffix`,
    57  where:
    58  
    59  - `$prefix` is, by default, `"output"`.
    60  - `$start` and `$end` are the boundaries of the window for the data being
    61    written. These are omitted if we're using the Global window.
    62  - `$pane` is the index for the number of firing for a window.
    63  - `$shard` and `$numShards` are the current shard number, and the total number
    64    of shards for this window firing.
    65  - `$suffix` is, by default, an empty string, but it can be set by the user via
    66    ``default_file_naming``.
    67  
    68  Dynamic Destinations
    69  --------------------
    70  If the elements in the input ``beam.PCollection`` can be partitioned into groups
    71  that should be treated differently (e.g. some events are to be stored as CSV,
    72  while some others are to be stored as Avro files), it is possible to do this
    73  by passing a `destination` parameter to ``WriteToFiles``. Something like the
    74  following::
    75  
    76      my_pcollection | beam.io.fileio.WriteToFiles(
    77            path='/my/file/path',
    78            destination=lambda record: 'avro' if record['type'] == 'A' else 'csv',
    79            sink=lambda dest: AvroSink() if dest == 'avro' else CsvSink(),
    80            file_naming=beam.io.fileio.destination_prefix_naming())
    81  
    82  In this transform, depending on the type of a record, it will be written down to
    83  a destination named `'avro'`, or `'csv'`. The value returned by the
    84  `destination` call is then passed to the `sink` call, to determine what sort of
    85  sink will be used for each destination. The return type of the `destination`
    86  parameter can be anything, as long as elements can be grouped by it.
    87  """
    88  
    89  # pytype: skip-file
    90  
    91  import collections
    92  import logging
    93  import os
    94  import random
    95  import uuid
    96  from collections import namedtuple
    97  from functools import partial
    98  from typing import TYPE_CHECKING
    99  from typing import Any
   100  from typing import BinaryIO  # pylint: disable=unused-import
   101  from typing import Callable
   102  from typing import DefaultDict
   103  from typing import Dict
   104  from typing import Iterable
   105  from typing import List
   106  from typing import Tuple
   107  from typing import Union
   108  
   109  import apache_beam as beam
   110  from apache_beam.io import filesystem
   111  from apache_beam.io import filesystems
   112  from apache_beam.io.filesystem import BeamIOError
   113  from apache_beam.io.filesystem import CompressionTypes
   114  from apache_beam.options.pipeline_options import GoogleCloudOptions
   115  from apache_beam.options.value_provider import StaticValueProvider
   116  from apache_beam.options.value_provider import ValueProvider
   117  from apache_beam.transforms.periodicsequence import PeriodicImpulse
   118  from apache_beam.transforms.userstate import CombiningValueStateSpec
   119  from apache_beam.transforms.window import FixedWindows
   120  from apache_beam.transforms.window import GlobalWindow
   121  from apache_beam.transforms.window import IntervalWindow
   122  from apache_beam.utils.timestamp import MAX_TIMESTAMP
   123  from apache_beam.utils.timestamp import Timestamp
   124  
   125  if TYPE_CHECKING:
   126    from apache_beam.transforms.window import BoundedWindow
   127  
   128  __all__ = [
   129      'EmptyMatchTreatment',
   130      'MatchFiles',
   131      'MatchAll',
   132      'MatchContinuously',
   133      'ReadableFile',
   134      'ReadMatches',
   135      'WriteToFiles'
   136  ]
   137  
   138  _LOGGER = logging.getLogger(__name__)
   139  
   140  FileMetadata = namedtuple("FileMetadata", "mime_type compression_type")
   141  
   142  CreateFileMetadataFn = Callable[[str, str], FileMetadata]
   143  
   144  
   145  class EmptyMatchTreatment(object):
   146    """How to treat empty matches in ``MatchAll`` and ``MatchFiles`` transforms.
   147  
   148    If empty matches are disallowed, an error will be thrown if a pattern does not
   149    match any files."""
   150  
   151    ALLOW = 'ALLOW'
   152    DISALLOW = 'DISALLOW'
   153    ALLOW_IF_WILDCARD = 'ALLOW_IF_WILDCARD'
   154  
   155    @staticmethod
   156    def allow_empty_match(pattern, setting):
   157      if setting == EmptyMatchTreatment.ALLOW:
   158        return True
   159      elif setting == EmptyMatchTreatment.ALLOW_IF_WILDCARD and '*' in pattern:
   160        return True
   161      elif setting == EmptyMatchTreatment.DISALLOW:
   162        return False
   163      else:
   164        raise ValueError(setting)
   165  
   166  
   167  class _MatchAllFn(beam.DoFn):
   168    def __init__(self, empty_match_treatment):
   169      self._empty_match_treatment = empty_match_treatment
   170  
   171    def process(self, file_pattern: str) -> List[filesystem.FileMetadata]:
   172      # TODO: Should we batch the lookups?
   173      match_results = filesystems.FileSystems.match([file_pattern])
   174      match_result = match_results[0]
   175  
   176      if (not match_result.metadata_list and
   177          not EmptyMatchTreatment.allow_empty_match(file_pattern,
   178                                                    self._empty_match_treatment)):
   179        raise BeamIOError(
   180            'Empty match for pattern %s. Disallowed.' % file_pattern)
   181  
   182      return match_result.metadata_list
   183  
   184  
   185  class MatchFiles(beam.PTransform):
   186    """Matches a file pattern using ``FileSystems.match``.
   187  
   188    This ``PTransform`` returns a ``PCollection`` of matching files in the form
   189    of ``FileMetadata`` objects."""
   190    def __init__(
   191        self,
   192        file_pattern: str,
   193        empty_match_treatment=EmptyMatchTreatment.ALLOW_IF_WILDCARD):
   194      self._file_pattern = file_pattern
   195      self._empty_match_treatment = empty_match_treatment
   196  
   197    def expand(self, pcoll) -> beam.PCollection[filesystem.FileMetadata]:
   198      return pcoll.pipeline | beam.Create([self._file_pattern]) | MatchAll()
   199  
   200  
   201  class MatchAll(beam.PTransform):
   202    """Matches file patterns from the input PCollection via ``FileSystems.match``.
   203  
   204    This ``PTransform`` returns a ``PCollection`` of matching files in the form
   205    of ``FileMetadata`` objects."""
   206    def __init__(self, empty_match_treatment=EmptyMatchTreatment.ALLOW):
   207      self._empty_match_treatment = empty_match_treatment
   208  
   209    def expand(
   210        self,
   211        pcoll: beam.PCollection,
   212    ) -> beam.PCollection[filesystem.FileMetadata]:
   213      return pcoll | beam.ParDo(_MatchAllFn(self._empty_match_treatment))
   214  
   215  
   216  class ReadableFile(object):
   217    """A utility class for accessing files."""
   218    def __init__(self, metadata, compression=None):
   219      self.metadata = metadata
   220      self._compression = compression
   221  
   222    def open(self, mime_type='text/plain', compression_type=None):
   223      compression = (
   224          compression_type or self._compression or
   225          filesystems.CompressionTypes.AUTO)
   226      return filesystems.FileSystems.open(
   227          self.metadata.path, mime_type=mime_type, compression_type=compression)
   228  
   229    def read(self, mime_type='application/octet-stream'):
   230      return self.open(mime_type).read()
   231  
   232    def read_utf8(self):
   233      return self.open().read().decode('utf-8')
   234  
   235  
   236  class _ReadMatchesFn(beam.DoFn):
   237    def __init__(self, compression, skip_directories):
   238      self._compression = compression
   239      self._skip_directories = skip_directories
   240  
   241    def process(
   242        self,
   243        file_metadata: Union[str, filesystem.FileMetadata],
   244    ) -> Iterable[ReadableFile]:
   245      metadata = (
   246          filesystem.FileMetadata(file_metadata, 0) if isinstance(
   247              file_metadata, str) else file_metadata)
   248  
   249      if ((metadata.path.endswith('/') or metadata.path.endswith('\\')) and
   250          self._skip_directories):
   251        return
   252      elif metadata.path.endswith('/') or metadata.path.endswith('\\'):
   253        raise BeamIOError(
   254            'Directories are not allowed in ReadMatches transform.'
   255            'Found %s.' % metadata.path)
   256  
   257      # TODO: Mime type? Other arguments? Maybe arguments passed in to transform?
   258      yield ReadableFile(metadata, self._compression)
   259  
   260  
   261  class MatchContinuously(beam.PTransform):
   262    """Checks for new files for a given pattern every interval.
   263  
   264    This ``PTransform`` returns a ``PCollection`` of matching files in the form
   265    of ``FileMetadata`` objects.
   266  
   267    MatchContinuously is experimental.  No backwards-compatibility
   268    guarantees.
   269    """
   270    def __init__(
   271        self,
   272        file_pattern,
   273        interval=360.0,
   274        has_deduplication=True,
   275        start_timestamp=Timestamp.now(),
   276        stop_timestamp=MAX_TIMESTAMP,
   277        match_updated_files=False,
   278        apply_windowing=False,
   279        empty_match_treatment=EmptyMatchTreatment.ALLOW):
   280      """Initializes a MatchContinuously transform.
   281  
   282      Args:
   283        file_pattern: The file path to read from.
   284        interval: Interval at which to check for files in seconds.
   285        has_deduplication: Whether files already read are discarded or not.
   286        start_timestamp: Timestamp for start file checking.
   287        stop_timestamp: Timestamp after which no more files will be checked.
   288        match_updated_files: (When has_deduplication is set to True) whether match
   289          file with timestamp changes.
   290        apply_windowing: Whether each element should be assigned to
   291          individual window. If false, all elements will reside in global window.
   292      """
   293  
   294      self.file_pattern = file_pattern
   295      self.interval = interval
   296      self.has_deduplication = has_deduplication
   297      self.start_ts = start_timestamp
   298      self.stop_ts = stop_timestamp
   299      self.match_upd = match_updated_files
   300      self.apply_windowing = apply_windowing
   301      self.empty_match_treatment = empty_match_treatment
   302  
   303    def expand(self, pbegin) -> beam.PCollection[filesystem.FileMetadata]:
   304      # invoke periodic impulse
   305      impulse = pbegin | PeriodicImpulse(
   306          start_timestamp=self.start_ts,
   307          stop_timestamp=self.stop_ts,
   308          fire_interval=self.interval)
   309  
   310      # match file pattern periodically
   311      match_files = (
   312          impulse
   313          | 'GetFilePattern' >> beam.Map(lambda x: self.file_pattern)
   314          | MatchAll(self.empty_match_treatment))
   315  
   316      # apply deduplication strategy if required
   317      if self.has_deduplication:
   318        # Making a Key Value so each file has its own state.
   319        match_files = match_files | 'ToKV' >> beam.Map(lambda x: (x.path, x))
   320        if self.match_upd:
   321          match_files = match_files | 'RemoveOldAlreadyRead' >> beam.ParDo(
   322              _RemoveOldDuplicates())
   323        else:
   324          match_files = match_files | 'RemoveAlreadyRead' >> beam.ParDo(
   325              _RemoveDuplicates())
   326  
   327      # apply windowing if required. Apply at last because deduplication relies on
   328      # the global window.
   329      if self.apply_windowing:
   330        match_files = match_files | beam.WindowInto(FixedWindows(self.interval))
   331  
   332      return match_files
   333  
   334  
   335  class ReadMatches(beam.PTransform):
   336    """Converts each result of MatchFiles() or MatchAll() to a ReadableFile.
   337  
   338     This helps read in a file's contents or obtain a file descriptor."""
   339    def __init__(self, compression=None, skip_directories=True):
   340      self._compression = compression
   341      self._skip_directories = skip_directories
   342  
   343    def expand(
   344        self,
   345        pcoll: beam.PCollection[Union[str, filesystem.FileMetadata]],
   346    ) -> beam.PCollection[ReadableFile]:
   347      return pcoll | beam.ParDo(
   348          _ReadMatchesFn(self._compression, self._skip_directories))
   349  
   350  
   351  class FileSink(object):
   352    """Specifies how to write elements to individual files in ``WriteToFiles``.
   353  
   354    A Sink class must implement the following:
   355  
   356     - The ``open`` method, which initializes writing to a file handler (it is not
   357       responsible for opening the file handler itself).
   358     - The ``write`` method, which writes an element to the file that was passed
   359       in ``open``.
   360     - The ``flush`` method, which flushes any buffered state. This is most often
   361       called before closing a file (but not exclusively called in that
   362       situation). The sink is not responsible for closing the file handler.
   363    A Sink class can override the following:
   364     - The ``create_metadata`` method, which creates all metadata passed to
   365       Filesystems.create.
   366     """
   367    def create_metadata(
   368        self, destination: str, full_file_name: str) -> FileMetadata:
   369      return FileMetadata(
   370          mime_type="application/octet-stream",
   371          compression_type=CompressionTypes.AUTO)
   372  
   373    def open(self, fh):
   374      # type: (BinaryIO) -> None
   375      raise NotImplementedError
   376  
   377    def write(self, record):
   378      raise NotImplementedError
   379  
   380    def flush(self):
   381      raise NotImplementedError
   382  
   383  
   384  @beam.typehints.with_input_types(str)
   385  class TextSink(FileSink):
   386    """A sink that encodes utf8 elements, and writes to file handlers.
   387  
   388    This sink simply calls file_handler.write(record.encode('utf8') + '\n') on all
   389    records that come into it.
   390    """
   391    def open(self, fh):
   392      self._fh = fh
   393  
   394    def write(self, record):
   395      self._fh.write(record.encode('utf8'))
   396      self._fh.write(b'\n')
   397  
   398    def flush(self):
   399      self._fh.flush()
   400  
   401  
   402  def prefix_naming(prefix):
   403    return default_file_naming(prefix)
   404  
   405  
   406  _DEFAULT_FILE_NAME_TEMPLATE = (
   407      '{prefix}-{start}-{end}-{pane}-'
   408      '{shard:05d}-of-{total_shards:05d}'
   409      '{suffix}{compression}')
   410  
   411  
   412  def _format_shard(
   413      window, pane, shard_index, total_shards, compression, prefix, suffix):
   414    kwargs = {
   415        'prefix': prefix,
   416        'start': '',
   417        'end': '',
   418        'pane': '',
   419        'shard': 0,
   420        'total_shards': 0,
   421        'suffix': '',
   422        'compression': ''
   423    }
   424  
   425    if total_shards is not None and shard_index is not None:
   426      kwargs['shard'] = int(shard_index)
   427      kwargs['total_shards'] = int(total_shards)
   428  
   429    if window != GlobalWindow():
   430      kwargs['start'] = window.start.to_utc_datetime().isoformat()
   431      kwargs['end'] = window.end.to_utc_datetime().isoformat()
   432  
   433    # TODO(https://github.com/apache/beam/issues/18721): Add support for PaneInfo
   434    # If the PANE is the ONLY firing in the window, we don't add it.
   435    #if pane and not (pane.is_first and pane.is_last):
   436    #  kwargs['pane'] = pane.index
   437  
   438    if suffix:
   439      kwargs['suffix'] = suffix
   440  
   441    if compression:
   442      kwargs['compression'] = '.%s' % compression
   443  
   444    # Remove separators for unused template parts.
   445    format = _DEFAULT_FILE_NAME_TEMPLATE
   446    if shard_index is None:
   447      format = format.replace('-{shard:05d}', '')
   448    if total_shards is None:
   449      format = format.replace('-of-{total_shards:05d}', '')
   450    for name, value in kwargs.items():
   451      if value in (None, ''):
   452        format = format.replace('-{%s}' % name, '')
   453  
   454    return format.format(**kwargs)
   455  
   456  
   457  FileNaming = Callable[[Any, Any, int, int, Any, str, str], str]
   458  
   459  
   460  def destination_prefix_naming(suffix=None) -> FileNaming:
   461    def _inner(window, pane, shard_index, total_shards, compression, destination):
   462      prefix = str(destination)
   463      return _format_shard(
   464          window, pane, shard_index, total_shards, compression, prefix, suffix)
   465  
   466    return _inner
   467  
   468  
   469  def default_file_naming(prefix, suffix=None) -> FileNaming:
   470    def _inner(window, pane, shard_index, total_shards, compression, destination):
   471      return _format_shard(
   472          window, pane, shard_index, total_shards, compression, prefix, suffix)
   473  
   474    return _inner
   475  
   476  
   477  def single_file_naming(prefix, suffix=None) -> FileNaming:
   478    def _inner(window, pane, shard_index, total_shards, compression, destination):
   479      assert shard_index in (0, None), shard_index
   480      assert total_shards in (1, None), total_shards
   481      return _format_shard(window, pane, None, None, compression, prefix, suffix)
   482  
   483    return _inner
   484  
   485  
   486  _FileResult = collections.namedtuple(
   487      'FileResult', [
   488          'file_name',
   489          'shard_index',
   490          'total_shards',
   491          'window',
   492          'pane',
   493          'destination'
   494      ])
   495  
   496  
   497  # Adding a class to contain PyDoc.
   498  class FileResult(_FileResult):
   499    """A descriptor of a file that has been written."""
   500    pass
   501  
   502  
   503  class WriteToFiles(beam.PTransform):
   504    r"""Write the incoming PCollection to a set of output files.
   505  
   506    The incoming ``PCollection`` may be bounded or unbounded.
   507  
   508    **Note:** For unbounded ``PCollection``\s, this transform does not support
   509    multiple firings per Window (due to the fact that files are named only by
   510    their destination, and window, at the moment).
   511  
   512    WriteToFiles is experimental.  No backwards-compatibility guarantees.
   513    """
   514  
   515    # We allow up to 20 different destinations to be written in a single bundle.
   516    # Too many files will add memory pressure to the worker, so we let it be 20.
   517    MAX_NUM_WRITERS_PER_BUNDLE = 20
   518  
   519    DEFAULT_SHARDING = 5
   520  
   521    def __init__(
   522        self,
   523        path,
   524        file_naming=None,
   525        destination=None,
   526        temp_directory=None,
   527        sink=None,
   528        shards=None,
   529        output_fn=None,
   530        max_writers_per_bundle=MAX_NUM_WRITERS_PER_BUNDLE):
   531      """Initializes a WriteToFiles transform.
   532  
   533      Args:
   534        path (str, ValueProvider): The directory to write files into.
   535        file_naming (callable): A callable that takes in a window, pane,
   536          shard_index, total_shards and compression; and returns a file name.
   537        destination (callable): If this argument is provided, the sink parameter
   538          must also be a callable.
   539        temp_directory (str, ValueProvider): To ensure atomicity in the transform,
   540          the output is written into temporary files, which are written to a
   541          directory that is meant to be temporary as well. Once the whole output
   542          has been written, the files are moved into their final destination, and
   543          given their final names. By default, the temporary directory will be
   544          within the temp_location of your pipeline.
   545        sink (callable, ~apache_beam.io.fileio.FileSink): The sink to use to write
   546          into a file. It should implement the methods of a ``FileSink``. Pass a
   547          class signature or an instance of FileSink to this parameter. If none is
   548          provided, a ``TextSink`` is used.
   549        shards (int): The number of shards per destination and trigger firing.
   550        max_writers_per_bundle (int): The number of writers that can be open
   551          concurrently in a single worker that's processing one bundle.
   552      """
   553      self.path = (
   554          path if isinstance(path, ValueProvider) else StaticValueProvider(
   555              str, path))
   556      self.file_naming_fn = file_naming or default_file_naming('output')
   557      self.destination_fn = self._get_destination_fn(destination)
   558      self._temp_directory = temp_directory
   559      self.sink_fn = self._get_sink_fn(sink)
   560      self.shards = shards or WriteToFiles.DEFAULT_SHARDING
   561      self.output_fn = output_fn or (lambda x: x)
   562  
   563      self._max_num_writers_per_bundle = max_writers_per_bundle
   564  
   565    @staticmethod
   566    def _get_sink_fn(input_sink):
   567      # type: (...) -> Callable[[Any], FileSink]
   568      if isinstance(input_sink, type) and issubclass(input_sink, FileSink):
   569        return lambda x: input_sink()
   570      elif isinstance(input_sink, FileSink):
   571        kls = input_sink.__class__
   572        return lambda x: kls()
   573      elif callable(input_sink):
   574        return input_sink
   575      else:
   576        return lambda x: TextSink()
   577  
   578    @staticmethod
   579    def _get_destination_fn(destination):
   580      # type: (...) -> Callable[[Any], str]
   581      if isinstance(destination, ValueProvider):
   582        return lambda elm: destination.get()
   583      elif callable(destination):
   584        return destination
   585      else:
   586        return lambda elm: destination
   587  
   588    def expand(self, pcoll):
   589      p = pcoll.pipeline
   590  
   591      if not self._temp_directory:
   592        temp_location = (
   593            p.options.view_as(GoogleCloudOptions).temp_location or
   594            self.path.get())
   595        dir_uid = str(uuid.uuid4())
   596        self._temp_directory = StaticValueProvider(
   597            str, filesystems.FileSystems.join(temp_location, '.temp%s' % dir_uid))
   598        _LOGGER.info('Added temporary directory %s', self._temp_directory.get())
   599  
   600      output = (
   601          pcoll
   602          | beam.ParDo(
   603              _WriteUnshardedRecordsFn(
   604                  base_path=self._temp_directory,
   605                  destination_fn=self.destination_fn,
   606                  sink_fn=self.sink_fn,
   607                  max_writers_per_bundle=self._max_num_writers_per_bundle)).
   608          with_outputs(
   609              _WriteUnshardedRecordsFn.SPILLED_RECORDS,
   610              _WriteUnshardedRecordsFn.WRITTEN_FILES))
   611  
   612      written_files_pc = output[_WriteUnshardedRecordsFn.WRITTEN_FILES]
   613      spilled_records_pc = output[_WriteUnshardedRecordsFn.SPILLED_RECORDS]
   614  
   615      more_written_files_pc = (
   616          spilled_records_pc
   617          | beam.ParDo(
   618              _AppendShardedDestination(self.destination_fn, self.shards))
   619          | "GroupRecordsByDestinationAndShard" >> beam.GroupByKey()
   620          | beam.ParDo(
   621              _WriteShardedRecordsFn(
   622                  self._temp_directory, self.sink_fn, self.shards)))
   623  
   624      files_by_destination_pc = (
   625          (written_files_pc, more_written_files_pc)
   626          | beam.Flatten()
   627          | beam.Map(lambda file_result: (file_result.destination, file_result))
   628          | "GroupTempFilesByDestination" >> beam.GroupByKey())
   629  
   630      # Now we should take the temporary files, and write them to the final
   631      # destination, with their proper names.
   632  
   633      file_results = (
   634          files_by_destination_pc
   635          | beam.ParDo(
   636              _MoveTempFilesIntoFinalDestinationFn(
   637                  self.path, self.file_naming_fn, self._temp_directory)))
   638  
   639      return file_results
   640  
   641  
   642  def _create_writer(
   643      base_path,
   644      writer_key: Tuple[str, IntervalWindow],
   645      create_metadata_fn: CreateFileMetadataFn,
   646  ):
   647    try:
   648      filesystems.FileSystems.mkdirs(base_path)
   649    except IOError:
   650      # Directory already exists.
   651      pass
   652  
   653    destination = writer_key[0]
   654  
   655    # The file name has a prefix determined by destination+window, along with
   656    # a random string. This allows us to retrieve orphaned files later on.
   657    file_name = '%s_%s' % (abs(hash(writer_key)), uuid.uuid4())
   658    full_file_name = filesystems.FileSystems.join(base_path, file_name)
   659    metadata = create_metadata_fn(destination, full_file_name)
   660    return full_file_name, filesystems.FileSystems.create(
   661        full_file_name,
   662        **metadata._asdict())
   663  
   664  
   665  class _MoveTempFilesIntoFinalDestinationFn(beam.DoFn):
   666    def __init__(self, path, file_naming_fn, temp_dir):
   667      self.path = path
   668      self.file_naming_fn = file_naming_fn
   669      self.temporary_directory = temp_dir
   670  
   671    def process(self, element, w=beam.DoFn.WindowParam):
   672      destination = element[0]
   673      # list of FileResult objects for temp files
   674      temp_file_results = list(element[1])
   675      # list of FileResult objects for final files
   676      final_file_results = []
   677  
   678      for i, r in enumerate(temp_file_results):
   679        # TODO(pabloem): Handle compression for files.
   680        final_file_name = self.file_naming_fn(
   681            r.window, r.pane, i, len(temp_file_results), '', destination)
   682  
   683        final_file_results.append(
   684            FileResult(
   685                final_file_name,
   686                i,
   687                len(temp_file_results),
   688                r.window,
   689                r.pane,
   690                destination))
   691  
   692      move_from = [f.file_name for f in temp_file_results]
   693      move_to = [f.file_name for f in final_file_results]
   694      _LOGGER.info(
   695          'Moving temporary files %s to dir: %s as %s',
   696          map(os.path.basename, move_from),
   697          self.path.get(),
   698          move_to)
   699  
   700      try:
   701        filesystems.FileSystems.rename(
   702            move_from,
   703            [filesystems.FileSystems.join(self.path.get(), f) for f in move_to])
   704      except BeamIOError:
   705        # This error is not serious, because it may happen on a retry of the
   706        # bundle. We simply log it.
   707        _LOGGER.debug(
   708            'Exception occurred during moving files: %s. This may be due to a'
   709            ' bundle being retried.',
   710            move_from)
   711  
   712      yield from final_file_results
   713  
   714      _LOGGER.debug(
   715          'Checking orphaned temporary files for destination %s and window %s',
   716          destination,
   717          w)
   718      writer_key = (destination, w)
   719      self._check_orphaned_files(writer_key)
   720  
   721    def _check_orphaned_files(self, writer_key):
   722      try:
   723        prefix = filesystems.FileSystems.join(
   724            self.temporary_directory.get(), str(abs(hash(writer_key))))
   725        match_result = filesystems.FileSystems.match(['%s*' % prefix])
   726        orphaned_files = [m.path for m in match_result[0].metadata_list]
   727  
   728        if len(orphaned_files) > 0:
   729          _LOGGER.info(
   730              'Some files may be left orphaned in the temporary folder: %s',
   731              orphaned_files)
   732      except BeamIOError as e:
   733        _LOGGER.info('Exceptions when checking orphaned files: %s', e)
   734  
   735  
   736  class _WriteShardedRecordsFn(beam.DoFn):
   737  
   738    def __init__(self,
   739                 base_path,
   740                 sink_fn,  # type: Callable[[Any], FileSink]
   741                 shards  # type: int
   742                ):
   743      self.base_path = base_path
   744      self.sink_fn = sink_fn
   745      self.shards = shards
   746  
   747    def process(
   748        self, element, w=beam.DoFn.WindowParam, pane=beam.DoFn.PaneInfoParam):
   749      destination_and_shard = element[0]
   750      destination = destination_and_shard[0]
   751      shard = destination_and_shard[1]
   752      records = element[1]
   753  
   754      sink = self.sink_fn(destination)
   755  
   756      full_file_name, writer = _create_writer(
   757          base_path=self.base_path.get(),
   758          writer_key=(destination, w),
   759          create_metadata_fn=sink.create_metadata)
   760  
   761      sink.open(writer)
   762  
   763      for r in records:
   764        sink.write(r)
   765  
   766      sink.flush()
   767      writer.close()
   768  
   769      _LOGGER.info(
   770          'Writing file %s for destination %s and shard %s',
   771          full_file_name,
   772          destination,
   773          repr(shard))
   774  
   775      yield FileResult(
   776          full_file_name,
   777          shard_index=shard,
   778          total_shards=self.shards,
   779          window=w,
   780          pane=pane,
   781          destination=destination)
   782  
   783  
   784  class _AppendShardedDestination(beam.DoFn):
   785    def __init__(
   786        self,
   787        destination,  # type: Callable[[Any], str]
   788        shards  # type: int
   789    ):
   790      self.destination_fn = destination
   791      self.shards = shards
   792  
   793      # We start the shards for a single destination at an arbitrary point.
   794      self._shard_counter = collections.defaultdict(
   795          lambda: random.randrange(self.shards))  # type: DefaultDict[str, int]
   796  
   797    def _next_shard_for_destination(self, destination):
   798      self._shard_counter[destination] = ((self._shard_counter[destination] + 1) %
   799                                          self.shards)
   800  
   801      return self._shard_counter[destination]
   802  
   803    def process(self, record):
   804      destination = self.destination_fn(record)
   805      shard = self._next_shard_for_destination(destination)
   806  
   807      yield ((destination, shard), record)
   808  
   809  
   810  class _WriteUnshardedRecordsFn(beam.DoFn):
   811  
   812    SPILLED_RECORDS = 'spilled_records'
   813    WRITTEN_FILES = 'written_files'
   814  
   815    _writers_and_sinks = None  # type: Dict[Tuple[str, BoundedWindow], Tuple[BinaryIO, FileSink]]
   816    _file_names = None  # type: Dict[Tuple[str, BoundedWindow], str]
   817  
   818    def __init__(
   819        self,
   820        base_path,
   821        destination_fn,
   822        sink_fn,
   823        max_writers_per_bundle=WriteToFiles.MAX_NUM_WRITERS_PER_BUNDLE):
   824      self.base_path = base_path
   825      self.destination_fn = destination_fn
   826      self.sink_fn = sink_fn
   827      self.max_num_writers_per_bundle = max_writers_per_bundle
   828  
   829    def start_bundle(self):
   830      self._writers_and_sinks = {}
   831      self._file_names = {}
   832  
   833    def process(
   834        self, record, w=beam.DoFn.WindowParam, pane=beam.DoFn.PaneInfoParam):
   835      destination = self.destination_fn(record)
   836  
   837      writer, sink = self._get_or_create_writer_and_sink(destination, w)
   838  
   839      if not writer:
   840        return [beam.pvalue.TaggedOutput(self.SPILLED_RECORDS, record)]
   841      else:
   842        sink.write(record)
   843  
   844    def _get_or_create_writer_and_sink(self, destination, window):
   845      """Returns a tuple of writer, sink."""
   846      writer_key = (destination, window)
   847      if writer_key in self._writers_and_sinks:
   848        return self._writers_and_sinks.get(writer_key)
   849      elif len(self._writers_and_sinks) >= self.max_num_writers_per_bundle:
   850        # The writer does not exist, and we have too many writers already.
   851        return None, None
   852      else:
   853        # The writer does not exist, but we can still create a new one.
   854        sink = self.sink_fn(destination)
   855  
   856        full_file_name, writer = _create_writer(
   857            base_path=self.base_path.get(),
   858            writer_key=writer_key,
   859            create_metadata_fn=sink.create_metadata)
   860  
   861        sink.open(writer)
   862        self._writers_and_sinks[writer_key] = (writer, sink)
   863        self._file_names[writer_key] = full_file_name
   864        return self._writers_and_sinks[writer_key]
   865  
   866    def finish_bundle(self):
   867      for key, (writer, sink) in self._writers_and_sinks.items():
   868  
   869        sink.flush()
   870        writer.close()
   871  
   872        file_result = FileResult(self._file_names[key],
   873                                 shard_index=-1,
   874                                 total_shards=0,
   875                                 window=key[1],
   876                                 pane=None,  # TODO(pabloem): get the pane info
   877                                 destination=key[0])
   878  
   879        yield beam.pvalue.TaggedOutput(
   880            self.WRITTEN_FILES,
   881            beam.transforms.window.WindowedValue(
   882                file_result,
   883                timestamp=key[1].start,
   884                windows=[key[1]]  # TODO(pabloem) HOW DO WE GET THE PANE
   885            ))
   886  
   887  
   888  class _RemoveDuplicates(beam.DoFn):
   889    """Internal DoFn that filters out filenames already seen (even though the file
   890    has updated)."""
   891    COUNT_STATE = CombiningValueStateSpec('count', combine_fn=sum)
   892  
   893    def process(
   894        self,
   895        element: Tuple[str, filesystem.FileMetadata],
   896        count_state=beam.DoFn.StateParam(COUNT_STATE)
   897    ) -> Iterable[filesystem.FileMetadata]:
   898  
   899      path = element[0]
   900      file_metadata = element[1]
   901      counter = count_state.read()
   902  
   903      if counter == 0:
   904        count_state.add(1)
   905        _LOGGER.debug('Generated entry for file %s', path)
   906        yield file_metadata
   907      else:
   908        _LOGGER.debug('File %s was already read, seen %d times', path, counter)
   909  
   910  
   911  class _RemoveOldDuplicates(beam.DoFn):
   912    """Internal DoFn that filters out filenames already seen and timestamp
   913    unchanged."""
   914    TIME_STATE = CombiningValueStateSpec(
   915        'count', combine_fn=partial(max, default=0.0))
   916  
   917    def process(
   918        self,
   919        element: Tuple[str, filesystem.FileMetadata],
   920        time_state=beam.DoFn.StateParam(TIME_STATE)
   921    ) -> Iterable[filesystem.FileMetadata]:
   922      path = element[0]
   923      file_metadata = element[1]
   924      new_ts = file_metadata.last_updated_in_seconds
   925      old_ts = time_state.read()
   926  
   927      if old_ts < new_ts:
   928        time_state.add(new_ts)
   929        _LOGGER.debug('Generated entry for file %s', path)
   930        yield file_metadata
   931      else:
   932        _LOGGER.debug('File %s was already read', path)