github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/direct/bundle_factory.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A factory that creates UncommittedBundles."""
    19  
    20  # pytype: skip-file
    21  
    22  from typing import Iterable
    23  from typing import Iterator
    24  from typing import List
    25  from typing import Union
    26  from typing import cast
    27  
    28  from apache_beam import pvalue
    29  from apache_beam.runners import common
    30  from apache_beam.utils.windowed_value import WindowedValue
    31  
    32  
    33  class BundleFactory(object):
    34    """For internal use only; no backwards-compatibility guarantees.
    35  
    36    BundleFactory creates output bundles to be used by transform evaluators.
    37  
    38    Args:
    39      stacked: whether or not to stack the WindowedValues within the bundle
    40        in case consecutive ones share the same timestamp and windows.
    41        DirectRunnerOptions.direct_runner_use_stacked_bundle controls this option.
    42    """
    43    def __init__(self, stacked):
    44      # type: (bool) -> None
    45      self._stacked = stacked
    46  
    47    def create_bundle(self, output_pcollection):
    48      # type: (Union[pvalue.PBegin, pvalue.PCollection]) -> _Bundle
    49      return _Bundle(output_pcollection, self._stacked)
    50  
    51    def create_empty_committed_bundle(self, output_pcollection):
    52      # type: (Union[pvalue.PBegin, pvalue.PCollection]) -> _Bundle
    53      bundle = self.create_bundle(output_pcollection)
    54      bundle.commit(None)
    55      return bundle
    56  
    57  
    58  # a bundle represents a unit of work that will be processed by a transform.
    59  class _Bundle(common.Receiver):
    60    """Part of a PCollection with output elements.
    61  
    62    Part of a PCollection. Elements are output to a bundle, which will cause them
    63    to be executed by PTransform that consume the PCollection this bundle is a
    64    part of at a later point. It starts as an uncommitted bundle and can have
    65    elements added to it. It needs to be committed to make it immutable before
    66    passing it to a downstream ptransform.
    67  
    68    The stored elements are WindowedValues, which contains timestamp and windows
    69    information.
    70  
    71    Bundle internally optimizes storage by stacking elements with the same
    72    timestamp and windows into StackedWindowedValues, and then returns an iterable
    73    to restore WindowedValues upon get_elements() call.
    74  
    75    When this optimization is not desired, it can be avoided by an option when
    76    creating bundles, like:::
    77  
    78      b = Bundle(stacked=False)
    79    """
    80    class _StackedWindowedValues(object):
    81      """A stack of WindowedValues with the same timestamp and windows.
    82  
    83      It must be initialized from a single WindowedValue.
    84  
    85      Example:::
    86  
    87        s = StackedWindowedValues(windowed_value)
    88        if (another_windowed_value.timestamp == s.timestamp and
    89            another_windowed_value.windows == s.windows):
    90          s.add_value(another_windowed_value.value)
    91        windowed_values = [wv for wv in s.windowed_values()]
    92        # now windowed_values equals to [windowed_value, another_windowed_value]
    93      """
    94      def __init__(self, initial_windowed_value):
    95        self._initial_windowed_value = initial_windowed_value
    96        self._appended_values = []
    97  
    98      @property
    99      def timestamp(self):
   100        return self._initial_windowed_value.timestamp
   101  
   102      @property
   103      def windows(self):
   104        return self._initial_windowed_value.windows
   105  
   106      @property
   107      def pane_info(self):
   108        return self._initial_windowed_value.pane_info
   109  
   110      def add_value(self, value):
   111        self._appended_values.append(value)
   112  
   113      def windowed_values(self):
   114        # type: () -> Iterator[WindowedValue]
   115        # yield first windowed_value as is, then iterate through
   116        # _appended_values to yield WindowedValue on the fly.
   117        yield self._initial_windowed_value
   118        for v in self._appended_values:
   119          yield self._initial_windowed_value.with_value(v)
   120  
   121    def __init__(self, pcollection, stacked=True):
   122      # type: (Union[pvalue.PBegin, pvalue.PCollection], bool) -> None
   123      assert isinstance(pcollection, (pvalue.PBegin, pvalue.PCollection))
   124      self._pcollection = pcollection
   125      self._elements = [
   126      ]  # type: List[Union[WindowedValue, _Bundle._StackedWindowedValues]]
   127      self._stacked = stacked
   128      self._committed = False
   129      self._tag = None  # optional tag information for this bundle
   130  
   131    def get_elements_iterable(self, make_copy=False):
   132      # type: (bool) -> Iterable[WindowedValue]
   133  
   134      """Returns iterable elements.
   135  
   136      Args:
   137        make_copy: whether to force returning copy or yielded iterable.
   138  
   139      Returns:
   140        unstacked elements,
   141        in the form of iterable if committed and make_copy is not True,
   142        or as a list of copied WindowedValues.
   143      """
   144      if not self._stacked:
   145        # we can safely assume self._elements contains only WindowedValues
   146        elements = cast('List[WindowedValue]', self._elements)
   147        if self._committed and not make_copy:
   148          return elements
   149        return list(elements)
   150  
   151      def iterable_stacked_or_elements(elements):
   152        for e in elements:
   153          if isinstance(e, _Bundle._StackedWindowedValues):
   154            for w in e.windowed_values():
   155              yield w
   156          else:
   157            yield e
   158  
   159      if self._committed and not make_copy:
   160        return iterable_stacked_or_elements(self._elements)
   161      # returns a copy.
   162      return [e for e in iterable_stacked_or_elements(self._elements)]
   163  
   164    def has_elements(self):
   165      return len(self._elements) > 0
   166  
   167    @property
   168    def tag(self):
   169      return self._tag
   170  
   171    @tag.setter
   172    def tag(self, value):
   173      assert not self._tag
   174      self._tag = value
   175  
   176    @property
   177    def pcollection(self):
   178      """PCollection that the elements of this UncommittedBundle belong to."""
   179      return self._pcollection
   180  
   181    def add(self, element):
   182      """Outputs an element to this bundle.
   183  
   184      Args:
   185        element: WindowedValue
   186      """
   187      assert not self._committed
   188      if not self._stacked:
   189        self._elements.append(element)
   190        return
   191      if (self._elements and
   192          (isinstance(self._elements[-1],
   193                      (WindowedValue, _Bundle._StackedWindowedValues))) and
   194          self._elements[-1].timestamp == element.timestamp and
   195          self._elements[-1].windows == element.windows and
   196          self._elements[-1].pane_info == element.pane_info):
   197        if isinstance(self._elements[-1], WindowedValue):
   198          self._elements[-1] = _Bundle._StackedWindowedValues(self._elements[-1])
   199        self._elements[-1].add_value(element.value)
   200      else:
   201        self._elements.append(element)
   202  
   203    def output(self, element):
   204      self.add(element)
   205  
   206    def receive(self, element):
   207      # type: (WindowedValue) -> None
   208      self.add(element)
   209  
   210    def commit(self, synchronized_processing_time):
   211      """Commits this bundle.
   212  
   213      Uncommitted bundle will become committed (immutable) after this call.
   214  
   215      Args:
   216        synchronized_processing_time: the synchronized processing time at which
   217        this bundle was committed
   218      """
   219      assert not self._committed
   220      self._committed = True
   221      self._elements = tuple(self._elements)
   222      self._synchronized_processing_time = synchronized_processing_time