github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/direct/bundle_factory.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A factory that creates UncommittedBundles.""" 19 20 # pytype: skip-file 21 22 from typing import Iterable 23 from typing import Iterator 24 from typing import List 25 from typing import Union 26 from typing import cast 27 28 from apache_beam import pvalue 29 from apache_beam.runners import common 30 from apache_beam.utils.windowed_value import WindowedValue 31 32 33 class BundleFactory(object): 34 """For internal use only; no backwards-compatibility guarantees. 35 36 BundleFactory creates output bundles to be used by transform evaluators. 37 38 Args: 39 stacked: whether or not to stack the WindowedValues within the bundle 40 in case consecutive ones share the same timestamp and windows. 41 DirectRunnerOptions.direct_runner_use_stacked_bundle controls this option. 42 """ 43 def __init__(self, stacked): 44 # type: (bool) -> None 45 self._stacked = stacked 46 47 def create_bundle(self, output_pcollection): 48 # type: (Union[pvalue.PBegin, pvalue.PCollection]) -> _Bundle 49 return _Bundle(output_pcollection, self._stacked) 50 51 def create_empty_committed_bundle(self, output_pcollection): 52 # type: (Union[pvalue.PBegin, pvalue.PCollection]) -> _Bundle 53 bundle = self.create_bundle(output_pcollection) 54 bundle.commit(None) 55 return bundle 56 57 58 # a bundle represents a unit of work that will be processed by a transform. 59 class _Bundle(common.Receiver): 60 """Part of a PCollection with output elements. 61 62 Part of a PCollection. Elements are output to a bundle, which will cause them 63 to be executed by PTransform that consume the PCollection this bundle is a 64 part of at a later point. It starts as an uncommitted bundle and can have 65 elements added to it. It needs to be committed to make it immutable before 66 passing it to a downstream ptransform. 67 68 The stored elements are WindowedValues, which contains timestamp and windows 69 information. 70 71 Bundle internally optimizes storage by stacking elements with the same 72 timestamp and windows into StackedWindowedValues, and then returns an iterable 73 to restore WindowedValues upon get_elements() call. 74 75 When this optimization is not desired, it can be avoided by an option when 76 creating bundles, like::: 77 78 b = Bundle(stacked=False) 79 """ 80 class _StackedWindowedValues(object): 81 """A stack of WindowedValues with the same timestamp and windows. 82 83 It must be initialized from a single WindowedValue. 84 85 Example::: 86 87 s = StackedWindowedValues(windowed_value) 88 if (another_windowed_value.timestamp == s.timestamp and 89 another_windowed_value.windows == s.windows): 90 s.add_value(another_windowed_value.value) 91 windowed_values = [wv for wv in s.windowed_values()] 92 # now windowed_values equals to [windowed_value, another_windowed_value] 93 """ 94 def __init__(self, initial_windowed_value): 95 self._initial_windowed_value = initial_windowed_value 96 self._appended_values = [] 97 98 @property 99 def timestamp(self): 100 return self._initial_windowed_value.timestamp 101 102 @property 103 def windows(self): 104 return self._initial_windowed_value.windows 105 106 @property 107 def pane_info(self): 108 return self._initial_windowed_value.pane_info 109 110 def add_value(self, value): 111 self._appended_values.append(value) 112 113 def windowed_values(self): 114 # type: () -> Iterator[WindowedValue] 115 # yield first windowed_value as is, then iterate through 116 # _appended_values to yield WindowedValue on the fly. 117 yield self._initial_windowed_value 118 for v in self._appended_values: 119 yield self._initial_windowed_value.with_value(v) 120 121 def __init__(self, pcollection, stacked=True): 122 # type: (Union[pvalue.PBegin, pvalue.PCollection], bool) -> None 123 assert isinstance(pcollection, (pvalue.PBegin, pvalue.PCollection)) 124 self._pcollection = pcollection 125 self._elements = [ 126 ] # type: List[Union[WindowedValue, _Bundle._StackedWindowedValues]] 127 self._stacked = stacked 128 self._committed = False 129 self._tag = None # optional tag information for this bundle 130 131 def get_elements_iterable(self, make_copy=False): 132 # type: (bool) -> Iterable[WindowedValue] 133 134 """Returns iterable elements. 135 136 Args: 137 make_copy: whether to force returning copy or yielded iterable. 138 139 Returns: 140 unstacked elements, 141 in the form of iterable if committed and make_copy is not True, 142 or as a list of copied WindowedValues. 143 """ 144 if not self._stacked: 145 # we can safely assume self._elements contains only WindowedValues 146 elements = cast('List[WindowedValue]', self._elements) 147 if self._committed and not make_copy: 148 return elements 149 return list(elements) 150 151 def iterable_stacked_or_elements(elements): 152 for e in elements: 153 if isinstance(e, _Bundle._StackedWindowedValues): 154 for w in e.windowed_values(): 155 yield w 156 else: 157 yield e 158 159 if self._committed and not make_copy: 160 return iterable_stacked_or_elements(self._elements) 161 # returns a copy. 162 return [e for e in iterable_stacked_or_elements(self._elements)] 163 164 def has_elements(self): 165 return len(self._elements) > 0 166 167 @property 168 def tag(self): 169 return self._tag 170 171 @tag.setter 172 def tag(self, value): 173 assert not self._tag 174 self._tag = value 175 176 @property 177 def pcollection(self): 178 """PCollection that the elements of this UncommittedBundle belong to.""" 179 return self._pcollection 180 181 def add(self, element): 182 """Outputs an element to this bundle. 183 184 Args: 185 element: WindowedValue 186 """ 187 assert not self._committed 188 if not self._stacked: 189 self._elements.append(element) 190 return 191 if (self._elements and 192 (isinstance(self._elements[-1], 193 (WindowedValue, _Bundle._StackedWindowedValues))) and 194 self._elements[-1].timestamp == element.timestamp and 195 self._elements[-1].windows == element.windows and 196 self._elements[-1].pane_info == element.pane_info): 197 if isinstance(self._elements[-1], WindowedValue): 198 self._elements[-1] = _Bundle._StackedWindowedValues(self._elements[-1]) 199 self._elements[-1].add_value(element.value) 200 else: 201 self._elements.append(element) 202 203 def output(self, element): 204 self.add(element) 205 206 def receive(self, element): 207 # type: (WindowedValue) -> None 208 self.add(element) 209 210 def commit(self, synchronized_processing_time): 211 """Commits this bundle. 212 213 Uncommitted bundle will become committed (immutable) after this call. 214 215 Args: 216 synchronized_processing_time: the synchronized processing time at which 217 this bundle was committed 218 """ 219 assert not self._committed 220 self._committed = True 221 self._elements = tuple(self._elements) 222 self._synchronized_processing_time = synchronized_processing_time