github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/operation_specs.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Worker utilities for representing MapTasks. 19 20 Each MapTask represents a sequence of ParallelInstruction(s): read from a 21 source, write to a sink, parallel do, etc. 22 """ 23 24 # pytype: skip-file 25 26 import collections 27 28 from apache_beam import coders 29 30 31 def build_worker_instruction(*args): 32 """Create an object representing a ParallelInstruction protobuf. 33 34 This will be a collections.namedtuple with a custom __str__ method. 35 36 Alas, this wrapper is not known to pylint, which thinks it creates 37 constants. You may have to put a disable=invalid-name pylint 38 annotation on any use of this, depending on your names. 39 40 Args: 41 *args: first argument is the name of the type to create. Should 42 start with "Worker". Second arguments is alist of the 43 attributes of this object. 44 Returns: 45 A new class, a subclass of tuple, that represents the protobuf. 46 """ 47 tuple_class = collections.namedtuple(*args) 48 tuple_class.__str__ = worker_object_to_string 49 tuple_class.__repr__ = worker_object_to_string 50 return tuple_class 51 52 53 def worker_printable_fields(workerproto): 54 """Returns the interesting fields of a Worker* object.""" 55 return [ 56 '%s=%s' % (name, value) 57 # _asdict is the only way and cannot subclass this generated class 58 # pylint: disable=protected-access 59 for name, 60 value in workerproto._asdict().items() 61 # want to output value 0 but not None nor [] 62 if (value or value == 0) and name not in ( 63 'coder', 64 'coders', 65 'output_coders', 66 'elements', 67 'combine_fn', 68 'serialized_fn', 69 'window_fn', 70 'append_trailing_newlines', 71 'strip_trailing_newlines', 72 'compression_type', 73 'context', 74 'start_shuffle_position', 75 'end_shuffle_position', 76 'shuffle_reader_config', 77 'shuffle_writer_config') 78 ] 79 80 81 def worker_object_to_string(worker_object): 82 """Returns a string compactly representing a Worker* object.""" 83 return '%s(%s)' % ( 84 worker_object.__class__.__name__, 85 ', '.join(worker_printable_fields(worker_object))) 86 87 88 # All the following Worker* definitions will have these lint problems: 89 # pylint: disable=invalid-name 90 # pylint: disable=pointless-string-statement 91 92 WorkerRead = build_worker_instruction('WorkerRead', ['source', 'output_coders']) 93 """Worker details needed to read from a source. 94 95 Attributes: 96 source: a source object. 97 output_coders: 1-tuple of the coder for the output. 98 """ 99 100 WorkerSideInputSource = build_worker_instruction( 101 'WorkerSideInputSource', ['source', 'tag']) 102 """Worker details needed to read from a side input source. 103 104 Attributes: 105 source: a source object. 106 tag: string tag for this side input. 107 """ 108 109 WorkerGroupingShuffleRead = build_worker_instruction( 110 'WorkerGroupingShuffleRead', 111 [ 112 'start_shuffle_position', 113 'end_shuffle_position', 114 'shuffle_reader_config', 115 'coder', 116 'output_coders' 117 ]) 118 """Worker details needed to read from a grouping shuffle source. 119 120 Attributes: 121 start_shuffle_position: An opaque string to be passed to the shuffle 122 source to indicate where to start reading. 123 end_shuffle_position: An opaque string to be passed to the shuffle 124 source to indicate where to stop reading. 125 shuffle_reader_config: An opaque string used to initialize the shuffle 126 reader. Contains things like connection endpoints for the shuffle 127 server appliance and various options. 128 coder: The KV coder used to decode shuffle entries. 129 output_coders: 1-tuple of the coder for the output. 130 """ 131 132 WorkerUngroupedShuffleRead = build_worker_instruction( 133 'WorkerUngroupedShuffleRead', 134 [ 135 'start_shuffle_position', 136 'end_shuffle_position', 137 'shuffle_reader_config', 138 'coder', 139 'output_coders' 140 ]) 141 """Worker details needed to read from an ungrouped shuffle source. 142 143 Attributes: 144 start_shuffle_position: An opaque string to be passed to the shuffle 145 source to indicate where to start reading. 146 end_shuffle_position: An opaque string to be passed to the shuffle 147 source to indicate where to stop reading. 148 shuffle_reader_config: An opaque string used to initialize the shuffle 149 reader. Contains things like connection endpoints for the shuffle 150 server appliance and various options. 151 coder: The value coder used to decode shuffle entries. 152 """ 153 154 WorkerWrite = build_worker_instruction( 155 'WorkerWrite', ['sink', 'input', 'output_coders']) 156 """Worker details needed to write to a sink. 157 158 Attributes: 159 sink: a sink object. 160 input: A (producer index, output index) tuple representing the 161 ParallelInstruction operation whose output feeds into this operation. 162 The output index is 0 except for multi-output operations (like ParDo). 163 output_coders: 1-tuple, coder to use to estimate bytes written. 164 """ 165 166 WorkerInMemoryWrite = build_worker_instruction( 167 'WorkerInMemoryWrite', 168 ['output_buffer', 'write_windowed_values', 'input', 'output_coders']) 169 """Worker details needed to write to a in-memory sink. 170 171 Used only for unit testing. It makes worker tests less cluttered with code like 172 "write to a file and then check file contents". 173 174 Attributes: 175 output_buffer: list to which output elements will be appended 176 write_windowed_values: whether to record the entire WindowedValue outputs, 177 or just the raw (unwindowed) value 178 input: A (producer index, output index) tuple representing the 179 ParallelInstruction operation whose output feeds into this operation. 180 The output index is 0 except for multi-output operations (like ParDo). 181 output_coders: 1-tuple, coder to use to estimate bytes written. 182 """ 183 184 WorkerShuffleWrite = build_worker_instruction( 185 'WorkerShuffleWrite', 186 ['shuffle_kind', 'shuffle_writer_config', 'input', 'output_coders']) 187 """Worker details needed to write to a shuffle sink. 188 189 Attributes: 190 shuffle_kind: A string describing the shuffle kind. This can control the 191 way the worker interacts with the shuffle sink. The possible values are: 192 'ungrouped', 'group_keys', and 'group_keys_and_sort_values'. 193 shuffle_writer_config: An opaque string used to initialize the shuffle 194 write. Contains things like connection endpoints for the shuffle 195 server appliance and various options. 196 input: A (producer index, output index) tuple representing the 197 ParallelInstruction operation whose output feeds into this operation. 198 The output index is 0 except for multi-output operations (like ParDo). 199 output_coders: 1-tuple of the coder for input elements. If the 200 shuffle_kind is grouping, this is expected to be a KV coder. 201 """ 202 203 WorkerDoFn = build_worker_instruction( 204 'WorkerDoFn', 205 ['serialized_fn', 'output_tags', 'input', 'side_inputs', 'output_coders']) 206 """Worker details needed to run a DoFn. 207 Attributes: 208 serialized_fn: A serialized DoFn object to be run for each input element. 209 output_tags: The string tags used to identify the outputs of a ParDo 210 operation. The tag is present even if the ParDo has just one output 211 (e.g., ['out']. 212 output_coders: array of coders, one for each output. 213 input: A (producer index, output index) tuple representing the 214 ParallelInstruction operation whose output feeds into this operation. 215 The output index is 0 except for multi-output operations (like ParDo). 216 side_inputs: A list of Worker...Read instances describing sources to be 217 used for getting values. The types supported right now are 218 WorkerInMemoryRead and WorkerTextRead. 219 """ 220 221 WorkerReifyTimestampAndWindows = build_worker_instruction( 222 'WorkerReifyTimestampAndWindows', ['output_tags', 'input', 'output_coders']) 223 """Worker details needed to run a WindowInto. 224 Attributes: 225 output_tags: The string tags used to identify the outputs of a ParDo 226 operation. The tag is present even if the ParDo has just one output 227 (e.g., ['out']. 228 output_coders: array of coders, one for each output. 229 input: A (producer index, output index) tuple representing the 230 ParallelInstruction operation whose output feeds into this operation. 231 The output index is 0 except for multi-output operations (like ParDo). 232 """ 233 234 WorkerMergeWindows = build_worker_instruction( 235 'WorkerMergeWindows', 236 [ 237 'window_fn', 238 'combine_fn', 239 'phase', 240 'output_tags', 241 'input', 242 'coders', 243 'context', 244 'output_coders' 245 ]) 246 """Worker details needed to run a MergeWindows (aka. GroupAlsoByWindows). 247 Attributes: 248 window_fn: A serialized Windowing object representing the windowing strategy. 249 combine_fn: A serialized CombineFn object to be used after executing the 250 GroupAlsoByWindows operation. May be None if not a combining operation. 251 phase: Possible values are 'all', 'add', 'merge', and 'extract'. 252 A runner optimizer may split the user combiner in 3 separate 253 phases (ADD, MERGE, and EXTRACT), on separate VMs, as it sees 254 fit. The phase attribute dictates which DoFn is actually running in 255 the worker. May be None if not a combining operation. 256 output_tags: The string tags used to identify the outputs of a ParDo 257 operation. The tag is present even if the ParDo has just one output 258 (e.g., ['out']. 259 output_coders: array of coders, one for each output. 260 input: A (producer index, output index) tuple representing the 261 ParallelInstruction operation whose output feeds into this operation. 262 The output index is 0 except for multi-output operations (like ParDo). 263 coders: A 2-tuple of coders (key, value) to encode shuffle entries. 264 context: The ExecutionContext object for the current work item. 265 """ 266 267 WorkerCombineFn = build_worker_instruction( 268 'WorkerCombineFn', ['serialized_fn', 'phase', 'input', 'output_coders']) 269 """Worker details needed to run a CombineFn. 270 Attributes: 271 serialized_fn: A serialized CombineFn object to be used. 272 phase: Possible values are 'all', 'add', 'merge', and 'extract'. 273 A runner optimizer may split the user combiner in 3 separate 274 phases (ADD, MERGE, and EXTRACT), on separate VMs, as it sees 275 fit. The phase attribute dictates which DoFn is actually running in 276 the worker. 277 input: A (producer index, output index) tuple representing the 278 ParallelInstruction operation whose output feeds into this operation. 279 The output index is 0 except for multi-output operations (like ParDo). 280 output_coders: 1-tuple of the coder for the output. 281 """ 282 283 WorkerPartialGroupByKey = build_worker_instruction( 284 'WorkerPartialGroupByKey', ['combine_fn', 'input', 'output_coders']) 285 """Worker details needed to run a partial group-by-key. 286 Attributes: 287 combine_fn: A serialized CombineFn object to be used. 288 input: A (producer index, output index) tuple representing the 289 ParallelInstruction operation whose output feeds into this operation. 290 The output index is 0 except for multi-output operations (like ParDo). 291 output_coders: 1-tuple of the coder for the output. 292 """ 293 294 WorkerFlatten = build_worker_instruction( 295 'WorkerFlatten', ['inputs', 'output_coders']) 296 """Worker details needed to run a Flatten. 297 Attributes: 298 inputs: A list of tuples, each (producer index, output index), representing 299 the ParallelInstruction operations whose output feeds into this operation. 300 The output index is 0 unless the input is from a multi-output 301 operation (such as ParDo). 302 output_coders: 1-tuple of the coder for the output. 303 """ 304 305 306 def get_coder_from_spec(coder_spec): 307 """Return a coder instance from a coder spec. 308 309 Args: 310 coder_spec: A dict where the value of the '@type' key is a pickled instance 311 of a Coder instance. 312 313 Returns: 314 A coder instance (has encode/decode methods). 315 """ 316 assert coder_spec is not None 317 318 # Ignore the wrappers in these encodings. 319 ignored_wrappers = ( 320 'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder') 321 if coder_spec['@type'] in ignored_wrappers: 322 assert len(coder_spec['component_encodings']) == 1 323 coder_spec = coder_spec['component_encodings'][0] 324 return get_coder_from_spec(coder_spec) 325 326 # Handle a few well known types of coders. 327 if coder_spec['@type'] == 'kind:pair': 328 assert len(coder_spec['component_encodings']) == 2 329 component_coders = [ 330 get_coder_from_spec(c) for c in coder_spec['component_encodings'] 331 ] 332 return coders.TupleCoder(component_coders) 333 elif coder_spec['@type'] == 'kind:stream': 334 assert len(coder_spec['component_encodings']) == 1 335 return coders.IterableCoder( 336 get_coder_from_spec(coder_spec['component_encodings'][0])) 337 elif coder_spec['@type'] == 'kind:windowed_value': 338 assert len(coder_spec['component_encodings']) == 2 339 value_coder, window_coder = [ 340 get_coder_from_spec(c) for c in coder_spec['component_encodings']] 341 return coders.coders.WindowedValueCoder( 342 value_coder, window_coder=window_coder) 343 elif coder_spec['@type'] == 'kind:interval_window': 344 assert ( 345 'component_encodings' not in coder_spec or 346 not coder_spec['component_encodings']) 347 return coders.coders.IntervalWindowCoder() 348 elif coder_spec['@type'] == 'kind:global_window': 349 assert ( 350 'component_encodings' not in coder_spec or 351 not coder_spec['component_encodings']) 352 return coders.coders.GlobalWindowCoder() 353 elif coder_spec['@type'] == 'kind:varint': 354 assert ( 355 'component_encodings' not in coder_spec or 356 len(coder_spec['component_encodings'] == 0)) 357 return coders.coders.VarIntCoder() 358 elif coder_spec['@type'] == 'kind:length_prefix': 359 assert len(coder_spec['component_encodings']) == 1 360 return coders.coders.LengthPrefixCoder( 361 get_coder_from_spec(coder_spec['component_encodings'][0])) 362 elif coder_spec['@type'] == 'kind:bytes': 363 assert ( 364 'component_encodings' not in coder_spec or 365 len(coder_spec['component_encodings'] == 0)) 366 return coders.BytesCoder() 367 368 # We pass coders in the form "<coder_name>$<pickled_data>" to make the job 369 # description JSON more readable. 370 return coders.coders.deserialize_coder(coder_spec['@type'].encode('ascii'))