github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_runner.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A runner implementation that submits a job for remote execution. 19 20 The runner will create a JSON description of the job graph and then submit it 21 to the Dataflow Service for remote execution by a worker. 22 """ 23 # pytype: skip-file 24 25 import base64 26 import logging 27 import os 28 import threading 29 import time 30 import traceback 31 import warnings 32 from collections import defaultdict 33 from subprocess import DEVNULL 34 from typing import TYPE_CHECKING 35 from typing import List 36 from urllib.parse import quote 37 from urllib.parse import quote_from_bytes 38 from urllib.parse import unquote_to_bytes 39 40 import apache_beam as beam 41 from apache_beam import coders 42 from apache_beam import error 43 from apache_beam.internal import pickler 44 from apache_beam.internal.gcp import json_value 45 from apache_beam.options.pipeline_options import DebugOptions 46 from apache_beam.options.pipeline_options import GoogleCloudOptions 47 from apache_beam.options.pipeline_options import SetupOptions 48 from apache_beam.options.pipeline_options import StandardOptions 49 from apache_beam.options.pipeline_options import TestOptions 50 from apache_beam.options.pipeline_options import TypeOptions 51 from apache_beam.options.pipeline_options import WorkerOptions 52 from apache_beam.portability import common_urns 53 from apache_beam.portability.api import beam_runner_api_pb2 54 from apache_beam.pvalue import AsSideInput 55 from apache_beam.runners.common import DoFnSignature 56 from apache_beam.runners.common import group_by_key_input_visitor 57 from apache_beam.runners.dataflow.internal import names 58 from apache_beam.runners.dataflow.internal.clients import dataflow as dataflow_api 59 from apache_beam.runners.dataflow.internal.names import PropertyNames 60 from apache_beam.runners.dataflow.internal.names import TransformNames 61 from apache_beam.runners.runner import PipelineResult 62 from apache_beam.runners.runner import PipelineRunner 63 from apache_beam.runners.runner import PipelineState 64 from apache_beam.runners.runner import PValueCache 65 from apache_beam.transforms import window 66 from apache_beam.transforms.display import DisplayData 67 from apache_beam.transforms.sideinputs import SIDE_INPUT_PREFIX 68 from apache_beam.typehints import typehints 69 from apache_beam.utils import processes 70 from apache_beam.utils import proto_utils 71 from apache_beam.utils.interactive_utils import is_in_notebook 72 from apache_beam.utils.plugin import BeamPlugin 73 74 if TYPE_CHECKING: 75 from apache_beam.pipeline import PTransformOverride 76 77 __all__ = ['DataflowRunner'] 78 79 _LOGGER = logging.getLogger(__name__) 80 81 BQ_SOURCE_UW_ERROR = ( 82 'The Read(BigQuerySource(...)) transform is not supported with newer stack ' 83 'features (Fn API, Dataflow Runner V2, etc). Please use the transform ' 84 'apache_beam.io.gcp.bigquery.ReadFromBigQuery instead.') 85 86 87 class DataflowRunner(PipelineRunner): 88 """A runner that creates job graphs and submits them for remote execution. 89 90 Every execution of the run() method will submit an independent job for 91 remote execution that consists of the nodes reachable from the passed in 92 node argument or entire graph if node is None. The run() method returns 93 after the service created the job and will not wait for the job to finish 94 if blocking is set to False. 95 """ 96 97 # A list of PTransformOverride objects to be applied before running a pipeline 98 # using DataflowRunner. 99 # Currently this only works for overrides where the input and output types do 100 # not change. 101 # For internal SDK use only. This should not be updated by Beam pipeline 102 # authors. 103 104 # Imported here to avoid circular dependencies. 105 # TODO: Remove the apache_beam.pipeline dependency in CreatePTransformOverride 106 from apache_beam.runners.dataflow.ptransform_overrides import CombineValuesPTransformOverride 107 from apache_beam.runners.dataflow.ptransform_overrides import CreatePTransformOverride 108 from apache_beam.runners.dataflow.ptransform_overrides import ReadPTransformOverride 109 from apache_beam.runners.dataflow.ptransform_overrides import NativeReadPTransformOverride 110 111 # These overrides should be applied before the proto representation of the 112 # graph is created. 113 _PTRANSFORM_OVERRIDES = [ 114 NativeReadPTransformOverride(), 115 ] # type: List[PTransformOverride] 116 117 # These overrides should be applied after the proto representation of the 118 # graph is created. 119 _NON_PORTABLE_PTRANSFORM_OVERRIDES = [ 120 CombineValuesPTransformOverride(), 121 CreatePTransformOverride(), 122 ReadPTransformOverride(), 123 ] # type: List[PTransformOverride] 124 125 def __init__(self, cache=None): 126 # Cache of CloudWorkflowStep protos generated while the runner 127 # "executes" a pipeline. 128 self._cache = cache if cache is not None else PValueCache() 129 self._unique_step_id = 0 130 self._default_environment = None 131 132 def is_fnapi_compatible(self): 133 return False 134 135 def apply(self, transform, input, options): 136 _check_and_add_missing_options(options) 137 return super().apply(transform, input, options) 138 139 def _get_unique_step_name(self): 140 self._unique_step_id += 1 141 return 's%s' % self._unique_step_id 142 143 @staticmethod 144 def poll_for_job_completion( 145 runner, result, duration, state_update_callback=None): 146 """Polls for the specified job to finish running (successfully or not). 147 148 Updates the result with the new job information before returning. 149 150 Args: 151 runner: DataflowRunner instance to use for polling job state. 152 result: DataflowPipelineResult instance used for job information. 153 duration (int): The time to wait (in milliseconds) for job to finish. 154 If it is set to :data:`None`, it will wait indefinitely until the job 155 is finished. 156 """ 157 if result.state == PipelineState.DONE: 158 return 159 160 last_message_time = None 161 current_seen_messages = set() 162 163 last_error_rank = float('-inf') 164 last_error_msg = None 165 last_job_state = None 166 # How long to wait after pipeline failure for the error 167 # message to show up giving the reason for the failure. 168 # It typically takes about 30 seconds. 169 final_countdown_timer_secs = 50.0 170 sleep_secs = 5.0 171 172 # Try to prioritize the user-level traceback, if any. 173 def rank_error(msg): 174 if 'work item was attempted' in msg: 175 return -1 176 elif 'Traceback' in msg: 177 return 1 178 return 0 179 180 if duration: 181 start_secs = time.time() 182 duration_secs = duration // 1000 183 184 job_id = result.job_id() 185 while True: 186 response = runner.dataflow_client.get_job(job_id) 187 # If get() is called very soon after Create() the response may not contain 188 # an initialized 'currentState' field. 189 if response.currentState is not None: 190 if response.currentState != last_job_state: 191 if state_update_callback: 192 state_update_callback(response.currentState) 193 _LOGGER.info('Job %s is in state %s', job_id, response.currentState) 194 last_job_state = response.currentState 195 if str(response.currentState) != 'JOB_STATE_RUNNING': 196 # Stop checking for new messages on timeout, explanatory 197 # message received, success, or a terminal job state caused 198 # by the user that therefore doesn't require explanation. 199 if (final_countdown_timer_secs <= 0.0 or last_error_msg is not None or 200 str(response.currentState) == 'JOB_STATE_DONE' or 201 str(response.currentState) == 'JOB_STATE_CANCELLED' or 202 str(response.currentState) == 'JOB_STATE_UPDATED' or 203 str(response.currentState) == 'JOB_STATE_DRAINED'): 204 break 205 206 # Check that job is in a post-preparation state before starting the 207 # final countdown. 208 if (str(response.currentState) not in ('JOB_STATE_PENDING', 209 'JOB_STATE_QUEUED')): 210 # The job has failed; ensure we see any final error messages. 211 sleep_secs = 1.0 # poll faster during the final countdown 212 final_countdown_timer_secs -= sleep_secs 213 214 time.sleep(sleep_secs) 215 216 # Get all messages since beginning of the job run or since last message. 217 page_token = None 218 while True: 219 messages, page_token = runner.dataflow_client.list_messages( 220 job_id, page_token=page_token, start_time=last_message_time) 221 for m in messages: 222 message = '%s: %s: %s' % (m.time, m.messageImportance, m.messageText) 223 224 if not last_message_time or m.time > last_message_time: 225 last_message_time = m.time 226 current_seen_messages = set() 227 228 if message in current_seen_messages: 229 # Skip the message if it has already been seen at the current 230 # time. This could be the case since the list_messages API is 231 # queried starting at last_message_time. 232 continue 233 else: 234 current_seen_messages.add(message) 235 # Skip empty messages. 236 if m.messageImportance is None: 237 continue 238 _LOGGER.info(message) 239 if str(m.messageImportance) == 'JOB_MESSAGE_ERROR': 240 if rank_error(m.messageText) >= last_error_rank: 241 last_error_rank = rank_error(m.messageText) 242 last_error_msg = m.messageText 243 if not page_token: 244 break 245 246 if duration: 247 passed_secs = time.time() - start_secs 248 if passed_secs > duration_secs: 249 _LOGGER.warning( 250 'Timing out on waiting for job %s after %d seconds', 251 job_id, 252 passed_secs) 253 break 254 255 result._job = response 256 runner.last_error_msg = last_error_msg 257 258 @staticmethod 259 def _only_element(iterable): 260 # type: (Iterable[T]) -> T # noqa: F821 261 element, = iterable 262 return element 263 264 @staticmethod 265 def side_input_visitor(is_runner_v2=False, deterministic_key_coders=True): 266 # Imported here to avoid circular dependencies. 267 # pylint: disable=wrong-import-order, wrong-import-position 268 from apache_beam.pipeline import PipelineVisitor 269 from apache_beam.transforms.core import ParDo 270 271 class SideInputVisitor(PipelineVisitor): 272 """Ensures input `PCollection` used as a side inputs has a `KV` type. 273 274 TODO(BEAM-115): Once Python SDK is compatible with the new Runner API, 275 we could directly replace the coder instead of mutating the element type. 276 """ 277 def visit_transform(self, transform_node): 278 if isinstance(transform_node.transform, ParDo): 279 new_side_inputs = [] 280 for side_input in transform_node.side_inputs: 281 access_pattern = side_input._side_input_data().access_pattern 282 if access_pattern == common_urns.side_inputs.ITERABLE.urn: 283 # TODO(https://github.com/apache/beam/issues/20043): Stop 284 # patching up the access pattern to appease Dataflow when 285 # using the UW and hardcode the output type to be Any since 286 # the Dataflow JSON and pipeline proto can differ in coders 287 # which leads to encoding/decoding issues within the runner. 288 side_input.pvalue.element_type = typehints.Any 289 new_side_input = _DataflowIterableSideInput(side_input) 290 elif access_pattern == common_urns.side_inputs.MULTIMAP.urn: 291 # Ensure the input coder is a KV coder and patch up the 292 # access pattern to appease Dataflow. 293 side_input.pvalue.element_type = typehints.coerce_to_kv_type( 294 side_input.pvalue.element_type, transform_node.full_label) 295 side_input.pvalue.requires_deterministic_key_coder = ( 296 deterministic_key_coders and transform_node.full_label) 297 new_side_input = _DataflowMultimapSideInput(side_input) 298 else: 299 raise ValueError( 300 'Unsupported access pattern for %r: %r' % 301 (transform_node.full_label, access_pattern)) 302 new_side_inputs.append(new_side_input) 303 if is_runner_v2: 304 transform_node.side_inputs = new_side_inputs 305 transform_node.transform.side_inputs = new_side_inputs 306 307 return SideInputVisitor() 308 309 @staticmethod 310 def flatten_input_visitor(): 311 # Imported here to avoid circular dependencies. 312 from apache_beam.pipeline import PipelineVisitor 313 314 class FlattenInputVisitor(PipelineVisitor): 315 """A visitor that replaces the element type for input ``PCollections``s of 316 a ``Flatten`` transform with that of the output ``PCollection``. 317 """ 318 def visit_transform(self, transform_node): 319 # Imported here to avoid circular dependencies. 320 # pylint: disable=wrong-import-order, wrong-import-position 321 from apache_beam import Flatten 322 if isinstance(transform_node.transform, Flatten): 323 output_pcoll = DataflowRunner._only_element( 324 transform_node.outputs.values()) 325 for input_pcoll in transform_node.inputs: 326 input_pcoll.element_type = output_pcoll.element_type 327 328 return FlattenInputVisitor() 329 330 @staticmethod 331 def combinefn_visitor(): 332 # Imported here to avoid circular dependencies. 333 from apache_beam.pipeline import PipelineVisitor 334 from apache_beam import core 335 336 class CombineFnVisitor(PipelineVisitor): 337 """Checks if `CombineFn` has non-default setup or teardown methods. 338 If yes, raises `ValueError`. 339 """ 340 def visit_transform(self, applied_transform): 341 transform = applied_transform.transform 342 if isinstance(transform, core.ParDo) and isinstance( 343 transform.fn, core.CombineValuesDoFn): 344 if self._overrides_setup_or_teardown(transform.fn.combinefn): 345 raise ValueError( 346 'CombineFn.setup and CombineFn.teardown are ' 347 'not supported with non-portable Dataflow ' 348 'runner. Please use Dataflow Runner V2 instead.') 349 350 @staticmethod 351 def _overrides_setup_or_teardown(combinefn): 352 # TODO(https://github.com/apache/beam/issues/18716): provide an 353 # implementation for this method 354 return False 355 356 return CombineFnVisitor() 357 358 def _adjust_pipeline_for_dataflow_v2(self, pipeline): 359 # Dataflow runner requires a KV type for GBK inputs, hence we enforce that 360 # here. 361 pipeline.visit( 362 group_by_key_input_visitor( 363 not pipeline._options.view_as( 364 TypeOptions).allow_non_deterministic_key_coders)) 365 366 def _check_for_unsupported_features_on_non_portable_worker(self, pipeline): 367 pipeline.visit(self.combinefn_visitor()) 368 369 def run_pipeline(self, pipeline, options, pipeline_proto=None): 370 """Remotely executes entire pipeline or parts reachable from node.""" 371 if _is_runner_v2_disabled(options): 372 debug_options = options.view_as(DebugOptions) 373 if not debug_options.lookup_experiment('disable_runner_v2_until_v2.50'): 374 raise ValueError( 375 'disable_runner_v2 is deprecated in Beam Python ' + 376 beam.version.__version__ + 377 ' and this execution mode will be removed in a future Beam SDK. ' 378 'If needed, please use: ' 379 '"--experiments=disable_runner_v2_until_v2.50".') 380 381 # Label goog-dataflow-notebook if job is started from notebook. 382 if is_in_notebook(): 383 notebook_version = ( 384 'goog-dataflow-notebook=' + 385 beam.version.__version__.replace('.', '_')) 386 if options.view_as(GoogleCloudOptions).labels: 387 options.view_as(GoogleCloudOptions).labels.append(notebook_version) 388 else: 389 options.view_as(GoogleCloudOptions).labels = [notebook_version] 390 391 # Import here to avoid adding the dependency for local running scenarios. 392 try: 393 # pylint: disable=wrong-import-order, wrong-import-position 394 from apache_beam.runners.dataflow.internal import apiclient 395 except ImportError: 396 raise ImportError( 397 'Google Cloud Dataflow runner not available, ' 398 'please install apache_beam[gcp]') 399 400 if pipeline_proto or pipeline.contains_external_transforms: 401 if _is_runner_v2_disabled(options): 402 raise ValueError( 403 'This pipeline contains cross language transforms, ' 404 'which requires Runner V2.') 405 if not _is_runner_v2(options): 406 _LOGGER.info( 407 'Automatically enabling Dataflow Runner V2 since the ' 408 'pipeline used cross-language transforms.') 409 _add_runner_v2_missing_options(options) 410 411 is_runner_v2 = _is_runner_v2(options) 412 if not is_runner_v2: 413 self._check_for_unsupported_features_on_non_portable_worker(pipeline) 414 415 # Convert all side inputs into a form acceptable to Dataflow. 416 if pipeline: 417 pipeline.visit( 418 self.side_input_visitor( 419 _is_runner_v2(options), 420 deterministic_key_coders=not options.view_as( 421 TypeOptions).allow_non_deterministic_key_coders)) 422 423 # Performing configured PTransform overrides. Note that this is currently 424 # done before Runner API serialization, since the new proto needs to 425 # contain any added PTransforms. 426 pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) 427 428 if options.view_as(DebugOptions).lookup_experiment('use_legacy_bq_sink'): 429 warnings.warn( 430 "Native sinks no longer implemented; " 431 "ignoring use_legacy_bq_sink.") 432 433 from apache_beam.runners.dataflow.ptransform_overrides import GroupIntoBatchesWithShardedKeyPTransformOverride 434 pipeline.replace_all( 435 [GroupIntoBatchesWithShardedKeyPTransformOverride(self, options)]) 436 437 if pipeline_proto: 438 self.proto_pipeline = pipeline_proto 439 440 else: 441 from apache_beam.transforms import environments 442 if options.view_as(SetupOptions).prebuild_sdk_container_engine: 443 # if prebuild_sdk_container_engine is specified we will build a new sdk 444 # container image with dependencies pre-installed and use that image, 445 # instead of using the inferred default container image. 446 self._default_environment = ( 447 environments.DockerEnvironment.from_options(options)) 448 options.view_as(WorkerOptions).sdk_container_image = ( 449 self._default_environment.container_image) 450 else: 451 artifacts = environments.python_sdk_dependencies(options) 452 if artifacts and _is_runner_v2(options): 453 _LOGGER.info( 454 "Pipeline has additional dependencies to be installed " 455 "in SDK worker container, consider using the SDK " 456 "container image pre-building workflow to avoid " 457 "repetitive installations. Learn more on " 458 "https://cloud.google.com/dataflow/docs/guides/" 459 "using-custom-containers#prebuild") 460 self._default_environment = ( 461 environments.DockerEnvironment.from_container_image( 462 apiclient.get_container_image_from_options(options), 463 artifacts=artifacts, 464 resource_hints=environments.resource_hints_from_options( 465 options))) 466 467 # This has to be performed before pipeline proto is constructed to make 468 # sure that the changes are reflected in the portable job submission path. 469 self._adjust_pipeline_for_dataflow_v2(pipeline) 470 471 # Snapshot the pipeline in a portable proto. 472 self.proto_pipeline, self.proto_context = pipeline.to_runner_api( 473 return_context=True, default_environment=self._default_environment) 474 475 # Optimize the pipeline if it not streaming and the pre_optimize 476 # experiment is set. 477 if not options.view_as(StandardOptions).streaming: 478 pre_optimize = options.view_as(DebugOptions).lookup_experiment( 479 'pre_optimize', 'default').lower() 480 from apache_beam.runners.portability.fn_api_runner import translations 481 if pre_optimize == 'none': 482 phases = [] 483 elif pre_optimize == 'default' or pre_optimize == 'all': 484 phases = [translations.pack_combiners, translations.sort_stages] 485 else: 486 phases = [] 487 for phase_name in pre_optimize.split(','): 488 # For now, these are all we allow. 489 if phase_name in ('pack_combiners', ): 490 phases.append(getattr(translations, phase_name)) 491 else: 492 raise ValueError( 493 'Unknown or inapplicable phase for pre_optimize: %s' % 494 phase_name) 495 phases.append(translations.sort_stages) 496 497 if phases: 498 self.proto_pipeline = translations.optimize_pipeline( 499 self.proto_pipeline, 500 phases=phases, 501 known_runner_urns=frozenset(), 502 partial=True) 503 504 if not is_runner_v2: 505 # Performing configured PTransform overrides which should not be reflected 506 # in the proto representation of the graph. 507 pipeline.replace_all(DataflowRunner._NON_PORTABLE_PTRANSFORM_OVERRIDES) 508 509 # Add setup_options for all the BeamPlugin imports 510 setup_options = options.view_as(SetupOptions) 511 plugins = BeamPlugin.get_all_plugin_paths() 512 if setup_options.beam_plugins is not None: 513 plugins = list(set(plugins + setup_options.beam_plugins)) 514 setup_options.beam_plugins = plugins 515 516 # Elevate "min_cpu_platform" to pipeline option, but using the existing 517 # experiment. 518 debug_options = options.view_as(DebugOptions) 519 worker_options = options.view_as(WorkerOptions) 520 if worker_options.min_cpu_platform: 521 debug_options.add_experiment( 522 'min_cpu_platform=' + worker_options.min_cpu_platform) 523 524 self.job = apiclient.Job(options, self.proto_pipeline) 525 526 # TODO: Consider skipping these for all use_portable_job_submission jobs. 527 if pipeline: 528 # Dataflow Runner v1 requires output type of the Flatten to be the same as 529 # the inputs, hence we enforce that here. Dataflow Runner v2 does not 530 # require this. 531 pipeline.visit(self.flatten_input_visitor()) 532 533 # Trigger a traversal of all reachable nodes. 534 self.visit_transforms(pipeline, options) 535 536 test_options = options.view_as(TestOptions) 537 # If it is a dry run, return without submitting the job. 538 if test_options.dry_run: 539 result = PipelineResult(PipelineState.DONE) 540 result.wait_until_finish = lambda duration=None: None 541 return result 542 543 # Get a Dataflow API client and set its options 544 self.dataflow_client = apiclient.DataflowApplicationClient( 545 options, self.job.root_staging_location) 546 547 # Create the job description and send a request to the service. The result 548 # can be None if there is no need to send a request to the service (e.g. 549 # template creation). If a request was sent and failed then the call will 550 # raise an exception. 551 result = DataflowPipelineResult( 552 self.dataflow_client.create_job(self.job), self) 553 554 # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring. 555 from apache_beam.runners.dataflow.dataflow_metrics import DataflowMetrics 556 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) 557 result.metric_results = self._metrics 558 return result 559 560 def _get_typehint_based_encoding(self, typehint, window_coder): 561 """Returns an encoding based on a typehint object.""" 562 return self._get_cloud_encoding( 563 self._get_coder(typehint, window_coder=window_coder)) 564 565 @staticmethod 566 def _get_coder(typehint, window_coder): 567 """Returns a coder based on a typehint object.""" 568 if window_coder: 569 return coders.WindowedValueCoder( 570 coders.registry.get_coder(typehint), window_coder=window_coder) 571 return coders.registry.get_coder(typehint) 572 573 def _get_cloud_encoding(self, coder, unused=None): 574 """Returns an encoding based on a coder object.""" 575 if not isinstance(coder, coders.Coder): 576 raise TypeError( 577 'Coder object must inherit from coders.Coder: %s.' % str(coder)) 578 return coder.as_cloud_object(self.proto_context.coders) 579 580 def _get_side_input_encoding(self, input_encoding): 581 """Returns an encoding for the output of a view transform. 582 583 Args: 584 input_encoding: encoding of current transform's input. Side inputs need 585 this because the service will check that input and output types match. 586 587 Returns: 588 An encoding that matches the output and input encoding. This is essential 589 for the View transforms introduced to produce side inputs to a ParDo. 590 """ 591 return { 592 '@type': 'kind:stream', 593 'component_encodings': [input_encoding], 594 'is_stream_like': { 595 'value': True 596 }, 597 } 598 599 def _get_encoded_output_coder( 600 self, transform_node, window_value=True, output_tag=None): 601 """Returns the cloud encoding of the coder for the output of a transform.""" 602 603 if output_tag in transform_node.outputs: 604 element_type = transform_node.outputs[output_tag].element_type 605 elif len(transform_node.outputs) == 1: 606 output_tag = DataflowRunner._only_element(transform_node.outputs.keys()) 607 # TODO(robertwb): Handle type hints for multi-output transforms. 608 element_type = transform_node.outputs[output_tag].element_type 609 610 else: 611 # TODO(silviuc): Remove this branch (and assert) when typehints are 612 # propagated everywhere. Returning an 'Any' as type hint will trigger 613 # usage of the fallback coder (i.e., cPickler). 614 element_type = typehints.Any 615 if window_value: 616 # All outputs have the same windowing. So getting the coder from an 617 # arbitrary window is fine. 618 output_tag = next(iter(transform_node.outputs.keys())) 619 window_coder = ( 620 transform_node.outputs[output_tag].windowing.windowfn. 621 get_window_coder()) 622 else: 623 window_coder = None 624 return self._get_typehint_based_encoding(element_type, window_coder) 625 626 def get_pcoll_with_auto_sharding(self): 627 if not hasattr(self, '_pcoll_with_auto_sharding'): 628 return set() 629 return self._pcoll_with_auto_sharding 630 631 def add_pcoll_with_auto_sharding(self, applied_ptransform): 632 if not hasattr(self, '_pcoll_with_auto_sharding'): 633 self.__setattr__('_pcoll_with_auto_sharding', set()) 634 output = DataflowRunner._only_element(applied_ptransform.outputs.keys()) 635 self._pcoll_with_auto_sharding.add( 636 applied_ptransform.outputs[output]._unique_name()) 637 638 def _add_step(self, step_kind, step_label, transform_node, side_tags=()): 639 """Creates a Step object and adds it to the cache.""" 640 # Import here to avoid adding the dependency for local running scenarios. 641 # pylint: disable=wrong-import-order, wrong-import-position 642 from apache_beam.runners.dataflow.internal import apiclient 643 step = apiclient.Step(step_kind, self._get_unique_step_name()) 644 self.job.proto.steps.append(step.proto) 645 step.add_property(PropertyNames.USER_NAME, step_label) 646 # Cache the node/step association for the main output of the transform node. 647 648 # External transforms may not use 'None' as an output tag. 649 output_tags = ([None] + 650 list(side_tags) if None in transform_node.outputs.keys() else 651 list(transform_node.outputs.keys())) 652 653 # We have to cache output for all tags since some transforms may produce 654 # multiple outputs. 655 for output_tag in output_tags: 656 self._cache.cache_output(transform_node, output_tag, step) 657 658 # Finally, we add the display data items to the pipeline step. 659 # If the transform contains no display data then an empty list is added. 660 step.add_property( 661 PropertyNames.DISPLAY_DATA, 662 [ 663 item.get_dict() 664 for item in DisplayData.create_from(transform_node.transform).items 665 ]) 666 667 if transform_node.resource_hints: 668 step.add_property( 669 PropertyNames.RESOURCE_HINTS, 670 { 671 hint: quote_from_bytes(value) 672 for (hint, value) in transform_node.resource_hints.items() 673 }) 674 675 return step 676 677 def _add_singleton_step( 678 self, 679 label, 680 full_label, 681 tag, 682 input_step, 683 windowing_strategy, 684 access_pattern): 685 """Creates a CollectionToSingleton step used to handle ParDo side inputs.""" 686 # Import here to avoid adding the dependency for local running scenarios. 687 from apache_beam.runners.dataflow.internal import apiclient 688 step = apiclient.Step(TransformNames.COLLECTION_TO_SINGLETON, label) 689 self.job.proto.steps.append(step.proto) 690 step.add_property(PropertyNames.USER_NAME, full_label) 691 step.add_property( 692 PropertyNames.PARALLEL_INPUT, 693 { 694 '@type': 'OutputReference', 695 PropertyNames.STEP_NAME: input_step.proto.name, 696 PropertyNames.OUTPUT_NAME: input_step.get_output(tag) 697 }) 698 step.encoding = self._get_side_input_encoding(input_step.encoding) 699 700 output_info = { 701 PropertyNames.USER_NAME: '%s.%s' % (full_label, PropertyNames.OUTPUT), 702 PropertyNames.ENCODING: step.encoding, 703 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 704 } 705 if common_urns.side_inputs.MULTIMAP.urn == access_pattern: 706 output_info[PropertyNames.USE_INDEXED_FORMAT] = True 707 step.add_property(PropertyNames.OUTPUT_INFO, [output_info]) 708 709 step.add_property( 710 PropertyNames.WINDOWING_STRATEGY, 711 self.serialize_windowing_strategy( 712 windowing_strategy, self._default_environment)) 713 return step 714 715 def run_Impulse(self, transform_node, options): 716 step = self._add_step( 717 TransformNames.READ, transform_node.full_label, transform_node) 718 step.add_property(PropertyNames.FORMAT, 'impulse') 719 encoded_impulse_element = coders.WindowedValueCoder( 720 coders.BytesCoder(), 721 coders.coders.GlobalWindowCoder()).get_impl().encode_nested( 722 window.GlobalWindows.windowed_value(b'')) 723 if _is_runner_v2(options): 724 encoded_impulse_as_str = self.byte_array_to_json_string( 725 encoded_impulse_element) 726 else: 727 encoded_impulse_as_str = base64.b64encode(encoded_impulse_element).decode( 728 'ascii') 729 730 step.add_property(PropertyNames.IMPULSE_ELEMENT, encoded_impulse_as_str) 731 732 step.encoding = self._get_encoded_output_coder(transform_node) 733 step.add_property( 734 PropertyNames.OUTPUT_INFO, 735 [{ 736 PropertyNames.USER_NAME: ( 737 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 738 PropertyNames.ENCODING: step.encoding, 739 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 740 }]) 741 742 def run_Flatten(self, transform_node, options): 743 step = self._add_step( 744 TransformNames.FLATTEN, transform_node.full_label, transform_node) 745 inputs = [] 746 for one_input in transform_node.inputs: 747 input_step = self._cache.get_pvalue(one_input) 748 inputs.append({ 749 '@type': 'OutputReference', 750 PropertyNames.STEP_NAME: input_step.proto.name, 751 PropertyNames.OUTPUT_NAME: input_step.get_output(one_input.tag) 752 }) 753 step.add_property(PropertyNames.INPUTS, inputs) 754 step.encoding = self._get_encoded_output_coder(transform_node) 755 step.add_property( 756 PropertyNames.OUTPUT_INFO, 757 [{ 758 PropertyNames.USER_NAME: ( 759 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 760 PropertyNames.ENCODING: step.encoding, 761 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 762 }]) 763 764 # TODO(srohde): Remove this after internal usages have been removed. 765 def apply_GroupByKey(self, transform, pcoll, options): 766 return transform.expand(pcoll) 767 768 def _verify_gbk_coders(self, transform, pcoll): 769 # Infer coder of parent. 770 # 771 # TODO(ccy): make Coder inference and checking less specialized and more 772 # comprehensive. 773 774 parent = pcoll.producer 775 if parent: 776 coder = parent.transform._infer_output_coder() # pylint: disable=protected-access 777 if not coder: 778 coder = self._get_coder(pcoll.element_type or typehints.Any, None) 779 if not coder.is_kv_coder(): 780 raise ValueError(( 781 'Coder for the GroupByKey operation "%s" is not a ' 782 'key-value coder: %s.') % (transform.label, coder)) 783 # TODO(robertwb): Update the coder itself if it changed. 784 coders.registry.verify_deterministic( 785 coder.key_coder(), 'GroupByKey operation "%s"' % transform.label) 786 787 def run_GroupByKey(self, transform_node, options): 788 input_tag = transform_node.inputs[0].tag 789 input_step = self._cache.get_pvalue(transform_node.inputs[0]) 790 791 # Verify that the GBK's parent has a KV coder. 792 self._verify_gbk_coders(transform_node.transform, transform_node.inputs[0]) 793 794 step = self._add_step( 795 TransformNames.GROUP, transform_node.full_label, transform_node) 796 step.add_property( 797 PropertyNames.PARALLEL_INPUT, 798 { 799 '@type': 'OutputReference', 800 PropertyNames.STEP_NAME: input_step.proto.name, 801 PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) 802 }) 803 step.encoding = self._get_encoded_output_coder(transform_node) 804 step.add_property( 805 PropertyNames.OUTPUT_INFO, 806 [{ 807 PropertyNames.USER_NAME: ( 808 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 809 PropertyNames.ENCODING: step.encoding, 810 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 811 }]) 812 windowing = transform_node.transform.get_windowing(transform_node.inputs) 813 step.add_property( 814 PropertyNames.SERIALIZED_FN, 815 self.serialize_windowing_strategy(windowing, self._default_environment)) 816 817 def run_ExternalTransform(self, transform_node, options): 818 # Adds a dummy step to the Dataflow job description so that inputs and 819 # outputs are mapped correctly in the presence of external transforms. 820 # 821 # Note that Dataflow Python multi-language pipelines use Portable Job 822 # Submission by default, hence this step and rest of the Dataflow step 823 # definitions defined here are not used at Dataflow service but we have to 824 # maintain the mapping correctly till we can fully drop the Dataflow step 825 # definitions from the SDK. 826 827 # AppliedTransform node outputs have to be updated to correctly map the 828 # outputs for external transforms. 829 transform_node.outputs = ({ 830 output.tag: output 831 for output in transform_node.outputs.values() 832 }) 833 834 self.run_Impulse(transform_node, options) 835 836 def run_ParDo(self, transform_node, options): 837 transform = transform_node.transform 838 input_tag = transform_node.inputs[0].tag 839 input_step = self._cache.get_pvalue(transform_node.inputs[0]) 840 841 # Attach side inputs. 842 si_dict = {} 843 si_labels = {} 844 full_label_counts = defaultdict(int) 845 lookup_label = lambda side_pval: si_labels[side_pval] 846 named_inputs = transform_node.named_inputs() 847 label_renames = {} 848 for ix, side_pval in enumerate(transform_node.side_inputs): 849 assert isinstance(side_pval, AsSideInput) 850 step_name = 'SideInput-' + self._get_unique_step_name() 851 si_label = ((SIDE_INPUT_PREFIX + '%d-%s') % 852 (ix, transform_node.full_label)) 853 old_label = (SIDE_INPUT_PREFIX + '%d') % ix 854 855 label_renames[old_label] = si_label 856 857 assert old_label in named_inputs 858 pcollection_label = '%s.%s' % ( 859 side_pval.pvalue.producer.full_label.split('/')[-1], 860 side_pval.pvalue.tag if side_pval.pvalue.tag else 'out') 861 si_full_label = '%s/%s(%s.%s)' % ( 862 transform_node.full_label, 863 side_pval.__class__.__name__, 864 pcollection_label, 865 full_label_counts[pcollection_label]) 866 867 # Count the number of times the same PCollection is a side input 868 # to the same ParDo. 869 full_label_counts[pcollection_label] += 1 870 871 self._add_singleton_step( 872 step_name, 873 si_full_label, 874 side_pval.pvalue.tag, 875 self._cache.get_pvalue(side_pval.pvalue), 876 side_pval.pvalue.windowing, 877 side_pval._side_input_data().access_pattern) 878 si_dict[si_label] = { 879 '@type': 'OutputReference', 880 PropertyNames.STEP_NAME: step_name, 881 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 882 } 883 si_labels[side_pval] = si_label 884 885 # Now create the step for the ParDo transform being handled. 886 transform_name = transform_node.full_label.rsplit('/', 1)[-1] 887 step = self._add_step( 888 TransformNames.DO, 889 transform_node.full_label + 890 ('/{}'.format(transform_name) if transform_node.side_inputs else ''), 891 transform_node, 892 transform_node.transform.output_tags) 893 transform_proto = self.proto_context.transforms.get_proto(transform_node) 894 transform_id = self.proto_context.transforms.get_id(transform_node) 895 is_runner_v2 = _is_runner_v2(options) 896 # Patch side input ids to be unique across a given pipeline. 897 if (label_renames and 898 transform_proto.spec.urn == common_urns.primitives.PAR_DO.urn): 899 # Patch PTransform proto. 900 for old, new in label_renames.items(): 901 transform_proto.inputs[new] = transform_proto.inputs[old] 902 del transform_proto.inputs[old] 903 904 # Patch ParDo proto. 905 proto_type, _ = beam.PTransform._known_urns[transform_proto.spec.urn] 906 proto = proto_utils.parse_Bytes(transform_proto.spec.payload, proto_type) 907 for old, new in label_renames.items(): 908 proto.side_inputs[new].CopyFrom(proto.side_inputs[old]) 909 del proto.side_inputs[old] 910 transform_proto.spec.payload = proto.SerializeToString() 911 # We need to update the pipeline proto. 912 del self.proto_pipeline.components.transforms[transform_id] 913 ( 914 self.proto_pipeline.components.transforms[transform_id].CopyFrom( 915 transform_proto)) 916 # The data transmitted in SERIALIZED_FN is different depending on whether 917 # this is a runner v2 pipeline or not. 918 if is_runner_v2: 919 serialized_data = transform_id 920 else: 921 serialized_data = pickler.dumps( 922 self._pardo_fn_data(transform_node, lookup_label)) 923 step.add_property(PropertyNames.SERIALIZED_FN, serialized_data) 924 # TODO(BEAM-8882): Enable once dataflow service doesn't reject this. 925 # step.add_property(PropertyNames.PIPELINE_PROTO_TRANSFORM_ID, transform_id) 926 step.add_property( 927 PropertyNames.PARALLEL_INPUT, 928 { 929 '@type': 'OutputReference', 930 PropertyNames.STEP_NAME: input_step.proto.name, 931 PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) 932 }) 933 # Add side inputs if any. 934 step.add_property(PropertyNames.NON_PARALLEL_INPUTS, si_dict) 935 936 # Generate description for the outputs. The output names 937 # will be 'None' for main output and '<tag>' for a tagged output. 938 outputs = [] 939 940 all_output_tags = list(transform_proto.outputs.keys()) 941 942 # Some external transforms require output tags to not be modified. 943 # So we randomly select one of the output tags as the main output and 944 # leave others as side outputs. Transform execution should not change 945 # dependending on which output tag we choose as the main output here. 946 # Also, some SDKs do not work correctly if output tags are modified. So for 947 # external transforms, we leave tags unmodified. 948 # 949 # Python SDK uses 'None' as the tag of the main output. 950 main_output_tag = 'None' 951 952 step.encoding = self._get_encoded_output_coder( 953 transform_node, output_tag=main_output_tag) 954 955 side_output_tags = set(all_output_tags).difference({main_output_tag}) 956 957 # Add the main output to the description. 958 outputs.append({ 959 PropertyNames.USER_NAME: ( 960 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 961 PropertyNames.ENCODING: step.encoding, 962 PropertyNames.OUTPUT_NAME: main_output_tag 963 }) 964 for side_tag in side_output_tags: 965 # The assumption here is that all outputs will have the same typehint 966 # and coder as the main output. This is certainly the case right now 967 # but conceivably it could change in the future. 968 encoding = self._get_encoded_output_coder( 969 transform_node, output_tag=side_tag) 970 outputs.append({ 971 PropertyNames.USER_NAME: ( 972 '%s.%s' % (transform_node.full_label, side_tag)), 973 PropertyNames.ENCODING: encoding, 974 PropertyNames.OUTPUT_NAME: side_tag 975 }) 976 977 step.add_property(PropertyNames.OUTPUT_INFO, outputs) 978 979 # Add the restriction encoding if we are a splittable DoFn 980 restriction_coder = transform.get_restriction_coder() 981 if restriction_coder: 982 step.add_property( 983 PropertyNames.RESTRICTION_ENCODING, 984 self._get_cloud_encoding(restriction_coder)) 985 986 if options.view_as(StandardOptions).streaming: 987 is_stateful_dofn = (DoFnSignature(transform.dofn).is_stateful_dofn()) 988 if is_stateful_dofn: 989 step.add_property(PropertyNames.USES_KEYED_STATE, 'true') 990 991 # Also checks whether the step allows shardable keyed states. 992 # TODO(BEAM-11360): remove this when migrated to portable job 993 # submission since we only consider supporting the property in runner 994 # v2. 995 for pcoll in transform_node.outputs.values(): 996 if pcoll._unique_name() in self.get_pcoll_with_auto_sharding(): 997 step.add_property(PropertyNames.ALLOWS_SHARDABLE_STATE, 'true') 998 # Currently we only allow auto-sharding to be enabled through the 999 # GroupIntoBatches transform. So we also add the following property 1000 # which GroupIntoBatchesDoFn has, to allow the backend to perform 1001 # graph optimization. 1002 step.add_property(PropertyNames.PRESERVES_KEYS, 'true') 1003 break 1004 1005 @staticmethod 1006 def _pardo_fn_data(transform_node, get_label): 1007 transform = transform_node.transform 1008 si_tags_and_types = [ # pylint: disable=protected-access 1009 (get_label(side_pval), side_pval.__class__, side_pval._view_options()) 1010 for side_pval in transform_node.side_inputs] 1011 return ( 1012 transform.fn, 1013 transform.args, 1014 transform.kwargs, 1015 si_tags_and_types, 1016 transform_node.inputs[0].windowing) 1017 1018 def run_CombineValuesReplacement(self, transform_node, options): 1019 transform = transform_node.transform.transform 1020 input_tag = transform_node.inputs[0].tag 1021 input_step = self._cache.get_pvalue(transform_node.inputs[0]) 1022 step = self._add_step( 1023 TransformNames.COMBINE, transform_node.full_label, transform_node) 1024 transform_id = self.proto_context.transforms.get_id(transform_node.parent) 1025 1026 # The data transmitted in SERIALIZED_FN is different depending on whether 1027 # this is a runner v2 pipeline or not. 1028 if _is_runner_v2(options): 1029 # Fnapi pipelines send the transform ID of the CombineValues transform's 1030 # parent composite because Dataflow expects the ID of a CombinePerKey 1031 # transform. 1032 serialized_data = transform_id 1033 else: 1034 # Combiner functions do not take deferred side-inputs (i.e. PValues) and 1035 # therefore the code to handle extra args/kwargs is simpler than for the 1036 # DoFn's of the ParDo transform. In the last, empty argument is where 1037 # side inputs information would go. 1038 serialized_data = pickler.dumps( 1039 (transform.fn, transform.args, transform.kwargs, ())) 1040 step.add_property(PropertyNames.SERIALIZED_FN, serialized_data) 1041 # TODO(BEAM-8882): Enable once dataflow service doesn't reject this. 1042 # step.add_property(PropertyNames.PIPELINE_PROTO_TRANSFORM_ID, transform_id) 1043 step.add_property( 1044 PropertyNames.PARALLEL_INPUT, 1045 { 1046 '@type': 'OutputReference', 1047 PropertyNames.STEP_NAME: input_step.proto.name, 1048 PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) 1049 }) 1050 # Note that the accumulator must not have a WindowedValue encoding, while 1051 # the output of this step does in fact have a WindowedValue encoding. 1052 accumulator_encoding = self._get_cloud_encoding( 1053 transform.fn.get_accumulator_coder()) 1054 output_encoding = self._get_encoded_output_coder(transform_node) 1055 1056 step.encoding = output_encoding 1057 step.add_property(PropertyNames.ENCODING, accumulator_encoding) 1058 # Generate description for main output 'out.' 1059 outputs = [] 1060 # Add the main output to the description. 1061 outputs.append({ 1062 PropertyNames.USER_NAME: ( 1063 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 1064 PropertyNames.ENCODING: step.encoding, 1065 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 1066 }) 1067 step.add_property(PropertyNames.OUTPUT_INFO, outputs) 1068 1069 def run_Read(self, transform_node, options): 1070 transform = transform_node.transform 1071 step = self._add_step( 1072 TransformNames.READ, transform_node.full_label, transform_node) 1073 # TODO(mairbek): refactor if-else tree to use registerable functions. 1074 # Initialize the source specific properties. 1075 1076 standard_options = options.view_as(StandardOptions) 1077 if not hasattr(transform.source, 'format'): 1078 # If a format is not set, we assume the source to be a custom source. 1079 source_dict = {} 1080 1081 source_dict['spec'] = { 1082 '@type': names.SOURCE_TYPE, 1083 names.SERIALIZED_SOURCE_KEY: pickler.dumps(transform.source) 1084 } 1085 1086 try: 1087 source_dict['metadata'] = { 1088 'estimated_size_bytes': json_value.get_typed_value_descriptor( 1089 transform.source.estimate_size()) 1090 } 1091 except error.RuntimeValueProviderError: 1092 # Size estimation is best effort, and this error is by value provider. 1093 _LOGGER.info( 1094 'Could not estimate size of source %r due to ' + \ 1095 'RuntimeValueProviderError', transform.source) 1096 except Exception: # pylint: disable=broad-except 1097 # Size estimation is best effort. So we log the error and continue. 1098 _LOGGER.info( 1099 'Could not estimate size of source %r due to an exception: %s', 1100 transform.source, 1101 traceback.format_exc()) 1102 1103 step.add_property(PropertyNames.SOURCE_STEP_INPUT, source_dict) 1104 elif transform.source.format == 'pubsub': 1105 if not standard_options.streaming: 1106 raise ValueError( 1107 'Cloud Pub/Sub is currently available for use ' 1108 'only in streaming pipelines.') 1109 # Only one of topic or subscription should be set. 1110 if transform.source.full_subscription: 1111 step.add_property( 1112 PropertyNames.PUBSUB_SUBSCRIPTION, 1113 transform.source.full_subscription) 1114 elif transform.source.full_topic: 1115 step.add_property( 1116 PropertyNames.PUBSUB_TOPIC, transform.source.full_topic) 1117 if transform.source.id_label: 1118 step.add_property( 1119 PropertyNames.PUBSUB_ID_LABEL, transform.source.id_label) 1120 if transform.source.with_attributes: 1121 # Setting this property signals Dataflow runner to return full 1122 # PubsubMessages instead of just the data part of the payload. 1123 step.add_property(PropertyNames.PUBSUB_SERIALIZED_ATTRIBUTES_FN, '') 1124 1125 if transform.source.timestamp_attribute is not None: 1126 step.add_property( 1127 PropertyNames.PUBSUB_TIMESTAMP_ATTRIBUTE, 1128 transform.source.timestamp_attribute) 1129 else: 1130 raise ValueError( 1131 'Source %r has unexpected format %s.' % 1132 (transform.source, transform.source.format)) 1133 1134 if not hasattr(transform.source, 'format'): 1135 step.add_property(PropertyNames.FORMAT, names.SOURCE_FORMAT) 1136 else: 1137 step.add_property(PropertyNames.FORMAT, transform.source.format) 1138 1139 # Wrap coder in WindowedValueCoder: this is necessary as the encoding of a 1140 # step should be the type of value outputted by each step. Read steps 1141 # automatically wrap output values in a WindowedValue wrapper, if necessary. 1142 # This is also necessary for proper encoding for size estimation. 1143 # Using a GlobalWindowCoder as a place holder instead of the default 1144 # PickleCoder because GlobalWindowCoder is known coder. 1145 # TODO(robertwb): Query the collection for the windowfn to extract the 1146 # correct coder. 1147 coder = coders.WindowedValueCoder( 1148 coders.registry.get_coder(transform_node.outputs[None].element_type), 1149 coders.coders.GlobalWindowCoder()) 1150 1151 step.encoding = self._get_cloud_encoding(coder) 1152 step.add_property( 1153 PropertyNames.OUTPUT_INFO, 1154 [{ 1155 PropertyNames.USER_NAME: ( 1156 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 1157 PropertyNames.ENCODING: step.encoding, 1158 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 1159 }]) 1160 1161 def run__NativeWrite(self, transform_node, options): 1162 transform = transform_node.transform 1163 input_tag = transform_node.inputs[0].tag 1164 input_step = self._cache.get_pvalue(transform_node.inputs[0]) 1165 step = self._add_step( 1166 TransformNames.WRITE, transform_node.full_label, transform_node) 1167 # TODO(mairbek): refactor if-else tree to use registerable functions. 1168 # Initialize the sink specific properties. 1169 if transform.sink.format == 'pubsub': 1170 standard_options = options.view_as(StandardOptions) 1171 if not standard_options.streaming: 1172 raise ValueError( 1173 'Cloud Pub/Sub is currently available for use ' 1174 'only in streaming pipelines.') 1175 step.add_property(PropertyNames.PUBSUB_TOPIC, transform.sink.full_topic) 1176 if transform.sink.id_label: 1177 step.add_property( 1178 PropertyNames.PUBSUB_ID_LABEL, transform.sink.id_label) 1179 # Setting this property signals Dataflow runner that the PCollection 1180 # contains PubsubMessage objects instead of just raw data. 1181 step.add_property(PropertyNames.PUBSUB_SERIALIZED_ATTRIBUTES_FN, '') 1182 if transform.sink.timestamp_attribute is not None: 1183 step.add_property( 1184 PropertyNames.PUBSUB_TIMESTAMP_ATTRIBUTE, 1185 transform.sink.timestamp_attribute) 1186 else: 1187 raise ValueError( 1188 'Sink %r has unexpected format %s.' % 1189 (transform.sink, transform.sink.format)) 1190 step.add_property(PropertyNames.FORMAT, transform.sink.format) 1191 1192 # Wrap coder in WindowedValueCoder: this is necessary for proper encoding 1193 # for size estimation. Using a GlobalWindowCoder as a place holder instead 1194 # of the default PickleCoder because GlobalWindowCoder is known coder. 1195 # TODO(robertwb): Query the collection for the windowfn to extract the 1196 # correct coder. 1197 coder = coders.WindowedValueCoder( 1198 transform.sink.coder, coders.coders.GlobalWindowCoder()) 1199 step.encoding = self._get_cloud_encoding(coder) 1200 step.add_property(PropertyNames.ENCODING, step.encoding) 1201 step.add_property( 1202 PropertyNames.PARALLEL_INPUT, 1203 { 1204 '@type': 'OutputReference', 1205 PropertyNames.STEP_NAME: input_step.proto.name, 1206 PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) 1207 }) 1208 1209 def run_TestStream(self, transform_node, options): 1210 from apache_beam.testing.test_stream import ElementEvent 1211 from apache_beam.testing.test_stream import ProcessingTimeEvent 1212 from apache_beam.testing.test_stream import WatermarkEvent 1213 standard_options = options.view_as(StandardOptions) 1214 if not standard_options.streaming: 1215 raise ValueError( 1216 'TestStream is currently available for use ' 1217 'only in streaming pipelines.') 1218 1219 transform = transform_node.transform 1220 step = self._add_step( 1221 TransformNames.READ, transform_node.full_label, transform_node) 1222 step.add_property( 1223 PropertyNames.SERIALIZED_FN, 1224 self.proto_context.transforms.get_id(transform_node)) 1225 step.add_property(PropertyNames.FORMAT, 'test_stream') 1226 test_stream_payload = beam_runner_api_pb2.TestStreamPayload() 1227 # TestStream source doesn't do any decoding of elements, 1228 # so we won't set test_stream_payload.coder_id. 1229 output_coder = transform._infer_output_coder() # pylint: disable=protected-access 1230 for event in transform._events: 1231 new_event = test_stream_payload.events.add() 1232 if isinstance(event, ElementEvent): 1233 for tv in event.timestamped_values: 1234 element = new_event.element_event.elements.add() 1235 element.encoded_element = output_coder.encode(tv.value) 1236 element.timestamp = tv.timestamp.micros 1237 elif isinstance(event, ProcessingTimeEvent): 1238 new_event.processing_time_event.advance_duration = ( 1239 event.advance_by.micros) 1240 elif isinstance(event, WatermarkEvent): 1241 new_event.watermark_event.new_watermark = event.new_watermark.micros 1242 serialized_payload = self.byte_array_to_json_string( 1243 test_stream_payload.SerializeToString()) 1244 step.add_property(PropertyNames.SERIALIZED_TEST_STREAM, serialized_payload) 1245 1246 step.encoding = self._get_encoded_output_coder(transform_node) 1247 step.add_property( 1248 PropertyNames.OUTPUT_INFO, 1249 [{ 1250 PropertyNames.USER_NAME: ( 1251 '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), 1252 PropertyNames.ENCODING: step.encoding, 1253 PropertyNames.OUTPUT_NAME: PropertyNames.OUT 1254 }]) 1255 1256 # We must mark this method as not a test or else its name is a matcher for 1257 # nosetest tests. 1258 run_TestStream.__test__ = False # type: ignore[attr-defined] 1259 1260 @classmethod 1261 def serialize_windowing_strategy(cls, windowing, default_environment): 1262 from apache_beam.runners import pipeline_context 1263 context = pipeline_context.PipelineContext( 1264 default_environment=default_environment) 1265 windowing_proto = windowing.to_runner_api(context) 1266 return cls.byte_array_to_json_string( 1267 beam_runner_api_pb2.MessageWithComponents( 1268 components=context.to_runner_api(), 1269 windowing_strategy=windowing_proto).SerializeToString()) 1270 1271 @classmethod 1272 def deserialize_windowing_strategy(cls, serialized_data): 1273 # Imported here to avoid circular dependencies. 1274 # pylint: disable=wrong-import-order, wrong-import-position 1275 from apache_beam.runners import pipeline_context 1276 from apache_beam.transforms.core import Windowing 1277 proto = beam_runner_api_pb2.MessageWithComponents() 1278 proto.ParseFromString(cls.json_string_to_byte_array(serialized_data)) 1279 return Windowing.from_runner_api( 1280 proto.windowing_strategy, 1281 pipeline_context.PipelineContext(proto.components)) 1282 1283 @staticmethod 1284 def byte_array_to_json_string(raw_bytes): 1285 """Implements org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString.""" 1286 return quote(raw_bytes) 1287 1288 @staticmethod 1289 def json_string_to_byte_array(encoded_string): 1290 """Implements org.apache.beam.sdk.util.StringUtils.jsonStringToByteArray.""" 1291 return unquote_to_bytes(encoded_string) 1292 1293 def get_default_gcp_region(self): 1294 """Get a default value for Google Cloud region according to 1295 https://cloud.google.com/compute/docs/gcloud-compute/#default-properties. 1296 If no default can be found, returns None. 1297 """ 1298 environment_region = os.environ.get('CLOUDSDK_COMPUTE_REGION') 1299 if environment_region: 1300 _LOGGER.info( 1301 'Using default GCP region %s from $CLOUDSDK_COMPUTE_REGION', 1302 environment_region) 1303 return environment_region 1304 try: 1305 cmd = ['gcloud', 'config', 'get-value', 'compute/region'] 1306 raw_output = processes.check_output(cmd, stderr=DEVNULL) 1307 formatted_output = raw_output.decode('utf-8').strip() 1308 if formatted_output: 1309 _LOGGER.info( 1310 'Using default GCP region %s from `%s`', 1311 formatted_output, 1312 ' '.join(cmd)) 1313 return formatted_output 1314 except RuntimeError: 1315 pass 1316 return None 1317 1318 1319 class _DataflowSideInput(beam.pvalue.AsSideInput): 1320 """Wraps a side input as a dataflow-compatible side input.""" 1321 def _view_options(self): 1322 return { 1323 'data': self._data, 1324 } 1325 1326 def _side_input_data(self): 1327 return self._data 1328 1329 1330 def _add_runner_v2_missing_options(options): 1331 debug_options = options.view_as(DebugOptions) 1332 debug_options.add_experiment('beam_fn_api') 1333 debug_options.add_experiment('use_unified_worker') 1334 debug_options.add_experiment('use_runner_v2') 1335 debug_options.add_experiment('use_portable_job_submission') 1336 1337 1338 def _check_and_add_missing_options(options): 1339 # Type: (PipelineOptions) -> None 1340 1341 """Validates and adds missing pipeline options depending on options set. 1342 1343 :param options: PipelineOptions for this pipeline. 1344 """ 1345 debug_options = options.view_as(DebugOptions) 1346 dataflow_service_options = options.view_as( 1347 GoogleCloudOptions).dataflow_service_options or [] 1348 options.view_as( 1349 GoogleCloudOptions).dataflow_service_options = dataflow_service_options 1350 1351 # Ensure that prime is specified as an experiment if specified as a dataflow 1352 # service option 1353 if 'enable_prime' in dataflow_service_options: 1354 debug_options.add_experiment('enable_prime') 1355 elif debug_options.lookup_experiment('enable_prime'): 1356 dataflow_service_options.append('enable_prime') 1357 1358 # Streaming only supports using runner v2 (aka unified worker). 1359 # Runner v2 only supports using streaming engine (aka windmill service) 1360 if options.view_as(StandardOptions).streaming: 1361 google_cloud_options = options.view_as(GoogleCloudOptions) 1362 if _is_runner_v2_disabled(options): 1363 raise ValueError( 1364 'Disabling Runner V2 no longer supported for streaming pipeline ' 1365 'using Beam Python %s.' % beam.version.__version__) 1366 1367 if (not google_cloud_options.enable_streaming_engine and 1368 (debug_options.lookup_experiment("enable_windmill_service") or 1369 debug_options.lookup_experiment("enable_streaming_engine"))): 1370 raise ValueError( 1371 """Streaming engine both disabled and enabled: 1372 --enable_streaming_engine flag is not set, but 1373 enable_windmill_service and/or enable_streaming_engine experiments 1374 are present. It is recommended you only set the 1375 --enable_streaming_engine flag.""") 1376 1377 # Ensure that if we detected a streaming pipeline that streaming specific 1378 # options and experiments. 1379 options.view_as(StandardOptions).streaming = True 1380 google_cloud_options.enable_streaming_engine = True 1381 debug_options.add_experiment("enable_streaming_engine") 1382 debug_options.add_experiment("enable_windmill_service") 1383 _add_runner_v2_missing_options(debug_options) 1384 elif (debug_options.lookup_experiment('enable_prime') or 1385 debug_options.lookup_experiment('beam_fn_api') or 1386 debug_options.lookup_experiment('use_unified_worker') or 1387 debug_options.lookup_experiment('use_runner_v2') or 1388 debug_options.lookup_experiment('use_portable_job_submission')): 1389 if _is_runner_v2_disabled(options): 1390 raise ValueError( 1391 """Runner V2 both disabled and enabled: at least one of 1392 ['enable_prime', 'beam_fn_api', 'use_unified_worker', 'use_runner_v2', 1393 'use_portable_job_submission'] is set and also one of 1394 ['disable_runner_v2', 'disable_runner_v2_until_2023', 1395 'disable_prime_runner_v2'] is set.""") 1396 _add_runner_v2_missing_options(debug_options) 1397 1398 1399 def _is_runner_v2(options): 1400 # Type: (PipelineOptions) -> bool 1401 1402 """Returns true if runner v2 is enabled.""" 1403 _check_and_add_missing_options(options) 1404 return options.view_as(DebugOptions).lookup_experiment( 1405 'use_runner_v2', default=False) 1406 1407 1408 def _is_runner_v2_disabled(options): 1409 # Type: (PipelineOptions) -> bool 1410 1411 """Returns true if runner v2 is disabled.""" 1412 debug_options = options.view_as(DebugOptions) 1413 return ( 1414 debug_options.lookup_experiment('disable_runner_v2') or 1415 debug_options.lookup_experiment('disable_runner_v2_until_2023') or 1416 debug_options.lookup_experiment('disable_prime_runner_v2')) 1417 1418 1419 class _DataflowIterableSideInput(_DataflowSideInput): 1420 """Wraps an iterable side input as dataflow-compatible side input.""" 1421 def __init__(self, side_input): 1422 # pylint: disable=protected-access 1423 self.pvalue = side_input.pvalue 1424 side_input_data = side_input._side_input_data() 1425 assert ( 1426 side_input_data.access_pattern == common_urns.side_inputs.ITERABLE.urn) 1427 self._data = beam.pvalue.SideInputData( 1428 common_urns.side_inputs.ITERABLE.urn, 1429 side_input_data.window_mapping_fn, 1430 side_input_data.view_fn) 1431 1432 1433 class _DataflowMultimapSideInput(_DataflowSideInput): 1434 """Wraps a multimap side input as dataflow-compatible side input.""" 1435 def __init__(self, side_input): 1436 # pylint: disable=protected-access 1437 self.pvalue = side_input.pvalue 1438 side_input_data = side_input._side_input_data() 1439 assert ( 1440 side_input_data.access_pattern == common_urns.side_inputs.MULTIMAP.urn) 1441 self._data = beam.pvalue.SideInputData( 1442 common_urns.side_inputs.MULTIMAP.urn, 1443 side_input_data.window_mapping_fn, 1444 side_input_data.view_fn) 1445 1446 1447 class DataflowPipelineResult(PipelineResult): 1448 """Represents the state of a pipeline run on the Dataflow service.""" 1449 def __init__(self, job, runner): 1450 """Initialize a new DataflowPipelineResult instance. 1451 1452 Args: 1453 job: Job message from the Dataflow API. Could be :data:`None` if a job 1454 request was not sent to Dataflow service (e.g. template jobs). 1455 runner: DataflowRunner instance. 1456 """ 1457 self._job = job 1458 self._runner = runner 1459 self.metric_results = None 1460 1461 def _update_job(self): 1462 # We need the job id to be able to update job information. There is no need 1463 # to update the job if we are in a known terminal state. 1464 if self.has_job and not self.is_in_terminal_state(): 1465 self._job = self._runner.dataflow_client.get_job(self.job_id()) 1466 1467 def job_id(self): 1468 return self._job.id 1469 1470 def metrics(self): 1471 return self.metric_results 1472 1473 def monitoring_infos(self): 1474 logging.warning('Monitoring infos not yet supported for Dataflow runner.') 1475 return [] 1476 1477 @property 1478 def has_job(self): 1479 return self._job is not None 1480 1481 @staticmethod 1482 def api_jobstate_to_pipeline_state(api_jobstate): 1483 values_enum = dataflow_api.Job.CurrentStateValueValuesEnum 1484 1485 # Ordered by the enum values. Values that may be introduced in 1486 # future versions of Dataflow API are considered UNRECOGNIZED by this SDK. 1487 api_jobstate_map = defaultdict( 1488 lambda: PipelineState.UNRECOGNIZED, 1489 { 1490 values_enum.JOB_STATE_UNKNOWN: PipelineState.UNKNOWN, 1491 values_enum.JOB_STATE_STOPPED: PipelineState.STOPPED, 1492 values_enum.JOB_STATE_RUNNING: PipelineState.RUNNING, 1493 values_enum.JOB_STATE_DONE: PipelineState.DONE, 1494 values_enum.JOB_STATE_FAILED: PipelineState.FAILED, 1495 values_enum.JOB_STATE_CANCELLED: PipelineState.CANCELLED, 1496 values_enum.JOB_STATE_UPDATED: PipelineState.UPDATED, 1497 values_enum.JOB_STATE_DRAINING: PipelineState.DRAINING, 1498 values_enum.JOB_STATE_DRAINED: PipelineState.DRAINED, 1499 values_enum.JOB_STATE_PENDING: PipelineState.PENDING, 1500 values_enum.JOB_STATE_CANCELLING: PipelineState.CANCELLING, 1501 values_enum.JOB_STATE_RESOURCE_CLEANING_UP: PipelineState. 1502 RESOURCE_CLEANING_UP, 1503 }) 1504 1505 return ( 1506 api_jobstate_map[api_jobstate] 1507 if api_jobstate else PipelineState.UNKNOWN) 1508 1509 def _get_job_state(self): 1510 return self.api_jobstate_to_pipeline_state(self._job.currentState) 1511 1512 @property 1513 def state(self): 1514 """Return the current state of the remote job. 1515 1516 Returns: 1517 A PipelineState object. 1518 """ 1519 if not self.has_job: 1520 return PipelineState.UNKNOWN 1521 1522 self._update_job() 1523 1524 return self._get_job_state() 1525 1526 def is_in_terminal_state(self): 1527 if not self.has_job: 1528 return True 1529 1530 return PipelineState.is_terminal(self._get_job_state()) 1531 1532 def wait_until_finish(self, duration=None): 1533 if not self.is_in_terminal_state(): 1534 if not self.has_job: 1535 raise IOError('Failed to get the Dataflow job id.') 1536 consoleUrl = ( 1537 "Console URL: https://console.cloud.google.com/" 1538 f"dataflow/jobs/<RegionId>/{self.job_id()}" 1539 "?project=<ProjectId>") 1540 thread = threading.Thread( 1541 target=DataflowRunner.poll_for_job_completion, 1542 args=(self._runner, self, duration)) 1543 1544 # Mark the thread as a daemon thread so a keyboard interrupt on the main 1545 # thread will terminate everything. This is also the reason we will not 1546 # use thread.join() to wait for the polling thread. 1547 thread.daemon = True 1548 thread.start() 1549 while thread.is_alive(): 1550 time.sleep(5.0) 1551 1552 # TODO: Merge the termination code in poll_for_job_completion and 1553 # is_in_terminal_state. 1554 terminated = self.is_in_terminal_state() 1555 assert duration or terminated, ( 1556 'Job did not reach to a terminal state after waiting indefinitely. ' 1557 '{}'.format(consoleUrl)) 1558 1559 if terminated and self.state != PipelineState.DONE: 1560 # TODO(BEAM-1290): Consider converting this to an error log based on 1561 # theresolution of the issue. 1562 _LOGGER.error(consoleUrl) 1563 raise DataflowRuntimeException( 1564 'Dataflow pipeline failed. State: %s, Error:\n%s' % 1565 (self.state, getattr(self._runner, 'last_error_msg', None)), 1566 self) 1567 elif PipelineState.is_terminal( 1568 self.state) and self.state == PipelineState.FAILED and self._runner: 1569 raise DataflowRuntimeException( 1570 'Dataflow pipeline failed. State: %s, Error:\n%s' % 1571 (self.state, getattr(self._runner, 'last_error_msg', None)), 1572 self) 1573 1574 return self.state 1575 1576 def cancel(self): 1577 if not self.has_job: 1578 raise IOError('Failed to get the Dataflow job id.') 1579 1580 self._update_job() 1581 1582 if self.is_in_terminal_state(): 1583 _LOGGER.warning( 1584 'Cancel failed because job %s is already terminated in state %s.', 1585 self.job_id(), 1586 self.state) 1587 else: 1588 if not self._runner.dataflow_client.modify_job_state( 1589 self.job_id(), 'JOB_STATE_CANCELLED'): 1590 cancel_failed_message = ( 1591 'Failed to cancel job %s, please go to the Developers Console to ' 1592 'cancel it manually.') % self.job_id() 1593 _LOGGER.error(cancel_failed_message) 1594 raise DataflowRuntimeException(cancel_failed_message, self) 1595 1596 return self.state 1597 1598 def __str__(self): 1599 return '<%s %s %s>' % (self.__class__.__name__, self.job_id(), self.state) 1600 1601 def __repr__(self): 1602 return '<%s %s at %s>' % (self.__class__.__name__, self._job, hex(id(self))) 1603 1604 1605 class DataflowRuntimeException(Exception): 1606 """Indicates an error has occurred in running this pipeline.""" 1607 def __init__(self, msg, result): 1608 super().__init__(msg) 1609 self.result = result