github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_runner.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A runner that allows running of Beam pipelines interactively. 19 20 This module is experimental. No backwards-compatibility guarantees. 21 """ 22 23 # pytype: skip-file 24 25 import logging 26 from typing import Optional 27 28 import apache_beam as beam 29 from apache_beam import runners 30 from apache_beam.options.pipeline_options import FlinkRunnerOptions 31 from apache_beam.options.pipeline_options import GoogleCloudOptions 32 from apache_beam.options.pipeline_options import PipelineOptions 33 from apache_beam.options.pipeline_options import WorkerOptions 34 from apache_beam.pipeline import PipelineVisitor 35 from apache_beam.runners.direct import direct_runner 36 from apache_beam.runners.interactive import interactive_environment as ie 37 from apache_beam.runners.interactive import pipeline_instrument as inst 38 from apache_beam.runners.interactive import background_caching_job 39 from apache_beam.runners.interactive.dataproc.types import ClusterMetadata 40 from apache_beam.runners.interactive.display import pipeline_graph 41 from apache_beam.runners.interactive.options import capture_control 42 from apache_beam.runners.interactive.utils import to_element_list 43 from apache_beam.runners.interactive.utils import watch_sources 44 from apache_beam.testing.test_stream_service import TestStreamServiceController 45 46 # size of PCollection samples cached. 47 SAMPLE_SIZE = 8 48 49 _LOGGER = logging.getLogger(__name__) 50 51 52 class InteractiveRunner(runners.PipelineRunner): 53 """An interactive runner for Beam Python pipelines. 54 55 Allows interactively building and running Beam Python pipelines. 56 """ 57 def __init__( 58 self, 59 underlying_runner=None, 60 render_option=None, 61 skip_display=True, 62 force_compute=True, 63 blocking=True): 64 """Constructor of InteractiveRunner. 65 66 Args: 67 underlying_runner: (runner.PipelineRunner) 68 render_option: (str) this parameter decides how the pipeline graph is 69 rendered. See display.pipeline_graph_renderer for available options. 70 skip_display: (bool) whether to skip display operations when running the 71 pipeline. Useful if running large pipelines when display is not 72 needed. 73 force_compute: (bool) whether sequential pipeline runs can use cached data 74 of PCollections computed from the previous runs including show API 75 invocation from interactive_beam module. If True, always run the whole 76 pipeline and compute data for PCollections forcefully. If False, use 77 available data and run minimum pipeline fragment to only compute data 78 not available. 79 blocking: (bool) whether the pipeline run should be blocking or not. 80 """ 81 self._underlying_runner = ( 82 underlying_runner or direct_runner.DirectRunner()) 83 self._render_option = render_option 84 self._in_session = False 85 self._skip_display = skip_display 86 self._force_compute = force_compute 87 self._blocking = blocking 88 89 def is_fnapi_compatible(self): 90 # TODO(https://github.com/apache/beam/issues/19937): 91 # return self._underlying_runner.is_fnapi_compatible() 92 return False 93 94 def set_render_option(self, render_option): 95 """Sets the rendering option. 96 97 Args: 98 render_option: (str) this parameter decides how the pipeline graph is 99 rendered. See display.pipeline_graph_renderer for available options. 100 """ 101 self._render_option = render_option 102 103 def start_session(self): 104 """Start the session that keeps back-end managers and workers alive. 105 """ 106 if self._in_session: 107 return 108 109 enter = getattr(self._underlying_runner, '__enter__', None) 110 if enter is not None: 111 _LOGGER.info('Starting session.') 112 self._in_session = True 113 enter() 114 else: 115 _LOGGER.error('Keep alive not supported.') 116 117 def end_session(self): 118 """End the session that keeps backend managers and workers alive. 119 """ 120 if not self._in_session: 121 return 122 123 exit = getattr(self._underlying_runner, '__exit__', None) 124 if exit is not None: 125 self._in_session = False 126 _LOGGER.info('Ending session.') 127 exit(None, None, None) 128 129 def apply(self, transform, pvalueish, options): 130 # TODO(qinyeli, BEAM-646): Remove runner interception of apply. 131 return self._underlying_runner.apply(transform, pvalueish, options) 132 133 def run_pipeline(self, pipeline, options): 134 if not ie.current_env().options.enable_recording_replay: 135 capture_control.evict_captured_data() 136 if self._force_compute: 137 ie.current_env().evict_computed_pcollections() 138 139 # Make sure that sources without a user reference are still cached. 140 watch_sources(pipeline) 141 142 user_pipeline = ie.current_env().user_pipeline(pipeline) 143 144 from apache_beam.runners.portability.flink_runner import FlinkRunner 145 if isinstance(self._underlying_runner, FlinkRunner): 146 self.configure_for_flink(user_pipeline, options) 147 148 pipeline_instrument = inst.build_pipeline_instrument(pipeline, options) 149 150 # The user_pipeline analyzed might be None if the pipeline given has nothing 151 # to be cached and tracing back to the user defined pipeline is impossible. 152 # When it's None, there is no need to cache including the background 153 # caching job and no result to track since no background caching job is 154 # started at all. 155 if user_pipeline: 156 # Should use the underlying runner and run asynchronously. 157 background_caching_job.attempt_to_run_background_caching_job( 158 self._underlying_runner, user_pipeline, options) 159 if (background_caching_job.has_source_to_cache(user_pipeline) and 160 not background_caching_job.is_a_test_stream_service_running( 161 user_pipeline)): 162 streaming_cache_manager = ie.current_env().get_cache_manager( 163 user_pipeline) 164 165 # Only make the server if it doesn't exist already. 166 if (streaming_cache_manager and 167 not ie.current_env().get_test_stream_service_controller( 168 user_pipeline)): 169 170 def exception_handler(e): 171 _LOGGER.error(str(e)) 172 return True 173 174 test_stream_service = TestStreamServiceController( 175 streaming_cache_manager, exception_handler=exception_handler) 176 test_stream_service.start() 177 ie.current_env().set_test_stream_service_controller( 178 user_pipeline, test_stream_service) 179 180 pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api( 181 pipeline_instrument.instrumented_pipeline_proto(), 182 self._underlying_runner, 183 options) 184 185 if ie.current_env().get_test_stream_service_controller(user_pipeline): 186 endpoint = ie.current_env().get_test_stream_service_controller( 187 user_pipeline).endpoint 188 189 # TODO: make the StreamingCacheManager and TestStreamServiceController 190 # constructed when the InteractiveEnvironment is imported. 191 class TestStreamVisitor(PipelineVisitor): 192 def visit_transform(self, transform_node): 193 from apache_beam.testing.test_stream import TestStream 194 if (isinstance(transform_node.transform, TestStream) and 195 not transform_node.transform._events): 196 transform_node.transform._endpoint = endpoint 197 198 pipeline_to_execute.visit(TestStreamVisitor()) 199 200 if not self._skip_display: 201 a_pipeline_graph = pipeline_graph.PipelineGraph( 202 pipeline_instrument.original_pipeline_proto, 203 render_option=self._render_option) 204 a_pipeline_graph.display_graph() 205 206 main_job_result = PipelineResult( 207 pipeline_to_execute.run(), pipeline_instrument) 208 # In addition to this pipeline result setting, redundant result setting from 209 # outer scopes are also recommended since the user_pipeline might not be 210 # available from within this scope. 211 if user_pipeline: 212 ie.current_env().set_pipeline_result(user_pipeline, main_job_result) 213 214 if self._blocking: 215 main_job_result.wait_until_finish() 216 217 if main_job_result.state is beam.runners.runner.PipelineState.DONE: 218 # pylint: disable=bad-option-value 219 ie.current_env().mark_pcollection_computed( 220 pipeline_instrument.cached_pcolls) 221 222 return main_job_result 223 224 def configure_for_flink( 225 self, user_pipeline: beam.Pipeline, options: PipelineOptions) -> None: 226 """Configures the pipeline options for running a job with Flink. 227 228 When running with a FlinkRunner, a job server started from an uber jar 229 (locally built or remotely downloaded) hosting the beam_job_api will 230 communicate with the Flink cluster located at the given flink_master in the 231 pipeline options. 232 """ 233 clusters = ie.current_env().clusters 234 if clusters.pipelines.get(user_pipeline, None): 235 # Noop for a known pipeline using a known Dataproc cluster. 236 return 237 flink_master = self._strip_protocol_if_any( 238 options.view_as(FlinkRunnerOptions).flink_master) 239 cluster_metadata = clusters.default_cluster_metadata 240 if flink_master == '[auto]': 241 # Try to create/reuse a cluster when no flink_master is given. 242 project_id = options.view_as(GoogleCloudOptions).project 243 region = options.view_as(GoogleCloudOptions).region or 'us-central1' 244 if project_id: 245 if clusters.default_cluster_metadata: 246 # Reuse the cluster name from default in case of a known cluster. 247 cluster_metadata = ClusterMetadata( 248 project_id=project_id, 249 region=region, 250 cluster_name=clusters.default_cluster_metadata.cluster_name) 251 else: 252 # Generate the metadata with a new unique cluster name. 253 cluster_metadata = ClusterMetadata( 254 project_id=project_id, region=region) 255 # Add additional configurations. 256 self._worker_options_to_cluster_metadata(options, cluster_metadata) 257 # else use the default cluster metadata. 258 elif flink_master in clusters.master_urls: 259 cluster_metadata = clusters.cluster_metadata(flink_master) 260 else: # Noop if a self-hosted Flink is in use. 261 return 262 if not cluster_metadata: 263 return # Not even a default cluster to create/reuse, run Flink locally. 264 dcm = clusters.create(cluster_metadata) 265 # Side effects associated with the user_pipeline. 266 clusters.pipelines[user_pipeline] = dcm 267 dcm.pipelines.add(user_pipeline) 268 self._configure_flink_options( 269 options, 270 clusters.DATAPROC_FLINK_VERSION, 271 dcm.cluster_metadata.master_url) 272 273 def _strip_protocol_if_any(self, flink_master: Optional[str]): 274 if flink_master: 275 parts = flink_master.split('://') 276 if len(parts) > 1: 277 return parts[1] 278 return flink_master 279 280 def _worker_options_to_cluster_metadata( 281 self, options: PipelineOptions, cluster_metadata: ClusterMetadata): 282 worker_options = options.view_as(WorkerOptions) 283 if worker_options.subnetwork: 284 cluster_metadata.subnetwork = worker_options.subnetwork 285 if worker_options.num_workers: 286 cluster_metadata.num_workers = worker_options.num_workers 287 if worker_options.machine_type: 288 cluster_metadata.machine_type = worker_options.machine_type 289 290 def _configure_flink_options( 291 self, options: PipelineOptions, flink_version: str, master_url: str): 292 flink_options = options.view_as(FlinkRunnerOptions) 293 flink_options.flink_version = flink_version 294 # flink_options.flink_job_server_jar will be populated by the 295 # apache_beam.utils.subprocess_server.JavaJarServer.path_to_beam_jar, 296 # do not populate it explicitly. 297 flink_options.flink_master = master_url 298 299 300 class PipelineResult(beam.runners.runner.PipelineResult): 301 """Provides access to information about a pipeline.""" 302 def __init__(self, underlying_result, pipeline_instrument): 303 """Constructor of PipelineResult. 304 305 Args: 306 underlying_result: (PipelineResult) the result returned by the underlying 307 runner running the pipeline. 308 pipeline_instrument: (PipelineInstrument) pipeline instrument describing 309 the pipeline being executed with interactivity applied and related 310 metadata including where the interactivity-backing cache lies. 311 """ 312 super().__init__(underlying_result.state) 313 self._underlying_result = underlying_result 314 self._pipeline_instrument = pipeline_instrument 315 316 @property 317 def state(self): 318 return self._underlying_result.state 319 320 def wait_until_finish(self): 321 self._underlying_result.wait_until_finish() 322 323 def get(self, pcoll, include_window_info=False): 324 """Materializes the PCollection into a list. 325 326 If include_window_info is True, then returns the elements as 327 WindowedValues. Otherwise, return the element as itself. 328 """ 329 return list(self.read(pcoll, include_window_info)) 330 331 def read(self, pcoll, include_window_info=False): 332 """Reads the PCollection one element at a time from cache. 333 334 If include_window_info is True, then returns the elements as 335 WindowedValues. Otherwise, return the element as itself. 336 """ 337 key = self._pipeline_instrument.cache_key(pcoll) 338 cache_manager = ie.current_env().get_cache_manager( 339 self._pipeline_instrument.user_pipeline) 340 if key and cache_manager.exists('full', key): 341 coder = cache_manager.load_pcoder('full', key) 342 reader, _ = cache_manager.read('full', key) 343 return to_element_list(reader, coder, include_window_info) 344 else: 345 raise ValueError('PCollection not available, please run the pipeline.') 346 347 def cancel(self): 348 self._underlying_result.cancel()