github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/render.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A portable "runner" that renders a beam graph. 19 20 This runner can either render the graph to a (set of) output path(s), as 21 designated by (possibly repeated) --render_output, or serve the pipeline as 22 an interactive graph, if --render_port is set. 23 24 In Python, this runner can be passed directly at pipeline construction, e.g.:: 25 26 with beam.Pipeline(runner=beam.runners.render.RenderRunner(), options=...) 27 28 For other languages, start this service a by running:: 29 30 python -m apache_beam.runners.render --job_port=PORT ... 31 32 and then run your pipline with the PortableRunner setting the job endpoint 33 to `localhost:PORT`. 34 35 If any `--render_output=path.ext` flags are passed, each submitted job will 36 get written to the given output (overwriting any previously existing file). 37 38 If `--render_port` is set to a non-negative value, a local http server will 39 be started which allows for interactive exploration of the pipeline graph. 40 41 As an alternative to starting a job server, a single pipeline can be rendered 42 by passing a pipeline proto file to `--pipeline_proto`. For example 43 44 python -m apache_beam.runners.render \\ 45 --pipeline_proto gs://<staging_location>/pipeline.pb \\ 46 --render_output=/tmp/pipeline.svg 47 48 Requires the graphviz dot executable to be available in the path. 49 """ 50 51 import argparse 52 import base64 53 import collections 54 import http.server 55 import json 56 import logging 57 import os 58 import re 59 import subprocess 60 import sys 61 import tempfile 62 import threading 63 import time 64 import urllib.parse 65 66 from google.protobuf import json_format 67 from google.protobuf import text_format # type: ignore 68 69 from apache_beam.options import pipeline_options 70 from apache_beam.portability.api import beam_runner_api_pb2 71 from apache_beam.runners import runner 72 from apache_beam.runners.portability import local_job_service 73 from apache_beam.runners.portability import local_job_service_main 74 from apache_beam.runners.portability.fn_api_runner import translations 75 76 try: 77 from apache_beam.io.gcp import gcsio 78 except ImportError: 79 gcsio = None # type: ignore 80 81 # From the Beam site, circa November 2022. 82 DEFAULT_EDGE_STYLE = 'color="#ff570b"' 83 DEFAULT_TRANSFORM_STYLE = ( 84 'shape=rect style="rounded, filled" color="#ff570b" fillcolor="#fff6dd"') 85 DEFAULT_HIGHLIGHT_STYLE = ( 86 'shape=rect style="rounded, filled" color="#ff570b" fillcolor="#ffdb97"') 87 88 89 class RenderOptions(pipeline_options.PipelineOptions): 90 """Rendering options.""" 91 @classmethod 92 def _add_argparse_args(cls, parser): 93 parser.add_argument( 94 '--render_port', 95 type=int, 96 default=-1, 97 help='The port at which to serve the graph. ' 98 'If 0, an unused port will be chosen. ' 99 'If -1, the server will not be started.') 100 parser.add_argument( 101 '--render_output', 102 action='append', 103 help='A path or paths to which to write rendered output. ' 104 'The output type will be deduced from the file extension.') 105 parser.add_argument( 106 '--render_leaf_composite_nodes', 107 action='append', 108 help='A set of regular expressions for transform names that should ' 109 'not be expanded. For example, one could pass "\bRead.*" to indicate ' 110 'the inner structure of read nodes should not be expanded. ' 111 'If not given, defaults to the top-level nodes if interactively ' 112 'serving the graph and expanding all nodes otherwise.') 113 parser.add_argument( 114 '--render_edge_attributes', 115 default='', 116 help='Graphviz attributes to add to all edges.') 117 parser.add_argument( 118 '--render_node_attributes', 119 default='', 120 help='Graphviz attributes to add to all nodes.') 121 parser.add_argument( 122 '--render_highlight_attributes', 123 default='', 124 help='Graphviz attributes to add to all highlighted nodes.') 125 parser.add_argument( 126 '--log_proto', 127 default=False, 128 action='store_true', 129 help='Set to also log input pipeline proto to stdout.') 130 return parser 131 132 133 class PipelineRenderer: 134 def __init__(self, pipeline, options): 135 self.pipeline = pipeline 136 self.options = options 137 138 # Drill down into any uninteresting, top-level transforms that contain 139 # the whole pipeline (often added by the SDK). 140 roots = self.pipeline.root_transform_ids 141 while len(roots) == 1: 142 root = self.pipeline.components.transforms[roots[0]] 143 if not root.subtransforms: 144 break 145 roots = root.subtransforms 146 self.roots = roots 147 148 # Figure out at what point to stop rendering composite internals. 149 if options.render_leaf_composite_nodes: 150 is_leaf = lambda name: any( 151 re.match(pattern, name) 152 for patterns in options.render_leaf_composite_nodes 153 for pattern in patterns.split(',')) 154 self.leaf_composites = set() 155 156 def mark_leaves(transform_ids): 157 for transform_id in transform_ids: 158 if is_leaf(transform_id): 159 self.leaf_composites.add(transform_id) 160 else: 161 mark_leaves( 162 self.pipeline.components.transforms[transform_id].subtransforms) 163 164 mark_leaves(self.roots) 165 166 elif options.render_port >= 0: 167 # Start interactive with no unfolding. 168 self.leaf_composites = set(self.roots) 169 else: 170 # For non-interactive, expand fully. 171 self.leaf_composites = set() 172 173 # Useful for attempting graph layout consistency. 174 self.latest_positions = {} 175 self.highlighted = [] 176 177 def update(self, toggle=None): 178 if toggle: 179 transform_id = toggle[0] 180 self.highlighted = [transform_id] 181 if transform_id in self.leaf_composites: 182 transform = self.pipeline.components.transforms[transform_id] 183 if transform.subtransforms: 184 self.leaf_composites.remove(transform_id) 185 for subtransform in transform.subtransforms: 186 self.leaf_composites.add(subtransform) 187 if transform_id in self.latest_positions: 188 self.latest_positions[subtransform] = self.latest_positions[ 189 transform_id] 190 else: 191 self.leaf_composites.add(transform_id) 192 193 def style(self, transform_id): 194 base = ' '.join( 195 [DEFAULT_TRANSFORM_STYLE, self.options.render_node_attributes]) 196 if transform_id in self.highlighted: 197 return ' '.join([ 198 base, 199 DEFAULT_HIGHLIGHT_STYLE, 200 self.options.render_highlight_attributes 201 ]) 202 else: 203 return base 204 205 def to_dot(self): 206 return '\n'.join(self.to_dot_iter()) 207 208 def to_dot_iter(self): 209 yield 'digraph G {' 210 # Defer drawing any edges until the end lest we declare nodes too early. 211 edges_out = [] 212 for transform_id in self.roots: 213 yield from self.transform_to_dot( 214 transform_id, self.pcoll_leaf_consumers(), edges_out) 215 yield from edges_out 216 yield '}' 217 218 def transform_to_dot(self, transform_id, pcoll_leaf_consumers, edges_out): 219 transform = self.pipeline.components.transforms[transform_id] 220 if self.is_leaf(transform_id): 221 yield self.transform_node(transform_id) 222 transform_inputs = set(transform.inputs.values()) 223 for name, output in transform.outputs.items(): 224 # For outputs that are also inputs, it's ambiguous whether they are 225 # consumed as the outputs of this transform, or of the upstream 226 # transform. Render the latter. 227 if output in transform_inputs: 228 continue 229 output_label = name if len(transform.outputs) > 1 else '' 230 for consumer, is_side_input in pcoll_leaf_consumers[output]: 231 # Can't yield this here as the consumer might not be in this cluster. 232 edge_style = 'dashed' if is_side_input else 'solid' 233 edge_attributes = ' '.join([ 234 f'label="{output_label}" style={edge_style}', 235 DEFAULT_EDGE_STYLE, 236 self.options.render_edge_attributes 237 ]) 238 edges_out.append( 239 f'"{transform_id}" -> "{consumer}" [{edge_attributes}]') 240 else: 241 yield f'subgraph "cluster_{transform_id}" {{' 242 yield self.transform_attributes(transform_id) 243 for subtransform in transform.subtransforms: 244 yield from self.transform_to_dot( 245 subtransform, pcoll_leaf_consumers, edges_out) 246 yield '}' 247 248 def transform_node(self, transform_id): 249 return f'"{transform_id}" [{self.transform_attributes(transform_id)}]' 250 251 def transform_attributes(self, transform_id): 252 transform = self.pipeline.components.transforms[transform_id] 253 local_name = transform.unique_name.split('/')[-1] 254 if transform_id in self.latest_positions: 255 pos_str = f'pos="{self.latest_positions[transform_id]}"' 256 else: 257 pos_str = '' 258 return ( 259 f'label="{local_name}" {self.style(transform_id)} ' 260 f'URL="javascript:click(\'{transform_id}\')" {pos_str}') 261 262 def pcoll_leaf_consumers_iter(self, transform_id): 263 transform = self.pipeline.components.transforms[transform_id] 264 transform_inputs = set(transform.inputs.values()) 265 side_inputs = set(translations.side_inputs(transform).values()) 266 if self.is_leaf(transform_id): 267 for pcoll in transform.inputs.values(): 268 yield pcoll, (transform_id, pcoll in side_inputs) 269 for subtransform in transform.subtransforms: 270 for pcoll, (consumer, 271 annotation) in self.pcoll_leaf_consumers_iter(subtransform): 272 if self.is_leaf(transform_id): 273 if pcoll not in transform_inputs: 274 yield pcoll, (transform_id, annotation) 275 else: 276 yield pcoll, (consumer, annotation) 277 278 def pcoll_leaf_consumers(self): 279 result = collections.defaultdict(list) 280 for transform_id in self.roots: 281 for pcoll, consumer_info in self.pcoll_leaf_consumers_iter(transform_id): 282 result[pcoll].append(consumer_info) 283 return result 284 285 def is_leaf(self, transform_id): 286 return ( 287 transform_id in self.leaf_composites or 288 not self.pipeline.components.transforms[transform_id].subtransforms) 289 290 def info(self): 291 if len(self.highlighted) != 1: 292 return '' 293 transform_id = self.highlighted[0] 294 return f'<pre>{self.pipeline.components.transforms[transform_id]}</pre>' 295 296 def layout_dot(self): 297 layout = subprocess.run(['dot', '-Tdot'], 298 input=self.to_dot().encode('utf-8'), 299 capture_output=True, 300 check=True).stdout 301 302 # Try to capture the positions for layout consistency. 303 json_out = json.loads( 304 subprocess.run(['dot', '-n2', '-Kneato', '-Tjson'], 305 input=layout, 306 capture_output=True, 307 check=True).stdout) 308 for box in json_out['objects']: 309 name = box.get('name', None) 310 if name in self.pipeline.components.transforms: 311 if 'pos' in box: 312 self.latest_positions[name] = box['pos'] 313 elif 'bb' in box: 314 x0, y0, x1, y1 = [float(r) for r in box['bb'].split(',')] 315 self.latest_positions[name] = f'{(x0+x1)/2},{(y0+y1)/2}' 316 317 return layout 318 319 def page_callback_data(self, layout): 320 svg = subprocess.run(['dot', '-Kneato', '-n2', '-Tsvg'], 321 input=layout, 322 capture_output=True, 323 check=True).stdout 324 cmapx = subprocess.run(['dot', '-Kneato', '-n2', '-Tcmapx'], 325 input=layout, 326 capture_output=True, 327 check=True).stdout 328 329 return { 330 'src': 'data:image/svg+xml;base64,' + 331 base64.b64encode(svg).decode('utf-8'), 332 'cmapx': cmapx.decode('utf-8'), 333 'info': self.info(), 334 } 335 336 def render_data(self): 337 logging.info("Re-rendering pipeline...") 338 layout = self.layout_dot() 339 if self.options.render_output: 340 for path in self.options.render_output: 341 format = os.path.splitext(path)[-1][1:] 342 result = subprocess.run( 343 ['dot', '-Kneato', '-n2', '-T' + format, '-o', path], 344 input=layout, 345 check=False) 346 if result.returncode: 347 logging.error( 348 "Failed render pipeline as %r: exit %s", path, result.returncode) 349 else: 350 logging.info("Rendered pipeline as %r", path) 351 return self.page_callback_data(layout) 352 353 def render_json(self): 354 return json.dumps(self.render_data()) 355 356 def page(self): 357 data = self.render_data() 358 src = data['src'] 359 cmapx = data['cmapx'] 360 return """ 361 <html> 362 <head> 363 <script> 364 function click(transform_id) { 365 var xhttp = new XMLHttpRequest(); 366 xhttp.onreadystatechange = function() { 367 render_data = JSON.parse(this.responseText); 368 document.getElementById('image_map_holder').innerHTML = 369 render_data.cmapx; 370 document.getElementById('image_tag').src = render_data.src 371 document.getElementById('info').innerHTML = render_data.info 372 }; 373 xhttp.open("GET", "render?toggle=" + transform_id, true); 374 xhttp.send(); 375 } 376 377 </script> 378 </head> 379 """ + f""" 380 <body> 381 Click on a composite transform to expand. 382 <br> 383 <img id='image_tag' src='{src}' usemap='#G'> 384 <hr> 385 <div id='info'></div> 386 <div id='image_map_holder'> 387 {cmapx} 388 </div> 389 </body> 390 </html> 391 """ 392 393 394 class RenderRunner(runner.PipelineRunner): 395 # TODO(robertwb): Consider making this a runner wrapper, where live status 396 # (such as counters, stage completion status, or possibly even PCollection 397 # samples) queryable and/or displayed. This could evolve into a full Beam 398 # UI. 399 def run_pipeline(self, pipeline_object, options, pipeline_proto=None): 400 if not pipeline_proto: 401 pipeline_proto = pipeline_object.to_runner_api() 402 render_options = options.view_as(RenderOptions) 403 if render_options.log_proto: 404 logging.info(pipeline_proto) 405 renderer = PipelineRenderer(pipeline_proto, render_options) 406 renderer.page() 407 408 if render_options.render_port >= 0: 409 # TODO: If this gets more complex, we could consider taking on a 410 # framework like Flask as a dependency. 411 class RequestHandler(http.server.BaseHTTPRequestHandler): 412 def do_GET(self): 413 parts = urllib.parse.urlparse(self.path) 414 args = urllib.parse.parse_qs(parts.query) 415 renderer.update(**args) 416 417 if parts.path == '/': 418 response = renderer.page() 419 elif parts.path == '/render': 420 response = renderer.render_json() 421 else: 422 self.send_response(400) 423 return 424 425 self.send_response(200) 426 self.send_header("Content-type", "text/html") 427 self.end_headers() 428 self.wfile.write(response.encode('utf-8')) 429 430 server = http.server.HTTPServer(('localhost', render_options.render_port), 431 RequestHandler) 432 server_thread = threading.Thread(target=server.serve_forever, daemon=True) 433 server_thread.start() 434 print('Serving at http://%s:%s' % server.server_address) 435 return RenderPipelineResult(server) 436 437 else: 438 return RenderPipelineResult(None) 439 440 441 class RenderPipelineResult(runner.PipelineResult): 442 def __init__(self, server): 443 super().__init__(runner.PipelineState.RUNNING) 444 self.server = server 445 446 def wait_until_finish(self, duration=None): 447 if self.server: 448 time.sleep(duration or 1e8) 449 self.server.shutdown() 450 self._state = runner.PipelineState.DONE 451 452 def monitoring_infos(self): 453 return [] 454 455 456 def run(argv): 457 if argv[0] == __file__: 458 argv = argv[1:] 459 parser = argparse.ArgumentParser( 460 description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) 461 parser.add_argument( 462 '--job_port', 463 type=int, 464 default=0, 465 help='port on which to serve the job api') 466 parser.add_argument( 467 '--pipeline_proto', help='file containing the beam pipeline definition') 468 RenderOptions._add_argparse_args(parser) 469 options = parser.parse_args(argv) 470 471 if options.pipeline_proto: 472 if not options.render_output and options.render_port < 0: 473 options.render_port = 0 474 475 render_one(options) 476 477 if options.render_output: 478 return 479 480 run_server(options) 481 482 483 def render_one(options): 484 if options.pipeline_proto == '-': 485 content = sys.stdin.buffer.read() 486 if content[0] == b'{': 487 ext = '.json' 488 else: 489 try: 490 content.decode('utf-8') 491 ext = '.textproto' 492 except UnicodeDecodeError: 493 ext = '.pb' 494 else: 495 if options.pipeline_proto.startswith('gs://'): 496 if gcsio is None: 497 raise ImportError('GCS not available; please install apache_beam[gcp]') 498 open_fn = gcsio.GcsIO().open 499 else: 500 open_fn = open 501 502 with open_fn(options.pipeline_proto, 'rb') as fin: 503 content = fin.read() 504 ext = os.path.splitext(options.pipeline_proto)[-1] 505 506 if ext == '.textproto': 507 pipeline_proto = text_format.Parse(content, beam_runner_api_pb2.Pipeline()) 508 elif ext == '.json': 509 pipeline_proto = json_format.Parse(content, beam_runner_api_pb2.Pipeline()) 510 else: 511 pipeline_proto = beam_runner_api_pb2.Pipeline() 512 pipeline_proto.ParseFromString(content) 513 514 RenderRunner().run_pipeline( 515 None, pipeline_options.PipelineOptions(**vars(options)), pipeline_proto) 516 517 518 def run_server(options): 519 class RenderBeamJob(local_job_service.BeamJob): 520 def _invoke_runner(self): 521 return RenderRunner().run_pipeline( 522 None, 523 pipeline_options.PipelineOptions(**vars(options)), 524 self._pipeline_proto) 525 526 with tempfile.TemporaryDirectory() as staging_dir: 527 job_servicer = local_job_service.LocalJobServicer( 528 staging_dir, beam_job_type=RenderBeamJob) 529 port = job_servicer.start_grpc_server(options.job_port) 530 try: 531 local_job_service_main.serve( 532 "Listening for beam jobs on port %d." % port, job_servicer) 533 finally: 534 job_servicer.stop() 535 536 537 if __name__ == '__main__': 538 logging.basicConfig() 539 logging.getLogger().setLevel(logging.INFO) 540 run(sys.argv)