github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/display/display_manager.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Manages displaying pipeline graph and execution status on the frontend. 19 20 This module is experimental. No backwards-compatibility guarantees. 21 """ 22 23 # pytype: skip-file 24 25 import collections 26 import threading 27 import time 28 from typing import TYPE_CHECKING 29 30 from apache_beam.runners.interactive.display import interactive_pipeline_graph 31 32 try: 33 import IPython # pylint: disable=import-error 34 from IPython import get_ipython # pylint: disable=import-error 35 from IPython.display import display as ip_display # pylint: disable=import-error 36 # _display_progress defines how outputs are printed on the frontend. 37 _display_progress = ip_display 38 39 if not TYPE_CHECKING: 40 41 def _formatter(string, pp, cycle): # pylint: disable=unused-argument 42 pp.text(string) 43 44 if get_ipython(): 45 plain = get_ipython().display_formatter.formatters['text/plain'] # pylint: disable=undefined-variable 46 plain.for_type(str, _formatter) 47 48 except ImportError: 49 IPython = None 50 _display_progress = print 51 52 53 class DisplayManager(object): 54 """Manages displaying pipeline graph and execution status on the frontend.""" 55 def __init__( 56 self, 57 pipeline_proto, 58 pipeline_analyzer, 59 cache_manager, 60 pipeline_graph_renderer): 61 """Constructor of DisplayManager. 62 63 Args: 64 pipeline_proto: (Pipeline proto) 65 pipeline_analyzer: (PipelineAnalyzer) the pipeline analyzer that 66 corresponds to this round of execution. This will provide more 67 detailed informations about the pipeline 68 cache_manager: (interactive_runner.CacheManager) DisplayManager fetches 69 the latest status of pipeline execution by querying cache_manager. 70 pipeline_graph_renderer: (pipeline_graph_renderer.PipelineGraphRenderer) 71 decides how a pipeline graph is rendered. 72 """ 73 # Every parameter except cache_manager is expected to remain constant. 74 self._analyzer = pipeline_analyzer 75 self._cache_manager = cache_manager 76 self._pipeline_graph = interactive_pipeline_graph.InteractivePipelineGraph( 77 pipeline_proto, 78 required_transforms=self._analyzer.tl_required_trans_ids(), 79 referenced_pcollections=self._analyzer.tl_referenced_pcoll_ids(), 80 cached_pcollections=self._analyzer.caches_used()) 81 self._renderer = pipeline_graph_renderer 82 83 # _text_to_print keeps track of information to be displayed. 84 self._text_to_print = collections.OrderedDict() 85 self._text_to_print['summary'] = ( 86 'Using %s cached PCollections\nExecuting %s of %s ' 87 'transforms.') % ( 88 len(self._analyzer.caches_used()), 89 ( 90 len(self._analyzer.tl_required_trans_ids()) - 91 len(self._analyzer.read_cache_ids()) - 92 len(self._analyzer.write_cache_ids())), 93 len( 94 pipeline_proto.components.transforms[ 95 pipeline_proto.root_transform_ids[0]].subtransforms)) 96 self._text_to_print.update( 97 {pcoll_id: "" 98 for pcoll_id in self._analyzer.tl_referenced_pcoll_ids()}) 99 100 # _pcollection_stats maps pcoll_id to 101 # { 'cache_label': cache_label, version': version, 'sample': pcoll_in_list } 102 self._pcollection_stats = {} 103 for pcoll_id in self._analyzer.tl_referenced_pcoll_ids(): 104 self._pcollection_stats[pcoll_id] = { 105 'cache_label': self._analyzer.pipeline_info().cache_label(pcoll_id), 106 'version': -1, 107 'sample': [] 108 } 109 110 self._producers = {} 111 for _, transform in pipeline_proto.components.transforms.items(): 112 for pcoll_id in transform.outputs.values(): 113 if pcoll_id not in self._producers or '/' not in transform.unique_name: 114 self._producers[pcoll_id] = transform.unique_name 115 116 # For periodic update. 117 self._lock = threading.Lock() 118 self._periodic_update = False 119 120 def update_display(self, force=False): 121 """Updates display on the frontend. 122 123 Retrieves the latest execution status by querying CacheManager and updates 124 display on the fronend. The assumption is that there is only one pipeline in 125 a cell, because it clears up everything in the cell output every update 126 cycle. 127 128 Args: 129 force: (bool) whether to force updating when no stats change happens. 130 """ 131 with self._lock: 132 stats_updated = False 133 134 for pcoll_id, stats in self._pcollection_stats.items(): 135 cache_label = stats['cache_label'] 136 version = stats['version'] 137 138 if force or not self._cache_manager.is_latest_version( 139 version, 'sample', cache_label): 140 pcoll_list, version = self._cache_manager.read('sample', cache_label) 141 stats['sample'] = list(pcoll_list) 142 stats['version'] = version 143 stats_updated = True 144 145 if pcoll_id in self._analyzer.tl_referenced_pcoll_ids(): 146 self._text_to_print[pcoll_id] = ( 147 str( 148 '%s produced %s' % ( 149 self._producers[pcoll_id], 150 interactive_pipeline_graph.format_sample(pcoll_list, 151 5)))) 152 153 if force or stats_updated: 154 self._pipeline_graph.update_pcollection_stats(self._pcollection_stats) 155 156 if IPython: 157 from IPython import display 158 display.clear_output(True) 159 rendered_graph = self._renderer.render_pipeline_graph( 160 self._pipeline_graph) 161 display.display(display.HTML(rendered_graph)) 162 163 _display_progress('Running...') 164 for text in self._text_to_print.values(): 165 if text != "": 166 _display_progress(text) 167 168 def start_periodic_update(self): 169 """Start a thread that periodically updates the display.""" 170 self.update_display(True) 171 self._periodic_update = True 172 173 def _updater(): 174 while self._periodic_update: 175 self.update_display() 176 time.sleep(.02) 177 178 t = threading.Thread(target=_updater) 179 t.daemon = True 180 t.start() 181 182 def stop_periodic_update(self): 183 """Stop periodically updating the display.""" 184 self.update_display(True) 185 self._periodic_update = False