github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/display/display_manager.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Manages displaying pipeline graph and execution status on the frontend.
    19  
    20  This module is experimental. No backwards-compatibility guarantees.
    21  """
    22  
    23  # pytype: skip-file
    24  
    25  import collections
    26  import threading
    27  import time
    28  from typing import TYPE_CHECKING
    29  
    30  from apache_beam.runners.interactive.display import interactive_pipeline_graph
    31  
    32  try:
    33    import IPython  # pylint: disable=import-error
    34    from IPython import get_ipython  # pylint: disable=import-error
    35    from IPython.display import display as ip_display  # pylint: disable=import-error
    36    # _display_progress defines how outputs are printed on the frontend.
    37    _display_progress = ip_display
    38  
    39    if not TYPE_CHECKING:
    40  
    41      def _formatter(string, pp, cycle):  # pylint: disable=unused-argument
    42        pp.text(string)
    43  
    44      if get_ipython():
    45        plain = get_ipython().display_formatter.formatters['text/plain']  # pylint: disable=undefined-variable
    46        plain.for_type(str, _formatter)
    47  
    48  except ImportError:
    49    IPython = None
    50    _display_progress = print
    51  
    52  
    53  class DisplayManager(object):
    54    """Manages displaying pipeline graph and execution status on the frontend."""
    55    def __init__(
    56        self,
    57        pipeline_proto,
    58        pipeline_analyzer,
    59        cache_manager,
    60        pipeline_graph_renderer):
    61      """Constructor of DisplayManager.
    62  
    63      Args:
    64        pipeline_proto: (Pipeline proto)
    65        pipeline_analyzer: (PipelineAnalyzer) the pipeline analyzer that
    66            corresponds to this round of execution. This will provide more
    67            detailed informations about the pipeline
    68        cache_manager: (interactive_runner.CacheManager) DisplayManager fetches
    69            the latest status of pipeline execution by querying cache_manager.
    70        pipeline_graph_renderer: (pipeline_graph_renderer.PipelineGraphRenderer)
    71            decides how a pipeline graph is rendered.
    72      """
    73      # Every parameter except cache_manager is expected to remain constant.
    74      self._analyzer = pipeline_analyzer
    75      self._cache_manager = cache_manager
    76      self._pipeline_graph = interactive_pipeline_graph.InteractivePipelineGraph(
    77          pipeline_proto,
    78          required_transforms=self._analyzer.tl_required_trans_ids(),
    79          referenced_pcollections=self._analyzer.tl_referenced_pcoll_ids(),
    80          cached_pcollections=self._analyzer.caches_used())
    81      self._renderer = pipeline_graph_renderer
    82  
    83      # _text_to_print keeps track of information to be displayed.
    84      self._text_to_print = collections.OrderedDict()
    85      self._text_to_print['summary'] = (
    86          'Using %s cached PCollections\nExecuting %s of %s '
    87          'transforms.') % (
    88              len(self._analyzer.caches_used()),
    89              (
    90                  len(self._analyzer.tl_required_trans_ids()) -
    91                  len(self._analyzer.read_cache_ids()) -
    92                  len(self._analyzer.write_cache_ids())),
    93              len(
    94                  pipeline_proto.components.transforms[
    95                      pipeline_proto.root_transform_ids[0]].subtransforms))
    96      self._text_to_print.update(
    97          {pcoll_id: ""
    98           for pcoll_id in self._analyzer.tl_referenced_pcoll_ids()})
    99  
   100      # _pcollection_stats maps pcoll_id to
   101      # { 'cache_label': cache_label, version': version, 'sample': pcoll_in_list }
   102      self._pcollection_stats = {}
   103      for pcoll_id in self._analyzer.tl_referenced_pcoll_ids():
   104        self._pcollection_stats[pcoll_id] = {
   105            'cache_label': self._analyzer.pipeline_info().cache_label(pcoll_id),
   106            'version': -1,
   107            'sample': []
   108        }
   109  
   110      self._producers = {}
   111      for _, transform in pipeline_proto.components.transforms.items():
   112        for pcoll_id in transform.outputs.values():
   113          if pcoll_id not in self._producers or '/' not in transform.unique_name:
   114            self._producers[pcoll_id] = transform.unique_name
   115  
   116      # For periodic update.
   117      self._lock = threading.Lock()
   118      self._periodic_update = False
   119  
   120    def update_display(self, force=False):
   121      """Updates display on the frontend.
   122  
   123      Retrieves the latest execution status by querying CacheManager and updates
   124      display on the fronend. The assumption is that there is only one pipeline in
   125      a cell, because it clears up everything in the cell output every update
   126      cycle.
   127  
   128      Args:
   129        force: (bool) whether to force updating when no stats change happens.
   130      """
   131      with self._lock:
   132        stats_updated = False
   133  
   134        for pcoll_id, stats in self._pcollection_stats.items():
   135          cache_label = stats['cache_label']
   136          version = stats['version']
   137  
   138          if force or not self._cache_manager.is_latest_version(
   139              version, 'sample', cache_label):
   140            pcoll_list, version = self._cache_manager.read('sample', cache_label)
   141            stats['sample'] = list(pcoll_list)
   142            stats['version'] = version
   143            stats_updated = True
   144  
   145            if pcoll_id in self._analyzer.tl_referenced_pcoll_ids():
   146              self._text_to_print[pcoll_id] = (
   147                  str(
   148                      '%s produced %s' % (
   149                          self._producers[pcoll_id],
   150                          interactive_pipeline_graph.format_sample(pcoll_list,
   151                                                                   5))))
   152  
   153        if force or stats_updated:
   154          self._pipeline_graph.update_pcollection_stats(self._pcollection_stats)
   155  
   156          if IPython:
   157            from IPython import display
   158            display.clear_output(True)
   159            rendered_graph = self._renderer.render_pipeline_graph(
   160                self._pipeline_graph)
   161            display.display(display.HTML(rendered_graph))
   162  
   163          _display_progress('Running...')
   164          for text in self._text_to_print.values():
   165            if text != "":
   166              _display_progress(text)
   167  
   168    def start_periodic_update(self):
   169      """Start a thread that periodically updates the display."""
   170      self.update_display(True)
   171      self._periodic_update = True
   172  
   173      def _updater():
   174        while self._periodic_update:
   175          self.update_display()
   176          time.sleep(.02)
   177  
   178      t = threading.Thread(target=_updater)
   179      t.daemon = True
   180      t.start()
   181  
   182    def stop_periodic_update(self):
   183      """Stop periodically updating the display."""
   184      self.update_display(True)
   185      self._periodic_update = False