github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_environment.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_environment.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Module of the current Interactive Beam environment.
    19  
    20  For internal use only; no backwards-compatibility guarantees.
    21  Provides interfaces to interact with existing Interactive Beam environment.
    22  External Interactive Beam users please use interactive_beam module in
    23  application code or notebook.
    24  """
    25  # pytype: skip-file
    26  
    27  import atexit
    28  import importlib
    29  import logging
    30  import os
    31  import tempfile
    32  import warnings
    33  from collections.abc import Iterable
    34  from pathlib import PurePath
    35  
    36  import apache_beam as beam
    37  from apache_beam.runners import DataflowRunner
    38  from apache_beam.runners import runner
    39  from apache_beam.runners.direct import direct_runner
    40  from apache_beam.runners.interactive import cache_manager as cache
    41  from apache_beam.runners.interactive.messaging.interactive_environment_inspector import InteractiveEnvironmentInspector
    42  from apache_beam.runners.interactive.recording_manager import RecordingManager
    43  from apache_beam.runners.interactive.sql.sql_chain import SqlChain
    44  from apache_beam.runners.interactive.user_pipeline_tracker import UserPipelineTracker
    45  from apache_beam.runners.interactive.utils import assert_bucket_exists
    46  from apache_beam.runners.interactive.utils import detect_pipeline_runner
    47  from apache_beam.runners.interactive.utils import register_ipython_log_handler
    48  from apache_beam.utils.interactive_utils import is_in_ipython
    49  from apache_beam.utils.interactive_utils import is_in_notebook
    50  
    51  # Interactive Beam user flow is data-centric rather than pipeline-centric, so
    52  # there is only one global interactive environment instance that manages
    53  # implementation that enables interactivity.
    54  _interactive_beam_env = None
    55  
    56  _LOGGER = logging.getLogger(__name__)
    57  
    58  # By `format(customized_script=xxx)`, the given `customized_script` is
    59  # guaranteed to be executed within access to a jquery with datatable plugin
    60  # configured which is useful so that any `customized_script` is resilient to
    61  # browser refresh. Inside `customized_script`, use `$` as jQuery.
    62  _JQUERY_WITH_DATATABLE_TEMPLATE = """
    63          if (typeof window.interactive_beam_jquery == 'undefined') {{
    64            var jqueryScript = document.createElement('script');
    65            jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';
    66            jqueryScript.type = 'text/javascript';
    67            jqueryScript.onload = function() {{
    68              var datatableScript = document.createElement('script');
    69              datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';
    70              datatableScript.type = 'text/javascript';
    71              datatableScript.onload = function() {{
    72                window.interactive_beam_jquery = jQuery.noConflict(true);
    73                window.interactive_beam_jquery(document).ready(function($){{
    74                  {customized_script}
    75                }});
    76              }}
    77              document.head.appendChild(datatableScript);
    78            }};
    79            document.head.appendChild(jqueryScript);
    80          }} else {{
    81            window.interactive_beam_jquery(document).ready(function($){{
    82              {customized_script}
    83            }});
    84          }}"""
    85  
    86  # By `format(hrefs=xxx)`, the given `hrefs` will be imported as HTML imports.
    87  # Since HTML import might not be supported by the browser, we check if HTML
    88  # import is supported by the browser, if so, import HTMLs else setup
    89  # webcomponents and chain the HTML import to the end of onload.
    90  _HTML_IMPORT_TEMPLATE = """
    91          var import_html = () => {{
    92            {hrefs}.forEach(href => {{
    93              var link = document.createElement('link');
    94              link.rel = 'import'
    95              link.href = href;
    96              document.head.appendChild(link);
    97            }});
    98          }}
    99          if ('import' in document.createElement('link')) {{
   100            import_html();
   101          }} else {{
   102            var webcomponentScript = document.createElement('script');
   103            webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';
   104            webcomponentScript.type = 'text/javascript';
   105            webcomponentScript.onload = function(){{
   106              import_html();
   107            }};
   108            document.head.appendChild(webcomponentScript);
   109          }}"""
   110  
   111  
   112  def current_env():
   113    """Gets current Interactive Beam environment."""
   114    global _interactive_beam_env
   115    if not _interactive_beam_env:
   116      _interactive_beam_env = InteractiveEnvironment()
   117    return _interactive_beam_env
   118  
   119  
   120  def new_env():
   121    """Creates a new Interactive Beam environment to replace current one."""
   122    global _interactive_beam_env
   123    if _interactive_beam_env:
   124      _interactive_beam_env.cleanup()
   125    _interactive_beam_env = None
   126    return current_env()
   127  
   128  
   129  class InteractiveEnvironment(object):
   130    """An interactive environment with cache and pipeline variable metadata.
   131  
   132    Interactive Beam will use the watched variable information to determine if a
   133    PCollection is assigned to a variable in user pipeline definition. When
   134    executing the pipeline, interactivity is applied with implicit cache
   135    mechanism for those PCollections if the pipeline is interactive. Users can
   136    also visualize and introspect those PCollections in user code since they have
   137    handles to the variables.
   138    """
   139    def __init__(self):
   140      # Registers a cleanup routine when system exits.
   141      atexit.register(self.cleanup)
   142      # Holds cache managers that manage source recording and intermediate
   143      # PCollection cache for each pipeline. Each key is a stringified user
   144      # defined pipeline instance's id.
   145      self._cache_managers = {}
   146      # Holds RecordingManagers keyed by pipeline instance id.
   147      self._recording_managers = {}
   148      # Holds class instances, module object, string of module names.
   149      self._watching_set = set()
   150      # Holds variables list of (Dict[str, object]).
   151      self._watching_dict_list = []
   152      # Holds results of main jobs as Dict[str, PipelineResult].
   153      # Each key is a pipeline instance defined by the end user. The
   154      # InteractiveRunner is responsible for populating this dictionary
   155      # implicitly.
   156      self._main_pipeline_results = {}
   157      # Holds background caching jobs as Dict[str, BackgroundCachingJob].
   158      # Each key is a pipeline instance defined by the end user. The
   159      # InteractiveRunner or its enclosing scope is responsible for populating
   160      # this dictionary implicitly when a background caching jobs is started.
   161      self._background_caching_jobs = {}
   162      # Holds TestStreamServiceControllers that controls gRPC servers serving
   163      # events as test stream of TestStreamPayload.Event.
   164      # Dict[str, TestStreamServiceController]. Each key is a pipeline
   165      # instance defined by the end user. The InteractiveRunner or its enclosing
   166      # scope is responsible for populating this dictionary implicitly when a new
   167      # controller is created to start a new gRPC server. The server stays alive
   168      # until a new background caching job is started thus invalidating everything
   169      # the gRPC server serves.
   170      self._test_stream_service_controllers = {}
   171      self._cached_source_signature = {}
   172      self._tracked_user_pipelines = UserPipelineTracker()
   173      from apache_beam.runners.interactive.interactive_beam import clusters
   174      self.clusters = clusters
   175  
   176      # Tracks the computation completeness of PCollections. PCollections tracked
   177      # here don't need to be re-computed when data introspection is needed.
   178      self._computed_pcolls = set()
   179      # Always watch __main__ module.
   180      self.watch('__main__')
   181      # Check if [interactive] dependencies are installed.
   182      try:
   183        import IPython  # pylint: disable=unused-import
   184        import timeloop  # pylint: disable=unused-import
   185        from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator  # pylint: disable=unused-import
   186        from google.cloud import dataproc_v1  # pylint: disable=unused-import
   187        self._is_interactive_ready = True
   188      except ImportError:
   189        self._is_interactive_ready = False
   190        _LOGGER.warning(
   191            'Dependencies required for Interactive Beam PCollection '
   192            'visualization are not available, please use: `pip '
   193            'install apache-beam[interactive]` to install necessary '
   194            'dependencies to enable all data visualization features.')
   195  
   196      self._is_in_ipython = is_in_ipython()
   197      self._is_in_notebook = is_in_notebook()
   198      if not self._is_in_ipython:
   199        _LOGGER.warning(
   200            'You cannot use Interactive Beam features when you are '
   201            'not in an interactive environment such as a Jupyter '
   202            'notebook or ipython terminal.')
   203      if self._is_in_ipython and not self._is_in_notebook:
   204        _LOGGER.warning(
   205            'You have limited Interactive Beam features since your '
   206            'ipython kernel is not connected to any notebook frontend.')
   207      if self._is_in_notebook:
   208        self.load_jquery_with_datatable()
   209        register_ipython_log_handler()
   210  
   211      # A singleton inspector instance to message information of current
   212      # environment to other applications.
   213      self._inspector = InteractiveEnvironmentInspector()
   214      # A similar singleton inspector except it includes synthetic variables
   215      # generated by Interactive Beam.
   216      self._inspector_with_synthetic = InteractiveEnvironmentInspector(
   217          ignore_synthetic=False)
   218  
   219      self.sql_chain = {}
   220  
   221    @property
   222    def options(self):
   223      """A reference to the global interactive options.
   224  
   225      Provided to avoid import loop or excessive dynamic import. All internal
   226      Interactive Beam modules should access interactive_beam.options through
   227      this property.
   228      """
   229      from apache_beam.runners.interactive.interactive_beam import options
   230      return options
   231  
   232    @property
   233    def is_interactive_ready(self):
   234      """If the [interactive] dependencies are installed."""
   235      return self._is_interactive_ready
   236  
   237    @property
   238    def is_in_ipython(self):
   239      """If the runtime is within an IPython kernel."""
   240      return self._is_in_ipython
   241  
   242    @property
   243    def is_in_notebook(self):
   244      """If the kernel is connected to a notebook frontend.
   245  
   246      If not, it could be that the user is using kernel in a terminal or a unit
   247      test.
   248      """
   249      return self._is_in_notebook
   250  
   251    @property
   252    def inspector(self):
   253      """Gets the singleton InteractiveEnvironmentInspector to retrieve
   254      information consumable by other applications such as a notebook
   255      extension."""
   256      return self._inspector
   257  
   258    @property
   259    def inspector_with_synthetic(self):
   260      """Gets the singleton InteractiveEnvironmentInspector with additional
   261      synthetic variables generated by Interactive Beam. Internally used."""
   262      return self._inspector_with_synthetic
   263  
   264    def cleanup_pipeline(self, pipeline):
   265      from apache_beam.runners.interactive import background_caching_job as bcj
   266      bcj.attempt_to_cancel_background_caching_job(pipeline)
   267      bcj.attempt_to_stop_test_stream_service(pipeline)
   268      cache_manager = self.get_cache_manager(pipeline)
   269      # Recording manager performs cache manager cleanup during eviction, so we
   270      # don't need to clean it up here.
   271      if cache_manager and self.get_recording_manager(pipeline) is None:
   272        cache_manager.cleanup()
   273      self.clusters.cleanup(pipeline)
   274  
   275    def cleanup_environment(self):
   276      for _, job in self._background_caching_jobs.items():
   277        if job:
   278          job.cancel()
   279      for _, controller in self._test_stream_service_controllers.items():
   280        if controller:
   281          controller.stop()
   282      for pipeline_id, cache_manager in self._cache_managers.items():
   283        # Recording manager performs cache manager cleanup during eviction, so
   284        # we don't need to clean it up here.
   285        if cache_manager and pipeline_id not in self._recording_managers:
   286          cache_manager.cleanup()
   287      self.clusters.cleanup(force=True)
   288  
   289    def cleanup(self, pipeline=None):
   290      """Cleans up cached states for the given pipeline. Noop if the given
   291      pipeline is absent from the environment. Cleans up for all pipelines
   292      if no pipeline is specified."""
   293      if pipeline:
   294        self.cleanup_pipeline(pipeline)
   295      else:
   296        self.cleanup_environment()
   297  
   298      self.evict_recording_manager(pipeline)
   299      self.evict_background_caching_job(pipeline)
   300      self.evict_test_stream_service_controller(pipeline)
   301      self.evict_computed_pcollections(pipeline)
   302      self.evict_cached_source_signature(pipeline)
   303      self.evict_pipeline_result(pipeline)
   304      self.evict_tracked_pipelines(pipeline)
   305  
   306    def _track_user_pipelines(self, watchable):
   307      """Tracks user pipelines from the given watchable."""
   308  
   309      pipelines = set()
   310      if isinstance(watchable, beam.Pipeline):
   311        pipelines.add(watchable)
   312      elif isinstance(watchable, dict):
   313        for v in watchable.values():
   314          if isinstance(v, beam.Pipeline):
   315            pipelines.add(v)
   316      elif isinstance(watchable, Iterable):
   317        for v in watchable:
   318          if isinstance(v, beam.Pipeline):
   319            pipelines.add(v)
   320      for p in pipelines:
   321        self._tracked_user_pipelines.add_user_pipeline(p)
   322        _ = self.get_cache_manager(p, create_if_absent=True)
   323        _ = self.get_recording_manager(p, create_if_absent=True)
   324  
   325    def watch(self, watchable):
   326      """Watches a watchable.
   327  
   328      A watchable can be a dictionary of variable metadata such as locals(), a str
   329      name of a module, a module object or an instance of a class. The variable
   330      can come from any scope even local. Duplicated variable naming doesn't
   331      matter since they are different instances. Duplicated variables are also
   332      allowed when watching.
   333      """
   334      if isinstance(watchable, dict):
   335        self._watching_dict_list.append(watchable.items())
   336      else:
   337        self._watching_set.add(watchable)
   338      self._track_user_pipelines(watchable)
   339  
   340    def watching(self):
   341      """Analyzes and returns a list of pair lists referring to variable names and
   342      values from watched scopes.
   343  
   344      Each entry in the list represents the variable defined within a watched
   345      watchable. Currently, each entry holds a list of pairs. The format might
   346      change in the future to hold more metadata. Duplicated pairs are allowed.
   347      And multiple paris can have the same variable name as the "first" while
   348      having different variable values as the "second" since variables in
   349      different scopes can have the same name.
   350      """
   351      watching = list(self._watching_dict_list)
   352      for watchable in self._watching_set:
   353        if isinstance(watchable, str):
   354          module = importlib.import_module(watchable)
   355          watching.append(vars(module).items())
   356        else:
   357          watching.append(vars(watchable).items())
   358      return watching
   359  
   360    def set_cache_manager(self, cache_manager, pipeline):
   361      """Sets the cache manager held by current Interactive Environment for the
   362      given pipeline."""
   363      if self.get_cache_manager(pipeline) is cache_manager:
   364        # NOOP if setting to the same cache_manager.
   365        return
   366      if self.get_cache_manager(pipeline):
   367        # Invoke cleanup routine when a new cache_manager is forcefully set and
   368        # current cache_manager is not None.
   369        self.cleanup(pipeline)
   370      self._cache_managers[str(id(pipeline))] = cache_manager
   371  
   372    def get_cache_manager(self, pipeline, create_if_absent=False):
   373      """Gets the cache manager held by current Interactive Environment for the
   374      given pipeline. If the pipeline is absent from the environment while
   375      create_if_absent is True, creates and returns a new file based cache
   376      manager for the pipeline."""
   377      warnings.filterwarnings(
   378          'ignore',
   379          'options is deprecated since First stable release. References to '
   380          '<pipeline>.options will not be supported',
   381          category=DeprecationWarning)
   382  
   383      cache_manager = self._cache_managers.get(str(id(pipeline)), None)
   384      pipeline_runner = detect_pipeline_runner(pipeline)
   385      if not cache_manager and create_if_absent:
   386        cache_root = self.options.cache_root
   387        if cache_root:
   388          if cache_root.startswith('gs://'):
   389            cache_dir = self._get_gcs_cache_dir(pipeline, cache_root)
   390          else:
   391            cache_dir = tempfile.mkdtemp(dir=cache_root)
   392            if not isinstance(pipeline_runner, direct_runner.DirectRunner):
   393              _LOGGER.warning(
   394                  'A local cache directory has been specified while '
   395                  'not using DirectRunner. It is recommended to cache into a '
   396                  'GCS bucket instead.')
   397        else:
   398          staging_location = pipeline.options.get_all_options(
   399          )['staging_location']
   400          if isinstance(pipeline_runner, DataflowRunner) and staging_location:
   401            cache_dir = self._get_gcs_cache_dir(pipeline, staging_location)
   402            _LOGGER.info(
   403                'No cache_root detected. '
   404                'Defaulting to staging_location %s for cache location.',
   405                staging_location)
   406          else:
   407            cache_dir = tempfile.mkdtemp(
   408                suffix=str(id(pipeline)),
   409                prefix='it-',
   410                dir=os.environ.get('TEST_TMPDIR', None))
   411        cache_manager = cache.FileBasedCacheManager(cache_dir)
   412        self._cache_managers[str(id(pipeline))] = cache_manager
   413      return cache_manager
   414  
   415    def evict_cache_manager(self, pipeline=None):
   416      """Evicts the cache manager held by current Interactive Environment for the
   417      given pipeline. Noop if the pipeline is absent from the environment. If no
   418      pipeline is specified, evicts for all pipelines."""
   419      self.cleanup(pipeline)
   420      if pipeline:
   421        return self._cache_managers.pop(str(id(pipeline)), None)
   422      self._cache_managers.clear()
   423  
   424    def set_recording_manager(self, recording_manager, pipeline):
   425      """Sets the recording manager for the given pipeline."""
   426      if self.get_recording_manager(pipeline) is recording_manager:
   427        # NOOP if setting to the same recording_manager.
   428        return
   429      self._recording_managers[str(id(pipeline))] = recording_manager
   430  
   431    def get_recording_manager(self, pipeline, create_if_absent=False):
   432      """Gets the recording manager for the given pipeline."""
   433      recording_manager = self._recording_managers.get(str(id(pipeline)), None)
   434      if not recording_manager and create_if_absent:
   435        # Get the pipeline variable name for the user. This is useful if the user
   436        # has multiple pipelines.
   437        pipeline_var = ''
   438        for w in self.watching():
   439          for var, val in w:
   440            if val is pipeline:
   441              pipeline_var = var
   442              break
   443        recording_manager = RecordingManager(pipeline, pipeline_var)
   444        self._recording_managers[str(id(pipeline))] = recording_manager
   445      return recording_manager
   446  
   447    def evict_recording_manager(self, pipeline):
   448      """Evicts the recording manager for the given pipeline.
   449  
   450      This stops the background caching job and clears the cache.
   451      Noop if the pipeline is absent from the environment. If no
   452      pipeline is specified, evicts for all pipelines.
   453      """
   454      if not pipeline:
   455        for rm in self._recording_managers.values():
   456          rm.cancel()
   457          rm.clear()
   458        self._recording_managers = {}
   459        return
   460  
   461      recording_manager = self.get_recording_manager(pipeline)
   462      if recording_manager:
   463        recording_manager.cancel()
   464        recording_manager.clear()
   465        del self._recording_managers[str(id(pipeline))]
   466  
   467    def describe_all_recordings(self):
   468      """Returns a description of the recording for all watched pipelnes."""
   469      return {
   470          self.pipeline_id_to_pipeline(pid): rm.describe()
   471          for pid,
   472          rm in self._recording_managers.items()
   473      }
   474  
   475    def set_pipeline_result(self, pipeline, result):
   476      """Sets the pipeline run result. Adds one if absent. Otherwise, replace."""
   477      assert issubclass(type(pipeline), beam.Pipeline), (
   478          'pipeline must be an instance of apache_beam.Pipeline or its subclass')
   479      assert issubclass(type(result), runner.PipelineResult), (
   480          'result must be an instance of '
   481          'apache_beam.runners.runner.PipelineResult or its subclass')
   482      self._main_pipeline_results[str(id(pipeline))] = result
   483  
   484    def evict_pipeline_result(self, pipeline=None):
   485      """Evicts the last run result of the given pipeline. Noop if the pipeline
   486      is absent from the environment. If no pipeline is specified, evicts for all
   487      pipelines."""
   488      if pipeline:
   489        return self._main_pipeline_results.pop(str(id(pipeline)), None)
   490      self._main_pipeline_results.clear()
   491  
   492    def pipeline_result(self, pipeline):
   493      """Gets the pipeline run result. None if absent."""
   494      return self._main_pipeline_results.get(str(id(pipeline)), None)
   495  
   496    def set_background_caching_job(self, pipeline, background_caching_job):
   497      """Sets the background caching job started from the given pipeline."""
   498      assert issubclass(type(pipeline), beam.Pipeline), (
   499          'pipeline must be an instance of apache_beam.Pipeline or its subclass')
   500      from apache_beam.runners.interactive.background_caching_job import BackgroundCachingJob
   501      assert isinstance(background_caching_job, BackgroundCachingJob), (
   502          'background_caching job must be an instance of BackgroundCachingJob')
   503      self._background_caching_jobs[str(id(pipeline))] = background_caching_job
   504  
   505    def get_background_caching_job(self, pipeline):
   506      """Gets the background caching job started from the given pipeline."""
   507      return self._background_caching_jobs.get(str(id(pipeline)), None)
   508  
   509    def evict_background_caching_job(self, pipeline=None):
   510      """Evicts the background caching job started from the given pipeline. Noop
   511      if the given pipeline is absent from the environment. If no pipeline is
   512      specified, evicts for all pipelines."""
   513      if pipeline:
   514        return self._background_caching_jobs.pop(str(id(pipeline)), None)
   515      self._background_caching_jobs.clear()
   516  
   517    def set_test_stream_service_controller(self, pipeline, controller):
   518      """Sets the test stream service controller that has started a gRPC server
   519      serving the test stream for any job started from the given user defined
   520      pipeline.
   521      """
   522      self._test_stream_service_controllers[str(id(pipeline))] = controller
   523  
   524    def get_test_stream_service_controller(self, pipeline):
   525      """Gets the test stream service controller that has started a gRPC server
   526      serving the test stream for any job started from the given user defined
   527      pipeline.
   528      """
   529      return self._test_stream_service_controllers.get(str(id(pipeline)), None)
   530  
   531    def evict_test_stream_service_controller(self, pipeline):
   532      """Evicts and pops the test stream service controller that has started a
   533      gRPC server serving the test stream for any job started from the given
   534      user defined pipeline. Noop if the given pipeline is absent from the
   535      environment. If no pipeline is specified, evicts for all pipelines.
   536      """
   537      if pipeline:
   538        return self._test_stream_service_controllers.pop(str(id(pipeline)), None)
   539      self._test_stream_service_controllers.clear()
   540  
   541    def is_terminated(self, pipeline):
   542      """Queries if the most recent job (by executing the given pipeline) state
   543      is in a terminal state. True if absent."""
   544      result = self.pipeline_result(pipeline)
   545      if result:
   546        return runner.PipelineState.is_terminal(result.state)
   547      return True
   548  
   549    def set_cached_source_signature(self, pipeline, signature):
   550      self._cached_source_signature[str(id(pipeline))] = signature
   551  
   552    def get_cached_source_signature(self, pipeline):
   553      return self._cached_source_signature.get(str(id(pipeline)), set())
   554  
   555    def evict_cached_source_signature(self, pipeline=None):
   556      """Evicts the signature generated for each recorded source of the given
   557      pipeline. Noop if the given pipeline is absent from the environment. If no
   558      pipeline is specified, evicts for all pipelines."""
   559      if pipeline:
   560        return self._cached_source_signature.pop(str(id(pipeline)), None)
   561      self._cached_source_signature.clear()
   562  
   563    def track_user_pipelines(self):
   564      """Record references to all user defined pipeline instances watched in
   565      current environment.
   566  
   567      Current static global singleton interactive environment holds references to
   568      a set of pipeline instances defined by the user in the watched scope.
   569      Interactive Beam features could use the references to determine if a given
   570      pipeline is defined by user or implicitly created by Beam SDK or runners,
   571      then handle them differently.
   572  
   573      This is invoked every time a PTransform is to be applied if the current
   574      code execution is under ipython due to the possibility that any user defined
   575      pipeline can be re-evaluated through notebook cell re-execution at any time.
   576  
   577      Each time this is invoked, it will check if there is a cache manager
   578      already created for each user defined pipeline. If not, create one for it.
   579  
   580      If a pipeline is no longer watched due to re-execution while its
   581      PCollections are still in watched scope, the pipeline becomes anonymous but
   582      still accessible indirectly through references to its PCollections. This
   583      function also clears up internal states for those anonymous pipelines once
   584      all their PCollections are anonymous.
   585      """
   586      for watching in self.watching():
   587        for _, val in watching:
   588          if isinstance(val, beam.pipeline.Pipeline):
   589            self._tracked_user_pipelines.add_user_pipeline(val)
   590            _ = self.get_cache_manager(val, create_if_absent=True)
   591            _ = self.get_recording_manager(val, create_if_absent=True)
   592      all_tracked_pipeline_ids = set(self._background_caching_jobs.keys()).union(
   593          set(self._test_stream_service_controllers.keys()),
   594          set(self._cache_managers.keys()),
   595          {str(id(pcoll.pipeline))
   596           for pcoll in self._computed_pcolls},
   597          set(self._cached_source_signature.keys()),
   598          set(self._main_pipeline_results.keys()))
   599      inspectable_pipelines = self._inspector.inspectable_pipelines
   600      for pipeline in all_tracked_pipeline_ids:
   601        if pipeline not in inspectable_pipelines:
   602          self.cleanup(pipeline)
   603  
   604    @property
   605    def tracked_user_pipelines(self):
   606      """Returns the user pipelines in this environment."""
   607      for p in self._tracked_user_pipelines:
   608        yield p
   609  
   610    def user_pipeline(self, derived_pipeline):
   611      """Returns the user pipeline for the given derived pipeline."""
   612      return self._tracked_user_pipelines.get_user_pipeline(derived_pipeline)
   613  
   614    def add_user_pipeline(self, user_pipeline):
   615      self._tracked_user_pipelines.add_user_pipeline(user_pipeline)
   616  
   617    def add_derived_pipeline(self, user_pipeline, derived_pipeline):
   618      """Adds the derived pipeline to the parent user pipeline."""
   619      self._tracked_user_pipelines.add_derived_pipeline(
   620          user_pipeline, derived_pipeline)
   621  
   622    def evict_tracked_pipelines(self, user_pipeline):
   623      """Evicts the user pipeline and its derived pipelines."""
   624      if user_pipeline:
   625        self._tracked_user_pipelines.evict(user_pipeline)
   626      else:
   627        self._tracked_user_pipelines.clear()
   628  
   629    def pipeline_id_to_pipeline(self, pid):
   630      """Converts a pipeline id to a user pipeline.
   631      """
   632  
   633      return self._tracked_user_pipelines.get_pipeline(pid)
   634  
   635    def mark_pcollection_computed(self, pcolls):
   636      """Marks computation completeness for the given pcolls.
   637  
   638      Interactive Beam can use this information to determine if a computation is
   639      needed to introspect the data of any given PCollection.
   640      """
   641      self._computed_pcolls.update(pcoll for pcoll in pcolls)
   642  
   643    def evict_computed_pcollections(self, pipeline=None):
   644      """Evicts all computed PCollections for the given pipeline. If no pipeline
   645      is specified, evicts for all pipelines.
   646      """
   647      if pipeline:
   648        discarded = set()
   649        for pcoll in self._computed_pcolls:
   650          if pcoll.pipeline is pipeline:
   651            discarded.add(pcoll)
   652        self._computed_pcolls -= discarded
   653      else:
   654        self._computed_pcolls = set()
   655  
   656    @property
   657    def computed_pcollections(self):
   658      return self._computed_pcolls
   659  
   660    def load_jquery_with_datatable(self):
   661      """Loads common resources to enable jquery with datatable configured for
   662      notebook frontends if necessary. If the resources have been loaded, NOOP.
   663  
   664      A window.interactive_beam_jquery with datatable plugin configured can be
   665      used in following notebook cells once this is invoked.
   666  
   667      #. There should only be one jQuery imported.
   668      #. Datatable needs to be imported after jQuery is loaded.
   669      #. Imported jQuery is attached to window named as jquery[version].
   670      #. The window attachment needs to happen at the end of import chain until
   671         all jQuery plugins are set.
   672      """
   673      try:
   674        from IPython.display import Javascript
   675        from IPython.display import display_javascript
   676        display_javascript(
   677            Javascript(
   678                _JQUERY_WITH_DATATABLE_TEMPLATE.format(customized_script='')))
   679      except ImportError:
   680        pass  # NOOP if dependencies are not available.
   681  
   682    def import_html_to_head(self, html_hrefs):
   683      """Imports given external HTMLs (supported through webcomponents) into
   684      the head of the document.
   685  
   686      On load of webcomponentsjs, import given HTMLs. If HTML import is already
   687      supported, skip loading webcomponentsjs.
   688  
   689      No matter how many times an HTML import occurs in the document, only the
   690      first occurrence really embeds the external HTML. In a notebook environment,
   691      the body of the document is always changing due to cell [re-]execution,
   692      deletion and re-ordering. Thus, HTML imports shouldn't be put in the body
   693      especially the output areas of notebook cells.
   694      """
   695      try:
   696        from IPython.display import Javascript
   697        from IPython.display import display_javascript
   698        display_javascript(
   699            Javascript(_HTML_IMPORT_TEMPLATE.format(hrefs=html_hrefs)))
   700      except ImportError:
   701        pass  # NOOP if dependencies are not available.
   702  
   703    def get_sql_chain(self, pipeline, set_user_pipeline=False):
   704      if pipeline not in self.sql_chain:
   705        self.sql_chain[pipeline] = SqlChain()
   706      chain = self.sql_chain[pipeline]
   707      if set_user_pipeline:
   708        if chain.user_pipeline and chain.user_pipeline is not pipeline:
   709          raise ValueError(
   710              'The beam_sql magic tries to query PCollections from multiple '
   711              'pipelines: %s and %s',
   712              chain.user_pipeline,
   713              pipeline)
   714        chain.user_pipeline = pipeline
   715      return chain
   716  
   717    def _get_gcs_cache_dir(self, pipeline, cache_dir):
   718      cache_dir_path = PurePath(cache_dir)
   719      if len(cache_dir_path.parts) < 2:
   720        _LOGGER.error(
   721            'GCS bucket cache path "%s" is too short to be valid. See '
   722            'https://cloud.google.com/storage/docs/naming-buckets for '
   723            'the expected format.',
   724            cache_dir)
   725        raise ValueError('cache_root GCS bucket path is invalid.')
   726      bucket_name = cache_dir_path.parts[1]
   727      assert_bucket_exists(bucket_name)
   728      return 'gs://{}/{}'.format('/'.join(cache_dir_path.parts[1:]), id(pipeline))