github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_beam.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_beam.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Module of Interactive Beam features that can be used in notebook.
    19  
    20  The purpose of the module is to reduce the learning curve of Interactive Beam
    21  users, provide a single place for importing and add sugar syntax for all
    22  Interactive Beam components. It gives users capability to interact with existing
    23  environment/session/context for Interactive Beam and visualize PCollections as
    24  bounded dataset. In the meantime, it hides the interactivity implementation
    25  from users so that users can focus on developing Beam pipeline without worrying
    26  about how hidden states in the interactive session are managed.
    27  
    28  A convention to import this module:
    29    from apache_beam.runners.interactive import interactive_beam as ib
    30  
    31  Note: If you want backward-compatibility, only invoke interfaces provided by
    32  this module in your notebook or application code.
    33  """
    34  
    35  # pytype: skip-file
    36  
    37  import logging
    38  import warnings
    39  from datetime import timedelta
    40  from typing import Dict
    41  from typing import List
    42  from typing import Optional
    43  from typing import Union
    44  
    45  import pandas as pd
    46  
    47  import apache_beam as beam
    48  from apache_beam.dataframe.frame_base import DeferredBase
    49  from apache_beam.options.pipeline_options import FlinkRunnerOptions
    50  from apache_beam.runners.interactive import interactive_environment as ie
    51  from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager
    52  from apache_beam.runners.interactive.dataproc.types import ClusterIdentifier
    53  from apache_beam.runners.interactive.dataproc.types import ClusterMetadata
    54  from apache_beam.runners.interactive.display import pipeline_graph
    55  from apache_beam.runners.interactive.display.pcoll_visualization import visualize
    56  from apache_beam.runners.interactive.display.pcoll_visualization import visualize_computed_pcoll
    57  from apache_beam.runners.interactive.options import interactive_options
    58  from apache_beam.runners.interactive.utils import deferred_df_to_pcollection
    59  from apache_beam.runners.interactive.utils import elements_to_df
    60  from apache_beam.runners.interactive.utils import find_pcoll_name
    61  from apache_beam.runners.interactive.utils import progress_indicated
    62  from apache_beam.runners.runner import PipelineState
    63  
    64  _LOGGER = logging.getLogger(__name__)
    65  
    66  
    67  class Options(interactive_options.InteractiveOptions):
    68    """Options that guide how Interactive Beam works."""
    69    @property
    70    def enable_recording_replay(self):
    71      """Whether replayable source data recorded should be replayed for multiple
    72      PCollection evaluations and pipeline runs as long as the data recorded is
    73      still valid."""
    74      return self.capture_control._enable_capture_replay
    75  
    76    @enable_recording_replay.setter
    77    def enable_recording_replay(self, value):
    78      """Sets whether source data recorded should be replayed. True - Enables
    79      recording of replayable source data so that following PCollection
    80      evaluations and pipeline runs always use the same data recorded;
    81      False - Disables recording of replayable source data so that following
    82      PCollection evaluation and pipeline runs always use new data from sources.
    83      """
    84      # This makes sure the log handler is configured correctly in case the
    85      # options are configured in an early stage.
    86      _ = ie.current_env()
    87      if value:
    88        _LOGGER.info(
    89            'Record replay is enabled. When a PCollection is evaluated or the '
    90            'pipeline is executed, existing data recorded from previous '
    91            'computations will be replayed for consistent results. If no '
    92            'recorded data is available, new data from recordable sources will '
    93            'be recorded.')
    94      else:
    95        _LOGGER.info(
    96            'Record replay is disabled. The next time a PCollection is '
    97            'evaluated or the pipeline is executed, new data will always be '
    98            'consumed from sources in the pipeline. You will not have '
    99            'replayability until re-enabling this option.')
   100      self.capture_control._enable_capture_replay = value
   101  
   102    @property
   103    def recordable_sources(self):
   104      """Interactive Beam automatically records data from sources in this set.
   105      """
   106      return self.capture_control._capturable_sources
   107  
   108    @property
   109    def recording_duration(self):
   110      """The data recording of sources ends as soon as the background source
   111      recording job has run for this long."""
   112      return self.capture_control._capture_duration
   113  
   114    @recording_duration.setter
   115    def recording_duration(self, value):
   116      """Sets the recording duration as a timedelta. The input can be a
   117      datetime.timedelta, a possitive integer as seconds or a string
   118      representation that is parsable by pandas.to_timedelta.
   119  
   120      Example::
   121  
   122        # Sets the recording duration limit to 10 seconds.
   123        ib.options.recording_duration = timedelta(seconds=10)
   124        ib.options.recording_duration = 10
   125        ib.options.recording_duration = '10s'
   126        # Explicitly control the recordings.
   127        ib.recordings.stop(p)
   128        ib.recordings.clear(p)
   129        ib.recordings.record(p)
   130        # The next PCollection evaluation uses fresh data from sources,
   131        # and the data recorded will be replayed until another clear.
   132        ib.collect(some_pcoll)
   133      """
   134      duration = None
   135      if isinstance(value, int):
   136        assert value > 0, 'Duration must be a positive value.'
   137        duration = timedelta(seconds=value)
   138      elif isinstance(value, str):
   139        duration = pd.to_timedelta(value)
   140      else:
   141        assert isinstance(value, timedelta), ('The input can only abe a '
   142          'datetime.timedelta, a possitive integer as seconds, or a string '
   143          'representation that is parsable by pandas.to_timedelta.')
   144        duration = value
   145      if self.capture_control._capture_duration.total_seconds(
   146      ) != duration.total_seconds():
   147        _ = ie.current_env()
   148        _LOGGER.info(
   149            'You have changed recording duration from %s seconds to %s seconds. '
   150            'To allow new data to be recorded for the updated duration the '
   151            'next time a PCollection is evaluated or the pipeline is executed, '
   152            'please invoke ib.recordings.stop, ib.recordings.clear and '
   153            'ib.recordings.record.',
   154            self.capture_control._capture_duration.total_seconds(),
   155            duration.total_seconds())
   156        self.capture_control._capture_duration = duration
   157  
   158    @property
   159    def recording_size_limit(self):
   160      """The data recording of sources ends as soon as the size (in bytes) of data
   161      recorded from recordable sources reaches the limit."""
   162      return self.capture_control._capture_size_limit
   163  
   164    @recording_size_limit.setter
   165    def recording_size_limit(self, value):
   166      """Sets the recording size in bytes.
   167  
   168      Example::
   169  
   170        # Sets the recording size limit to 1GB.
   171        interactive_beam.options.recording_size_limit = 1e9
   172      """
   173      if self.capture_control._capture_size_limit != value:
   174        _ = ie.current_env()
   175        _LOGGER.info(
   176            'You have changed recording size limit from %s bytes to %s bytes. To '
   177            'allow new data to be recorded under the updated size limit the '
   178            'next time a PCollection is recorded or the pipeline is executed, '
   179            'please invoke ib.recordings.stop, ib.recordings.clear and '
   180            'ib.recordings.record.',
   181            self.capture_control._capture_size_limit,
   182            value)
   183        self.capture_control._capture_size_limit = value
   184  
   185    @property
   186    def display_timestamp_format(self):
   187      """The format in which timestamps are displayed.
   188  
   189      Default is '%Y-%m-%d %H:%M:%S.%f%z', e.g. 2020-02-01 15:05:06.000015-08:00.
   190      """
   191      return self._display_timestamp_format
   192  
   193    @display_timestamp_format.setter
   194    def display_timestamp_format(self, value):
   195      """Sets the format in which timestamps are displayed.
   196  
   197      Default is '%Y-%m-%d %H:%M:%S.%f%z', e.g. 2020-02-01 15:05:06.000015-08:00.
   198  
   199      Example::
   200  
   201        # Sets the format to not display the timezone or microseconds.
   202        interactive_beam.options.display_timestamp_format = %Y-%m-%d %H:%M:%S'
   203      """
   204      self._display_timestamp_format = value
   205  
   206    @property
   207    def display_timezone(self):
   208      """The timezone in which timestamps are displayed.
   209  
   210      Defaults to local timezone.
   211      """
   212      return self._display_timezone
   213  
   214    @display_timezone.setter
   215    def display_timezone(self, value):
   216      """Sets the timezone (datetime.tzinfo) in which timestamps are displayed.
   217  
   218      Defaults to local timezone.
   219  
   220      Example::
   221  
   222        # Imports the timezone library.
   223        from pytz import timezone
   224  
   225        # Will display all timestamps in the US/Eastern time zone.
   226        tz = timezone('US/Eastern')
   227  
   228        # You can also use dateutil.tz to get a timezone.
   229        tz = dateutil.tz.gettz('US/Eastern')
   230  
   231        interactive_beam.options.display_timezone = tz
   232      """
   233      self._display_timezone = value
   234  
   235    @property
   236    def cache_root(self):
   237      """The cache directory specified by the user.
   238  
   239      Defaults to None.
   240      """
   241      return self._cache_root
   242  
   243    @cache_root.setter
   244    def cache_root(self, value):
   245      """Sets the cache directory.
   246  
   247      Defaults to None.
   248  
   249      Example of local directory usage::
   250        interactive_beam.options.cache_root = '/Users/username/my/cache/dir'
   251  
   252      Example of GCS directory usage::
   253        interactive_beam.options.cache_root = 'gs://my-gcs-bucket/cache/dir'
   254      """
   255      _LOGGER.warning(
   256          'Interactive Beam has detected a set value for the cache_root '
   257          'option. Please note: existing cache managers will not have '
   258          'their current cache directory changed. The option must be '
   259          'set in Interactive Beam prior to the initialization of new '
   260          'pipelines to take effect. To apply changes to new pipelines, '
   261          'the kernel must be restarted or the pipeline creation codes '
   262          'must be re-executed. ')
   263      self._cache_root = value
   264  
   265  
   266  class Recordings():
   267    """An introspection interface for recordings for pipelines.
   268  
   269    When a user materializes a PCollection onto disk (eg. ib.show) for a streaming
   270    pipeline, a background source recording job is started. This job pulls data
   271    from all defined unbounded sources for that PCollection's pipeline. The
   272    following methods allow for introspection into that background recording job.
   273    """
   274    def describe(self, pipeline=None):
   275      # type: (Optional[beam.Pipeline]) -> dict[str, Any] # noqa: F821
   276  
   277      """Returns a description of all the recordings for the given pipeline.
   278  
   279      If no pipeline is given then this returns a dictionary of descriptions for
   280      all pipelines.
   281      """
   282  
   283      # Create the RecordingManager if it doesn't already exist.
   284      if pipeline:
   285        ie.current_env().get_recording_manager(pipeline, create_if_absent=True)
   286  
   287      description = ie.current_env().describe_all_recordings()
   288  
   289      if pipeline:
   290        return description[pipeline]
   291      return description
   292  
   293    def clear(self, pipeline):
   294      # type: (beam.Pipeline) -> bool
   295  
   296      """Clears all recordings of the given pipeline. Returns True if cleared."""
   297  
   298      description = self.describe(pipeline)
   299      if (not PipelineState.is_terminal(description['state']) and
   300          description['state'] != PipelineState.STOPPED):
   301        _LOGGER.warning(
   302            'Trying to clear a recording with a running pipeline. Did '
   303            'you forget to call ib.recordings.stop?')
   304        return False
   305  
   306      ie.current_env().cleanup(pipeline)
   307      return True
   308  
   309    def stop(self, pipeline):
   310      # type: (beam.Pipeline) -> None
   311  
   312      """Stops the background source recording of the given pipeline."""
   313  
   314      recording_manager = ie.current_env().get_recording_manager(
   315          pipeline, create_if_absent=True)
   316      recording_manager.cancel()
   317  
   318    def record(self, pipeline):
   319      # type: (beam.Pipeline) -> bool
   320  
   321      """Starts a background source recording job for the given pipeline. Returns
   322      True if the recording job was started.
   323      """
   324  
   325      description = self.describe(pipeline)
   326      if (not PipelineState.is_terminal(description['state']) and
   327          description['state'] != PipelineState.STOPPED):
   328        _LOGGER.warning(
   329            'Trying to start a recording with a running pipeline. Did '
   330            'you forget to call ib.recordings.stop?')
   331        return False
   332  
   333      if description['size'] > 0:
   334        _LOGGER.warning(
   335            'A recording already exists for this pipeline. To start a '
   336            'recording, make sure to call ib.recordings.clear first.')
   337        return False
   338  
   339      recording_manager = ie.current_env().get_recording_manager(
   340          pipeline, create_if_absent=True)
   341      return recording_manager.record_pipeline()
   342  
   343  
   344  class Clusters:
   345    """An interface to control clusters implicitly created and managed by
   346    the current interactive environment. This class is not needed and
   347    should not be used otherwise.
   348  
   349    Do not use it for clusters a user explicitly manages: e.g., if you have
   350    a Flink cluster running somewhere and provides the flink master when
   351    running a pipeline with the FlinkRunner, the cluster will not be tracked
   352    or managed by Beam.
   353    To reuse the same cluster for your pipelines, use the same pipeline
   354    options: e.g., a pipeline option with the same flink master if you are
   355    using FlinkRunner.
   356  
   357    This module is experimental. No backwards-compatibility guarantees.
   358  
   359    Interactive Beam automatically creates/reuses existing worker clusters to
   360    execute pipelines when it detects the need from configurations.
   361    Currently, the only supported cluster implementation is Flink running on
   362    Cloud Dataproc.
   363  
   364    To configure a pipeline to run on Cloud Dataproc with Flink, set the
   365    underlying runner of the InteractiveRunner to FlinkRunner and the pipeline
   366    options to indicate where on Cloud the FlinkRunner should be deployed to.
   367  
   368      An example to enable automatic Dataproc cluster creation/reuse::
   369  
   370        options = PipelineOptions([
   371            '--project=my-project',
   372            '--region=my-region',
   373            '--environment_type=DOCKER'])
   374        pipeline = beam.Pipeline(InteractiveRunner(
   375            underlying_runner=FlinkRunner()), options=options)
   376  
   377    Reuse a pipeline options in another pipeline would configure Interactive Beam
   378    to reuse the same Dataproc cluster implicitly managed by the current
   379    interactive environment.
   380    If a flink_master is identified as a known cluster, the corresponding cluster
   381    is also resued.
   382    Furthermore, if a cluster is explicitly created by using a pipeline as an
   383    identifier to a known cluster, the cluster is reused.
   384  
   385      An example::
   386  
   387        # If pipeline runs on a known cluster, below code reuses the cluster
   388        # manager without creating a new one.
   389        dcm = ib.clusters.create(pipeline)
   390  
   391    To provision the cluster, use WorkerOptions. Supported configurations are::
   392  
   393      1. subnetwork
   394      2. num_workers
   395      3. machine_type
   396  
   397    To configure a pipeline to run on an existing FlinkRunner deployed elsewhere,
   398    set the flink_master explicitly so no cluster will be created/reused.
   399  
   400      An example pipeline options to skip automatic Dataproc cluster usage::
   401  
   402        options = PipelineOptions([
   403            '--flink_master=some.self.hosted.flink:port',
   404            '--environment_type=DOCKER'])
   405  
   406    To configure a pipeline to run on a local FlinkRunner, explicitly set the
   407    default cluster metadata to None: ib.clusters.set_default_cluster(None).
   408    """
   409    # Explicitly set the Flink version here to ensure compatibility with 2.0
   410    # Dataproc images:
   411    # https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.0
   412    DATAPROC_FLINK_VERSION = '1.12'
   413  
   414    # The minimum worker number to create a Dataproc cluster.
   415    DATAPROC_MINIMUM_WORKER_NUM = 2
   416  
   417    # TODO(https://github.com/apache/beam/issues/21527): Fix the Dataproc image
   418    # version after a released image contains all missing dependencies for Flink
   419    # to run.
   420    # DATAPROC_IMAGE_VERSION = '2.0.XX-debian10'
   421  
   422    def __init__(self) -> None:
   423      self.dataproc_cluster_managers: Dict[ClusterMetadata,
   424                                           DataprocClusterManager] = {}
   425      self.master_urls: Dict[str, ClusterMetadata] = {}
   426      self.pipelines: Dict[beam.Pipeline, DataprocClusterManager] = {}
   427      self.default_cluster_metadata: Optional[ClusterMetadata] = None
   428  
   429    def create(
   430        self, cluster_identifier: ClusterIdentifier) -> DataprocClusterManager:
   431      """Creates a Dataproc cluster manager provisioned for the cluster
   432      identified. If the cluster is known, returns an existing cluster manager.
   433      """
   434      # Try to get some not-None cluster metadata.
   435      cluster_metadata = self.cluster_metadata(cluster_identifier)
   436      if not cluster_metadata:
   437        raise ValueError(
   438            'Unknown cluster identifier: %s. Cannot create or reuse'
   439            'a Dataproc cluster.')
   440      if not cluster_metadata.region:
   441        _LOGGER.info(
   442            'No region information was detected, defaulting Dataproc cluster '
   443            'region to: us-central1.')
   444        cluster_metadata.region = 'us-central1'
   445      elif cluster_metadata.region == 'global':
   446        # The global region is unsupported as it will be eventually deprecated.
   447        raise ValueError('Clusters in the global region are not supported.')
   448      # else use the provided region.
   449      if (cluster_metadata.num_workers and
   450          cluster_metadata.num_workers < self.DATAPROC_MINIMUM_WORKER_NUM):
   451        _LOGGER.info(
   452            'At least %s workers are required for a cluster, defaulting to %s.',
   453            self.DATAPROC_MINIMUM_WORKER_NUM,
   454            self.DATAPROC_MINIMUM_WORKER_NUM)
   455        cluster_metadata.num_workers = self.DATAPROC_MINIMUM_WORKER_NUM
   456      known_dcm = self.dataproc_cluster_managers.get(cluster_metadata, None)
   457      if known_dcm:
   458        return known_dcm
   459      dcm = DataprocClusterManager(cluster_metadata)
   460      dcm.create_flink_cluster()
   461      # ClusterMetadata with derivative fields populated by the dcm.
   462      derived_meta = dcm.cluster_metadata
   463      self.dataproc_cluster_managers[derived_meta] = dcm
   464      self.master_urls[derived_meta.master_url] = derived_meta
   465      # Update the default cluster metadata to the one just created.
   466      self.set_default_cluster(derived_meta)
   467      return dcm
   468  
   469    def cleanup(
   470        self,
   471        cluster_identifier: Optional[ClusterIdentifier] = None,
   472        force: bool = False) -> None:
   473      """Cleans up the cluster associated with the given cluster_identifier.
   474  
   475      When None cluster_identifier is provided: if force is True, cleans up for
   476      all clusters; otherwise, do a dry run and NOOP.
   477      If a beam.Pipeline is given as the ClusterIdentifier while multiple
   478      pipelines share the same cluster, it only cleans up the association between
   479      the pipeline and the cluster identified.
   480      If the cluster_identifier is unknown, NOOP.
   481      """
   482      if not cluster_identifier:
   483        dcm_to_cleanup = set(self.dataproc_cluster_managers.values())
   484        if force:
   485          for dcm in dcm_to_cleanup:
   486            self._cleanup(dcm)
   487          self.default_cluster_metadata = None
   488        else:
   489          _LOGGER.warning(
   490              'No cluster_identifier provided. If you intend to '
   491              'clean up all clusters, invoke ib.clusters.cleanup(force=True). '
   492              'Current clusters are %s.',
   493              self.describe())
   494      elif isinstance(cluster_identifier, beam.Pipeline):
   495        p = cluster_identifier
   496        dcm = self.pipelines.pop(p, None)
   497        if dcm:
   498          dcm.pipelines.remove(p)
   499          warnings.filterwarnings(
   500              'ignore',
   501              'options is deprecated since First stable release. References to '
   502              '<pipeline>.options will not be supported',
   503              category=DeprecationWarning)
   504          p_flink_options = p.options.view_as(FlinkRunnerOptions)
   505          p_flink_options.flink_master = '[auto]'
   506          p_flink_options.flink_version = None
   507          # Only cleans up when there is no pipeline using the cluster.
   508          if not dcm.pipelines:
   509            self._cleanup(dcm)
   510      else:
   511        if isinstance(cluster_identifier, str):
   512          meta = self.master_urls.get(cluster_identifier, None)
   513        else:
   514          meta = cluster_identifier
   515        dcm = self.dataproc_cluster_managers.get(meta, None)
   516        if dcm:
   517          self._cleanup(dcm)
   518  
   519    def describe(
   520        self,
   521        cluster_identifier: Optional[ClusterIdentifier] = None
   522    ) -> Union[ClusterMetadata, List[ClusterMetadata]]:
   523      """Describes the ClusterMetadata by a ClusterIdentifier.
   524  
   525      If no cluster_identifier is given or if the cluster_identifier is unknown,
   526      it returns descriptions for all known clusters.
   527  
   528      Example usage:
   529      # Describe the cluster executing work for a pipeline.
   530      ib.clusters.describe(pipeline)
   531      # Describe the cluster with the flink master url.
   532      ib.clusters.describe(master_url)
   533      # Describe all existing clusters.
   534      ib.clusters.describe()
   535      """
   536      if cluster_identifier:
   537        meta = self._cluster_metadata(cluster_identifier)
   538        if meta in self.dataproc_cluster_managers:
   539          return meta
   540      return list(self.dataproc_cluster_managers.keys())
   541  
   542    def set_default_cluster(
   543        self, cluster_identifier: Optional[ClusterIdentifier] = None) -> None:
   544      """Temporarily sets the default metadata for creating or reusing a
   545      DataprocClusterManager. It is always updated to the most recently created
   546      cluster.
   547  
   548      If no known ClusterMetadata can be identified by the ClusterIdentifer, NOOP.
   549      If None is set, next time when Flink is in use, if no cluster is explicitly
   550      configured by a pipeline, the job runs locally.
   551      """
   552      if cluster_identifier:
   553        self.default_cluster_metadata = self.cluster_metadata(cluster_identifier)
   554      else:
   555        self.default_cluster_metadata = None
   556  
   557    def cluster_metadata(
   558        self,
   559        cluster_identifier: Optional[ClusterIdentifier] = None
   560    ) -> Optional[ClusterMetadata]:
   561      """Fetches the ClusterMetadata by a ClusterIdentifier that could be a
   562      URL in string, a Beam pipeline, or an equivalent to a known ClusterMetadata;
   563  
   564      If the given cluster_identifier is an URL or a pipeline that is unknown to
   565      the current environment, the default cluster metadata (could be None) is
   566      returned.
   567      If the given cluster_identifier is a ClusterMetadata but unknown to the
   568      current environment, passes it through (NOOP).
   569      """
   570      meta = self._cluster_metadata(cluster_identifier)
   571      return meta if meta else self.default_cluster_metadata
   572  
   573    def _cluster_metadata(
   574        self,
   575        cluster_identifier: Optional[ClusterIdentifier] = None
   576    ) -> Optional[ClusterMetadata]:
   577      meta = None
   578      if cluster_identifier:
   579        if isinstance(cluster_identifier, str):
   580          meta = self.master_urls.get(cluster_identifier, None)
   581        elif isinstance(cluster_identifier, beam.Pipeline):
   582          dcm = self.pipelines.get(cluster_identifier, None)
   583          if dcm:
   584            meta = dcm.cluster_metadata
   585        elif isinstance(cluster_identifier, ClusterMetadata):
   586          meta = cluster_identifier
   587          if meta in self.dataproc_cluster_managers:
   588            meta = self.dataproc_cluster_managers[meta].cluster_metadata
   589          elif (meta and self.default_cluster_metadata and
   590                meta.cluster_name == self.default_cluster_metadata.cluster_name):
   591            _LOGGER.warning(
   592                'Cannot change the configuration of the running cluster %s. '
   593                'Existing is %s, desired is %s.',
   594                self.default_cluster_metadata.cluster_name,
   595                self.default_cluster_metadata,
   596                meta)
   597            meta.reset_name()
   598            _LOGGER.warning(
   599                'To avoid conflict, issuing a new cluster name %s '
   600                'for a new cluster.',
   601                meta.cluster_name)
   602        else:
   603          raise TypeError(
   604              'A cluster_identifier should be Optional[Union[str, '
   605              'beam.Pipeline, ClusterMetadata], instead %s was given.',
   606              type(cluster_identifier))
   607      return meta
   608  
   609    def _cleanup(self, dcm: DataprocClusterManager) -> None:
   610      dcm.cleanup()
   611      self.dataproc_cluster_managers.pop(dcm.cluster_metadata, None)
   612      self.master_urls.pop(dcm.cluster_metadata.master_url, None)
   613      for p in dcm.pipelines:
   614        self.pipelines.pop(p, None)
   615      if dcm.cluster_metadata == self.default_cluster_metadata:
   616        self.default_cluster_metadata = None
   617  
   618  
   619  # Users can set options to guide how Interactive Beam works.
   620  # Examples:
   621  # ib.options.enable_recording_replay = False/True
   622  # ib.options.recording_duration = '1m'
   623  # ib.options.recordable_sources.add(SourceClass)
   624  # Check the docstrings for detailed usages.
   625  options = Options()
   626  
   627  # Users can introspect into recordings by using the recordings class.
   628  # Examples:
   629  # p = beam.Pipeline(InteractiveRunner())
   630  # elems = p | beam.Create([1, 2, 3])
   631  # ib.show(elems)
   632  # ib.recordings.describe(p)
   633  recordings = Recordings()
   634  
   635  # Users can interact with the clusters used by their environment.
   636  # Examples:
   637  # ib.clusters.describe(p)
   638  # Check the docstrings for detailed usages.
   639  clusters = Clusters()
   640  
   641  
   642  def watch(watchable):
   643    """Monitors a watchable.
   644  
   645    This allows Interactive Beam to implicitly pass on the information about the
   646    location of your pipeline definition.
   647  
   648    Current implementation mainly watches for PCollection variables defined in
   649    user code. A watchable can be a dictionary of variable metadata such as
   650    locals(), a str name of a module, a module object or an instance of a class.
   651    The variable can come from any scope even local variables in a method of a
   652    class defined in a module.
   653  
   654      Below are all valid::
   655  
   656        watch(__main__)  # if import __main__ is already invoked
   657        watch('__main__')  # does not require invoking import __main__ beforehand
   658        watch(self)  # inside a class
   659        watch(SomeInstance())  # an instance of a class
   660        watch(locals())  # inside a function, watching local variables within
   661  
   662    If you write a Beam pipeline in the __main__ module directly, since the
   663    __main__ module is always watched, you don't have to instruct Interactive
   664    Beam. If your Beam pipeline is defined in some module other than __main__,
   665    such as inside a class function or a unit test, you can watch() the scope.
   666  
   667      For example::
   668  
   669        class Foo(object)
   670          def run_pipeline(self):
   671            with beam.Pipeline() as p:
   672              init_pcoll = p |  'Init Create' >> beam.Create(range(10))
   673              watch(locals())
   674            return init_pcoll
   675        init_pcoll = Foo().run_pipeline()
   676  
   677      Interactive Beam caches init_pcoll for the first run.
   678  
   679      Then you can use::
   680  
   681        show(init_pcoll)
   682  
   683      To visualize data from init_pcoll once the pipeline is executed.
   684    """
   685    ie.current_env().watch(watchable)
   686  
   687  
   688  @progress_indicated
   689  def show(
   690      *pcolls,
   691      include_window_info=False,
   692      visualize_data=False,
   693      n='inf',
   694      duration='inf'):
   695    # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None # noqa: F821
   696  
   697    """Shows given PCollections in an interactive exploratory way if used within
   698    a notebook, or prints a heading sampled data if used within an ipython shell.
   699    Noop if used in a non-interactive environment.
   700  
   701    Args:
   702      include_window_info: (optional) if True, windowing information of the
   703          data will be visualized too. Default is false.
   704      visualize_data: (optional) by default, the visualization contains data
   705          tables rendering data from given pcolls separately as if they are
   706          converted into dataframes. If visualize_data is True, there will be a
   707          more dive-in widget and statistically overview widget of the data.
   708          Otherwise, those 2 data visualization widgets will not be displayed.
   709      n: (optional) max number of elements to visualize. Default 'inf'.
   710      duration: (optional) max duration of elements to read in integer seconds or
   711          a string duration. Default 'inf'.
   712  
   713    The given pcolls can be dictionary of PCollections (as values), or iterable
   714    of PCollections or plain PCollection values.
   715  
   716    The user can specify either the max number of elements with `n` to read
   717    or the maximum duration of elements to read with `duration`. When a limiter is
   718    not supplied, it is assumed to be infinite.
   719  
   720    By default, the visualization contains data tables rendering data from given
   721    pcolls separately as if they are converted into dataframes. If visualize_data
   722    is True, there will be a more dive-in widget and statistically overview widget
   723    of the data. Otherwise, those 2 data visualization widgets will not be
   724    displayed.
   725  
   726    Ad hoc builds a pipeline fragment including only transforms that are
   727    necessary to produce data for given PCollections pcolls, runs the pipeline
   728    fragment to compute data for those pcolls and then visualizes the data.
   729  
   730    The function is always blocking. If used within a notebook, the data
   731    visualized might be dynamically updated before the function returns as more
   732    and more data could getting processed and emitted when the pipeline fragment
   733    is being executed. If used within an ipython shell, there will be no dynamic
   734    plotting but a static plotting in the end of pipeline fragment execution.
   735  
   736    The PCollections given must belong to the same pipeline.
   737  
   738      For example::
   739  
   740        p = beam.Pipeline(InteractiveRunner())
   741        init = p | 'Init' >> beam.Create(range(1000))
   742        square = init | 'Square' >> beam.Map(lambda x: x * x)
   743        cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)
   744  
   745        # Below builds a pipeline fragment from the defined pipeline `p` that
   746        # contains only applied transforms of `Init` and `Square`. Then the
   747        # interactive runner runs the pipeline fragment implicitly to compute data
   748        # represented by PCollection `square` and visualizes it.
   749        show(square)
   750  
   751        # This is equivalent to `show(square)` because `square` depends on `init`
   752        # and `init` is included in the pipeline fragment and computed anyway.
   753        show(init, square)
   754  
   755        # Below is similar to running `p.run()`. It computes data for both
   756        # PCollection `square` and PCollection `cube`, then visualizes them.
   757        show(square, cube)
   758    """
   759    flatten_pcolls = []
   760    for pcoll_container in pcolls:
   761      if isinstance(pcoll_container, dict):
   762        flatten_pcolls.extend(pcoll_container.values())
   763      elif isinstance(pcoll_container, (beam.pvalue.PCollection, DeferredBase)):
   764        flatten_pcolls.append(pcoll_container)
   765      else:
   766        try:
   767          flatten_pcolls.extend(iter(pcoll_container))
   768        except TypeError:
   769          raise ValueError(
   770              'The given pcoll %s is not a dict, an iterable or a PCollection.' %
   771              pcoll_container)
   772  
   773    # Iterate through the given PCollections and convert any deferred DataFrames
   774    # or Series into PCollections.
   775    pcolls = set()
   776  
   777    # The element type is used to help visualize the given PCollection. For the
   778    # deferred DataFrame/Series case it is the proxy of the frame.
   779    element_types = {}
   780    for pcoll in flatten_pcolls:
   781      if isinstance(pcoll, DeferredBase):
   782        pcoll, element_type = deferred_df_to_pcollection(pcoll)
   783        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
   784      else:
   785        element_type = pcoll.element_type
   786  
   787      element_types[pcoll] = element_type
   788  
   789      pcolls.add(pcoll)
   790      assert isinstance(pcoll, beam.pvalue.PCollection), (
   791          '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
   792  
   793    assert len(pcolls) > 0, (
   794        'Need at least 1 PCollection to show data visualization.')
   795  
   796    pcoll_pipeline = next(iter(pcolls)).pipeline
   797    user_pipeline = ie.current_env().user_pipeline(pcoll_pipeline)
   798    # Possibly showing a PCollection defined in a local scope that is not
   799    # explicitly watched. Ad hoc watch it though it's a little late.
   800    if not user_pipeline:
   801      watch({'anonymous_pipeline_{}'.format(id(pcoll_pipeline)): pcoll_pipeline})
   802      user_pipeline = pcoll_pipeline
   803  
   804    if isinstance(n, str):
   805      assert n == 'inf', (
   806          'Currently only the string \'inf\' is supported. This denotes reading '
   807          'elements until the recording is stopped via a kernel interrupt.')
   808    elif isinstance(n, int):
   809      assert n > 0, 'n needs to be positive or the string \'inf\''
   810  
   811    if isinstance(duration, int):
   812      assert duration > 0, ('duration needs to be positive, a duration string, '
   813                            'or the string \'inf\'')
   814  
   815    if n == 'inf':
   816      n = float('inf')
   817  
   818    if duration == 'inf':
   819      duration = float('inf')
   820  
   821    previously_computed_pcolls = {
   822        pcoll
   823        for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections
   824    }
   825    for pcoll in previously_computed_pcolls:
   826      visualize_computed_pcoll(
   827          find_pcoll_name(pcoll),
   828          pcoll,
   829          n,
   830          duration,
   831          include_window_info=include_window_info,
   832          display_facets=visualize_data)
   833    pcolls = pcolls - previously_computed_pcolls
   834  
   835    recording_manager = ie.current_env().get_recording_manager(
   836        user_pipeline, create_if_absent=True)
   837    recording = recording_manager.record(pcolls, max_n=n, max_duration=duration)
   838  
   839    # Catch a KeyboardInterrupt to gracefully cancel the recording and
   840    # visualizations.
   841    try:
   842      # If in notebook, static plotting computed pcolls as computation is done.
   843      if ie.current_env().is_in_notebook:
   844        for stream in recording.computed().values():
   845          visualize(
   846              stream,
   847              include_window_info=include_window_info,
   848              display_facets=visualize_data,
   849              element_type=element_types[stream.pcoll])
   850      elif ie.current_env().is_in_ipython:
   851        for stream in recording.computed().values():
   852          visualize(
   853              stream,
   854              include_window_info=include_window_info,
   855              element_type=element_types[stream.pcoll])
   856      if recording.is_computed():
   857        return
   858  
   859      # If in notebook, dynamic plotting as computation goes.
   860      if ie.current_env().is_in_notebook:
   861        for stream in recording.uncomputed().values():
   862          visualize(
   863              stream,
   864              dynamic_plotting_interval=1,
   865              include_window_info=include_window_info,
   866              display_facets=visualize_data,
   867              element_type=element_types[stream.pcoll])
   868  
   869      # Invoke wait_until_finish to ensure the blocking nature of this API without
   870      # relying on the run to be blocking.
   871      recording.wait_until_finish()
   872  
   873      # If just in ipython shell, plotting once when the computation is completed.
   874      if ie.current_env().is_in_ipython and not ie.current_env().is_in_notebook:
   875        for stream in recording.computed().values():
   876          visualize(stream, include_window_info=include_window_info)
   877  
   878    except KeyboardInterrupt:
   879      if recording:
   880        recording.cancel()
   881  
   882  
   883  @progress_indicated
   884  def collect(pcoll, n='inf', duration='inf', include_window_info=False):
   885    """Materializes the elements from a PCollection into a Dataframe.
   886  
   887    This reads each element from file and reads only the amount that it needs
   888    into memory. The user can specify either the max number of elements to read
   889    or the maximum duration of elements to read. When a limiter is not supplied,
   890    it is assumed to be infinite.
   891  
   892    Args:
   893      n: (optional) max number of elements to visualize. Default 'inf'.
   894      duration: (optional) max duration of elements to read in integer seconds or
   895          a string duration. Default 'inf'.
   896      include_window_info: (optional) if True, appends the windowing information
   897          to each row. Default False.
   898  
   899    For example::
   900  
   901      p = beam.Pipeline(InteractiveRunner())
   902      init = p | 'Init' >> beam.Create(range(10))
   903      square = init | 'Square' >> beam.Map(lambda x: x * x)
   904  
   905      # Run the pipeline and bring the PCollection into memory as a Dataframe.
   906      in_memory_square = head(square, n=5)
   907    """
   908    # Remember the element type so we can make an informed decision on how to
   909    # collect the result in elements_to_df.
   910    if isinstance(pcoll, DeferredBase):
   911      # Get the proxy so we can get the output shape of the DataFrame.
   912      pcoll, element_type = deferred_df_to_pcollection(pcoll)
   913      watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
   914    else:
   915      element_type = pcoll.element_type
   916  
   917    assert isinstance(pcoll, beam.pvalue.PCollection), (
   918        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
   919  
   920    if isinstance(n, str):
   921      assert n == 'inf', (
   922          'Currently only the string \'inf\' is supported. This denotes reading '
   923          'elements until the recording is stopped via a kernel interrupt.')
   924    elif isinstance(n, int):
   925      assert n > 0, 'n needs to be positive or the string \'inf\''
   926  
   927    if isinstance(duration, int):
   928      assert duration > 0, ('duration needs to be positive, a duration string, '
   929                            'or the string \'inf\'')
   930  
   931    if n == 'inf':
   932      n = float('inf')
   933  
   934    if duration == 'inf':
   935      duration = float('inf')
   936  
   937    user_pipeline = ie.current_env().user_pipeline(pcoll.pipeline)
   938    # Possibly collecting a PCollection defined in a local scope that is not
   939    # explicitly watched. Ad hoc watch it though it's a little late.
   940    if not user_pipeline:
   941      watch({'anonymous_pipeline_{}'.format(id(pcoll.pipeline)): pcoll.pipeline})
   942      user_pipeline = pcoll.pipeline
   943    recording_manager = ie.current_env().get_recording_manager(
   944        user_pipeline, create_if_absent=True)
   945  
   946    # If already computed, directly read the stream and return.
   947    if pcoll in ie.current_env().computed_pcollections:
   948      pcoll_name = find_pcoll_name(pcoll)
   949      elements = list(
   950          recording_manager.read(pcoll_name, pcoll, n, duration).read())
   951      return elements_to_df(
   952          elements,
   953          include_window_info=include_window_info,
   954          element_type=element_type)
   955  
   956    recording = recording_manager.record([pcoll], max_n=n, max_duration=duration)
   957  
   958    try:
   959      elements = list(recording.stream(pcoll).read())
   960    except KeyboardInterrupt:
   961      recording.cancel()
   962      return pd.DataFrame()
   963  
   964    if n == float('inf'):
   965      n = None
   966  
   967    # Collecting DataFrames may have a length > n, so slice again to be sure. Note
   968    # that array[:None] returns everything.
   969    return elements_to_df(
   970        elements,
   971        include_window_info=include_window_info,
   972        element_type=element_type)[:n]
   973  
   974  
   975  @progress_indicated
   976  def show_graph(pipeline):
   977    """Shows the current pipeline shape of a given Beam pipeline as a DAG.
   978    """
   979    pipeline_graph.PipelineGraph(pipeline).display_graph()
   980  
   981  
   982  def evict_recorded_data(pipeline=None):
   983    """Forcefully evicts all recorded replayable data for the given pipeline. If
   984    no pipeline is specified, evicts for all user defined pipelines.
   985  
   986    Once invoked, Interactive Beam will record new data based on the guidance of
   987    options the next time it evaluates/visualizes PCollections or runs pipelines.
   988    """
   989    from apache_beam.runners.interactive.options import capture_control
   990    capture_control.evict_captured_data(pipeline)