github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_beam.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Module of Interactive Beam features that can be used in notebook. 19 20 The purpose of the module is to reduce the learning curve of Interactive Beam 21 users, provide a single place for importing and add sugar syntax for all 22 Interactive Beam components. It gives users capability to interact with existing 23 environment/session/context for Interactive Beam and visualize PCollections as 24 bounded dataset. In the meantime, it hides the interactivity implementation 25 from users so that users can focus on developing Beam pipeline without worrying 26 about how hidden states in the interactive session are managed. 27 28 A convention to import this module: 29 from apache_beam.runners.interactive import interactive_beam as ib 30 31 Note: If you want backward-compatibility, only invoke interfaces provided by 32 this module in your notebook or application code. 33 """ 34 35 # pytype: skip-file 36 37 import logging 38 import warnings 39 from datetime import timedelta 40 from typing import Dict 41 from typing import List 42 from typing import Optional 43 from typing import Union 44 45 import pandas as pd 46 47 import apache_beam as beam 48 from apache_beam.dataframe.frame_base import DeferredBase 49 from apache_beam.options.pipeline_options import FlinkRunnerOptions 50 from apache_beam.runners.interactive import interactive_environment as ie 51 from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager 52 from apache_beam.runners.interactive.dataproc.types import ClusterIdentifier 53 from apache_beam.runners.interactive.dataproc.types import ClusterMetadata 54 from apache_beam.runners.interactive.display import pipeline_graph 55 from apache_beam.runners.interactive.display.pcoll_visualization import visualize 56 from apache_beam.runners.interactive.display.pcoll_visualization import visualize_computed_pcoll 57 from apache_beam.runners.interactive.options import interactive_options 58 from apache_beam.runners.interactive.utils import deferred_df_to_pcollection 59 from apache_beam.runners.interactive.utils import elements_to_df 60 from apache_beam.runners.interactive.utils import find_pcoll_name 61 from apache_beam.runners.interactive.utils import progress_indicated 62 from apache_beam.runners.runner import PipelineState 63 64 _LOGGER = logging.getLogger(__name__) 65 66 67 class Options(interactive_options.InteractiveOptions): 68 """Options that guide how Interactive Beam works.""" 69 @property 70 def enable_recording_replay(self): 71 """Whether replayable source data recorded should be replayed for multiple 72 PCollection evaluations and pipeline runs as long as the data recorded is 73 still valid.""" 74 return self.capture_control._enable_capture_replay 75 76 @enable_recording_replay.setter 77 def enable_recording_replay(self, value): 78 """Sets whether source data recorded should be replayed. True - Enables 79 recording of replayable source data so that following PCollection 80 evaluations and pipeline runs always use the same data recorded; 81 False - Disables recording of replayable source data so that following 82 PCollection evaluation and pipeline runs always use new data from sources. 83 """ 84 # This makes sure the log handler is configured correctly in case the 85 # options are configured in an early stage. 86 _ = ie.current_env() 87 if value: 88 _LOGGER.info( 89 'Record replay is enabled. When a PCollection is evaluated or the ' 90 'pipeline is executed, existing data recorded from previous ' 91 'computations will be replayed for consistent results. If no ' 92 'recorded data is available, new data from recordable sources will ' 93 'be recorded.') 94 else: 95 _LOGGER.info( 96 'Record replay is disabled. The next time a PCollection is ' 97 'evaluated or the pipeline is executed, new data will always be ' 98 'consumed from sources in the pipeline. You will not have ' 99 'replayability until re-enabling this option.') 100 self.capture_control._enable_capture_replay = value 101 102 @property 103 def recordable_sources(self): 104 """Interactive Beam automatically records data from sources in this set. 105 """ 106 return self.capture_control._capturable_sources 107 108 @property 109 def recording_duration(self): 110 """The data recording of sources ends as soon as the background source 111 recording job has run for this long.""" 112 return self.capture_control._capture_duration 113 114 @recording_duration.setter 115 def recording_duration(self, value): 116 """Sets the recording duration as a timedelta. The input can be a 117 datetime.timedelta, a possitive integer as seconds or a string 118 representation that is parsable by pandas.to_timedelta. 119 120 Example:: 121 122 # Sets the recording duration limit to 10 seconds. 123 ib.options.recording_duration = timedelta(seconds=10) 124 ib.options.recording_duration = 10 125 ib.options.recording_duration = '10s' 126 # Explicitly control the recordings. 127 ib.recordings.stop(p) 128 ib.recordings.clear(p) 129 ib.recordings.record(p) 130 # The next PCollection evaluation uses fresh data from sources, 131 # and the data recorded will be replayed until another clear. 132 ib.collect(some_pcoll) 133 """ 134 duration = None 135 if isinstance(value, int): 136 assert value > 0, 'Duration must be a positive value.' 137 duration = timedelta(seconds=value) 138 elif isinstance(value, str): 139 duration = pd.to_timedelta(value) 140 else: 141 assert isinstance(value, timedelta), ('The input can only abe a ' 142 'datetime.timedelta, a possitive integer as seconds, or a string ' 143 'representation that is parsable by pandas.to_timedelta.') 144 duration = value 145 if self.capture_control._capture_duration.total_seconds( 146 ) != duration.total_seconds(): 147 _ = ie.current_env() 148 _LOGGER.info( 149 'You have changed recording duration from %s seconds to %s seconds. ' 150 'To allow new data to be recorded for the updated duration the ' 151 'next time a PCollection is evaluated or the pipeline is executed, ' 152 'please invoke ib.recordings.stop, ib.recordings.clear and ' 153 'ib.recordings.record.', 154 self.capture_control._capture_duration.total_seconds(), 155 duration.total_seconds()) 156 self.capture_control._capture_duration = duration 157 158 @property 159 def recording_size_limit(self): 160 """The data recording of sources ends as soon as the size (in bytes) of data 161 recorded from recordable sources reaches the limit.""" 162 return self.capture_control._capture_size_limit 163 164 @recording_size_limit.setter 165 def recording_size_limit(self, value): 166 """Sets the recording size in bytes. 167 168 Example:: 169 170 # Sets the recording size limit to 1GB. 171 interactive_beam.options.recording_size_limit = 1e9 172 """ 173 if self.capture_control._capture_size_limit != value: 174 _ = ie.current_env() 175 _LOGGER.info( 176 'You have changed recording size limit from %s bytes to %s bytes. To ' 177 'allow new data to be recorded under the updated size limit the ' 178 'next time a PCollection is recorded or the pipeline is executed, ' 179 'please invoke ib.recordings.stop, ib.recordings.clear and ' 180 'ib.recordings.record.', 181 self.capture_control._capture_size_limit, 182 value) 183 self.capture_control._capture_size_limit = value 184 185 @property 186 def display_timestamp_format(self): 187 """The format in which timestamps are displayed. 188 189 Default is '%Y-%m-%d %H:%M:%S.%f%z', e.g. 2020-02-01 15:05:06.000015-08:00. 190 """ 191 return self._display_timestamp_format 192 193 @display_timestamp_format.setter 194 def display_timestamp_format(self, value): 195 """Sets the format in which timestamps are displayed. 196 197 Default is '%Y-%m-%d %H:%M:%S.%f%z', e.g. 2020-02-01 15:05:06.000015-08:00. 198 199 Example:: 200 201 # Sets the format to not display the timezone or microseconds. 202 interactive_beam.options.display_timestamp_format = %Y-%m-%d %H:%M:%S' 203 """ 204 self._display_timestamp_format = value 205 206 @property 207 def display_timezone(self): 208 """The timezone in which timestamps are displayed. 209 210 Defaults to local timezone. 211 """ 212 return self._display_timezone 213 214 @display_timezone.setter 215 def display_timezone(self, value): 216 """Sets the timezone (datetime.tzinfo) in which timestamps are displayed. 217 218 Defaults to local timezone. 219 220 Example:: 221 222 # Imports the timezone library. 223 from pytz import timezone 224 225 # Will display all timestamps in the US/Eastern time zone. 226 tz = timezone('US/Eastern') 227 228 # You can also use dateutil.tz to get a timezone. 229 tz = dateutil.tz.gettz('US/Eastern') 230 231 interactive_beam.options.display_timezone = tz 232 """ 233 self._display_timezone = value 234 235 @property 236 def cache_root(self): 237 """The cache directory specified by the user. 238 239 Defaults to None. 240 """ 241 return self._cache_root 242 243 @cache_root.setter 244 def cache_root(self, value): 245 """Sets the cache directory. 246 247 Defaults to None. 248 249 Example of local directory usage:: 250 interactive_beam.options.cache_root = '/Users/username/my/cache/dir' 251 252 Example of GCS directory usage:: 253 interactive_beam.options.cache_root = 'gs://my-gcs-bucket/cache/dir' 254 """ 255 _LOGGER.warning( 256 'Interactive Beam has detected a set value for the cache_root ' 257 'option. Please note: existing cache managers will not have ' 258 'their current cache directory changed. The option must be ' 259 'set in Interactive Beam prior to the initialization of new ' 260 'pipelines to take effect. To apply changes to new pipelines, ' 261 'the kernel must be restarted or the pipeline creation codes ' 262 'must be re-executed. ') 263 self._cache_root = value 264 265 266 class Recordings(): 267 """An introspection interface for recordings for pipelines. 268 269 When a user materializes a PCollection onto disk (eg. ib.show) for a streaming 270 pipeline, a background source recording job is started. This job pulls data 271 from all defined unbounded sources for that PCollection's pipeline. The 272 following methods allow for introspection into that background recording job. 273 """ 274 def describe(self, pipeline=None): 275 # type: (Optional[beam.Pipeline]) -> dict[str, Any] # noqa: F821 276 277 """Returns a description of all the recordings for the given pipeline. 278 279 If no pipeline is given then this returns a dictionary of descriptions for 280 all pipelines. 281 """ 282 283 # Create the RecordingManager if it doesn't already exist. 284 if pipeline: 285 ie.current_env().get_recording_manager(pipeline, create_if_absent=True) 286 287 description = ie.current_env().describe_all_recordings() 288 289 if pipeline: 290 return description[pipeline] 291 return description 292 293 def clear(self, pipeline): 294 # type: (beam.Pipeline) -> bool 295 296 """Clears all recordings of the given pipeline. Returns True if cleared.""" 297 298 description = self.describe(pipeline) 299 if (not PipelineState.is_terminal(description['state']) and 300 description['state'] != PipelineState.STOPPED): 301 _LOGGER.warning( 302 'Trying to clear a recording with a running pipeline. Did ' 303 'you forget to call ib.recordings.stop?') 304 return False 305 306 ie.current_env().cleanup(pipeline) 307 return True 308 309 def stop(self, pipeline): 310 # type: (beam.Pipeline) -> None 311 312 """Stops the background source recording of the given pipeline.""" 313 314 recording_manager = ie.current_env().get_recording_manager( 315 pipeline, create_if_absent=True) 316 recording_manager.cancel() 317 318 def record(self, pipeline): 319 # type: (beam.Pipeline) -> bool 320 321 """Starts a background source recording job for the given pipeline. Returns 322 True if the recording job was started. 323 """ 324 325 description = self.describe(pipeline) 326 if (not PipelineState.is_terminal(description['state']) and 327 description['state'] != PipelineState.STOPPED): 328 _LOGGER.warning( 329 'Trying to start a recording with a running pipeline. Did ' 330 'you forget to call ib.recordings.stop?') 331 return False 332 333 if description['size'] > 0: 334 _LOGGER.warning( 335 'A recording already exists for this pipeline. To start a ' 336 'recording, make sure to call ib.recordings.clear first.') 337 return False 338 339 recording_manager = ie.current_env().get_recording_manager( 340 pipeline, create_if_absent=True) 341 return recording_manager.record_pipeline() 342 343 344 class Clusters: 345 """An interface to control clusters implicitly created and managed by 346 the current interactive environment. This class is not needed and 347 should not be used otherwise. 348 349 Do not use it for clusters a user explicitly manages: e.g., if you have 350 a Flink cluster running somewhere and provides the flink master when 351 running a pipeline with the FlinkRunner, the cluster will not be tracked 352 or managed by Beam. 353 To reuse the same cluster for your pipelines, use the same pipeline 354 options: e.g., a pipeline option with the same flink master if you are 355 using FlinkRunner. 356 357 This module is experimental. No backwards-compatibility guarantees. 358 359 Interactive Beam automatically creates/reuses existing worker clusters to 360 execute pipelines when it detects the need from configurations. 361 Currently, the only supported cluster implementation is Flink running on 362 Cloud Dataproc. 363 364 To configure a pipeline to run on Cloud Dataproc with Flink, set the 365 underlying runner of the InteractiveRunner to FlinkRunner and the pipeline 366 options to indicate where on Cloud the FlinkRunner should be deployed to. 367 368 An example to enable automatic Dataproc cluster creation/reuse:: 369 370 options = PipelineOptions([ 371 '--project=my-project', 372 '--region=my-region', 373 '--environment_type=DOCKER']) 374 pipeline = beam.Pipeline(InteractiveRunner( 375 underlying_runner=FlinkRunner()), options=options) 376 377 Reuse a pipeline options in another pipeline would configure Interactive Beam 378 to reuse the same Dataproc cluster implicitly managed by the current 379 interactive environment. 380 If a flink_master is identified as a known cluster, the corresponding cluster 381 is also resued. 382 Furthermore, if a cluster is explicitly created by using a pipeline as an 383 identifier to a known cluster, the cluster is reused. 384 385 An example:: 386 387 # If pipeline runs on a known cluster, below code reuses the cluster 388 # manager without creating a new one. 389 dcm = ib.clusters.create(pipeline) 390 391 To provision the cluster, use WorkerOptions. Supported configurations are:: 392 393 1. subnetwork 394 2. num_workers 395 3. machine_type 396 397 To configure a pipeline to run on an existing FlinkRunner deployed elsewhere, 398 set the flink_master explicitly so no cluster will be created/reused. 399 400 An example pipeline options to skip automatic Dataproc cluster usage:: 401 402 options = PipelineOptions([ 403 '--flink_master=some.self.hosted.flink:port', 404 '--environment_type=DOCKER']) 405 406 To configure a pipeline to run on a local FlinkRunner, explicitly set the 407 default cluster metadata to None: ib.clusters.set_default_cluster(None). 408 """ 409 # Explicitly set the Flink version here to ensure compatibility with 2.0 410 # Dataproc images: 411 # https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.0 412 DATAPROC_FLINK_VERSION = '1.12' 413 414 # The minimum worker number to create a Dataproc cluster. 415 DATAPROC_MINIMUM_WORKER_NUM = 2 416 417 # TODO(https://github.com/apache/beam/issues/21527): Fix the Dataproc image 418 # version after a released image contains all missing dependencies for Flink 419 # to run. 420 # DATAPROC_IMAGE_VERSION = '2.0.XX-debian10' 421 422 def __init__(self) -> None: 423 self.dataproc_cluster_managers: Dict[ClusterMetadata, 424 DataprocClusterManager] = {} 425 self.master_urls: Dict[str, ClusterMetadata] = {} 426 self.pipelines: Dict[beam.Pipeline, DataprocClusterManager] = {} 427 self.default_cluster_metadata: Optional[ClusterMetadata] = None 428 429 def create( 430 self, cluster_identifier: ClusterIdentifier) -> DataprocClusterManager: 431 """Creates a Dataproc cluster manager provisioned for the cluster 432 identified. If the cluster is known, returns an existing cluster manager. 433 """ 434 # Try to get some not-None cluster metadata. 435 cluster_metadata = self.cluster_metadata(cluster_identifier) 436 if not cluster_metadata: 437 raise ValueError( 438 'Unknown cluster identifier: %s. Cannot create or reuse' 439 'a Dataproc cluster.') 440 if not cluster_metadata.region: 441 _LOGGER.info( 442 'No region information was detected, defaulting Dataproc cluster ' 443 'region to: us-central1.') 444 cluster_metadata.region = 'us-central1' 445 elif cluster_metadata.region == 'global': 446 # The global region is unsupported as it will be eventually deprecated. 447 raise ValueError('Clusters in the global region are not supported.') 448 # else use the provided region. 449 if (cluster_metadata.num_workers and 450 cluster_metadata.num_workers < self.DATAPROC_MINIMUM_WORKER_NUM): 451 _LOGGER.info( 452 'At least %s workers are required for a cluster, defaulting to %s.', 453 self.DATAPROC_MINIMUM_WORKER_NUM, 454 self.DATAPROC_MINIMUM_WORKER_NUM) 455 cluster_metadata.num_workers = self.DATAPROC_MINIMUM_WORKER_NUM 456 known_dcm = self.dataproc_cluster_managers.get(cluster_metadata, None) 457 if known_dcm: 458 return known_dcm 459 dcm = DataprocClusterManager(cluster_metadata) 460 dcm.create_flink_cluster() 461 # ClusterMetadata with derivative fields populated by the dcm. 462 derived_meta = dcm.cluster_metadata 463 self.dataproc_cluster_managers[derived_meta] = dcm 464 self.master_urls[derived_meta.master_url] = derived_meta 465 # Update the default cluster metadata to the one just created. 466 self.set_default_cluster(derived_meta) 467 return dcm 468 469 def cleanup( 470 self, 471 cluster_identifier: Optional[ClusterIdentifier] = None, 472 force: bool = False) -> None: 473 """Cleans up the cluster associated with the given cluster_identifier. 474 475 When None cluster_identifier is provided: if force is True, cleans up for 476 all clusters; otherwise, do a dry run and NOOP. 477 If a beam.Pipeline is given as the ClusterIdentifier while multiple 478 pipelines share the same cluster, it only cleans up the association between 479 the pipeline and the cluster identified. 480 If the cluster_identifier is unknown, NOOP. 481 """ 482 if not cluster_identifier: 483 dcm_to_cleanup = set(self.dataproc_cluster_managers.values()) 484 if force: 485 for dcm in dcm_to_cleanup: 486 self._cleanup(dcm) 487 self.default_cluster_metadata = None 488 else: 489 _LOGGER.warning( 490 'No cluster_identifier provided. If you intend to ' 491 'clean up all clusters, invoke ib.clusters.cleanup(force=True). ' 492 'Current clusters are %s.', 493 self.describe()) 494 elif isinstance(cluster_identifier, beam.Pipeline): 495 p = cluster_identifier 496 dcm = self.pipelines.pop(p, None) 497 if dcm: 498 dcm.pipelines.remove(p) 499 warnings.filterwarnings( 500 'ignore', 501 'options is deprecated since First stable release. References to ' 502 '<pipeline>.options will not be supported', 503 category=DeprecationWarning) 504 p_flink_options = p.options.view_as(FlinkRunnerOptions) 505 p_flink_options.flink_master = '[auto]' 506 p_flink_options.flink_version = None 507 # Only cleans up when there is no pipeline using the cluster. 508 if not dcm.pipelines: 509 self._cleanup(dcm) 510 else: 511 if isinstance(cluster_identifier, str): 512 meta = self.master_urls.get(cluster_identifier, None) 513 else: 514 meta = cluster_identifier 515 dcm = self.dataproc_cluster_managers.get(meta, None) 516 if dcm: 517 self._cleanup(dcm) 518 519 def describe( 520 self, 521 cluster_identifier: Optional[ClusterIdentifier] = None 522 ) -> Union[ClusterMetadata, List[ClusterMetadata]]: 523 """Describes the ClusterMetadata by a ClusterIdentifier. 524 525 If no cluster_identifier is given or if the cluster_identifier is unknown, 526 it returns descriptions for all known clusters. 527 528 Example usage: 529 # Describe the cluster executing work for a pipeline. 530 ib.clusters.describe(pipeline) 531 # Describe the cluster with the flink master url. 532 ib.clusters.describe(master_url) 533 # Describe all existing clusters. 534 ib.clusters.describe() 535 """ 536 if cluster_identifier: 537 meta = self._cluster_metadata(cluster_identifier) 538 if meta in self.dataproc_cluster_managers: 539 return meta 540 return list(self.dataproc_cluster_managers.keys()) 541 542 def set_default_cluster( 543 self, cluster_identifier: Optional[ClusterIdentifier] = None) -> None: 544 """Temporarily sets the default metadata for creating or reusing a 545 DataprocClusterManager. It is always updated to the most recently created 546 cluster. 547 548 If no known ClusterMetadata can be identified by the ClusterIdentifer, NOOP. 549 If None is set, next time when Flink is in use, if no cluster is explicitly 550 configured by a pipeline, the job runs locally. 551 """ 552 if cluster_identifier: 553 self.default_cluster_metadata = self.cluster_metadata(cluster_identifier) 554 else: 555 self.default_cluster_metadata = None 556 557 def cluster_metadata( 558 self, 559 cluster_identifier: Optional[ClusterIdentifier] = None 560 ) -> Optional[ClusterMetadata]: 561 """Fetches the ClusterMetadata by a ClusterIdentifier that could be a 562 URL in string, a Beam pipeline, or an equivalent to a known ClusterMetadata; 563 564 If the given cluster_identifier is an URL or a pipeline that is unknown to 565 the current environment, the default cluster metadata (could be None) is 566 returned. 567 If the given cluster_identifier is a ClusterMetadata but unknown to the 568 current environment, passes it through (NOOP). 569 """ 570 meta = self._cluster_metadata(cluster_identifier) 571 return meta if meta else self.default_cluster_metadata 572 573 def _cluster_metadata( 574 self, 575 cluster_identifier: Optional[ClusterIdentifier] = None 576 ) -> Optional[ClusterMetadata]: 577 meta = None 578 if cluster_identifier: 579 if isinstance(cluster_identifier, str): 580 meta = self.master_urls.get(cluster_identifier, None) 581 elif isinstance(cluster_identifier, beam.Pipeline): 582 dcm = self.pipelines.get(cluster_identifier, None) 583 if dcm: 584 meta = dcm.cluster_metadata 585 elif isinstance(cluster_identifier, ClusterMetadata): 586 meta = cluster_identifier 587 if meta in self.dataproc_cluster_managers: 588 meta = self.dataproc_cluster_managers[meta].cluster_metadata 589 elif (meta and self.default_cluster_metadata and 590 meta.cluster_name == self.default_cluster_metadata.cluster_name): 591 _LOGGER.warning( 592 'Cannot change the configuration of the running cluster %s. ' 593 'Existing is %s, desired is %s.', 594 self.default_cluster_metadata.cluster_name, 595 self.default_cluster_metadata, 596 meta) 597 meta.reset_name() 598 _LOGGER.warning( 599 'To avoid conflict, issuing a new cluster name %s ' 600 'for a new cluster.', 601 meta.cluster_name) 602 else: 603 raise TypeError( 604 'A cluster_identifier should be Optional[Union[str, ' 605 'beam.Pipeline, ClusterMetadata], instead %s was given.', 606 type(cluster_identifier)) 607 return meta 608 609 def _cleanup(self, dcm: DataprocClusterManager) -> None: 610 dcm.cleanup() 611 self.dataproc_cluster_managers.pop(dcm.cluster_metadata, None) 612 self.master_urls.pop(dcm.cluster_metadata.master_url, None) 613 for p in dcm.pipelines: 614 self.pipelines.pop(p, None) 615 if dcm.cluster_metadata == self.default_cluster_metadata: 616 self.default_cluster_metadata = None 617 618 619 # Users can set options to guide how Interactive Beam works. 620 # Examples: 621 # ib.options.enable_recording_replay = False/True 622 # ib.options.recording_duration = '1m' 623 # ib.options.recordable_sources.add(SourceClass) 624 # Check the docstrings for detailed usages. 625 options = Options() 626 627 # Users can introspect into recordings by using the recordings class. 628 # Examples: 629 # p = beam.Pipeline(InteractiveRunner()) 630 # elems = p | beam.Create([1, 2, 3]) 631 # ib.show(elems) 632 # ib.recordings.describe(p) 633 recordings = Recordings() 634 635 # Users can interact with the clusters used by their environment. 636 # Examples: 637 # ib.clusters.describe(p) 638 # Check the docstrings for detailed usages. 639 clusters = Clusters() 640 641 642 def watch(watchable): 643 """Monitors a watchable. 644 645 This allows Interactive Beam to implicitly pass on the information about the 646 location of your pipeline definition. 647 648 Current implementation mainly watches for PCollection variables defined in 649 user code. A watchable can be a dictionary of variable metadata such as 650 locals(), a str name of a module, a module object or an instance of a class. 651 The variable can come from any scope even local variables in a method of a 652 class defined in a module. 653 654 Below are all valid:: 655 656 watch(__main__) # if import __main__ is already invoked 657 watch('__main__') # does not require invoking import __main__ beforehand 658 watch(self) # inside a class 659 watch(SomeInstance()) # an instance of a class 660 watch(locals()) # inside a function, watching local variables within 661 662 If you write a Beam pipeline in the __main__ module directly, since the 663 __main__ module is always watched, you don't have to instruct Interactive 664 Beam. If your Beam pipeline is defined in some module other than __main__, 665 such as inside a class function or a unit test, you can watch() the scope. 666 667 For example:: 668 669 class Foo(object) 670 def run_pipeline(self): 671 with beam.Pipeline() as p: 672 init_pcoll = p | 'Init Create' >> beam.Create(range(10)) 673 watch(locals()) 674 return init_pcoll 675 init_pcoll = Foo().run_pipeline() 676 677 Interactive Beam caches init_pcoll for the first run. 678 679 Then you can use:: 680 681 show(init_pcoll) 682 683 To visualize data from init_pcoll once the pipeline is executed. 684 """ 685 ie.current_env().watch(watchable) 686 687 688 @progress_indicated 689 def show( 690 *pcolls, 691 include_window_info=False, 692 visualize_data=False, 693 n='inf', 694 duration='inf'): 695 # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None # noqa: F821 696 697 """Shows given PCollections in an interactive exploratory way if used within 698 a notebook, or prints a heading sampled data if used within an ipython shell. 699 Noop if used in a non-interactive environment. 700 701 Args: 702 include_window_info: (optional) if True, windowing information of the 703 data will be visualized too. Default is false. 704 visualize_data: (optional) by default, the visualization contains data 705 tables rendering data from given pcolls separately as if they are 706 converted into dataframes. If visualize_data is True, there will be a 707 more dive-in widget and statistically overview widget of the data. 708 Otherwise, those 2 data visualization widgets will not be displayed. 709 n: (optional) max number of elements to visualize. Default 'inf'. 710 duration: (optional) max duration of elements to read in integer seconds or 711 a string duration. Default 'inf'. 712 713 The given pcolls can be dictionary of PCollections (as values), or iterable 714 of PCollections or plain PCollection values. 715 716 The user can specify either the max number of elements with `n` to read 717 or the maximum duration of elements to read with `duration`. When a limiter is 718 not supplied, it is assumed to be infinite. 719 720 By default, the visualization contains data tables rendering data from given 721 pcolls separately as if they are converted into dataframes. If visualize_data 722 is True, there will be a more dive-in widget and statistically overview widget 723 of the data. Otherwise, those 2 data visualization widgets will not be 724 displayed. 725 726 Ad hoc builds a pipeline fragment including only transforms that are 727 necessary to produce data for given PCollections pcolls, runs the pipeline 728 fragment to compute data for those pcolls and then visualizes the data. 729 730 The function is always blocking. If used within a notebook, the data 731 visualized might be dynamically updated before the function returns as more 732 and more data could getting processed and emitted when the pipeline fragment 733 is being executed. If used within an ipython shell, there will be no dynamic 734 plotting but a static plotting in the end of pipeline fragment execution. 735 736 The PCollections given must belong to the same pipeline. 737 738 For example:: 739 740 p = beam.Pipeline(InteractiveRunner()) 741 init = p | 'Init' >> beam.Create(range(1000)) 742 square = init | 'Square' >> beam.Map(lambda x: x * x) 743 cube = init | 'Cube' >> beam.Map(lambda x: x ** 3) 744 745 # Below builds a pipeline fragment from the defined pipeline `p` that 746 # contains only applied transforms of `Init` and `Square`. Then the 747 # interactive runner runs the pipeline fragment implicitly to compute data 748 # represented by PCollection `square` and visualizes it. 749 show(square) 750 751 # This is equivalent to `show(square)` because `square` depends on `init` 752 # and `init` is included in the pipeline fragment and computed anyway. 753 show(init, square) 754 755 # Below is similar to running `p.run()`. It computes data for both 756 # PCollection `square` and PCollection `cube`, then visualizes them. 757 show(square, cube) 758 """ 759 flatten_pcolls = [] 760 for pcoll_container in pcolls: 761 if isinstance(pcoll_container, dict): 762 flatten_pcolls.extend(pcoll_container.values()) 763 elif isinstance(pcoll_container, (beam.pvalue.PCollection, DeferredBase)): 764 flatten_pcolls.append(pcoll_container) 765 else: 766 try: 767 flatten_pcolls.extend(iter(pcoll_container)) 768 except TypeError: 769 raise ValueError( 770 'The given pcoll %s is not a dict, an iterable or a PCollection.' % 771 pcoll_container) 772 773 # Iterate through the given PCollections and convert any deferred DataFrames 774 # or Series into PCollections. 775 pcolls = set() 776 777 # The element type is used to help visualize the given PCollection. For the 778 # deferred DataFrame/Series case it is the proxy of the frame. 779 element_types = {} 780 for pcoll in flatten_pcolls: 781 if isinstance(pcoll, DeferredBase): 782 pcoll, element_type = deferred_df_to_pcollection(pcoll) 783 watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) 784 else: 785 element_type = pcoll.element_type 786 787 element_types[pcoll] = element_type 788 789 pcolls.add(pcoll) 790 assert isinstance(pcoll, beam.pvalue.PCollection), ( 791 '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) 792 793 assert len(pcolls) > 0, ( 794 'Need at least 1 PCollection to show data visualization.') 795 796 pcoll_pipeline = next(iter(pcolls)).pipeline 797 user_pipeline = ie.current_env().user_pipeline(pcoll_pipeline) 798 # Possibly showing a PCollection defined in a local scope that is not 799 # explicitly watched. Ad hoc watch it though it's a little late. 800 if not user_pipeline: 801 watch({'anonymous_pipeline_{}'.format(id(pcoll_pipeline)): pcoll_pipeline}) 802 user_pipeline = pcoll_pipeline 803 804 if isinstance(n, str): 805 assert n == 'inf', ( 806 'Currently only the string \'inf\' is supported. This denotes reading ' 807 'elements until the recording is stopped via a kernel interrupt.') 808 elif isinstance(n, int): 809 assert n > 0, 'n needs to be positive or the string \'inf\'' 810 811 if isinstance(duration, int): 812 assert duration > 0, ('duration needs to be positive, a duration string, ' 813 'or the string \'inf\'') 814 815 if n == 'inf': 816 n = float('inf') 817 818 if duration == 'inf': 819 duration = float('inf') 820 821 previously_computed_pcolls = { 822 pcoll 823 for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections 824 } 825 for pcoll in previously_computed_pcolls: 826 visualize_computed_pcoll( 827 find_pcoll_name(pcoll), 828 pcoll, 829 n, 830 duration, 831 include_window_info=include_window_info, 832 display_facets=visualize_data) 833 pcolls = pcolls - previously_computed_pcolls 834 835 recording_manager = ie.current_env().get_recording_manager( 836 user_pipeline, create_if_absent=True) 837 recording = recording_manager.record(pcolls, max_n=n, max_duration=duration) 838 839 # Catch a KeyboardInterrupt to gracefully cancel the recording and 840 # visualizations. 841 try: 842 # If in notebook, static plotting computed pcolls as computation is done. 843 if ie.current_env().is_in_notebook: 844 for stream in recording.computed().values(): 845 visualize( 846 stream, 847 include_window_info=include_window_info, 848 display_facets=visualize_data, 849 element_type=element_types[stream.pcoll]) 850 elif ie.current_env().is_in_ipython: 851 for stream in recording.computed().values(): 852 visualize( 853 stream, 854 include_window_info=include_window_info, 855 element_type=element_types[stream.pcoll]) 856 if recording.is_computed(): 857 return 858 859 # If in notebook, dynamic plotting as computation goes. 860 if ie.current_env().is_in_notebook: 861 for stream in recording.uncomputed().values(): 862 visualize( 863 stream, 864 dynamic_plotting_interval=1, 865 include_window_info=include_window_info, 866 display_facets=visualize_data, 867 element_type=element_types[stream.pcoll]) 868 869 # Invoke wait_until_finish to ensure the blocking nature of this API without 870 # relying on the run to be blocking. 871 recording.wait_until_finish() 872 873 # If just in ipython shell, plotting once when the computation is completed. 874 if ie.current_env().is_in_ipython and not ie.current_env().is_in_notebook: 875 for stream in recording.computed().values(): 876 visualize(stream, include_window_info=include_window_info) 877 878 except KeyboardInterrupt: 879 if recording: 880 recording.cancel() 881 882 883 @progress_indicated 884 def collect(pcoll, n='inf', duration='inf', include_window_info=False): 885 """Materializes the elements from a PCollection into a Dataframe. 886 887 This reads each element from file and reads only the amount that it needs 888 into memory. The user can specify either the max number of elements to read 889 or the maximum duration of elements to read. When a limiter is not supplied, 890 it is assumed to be infinite. 891 892 Args: 893 n: (optional) max number of elements to visualize. Default 'inf'. 894 duration: (optional) max duration of elements to read in integer seconds or 895 a string duration. Default 'inf'. 896 include_window_info: (optional) if True, appends the windowing information 897 to each row. Default False. 898 899 For example:: 900 901 p = beam.Pipeline(InteractiveRunner()) 902 init = p | 'Init' >> beam.Create(range(10)) 903 square = init | 'Square' >> beam.Map(lambda x: x * x) 904 905 # Run the pipeline and bring the PCollection into memory as a Dataframe. 906 in_memory_square = head(square, n=5) 907 """ 908 # Remember the element type so we can make an informed decision on how to 909 # collect the result in elements_to_df. 910 if isinstance(pcoll, DeferredBase): 911 # Get the proxy so we can get the output shape of the DataFrame. 912 pcoll, element_type = deferred_df_to_pcollection(pcoll) 913 watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) 914 else: 915 element_type = pcoll.element_type 916 917 assert isinstance(pcoll, beam.pvalue.PCollection), ( 918 '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) 919 920 if isinstance(n, str): 921 assert n == 'inf', ( 922 'Currently only the string \'inf\' is supported. This denotes reading ' 923 'elements until the recording is stopped via a kernel interrupt.') 924 elif isinstance(n, int): 925 assert n > 0, 'n needs to be positive or the string \'inf\'' 926 927 if isinstance(duration, int): 928 assert duration > 0, ('duration needs to be positive, a duration string, ' 929 'or the string \'inf\'') 930 931 if n == 'inf': 932 n = float('inf') 933 934 if duration == 'inf': 935 duration = float('inf') 936 937 user_pipeline = ie.current_env().user_pipeline(pcoll.pipeline) 938 # Possibly collecting a PCollection defined in a local scope that is not 939 # explicitly watched. Ad hoc watch it though it's a little late. 940 if not user_pipeline: 941 watch({'anonymous_pipeline_{}'.format(id(pcoll.pipeline)): pcoll.pipeline}) 942 user_pipeline = pcoll.pipeline 943 recording_manager = ie.current_env().get_recording_manager( 944 user_pipeline, create_if_absent=True) 945 946 # If already computed, directly read the stream and return. 947 if pcoll in ie.current_env().computed_pcollections: 948 pcoll_name = find_pcoll_name(pcoll) 949 elements = list( 950 recording_manager.read(pcoll_name, pcoll, n, duration).read()) 951 return elements_to_df( 952 elements, 953 include_window_info=include_window_info, 954 element_type=element_type) 955 956 recording = recording_manager.record([pcoll], max_n=n, max_duration=duration) 957 958 try: 959 elements = list(recording.stream(pcoll).read()) 960 except KeyboardInterrupt: 961 recording.cancel() 962 return pd.DataFrame() 963 964 if n == float('inf'): 965 n = None 966 967 # Collecting DataFrames may have a length > n, so slice again to be sure. Note 968 # that array[:None] returns everything. 969 return elements_to_df( 970 elements, 971 include_window_info=include_window_info, 972 element_type=element_type)[:n] 973 974 975 @progress_indicated 976 def show_graph(pipeline): 977 """Shows the current pipeline shape of a given Beam pipeline as a DAG. 978 """ 979 pipeline_graph.PipelineGraph(pipeline).display_graph() 980 981 982 def evict_recorded_data(pipeline=None): 983 """Forcefully evicts all recorded replayable data for the given pipeline. If 984 no pipeline is specified, evicts for all user defined pipelines. 985 986 Once invoked, Interactive Beam will record new data based on the guidance of 987 options the next time it evaluates/visualizes PCollections or runs pipelines. 988 """ 989 from apache_beam.runners.interactive.options import capture_control 990 capture_control.evict_captured_data(pipeline)