github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_environment.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Module of the current Interactive Beam environment. 19 20 For internal use only; no backwards-compatibility guarantees. 21 Provides interfaces to interact with existing Interactive Beam environment. 22 External Interactive Beam users please use interactive_beam module in 23 application code or notebook. 24 """ 25 # pytype: skip-file 26 27 import atexit 28 import importlib 29 import logging 30 import os 31 import tempfile 32 import warnings 33 from collections.abc import Iterable 34 from pathlib import PurePath 35 36 import apache_beam as beam 37 from apache_beam.runners import DataflowRunner 38 from apache_beam.runners import runner 39 from apache_beam.runners.direct import direct_runner 40 from apache_beam.runners.interactive import cache_manager as cache 41 from apache_beam.runners.interactive.messaging.interactive_environment_inspector import InteractiveEnvironmentInspector 42 from apache_beam.runners.interactive.recording_manager import RecordingManager 43 from apache_beam.runners.interactive.sql.sql_chain import SqlChain 44 from apache_beam.runners.interactive.user_pipeline_tracker import UserPipelineTracker 45 from apache_beam.runners.interactive.utils import assert_bucket_exists 46 from apache_beam.runners.interactive.utils import detect_pipeline_runner 47 from apache_beam.runners.interactive.utils import register_ipython_log_handler 48 from apache_beam.utils.interactive_utils import is_in_ipython 49 from apache_beam.utils.interactive_utils import is_in_notebook 50 51 # Interactive Beam user flow is data-centric rather than pipeline-centric, so 52 # there is only one global interactive environment instance that manages 53 # implementation that enables interactivity. 54 _interactive_beam_env = None 55 56 _LOGGER = logging.getLogger(__name__) 57 58 # By `format(customized_script=xxx)`, the given `customized_script` is 59 # guaranteed to be executed within access to a jquery with datatable plugin 60 # configured which is useful so that any `customized_script` is resilient to 61 # browser refresh. Inside `customized_script`, use `$` as jQuery. 62 _JQUERY_WITH_DATATABLE_TEMPLATE = """ 63 if (typeof window.interactive_beam_jquery == 'undefined') {{ 64 var jqueryScript = document.createElement('script'); 65 jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js'; 66 jqueryScript.type = 'text/javascript'; 67 jqueryScript.onload = function() {{ 68 var datatableScript = document.createElement('script'); 69 datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js'; 70 datatableScript.type = 'text/javascript'; 71 datatableScript.onload = function() {{ 72 window.interactive_beam_jquery = jQuery.noConflict(true); 73 window.interactive_beam_jquery(document).ready(function($){{ 74 {customized_script} 75 }}); 76 }} 77 document.head.appendChild(datatableScript); 78 }}; 79 document.head.appendChild(jqueryScript); 80 }} else {{ 81 window.interactive_beam_jquery(document).ready(function($){{ 82 {customized_script} 83 }}); 84 }}""" 85 86 # By `format(hrefs=xxx)`, the given `hrefs` will be imported as HTML imports. 87 # Since HTML import might not be supported by the browser, we check if HTML 88 # import is supported by the browser, if so, import HTMLs else setup 89 # webcomponents and chain the HTML import to the end of onload. 90 _HTML_IMPORT_TEMPLATE = """ 91 var import_html = () => {{ 92 {hrefs}.forEach(href => {{ 93 var link = document.createElement('link'); 94 link.rel = 'import' 95 link.href = href; 96 document.head.appendChild(link); 97 }}); 98 }} 99 if ('import' in document.createElement('link')) {{ 100 import_html(); 101 }} else {{ 102 var webcomponentScript = document.createElement('script'); 103 webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js'; 104 webcomponentScript.type = 'text/javascript'; 105 webcomponentScript.onload = function(){{ 106 import_html(); 107 }}; 108 document.head.appendChild(webcomponentScript); 109 }}""" 110 111 112 def current_env(): 113 """Gets current Interactive Beam environment.""" 114 global _interactive_beam_env 115 if not _interactive_beam_env: 116 _interactive_beam_env = InteractiveEnvironment() 117 return _interactive_beam_env 118 119 120 def new_env(): 121 """Creates a new Interactive Beam environment to replace current one.""" 122 global _interactive_beam_env 123 if _interactive_beam_env: 124 _interactive_beam_env.cleanup() 125 _interactive_beam_env = None 126 return current_env() 127 128 129 class InteractiveEnvironment(object): 130 """An interactive environment with cache and pipeline variable metadata. 131 132 Interactive Beam will use the watched variable information to determine if a 133 PCollection is assigned to a variable in user pipeline definition. When 134 executing the pipeline, interactivity is applied with implicit cache 135 mechanism for those PCollections if the pipeline is interactive. Users can 136 also visualize and introspect those PCollections in user code since they have 137 handles to the variables. 138 """ 139 def __init__(self): 140 # Registers a cleanup routine when system exits. 141 atexit.register(self.cleanup) 142 # Holds cache managers that manage source recording and intermediate 143 # PCollection cache for each pipeline. Each key is a stringified user 144 # defined pipeline instance's id. 145 self._cache_managers = {} 146 # Holds RecordingManagers keyed by pipeline instance id. 147 self._recording_managers = {} 148 # Holds class instances, module object, string of module names. 149 self._watching_set = set() 150 # Holds variables list of (Dict[str, object]). 151 self._watching_dict_list = [] 152 # Holds results of main jobs as Dict[str, PipelineResult]. 153 # Each key is a pipeline instance defined by the end user. The 154 # InteractiveRunner is responsible for populating this dictionary 155 # implicitly. 156 self._main_pipeline_results = {} 157 # Holds background caching jobs as Dict[str, BackgroundCachingJob]. 158 # Each key is a pipeline instance defined by the end user. The 159 # InteractiveRunner or its enclosing scope is responsible for populating 160 # this dictionary implicitly when a background caching jobs is started. 161 self._background_caching_jobs = {} 162 # Holds TestStreamServiceControllers that controls gRPC servers serving 163 # events as test stream of TestStreamPayload.Event. 164 # Dict[str, TestStreamServiceController]. Each key is a pipeline 165 # instance defined by the end user. The InteractiveRunner or its enclosing 166 # scope is responsible for populating this dictionary implicitly when a new 167 # controller is created to start a new gRPC server. The server stays alive 168 # until a new background caching job is started thus invalidating everything 169 # the gRPC server serves. 170 self._test_stream_service_controllers = {} 171 self._cached_source_signature = {} 172 self._tracked_user_pipelines = UserPipelineTracker() 173 from apache_beam.runners.interactive.interactive_beam import clusters 174 self.clusters = clusters 175 176 # Tracks the computation completeness of PCollections. PCollections tracked 177 # here don't need to be re-computed when data introspection is needed. 178 self._computed_pcolls = set() 179 # Always watch __main__ module. 180 self.watch('__main__') 181 # Check if [interactive] dependencies are installed. 182 try: 183 import IPython # pylint: disable=unused-import 184 import timeloop # pylint: disable=unused-import 185 from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator # pylint: disable=unused-import 186 from google.cloud import dataproc_v1 # pylint: disable=unused-import 187 self._is_interactive_ready = True 188 except ImportError: 189 self._is_interactive_ready = False 190 _LOGGER.warning( 191 'Dependencies required for Interactive Beam PCollection ' 192 'visualization are not available, please use: `pip ' 193 'install apache-beam[interactive]` to install necessary ' 194 'dependencies to enable all data visualization features.') 195 196 self._is_in_ipython = is_in_ipython() 197 self._is_in_notebook = is_in_notebook() 198 if not self._is_in_ipython: 199 _LOGGER.warning( 200 'You cannot use Interactive Beam features when you are ' 201 'not in an interactive environment such as a Jupyter ' 202 'notebook or ipython terminal.') 203 if self._is_in_ipython and not self._is_in_notebook: 204 _LOGGER.warning( 205 'You have limited Interactive Beam features since your ' 206 'ipython kernel is not connected to any notebook frontend.') 207 if self._is_in_notebook: 208 self.load_jquery_with_datatable() 209 register_ipython_log_handler() 210 211 # A singleton inspector instance to message information of current 212 # environment to other applications. 213 self._inspector = InteractiveEnvironmentInspector() 214 # A similar singleton inspector except it includes synthetic variables 215 # generated by Interactive Beam. 216 self._inspector_with_synthetic = InteractiveEnvironmentInspector( 217 ignore_synthetic=False) 218 219 self.sql_chain = {} 220 221 @property 222 def options(self): 223 """A reference to the global interactive options. 224 225 Provided to avoid import loop or excessive dynamic import. All internal 226 Interactive Beam modules should access interactive_beam.options through 227 this property. 228 """ 229 from apache_beam.runners.interactive.interactive_beam import options 230 return options 231 232 @property 233 def is_interactive_ready(self): 234 """If the [interactive] dependencies are installed.""" 235 return self._is_interactive_ready 236 237 @property 238 def is_in_ipython(self): 239 """If the runtime is within an IPython kernel.""" 240 return self._is_in_ipython 241 242 @property 243 def is_in_notebook(self): 244 """If the kernel is connected to a notebook frontend. 245 246 If not, it could be that the user is using kernel in a terminal or a unit 247 test. 248 """ 249 return self._is_in_notebook 250 251 @property 252 def inspector(self): 253 """Gets the singleton InteractiveEnvironmentInspector to retrieve 254 information consumable by other applications such as a notebook 255 extension.""" 256 return self._inspector 257 258 @property 259 def inspector_with_synthetic(self): 260 """Gets the singleton InteractiveEnvironmentInspector with additional 261 synthetic variables generated by Interactive Beam. Internally used.""" 262 return self._inspector_with_synthetic 263 264 def cleanup_pipeline(self, pipeline): 265 from apache_beam.runners.interactive import background_caching_job as bcj 266 bcj.attempt_to_cancel_background_caching_job(pipeline) 267 bcj.attempt_to_stop_test_stream_service(pipeline) 268 cache_manager = self.get_cache_manager(pipeline) 269 # Recording manager performs cache manager cleanup during eviction, so we 270 # don't need to clean it up here. 271 if cache_manager and self.get_recording_manager(pipeline) is None: 272 cache_manager.cleanup() 273 self.clusters.cleanup(pipeline) 274 275 def cleanup_environment(self): 276 for _, job in self._background_caching_jobs.items(): 277 if job: 278 job.cancel() 279 for _, controller in self._test_stream_service_controllers.items(): 280 if controller: 281 controller.stop() 282 for pipeline_id, cache_manager in self._cache_managers.items(): 283 # Recording manager performs cache manager cleanup during eviction, so 284 # we don't need to clean it up here. 285 if cache_manager and pipeline_id not in self._recording_managers: 286 cache_manager.cleanup() 287 self.clusters.cleanup(force=True) 288 289 def cleanup(self, pipeline=None): 290 """Cleans up cached states for the given pipeline. Noop if the given 291 pipeline is absent from the environment. Cleans up for all pipelines 292 if no pipeline is specified.""" 293 if pipeline: 294 self.cleanup_pipeline(pipeline) 295 else: 296 self.cleanup_environment() 297 298 self.evict_recording_manager(pipeline) 299 self.evict_background_caching_job(pipeline) 300 self.evict_test_stream_service_controller(pipeline) 301 self.evict_computed_pcollections(pipeline) 302 self.evict_cached_source_signature(pipeline) 303 self.evict_pipeline_result(pipeline) 304 self.evict_tracked_pipelines(pipeline) 305 306 def _track_user_pipelines(self, watchable): 307 """Tracks user pipelines from the given watchable.""" 308 309 pipelines = set() 310 if isinstance(watchable, beam.Pipeline): 311 pipelines.add(watchable) 312 elif isinstance(watchable, dict): 313 for v in watchable.values(): 314 if isinstance(v, beam.Pipeline): 315 pipelines.add(v) 316 elif isinstance(watchable, Iterable): 317 for v in watchable: 318 if isinstance(v, beam.Pipeline): 319 pipelines.add(v) 320 for p in pipelines: 321 self._tracked_user_pipelines.add_user_pipeline(p) 322 _ = self.get_cache_manager(p, create_if_absent=True) 323 _ = self.get_recording_manager(p, create_if_absent=True) 324 325 def watch(self, watchable): 326 """Watches a watchable. 327 328 A watchable can be a dictionary of variable metadata such as locals(), a str 329 name of a module, a module object or an instance of a class. The variable 330 can come from any scope even local. Duplicated variable naming doesn't 331 matter since they are different instances. Duplicated variables are also 332 allowed when watching. 333 """ 334 if isinstance(watchable, dict): 335 self._watching_dict_list.append(watchable.items()) 336 else: 337 self._watching_set.add(watchable) 338 self._track_user_pipelines(watchable) 339 340 def watching(self): 341 """Analyzes and returns a list of pair lists referring to variable names and 342 values from watched scopes. 343 344 Each entry in the list represents the variable defined within a watched 345 watchable. Currently, each entry holds a list of pairs. The format might 346 change in the future to hold more metadata. Duplicated pairs are allowed. 347 And multiple paris can have the same variable name as the "first" while 348 having different variable values as the "second" since variables in 349 different scopes can have the same name. 350 """ 351 watching = list(self._watching_dict_list) 352 for watchable in self._watching_set: 353 if isinstance(watchable, str): 354 module = importlib.import_module(watchable) 355 watching.append(vars(module).items()) 356 else: 357 watching.append(vars(watchable).items()) 358 return watching 359 360 def set_cache_manager(self, cache_manager, pipeline): 361 """Sets the cache manager held by current Interactive Environment for the 362 given pipeline.""" 363 if self.get_cache_manager(pipeline) is cache_manager: 364 # NOOP if setting to the same cache_manager. 365 return 366 if self.get_cache_manager(pipeline): 367 # Invoke cleanup routine when a new cache_manager is forcefully set and 368 # current cache_manager is not None. 369 self.cleanup(pipeline) 370 self._cache_managers[str(id(pipeline))] = cache_manager 371 372 def get_cache_manager(self, pipeline, create_if_absent=False): 373 """Gets the cache manager held by current Interactive Environment for the 374 given pipeline. If the pipeline is absent from the environment while 375 create_if_absent is True, creates and returns a new file based cache 376 manager for the pipeline.""" 377 warnings.filterwarnings( 378 'ignore', 379 'options is deprecated since First stable release. References to ' 380 '<pipeline>.options will not be supported', 381 category=DeprecationWarning) 382 383 cache_manager = self._cache_managers.get(str(id(pipeline)), None) 384 pipeline_runner = detect_pipeline_runner(pipeline) 385 if not cache_manager and create_if_absent: 386 cache_root = self.options.cache_root 387 if cache_root: 388 if cache_root.startswith('gs://'): 389 cache_dir = self._get_gcs_cache_dir(pipeline, cache_root) 390 else: 391 cache_dir = tempfile.mkdtemp(dir=cache_root) 392 if not isinstance(pipeline_runner, direct_runner.DirectRunner): 393 _LOGGER.warning( 394 'A local cache directory has been specified while ' 395 'not using DirectRunner. It is recommended to cache into a ' 396 'GCS bucket instead.') 397 else: 398 staging_location = pipeline.options.get_all_options( 399 )['staging_location'] 400 if isinstance(pipeline_runner, DataflowRunner) and staging_location: 401 cache_dir = self._get_gcs_cache_dir(pipeline, staging_location) 402 _LOGGER.info( 403 'No cache_root detected. ' 404 'Defaulting to staging_location %s for cache location.', 405 staging_location) 406 else: 407 cache_dir = tempfile.mkdtemp( 408 suffix=str(id(pipeline)), 409 prefix='it-', 410 dir=os.environ.get('TEST_TMPDIR', None)) 411 cache_manager = cache.FileBasedCacheManager(cache_dir) 412 self._cache_managers[str(id(pipeline))] = cache_manager 413 return cache_manager 414 415 def evict_cache_manager(self, pipeline=None): 416 """Evicts the cache manager held by current Interactive Environment for the 417 given pipeline. Noop if the pipeline is absent from the environment. If no 418 pipeline is specified, evicts for all pipelines.""" 419 self.cleanup(pipeline) 420 if pipeline: 421 return self._cache_managers.pop(str(id(pipeline)), None) 422 self._cache_managers.clear() 423 424 def set_recording_manager(self, recording_manager, pipeline): 425 """Sets the recording manager for the given pipeline.""" 426 if self.get_recording_manager(pipeline) is recording_manager: 427 # NOOP if setting to the same recording_manager. 428 return 429 self._recording_managers[str(id(pipeline))] = recording_manager 430 431 def get_recording_manager(self, pipeline, create_if_absent=False): 432 """Gets the recording manager for the given pipeline.""" 433 recording_manager = self._recording_managers.get(str(id(pipeline)), None) 434 if not recording_manager and create_if_absent: 435 # Get the pipeline variable name for the user. This is useful if the user 436 # has multiple pipelines. 437 pipeline_var = '' 438 for w in self.watching(): 439 for var, val in w: 440 if val is pipeline: 441 pipeline_var = var 442 break 443 recording_manager = RecordingManager(pipeline, pipeline_var) 444 self._recording_managers[str(id(pipeline))] = recording_manager 445 return recording_manager 446 447 def evict_recording_manager(self, pipeline): 448 """Evicts the recording manager for the given pipeline. 449 450 This stops the background caching job and clears the cache. 451 Noop if the pipeline is absent from the environment. If no 452 pipeline is specified, evicts for all pipelines. 453 """ 454 if not pipeline: 455 for rm in self._recording_managers.values(): 456 rm.cancel() 457 rm.clear() 458 self._recording_managers = {} 459 return 460 461 recording_manager = self.get_recording_manager(pipeline) 462 if recording_manager: 463 recording_manager.cancel() 464 recording_manager.clear() 465 del self._recording_managers[str(id(pipeline))] 466 467 def describe_all_recordings(self): 468 """Returns a description of the recording for all watched pipelnes.""" 469 return { 470 self.pipeline_id_to_pipeline(pid): rm.describe() 471 for pid, 472 rm in self._recording_managers.items() 473 } 474 475 def set_pipeline_result(self, pipeline, result): 476 """Sets the pipeline run result. Adds one if absent. Otherwise, replace.""" 477 assert issubclass(type(pipeline), beam.Pipeline), ( 478 'pipeline must be an instance of apache_beam.Pipeline or its subclass') 479 assert issubclass(type(result), runner.PipelineResult), ( 480 'result must be an instance of ' 481 'apache_beam.runners.runner.PipelineResult or its subclass') 482 self._main_pipeline_results[str(id(pipeline))] = result 483 484 def evict_pipeline_result(self, pipeline=None): 485 """Evicts the last run result of the given pipeline. Noop if the pipeline 486 is absent from the environment. If no pipeline is specified, evicts for all 487 pipelines.""" 488 if pipeline: 489 return self._main_pipeline_results.pop(str(id(pipeline)), None) 490 self._main_pipeline_results.clear() 491 492 def pipeline_result(self, pipeline): 493 """Gets the pipeline run result. None if absent.""" 494 return self._main_pipeline_results.get(str(id(pipeline)), None) 495 496 def set_background_caching_job(self, pipeline, background_caching_job): 497 """Sets the background caching job started from the given pipeline.""" 498 assert issubclass(type(pipeline), beam.Pipeline), ( 499 'pipeline must be an instance of apache_beam.Pipeline or its subclass') 500 from apache_beam.runners.interactive.background_caching_job import BackgroundCachingJob 501 assert isinstance(background_caching_job, BackgroundCachingJob), ( 502 'background_caching job must be an instance of BackgroundCachingJob') 503 self._background_caching_jobs[str(id(pipeline))] = background_caching_job 504 505 def get_background_caching_job(self, pipeline): 506 """Gets the background caching job started from the given pipeline.""" 507 return self._background_caching_jobs.get(str(id(pipeline)), None) 508 509 def evict_background_caching_job(self, pipeline=None): 510 """Evicts the background caching job started from the given pipeline. Noop 511 if the given pipeline is absent from the environment. If no pipeline is 512 specified, evicts for all pipelines.""" 513 if pipeline: 514 return self._background_caching_jobs.pop(str(id(pipeline)), None) 515 self._background_caching_jobs.clear() 516 517 def set_test_stream_service_controller(self, pipeline, controller): 518 """Sets the test stream service controller that has started a gRPC server 519 serving the test stream for any job started from the given user defined 520 pipeline. 521 """ 522 self._test_stream_service_controllers[str(id(pipeline))] = controller 523 524 def get_test_stream_service_controller(self, pipeline): 525 """Gets the test stream service controller that has started a gRPC server 526 serving the test stream for any job started from the given user defined 527 pipeline. 528 """ 529 return self._test_stream_service_controllers.get(str(id(pipeline)), None) 530 531 def evict_test_stream_service_controller(self, pipeline): 532 """Evicts and pops the test stream service controller that has started a 533 gRPC server serving the test stream for any job started from the given 534 user defined pipeline. Noop if the given pipeline is absent from the 535 environment. If no pipeline is specified, evicts for all pipelines. 536 """ 537 if pipeline: 538 return self._test_stream_service_controllers.pop(str(id(pipeline)), None) 539 self._test_stream_service_controllers.clear() 540 541 def is_terminated(self, pipeline): 542 """Queries if the most recent job (by executing the given pipeline) state 543 is in a terminal state. True if absent.""" 544 result = self.pipeline_result(pipeline) 545 if result: 546 return runner.PipelineState.is_terminal(result.state) 547 return True 548 549 def set_cached_source_signature(self, pipeline, signature): 550 self._cached_source_signature[str(id(pipeline))] = signature 551 552 def get_cached_source_signature(self, pipeline): 553 return self._cached_source_signature.get(str(id(pipeline)), set()) 554 555 def evict_cached_source_signature(self, pipeline=None): 556 """Evicts the signature generated for each recorded source of the given 557 pipeline. Noop if the given pipeline is absent from the environment. If no 558 pipeline is specified, evicts for all pipelines.""" 559 if pipeline: 560 return self._cached_source_signature.pop(str(id(pipeline)), None) 561 self._cached_source_signature.clear() 562 563 def track_user_pipelines(self): 564 """Record references to all user defined pipeline instances watched in 565 current environment. 566 567 Current static global singleton interactive environment holds references to 568 a set of pipeline instances defined by the user in the watched scope. 569 Interactive Beam features could use the references to determine if a given 570 pipeline is defined by user or implicitly created by Beam SDK or runners, 571 then handle them differently. 572 573 This is invoked every time a PTransform is to be applied if the current 574 code execution is under ipython due to the possibility that any user defined 575 pipeline can be re-evaluated through notebook cell re-execution at any time. 576 577 Each time this is invoked, it will check if there is a cache manager 578 already created for each user defined pipeline. If not, create one for it. 579 580 If a pipeline is no longer watched due to re-execution while its 581 PCollections are still in watched scope, the pipeline becomes anonymous but 582 still accessible indirectly through references to its PCollections. This 583 function also clears up internal states for those anonymous pipelines once 584 all their PCollections are anonymous. 585 """ 586 for watching in self.watching(): 587 for _, val in watching: 588 if isinstance(val, beam.pipeline.Pipeline): 589 self._tracked_user_pipelines.add_user_pipeline(val) 590 _ = self.get_cache_manager(val, create_if_absent=True) 591 _ = self.get_recording_manager(val, create_if_absent=True) 592 all_tracked_pipeline_ids = set(self._background_caching_jobs.keys()).union( 593 set(self._test_stream_service_controllers.keys()), 594 set(self._cache_managers.keys()), 595 {str(id(pcoll.pipeline)) 596 for pcoll in self._computed_pcolls}, 597 set(self._cached_source_signature.keys()), 598 set(self._main_pipeline_results.keys())) 599 inspectable_pipelines = self._inspector.inspectable_pipelines 600 for pipeline in all_tracked_pipeline_ids: 601 if pipeline not in inspectable_pipelines: 602 self.cleanup(pipeline) 603 604 @property 605 def tracked_user_pipelines(self): 606 """Returns the user pipelines in this environment.""" 607 for p in self._tracked_user_pipelines: 608 yield p 609 610 def user_pipeline(self, derived_pipeline): 611 """Returns the user pipeline for the given derived pipeline.""" 612 return self._tracked_user_pipelines.get_user_pipeline(derived_pipeline) 613 614 def add_user_pipeline(self, user_pipeline): 615 self._tracked_user_pipelines.add_user_pipeline(user_pipeline) 616 617 def add_derived_pipeline(self, user_pipeline, derived_pipeline): 618 """Adds the derived pipeline to the parent user pipeline.""" 619 self._tracked_user_pipelines.add_derived_pipeline( 620 user_pipeline, derived_pipeline) 621 622 def evict_tracked_pipelines(self, user_pipeline): 623 """Evicts the user pipeline and its derived pipelines.""" 624 if user_pipeline: 625 self._tracked_user_pipelines.evict(user_pipeline) 626 else: 627 self._tracked_user_pipelines.clear() 628 629 def pipeline_id_to_pipeline(self, pid): 630 """Converts a pipeline id to a user pipeline. 631 """ 632 633 return self._tracked_user_pipelines.get_pipeline(pid) 634 635 def mark_pcollection_computed(self, pcolls): 636 """Marks computation completeness for the given pcolls. 637 638 Interactive Beam can use this information to determine if a computation is 639 needed to introspect the data of any given PCollection. 640 """ 641 self._computed_pcolls.update(pcoll for pcoll in pcolls) 642 643 def evict_computed_pcollections(self, pipeline=None): 644 """Evicts all computed PCollections for the given pipeline. If no pipeline 645 is specified, evicts for all pipelines. 646 """ 647 if pipeline: 648 discarded = set() 649 for pcoll in self._computed_pcolls: 650 if pcoll.pipeline is pipeline: 651 discarded.add(pcoll) 652 self._computed_pcolls -= discarded 653 else: 654 self._computed_pcolls = set() 655 656 @property 657 def computed_pcollections(self): 658 return self._computed_pcolls 659 660 def load_jquery_with_datatable(self): 661 """Loads common resources to enable jquery with datatable configured for 662 notebook frontends if necessary. If the resources have been loaded, NOOP. 663 664 A window.interactive_beam_jquery with datatable plugin configured can be 665 used in following notebook cells once this is invoked. 666 667 #. There should only be one jQuery imported. 668 #. Datatable needs to be imported after jQuery is loaded. 669 #. Imported jQuery is attached to window named as jquery[version]. 670 #. The window attachment needs to happen at the end of import chain until 671 all jQuery plugins are set. 672 """ 673 try: 674 from IPython.display import Javascript 675 from IPython.display import display_javascript 676 display_javascript( 677 Javascript( 678 _JQUERY_WITH_DATATABLE_TEMPLATE.format(customized_script=''))) 679 except ImportError: 680 pass # NOOP if dependencies are not available. 681 682 def import_html_to_head(self, html_hrefs): 683 """Imports given external HTMLs (supported through webcomponents) into 684 the head of the document. 685 686 On load of webcomponentsjs, import given HTMLs. If HTML import is already 687 supported, skip loading webcomponentsjs. 688 689 No matter how many times an HTML import occurs in the document, only the 690 first occurrence really embeds the external HTML. In a notebook environment, 691 the body of the document is always changing due to cell [re-]execution, 692 deletion and re-ordering. Thus, HTML imports shouldn't be put in the body 693 especially the output areas of notebook cells. 694 """ 695 try: 696 from IPython.display import Javascript 697 from IPython.display import display_javascript 698 display_javascript( 699 Javascript(_HTML_IMPORT_TEMPLATE.format(hrefs=html_hrefs))) 700 except ImportError: 701 pass # NOOP if dependencies are not available. 702 703 def get_sql_chain(self, pipeline, set_user_pipeline=False): 704 if pipeline not in self.sql_chain: 705 self.sql_chain[pipeline] = SqlChain() 706 chain = self.sql_chain[pipeline] 707 if set_user_pipeline: 708 if chain.user_pipeline and chain.user_pipeline is not pipeline: 709 raise ValueError( 710 'The beam_sql magic tries to query PCollections from multiple ' 711 'pipelines: %s and %s', 712 chain.user_pipeline, 713 pipeline) 714 chain.user_pipeline = pipeline 715 return chain 716 717 def _get_gcs_cache_dir(self, pipeline, cache_dir): 718 cache_dir_path = PurePath(cache_dir) 719 if len(cache_dir_path.parts) < 2: 720 _LOGGER.error( 721 'GCS bucket cache path "%s" is too short to be valid. See ' 722 'https://cloud.google.com/storage/docs/naming-buckets for ' 723 'the expected format.', 724 cache_dir) 725 raise ValueError('cache_root GCS bucket path is invalid.') 726 bucket_name = cache_dir_path.parts[1] 727 assert_bucket_exists(bucket_name) 728 return 'gs://{}/{}'.format('/'.join(cache_dir_path.parts[1:]), id(pipeline))