github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_beam_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for apache_beam.runners.interactive.interactive_beam.""" 19 # pytype: skip-file 20 21 import dataclasses 22 import importlib 23 import sys 24 import time 25 import unittest 26 from typing import NamedTuple 27 from unittest.mock import patch 28 29 import apache_beam as beam 30 from apache_beam import dataframe as frames 31 from apache_beam.options.pipeline_options import FlinkRunnerOptions 32 from apache_beam.options.pipeline_options import PipelineOptions 33 from apache_beam.runners.interactive import interactive_beam as ib 34 from apache_beam.runners.interactive import interactive_environment as ie 35 from apache_beam.runners.interactive import interactive_runner as ir 36 from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager 37 from apache_beam.runners.interactive.dataproc.types import ClusterMetadata 38 from apache_beam.runners.interactive.options.capture_limiters import Limiter 39 from apache_beam.runners.interactive.testing.mock_env import isolated_env 40 from apache_beam.runners.runner import PipelineState 41 from apache_beam.testing.test_stream import TestStream 42 43 44 @dataclasses.dataclass 45 class MockClusterMetadata: 46 master_url = 'mock_url' 47 48 49 class Record(NamedTuple): 50 order_id: int 51 product_id: int 52 quantity: int 53 54 55 # The module name is also a variable in module. 56 _module_name = 'apache_beam.runners.interactive.interactive_beam_test' 57 58 59 def _get_watched_pcollections_with_variable_names(): 60 watched_pcollections = {} 61 for watching in ie.current_env().watching(): 62 for key, val in watching: 63 if hasattr(val, '__class__') and isinstance(val, beam.pvalue.PCollection): 64 watched_pcollections[val] = key 65 return watched_pcollections 66 67 68 @isolated_env 69 class InteractiveBeamTest(unittest.TestCase): 70 def setUp(self): 71 self._var_in_class_instance = 'a var in class instance, not directly used' 72 73 def tearDown(self): 74 ib.options.capture_control.set_limiters_for_test([]) 75 76 def test_watch_main_by_default(self): 77 test_env = ie.InteractiveEnvironment() 78 # Current Interactive Beam env fetched and the test env are 2 instances. 79 self.assertNotEqual(id(ie.current_env()), id(test_env)) 80 self.assertEqual(ie.current_env().watching(), test_env.watching()) 81 82 def test_watch_a_module_by_name(self): 83 test_env = ie.InteractiveEnvironment() 84 ib.watch(_module_name) 85 test_env.watch(_module_name) 86 self.assertEqual(ie.current_env().watching(), test_env.watching()) 87 88 def test_watch_a_module_by_module_object(self): 89 test_env = ie.InteractiveEnvironment() 90 module = importlib.import_module(_module_name) 91 ib.watch(module) 92 test_env.watch(module) 93 self.assertEqual(ie.current_env().watching(), test_env.watching()) 94 95 def test_watch_locals(self): 96 # test_env serves as local var too. 97 test_env = ie.InteractiveEnvironment() 98 ib.watch(locals()) 99 test_env.watch(locals()) 100 self.assertEqual(ie.current_env().watching(), test_env.watching()) 101 102 def test_watch_class_instance(self): 103 test_env = ie.InteractiveEnvironment() 104 ib.watch(self) 105 test_env.watch(self) 106 self.assertEqual(ie.current_env().watching(), test_env.watching()) 107 108 @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]") 109 def test_show_always_watch_given_pcolls(self): 110 p = beam.Pipeline(ir.InteractiveRunner()) 111 # pylint: disable=bad-option-value 112 pcoll = p | 'Create' >> beam.Create(range(10)) 113 # The pcoll is not watched since watch(locals()) is not explicitly called. 114 self.assertFalse(pcoll in _get_watched_pcollections_with_variable_names()) 115 # The call of show watches pcoll. 116 ib.watch({'p': p}) 117 ie.current_env().track_user_pipelines() 118 ib.show(pcoll) 119 self.assertTrue(pcoll in _get_watched_pcollections_with_variable_names()) 120 121 @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]") 122 def test_show_mark_pcolls_computed_when_done(self): 123 p = beam.Pipeline(ir.InteractiveRunner()) 124 # pylint: disable=bad-option-value 125 pcoll = p | 'Create' >> beam.Create(range(10)) 126 self.assertFalse(pcoll in ie.current_env().computed_pcollections) 127 # The call of show marks pcoll computed. 128 ib.watch(locals()) 129 ie.current_env().track_user_pipelines() 130 ib.show(pcoll) 131 self.assertTrue(pcoll in ie.current_env().computed_pcollections) 132 133 @patch(( 134 'apache_beam.runners.interactive.interactive_beam.' 135 'visualize_computed_pcoll')) 136 def test_show_handles_dict_of_pcolls(self, mocked_visualize): 137 p = beam.Pipeline(ir.InteractiveRunner()) 138 # pylint: disable=bad-option-value 139 pcoll = p | 'Create' >> beam.Create(range(10)) 140 ib.watch(locals()) 141 ie.current_env().track_user_pipelines() 142 ie.current_env().mark_pcollection_computed([pcoll]) 143 ie.current_env()._is_in_ipython = True 144 ie.current_env()._is_in_notebook = True 145 ib.show({'pcoll': pcoll}) 146 mocked_visualize.assert_called_once() 147 148 @patch(( 149 'apache_beam.runners.interactive.interactive_beam.' 150 'visualize_computed_pcoll')) 151 def test_show_handles_iterable_of_pcolls(self, mocked_visualize): 152 p = beam.Pipeline(ir.InteractiveRunner()) 153 # pylint: disable=bad-option-value 154 pcoll = p | 'Create' >> beam.Create(range(10)) 155 ib.watch(locals()) 156 ie.current_env().track_user_pipelines() 157 ie.current_env().mark_pcollection_computed([pcoll]) 158 ie.current_env()._is_in_ipython = True 159 ie.current_env()._is_in_notebook = True 160 ib.show([pcoll]) 161 mocked_visualize.assert_called_once() 162 163 @patch('apache_beam.runners.interactive.interactive_beam.visualize') 164 def test_show_handles_deferred_dataframes(self, mocked_visualize): 165 p = beam.Pipeline(ir.InteractiveRunner()) 166 167 deferred = frames.convert.to_dataframe(p | beam.Create([Record(0, 0, 0)])) 168 169 ib.watch(locals()) 170 ie.current_env().track_user_pipelines() 171 ie.current_env()._is_in_ipython = True 172 ie.current_env()._is_in_notebook = True 173 ib.show(deferred) 174 mocked_visualize.assert_called_once() 175 176 @patch(( 177 'apache_beam.runners.interactive.interactive_beam.' 178 'visualize_computed_pcoll')) 179 def test_show_noop_when_pcoll_container_is_invalid(self, mocked_visualize): 180 class SomeRandomClass: 181 def __init__(self, pcoll): 182 self._pcoll = pcoll 183 184 p = beam.Pipeline(ir.InteractiveRunner()) 185 # pylint: disable=bad-option-value 186 pcoll = p | 'Create' >> beam.Create(range(10)) 187 ie.current_env().mark_pcollection_computed([pcoll]) 188 ie.current_env()._is_in_ipython = True 189 ie.current_env()._is_in_notebook = True 190 self.assertRaises(ValueError, ib.show, SomeRandomClass(pcoll)) 191 mocked_visualize.assert_not_called() 192 193 def test_recordings_describe(self): 194 """Tests that getting the description works.""" 195 196 # Create the pipelines to test. 197 p1 = beam.Pipeline(ir.InteractiveRunner()) 198 p2 = beam.Pipeline(ir.InteractiveRunner()) 199 200 ib.watch(locals()) 201 202 # Get the descriptions. This test is simple as there isn't much logic in the 203 # method. 204 self.assertEqual(ib.recordings.describe(p1)['size'], 0) 205 self.assertEqual(ib.recordings.describe(p2)['size'], 0) 206 207 all_descriptions = ib.recordings.describe() 208 self.assertEqual(all_descriptions[p1]['size'], 0) 209 self.assertEqual(all_descriptions[p2]['size'], 0) 210 211 # Ensure that the variable name for the pipeline is set correctly. 212 self.assertEqual(all_descriptions[p1]['pipeline_var'], 'p1') 213 self.assertEqual(all_descriptions[p2]['pipeline_var'], 'p2') 214 215 def test_recordings_clear(self): 216 """Tests that clearing the pipeline is correctly forwarded.""" 217 218 # Create a basic pipeline to store something in the cache. 219 p = beam.Pipeline(ir.InteractiveRunner()) 220 elem = p | beam.Create([1]) 221 ib.watch(locals()) 222 ie.current_env().track_user_pipelines() 223 224 # This records the pipeline so that the cache size is > 0. 225 ib.collect(elem) 226 self.assertGreater(ib.recordings.describe(p)['size'], 0) 227 228 # After clearing, the cache should be empty. 229 ib.recordings.clear(p) 230 self.assertEqual(ib.recordings.describe(p)['size'], 0) 231 232 def test_recordings_record(self): 233 """Tests that recording pipeline succeeds.""" 234 235 # Add the TestStream so that it can be cached. 236 ib.options.recordable_sources.add(TestStream) 237 238 # Create a pipeline with an arbitrary amonunt of elements. 239 p = beam.Pipeline( 240 ir.InteractiveRunner(), options=PipelineOptions(streaming=True)) 241 # pylint: disable=unused-variable 242 _ = (p 243 | TestStream() 244 .advance_watermark_to(0) 245 .advance_processing_time(1) 246 .add_elements(list(range(10))) 247 .advance_processing_time(1)) # yapf: disable 248 ib.watch(locals()) 249 ie.current_env().track_user_pipelines() 250 251 # Assert that the pipeline starts in a good state. 252 self.assertEqual(ib.recordings.describe(p)['state'], PipelineState.STOPPED) 253 self.assertEqual(ib.recordings.describe(p)['size'], 0) 254 255 # Create a lmiter that stops the background caching job when something is 256 # written to cache. This is used to make ensure that the pipeline is 257 # functioning properly and that there are no data races with the test. 258 class SizeLimiter(Limiter): 259 def __init__(self, pipeline): 260 self.pipeline = pipeline 261 self.should_trigger = False 262 263 def is_triggered(self): 264 return ( 265 ib.recordings.describe(self.pipeline)['size'] > 0 and 266 self.should_trigger) 267 268 limiter = SizeLimiter(p) 269 ib.options.capture_control.set_limiters_for_test([limiter]) 270 271 # Assert that a recording can be started only once. 272 self.assertTrue(ib.recordings.record(p)) 273 self.assertFalse(ib.recordings.record(p)) 274 self.assertEqual(ib.recordings.describe(p)['state'], PipelineState.RUNNING) 275 276 # Wait for the pipeline to start and write something to cache. 277 limiter.should_trigger = True 278 for _ in range(60): 279 if limiter.is_triggered(): 280 break 281 time.sleep(1) 282 self.assertTrue( 283 limiter.is_triggered(), 284 'Test timed out waiting for limiter to be triggered. This indicates ' 285 'that the BackgroundCachingJob did not cache anything.') 286 287 # Assert that a recording can be stopped and can't be started again until 288 # after the cache is cleared. 289 ib.recordings.stop(p) 290 self.assertEqual(ib.recordings.describe(p)['state'], PipelineState.STOPPED) 291 self.assertFalse(ib.recordings.record(p)) 292 ib.recordings.clear(p) 293 self.assertTrue(ib.recordings.record(p)) 294 ib.recordings.stop(p) 295 296 297 @unittest.skipIf( 298 not ie.current_env().is_interactive_ready, 299 '[interactive] dependency is not installed.') 300 @isolated_env 301 class InteractiveBeamClustersTest(unittest.TestCase): 302 def setUp(self): 303 self.current_env.options.cache_root = 'gs://fake' 304 self.clusters = self.current_env.clusters 305 306 def tearDown(self): 307 self.current_env.options.cache_root = None 308 309 def test_cluster_metadata_pass_through_metadata(self): 310 cid = ClusterMetadata(project_id='test-project') 311 meta = self.clusters.cluster_metadata(cid) 312 self.assertIs(meta, cid) 313 314 def test_cluster_metadata_identifies_pipeline(self): 315 cid = beam.Pipeline() 316 known_meta = ClusterMetadata(project_id='test-project') 317 dcm = DataprocClusterManager(known_meta) 318 self.clusters.pipelines[cid] = dcm 319 320 meta = self.clusters.cluster_metadata(cid) 321 self.assertIs(meta, known_meta) 322 323 def test_cluster_metadata_identifies_master_url(self): 324 cid = 'test-url' 325 known_meta = ClusterMetadata(project_id='test-project') 326 _ = DataprocClusterManager(known_meta) 327 self.clusters.master_urls[cid] = known_meta 328 329 meta = self.clusters.cluster_metadata(cid) 330 self.assertIs(meta, known_meta) 331 332 def test_cluster_metadata_default_value(self): 333 cid_none = None 334 cid_unknown_p = beam.Pipeline() 335 cid_unknown_master_url = 'test-url' 336 default_meta = ClusterMetadata(project_id='test-project') 337 self.clusters.set_default_cluster(default_meta) 338 339 self.assertIs(default_meta, self.clusters.cluster_metadata(cid_none)) 340 self.assertIs(default_meta, self.clusters.cluster_metadata(cid_unknown_p)) 341 self.assertIs( 342 default_meta, self.clusters.cluster_metadata(cid_unknown_master_url)) 343 344 def test_create_a_new_cluster(self): 345 meta = ClusterMetadata(project_id='test-project') 346 _ = self.clusters.create(meta) 347 348 # Derived fields are populated. 349 self.assertTrue(meta.master_url.startswith('test-url')) 350 self.assertEqual(meta.dashboard, 'test-dashboard') 351 # The cluster is known. 352 self.assertIn(meta, self.clusters.dataproc_cluster_managers) 353 self.assertIn(meta.master_url, self.clusters.master_urls) 354 # The default cluster is updated to the created cluster. 355 self.assertIs(meta, self.clusters.default_cluster_metadata) 356 357 def test_create_but_reuse_a_known_cluster(self): 358 known_meta = ClusterMetadata( 359 project_id='test-project', region='test-region') 360 known_dcm = DataprocClusterManager(known_meta) 361 known_meta.master_url = 'test-url' 362 self.clusters.set_default_cluster(known_meta) 363 self.clusters.dataproc_cluster_managers[known_meta] = known_dcm 364 self.clusters.master_urls[known_meta.master_url] = known_meta 365 366 # Use an equivalent meta as the identifier to create a cluster. 367 cid_meta = ClusterMetadata( 368 project_id=known_meta.project_id, 369 region=known_meta.region, 370 cluster_name=known_meta.cluster_name) 371 dcm = self.clusters.create(cid_meta) 372 # The known cluster manager is returned. 373 self.assertIs(dcm, known_dcm) 374 375 # Then use an equivalent master_url as the identifier. 376 cid_master_url = known_meta.master_url 377 dcm = self.clusters.create(cid_master_url) 378 self.assertIs(dcm, known_dcm) 379 380 def test_cleanup_by_a_pipeline(self): 381 meta = ClusterMetadata(project_id='test-project') 382 dcm = self.clusters.create(meta) 383 384 # Set up the association between a pipeline and a cluster. 385 # In real code, it's set by the runner the 1st time a pipeline is executed. 386 options = PipelineOptions() 387 options.view_as(FlinkRunnerOptions).flink_master = meta.master_url 388 p = beam.Pipeline(options=options) 389 self.clusters.pipelines[p] = dcm 390 dcm.pipelines.add(p) 391 392 self.clusters.cleanup(p) 393 # Delete the cluster. 394 self.m_delete_cluster.assert_called_once() 395 # Pipeline association is cleaned up. 396 self.assertNotIn(p, self.clusters.pipelines) 397 self.assertNotIn(p, dcm.pipelines) 398 self.assertEqual(options.view_as(FlinkRunnerOptions).flink_master, '[auto]') 399 # The cluster is unknown now. 400 self.assertNotIn(meta, self.clusters.dataproc_cluster_managers) 401 self.assertNotIn(meta.master_url, self.clusters.master_urls) 402 # The cleaned up cluster is also the default cluster. Clean the default. 403 self.assertIsNone(self.clusters.default_cluster_metadata) 404 405 def test_not_cleanup_if_multiple_pipelines_share_a_manager(self): 406 meta = ClusterMetadata(project_id='test-project') 407 dcm = self.clusters.create(meta) 408 409 options = PipelineOptions() 410 options.view_as(FlinkRunnerOptions).flink_master = meta.master_url 411 options2 = PipelineOptions() 412 options2.view_as(FlinkRunnerOptions).flink_master = meta.master_url 413 p = beam.Pipeline(options=options) 414 p2 = beam.Pipeline(options=options2) 415 self.clusters.pipelines[p] = dcm 416 self.clusters.pipelines[p2] = dcm 417 dcm.pipelines.add(p) 418 dcm.pipelines.add(p2) 419 420 self.clusters.cleanup(p) 421 # No cluster deleted. 422 self.m_delete_cluster.assert_not_called() 423 # Pipeline association of p is cleaned up. 424 self.assertNotIn(p, self.clusters.pipelines) 425 self.assertNotIn(p, dcm.pipelines) 426 self.assertEqual(options.view_as(FlinkRunnerOptions).flink_master, '[auto]') 427 # Pipeline association of p2 still presents. 428 self.assertIn(p2, self.clusters.pipelines) 429 self.assertIn(p2, dcm.pipelines) 430 self.assertEqual( 431 options2.view_as(FlinkRunnerOptions).flink_master, meta.master_url) 432 # The cluster is still known. 433 self.assertIn(meta, self.clusters.dataproc_cluster_managers) 434 self.assertIn(meta.master_url, self.clusters.master_urls) 435 # The default cluster still presents. 436 self.assertIs(meta, self.clusters.default_cluster_metadata) 437 438 def test_cleanup_by_a_master_url(self): 439 meta = ClusterMetadata(project_id='test-project') 440 _ = self.clusters.create(meta) 441 442 self.clusters.cleanup(meta.master_url) 443 self.m_delete_cluster.assert_called_once() 444 self.assertNotIn(meta, self.clusters.dataproc_cluster_managers) 445 self.assertNotIn(meta.master_url, self.clusters.master_urls) 446 self.assertIsNone(self.clusters.default_cluster_metadata) 447 448 def test_cleanup_by_meta(self): 449 known_meta = ClusterMetadata( 450 project_id='test-project', region='test-region') 451 _ = self.clusters.create(known_meta) 452 453 meta = ClusterMetadata( 454 project_id=known_meta.project_id, 455 region=known_meta.region, 456 cluster_name=known_meta.cluster_name) 457 self.clusters.cleanup(meta) 458 self.m_delete_cluster.assert_called_once() 459 self.assertNotIn(known_meta, self.clusters.dataproc_cluster_managers) 460 self.assertNotIn(known_meta.master_url, self.clusters.master_urls) 461 self.assertIsNone(self.clusters.default_cluster_metadata) 462 463 def test_force_cleanup_everything(self): 464 meta = ClusterMetadata(project_id='test-project') 465 meta2 = ClusterMetadata(project_id='test-project-2') 466 _ = self.clusters.create(meta) 467 _ = self.clusters.create(meta2) 468 469 self.clusters.cleanup(force=True) 470 self.assertEqual(self.m_delete_cluster.call_count, 2) 471 self.assertNotIn(meta, self.clusters.dataproc_cluster_managers) 472 self.assertNotIn(meta2, self.clusters.dataproc_cluster_managers) 473 self.assertIsNone(self.clusters.default_cluster_metadata) 474 475 def test_cleanup_noop_for_no_cluster_identifier(self): 476 meta = ClusterMetadata(project_id='test-project') 477 _ = self.clusters.create(meta) 478 479 self.clusters.cleanup() 480 self.m_delete_cluster.assert_not_called() 481 482 def test_cleanup_noop_unknown_cluster(self): 483 meta = ClusterMetadata(project_id='test-project') 484 dcm = self.clusters.create(meta) 485 p = beam.Pipeline() 486 self.clusters.pipelines[p] = dcm 487 dcm.pipelines.add(p) 488 489 cid_pipeline = beam.Pipeline() 490 self.clusters.cleanup(cid_pipeline) 491 self.m_delete_cluster.assert_not_called() 492 493 cid_master_url = 'some-random-url' 494 self.clusters.cleanup(cid_master_url) 495 self.m_delete_cluster.assert_not_called() 496 497 cid_meta = ClusterMetadata(project_id='random-project') 498 self.clusters.cleanup(cid_meta) 499 self.m_delete_cluster.assert_not_called() 500 501 self.assertIn(meta, self.clusters.dataproc_cluster_managers) 502 self.assertIn(meta.master_url, self.clusters.master_urls) 503 self.assertIs(meta, self.clusters.default_cluster_metadata) 504 self.assertIn(p, self.clusters.pipelines) 505 self.assertIn(p, dcm.pipelines) 506 507 def test_describe_everything(self): 508 meta = ClusterMetadata(project_id='test-project') 509 meta2 = ClusterMetadata( 510 project_id='test-project', region='some-other-region') 511 _ = self.clusters.create(meta) 512 _ = self.clusters.create(meta2) 513 514 meta_list = self.clusters.describe() 515 self.assertEqual([meta, meta2], meta_list) 516 517 def test_describe_by_cluster_identifier(self): 518 known_meta = ClusterMetadata(project_id='test-project') 519 known_meta2 = ClusterMetadata( 520 project_id='test-project', region='some-other-region') 521 dcm = self.clusters.create(known_meta) 522 dcm2 = self.clusters.create(known_meta2) 523 p = beam.Pipeline() 524 p2 = beam.Pipeline() 525 self.clusters.pipelines[p] = dcm 526 dcm.pipelines.add(p) 527 self.clusters.pipelines[p2] = dcm2 528 dcm.pipelines.add(p2) 529 530 cid_pipeline = p 531 meta = self.clusters.describe(cid_pipeline) 532 self.assertIs(meta, known_meta) 533 534 cid_master_url = known_meta.master_url 535 meta = self.clusters.describe(cid_master_url) 536 self.assertIs(meta, known_meta) 537 538 cid_meta = ClusterMetadata( 539 project_id=known_meta.project_id, 540 region=known_meta.region, 541 cluster_name=known_meta.cluster_name) 542 meta = self.clusters.describe(cid_meta) 543 self.assertIs(meta, known_meta) 544 545 def test_describe_everything_when_cluster_identifer_unknown(self): 546 known_meta = ClusterMetadata(project_id='test-project') 547 known_meta2 = ClusterMetadata( 548 project_id='test-project', region='some-other-region') 549 dcm = self.clusters.create(known_meta) 550 dcm2 = self.clusters.create(known_meta2) 551 p = beam.Pipeline() 552 p2 = beam.Pipeline() 553 self.clusters.pipelines[p] = dcm 554 dcm.pipelines.add(p) 555 self.clusters.pipelines[p2] = dcm2 556 dcm.pipelines.add(p2) 557 558 cid_pipeline = beam.Pipeline() 559 meta_list = self.clusters.describe(cid_pipeline) 560 self.assertEqual([known_meta, known_meta2], meta_list) 561 562 cid_master_url = 'some-random-url' 563 meta_list = self.clusters.describe(cid_master_url) 564 self.assertEqual([known_meta, known_meta2], meta_list) 565 566 cid_meta = ClusterMetadata(project_id='some-random-project') 567 meta_list = self.clusters.describe(cid_meta) 568 self.assertEqual([known_meta, known_meta2], meta_list) 569 570 def test_default_value_for_invalid_worker_number(self): 571 meta = ClusterMetadata(project_id='test-project', num_workers=1) 572 self.clusters.create(meta) 573 574 self.assertEqual(meta.num_workers, 2) 575 576 577 if __name__ == '__main__': 578 unittest.main()