github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/background_caching_job_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for apache_beam.runners.interactive.background_caching_job.""" 19 # pytype: skip-file 20 21 import unittest 22 from unittest.mock import patch 23 24 import apache_beam as beam 25 from apache_beam.options.pipeline_options import PipelineOptions 26 from apache_beam.pipeline import PipelineVisitor 27 from apache_beam.runners import runner 28 from apache_beam.runners.interactive import background_caching_job as bcj 29 from apache_beam.runners.interactive import interactive_beam as ib 30 from apache_beam.runners.interactive import interactive_environment as ie 31 from apache_beam.runners.interactive import interactive_runner 32 from apache_beam.runners.interactive.caching.streaming_cache import StreamingCache 33 from apache_beam.runners.interactive.testing.mock_ipython import mock_get_ipython 34 from apache_beam.runners.interactive.testing.test_cache_manager import FileRecordsBuilder 35 from apache_beam.testing.test_stream import TestStream 36 from apache_beam.testing.test_stream_service import TestStreamServiceController 37 from apache_beam.transforms.window import TimestampedValue 38 39 _FOO_PUBSUB_SUB = 'projects/test-project/subscriptions/foo' 40 _BAR_PUBSUB_SUB = 'projects/test-project/subscriptions/bar' 41 _TEST_CACHE_KEY = 'test' 42 43 44 def _build_a_test_stream_pipeline(): 45 test_stream = ( 46 TestStream().advance_watermark_to(0).add_elements([ 47 TimestampedValue('a', 1) 48 ]).advance_processing_time(5).advance_watermark_to_infinity()) 49 p = beam.Pipeline(runner=interactive_runner.InteractiveRunner()) 50 events = p | test_stream # pylint: disable=possibly-unused-variable 51 ib.watch(locals()) 52 return p 53 54 55 def _build_an_empty_stream_pipeline(): 56 pipeline_options = PipelineOptions(streaming=True) 57 p = beam.Pipeline( 58 interactive_runner.InteractiveRunner(), options=pipeline_options) 59 ib.watch({'pipeline': p}) 60 return p 61 62 63 def _setup_test_streaming_cache(pipeline): 64 cache_manager = StreamingCache(cache_dir=None) 65 ie.current_env().set_cache_manager(cache_manager, pipeline) 66 builder = FileRecordsBuilder(tag=_TEST_CACHE_KEY) 67 (builder 68 .advance_watermark(watermark_secs=0) 69 .advance_processing_time(5) 70 .add_element(element='a', event_time_secs=1) 71 .advance_watermark(watermark_secs=100) 72 .advance_processing_time(10)) # yapf: disable 73 cache_manager.write(builder.build(), _TEST_CACHE_KEY) 74 75 76 @unittest.skipIf( 77 not ie.current_env().is_interactive_ready, 78 '[interactive] dependency is not installed.') 79 class BackgroundCachingJobTest(unittest.TestCase): 80 def tearDown(self): 81 ie.new_env() 82 83 # TODO(BEAM-8335): remove the patches when there are appropriate test sources 84 # that meet the boundedness checks. 85 @patch( 86 'apache_beam.runners.interactive.background_caching_job' 87 '.has_source_to_cache', 88 lambda x: True) 89 # Disable the clean up so that we can keep the test streaming cache. 90 @patch( 91 'apache_beam.runners.interactive.interactive_environment' 92 '.InteractiveEnvironment.cleanup', 93 lambda x, 94 y: None) 95 def test_background_caching_job_starts_when_none_such_job_exists(self): 96 97 # Create a fake PipelineResult and PipelineRunner. This is because we want 98 # to test whether the BackgroundCachingJob can be started without having to 99 # rely on a real pipeline run. 100 class FakePipelineResult(beam.runners.runner.PipelineResult): 101 def wait_until_finish(self): 102 return 103 104 class FakePipelineRunner(beam.runners.PipelineRunner): 105 def run_pipeline(self, pipeline, options): 106 return FakePipelineResult(beam.runners.runner.PipelineState.RUNNING) 107 108 p = beam.Pipeline( 109 runner=interactive_runner.InteractiveRunner(FakePipelineRunner()), 110 options=PipelineOptions(streaming=True)) 111 112 # pylint: disable=possibly-unused-variable 113 elems = p | 'Read' >> beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB) 114 115 ib.watch(locals()) 116 117 _setup_test_streaming_cache(p) 118 p.run() 119 self.assertIsNotNone(ie.current_env().get_background_caching_job(p)) 120 expected_cached_source_signature = bcj.extract_source_to_cache_signature(p) 121 # This is to check whether the cached source signature is set correctly 122 # when the background caching job is started. 123 self.assertEqual( 124 expected_cached_source_signature, 125 ie.current_env().get_cached_source_signature(p)) 126 127 @patch( 128 'apache_beam.runners.interactive.background_caching_job' 129 '.has_source_to_cache', 130 lambda x: False) 131 def test_background_caching_job_not_start_for_batch_pipeline(self): 132 p = beam.Pipeline() 133 134 # pylint: disable=expression-not-assigned 135 p | beam.Create([]) 136 p.run() 137 self.assertIsNone(ie.current_env().get_background_caching_job(p)) 138 139 @patch( 140 'apache_beam.runners.interactive.background_caching_job' 141 '.has_source_to_cache', 142 lambda x: True) 143 # Disable the clean up so that we can keep the test streaming cache. 144 @patch( 145 'apache_beam.runners.interactive.interactive_environment' 146 '.InteractiveEnvironment.cleanup', 147 lambda x, 148 y: None) 149 def test_background_caching_job_not_start_when_such_job_exists(self): 150 p = _build_a_test_stream_pipeline() 151 _setup_test_streaming_cache(p) 152 a_running_background_caching_job = bcj.BackgroundCachingJob( 153 runner.PipelineResult(runner.PipelineState.RUNNING), limiters=[]) 154 ie.current_env().set_background_caching_job( 155 p, a_running_background_caching_job) 156 main_job_result = p.run() 157 # No background caching job is started so result is still the running one. 158 self.assertIs( 159 a_running_background_caching_job, 160 ie.current_env().get_background_caching_job(p)) 161 # A new main job is started so result of the main job is set. 162 self.assertIs(main_job_result, ie.current_env().pipeline_result(p)) 163 164 @patch( 165 'apache_beam.runners.interactive.background_caching_job' 166 '.has_source_to_cache', 167 lambda x: True) 168 # Disable the clean up so that we can keep the test streaming cache. 169 @patch( 170 'apache_beam.runners.interactive.interactive_environment' 171 '.InteractiveEnvironment.cleanup', 172 lambda x, 173 y: None) 174 def test_background_caching_job_not_start_when_such_job_is_done(self): 175 p = _build_a_test_stream_pipeline() 176 _setup_test_streaming_cache(p) 177 a_done_background_caching_job = bcj.BackgroundCachingJob( 178 runner.PipelineResult(runner.PipelineState.DONE), limiters=[]) 179 ie.current_env().set_background_caching_job( 180 p, a_done_background_caching_job) 181 main_job_result = p.run() 182 # No background caching job is started so result is still the running one. 183 self.assertIs( 184 a_done_background_caching_job, 185 ie.current_env().get_background_caching_job(p)) 186 # A new main job is started so result of the main job is set. 187 self.assertIs(main_job_result, ie.current_env().pipeline_result(p)) 188 189 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 190 def test_source_to_cache_changed_when_pipeline_is_first_time_seen(self, cell): 191 with cell: # Cell 1 192 pipeline = _build_an_empty_stream_pipeline() 193 194 with cell: # Cell 2 195 read_foo = pipeline | 'Read' >> beam.io.ReadFromPubSub( 196 subscription=_FOO_PUBSUB_SUB) 197 ib.watch({'read_foo': read_foo}) 198 199 self.assertTrue(bcj.is_source_to_cache_changed(pipeline)) 200 201 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 202 def test_source_to_cache_changed_when_new_source_is_added(self, cell): 203 with cell: # Cell 1 204 pipeline = _build_an_empty_stream_pipeline() 205 read_foo = pipeline | 'Read' >> beam.io.ReadFromPubSub( 206 subscription=_FOO_PUBSUB_SUB) 207 ib.watch({'read_foo': read_foo}) 208 209 # Sets the signature for current pipeline state. 210 ie.current_env().set_cached_source_signature( 211 pipeline, bcj.extract_source_to_cache_signature(pipeline)) 212 213 self.assertFalse(bcj.is_cache_complete(str(id(pipeline)))) 214 215 with cell: # Cell 2 216 read_bar = pipeline | 'Read' >> beam.io.ReadFromPubSub( 217 subscription=_BAR_PUBSUB_SUB) 218 ib.watch({'read_bar': read_bar}) 219 220 self.assertTrue(bcj.is_cache_complete(str(id(pipeline)))) 221 self.assertTrue(bcj.is_source_to_cache_changed(pipeline)) 222 223 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 224 def test_source_to_cache_changed_when_source_is_altered(self, cell): 225 with cell: # Cell 1 226 pipeline = _build_an_empty_stream_pipeline() 227 transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB) 228 read_foo = pipeline | 'Read' >> transform 229 ib.watch({'read_foo': read_foo}) 230 231 # Sets the signature for current pipeline state. 232 ie.current_env().set_cached_source_signature( 233 pipeline, bcj.extract_source_to_cache_signature(pipeline)) 234 235 with cell: # Cell 2 236 from apache_beam.io.gcp.pubsub import _PubSubSource 237 # Alter the transform. 238 transform._source = _PubSubSource(subscription=_BAR_PUBSUB_SUB) 239 240 self.assertTrue(bcj.is_source_to_cache_changed(pipeline)) 241 242 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 243 def test_source_to_cache_not_changed_for_same_source(self, cell): 244 with cell: # Cell 1 245 pipeline = _build_an_empty_stream_pipeline() 246 transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB) 247 248 with cell: # Cell 2 249 read_foo_1 = pipeline | 'Read' >> transform 250 ib.watch({'read_foo_1': read_foo_1}) 251 252 # Sets the signature for current pipeline state. 253 ie.current_env().set_cached_source_signature( 254 pipeline, bcj.extract_source_to_cache_signature(pipeline)) 255 256 with cell: # Cell 3 257 # Apply exactly the same transform and the same instance. 258 read_foo_2 = pipeline | 'Read' >> transform 259 ib.watch({'read_foo_2': read_foo_2}) 260 261 self.assertFalse(bcj.is_source_to_cache_changed(pipeline)) 262 263 with cell: # Cell 4 264 # Apply the same transform but represented in a different instance. 265 # The signature representing the urn and payload is still the same, so it 266 # is not treated as a new unbounded source. 267 read_foo_3 = pipeline | 'Read' >> beam.io.ReadFromPubSub( 268 subscription=_FOO_PUBSUB_SUB) 269 ib.watch({'read_foo_3': read_foo_3}) 270 271 self.assertFalse(bcj.is_source_to_cache_changed(pipeline)) 272 273 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 274 def test_source_to_cache_not_changed_when_source_is_removed(self, cell): 275 with cell: # Cell 1 276 pipeline = _build_an_empty_stream_pipeline() 277 foo_transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB) 278 bar_transform = beam.io.ReadFromPubSub(subscription=_BAR_PUBSUB_SUB) 279 280 with cell: # Cell 2 281 read_foo = pipeline | 'Read' >> foo_transform 282 ib.watch({'read_foo': read_foo}) 283 284 signature_with_only_foo = bcj.extract_source_to_cache_signature(pipeline) 285 286 with cell: # Cell 3 287 read_bar = pipeline | 'Read' >> bar_transform 288 ib.watch({'read_bar': read_bar}) 289 290 self.assertTrue(bcj.is_source_to_cache_changed(pipeline)) 291 signature_with_foo_bar = ie.current_env().get_cached_source_signature( 292 pipeline) 293 self.assertNotEqual(signature_with_only_foo, signature_with_foo_bar) 294 295 class BarPruneVisitor(PipelineVisitor): 296 def enter_composite_transform(self, transform_node): 297 pruned_parts = list(transform_node.parts) 298 for part in transform_node.parts: 299 if part.transform is bar_transform: 300 pruned_parts.remove(part) 301 transform_node.parts = tuple(pruned_parts) 302 self.visit_transform(transform_node) 303 304 def visit_transform(self, transform_node): 305 if transform_node.transform is bar_transform: 306 transform_node.parent = None 307 308 v = BarPruneVisitor() 309 pipeline.visit(v) 310 311 signature_after_pruning_bar = bcj.extract_source_to_cache_signature( 312 pipeline) 313 self.assertEqual(signature_with_only_foo, signature_after_pruning_bar) 314 self.assertFalse(bcj.is_source_to_cache_changed(pipeline)) 315 316 def test_determine_a_test_stream_service_running(self): 317 pipeline = _build_an_empty_stream_pipeline() 318 test_stream_service = TestStreamServiceController(reader=None) 319 test_stream_service.start() 320 ie.current_env().set_test_stream_service_controller( 321 pipeline, test_stream_service) 322 self.assertTrue(bcj.is_a_test_stream_service_running(pipeline)) 323 # the test_stream_service will be cleaned up on teardown. 324 325 def test_stop_a_running_test_stream_service(self): 326 pipeline = _build_an_empty_stream_pipeline() 327 test_stream_service = TestStreamServiceController(reader=None) 328 test_stream_service.start() 329 ie.current_env().set_test_stream_service_controller( 330 pipeline, test_stream_service) 331 bcj.attempt_to_stop_test_stream_service(pipeline) 332 self.assertFalse(bcj.is_a_test_stream_service_running(pipeline)) 333 334 @patch( 335 'apache_beam.testing.test_stream_service.TestStreamServiceController' 336 '.stop') 337 def test_noop_when_no_test_stream_service_running(self, _mocked_stop): 338 pipeline = _build_an_empty_stream_pipeline() 339 self.assertFalse(bcj.is_a_test_stream_service_running(pipeline)) 340 bcj.attempt_to_stop_test_stream_service(pipeline) 341 _mocked_stop.assert_not_called() 342 343 344 if __name__ == '__main__': 345 unittest.main()