github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/background_caching_job_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tests for apache_beam.runners.interactive.background_caching_job."""
    19  # pytype: skip-file
    20  
    21  import unittest
    22  from unittest.mock import patch
    23  
    24  import apache_beam as beam
    25  from apache_beam.options.pipeline_options import PipelineOptions
    26  from apache_beam.pipeline import PipelineVisitor
    27  from apache_beam.runners import runner
    28  from apache_beam.runners.interactive import background_caching_job as bcj
    29  from apache_beam.runners.interactive import interactive_beam as ib
    30  from apache_beam.runners.interactive import interactive_environment as ie
    31  from apache_beam.runners.interactive import interactive_runner
    32  from apache_beam.runners.interactive.caching.streaming_cache import StreamingCache
    33  from apache_beam.runners.interactive.testing.mock_ipython import mock_get_ipython
    34  from apache_beam.runners.interactive.testing.test_cache_manager import FileRecordsBuilder
    35  from apache_beam.testing.test_stream import TestStream
    36  from apache_beam.testing.test_stream_service import TestStreamServiceController
    37  from apache_beam.transforms.window import TimestampedValue
    38  
    39  _FOO_PUBSUB_SUB = 'projects/test-project/subscriptions/foo'
    40  _BAR_PUBSUB_SUB = 'projects/test-project/subscriptions/bar'
    41  _TEST_CACHE_KEY = 'test'
    42  
    43  
    44  def _build_a_test_stream_pipeline():
    45    test_stream = (
    46        TestStream().advance_watermark_to(0).add_elements([
    47            TimestampedValue('a', 1)
    48        ]).advance_processing_time(5).advance_watermark_to_infinity())
    49    p = beam.Pipeline(runner=interactive_runner.InteractiveRunner())
    50    events = p | test_stream  # pylint: disable=possibly-unused-variable
    51    ib.watch(locals())
    52    return p
    53  
    54  
    55  def _build_an_empty_stream_pipeline():
    56    pipeline_options = PipelineOptions(streaming=True)
    57    p = beam.Pipeline(
    58        interactive_runner.InteractiveRunner(), options=pipeline_options)
    59    ib.watch({'pipeline': p})
    60    return p
    61  
    62  
    63  def _setup_test_streaming_cache(pipeline):
    64    cache_manager = StreamingCache(cache_dir=None)
    65    ie.current_env().set_cache_manager(cache_manager, pipeline)
    66    builder = FileRecordsBuilder(tag=_TEST_CACHE_KEY)
    67    (builder
    68        .advance_watermark(watermark_secs=0)
    69        .advance_processing_time(5)
    70        .add_element(element='a', event_time_secs=1)
    71        .advance_watermark(watermark_secs=100)
    72        .advance_processing_time(10)) # yapf: disable
    73    cache_manager.write(builder.build(), _TEST_CACHE_KEY)
    74  
    75  
    76  @unittest.skipIf(
    77      not ie.current_env().is_interactive_ready,
    78      '[interactive] dependency is not installed.')
    79  class BackgroundCachingJobTest(unittest.TestCase):
    80    def tearDown(self):
    81      ie.new_env()
    82  
    83    # TODO(BEAM-8335): remove the patches when there are appropriate test sources
    84    # that meet the boundedness checks.
    85    @patch(
    86        'apache_beam.runners.interactive.background_caching_job'
    87        '.has_source_to_cache',
    88        lambda x: True)
    89    # Disable the clean up so that we can keep the test streaming cache.
    90    @patch(
    91        'apache_beam.runners.interactive.interactive_environment'
    92        '.InteractiveEnvironment.cleanup',
    93        lambda x,
    94        y: None)
    95    def test_background_caching_job_starts_when_none_such_job_exists(self):
    96  
    97      # Create a fake PipelineResult and PipelineRunner. This is because we want
    98      # to test whether the BackgroundCachingJob can be started without having to
    99      # rely on a real pipeline run.
   100      class FakePipelineResult(beam.runners.runner.PipelineResult):
   101        def wait_until_finish(self):
   102          return
   103  
   104      class FakePipelineRunner(beam.runners.PipelineRunner):
   105        def run_pipeline(self, pipeline, options):
   106          return FakePipelineResult(beam.runners.runner.PipelineState.RUNNING)
   107  
   108      p = beam.Pipeline(
   109          runner=interactive_runner.InteractiveRunner(FakePipelineRunner()),
   110          options=PipelineOptions(streaming=True))
   111  
   112      # pylint: disable=possibly-unused-variable
   113      elems = p | 'Read' >> beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB)
   114  
   115      ib.watch(locals())
   116  
   117      _setup_test_streaming_cache(p)
   118      p.run()
   119      self.assertIsNotNone(ie.current_env().get_background_caching_job(p))
   120      expected_cached_source_signature = bcj.extract_source_to_cache_signature(p)
   121      # This is to check whether the cached source signature is set correctly
   122      # when the background caching job is started.
   123      self.assertEqual(
   124          expected_cached_source_signature,
   125          ie.current_env().get_cached_source_signature(p))
   126  
   127    @patch(
   128        'apache_beam.runners.interactive.background_caching_job'
   129        '.has_source_to_cache',
   130        lambda x: False)
   131    def test_background_caching_job_not_start_for_batch_pipeline(self):
   132      p = beam.Pipeline()
   133  
   134      # pylint: disable=expression-not-assigned
   135      p | beam.Create([])
   136      p.run()
   137      self.assertIsNone(ie.current_env().get_background_caching_job(p))
   138  
   139    @patch(
   140        'apache_beam.runners.interactive.background_caching_job'
   141        '.has_source_to_cache',
   142        lambda x: True)
   143    # Disable the clean up so that we can keep the test streaming cache.
   144    @patch(
   145        'apache_beam.runners.interactive.interactive_environment'
   146        '.InteractiveEnvironment.cleanup',
   147        lambda x,
   148        y: None)
   149    def test_background_caching_job_not_start_when_such_job_exists(self):
   150      p = _build_a_test_stream_pipeline()
   151      _setup_test_streaming_cache(p)
   152      a_running_background_caching_job = bcj.BackgroundCachingJob(
   153          runner.PipelineResult(runner.PipelineState.RUNNING), limiters=[])
   154      ie.current_env().set_background_caching_job(
   155          p, a_running_background_caching_job)
   156      main_job_result = p.run()
   157      # No background caching job is started so result is still the running one.
   158      self.assertIs(
   159          a_running_background_caching_job,
   160          ie.current_env().get_background_caching_job(p))
   161      # A new main job is started so result of the main job is set.
   162      self.assertIs(main_job_result, ie.current_env().pipeline_result(p))
   163  
   164    @patch(
   165        'apache_beam.runners.interactive.background_caching_job'
   166        '.has_source_to_cache',
   167        lambda x: True)
   168    # Disable the clean up so that we can keep the test streaming cache.
   169    @patch(
   170        'apache_beam.runners.interactive.interactive_environment'
   171        '.InteractiveEnvironment.cleanup',
   172        lambda x,
   173        y: None)
   174    def test_background_caching_job_not_start_when_such_job_is_done(self):
   175      p = _build_a_test_stream_pipeline()
   176      _setup_test_streaming_cache(p)
   177      a_done_background_caching_job = bcj.BackgroundCachingJob(
   178          runner.PipelineResult(runner.PipelineState.DONE), limiters=[])
   179      ie.current_env().set_background_caching_job(
   180          p, a_done_background_caching_job)
   181      main_job_result = p.run()
   182      # No background caching job is started so result is still the running one.
   183      self.assertIs(
   184          a_done_background_caching_job,
   185          ie.current_env().get_background_caching_job(p))
   186      # A new main job is started so result of the main job is set.
   187      self.assertIs(main_job_result, ie.current_env().pipeline_result(p))
   188  
   189    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   190    def test_source_to_cache_changed_when_pipeline_is_first_time_seen(self, cell):
   191      with cell:  # Cell 1
   192        pipeline = _build_an_empty_stream_pipeline()
   193  
   194      with cell:  # Cell 2
   195        read_foo = pipeline | 'Read' >> beam.io.ReadFromPubSub(
   196            subscription=_FOO_PUBSUB_SUB)
   197        ib.watch({'read_foo': read_foo})
   198  
   199      self.assertTrue(bcj.is_source_to_cache_changed(pipeline))
   200  
   201    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   202    def test_source_to_cache_changed_when_new_source_is_added(self, cell):
   203      with cell:  # Cell 1
   204        pipeline = _build_an_empty_stream_pipeline()
   205        read_foo = pipeline | 'Read' >> beam.io.ReadFromPubSub(
   206            subscription=_FOO_PUBSUB_SUB)
   207        ib.watch({'read_foo': read_foo})
   208  
   209      # Sets the signature for current pipeline state.
   210      ie.current_env().set_cached_source_signature(
   211          pipeline, bcj.extract_source_to_cache_signature(pipeline))
   212  
   213      self.assertFalse(bcj.is_cache_complete(str(id(pipeline))))
   214  
   215      with cell:  # Cell 2
   216        read_bar = pipeline | 'Read' >> beam.io.ReadFromPubSub(
   217            subscription=_BAR_PUBSUB_SUB)
   218        ib.watch({'read_bar': read_bar})
   219  
   220      self.assertTrue(bcj.is_cache_complete(str(id(pipeline))))
   221      self.assertTrue(bcj.is_source_to_cache_changed(pipeline))
   222  
   223    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   224    def test_source_to_cache_changed_when_source_is_altered(self, cell):
   225      with cell:  # Cell 1
   226        pipeline = _build_an_empty_stream_pipeline()
   227        transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB)
   228        read_foo = pipeline | 'Read' >> transform
   229        ib.watch({'read_foo': read_foo})
   230  
   231      # Sets the signature for current pipeline state.
   232      ie.current_env().set_cached_source_signature(
   233          pipeline, bcj.extract_source_to_cache_signature(pipeline))
   234  
   235      with cell:  # Cell 2
   236        from apache_beam.io.gcp.pubsub import _PubSubSource
   237        # Alter the transform.
   238        transform._source = _PubSubSource(subscription=_BAR_PUBSUB_SUB)
   239  
   240      self.assertTrue(bcj.is_source_to_cache_changed(pipeline))
   241  
   242    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   243    def test_source_to_cache_not_changed_for_same_source(self, cell):
   244      with cell:  # Cell 1
   245        pipeline = _build_an_empty_stream_pipeline()
   246        transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB)
   247  
   248      with cell:  # Cell 2
   249        read_foo_1 = pipeline | 'Read' >> transform
   250        ib.watch({'read_foo_1': read_foo_1})
   251  
   252      # Sets the signature for current pipeline state.
   253      ie.current_env().set_cached_source_signature(
   254          pipeline, bcj.extract_source_to_cache_signature(pipeline))
   255  
   256      with cell:  # Cell 3
   257        # Apply exactly the same transform and the same instance.
   258        read_foo_2 = pipeline | 'Read' >> transform
   259        ib.watch({'read_foo_2': read_foo_2})
   260  
   261      self.assertFalse(bcj.is_source_to_cache_changed(pipeline))
   262  
   263      with cell:  # Cell 4
   264        # Apply the same transform but represented in a different instance.
   265        # The signature representing the urn and payload is still the same, so it
   266        # is not treated as a new unbounded source.
   267        read_foo_3 = pipeline | 'Read' >> beam.io.ReadFromPubSub(
   268            subscription=_FOO_PUBSUB_SUB)
   269        ib.watch({'read_foo_3': read_foo_3})
   270  
   271      self.assertFalse(bcj.is_source_to_cache_changed(pipeline))
   272  
   273    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   274    def test_source_to_cache_not_changed_when_source_is_removed(self, cell):
   275      with cell:  # Cell 1
   276        pipeline = _build_an_empty_stream_pipeline()
   277        foo_transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB)
   278        bar_transform = beam.io.ReadFromPubSub(subscription=_BAR_PUBSUB_SUB)
   279  
   280      with cell:  # Cell 2
   281        read_foo = pipeline | 'Read' >> foo_transform
   282        ib.watch({'read_foo': read_foo})
   283  
   284      signature_with_only_foo = bcj.extract_source_to_cache_signature(pipeline)
   285  
   286      with cell:  # Cell 3
   287        read_bar = pipeline | 'Read' >> bar_transform
   288        ib.watch({'read_bar': read_bar})
   289  
   290      self.assertTrue(bcj.is_source_to_cache_changed(pipeline))
   291      signature_with_foo_bar = ie.current_env().get_cached_source_signature(
   292          pipeline)
   293      self.assertNotEqual(signature_with_only_foo, signature_with_foo_bar)
   294  
   295      class BarPruneVisitor(PipelineVisitor):
   296        def enter_composite_transform(self, transform_node):
   297          pruned_parts = list(transform_node.parts)
   298          for part in transform_node.parts:
   299            if part.transform is bar_transform:
   300              pruned_parts.remove(part)
   301          transform_node.parts = tuple(pruned_parts)
   302          self.visit_transform(transform_node)
   303  
   304        def visit_transform(self, transform_node):
   305          if transform_node.transform is bar_transform:
   306            transform_node.parent = None
   307  
   308      v = BarPruneVisitor()
   309      pipeline.visit(v)
   310  
   311      signature_after_pruning_bar = bcj.extract_source_to_cache_signature(
   312          pipeline)
   313      self.assertEqual(signature_with_only_foo, signature_after_pruning_bar)
   314      self.assertFalse(bcj.is_source_to_cache_changed(pipeline))
   315  
   316    def test_determine_a_test_stream_service_running(self):
   317      pipeline = _build_an_empty_stream_pipeline()
   318      test_stream_service = TestStreamServiceController(reader=None)
   319      test_stream_service.start()
   320      ie.current_env().set_test_stream_service_controller(
   321          pipeline, test_stream_service)
   322      self.assertTrue(bcj.is_a_test_stream_service_running(pipeline))
   323      # the test_stream_service will be cleaned up on teardown.
   324  
   325    def test_stop_a_running_test_stream_service(self):
   326      pipeline = _build_an_empty_stream_pipeline()
   327      test_stream_service = TestStreamServiceController(reader=None)
   328      test_stream_service.start()
   329      ie.current_env().set_test_stream_service_controller(
   330          pipeline, test_stream_service)
   331      bcj.attempt_to_stop_test_stream_service(pipeline)
   332      self.assertFalse(bcj.is_a_test_stream_service_running(pipeline))
   333  
   334    @patch(
   335        'apache_beam.testing.test_stream_service.TestStreamServiceController'
   336        '.stop')
   337    def test_noop_when_no_test_stream_service_running(self, _mocked_stop):
   338      pipeline = _build_an_empty_stream_pipeline()
   339      self.assertFalse(bcj.is_a_test_stream_service_running(pipeline))
   340      bcj.attempt_to_stop_test_stream_service(pipeline)
   341      _mocked_stop.assert_not_called()
   342  
   343  
   344  if __name__ == '__main__':
   345    unittest.main()