github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/test_stream_it_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/test_stream_it_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Integration tests for the test_stream module."""
    19  
    20  # pytype: skip-file
    21  
    22  import unittest
    23  from functools import wraps
    24  
    25  import pytest
    26  
    27  import apache_beam as beam
    28  from apache_beam.options.pipeline_options import StandardOptions
    29  from apache_beam.testing.test_pipeline import TestPipeline
    30  from apache_beam.testing.test_stream import TestStream
    31  from apache_beam.testing.util import assert_that
    32  from apache_beam.testing.util import equal_to
    33  from apache_beam.testing.util import equal_to_per_window
    34  from apache_beam.transforms import trigger
    35  from apache_beam.transforms import window
    36  from apache_beam.transforms.window import FixedWindows
    37  from apache_beam.transforms.window import TimestampedValue
    38  from apache_beam.utils import timestamp
    39  from apache_beam.utils.timestamp import Timestamp
    40  
    41  
    42  def supported(runners):
    43    if not isinstance(runners, list):
    44      runners = [runners]
    45  
    46    def inner(fn):
    47      @wraps(fn)
    48      def wrapped(self):
    49        if self.runner_name not in runners:
    50          self.skipTest(
    51              'The "{}", does not support the TestStream transform. '
    52              'Supported runners: {}'.format(self.runner_name, runners))
    53        else:
    54          return fn(self)
    55  
    56      return wrapped
    57  
    58    return inner
    59  
    60  
    61  class TestStreamIntegrationTests(unittest.TestCase):
    62    @classmethod
    63    def setUpClass(cls):
    64      cls.test_pipeline = TestPipeline(is_integration_test=True)
    65      cls.args = cls.test_pipeline.get_full_options_as_args()
    66      cls.runner_name = type(cls.test_pipeline.runner).__name__
    67      cls.project = cls.test_pipeline.get_option('project')
    68  
    69    @supported(['DirectRunner', 'SwitchingDirectRunner'])
    70    @pytest.mark.it_postcommit
    71    def test_basic_execution(self):
    72      test_stream = (
    73          TestStream().advance_watermark_to(10).add_elements([
    74              'a', 'b', 'c'
    75          ]).advance_watermark_to(20).add_elements(['d']).add_elements([
    76              'e'
    77          ]).advance_processing_time(10).advance_watermark_to(300).add_elements([
    78              TimestampedValue('late', 12)
    79          ]).add_elements([TimestampedValue('last', 310)
    80                           ]).advance_watermark_to_infinity())
    81  
    82      class RecordFn(beam.DoFn):
    83        def process(
    84            self,
    85            element=beam.DoFn.ElementParam,
    86            timestamp=beam.DoFn.TimestampParam):
    87          yield (element, timestamp)
    88  
    89      with beam.Pipeline(argv=self.args) as p:
    90        my_record_fn = RecordFn()
    91        records = p | test_stream | beam.ParDo(my_record_fn)
    92  
    93        assert_that(
    94            records,
    95            equal_to([
    96                ('a', timestamp.Timestamp(10)),
    97                ('b', timestamp.Timestamp(10)),
    98                ('c', timestamp.Timestamp(10)),
    99                ('d', timestamp.Timestamp(20)),
   100                ('e', timestamp.Timestamp(20)),
   101                ('late', timestamp.Timestamp(12)),
   102                ('last', timestamp.Timestamp(310)),
   103            ]))
   104  
   105    @supported(['DirectRunner', 'SwitchingDirectRunner'])
   106    @pytest.mark.it_postcommit
   107    def test_multiple_outputs(self):
   108      """Tests that the TestStream supports emitting to multiple PCollections."""
   109      letters_elements = [
   110          TimestampedValue('a', 6),
   111          TimestampedValue('b', 7),
   112          TimestampedValue('c', 8),
   113      ]
   114      numbers_elements = [
   115          TimestampedValue('1', 11),
   116          TimestampedValue('2', 12),
   117          TimestampedValue('3', 13),
   118      ]
   119      test_stream = (
   120          TestStream().advance_watermark_to(5, tag='letters').add_elements(
   121              letters_elements,
   122              tag='letters').advance_watermark_to(10, tag='numbers').add_elements(
   123                  numbers_elements, tag='numbers'))
   124  
   125      class RecordFn(beam.DoFn):
   126        def process(
   127            self,
   128            element=beam.DoFn.ElementParam,
   129            timestamp=beam.DoFn.TimestampParam):
   130          yield (element, timestamp)
   131  
   132      options = StandardOptions(streaming=True)
   133      p = TestPipeline(is_integration_test=True, options=options)
   134  
   135      main = p | test_stream
   136      letters = main['letters'] | 'record letters' >> beam.ParDo(RecordFn())
   137      numbers = main['numbers'] | 'record numbers' >> beam.ParDo(RecordFn())
   138  
   139      assert_that(
   140          letters,
   141          equal_to([('a', Timestamp(6)), ('b', Timestamp(7)),
   142                    ('c', Timestamp(8))]),
   143          label='assert letters')
   144  
   145      assert_that(
   146          numbers,
   147          equal_to([('1', Timestamp(11)), ('2', Timestamp(12)),
   148                    ('3', Timestamp(13))]),
   149          label='assert numbers')
   150  
   151      p.run()
   152  
   153    @supported(['DirectRunner', 'SwitchingDirectRunner'])
   154    @pytest.mark.it_postcommit
   155    def test_multiple_outputs_with_watermark_advancement(self):
   156      """Tests that the TestStream can independently control output watermarks."""
   157  
   158      # Purposely set the watermark of numbers to 20 then letters to 5 to test
   159      # that the watermark advancement is per PCollection.
   160      #
   161      # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
   162      # emitted at different times so that they will have different windows. The
   163      # watermark advancement is checked by checking their windows. If the
   164      # watermark does not advance, then the windows will be [-inf, -inf). If the
   165      # windows do not advance separately, then the PCollections will both
   166      # windowed in [15, 30).
   167      letters_elements = [
   168          TimestampedValue('a', 6),
   169          TimestampedValue('b', 7),
   170          TimestampedValue('c', 8),
   171      ]
   172      numbers_elements = [
   173          TimestampedValue('1', 21),
   174          TimestampedValue('2', 22),
   175          TimestampedValue('3', 23),
   176      ]
   177      test_stream = (
   178          TestStream().advance_watermark_to(
   179              0, tag='letters').advance_watermark_to(
   180                  0, tag='numbers').advance_watermark_to(
   181                      20, tag='numbers').advance_watermark_to(
   182                          5, tag='letters').add_elements(
   183                              letters_elements,
   184                              tag='letters').advance_watermark_to(
   185                                  10, tag='letters').add_elements(
   186                                      numbers_elements,
   187                                      tag='numbers').advance_watermark_to(
   188                                          30, tag='numbers'))
   189  
   190      options = StandardOptions(streaming=True)
   191      p = TestPipeline(is_integration_test=True, options=options)
   192  
   193      main = p | test_stream
   194  
   195      # Use an AfterWatermark trigger with an early firing to test that the
   196      # watermark is advancing properly and that the element is being emitted in
   197      # the correct window.
   198      letters = (
   199          main['letters']
   200          | 'letter windows' >> beam.WindowInto(
   201              FixedWindows(15),
   202              trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
   203              accumulation_mode=trigger.AccumulationMode.DISCARDING)
   204          | 'letter with key' >> beam.Map(lambda x: ('k', x))
   205          | 'letter gbk' >> beam.GroupByKey())
   206  
   207      numbers = (
   208          main['numbers']
   209          | 'number windows' >> beam.WindowInto(
   210              FixedWindows(15),
   211              trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
   212              accumulation_mode=trigger.AccumulationMode.DISCARDING)
   213          | 'number with key' >> beam.Map(lambda x: ('k', x))
   214          | 'number gbk' >> beam.GroupByKey())
   215  
   216      # The letters were emitted when the watermark was at 5, thus we expect to
   217      # see the elements in the [0, 15) window. We used an early trigger to make
   218      # sure that the ON_TIME empty pane was also emitted with a TestStream.
   219      # This pane has no data because of the early trigger causes the elements to
   220      # fire before the end of the window and because the accumulation mode
   221      # discards any data after the trigger fired.
   222      expected_letters = {
   223          window.IntervalWindow(0, 15): [
   224              ('k', ['a', 'b', 'c']),
   225              ('k', []),
   226          ],
   227      }
   228  
   229      # Same here, except the numbers were emitted at watermark = 20, thus they
   230      # are in the [15, 30) window.
   231      expected_numbers = {
   232          window.IntervalWindow(15, 30): [
   233              ('k', ['1', '2', '3']),
   234              ('k', []),
   235          ],
   236      }
   237      assert_that(
   238          letters,
   239          equal_to_per_window(expected_letters),
   240          label='letters assert per window')
   241      assert_that(
   242          numbers,
   243          equal_to_per_window(expected_numbers),
   244          label='numbers assert per window')
   245  
   246      p.run()
   247  
   248  
   249  if __name__ == '__main__':
   250    unittest.main()