github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/test_stream_it_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Integration tests for the test_stream module.""" 19 20 # pytype: skip-file 21 22 import unittest 23 from functools import wraps 24 25 import pytest 26 27 import apache_beam as beam 28 from apache_beam.options.pipeline_options import StandardOptions 29 from apache_beam.testing.test_pipeline import TestPipeline 30 from apache_beam.testing.test_stream import TestStream 31 from apache_beam.testing.util import assert_that 32 from apache_beam.testing.util import equal_to 33 from apache_beam.testing.util import equal_to_per_window 34 from apache_beam.transforms import trigger 35 from apache_beam.transforms import window 36 from apache_beam.transforms.window import FixedWindows 37 from apache_beam.transforms.window import TimestampedValue 38 from apache_beam.utils import timestamp 39 from apache_beam.utils.timestamp import Timestamp 40 41 42 def supported(runners): 43 if not isinstance(runners, list): 44 runners = [runners] 45 46 def inner(fn): 47 @wraps(fn) 48 def wrapped(self): 49 if self.runner_name not in runners: 50 self.skipTest( 51 'The "{}", does not support the TestStream transform. ' 52 'Supported runners: {}'.format(self.runner_name, runners)) 53 else: 54 return fn(self) 55 56 return wrapped 57 58 return inner 59 60 61 class TestStreamIntegrationTests(unittest.TestCase): 62 @classmethod 63 def setUpClass(cls): 64 cls.test_pipeline = TestPipeline(is_integration_test=True) 65 cls.args = cls.test_pipeline.get_full_options_as_args() 66 cls.runner_name = type(cls.test_pipeline.runner).__name__ 67 cls.project = cls.test_pipeline.get_option('project') 68 69 @supported(['DirectRunner', 'SwitchingDirectRunner']) 70 @pytest.mark.it_postcommit 71 def test_basic_execution(self): 72 test_stream = ( 73 TestStream().advance_watermark_to(10).add_elements([ 74 'a', 'b', 'c' 75 ]).advance_watermark_to(20).add_elements(['d']).add_elements([ 76 'e' 77 ]).advance_processing_time(10).advance_watermark_to(300).add_elements([ 78 TimestampedValue('late', 12) 79 ]).add_elements([TimestampedValue('last', 310) 80 ]).advance_watermark_to_infinity()) 81 82 class RecordFn(beam.DoFn): 83 def process( 84 self, 85 element=beam.DoFn.ElementParam, 86 timestamp=beam.DoFn.TimestampParam): 87 yield (element, timestamp) 88 89 with beam.Pipeline(argv=self.args) as p: 90 my_record_fn = RecordFn() 91 records = p | test_stream | beam.ParDo(my_record_fn) 92 93 assert_that( 94 records, 95 equal_to([ 96 ('a', timestamp.Timestamp(10)), 97 ('b', timestamp.Timestamp(10)), 98 ('c', timestamp.Timestamp(10)), 99 ('d', timestamp.Timestamp(20)), 100 ('e', timestamp.Timestamp(20)), 101 ('late', timestamp.Timestamp(12)), 102 ('last', timestamp.Timestamp(310)), 103 ])) 104 105 @supported(['DirectRunner', 'SwitchingDirectRunner']) 106 @pytest.mark.it_postcommit 107 def test_multiple_outputs(self): 108 """Tests that the TestStream supports emitting to multiple PCollections.""" 109 letters_elements = [ 110 TimestampedValue('a', 6), 111 TimestampedValue('b', 7), 112 TimestampedValue('c', 8), 113 ] 114 numbers_elements = [ 115 TimestampedValue('1', 11), 116 TimestampedValue('2', 12), 117 TimestampedValue('3', 13), 118 ] 119 test_stream = ( 120 TestStream().advance_watermark_to(5, tag='letters').add_elements( 121 letters_elements, 122 tag='letters').advance_watermark_to(10, tag='numbers').add_elements( 123 numbers_elements, tag='numbers')) 124 125 class RecordFn(beam.DoFn): 126 def process( 127 self, 128 element=beam.DoFn.ElementParam, 129 timestamp=beam.DoFn.TimestampParam): 130 yield (element, timestamp) 131 132 options = StandardOptions(streaming=True) 133 p = TestPipeline(is_integration_test=True, options=options) 134 135 main = p | test_stream 136 letters = main['letters'] | 'record letters' >> beam.ParDo(RecordFn()) 137 numbers = main['numbers'] | 'record numbers' >> beam.ParDo(RecordFn()) 138 139 assert_that( 140 letters, 141 equal_to([('a', Timestamp(6)), ('b', Timestamp(7)), 142 ('c', Timestamp(8))]), 143 label='assert letters') 144 145 assert_that( 146 numbers, 147 equal_to([('1', Timestamp(11)), ('2', Timestamp(12)), 148 ('3', Timestamp(13))]), 149 label='assert numbers') 150 151 p.run() 152 153 @supported(['DirectRunner', 'SwitchingDirectRunner']) 154 @pytest.mark.it_postcommit 155 def test_multiple_outputs_with_watermark_advancement(self): 156 """Tests that the TestStream can independently control output watermarks.""" 157 158 # Purposely set the watermark of numbers to 20 then letters to 5 to test 159 # that the watermark advancement is per PCollection. 160 # 161 # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be 162 # emitted at different times so that they will have different windows. The 163 # watermark advancement is checked by checking their windows. If the 164 # watermark does not advance, then the windows will be [-inf, -inf). If the 165 # windows do not advance separately, then the PCollections will both 166 # windowed in [15, 30). 167 letters_elements = [ 168 TimestampedValue('a', 6), 169 TimestampedValue('b', 7), 170 TimestampedValue('c', 8), 171 ] 172 numbers_elements = [ 173 TimestampedValue('1', 21), 174 TimestampedValue('2', 22), 175 TimestampedValue('3', 23), 176 ] 177 test_stream = ( 178 TestStream().advance_watermark_to( 179 0, tag='letters').advance_watermark_to( 180 0, tag='numbers').advance_watermark_to( 181 20, tag='numbers').advance_watermark_to( 182 5, tag='letters').add_elements( 183 letters_elements, 184 tag='letters').advance_watermark_to( 185 10, tag='letters').add_elements( 186 numbers_elements, 187 tag='numbers').advance_watermark_to( 188 30, tag='numbers')) 189 190 options = StandardOptions(streaming=True) 191 p = TestPipeline(is_integration_test=True, options=options) 192 193 main = p | test_stream 194 195 # Use an AfterWatermark trigger with an early firing to test that the 196 # watermark is advancing properly and that the element is being emitted in 197 # the correct window. 198 letters = ( 199 main['letters'] 200 | 'letter windows' >> beam.WindowInto( 201 FixedWindows(15), 202 trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), 203 accumulation_mode=trigger.AccumulationMode.DISCARDING) 204 | 'letter with key' >> beam.Map(lambda x: ('k', x)) 205 | 'letter gbk' >> beam.GroupByKey()) 206 207 numbers = ( 208 main['numbers'] 209 | 'number windows' >> beam.WindowInto( 210 FixedWindows(15), 211 trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), 212 accumulation_mode=trigger.AccumulationMode.DISCARDING) 213 | 'number with key' >> beam.Map(lambda x: ('k', x)) 214 | 'number gbk' >> beam.GroupByKey()) 215 216 # The letters were emitted when the watermark was at 5, thus we expect to 217 # see the elements in the [0, 15) window. We used an early trigger to make 218 # sure that the ON_TIME empty pane was also emitted with a TestStream. 219 # This pane has no data because of the early trigger causes the elements to 220 # fire before the end of the window and because the accumulation mode 221 # discards any data after the trigger fired. 222 expected_letters = { 223 window.IntervalWindow(0, 15): [ 224 ('k', ['a', 'b', 'c']), 225 ('k', []), 226 ], 227 } 228 229 # Same here, except the numbers were emitted at watermark = 20, thus they 230 # are in the [15, 30) window. 231 expected_numbers = { 232 window.IntervalWindow(15, 30): [ 233 ('k', ['1', '2', '3']), 234 ('k', []), 235 ], 236 } 237 assert_that( 238 letters, 239 equal_to_per_window(expected_letters), 240 label='letters assert per window') 241 assert_that( 242 numbers, 243 equal_to_per_window(expected_numbers), 244 label='numbers assert per window') 245 246 p.run() 247 248 249 if __name__ == '__main__': 250 unittest.main()