github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/deduplicate_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # pytype: skip-file 19 20 """Unit tests for deduplicate transform by using TestStream.""" 21 22 import unittest 23 24 import pytest 25 26 import apache_beam as beam 27 from apache_beam.coders import coders 28 from apache_beam.testing.test_pipeline import TestPipeline 29 from apache_beam.testing.test_stream import TestStream 30 from apache_beam.testing.util import assert_that 31 from apache_beam.testing.util import equal_to 32 from apache_beam.testing.util import equal_to_per_window 33 from apache_beam.transforms import deduplicate 34 from apache_beam.transforms import window 35 from apache_beam.utils.timestamp import Duration 36 from apache_beam.utils.timestamp import Timestamp 37 38 39 # TestStream is only supported in streaming pipeline. The Deduplicate transform 40 # also requires Timer support. Sickbaying this testsuite until dataflow runner 41 # supports both TestStream and user timer. 42 @pytest.mark.no_sickbay_batch 43 @pytest.mark.no_sickbay_streaming 44 @pytest.mark.it_validatesrunner 45 class DeduplicateTest(unittest.TestCase): 46 def __init__(self, *args, **kwargs): 47 self.runner = None 48 self.options = None 49 super().__init__(*args, **kwargs) 50 51 def set_runner(self, runner): 52 self.runner = runner 53 54 def set_options(self, options): 55 self.options = options 56 57 def create_pipeline(self): 58 if self.runner and self.options: 59 return TestPipeline(runner=self.runner, options=self.options) 60 elif self.runner: 61 return TestPipeline(runner=self.runner) 62 elif self.options: 63 return TestPipeline(options=self.options) 64 else: 65 return TestPipeline() 66 67 def test_deduplication_in_different_windows(self): 68 with self.create_pipeline() as p: 69 test_stream = ( 70 TestStream( 71 coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements( 72 [ 73 window.TimestampedValue('k1', 0), 74 window.TimestampedValue('k2', 10), 75 window.TimestampedValue('k3', 20), 76 window.TimestampedValue('k1', 30), 77 window.TimestampedValue('k2', 40), 78 window.TimestampedValue('k3', 50), 79 window.TimestampedValue('k4', 60), 80 window.TimestampedValue('k5', 70), 81 window.TimestampedValue('k6', 80) 82 ]).advance_watermark_to_infinity()) 83 84 res = ( 85 p 86 | test_stream 87 | beam.WindowInto(window.FixedWindows(30)) 88 | deduplicate.Deduplicate(processing_time_duration=10 * 60) 89 | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) 90 # Deduplication should happen per window. 91 expect_unique_keys_per_window = { 92 window.IntervalWindow(0, 30): [('k1', Timestamp(0)), 93 ('k2', Timestamp(10)), 94 ('k3', Timestamp(20))], 95 window.IntervalWindow(30, 60): [('k1', Timestamp(30)), 96 ('k2', Timestamp(40)), 97 ('k3', Timestamp(50))], 98 window.IntervalWindow(60, 90): [('k4', Timestamp(60)), 99 ('k5', Timestamp(70)), 100 ('k6', Timestamp(80))], 101 } 102 assert_that( 103 res, 104 equal_to_per_window(expect_unique_keys_per_window), 105 use_global_window=False, 106 label='assert per window') 107 108 @unittest.skip('TestStream not yet supported') 109 def test_deduplication_with_event_time(self): 110 deduplicate_duration = 60 111 with self.create_pipeline() as p: 112 test_stream = ( 113 TestStream(coder=coders.StrUtf8Coder()).with_output_types( 114 str).advance_watermark_to(0).add_elements([ 115 window.TimestampedValue('k1', 0), 116 window.TimestampedValue('k2', 20), 117 window.TimestampedValue('k3', 30) 118 ]).advance_watermark_to(30).add_elements([ 119 window.TimestampedValue('k1', 40), 120 window.TimestampedValue('k2', 50), 121 window.TimestampedValue('k3', 60) 122 ]).advance_watermark_to(deduplicate_duration).add_elements([ 123 window.TimestampedValue('k1', 70) 124 ]).advance_watermark_to_infinity()) 125 res = ( 126 p 127 | test_stream 128 | deduplicate.Deduplicate( 129 event_time_duration=Duration(deduplicate_duration)) 130 | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) 131 132 assert_that( 133 res, 134 equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)), 135 ('k3', Timestamp(30)), ('k1', Timestamp(70))])) 136 137 @unittest.skip('TestStream not yet supported') 138 def test_deduplication_with_processing_time(self): 139 deduplicate_duration = 60 140 with self.create_pipeline() as p: 141 test_stream = ( 142 TestStream(coder=coders.StrUtf8Coder()).with_output_types( 143 str).advance_watermark_to(0).add_elements([ 144 window.TimestampedValue('k1', 0), 145 window.TimestampedValue('k2', 20), 146 window.TimestampedValue('k3', 30) 147 ]).advance_processing_time(30).add_elements([ 148 window.TimestampedValue('k1', 40), 149 window.TimestampedValue('k2', 50), 150 window.TimestampedValue('k3', 60) 151 ]).advance_processing_time(deduplicate_duration).add_elements([ 152 window.TimestampedValue('k1', 70) 153 ]).advance_watermark_to_infinity()) 154 res = ( 155 p 156 | test_stream 157 | deduplicate.Deduplicate( 158 processing_time_duration=Duration(deduplicate_duration)) 159 | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) 160 assert_that( 161 res, 162 equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)), 163 ('k3', Timestamp(30)), ('k1', Timestamp(70))])) 164 165 166 if __name__ == '__main__': 167 unittest.main()