github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/deduplicate_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  
    20  """Unit tests for deduplicate transform by using TestStream."""
    21  
    22  import unittest
    23  
    24  import pytest
    25  
    26  import apache_beam as beam
    27  from apache_beam.coders import coders
    28  from apache_beam.testing.test_pipeline import TestPipeline
    29  from apache_beam.testing.test_stream import TestStream
    30  from apache_beam.testing.util import assert_that
    31  from apache_beam.testing.util import equal_to
    32  from apache_beam.testing.util import equal_to_per_window
    33  from apache_beam.transforms import deduplicate
    34  from apache_beam.transforms import window
    35  from apache_beam.utils.timestamp import Duration
    36  from apache_beam.utils.timestamp import Timestamp
    37  
    38  
    39  # TestStream is only supported in streaming pipeline. The Deduplicate transform
    40  # also requires Timer support. Sickbaying this testsuite until dataflow runner
    41  # supports both TestStream and user timer.
    42  @pytest.mark.no_sickbay_batch
    43  @pytest.mark.no_sickbay_streaming
    44  @pytest.mark.it_validatesrunner
    45  class DeduplicateTest(unittest.TestCase):
    46    def __init__(self, *args, **kwargs):
    47      self.runner = None
    48      self.options = None
    49      super().__init__(*args, **kwargs)
    50  
    51    def set_runner(self, runner):
    52      self.runner = runner
    53  
    54    def set_options(self, options):
    55      self.options = options
    56  
    57    def create_pipeline(self):
    58      if self.runner and self.options:
    59        return TestPipeline(runner=self.runner, options=self.options)
    60      elif self.runner:
    61        return TestPipeline(runner=self.runner)
    62      elif self.options:
    63        return TestPipeline(options=self.options)
    64      else:
    65        return TestPipeline()
    66  
    67    def test_deduplication_in_different_windows(self):
    68      with self.create_pipeline() as p:
    69        test_stream = (
    70            TestStream(
    71                coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements(
    72                    [
    73                        window.TimestampedValue('k1', 0),
    74                        window.TimestampedValue('k2', 10),
    75                        window.TimestampedValue('k3', 20),
    76                        window.TimestampedValue('k1', 30),
    77                        window.TimestampedValue('k2', 40),
    78                        window.TimestampedValue('k3', 50),
    79                        window.TimestampedValue('k4', 60),
    80                        window.TimestampedValue('k5', 70),
    81                        window.TimestampedValue('k6', 80)
    82                    ]).advance_watermark_to_infinity())
    83  
    84        res = (
    85            p
    86            | test_stream
    87            | beam.WindowInto(window.FixedWindows(30))
    88            | deduplicate.Deduplicate(processing_time_duration=10 * 60)
    89            | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
    90        # Deduplication should happen per window.
    91        expect_unique_keys_per_window = {
    92            window.IntervalWindow(0, 30): [('k1', Timestamp(0)),
    93                                           ('k2', Timestamp(10)),
    94                                           ('k3', Timestamp(20))],
    95            window.IntervalWindow(30, 60): [('k1', Timestamp(30)),
    96                                            ('k2', Timestamp(40)),
    97                                            ('k3', Timestamp(50))],
    98            window.IntervalWindow(60, 90): [('k4', Timestamp(60)),
    99                                            ('k5', Timestamp(70)),
   100                                            ('k6', Timestamp(80))],
   101        }
   102        assert_that(
   103            res,
   104            equal_to_per_window(expect_unique_keys_per_window),
   105            use_global_window=False,
   106            label='assert per window')
   107  
   108    @unittest.skip('TestStream not yet supported')
   109    def test_deduplication_with_event_time(self):
   110      deduplicate_duration = 60
   111      with self.create_pipeline() as p:
   112        test_stream = (
   113            TestStream(coder=coders.StrUtf8Coder()).with_output_types(
   114                str).advance_watermark_to(0).add_elements([
   115                    window.TimestampedValue('k1', 0),
   116                    window.TimestampedValue('k2', 20),
   117                    window.TimestampedValue('k3', 30)
   118                ]).advance_watermark_to(30).add_elements([
   119                    window.TimestampedValue('k1', 40),
   120                    window.TimestampedValue('k2', 50),
   121                    window.TimestampedValue('k3', 60)
   122                ]).advance_watermark_to(deduplicate_duration).add_elements([
   123                    window.TimestampedValue('k1', 70)
   124                ]).advance_watermark_to_infinity())
   125        res = (
   126            p
   127            | test_stream
   128            | deduplicate.Deduplicate(
   129                event_time_duration=Duration(deduplicate_duration))
   130            | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
   131  
   132        assert_that(
   133            res,
   134            equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)),
   135                      ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
   136  
   137    @unittest.skip('TestStream not yet supported')
   138    def test_deduplication_with_processing_time(self):
   139      deduplicate_duration = 60
   140      with self.create_pipeline() as p:
   141        test_stream = (
   142            TestStream(coder=coders.StrUtf8Coder()).with_output_types(
   143                str).advance_watermark_to(0).add_elements([
   144                    window.TimestampedValue('k1', 0),
   145                    window.TimestampedValue('k2', 20),
   146                    window.TimestampedValue('k3', 30)
   147                ]).advance_processing_time(30).add_elements([
   148                    window.TimestampedValue('k1', 40),
   149                    window.TimestampedValue('k2', 50),
   150                    window.TimestampedValue('k3', 60)
   151                ]).advance_processing_time(deduplicate_duration).add_elements([
   152                    window.TimestampedValue('k1', 70)
   153                ]).advance_watermark_to_infinity())
   154        res = (
   155            p
   156            | test_stream
   157            | deduplicate.Deduplicate(
   158                processing_time_duration=Duration(deduplicate_duration))
   159            | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
   160        assert_that(
   161            res,
   162            equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)),
   163                      ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
   164  
   165  
   166  if __name__ == '__main__':
   167    unittest.main()