github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/window_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Unit tests for the windowing classes."""
    19  # pytype: skip-file
    20  
    21  import unittest
    22  
    23  import apache_beam as beam
    24  from apache_beam.coders import coders
    25  from apache_beam.runners import pipeline_context
    26  from apache_beam.testing.test_pipeline import TestPipeline
    27  from apache_beam.testing.util import assert_that
    28  from apache_beam.testing.util import equal_to
    29  from apache_beam.transforms import CombinePerKey
    30  from apache_beam.transforms import Create
    31  from apache_beam.transforms import FlatMapTuple
    32  from apache_beam.transforms import GroupByKey
    33  from apache_beam.transforms import Map
    34  from apache_beam.transforms import MapTuple
    35  from apache_beam.transforms import WindowInto
    36  from apache_beam.transforms import combiners
    37  from apache_beam.transforms import core
    38  from apache_beam.transforms.core import Windowing
    39  from apache_beam.transforms.trigger import AccumulationMode
    40  from apache_beam.transforms.trigger import AfterCount
    41  from apache_beam.transforms.window import FixedWindows
    42  from apache_beam.transforms.window import GlobalWindow
    43  from apache_beam.transforms.window import GlobalWindows
    44  from apache_beam.transforms.window import IntervalWindow
    45  from apache_beam.transforms.window import NonMergingWindowFn
    46  from apache_beam.transforms.window import Sessions
    47  from apache_beam.transforms.window import SlidingWindows
    48  from apache_beam.transforms.window import TimestampCombiner
    49  from apache_beam.transforms.window import TimestampedValue
    50  from apache_beam.transforms.window import WindowedValue
    51  from apache_beam.transforms.window import WindowFn
    52  from apache_beam.utils.timestamp import MAX_TIMESTAMP
    53  from apache_beam.utils.timestamp import MIN_TIMESTAMP
    54  
    55  
    56  def context(element, timestamp):
    57    return WindowFn.AssignContext(timestamp, element)
    58  
    59  
    60  class ReifyWindowsFn(core.DoFn):
    61    def process(self, element, window=core.DoFn.WindowParam):
    62      key, values = element
    63      yield "%s @ %s" % (key, window), values
    64  
    65  
    66  class TestCustomWindows(NonMergingWindowFn):
    67    """A custom non merging window fn which assigns elements into interval windows
    68    [0, 3), [3, 5) and [5, element timestamp) based on the element timestamps.
    69    """
    70    def assign(self, context):
    71      timestamp = context.timestamp
    72      if timestamp < 3:
    73        return [IntervalWindow(0, 3)]
    74      elif timestamp < 5:
    75        return [IntervalWindow(3, 5)]
    76      else:
    77        return [IntervalWindow(5, timestamp)]
    78  
    79    def get_window_coder(self):
    80      return coders.IntervalWindowCoder()
    81  
    82  
    83  class WindowTest(unittest.TestCase):
    84    def test_timestamped_value_cmp(self):
    85      self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2))
    86      self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2.0))
    87      self.assertNotEqual(TimestampedValue('a', 2), TimestampedValue('a', 2.1))
    88      self.assertNotEqual(TimestampedValue('a', 2), TimestampedValue('b', 2))
    89  
    90    def test_global_window(self):
    91      self.assertEqual(GlobalWindow(), GlobalWindow())
    92      self.assertNotEqual(
    93          GlobalWindow(), IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP))
    94      self.assertNotEqual(
    95          IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP), GlobalWindow())
    96      self.assertTrue(GlobalWindow().max_timestamp() < MAX_TIMESTAMP)
    97  
    98    def test_fixed_windows(self):
    99      # Test windows with offset: 2, 7, 12, 17, ...
   100      windowfn = FixedWindows(size=5, offset=2)
   101      self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 7)))
   102      self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
   103      self.assertEqual([IntervalWindow(12, 17)],
   104                       windowfn.assign(context('v', 12)))
   105  
   106      # Test windows without offset: 0, 5, 10, 15, ...
   107      windowfn = FixedWindows(size=5)
   108      self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 5)))
   109      self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 9)))
   110      self.assertEqual([IntervalWindow(10, 15)],
   111                       windowfn.assign(context('v', 10)))
   112  
   113      # Test windows with offset out of range.
   114      windowfn = FixedWindows(size=5, offset=12)
   115      self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
   116  
   117    def test_sliding_windows_assignment(self):
   118      windowfn = SlidingWindows(size=15, period=5, offset=2)
   119      expected = [
   120          IntervalWindow(7, 22), IntervalWindow(2, 17), IntervalWindow(-3, 12)
   121      ]
   122      self.assertEqual(expected, windowfn.assign(context('v', 7)))
   123      self.assertEqual(expected, windowfn.assign(context('v', 8)))
   124      self.assertEqual(expected, windowfn.assign(context('v', 11)))
   125  
   126    def test_sliding_windows_assignment_fraction(self):
   127      windowfn = SlidingWindows(size=3.5, period=2.5, offset=1.5)
   128      self.assertEqual([IntervalWindow(1.5, 5.0), IntervalWindow(-1.0, 2.5)],
   129                       windowfn.assign(context('v', 1.7)))
   130      self.assertEqual([IntervalWindow(1.5, 5.0)],
   131                       windowfn.assign(context('v', 3)))
   132  
   133    def test_sliding_windows_assignment_fraction_large_offset(self):
   134      windowfn = SlidingWindows(size=3.5, period=2.5, offset=4.0)
   135      self.assertEqual([IntervalWindow(1.5, 5.0), IntervalWindow(-1.0, 2.5)],
   136                       windowfn.assign(context('v', 1.7)))
   137      self.assertEqual([IntervalWindow(4.0, 7.5), IntervalWindow(1.5, 5.0)],
   138                       windowfn.assign(context('v', 4.5)))
   139  
   140    def test_sessions_merging(self):
   141      windowfn = Sessions(10)
   142  
   143      def merge(*timestamps):
   144        windows = [windowfn.assign(context(None, t)) for t in timestamps]
   145        running = set()
   146  
   147        class TestMergeContext(WindowFn.MergeContext):
   148          def __init__(self):
   149            super().__init__(running)
   150  
   151          def merge(self, to_be_merged, merge_result):
   152            for w in to_be_merged:
   153              if w in running:
   154                running.remove(w)
   155            running.add(merge_result)
   156  
   157        for ws in windows:
   158          running.update(ws)
   159          windowfn.merge(TestMergeContext())
   160        windowfn.merge(TestMergeContext())
   161        return sorted(running)
   162  
   163      self.assertEqual([IntervalWindow(2, 12)], merge(2))
   164      self.assertEqual([IntervalWindow(2, 12), IntervalWindow(19, 29)],
   165                       merge(2, 19))
   166  
   167      self.assertEqual([IntervalWindow(2, 19)], merge(2, 9))
   168      self.assertEqual([IntervalWindow(2, 19)], merge(9, 2))
   169  
   170      self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)],
   171                       merge(2, 9, 19))
   172      self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)],
   173                       merge(19, 9, 2))
   174  
   175      self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10))
   176  
   177    def timestamped_key_values(self, pipeline, key, *timestamps):
   178      return (
   179          pipeline | 'start' >> Create(timestamps)
   180          | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()])))
   181  
   182    def test_sliding_windows(self):
   183      with TestPipeline() as p:
   184        pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
   185        result = (
   186            pcoll
   187            | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
   188            | GroupByKey()
   189            | beam.MapTuple(lambda k, vs: (k, sorted(vs)))
   190            | beam.ParDo(ReifyWindowsFn()))
   191        expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]),
   192                    ('key @ [2.0, 6.0)', [2, 3])]
   193        assert_that(result, equal_to(expected))
   194  
   195    def test_sessions(self):
   196      with TestPipeline() as p:
   197        pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
   198        sort_values = Map(lambda k_vs: (k_vs[0], sorted(k_vs[1])))
   199        result = (
   200            pcoll
   201            | 'w' >> WindowInto(Sessions(10))
   202            | GroupByKey()
   203            | sort_values
   204            | beam.ParDo(ReifyWindowsFn()))
   205        expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
   206                    ('key @ [20.0, 45.0)', [20, 27, 35])]
   207        assert_that(result, equal_to(expected))
   208  
   209    def test_timestamped_value(self):
   210      with TestPipeline() as p:
   211        result = (
   212            p
   213            | 'start' >> Create([(k, k) for k in range(10)])
   214            | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1]))
   215            | 'w' >> WindowInto(FixedWindows(5))
   216            | Map(lambda v: ('key', v))
   217            | GroupByKey()
   218            | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
   219        assert_that(
   220            result,
   221            equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])]))
   222  
   223    def test_rewindow(self):
   224      with TestPipeline() as p:
   225        result = (
   226            p
   227            | Create([(k, k) for k in range(10)])
   228            | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1]))
   229            | 'window' >> WindowInto(SlidingWindows(period=2, size=6))
   230            # Per the model, each element is now duplicated across
   231            # three windows. Rewindowing must preserve this duplication.
   232            | 'rewindow' >> WindowInto(FixedWindows(5))
   233            | 'rewindow2' >> WindowInto(FixedWindows(5))
   234            | Map(lambda v: ('key', v))
   235            | GroupByKey()
   236            | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
   237        assert_that(
   238            result,
   239            equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)),
   240                      ('key', sorted([5, 6, 7, 8, 9] * 3))]))
   241  
   242    def test_rewindow_regroup(self):
   243      with TestPipeline() as p:
   244        grouped = (
   245            p
   246            | Create(range(5))
   247            | Map(lambda t: TimestampedValue(('key', t), t))
   248            | 'window' >> WindowInto(FixedWindows(5, offset=3))
   249            | GroupByKey()
   250            | MapTuple(lambda k, vs: (k, sorted(vs))))
   251        # Both of these group-and-ungroup sequences should be idempotent.
   252        regrouped1 = (
   253            grouped
   254            | 'w1' >> WindowInto(FixedWindows(5, offset=3))
   255            | 'g1' >> GroupByKey()
   256            | FlatMapTuple(lambda k, vs: [(k, v) for v in vs]))
   257        regrouped2 = (
   258            grouped
   259            | FlatMapTuple(lambda k, vs: [(k, v) for v in vs])
   260            | 'w2' >> WindowInto(FixedWindows(5, offset=3))
   261            | 'g2' >> GroupByKey()
   262            | MapTuple(lambda k, vs: (k, sorted(vs))))
   263        with_windows = Map(lambda e, w=beam.DoFn.WindowParam: (e, w))
   264        expected = [(('key', [0, 1, 2]), IntervalWindow(-2, 3)),
   265                    (('key', [3, 4]), IntervalWindow(3, 8))]
   266  
   267        assert_that(grouped | 'ww' >> with_windows, equal_to(expected))
   268        assert_that(
   269            regrouped1 | 'ww1' >> with_windows, equal_to(expected), label='r1')
   270        assert_that(
   271            regrouped2 | 'ww2' >> with_windows, equal_to(expected), label='r2')
   272  
   273    def test_timestamped_with_combiners(self):
   274      with TestPipeline() as p:
   275        result = (
   276            p
   277            # Create some initial test values.
   278            | 'start' >> Create([(k, k) for k in range(10)])
   279            # The purpose of the WindowInto transform is to establish a
   280            # FixedWindows windowing function for the PCollection.
   281            # It does not bucket elements into windows since the timestamps
   282            # from Create are not spaced 5 ms apart and very likely they all
   283            # fall into the same window.
   284            | 'w' >> WindowInto(FixedWindows(5))
   285            # Generate timestamped values using the values as timestamps.
   286            # Now there are values 5 ms apart and since Map propagates the
   287            # windowing function from input to output the output PCollection
   288            # will have elements falling into different 5ms windows.
   289            | Map(lambda x_t2: TimestampedValue(x_t2[0], x_t2[1]))
   290            # We add a 'key' to each value representing the index of the
   291            # window. This is important since there is no guarantee of
   292            # order for the elements of a PCollection.
   293            | Map(lambda v: (v // 5, v)))
   294        # Sum all elements associated with a key and window. Although it
   295        # is called CombinePerKey it is really CombinePerKeyAndWindow the
   296        # same way GroupByKey is really GroupByKeyAndWindow.
   297        sum_per_window = result | CombinePerKey(sum)
   298        # Compute mean per key and window.
   299        mean_per_window = result | combiners.Mean.PerKey()
   300        assert_that(
   301            sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum')
   302        assert_that(
   303            mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean')
   304  
   305    def test_custom_windows(self):
   306      with TestPipeline() as p:
   307        pcoll = self.timestamped_key_values(p, 'key', 0, 1, 2, 3, 4, 5, 6)
   308        # pylint: disable=abstract-class-instantiated
   309        result = (
   310            pcoll
   311            | 'custom window' >> WindowInto(TestCustomWindows())
   312            | GroupByKey()
   313            | 'sort values' >> MapTuple(lambda k, vs: (k, sorted(vs))))
   314        assert_that(
   315            result,
   316            equal_to([('key', [0, 1, 2]), ('key', [3, 4]), ('key', [5]),
   317                      ('key', [6])]))
   318  
   319    def test_window_assignment_idempotency(self):
   320      with TestPipeline() as p:
   321        pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4)
   322        result = (
   323            pcoll
   324            | 'window' >> WindowInto(FixedWindows(2))
   325            | 'same window' >> WindowInto(FixedWindows(2))
   326            | 'same window again' >> WindowInto(FixedWindows(2))
   327            | GroupByKey())
   328  
   329        assert_that(result, equal_to([('key', [0]), ('key', [2]), ('key', [4])]))
   330  
   331    def test_window_assignment_through_multiple_gbk_idempotency(self):
   332      with TestPipeline() as p:
   333        pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4)
   334        result = (
   335            pcoll
   336            | 'window' >> WindowInto(FixedWindows(2))
   337            | 'gbk' >> GroupByKey()
   338            | 'same window' >> WindowInto(FixedWindows(2))
   339            | 'another gbk' >> GroupByKey()
   340            | 'same window again' >> WindowInto(FixedWindows(2))
   341            | 'gbk again' >> GroupByKey())
   342  
   343        assert_that(
   344            result,
   345            equal_to([('key', [[[0]]]), ('key', [[[2]]]), ('key', [[[4]]])]))
   346  
   347  
   348  class RunnerApiTest(unittest.TestCase):
   349    def test_windowfn_encoding(self):
   350      for window_fn in (GlobalWindows(),
   351                        FixedWindows(37),
   352                        SlidingWindows(2, 389),
   353                        Sessions(5077)):
   354        context = pipeline_context.PipelineContext()
   355        self.assertEqual(
   356            window_fn,
   357            WindowFn.from_runner_api(window_fn.to_runner_api(context), context))
   358  
   359    def test_windowing_encoding(self):
   360      for windowing in (Windowing(GlobalWindows()),
   361                        Windowing(
   362                            FixedWindows(1, 3),
   363                            AfterCount(6),
   364                            accumulation_mode=AccumulationMode.ACCUMULATING),
   365                        Windowing(
   366                            SlidingWindows(10, 15, 21),
   367                            AfterCount(28),
   368                            timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST,
   369                            accumulation_mode=AccumulationMode.DISCARDING)):
   370        context = pipeline_context.PipelineContext()
   371        self.assertEqual(
   372            windowing,
   373            Windowing.from_runner_api(windowing.to_runner_api(context), context))
   374  
   375  
   376  if __name__ == '__main__':
   377    unittest.main()