github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/window_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Unit tests for the windowing classes.""" 19 # pytype: skip-file 20 21 import unittest 22 23 import apache_beam as beam 24 from apache_beam.coders import coders 25 from apache_beam.runners import pipeline_context 26 from apache_beam.testing.test_pipeline import TestPipeline 27 from apache_beam.testing.util import assert_that 28 from apache_beam.testing.util import equal_to 29 from apache_beam.transforms import CombinePerKey 30 from apache_beam.transforms import Create 31 from apache_beam.transforms import FlatMapTuple 32 from apache_beam.transforms import GroupByKey 33 from apache_beam.transforms import Map 34 from apache_beam.transforms import MapTuple 35 from apache_beam.transforms import WindowInto 36 from apache_beam.transforms import combiners 37 from apache_beam.transforms import core 38 from apache_beam.transforms.core import Windowing 39 from apache_beam.transforms.trigger import AccumulationMode 40 from apache_beam.transforms.trigger import AfterCount 41 from apache_beam.transforms.window import FixedWindows 42 from apache_beam.transforms.window import GlobalWindow 43 from apache_beam.transforms.window import GlobalWindows 44 from apache_beam.transforms.window import IntervalWindow 45 from apache_beam.transforms.window import NonMergingWindowFn 46 from apache_beam.transforms.window import Sessions 47 from apache_beam.transforms.window import SlidingWindows 48 from apache_beam.transforms.window import TimestampCombiner 49 from apache_beam.transforms.window import TimestampedValue 50 from apache_beam.transforms.window import WindowedValue 51 from apache_beam.transforms.window import WindowFn 52 from apache_beam.utils.timestamp import MAX_TIMESTAMP 53 from apache_beam.utils.timestamp import MIN_TIMESTAMP 54 55 56 def context(element, timestamp): 57 return WindowFn.AssignContext(timestamp, element) 58 59 60 class ReifyWindowsFn(core.DoFn): 61 def process(self, element, window=core.DoFn.WindowParam): 62 key, values = element 63 yield "%s @ %s" % (key, window), values 64 65 66 class TestCustomWindows(NonMergingWindowFn): 67 """A custom non merging window fn which assigns elements into interval windows 68 [0, 3), [3, 5) and [5, element timestamp) based on the element timestamps. 69 """ 70 def assign(self, context): 71 timestamp = context.timestamp 72 if timestamp < 3: 73 return [IntervalWindow(0, 3)] 74 elif timestamp < 5: 75 return [IntervalWindow(3, 5)] 76 else: 77 return [IntervalWindow(5, timestamp)] 78 79 def get_window_coder(self): 80 return coders.IntervalWindowCoder() 81 82 83 class WindowTest(unittest.TestCase): 84 def test_timestamped_value_cmp(self): 85 self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2)) 86 self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2.0)) 87 self.assertNotEqual(TimestampedValue('a', 2), TimestampedValue('a', 2.1)) 88 self.assertNotEqual(TimestampedValue('a', 2), TimestampedValue('b', 2)) 89 90 def test_global_window(self): 91 self.assertEqual(GlobalWindow(), GlobalWindow()) 92 self.assertNotEqual( 93 GlobalWindow(), IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP)) 94 self.assertNotEqual( 95 IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP), GlobalWindow()) 96 self.assertTrue(GlobalWindow().max_timestamp() < MAX_TIMESTAMP) 97 98 def test_fixed_windows(self): 99 # Test windows with offset: 2, 7, 12, 17, ... 100 windowfn = FixedWindows(size=5, offset=2) 101 self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 7))) 102 self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11))) 103 self.assertEqual([IntervalWindow(12, 17)], 104 windowfn.assign(context('v', 12))) 105 106 # Test windows without offset: 0, 5, 10, 15, ... 107 windowfn = FixedWindows(size=5) 108 self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 5))) 109 self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 9))) 110 self.assertEqual([IntervalWindow(10, 15)], 111 windowfn.assign(context('v', 10))) 112 113 # Test windows with offset out of range. 114 windowfn = FixedWindows(size=5, offset=12) 115 self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11))) 116 117 def test_sliding_windows_assignment(self): 118 windowfn = SlidingWindows(size=15, period=5, offset=2) 119 expected = [ 120 IntervalWindow(7, 22), IntervalWindow(2, 17), IntervalWindow(-3, 12) 121 ] 122 self.assertEqual(expected, windowfn.assign(context('v', 7))) 123 self.assertEqual(expected, windowfn.assign(context('v', 8))) 124 self.assertEqual(expected, windowfn.assign(context('v', 11))) 125 126 def test_sliding_windows_assignment_fraction(self): 127 windowfn = SlidingWindows(size=3.5, period=2.5, offset=1.5) 128 self.assertEqual([IntervalWindow(1.5, 5.0), IntervalWindow(-1.0, 2.5)], 129 windowfn.assign(context('v', 1.7))) 130 self.assertEqual([IntervalWindow(1.5, 5.0)], 131 windowfn.assign(context('v', 3))) 132 133 def test_sliding_windows_assignment_fraction_large_offset(self): 134 windowfn = SlidingWindows(size=3.5, period=2.5, offset=4.0) 135 self.assertEqual([IntervalWindow(1.5, 5.0), IntervalWindow(-1.0, 2.5)], 136 windowfn.assign(context('v', 1.7))) 137 self.assertEqual([IntervalWindow(4.0, 7.5), IntervalWindow(1.5, 5.0)], 138 windowfn.assign(context('v', 4.5))) 139 140 def test_sessions_merging(self): 141 windowfn = Sessions(10) 142 143 def merge(*timestamps): 144 windows = [windowfn.assign(context(None, t)) for t in timestamps] 145 running = set() 146 147 class TestMergeContext(WindowFn.MergeContext): 148 def __init__(self): 149 super().__init__(running) 150 151 def merge(self, to_be_merged, merge_result): 152 for w in to_be_merged: 153 if w in running: 154 running.remove(w) 155 running.add(merge_result) 156 157 for ws in windows: 158 running.update(ws) 159 windowfn.merge(TestMergeContext()) 160 windowfn.merge(TestMergeContext()) 161 return sorted(running) 162 163 self.assertEqual([IntervalWindow(2, 12)], merge(2)) 164 self.assertEqual([IntervalWindow(2, 12), IntervalWindow(19, 29)], 165 merge(2, 19)) 166 167 self.assertEqual([IntervalWindow(2, 19)], merge(2, 9)) 168 self.assertEqual([IntervalWindow(2, 19)], merge(9, 2)) 169 170 self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], 171 merge(2, 9, 19)) 172 self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], 173 merge(19, 9, 2)) 174 175 self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10)) 176 177 def timestamped_key_values(self, pipeline, key, *timestamps): 178 return ( 179 pipeline | 'start' >> Create(timestamps) 180 | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()]))) 181 182 def test_sliding_windows(self): 183 with TestPipeline() as p: 184 pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) 185 result = ( 186 pcoll 187 | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) 188 | GroupByKey() 189 | beam.MapTuple(lambda k, vs: (k, sorted(vs))) 190 | beam.ParDo(ReifyWindowsFn())) 191 expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), 192 ('key @ [2.0, 6.0)', [2, 3])] 193 assert_that(result, equal_to(expected)) 194 195 def test_sessions(self): 196 with TestPipeline() as p: 197 pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) 198 sort_values = Map(lambda k_vs: (k_vs[0], sorted(k_vs[1]))) 199 result = ( 200 pcoll 201 | 'w' >> WindowInto(Sessions(10)) 202 | GroupByKey() 203 | sort_values 204 | beam.ParDo(ReifyWindowsFn())) 205 expected = [('key @ [1.0, 13.0)', [1, 2, 3]), 206 ('key @ [20.0, 45.0)', [20, 27, 35])] 207 assert_that(result, equal_to(expected)) 208 209 def test_timestamped_value(self): 210 with TestPipeline() as p: 211 result = ( 212 p 213 | 'start' >> Create([(k, k) for k in range(10)]) 214 | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1])) 215 | 'w' >> WindowInto(FixedWindows(5)) 216 | Map(lambda v: ('key', v)) 217 | GroupByKey() 218 | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) 219 assert_that( 220 result, 221 equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])])) 222 223 def test_rewindow(self): 224 with TestPipeline() as p: 225 result = ( 226 p 227 | Create([(k, k) for k in range(10)]) 228 | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1])) 229 | 'window' >> WindowInto(SlidingWindows(period=2, size=6)) 230 # Per the model, each element is now duplicated across 231 # three windows. Rewindowing must preserve this duplication. 232 | 'rewindow' >> WindowInto(FixedWindows(5)) 233 | 'rewindow2' >> WindowInto(FixedWindows(5)) 234 | Map(lambda v: ('key', v)) 235 | GroupByKey() 236 | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) 237 assert_that( 238 result, 239 equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)), 240 ('key', sorted([5, 6, 7, 8, 9] * 3))])) 241 242 def test_rewindow_regroup(self): 243 with TestPipeline() as p: 244 grouped = ( 245 p 246 | Create(range(5)) 247 | Map(lambda t: TimestampedValue(('key', t), t)) 248 | 'window' >> WindowInto(FixedWindows(5, offset=3)) 249 | GroupByKey() 250 | MapTuple(lambda k, vs: (k, sorted(vs)))) 251 # Both of these group-and-ungroup sequences should be idempotent. 252 regrouped1 = ( 253 grouped 254 | 'w1' >> WindowInto(FixedWindows(5, offset=3)) 255 | 'g1' >> GroupByKey() 256 | FlatMapTuple(lambda k, vs: [(k, v) for v in vs])) 257 regrouped2 = ( 258 grouped 259 | FlatMapTuple(lambda k, vs: [(k, v) for v in vs]) 260 | 'w2' >> WindowInto(FixedWindows(5, offset=3)) 261 | 'g2' >> GroupByKey() 262 | MapTuple(lambda k, vs: (k, sorted(vs)))) 263 with_windows = Map(lambda e, w=beam.DoFn.WindowParam: (e, w)) 264 expected = [(('key', [0, 1, 2]), IntervalWindow(-2, 3)), 265 (('key', [3, 4]), IntervalWindow(3, 8))] 266 267 assert_that(grouped | 'ww' >> with_windows, equal_to(expected)) 268 assert_that( 269 regrouped1 | 'ww1' >> with_windows, equal_to(expected), label='r1') 270 assert_that( 271 regrouped2 | 'ww2' >> with_windows, equal_to(expected), label='r2') 272 273 def test_timestamped_with_combiners(self): 274 with TestPipeline() as p: 275 result = ( 276 p 277 # Create some initial test values. 278 | 'start' >> Create([(k, k) for k in range(10)]) 279 # The purpose of the WindowInto transform is to establish a 280 # FixedWindows windowing function for the PCollection. 281 # It does not bucket elements into windows since the timestamps 282 # from Create are not spaced 5 ms apart and very likely they all 283 # fall into the same window. 284 | 'w' >> WindowInto(FixedWindows(5)) 285 # Generate timestamped values using the values as timestamps. 286 # Now there are values 5 ms apart and since Map propagates the 287 # windowing function from input to output the output PCollection 288 # will have elements falling into different 5ms windows. 289 | Map(lambda x_t2: TimestampedValue(x_t2[0], x_t2[1])) 290 # We add a 'key' to each value representing the index of the 291 # window. This is important since there is no guarantee of 292 # order for the elements of a PCollection. 293 | Map(lambda v: (v // 5, v))) 294 # Sum all elements associated with a key and window. Although it 295 # is called CombinePerKey it is really CombinePerKeyAndWindow the 296 # same way GroupByKey is really GroupByKeyAndWindow. 297 sum_per_window = result | CombinePerKey(sum) 298 # Compute mean per key and window. 299 mean_per_window = result | combiners.Mean.PerKey() 300 assert_that( 301 sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') 302 assert_that( 303 mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') 304 305 def test_custom_windows(self): 306 with TestPipeline() as p: 307 pcoll = self.timestamped_key_values(p, 'key', 0, 1, 2, 3, 4, 5, 6) 308 # pylint: disable=abstract-class-instantiated 309 result = ( 310 pcoll 311 | 'custom window' >> WindowInto(TestCustomWindows()) 312 | GroupByKey() 313 | 'sort values' >> MapTuple(lambda k, vs: (k, sorted(vs)))) 314 assert_that( 315 result, 316 equal_to([('key', [0, 1, 2]), ('key', [3, 4]), ('key', [5]), 317 ('key', [6])])) 318 319 def test_window_assignment_idempotency(self): 320 with TestPipeline() as p: 321 pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4) 322 result = ( 323 pcoll 324 | 'window' >> WindowInto(FixedWindows(2)) 325 | 'same window' >> WindowInto(FixedWindows(2)) 326 | 'same window again' >> WindowInto(FixedWindows(2)) 327 | GroupByKey()) 328 329 assert_that(result, equal_to([('key', [0]), ('key', [2]), ('key', [4])])) 330 331 def test_window_assignment_through_multiple_gbk_idempotency(self): 332 with TestPipeline() as p: 333 pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4) 334 result = ( 335 pcoll 336 | 'window' >> WindowInto(FixedWindows(2)) 337 | 'gbk' >> GroupByKey() 338 | 'same window' >> WindowInto(FixedWindows(2)) 339 | 'another gbk' >> GroupByKey() 340 | 'same window again' >> WindowInto(FixedWindows(2)) 341 | 'gbk again' >> GroupByKey()) 342 343 assert_that( 344 result, 345 equal_to([('key', [[[0]]]), ('key', [[[2]]]), ('key', [[[4]]])])) 346 347 348 class RunnerApiTest(unittest.TestCase): 349 def test_windowfn_encoding(self): 350 for window_fn in (GlobalWindows(), 351 FixedWindows(37), 352 SlidingWindows(2, 389), 353 Sessions(5077)): 354 context = pipeline_context.PipelineContext() 355 self.assertEqual( 356 window_fn, 357 WindowFn.from_runner_api(window_fn.to_runner_api(context), context)) 358 359 def test_windowing_encoding(self): 360 for windowing in (Windowing(GlobalWindows()), 361 Windowing( 362 FixedWindows(1, 3), 363 AfterCount(6), 364 accumulation_mode=AccumulationMode.ACCUMULATING), 365 Windowing( 366 SlidingWindows(10, 15, 21), 367 AfterCount(28), 368 timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST, 369 accumulation_mode=AccumulationMode.DISCARDING)): 370 context = pipeline_context.PipelineContext() 371 self.assertEqual( 372 windowing, 373 Windowing.from_runner_api(windowing.to_runner_api(context), context)) 374 375 376 if __name__ == '__main__': 377 unittest.main()