github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/transforms_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import typing 18 import unittest 19 import warnings 20 21 import pandas as pd 22 23 import apache_beam as beam 24 from apache_beam import coders 25 from apache_beam import metrics 26 from apache_beam.dataframe import convert 27 from apache_beam.dataframe import expressions 28 from apache_beam.dataframe import frame_base 29 from apache_beam.dataframe import transforms 30 from apache_beam.runners.portability.fn_api_runner import fn_runner 31 from apache_beam.testing.util import assert_that 32 from apache_beam.testing.util import equal_to 33 34 35 def check_correct(expected, actual): 36 if actual is None: 37 raise AssertionError('Empty frame but expected: \n\n%s' % (expected)) 38 if isinstance(expected, pd.core.generic.NDFrame): 39 expected = expected.sort_index() 40 actual = actual.sort_index() 41 42 if isinstance(expected, pd.Series): 43 pd.testing.assert_series_equal(expected, actual) 44 elif isinstance(expected, pd.DataFrame): 45 pd.testing.assert_frame_equal(expected, actual) 46 else: 47 raise ValueError( 48 f"Expected value is a {type(expected)}," 49 "not a Series or DataFrame.") 50 else: 51 if actual != expected: 52 raise AssertionError('Scalars not equal: %s != %s' % (actual, expected)) 53 54 55 def concat(parts): 56 if len(parts) > 1: 57 return pd.concat(parts) 58 elif len(parts) == 1: 59 return parts[0] 60 else: 61 return None 62 63 64 def df_equal_to(expected): 65 return lambda actual: check_correct(expected, concat(actual)) 66 67 68 AnimalSpeed = typing.NamedTuple( 69 'AnimalSpeed', [('Animal', str), ('Speed', int)]) 70 coders.registry.register_coder(AnimalSpeed, coders.RowCoder) 71 Nested = typing.NamedTuple( 72 'Nested', [('id', int), ('animal_speed', AnimalSpeed)]) 73 coders.registry.register_coder(Nested, coders.RowCoder) 74 75 76 class TransformTest(unittest.TestCase): 77 def run_scenario(self, input, func): 78 expected = func(input) 79 80 empty = input.iloc[0:0] 81 input_placeholder = expressions.PlaceholderExpression(empty) 82 input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) 83 actual_deferred = func(input_deferred)._expr.evaluate_at( 84 expressions.Session({input_placeholder: input})) 85 86 check_correct(expected, actual_deferred) 87 88 with beam.Pipeline() as p: 89 input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]]) 90 input_df = convert.to_dataframe(input_pcoll, proxy=empty) 91 output_df = func(input_df) 92 93 output_proxy = output_df._expr.proxy() 94 if isinstance(output_proxy, pd.core.generic.NDFrame): 95 self.assertTrue( 96 output_proxy.iloc[:0].equals(expected.iloc[:0]), 97 ( 98 'Output proxy is incorrect:\n' 99 f'Expected:\n{expected.iloc[:0]}\n\n' 100 f'Actual:\n{output_proxy.iloc[:0]}')) 101 else: 102 self.assertEqual(type(output_proxy), type(expected)) 103 104 output_pcoll = convert.to_pcollection(output_df, yield_elements='pandas') 105 106 assert_that( 107 output_pcoll, lambda actual: check_correct(expected, concat(actual))) 108 109 def test_identity(self): 110 df = pd.DataFrame({ 111 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], 112 'Speed': [380., 370., 24., 26.] 113 }) 114 self.run_scenario(df, lambda x: x) 115 116 def test_groupby_sum_mean(self): 117 df = pd.DataFrame({ 118 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], 119 'Speed': [380., 370., 24., 26.] 120 }) 121 self.run_scenario(df, lambda df: df.groupby('Animal').sum()) 122 with expressions.allow_non_parallel_operations(): 123 self.run_scenario(df, lambda df: df.groupby('Animal').mean()) 124 self.run_scenario( 125 df, lambda df: df.loc[df.Speed > 25].groupby('Animal').sum()) 126 127 def test_groupby_apply(self): 128 df = pd.DataFrame({ 129 'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)], 130 'foo': [None if i % 11 == 0 else i for i in range(100)], 131 'bar': [None if i % 7 == 0 else 99 - i for i in range(100)], 132 'baz': [None if i % 13 == 0 else i * 2 for i in range(100)], 133 }) 134 135 def median_sum_fn(x): 136 with warnings.catch_warnings(): 137 warnings.filterwarnings("ignore", message="Mean of empty slice") 138 return (x.foo + x.bar).median() 139 140 describe = lambda df: df.describe() 141 142 self.run_scenario(df, lambda df: df.groupby('group').foo.apply(describe)) 143 self.run_scenario( 144 df, lambda df: df.groupby('group')[['foo', 'bar']].apply(describe)) 145 self.run_scenario(df, lambda df: df.groupby('group').apply(median_sum_fn)) 146 self.run_scenario( 147 df, 148 lambda df: df.set_index('group').foo.groupby(level=0).apply(describe)) 149 self.run_scenario(df, lambda df: df.groupby(level=0).apply(median_sum_fn)) 150 self.run_scenario( 151 df, lambda df: df.groupby(lambda x: x % 3).apply(describe)) 152 153 def test_filter(self): 154 df = pd.DataFrame({ 155 'Animal': ['Aardvark', 'Ant', 'Elephant', 'Zebra'], 156 'Speed': [5, 2, 35, 40] 157 }) 158 self.run_scenario(df, lambda df: df.filter(items=['Animal'])) 159 self.run_scenario(df, lambda df: df.filter(regex='Anim.*')) 160 self.run_scenario( 161 df, lambda df: df.set_index('Animal').filter(regex='F.*', axis='index')) 162 163 with expressions.allow_non_parallel_operations(): 164 a = pd.DataFrame({'col': [1, 2, 3]}) 165 self.run_scenario(a, lambda a: a.agg(sum)) 166 self.run_scenario(a, lambda a: a.agg(['mean', 'min', 'max'])) 167 168 def test_scalar(self): 169 with expressions.allow_non_parallel_operations(): 170 a = pd.Series([1, 2, 6]) 171 self.run_scenario(a, lambda a: a.agg(sum)) 172 self.run_scenario(a, lambda a: a / a.agg(sum)) 173 self.run_scenario(a, lambda a: a / (a.max() - a.min())) 174 self.run_scenario(a, lambda a: a / (a.sum() - 1)) 175 176 # Tests scalar being used as an input to a downstream stage. 177 df = pd.DataFrame({'key': ['a', 'a', 'b'], 'val': [1, 2, 6]}) 178 self.run_scenario( 179 df, lambda df: df.groupby('key').sum().val / df.val.agg(sum)) 180 181 def test_getitem_projection(self): 182 df = pd.DataFrame({ 183 'Animal': ['Aardvark', 'Ant', 'Elephant', 'Zebra'], 184 'Speed': [5, 2, 35, 40], 185 'Size': ['Small', 'Extra Small', 'Large', 'Medium'] 186 }) 187 self.run_scenario(df, lambda df: df[['Speed', 'Size']]) 188 189 def test_offset_elementwise(self): 190 s = pd.Series(range(10)).astype(float) 191 df = pd.DataFrame({'value': s, 'square': s * s, 'cube': s * s * s}) 192 # Only those values that are both squares and cubes will intersect. 193 self.run_scenario( 194 df, 195 lambda df: df.set_index('square').value + df.set_index('cube').value) 196 197 def test_batching_named_tuple_input(self): 198 with beam.Pipeline() as p: 199 result = ( 200 p | beam.Create([ 201 AnimalSpeed('Aardvark', 5), 202 AnimalSpeed('Ant', 2), 203 AnimalSpeed('Elephant', 35), 204 AnimalSpeed('Zebra', 40) 205 ]).with_output_types(AnimalSpeed) 206 | transforms.DataframeTransform(lambda df: df.filter(regex='Anim.*'))) 207 208 assert_that( 209 result, 210 equal_to([('Aardvark', ), ('Ant', ), ('Elephant', ), ('Zebra', )])) 211 212 def test_batching_beam_row_input(self): 213 with beam.Pipeline() as p: 214 result = ( 215 p 216 | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.), 217 (u'Parrot', 26.)]) 218 | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])) 219 | transforms.DataframeTransform( 220 lambda df: df.groupby('Animal').mean(), include_indexes=True)) 221 222 assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)])) 223 224 def test_batching_beam_row_to_dataframe(self): 225 with beam.Pipeline() as p: 226 df = convert.to_dataframe( 227 p 228 | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), ( 229 u'Parrot', 24.), (u'Parrot', 26.)]) 230 | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))) 231 232 result = convert.to_pcollection( 233 df.groupby('Animal').mean(), include_indexes=True) 234 235 assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)])) 236 237 def test_batching_passthrough_nested_schema(self): 238 with beam.Pipeline() as p: 239 nested_schema_pc = ( 240 p | beam.Create([Nested(1, AnimalSpeed('Aardvark', 5)) 241 ]).with_output_types(Nested)) 242 result = nested_schema_pc | transforms.DataframeTransform( # pylint: disable=expression-not-assigned 243 lambda df: df.filter(items=['animal_speed'])) 244 245 assert_that(result, equal_to([(('Aardvark', 5), )])) 246 247 def test_batching_passthrough_nested_array(self): 248 Array = typing.NamedTuple( 249 'Array', [('id', int), ('business_numbers', typing.Sequence[int])]) 250 coders.registry.register_coder(Array, coders.RowCoder) 251 252 with beam.Pipeline() as p: 253 array_schema_pc = (p | beam.Create([Array(1, [7, 8, 9])])) 254 result = array_schema_pc | transforms.DataframeTransform( # pylint: disable=expression-not-assigned 255 lambda df: df.filter(items=['business_numbers'])) 256 257 assert_that(result, equal_to([([7, 8, 9], )])) 258 259 def test_unbatching_series(self): 260 with beam.Pipeline() as p: 261 result = ( 262 p 263 | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.), 264 (u'Parrot', 26.)]) 265 | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])) 266 | transforms.DataframeTransform(lambda df: df.Animal)) 267 268 assert_that(result, equal_to(['Falcon', 'Falcon', 'Parrot', 'Parrot'])) 269 270 def test_input_output_polymorphism(self): 271 one_series = pd.Series([1]) 272 two_series = pd.Series([2]) 273 three_series = pd.Series([3]) 274 proxy = one_series[:0] 275 276 def equal_to_series(expected): 277 def check(actual): 278 actual = pd.concat(actual) 279 if not expected.equals(actual): 280 raise AssertionError( 281 'Series not equal: \n%s\n%s\n' % (expected, actual)) 282 283 return check 284 285 with beam.Pipeline() as p: 286 one = p | 'One' >> beam.Create([one_series]) 287 two = p | 'Two' >> beam.Create([two_series]) 288 289 assert_that( 290 one | 'PcollInPcollOut' >> transforms.DataframeTransform( 291 lambda x: 3 * x, proxy=proxy, yield_elements='pandas'), 292 equal_to_series(three_series), 293 label='CheckPcollInPcollOut') 294 295 assert_that( 296 (one, two) 297 | 'TupleIn' >> transforms.DataframeTransform( 298 lambda x, y: (x + y), (proxy, proxy), yield_elements='pandas'), 299 equal_to_series(three_series), 300 label='CheckTupleIn') 301 302 assert_that( 303 dict(x=one, y=two) 304 | 'DictIn' >> transforms.DataframeTransform( 305 lambda x, 306 y: (x + y), 307 proxy=dict(x=proxy, y=proxy), 308 yield_elements='pandas'), 309 equal_to_series(three_series), 310 label='CheckDictIn') 311 312 double, triple = one | 'TupleOut' >> transforms.DataframeTransform( 313 lambda x: (2*x, 3*x), proxy, yield_elements='pandas') 314 assert_that(double, equal_to_series(two_series), 'CheckTupleOut0') 315 assert_that(triple, equal_to_series(three_series), 'CheckTupleOut1') 316 317 res = one | 'DictOut' >> transforms.DataframeTransform( 318 lambda x: {'res': 3 * x}, proxy, yield_elements='pandas') 319 assert_that(res['res'], equal_to_series(three_series), 'CheckDictOut') 320 321 def test_cat(self): 322 # verify that cat works with a List[Series] since this is 323 # missing from doctests 324 df = pd.DataFrame({ 325 'one': ['A', 'B', 'C'], 326 'two': ['BB', 'CC', 'A'], 327 'three': ['CCC', 'AA', 'B'], 328 }) 329 self.run_scenario(df, lambda df: df.two.str.cat([df.three], join='outer')) 330 self.run_scenario( 331 df, lambda df: df.one.str.cat([df.two, df.three], join='outer')) 332 333 def test_repeat(self): 334 # verify that repeat works with a Series since this is 335 # missing from doctests 336 df = pd.DataFrame({ 337 'strings': ['A', 'B', 'C', 'D', 'E'], 338 'repeats': [3, 1, 4, 5, 2], 339 }) 340 self.run_scenario(df, lambda df: df.strings.str.repeat(df.repeats)) 341 342 def test_rename(self): 343 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 344 self.run_scenario( 345 df, lambda df: df.rename(columns={'B': 'C'}, index={ 346 0: 2, 2: 0 347 })) 348 349 with expressions.allow_non_parallel_operations(): 350 self.run_scenario( 351 df, 352 lambda df: df.rename( 353 columns={'B': 'C'}, index={ 354 0: 2, 2: 0 355 }, errors='raise')) 356 357 358 class FusionTest(unittest.TestCase): 359 @staticmethod 360 def fused_stages(p): 361 return p.result.monitoring_metrics().query( 362 metrics.MetricsFilter().with_name( 363 fn_runner.FnApiRunner.NUM_FUSED_STAGES_COUNTER) 364 )['counters'][0].result 365 366 @staticmethod 367 def create_animal_speed_input(p): 368 return p | beam.Create([ 369 AnimalSpeed('Aardvark', 5), 370 AnimalSpeed('Ant', 2), 371 AnimalSpeed('Elephant', 35), 372 AnimalSpeed('Zebra', 40) 373 ], 374 reshuffle=False) 375 376 def test_loc_filter(self): 377 with beam.Pipeline() as p: 378 _ = ( 379 self.create_animal_speed_input(p) 380 | transforms.DataframeTransform(lambda df: df[df.Speed > 10])) 381 self.assertEqual(self.fused_stages(p), 1) 382 383 def test_column_manipulation(self): 384 def set_column(df, name, s): 385 df[name] = s 386 return df 387 388 with beam.Pipeline() as p: 389 _ = ( 390 self.create_animal_speed_input(p) 391 | transforms.DataframeTransform( 392 lambda df: set_column(df, 'x', df.Speed + df.Animal.str.len()))) 393 self.assertEqual(self.fused_stages(p), 1) 394 395 396 class TransformPartsTest(unittest.TestCase): 397 def test_rebatch(self): 398 with beam.Pipeline() as p: 399 sA = pd.Series(range(1000)) 400 sB = sA * sA 401 pcA = p | 'CreatePCollA' >> beam.Create([('k0', sA[::3]), 402 ('k1', sA[1::3]), 403 ('k2', sA[2::3])]) 404 pcB = p | 'CreatePCollB' >> beam.Create([('k0', sB[::3]), 405 ('k1', sB[1::3]), 406 ('k2', sB[2::3])]) 407 input = {'A': pcA, 'B': pcB} | beam.CoGroupByKey() 408 output = input | beam.ParDo( 409 transforms._ReBatch(target_size=sA.memory_usage())) 410 411 # There should be exactly two elements, as the target size will be 412 # hit when 2/3 of pcA and 2/3 of pcB is seen, but not before. 413 assert_that(output | beam.combiners.Count.Globally(), equal_to([2])) 414 415 # Sanity check that we got all the right values. 416 assert_that( 417 output | beam.Map(lambda x: x['A'].sum()) 418 | 'SumA' >> beam.CombineGlobally(sum), 419 equal_to([sA.sum()]), 420 label='CheckValuesA') 421 assert_that( 422 output | beam.Map(lambda x: x['B'].sum()) 423 | 'SumB' >> beam.CombineGlobally(sum), 424 equal_to([sB.sum()]), 425 label='CheckValuesB') 426 427 428 if __name__ == '__main__': 429 unittest.main()