github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/frames_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import re 18 import unittest 19 import warnings 20 21 import numpy as np 22 import pandas as pd 23 from parameterized import parameterized 24 25 import apache_beam as beam 26 from apache_beam.dataframe import expressions 27 from apache_beam.dataframe import frame_base 28 from apache_beam.dataframe import frames 29 from apache_beam.dataframe.convert import to_dataframe 30 from apache_beam.runners.interactive import interactive_beam as ib 31 from apache_beam.runners.interactive import interactive_environment as ie 32 from apache_beam.runners.interactive.interactive_runner import InteractiveRunner 33 from apache_beam.runners.interactive.testing.mock_env import isolated_env 34 35 # Get major, minor version 36 PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2])) 37 38 GROUPBY_DF = pd.DataFrame({ 39 'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)], 40 'foo': [None if i % 11 == 0 else i for i in range(100)], 41 'bar': [None if i % 7 == 0 else 99 - i for i in range(100)], 42 'baz': [None if i % 13 == 0 else i * 2 for i in range(100)], 43 'bool': [i % 17 == 0 for i in range(100)], 44 'str': [str(i) for i in range(100)], 45 }) 46 47 48 def _get_deferred_args(*args): 49 return [ 50 frame_base.DeferredFrame.wrap( 51 expressions.ConstantExpression(arg, arg[0:0])) for arg in args 52 ] 53 54 55 class _AbstractFrameTest(unittest.TestCase): 56 """Test sub-class with utilities for verifying DataFrame operations.""" 57 def _run_error_test( 58 self, func, *args, construction_time=True, distributed=True): 59 """Verify that func(*args) raises the same exception in pandas and in Beam. 60 61 Note that by default this only checks for exceptions that the Beam DataFrame 62 API raises during expression generation (i.e. construction time). 63 Exceptions raised while the pipeline is executing are less helpful, but 64 are sometimes unavoidable (e.g. data validation exceptions), to check for 65 these exceptions use construction_time=False.""" 66 deferred_args = _get_deferred_args(*args) 67 68 # Get expected error 69 try: 70 expected = func(*args) 71 except Exception as e: 72 expected_error = e 73 else: 74 raise AssertionError( 75 "Expected an error, but executing with pandas successfully " 76 f"returned:\n{expected}") 77 78 # Get actual error 79 if construction_time: 80 try: 81 _ = func(*deferred_args)._expr 82 except Exception as e: 83 actual = e 84 else: 85 raise AssertionError( 86 f"Expected an error:\n{expected_error}\nbut Beam successfully " 87 f"generated an expression.") 88 else: # not construction_time 89 # Check for an error raised during pipeline execution 90 expr = func(*deferred_args)._expr 91 session_type = ( 92 expressions.PartitioningSession 93 if distributed else expressions.Session) 94 try: 95 result = session_type({}).evaluate(expr) 96 except Exception as e: 97 actual = e 98 else: 99 raise AssertionError( 100 f"Expected an error:\n{expected_error}\nbut Beam successfully " 101 f"Computed the result:\n{result}.") 102 103 # Verify 104 if (not isinstance(actual, type(expected_error)) or 105 str(expected_error) not in str(actual)): 106 raise AssertionError( 107 f'Expected {expected_error!r} to be raised, but got {actual!r}' 108 ) from actual 109 110 def _run_inplace_test(self, func, arg, **kwargs): 111 """Verify an inplace operation performed by func. 112 113 Checks that func performs the same inplace operation on arg, in pandas and 114 in Beam.""" 115 def wrapper(df): 116 df = df.copy() 117 func(df) 118 return df 119 120 self._run_test(wrapper, arg, **kwargs) 121 122 def _run_test( 123 self, 124 func, 125 *args, 126 distributed=True, 127 nonparallel=False, 128 check_proxy=True, 129 lenient_dtype_check=False): 130 """Verify that func(*args) produces the same result in pandas and in Beam. 131 132 Args: 133 distributed (bool): Whether or not to use PartitioningSession to 134 simulate parallel execution. 135 nonparallel (bool): Whether or not this function contains a 136 non-parallelizable operation. If True, the expression will be 137 generated twice, once outside of an allow_non_parallel_operations 138 block (to verify NonParallelOperation is raised), and again inside 139 of an allow_non_parallel_operations block to actually generate an 140 expression to verify. 141 check_proxy (bool): Whether or not to check that the proxy of the 142 generated expression matches the actual result, defaults to True. 143 This option should NOT be set to False in tests added for new 144 operations if at all possible. Instead make sure the new operation 145 produces the correct proxy. This flag only exists as an escape hatch 146 until existing failures can be addressed 147 (https://github.com/apache/beam/issues/20926). 148 lenient_dtype_check (bool): Whether or not to check that numeric columns 149 are still numeric between actual and proxy. i.e. verify that they 150 are at least int64 or float64, and not necessarily have the exact 151 same dtype. This may need to be set to True for some non-deferred 152 operations, where the dtype of the values in the proxy are not known 153 ahead of time, causing int64 to float64 coercion issues. 154 """ 155 # Compute expected value 156 expected = func(*args) 157 158 # Compute actual value 159 deferred_args = _get_deferred_args(*args) 160 if nonparallel: 161 # First run outside a nonparallel block to confirm this raises as expected 162 with self.assertRaises(expressions.NonParallelOperation) as raised: 163 func(*deferred_args) 164 165 if raised.exception.msg.startswith( 166 "Encountered non-parallelizable form of"): 167 raise AssertionError( 168 "Default NonParallelOperation raised, please specify a reason in " 169 "the Singleton() partitioning requirement for this operation." 170 ) from raised.exception 171 172 # Re-run in an allow non parallel block to get an expression to verify 173 with beam.dataframe.allow_non_parallel_operations(): 174 expr = func(*deferred_args)._expr 175 else: 176 expr = func(*deferred_args)._expr 177 178 # Compute the result of the generated expression 179 session_type = ( 180 expressions.PartitioningSession if distributed else expressions.Session) 181 182 actual = session_type({}).evaluate(expr) 183 184 # Verify 185 if isinstance(expected, pd.core.generic.NDFrame): 186 if distributed: 187 if expected.index.is_unique: 188 expected = expected.sort_index() 189 actual = actual.sort_index() 190 else: 191 expected = expected.sort_values(list(expected.columns)) 192 actual = actual.sort_values(list(actual.columns)) 193 if isinstance(expected, pd.Series): 194 if lenient_dtype_check: 195 pd.testing.assert_series_equal( 196 expected.astype('Float64'), actual.astype('Float64')) 197 else: 198 pd.testing.assert_series_equal(expected, actual) 199 elif isinstance(expected, pd.DataFrame): 200 if lenient_dtype_check: 201 pd.testing.assert_frame_equal( 202 expected.astype('Float64'), actual.astype('Float64')) 203 else: 204 pd.testing.assert_frame_equal(expected, actual) 205 else: 206 raise ValueError( 207 f"Expected value is a {type(expected)}," 208 "not a Series or DataFrame.") 209 210 else: 211 # Expectation is not a pandas object 212 if isinstance(expected, float): 213 if np.isnan(expected): 214 cmp = np.isnan 215 else: 216 cmp = lambda x: np.isclose(expected, x) 217 else: 218 cmp = lambda x: x == expected 219 self.assertTrue( 220 cmp(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual)) 221 222 if check_proxy: 223 # Verify that the actual result agrees with the proxy 224 proxy = expr.proxy() 225 226 if type(actual) in (np.float32, np.float64): 227 self.assertTrue(type(actual) == type(proxy) or np.isnan(proxy)) 228 else: 229 self.assertEqual(type(actual), type(proxy)) 230 231 if isinstance(expected, pd.core.generic.NDFrame): 232 if isinstance(expected, pd.Series): 233 if lenient_dtype_check: 234 self.assertEqual( 235 actual.astype('Float64').dtype, proxy.astype('Float64').dtype) 236 else: 237 self.assertEqual(actual.dtype, proxy.dtype) 238 self.assertEqual(actual.name, proxy.name) 239 elif isinstance(expected, pd.DataFrame): 240 if lenient_dtype_check: 241 pd.testing.assert_series_equal( 242 actual.astype('Float64').dtypes, proxy.astype('Float64').dtypes) 243 else: 244 pd.testing.assert_series_equal(actual.dtypes, proxy.dtypes) 245 246 else: 247 raise ValueError( 248 f"Expected value is a {type(expected)}," 249 "not a Series or DataFrame.") 250 251 self.assertEqual(actual.index.names, proxy.index.names) 252 253 for i in range(actual.index.nlevels): 254 if lenient_dtype_check: 255 self.assertEqual( 256 actual.astype('Float64').index.get_level_values(i).dtype, 257 proxy.astype('Float64').index.get_level_values(i).dtype) 258 else: 259 self.assertEqual( 260 actual.index.get_level_values(i).dtype, 261 proxy.index.get_level_values(i).dtype) 262 263 264 class DeferredFrameTest(_AbstractFrameTest): 265 """Miscellaneous tessts for DataFrame operations.""" 266 def test_series_arithmetic(self): 267 a = pd.Series([1, 2, 3]) 268 b = pd.Series([100, 200, 300]) 269 270 self._run_test(lambda a, b: a - 2 * b, a, b) 271 self._run_test(lambda a, b: a.subtract(2).multiply(b).divide(a), a, b) 272 273 def test_dataframe_arithmetic(self): 274 df = pd.DataFrame({'a': [1, 2, 3], 'b': [100, 200, 300]}) 275 df2 = pd.DataFrame({'a': [3000, 1000, 2000], 'b': [7, 11, 13]}) 276 277 self._run_test(lambda df, df2: df - 2 * df2, df, df2) 278 self._run_test( 279 lambda df, df2: df.subtract(2).multiply(df2).divide(df), df, df2) 280 281 @unittest.skipIf(PD_VERSION < (1, 3), "dropna=False is new in pandas 1.3") 282 def test_value_counts_dropna_false(self): 283 df = pd.DataFrame({ 284 'first_name': ['John', 'Anne', 'John', 'Beth'], 285 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise'] 286 }) 287 # TODO(https://github.com/apache/beam/issues/21014): Remove the 288 # assertRaises this when the underlying bug in 289 # https://github.com/pandas-dev/pandas/issues/36470 is fixed. 290 with self.assertRaises(NotImplementedError): 291 self._run_test(lambda df: df.value_counts(dropna=False), df) 292 293 def test_get_column(self): 294 df = pd.DataFrame({ 295 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], 296 'Speed': [380., 370., 24., 26.] 297 }) 298 self._run_test(lambda df: df['Animal'], df) 299 self._run_test(lambda df: df.Speed, df) 300 self._run_test(lambda df: df.get('Animal'), df) 301 self._run_test(lambda df: df.get('FOO', df.Animal), df) 302 303 def test_series_xs(self): 304 # pandas doctests only verify DataFrame.xs, here we verify Series.xs as well 305 d = { 306 'num_legs': [4, 4, 2, 2], 307 'num_wings': [0, 0, 2, 2], 308 'class': ['mammal', 'mammal', 'mammal', 'bird'], 309 'animal': ['cat', 'dog', 'bat', 'penguin'], 310 'locomotion': ['walks', 'walks', 'flies', 'walks'] 311 } 312 df = pd.DataFrame(data=d) 313 df = df.set_index(['class', 'animal', 'locomotion']) 314 315 self._run_test(lambda df: df.num_legs.xs('mammal'), df) 316 self._run_test(lambda df: df.num_legs.xs(('mammal', 'dog')), df) 317 self._run_test(lambda df: df.num_legs.xs('cat', level=1), df) 318 self._run_test( 319 lambda df: df.num_legs.xs(('bird', 'walks'), level=[0, 'locomotion']), 320 df) 321 322 def test_dataframe_xs(self): 323 # Test cases reported in BEAM-13421 324 df = pd.DataFrame( 325 np.array([ 326 ['state', 'day1', 12], 327 ['state', 'day1', 1], 328 ['state', 'day2', 14], 329 ['county', 'day1', 9], 330 ]), 331 columns=['provider', 'time', 'value']) 332 333 self._run_test(lambda df: df.xs('state'), df.set_index(['provider'])) 334 self._run_test( 335 lambda df: df.xs('state'), df.set_index(['provider', 'time'])) 336 337 def test_set_column(self): 338 def new_column(df): 339 df['NewCol'] = df['Speed'] 340 341 df = pd.DataFrame({ 342 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], 343 'Speed': [380., 370., 24., 26.] 344 }) 345 self._run_inplace_test(new_column, df) 346 347 def test_set_column_from_index(self): 348 def new_column(df): 349 df['NewCol'] = df.index 350 351 df = pd.DataFrame({ 352 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], 353 'Speed': [380., 370., 24., 26.] 354 }) 355 self._run_inplace_test(new_column, df) 356 357 def test_tz_localize_ambiguous_series(self): 358 # This replicates a tz_localize doctest: 359 # s.tz_localize('CET', ambiguous=np.array([True, True, False])) 360 # But using a DeferredSeries instead of a np array 361 362 s = pd.Series( 363 range(3), 364 index=pd.DatetimeIndex([ 365 '2018-10-28 01:20:00', '2018-10-28 02:36:00', '2018-10-28 03:46:00' 366 ])) 367 ambiguous = pd.Series([True, True, False], index=s.index) 368 369 self._run_test( 370 lambda s, 371 ambiguous: s.tz_localize('CET', ambiguous=ambiguous), 372 s, 373 ambiguous) 374 375 def test_tz_convert(self): 376 # This replicates a tz_localize doctest: 377 # s.tz_localize('CET', ambiguous=np.array([True, True, False])) 378 # But using a DeferredSeries instead of a np array 379 380 s = pd.Series( 381 range(3), 382 index=pd.DatetimeIndex([ 383 '2018-10-27 01:20:00', '2018-10-27 02:36:00', '2018-10-27 03:46:00' 384 ], 385 tz='Europe/Berlin')) 386 387 self._run_test(lambda s: s.tz_convert('America/Los_Angeles'), s) 388 389 def test_sort_index_columns(self): 390 df = pd.DataFrame({ 391 'c': range(10), 392 'a': range(10), 393 'b': range(10), 394 np.nan: range(10), 395 }) 396 397 self._run_test(lambda df: df.sort_index(axis=1), df) 398 self._run_test(lambda df: df.sort_index(axis=1, ascending=False), df) 399 self._run_test(lambda df: df.sort_index(axis=1, na_position='first'), df) 400 401 def test_where_callable_args(self): 402 df = pd.DataFrame( 403 np.arange(10, dtype=np.int64).reshape(-1, 2), columns=['A', 'B']) 404 405 self._run_test( 406 lambda df: df.where(lambda df: df % 2 == 0, lambda df: df * 10), df) 407 408 def test_where_concrete_args(self): 409 df = pd.DataFrame( 410 np.arange(10, dtype=np.int64).reshape(-1, 2), columns=['A', 'B']) 411 412 self._run_test( 413 lambda df: df.where( 414 df % 2 == 0, pd.Series({ 415 'A': 123, 'B': 456 416 }), axis=1), 417 df) 418 419 def test_combine_dataframe(self): 420 df = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 421 df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 422 take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 423 self._run_test( 424 lambda df, 425 df2: df.combine(df2, take_smaller), 426 df, 427 df2, 428 nonparallel=True) 429 430 def test_combine_dataframe_fill(self): 431 df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 432 df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 433 take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 434 self._run_test( 435 lambda df1, 436 df2: df1.combine(df2, take_smaller, fill_value=-5), 437 df1, 438 df2, 439 nonparallel=True) 440 441 def test_combine_Series(self): 442 s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) 443 s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) 444 self._run_test( 445 lambda s1, 446 s2: s1.combine(s2, max), 447 s1, 448 s2, 449 nonparallel=True, 450 check_proxy=False) 451 452 def test_combine_first_dataframe(self): 453 df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) 454 df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 455 456 self._run_test(lambda df1, df2: df1.combine_first(df2), df1, df2) 457 458 def test_combine_first_series(self): 459 s1 = pd.Series([1, np.nan]) 460 s2 = pd.Series([3, 4]) 461 462 self._run_test(lambda s1, s2: s1.combine_first(s2), s1, s2) 463 464 def test_add_prefix(self): 465 df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) 466 s = pd.Series([1, 2, 3, 4]) 467 468 self._run_test(lambda df: df.add_prefix('col_'), df) 469 self._run_test(lambda s: s.add_prefix('col_'), s) 470 471 def test_add_suffix(self): 472 df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) 473 s = pd.Series([1, 2, 3, 4]) 474 475 self._run_test(lambda df: df.add_suffix('_col'), df) 476 self._run_test(lambda s: s.add_prefix('_col'), s) 477 478 def test_set_index(self): 479 df = pd.DataFrame({ 480 # [19, 18, ..] 481 'index1': reversed(range(20)), # [15, 16, .., 0, 1, .., 13, 14] 482 'index2': np.roll(range(20), 5), # ['', 'a', 'bb', ...] 483 'values': [chr(ord('a') + i) * i for i in range(20)], 484 }) 485 486 self._run_test(lambda df: df.set_index(['index1', 'index2']), df) 487 self._run_test(lambda df: df.set_index(['index1', 'index2'], drop=True), df) 488 self._run_test(lambda df: df.set_index('values'), df) 489 490 self._run_error_test(lambda df: df.set_index('bad'), df) 491 self._run_error_test( 492 lambda df: df.set_index(['index2', 'bad', 'really_bad']), df) 493 494 def test_set_axis(self): 495 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=['X', 'Y', 'Z']) 496 497 self._run_test(lambda df: df.set_axis(['I', 'II'], axis='columns'), df) 498 self._run_test(lambda df: df.set_axis([0, 1], axis=1), df) 499 self._run_inplace_test( 500 lambda df: df.set_axis(['i', 'ii'], axis='columns'), df) 501 with self.assertRaises(NotImplementedError): 502 self._run_test(lambda df: df.set_axis(['a', 'b', 'c'], axis='index'), df) 503 self._run_test(lambda df: df.set_axis([0, 1, 2], axis=0), df) 504 505 def test_series_set_axis(self): 506 s = pd.Series(list(range(3)), index=['X', 'Y', 'Z']) 507 with self.assertRaises(NotImplementedError): 508 self._run_test(lambda s: s.set_axis(['a', 'b', 'c']), s) 509 self._run_test(lambda s: s.set_axis([1, 2, 3]), s) 510 511 def test_series_drop_ignore_errors(self): 512 midx = pd.MultiIndex( 513 levels=[['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']], 514 codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]]) 515 s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) 516 517 # drop() requires singleton partitioning unless errors are ignored 518 # Add some additional tests here to make sure the implementation works in 519 # non-singleton partitioning. 520 self._run_test(lambda s: s.drop('lama', level=0, errors='ignore'), s) 521 self._run_test(lambda s: s.drop(('cow', 'speed'), errors='ignore'), s) 522 self._run_test(lambda s: s.drop('falcon', level=0, errors='ignore'), s) 523 524 def test_dataframe_drop_ignore_errors(self): 525 midx = pd.MultiIndex( 526 levels=[['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']], 527 codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]]) 528 df = pd.DataFrame( 529 index=midx, 530 columns=['big', 'small'], 531 data=[[45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], 532 [320, 250], [1, 0.8], [0.3, 0.2]]) 533 534 # drop() requires singleton partitioning unless errors are ignored 535 # Add some additional tests here to make sure the implementation works in 536 # non-singleton partitioning. 537 self._run_test( 538 lambda df: df.drop(index='lama', level=0, errors='ignore'), df) 539 self._run_test( 540 lambda df: df.drop(index=('cow', 'speed'), errors='ignore'), df) 541 self._run_test( 542 lambda df: df.drop(index='falcon', level=0, errors='ignore'), df) 543 self._run_test( 544 lambda df: df.drop(index='cow', columns='small', errors='ignore'), df) 545 546 def test_merge(self): 547 # This is from the pandas doctests, but fails due to re-indexing being 548 # order-sensitive. 549 df1 = pd.DataFrame({ 550 'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5] 551 }) 552 df2 = pd.DataFrame({ 553 'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8] 554 }) 555 self._run_test( 556 lambda df1, 557 df2: df1.merge(df2, left_on='lkey', right_on='rkey').rename( 558 index=lambda x: '*'), 559 df1, 560 df2, 561 nonparallel=True, 562 check_proxy=False) 563 self._run_test( 564 lambda df1, 565 df2: df1.merge( 566 df2, left_on='lkey', right_on='rkey', suffixes=('_left', '_right')). 567 rename(index=lambda x: '*'), 568 df1, 569 df2, 570 nonparallel=True, 571 check_proxy=False) 572 573 def test_merge_left_join(self): 574 # This is from the pandas doctests, but fails due to re-indexing being 575 # order-sensitive. 576 df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) 577 df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) 578 579 self._run_test( 580 lambda df1, 581 df2: df1.merge(df2, how='left', on='a').rename(index=lambda x: '*'), 582 df1, 583 df2, 584 nonparallel=True, 585 check_proxy=False) 586 587 def test_merge_on_index(self): 588 # This is from the pandas doctests, but fails due to re-indexing being 589 # order-sensitive. 590 df1 = pd.DataFrame({ 591 'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5] 592 }).set_index('lkey') 593 df2 = pd.DataFrame({ 594 'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8] 595 }).set_index('rkey') 596 597 self._run_test( 598 lambda df1, 599 df2: df1.merge(df2, left_index=True, right_index=True), 600 df1, 601 df2, 602 check_proxy=False) 603 604 def test_merge_same_key(self): 605 df1 = pd.DataFrame({ 606 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5] 607 }) 608 df2 = pd.DataFrame({ 609 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8] 610 }) 611 self._run_test( 612 lambda df1, 613 df2: df1.merge(df2, on='key').rename(index=lambda x: '*'), 614 df1, 615 df2, 616 nonparallel=True, 617 check_proxy=False) 618 self._run_test( 619 lambda df1, 620 df2: df1.merge(df2, on='key', suffixes=('_left', '_right')).rename( 621 index=lambda x: '*'), 622 df1, 623 df2, 624 nonparallel=True, 625 check_proxy=False) 626 627 def test_merge_same_key_doctest(self): 628 df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) 629 df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) 630 631 self._run_test( 632 lambda df1, 633 df2: df1.merge(df2, how='left', on='a').rename(index=lambda x: '*'), 634 df1, 635 df2, 636 nonparallel=True, 637 check_proxy=False) 638 # Test without specifying 'on' 639 self._run_test( 640 lambda df1, 641 df2: df1.merge(df2, how='left').rename(index=lambda x: '*'), 642 df1, 643 df2, 644 nonparallel=True, 645 check_proxy=False) 646 647 def test_merge_same_key_suffix_collision(self): 648 df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2], 'a_lsuffix': [5, 6]}) 649 df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4], 'a_rsuffix': [7, 8]}) 650 651 self._run_test( 652 lambda df1, 653 df2: df1.merge( 654 df2, how='left', on='a', suffixes=('_lsuffix', '_rsuffix')).rename( 655 index=lambda x: '*'), 656 df1, 657 df2, 658 nonparallel=True, 659 check_proxy=False) 660 # Test without specifying 'on' 661 self._run_test( 662 lambda df1, 663 df2: df1.merge(df2, how='left', suffixes=('_lsuffix', '_rsuffix')). 664 rename(index=lambda x: '*'), 665 df1, 666 df2, 667 nonparallel=True, 668 check_proxy=False) 669 670 def test_swaplevel(self): 671 df = pd.DataFrame( 672 {"Grade": ["A", "B", "A", "C"]}, 673 index=[ 674 ["Final exam", "Final exam", "Coursework", "Coursework"], 675 ["History", "Geography", "History", "Geography"], 676 ["January", "February", "March", "April"], 677 ]) 678 self._run_test(lambda df: df.swaplevel(), df) 679 680 def test_value_counts_with_nans(self): 681 # similar to doctests that verify value_counts, but include nan values to 682 # make sure we handle them correctly. 683 df = pd.DataFrame({ 684 'num_legs': [2, 4, 4, 6, np.nan, np.nan], 685 'num_wings': [2, 0, 0, 0, np.nan, 2] 686 }, 687 index=['falcon', 'dog', 'cat', 'ant', 'car', 'plane']) 688 689 self._run_test(lambda df: df.value_counts(), df) 690 self._run_test(lambda df: df.value_counts(normalize=True), df) 691 692 if PD_VERSION >= (1, 3): 693 # dropna=False is new in pandas 1.3 694 # TODO(https://github.com/apache/beam/issues/21014): Remove the 695 # assertRaises this when the underlying bug in 696 # https://github.com/pandas-dev/pandas/issues/36470 is fixed. 697 with self.assertRaises(NotImplementedError): 698 self._run_test(lambda df: df.value_counts(dropna=False), df) 699 700 # Test the defaults. 701 self._run_test(lambda df: df.num_wings.value_counts(), df) 702 self._run_test(lambda df: df.num_wings.value_counts(normalize=True), df) 703 self._run_test(lambda df: df.num_wings.value_counts(dropna=False), df) 704 705 # Test the combination interactions. 706 for normalize in (True, False): 707 for dropna in (True, False): 708 self._run_test( 709 lambda df, 710 dropna=dropna, 711 normalize=normalize: df.num_wings.value_counts( 712 dropna=dropna, normalize=normalize), 713 df) 714 715 def test_value_counts_does_not_support_sort(self): 716 df = pd.DataFrame({ 717 'num_legs': [2, 4, 4, 6, np.nan, np.nan], 718 'num_wings': [2, 0, 0, 0, np.nan, 2] 719 }, 720 index=['falcon', 'dog', 'cat', 'ant', 'car', 'plane']) 721 722 with self.assertRaisesRegex(frame_base.WontImplementError, 723 r"value_counts\(sort\=True\)"): 724 self._run_test(lambda df: df.value_counts(sort=True), df) 725 726 with self.assertRaisesRegex(frame_base.WontImplementError, 727 r"value_counts\(sort\=True\)"): 728 self._run_test(lambda df: df.num_wings.value_counts(sort=True), df) 729 730 def test_series_getitem(self): 731 s = pd.Series([x**2 for x in range(10)]) 732 self._run_test(lambda s: s[...], s) 733 self._run_test(lambda s: s[:], s) 734 self._run_test(lambda s: s[s < 10], s) 735 self._run_test(lambda s: s[lambda s: s < 10], s) 736 737 s.index = s.index.map(float) 738 self._run_test(lambda s: s[1.5:6], s) 739 740 def test_series_truncate(self): 741 s = pd.Series(['a', 'b', 'c', 'd', 'e', 'f']) 742 self._run_test(lambda s: s.truncate(before=1, after=3), s) 743 744 def test_dataframe_truncate(self): 745 df = pd.DataFrame({ 746 'C': list('abcde'), 'B': list('fghij'), 'A': list('klmno') 747 }, 748 index=[1, 2, 3, 4, 5]) 749 self._run_test(lambda df: df.truncate(before=1, after=3), df) 750 self._run_test(lambda df: df.truncate(before='A', after='B', axis=1), df) 751 self._run_test(lambda df: df['A'].truncate(before=2, after=4), df) 752 753 @parameterized.expand([ 754 (pd.Series(range(10)), ), # unique 755 (pd.Series(list(range(100)) + [0]), ), # non-unique int 756 (pd.Series(list(range(100)) + [0]) / 100, ), # non-unique flt 757 (pd.Series(['a', 'b', 'c', 'd']), ), # unique str 758 (pd.Series(['a', 'b', 'a', 'c', 'd']), ), # non-unique str 759 ]) 760 def test_series_is_unique(self, series): 761 self._run_test(lambda s: s.is_unique, series) 762 763 @parameterized.expand([ 764 (pd.Series(range(10)), ), # False 765 (pd.Series([1, 2, np.nan, 3, np.nan]), ), # True 766 (pd.Series(['a', 'b', 'c', 'd', 'e']), ), # False 767 (pd.Series(['a', 'b', None, 'c', None]), ), # True 768 ]) 769 def test_series_hasnans(self, series): 770 self._run_test(lambda s: s.hasnans, series) 771 772 def test_dataframe_getitem(self): 773 df = pd.DataFrame({'A': [x**2 for x in range(6)], 'B': list('abcdef')}) 774 self._run_test(lambda df: df['A'], df) 775 self._run_test(lambda df: df[['A', 'B']], df) 776 777 self._run_test(lambda df: df[:], df) 778 self._run_test(lambda df: df[df.A < 10], df) 779 780 df.index = df.index.map(float) 781 self._run_test(lambda df: df[1.5:4], df) 782 783 def test_loc(self): 784 dates = pd.date_range('1/1/2000', periods=8) 785 # TODO(https://github.com/apache/beam/issues/20765): 786 # We do not preserve the freq attribute on a DateTime index 787 dates.freq = None 788 df = pd.DataFrame( 789 np.arange(32).reshape((8, 4)), 790 index=dates, 791 columns=['A', 'B', 'C', 'D']) 792 self._run_test(lambda df: df.loc[:], df) 793 self._run_test(lambda df: df.loc[:, 'A'], df) 794 self._run_test(lambda df: df.loc[:dates[3]], df) 795 self._run_test(lambda df: df.loc[df.A > 10], df) 796 self._run_test(lambda df: df.loc[lambda df: df.A > 10], df) 797 self._run_test(lambda df: df.C.loc[df.A > 10], df) 798 self._run_test(lambda df, s: df.loc[s.loc[1:3]], df, pd.Series(dates)) 799 800 def test_append_sort(self): 801 # yapf: disable 802 df1 = pd.DataFrame({'int': [1, 2, 3], 'str': ['a', 'b', 'c']}, 803 columns=['int', 'str'], 804 index=[1, 3, 5]) 805 df2 = pd.DataFrame({'int': [4, 5, 6], 'str': ['d', 'e', 'f']}, 806 columns=['str', 'int'], 807 index=[2, 4, 6]) 808 # yapf: enable 809 810 self._run_test(lambda df1, df2: df1.append(df2, sort=True), df1, df2) 811 self._run_test(lambda df1, df2: df1.append(df2, sort=False), df1, df2) 812 self._run_test(lambda df1, df2: df2.append(df1, sort=True), df1, df2) 813 self._run_test(lambda df1, df2: df2.append(df1, sort=False), df1, df2) 814 815 def test_smallest_largest(self): 816 df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [2, 3, 5, 7]}) 817 self._run_test(lambda df: df.nlargest(1, 'A', keep='all'), df) 818 self._run_test(lambda df: df.nsmallest(3, 'A', keep='all'), df) 819 self._run_test(lambda df: df.nlargest(3, ['A', 'B'], keep='all'), df) 820 821 def test_series_cov_corr(self): 822 for s in [pd.Series([1, 2, 3]), 823 pd.Series(range(100)), 824 pd.Series([x**3 for x in range(-50, 50)])]: 825 self._run_test(lambda s: s.std(), s) 826 self._run_test(lambda s: s.var(), s) 827 self._run_test(lambda s: s.corr(s), s) 828 self._run_test(lambda s: s.corr(s + 1), s) 829 self._run_test(lambda s: s.corr(s * s), s) 830 self._run_test(lambda s: s.cov(s * s), s) 831 self._run_test(lambda s: s.skew(), s) 832 self._run_test(lambda s: s.kurtosis(), s) 833 self._run_test(lambda s: s.kurt(), s) 834 835 def test_dataframe_cov_corr(self): 836 df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) 837 df.loc[df.index[:5], 'a'] = np.nan 838 df.loc[df.index[5:10], 'b'] = np.nan 839 self._run_test(lambda df: df.corr(), df) 840 self._run_test(lambda df: df.cov(), df) 841 self._run_test(lambda df: df.corr(min_periods=12), df) 842 self._run_test(lambda df: df.cov(min_periods=12), df) 843 self._run_test(lambda df: df.corrwith(df.a), df) 844 self._run_test(lambda df: df[['a', 'b']].corrwith(df[['b', 'c']]), df) 845 846 df2 = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) 847 self._run_test( 848 lambda df, df2: df.corrwith(df2, axis=1), df, df2, check_proxy=False) 849 850 def test_corrwith_bad_axis(self): 851 df = pd.DataFrame({'a': range(3), 'b': range(3, 6), 'c': range(6, 9)}) 852 self._run_error_test(lambda df: df.corrwith(df.a, axis=2), df) 853 self._run_error_test(lambda df: df.corrwith(df, axis=5), df) 854 855 @unittest.skipIf(PD_VERSION < (1, 2), "na_action added in pandas 1.2.0") 856 def test_applymap_na_action(self): 857 # Replicates a doctest for na_action which is incompatible with 858 # doctest framework 859 df = pd.DataFrame([[pd.NA, 2.12], [3.356, 4.567]]) 860 self._run_test( 861 lambda df: df.applymap(lambda x: len(str(x)), na_action='ignore'), 862 df, 863 # TODO: generate proxy using naive type inference on fn 864 check_proxy=False) 865 866 def test_dataframe_eval_query(self): 867 df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) 868 self._run_test(lambda df: df.eval('foo = a + b - c'), df) 869 self._run_test(lambda df: df.query('a > b + c'), df) 870 871 self._run_inplace_test(lambda df: df.eval('foo = a + b - c'), df) 872 873 # Verify that attempting to access locals raises a useful error 874 deferred_df = frame_base.DeferredFrame.wrap( 875 expressions.ConstantExpression(df, df[0:0])) 876 self.assertRaises( 877 NotImplementedError, lambda: deferred_df.eval('foo = a + @b - c')) 878 self.assertRaises( 879 NotImplementedError, lambda: deferred_df.query('a > @b + c')) 880 881 def test_index_name_assignment(self): 882 df = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) 883 df = df.set_index(['a', 'b'], drop=False) 884 885 def change_index_names(df): 886 df.index.names = ['A', None] 887 888 self._run_inplace_test(change_index_names, df) 889 890 def test_quantile(self): 891 df = pd.DataFrame( 892 np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=['a', 'b']) 893 894 self._run_test( 895 lambda df: df.quantile(0.1, axis='columns'), df, check_proxy=False) 896 897 self._run_test( 898 lambda df: df.quantile(0.1, axis='columns'), df, check_proxy=False) 899 with self.assertRaisesRegex(frame_base.WontImplementError, 900 r"df\.quantile\(q=0\.1, axis='columns'\)"): 901 self._run_test(lambda df: df.quantile([0.1, 0.5], axis='columns'), df) 902 903 def test_dataframe_melt(self): 904 905 df = pd.DataFrame({ 906 'A': { 907 0: 'a', 1: 'b', 2: 'c' 908 }, 909 'B': { 910 0: 1, 1: 3, 2: 5 911 }, 912 'C': { 913 0: 2, 1: 4, 2: 6 914 } 915 }) 916 917 self._run_test( 918 lambda df: df.melt(id_vars=['A'], value_vars=['B'], ignore_index=False), 919 df) 920 self._run_test( 921 lambda df: df.melt( 922 id_vars=['A'], value_vars=['B', 'C'], ignore_index=False), 923 df) 924 self._run_test( 925 lambda df: df.melt( 926 id_vars=['A'], 927 value_vars=['B'], 928 var_name='myVarname', 929 value_name='myValname', 930 ignore_index=False), 931 df) 932 self._run_test( 933 lambda df: df.melt( 934 id_vars=['A'], value_vars=['B', 'C'], ignore_index=False), 935 df) 936 937 df.columns = [list('ABC'), list('DEF')] 938 self._run_test( 939 lambda df: df.melt( 940 col_level=0, id_vars=['A'], value_vars=['B'], ignore_index=False), 941 df) 942 self._run_test( 943 lambda df: df.melt( 944 id_vars=[('A', 'D')], value_vars=[('B', 'E')], ignore_index=False), 945 df) 946 947 def test_fillna_columns(self): 948 df = pd.DataFrame( 949 [[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, 5], 950 [np.nan, 3, np.nan, 4], [3, np.nan, np.nan, 4]], 951 columns=list('ABCD')) 952 953 self._run_test(lambda df: df.fillna(method='ffill', axis='columns'), df) 954 self._run_test( 955 lambda df: df.fillna(method='ffill', axis='columns', limit=1), df) 956 self._run_test( 957 lambda df: df.fillna(method='bfill', axis='columns', limit=1), df) 958 959 # Intended behavior is unclear here. See 960 # https://github.com/pandas-dev/pandas/issues/40989 961 # self._run_test(lambda df: df.fillna(axis='columns', value=100, 962 # limit=2), df) 963 964 def test_dataframe_fillna_dataframe_as_value(self): 965 df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], 966 [np.nan, np.nan, np.nan, 5], [np.nan, 3, np.nan, 4]], 967 columns=list("ABCD")) 968 df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE")) 969 970 self._run_test(lambda df, df2: df.fillna(df2), df, df2) 971 972 def test_dataframe_fillna_series_as_value(self): 973 df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], 974 [np.nan, np.nan, np.nan, 5], [np.nan, 3, np.nan, 4]], 975 columns=list("ABCD")) 976 s = pd.Series(range(4), index=list("ABCE")) 977 978 self._run_test(lambda df, s: df.fillna(s), df, s) 979 980 def test_series_fillna_series_as_value(self): 981 df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], 982 [np.nan, np.nan, np.nan, 5], [np.nan, 3, np.nan, 4]], 983 columns=list("ABCD")) 984 df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE")) 985 986 self._run_test(lambda df, df2: df.A.fillna(df2.A), df, df2) 987 988 def test_append_verify_integrity(self): 989 df1 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(10)) 990 df2 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(9, 19)) 991 992 self._run_error_test( 993 lambda s1, 994 s2: s1.append(s2, verify_integrity=True), 995 df1['A'], 996 df2['A'], 997 construction_time=False) 998 self._run_error_test( 999 lambda df1, 1000 df2: df1.append(df2, verify_integrity=True), 1001 df1, 1002 df2, 1003 construction_time=False) 1004 1005 def test_categorical_groupby(self): 1006 df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) 1007 df['B'] = df['B'].astype(pd.CategoricalDtype(list('cab'))) 1008 df = df.set_index('B') 1009 # TODO(BEAM-11190): These aggregations can be done in index partitions, but 1010 # it will require a little more complex logic 1011 self._run_test(lambda df: df.groupby(level=0).sum(), df, nonparallel=True) 1012 self._run_test(lambda df: df.groupby(level=0).mean(), df, nonparallel=True) 1013 1014 def test_astype_categorical(self): 1015 df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) 1016 categorical_dtype = pd.CategoricalDtype(df.B.unique()) 1017 1018 self._run_test(lambda df: df.B.astype(categorical_dtype), df) 1019 1020 @unittest.skipIf( 1021 PD_VERSION < (1, 2), "DataFrame.unstack not supported in pandas <1.2.x") 1022 def test_astype_categorical_with_unstack(self): 1023 df = pd.DataFrame({ 1024 'index1': ['one', 'one', 'two', 'two'], 1025 'index2': ['a', 'b', 'a', 'b'], 1026 'data': np.arange(1.0, 5.0), 1027 }) 1028 1029 def with_categorical_index(df): 1030 df.index1 = df.index1.astype(pd.CategoricalDtype(['one', 'two'])) 1031 df.index2 = df.index2.astype(pd.CategoricalDtype(['a', 'b'])) 1032 df.set_index(['index1', 'index2'], drop=True) 1033 return df 1034 1035 self._run_test( 1036 lambda df: with_categorical_index(df).unstack(level=-1), 1037 df, 1038 check_proxy=False) 1039 1040 def test_dataframe_sum_nonnumeric_raises(self): 1041 # Attempting a numeric aggregation with the str column present should 1042 # raise, and suggest the numeric_only argument 1043 with self.assertRaisesRegex(frame_base.WontImplementError, 'numeric_only'): 1044 self._run_test(lambda df: df.sum(), GROUPBY_DF) 1045 1046 # numeric_only=True should work 1047 self._run_test(lambda df: df.sum(numeric_only=True), GROUPBY_DF) 1048 # projecting only numeric columns should too 1049 self._run_test(lambda df: df[['foo', 'bar']].sum(), GROUPBY_DF) 1050 1051 def test_insert(self): 1052 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 1053 1054 self._run_inplace_test(lambda df: df.insert(1, 'C', df.A * 2), df) 1055 self._run_inplace_test( 1056 lambda df: df.insert(0, 'foo', pd.Series([8], index=[1])), 1057 df, 1058 check_proxy=False) 1059 self._run_inplace_test(lambda df: df.insert(2, 'bar', value='q'), df) 1060 1061 def test_insert_does_not_support_list_value(self): 1062 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 1063 1064 with self.assertRaisesRegex(frame_base.WontImplementError, 1065 r"insert\(value=list\)"): 1066 self._run_inplace_test(lambda df: df.insert(1, 'C', [7, 8, 9]), df) 1067 1068 def test_drop_duplicates(self): 1069 df = pd.DataFrame({ 1070 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 1071 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 1072 'rating': [4, 4, 3.5, 15, 5] 1073 }) 1074 1075 self._run_test(lambda df: df.drop_duplicates(keep=False), df) 1076 self._run_test( 1077 lambda df: df.drop_duplicates(subset=['brand'], keep=False), df) 1078 self._run_test( 1079 lambda df: df.drop_duplicates(subset=['brand', 'style'], keep=False), 1080 df) 1081 1082 @parameterized.expand([ 1083 ( 1084 lambda base: base.from_dict({ 1085 'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd'] 1086 }), ), 1087 ( 1088 lambda base: base.from_dict({ 1089 'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd'] 1090 }, 1091 orient='index'), ), 1092 ( 1093 lambda base: base.from_records( 1094 np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], 1095 dtype=[('col_1', 'i4'), ('col_2', 'U1')])), ), 1096 ]) 1097 def test_create_methods(self, func): 1098 expected = func(pd.DataFrame) 1099 1100 deferred_df = func(frames.DeferredDataFrame) 1101 actual = expressions.Session({}).evaluate(deferred_df._expr) 1102 1103 pd.testing.assert_frame_equal(actual, expected) 1104 1105 def test_replace(self): 1106 # verify a replace() doctest case that doesn't quite work in Beam as it uses 1107 # the default method='pad' 1108 df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], 'B': ['abc', 'bar', 'xyz']}) 1109 1110 self._run_test( 1111 lambda df: df.replace( 1112 regex={ 1113 r'^ba.$': 'new', 'foo': 'xyz' 1114 }, method=None), 1115 df) 1116 1117 def test_sample_columns(self): 1118 df = pd.DataFrame({ 1119 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 1120 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 1121 'rating': [4, 4, 3.5, 15, 5] 1122 }) 1123 1124 self._run_test(lambda df: df.sample(axis=1, n=2, random_state=1), df) 1125 self._run_error_test(lambda df: df.sample(axis=1, n=10, random_state=2), df) 1126 self._run_test( 1127 lambda df: df.sample(axis=1, n=10, random_state=3, replace=True), df) 1128 1129 def test_cat(self): 1130 # Replicate the doctests from CategorigcalAccessor 1131 # These tests don't translate into pandas_doctests_test.py because it 1132 # tries to use astype("category") in Beam, which makes a non-deferred 1133 # column type. 1134 s = pd.Series(list("abbccc")).astype("category") 1135 1136 self._run_test(lambda s: s.cat.rename_categories(list("cba")), s) 1137 self._run_test(lambda s: s.cat.reorder_categories(list("cba")), s) 1138 self._run_test(lambda s: s.cat.add_categories(["d", "e"]), s) 1139 self._run_test(lambda s: s.cat.remove_categories(["a", "c"]), s) 1140 self._run_test(lambda s: s.cat.set_categories(list("abcde")), s) 1141 self._run_test(lambda s: s.cat.as_ordered(), s) 1142 self._run_test(lambda s: s.cat.as_unordered(), s) 1143 self._run_test(lambda s: s.cat.codes, s) 1144 1145 @parameterized.expand(frames.ELEMENTWISE_DATETIME_PROPERTIES) 1146 def test_dt_property(self, prop_name): 1147 # Generate a series with a lot of unique timestamps 1148 s = pd.Series( 1149 pd.date_range('1/1/2000', periods=100, freq='m') + 1150 pd.timedelta_range(start='0 days', end='70 days', periods=100)) 1151 self._run_test(lambda s: getattr(s.dt, prop_name), s) 1152 1153 @parameterized.expand([ 1154 ('month_name', {}), 1155 ('day_name', {}), 1156 ('normalize', {}), 1157 ( 1158 'strftime', 1159 { 1160 'date_format': '%B %d, %Y, %r' 1161 }, 1162 ), 1163 ('tz_convert', { 1164 'tz': 'Europe/Berlin' 1165 }), 1166 ]) 1167 def test_dt_method(self, op, kwargs): 1168 # Generate a series with a lot of unique timestamps 1169 s = pd.Series( 1170 pd.date_range( 1171 '1/1/2000', periods=100, freq='m', tz='America/Los_Angeles') + 1172 pd.timedelta_range(start='0 days', end='70 days', periods=100)) 1173 1174 self._run_test(lambda s: getattr(s.dt, op)(**kwargs), s) 1175 1176 def test_dt_tz_localize_ambiguous_series(self): 1177 # This replicates a dt.tz_localize doctest: 1178 # s.tz_localize('CET', ambiguous=np.array([True, True, False])) 1179 # But using a DeferredSeries instead of a np array 1180 1181 s = pd.to_datetime( 1182 pd.Series([ 1183 '2018-10-28 01:20:00', '2018-10-28 02:36:00', '2018-10-28 03:46:00' 1184 ])) 1185 ambiguous = pd.Series([True, True, False], index=s.index) 1186 1187 self._run_test( 1188 lambda s, 1189 ambiguous: s.dt.tz_localize('CET', ambiguous=ambiguous), 1190 s, 1191 ambiguous) 1192 1193 def test_dt_tz_localize_nonexistent(self): 1194 # This replicates dt.tz_localize doctests that exercise `nonexistent`. 1195 # However they specify ambiguous='NaT' because the default, 1196 # ambiguous='infer', is not supported. 1197 s = pd.to_datetime( 1198 pd.Series(['2015-03-29 02:30:00', '2015-03-29 03:30:00'])) 1199 1200 self._run_test( 1201 lambda s: s.dt.tz_localize( 1202 'Europe/Warsaw', ambiguous='NaT', nonexistent='shift_forward'), 1203 s) 1204 self._run_test( 1205 lambda s: s.dt.tz_localize( 1206 'Europe/Warsaw', ambiguous='NaT', nonexistent='shift_backward'), 1207 s) 1208 self._run_test( 1209 lambda s: s.dt.tz_localize( 1210 'Europe/Warsaw', ambiguous='NaT', nonexistent=pd.Timedelta('1H')), 1211 s) 1212 1213 def test_compare_series(self): 1214 s1 = pd.Series(["a", "b", "c", "d", "e"]) 1215 s2 = pd.Series(["a", "a", "c", "b", "e"]) 1216 1217 self._run_test(lambda s1, s2: s1.compare(s2), s1, s2) 1218 self._run_test(lambda s1, s2: s1.compare(s2, align_axis=0), s1, s2) 1219 self._run_test(lambda s1, s2: s1.compare(s2, keep_shape=True), s1, s2) 1220 self._run_test( 1221 lambda s1, s2: s1.compare(s2, keep_shape=True, keep_equal=True), s1, s2) 1222 1223 def test_compare_dataframe(self): 1224 df1 = pd.DataFrame( 1225 { 1226 "col1": ["a", "a", "b", "b", "a"], 1227 "col2": [1.0, 2.0, 3.0, np.nan, 5.0], 1228 "col3": [1.0, 2.0, 3.0, 4.0, 5.0] 1229 }, 1230 columns=["col1", "col2", "col3"], 1231 ) 1232 df2 = df1.copy() 1233 df2.loc[0, 'col1'] = 'c' 1234 df2.loc[2, 'col3'] = 4.0 1235 1236 # Skipped because keep_shape=False won't be implemented 1237 with self.assertRaisesRegex( 1238 frame_base.WontImplementError, 1239 r"compare\(align_axis\=1, keep_shape\=False\) is not allowed"): 1240 self._run_test(lambda df1, df2: df1.compare(df2), df1, df2) 1241 1242 self._run_test( 1243 lambda df1, 1244 df2: df1.compare(df2, align_axis=0), 1245 df1, 1246 df2, 1247 check_proxy=False) 1248 self._run_test(lambda df1, df2: df1.compare(df2, keep_shape=True), df1, df2) 1249 self._run_test( 1250 lambda df1, 1251 df2: df1.compare(df2, align_axis=0, keep_shape=True), 1252 df1, 1253 df2) 1254 self._run_test( 1255 lambda df1, 1256 df2: df1.compare(df2, keep_shape=True, keep_equal=True), 1257 df1, 1258 df2) 1259 self._run_test( 1260 lambda df1, 1261 df2: df1.compare(df2, align_axis=0, keep_shape=True, keep_equal=True), 1262 df1, 1263 df2) 1264 1265 def test_idxmin(self): 1266 df = pd.DataFrame({ 1267 'consumption': [10.51, 103.11, 55.48], 1268 'co2_emissions': [37.2, 19.66, 1712] 1269 }, 1270 index=['Pork', 'Wheat Products', 'Beef']) 1271 1272 df2 = df.copy() 1273 df2.loc['Pork', 'co2_emissions'] = None 1274 df2.loc['Wheat Products', 'consumption'] = None 1275 df2.loc['Beef', 'co2_emissions'] = None 1276 1277 df3 = pd.DataFrame({ 1278 'consumption': [1.1, 2.2, 3.3], 'co2_emissions': [3.3, 2.2, 1.1] 1279 }, 1280 index=[0, 1, 2]) 1281 1282 s = pd.Series(data=[4, 3, None, 1], index=['A', 'B', 'C', 'D']) 1283 s2 = pd.Series(data=[1, 2, 3], index=[1, 2, 3]) 1284 1285 self._run_test(lambda df: df.idxmin(), df) 1286 self._run_test(lambda df: df.idxmin(skipna=False), df) 1287 self._run_test(lambda df: df.idxmin(axis=1), df) 1288 self._run_test(lambda df: df.idxmin(axis=1, skipna=False), df) 1289 self._run_test(lambda df2: df2.idxmin(), df2) 1290 self._run_test(lambda df2: df2.idxmin(axis=1), df2) 1291 self._run_test(lambda df2: df2.idxmin(skipna=False), df2, check_proxy=False) 1292 self._run_test( 1293 lambda df2: df2.idxmin(axis=1, skipna=False), df2, check_proxy=False) 1294 self._run_test(lambda df3: df3.idxmin(), df3) 1295 self._run_test(lambda df3: df3.idxmin(axis=1), df3) 1296 self._run_test(lambda df3: df3.idxmin(skipna=False), df3) 1297 self._run_test(lambda df3: df3.idxmin(axis=1, skipna=False), df3) 1298 1299 self._run_test(lambda s: s.idxmin(), s) 1300 self._run_test(lambda s: s.idxmin(skipna=False), s, check_proxy=False) 1301 self._run_test(lambda s2: s2.idxmin(), s2) 1302 self._run_test(lambda s2: s2.idxmin(skipna=False), s2) 1303 1304 def test_idxmax(self): 1305 df = pd.DataFrame({ 1306 'consumption': [10.51, 103.11, 55.48], 1307 'co2_emissions': [37.2, 19.66, 1712] 1308 }, 1309 index=['Pork', 'Wheat Products', 'Beef']) 1310 1311 df2 = df.copy() 1312 df2.loc['Pork', 'co2_emissions'] = None 1313 df2.loc['Wheat Products', 'consumption'] = None 1314 df2.loc['Beef', 'co2_emissions'] = None 1315 1316 df3 = pd.DataFrame({ 1317 'consumption': [1.1, 2.2, 3.3], 'co2_emissions': [3.3, 2.2, 1.1] 1318 }, 1319 index=[0, 1, 2]) 1320 1321 s = pd.Series(data=[1, None, 4, 1], index=['A', 'B', 'C', 'D']) 1322 s2 = pd.Series(data=[1, 2, 3], index=[1, 2, 3]) 1323 1324 self._run_test(lambda df: df.idxmax(), df) 1325 self._run_test(lambda df: df.idxmax(skipna=False), df) 1326 self._run_test(lambda df: df.idxmax(axis=1), df) 1327 self._run_test(lambda df: df.idxmax(axis=1, skipna=False), df) 1328 self._run_test(lambda df2: df2.idxmax(), df2) 1329 self._run_test(lambda df2: df2.idxmax(axis=1), df2) 1330 self._run_test( 1331 lambda df2: df2.idxmax(axis=1, skipna=False), df2, check_proxy=False) 1332 self._run_test(lambda df2: df2.idxmax(skipna=False), df2, check_proxy=False) 1333 self._run_test(lambda df3: df3.idxmax(), df3) 1334 self._run_test(lambda df3: df3.idxmax(axis=1), df3) 1335 self._run_test(lambda df3: df3.idxmax(skipna=False), df3) 1336 self._run_test(lambda df3: df3.idxmax(axis=1, skipna=False), df3) 1337 1338 self._run_test(lambda s: s.idxmax(), s) 1339 self._run_test(lambda s: s.idxmax(skipna=False), s, check_proxy=False) 1340 self._run_test(lambda s2: s2.idxmax(), s2) 1341 self._run_test(lambda s2: s2.idxmax(skipna=False), s2) 1342 1343 def test_pipe(self): 1344 def df_times(df, column, times): 1345 df[column] = df[column] * times 1346 return df 1347 1348 def df_times_shuffled(column, times, df): 1349 return df_times(df, column, times) 1350 1351 def s_times(s, times): 1352 return s * times 1353 1354 def s_times_shuffled(times, s): 1355 return s_times(s, times) 1356 1357 df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=[0, 1, 2]) 1358 s = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4]) 1359 1360 self._run_inplace_test(lambda df: df.pipe(df_times, 'A', 2), df) 1361 self._run_inplace_test( 1362 lambda df: df.pipe((df_times_shuffled, 'df'), 'A', 2), df) 1363 1364 self._run_test(lambda s: s.pipe(s_times, 2), s) 1365 self._run_test(lambda s: s.pipe((s_times_shuffled, 's'), 2), s) 1366 1367 def test_unstack_pandas_series_not_multiindex(self): 1368 # Pandas should throw a ValueError if performing unstack 1369 # on a Series without MultiIndex 1370 s = pd.Series([1, 2, 3, 4], index=['one', 'two', 'three', 'four']) 1371 with self.assertRaises((AttributeError, ValueError)): 1372 self._run_test(lambda s: s.unstack(), s) 1373 1374 def test_unstack_non_categorical_index(self): 1375 index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ('two', 'a'), 1376 ('two', 'b')]) 1377 index = index.set_levels( 1378 index.levels[0].astype(pd.CategoricalDtype(['one', 'two'])), level=0) 1379 s = pd.Series(np.arange(1.0, 5.0), index=index) 1380 with self.assertRaisesRegex( 1381 frame_base.WontImplementError, 1382 r"unstack\(\) is only supported on DataFrames if"): 1383 self._run_test(lambda s: s.unstack(level=-1), s) 1384 1385 def _unstack_get_categorical_index(self): 1386 index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ('two', 'a'), 1387 ('two', 'b')]) 1388 index = index.set_levels( 1389 index.levels[0].astype(pd.CategoricalDtype(['one', 'two'])), level=0) 1390 index = index.set_levels( 1391 index.levels[1].astype(pd.CategoricalDtype(['a', 'b'])), level=1) 1392 return index 1393 1394 def test_unstack_pandas_example1(self): 1395 index = self._unstack_get_categorical_index() 1396 s = pd.Series(np.arange(1.0, 5.0), index=index) 1397 self._run_test(lambda s: s.unstack(level=-1), s) 1398 1399 def test_unstack_pandas_example2(self): 1400 index = self._unstack_get_categorical_index() 1401 s = pd.Series(np.arange(1.0, 5.0), index=index) 1402 self._run_test(lambda s: s.unstack(level=0), s) 1403 1404 def test_unstack_pandas_example3(self): 1405 index = self._unstack_get_categorical_index() 1406 s = pd.Series(np.arange(1.0, 5.0), index=index) 1407 df = s.unstack(level=0) 1408 if PD_VERSION < (1, 2): 1409 with self.assertRaisesRegex( 1410 frame_base.WontImplementError, 1411 r"unstack\(\) is not supported when using pandas < 1.2.0"): 1412 self._run_test(lambda df: df.unstack(), df) 1413 else: 1414 self._run_test(lambda df: df.unstack(), df) 1415 1416 @unittest.skipIf( 1417 PD_VERSION < (1, 4), 1418 "Cannot set dtype of index to boolean for pandas<1.4") 1419 def test_unstack_bool(self): 1420 index = pd.MultiIndex.from_tuples([(True, 'a'), (True, 'b'), (False, 'a'), 1421 (False, 'b')]) 1422 index = index.set_levels(index.levels[0].astype('boolean'), level=0) 1423 index = index.set_levels( 1424 index.levels[1].astype(pd.CategoricalDtype(['a', 'b'])), level=1) 1425 s = pd.Series(np.arange(1.0, 5.0), index=index) 1426 self._run_test(lambda s: s.unstack(level=0), s) 1427 1428 def test_unstack_series_multiple_index_levels(self): 1429 tuples = list( 1430 zip( 1431 *[ 1432 ["bar", "bar", "bar", "bar", "baz", "baz", "baz", "baz"], 1433 ["one", "one", "two", "two", "one", "one", "two", "two"], 1434 ["A", "B", "A", "B", "A", "B", "A", "B"], 1435 ])) 1436 index = pd.MultiIndex.from_tuples( 1437 tuples, names=["first", "second", "third"]) 1438 index = index.set_levels( 1439 index.levels[0].astype(pd.CategoricalDtype(['bar', 'baz'])), level=0) 1440 index = index.set_levels( 1441 index.levels[1].astype(pd.CategoricalDtype(['one', 'two'])), level=1) 1442 index = index.set_levels( 1443 index.levels[2].astype(pd.CategoricalDtype(['A', 'B'])), level=2) 1444 df = pd.Series(np.random.randn(8), index=index) 1445 self._run_test(lambda df: df.unstack(level=['first', 'third']), df) 1446 1447 def test_unstack_series_multiple_index_and_column_levels(self): 1448 columns = pd.MultiIndex.from_tuples( 1449 [ 1450 ("A", "cat", "long"), 1451 ("B", "cat", "long"), 1452 ("A", "dog", "short"), 1453 ("B", "dog", "short"), 1454 ], 1455 names=["exp", "animal", "hair_length"], 1456 ) 1457 index = pd.MultiIndex.from_product( 1458 [['one', 'two'], ['a', 'b'], ['bar', 'baz']], 1459 names=["first", "second", "third"]) 1460 index = index.set_levels( 1461 index.levels[0].astype(pd.CategoricalDtype(['one', 'two'])), level=0) 1462 index = index.set_levels( 1463 index.levels[1].astype(pd.CategoricalDtype(['a', 'b'])), level=1) 1464 index = index.set_levels( 1465 index.levels[2].astype(pd.CategoricalDtype(['bar', 'baz'])), level=2) 1466 df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) 1467 df = df.stack(level=["animal", "hair_length"]) 1468 self._run_test(lambda df: df.unstack(level=['second', 'third']), df) 1469 self._run_test(lambda df: df.unstack(level=['second']), df) 1470 1471 def test_pivot_non_categorical(self): 1472 df = pd.DataFrame({ 1473 'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 1474 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 1475 'baz': [1, 2, 3, 4, 5, 6], 1476 'zoo': ['x', 'y', 'z', 'q', 'w', 't'] 1477 }) 1478 with self.assertRaisesRegex( 1479 frame_base.WontImplementError, 1480 r"pivot\(\) of non-categorical type is not supported"): 1481 self._run_test( 1482 lambda df: df.pivot(index='foo', columns='bar', values='baz'), df) 1483 1484 def test_pivot_pandas_example1(self): 1485 # Simple test 1 1486 df = pd.DataFrame({ 1487 'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 1488 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 1489 'baz': [1, 2, 3, 4, 5, 6], 1490 'zoo': ['x', 'y', 'z', 'q', 'w', 't'] 1491 }) 1492 df['bar'] = df['bar'].astype( 1493 pd.CategoricalDtype(categories=['A', 'B', 'C'])) 1494 self._run_test( 1495 lambda df: df.pivot(index='foo', columns='bar', values='baz'), df) 1496 self._run_test( 1497 lambda df: df.pivot(index=['foo'], columns='bar', values='baz'), df) 1498 1499 def test_pivot_pandas_example3(self): 1500 # Multiple values 1501 df = pd.DataFrame({ 1502 'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 1503 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 1504 'baz': [1, 2, 3, 4, 5, 6], 1505 'zoo': ['x', 'y', 'z', 'q', 'w', 't'] 1506 }) 1507 df['bar'] = df['bar'].astype( 1508 pd.CategoricalDtype(categories=['A', 'B', 'C'])) 1509 self._run_test( 1510 lambda df: df.pivot(index='foo', columns='bar', values=['baz', 'zoo']), 1511 df) 1512 self._run_test( 1513 lambda df: df.pivot( 1514 index='foo', columns=['bar'], values=['baz', 'zoo']), 1515 df) 1516 1517 def test_pivot_pandas_example4(self): 1518 # Multiple columns 1519 df = pd.DataFrame({ 1520 "lev1": [1, 1, 1, 2, 2, 2], 1521 "lev2": [1, 1, 2, 1, 1, 2], 1522 "lev3": [1, 2, 1, 2, 1, 2], 1523 "lev4": [1, 2, 3, 4, 5, 6], 1524 "values": [0, 1, 2, 3, 4, 5] 1525 }) 1526 df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2])) 1527 df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2])) 1528 df['values'] = df['values'].astype('Int64') 1529 self._run_test( 1530 lambda df: df.pivot( 1531 index="lev1", columns=["lev2", "lev3"], values="values"), 1532 df) 1533 1534 def test_pivot_pandas_example5(self): 1535 # Multiple index 1536 df = pd.DataFrame({ 1537 "lev1": [1, 1, 1, 2, 2, 2], 1538 "lev2": [1, 1, 2, 1, 1, 2], 1539 "lev3": [1, 2, 1, 2, 1, 2], 1540 "lev4": [1, 2, 3, 4, 5, 6], 1541 "values": [0, 1, 2, 3, 4, 5] 1542 }) 1543 df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2])) 1544 # Cast to nullable Int64 because Beam doesn't do the correct conversion to 1545 # float64 1546 df['values'] = df['values'].astype('Int64') 1547 if PD_VERSION < (1, 4): 1548 with self.assertRaisesRegex( 1549 frame_base.WontImplementError, 1550 r"pivot\(\) is not supported when pandas<1.4 and index is a Multi"): 1551 self._run_test( 1552 lambda df: df.pivot( 1553 index=["lev1", "lev2"], columns=["lev3"], values="values"), 1554 df) 1555 else: 1556 self._run_test( 1557 lambda df: df.pivot( 1558 index=["lev1", "lev2"], columns=["lev3"], values="values"), 1559 df) 1560 1561 def test_pivot_pandas_example6(self): 1562 # Value error when there are duplicates 1563 df = pd.DataFrame({ 1564 "foo": ['one', 'one', 'two', 'two'], 1565 "bar": ['A', 'A', 'B', 'C'], 1566 "baz": [1, 2, 3, 4] 1567 }) 1568 df['bar'] = df['bar'].astype( 1569 pd.CategoricalDtype(categories=['A', 'B', 'C'])) 1570 self._run_error_test( 1571 lambda df: df.pivot(index='foo', columns='bar', values='baz'), 1572 df, 1573 construction_time=False) 1574 1575 def test_pivot_no_index_provided_on_single_level_index(self): 1576 # Multiple columns, no index value provided 1577 df = pd.DataFrame({ 1578 "lev1": [1, 1, 1, 2, 2, 2], 1579 "lev2": [1, 1, 2, 1, 1, 2], 1580 "lev3": [1, 2, 1, 2, 1, 2], 1581 "lev4": [1, 2, 3, 4, 5, 6], 1582 "values": [0, 1, 2, 3, 4, 5] 1583 }) 1584 df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2])) 1585 df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2])) 1586 df['values'] = df['values'].astype('Int64') 1587 self._run_test( 1588 lambda df: df.pivot(columns=["lev2", "lev3"], values="values"), df) 1589 1590 def test_pivot_no_index_provided_on_multiindex(self): 1591 # Multiple columns, no index value provided 1592 tuples = list( 1593 zip( 1594 *[ 1595 ["bar", "bar", "bar", "baz", "baz", "baz"], 1596 [ 1597 "one", 1598 "two", 1599 "three", 1600 "one", 1601 "two", 1602 "three", 1603 ], 1604 ])) 1605 index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) 1606 df = pd.DataFrame({ 1607 "lev1": [1, 1, 1, 2, 2, 2], 1608 "lev2": [1, 1, 2, 1, 1, 2], 1609 "lev3": [1, 2, 1, 2, 1, 2], 1610 "lev4": [1, 2, 3, 4, 5, 6], 1611 "values": [0, 1, 2, 3, 4, 5] 1612 }, 1613 index=index) 1614 df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2])) 1615 df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2])) 1616 df['values'] = df['values'].astype('float64') 1617 df['lev1'] = df['lev1'].astype('int64') 1618 df['lev4'] = df['lev4'].astype('int64') 1619 if PD_VERSION < (1, 4): 1620 with self.assertRaisesRegex( 1621 frame_base.WontImplementError, 1622 r"pivot\(\) is not supported when pandas<1.4 and index is a Multi"): 1623 self._run_test(lambda df: df.pivot(columns=["lev2", "lev3"]), df) 1624 else: 1625 self._run_test( 1626 lambda df: df.pivot(columns=["lev2", "lev3"]), 1627 df, 1628 lenient_dtype_check=True) 1629 1630 1631 # pandas doesn't support kurtosis on GroupBys: 1632 # https://github.com/pandas-dev/pandas/issues/40139 1633 ALL_GROUPING_AGGREGATIONS = sorted( 1634 set(frames.ALL_AGGREGATIONS) - set(('kurt', 'kurtosis'))) 1635 1636 1637 class GroupByTest(_AbstractFrameTest): 1638 """Tests for DataFrame/Series GroupBy operations.""" 1639 @staticmethod 1640 def median_sum_fn(x): 1641 with warnings.catch_warnings(): 1642 warnings.filterwarnings("ignore", message="Mean of empty slice") 1643 return (x.foo + x.bar).median() 1644 1645 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1646 def test_groupby_agg(self, agg_type): 1647 if agg_type == 'describe' and PD_VERSION < (1, 2): 1648 self.skipTest( 1649 "https://github.com/apache/beam/issues/20967: proxy generation of " 1650 "DataFrameGroupBy.describe fails in pandas < 1.2") 1651 self._run_test( 1652 lambda df: df.groupby('group').agg(agg_type), 1653 GROUPBY_DF, 1654 check_proxy=False) 1655 1656 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1657 def test_groupby_with_filter(self, agg_type): 1658 if agg_type == 'describe' and PD_VERSION < (1, 2): 1659 self.skipTest( 1660 "https://github.com/apache/beam/issues/20967: proxy generation of " 1661 "DataFrameGroupBy.describe fails in pandas < 1.2") 1662 self._run_test( 1663 lambda df: getattr(df[df.foo > 30].groupby('group'), agg_type)(), 1664 GROUPBY_DF, 1665 check_proxy=False) 1666 1667 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1668 def test_groupby(self, agg_type): 1669 if agg_type == 'describe' and PD_VERSION < (1, 2): 1670 self.skipTest( 1671 "https://github.com/apache/beam/issues/20967: proxy generation of " 1672 "DataFrameGroupBy.describe fails in pandas < 1.2") 1673 1674 self._run_test( 1675 lambda df: getattr(df.groupby('group'), agg_type)(), 1676 GROUPBY_DF, 1677 check_proxy=False) 1678 1679 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1680 def test_groupby_series(self, agg_type): 1681 if agg_type == 'describe' and PD_VERSION < (1, 2): 1682 self.skipTest( 1683 "https://github.com/apache/beam/issues/20967: proxy generation of " 1684 "DataFrameGroupBy.describe fails in pandas < 1.2") 1685 1686 self._run_test( 1687 lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(), 1688 GROUPBY_DF, 1689 check_proxy=False) 1690 1691 def test_groupby_user_guide(self): 1692 # Example from https://pandas.pydata.org/docs/user_guide/groupby.html 1693 arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], 1694 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] 1695 1696 index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) 1697 1698 df = pd.DataFrame({ 1699 'A': [1, 1, 1, 1, 2, 2, 3, 3], 'B': np.arange(8) 1700 }, 1701 index=index) 1702 1703 self._run_test(lambda df: df.groupby(['second', 'A']).sum(), df) 1704 1705 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1706 def test_groupby_project_series(self, agg_type): 1707 df = GROUPBY_DF 1708 1709 if agg_type == 'describe': 1710 self.skipTest( 1711 "https://github.com/apache/beam/issues/20967: proxy generation of " 1712 "SeriesGroupBy.describe fails") 1713 if agg_type in ('corr', 'cov'): 1714 self.skipTest( 1715 "https://github.com/apache/beam/issues/20895: " 1716 "SeriesGroupBy.{corr, cov} do not raise the expected error.") 1717 1718 self._run_test(lambda df: getattr(df.groupby('group').foo, agg_type)(), df) 1719 self._run_test(lambda df: getattr(df.groupby('group').bar, agg_type)(), df) 1720 self._run_test( 1721 lambda df: getattr(df.groupby('group')['foo'], agg_type)(), df) 1722 self._run_test( 1723 lambda df: getattr(df.groupby('group')['bar'], agg_type)(), df) 1724 1725 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1726 def test_groupby_project_dataframe(self, agg_type): 1727 if agg_type == 'describe' and PD_VERSION < (1, 2): 1728 self.skipTest( 1729 "https://github.com/apache/beam/issues/20967: proxy generation of " 1730 "DataFrameGroupBy.describe fails in pandas < 1.2") 1731 self._run_test( 1732 lambda df: getattr(df.groupby('group')[['bar', 'baz']], agg_type)(), 1733 GROUPBY_DF, 1734 check_proxy=False) 1735 1736 def test_groupby_errors_bad_projection(self): 1737 df = GROUPBY_DF 1738 1739 # non-existent projection column 1740 self._run_error_test( 1741 lambda df: df.groupby('group')[['bar', 'baz']].bar.median(), df) 1742 self._run_error_test(lambda df: df.groupby('group')[['bad']].median(), df) 1743 1744 self._run_error_test(lambda df: df.groupby('group').bad.median(), df) 1745 1746 self._run_error_test( 1747 lambda df: df.groupby('group')[['bar', 'baz']].bar.sum(), df) 1748 self._run_error_test(lambda df: df.groupby('group')[['bat']].sum(), df) 1749 self._run_error_test(lambda df: df.groupby('group').bat.sum(), df) 1750 1751 def test_groupby_errors_non_existent_label(self): 1752 df = GROUPBY_DF 1753 1754 # non-existent grouping label 1755 self._run_error_test( 1756 lambda df: df.groupby(['really_bad', 'foo', 'bad']).foo.sum(), df) 1757 self._run_error_test(lambda df: df.groupby('bad').foo.sum(), df) 1758 1759 def test_groupby_callable(self): 1760 df = GROUPBY_DF 1761 1762 self._run_test(lambda df: df.groupby(lambda x: x % 2).foo.sum(), df) 1763 self._run_test(lambda df: df.groupby(lambda x: x % 5).median(), df) 1764 1765 def test_groupby_apply(self): 1766 df = GROUPBY_DF 1767 # Note this is the same as DataFrameGroupBy.describe. Using it here is 1768 # just a convenient way to test apply() with a user fn that returns a Series 1769 describe = lambda df: df.describe() 1770 1771 self._run_test(lambda df: df.groupby('group').foo.apply(describe), df) 1772 self._run_test( 1773 lambda df: df.groupby('group')[['foo', 'bar']].apply(describe), df) 1774 self._run_test(lambda df: df.groupby('group').apply(self.median_sum_fn), df) 1775 self._run_test( 1776 lambda df: df.set_index('group').foo.groupby(level=0).apply(describe), 1777 df) 1778 self._run_test(lambda df: df.groupby(level=0).apply(self.median_sum_fn), df) 1779 self._run_test(lambda df: df.groupby(lambda x: x % 3).apply(describe), df) 1780 self._run_test( 1781 lambda df: df.bar.groupby(lambda x: x % 3).apply(describe), df) 1782 self._run_test( 1783 lambda df: df.set_index(['str', 'group', 'bool']).groupby( 1784 level='group').apply(self.median_sum_fn), 1785 df) 1786 1787 def test_groupby_apply_preserves_column_order(self): 1788 df = GROUPBY_DF 1789 1790 self._run_test( 1791 lambda df: df[['foo', 'group', 'bar']].groupby('group').apply( 1792 lambda x: x), 1793 df) 1794 1795 def test_groupby_transform(self): 1796 df = pd.DataFrame({ 1797 "Date": [ 1798 "2015-05-08", 1799 "2015-05-07", 1800 "2015-05-06", 1801 "2015-05-05", 1802 "2015-05-08", 1803 "2015-05-07", 1804 "2015-05-06", 1805 "2015-05-05" 1806 ], 1807 "Data": [5, 8, 6, 1, 50, 100, 60, 120], 1808 }) 1809 1810 self._run_test(lambda df: df.groupby('Date')['Data'].transform(np.sum), df) 1811 self._run_test( 1812 lambda df: df.groupby('Date')['Data'].transform( 1813 lambda x: (x - x.mean()) / x.std()), 1814 df) 1815 1816 def test_groupby_pipe(self): 1817 df = GROUPBY_DF 1818 1819 self._run_test(lambda df: df.groupby('group').pipe(lambda x: x.sum()), df) 1820 self._run_test( 1821 lambda df: df.groupby('group')['bool'].pipe(lambda x: x.any()), df) 1822 self._run_test( 1823 lambda df: df.groupby(['group', 'foo']).pipe( 1824 (lambda a, x: x.sum(numeric_only=a), 'x'), False), 1825 df, 1826 check_proxy=False) 1827 1828 def test_groupby_apply_modified_index(self): 1829 df = GROUPBY_DF 1830 1831 # If apply fn modifies the index then the output will include the grouped 1832 # index 1833 self._run_test( 1834 lambda df: df.groupby('group').apply( 1835 lambda x: x[x.foo > x.foo.median()]), 1836 df) 1837 1838 @unittest.skip('https://github.com/apache/beam/issues/20762') 1839 def test_groupby_aggregate_grouped_column(self): 1840 df = pd.DataFrame({ 1841 'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)], 1842 'foo': [None if i % 11 == 0 else i for i in range(100)], 1843 'bar': [None if i % 7 == 0 else 99 - i for i in range(100)], 1844 'baz': [None if i % 13 == 0 else i * 2 for i in range(100)], 1845 }) 1846 1847 self._run_test(lambda df: df.groupby('group').group.count(), df) 1848 self._run_test(lambda df: df.groupby('group')[['group', 'bar']].count(), df) 1849 self._run_test( 1850 lambda df: df.groupby('group')[['group', 'bar']].apply( 1851 lambda x: x.describe()), 1852 df) 1853 1854 @parameterized.expand((x, ) for x in [ 1855 0, 1856 [1], 1857 3, 1858 [0, 3], 1859 [2, 1], 1860 ['foo', 0], 1861 [1, 'str'], 1862 [3, 0, 2, 1], 1863 ]) 1864 def test_groupby_level_agg(self, level): 1865 df = GROUPBY_DF.set_index(['group', 'foo', 'bar', 'str'], drop=False) 1866 self._run_test(lambda df: df.groupby(level=level).bar.max(), df) 1867 self._run_test( 1868 lambda df: df.groupby(level=level).sum(numeric_only=True), df) 1869 self._run_test( 1870 lambda df: df.groupby(level=level).apply(self.median_sum_fn), df) 1871 1872 @unittest.skipIf(PD_VERSION < (1, 1), "drop_na added in pandas 1.1.0") 1873 def test_groupby_count_na(self): 1874 # Verify we can do a groupby.count() that doesn't drop NaN values 1875 self._run_test( 1876 lambda df: df.groupby('foo', dropna=True).bar.count(), GROUPBY_DF) 1877 self._run_test( 1878 lambda df: df.groupby('foo', dropna=False).bar.count(), GROUPBY_DF) 1879 1880 def test_groupby_sum_min_count(self): 1881 df = pd.DataFrame({ 1882 'good': [1, 2, 3, np.nan], 1883 'bad': [np.nan, np.nan, np.nan, 4], 1884 'group': ['a', 'b', 'a', 'b'] 1885 }) 1886 1887 self._run_test(lambda df: df.groupby('group').sum(min_count=2), df) 1888 1889 def test_groupby_dtypes(self): 1890 self._run_test( 1891 lambda df: df.groupby('group').dtypes, GROUPBY_DF, check_proxy=False) 1892 self._run_test( 1893 lambda df: df.groupby(level=0).dtypes, GROUPBY_DF, check_proxy=False) 1894 1895 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1896 def test_dataframe_groupby_series(self, agg_type): 1897 if agg_type == 'describe' and PD_VERSION < (1, 2): 1898 self.skipTest( 1899 "https://github.com/apache/beam/issues/20967: proxy generation of " 1900 "DataFrameGroupBy.describe fails in pandas < 1.2") 1901 self._run_test( 1902 lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type), 1903 GROUPBY_DF, 1904 check_proxy=False) 1905 self._run_test( 1906 lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type), 1907 GROUPBY_DF, 1908 check_proxy=False) 1909 1910 @parameterized.expand(ALL_GROUPING_AGGREGATIONS) 1911 def test_series_groupby_series(self, agg_type): 1912 if agg_type == 'describe': 1913 self.skipTest( 1914 "https://github.com/apache/beam/issues/20967: proxy generation of " 1915 "SeriesGroupBy.describe fails") 1916 if agg_type in ('corr', 'cov'): 1917 self.skipTest( 1918 "https://github.com/apache/beam/issues/20895: " 1919 "SeriesGroupBy.{corr, cov} do not raise the expected error.") 1920 self._run_test( 1921 lambda df: df[df.foo < 40].bar.groupby(df.group).agg(agg_type), 1922 GROUPBY_DF) 1923 self._run_test( 1924 lambda df: df[df.foo < 40].bar.groupby(df.foo % 3).agg(agg_type), 1925 GROUPBY_DF) 1926 1927 def test_groupby_series_apply(self): 1928 df = GROUPBY_DF 1929 1930 # Note this is the same as DataFrameGroupBy.describe. Using it here is 1931 # just a convenient way to test apply() with a user fn that returns a Series 1932 describe = lambda df: df.describe() 1933 1934 self._run_test(lambda df: df.groupby(df.group).foo.apply(describe), df) 1935 self._run_test( 1936 lambda df: df.groupby(df.group)[['foo', 'bar']].apply(describe), df) 1937 self._run_test( 1938 lambda df: df.groupby(df.group).apply(self.median_sum_fn), df) 1939 1940 def test_groupby_multiindex_keep_nans(self): 1941 # Due to https://github.com/pandas-dev/pandas/issues/36470 1942 # groupby(dropna=False) doesn't work with multiple columns 1943 with self.assertRaisesRegex(NotImplementedError, 1944 "https://github.com/apache/beam/issues/21014"): 1945 self._run_test( 1946 lambda df: df.groupby(['foo', 'bar'], dropna=False).sum(), GROUPBY_DF) 1947 1948 1949 class AggregationTest(_AbstractFrameTest): 1950 """Tests for global aggregation methods on DataFrame/Series.""" 1951 1952 # corr, cov on Series require an other argument 1953 @parameterized.expand( 1954 sorted(set(frames.ALL_AGGREGATIONS) - set(['corr', 'cov']))) 1955 def test_series_agg(self, agg_method): 1956 s = pd.Series(list(range(16))) 1957 1958 nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad') 1959 1960 # TODO(https://github.com/apache/beam/issues/20926): max and min produce 1961 # the wrong proxy 1962 check_proxy = agg_method not in ('max', 'min') 1963 1964 self._run_test( 1965 lambda s: s.agg(agg_method), 1966 s, 1967 nonparallel=nonparallel, 1968 check_proxy=check_proxy) 1969 1970 # corr, cov on Series require an other argument 1971 # Series.size is a property 1972 @parameterized.expand( 1973 sorted(set(frames.ALL_AGGREGATIONS) - set(['corr', 'cov', 'size']))) 1974 def test_series_agg_method(self, agg_method): 1975 s = pd.Series(list(range(16))) 1976 1977 nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad') 1978 1979 # TODO(https://github.com/apache/beam/issues/20926): max and min produce 1980 # the wrong proxy 1981 check_proxy = agg_method not in ('max', 'min') 1982 1983 self._run_test( 1984 lambda s: getattr(s, agg_method)(), 1985 s, 1986 nonparallel=nonparallel, 1987 check_proxy=check_proxy) 1988 1989 @parameterized.expand(frames.ALL_AGGREGATIONS) 1990 def test_dataframe_agg(self, agg_method): 1991 df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]}) 1992 1993 nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad') 1994 1995 # TODO(https://github.com/apache/beam/issues/20926): max and min produce 1996 # the wrong proxy 1997 check_proxy = agg_method not in ('max', 'min') 1998 1999 self._run_test( 2000 lambda df: df.agg(agg_method), 2001 df, 2002 nonparallel=nonparallel, 2003 check_proxy=check_proxy) 2004 2005 # DataFrame.size is a property 2006 @parameterized.expand(sorted(set(frames.ALL_AGGREGATIONS) - set(['size']))) 2007 def test_dataframe_agg_method(self, agg_method): 2008 df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]}) 2009 2010 nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad') 2011 2012 # TODO(https://github.com/apache/beam/issues/20926): max and min produce 2013 # the wrong proxy 2014 check_proxy = agg_method not in ('max', 'min') 2015 2016 self._run_test( 2017 lambda df: getattr(df, agg_method)(), 2018 df, 2019 nonparallel=nonparallel, 2020 check_proxy=check_proxy) 2021 2022 def test_series_agg_modes(self): 2023 s = pd.Series(list(range(16))) 2024 self._run_test(lambda s: s.agg('sum'), s) 2025 self._run_test(lambda s: s.agg(['sum']), s) 2026 self._run_test(lambda s: s.agg(['sum', 'mean']), s) 2027 self._run_test(lambda s: s.agg(['mean']), s) 2028 self._run_test(lambda s: s.agg('mean'), s) 2029 2030 def test_dataframe_agg_modes(self): 2031 df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]}) 2032 self._run_test(lambda df: df.agg('sum'), df) 2033 self._run_test(lambda df: df.agg(['sum', 'mean']), df) 2034 self._run_test(lambda df: df.agg({'A': 'sum', 'B': 'sum'}), df) 2035 self._run_test(lambda df: df.agg({'A': 'sum', 'B': 'mean'}), df) 2036 self._run_test(lambda df: df.agg({'A': ['sum', 'mean']}), df) 2037 self._run_test(lambda df: df.agg({'A': ['sum', 'mean'], 'B': 'min'}), df) 2038 2039 def test_series_agg_level(self): 2040 self._run_test( 2041 lambda df: df.set_index(['group', 'foo']).bar.count(level=0), 2042 GROUPBY_DF) 2043 self._run_test( 2044 lambda df: df.set_index(['group', 'foo']).bar.max(level=0), GROUPBY_DF) 2045 2046 self._run_test( 2047 lambda df: df.set_index(['group', 'foo']).bar.median(level=0), 2048 GROUPBY_DF) 2049 2050 self._run_test( 2051 lambda df: df.set_index(['foo', 'group']).bar.count(level=1), 2052 GROUPBY_DF) 2053 self._run_test( 2054 lambda df: df.set_index(['group', 'foo']).bar.max(level=1), GROUPBY_DF) 2055 self._run_test( 2056 lambda df: df.set_index(['group', 'foo']).bar.max(level='foo'), 2057 GROUPBY_DF) 2058 self._run_test( 2059 lambda df: df.set_index(['group', 'foo']).bar.median(level=1), 2060 GROUPBY_DF) 2061 2062 def test_dataframe_agg_level(self): 2063 self._run_test( 2064 lambda df: df.set_index(['group', 'foo']).count(level=0), GROUPBY_DF) 2065 self._run_test( 2066 lambda df: df.set_index(['group', 'foo']).max( 2067 level=0, numeric_only=False), 2068 GROUPBY_DF, 2069 check_proxy=False) 2070 # pandas implementation doesn't respect numeric_only argument here 2071 # (https://github.com/pandas-dev/pandas/issues/40788), it 2072 # always acts as if numeric_only=True. Our implmentation respects it so we 2073 # need to make it explicit. 2074 self._run_test( 2075 lambda df: df.set_index(['group', 'foo']).sum( 2076 level=0, numeric_only=True), 2077 GROUPBY_DF) 2078 2079 self._run_test( 2080 lambda df: df.set_index(['group', 'foo'])[['bar']].count(level=1), 2081 GROUPBY_DF) 2082 self._run_test( 2083 lambda df: df.set_index(['group', 'foo']).count(level=1), GROUPBY_DF) 2084 self._run_test( 2085 lambda df: df.set_index(['group', 'foo']).max( 2086 level=1, numeric_only=False), 2087 GROUPBY_DF, 2088 check_proxy=False) 2089 # sum with str columns is order-sensitive 2090 self._run_test( 2091 lambda df: df.set_index(['group', 'foo']).sum( 2092 level=1, numeric_only=True), 2093 GROUPBY_DF) 2094 2095 self._run_test( 2096 lambda df: df.set_index(['group', 'foo']).median( 2097 level=0, numeric_only=True), 2098 GROUPBY_DF) 2099 self._run_test( 2100 lambda df: df.drop('str', axis=1).set_index(['foo', 'group']).median( 2101 level=1, numeric_only=True), 2102 GROUPBY_DF) 2103 2104 def test_series_agg_multifunc_level(self): 2105 # level= is ignored for multiple agg fns 2106 self._run_test( 2107 lambda df: df.set_index(['group', 'foo']).bar.agg(['min', 'max'], 2108 level=0), 2109 GROUPBY_DF) 2110 2111 def test_series_mean_skipna(self): 2112 df = pd.DataFrame({ 2113 'one': [i if i % 8 == 0 else np.nan for i in range(8)], 2114 'two': [i if i % 4 == 0 else np.nan for i in range(8)], 2115 'three': [i if i % 2 == 0 else np.nan for i in range(8)], 2116 }) 2117 2118 self._run_test(lambda df: df.one.mean(skipna=False), df) 2119 self._run_test(lambda df: df.two.mean(skipna=False), df) 2120 self._run_test(lambda df: df.three.mean(skipna=False), df) 2121 2122 self._run_test(lambda df: df.one.mean(skipna=True), df) 2123 self._run_test(lambda df: df.two.mean(skipna=True), df) 2124 self._run_test(lambda df: df.three.mean(skipna=True), df) 2125 2126 def test_dataframe_agg_multifunc_level(self): 2127 # level= is ignored for multiple agg fns 2128 self._run_test( 2129 lambda df: df.set_index(['group', 'foo']).agg(['min', 'max'], level=0), 2130 GROUPBY_DF, 2131 check_proxy=False) 2132 2133 @parameterized.expand([(True, ), (False, )]) 2134 @unittest.skipIf( 2135 PD_VERSION < (1, 2), 2136 "pandas 1.1.0 produces different dtypes for these examples") 2137 def test_dataframe_agg_numeric_only(self, numeric_only): 2138 # Note other aggregation functions can fail on this input with 2139 # numeric_only={False,None}. These are the only ones that actually work for 2140 # the string inputs. 2141 self._run_test( 2142 lambda df: df.max(numeric_only=numeric_only), 2143 GROUPBY_DF, 2144 check_proxy=False) 2145 self._run_test( 2146 lambda df: df.min(numeric_only=numeric_only), 2147 GROUPBY_DF, 2148 check_proxy=False) 2149 2150 @unittest.skip( 2151 "pandas implementation doesn't respect numeric_only= with " 2152 "level= (https://github.com/pandas-dev/pandas/issues/40788)") 2153 def test_dataframe_agg_level_numeric_only(self): 2154 self._run_test( 2155 lambda df: df.set_index('foo').sum(level=0, numeric_only=True), 2156 GROUPBY_DF) 2157 self._run_test( 2158 lambda df: df.set_index('foo').max(level=0, numeric_only=True), 2159 GROUPBY_DF) 2160 self._run_test( 2161 lambda df: df.set_index('foo').mean(level=0, numeric_only=True), 2162 GROUPBY_DF) 2163 self._run_test( 2164 lambda df: df.set_index('foo').median(level=0, numeric_only=True), 2165 GROUPBY_DF) 2166 2167 def test_dataframe_agg_bool_only(self): 2168 df = pd.DataFrame({ 2169 'all': [True for i in range(10)], 2170 'any': [i % 3 == 0 for i in range(10)], 2171 'int': range(10) 2172 }) 2173 2174 self._run_test(lambda df: df.all(), df) 2175 self._run_test(lambda df: df.any(), df) 2176 self._run_test(lambda df: df.all(bool_only=True), df) 2177 self._run_test(lambda df: df.any(bool_only=True), df) 2178 2179 @unittest.skip( 2180 "pandas doesn't implement bool_only= with level= " 2181 "(https://github.com/pandas-dev/pandas/blob/" 2182 "v1.2.3/pandas/core/generic.py#L10573)") 2183 def test_dataframe_agg_level_bool_only(self): 2184 df = pd.DataFrame({ 2185 'all': [True for i in range(10)], 2186 'any': [i % 3 == 0 for i in range(10)], 2187 'int': range(10) 2188 }) 2189 2190 self._run_test(lambda df: df.set_index('int', drop=False).all(level=0), df) 2191 self._run_test(lambda df: df.set_index('int', drop=False).any(level=0), df) 2192 self._run_test( 2193 lambda df: df.set_index('int', drop=False).all(level=0, bool_only=True), 2194 df) 2195 self._run_test( 2196 lambda df: df.set_index('int', drop=False).any(level=0, bool_only=True), 2197 df) 2198 2199 def test_series_agg_np_size(self): 2200 self._run_test( 2201 lambda df: df.set_index(['group', 'foo']).agg(np.size), 2202 GROUPBY_DF, 2203 check_proxy=False) 2204 2205 def test_df_agg_invalid_kwarg_raises(self): 2206 self._run_error_test(lambda df: df.agg('mean', bool_only=True), GROUPBY_DF) 2207 self._run_error_test( 2208 lambda df: df.agg('any', numeric_only=True), GROUPBY_DF) 2209 self._run_error_test( 2210 lambda df: df.agg('median', min_count=3, numeric_only=True), GROUPBY_DF) 2211 2212 def test_series_agg_method_invalid_kwarg_raises(self): 2213 self._run_error_test(lambda df: df.foo.median(min_count=3), GROUPBY_DF) 2214 self._run_error_test( 2215 lambda df: df.foo.agg('median', min_count=3), GROUPBY_DF) 2216 2217 @unittest.skipIf( 2218 PD_VERSION < (1, 3), 2219 ( 2220 "DataFrame.agg raises a different exception from the " 2221 "aggregation methods. Fixed in " 2222 "https://github.com/pandas-dev/pandas/pull/40543.")) 2223 def test_df_agg_method_invalid_kwarg_raises(self): 2224 self._run_error_test(lambda df: df.mean(bool_only=True), GROUPBY_DF) 2225 self._run_error_test(lambda df: df.any(numeric_only=True), GROUPBY_DF) 2226 self._run_error_test( 2227 lambda df: df.median(min_count=3, numeric_only=True), GROUPBY_DF) 2228 2229 def test_agg_min_count(self): 2230 df = pd.DataFrame({ 2231 'good': [1, 2, 3, np.nan], 2232 'bad': [np.nan, np.nan, np.nan, 4], 2233 }, 2234 index=['a', 'b', 'a', 'b']) 2235 2236 self._run_test(lambda df: df.sum(level=0, min_count=2), df) 2237 2238 self._run_test(lambda df: df.sum(min_count=3), df, nonparallel=True) 2239 self._run_test(lambda df: df.sum(min_count=1), df, nonparallel=True) 2240 self._run_test(lambda df: df.good.sum(min_count=2), df, nonparallel=True) 2241 self._run_test(lambda df: df.bad.sum(min_count=2), df, nonparallel=True) 2242 2243 def test_series_agg_std(self): 2244 s = pd.Series(range(10)) 2245 2246 self._run_test(lambda s: s.agg('std'), s) 2247 self._run_test(lambda s: s.agg('var'), s) 2248 self._run_test(lambda s: s.agg(['std', 'sum']), s) 2249 self._run_test(lambda s: s.agg(['var']), s) 2250 2251 def test_std_all_na(self): 2252 s = pd.Series([np.nan] * 10) 2253 2254 self._run_test(lambda s: s.agg('std'), s) 2255 self._run_test(lambda s: s.std(), s) 2256 2257 def test_std_mostly_na_with_ddof(self): 2258 df = pd.DataFrame({ 2259 'one': [i if i % 8 == 0 else np.nan for i in range(8)], 2260 'two': [i if i % 4 == 0 else np.nan for i in range(8)], 2261 'three': [i if i % 2 == 0 else np.nan for i in range(8)], 2262 }, 2263 index=pd.MultiIndex.from_arrays( 2264 [list(range(8)), list(reversed(range(8)))], 2265 names=['forward', None])) 2266 2267 self._run_test(lambda df: df.std(), df) # ddof=1 2268 self._run_test(lambda df: df.std(ddof=0), df) 2269 self._run_test(lambda df: df.std(ddof=2), df) 2270 self._run_test(lambda df: df.std(ddof=3), df) 2271 self._run_test(lambda df: df.std(ddof=4), df) 2272 2273 def test_dataframe_std(self): 2274 self._run_test(lambda df: df.std(numeric_only=True), GROUPBY_DF) 2275 self._run_test(lambda df: df.var(numeric_only=True), GROUPBY_DF) 2276 2277 def test_dataframe_mode(self): 2278 self._run_test( 2279 lambda df: df.mode(), GROUPBY_DF, nonparallel=True, check_proxy=False) 2280 self._run_test( 2281 lambda df: df.mode(numeric_only=True), 2282 GROUPBY_DF, 2283 nonparallel=True, 2284 check_proxy=False) 2285 self._run_test( 2286 lambda df: df.mode(dropna=True, numeric_only=True), 2287 GROUPBY_DF, 2288 nonparallel=True, 2289 check_proxy=False) 2290 2291 def test_series_mode(self): 2292 self._run_test(lambda df: df.foo.mode(), GROUPBY_DF, nonparallel=True) 2293 self._run_test( 2294 lambda df: df.baz.mode(dropna=True), GROUPBY_DF, nonparallel=True) 2295 2296 2297 class BeamSpecificTest(unittest.TestCase): 2298 """Tests for functionality that's specific to the Beam DataFrame API. 2299 2300 These features don't exist in pandas so we must verify them independently.""" 2301 def assert_frame_data_equivalent( 2302 self, actual, expected, check_column_subset=False, extra_col_value=0): 2303 """Verify that actual is the same as expected, ignoring the index and order 2304 of the data. 2305 2306 Note: In order to perform non-deferred column operations in Beam, we have 2307 to enumerate all possible categories of data, even if they are ultimately 2308 unobserved. The default Pandas implementation on the other hand does not 2309 produce unobserved columns. This means when conducting tests, we need to 2310 account for the fact that the Beam result may be a superset of that of the 2311 Pandas result. 2312 2313 If ``check_column_subset`` is `True`, we verify that all of the columns in 2314 the Dataframe returned from the Pandas implementation is contained in the 2315 Dataframe created from the Beam implementation. 2316 2317 We also check if all columns that exist in the Beam implementation but 2318 not in the Pandas implementation are all equal to the ``extra_col_value`` 2319 to ensure that they were not erroneously populated. 2320 """ 2321 if check_column_subset: 2322 if isinstance(expected, pd.DataFrame): 2323 expected_cols = set(expected.columns) 2324 actual_cols = set(actual.columns) 2325 # Verifying that expected columns is a subset of the actual columns 2326 if not set(expected_cols).issubset(set(actual_cols)): 2327 raise AssertionError( 2328 f"Expected columns:\n{expected.columns}\n is not a" 2329 f"subset of {actual.columns}.") 2330 2331 # Verifying that columns that don't exist in expected 2332 # but do in actual, are all equal to `extra_col_value` (default of 0) 2333 extra_columns = actual_cols - expected_cols 2334 if extra_columns: 2335 actual_extra_only = actual[list(extra_columns)] 2336 2337 if np.isnan(extra_col_value): 2338 extra_cols_all_match = actual_extra_only.isna().all().all() 2339 else: 2340 extra_cols_all_match = actual_extra_only.eq( 2341 extra_col_value).all().all() 2342 if not extra_cols_all_match: 2343 raise AssertionError( 2344 f"Extra columns:{extra_columns}\n should all " 2345 f"be {extra_col_value}, but got \n{actual_extra_only}.") 2346 2347 # Filtering actual to contain only columns in expected 2348 actual = actual[expected.columns] 2349 2350 def sort_and_drop_index(df): 2351 if isinstance(df, pd.Series): 2352 df = df.sort_values() 2353 elif isinstance(df, pd.DataFrame): 2354 df = df.sort_values(by=list(df.columns)) 2355 2356 return df.reset_index(drop=True) 2357 2358 actual = sort_and_drop_index(actual) 2359 expected = sort_and_drop_index(expected) 2360 2361 if isinstance(expected, pd.Series): 2362 pd.testing.assert_series_equal(actual, expected) 2363 elif isinstance(expected, pd.DataFrame): 2364 pd.testing.assert_frame_equal(actual, expected) 2365 2366 def _evaluate(self, func, *args, distributed=True): 2367 deferred_args = [ 2368 frame_base.DeferredFrame.wrap( 2369 expressions.ConstantExpression(arg, arg[0:0])) for arg in args 2370 ] 2371 2372 session_type = ( 2373 expressions.PartitioningSession if distributed else expressions.Session) 2374 2375 return session_type({}).evaluate(func(*deferred_args)._expr) 2376 2377 def test_drop_duplicates_keep_any(self): 2378 df = pd.DataFrame({ 2379 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 2380 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 2381 'rating': [4, 4, 3.5, 15, 5] 2382 }) 2383 2384 result = self._evaluate(lambda df: df.drop_duplicates(keep='any'), df) 2385 2386 # Verify that the result is the same as conventional drop_duplicates 2387 self.assert_frame_data_equivalent(result, df.drop_duplicates()) 2388 2389 def test_drop_duplicates_keep_any_subset(self): 2390 df = pd.DataFrame({ 2391 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 2392 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 2393 'rating': [4, 4, 3.5, 15, 5] 2394 }) 2395 2396 result = self._evaluate( 2397 lambda df: df.drop_duplicates(keep='any', subset=['brand']), df) 2398 2399 self.assertTrue(result.brand.unique) 2400 self.assert_frame_data_equivalent( 2401 result.brand, df.drop_duplicates(subset=['brand']).brand) 2402 2403 def test_series_drop_duplicates_keep_any(self): 2404 df = pd.DataFrame({ 2405 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 2406 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 2407 'rating': [4, 4, 3.5, 15, 5] 2408 }) 2409 2410 result = self._evaluate(lambda df: df.brand.drop_duplicates(keep='any'), df) 2411 2412 self.assert_frame_data_equivalent(result, df.brand.drop_duplicates()) 2413 2414 def test_duplicated_keep_any(self): 2415 df = pd.DataFrame({ 2416 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 2417 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 2418 'rating': [4, 4, 3.5, 15, 5] 2419 }) 2420 2421 result = self._evaluate(lambda df: df.duplicated(keep='any'), df) 2422 2423 # Verify that the result is the same as conventional duplicated 2424 self.assert_frame_data_equivalent(result, df.duplicated()) 2425 2426 def test_get_dummies_not_categoricaldtype(self): 2427 # Should not work because series is not a CategoricalDtype type 2428 with self.assertRaisesRegex( 2429 frame_base.WontImplementError, 2430 r"get_dummies\(\) of non-categorical type is not supported"): 2431 s = pd.Series(['a ,b', 'a', 'a, d']) 2432 self._evaluate(lambda s: s.str.get_dummies(','), s) 2433 2434 # bool series do not work because they are not a CategoricalDtype type 2435 with self.assertRaisesRegex( 2436 frame_base.WontImplementError, 2437 r"get_dummies\(\) of non-categorical type is not supported"): 2438 s = pd.Series([True, False, False, True]) 2439 self._evaluate(lambda s: s.str.get_dummies(), s) 2440 2441 def test_get_dummies_comma_separator(self): 2442 s = pd.Series(['a ,b', 'a', 'a, d', 'c']) 2443 s = s.astype(pd.CategoricalDtype(categories=['a ,b', 'c', 'b', 'a,d'])) 2444 result = self._evaluate(lambda s: s.str.get_dummies(','), s) 2445 self.assert_frame_data_equivalent( 2446 result, s.str.get_dummies(','), check_column_subset=True) 2447 2448 def test_get_dummies_pandas_doc_example1(self): 2449 s = pd.Series(['a|b', 'a', 'a|c']) 2450 s = s.astype(pd.CategoricalDtype(categories=['a|b', 'a', 'a|c'])) 2451 result = self._evaluate(lambda s: s.str.get_dummies(), s) 2452 self.assert_frame_data_equivalent( 2453 result, s.str.get_dummies(), check_column_subset=True) 2454 2455 def test_get_dummies_pandas_doc_example2(self): 2456 # Shouldn't still work even though np.nan is not considered a category 2457 # because we automatically create a nan column 2458 s = pd.Series(['a|b', np.nan, 'a|c']) 2459 s = s.astype(pd.CategoricalDtype(categories=['a|b', 'a|c'])) 2460 result = self._evaluate(lambda s: s.str.get_dummies(), s) 2461 self.assert_frame_data_equivalent( 2462 result, s.str.get_dummies(), check_column_subset=True) 2463 2464 def test_get_dummies_pass_nan_as_category(self): 2465 # Explicitly pass 'nan' as a category 2466 s = pd.Series(['a|b', 'b|c', 'a|c', 'c', 'd']) 2467 s = s.astype(pd.CategoricalDtype(categories=['a', 'b', 'c', 'nan'])) 2468 result = self._evaluate(lambda s: s.str.get_dummies(), s) 2469 self.assert_frame_data_equivalent( 2470 result, s.str.get_dummies(), check_column_subset=True) 2471 2472 def test_get_dummies_bools_casted_to_string(self): 2473 s = pd.Series([True, False, False, True]).astype('str') 2474 s = s.astype(pd.CategoricalDtype(categories=['True', 'False'])) 2475 result = self._evaluate(lambda s: s.str.get_dummies(), s) 2476 self.assert_frame_data_equivalent( 2477 result, s.str.get_dummies(), check_column_subset=True) 2478 2479 def test_nsmallest_any(self): 2480 df = pd.DataFrame({ 2481 'population': [ 2482 59000000, 2483 65000000, 2484 434000, 2485 434000, 2486 434000, 2487 337000, 2488 337000, 2489 11300, 2490 11300 2491 ], 2492 'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], 2493 'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"] 2494 }, 2495 index=[ 2496 "Italy", 2497 "France", 2498 "Malta", 2499 "Maldives", 2500 "Brunei", 2501 "Iceland", 2502 "Nauru", 2503 "Tuvalu", 2504 "Anguilla" 2505 ]) 2506 2507 result = self._evaluate( 2508 lambda df: df.population.nsmallest(3, keep='any'), df) 2509 2510 # keep='any' should produce the same result as keep='first', 2511 # but not necessarily with the same index 2512 self.assert_frame_data_equivalent(result, df.population.nsmallest(3)) 2513 2514 def test_nlargest_any(self): 2515 df = pd.DataFrame({ 2516 'population': [ 2517 59000000, 2518 65000000, 2519 434000, 2520 434000, 2521 434000, 2522 337000, 2523 337000, 2524 11300, 2525 11300 2526 ], 2527 'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], 2528 'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"] 2529 }, 2530 index=[ 2531 "Italy", 2532 "France", 2533 "Malta", 2534 "Maldives", 2535 "Brunei", 2536 "Iceland", 2537 "Nauru", 2538 "Tuvalu", 2539 "Anguilla" 2540 ]) 2541 2542 result = self._evaluate( 2543 lambda df: df.population.nlargest(3, keep='any'), df) 2544 2545 # keep='any' should produce the same result as keep='first', 2546 # but not necessarily with the same index 2547 self.assert_frame_data_equivalent(result, df.population.nlargest(3)) 2548 2549 def test_pivot_pandas_example2(self): 2550 # Simple test 2 2551 df = pd.DataFrame({ 2552 'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 2553 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 2554 'baz': [1, 2, 3, 4, 5, 6], 2555 'zoo': ['x', 'y', 'z', 'q', 'w', 't'] 2556 }) 2557 df['bar'] = df['bar'].astype( 2558 pd.CategoricalDtype(categories=['A', 'B', 'C'])) 2559 result = self._evaluate(lambda df: df.pivot(index='foo', columns='bar'), df) 2560 # When there are multiple values, dtypes default to object. 2561 # Thus, need to convert to numeric with pd.to_numeric 2562 self.assert_frame_data_equivalent( 2563 result['baz'].apply(pd.to_numeric), 2564 df.pivot(index='foo', columns='bar')['baz']) 2565 2566 def test_sample(self): 2567 df = pd.DataFrame({ 2568 'population': [ 2569 59000000, 2570 65000000, 2571 434000, 2572 434000, 2573 434000, 2574 337000, 2575 337000, 2576 11300, 2577 11300 2578 ], 2579 'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], 2580 'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"] 2581 }, 2582 index=[ 2583 "Italy", 2584 "France", 2585 "Malta", 2586 "Maldives", 2587 "Brunei", 2588 "Iceland", 2589 "Nauru", 2590 "Tuvalu", 2591 "Anguilla" 2592 ]) 2593 2594 result = self._evaluate(lambda df: df.sample(n=3), df) 2595 2596 self.assertEqual(len(result), 3) 2597 2598 series_result = self._evaluate(lambda df: df.GDP.sample(n=3), df) 2599 self.assertEqual(len(series_result), 3) 2600 self.assertEqual(series_result.name, "GDP") 2601 2602 def test_sample_with_weights(self): 2603 df = pd.DataFrame({ 2604 'population': [ 2605 59000000, 2606 65000000, 2607 434000, 2608 434000, 2609 434000, 2610 337000, 2611 337000, 2612 11300, 2613 11300 2614 ], 2615 'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], 2616 'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"] 2617 }, 2618 index=[ 2619 "Italy", 2620 "France", 2621 "Malta", 2622 "Maldives", 2623 "Brunei", 2624 "Iceland", 2625 "Nauru", 2626 "Tuvalu", 2627 "Anguilla" 2628 ]) 2629 2630 weights = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1], index=df.index) 2631 2632 result = self._evaluate( 2633 lambda df, weights: df.sample(n=2, weights=weights), df, weights) 2634 2635 self.assertEqual(len(result), 2) 2636 self.assertEqual(set(result.index), set(["Tuvalu", "Anguilla"])) 2637 2638 series_result = self._evaluate( 2639 lambda df, weights: df.GDP.sample(n=2, weights=weights), df, weights) 2640 self.assertEqual(len(series_result), 2) 2641 self.assertEqual(series_result.name, "GDP") 2642 self.assertEqual(set(series_result.index), set(["Tuvalu", "Anguilla"])) 2643 2644 def test_sample_with_missing_weights(self): 2645 df = pd.DataFrame({ 2646 'population': [ 2647 59000000, 2648 65000000, 2649 434000, 2650 434000, 2651 434000, 2652 337000, 2653 337000, 2654 11300, 2655 11300 2656 ], 2657 'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], 2658 'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"] 2659 }, 2660 index=[ 2661 "Italy", 2662 "France", 2663 "Malta", 2664 "Maldives", 2665 "Brunei", 2666 "Iceland", 2667 "Nauru", 2668 "Tuvalu", 2669 "Anguilla" 2670 ]) 2671 2672 # Missing weights are treated as 0 2673 weights = pd.Series([.1, .01, np.nan, 0], 2674 index=["Nauru", "Iceland", "Anguilla", "Italy"]) 2675 2676 result = self._evaluate( 2677 lambda df, weights: df.sample(n=2, weights=weights), df, weights) 2678 2679 self.assertEqual(len(result), 2) 2680 self.assertEqual(set(result.index), set(["Nauru", "Iceland"])) 2681 2682 series_result = self._evaluate( 2683 lambda df, weights: df.GDP.sample(n=2, weights=weights), df, weights) 2684 2685 self.assertEqual(len(series_result), 2) 2686 self.assertEqual(series_result.name, "GDP") 2687 self.assertEqual(set(series_result.index), set(["Nauru", "Iceland"])) 2688 2689 def test_sample_with_weights_distribution(self): 2690 target_prob = 0.25 2691 num_samples = 100 2692 num_targets = 200 2693 num_other_elements = 10000 2694 2695 target_weight = target_prob / num_targets 2696 other_weight = (1 - target_prob) / num_other_elements 2697 self.assertTrue(target_weight > other_weight * 10, "weights too close") 2698 2699 result = self._evaluate( 2700 lambda s, 2701 weights: s.sample(n=num_samples, weights=weights).sum(), 2702 # The first elements are 1, the rest are all 0. This means that when 2703 # we sum all the sampled elements (above), the result should be the 2704 # number of times the first elements (aka targets) were sampled. 2705 pd.Series([1] * num_targets + [0] * num_other_elements), 2706 pd.Series([target_weight] * num_targets + 2707 [other_weight] * num_other_elements)) 2708 2709 # With the above constants, the probability of violating this invariant 2710 # (as computed using the Bernoulli distribution) is about 0.0012%. 2711 expected = num_samples * target_prob 2712 self.assertTrue(expected / 3 < result < expected * 2, (expected, result)) 2713 2714 def test_split_pandas_examples_no_expand(self): 2715 # if expand=False (default), then no need to cast dtype to be 2716 # CategoricalDtype. 2717 s = pd.Series([ 2718 "this is a regular sentence", 2719 "https://docs.python.org/3/tutorial/index.html", 2720 np.nan 2721 ]) 2722 result = self._evaluate(lambda s: s.str.split(), s) 2723 self.assert_frame_data_equivalent(result, s.str.split()) 2724 2725 result = self._evaluate(lambda s: s.str.rsplit(), s) 2726 self.assert_frame_data_equivalent(result, s.str.rsplit()) 2727 2728 result = self._evaluate(lambda s: s.str.split(n=2), s) 2729 self.assert_frame_data_equivalent(result, s.str.split(n=2)) 2730 2731 result = self._evaluate(lambda s: s.str.rsplit(n=2), s) 2732 self.assert_frame_data_equivalent(result, s.str.rsplit(n=2)) 2733 2734 result = self._evaluate(lambda s: s.str.split(pat="/"), s) 2735 self.assert_frame_data_equivalent(result, s.str.split(pat="/")) 2736 2737 def test_split_pandas_examples_expand_not_categorical(self): 2738 # When expand=True, there is exception because series is not categorical 2739 s = pd.Series([ 2740 "this is a regular sentence", 2741 "https://docs.python.org/3/tutorial/index.html", 2742 np.nan 2743 ]) 2744 with self.assertRaisesRegex( 2745 frame_base.WontImplementError, 2746 r"split\(\) of non-categorical type is not supported"): 2747 self._evaluate(lambda s: s.str.split(expand=True), s) 2748 2749 with self.assertRaisesRegex( 2750 frame_base.WontImplementError, 2751 r"rsplit\(\) of non-categorical type is not supported"): 2752 self._evaluate(lambda s: s.str.rsplit(expand=True), s) 2753 2754 def test_split_pandas_examples_expand_pat_is_string_literal1(self): 2755 # When expand=True and pattern is treated as a string literal 2756 s = pd.Series([ 2757 "this is a regular sentence", 2758 "https://docs.python.org/3/tutorial/index.html", 2759 np.nan 2760 ]) 2761 s = s.astype( 2762 pd.CategoricalDtype( 2763 categories=[ 2764 'this is a regular sentence', 2765 'https://docs.python.org/3/tutorial/index.html' 2766 ])) 2767 result = self._evaluate(lambda s: s.str.split(expand=True), s) 2768 self.assert_frame_data_equivalent(result, s.str.split(expand=True)) 2769 2770 result = self._evaluate(lambda s: s.str.rsplit("/", n=1, expand=True), s) 2771 self.assert_frame_data_equivalent( 2772 result, s.str.rsplit("/", n=1, expand=True)) 2773 2774 @unittest.skipIf(PD_VERSION < (1, 4), "regex arg is new in pandas 1.4") 2775 def test_split_pandas_examples_expand_pat_is_string_literal2(self): 2776 # when regex is None (default) regex pat is string literal if len(pat) == 1 2777 s = pd.Series(['foojpgbar.jpg']).astype('category') 2778 s = s.astype(pd.CategoricalDtype(categories=["foojpgbar.jpg"])) 2779 result = self._evaluate(lambda s: s.str.split(r".", expand=True), s) 2780 self.assert_frame_data_equivalent(result, s.str.split(r".", expand=True)) 2781 2782 # When regex=False, pat is interpreted as the string itself 2783 result = self._evaluate( 2784 lambda s: s.str.split(r"\.jpg", regex=False, expand=True), s) 2785 self.assert_frame_data_equivalent( 2786 result, s.str.split(r"\.jpg", regex=False, expand=True)) 2787 2788 @unittest.skipIf(PD_VERSION < (1, 4), "regex arg is new in pandas 1.4") 2789 def test_split_pandas_examples_expand_pat_is_regex(self): 2790 # when regex is None (default) regex pat is compiled if len(pat) != 1 2791 s = pd.Series(["foo and bar plus baz"]) 2792 s = s.astype(pd.CategoricalDtype(categories=["foo and bar plus baz"])) 2793 result = self._evaluate(lambda s: s.str.split(r"and|plus", expand=True), s) 2794 self.assert_frame_data_equivalent( 2795 result, s.str.split(r"and|plus", expand=True)) 2796 2797 s = pd.Series(['foojpgbar.jpg']).astype('category') 2798 s = s.astype(pd.CategoricalDtype(categories=["foojpgbar.jpg"])) 2799 result = self._evaluate(lambda s: s.str.split(r"\.jpg", expand=True), s) 2800 self.assert_frame_data_equivalent( 2801 result, s.str.split(r"\.jpg", expand=True)) 2802 2803 # When regex=True, pat is interpreted as a regex 2804 result = self._evaluate( 2805 lambda s: s.str.split(r"\.jpg", regex=True, expand=True), s) 2806 self.assert_frame_data_equivalent( 2807 result, s.str.split(r"\.jpg", regex=True, expand=True)) 2808 2809 # A compiled regex can be passed as pat 2810 result = self._evaluate( 2811 lambda s: s.str.split(re.compile(r"\.jpg"), expand=True), s) 2812 self.assert_frame_data_equivalent( 2813 result, s.str.split(re.compile(r"\.jpg"), expand=True)) 2814 2815 @unittest.skipIf(PD_VERSION < (1, 4), "regex arg is new in pandas 1.4") 2816 def test_split_pat_is_regex(self): 2817 # regex=True, but expand=False 2818 s = pd.Series(['foojpgbar.jpg']).astype('category') 2819 s = s.astype(pd.CategoricalDtype(categories=["foojpgbar.jpg"])) 2820 result = self._evaluate( 2821 lambda s: s.str.split(r"\.jpg", regex=True, expand=False), s) 2822 self.assert_frame_data_equivalent( 2823 result, s.str.split(r"\.jpg", regex=True, expand=False)) 2824 2825 def test_astype_categorical_rejected(self): 2826 df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) 2827 2828 with self.assertRaisesRegex(frame_base.WontImplementError, 2829 r"astype\(dtype='category'\)"): 2830 self._evaluate(lambda df: df.B.astype('category'), df) 2831 2832 2833 class AllowNonParallelTest(unittest.TestCase): 2834 def _use_non_parallel_operation(self): 2835 _ = frame_base.DeferredFrame.wrap( 2836 expressions.PlaceholderExpression(pd.Series([1, 2, 3]))).replace( 2837 'a', 'b', limit=1) 2838 2839 def test_disallow_non_parallel(self): 2840 with self.assertRaises(expressions.NonParallelOperation): 2841 self._use_non_parallel_operation() 2842 2843 def test_allow_non_parallel_in_context(self): 2844 with beam.dataframe.allow_non_parallel_operations(): 2845 self._use_non_parallel_operation() 2846 2847 def test_allow_non_parallel_nesting(self): 2848 # disallowed 2849 with beam.dataframe.allow_non_parallel_operations(): 2850 # allowed 2851 self._use_non_parallel_operation() 2852 with beam.dataframe.allow_non_parallel_operations(False): 2853 # disallowed again 2854 with self.assertRaises(expressions.NonParallelOperation): 2855 self._use_non_parallel_operation() 2856 # allowed 2857 self._use_non_parallel_operation() 2858 # disallowed 2859 with self.assertRaises(expressions.NonParallelOperation): 2860 self._use_non_parallel_operation() 2861 2862 2863 class ConstructionTimeTest(unittest.TestCase): 2864 """Tests for operations that can be executed eagerly.""" 2865 DF = pd.DataFrame({ 2866 'str_col': ['foo', 'bar'] * 3, 2867 'int_col': [1, 2] * 3, 2868 'flt_col': [1.1, 2.2] * 3, 2869 'cat_col': pd.Series(list('aabbca'), dtype="category"), 2870 'datetime_col': pd.Series( 2871 pd.date_range( 2872 '1/1/2000', periods=6, freq='m', tz='America/Los_Angeles')) 2873 }) 2874 DEFERRED_DF = frame_base.DeferredFrame.wrap( 2875 expressions.PlaceholderExpression(DF.iloc[:0])) 2876 2877 def _run_test(self, fn): 2878 expected = fn(self.DF) 2879 actual = fn(self.DEFERRED_DF) 2880 2881 if isinstance(expected, pd.Index): 2882 pd.testing.assert_index_equal(expected, actual) 2883 elif isinstance(expected, pd.Series): 2884 pd.testing.assert_series_equal(expected, actual) 2885 elif isinstance(expected, pd.DataFrame): 2886 pd.testing.assert_frame_equal(expected, actual) 2887 else: 2888 self.assertEqual(expected, actual) 2889 2890 @parameterized.expand(DF.columns) 2891 def test_series_name(self, col_name): 2892 self._run_test(lambda df: df[col_name].name) 2893 2894 @parameterized.expand(DF.columns) 2895 def test_series_dtype(self, col_name): 2896 self._run_test(lambda df: df[col_name].dtype) 2897 self._run_test(lambda df: df[col_name].dtypes) 2898 2899 def test_dataframe_columns(self): 2900 self._run_test(lambda df: list(df.columns)) 2901 2902 def test_dataframe_dtypes(self): 2903 self._run_test(lambda df: list(df.dtypes)) 2904 2905 def test_categories(self): 2906 self._run_test(lambda df: df.cat_col.cat.categories) 2907 2908 def test_categorical_ordered(self): 2909 self._run_test(lambda df: df.cat_col.cat.ordered) 2910 2911 def test_groupby_ndim(self): 2912 self._run_test(lambda df: df.groupby('int_col').ndim) 2913 2914 def test_groupby_project_ndim(self): 2915 self._run_test(lambda df: df.groupby('int_col').flt_col.ndim) 2916 self._run_test( 2917 lambda df: df.groupby('int_col')[['flt_col', 'str_col']].ndim) 2918 2919 def test_get_column_default_None(self): 2920 # .get just returns default_value=None at construction time if the column 2921 # doesn't exist 2922 self._run_test(lambda df: df.get('FOO')) 2923 2924 def test_datetime_tz(self): 2925 self._run_test(lambda df: df.datetime_col.dt.tz) 2926 2927 2928 class DocstringTest(unittest.TestCase): 2929 @parameterized.expand([ 2930 (frames.DeferredDataFrame, pd.DataFrame), 2931 (frames.DeferredSeries, pd.Series), 2932 #(frames._DeferredIndex, pd.Index), 2933 (frames._DeferredStringMethods, pd.core.strings.StringMethods), 2934 ( 2935 frames._DeferredCategoricalMethods, 2936 pd.core.arrays.categorical.CategoricalAccessor), 2937 (frames.DeferredGroupBy, pd.core.groupby.generic.DataFrameGroupBy), 2938 (frames._DeferredGroupByCols, pd.core.groupby.generic.DataFrameGroupBy), 2939 ( 2940 frames._DeferredDatetimeMethods, 2941 pd.core.indexes.accessors.DatetimeProperties), 2942 ]) 2943 def test_docs_defined(self, beam_type, pd_type): 2944 beam_attrs = set(dir(beam_type)) 2945 pd_attrs = set(dir(pd_type)) 2946 2947 docstring_required = sorted([ 2948 attr for attr in beam_attrs.intersection(pd_attrs) 2949 if getattr(pd_type, attr).__doc__ and not attr.startswith('_') 2950 ]) 2951 2952 docstring_missing = [ 2953 attr for attr in docstring_required 2954 if not getattr(beam_type, attr).__doc__ 2955 ] 2956 2957 self.assertTrue( 2958 len(docstring_missing) == 0, 2959 f'{beam_type.__name__} is missing a docstring for ' 2960 f'{len(docstring_missing)}/{len(docstring_required)} ' 2961 f'({len(docstring_missing)/len(docstring_required):%}) ' 2962 f'operations:\n{docstring_missing}') 2963 2964 2965 class ReprTest(unittest.TestCase): 2966 def test_basic_dataframe(self): 2967 df = frame_base.DeferredFrame.wrap( 2968 expressions.ConstantExpression(GROUPBY_DF)) 2969 self.assertEqual( 2970 repr(df), 2971 ( 2972 "DeferredDataFrame(columns=['group', 'foo', 'bar', 'baz', 'bool', " 2973 "'str'], index=<unnamed>)")) 2974 2975 def test_dataframe_with_named_index(self): 2976 df = frame_base.DeferredFrame.wrap( 2977 expressions.ConstantExpression(GROUPBY_DF.set_index('group'))) 2978 self.assertEqual( 2979 repr(df), 2980 ( 2981 "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool', 'str'], " 2982 "index='group')")) 2983 2984 def test_dataframe_with_partial_named_index(self): 2985 df = frame_base.DeferredFrame.wrap( 2986 expressions.ConstantExpression( 2987 GROUPBY_DF.set_index([GROUPBY_DF.index, 'group']))) 2988 self.assertEqual( 2989 repr(df), 2990 ( 2991 "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool', 'str'], " 2992 "indexes=[<unnamed>, 'group'])")) 2993 2994 def test_dataframe_with_named_multi_index(self): 2995 df = frame_base.DeferredFrame.wrap( 2996 expressions.ConstantExpression(GROUPBY_DF.set_index(['str', 'group']))) 2997 self.assertEqual( 2998 repr(df), 2999 ( 3000 "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool'], " 3001 "indexes=['str', 'group'])")) 3002 3003 def test_dataframe_with_multiple_column_levels(self): 3004 df = pd.DataFrame({ 3005 'foofoofoo': ['one', 'one', 'one', 'two', 'two', 'two'], 3006 'barbar': ['A', 'B', 'C', 'A', 'B', 'C'], 3007 'bazzy': [1, 2, 3, 4, 5, 6], 3008 'zoop': ['x', 'y', 'z', 'q', 'w', 't'] 3009 }) 3010 3011 df = df.pivot(index='foofoofoo', columns='barbar') 3012 df = frame_base.DeferredFrame.wrap(expressions.ConstantExpression(df)) 3013 self.assertEqual( 3014 repr(df), 3015 ( 3016 "DeferredDataFrame(columns=[('bazzy', 'A'), ('bazzy', 'B'), " 3017 "('bazzy', 'C'), ('zoop', 'A'), ('zoop', 'B'), ('zoop', 'C')], " 3018 "index='foofoofoo')")) 3019 3020 def test_dataframe_with_multiple_column_and_multiple_index_levels(self): 3021 df = pd.DataFrame({ 3022 'foofoofoo': ['one', 'one', 'one', 'two', 'two', 'two'], 3023 'barbar': ['A', 'B', 'C', 'A', 'B', 'C'], 3024 'bazzy': [1, 2, 3, 4, 5, 6], 3025 'zoop': ['x', 'y', 'z', 'q', 'w', 't'] 3026 }) 3027 3028 df = df.pivot(index='foofoofoo', columns='barbar') 3029 df.index = [['a', 'b'], df.index] 3030 3031 # pandas repr displays this: 3032 # bazzy zoop 3033 # barbar A B C A B C 3034 # foofoofoo 3035 # a one 1 2 3 x y z 3036 # b two 4 5 6 q w t 3037 df = frame_base.DeferredFrame.wrap(expressions.ConstantExpression(df)) 3038 self.assertEqual( 3039 repr(df), 3040 ( 3041 "DeferredDataFrame(columns=[('bazzy', 'A'), ('bazzy', 'B'), " 3042 "('bazzy', 'C'), ('zoop', 'A'), ('zoop', 'B'), ('zoop', 'C')], " 3043 "indexes=[<unnamed>, 'foofoofoo'])")) 3044 3045 def test_basic_series(self): 3046 df = frame_base.DeferredFrame.wrap( 3047 expressions.ConstantExpression(GROUPBY_DF['bool'])) 3048 self.assertEqual( 3049 repr(df), "DeferredSeries(name='bool', dtype=bool, index=<unnamed>)") 3050 3051 def test_series_with_named_index(self): 3052 df = frame_base.DeferredFrame.wrap( 3053 expressions.ConstantExpression(GROUPBY_DF.set_index('group')['str'])) 3054 self.assertEqual( 3055 repr(df), "DeferredSeries(name='str', dtype=object, index='group')") 3056 3057 def test_series_with_partial_named_index(self): 3058 df = frame_base.DeferredFrame.wrap( 3059 expressions.ConstantExpression( 3060 GROUPBY_DF.set_index([GROUPBY_DF.index, 'group'])['bar'])) 3061 self.assertEqual( 3062 repr(df), 3063 ( 3064 "DeferredSeries(name='bar', dtype=float64, " 3065 "indexes=[<unnamed>, 'group'])")) 3066 3067 def test_series_with_named_multi_index(self): 3068 df = frame_base.DeferredFrame.wrap( 3069 expressions.ConstantExpression( 3070 GROUPBY_DF.set_index(['str', 'group'])['baz'])) 3071 self.assertEqual( 3072 repr(df), 3073 "DeferredSeries(name='baz', dtype=float64, indexes=['str', 'group'])") 3074 3075 3076 @unittest.skipIf( 3077 not ie.current_env().is_interactive_ready, 3078 '[interactive] dependency is not installed.') 3079 @isolated_env 3080 class InteractiveDataFrameTest(unittest.TestCase): 3081 def test_collect_merged_dataframes(self): 3082 p = beam.Pipeline(InteractiveRunner()) 3083 pcoll_1 = ( 3084 p 3085 | 'Create data 1' >> beam.Create([(1, 'a'), (2, 'b'), (3, 'c'), 3086 (4, 'd')]) 3087 | 3088 'To rows 1' >> beam.Select(col_1=lambda x: x[0], col_2=lambda x: x[1])) 3089 df_1 = to_dataframe(pcoll_1) 3090 pcoll_2 = ( 3091 p 3092 | 'Create data 2' >> beam.Create([(5, 'e'), (6, 'f'), (7, 'g'), 3093 (8, 'h')]) 3094 | 3095 'To rows 2' >> beam.Select(col_3=lambda x: x[0], col_4=lambda x: x[1])) 3096 df_2 = to_dataframe(pcoll_2) 3097 3098 df_merged = df_1.merge(df_2, left_index=True, right_index=True) 3099 pd_df = ib.collect(df_merged).sort_values(by='col_1') 3100 self.assertEqual(pd_df.shape, (4, 4)) 3101 self.assertEqual(list(pd_df['col_1']), [1, 2, 3, 4]) 3102 self.assertEqual(list(pd_df['col_2']), ['a', 'b', 'c', 'd']) 3103 self.assertEqual(list(pd_df['col_3']), [5, 6, 7, 8]) 3104 self.assertEqual(list(pd_df['col_4']), ['e', 'f', 'g', 'h']) 3105 3106 3107 if __name__ == '__main__': 3108 unittest.main()