github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_doctests_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import sys 18 import unittest 19 20 import pandas as pd 21 22 from apache_beam.dataframe import doctests 23 from apache_beam.dataframe.frames import PD_VERSION 24 from apache_beam.dataframe.pandas_top_level_functions import _is_top_level_function 25 26 27 @unittest.skipIf( 28 sys.platform == 'win32', '[https://github.com/apache/beam/issues/20361]') 29 class DoctestTest(unittest.TestCase): 30 def test_ndframe_tests(self): 31 # IO methods are tested in io_test.py 32 skip_writes = { 33 f'pandas.core.generic.NDFrame.{name}': ['*'] 34 for name in dir(pd.core.generic.NDFrame) if name.startswith('to_') 35 } 36 37 result = doctests.testmod( 38 pd.core.generic, 39 use_beam=False, 40 report=True, 41 wont_implement_ok={ 42 'pandas.core.generic.NDFrame.head': ['*'], 43 'pandas.core.generic.NDFrame.shift': [ 44 'df.shift(periods=3)', 45 'df.shift(periods=3, fill_value=0)', 46 ], 47 'pandas.core.generic.NDFrame.tail': ['*'], 48 'pandas.core.generic.NDFrame.take': ['*'], 49 'pandas.core.generic.NDFrame.values': ['*'], 50 'pandas.core.generic.NDFrame.tz_localize': [ 51 "s.tz_localize('CET', ambiguous='infer')", 52 # np.array is not a deferred object. This use-case is possible 53 # with a deferred Series though, which is tested in 54 # frames_test.py 55 "s.tz_localize('CET', ambiguous=np.array([True, True, False]))", 56 ], 57 'pandas.core.generic.NDFrame.truncate': [ 58 # These inputs rely on tail (wont implement, order 59 # sensitive) for verification 60 "df.tail()", 61 "df.truncate(before=pd.Timestamp('2016-01-05'),\n" 62 " after=pd.Timestamp('2016-01-10')).tail()", 63 "df.truncate('2016-01-05', '2016-01-10').tail()", 64 "df.loc['2016-01-05':'2016-01-10', :].tail()" 65 ], 66 'pandas.core.generic.NDFrame.replace': [ 67 "s.replace([1, 2], method='bfill')", 68 # Relies on method='pad' 69 "s.replace('a')", 70 # Relies on method='pad' 71 # value=None is not valid for pandas < 1.4 72 "s.replace('a', None)", 73 # Implicitly uses method='pad', but output doesn't rely on that 74 # behavior. Verified indepently in 75 # frames_test.py::DeferredFrameTest::test_replace 76 "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})" 77 ], 78 'pandas.core.generic.NDFrame.fillna': [ 79 'df.fillna(method=\'ffill\')', 80 'df.fillna(method="ffill")', 81 'df.fillna(value=values, limit=1)', 82 ], 83 'pandas.core.generic.NDFrame.sort_values': ['*'], 84 'pandas.core.generic.NDFrame.mask': [ 85 'df.where(m, -df) == np.where(m, df, -df)' 86 ], 87 'pandas.core.generic.NDFrame.where': [ 88 'df.where(m, -df) == np.where(m, df, -df)' 89 ], 90 'pandas.core.generic.NDFrame.interpolate': ['*'], 91 'pandas.core.generic.NDFrame.resample': ['*'], 92 'pandas.core.generic.NDFrame.rolling': ['*'], 93 # argsort wont implement 94 'pandas.core.generic.NDFrame.abs': [ 95 'df.loc[(df.c - 43).abs().argsort()]', 96 ], 97 'pandas.core.generic.NDFrame.reindex': ['*'], 98 'pandas.core.generic.NDFrame.pct_change': ['*'], 99 'pandas.core.generic.NDFrame.asof': ['*'], 100 'pandas.core.generic.NDFrame.infer_objects': ['*'], 101 'pandas.core.generic.NDFrame.ewm': ['*'], 102 'pandas.core.generic.NDFrame.expanding': ['*'], 103 'pandas.core.generic.NDFrame.get': ['*'], 104 }, 105 not_implemented_ok={ 106 'pandas.core.generic.NDFrame.asof': ['*'], 107 'pandas.core.generic.NDFrame.at_time': ['*'], 108 'pandas.core.generic.NDFrame.between_time': ['*'], 109 'pandas.core.generic.NDFrame.ewm': ['*'], 110 'pandas.core.generic.NDFrame.expanding': ['*'], 111 'pandas.core.generic.NDFrame.flags': ['*'], 112 'pandas.core.generic.NDFrame.rank': ['*'], 113 'pandas.core.generic.NDFrame.reindex_like': ['*'], 114 'pandas.core.generic.NDFrame.replace': ['*'], 115 'pandas.core.generic.NDFrame.sample': ['*'], 116 'pandas.core.generic.NDFrame.set_flags': ['*'], 117 'pandas.core.generic.NDFrame.squeeze': ['*'], 118 'pandas.core.generic.NDFrame.truncate': ['*'], 119 }, 120 skip={ 121 # Internal test 122 'pandas.core.generic.NDFrame._set_axis_name': ['*'], 123 # Fails to construct test series. asfreq is not implemented anyway. 124 'pandas.core.generic.NDFrame.asfreq': ['*'], 125 'pandas.core.generic.NDFrame.astype': ['*'], 126 'pandas.core.generic.NDFrame.convert_dtypes': ['*'], 127 'pandas.core.generic.NDFrame.copy': ['*'], 128 'pandas.core.generic.NDFrame.droplevel': ['*'], 129 'pandas.core.generic.NDFrame.get': ['*'], 130 'pandas.core.generic.NDFrame.rank': ['*'], 131 'pandas.core.generic.NDFrame.rename': [ 132 # Seems to be an upstream bug. The actual error has a different 133 # message: 134 # TypeError: Index(...) must be called with a collection of 135 # some kind, 2 was passed 136 # pandas doctests only verify the type of exception 137 'df.rename(2)' 138 ], 139 # For pandas >= 1.4, rename is changed to _rename 140 'pandas.core.generic.NDFrame._rename': [ 141 # Seems to be an upstream bug. The actual error has a different 142 # message: 143 # TypeError: Index(...) must be called with a collection of 144 # some kind, 2 was passed 145 # pandas doctests only verify the type of exception 146 'df.rename(2)' 147 ], 148 # Tests rely on setting index 149 'pandas.core.generic.NDFrame.rename_axis': ['*'], 150 # Raises right exception, but testing framework has matching issues. 151 'pandas.core.generic.NDFrame.replace': [ 152 "df.replace({'a string': 'new value', True: False}) # raises" 153 ], 154 'pandas.core.generic.NDFrame.squeeze': ['*'], 155 156 # NameError 157 'pandas.core.generic.NDFrame.resample': ['df'], 158 159 # Skipped so we don't need to install natsort 160 'pandas.core.generic.NDFrame.sort_values': [ 161 'from natsort import index_natsorted', 162 'df.sort_values(\n' 163 ' by="time",\n' 164 ' key=lambda x: np.argsort(index_natsorted(df["time"]))\n' 165 ')' 166 ], 167 **skip_writes 168 }) 169 self.assertEqual(result.failed, 0) 170 171 def test_dataframe_tests(self): 172 result = doctests.testmod( 173 pd.core.frame, 174 use_beam=False, 175 report=True, 176 wont_implement_ok={ 177 'pandas.core.frame.DataFrame.T': ['*'], 178 'pandas.core.frame.DataFrame.cummax': ['*'], 179 'pandas.core.frame.DataFrame.cummin': ['*'], 180 'pandas.core.frame.DataFrame.cumsum': ['*'], 181 'pandas.core.frame.DataFrame.cumprod': ['*'], 182 'pandas.core.frame.DataFrame.diff': ['*'], 183 'pandas.core.frame.DataFrame.fillna': [ 184 'df.fillna(method=\'ffill\')', 185 'df.fillna(method="ffill")', 186 'df.fillna(value=values, limit=1)', 187 ], 188 'pandas.core.frame.DataFrame.items': ['*'], 189 'pandas.core.frame.DataFrame.itertuples': ['*'], 190 'pandas.core.frame.DataFrame.iterrows': ['*'], 191 'pandas.core.frame.DataFrame.iteritems': ['*'], 192 # default keep is 'first' 193 'pandas.core.frame.DataFrame.nlargest': [ 194 "df.nlargest(3, 'population')", 195 "df.nlargest(3, ['population', 'GDP'])", 196 "df.nlargest(3, 'population', keep='last')" 197 ], 198 'pandas.core.frame.DataFrame.nsmallest': [ 199 "df.nsmallest(3, 'population')", 200 "df.nsmallest(3, ['population', 'GDP'])", 201 "df.nsmallest(3, 'population', keep='last')", 202 ], 203 'pandas.core.frame.DataFrame.replace': [ 204 "s.replace([1, 2], method='bfill')", 205 # Relies on method='pad' 206 "s.replace('a')", 207 # Relies on method='pad' 208 # value=None is not valid for pandas < 1.4 209 "s.replace('a', None)", 210 # Implicitly uses method='pad', but output doesn't rely on that 211 # behavior. Verified indepently in 212 # frames_test.py::DeferredFrameTest::test_replace 213 "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})" 214 ], 215 'pandas.core.frame.DataFrame.to_records': ['*'], 216 'pandas.core.frame.DataFrame.to_dict': ['*'], 217 'pandas.core.frame.DataFrame.to_numpy': ['*'], 218 'pandas.core.frame.DataFrame.to_string': ['*'], 219 'pandas.core.frame.DataFrame.transpose': ['*'], 220 'pandas.core.frame.DataFrame.shape': ['*'], 221 'pandas.core.frame.DataFrame.shift': [ 222 'df.shift(periods=3)', 223 'df.shift(periods=3, fill_value=0)', 224 ], 225 'pandas.core.frame.DataFrame.unstack': ['*'], 226 'pandas.core.frame.DataFrame.memory_usage': ['*'], 227 'pandas.core.frame.DataFrame.info': ['*'], 228 # Not equal to df.agg('mode', axis='columns', numeric_only=True) 229 # because there can be multiple columns if a row has more than one 230 # mode 231 'pandas.core.frame.DataFrame.mode': [ 232 "df.mode(axis='columns', numeric_only=True)" 233 ], 234 'pandas.core.frame.DataFrame.append': [ 235 'df.append(df2, ignore_index=True)', 236 "for i in range(5):\n" + 237 " df = df.append({'A': i}, ignore_index=True)", 238 ], 239 'pandas.core.frame.DataFrame.sort_index': ['*'], 240 'pandas.core.frame.DataFrame.sort_values': ['*'], 241 'pandas.core.frame.DataFrame.melt': [ 242 "df.melt(id_vars=['A'], value_vars=['B'])", 243 "df.melt(id_vars=['A'], value_vars=['B', 'C'])", 244 "df.melt(col_level=0, id_vars=['A'], value_vars=['B'])", 245 "df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')])", 246 "df.melt(id_vars=['A'], value_vars=['B'],\n" + 247 " var_name='myVarname', value_name='myValname')" 248 ], 249 # Most keep= options are order-sensitive 250 'pandas.core.frame.DataFrame.drop_duplicates': ['*'], 251 'pandas.core.frame.DataFrame.duplicated': [ 252 'df.duplicated()', 253 "df.duplicated(keep='last')", 254 "df.duplicated(subset=['brand'])", 255 ], 256 'pandas.core.frame.DataFrame.reindex': ['*'], 257 'pandas.core.frame.DataFrame.dot': [ 258 # reindex not supported 259 's2 = s.reindex([1, 0, 2, 3])', 260 ], 261 'pandas.core.frame.DataFrame.resample': ['*'], 262 'pandas.core.frame.DataFrame.values': ['*'], 263 }, 264 not_implemented_ok={ 265 'pandas.core.frame.DataFrame.transform': [ 266 # str arg not supported. Tested with np.sum in 267 # frames_test.py::DeferredFrameTest::test_groupby_transform_sum 268 "df.groupby('Date')['Data'].transform('sum')", 269 ], 270 'pandas.core.frame.DataFrame.melt': ['*'], 271 'pandas.core.frame.DataFrame.reindex_axis': ['*'], 272 'pandas.core.frame.DataFrame.round': [ 273 'df.round(decimals)', 274 ], 275 276 # Trivially elementwise for axis=columns. Relies on global indexing 277 # for axis=rows. 278 # Difficult to determine proxy, need to inspect function 279 'pandas.core.frame.DataFrame.apply': ['*'], 280 281 # Cross-join not implemented 282 'pandas.core.frame.DataFrame.merge': [ 283 "df1.merge(df2, how='cross')" 284 ], 285 286 # TODO(https://github.com/apache/beam/issues/20759) 287 'pandas.core.frame.DataFrame.set_index': [ 288 "df.set_index([s, s**2])", 289 ], 290 291 'pandas.core.frame.DataFrame.set_axis': [ 292 "df.set_axis(range(0,2), axis='index')", 293 ], 294 295 # TODO(https://github.com/apache/beam/issues/21014) 296 'pandas.core.frame.DataFrame.value_counts': [ 297 'df.value_counts(dropna=False)' 298 ], 299 }, 300 skip={ 301 # DataFrame construction from a dictionary and 302 # Series requires using the len() function, which 303 # is a non-deferred operation that we do not allow 304 'pandas.core.frame.DataFrame': [ 305 'pd.DataFrame(data=d, index=[0, 1, 2, 3])', 306 ], 307 # s2 created with reindex 308 'pandas.core.frame.DataFrame.dot': [ 309 'df.dot(s2)', 310 ], 311 312 'pandas.core.frame.DataFrame.resample': ['df'], 313 'pandas.core.frame.DataFrame.asfreq': ['*'], 314 # Throws NotImplementedError when modifying df 315 'pandas.core.frame.DataFrame.axes': [ 316 # Returns deferred index. 317 'df.axes', 318 ], 319 # Skipped because the relies on loc to set cells in df2 320 'pandas.core.frame.DataFrame.compare': ['*'], 321 'pandas.core.frame.DataFrame.cov': [ 322 # Relies on setting entries ahead of time. 323 "df.loc[df.index[:5], 'a'] = np.nan", 324 "df.loc[df.index[5:10], 'b'] = np.nan", 325 'df.cov(min_periods=12)', 326 ], 327 'pandas.core.frame.DataFrame.rename': [ 328 # Returns deferred index. 329 'df.index', 330 'df.rename(index=str).index', 331 ], 332 'pandas.core.frame.DataFrame.set_index': [ 333 # TODO(https://github.com/apache/beam/issues/20759): This could 334 # pass in the index as a DeferredIndex, and we should fail it 335 # as order-sensitive. 336 "df.set_index([pd.Index([1, 2, 3, 4]), 'year'])", 337 ], 338 'pandas.core.frame.DataFrame.set_axis': [ 339 # This should pass as set_axis(axis='columns') 340 # and fail with set_axis(axis='index') 341 "df.set_axis(['a', 'b', 'c'], axis='index')" 342 ], 343 'pandas.core.frame.DataFrame.to_markdown': ['*'], 344 'pandas.core.frame.DataFrame.to_parquet': ['*'], 345 346 # Raises right exception, but testing framework has matching issues. 347 # Tested in `frames_test.py`. 348 'pandas.core.frame.DataFrame.insert': [ 349 'df', 350 'df.insert(1, "newcol", [99, 99])', 351 'df.insert(0, "col1", [100, 100], allow_duplicates=True)' 352 ], 353 354 'pandas.core.frame.DataFrame.to_records': [ 355 'df.index = df.index.rename("I")', 356 'index_dtypes = f"<S{df.index.str.len().max()}"', # 1.x 357 'index_dtypes = "<S{}".format(df.index.str.len().max())', #0.x 358 'df.to_records(index_dtypes=index_dtypes)', 359 ], 360 # These tests use the static method pd.pivot_table, which doesn't 361 # actually raise NotImplementedError 362 'pandas.core.frame.DataFrame.pivot_table': ['*'], 363 # Expected to raise a ValueError, but we raise NotImplementedError 364 'pandas.core.frame.DataFrame.pivot': [ 365 "df.pivot(index='foo', columns='bar', values='baz')", 366 "df.pivot(index='foo', columns='bar')['baz']", 367 "df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])", 368 # pylint: disable=line-too-long 369 'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")', 370 # pylint: disable=line-too-long 371 'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")' 372 ], 373 'pandas.core.frame.DataFrame.append': [ 374 'df', 375 # pylint: disable=line-too-long 376 "pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],\n" 377 " ignore_index=True)" 378 ], 379 'pandas.core.frame.DataFrame.eval': ['df'], 380 'pandas.core.frame.DataFrame.melt': [ 381 "df.columns = [list('ABC'), list('DEF')]", "df" 382 ], 383 'pandas.core.frame.DataFrame.merge': [ 384 # Order-sensitive index, checked in frames_test.py. 385 "df1.merge(df2, left_on='lkey', right_on='rkey')", 386 "df1.merge(df2, left_on='lkey', right_on='rkey',\n" 387 " suffixes=('_left', '_right'))", 388 "df1.merge(df2, how='left', on='a')", 389 ], 390 # Raises right exception, but testing framework has matching issues. 391 'pandas.core.frame.DataFrame.replace': [ 392 "df.replace({'a string': 'new value', True: False}) # raises" 393 ], 394 'pandas.core.frame.DataFrame.to_sparse': ['type(df)'], 395 396 # Skipped because "seen_wont_implement" is reset before getting to 397 # these calls, so the NameError they raise is not ignored. 398 'pandas.core.frame.DataFrame.T': [ 399 'df1_transposed.dtypes', 'df2_transposed.dtypes' 400 ], 401 'pandas.core.frame.DataFrame.transpose': [ 402 'df1_transposed.dtypes', 'df2_transposed.dtypes' 403 ], 404 # Skipped because the relies on iloc to set a cell to NA. Test is 405 # replicated in frames_test::DeferredFrameTest::test_applymap. 406 'pandas.core.frame.DataFrame.applymap': [ 407 'df_copy.iloc[0, 0] = pd.NA', 408 "df_copy.applymap(lambda x: len(str(x)), na_action='ignore')", 409 ], 410 # Skipped so we don't need to install natsort 411 'pandas.core.frame.DataFrame.sort_values': [ 412 'from natsort import index_natsorted', 413 'df.sort_values(\n' 414 ' by="time",\n' 415 ' key=lambda x: np.argsort(index_natsorted(df["time"]))\n' 416 ')' 417 ], 418 # Mode that we don't yet support, documentation added in pandas 419 # 1.2.0 (https://github.com/pandas-dev/pandas/issues/35912) 420 'pandas.core.frame.DataFrame.aggregate': [ 421 "df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))" 422 ], 423 }) 424 self.assertEqual(result.failed, 0) 425 426 def test_series_tests(self): 427 result = doctests.testmod( 428 pd.core.series, 429 use_beam=False, 430 report=True, 431 wont_implement_ok={ 432 'pandas.core.series.Series.__array__': ['*'], 433 'pandas.core.series.Series.array': ['*'], 434 'pandas.core.series.Series.cummax': ['*'], 435 'pandas.core.series.Series.cummin': ['*'], 436 'pandas.core.series.Series.cumsum': ['*'], 437 'pandas.core.series.Series.cumprod': ['*'], 438 'pandas.core.series.Series.diff': ['*'], 439 'pandas.core.series.Series.dot': [ 440 's.dot(arr)', # non-deferred result 441 ], 442 'pandas.core.series.Series.fillna': [ 443 'df.fillna(method=\'ffill\')', 444 'df.fillna(method="ffill")', 445 'df.fillna(value=values, limit=1)', 446 ], 447 'pandas.core.series.Series.info': ['*'], 448 'pandas.core.series.Series.items': ['*'], 449 'pandas.core.series.Series.iteritems': ['*'], 450 # default keep is 'first' 451 'pandas.core.series.Series.nlargest': [ 452 "s.nlargest()", 453 "s.nlargest(3)", 454 "s.nlargest(3, keep='last')", 455 ], 456 'pandas.core.series.Series.memory_usage': ['*'], 457 'pandas.core.series.Series.nsmallest': [ 458 "s.nsmallest()", 459 "s.nsmallest(3)", 460 "s.nsmallest(3, keep='last')", 461 ], 462 'pandas.core.series.Series.pop': ['*'], 463 'pandas.core.series.Series.searchsorted': ['*'], 464 'pandas.core.series.Series.shift': [ 465 'df.shift(periods=3)', 466 'df.shift(periods=3, fill_value=0)', 467 ], 468 'pandas.core.series.Series.take': ['*'], 469 'pandas.core.series.Series.to_dict': ['*'], 470 'pandas.core.series.Series.unique': ['*'], 471 'pandas.core.series.Series.unstack': ['*'], 472 'pandas.core.series.Series.values': ['*'], 473 'pandas.core.series.Series.view': ['*'], 474 'pandas.core.series.Series.append': [ 475 's1.append(s2, ignore_index=True)', 476 ], 477 'pandas.core.series.Series.replace': [ 478 "s.replace([1, 2], method='bfill')", 479 # Relies on method='pad' 480 "s.replace('a')", 481 # Relies on method='pad' 482 # value=None is not valid for pandas < 1.4 483 "s.replace('a', None)", 484 # Implicitly uses method='pad', but output doesn't rely on that 485 # behavior. Verified indepently in 486 # frames_test.py::DeferredFrameTest::test_replace 487 "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})" 488 ], 489 'pandas.core.series.Series.sort_index': ['*'], 490 'pandas.core.series.Series.sort_values': ['*'], 491 'pandas.core.series.Series.argmax': ['*'], 492 'pandas.core.series.Series.argmin': ['*'], 493 'pandas.core.series.Series.drop_duplicates': [ 494 's.drop_duplicates()', 495 "s.drop_duplicates(keep='last')", 496 ], 497 'pandas.core.series.Series.reindex': ['*'], 498 'pandas.core.series.Series.autocorr': ['*'], 499 'pandas.core.series.Series.repeat': ['s.repeat([1, 2, 3])'], 500 'pandas.core.series.Series.resample': ['*'], 501 'pandas.core.series.Series': ['ser.iloc[0] = 999'], 502 }, 503 not_implemented_ok={ 504 'pandas.core.series.Series.transform': [ 505 # str arg not supported. Tested with np.sum in 506 # frames_test.py::DeferredFrameTest::test_groupby_transform_sum 507 "df.groupby('Date')['Data'].transform('sum')", 508 ], 509 'pandas.core.series.Series.groupby': [ 510 'ser.groupby(["a", "b", "a", "b"]).mean()', 511 'ser.groupby(["a", "b", "a", np.nan]).mean()', 512 'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()', 513 ], 514 }, 515 skip={ 516 # Relies on setting values with iloc 517 'pandas.core.series.Series': ['ser', 'r'], 518 'pandas.core.series.Series.groupby': [ 519 # TODO(https://github.com/apache/beam/issues/20643): This 520 # example requires aligning two series with non-unique indexes. 521 # It only works in pandas because pandas can recognize the 522 # indexes are identical and elide the alignment. 523 'ser.groupby(ser > 100).mean()', 524 ], 525 'pandas.core.series.Series.asfreq': ['*'], 526 # error formatting 527 'pandas.core.series.Series.append': [ 528 's1.append(s2, verify_integrity=True)', 529 ], 530 'pandas.core.series.Series.cov': [ 531 # Differs in LSB on jenkins. 532 "s1.cov(s2)", 533 ], 534 # Skipped idxmax/idxmin due an issue with the test framework 535 'pandas.core.series.Series.idxmin': ['s.idxmin()'], 536 'pandas.core.series.Series.idxmax': ['s.idxmax()'], 537 'pandas.core.series.Series.duplicated': ['*'], 538 'pandas.core.series.Series.set_axis': ['*'], 539 'pandas.core.series.Series.nonzero': ['*'], 540 'pandas.core.series.Series.pop': ['ser'], # testing side effect 541 # Raises right exception, but testing framework has matching issues. 542 'pandas.core.series.Series.replace': [ 543 "df.replace({'a string': 'new value', True: False}) # raises" 544 ], 545 'pandas.core.series.Series.searchsorted': [ 546 # This doctest seems to be incorrectly parsed. 547 "x = pd.Categorical(['apple', 'bread', 'bread'," 548 ], 549 'pandas.core.series.Series.to_csv': ['*'], 550 'pandas.core.series.Series.to_markdown': ['*'], 551 'pandas.core.series.Series.update': ['*'], 552 'pandas.core.series.Series.view': [ 553 # Inspection after modification. 554 's' 555 ], 556 'pandas.core.series.Series.resample': ['df'], 557 }) 558 self.assertEqual(result.failed, 0) 559 560 def test_string_tests(self): 561 if PD_VERSION < (1, 2): 562 module = pd.core.strings 563 else: 564 # Definitions were moved to accessor in pandas 1.2.0 565 module = pd.core.strings.accessor 566 567 module_name = module.__name__ 568 569 result = doctests.testmod( 570 module, 571 use_beam=False, 572 wont_implement_ok={ 573 # These methods can accept deferred series objects, but not lists 574 f'{module_name}.StringMethods.cat': [ 575 "s.str.cat(['A', 'B', 'C', 'D'], sep=',')", 576 "s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')", 577 "s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')" 578 ], 579 f'{module_name}.StringMethods.repeat': [ 580 's.str.repeat(repeats=[1, 2, 3])' 581 ], 582 f'{module_name}.str_repeat': ['s.str.repeat(repeats=[1, 2, 3])'], 583 # get_dummies pandas examples are not casted to CategoricalDtype 584 # Must be CategoricalDtype to work in Beam 585 f'{module_name}.StringMethods.get_dummies': ['*'], 586 f'{module_name}.str_get_dummies': ['*'], 587 f'{module_name}.StringMethods': ['s.str.split("_")'], 588 }, 589 skip={ 590 # count() on Series with a NaN produces mismatched type if we 591 # have a NaN-only partition. 592 f'{module_name}.StringMethods.count': ["s.str.count('a')"], 593 f'{module_name}.str_count': ["s.str.count('a')"], 594 595 # Bad test strings in pandas 1.1.x 596 f'{module_name}.str_replace': [ 597 "pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)" 598 ], 599 f'{module_name}.StringMethods.replace': [ 600 "pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)" 601 ], 602 603 # output has incorrect formatting in 1.2.x 604 f'{module_name}.StringMethods.extractall': ['*'], 605 606 # For split and rsplit, if expand=True, then the series 607 # must be of CategoricalDtype, which pandas doesn't convert to 608 f'{module_name}.StringMethods.rsplit': [ 609 's.str.split(r"\\+|=", expand=True)', # for pandas<1.4 610 's.str.split(expand=True)', 611 's.str.rsplit("/", n=1, expand=True)', 612 's.str.split(r"and|plus", expand=True)', 613 's.str.split(r".", expand=True)', 614 's.str.split(r"\\.jpg", expand=True)', 615 's.str.split(r"\\.jpg", regex=True, expand=True)', 616 's.str.split(re.compile(r"\\.jpg"), expand=True)', 617 's.str.split(r"\\.jpg", regex=False, expand=True)' 618 ], 619 f'{module_name}.StringMethods.split': [ 620 's.str.split(r"\\+|=", expand=True)', # for pandas<1.4 621 's.str.split(expand=True)', 622 's.str.rsplit("/", n=1, expand=True)', 623 's.str.split(r"and|plus", expand=True)', 624 's.str.split(r".", expand=True)', 625 's.str.split(r"\\.jpg", expand=True)', 626 's.str.split(r"\\.jpg", regex=True, expand=True)', 627 's.str.split(re.compile(r"\\.jpg"), expand=True)', 628 's.str.split(r"\\.jpg", regex=False, expand=True)' 629 ] 630 }) 631 self.assertEqual(result.failed, 0) 632 633 def test_datetime_tests(self): 634 # TODO(BEAM-10721) 635 indexes_accessors_result = doctests.testmod( 636 pd.core.indexes.accessors, 637 use_beam=False, 638 skip={ 639 'pandas.core.indexes.accessors.TimedeltaProperties': [ 640 # Seems like an upstream bug. The property is 'second' 641 'seconds_series.dt.seconds' 642 ], 643 644 # TODO(https://github.com/apache/beam/issues/21013): Test data 645 # creation fails for these 646 # s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) 647 # pylint: disable=line-too-long 648 'pandas.core.indexes.accessors.DatetimeProperties.to_pydatetime': [ 649 '*' 650 ], 651 'pandas.core.indexes.accessors.TimedeltaProperties.components': [ 652 '*' 653 ], 654 'pandas.core.indexes.accessors.TimedeltaProperties.to_pytimedelta': [ 655 '*' 656 ], 657 # pylint: enable=line-too-long 658 }) 659 datetimelike_result = doctests.testmod( 660 pd.core.arrays.datetimelike, use_beam=False) 661 662 datetime_result = doctests.testmod( 663 pd.core.arrays.datetimes, 664 use_beam=False, 665 wont_implement_ok={ 666 'pandas.core.arrays.datetimes.DatetimeArray.to_period': ['*'], 667 # All tz_localize tests use unsupported values for ambiguous= 668 # Verified seperately in 669 # frames_test.py::DeferredFrameTest::test_dt_tz_localize_* 670 'pandas.core.arrays.datetimes.DatetimeArray.tz_localize': ['*'], 671 }, 672 not_implemented_ok={ 673 # Verifies index version of this method 674 'pandas.core.arrays.datetimes.DatetimeArray.to_period': [ 675 'df.index.to_period("M")' 676 ], 677 }) 678 679 self.assertEqual(indexes_accessors_result.failed, 0) 680 self.assertEqual(datetimelike_result.failed, 0) 681 self.assertEqual(datetime_result.failed, 0) 682 683 def test_indexing_tests(self): 684 result = doctests.testmod( 685 pd.core.indexing, 686 use_beam=False, 687 skip={ 688 'pandas.core.indexing._IndexSlice': ['*'], 689 'pandas.core.indexing.IndexingMixin.at': ['*'], 690 'pandas.core.indexing.IndexingMixin.iat': ['*'], 691 'pandas.core.indexing.IndexingMixin.iloc': ['*'], 692 'pandas.core.indexing.IndexingMixin.loc': ['*'], 693 'pandas.core.indexing._AtIndexer': ['*'], 694 'pandas.core.indexing._LocIndexer': ['*'], 695 'pandas.core.indexing._iAtIndexer': ['*'], 696 'pandas.core.indexing._iLocIndexer': ['*'], 697 }) 698 self.assertEqual(result.failed, 0) 699 700 def test_groupby_tests(self): 701 result = doctests.testmod( 702 pd.core.groupby.groupby, 703 use_beam=False, 704 verbose=True, 705 wont_implement_ok={ 706 'pandas.core.groupby.groupby.GroupBy.first': ['*'], 707 'pandas.core.groupby.groupby.GroupBy.head': ['*'], 708 'pandas.core.groupby.groupby.GroupBy.last': ['*'], 709 'pandas.core.groupby.groupby.GroupBy.tail': ['*'], 710 'pandas.core.groupby.groupby.GroupBy.nth': ['*'], 711 'pandas.core.groupby.groupby.GroupBy.cumcount': ['*'], 712 'pandas.core.groupby.groupby.GroupBy.resample': ['*'], 713 }, 714 not_implemented_ok={ 715 'pandas.core.groupby.groupby.GroupBy.first': ['*'], 716 'pandas.core.groupby.groupby.GroupBy.last': ['*'], 717 'pandas.core.groupby.groupby.GroupBy.ngroup': ['*'], 718 'pandas.core.groupby.groupby.GroupBy.sample': ['*'], 719 'pandas.core.groupby.groupby.GroupBy.rank': ['*'], 720 'pandas.core.groupby.groupby.GroupBy.nth': [ 721 "df.groupby('A', as_index=False).nth(1)", 722 ], 723 }, 724 skip={ 725 # Uses iloc to mutate a DataFrame 726 'pandas.core.groupby.groupby.GroupBy.resample': [ 727 'df.iloc[2, 0] = 5', 728 'df', 729 ], 730 # df is reassigned 731 'pandas.core.groupby.groupby.GroupBy.rank': ['df'], 732 # TODO: Raise wont implement for list passed as a grouping column 733 # Currently raises unhashable type: list 734 'pandas.core.groupby.groupby.GroupBy.ngroup': [ 735 'df.groupby(["A", [1,1,2,3,2,1]]).ngroup()' 736 ], 737 }) 738 self.assertEqual(result.failed, 0) 739 740 result = doctests.testmod( 741 pd.core.groupby.generic, 742 use_beam=False, 743 wont_implement_ok={ 744 # Returns an array by default, not a Series. WontImplement 745 # (non-deferred) 746 'pandas.core.groupby.generic.SeriesGroupBy.unique': ['*'], 747 # TODO: Is take actually deprecated? 748 'pandas.core.groupby.generic.DataFrameGroupBy.take': ['*'], 749 'pandas.core.groupby.generic.SeriesGroupBy.take': ['*'], 750 'pandas.core.groupby.generic.SeriesGroupBy.nsmallest': [ 751 "s.nsmallest(3, keep='last')", 752 "s.nsmallest(3)", 753 "s.nsmallest()", 754 ], 755 'pandas.core.groupby.generic.SeriesGroupBy.nlargest': [ 756 "s.nlargest(3, keep='last')", 757 "s.nlargest(3)", 758 "s.nlargest()", 759 ], 760 'pandas.core.groupby.generic.DataFrameGroupBy.diff': ['*'], 761 'pandas.core.groupby.generic.SeriesGroupBy.diff': ['*'], 762 'pandas.core.groupby.generic.DataFrameGroupBy.hist': ['*'], 763 'pandas.core.groupby.generic.DataFrameGroupBy.fillna': [ 764 'df.fillna(method=\'ffill\')', 765 'df.fillna(method="ffill")', 766 'df.fillna(value=values, limit=1)', 767 ], 768 'pandas.core.groupby.generic.SeriesGroupBy.fillna': [ 769 'df.fillna(method=\'ffill\')', 770 'df.fillna(method="ffill")', 771 'df.fillna(value=values, limit=1)', 772 ], 773 }, 774 not_implemented_ok={ 775 'pandas.core.groupby.generic.DataFrameGroupBy.idxmax': ['*'], 776 'pandas.core.groupby.generic.DataFrameGroupBy.idxmin': ['*'], 777 'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'], 778 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'], 779 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'], 780 'pandas.core.groupby.generic.SeriesGroupBy.apply': ['*'], 781 }, 782 skip={ 783 'pandas.core.groupby.generic.SeriesGroupBy.cov': [ 784 # Floating point comparison fails 785 's1.cov(s2)', 786 ], 787 'pandas.core.groupby.generic.DataFrameGroupBy.cov': [ 788 # Mutates input DataFrame with loc 789 # TODO: Replicate in frames_test.py 790 "df.loc[df.index[:5], 'a'] = np.nan", 791 "df.loc[df.index[5:10], 'b'] = np.nan", 792 "df.cov(min_periods=12)", 793 ], 794 # These examples rely on grouping by a list 795 'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'], 796 'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'], 797 'pandas.core.groupby.generic.SeriesGroupBy.transform': [ 798 # Dropping invalid columns during a transform is unsupported. 799 'grouped.transform(lambda x: (x - x.mean()) / x.std())' 800 ], 801 'pandas.core.groupby.generic.DataFrameGroupBy.transform': [ 802 # Dropping invalid columns during a transform is unsupported. 803 'grouped.transform(lambda x: (x - x.mean()) / x.std())' 804 ], 805 # Skipped idxmax/idxmin due an issue with the test framework 806 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['s.idxmin()'], 807 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['s.idxmax()'], 808 # Uses as_index, which is currently not_implemented 809 'pandas.core.groupby.generic.DataFrameGroupBy.value_counts': [ 810 "df.groupby('gender', as_index=False).value_counts()", 811 # pylint: disable=line-too-long 812 "df.groupby('gender', as_index=False).value_counts(normalize=True)", 813 ], 814 }) 815 self.assertEqual(result.failed, 0) 816 817 def test_top_level(self): 818 tests = { 819 name: func.__doc__ 820 for (name, func) in pd.__dict__.items() 821 if _is_top_level_function(func) and getattr(func, '__doc__', None) 822 } 823 824 # IO methods are tested in io_test.py 825 skip_reads = {name: ['*'] for name in dir(pd) if name.startswith('read_')} 826 827 result = doctests.teststrings( 828 tests, 829 use_beam=False, 830 report=True, 831 not_implemented_ok={ 832 'concat': ['pd.concat([s1, s2], ignore_index=True)'], 833 'crosstab': ['*'], 834 'cut': ['*'], 835 'eval': ['*'], 836 'from_dummies': ['*'], 837 'get_dummies': ['*'], 838 'infer_freq': ['*'], 839 'lreshape': ['*'], 840 'melt': ['*'], 841 'merge': ["df1.merge(df2, how='cross')"], 842 'merge_asof': ['*'], 843 'pivot_table': ['*'], 844 'qcut': ['*'], 845 'reset_option': ['*'], 846 'set_eng_float_format': ['*'], 847 'set_option': ['*'], 848 'to_numeric': ['*'], 849 'to_timedelta': ['*'], 850 'unique': ['*'], 851 'wide_to_long': ['*'], 852 }, 853 wont_implement_ok={ 854 'factorize': ['*'], 855 'pivot': ['*'], 856 'to_datetime': ['s.head()'], 857 'to_pickle': ['*'], 858 'melt': [ 859 "pd.melt(df, id_vars=['A'], value_vars=['B'])", 860 "pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])", 861 "pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])", 862 "pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])", 863 "pd.melt(df, id_vars=['A'], value_vars=['B'],\n" + 864 " var_name='myVarname', value_name='myValname')" 865 ], 866 }, 867 skip={ 868 # error formatting 869 'concat': [ 870 'pd.concat([df5, df6], verify_integrity=True)', 871 'pd.concat([df7, new_row.to_frame().T], ignore_index=True)' 872 ], 873 # doctest DeprecationWarning 874 'melt': ['df'], 875 # Order-sensitive re-indexing. 876 'merge': [ 877 "df1.merge(df2, left_on='lkey', right_on='rkey')", 878 "df1.merge(df2, left_on='lkey', right_on='rkey',\n" 879 " suffixes=('_left', '_right'))", 880 "df1.merge(df2, how='left', on='a')", 881 ], 882 # Not an actual test. 883 'option_context': ['*'], 884 'factorize': ['codes', 'uniques'], 885 # Bad top-level use of un-imported function. 886 'merge_ordered': [ 887 'merge_ordered(df1, df2, fill_method="ffill", left_by="group")' 888 ], 889 # Expected error. 890 'pivot': [ 891 "df.pivot(index='foo', columns='bar', values='baz')", 892 "df.pivot(index='foo', columns='bar')['baz']", 893 "df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])", 894 # pylint: disable=line-too-long 895 'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")', 896 # pylint: disable=line-too-long 897 'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")' 898 ], 899 # Never written. 900 'to_pickle': ['os.remove("./dummy.pkl")'], 901 **skip_reads 902 }) 903 self.assertEqual(result.failed, 0) 904 905 906 if __name__ == '__main__': 907 unittest.main()