github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_doctests_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_doctests_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import sys
    18  import unittest
    19  
    20  import pandas as pd
    21  
    22  from apache_beam.dataframe import doctests
    23  from apache_beam.dataframe.frames import PD_VERSION
    24  from apache_beam.dataframe.pandas_top_level_functions import _is_top_level_function
    25  
    26  
    27  @unittest.skipIf(
    28      sys.platform == 'win32', '[https://github.com/apache/beam/issues/20361]')
    29  class DoctestTest(unittest.TestCase):
    30    def test_ndframe_tests(self):
    31      # IO methods are tested in io_test.py
    32      skip_writes = {
    33          f'pandas.core.generic.NDFrame.{name}': ['*']
    34          for name in dir(pd.core.generic.NDFrame) if name.startswith('to_')
    35      }
    36  
    37      result = doctests.testmod(
    38          pd.core.generic,
    39          use_beam=False,
    40          report=True,
    41          wont_implement_ok={
    42              'pandas.core.generic.NDFrame.head': ['*'],
    43              'pandas.core.generic.NDFrame.shift': [
    44                  'df.shift(periods=3)',
    45                  'df.shift(periods=3, fill_value=0)',
    46              ],
    47              'pandas.core.generic.NDFrame.tail': ['*'],
    48              'pandas.core.generic.NDFrame.take': ['*'],
    49              'pandas.core.generic.NDFrame.values': ['*'],
    50              'pandas.core.generic.NDFrame.tz_localize': [
    51                  "s.tz_localize('CET', ambiguous='infer')",
    52                  # np.array is not a deferred object. This use-case is possible
    53                  # with a deferred Series though, which is tested in
    54                  # frames_test.py
    55                  "s.tz_localize('CET', ambiguous=np.array([True, True, False]))",
    56              ],
    57              'pandas.core.generic.NDFrame.truncate': [
    58                  # These inputs rely on tail (wont implement, order
    59                  # sensitive) for verification
    60                  "df.tail()",
    61                  "df.truncate(before=pd.Timestamp('2016-01-05'),\n"
    62                  "            after=pd.Timestamp('2016-01-10')).tail()",
    63                  "df.truncate('2016-01-05', '2016-01-10').tail()",
    64                  "df.loc['2016-01-05':'2016-01-10', :].tail()"
    65              ],
    66              'pandas.core.generic.NDFrame.replace': [
    67                  "s.replace([1, 2], method='bfill')",
    68                  # Relies on method='pad'
    69                  "s.replace('a')",
    70                  # Relies on method='pad'
    71                  # value=None is not valid for pandas < 1.4
    72                  "s.replace('a', None)",
    73                  # Implicitly uses method='pad', but output doesn't rely on that
    74                  # behavior. Verified indepently in
    75                  # frames_test.py::DeferredFrameTest::test_replace
    76                  "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})"
    77              ],
    78              'pandas.core.generic.NDFrame.fillna': [
    79                  'df.fillna(method=\'ffill\')',
    80                  'df.fillna(method="ffill")',
    81                  'df.fillna(value=values, limit=1)',
    82              ],
    83              'pandas.core.generic.NDFrame.sort_values': ['*'],
    84              'pandas.core.generic.NDFrame.mask': [
    85                  'df.where(m, -df) == np.where(m, df, -df)'
    86              ],
    87              'pandas.core.generic.NDFrame.where': [
    88                  'df.where(m, -df) == np.where(m, df, -df)'
    89              ],
    90              'pandas.core.generic.NDFrame.interpolate': ['*'],
    91              'pandas.core.generic.NDFrame.resample': ['*'],
    92              'pandas.core.generic.NDFrame.rolling': ['*'],
    93              # argsort wont implement
    94              'pandas.core.generic.NDFrame.abs': [
    95                  'df.loc[(df.c - 43).abs().argsort()]',
    96              ],
    97              'pandas.core.generic.NDFrame.reindex': ['*'],
    98              'pandas.core.generic.NDFrame.pct_change': ['*'],
    99              'pandas.core.generic.NDFrame.asof': ['*'],
   100              'pandas.core.generic.NDFrame.infer_objects': ['*'],
   101              'pandas.core.generic.NDFrame.ewm': ['*'],
   102              'pandas.core.generic.NDFrame.expanding': ['*'],
   103              'pandas.core.generic.NDFrame.get': ['*'],
   104          },
   105          not_implemented_ok={
   106              'pandas.core.generic.NDFrame.asof': ['*'],
   107              'pandas.core.generic.NDFrame.at_time': ['*'],
   108              'pandas.core.generic.NDFrame.between_time': ['*'],
   109              'pandas.core.generic.NDFrame.ewm': ['*'],
   110              'pandas.core.generic.NDFrame.expanding': ['*'],
   111              'pandas.core.generic.NDFrame.flags': ['*'],
   112              'pandas.core.generic.NDFrame.rank': ['*'],
   113              'pandas.core.generic.NDFrame.reindex_like': ['*'],
   114              'pandas.core.generic.NDFrame.replace': ['*'],
   115              'pandas.core.generic.NDFrame.sample': ['*'],
   116              'pandas.core.generic.NDFrame.set_flags': ['*'],
   117              'pandas.core.generic.NDFrame.squeeze': ['*'],
   118              'pandas.core.generic.NDFrame.truncate': ['*'],
   119          },
   120          skip={
   121              # Internal test
   122              'pandas.core.generic.NDFrame._set_axis_name': ['*'],
   123              # Fails to construct test series. asfreq is not implemented anyway.
   124              'pandas.core.generic.NDFrame.asfreq': ['*'],
   125              'pandas.core.generic.NDFrame.astype': ['*'],
   126              'pandas.core.generic.NDFrame.convert_dtypes': ['*'],
   127              'pandas.core.generic.NDFrame.copy': ['*'],
   128              'pandas.core.generic.NDFrame.droplevel': ['*'],
   129              'pandas.core.generic.NDFrame.get': ['*'],
   130              'pandas.core.generic.NDFrame.rank': ['*'],
   131              'pandas.core.generic.NDFrame.rename': [
   132                  # Seems to be an upstream bug. The actual error has a different
   133                  # message:
   134                  #   TypeError: Index(...) must be called with a collection of
   135                  #   some kind, 2 was passed
   136                  # pandas doctests only verify the type of exception
   137                  'df.rename(2)'
   138              ],
   139              # For pandas >= 1.4, rename is changed to _rename
   140              'pandas.core.generic.NDFrame._rename': [
   141                  # Seems to be an upstream bug. The actual error has a different
   142                  # message:
   143                  #   TypeError: Index(...) must be called with a collection of
   144                  #   some kind, 2 was passed
   145                  # pandas doctests only verify the type of exception
   146                  'df.rename(2)'
   147              ],
   148              # Tests rely on setting index
   149              'pandas.core.generic.NDFrame.rename_axis': ['*'],
   150              # Raises right exception, but testing framework has matching issues.
   151              'pandas.core.generic.NDFrame.replace': [
   152                  "df.replace({'a string': 'new value', True: False})  # raises"
   153              ],
   154              'pandas.core.generic.NDFrame.squeeze': ['*'],
   155  
   156              # NameError
   157              'pandas.core.generic.NDFrame.resample': ['df'],
   158  
   159              # Skipped so we don't need to install natsort
   160              'pandas.core.generic.NDFrame.sort_values': [
   161                  'from natsort import index_natsorted',
   162                  'df.sort_values(\n'
   163                  '   by="time",\n'
   164                  '   key=lambda x: np.argsort(index_natsorted(df["time"]))\n'
   165                  ')'
   166              ],
   167              **skip_writes
   168          })
   169      self.assertEqual(result.failed, 0)
   170  
   171    def test_dataframe_tests(self):
   172      result = doctests.testmod(
   173          pd.core.frame,
   174          use_beam=False,
   175          report=True,
   176          wont_implement_ok={
   177              'pandas.core.frame.DataFrame.T': ['*'],
   178              'pandas.core.frame.DataFrame.cummax': ['*'],
   179              'pandas.core.frame.DataFrame.cummin': ['*'],
   180              'pandas.core.frame.DataFrame.cumsum': ['*'],
   181              'pandas.core.frame.DataFrame.cumprod': ['*'],
   182              'pandas.core.frame.DataFrame.diff': ['*'],
   183              'pandas.core.frame.DataFrame.fillna': [
   184                  'df.fillna(method=\'ffill\')',
   185                  'df.fillna(method="ffill")',
   186                  'df.fillna(value=values, limit=1)',
   187              ],
   188              'pandas.core.frame.DataFrame.items': ['*'],
   189              'pandas.core.frame.DataFrame.itertuples': ['*'],
   190              'pandas.core.frame.DataFrame.iterrows': ['*'],
   191              'pandas.core.frame.DataFrame.iteritems': ['*'],
   192              # default keep is 'first'
   193              'pandas.core.frame.DataFrame.nlargest': [
   194                  "df.nlargest(3, 'population')",
   195                  "df.nlargest(3, ['population', 'GDP'])",
   196                  "df.nlargest(3, 'population', keep='last')"
   197              ],
   198              'pandas.core.frame.DataFrame.nsmallest': [
   199                  "df.nsmallest(3, 'population')",
   200                  "df.nsmallest(3, ['population', 'GDP'])",
   201                  "df.nsmallest(3, 'population', keep='last')",
   202              ],
   203              'pandas.core.frame.DataFrame.replace': [
   204                  "s.replace([1, 2], method='bfill')",
   205                  # Relies on method='pad'
   206                  "s.replace('a')",
   207                  # Relies on method='pad'
   208                  # value=None is not valid for pandas < 1.4
   209                  "s.replace('a', None)",
   210                  # Implicitly uses method='pad', but output doesn't rely on that
   211                  # behavior. Verified indepently in
   212                  # frames_test.py::DeferredFrameTest::test_replace
   213                  "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})"
   214              ],
   215              'pandas.core.frame.DataFrame.to_records': ['*'],
   216              'pandas.core.frame.DataFrame.to_dict': ['*'],
   217              'pandas.core.frame.DataFrame.to_numpy': ['*'],
   218              'pandas.core.frame.DataFrame.to_string': ['*'],
   219              'pandas.core.frame.DataFrame.transpose': ['*'],
   220              'pandas.core.frame.DataFrame.shape': ['*'],
   221              'pandas.core.frame.DataFrame.shift': [
   222                  'df.shift(periods=3)',
   223                  'df.shift(periods=3, fill_value=0)',
   224              ],
   225              'pandas.core.frame.DataFrame.unstack': ['*'],
   226              'pandas.core.frame.DataFrame.memory_usage': ['*'],
   227              'pandas.core.frame.DataFrame.info': ['*'],
   228              # Not equal to df.agg('mode', axis='columns', numeric_only=True)
   229              # because there can be multiple columns if a row has more than one
   230              # mode
   231              'pandas.core.frame.DataFrame.mode': [
   232                  "df.mode(axis='columns', numeric_only=True)"
   233              ],
   234              'pandas.core.frame.DataFrame.append': [
   235                  'df.append(df2, ignore_index=True)',
   236                  "for i in range(5):\n" +
   237                  "    df = df.append({'A': i}, ignore_index=True)",
   238              ],
   239              'pandas.core.frame.DataFrame.sort_index': ['*'],
   240              'pandas.core.frame.DataFrame.sort_values': ['*'],
   241              'pandas.core.frame.DataFrame.melt': [
   242                  "df.melt(id_vars=['A'], value_vars=['B'])",
   243                  "df.melt(id_vars=['A'], value_vars=['B', 'C'])",
   244                  "df.melt(col_level=0, id_vars=['A'], value_vars=['B'])",
   245                  "df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')])",
   246                  "df.melt(id_vars=['A'], value_vars=['B'],\n" +
   247                  "        var_name='myVarname', value_name='myValname')"
   248              ],
   249              # Most keep= options are order-sensitive
   250              'pandas.core.frame.DataFrame.drop_duplicates': ['*'],
   251              'pandas.core.frame.DataFrame.duplicated': [
   252                  'df.duplicated()',
   253                  "df.duplicated(keep='last')",
   254                  "df.duplicated(subset=['brand'])",
   255              ],
   256              'pandas.core.frame.DataFrame.reindex': ['*'],
   257              'pandas.core.frame.DataFrame.dot': [
   258                  # reindex not supported
   259                  's2 = s.reindex([1, 0, 2, 3])',
   260              ],
   261              'pandas.core.frame.DataFrame.resample': ['*'],
   262              'pandas.core.frame.DataFrame.values': ['*'],
   263          },
   264          not_implemented_ok={
   265              'pandas.core.frame.DataFrame.transform': [
   266                  # str arg not supported. Tested with np.sum in
   267                  # frames_test.py::DeferredFrameTest::test_groupby_transform_sum
   268                  "df.groupby('Date')['Data'].transform('sum')",
   269              ],
   270              'pandas.core.frame.DataFrame.melt': ['*'],
   271              'pandas.core.frame.DataFrame.reindex_axis': ['*'],
   272              'pandas.core.frame.DataFrame.round': [
   273                  'df.round(decimals)',
   274              ],
   275  
   276              # Trivially elementwise for axis=columns. Relies on global indexing
   277              # for axis=rows.
   278              # Difficult to determine proxy, need to inspect function
   279              'pandas.core.frame.DataFrame.apply': ['*'],
   280  
   281              # Cross-join not implemented
   282              'pandas.core.frame.DataFrame.merge': [
   283                  "df1.merge(df2, how='cross')"
   284              ],
   285  
   286              # TODO(https://github.com/apache/beam/issues/20759)
   287              'pandas.core.frame.DataFrame.set_index': [
   288                  "df.set_index([s, s**2])",
   289              ],
   290  
   291              'pandas.core.frame.DataFrame.set_axis': [
   292                  "df.set_axis(range(0,2), axis='index')",
   293              ],
   294  
   295              # TODO(https://github.com/apache/beam/issues/21014)
   296              'pandas.core.frame.DataFrame.value_counts': [
   297                'df.value_counts(dropna=False)'
   298              ],
   299          },
   300          skip={
   301              # DataFrame construction from a dictionary and
   302              # Series requires using the len() function, which
   303              # is a non-deferred operation that we do not allow
   304              'pandas.core.frame.DataFrame': [
   305                  'pd.DataFrame(data=d, index=[0, 1, 2, 3])',
   306              ],
   307              # s2 created with reindex
   308              'pandas.core.frame.DataFrame.dot': [
   309                  'df.dot(s2)',
   310              ],
   311  
   312              'pandas.core.frame.DataFrame.resample': ['df'],
   313              'pandas.core.frame.DataFrame.asfreq': ['*'],
   314              # Throws NotImplementedError when modifying df
   315              'pandas.core.frame.DataFrame.axes': [
   316                  # Returns deferred index.
   317                  'df.axes',
   318              ],
   319              # Skipped because the relies on loc to set cells in df2
   320              'pandas.core.frame.DataFrame.compare': ['*'],
   321              'pandas.core.frame.DataFrame.cov': [
   322                  # Relies on setting entries ahead of time.
   323                  "df.loc[df.index[:5], 'a'] = np.nan",
   324                  "df.loc[df.index[5:10], 'b'] = np.nan",
   325                  'df.cov(min_periods=12)',
   326              ],
   327              'pandas.core.frame.DataFrame.rename': [
   328                  # Returns deferred index.
   329                  'df.index',
   330                  'df.rename(index=str).index',
   331              ],
   332              'pandas.core.frame.DataFrame.set_index': [
   333                  # TODO(https://github.com/apache/beam/issues/20759): This could
   334                  # pass in the index as a DeferredIndex, and we should fail it
   335                  # as order-sensitive.
   336                  "df.set_index([pd.Index([1, 2, 3, 4]), 'year'])",
   337              ],
   338              'pandas.core.frame.DataFrame.set_axis': [
   339                  # This should pass as set_axis(axis='columns')
   340                  # and fail with set_axis(axis='index')
   341                  "df.set_axis(['a', 'b', 'c'], axis='index')"
   342              ],
   343              'pandas.core.frame.DataFrame.to_markdown': ['*'],
   344              'pandas.core.frame.DataFrame.to_parquet': ['*'],
   345  
   346              # Raises right exception, but testing framework has matching issues.
   347              # Tested in `frames_test.py`.
   348              'pandas.core.frame.DataFrame.insert': [
   349                  'df',
   350                  'df.insert(1, "newcol", [99, 99])',
   351                  'df.insert(0, "col1", [100, 100], allow_duplicates=True)'
   352              ],
   353  
   354              'pandas.core.frame.DataFrame.to_records': [
   355                  'df.index = df.index.rename("I")',
   356                  'index_dtypes = f"<S{df.index.str.len().max()}"', # 1.x
   357                  'index_dtypes = "<S{}".format(df.index.str.len().max())', #0.x
   358                  'df.to_records(index_dtypes=index_dtypes)',
   359              ],
   360              # These tests use the static method pd.pivot_table, which doesn't
   361              # actually raise NotImplementedError
   362              'pandas.core.frame.DataFrame.pivot_table': ['*'],
   363              # Expected to raise a ValueError, but we raise NotImplementedError
   364              'pandas.core.frame.DataFrame.pivot': [
   365                  "df.pivot(index='foo', columns='bar', values='baz')",
   366                  "df.pivot(index='foo', columns='bar')['baz']",
   367                  "df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])",
   368                  # pylint: disable=line-too-long
   369                  'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")',
   370                  # pylint: disable=line-too-long
   371                  'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")'
   372              ],
   373              'pandas.core.frame.DataFrame.append': [
   374                  'df',
   375                  # pylint: disable=line-too-long
   376                  "pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],\n"
   377                  "          ignore_index=True)"
   378              ],
   379              'pandas.core.frame.DataFrame.eval': ['df'],
   380              'pandas.core.frame.DataFrame.melt': [
   381                  "df.columns = [list('ABC'), list('DEF')]", "df"
   382              ],
   383              'pandas.core.frame.DataFrame.merge': [
   384                  # Order-sensitive index, checked in frames_test.py.
   385                  "df1.merge(df2, left_on='lkey', right_on='rkey')",
   386                  "df1.merge(df2, left_on='lkey', right_on='rkey',\n"
   387                  "          suffixes=('_left', '_right'))",
   388                  "df1.merge(df2, how='left', on='a')",
   389              ],
   390              # Raises right exception, but testing framework has matching issues.
   391              'pandas.core.frame.DataFrame.replace': [
   392                  "df.replace({'a string': 'new value', True: False})  # raises"
   393              ],
   394              'pandas.core.frame.DataFrame.to_sparse': ['type(df)'],
   395  
   396              # Skipped because "seen_wont_implement" is reset before getting to
   397              # these calls, so the NameError they raise is not ignored.
   398              'pandas.core.frame.DataFrame.T': [
   399                  'df1_transposed.dtypes', 'df2_transposed.dtypes'
   400              ],
   401              'pandas.core.frame.DataFrame.transpose': [
   402                  'df1_transposed.dtypes', 'df2_transposed.dtypes'
   403              ],
   404              # Skipped because the relies on iloc to set a cell to NA. Test is
   405              # replicated in frames_test::DeferredFrameTest::test_applymap.
   406              'pandas.core.frame.DataFrame.applymap': [
   407                  'df_copy.iloc[0, 0] = pd.NA',
   408                  "df_copy.applymap(lambda x: len(str(x)), na_action='ignore')",
   409              ],
   410              # Skipped so we don't need to install natsort
   411              'pandas.core.frame.DataFrame.sort_values': [
   412                  'from natsort import index_natsorted',
   413                  'df.sort_values(\n'
   414                  '   by="time",\n'
   415                  '   key=lambda x: np.argsort(index_natsorted(df["time"]))\n'
   416                  ')'
   417              ],
   418              # Mode that we don't yet support, documentation added in pandas
   419              # 1.2.0 (https://github.com/pandas-dev/pandas/issues/35912)
   420              'pandas.core.frame.DataFrame.aggregate': [
   421                  "df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))"
   422              ],
   423          })
   424      self.assertEqual(result.failed, 0)
   425  
   426    def test_series_tests(self):
   427      result = doctests.testmod(
   428          pd.core.series,
   429          use_beam=False,
   430          report=True,
   431          wont_implement_ok={
   432              'pandas.core.series.Series.__array__': ['*'],
   433              'pandas.core.series.Series.array': ['*'],
   434              'pandas.core.series.Series.cummax': ['*'],
   435              'pandas.core.series.Series.cummin': ['*'],
   436              'pandas.core.series.Series.cumsum': ['*'],
   437              'pandas.core.series.Series.cumprod': ['*'],
   438              'pandas.core.series.Series.diff': ['*'],
   439              'pandas.core.series.Series.dot': [
   440                  's.dot(arr)',  # non-deferred result
   441              ],
   442              'pandas.core.series.Series.fillna': [
   443                  'df.fillna(method=\'ffill\')',
   444                  'df.fillna(method="ffill")',
   445                  'df.fillna(value=values, limit=1)',
   446              ],
   447              'pandas.core.series.Series.info': ['*'],
   448              'pandas.core.series.Series.items': ['*'],
   449              'pandas.core.series.Series.iteritems': ['*'],
   450              # default keep is 'first'
   451              'pandas.core.series.Series.nlargest': [
   452                  "s.nlargest()",
   453                  "s.nlargest(3)",
   454                  "s.nlargest(3, keep='last')",
   455              ],
   456              'pandas.core.series.Series.memory_usage': ['*'],
   457              'pandas.core.series.Series.nsmallest': [
   458                  "s.nsmallest()",
   459                  "s.nsmallest(3)",
   460                  "s.nsmallest(3, keep='last')",
   461              ],
   462              'pandas.core.series.Series.pop': ['*'],
   463              'pandas.core.series.Series.searchsorted': ['*'],
   464              'pandas.core.series.Series.shift': [
   465                  'df.shift(periods=3)',
   466                  'df.shift(periods=3, fill_value=0)',
   467              ],
   468              'pandas.core.series.Series.take': ['*'],
   469              'pandas.core.series.Series.to_dict': ['*'],
   470              'pandas.core.series.Series.unique': ['*'],
   471              'pandas.core.series.Series.unstack': ['*'],
   472              'pandas.core.series.Series.values': ['*'],
   473              'pandas.core.series.Series.view': ['*'],
   474              'pandas.core.series.Series.append': [
   475                  's1.append(s2, ignore_index=True)',
   476              ],
   477              'pandas.core.series.Series.replace': [
   478                  "s.replace([1, 2], method='bfill')",
   479                  # Relies on method='pad'
   480                  "s.replace('a')",
   481                  # Relies on method='pad'
   482                  # value=None is not valid for pandas < 1.4
   483                  "s.replace('a', None)",
   484                  # Implicitly uses method='pad', but output doesn't rely on that
   485                  # behavior. Verified indepently in
   486                  # frames_test.py::DeferredFrameTest::test_replace
   487                  "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})"
   488              ],
   489              'pandas.core.series.Series.sort_index': ['*'],
   490              'pandas.core.series.Series.sort_values': ['*'],
   491              'pandas.core.series.Series.argmax': ['*'],
   492              'pandas.core.series.Series.argmin': ['*'],
   493              'pandas.core.series.Series.drop_duplicates': [
   494                  's.drop_duplicates()',
   495                  "s.drop_duplicates(keep='last')",
   496              ],
   497              'pandas.core.series.Series.reindex': ['*'],
   498              'pandas.core.series.Series.autocorr': ['*'],
   499              'pandas.core.series.Series.repeat': ['s.repeat([1, 2, 3])'],
   500              'pandas.core.series.Series.resample': ['*'],
   501              'pandas.core.series.Series': ['ser.iloc[0] = 999'],
   502          },
   503          not_implemented_ok={
   504              'pandas.core.series.Series.transform': [
   505                  # str arg not supported. Tested with np.sum in
   506                  # frames_test.py::DeferredFrameTest::test_groupby_transform_sum
   507                  "df.groupby('Date')['Data'].transform('sum')",
   508              ],
   509              'pandas.core.series.Series.groupby': [
   510                  'ser.groupby(["a", "b", "a", "b"]).mean()',
   511                  'ser.groupby(["a", "b", "a", np.nan]).mean()',
   512                  'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()',
   513              ],
   514          },
   515          skip={
   516              # Relies on setting values with iloc
   517              'pandas.core.series.Series': ['ser', 'r'],
   518              'pandas.core.series.Series.groupby': [
   519                  # TODO(https://github.com/apache/beam/issues/20643): This
   520                  # example requires aligning two series with non-unique indexes.
   521                  # It only works in pandas because pandas can recognize the
   522                  # indexes are identical and elide the alignment.
   523                  'ser.groupby(ser > 100).mean()',
   524              ],
   525              'pandas.core.series.Series.asfreq': ['*'],
   526              # error formatting
   527              'pandas.core.series.Series.append': [
   528                  's1.append(s2, verify_integrity=True)',
   529              ],
   530              'pandas.core.series.Series.cov': [
   531                  # Differs in LSB on jenkins.
   532                  "s1.cov(s2)",
   533              ],
   534              # Skipped idxmax/idxmin due an issue with the test framework
   535              'pandas.core.series.Series.idxmin': ['s.idxmin()'],
   536              'pandas.core.series.Series.idxmax': ['s.idxmax()'],
   537              'pandas.core.series.Series.duplicated': ['*'],
   538              'pandas.core.series.Series.set_axis': ['*'],
   539              'pandas.core.series.Series.nonzero': ['*'],
   540              'pandas.core.series.Series.pop': ['ser'],  # testing side effect
   541              # Raises right exception, but testing framework has matching issues.
   542              'pandas.core.series.Series.replace': [
   543                  "df.replace({'a string': 'new value', True: False})  # raises"
   544              ],
   545              'pandas.core.series.Series.searchsorted': [
   546                  # This doctest seems to be incorrectly parsed.
   547                  "x = pd.Categorical(['apple', 'bread', 'bread',"
   548              ],
   549              'pandas.core.series.Series.to_csv': ['*'],
   550              'pandas.core.series.Series.to_markdown': ['*'],
   551              'pandas.core.series.Series.update': ['*'],
   552              'pandas.core.series.Series.view': [
   553                  # Inspection after modification.
   554                  's'
   555              ],
   556              'pandas.core.series.Series.resample': ['df'],
   557          })
   558      self.assertEqual(result.failed, 0)
   559  
   560    def test_string_tests(self):
   561      if PD_VERSION < (1, 2):
   562        module = pd.core.strings
   563      else:
   564        # Definitions were moved to accessor in pandas 1.2.0
   565        module = pd.core.strings.accessor
   566  
   567      module_name = module.__name__
   568  
   569      result = doctests.testmod(
   570          module,
   571          use_beam=False,
   572          wont_implement_ok={
   573              # These methods can accept deferred series objects, but not lists
   574              f'{module_name}.StringMethods.cat': [
   575                  "s.str.cat(['A', 'B', 'C', 'D'], sep=',')",
   576                  "s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')",
   577                  "s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')"
   578              ],
   579              f'{module_name}.StringMethods.repeat': [
   580                  's.str.repeat(repeats=[1, 2, 3])'
   581              ],
   582              f'{module_name}.str_repeat': ['s.str.repeat(repeats=[1, 2, 3])'],
   583              # get_dummies pandas examples are not casted to CategoricalDtype
   584              # Must be CategoricalDtype to work in Beam
   585              f'{module_name}.StringMethods.get_dummies': ['*'],
   586              f'{module_name}.str_get_dummies': ['*'],
   587              f'{module_name}.StringMethods': ['s.str.split("_")'],
   588          },
   589          skip={
   590              # count() on Series with a NaN produces mismatched type if we
   591              # have a NaN-only partition.
   592              f'{module_name}.StringMethods.count': ["s.str.count('a')"],
   593              f'{module_name}.str_count': ["s.str.count('a')"],
   594  
   595              # Bad test strings in pandas 1.1.x
   596              f'{module_name}.str_replace': [
   597                  "pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)"
   598              ],
   599              f'{module_name}.StringMethods.replace': [
   600                  "pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)"
   601              ],
   602  
   603              # output has incorrect formatting in 1.2.x
   604              f'{module_name}.StringMethods.extractall': ['*'],
   605  
   606              # For split and rsplit, if expand=True, then the series
   607              # must be of CategoricalDtype, which pandas doesn't convert to
   608              f'{module_name}.StringMethods.rsplit': [
   609                  's.str.split(r"\\+|=", expand=True)', # for pandas<1.4
   610                  's.str.split(expand=True)',
   611                  's.str.rsplit("/", n=1, expand=True)',
   612                  's.str.split(r"and|plus", expand=True)',
   613                  's.str.split(r".", expand=True)',
   614                  's.str.split(r"\\.jpg", expand=True)',
   615                  's.str.split(r"\\.jpg", regex=True, expand=True)',
   616                  's.str.split(re.compile(r"\\.jpg"), expand=True)',
   617                  's.str.split(r"\\.jpg", regex=False, expand=True)'
   618              ],
   619              f'{module_name}.StringMethods.split': [
   620                  's.str.split(r"\\+|=", expand=True)', # for pandas<1.4
   621                  's.str.split(expand=True)',
   622                  's.str.rsplit("/", n=1, expand=True)',
   623                  's.str.split(r"and|plus", expand=True)',
   624                  's.str.split(r".", expand=True)',
   625                  's.str.split(r"\\.jpg", expand=True)',
   626                  's.str.split(r"\\.jpg", regex=True, expand=True)',
   627                  's.str.split(re.compile(r"\\.jpg"), expand=True)',
   628                  's.str.split(r"\\.jpg", regex=False, expand=True)'
   629              ]
   630          })
   631      self.assertEqual(result.failed, 0)
   632  
   633    def test_datetime_tests(self):
   634      # TODO(BEAM-10721)
   635      indexes_accessors_result = doctests.testmod(
   636          pd.core.indexes.accessors,
   637          use_beam=False,
   638          skip={
   639              'pandas.core.indexes.accessors.TimedeltaProperties': [
   640                  # Seems like an upstream bug. The property is 'second'
   641                  'seconds_series.dt.seconds'
   642              ],
   643  
   644              # TODO(https://github.com/apache/beam/issues/21013): Test data
   645              # creation fails for these
   646              #   s = pd.Series(pd.to_timedelta(np.arange(5), unit="d"))
   647              # pylint: disable=line-too-long
   648              'pandas.core.indexes.accessors.DatetimeProperties.to_pydatetime': [
   649                  '*'
   650              ],
   651              'pandas.core.indexes.accessors.TimedeltaProperties.components': [
   652                  '*'
   653              ],
   654              'pandas.core.indexes.accessors.TimedeltaProperties.to_pytimedelta': [
   655                  '*'
   656              ],
   657              # pylint: enable=line-too-long
   658          })
   659      datetimelike_result = doctests.testmod(
   660          pd.core.arrays.datetimelike, use_beam=False)
   661  
   662      datetime_result = doctests.testmod(
   663          pd.core.arrays.datetimes,
   664          use_beam=False,
   665          wont_implement_ok={
   666              'pandas.core.arrays.datetimes.DatetimeArray.to_period': ['*'],
   667              # All tz_localize tests use unsupported values for ambiguous=
   668              # Verified seperately in
   669              # frames_test.py::DeferredFrameTest::test_dt_tz_localize_*
   670              'pandas.core.arrays.datetimes.DatetimeArray.tz_localize': ['*'],
   671          },
   672          not_implemented_ok={
   673              # Verifies index version of this method
   674              'pandas.core.arrays.datetimes.DatetimeArray.to_period': [
   675                  'df.index.to_period("M")'
   676              ],
   677          })
   678  
   679      self.assertEqual(indexes_accessors_result.failed, 0)
   680      self.assertEqual(datetimelike_result.failed, 0)
   681      self.assertEqual(datetime_result.failed, 0)
   682  
   683    def test_indexing_tests(self):
   684      result = doctests.testmod(
   685          pd.core.indexing,
   686          use_beam=False,
   687          skip={
   688              'pandas.core.indexing._IndexSlice': ['*'],
   689              'pandas.core.indexing.IndexingMixin.at': ['*'],
   690              'pandas.core.indexing.IndexingMixin.iat': ['*'],
   691              'pandas.core.indexing.IndexingMixin.iloc': ['*'],
   692              'pandas.core.indexing.IndexingMixin.loc': ['*'],
   693              'pandas.core.indexing._AtIndexer': ['*'],
   694              'pandas.core.indexing._LocIndexer': ['*'],
   695              'pandas.core.indexing._iAtIndexer': ['*'],
   696              'pandas.core.indexing._iLocIndexer': ['*'],
   697          })
   698      self.assertEqual(result.failed, 0)
   699  
   700    def test_groupby_tests(self):
   701      result = doctests.testmod(
   702          pd.core.groupby.groupby,
   703          use_beam=False,
   704          verbose=True,
   705          wont_implement_ok={
   706              'pandas.core.groupby.groupby.GroupBy.first': ['*'],
   707              'pandas.core.groupby.groupby.GroupBy.head': ['*'],
   708              'pandas.core.groupby.groupby.GroupBy.last': ['*'],
   709              'pandas.core.groupby.groupby.GroupBy.tail': ['*'],
   710              'pandas.core.groupby.groupby.GroupBy.nth': ['*'],
   711              'pandas.core.groupby.groupby.GroupBy.cumcount': ['*'],
   712              'pandas.core.groupby.groupby.GroupBy.resample': ['*'],
   713          },
   714          not_implemented_ok={
   715              'pandas.core.groupby.groupby.GroupBy.first': ['*'],
   716              'pandas.core.groupby.groupby.GroupBy.last': ['*'],
   717              'pandas.core.groupby.groupby.GroupBy.ngroup': ['*'],
   718              'pandas.core.groupby.groupby.GroupBy.sample': ['*'],
   719              'pandas.core.groupby.groupby.GroupBy.rank': ['*'],
   720              'pandas.core.groupby.groupby.GroupBy.nth': [
   721                  "df.groupby('A', as_index=False).nth(1)",
   722              ],
   723          },
   724          skip={
   725              # Uses iloc to mutate a DataFrame
   726              'pandas.core.groupby.groupby.GroupBy.resample': [
   727                  'df.iloc[2, 0] = 5',
   728                  'df',
   729              ],
   730              # df is reassigned
   731              'pandas.core.groupby.groupby.GroupBy.rank': ['df'],
   732              # TODO: Raise wont implement for list passed as a grouping column
   733              # Currently raises unhashable type: list
   734              'pandas.core.groupby.groupby.GroupBy.ngroup': [
   735                  'df.groupby(["A", [1,1,2,3,2,1]]).ngroup()'
   736              ],
   737          })
   738      self.assertEqual(result.failed, 0)
   739  
   740      result = doctests.testmod(
   741          pd.core.groupby.generic,
   742          use_beam=False,
   743          wont_implement_ok={
   744              # Returns an array by default, not a Series. WontImplement
   745              # (non-deferred)
   746              'pandas.core.groupby.generic.SeriesGroupBy.unique': ['*'],
   747              # TODO: Is take actually deprecated?
   748              'pandas.core.groupby.generic.DataFrameGroupBy.take': ['*'],
   749              'pandas.core.groupby.generic.SeriesGroupBy.take': ['*'],
   750              'pandas.core.groupby.generic.SeriesGroupBy.nsmallest': [
   751                  "s.nsmallest(3, keep='last')",
   752                  "s.nsmallest(3)",
   753                  "s.nsmallest()",
   754              ],
   755              'pandas.core.groupby.generic.SeriesGroupBy.nlargest': [
   756                  "s.nlargest(3, keep='last')",
   757                  "s.nlargest(3)",
   758                  "s.nlargest()",
   759              ],
   760              'pandas.core.groupby.generic.DataFrameGroupBy.diff': ['*'],
   761              'pandas.core.groupby.generic.SeriesGroupBy.diff': ['*'],
   762              'pandas.core.groupby.generic.DataFrameGroupBy.hist': ['*'],
   763              'pandas.core.groupby.generic.DataFrameGroupBy.fillna': [
   764                  'df.fillna(method=\'ffill\')',
   765                  'df.fillna(method="ffill")',
   766                  'df.fillna(value=values, limit=1)',
   767              ],
   768              'pandas.core.groupby.generic.SeriesGroupBy.fillna': [
   769                  'df.fillna(method=\'ffill\')',
   770                  'df.fillna(method="ffill")',
   771                  'df.fillna(value=values, limit=1)',
   772              ],
   773          },
   774          not_implemented_ok={
   775              'pandas.core.groupby.generic.DataFrameGroupBy.idxmax': ['*'],
   776              'pandas.core.groupby.generic.DataFrameGroupBy.idxmin': ['*'],
   777              'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
   778              'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'],
   779              'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'],
   780              'pandas.core.groupby.generic.SeriesGroupBy.apply': ['*'],
   781          },
   782          skip={
   783              'pandas.core.groupby.generic.SeriesGroupBy.cov': [
   784                  # Floating point comparison fails
   785                  's1.cov(s2)',
   786              ],
   787              'pandas.core.groupby.generic.DataFrameGroupBy.cov': [
   788                  # Mutates input DataFrame with loc
   789                  # TODO: Replicate in frames_test.py
   790                  "df.loc[df.index[:5], 'a'] = np.nan",
   791                  "df.loc[df.index[5:10], 'b'] = np.nan",
   792                  "df.cov(min_periods=12)",
   793              ],
   794              # These examples rely on grouping by a list
   795              'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'],
   796              'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'],
   797              'pandas.core.groupby.generic.SeriesGroupBy.transform': [
   798                  # Dropping invalid columns during a transform is unsupported.
   799                  'grouped.transform(lambda x: (x - x.mean()) / x.std())'
   800              ],
   801              'pandas.core.groupby.generic.DataFrameGroupBy.transform': [
   802                  # Dropping invalid columns during a transform is unsupported.
   803                  'grouped.transform(lambda x: (x - x.mean()) / x.std())'
   804              ],
   805              # Skipped idxmax/idxmin due an issue with the test framework
   806              'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['s.idxmin()'],
   807              'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['s.idxmax()'],
   808              # Uses as_index, which is currently not_implemented
   809              'pandas.core.groupby.generic.DataFrameGroupBy.value_counts': [
   810                  "df.groupby('gender', as_index=False).value_counts()",
   811                  # pylint: disable=line-too-long
   812                  "df.groupby('gender', as_index=False).value_counts(normalize=True)",
   813              ],
   814          })
   815      self.assertEqual(result.failed, 0)
   816  
   817    def test_top_level(self):
   818      tests = {
   819          name: func.__doc__
   820          for (name, func) in pd.__dict__.items()
   821          if _is_top_level_function(func) and getattr(func, '__doc__', None)
   822      }
   823  
   824      # IO methods are tested in io_test.py
   825      skip_reads = {name: ['*'] for name in dir(pd) if name.startswith('read_')}
   826  
   827      result = doctests.teststrings(
   828          tests,
   829          use_beam=False,
   830          report=True,
   831          not_implemented_ok={
   832              'concat': ['pd.concat([s1, s2], ignore_index=True)'],
   833              'crosstab': ['*'],
   834              'cut': ['*'],
   835              'eval': ['*'],
   836              'from_dummies': ['*'],
   837              'get_dummies': ['*'],
   838              'infer_freq': ['*'],
   839              'lreshape': ['*'],
   840              'melt': ['*'],
   841              'merge': ["df1.merge(df2, how='cross')"],
   842              'merge_asof': ['*'],
   843              'pivot_table': ['*'],
   844              'qcut': ['*'],
   845              'reset_option': ['*'],
   846              'set_eng_float_format': ['*'],
   847              'set_option': ['*'],
   848              'to_numeric': ['*'],
   849              'to_timedelta': ['*'],
   850              'unique': ['*'],
   851              'wide_to_long': ['*'],
   852          },
   853          wont_implement_ok={
   854              'factorize': ['*'],
   855              'pivot': ['*'],
   856              'to_datetime': ['s.head()'],
   857              'to_pickle': ['*'],
   858              'melt': [
   859                  "pd.melt(df, id_vars=['A'], value_vars=['B'])",
   860                  "pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])",
   861                  "pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])",
   862                  "pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])",
   863                  "pd.melt(df, id_vars=['A'], value_vars=['B'],\n" +
   864                  "        var_name='myVarname', value_name='myValname')"
   865              ],
   866          },
   867          skip={
   868              # error formatting
   869              'concat': [
   870                  'pd.concat([df5, df6], verify_integrity=True)',
   871                  'pd.concat([df7, new_row.to_frame().T], ignore_index=True)'
   872              ],
   873              # doctest DeprecationWarning
   874              'melt': ['df'],
   875              # Order-sensitive re-indexing.
   876              'merge': [
   877                  "df1.merge(df2, left_on='lkey', right_on='rkey')",
   878                  "df1.merge(df2, left_on='lkey', right_on='rkey',\n"
   879                  "          suffixes=('_left', '_right'))",
   880                  "df1.merge(df2, how='left', on='a')",
   881              ],
   882              # Not an actual test.
   883              'option_context': ['*'],
   884              'factorize': ['codes', 'uniques'],
   885              # Bad top-level use of un-imported function.
   886              'merge_ordered': [
   887                  'merge_ordered(df1, df2, fill_method="ffill", left_by="group")'
   888              ],
   889              # Expected error.
   890              'pivot': [
   891                  "df.pivot(index='foo', columns='bar', values='baz')",
   892                  "df.pivot(index='foo', columns='bar')['baz']",
   893                  "df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])",
   894                  # pylint: disable=line-too-long
   895                  'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")',
   896                  # pylint: disable=line-too-long
   897                  'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")'
   898              ],
   899              # Never written.
   900              'to_pickle': ['os.remove("./dummy.pkl")'],
   901              **skip_reads
   902          })
   903      self.assertEqual(result.failed, 0)
   904  
   905  
   906  if __name__ == '__main__':
   907    unittest.main()