github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/frames_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import re
    18  import unittest
    19  import warnings
    20  
    21  import numpy as np
    22  import pandas as pd
    23  from parameterized import parameterized
    24  
    25  import apache_beam as beam
    26  from apache_beam.dataframe import expressions
    27  from apache_beam.dataframe import frame_base
    28  from apache_beam.dataframe import frames
    29  from apache_beam.dataframe.convert import to_dataframe
    30  from apache_beam.runners.interactive import interactive_beam as ib
    31  from apache_beam.runners.interactive import interactive_environment as ie
    32  from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
    33  from apache_beam.runners.interactive.testing.mock_env import isolated_env
    34  
    35  # Get major, minor version
    36  PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
    37  
    38  GROUPBY_DF = pd.DataFrame({
    39      'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
    40      'foo': [None if i % 11 == 0 else i for i in range(100)],
    41      'bar': [None if i % 7 == 0 else 99 - i for i in range(100)],
    42      'baz': [None if i % 13 == 0 else i * 2 for i in range(100)],
    43      'bool': [i % 17 == 0 for i in range(100)],
    44      'str': [str(i) for i in range(100)],
    45  })
    46  
    47  
    48  def _get_deferred_args(*args):
    49    return [
    50        frame_base.DeferredFrame.wrap(
    51            expressions.ConstantExpression(arg, arg[0:0])) for arg in args
    52    ]
    53  
    54  
    55  class _AbstractFrameTest(unittest.TestCase):
    56    """Test sub-class with utilities for verifying DataFrame operations."""
    57    def _run_error_test(
    58        self, func, *args, construction_time=True, distributed=True):
    59      """Verify that func(*args) raises the same exception in pandas and in Beam.
    60  
    61      Note that by default this only checks for exceptions that the Beam DataFrame
    62      API raises during expression generation (i.e. construction time).
    63      Exceptions raised while the pipeline is executing are less helpful, but
    64      are sometimes unavoidable (e.g. data validation exceptions), to check for
    65      these exceptions use construction_time=False."""
    66      deferred_args = _get_deferred_args(*args)
    67  
    68      # Get expected error
    69      try:
    70        expected = func(*args)
    71      except Exception as e:
    72        expected_error = e
    73      else:
    74        raise AssertionError(
    75            "Expected an error, but executing with pandas successfully "
    76            f"returned:\n{expected}")
    77  
    78      # Get actual error
    79      if construction_time:
    80        try:
    81          _ = func(*deferred_args)._expr
    82        except Exception as e:
    83          actual = e
    84        else:
    85          raise AssertionError(
    86              f"Expected an error:\n{expected_error}\nbut Beam successfully "
    87              f"generated an expression.")
    88      else:  # not construction_time
    89        # Check for an error raised during pipeline execution
    90        expr = func(*deferred_args)._expr
    91        session_type = (
    92            expressions.PartitioningSession
    93            if distributed else expressions.Session)
    94        try:
    95          result = session_type({}).evaluate(expr)
    96        except Exception as e:
    97          actual = e
    98        else:
    99          raise AssertionError(
   100              f"Expected an error:\n{expected_error}\nbut Beam successfully "
   101              f"Computed the result:\n{result}.")
   102  
   103      # Verify
   104      if (not isinstance(actual, type(expected_error)) or
   105          str(expected_error) not in str(actual)):
   106        raise AssertionError(
   107            f'Expected {expected_error!r} to be raised, but got {actual!r}'
   108        ) from actual
   109  
   110    def _run_inplace_test(self, func, arg, **kwargs):
   111      """Verify an inplace operation performed by func.
   112  
   113      Checks that func performs the same inplace operation on arg, in pandas and
   114      in Beam."""
   115      def wrapper(df):
   116        df = df.copy()
   117        func(df)
   118        return df
   119  
   120      self._run_test(wrapper, arg, **kwargs)
   121  
   122    def _run_test(
   123        self,
   124        func,
   125        *args,
   126        distributed=True,
   127        nonparallel=False,
   128        check_proxy=True,
   129        lenient_dtype_check=False):
   130      """Verify that func(*args) produces the same result in pandas and in Beam.
   131  
   132      Args:
   133          distributed (bool): Whether or not to use PartitioningSession to
   134              simulate parallel execution.
   135          nonparallel (bool): Whether or not this function contains a
   136              non-parallelizable operation. If True, the expression will be
   137              generated twice, once outside of an allow_non_parallel_operations
   138              block (to verify NonParallelOperation is raised), and again inside
   139              of an allow_non_parallel_operations block to actually generate an
   140              expression to verify.
   141          check_proxy (bool): Whether or not to check that the proxy of the
   142              generated expression matches the actual result, defaults to True.
   143              This option should NOT be set to False in tests added for new
   144              operations if at all possible. Instead make sure the new operation
   145              produces the correct proxy. This flag only exists as an escape hatch
   146              until existing failures can be addressed
   147              (https://github.com/apache/beam/issues/20926).
   148          lenient_dtype_check (bool): Whether or not to check that numeric columns
   149              are still numeric between actual and proxy. i.e. verify that they
   150              are at least int64 or float64, and not necessarily have the exact
   151              same dtype. This may need to be set to True for some non-deferred
   152              operations, where the dtype of the values in the proxy are not known
   153              ahead of time, causing int64 to float64 coercion issues.
   154      """
   155      # Compute expected value
   156      expected = func(*args)
   157  
   158      # Compute actual value
   159      deferred_args = _get_deferred_args(*args)
   160      if nonparallel:
   161        # First run outside a nonparallel block to confirm this raises as expected
   162        with self.assertRaises(expressions.NonParallelOperation) as raised:
   163          func(*deferred_args)
   164  
   165        if raised.exception.msg.startswith(
   166            "Encountered non-parallelizable form of"):
   167          raise AssertionError(
   168              "Default NonParallelOperation raised, please specify a reason in "
   169              "the Singleton() partitioning requirement for this operation."
   170          ) from raised.exception
   171  
   172        # Re-run in an allow non parallel block to get an expression to verify
   173        with beam.dataframe.allow_non_parallel_operations():
   174          expr = func(*deferred_args)._expr
   175      else:
   176        expr = func(*deferred_args)._expr
   177  
   178      # Compute the result of the generated expression
   179      session_type = (
   180          expressions.PartitioningSession if distributed else expressions.Session)
   181  
   182      actual = session_type({}).evaluate(expr)
   183  
   184      # Verify
   185      if isinstance(expected, pd.core.generic.NDFrame):
   186        if distributed:
   187          if expected.index.is_unique:
   188            expected = expected.sort_index()
   189            actual = actual.sort_index()
   190          else:
   191            expected = expected.sort_values(list(expected.columns))
   192            actual = actual.sort_values(list(actual.columns))
   193        if isinstance(expected, pd.Series):
   194          if lenient_dtype_check:
   195            pd.testing.assert_series_equal(
   196                expected.astype('Float64'), actual.astype('Float64'))
   197          else:
   198            pd.testing.assert_series_equal(expected, actual)
   199        elif isinstance(expected, pd.DataFrame):
   200          if lenient_dtype_check:
   201            pd.testing.assert_frame_equal(
   202                expected.astype('Float64'), actual.astype('Float64'))
   203          else:
   204            pd.testing.assert_frame_equal(expected, actual)
   205        else:
   206          raise ValueError(
   207              f"Expected value is a {type(expected)},"
   208              "not a Series or DataFrame.")
   209  
   210      else:
   211        # Expectation is not a pandas object
   212        if isinstance(expected, float):
   213          if np.isnan(expected):
   214            cmp = np.isnan
   215          else:
   216            cmp = lambda x: np.isclose(expected, x)
   217        else:
   218          cmp = lambda x: x == expected
   219        self.assertTrue(
   220            cmp(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
   221  
   222      if check_proxy:
   223        # Verify that the actual result agrees with the proxy
   224        proxy = expr.proxy()
   225  
   226        if type(actual) in (np.float32, np.float64):
   227          self.assertTrue(type(actual) == type(proxy) or np.isnan(proxy))
   228        else:
   229          self.assertEqual(type(actual), type(proxy))
   230  
   231        if isinstance(expected, pd.core.generic.NDFrame):
   232          if isinstance(expected, pd.Series):
   233            if lenient_dtype_check:
   234              self.assertEqual(
   235                  actual.astype('Float64').dtype, proxy.astype('Float64').dtype)
   236            else:
   237              self.assertEqual(actual.dtype, proxy.dtype)
   238            self.assertEqual(actual.name, proxy.name)
   239          elif isinstance(expected, pd.DataFrame):
   240            if lenient_dtype_check:
   241              pd.testing.assert_series_equal(
   242                  actual.astype('Float64').dtypes, proxy.astype('Float64').dtypes)
   243            else:
   244              pd.testing.assert_series_equal(actual.dtypes, proxy.dtypes)
   245  
   246          else:
   247            raise ValueError(
   248                f"Expected value is a {type(expected)},"
   249                "not a Series or DataFrame.")
   250  
   251          self.assertEqual(actual.index.names, proxy.index.names)
   252  
   253          for i in range(actual.index.nlevels):
   254            if lenient_dtype_check:
   255              self.assertEqual(
   256                  actual.astype('Float64').index.get_level_values(i).dtype,
   257                  proxy.astype('Float64').index.get_level_values(i).dtype)
   258            else:
   259              self.assertEqual(
   260                  actual.index.get_level_values(i).dtype,
   261                  proxy.index.get_level_values(i).dtype)
   262  
   263  
   264  class DeferredFrameTest(_AbstractFrameTest):
   265    """Miscellaneous tessts for DataFrame operations."""
   266    def test_series_arithmetic(self):
   267      a = pd.Series([1, 2, 3])
   268      b = pd.Series([100, 200, 300])
   269  
   270      self._run_test(lambda a, b: a - 2 * b, a, b)
   271      self._run_test(lambda a, b: a.subtract(2).multiply(b).divide(a), a, b)
   272  
   273    def test_dataframe_arithmetic(self):
   274      df = pd.DataFrame({'a': [1, 2, 3], 'b': [100, 200, 300]})
   275      df2 = pd.DataFrame({'a': [3000, 1000, 2000], 'b': [7, 11, 13]})
   276  
   277      self._run_test(lambda df, df2: df - 2 * df2, df, df2)
   278      self._run_test(
   279          lambda df, df2: df.subtract(2).multiply(df2).divide(df), df, df2)
   280  
   281    @unittest.skipIf(PD_VERSION < (1, 3), "dropna=False is new in pandas 1.3")
   282    def test_value_counts_dropna_false(self):
   283      df = pd.DataFrame({
   284          'first_name': ['John', 'Anne', 'John', 'Beth'],
   285          'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']
   286      })
   287      # TODO(https://github.com/apache/beam/issues/21014): Remove the
   288      # assertRaises this when the underlying bug in
   289      # https://github.com/pandas-dev/pandas/issues/36470 is fixed.
   290      with self.assertRaises(NotImplementedError):
   291        self._run_test(lambda df: df.value_counts(dropna=False), df)
   292  
   293    def test_get_column(self):
   294      df = pd.DataFrame({
   295          'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
   296          'Speed': [380., 370., 24., 26.]
   297      })
   298      self._run_test(lambda df: df['Animal'], df)
   299      self._run_test(lambda df: df.Speed, df)
   300      self._run_test(lambda df: df.get('Animal'), df)
   301      self._run_test(lambda df: df.get('FOO', df.Animal), df)
   302  
   303    def test_series_xs(self):
   304      # pandas doctests only verify DataFrame.xs, here we verify Series.xs as well
   305      d = {
   306          'num_legs': [4, 4, 2, 2],
   307          'num_wings': [0, 0, 2, 2],
   308          'class': ['mammal', 'mammal', 'mammal', 'bird'],
   309          'animal': ['cat', 'dog', 'bat', 'penguin'],
   310          'locomotion': ['walks', 'walks', 'flies', 'walks']
   311      }
   312      df = pd.DataFrame(data=d)
   313      df = df.set_index(['class', 'animal', 'locomotion'])
   314  
   315      self._run_test(lambda df: df.num_legs.xs('mammal'), df)
   316      self._run_test(lambda df: df.num_legs.xs(('mammal', 'dog')), df)
   317      self._run_test(lambda df: df.num_legs.xs('cat', level=1), df)
   318      self._run_test(
   319          lambda df: df.num_legs.xs(('bird', 'walks'), level=[0, 'locomotion']),
   320          df)
   321  
   322    def test_dataframe_xs(self):
   323      # Test cases reported in BEAM-13421
   324      df = pd.DataFrame(
   325          np.array([
   326              ['state', 'day1', 12],
   327              ['state', 'day1', 1],
   328              ['state', 'day2', 14],
   329              ['county', 'day1', 9],
   330          ]),
   331          columns=['provider', 'time', 'value'])
   332  
   333      self._run_test(lambda df: df.xs('state'), df.set_index(['provider']))
   334      self._run_test(
   335          lambda df: df.xs('state'), df.set_index(['provider', 'time']))
   336  
   337    def test_set_column(self):
   338      def new_column(df):
   339        df['NewCol'] = df['Speed']
   340  
   341      df = pd.DataFrame({
   342          'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
   343          'Speed': [380., 370., 24., 26.]
   344      })
   345      self._run_inplace_test(new_column, df)
   346  
   347    def test_set_column_from_index(self):
   348      def new_column(df):
   349        df['NewCol'] = df.index
   350  
   351      df = pd.DataFrame({
   352          'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
   353          'Speed': [380., 370., 24., 26.]
   354      })
   355      self._run_inplace_test(new_column, df)
   356  
   357    def test_tz_localize_ambiguous_series(self):
   358      # This replicates a tz_localize doctest:
   359      #   s.tz_localize('CET', ambiguous=np.array([True, True, False]))
   360      # But using a DeferredSeries instead of a np array
   361  
   362      s = pd.Series(
   363          range(3),
   364          index=pd.DatetimeIndex([
   365              '2018-10-28 01:20:00', '2018-10-28 02:36:00', '2018-10-28 03:46:00'
   366          ]))
   367      ambiguous = pd.Series([True, True, False], index=s.index)
   368  
   369      self._run_test(
   370          lambda s,
   371          ambiguous: s.tz_localize('CET', ambiguous=ambiguous),
   372          s,
   373          ambiguous)
   374  
   375    def test_tz_convert(self):
   376      # This replicates a tz_localize doctest:
   377      #   s.tz_localize('CET', ambiguous=np.array([True, True, False]))
   378      # But using a DeferredSeries instead of a np array
   379  
   380      s = pd.Series(
   381          range(3),
   382          index=pd.DatetimeIndex([
   383              '2018-10-27 01:20:00', '2018-10-27 02:36:00', '2018-10-27 03:46:00'
   384          ],
   385                                 tz='Europe/Berlin'))
   386  
   387      self._run_test(lambda s: s.tz_convert('America/Los_Angeles'), s)
   388  
   389    def test_sort_index_columns(self):
   390      df = pd.DataFrame({
   391          'c': range(10),
   392          'a': range(10),
   393          'b': range(10),
   394          np.nan: range(10),
   395      })
   396  
   397      self._run_test(lambda df: df.sort_index(axis=1), df)
   398      self._run_test(lambda df: df.sort_index(axis=1, ascending=False), df)
   399      self._run_test(lambda df: df.sort_index(axis=1, na_position='first'), df)
   400  
   401    def test_where_callable_args(self):
   402      df = pd.DataFrame(
   403          np.arange(10, dtype=np.int64).reshape(-1, 2), columns=['A', 'B'])
   404  
   405      self._run_test(
   406          lambda df: df.where(lambda df: df % 2 == 0, lambda df: df * 10), df)
   407  
   408    def test_where_concrete_args(self):
   409      df = pd.DataFrame(
   410          np.arange(10, dtype=np.int64).reshape(-1, 2), columns=['A', 'B'])
   411  
   412      self._run_test(
   413          lambda df: df.where(
   414              df % 2 == 0, pd.Series({
   415                  'A': 123, 'B': 456
   416              }), axis=1),
   417          df)
   418  
   419    def test_combine_dataframe(self):
   420      df = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
   421      df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
   422      take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
   423      self._run_test(
   424          lambda df,
   425          df2: df.combine(df2, take_smaller),
   426          df,
   427          df2,
   428          nonparallel=True)
   429  
   430    def test_combine_dataframe_fill(self):
   431      df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
   432      df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
   433      take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
   434      self._run_test(
   435          lambda df1,
   436          df2: df1.combine(df2, take_smaller, fill_value=-5),
   437          df1,
   438          df2,
   439          nonparallel=True)
   440  
   441    def test_combine_Series(self):
   442      s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
   443      s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
   444      self._run_test(
   445          lambda s1,
   446          s2: s1.combine(s2, max),
   447          s1,
   448          s2,
   449          nonparallel=True,
   450          check_proxy=False)
   451  
   452    def test_combine_first_dataframe(self):
   453      df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
   454      df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
   455  
   456      self._run_test(lambda df1, df2: df1.combine_first(df2), df1, df2)
   457  
   458    def test_combine_first_series(self):
   459      s1 = pd.Series([1, np.nan])
   460      s2 = pd.Series([3, 4])
   461  
   462      self._run_test(lambda s1, s2: s1.combine_first(s2), s1, s2)
   463  
   464    def test_add_prefix(self):
   465      df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
   466      s = pd.Series([1, 2, 3, 4])
   467  
   468      self._run_test(lambda df: df.add_prefix('col_'), df)
   469      self._run_test(lambda s: s.add_prefix('col_'), s)
   470  
   471    def test_add_suffix(self):
   472      df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
   473      s = pd.Series([1, 2, 3, 4])
   474  
   475      self._run_test(lambda df: df.add_suffix('_col'), df)
   476      self._run_test(lambda s: s.add_prefix('_col'), s)
   477  
   478    def test_set_index(self):
   479      df = pd.DataFrame({
   480          # [19, 18, ..]
   481          'index1': reversed(range(20)),  # [15, 16, .., 0, 1, .., 13, 14]
   482          'index2': np.roll(range(20), 5),  # ['', 'a', 'bb', ...]
   483          'values': [chr(ord('a') + i) * i for i in range(20)],
   484      })
   485  
   486      self._run_test(lambda df: df.set_index(['index1', 'index2']), df)
   487      self._run_test(lambda df: df.set_index(['index1', 'index2'], drop=True), df)
   488      self._run_test(lambda df: df.set_index('values'), df)
   489  
   490      self._run_error_test(lambda df: df.set_index('bad'), df)
   491      self._run_error_test(
   492          lambda df: df.set_index(['index2', 'bad', 'really_bad']), df)
   493  
   494    def test_set_axis(self):
   495      df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=['X', 'Y', 'Z'])
   496  
   497      self._run_test(lambda df: df.set_axis(['I', 'II'], axis='columns'), df)
   498      self._run_test(lambda df: df.set_axis([0, 1], axis=1), df)
   499      self._run_inplace_test(
   500          lambda df: df.set_axis(['i', 'ii'], axis='columns'), df)
   501      with self.assertRaises(NotImplementedError):
   502        self._run_test(lambda df: df.set_axis(['a', 'b', 'c'], axis='index'), df)
   503        self._run_test(lambda df: df.set_axis([0, 1, 2], axis=0), df)
   504  
   505    def test_series_set_axis(self):
   506      s = pd.Series(list(range(3)), index=['X', 'Y', 'Z'])
   507      with self.assertRaises(NotImplementedError):
   508        self._run_test(lambda s: s.set_axis(['a', 'b', 'c']), s)
   509        self._run_test(lambda s: s.set_axis([1, 2, 3]), s)
   510  
   511    def test_series_drop_ignore_errors(self):
   512      midx = pd.MultiIndex(
   513          levels=[['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']],
   514          codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]])
   515      s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
   516  
   517      # drop() requires singleton partitioning unless errors are ignored
   518      # Add some additional tests here to make sure the implementation works in
   519      # non-singleton partitioning.
   520      self._run_test(lambda s: s.drop('lama', level=0, errors='ignore'), s)
   521      self._run_test(lambda s: s.drop(('cow', 'speed'), errors='ignore'), s)
   522      self._run_test(lambda s: s.drop('falcon', level=0, errors='ignore'), s)
   523  
   524    def test_dataframe_drop_ignore_errors(self):
   525      midx = pd.MultiIndex(
   526          levels=[['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']],
   527          codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]])
   528      df = pd.DataFrame(
   529          index=midx,
   530          columns=['big', 'small'],
   531          data=[[45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8],
   532                [320, 250], [1, 0.8], [0.3, 0.2]])
   533  
   534      # drop() requires singleton partitioning unless errors are ignored
   535      # Add some additional tests here to make sure the implementation works in
   536      # non-singleton partitioning.
   537      self._run_test(
   538          lambda df: df.drop(index='lama', level=0, errors='ignore'), df)
   539      self._run_test(
   540          lambda df: df.drop(index=('cow', 'speed'), errors='ignore'), df)
   541      self._run_test(
   542          lambda df: df.drop(index='falcon', level=0, errors='ignore'), df)
   543      self._run_test(
   544          lambda df: df.drop(index='cow', columns='small', errors='ignore'), df)
   545  
   546    def test_merge(self):
   547      # This is from the pandas doctests, but fails due to re-indexing being
   548      # order-sensitive.
   549      df1 = pd.DataFrame({
   550          'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]
   551      })
   552      df2 = pd.DataFrame({
   553          'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]
   554      })
   555      self._run_test(
   556          lambda df1,
   557          df2: df1.merge(df2, left_on='lkey', right_on='rkey').rename(
   558              index=lambda x: '*'),
   559          df1,
   560          df2,
   561          nonparallel=True,
   562          check_proxy=False)
   563      self._run_test(
   564          lambda df1,
   565          df2: df1.merge(
   566              df2, left_on='lkey', right_on='rkey', suffixes=('_left', '_right')).
   567          rename(index=lambda x: '*'),
   568          df1,
   569          df2,
   570          nonparallel=True,
   571          check_proxy=False)
   572  
   573    def test_merge_left_join(self):
   574      # This is from the pandas doctests, but fails due to re-indexing being
   575      # order-sensitive.
   576      df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
   577      df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
   578  
   579      self._run_test(
   580          lambda df1,
   581          df2: df1.merge(df2, how='left', on='a').rename(index=lambda x: '*'),
   582          df1,
   583          df2,
   584          nonparallel=True,
   585          check_proxy=False)
   586  
   587    def test_merge_on_index(self):
   588      # This is from the pandas doctests, but fails due to re-indexing being
   589      # order-sensitive.
   590      df1 = pd.DataFrame({
   591          'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]
   592      }).set_index('lkey')
   593      df2 = pd.DataFrame({
   594          'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]
   595      }).set_index('rkey')
   596  
   597      self._run_test(
   598          lambda df1,
   599          df2: df1.merge(df2, left_index=True, right_index=True),
   600          df1,
   601          df2,
   602          check_proxy=False)
   603  
   604    def test_merge_same_key(self):
   605      df1 = pd.DataFrame({
   606          'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]
   607      })
   608      df2 = pd.DataFrame({
   609          'key': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]
   610      })
   611      self._run_test(
   612          lambda df1,
   613          df2: df1.merge(df2, on='key').rename(index=lambda x: '*'),
   614          df1,
   615          df2,
   616          nonparallel=True,
   617          check_proxy=False)
   618      self._run_test(
   619          lambda df1,
   620          df2: df1.merge(df2, on='key', suffixes=('_left', '_right')).rename(
   621              index=lambda x: '*'),
   622          df1,
   623          df2,
   624          nonparallel=True,
   625          check_proxy=False)
   626  
   627    def test_merge_same_key_doctest(self):
   628      df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
   629      df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
   630  
   631      self._run_test(
   632          lambda df1,
   633          df2: df1.merge(df2, how='left', on='a').rename(index=lambda x: '*'),
   634          df1,
   635          df2,
   636          nonparallel=True,
   637          check_proxy=False)
   638      # Test without specifying 'on'
   639      self._run_test(
   640          lambda df1,
   641          df2: df1.merge(df2, how='left').rename(index=lambda x: '*'),
   642          df1,
   643          df2,
   644          nonparallel=True,
   645          check_proxy=False)
   646  
   647    def test_merge_same_key_suffix_collision(self):
   648      df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2], 'a_lsuffix': [5, 6]})
   649      df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4], 'a_rsuffix': [7, 8]})
   650  
   651      self._run_test(
   652          lambda df1,
   653          df2: df1.merge(
   654              df2, how='left', on='a', suffixes=('_lsuffix', '_rsuffix')).rename(
   655                  index=lambda x: '*'),
   656          df1,
   657          df2,
   658          nonparallel=True,
   659          check_proxy=False)
   660      # Test without specifying 'on'
   661      self._run_test(
   662          lambda df1,
   663          df2: df1.merge(df2, how='left', suffixes=('_lsuffix', '_rsuffix')).
   664          rename(index=lambda x: '*'),
   665          df1,
   666          df2,
   667          nonparallel=True,
   668          check_proxy=False)
   669  
   670    def test_swaplevel(self):
   671      df = pd.DataFrame(
   672          {"Grade": ["A", "B", "A", "C"]},
   673          index=[
   674              ["Final exam", "Final exam", "Coursework", "Coursework"],
   675              ["History", "Geography", "History", "Geography"],
   676              ["January", "February", "March", "April"],
   677          ])
   678      self._run_test(lambda df: df.swaplevel(), df)
   679  
   680    def test_value_counts_with_nans(self):
   681      # similar to doctests that verify value_counts, but include nan values to
   682      # make sure we handle them correctly.
   683      df = pd.DataFrame({
   684          'num_legs': [2, 4, 4, 6, np.nan, np.nan],
   685          'num_wings': [2, 0, 0, 0, np.nan, 2]
   686      },
   687                        index=['falcon', 'dog', 'cat', 'ant', 'car', 'plane'])
   688  
   689      self._run_test(lambda df: df.value_counts(), df)
   690      self._run_test(lambda df: df.value_counts(normalize=True), df)
   691  
   692      if PD_VERSION >= (1, 3):
   693        # dropna=False is new in pandas 1.3
   694        # TODO(https://github.com/apache/beam/issues/21014): Remove the
   695        # assertRaises this when the underlying bug in
   696        # https://github.com/pandas-dev/pandas/issues/36470 is fixed.
   697        with self.assertRaises(NotImplementedError):
   698          self._run_test(lambda df: df.value_counts(dropna=False), df)
   699  
   700      # Test the defaults.
   701      self._run_test(lambda df: df.num_wings.value_counts(), df)
   702      self._run_test(lambda df: df.num_wings.value_counts(normalize=True), df)
   703      self._run_test(lambda df: df.num_wings.value_counts(dropna=False), df)
   704  
   705      # Test the combination interactions.
   706      for normalize in (True, False):
   707        for dropna in (True, False):
   708          self._run_test(
   709              lambda df,
   710              dropna=dropna,
   711              normalize=normalize: df.num_wings.value_counts(
   712                  dropna=dropna, normalize=normalize),
   713              df)
   714  
   715    def test_value_counts_does_not_support_sort(self):
   716      df = pd.DataFrame({
   717          'num_legs': [2, 4, 4, 6, np.nan, np.nan],
   718          'num_wings': [2, 0, 0, 0, np.nan, 2]
   719      },
   720                        index=['falcon', 'dog', 'cat', 'ant', 'car', 'plane'])
   721  
   722      with self.assertRaisesRegex(frame_base.WontImplementError,
   723                                  r"value_counts\(sort\=True\)"):
   724        self._run_test(lambda df: df.value_counts(sort=True), df)
   725  
   726      with self.assertRaisesRegex(frame_base.WontImplementError,
   727                                  r"value_counts\(sort\=True\)"):
   728        self._run_test(lambda df: df.num_wings.value_counts(sort=True), df)
   729  
   730    def test_series_getitem(self):
   731      s = pd.Series([x**2 for x in range(10)])
   732      self._run_test(lambda s: s[...], s)
   733      self._run_test(lambda s: s[:], s)
   734      self._run_test(lambda s: s[s < 10], s)
   735      self._run_test(lambda s: s[lambda s: s < 10], s)
   736  
   737      s.index = s.index.map(float)
   738      self._run_test(lambda s: s[1.5:6], s)
   739  
   740    def test_series_truncate(self):
   741      s = pd.Series(['a', 'b', 'c', 'd', 'e', 'f'])
   742      self._run_test(lambda s: s.truncate(before=1, after=3), s)
   743  
   744    def test_dataframe_truncate(self):
   745      df = pd.DataFrame({
   746          'C': list('abcde'), 'B': list('fghij'), 'A': list('klmno')
   747      },
   748                        index=[1, 2, 3, 4, 5])
   749      self._run_test(lambda df: df.truncate(before=1, after=3), df)
   750      self._run_test(lambda df: df.truncate(before='A', after='B', axis=1), df)
   751      self._run_test(lambda df: df['A'].truncate(before=2, after=4), df)
   752  
   753    @parameterized.expand([
   754        (pd.Series(range(10)), ),  # unique
   755        (pd.Series(list(range(100)) + [0]), ),  # non-unique int
   756        (pd.Series(list(range(100)) + [0]) / 100, ),  # non-unique flt
   757        (pd.Series(['a', 'b', 'c', 'd']), ),  # unique str
   758        (pd.Series(['a', 'b', 'a', 'c', 'd']), ),  # non-unique str
   759    ])
   760    def test_series_is_unique(self, series):
   761      self._run_test(lambda s: s.is_unique, series)
   762  
   763    @parameterized.expand([
   764        (pd.Series(range(10)), ),  # False
   765        (pd.Series([1, 2, np.nan, 3, np.nan]), ),  # True
   766        (pd.Series(['a', 'b', 'c', 'd', 'e']), ),  # False
   767        (pd.Series(['a', 'b', None, 'c', None]), ),  # True
   768    ])
   769    def test_series_hasnans(self, series):
   770      self._run_test(lambda s: s.hasnans, series)
   771  
   772    def test_dataframe_getitem(self):
   773      df = pd.DataFrame({'A': [x**2 for x in range(6)], 'B': list('abcdef')})
   774      self._run_test(lambda df: df['A'], df)
   775      self._run_test(lambda df: df[['A', 'B']], df)
   776  
   777      self._run_test(lambda df: df[:], df)
   778      self._run_test(lambda df: df[df.A < 10], df)
   779  
   780      df.index = df.index.map(float)
   781      self._run_test(lambda df: df[1.5:4], df)
   782  
   783    def test_loc(self):
   784      dates = pd.date_range('1/1/2000', periods=8)
   785      # TODO(https://github.com/apache/beam/issues/20765):
   786      # We do not preserve the freq attribute on a DateTime index
   787      dates.freq = None
   788      df = pd.DataFrame(
   789          np.arange(32).reshape((8, 4)),
   790          index=dates,
   791          columns=['A', 'B', 'C', 'D'])
   792      self._run_test(lambda df: df.loc[:], df)
   793      self._run_test(lambda df: df.loc[:, 'A'], df)
   794      self._run_test(lambda df: df.loc[:dates[3]], df)
   795      self._run_test(lambda df: df.loc[df.A > 10], df)
   796      self._run_test(lambda df: df.loc[lambda df: df.A > 10], df)
   797      self._run_test(lambda df: df.C.loc[df.A > 10], df)
   798      self._run_test(lambda df, s: df.loc[s.loc[1:3]], df, pd.Series(dates))
   799  
   800    def test_append_sort(self):
   801      # yapf: disable
   802      df1 = pd.DataFrame({'int': [1, 2, 3], 'str': ['a', 'b', 'c']},
   803                         columns=['int', 'str'],
   804                         index=[1, 3, 5])
   805      df2 = pd.DataFrame({'int': [4, 5, 6], 'str': ['d', 'e', 'f']},
   806                         columns=['str', 'int'],
   807                         index=[2, 4, 6])
   808      # yapf: enable
   809  
   810      self._run_test(lambda df1, df2: df1.append(df2, sort=True), df1, df2)
   811      self._run_test(lambda df1, df2: df1.append(df2, sort=False), df1, df2)
   812      self._run_test(lambda df1, df2: df2.append(df1, sort=True), df1, df2)
   813      self._run_test(lambda df1, df2: df2.append(df1, sort=False), df1, df2)
   814  
   815    def test_smallest_largest(self):
   816      df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [2, 3, 5, 7]})
   817      self._run_test(lambda df: df.nlargest(1, 'A', keep='all'), df)
   818      self._run_test(lambda df: df.nsmallest(3, 'A', keep='all'), df)
   819      self._run_test(lambda df: df.nlargest(3, ['A', 'B'], keep='all'), df)
   820  
   821    def test_series_cov_corr(self):
   822      for s in [pd.Series([1, 2, 3]),
   823                pd.Series(range(100)),
   824                pd.Series([x**3 for x in range(-50, 50)])]:
   825        self._run_test(lambda s: s.std(), s)
   826        self._run_test(lambda s: s.var(), s)
   827        self._run_test(lambda s: s.corr(s), s)
   828        self._run_test(lambda s: s.corr(s + 1), s)
   829        self._run_test(lambda s: s.corr(s * s), s)
   830        self._run_test(lambda s: s.cov(s * s), s)
   831        self._run_test(lambda s: s.skew(), s)
   832        self._run_test(lambda s: s.kurtosis(), s)
   833        self._run_test(lambda s: s.kurt(), s)
   834  
   835    def test_dataframe_cov_corr(self):
   836      df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])
   837      df.loc[df.index[:5], 'a'] = np.nan
   838      df.loc[df.index[5:10], 'b'] = np.nan
   839      self._run_test(lambda df: df.corr(), df)
   840      self._run_test(lambda df: df.cov(), df)
   841      self._run_test(lambda df: df.corr(min_periods=12), df)
   842      self._run_test(lambda df: df.cov(min_periods=12), df)
   843      self._run_test(lambda df: df.corrwith(df.a), df)
   844      self._run_test(lambda df: df[['a', 'b']].corrwith(df[['b', 'c']]), df)
   845  
   846      df2 = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])
   847      self._run_test(
   848          lambda df, df2: df.corrwith(df2, axis=1), df, df2, check_proxy=False)
   849  
   850    def test_corrwith_bad_axis(self):
   851      df = pd.DataFrame({'a': range(3), 'b': range(3, 6), 'c': range(6, 9)})
   852      self._run_error_test(lambda df: df.corrwith(df.a, axis=2), df)
   853      self._run_error_test(lambda df: df.corrwith(df, axis=5), df)
   854  
   855    @unittest.skipIf(PD_VERSION < (1, 2), "na_action added in pandas 1.2.0")
   856    def test_applymap_na_action(self):
   857      # Replicates a doctest for na_action which is incompatible with
   858      # doctest framework
   859      df = pd.DataFrame([[pd.NA, 2.12], [3.356, 4.567]])
   860      self._run_test(
   861          lambda df: df.applymap(lambda x: len(str(x)), na_action='ignore'),
   862          df,
   863          # TODO: generate proxy using naive type inference on fn
   864          check_proxy=False)
   865  
   866    def test_dataframe_eval_query(self):
   867      df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])
   868      self._run_test(lambda df: df.eval('foo = a + b - c'), df)
   869      self._run_test(lambda df: df.query('a > b + c'), df)
   870  
   871      self._run_inplace_test(lambda df: df.eval('foo = a + b - c'), df)
   872  
   873      # Verify that attempting to access locals raises a useful error
   874      deferred_df = frame_base.DeferredFrame.wrap(
   875          expressions.ConstantExpression(df, df[0:0]))
   876      self.assertRaises(
   877          NotImplementedError, lambda: deferred_df.eval('foo = a + @b - c'))
   878      self.assertRaises(
   879          NotImplementedError, lambda: deferred_df.query('a > @b + c'))
   880  
   881    def test_index_name_assignment(self):
   882      df = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
   883      df = df.set_index(['a', 'b'], drop=False)
   884  
   885      def change_index_names(df):
   886        df.index.names = ['A', None]
   887  
   888      self._run_inplace_test(change_index_names, df)
   889  
   890    def test_quantile(self):
   891      df = pd.DataFrame(
   892          np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=['a', 'b'])
   893  
   894      self._run_test(
   895          lambda df: df.quantile(0.1, axis='columns'), df, check_proxy=False)
   896  
   897      self._run_test(
   898          lambda df: df.quantile(0.1, axis='columns'), df, check_proxy=False)
   899      with self.assertRaisesRegex(frame_base.WontImplementError,
   900                                  r"df\.quantile\(q=0\.1, axis='columns'\)"):
   901        self._run_test(lambda df: df.quantile([0.1, 0.5], axis='columns'), df)
   902  
   903    def test_dataframe_melt(self):
   904  
   905      df = pd.DataFrame({
   906          'A': {
   907              0: 'a', 1: 'b', 2: 'c'
   908          },
   909          'B': {
   910              0: 1, 1: 3, 2: 5
   911          },
   912          'C': {
   913              0: 2, 1: 4, 2: 6
   914          }
   915      })
   916  
   917      self._run_test(
   918          lambda df: df.melt(id_vars=['A'], value_vars=['B'], ignore_index=False),
   919          df)
   920      self._run_test(
   921          lambda df: df.melt(
   922              id_vars=['A'], value_vars=['B', 'C'], ignore_index=False),
   923          df)
   924      self._run_test(
   925          lambda df: df.melt(
   926              id_vars=['A'],
   927              value_vars=['B'],
   928              var_name='myVarname',
   929              value_name='myValname',
   930              ignore_index=False),
   931          df)
   932      self._run_test(
   933          lambda df: df.melt(
   934              id_vars=['A'], value_vars=['B', 'C'], ignore_index=False),
   935          df)
   936  
   937      df.columns = [list('ABC'), list('DEF')]
   938      self._run_test(
   939          lambda df: df.melt(
   940              col_level=0, id_vars=['A'], value_vars=['B'], ignore_index=False),
   941          df)
   942      self._run_test(
   943          lambda df: df.melt(
   944              id_vars=[('A', 'D')], value_vars=[('B', 'E')], ignore_index=False),
   945          df)
   946  
   947    def test_fillna_columns(self):
   948      df = pd.DataFrame(
   949          [[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, 5],
   950           [np.nan, 3, np.nan, 4], [3, np.nan, np.nan, 4]],
   951          columns=list('ABCD'))
   952  
   953      self._run_test(lambda df: df.fillna(method='ffill', axis='columns'), df)
   954      self._run_test(
   955          lambda df: df.fillna(method='ffill', axis='columns', limit=1), df)
   956      self._run_test(
   957          lambda df: df.fillna(method='bfill', axis='columns', limit=1), df)
   958  
   959      # Intended behavior is unclear here. See
   960      # https://github.com/pandas-dev/pandas/issues/40989
   961      # self._run_test(lambda df: df.fillna(axis='columns', value=100,
   962      #                                     limit=2), df)
   963  
   964    def test_dataframe_fillna_dataframe_as_value(self):
   965      df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
   966                         [np.nan, np.nan, np.nan, 5], [np.nan, 3, np.nan, 4]],
   967                        columns=list("ABCD"))
   968      df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
   969  
   970      self._run_test(lambda df, df2: df.fillna(df2), df, df2)
   971  
   972    def test_dataframe_fillna_series_as_value(self):
   973      df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
   974                         [np.nan, np.nan, np.nan, 5], [np.nan, 3, np.nan, 4]],
   975                        columns=list("ABCD"))
   976      s = pd.Series(range(4), index=list("ABCE"))
   977  
   978      self._run_test(lambda df, s: df.fillna(s), df, s)
   979  
   980    def test_series_fillna_series_as_value(self):
   981      df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
   982                         [np.nan, np.nan, np.nan, 5], [np.nan, 3, np.nan, 4]],
   983                        columns=list("ABCD"))
   984      df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
   985  
   986      self._run_test(lambda df, df2: df.A.fillna(df2.A), df, df2)
   987  
   988    def test_append_verify_integrity(self):
   989      df1 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(10))
   990      df2 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(9, 19))
   991  
   992      self._run_error_test(
   993          lambda s1,
   994          s2: s1.append(s2, verify_integrity=True),
   995          df1['A'],
   996          df2['A'],
   997          construction_time=False)
   998      self._run_error_test(
   999          lambda df1,
  1000          df2: df1.append(df2, verify_integrity=True),
  1001          df1,
  1002          df2,
  1003          construction_time=False)
  1004  
  1005    def test_categorical_groupby(self):
  1006      df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')})
  1007      df['B'] = df['B'].astype(pd.CategoricalDtype(list('cab')))
  1008      df = df.set_index('B')
  1009      # TODO(BEAM-11190): These aggregations can be done in index partitions, but
  1010      # it will require a little more complex logic
  1011      self._run_test(lambda df: df.groupby(level=0).sum(), df, nonparallel=True)
  1012      self._run_test(lambda df: df.groupby(level=0).mean(), df, nonparallel=True)
  1013  
  1014    def test_astype_categorical(self):
  1015      df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')})
  1016      categorical_dtype = pd.CategoricalDtype(df.B.unique())
  1017  
  1018      self._run_test(lambda df: df.B.astype(categorical_dtype), df)
  1019  
  1020    @unittest.skipIf(
  1021        PD_VERSION < (1, 2), "DataFrame.unstack not supported in pandas <1.2.x")
  1022    def test_astype_categorical_with_unstack(self):
  1023      df = pd.DataFrame({
  1024          'index1': ['one', 'one', 'two', 'two'],
  1025          'index2': ['a', 'b', 'a', 'b'],
  1026          'data': np.arange(1.0, 5.0),
  1027      })
  1028  
  1029      def with_categorical_index(df):
  1030        df.index1 = df.index1.astype(pd.CategoricalDtype(['one', 'two']))
  1031        df.index2 = df.index2.astype(pd.CategoricalDtype(['a', 'b']))
  1032        df.set_index(['index1', 'index2'], drop=True)
  1033        return df
  1034  
  1035      self._run_test(
  1036          lambda df: with_categorical_index(df).unstack(level=-1),
  1037          df,
  1038          check_proxy=False)
  1039  
  1040    def test_dataframe_sum_nonnumeric_raises(self):
  1041      # Attempting a numeric aggregation with the str column present should
  1042      # raise, and suggest the numeric_only argument
  1043      with self.assertRaisesRegex(frame_base.WontImplementError, 'numeric_only'):
  1044        self._run_test(lambda df: df.sum(), GROUPBY_DF)
  1045  
  1046      # numeric_only=True should work
  1047      self._run_test(lambda df: df.sum(numeric_only=True), GROUPBY_DF)
  1048      # projecting only numeric columns should too
  1049      self._run_test(lambda df: df[['foo', 'bar']].sum(), GROUPBY_DF)
  1050  
  1051    def test_insert(self):
  1052      df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  1053  
  1054      self._run_inplace_test(lambda df: df.insert(1, 'C', df.A * 2), df)
  1055      self._run_inplace_test(
  1056          lambda df: df.insert(0, 'foo', pd.Series([8], index=[1])),
  1057          df,
  1058          check_proxy=False)
  1059      self._run_inplace_test(lambda df: df.insert(2, 'bar', value='q'), df)
  1060  
  1061    def test_insert_does_not_support_list_value(self):
  1062      df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  1063  
  1064      with self.assertRaisesRegex(frame_base.WontImplementError,
  1065                                  r"insert\(value=list\)"):
  1066        self._run_inplace_test(lambda df: df.insert(1, 'C', [7, 8, 9]), df)
  1067  
  1068    def test_drop_duplicates(self):
  1069      df = pd.DataFrame({
  1070          'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  1071          'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  1072          'rating': [4, 4, 3.5, 15, 5]
  1073      })
  1074  
  1075      self._run_test(lambda df: df.drop_duplicates(keep=False), df)
  1076      self._run_test(
  1077          lambda df: df.drop_duplicates(subset=['brand'], keep=False), df)
  1078      self._run_test(
  1079          lambda df: df.drop_duplicates(subset=['brand', 'style'], keep=False),
  1080          df)
  1081  
  1082    @parameterized.expand([
  1083        (
  1084            lambda base: base.from_dict({
  1085                'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']
  1086            }), ),
  1087        (
  1088            lambda base: base.from_dict({
  1089                'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']
  1090            },
  1091                                        orient='index'), ),
  1092        (
  1093            lambda base: base.from_records(
  1094                np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
  1095                         dtype=[('col_1', 'i4'), ('col_2', 'U1')])), ),
  1096    ])
  1097    def test_create_methods(self, func):
  1098      expected = func(pd.DataFrame)
  1099  
  1100      deferred_df = func(frames.DeferredDataFrame)
  1101      actual = expressions.Session({}).evaluate(deferred_df._expr)
  1102  
  1103      pd.testing.assert_frame_equal(actual, expected)
  1104  
  1105    def test_replace(self):
  1106      # verify a replace() doctest case that doesn't quite work in Beam as it uses
  1107      # the default method='pad'
  1108      df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], 'B': ['abc', 'bar', 'xyz']})
  1109  
  1110      self._run_test(
  1111          lambda df: df.replace(
  1112              regex={
  1113                  r'^ba.$': 'new', 'foo': 'xyz'
  1114              }, method=None),
  1115          df)
  1116  
  1117    def test_sample_columns(self):
  1118      df = pd.DataFrame({
  1119          'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  1120          'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  1121          'rating': [4, 4, 3.5, 15, 5]
  1122      })
  1123  
  1124      self._run_test(lambda df: df.sample(axis=1, n=2, random_state=1), df)
  1125      self._run_error_test(lambda df: df.sample(axis=1, n=10, random_state=2), df)
  1126      self._run_test(
  1127          lambda df: df.sample(axis=1, n=10, random_state=3, replace=True), df)
  1128  
  1129    def test_cat(self):
  1130      # Replicate the doctests from CategorigcalAccessor
  1131      # These tests don't translate into pandas_doctests_test.py because it
  1132      # tries to use astype("category") in Beam, which makes a non-deferred
  1133      # column type.
  1134      s = pd.Series(list("abbccc")).astype("category")
  1135  
  1136      self._run_test(lambda s: s.cat.rename_categories(list("cba")), s)
  1137      self._run_test(lambda s: s.cat.reorder_categories(list("cba")), s)
  1138      self._run_test(lambda s: s.cat.add_categories(["d", "e"]), s)
  1139      self._run_test(lambda s: s.cat.remove_categories(["a", "c"]), s)
  1140      self._run_test(lambda s: s.cat.set_categories(list("abcde")), s)
  1141      self._run_test(lambda s: s.cat.as_ordered(), s)
  1142      self._run_test(lambda s: s.cat.as_unordered(), s)
  1143      self._run_test(lambda s: s.cat.codes, s)
  1144  
  1145    @parameterized.expand(frames.ELEMENTWISE_DATETIME_PROPERTIES)
  1146    def test_dt_property(self, prop_name):
  1147      # Generate a series with a lot of unique timestamps
  1148      s = pd.Series(
  1149          pd.date_range('1/1/2000', periods=100, freq='m') +
  1150          pd.timedelta_range(start='0 days', end='70 days', periods=100))
  1151      self._run_test(lambda s: getattr(s.dt, prop_name), s)
  1152  
  1153    @parameterized.expand([
  1154        ('month_name', {}),
  1155        ('day_name', {}),
  1156        ('normalize', {}),
  1157        (
  1158            'strftime',
  1159            {
  1160                'date_format': '%B %d, %Y, %r'
  1161            },
  1162        ),
  1163        ('tz_convert', {
  1164            'tz': 'Europe/Berlin'
  1165        }),
  1166    ])
  1167    def test_dt_method(self, op, kwargs):
  1168      # Generate a series with a lot of unique timestamps
  1169      s = pd.Series(
  1170          pd.date_range(
  1171              '1/1/2000', periods=100, freq='m', tz='America/Los_Angeles') +
  1172          pd.timedelta_range(start='0 days', end='70 days', periods=100))
  1173  
  1174      self._run_test(lambda s: getattr(s.dt, op)(**kwargs), s)
  1175  
  1176    def test_dt_tz_localize_ambiguous_series(self):
  1177      # This replicates a dt.tz_localize doctest:
  1178      #   s.tz_localize('CET', ambiguous=np.array([True, True, False]))
  1179      # But using a DeferredSeries instead of a np array
  1180  
  1181      s = pd.to_datetime(
  1182          pd.Series([
  1183              '2018-10-28 01:20:00', '2018-10-28 02:36:00', '2018-10-28 03:46:00'
  1184          ]))
  1185      ambiguous = pd.Series([True, True, False], index=s.index)
  1186  
  1187      self._run_test(
  1188          lambda s,
  1189          ambiguous: s.dt.tz_localize('CET', ambiguous=ambiguous),
  1190          s,
  1191          ambiguous)
  1192  
  1193    def test_dt_tz_localize_nonexistent(self):
  1194      # This replicates dt.tz_localize doctests that exercise `nonexistent`.
  1195      # However they specify ambiguous='NaT' because the default,
  1196      # ambiguous='infer', is not supported.
  1197      s = pd.to_datetime(
  1198          pd.Series(['2015-03-29 02:30:00', '2015-03-29 03:30:00']))
  1199  
  1200      self._run_test(
  1201          lambda s: s.dt.tz_localize(
  1202              'Europe/Warsaw', ambiguous='NaT', nonexistent='shift_forward'),
  1203          s)
  1204      self._run_test(
  1205          lambda s: s.dt.tz_localize(
  1206              'Europe/Warsaw', ambiguous='NaT', nonexistent='shift_backward'),
  1207          s)
  1208      self._run_test(
  1209          lambda s: s.dt.tz_localize(
  1210              'Europe/Warsaw', ambiguous='NaT', nonexistent=pd.Timedelta('1H')),
  1211          s)
  1212  
  1213    def test_compare_series(self):
  1214      s1 = pd.Series(["a", "b", "c", "d", "e"])
  1215      s2 = pd.Series(["a", "a", "c", "b", "e"])
  1216  
  1217      self._run_test(lambda s1, s2: s1.compare(s2), s1, s2)
  1218      self._run_test(lambda s1, s2: s1.compare(s2, align_axis=0), s1, s2)
  1219      self._run_test(lambda s1, s2: s1.compare(s2, keep_shape=True), s1, s2)
  1220      self._run_test(
  1221          lambda s1, s2: s1.compare(s2, keep_shape=True, keep_equal=True), s1, s2)
  1222  
  1223    def test_compare_dataframe(self):
  1224      df1 = pd.DataFrame(
  1225          {
  1226              "col1": ["a", "a", "b", "b", "a"],
  1227              "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
  1228              "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
  1229          },
  1230          columns=["col1", "col2", "col3"],
  1231      )
  1232      df2 = df1.copy()
  1233      df2.loc[0, 'col1'] = 'c'
  1234      df2.loc[2, 'col3'] = 4.0
  1235  
  1236      # Skipped because keep_shape=False won't be implemented
  1237      with self.assertRaisesRegex(
  1238          frame_base.WontImplementError,
  1239          r"compare\(align_axis\=1, keep_shape\=False\) is not allowed"):
  1240        self._run_test(lambda df1, df2: df1.compare(df2), df1, df2)
  1241  
  1242      self._run_test(
  1243          lambda df1,
  1244          df2: df1.compare(df2, align_axis=0),
  1245          df1,
  1246          df2,
  1247          check_proxy=False)
  1248      self._run_test(lambda df1, df2: df1.compare(df2, keep_shape=True), df1, df2)
  1249      self._run_test(
  1250          lambda df1,
  1251          df2: df1.compare(df2, align_axis=0, keep_shape=True),
  1252          df1,
  1253          df2)
  1254      self._run_test(
  1255          lambda df1,
  1256          df2: df1.compare(df2, keep_shape=True, keep_equal=True),
  1257          df1,
  1258          df2)
  1259      self._run_test(
  1260          lambda df1,
  1261          df2: df1.compare(df2, align_axis=0, keep_shape=True, keep_equal=True),
  1262          df1,
  1263          df2)
  1264  
  1265    def test_idxmin(self):
  1266      df = pd.DataFrame({
  1267          'consumption': [10.51, 103.11, 55.48],
  1268          'co2_emissions': [37.2, 19.66, 1712]
  1269      },
  1270                        index=['Pork', 'Wheat Products', 'Beef'])
  1271  
  1272      df2 = df.copy()
  1273      df2.loc['Pork', 'co2_emissions'] = None
  1274      df2.loc['Wheat Products', 'consumption'] = None
  1275      df2.loc['Beef', 'co2_emissions'] = None
  1276  
  1277      df3 = pd.DataFrame({
  1278          'consumption': [1.1, 2.2, 3.3], 'co2_emissions': [3.3, 2.2, 1.1]
  1279      },
  1280                         index=[0, 1, 2])
  1281  
  1282      s = pd.Series(data=[4, 3, None, 1], index=['A', 'B', 'C', 'D'])
  1283      s2 = pd.Series(data=[1, 2, 3], index=[1, 2, 3])
  1284  
  1285      self._run_test(lambda df: df.idxmin(), df)
  1286      self._run_test(lambda df: df.idxmin(skipna=False), df)
  1287      self._run_test(lambda df: df.idxmin(axis=1), df)
  1288      self._run_test(lambda df: df.idxmin(axis=1, skipna=False), df)
  1289      self._run_test(lambda df2: df2.idxmin(), df2)
  1290      self._run_test(lambda df2: df2.idxmin(axis=1), df2)
  1291      self._run_test(lambda df2: df2.idxmin(skipna=False), df2, check_proxy=False)
  1292      self._run_test(
  1293          lambda df2: df2.idxmin(axis=1, skipna=False), df2, check_proxy=False)
  1294      self._run_test(lambda df3: df3.idxmin(), df3)
  1295      self._run_test(lambda df3: df3.idxmin(axis=1), df3)
  1296      self._run_test(lambda df3: df3.idxmin(skipna=False), df3)
  1297      self._run_test(lambda df3: df3.idxmin(axis=1, skipna=False), df3)
  1298  
  1299      self._run_test(lambda s: s.idxmin(), s)
  1300      self._run_test(lambda s: s.idxmin(skipna=False), s, check_proxy=False)
  1301      self._run_test(lambda s2: s2.idxmin(), s2)
  1302      self._run_test(lambda s2: s2.idxmin(skipna=False), s2)
  1303  
  1304    def test_idxmax(self):
  1305      df = pd.DataFrame({
  1306          'consumption': [10.51, 103.11, 55.48],
  1307          'co2_emissions': [37.2, 19.66, 1712]
  1308      },
  1309                        index=['Pork', 'Wheat Products', 'Beef'])
  1310  
  1311      df2 = df.copy()
  1312      df2.loc['Pork', 'co2_emissions'] = None
  1313      df2.loc['Wheat Products', 'consumption'] = None
  1314      df2.loc['Beef', 'co2_emissions'] = None
  1315  
  1316      df3 = pd.DataFrame({
  1317          'consumption': [1.1, 2.2, 3.3], 'co2_emissions': [3.3, 2.2, 1.1]
  1318      },
  1319                         index=[0, 1, 2])
  1320  
  1321      s = pd.Series(data=[1, None, 4, 1], index=['A', 'B', 'C', 'D'])
  1322      s2 = pd.Series(data=[1, 2, 3], index=[1, 2, 3])
  1323  
  1324      self._run_test(lambda df: df.idxmax(), df)
  1325      self._run_test(lambda df: df.idxmax(skipna=False), df)
  1326      self._run_test(lambda df: df.idxmax(axis=1), df)
  1327      self._run_test(lambda df: df.idxmax(axis=1, skipna=False), df)
  1328      self._run_test(lambda df2: df2.idxmax(), df2)
  1329      self._run_test(lambda df2: df2.idxmax(axis=1), df2)
  1330      self._run_test(
  1331          lambda df2: df2.idxmax(axis=1, skipna=False), df2, check_proxy=False)
  1332      self._run_test(lambda df2: df2.idxmax(skipna=False), df2, check_proxy=False)
  1333      self._run_test(lambda df3: df3.idxmax(), df3)
  1334      self._run_test(lambda df3: df3.idxmax(axis=1), df3)
  1335      self._run_test(lambda df3: df3.idxmax(skipna=False), df3)
  1336      self._run_test(lambda df3: df3.idxmax(axis=1, skipna=False), df3)
  1337  
  1338      self._run_test(lambda s: s.idxmax(), s)
  1339      self._run_test(lambda s: s.idxmax(skipna=False), s, check_proxy=False)
  1340      self._run_test(lambda s2: s2.idxmax(), s2)
  1341      self._run_test(lambda s2: s2.idxmax(skipna=False), s2)
  1342  
  1343    def test_pipe(self):
  1344      def df_times(df, column, times):
  1345        df[column] = df[column] * times
  1346        return df
  1347  
  1348      def df_times_shuffled(column, times, df):
  1349        return df_times(df, column, times)
  1350  
  1351      def s_times(s, times):
  1352        return s * times
  1353  
  1354      def s_times_shuffled(times, s):
  1355        return s_times(s, times)
  1356  
  1357      df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=[0, 1, 2])
  1358      s = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4])
  1359  
  1360      self._run_inplace_test(lambda df: df.pipe(df_times, 'A', 2), df)
  1361      self._run_inplace_test(
  1362          lambda df: df.pipe((df_times_shuffled, 'df'), 'A', 2), df)
  1363  
  1364      self._run_test(lambda s: s.pipe(s_times, 2), s)
  1365      self._run_test(lambda s: s.pipe((s_times_shuffled, 's'), 2), s)
  1366  
  1367    def test_unstack_pandas_series_not_multiindex(self):
  1368      # Pandas should throw a ValueError if performing unstack
  1369      # on a Series without MultiIndex
  1370      s = pd.Series([1, 2, 3, 4], index=['one', 'two', 'three', 'four'])
  1371      with self.assertRaises((AttributeError, ValueError)):
  1372        self._run_test(lambda s: s.unstack(), s)
  1373  
  1374    def test_unstack_non_categorical_index(self):
  1375      index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ('two', 'a'),
  1376                                         ('two', 'b')])
  1377      index = index.set_levels(
  1378          index.levels[0].astype(pd.CategoricalDtype(['one', 'two'])), level=0)
  1379      s = pd.Series(np.arange(1.0, 5.0), index=index)
  1380      with self.assertRaisesRegex(
  1381          frame_base.WontImplementError,
  1382          r"unstack\(\) is only supported on DataFrames if"):
  1383        self._run_test(lambda s: s.unstack(level=-1), s)
  1384  
  1385    def _unstack_get_categorical_index(self):
  1386      index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ('two', 'a'),
  1387                                         ('two', 'b')])
  1388      index = index.set_levels(
  1389          index.levels[0].astype(pd.CategoricalDtype(['one', 'two'])), level=0)
  1390      index = index.set_levels(
  1391          index.levels[1].astype(pd.CategoricalDtype(['a', 'b'])), level=1)
  1392      return index
  1393  
  1394    def test_unstack_pandas_example1(self):
  1395      index = self._unstack_get_categorical_index()
  1396      s = pd.Series(np.arange(1.0, 5.0), index=index)
  1397      self._run_test(lambda s: s.unstack(level=-1), s)
  1398  
  1399    def test_unstack_pandas_example2(self):
  1400      index = self._unstack_get_categorical_index()
  1401      s = pd.Series(np.arange(1.0, 5.0), index=index)
  1402      self._run_test(lambda s: s.unstack(level=0), s)
  1403  
  1404    def test_unstack_pandas_example3(self):
  1405      index = self._unstack_get_categorical_index()
  1406      s = pd.Series(np.arange(1.0, 5.0), index=index)
  1407      df = s.unstack(level=0)
  1408      if PD_VERSION < (1, 2):
  1409        with self.assertRaisesRegex(
  1410            frame_base.WontImplementError,
  1411            r"unstack\(\) is not supported when using pandas < 1.2.0"):
  1412          self._run_test(lambda df: df.unstack(), df)
  1413      else:
  1414        self._run_test(lambda df: df.unstack(), df)
  1415  
  1416    @unittest.skipIf(
  1417        PD_VERSION < (1, 4),
  1418        "Cannot set dtype of index to boolean for pandas<1.4")
  1419    def test_unstack_bool(self):
  1420      index = pd.MultiIndex.from_tuples([(True, 'a'), (True, 'b'), (False, 'a'),
  1421                                         (False, 'b')])
  1422      index = index.set_levels(index.levels[0].astype('boolean'), level=0)
  1423      index = index.set_levels(
  1424          index.levels[1].astype(pd.CategoricalDtype(['a', 'b'])), level=1)
  1425      s = pd.Series(np.arange(1.0, 5.0), index=index)
  1426      self._run_test(lambda s: s.unstack(level=0), s)
  1427  
  1428    def test_unstack_series_multiple_index_levels(self):
  1429      tuples = list(
  1430          zip(
  1431              *[
  1432                  ["bar", "bar", "bar", "bar", "baz", "baz", "baz", "baz"],
  1433                  ["one", "one", "two", "two", "one", "one", "two", "two"],
  1434                  ["A", "B", "A", "B", "A", "B", "A", "B"],
  1435              ]))
  1436      index = pd.MultiIndex.from_tuples(
  1437          tuples, names=["first", "second", "third"])
  1438      index = index.set_levels(
  1439          index.levels[0].astype(pd.CategoricalDtype(['bar', 'baz'])), level=0)
  1440      index = index.set_levels(
  1441          index.levels[1].astype(pd.CategoricalDtype(['one', 'two'])), level=1)
  1442      index = index.set_levels(
  1443          index.levels[2].astype(pd.CategoricalDtype(['A', 'B'])), level=2)
  1444      df = pd.Series(np.random.randn(8), index=index)
  1445      self._run_test(lambda df: df.unstack(level=['first', 'third']), df)
  1446  
  1447    def test_unstack_series_multiple_index_and_column_levels(self):
  1448      columns = pd.MultiIndex.from_tuples(
  1449          [
  1450              ("A", "cat", "long"),
  1451              ("B", "cat", "long"),
  1452              ("A", "dog", "short"),
  1453              ("B", "dog", "short"),
  1454          ],
  1455          names=["exp", "animal", "hair_length"],
  1456      )
  1457      index = pd.MultiIndex.from_product(
  1458          [['one', 'two'], ['a', 'b'], ['bar', 'baz']],
  1459          names=["first", "second", "third"])
  1460      index = index.set_levels(
  1461          index.levels[0].astype(pd.CategoricalDtype(['one', 'two'])), level=0)
  1462      index = index.set_levels(
  1463          index.levels[1].astype(pd.CategoricalDtype(['a', 'b'])), level=1)
  1464      index = index.set_levels(
  1465          index.levels[2].astype(pd.CategoricalDtype(['bar', 'baz'])), level=2)
  1466      df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns)
  1467      df = df.stack(level=["animal", "hair_length"])
  1468      self._run_test(lambda df: df.unstack(level=['second', 'third']), df)
  1469      self._run_test(lambda df: df.unstack(level=['second']), df)
  1470  
  1471    def test_pivot_non_categorical(self):
  1472      df = pd.DataFrame({
  1473          'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
  1474          'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  1475          'baz': [1, 2, 3, 4, 5, 6],
  1476          'zoo': ['x', 'y', 'z', 'q', 'w', 't']
  1477      })
  1478      with self.assertRaisesRegex(
  1479          frame_base.WontImplementError,
  1480          r"pivot\(\) of non-categorical type is not supported"):
  1481        self._run_test(
  1482            lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
  1483  
  1484    def test_pivot_pandas_example1(self):
  1485      # Simple test 1
  1486      df = pd.DataFrame({
  1487          'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
  1488          'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  1489          'baz': [1, 2, 3, 4, 5, 6],
  1490          'zoo': ['x', 'y', 'z', 'q', 'w', 't']
  1491      })
  1492      df['bar'] = df['bar'].astype(
  1493          pd.CategoricalDtype(categories=['A', 'B', 'C']))
  1494      self._run_test(
  1495          lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
  1496      self._run_test(
  1497          lambda df: df.pivot(index=['foo'], columns='bar', values='baz'), df)
  1498  
  1499    def test_pivot_pandas_example3(self):
  1500      # Multiple values
  1501      df = pd.DataFrame({
  1502          'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
  1503          'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  1504          'baz': [1, 2, 3, 4, 5, 6],
  1505          'zoo': ['x', 'y', 'z', 'q', 'w', 't']
  1506      })
  1507      df['bar'] = df['bar'].astype(
  1508          pd.CategoricalDtype(categories=['A', 'B', 'C']))
  1509      self._run_test(
  1510          lambda df: df.pivot(index='foo', columns='bar', values=['baz', 'zoo']),
  1511          df)
  1512      self._run_test(
  1513          lambda df: df.pivot(
  1514              index='foo', columns=['bar'], values=['baz', 'zoo']),
  1515          df)
  1516  
  1517    def test_pivot_pandas_example4(self):
  1518      # Multiple columns
  1519      df = pd.DataFrame({
  1520          "lev1": [1, 1, 1, 2, 2, 2],
  1521          "lev2": [1, 1, 2, 1, 1, 2],
  1522          "lev3": [1, 2, 1, 2, 1, 2],
  1523          "lev4": [1, 2, 3, 4, 5, 6],
  1524          "values": [0, 1, 2, 3, 4, 5]
  1525      })
  1526      df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1527      df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1528      df['values'] = df['values'].astype('Int64')
  1529      self._run_test(
  1530          lambda df: df.pivot(
  1531              index="lev1", columns=["lev2", "lev3"], values="values"),
  1532          df)
  1533  
  1534    def test_pivot_pandas_example5(self):
  1535      # Multiple index
  1536      df = pd.DataFrame({
  1537          "lev1": [1, 1, 1, 2, 2, 2],
  1538          "lev2": [1, 1, 2, 1, 1, 2],
  1539          "lev3": [1, 2, 1, 2, 1, 2],
  1540          "lev4": [1, 2, 3, 4, 5, 6],
  1541          "values": [0, 1, 2, 3, 4, 5]
  1542      })
  1543      df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1544      # Cast to nullable Int64 because Beam doesn't do the correct conversion to
  1545      # float64
  1546      df['values'] = df['values'].astype('Int64')
  1547      if PD_VERSION < (1, 4):
  1548        with self.assertRaisesRegex(
  1549            frame_base.WontImplementError,
  1550            r"pivot\(\) is not supported when pandas<1.4 and index is a Multi"):
  1551          self._run_test(
  1552              lambda df: df.pivot(
  1553                  index=["lev1", "lev2"], columns=["lev3"], values="values"),
  1554              df)
  1555      else:
  1556        self._run_test(
  1557            lambda df: df.pivot(
  1558                index=["lev1", "lev2"], columns=["lev3"], values="values"),
  1559            df)
  1560  
  1561    def test_pivot_pandas_example6(self):
  1562      # Value error when there are duplicates
  1563      df = pd.DataFrame({
  1564          "foo": ['one', 'one', 'two', 'two'],
  1565          "bar": ['A', 'A', 'B', 'C'],
  1566          "baz": [1, 2, 3, 4]
  1567      })
  1568      df['bar'] = df['bar'].astype(
  1569          pd.CategoricalDtype(categories=['A', 'B', 'C']))
  1570      self._run_error_test(
  1571          lambda df: df.pivot(index='foo', columns='bar', values='baz'),
  1572          df,
  1573          construction_time=False)
  1574  
  1575    def test_pivot_no_index_provided_on_single_level_index(self):
  1576      # Multiple columns, no index value provided
  1577      df = pd.DataFrame({
  1578          "lev1": [1, 1, 1, 2, 2, 2],
  1579          "lev2": [1, 1, 2, 1, 1, 2],
  1580          "lev3": [1, 2, 1, 2, 1, 2],
  1581          "lev4": [1, 2, 3, 4, 5, 6],
  1582          "values": [0, 1, 2, 3, 4, 5]
  1583      })
  1584      df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1585      df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1586      df['values'] = df['values'].astype('Int64')
  1587      self._run_test(
  1588          lambda df: df.pivot(columns=["lev2", "lev3"], values="values"), df)
  1589  
  1590    def test_pivot_no_index_provided_on_multiindex(self):
  1591      # Multiple columns, no index value provided
  1592      tuples = list(
  1593          zip(
  1594              *[
  1595                  ["bar", "bar", "bar", "baz", "baz", "baz"],
  1596                  [
  1597                      "one",
  1598                      "two",
  1599                      "three",
  1600                      "one",
  1601                      "two",
  1602                      "three",
  1603                  ],
  1604              ]))
  1605      index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
  1606      df = pd.DataFrame({
  1607          "lev1": [1, 1, 1, 2, 2, 2],
  1608          "lev2": [1, 1, 2, 1, 1, 2],
  1609          "lev3": [1, 2, 1, 2, 1, 2],
  1610          "lev4": [1, 2, 3, 4, 5, 6],
  1611          "values": [0, 1, 2, 3, 4, 5]
  1612      },
  1613                        index=index)
  1614      df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1615      df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
  1616      df['values'] = df['values'].astype('float64')
  1617      df['lev1'] = df['lev1'].astype('int64')
  1618      df['lev4'] = df['lev4'].astype('int64')
  1619      if PD_VERSION < (1, 4):
  1620        with self.assertRaisesRegex(
  1621            frame_base.WontImplementError,
  1622            r"pivot\(\) is not supported when pandas<1.4 and index is a Multi"):
  1623          self._run_test(lambda df: df.pivot(columns=["lev2", "lev3"]), df)
  1624      else:
  1625        self._run_test(
  1626            lambda df: df.pivot(columns=["lev2", "lev3"]),
  1627            df,
  1628            lenient_dtype_check=True)
  1629  
  1630  
  1631  # pandas doesn't support kurtosis on GroupBys:
  1632  # https://github.com/pandas-dev/pandas/issues/40139
  1633  ALL_GROUPING_AGGREGATIONS = sorted(
  1634      set(frames.ALL_AGGREGATIONS) - set(('kurt', 'kurtosis')))
  1635  
  1636  
  1637  class GroupByTest(_AbstractFrameTest):
  1638    """Tests for DataFrame/Series GroupBy operations."""
  1639    @staticmethod
  1640    def median_sum_fn(x):
  1641      with warnings.catch_warnings():
  1642        warnings.filterwarnings("ignore", message="Mean of empty slice")
  1643        return (x.foo + x.bar).median()
  1644  
  1645    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1646    def test_groupby_agg(self, agg_type):
  1647      if agg_type == 'describe' and PD_VERSION < (1, 2):
  1648        self.skipTest(
  1649            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1650            "DataFrameGroupBy.describe fails in pandas < 1.2")
  1651      self._run_test(
  1652          lambda df: df.groupby('group').agg(agg_type),
  1653          GROUPBY_DF,
  1654          check_proxy=False)
  1655  
  1656    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1657    def test_groupby_with_filter(self, agg_type):
  1658      if agg_type == 'describe' and PD_VERSION < (1, 2):
  1659        self.skipTest(
  1660            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1661            "DataFrameGroupBy.describe fails in pandas < 1.2")
  1662      self._run_test(
  1663          lambda df: getattr(df[df.foo > 30].groupby('group'), agg_type)(),
  1664          GROUPBY_DF,
  1665          check_proxy=False)
  1666  
  1667    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1668    def test_groupby(self, agg_type):
  1669      if agg_type == 'describe' and PD_VERSION < (1, 2):
  1670        self.skipTest(
  1671            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1672            "DataFrameGroupBy.describe fails in pandas < 1.2")
  1673  
  1674      self._run_test(
  1675          lambda df: getattr(df.groupby('group'), agg_type)(),
  1676          GROUPBY_DF,
  1677          check_proxy=False)
  1678  
  1679    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1680    def test_groupby_series(self, agg_type):
  1681      if agg_type == 'describe' and PD_VERSION < (1, 2):
  1682        self.skipTest(
  1683            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1684            "DataFrameGroupBy.describe fails in pandas < 1.2")
  1685  
  1686      self._run_test(
  1687          lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(),
  1688          GROUPBY_DF,
  1689          check_proxy=False)
  1690  
  1691    def test_groupby_user_guide(self):
  1692      # Example from https://pandas.pydata.org/docs/user_guide/groupby.html
  1693      arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
  1694                ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
  1695  
  1696      index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
  1697  
  1698      df = pd.DataFrame({
  1699          'A': [1, 1, 1, 1, 2, 2, 3, 3], 'B': np.arange(8)
  1700      },
  1701                        index=index)
  1702  
  1703      self._run_test(lambda df: df.groupby(['second', 'A']).sum(), df)
  1704  
  1705    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1706    def test_groupby_project_series(self, agg_type):
  1707      df = GROUPBY_DF
  1708  
  1709      if agg_type == 'describe':
  1710        self.skipTest(
  1711            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1712            "SeriesGroupBy.describe fails")
  1713      if agg_type in ('corr', 'cov'):
  1714        self.skipTest(
  1715            "https://github.com/apache/beam/issues/20895: "
  1716            "SeriesGroupBy.{corr, cov} do not raise the expected error.")
  1717  
  1718      self._run_test(lambda df: getattr(df.groupby('group').foo, agg_type)(), df)
  1719      self._run_test(lambda df: getattr(df.groupby('group').bar, agg_type)(), df)
  1720      self._run_test(
  1721          lambda df: getattr(df.groupby('group')['foo'], agg_type)(), df)
  1722      self._run_test(
  1723          lambda df: getattr(df.groupby('group')['bar'], agg_type)(), df)
  1724  
  1725    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1726    def test_groupby_project_dataframe(self, agg_type):
  1727      if agg_type == 'describe' and PD_VERSION < (1, 2):
  1728        self.skipTest(
  1729            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1730            "DataFrameGroupBy.describe fails in pandas < 1.2")
  1731      self._run_test(
  1732          lambda df: getattr(df.groupby('group')[['bar', 'baz']], agg_type)(),
  1733          GROUPBY_DF,
  1734          check_proxy=False)
  1735  
  1736    def test_groupby_errors_bad_projection(self):
  1737      df = GROUPBY_DF
  1738  
  1739      # non-existent projection column
  1740      self._run_error_test(
  1741          lambda df: df.groupby('group')[['bar', 'baz']].bar.median(), df)
  1742      self._run_error_test(lambda df: df.groupby('group')[['bad']].median(), df)
  1743  
  1744      self._run_error_test(lambda df: df.groupby('group').bad.median(), df)
  1745  
  1746      self._run_error_test(
  1747          lambda df: df.groupby('group')[['bar', 'baz']].bar.sum(), df)
  1748      self._run_error_test(lambda df: df.groupby('group')[['bat']].sum(), df)
  1749      self._run_error_test(lambda df: df.groupby('group').bat.sum(), df)
  1750  
  1751    def test_groupby_errors_non_existent_label(self):
  1752      df = GROUPBY_DF
  1753  
  1754      # non-existent grouping label
  1755      self._run_error_test(
  1756          lambda df: df.groupby(['really_bad', 'foo', 'bad']).foo.sum(), df)
  1757      self._run_error_test(lambda df: df.groupby('bad').foo.sum(), df)
  1758  
  1759    def test_groupby_callable(self):
  1760      df = GROUPBY_DF
  1761  
  1762      self._run_test(lambda df: df.groupby(lambda x: x % 2).foo.sum(), df)
  1763      self._run_test(lambda df: df.groupby(lambda x: x % 5).median(), df)
  1764  
  1765    def test_groupby_apply(self):
  1766      df = GROUPBY_DF
  1767      # Note this is the same as DataFrameGroupBy.describe. Using it here is
  1768      # just a convenient way to test apply() with a user fn that returns a Series
  1769      describe = lambda df: df.describe()
  1770  
  1771      self._run_test(lambda df: df.groupby('group').foo.apply(describe), df)
  1772      self._run_test(
  1773          lambda df: df.groupby('group')[['foo', 'bar']].apply(describe), df)
  1774      self._run_test(lambda df: df.groupby('group').apply(self.median_sum_fn), df)
  1775      self._run_test(
  1776          lambda df: df.set_index('group').foo.groupby(level=0).apply(describe),
  1777          df)
  1778      self._run_test(lambda df: df.groupby(level=0).apply(self.median_sum_fn), df)
  1779      self._run_test(lambda df: df.groupby(lambda x: x % 3).apply(describe), df)
  1780      self._run_test(
  1781          lambda df: df.bar.groupby(lambda x: x % 3).apply(describe), df)
  1782      self._run_test(
  1783          lambda df: df.set_index(['str', 'group', 'bool']).groupby(
  1784              level='group').apply(self.median_sum_fn),
  1785          df)
  1786  
  1787    def test_groupby_apply_preserves_column_order(self):
  1788      df = GROUPBY_DF
  1789  
  1790      self._run_test(
  1791          lambda df: df[['foo', 'group', 'bar']].groupby('group').apply(
  1792              lambda x: x),
  1793          df)
  1794  
  1795    def test_groupby_transform(self):
  1796      df = pd.DataFrame({
  1797          "Date": [
  1798              "2015-05-08",
  1799              "2015-05-07",
  1800              "2015-05-06",
  1801              "2015-05-05",
  1802              "2015-05-08",
  1803              "2015-05-07",
  1804              "2015-05-06",
  1805              "2015-05-05"
  1806          ],
  1807          "Data": [5, 8, 6, 1, 50, 100, 60, 120],
  1808      })
  1809  
  1810      self._run_test(lambda df: df.groupby('Date')['Data'].transform(np.sum), df)
  1811      self._run_test(
  1812          lambda df: df.groupby('Date')['Data'].transform(
  1813              lambda x: (x - x.mean()) / x.std()),
  1814          df)
  1815  
  1816    def test_groupby_pipe(self):
  1817      df = GROUPBY_DF
  1818  
  1819      self._run_test(lambda df: df.groupby('group').pipe(lambda x: x.sum()), df)
  1820      self._run_test(
  1821          lambda df: df.groupby('group')['bool'].pipe(lambda x: x.any()), df)
  1822      self._run_test(
  1823          lambda df: df.groupby(['group', 'foo']).pipe(
  1824              (lambda a, x: x.sum(numeric_only=a), 'x'), False),
  1825          df,
  1826          check_proxy=False)
  1827  
  1828    def test_groupby_apply_modified_index(self):
  1829      df = GROUPBY_DF
  1830  
  1831      # If apply fn modifies the index then the output will include the grouped
  1832      # index
  1833      self._run_test(
  1834          lambda df: df.groupby('group').apply(
  1835              lambda x: x[x.foo > x.foo.median()]),
  1836          df)
  1837  
  1838    @unittest.skip('https://github.com/apache/beam/issues/20762')
  1839    def test_groupby_aggregate_grouped_column(self):
  1840      df = pd.DataFrame({
  1841          'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
  1842          'foo': [None if i % 11 == 0 else i for i in range(100)],
  1843          'bar': [None if i % 7 == 0 else 99 - i for i in range(100)],
  1844          'baz': [None if i % 13 == 0 else i * 2 for i in range(100)],
  1845      })
  1846  
  1847      self._run_test(lambda df: df.groupby('group').group.count(), df)
  1848      self._run_test(lambda df: df.groupby('group')[['group', 'bar']].count(), df)
  1849      self._run_test(
  1850          lambda df: df.groupby('group')[['group', 'bar']].apply(
  1851              lambda x: x.describe()),
  1852          df)
  1853  
  1854    @parameterized.expand((x, ) for x in [
  1855        0,
  1856        [1],
  1857        3,
  1858        [0, 3],
  1859        [2, 1],
  1860        ['foo', 0],
  1861        [1, 'str'],
  1862        [3, 0, 2, 1],
  1863    ])
  1864    def test_groupby_level_agg(self, level):
  1865      df = GROUPBY_DF.set_index(['group', 'foo', 'bar', 'str'], drop=False)
  1866      self._run_test(lambda df: df.groupby(level=level).bar.max(), df)
  1867      self._run_test(
  1868          lambda df: df.groupby(level=level).sum(numeric_only=True), df)
  1869      self._run_test(
  1870          lambda df: df.groupby(level=level).apply(self.median_sum_fn), df)
  1871  
  1872    @unittest.skipIf(PD_VERSION < (1, 1), "drop_na added in pandas 1.1.0")
  1873    def test_groupby_count_na(self):
  1874      # Verify we can do a groupby.count() that doesn't drop NaN values
  1875      self._run_test(
  1876          lambda df: df.groupby('foo', dropna=True).bar.count(), GROUPBY_DF)
  1877      self._run_test(
  1878          lambda df: df.groupby('foo', dropna=False).bar.count(), GROUPBY_DF)
  1879  
  1880    def test_groupby_sum_min_count(self):
  1881      df = pd.DataFrame({
  1882          'good': [1, 2, 3, np.nan],
  1883          'bad': [np.nan, np.nan, np.nan, 4],
  1884          'group': ['a', 'b', 'a', 'b']
  1885      })
  1886  
  1887      self._run_test(lambda df: df.groupby('group').sum(min_count=2), df)
  1888  
  1889    def test_groupby_dtypes(self):
  1890      self._run_test(
  1891          lambda df: df.groupby('group').dtypes, GROUPBY_DF, check_proxy=False)
  1892      self._run_test(
  1893          lambda df: df.groupby(level=0).dtypes, GROUPBY_DF, check_proxy=False)
  1894  
  1895    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1896    def test_dataframe_groupby_series(self, agg_type):
  1897      if agg_type == 'describe' and PD_VERSION < (1, 2):
  1898        self.skipTest(
  1899            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1900            "DataFrameGroupBy.describe fails in pandas < 1.2")
  1901      self._run_test(
  1902          lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type),
  1903          GROUPBY_DF,
  1904          check_proxy=False)
  1905      self._run_test(
  1906          lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type),
  1907          GROUPBY_DF,
  1908          check_proxy=False)
  1909  
  1910    @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
  1911    def test_series_groupby_series(self, agg_type):
  1912      if agg_type == 'describe':
  1913        self.skipTest(
  1914            "https://github.com/apache/beam/issues/20967: proxy generation of "
  1915            "SeriesGroupBy.describe fails")
  1916      if agg_type in ('corr', 'cov'):
  1917        self.skipTest(
  1918            "https://github.com/apache/beam/issues/20895: "
  1919            "SeriesGroupBy.{corr, cov} do not raise the expected error.")
  1920      self._run_test(
  1921          lambda df: df[df.foo < 40].bar.groupby(df.group).agg(agg_type),
  1922          GROUPBY_DF)
  1923      self._run_test(
  1924          lambda df: df[df.foo < 40].bar.groupby(df.foo % 3).agg(agg_type),
  1925          GROUPBY_DF)
  1926  
  1927    def test_groupby_series_apply(self):
  1928      df = GROUPBY_DF
  1929  
  1930      # Note this is the same as DataFrameGroupBy.describe. Using it here is
  1931      # just a convenient way to test apply() with a user fn that returns a Series
  1932      describe = lambda df: df.describe()
  1933  
  1934      self._run_test(lambda df: df.groupby(df.group).foo.apply(describe), df)
  1935      self._run_test(
  1936          lambda df: df.groupby(df.group)[['foo', 'bar']].apply(describe), df)
  1937      self._run_test(
  1938          lambda df: df.groupby(df.group).apply(self.median_sum_fn), df)
  1939  
  1940    def test_groupby_multiindex_keep_nans(self):
  1941      # Due to https://github.com/pandas-dev/pandas/issues/36470
  1942      # groupby(dropna=False) doesn't work with multiple columns
  1943      with self.assertRaisesRegex(NotImplementedError,
  1944                                  "https://github.com/apache/beam/issues/21014"):
  1945        self._run_test(
  1946            lambda df: df.groupby(['foo', 'bar'], dropna=False).sum(), GROUPBY_DF)
  1947  
  1948  
  1949  class AggregationTest(_AbstractFrameTest):
  1950    """Tests for global aggregation methods on DataFrame/Series."""
  1951  
  1952    # corr, cov on Series require an other argument
  1953    @parameterized.expand(
  1954        sorted(set(frames.ALL_AGGREGATIONS) - set(['corr', 'cov'])))
  1955    def test_series_agg(self, agg_method):
  1956      s = pd.Series(list(range(16)))
  1957  
  1958      nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad')
  1959  
  1960      # TODO(https://github.com/apache/beam/issues/20926): max and min produce
  1961      # the wrong proxy
  1962      check_proxy = agg_method not in ('max', 'min')
  1963  
  1964      self._run_test(
  1965          lambda s: s.agg(agg_method),
  1966          s,
  1967          nonparallel=nonparallel,
  1968          check_proxy=check_proxy)
  1969  
  1970    # corr, cov on Series require an other argument
  1971    # Series.size is a property
  1972    @parameterized.expand(
  1973        sorted(set(frames.ALL_AGGREGATIONS) - set(['corr', 'cov', 'size'])))
  1974    def test_series_agg_method(self, agg_method):
  1975      s = pd.Series(list(range(16)))
  1976  
  1977      nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad')
  1978  
  1979      # TODO(https://github.com/apache/beam/issues/20926): max and min produce
  1980      # the wrong proxy
  1981      check_proxy = agg_method not in ('max', 'min')
  1982  
  1983      self._run_test(
  1984          lambda s: getattr(s, agg_method)(),
  1985          s,
  1986          nonparallel=nonparallel,
  1987          check_proxy=check_proxy)
  1988  
  1989    @parameterized.expand(frames.ALL_AGGREGATIONS)
  1990    def test_dataframe_agg(self, agg_method):
  1991      df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]})
  1992  
  1993      nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad')
  1994  
  1995      # TODO(https://github.com/apache/beam/issues/20926): max and min produce
  1996      # the wrong proxy
  1997      check_proxy = agg_method not in ('max', 'min')
  1998  
  1999      self._run_test(
  2000          lambda df: df.agg(agg_method),
  2001          df,
  2002          nonparallel=nonparallel,
  2003          check_proxy=check_proxy)
  2004  
  2005    # DataFrame.size is a property
  2006    @parameterized.expand(sorted(set(frames.ALL_AGGREGATIONS) - set(['size'])))
  2007    def test_dataframe_agg_method(self, agg_method):
  2008      df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]})
  2009  
  2010      nonparallel = agg_method in ('quantile', 'describe', 'median', 'sem', 'mad')
  2011  
  2012      # TODO(https://github.com/apache/beam/issues/20926): max and min produce
  2013      # the wrong proxy
  2014      check_proxy = agg_method not in ('max', 'min')
  2015  
  2016      self._run_test(
  2017          lambda df: getattr(df, agg_method)(),
  2018          df,
  2019          nonparallel=nonparallel,
  2020          check_proxy=check_proxy)
  2021  
  2022    def test_series_agg_modes(self):
  2023      s = pd.Series(list(range(16)))
  2024      self._run_test(lambda s: s.agg('sum'), s)
  2025      self._run_test(lambda s: s.agg(['sum']), s)
  2026      self._run_test(lambda s: s.agg(['sum', 'mean']), s)
  2027      self._run_test(lambda s: s.agg(['mean']), s)
  2028      self._run_test(lambda s: s.agg('mean'), s)
  2029  
  2030    def test_dataframe_agg_modes(self):
  2031      df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]})
  2032      self._run_test(lambda df: df.agg('sum'), df)
  2033      self._run_test(lambda df: df.agg(['sum', 'mean']), df)
  2034      self._run_test(lambda df: df.agg({'A': 'sum', 'B': 'sum'}), df)
  2035      self._run_test(lambda df: df.agg({'A': 'sum', 'B': 'mean'}), df)
  2036      self._run_test(lambda df: df.agg({'A': ['sum', 'mean']}), df)
  2037      self._run_test(lambda df: df.agg({'A': ['sum', 'mean'], 'B': 'min'}), df)
  2038  
  2039    def test_series_agg_level(self):
  2040      self._run_test(
  2041          lambda df: df.set_index(['group', 'foo']).bar.count(level=0),
  2042          GROUPBY_DF)
  2043      self._run_test(
  2044          lambda df: df.set_index(['group', 'foo']).bar.max(level=0), GROUPBY_DF)
  2045  
  2046      self._run_test(
  2047          lambda df: df.set_index(['group', 'foo']).bar.median(level=0),
  2048          GROUPBY_DF)
  2049  
  2050      self._run_test(
  2051          lambda df: df.set_index(['foo', 'group']).bar.count(level=1),
  2052          GROUPBY_DF)
  2053      self._run_test(
  2054          lambda df: df.set_index(['group', 'foo']).bar.max(level=1), GROUPBY_DF)
  2055      self._run_test(
  2056          lambda df: df.set_index(['group', 'foo']).bar.max(level='foo'),
  2057          GROUPBY_DF)
  2058      self._run_test(
  2059          lambda df: df.set_index(['group', 'foo']).bar.median(level=1),
  2060          GROUPBY_DF)
  2061  
  2062    def test_dataframe_agg_level(self):
  2063      self._run_test(
  2064          lambda df: df.set_index(['group', 'foo']).count(level=0), GROUPBY_DF)
  2065      self._run_test(
  2066          lambda df: df.set_index(['group', 'foo']).max(
  2067              level=0, numeric_only=False),
  2068          GROUPBY_DF,
  2069          check_proxy=False)
  2070      # pandas implementation doesn't respect numeric_only argument here
  2071      # (https://github.com/pandas-dev/pandas/issues/40788), it
  2072      # always acts as if numeric_only=True. Our implmentation respects it so we
  2073      # need to make it explicit.
  2074      self._run_test(
  2075          lambda df: df.set_index(['group', 'foo']).sum(
  2076              level=0, numeric_only=True),
  2077          GROUPBY_DF)
  2078  
  2079      self._run_test(
  2080          lambda df: df.set_index(['group', 'foo'])[['bar']].count(level=1),
  2081          GROUPBY_DF)
  2082      self._run_test(
  2083          lambda df: df.set_index(['group', 'foo']).count(level=1), GROUPBY_DF)
  2084      self._run_test(
  2085          lambda df: df.set_index(['group', 'foo']).max(
  2086              level=1, numeric_only=False),
  2087          GROUPBY_DF,
  2088          check_proxy=False)
  2089      # sum with str columns is order-sensitive
  2090      self._run_test(
  2091          lambda df: df.set_index(['group', 'foo']).sum(
  2092              level=1, numeric_only=True),
  2093          GROUPBY_DF)
  2094  
  2095      self._run_test(
  2096          lambda df: df.set_index(['group', 'foo']).median(
  2097              level=0, numeric_only=True),
  2098          GROUPBY_DF)
  2099      self._run_test(
  2100          lambda df: df.drop('str', axis=1).set_index(['foo', 'group']).median(
  2101              level=1, numeric_only=True),
  2102          GROUPBY_DF)
  2103  
  2104    def test_series_agg_multifunc_level(self):
  2105      # level= is ignored for multiple agg fns
  2106      self._run_test(
  2107          lambda df: df.set_index(['group', 'foo']).bar.agg(['min', 'max'],
  2108                                                            level=0),
  2109          GROUPBY_DF)
  2110  
  2111    def test_series_mean_skipna(self):
  2112      df = pd.DataFrame({
  2113          'one': [i if i % 8 == 0 else np.nan for i in range(8)],
  2114          'two': [i if i % 4 == 0 else np.nan for i in range(8)],
  2115          'three': [i if i % 2 == 0 else np.nan for i in range(8)],
  2116      })
  2117  
  2118      self._run_test(lambda df: df.one.mean(skipna=False), df)
  2119      self._run_test(lambda df: df.two.mean(skipna=False), df)
  2120      self._run_test(lambda df: df.three.mean(skipna=False), df)
  2121  
  2122      self._run_test(lambda df: df.one.mean(skipna=True), df)
  2123      self._run_test(lambda df: df.two.mean(skipna=True), df)
  2124      self._run_test(lambda df: df.three.mean(skipna=True), df)
  2125  
  2126    def test_dataframe_agg_multifunc_level(self):
  2127      # level= is ignored for multiple agg fns
  2128      self._run_test(
  2129          lambda df: df.set_index(['group', 'foo']).agg(['min', 'max'], level=0),
  2130          GROUPBY_DF,
  2131          check_proxy=False)
  2132  
  2133    @parameterized.expand([(True, ), (False, )])
  2134    @unittest.skipIf(
  2135        PD_VERSION < (1, 2),
  2136        "pandas 1.1.0 produces different dtypes for these examples")
  2137    def test_dataframe_agg_numeric_only(self, numeric_only):
  2138      # Note other aggregation functions can fail on this input with
  2139      # numeric_only={False,None}. These are the only ones that actually work for
  2140      # the string inputs.
  2141      self._run_test(
  2142          lambda df: df.max(numeric_only=numeric_only),
  2143          GROUPBY_DF,
  2144          check_proxy=False)
  2145      self._run_test(
  2146          lambda df: df.min(numeric_only=numeric_only),
  2147          GROUPBY_DF,
  2148          check_proxy=False)
  2149  
  2150    @unittest.skip(
  2151        "pandas implementation doesn't respect numeric_only= with "
  2152        "level= (https://github.com/pandas-dev/pandas/issues/40788)")
  2153    def test_dataframe_agg_level_numeric_only(self):
  2154      self._run_test(
  2155          lambda df: df.set_index('foo').sum(level=0, numeric_only=True),
  2156          GROUPBY_DF)
  2157      self._run_test(
  2158          lambda df: df.set_index('foo').max(level=0, numeric_only=True),
  2159          GROUPBY_DF)
  2160      self._run_test(
  2161          lambda df: df.set_index('foo').mean(level=0, numeric_only=True),
  2162          GROUPBY_DF)
  2163      self._run_test(
  2164          lambda df: df.set_index('foo').median(level=0, numeric_only=True),
  2165          GROUPBY_DF)
  2166  
  2167    def test_dataframe_agg_bool_only(self):
  2168      df = pd.DataFrame({
  2169          'all': [True for i in range(10)],
  2170          'any': [i % 3 == 0 for i in range(10)],
  2171          'int': range(10)
  2172      })
  2173  
  2174      self._run_test(lambda df: df.all(), df)
  2175      self._run_test(lambda df: df.any(), df)
  2176      self._run_test(lambda df: df.all(bool_only=True), df)
  2177      self._run_test(lambda df: df.any(bool_only=True), df)
  2178  
  2179    @unittest.skip(
  2180        "pandas doesn't implement bool_only= with level= "
  2181        "(https://github.com/pandas-dev/pandas/blob/"
  2182        "v1.2.3/pandas/core/generic.py#L10573)")
  2183    def test_dataframe_agg_level_bool_only(self):
  2184      df = pd.DataFrame({
  2185          'all': [True for i in range(10)],
  2186          'any': [i % 3 == 0 for i in range(10)],
  2187          'int': range(10)
  2188      })
  2189  
  2190      self._run_test(lambda df: df.set_index('int', drop=False).all(level=0), df)
  2191      self._run_test(lambda df: df.set_index('int', drop=False).any(level=0), df)
  2192      self._run_test(
  2193          lambda df: df.set_index('int', drop=False).all(level=0, bool_only=True),
  2194          df)
  2195      self._run_test(
  2196          lambda df: df.set_index('int', drop=False).any(level=0, bool_only=True),
  2197          df)
  2198  
  2199    def test_series_agg_np_size(self):
  2200      self._run_test(
  2201          lambda df: df.set_index(['group', 'foo']).agg(np.size),
  2202          GROUPBY_DF,
  2203          check_proxy=False)
  2204  
  2205    def test_df_agg_invalid_kwarg_raises(self):
  2206      self._run_error_test(lambda df: df.agg('mean', bool_only=True), GROUPBY_DF)
  2207      self._run_error_test(
  2208          lambda df: df.agg('any', numeric_only=True), GROUPBY_DF)
  2209      self._run_error_test(
  2210          lambda df: df.agg('median', min_count=3, numeric_only=True), GROUPBY_DF)
  2211  
  2212    def test_series_agg_method_invalid_kwarg_raises(self):
  2213      self._run_error_test(lambda df: df.foo.median(min_count=3), GROUPBY_DF)
  2214      self._run_error_test(
  2215          lambda df: df.foo.agg('median', min_count=3), GROUPBY_DF)
  2216  
  2217    @unittest.skipIf(
  2218        PD_VERSION < (1, 3),
  2219        (
  2220            "DataFrame.agg raises a different exception from the "
  2221            "aggregation methods. Fixed in "
  2222            "https://github.com/pandas-dev/pandas/pull/40543."))
  2223    def test_df_agg_method_invalid_kwarg_raises(self):
  2224      self._run_error_test(lambda df: df.mean(bool_only=True), GROUPBY_DF)
  2225      self._run_error_test(lambda df: df.any(numeric_only=True), GROUPBY_DF)
  2226      self._run_error_test(
  2227          lambda df: df.median(min_count=3, numeric_only=True), GROUPBY_DF)
  2228  
  2229    def test_agg_min_count(self):
  2230      df = pd.DataFrame({
  2231          'good': [1, 2, 3, np.nan],
  2232          'bad': [np.nan, np.nan, np.nan, 4],
  2233      },
  2234                        index=['a', 'b', 'a', 'b'])
  2235  
  2236      self._run_test(lambda df: df.sum(level=0, min_count=2), df)
  2237  
  2238      self._run_test(lambda df: df.sum(min_count=3), df, nonparallel=True)
  2239      self._run_test(lambda df: df.sum(min_count=1), df, nonparallel=True)
  2240      self._run_test(lambda df: df.good.sum(min_count=2), df, nonparallel=True)
  2241      self._run_test(lambda df: df.bad.sum(min_count=2), df, nonparallel=True)
  2242  
  2243    def test_series_agg_std(self):
  2244      s = pd.Series(range(10))
  2245  
  2246      self._run_test(lambda s: s.agg('std'), s)
  2247      self._run_test(lambda s: s.agg('var'), s)
  2248      self._run_test(lambda s: s.agg(['std', 'sum']), s)
  2249      self._run_test(lambda s: s.agg(['var']), s)
  2250  
  2251    def test_std_all_na(self):
  2252      s = pd.Series([np.nan] * 10)
  2253  
  2254      self._run_test(lambda s: s.agg('std'), s)
  2255      self._run_test(lambda s: s.std(), s)
  2256  
  2257    def test_std_mostly_na_with_ddof(self):
  2258      df = pd.DataFrame({
  2259          'one': [i if i % 8 == 0 else np.nan for i in range(8)],
  2260          'two': [i if i % 4 == 0 else np.nan for i in range(8)],
  2261          'three': [i if i % 2 == 0 else np.nan for i in range(8)],
  2262      },
  2263                        index=pd.MultiIndex.from_arrays(
  2264                            [list(range(8)), list(reversed(range(8)))],
  2265                            names=['forward', None]))
  2266  
  2267      self._run_test(lambda df: df.std(), df)  # ddof=1
  2268      self._run_test(lambda df: df.std(ddof=0), df)
  2269      self._run_test(lambda df: df.std(ddof=2), df)
  2270      self._run_test(lambda df: df.std(ddof=3), df)
  2271      self._run_test(lambda df: df.std(ddof=4), df)
  2272  
  2273    def test_dataframe_std(self):
  2274      self._run_test(lambda df: df.std(numeric_only=True), GROUPBY_DF)
  2275      self._run_test(lambda df: df.var(numeric_only=True), GROUPBY_DF)
  2276  
  2277    def test_dataframe_mode(self):
  2278      self._run_test(
  2279          lambda df: df.mode(), GROUPBY_DF, nonparallel=True, check_proxy=False)
  2280      self._run_test(
  2281          lambda df: df.mode(numeric_only=True),
  2282          GROUPBY_DF,
  2283          nonparallel=True,
  2284          check_proxy=False)
  2285      self._run_test(
  2286          lambda df: df.mode(dropna=True, numeric_only=True),
  2287          GROUPBY_DF,
  2288          nonparallel=True,
  2289          check_proxy=False)
  2290  
  2291    def test_series_mode(self):
  2292      self._run_test(lambda df: df.foo.mode(), GROUPBY_DF, nonparallel=True)
  2293      self._run_test(
  2294          lambda df: df.baz.mode(dropna=True), GROUPBY_DF, nonparallel=True)
  2295  
  2296  
  2297  class BeamSpecificTest(unittest.TestCase):
  2298    """Tests for functionality that's specific to the Beam DataFrame API.
  2299  
  2300    These features don't exist in pandas so we must verify them independently."""
  2301    def assert_frame_data_equivalent(
  2302        self, actual, expected, check_column_subset=False, extra_col_value=0):
  2303      """Verify that actual is the same as expected, ignoring the index and order
  2304      of the data.
  2305  
  2306      Note: In order to perform non-deferred column operations in Beam, we have
  2307      to enumerate all possible categories of data, even if they are ultimately
  2308      unobserved. The default Pandas implementation on the other hand does not
  2309      produce unobserved columns. This means when conducting tests, we need to
  2310      account for the fact that the Beam result may be a superset of that of the
  2311      Pandas result.
  2312  
  2313      If ``check_column_subset`` is `True`, we verify that all of the columns in
  2314      the Dataframe returned from the Pandas implementation is contained in the
  2315      Dataframe created from the Beam implementation.
  2316  
  2317      We also check if all columns that exist in the Beam implementation but
  2318      not in the Pandas implementation are all equal to the ``extra_col_value``
  2319      to ensure that they were not erroneously populated.
  2320      """
  2321      if check_column_subset:
  2322        if isinstance(expected, pd.DataFrame):
  2323          expected_cols = set(expected.columns)
  2324          actual_cols = set(actual.columns)
  2325          # Verifying that expected columns is a subset of the actual columns
  2326          if not set(expected_cols).issubset(set(actual_cols)):
  2327            raise AssertionError(
  2328                f"Expected columns:\n{expected.columns}\n is not a"
  2329                f"subset of {actual.columns}.")
  2330  
  2331          # Verifying that columns that don't exist in expected
  2332          # but do in actual, are all equal to `extra_col_value` (default of 0)
  2333          extra_columns = actual_cols - expected_cols
  2334          if extra_columns:
  2335            actual_extra_only = actual[list(extra_columns)]
  2336  
  2337            if np.isnan(extra_col_value):
  2338              extra_cols_all_match = actual_extra_only.isna().all().all()
  2339            else:
  2340              extra_cols_all_match = actual_extra_only.eq(
  2341                  extra_col_value).all().all()
  2342            if not extra_cols_all_match:
  2343              raise AssertionError(
  2344                  f"Extra columns:{extra_columns}\n should all "
  2345                  f"be {extra_col_value}, but got \n{actual_extra_only}.")
  2346  
  2347          # Filtering actual to contain only columns in expected
  2348          actual = actual[expected.columns]
  2349  
  2350      def sort_and_drop_index(df):
  2351        if isinstance(df, pd.Series):
  2352          df = df.sort_values()
  2353        elif isinstance(df, pd.DataFrame):
  2354          df = df.sort_values(by=list(df.columns))
  2355  
  2356        return df.reset_index(drop=True)
  2357  
  2358      actual = sort_and_drop_index(actual)
  2359      expected = sort_and_drop_index(expected)
  2360  
  2361      if isinstance(expected, pd.Series):
  2362        pd.testing.assert_series_equal(actual, expected)
  2363      elif isinstance(expected, pd.DataFrame):
  2364        pd.testing.assert_frame_equal(actual, expected)
  2365  
  2366    def _evaluate(self, func, *args, distributed=True):
  2367      deferred_args = [
  2368          frame_base.DeferredFrame.wrap(
  2369              expressions.ConstantExpression(arg, arg[0:0])) for arg in args
  2370      ]
  2371  
  2372      session_type = (
  2373          expressions.PartitioningSession if distributed else expressions.Session)
  2374  
  2375      return session_type({}).evaluate(func(*deferred_args)._expr)
  2376  
  2377    def test_drop_duplicates_keep_any(self):
  2378      df = pd.DataFrame({
  2379          'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  2380          'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  2381          'rating': [4, 4, 3.5, 15, 5]
  2382      })
  2383  
  2384      result = self._evaluate(lambda df: df.drop_duplicates(keep='any'), df)
  2385  
  2386      # Verify that the result is the same as conventional drop_duplicates
  2387      self.assert_frame_data_equivalent(result, df.drop_duplicates())
  2388  
  2389    def test_drop_duplicates_keep_any_subset(self):
  2390      df = pd.DataFrame({
  2391          'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  2392          'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  2393          'rating': [4, 4, 3.5, 15, 5]
  2394      })
  2395  
  2396      result = self._evaluate(
  2397          lambda df: df.drop_duplicates(keep='any', subset=['brand']), df)
  2398  
  2399      self.assertTrue(result.brand.unique)
  2400      self.assert_frame_data_equivalent(
  2401          result.brand, df.drop_duplicates(subset=['brand']).brand)
  2402  
  2403    def test_series_drop_duplicates_keep_any(self):
  2404      df = pd.DataFrame({
  2405          'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  2406          'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  2407          'rating': [4, 4, 3.5, 15, 5]
  2408      })
  2409  
  2410      result = self._evaluate(lambda df: df.brand.drop_duplicates(keep='any'), df)
  2411  
  2412      self.assert_frame_data_equivalent(result, df.brand.drop_duplicates())
  2413  
  2414    def test_duplicated_keep_any(self):
  2415      df = pd.DataFrame({
  2416          'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  2417          'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  2418          'rating': [4, 4, 3.5, 15, 5]
  2419      })
  2420  
  2421      result = self._evaluate(lambda df: df.duplicated(keep='any'), df)
  2422  
  2423      # Verify that the result is the same as conventional duplicated
  2424      self.assert_frame_data_equivalent(result, df.duplicated())
  2425  
  2426    def test_get_dummies_not_categoricaldtype(self):
  2427      # Should not work because series is not a CategoricalDtype type
  2428      with self.assertRaisesRegex(
  2429          frame_base.WontImplementError,
  2430          r"get_dummies\(\) of non-categorical type is not supported"):
  2431        s = pd.Series(['a ,b', 'a', 'a, d'])
  2432        self._evaluate(lambda s: s.str.get_dummies(','), s)
  2433  
  2434      # bool series do not work because they are not a CategoricalDtype type
  2435      with self.assertRaisesRegex(
  2436          frame_base.WontImplementError,
  2437          r"get_dummies\(\) of non-categorical type is not supported"):
  2438        s = pd.Series([True, False, False, True])
  2439        self._evaluate(lambda s: s.str.get_dummies(), s)
  2440  
  2441    def test_get_dummies_comma_separator(self):
  2442      s = pd.Series(['a ,b', 'a', 'a, d', 'c'])
  2443      s = s.astype(pd.CategoricalDtype(categories=['a ,b', 'c', 'b', 'a,d']))
  2444      result = self._evaluate(lambda s: s.str.get_dummies(','), s)
  2445      self.assert_frame_data_equivalent(
  2446          result, s.str.get_dummies(','), check_column_subset=True)
  2447  
  2448    def test_get_dummies_pandas_doc_example1(self):
  2449      s = pd.Series(['a|b', 'a', 'a|c'])
  2450      s = s.astype(pd.CategoricalDtype(categories=['a|b', 'a', 'a|c']))
  2451      result = self._evaluate(lambda s: s.str.get_dummies(), s)
  2452      self.assert_frame_data_equivalent(
  2453          result, s.str.get_dummies(), check_column_subset=True)
  2454  
  2455    def test_get_dummies_pandas_doc_example2(self):
  2456      # Shouldn't still work even though np.nan is not considered a category
  2457      # because we automatically create a nan column
  2458      s = pd.Series(['a|b', np.nan, 'a|c'])
  2459      s = s.astype(pd.CategoricalDtype(categories=['a|b', 'a|c']))
  2460      result = self._evaluate(lambda s: s.str.get_dummies(), s)
  2461      self.assert_frame_data_equivalent(
  2462          result, s.str.get_dummies(), check_column_subset=True)
  2463  
  2464    def test_get_dummies_pass_nan_as_category(self):
  2465      # Explicitly pass 'nan' as a category
  2466      s = pd.Series(['a|b', 'b|c', 'a|c', 'c', 'd'])
  2467      s = s.astype(pd.CategoricalDtype(categories=['a', 'b', 'c', 'nan']))
  2468      result = self._evaluate(lambda s: s.str.get_dummies(), s)
  2469      self.assert_frame_data_equivalent(
  2470          result, s.str.get_dummies(), check_column_subset=True)
  2471  
  2472    def test_get_dummies_bools_casted_to_string(self):
  2473      s = pd.Series([True, False, False, True]).astype('str')
  2474      s = s.astype(pd.CategoricalDtype(categories=['True', 'False']))
  2475      result = self._evaluate(lambda s: s.str.get_dummies(), s)
  2476      self.assert_frame_data_equivalent(
  2477          result, s.str.get_dummies(), check_column_subset=True)
  2478  
  2479    def test_nsmallest_any(self):
  2480      df = pd.DataFrame({
  2481          'population': [
  2482              59000000,
  2483              65000000,
  2484              434000,
  2485              434000,
  2486              434000,
  2487              337000,
  2488              337000,
  2489              11300,
  2490              11300
  2491          ],
  2492          'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
  2493          'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"]
  2494      },
  2495                        index=[
  2496                            "Italy",
  2497                            "France",
  2498                            "Malta",
  2499                            "Maldives",
  2500                            "Brunei",
  2501                            "Iceland",
  2502                            "Nauru",
  2503                            "Tuvalu",
  2504                            "Anguilla"
  2505                        ])
  2506  
  2507      result = self._evaluate(
  2508          lambda df: df.population.nsmallest(3, keep='any'), df)
  2509  
  2510      # keep='any' should produce the same result as keep='first',
  2511      # but not necessarily with the same index
  2512      self.assert_frame_data_equivalent(result, df.population.nsmallest(3))
  2513  
  2514    def test_nlargest_any(self):
  2515      df = pd.DataFrame({
  2516          'population': [
  2517              59000000,
  2518              65000000,
  2519              434000,
  2520              434000,
  2521              434000,
  2522              337000,
  2523              337000,
  2524              11300,
  2525              11300
  2526          ],
  2527          'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
  2528          'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"]
  2529      },
  2530                        index=[
  2531                            "Italy",
  2532                            "France",
  2533                            "Malta",
  2534                            "Maldives",
  2535                            "Brunei",
  2536                            "Iceland",
  2537                            "Nauru",
  2538                            "Tuvalu",
  2539                            "Anguilla"
  2540                        ])
  2541  
  2542      result = self._evaluate(
  2543          lambda df: df.population.nlargest(3, keep='any'), df)
  2544  
  2545      # keep='any' should produce the same result as keep='first',
  2546      # but not necessarily with the same index
  2547      self.assert_frame_data_equivalent(result, df.population.nlargest(3))
  2548  
  2549    def test_pivot_pandas_example2(self):
  2550      # Simple test 2
  2551      df = pd.DataFrame({
  2552          'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
  2553          'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  2554          'baz': [1, 2, 3, 4, 5, 6],
  2555          'zoo': ['x', 'y', 'z', 'q', 'w', 't']
  2556      })
  2557      df['bar'] = df['bar'].astype(
  2558          pd.CategoricalDtype(categories=['A', 'B', 'C']))
  2559      result = self._evaluate(lambda df: df.pivot(index='foo', columns='bar'), df)
  2560      # When there are multiple values, dtypes default to object.
  2561      # Thus, need to convert to numeric with pd.to_numeric
  2562      self.assert_frame_data_equivalent(
  2563          result['baz'].apply(pd.to_numeric),
  2564          df.pivot(index='foo', columns='bar')['baz'])
  2565  
  2566    def test_sample(self):
  2567      df = pd.DataFrame({
  2568          'population': [
  2569              59000000,
  2570              65000000,
  2571              434000,
  2572              434000,
  2573              434000,
  2574              337000,
  2575              337000,
  2576              11300,
  2577              11300
  2578          ],
  2579          'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
  2580          'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"]
  2581      },
  2582                        index=[
  2583                            "Italy",
  2584                            "France",
  2585                            "Malta",
  2586                            "Maldives",
  2587                            "Brunei",
  2588                            "Iceland",
  2589                            "Nauru",
  2590                            "Tuvalu",
  2591                            "Anguilla"
  2592                        ])
  2593  
  2594      result = self._evaluate(lambda df: df.sample(n=3), df)
  2595  
  2596      self.assertEqual(len(result), 3)
  2597  
  2598      series_result = self._evaluate(lambda df: df.GDP.sample(n=3), df)
  2599      self.assertEqual(len(series_result), 3)
  2600      self.assertEqual(series_result.name, "GDP")
  2601  
  2602    def test_sample_with_weights(self):
  2603      df = pd.DataFrame({
  2604          'population': [
  2605              59000000,
  2606              65000000,
  2607              434000,
  2608              434000,
  2609              434000,
  2610              337000,
  2611              337000,
  2612              11300,
  2613              11300
  2614          ],
  2615          'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
  2616          'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"]
  2617      },
  2618                        index=[
  2619                            "Italy",
  2620                            "France",
  2621                            "Malta",
  2622                            "Maldives",
  2623                            "Brunei",
  2624                            "Iceland",
  2625                            "Nauru",
  2626                            "Tuvalu",
  2627                            "Anguilla"
  2628                        ])
  2629  
  2630      weights = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1], index=df.index)
  2631  
  2632      result = self._evaluate(
  2633          lambda df, weights: df.sample(n=2, weights=weights), df, weights)
  2634  
  2635      self.assertEqual(len(result), 2)
  2636      self.assertEqual(set(result.index), set(["Tuvalu", "Anguilla"]))
  2637  
  2638      series_result = self._evaluate(
  2639          lambda df, weights: df.GDP.sample(n=2, weights=weights), df, weights)
  2640      self.assertEqual(len(series_result), 2)
  2641      self.assertEqual(series_result.name, "GDP")
  2642      self.assertEqual(set(series_result.index), set(["Tuvalu", "Anguilla"]))
  2643  
  2644    def test_sample_with_missing_weights(self):
  2645      df = pd.DataFrame({
  2646          'population': [
  2647              59000000,
  2648              65000000,
  2649              434000,
  2650              434000,
  2651              434000,
  2652              337000,
  2653              337000,
  2654              11300,
  2655              11300
  2656          ],
  2657          'GDP': [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
  2658          'alpha-2': ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"]
  2659      },
  2660                        index=[
  2661                            "Italy",
  2662                            "France",
  2663                            "Malta",
  2664                            "Maldives",
  2665                            "Brunei",
  2666                            "Iceland",
  2667                            "Nauru",
  2668                            "Tuvalu",
  2669                            "Anguilla"
  2670                        ])
  2671  
  2672      # Missing weights are treated as 0
  2673      weights = pd.Series([.1, .01, np.nan, 0],
  2674                          index=["Nauru", "Iceland", "Anguilla", "Italy"])
  2675  
  2676      result = self._evaluate(
  2677          lambda df, weights: df.sample(n=2, weights=weights), df, weights)
  2678  
  2679      self.assertEqual(len(result), 2)
  2680      self.assertEqual(set(result.index), set(["Nauru", "Iceland"]))
  2681  
  2682      series_result = self._evaluate(
  2683          lambda df, weights: df.GDP.sample(n=2, weights=weights), df, weights)
  2684  
  2685      self.assertEqual(len(series_result), 2)
  2686      self.assertEqual(series_result.name, "GDP")
  2687      self.assertEqual(set(series_result.index), set(["Nauru", "Iceland"]))
  2688  
  2689    def test_sample_with_weights_distribution(self):
  2690      target_prob = 0.25
  2691      num_samples = 100
  2692      num_targets = 200
  2693      num_other_elements = 10000
  2694  
  2695      target_weight = target_prob / num_targets
  2696      other_weight = (1 - target_prob) / num_other_elements
  2697      self.assertTrue(target_weight > other_weight * 10, "weights too close")
  2698  
  2699      result = self._evaluate(
  2700          lambda s,
  2701          weights: s.sample(n=num_samples, weights=weights).sum(),
  2702          # The first elements are 1, the rest are all 0.  This means that when
  2703          # we sum all the sampled elements (above), the result should be the
  2704          # number of times the first elements (aka targets) were sampled.
  2705          pd.Series([1] * num_targets + [0] * num_other_elements),
  2706          pd.Series([target_weight] * num_targets +
  2707                    [other_weight] * num_other_elements))
  2708  
  2709      # With the above constants, the probability of violating this invariant
  2710      # (as computed using the Bernoulli distribution) is about 0.0012%.
  2711      expected = num_samples * target_prob
  2712      self.assertTrue(expected / 3 < result < expected * 2, (expected, result))
  2713  
  2714    def test_split_pandas_examples_no_expand(self):
  2715      # if expand=False (default), then no need to cast dtype to be
  2716      # CategoricalDtype.
  2717      s = pd.Series([
  2718          "this is a regular sentence",
  2719          "https://docs.python.org/3/tutorial/index.html",
  2720          np.nan
  2721      ])
  2722      result = self._evaluate(lambda s: s.str.split(), s)
  2723      self.assert_frame_data_equivalent(result, s.str.split())
  2724  
  2725      result = self._evaluate(lambda s: s.str.rsplit(), s)
  2726      self.assert_frame_data_equivalent(result, s.str.rsplit())
  2727  
  2728      result = self._evaluate(lambda s: s.str.split(n=2), s)
  2729      self.assert_frame_data_equivalent(result, s.str.split(n=2))
  2730  
  2731      result = self._evaluate(lambda s: s.str.rsplit(n=2), s)
  2732      self.assert_frame_data_equivalent(result, s.str.rsplit(n=2))
  2733  
  2734      result = self._evaluate(lambda s: s.str.split(pat="/"), s)
  2735      self.assert_frame_data_equivalent(result, s.str.split(pat="/"))
  2736  
  2737    def test_split_pandas_examples_expand_not_categorical(self):
  2738      # When expand=True, there is exception because series is not categorical
  2739      s = pd.Series([
  2740          "this is a regular sentence",
  2741          "https://docs.python.org/3/tutorial/index.html",
  2742          np.nan
  2743      ])
  2744      with self.assertRaisesRegex(
  2745          frame_base.WontImplementError,
  2746          r"split\(\) of non-categorical type is not supported"):
  2747        self._evaluate(lambda s: s.str.split(expand=True), s)
  2748  
  2749      with self.assertRaisesRegex(
  2750          frame_base.WontImplementError,
  2751          r"rsplit\(\) of non-categorical type is not supported"):
  2752        self._evaluate(lambda s: s.str.rsplit(expand=True), s)
  2753  
  2754    def test_split_pandas_examples_expand_pat_is_string_literal1(self):
  2755      # When expand=True and pattern is treated as a string literal
  2756      s = pd.Series([
  2757          "this is a regular sentence",
  2758          "https://docs.python.org/3/tutorial/index.html",
  2759          np.nan
  2760      ])
  2761      s = s.astype(
  2762          pd.CategoricalDtype(
  2763              categories=[
  2764                  'this is a regular sentence',
  2765                  'https://docs.python.org/3/tutorial/index.html'
  2766              ]))
  2767      result = self._evaluate(lambda s: s.str.split(expand=True), s)
  2768      self.assert_frame_data_equivalent(result, s.str.split(expand=True))
  2769  
  2770      result = self._evaluate(lambda s: s.str.rsplit("/", n=1, expand=True), s)
  2771      self.assert_frame_data_equivalent(
  2772          result, s.str.rsplit("/", n=1, expand=True))
  2773  
  2774    @unittest.skipIf(PD_VERSION < (1, 4), "regex arg is new in pandas 1.4")
  2775    def test_split_pandas_examples_expand_pat_is_string_literal2(self):
  2776      # when regex is None (default) regex pat is string literal if len(pat) == 1
  2777      s = pd.Series(['foojpgbar.jpg']).astype('category')
  2778      s = s.astype(pd.CategoricalDtype(categories=["foojpgbar.jpg"]))
  2779      result = self._evaluate(lambda s: s.str.split(r".", expand=True), s)
  2780      self.assert_frame_data_equivalent(result, s.str.split(r".", expand=True))
  2781  
  2782      # When regex=False, pat is interpreted as the string itself
  2783      result = self._evaluate(
  2784          lambda s: s.str.split(r"\.jpg", regex=False, expand=True), s)
  2785      self.assert_frame_data_equivalent(
  2786          result, s.str.split(r"\.jpg", regex=False, expand=True))
  2787  
  2788    @unittest.skipIf(PD_VERSION < (1, 4), "regex arg is new in pandas 1.4")
  2789    def test_split_pandas_examples_expand_pat_is_regex(self):
  2790      # when regex is None (default) regex pat is compiled if len(pat) != 1
  2791      s = pd.Series(["foo and bar plus baz"])
  2792      s = s.astype(pd.CategoricalDtype(categories=["foo and bar plus baz"]))
  2793      result = self._evaluate(lambda s: s.str.split(r"and|plus", expand=True), s)
  2794      self.assert_frame_data_equivalent(
  2795          result, s.str.split(r"and|plus", expand=True))
  2796  
  2797      s = pd.Series(['foojpgbar.jpg']).astype('category')
  2798      s = s.astype(pd.CategoricalDtype(categories=["foojpgbar.jpg"]))
  2799      result = self._evaluate(lambda s: s.str.split(r"\.jpg", expand=True), s)
  2800      self.assert_frame_data_equivalent(
  2801          result, s.str.split(r"\.jpg", expand=True))
  2802  
  2803      # When regex=True, pat is interpreted as a regex
  2804      result = self._evaluate(
  2805          lambda s: s.str.split(r"\.jpg", regex=True, expand=True), s)
  2806      self.assert_frame_data_equivalent(
  2807          result, s.str.split(r"\.jpg", regex=True, expand=True))
  2808  
  2809      # A compiled regex can be passed as pat
  2810      result = self._evaluate(
  2811          lambda s: s.str.split(re.compile(r"\.jpg"), expand=True), s)
  2812      self.assert_frame_data_equivalent(
  2813          result, s.str.split(re.compile(r"\.jpg"), expand=True))
  2814  
  2815    @unittest.skipIf(PD_VERSION < (1, 4), "regex arg is new in pandas 1.4")
  2816    def test_split_pat_is_regex(self):
  2817      # regex=True, but expand=False
  2818      s = pd.Series(['foojpgbar.jpg']).astype('category')
  2819      s = s.astype(pd.CategoricalDtype(categories=["foojpgbar.jpg"]))
  2820      result = self._evaluate(
  2821          lambda s: s.str.split(r"\.jpg", regex=True, expand=False), s)
  2822      self.assert_frame_data_equivalent(
  2823          result, s.str.split(r"\.jpg", regex=True, expand=False))
  2824  
  2825    def test_astype_categorical_rejected(self):
  2826      df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')})
  2827  
  2828      with self.assertRaisesRegex(frame_base.WontImplementError,
  2829                                  r"astype\(dtype='category'\)"):
  2830        self._evaluate(lambda df: df.B.astype('category'), df)
  2831  
  2832  
  2833  class AllowNonParallelTest(unittest.TestCase):
  2834    def _use_non_parallel_operation(self):
  2835      _ = frame_base.DeferredFrame.wrap(
  2836          expressions.PlaceholderExpression(pd.Series([1, 2, 3]))).replace(
  2837              'a', 'b', limit=1)
  2838  
  2839    def test_disallow_non_parallel(self):
  2840      with self.assertRaises(expressions.NonParallelOperation):
  2841        self._use_non_parallel_operation()
  2842  
  2843    def test_allow_non_parallel_in_context(self):
  2844      with beam.dataframe.allow_non_parallel_operations():
  2845        self._use_non_parallel_operation()
  2846  
  2847    def test_allow_non_parallel_nesting(self):
  2848      # disallowed
  2849      with beam.dataframe.allow_non_parallel_operations():
  2850        # allowed
  2851        self._use_non_parallel_operation()
  2852        with beam.dataframe.allow_non_parallel_operations(False):
  2853          # disallowed again
  2854          with self.assertRaises(expressions.NonParallelOperation):
  2855            self._use_non_parallel_operation()
  2856        # allowed
  2857        self._use_non_parallel_operation()
  2858      # disallowed
  2859      with self.assertRaises(expressions.NonParallelOperation):
  2860        self._use_non_parallel_operation()
  2861  
  2862  
  2863  class ConstructionTimeTest(unittest.TestCase):
  2864    """Tests for operations that can be executed eagerly."""
  2865    DF = pd.DataFrame({
  2866        'str_col': ['foo', 'bar'] * 3,
  2867        'int_col': [1, 2] * 3,
  2868        'flt_col': [1.1, 2.2] * 3,
  2869        'cat_col': pd.Series(list('aabbca'), dtype="category"),
  2870        'datetime_col': pd.Series(
  2871            pd.date_range(
  2872                '1/1/2000', periods=6, freq='m', tz='America/Los_Angeles'))
  2873    })
  2874    DEFERRED_DF = frame_base.DeferredFrame.wrap(
  2875        expressions.PlaceholderExpression(DF.iloc[:0]))
  2876  
  2877    def _run_test(self, fn):
  2878      expected = fn(self.DF)
  2879      actual = fn(self.DEFERRED_DF)
  2880  
  2881      if isinstance(expected, pd.Index):
  2882        pd.testing.assert_index_equal(expected, actual)
  2883      elif isinstance(expected, pd.Series):
  2884        pd.testing.assert_series_equal(expected, actual)
  2885      elif isinstance(expected, pd.DataFrame):
  2886        pd.testing.assert_frame_equal(expected, actual)
  2887      else:
  2888        self.assertEqual(expected, actual)
  2889  
  2890    @parameterized.expand(DF.columns)
  2891    def test_series_name(self, col_name):
  2892      self._run_test(lambda df: df[col_name].name)
  2893  
  2894    @parameterized.expand(DF.columns)
  2895    def test_series_dtype(self, col_name):
  2896      self._run_test(lambda df: df[col_name].dtype)
  2897      self._run_test(lambda df: df[col_name].dtypes)
  2898  
  2899    def test_dataframe_columns(self):
  2900      self._run_test(lambda df: list(df.columns))
  2901  
  2902    def test_dataframe_dtypes(self):
  2903      self._run_test(lambda df: list(df.dtypes))
  2904  
  2905    def test_categories(self):
  2906      self._run_test(lambda df: df.cat_col.cat.categories)
  2907  
  2908    def test_categorical_ordered(self):
  2909      self._run_test(lambda df: df.cat_col.cat.ordered)
  2910  
  2911    def test_groupby_ndim(self):
  2912      self._run_test(lambda df: df.groupby('int_col').ndim)
  2913  
  2914    def test_groupby_project_ndim(self):
  2915      self._run_test(lambda df: df.groupby('int_col').flt_col.ndim)
  2916      self._run_test(
  2917          lambda df: df.groupby('int_col')[['flt_col', 'str_col']].ndim)
  2918  
  2919    def test_get_column_default_None(self):
  2920      # .get just returns default_value=None at construction time if the column
  2921      # doesn't exist
  2922      self._run_test(lambda df: df.get('FOO'))
  2923  
  2924    def test_datetime_tz(self):
  2925      self._run_test(lambda df: df.datetime_col.dt.tz)
  2926  
  2927  
  2928  class DocstringTest(unittest.TestCase):
  2929    @parameterized.expand([
  2930        (frames.DeferredDataFrame, pd.DataFrame),
  2931        (frames.DeferredSeries, pd.Series),
  2932        #(frames._DeferredIndex, pd.Index),
  2933        (frames._DeferredStringMethods, pd.core.strings.StringMethods),
  2934        (
  2935            frames._DeferredCategoricalMethods,
  2936            pd.core.arrays.categorical.CategoricalAccessor),
  2937        (frames.DeferredGroupBy, pd.core.groupby.generic.DataFrameGroupBy),
  2938        (frames._DeferredGroupByCols, pd.core.groupby.generic.DataFrameGroupBy),
  2939        (
  2940            frames._DeferredDatetimeMethods,
  2941            pd.core.indexes.accessors.DatetimeProperties),
  2942    ])
  2943    def test_docs_defined(self, beam_type, pd_type):
  2944      beam_attrs = set(dir(beam_type))
  2945      pd_attrs = set(dir(pd_type))
  2946  
  2947      docstring_required = sorted([
  2948          attr for attr in beam_attrs.intersection(pd_attrs)
  2949          if getattr(pd_type, attr).__doc__ and not attr.startswith('_')
  2950      ])
  2951  
  2952      docstring_missing = [
  2953          attr for attr in docstring_required
  2954          if not getattr(beam_type, attr).__doc__
  2955      ]
  2956  
  2957      self.assertTrue(
  2958          len(docstring_missing) == 0,
  2959          f'{beam_type.__name__} is missing a docstring for '
  2960          f'{len(docstring_missing)}/{len(docstring_required)} '
  2961          f'({len(docstring_missing)/len(docstring_required):%}) '
  2962          f'operations:\n{docstring_missing}')
  2963  
  2964  
  2965  class ReprTest(unittest.TestCase):
  2966    def test_basic_dataframe(self):
  2967      df = frame_base.DeferredFrame.wrap(
  2968          expressions.ConstantExpression(GROUPBY_DF))
  2969      self.assertEqual(
  2970          repr(df),
  2971          (
  2972              "DeferredDataFrame(columns=['group', 'foo', 'bar', 'baz', 'bool', "
  2973              "'str'], index=<unnamed>)"))
  2974  
  2975    def test_dataframe_with_named_index(self):
  2976      df = frame_base.DeferredFrame.wrap(
  2977          expressions.ConstantExpression(GROUPBY_DF.set_index('group')))
  2978      self.assertEqual(
  2979          repr(df),
  2980          (
  2981              "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool', 'str'], "
  2982              "index='group')"))
  2983  
  2984    def test_dataframe_with_partial_named_index(self):
  2985      df = frame_base.DeferredFrame.wrap(
  2986          expressions.ConstantExpression(
  2987              GROUPBY_DF.set_index([GROUPBY_DF.index, 'group'])))
  2988      self.assertEqual(
  2989          repr(df),
  2990          (
  2991              "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool', 'str'], "
  2992              "indexes=[<unnamed>, 'group'])"))
  2993  
  2994    def test_dataframe_with_named_multi_index(self):
  2995      df = frame_base.DeferredFrame.wrap(
  2996          expressions.ConstantExpression(GROUPBY_DF.set_index(['str', 'group'])))
  2997      self.assertEqual(
  2998          repr(df),
  2999          (
  3000              "DeferredDataFrame(columns=['foo', 'bar', 'baz', 'bool'], "
  3001              "indexes=['str', 'group'])"))
  3002  
  3003    def test_dataframe_with_multiple_column_levels(self):
  3004      df = pd.DataFrame({
  3005          'foofoofoo': ['one', 'one', 'one', 'two', 'two', 'two'],
  3006          'barbar': ['A', 'B', 'C', 'A', 'B', 'C'],
  3007          'bazzy': [1, 2, 3, 4, 5, 6],
  3008          'zoop': ['x', 'y', 'z', 'q', 'w', 't']
  3009      })
  3010  
  3011      df = df.pivot(index='foofoofoo', columns='barbar')
  3012      df = frame_base.DeferredFrame.wrap(expressions.ConstantExpression(df))
  3013      self.assertEqual(
  3014          repr(df),
  3015          (
  3016              "DeferredDataFrame(columns=[('bazzy', 'A'), ('bazzy', 'B'), "
  3017              "('bazzy', 'C'), ('zoop', 'A'), ('zoop', 'B'), ('zoop', 'C')], "
  3018              "index='foofoofoo')"))
  3019  
  3020    def test_dataframe_with_multiple_column_and_multiple_index_levels(self):
  3021      df = pd.DataFrame({
  3022          'foofoofoo': ['one', 'one', 'one', 'two', 'two', 'two'],
  3023          'barbar': ['A', 'B', 'C', 'A', 'B', 'C'],
  3024          'bazzy': [1, 2, 3, 4, 5, 6],
  3025          'zoop': ['x', 'y', 'z', 'q', 'w', 't']
  3026      })
  3027  
  3028      df = df.pivot(index='foofoofoo', columns='barbar')
  3029      df.index = [['a', 'b'], df.index]
  3030  
  3031      # pandas repr displays this:
  3032      #             bazzy       zoop
  3033      # barbar          A  B  C    A  B  C
  3034      #   foofoofoo
  3035      # a one           1  2  3    x  y  z
  3036      # b two           4  5  6    q  w  t
  3037      df = frame_base.DeferredFrame.wrap(expressions.ConstantExpression(df))
  3038      self.assertEqual(
  3039          repr(df),
  3040          (
  3041              "DeferredDataFrame(columns=[('bazzy', 'A'), ('bazzy', 'B'), "
  3042              "('bazzy', 'C'), ('zoop', 'A'), ('zoop', 'B'), ('zoop', 'C')], "
  3043              "indexes=[<unnamed>, 'foofoofoo'])"))
  3044  
  3045    def test_basic_series(self):
  3046      df = frame_base.DeferredFrame.wrap(
  3047          expressions.ConstantExpression(GROUPBY_DF['bool']))
  3048      self.assertEqual(
  3049          repr(df), "DeferredSeries(name='bool', dtype=bool, index=<unnamed>)")
  3050  
  3051    def test_series_with_named_index(self):
  3052      df = frame_base.DeferredFrame.wrap(
  3053          expressions.ConstantExpression(GROUPBY_DF.set_index('group')['str']))
  3054      self.assertEqual(
  3055          repr(df), "DeferredSeries(name='str', dtype=object, index='group')")
  3056  
  3057    def test_series_with_partial_named_index(self):
  3058      df = frame_base.DeferredFrame.wrap(
  3059          expressions.ConstantExpression(
  3060              GROUPBY_DF.set_index([GROUPBY_DF.index, 'group'])['bar']))
  3061      self.assertEqual(
  3062          repr(df),
  3063          (
  3064              "DeferredSeries(name='bar', dtype=float64, "
  3065              "indexes=[<unnamed>, 'group'])"))
  3066  
  3067    def test_series_with_named_multi_index(self):
  3068      df = frame_base.DeferredFrame.wrap(
  3069          expressions.ConstantExpression(
  3070              GROUPBY_DF.set_index(['str', 'group'])['baz']))
  3071      self.assertEqual(
  3072          repr(df),
  3073          "DeferredSeries(name='baz', dtype=float64, indexes=['str', 'group'])")
  3074  
  3075  
  3076  @unittest.skipIf(
  3077      not ie.current_env().is_interactive_ready,
  3078      '[interactive] dependency is not installed.')
  3079  @isolated_env
  3080  class InteractiveDataFrameTest(unittest.TestCase):
  3081    def test_collect_merged_dataframes(self):
  3082      p = beam.Pipeline(InteractiveRunner())
  3083      pcoll_1 = (
  3084          p
  3085          | 'Create data 1' >> beam.Create([(1, 'a'), (2, 'b'), (3, 'c'),
  3086                                            (4, 'd')])
  3087          |
  3088          'To rows 1' >> beam.Select(col_1=lambda x: x[0], col_2=lambda x: x[1]))
  3089      df_1 = to_dataframe(pcoll_1)
  3090      pcoll_2 = (
  3091          p
  3092          | 'Create data 2' >> beam.Create([(5, 'e'), (6, 'f'), (7, 'g'),
  3093                                            (8, 'h')])
  3094          |
  3095          'To rows 2' >> beam.Select(col_3=lambda x: x[0], col_4=lambda x: x[1]))
  3096      df_2 = to_dataframe(pcoll_2)
  3097  
  3098      df_merged = df_1.merge(df_2, left_index=True, right_index=True)
  3099      pd_df = ib.collect(df_merged).sort_values(by='col_1')
  3100      self.assertEqual(pd_df.shape, (4, 4))
  3101      self.assertEqual(list(pd_df['col_1']), [1, 2, 3, 4])
  3102      self.assertEqual(list(pd_df['col_2']), ['a', 'b', 'c', 'd'])
  3103      self.assertEqual(list(pd_df['col_3']), [5, 6, 7, 8])
  3104      self.assertEqual(list(pd_df['col_4']), ['e', 'f', 'g', 'h'])
  3105  
  3106  
  3107  if __name__ == '__main__':
  3108    unittest.main()