github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/transforms_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/transforms_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import typing
    18  import unittest
    19  import warnings
    20  
    21  import pandas as pd
    22  
    23  import apache_beam as beam
    24  from apache_beam import coders
    25  from apache_beam import metrics
    26  from apache_beam.dataframe import convert
    27  from apache_beam.dataframe import expressions
    28  from apache_beam.dataframe import frame_base
    29  from apache_beam.dataframe import transforms
    30  from apache_beam.runners.portability.fn_api_runner import fn_runner
    31  from apache_beam.testing.util import assert_that
    32  from apache_beam.testing.util import equal_to
    33  
    34  
    35  def check_correct(expected, actual):
    36    if actual is None:
    37      raise AssertionError('Empty frame but expected: \n\n%s' % (expected))
    38    if isinstance(expected, pd.core.generic.NDFrame):
    39      expected = expected.sort_index()
    40      actual = actual.sort_index()
    41  
    42      if isinstance(expected, pd.Series):
    43        pd.testing.assert_series_equal(expected, actual)
    44      elif isinstance(expected, pd.DataFrame):
    45        pd.testing.assert_frame_equal(expected, actual)
    46      else:
    47        raise ValueError(
    48            f"Expected value is a {type(expected)},"
    49            "not a Series or DataFrame.")
    50    else:
    51      if actual != expected:
    52        raise AssertionError('Scalars not equal: %s != %s' % (actual, expected))
    53  
    54  
    55  def concat(parts):
    56    if len(parts) > 1:
    57      return pd.concat(parts)
    58    elif len(parts) == 1:
    59      return parts[0]
    60    else:
    61      return None
    62  
    63  
    64  def df_equal_to(expected):
    65    return lambda actual: check_correct(expected, concat(actual))
    66  
    67  
    68  AnimalSpeed = typing.NamedTuple(
    69      'AnimalSpeed', [('Animal', str), ('Speed', int)])
    70  coders.registry.register_coder(AnimalSpeed, coders.RowCoder)
    71  Nested = typing.NamedTuple(
    72      'Nested', [('id', int), ('animal_speed', AnimalSpeed)])
    73  coders.registry.register_coder(Nested, coders.RowCoder)
    74  
    75  
    76  class TransformTest(unittest.TestCase):
    77    def run_scenario(self, input, func):
    78      expected = func(input)
    79  
    80      empty = input.iloc[0:0]
    81      input_placeholder = expressions.PlaceholderExpression(empty)
    82      input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
    83      actual_deferred = func(input_deferred)._expr.evaluate_at(
    84          expressions.Session({input_placeholder: input}))
    85  
    86      check_correct(expected, actual_deferred)
    87  
    88      with beam.Pipeline() as p:
    89        input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]])
    90        input_df = convert.to_dataframe(input_pcoll, proxy=empty)
    91        output_df = func(input_df)
    92  
    93        output_proxy = output_df._expr.proxy()
    94        if isinstance(output_proxy, pd.core.generic.NDFrame):
    95          self.assertTrue(
    96              output_proxy.iloc[:0].equals(expected.iloc[:0]),
    97              (
    98                  'Output proxy is incorrect:\n'
    99                  f'Expected:\n{expected.iloc[:0]}\n\n'
   100                  f'Actual:\n{output_proxy.iloc[:0]}'))
   101        else:
   102          self.assertEqual(type(output_proxy), type(expected))
   103  
   104        output_pcoll = convert.to_pcollection(output_df, yield_elements='pandas')
   105  
   106        assert_that(
   107            output_pcoll, lambda actual: check_correct(expected, concat(actual)))
   108  
   109    def test_identity(self):
   110      df = pd.DataFrame({
   111          'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
   112          'Speed': [380., 370., 24., 26.]
   113      })
   114      self.run_scenario(df, lambda x: x)
   115  
   116    def test_groupby_sum_mean(self):
   117      df = pd.DataFrame({
   118          'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
   119          'Speed': [380., 370., 24., 26.]
   120      })
   121      self.run_scenario(df, lambda df: df.groupby('Animal').sum())
   122      with expressions.allow_non_parallel_operations():
   123        self.run_scenario(df, lambda df: df.groupby('Animal').mean())
   124      self.run_scenario(
   125          df, lambda df: df.loc[df.Speed > 25].groupby('Animal').sum())
   126  
   127    def test_groupby_apply(self):
   128      df = pd.DataFrame({
   129          'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
   130          'foo': [None if i % 11 == 0 else i for i in range(100)],
   131          'bar': [None if i % 7 == 0 else 99 - i for i in range(100)],
   132          'baz': [None if i % 13 == 0 else i * 2 for i in range(100)],
   133      })
   134  
   135      def median_sum_fn(x):
   136        with warnings.catch_warnings():
   137          warnings.filterwarnings("ignore", message="Mean of empty slice")
   138          return (x.foo + x.bar).median()
   139  
   140      describe = lambda df: df.describe()
   141  
   142      self.run_scenario(df, lambda df: df.groupby('group').foo.apply(describe))
   143      self.run_scenario(
   144          df, lambda df: df.groupby('group')[['foo', 'bar']].apply(describe))
   145      self.run_scenario(df, lambda df: df.groupby('group').apply(median_sum_fn))
   146      self.run_scenario(
   147          df,
   148          lambda df: df.set_index('group').foo.groupby(level=0).apply(describe))
   149      self.run_scenario(df, lambda df: df.groupby(level=0).apply(median_sum_fn))
   150      self.run_scenario(
   151          df, lambda df: df.groupby(lambda x: x % 3).apply(describe))
   152  
   153    def test_filter(self):
   154      df = pd.DataFrame({
   155          'Animal': ['Aardvark', 'Ant', 'Elephant', 'Zebra'],
   156          'Speed': [5, 2, 35, 40]
   157      })
   158      self.run_scenario(df, lambda df: df.filter(items=['Animal']))
   159      self.run_scenario(df, lambda df: df.filter(regex='Anim.*'))
   160      self.run_scenario(
   161          df, lambda df: df.set_index('Animal').filter(regex='F.*', axis='index'))
   162  
   163      with expressions.allow_non_parallel_operations():
   164        a = pd.DataFrame({'col': [1, 2, 3]})
   165        self.run_scenario(a, lambda a: a.agg(sum))
   166        self.run_scenario(a, lambda a: a.agg(['mean', 'min', 'max']))
   167  
   168    def test_scalar(self):
   169      with expressions.allow_non_parallel_operations():
   170        a = pd.Series([1, 2, 6])
   171        self.run_scenario(a, lambda a: a.agg(sum))
   172        self.run_scenario(a, lambda a: a / a.agg(sum))
   173        self.run_scenario(a, lambda a: a / (a.max() - a.min()))
   174        self.run_scenario(a, lambda a: a / (a.sum() - 1))
   175  
   176        # Tests scalar being used as an input to a downstream stage.
   177        df = pd.DataFrame({'key': ['a', 'a', 'b'], 'val': [1, 2, 6]})
   178        self.run_scenario(
   179            df, lambda df: df.groupby('key').sum().val / df.val.agg(sum))
   180  
   181    def test_getitem_projection(self):
   182      df = pd.DataFrame({
   183          'Animal': ['Aardvark', 'Ant', 'Elephant', 'Zebra'],
   184          'Speed': [5, 2, 35, 40],
   185          'Size': ['Small', 'Extra Small', 'Large', 'Medium']
   186      })
   187      self.run_scenario(df, lambda df: df[['Speed', 'Size']])
   188  
   189    def test_offset_elementwise(self):
   190      s = pd.Series(range(10)).astype(float)
   191      df = pd.DataFrame({'value': s, 'square': s * s, 'cube': s * s * s})
   192      # Only those values that are both squares and cubes will intersect.
   193      self.run_scenario(
   194          df,
   195          lambda df: df.set_index('square').value + df.set_index('cube').value)
   196  
   197    def test_batching_named_tuple_input(self):
   198      with beam.Pipeline() as p:
   199        result = (
   200            p | beam.Create([
   201                AnimalSpeed('Aardvark', 5),
   202                AnimalSpeed('Ant', 2),
   203                AnimalSpeed('Elephant', 35),
   204                AnimalSpeed('Zebra', 40)
   205            ]).with_output_types(AnimalSpeed)
   206            | transforms.DataframeTransform(lambda df: df.filter(regex='Anim.*')))
   207  
   208        assert_that(
   209            result,
   210            equal_to([('Aardvark', ), ('Ant', ), ('Elephant', ), ('Zebra', )]))
   211  
   212    def test_batching_beam_row_input(self):
   213      with beam.Pipeline() as p:
   214        result = (
   215            p
   216            | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.),
   217                           (u'Parrot', 26.)])
   218            | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))
   219            | transforms.DataframeTransform(
   220                lambda df: df.groupby('Animal').mean(), include_indexes=True))
   221  
   222        assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
   223  
   224    def test_batching_beam_row_to_dataframe(self):
   225      with beam.Pipeline() as p:
   226        df = convert.to_dataframe(
   227            p
   228            | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (
   229                u'Parrot', 24.), (u'Parrot', 26.)])
   230            | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))
   231  
   232        result = convert.to_pcollection(
   233            df.groupby('Animal').mean(), include_indexes=True)
   234  
   235        assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
   236  
   237    def test_batching_passthrough_nested_schema(self):
   238      with beam.Pipeline() as p:
   239        nested_schema_pc = (
   240            p | beam.Create([Nested(1, AnimalSpeed('Aardvark', 5))
   241                             ]).with_output_types(Nested))
   242        result = nested_schema_pc | transforms.DataframeTransform(  # pylint: disable=expression-not-assigned
   243            lambda df: df.filter(items=['animal_speed']))
   244  
   245        assert_that(result, equal_to([(('Aardvark', 5), )]))
   246  
   247    def test_batching_passthrough_nested_array(self):
   248      Array = typing.NamedTuple(
   249          'Array', [('id', int), ('business_numbers', typing.Sequence[int])])
   250      coders.registry.register_coder(Array, coders.RowCoder)
   251  
   252      with beam.Pipeline() as p:
   253        array_schema_pc = (p | beam.Create([Array(1, [7, 8, 9])]))
   254        result = array_schema_pc | transforms.DataframeTransform(  # pylint: disable=expression-not-assigned
   255              lambda df: df.filter(items=['business_numbers']))
   256  
   257        assert_that(result, equal_to([([7, 8, 9], )]))
   258  
   259    def test_unbatching_series(self):
   260      with beam.Pipeline() as p:
   261        result = (
   262            p
   263            | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.),
   264                           (u'Parrot', 26.)])
   265            | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))
   266            | transforms.DataframeTransform(lambda df: df.Animal))
   267  
   268        assert_that(result, equal_to(['Falcon', 'Falcon', 'Parrot', 'Parrot']))
   269  
   270    def test_input_output_polymorphism(self):
   271      one_series = pd.Series([1])
   272      two_series = pd.Series([2])
   273      three_series = pd.Series([3])
   274      proxy = one_series[:0]
   275  
   276      def equal_to_series(expected):
   277        def check(actual):
   278          actual = pd.concat(actual)
   279          if not expected.equals(actual):
   280            raise AssertionError(
   281                'Series not equal: \n%s\n%s\n' % (expected, actual))
   282  
   283        return check
   284  
   285      with beam.Pipeline() as p:
   286        one = p | 'One' >> beam.Create([one_series])
   287        two = p | 'Two' >> beam.Create([two_series])
   288  
   289        assert_that(
   290            one | 'PcollInPcollOut' >> transforms.DataframeTransform(
   291                lambda x: 3 * x, proxy=proxy, yield_elements='pandas'),
   292            equal_to_series(three_series),
   293            label='CheckPcollInPcollOut')
   294  
   295        assert_that(
   296            (one, two)
   297            | 'TupleIn' >> transforms.DataframeTransform(
   298                lambda x, y: (x + y), (proxy, proxy), yield_elements='pandas'),
   299            equal_to_series(three_series),
   300            label='CheckTupleIn')
   301  
   302        assert_that(
   303            dict(x=one, y=two)
   304            | 'DictIn' >> transforms.DataframeTransform(
   305                lambda x,
   306                y: (x + y),
   307                proxy=dict(x=proxy, y=proxy),
   308                yield_elements='pandas'),
   309            equal_to_series(three_series),
   310            label='CheckDictIn')
   311  
   312        double, triple = one | 'TupleOut' >> transforms.DataframeTransform(
   313                lambda x: (2*x, 3*x), proxy, yield_elements='pandas')
   314        assert_that(double, equal_to_series(two_series), 'CheckTupleOut0')
   315        assert_that(triple, equal_to_series(three_series), 'CheckTupleOut1')
   316  
   317        res = one | 'DictOut' >> transforms.DataframeTransform(
   318            lambda x: {'res': 3 * x}, proxy, yield_elements='pandas')
   319        assert_that(res['res'], equal_to_series(three_series), 'CheckDictOut')
   320  
   321    def test_cat(self):
   322      # verify that cat works with a List[Series] since this is
   323      # missing from doctests
   324      df = pd.DataFrame({
   325          'one': ['A', 'B', 'C'],
   326          'two': ['BB', 'CC', 'A'],
   327          'three': ['CCC', 'AA', 'B'],
   328      })
   329      self.run_scenario(df, lambda df: df.two.str.cat([df.three], join='outer'))
   330      self.run_scenario(
   331          df, lambda df: df.one.str.cat([df.two, df.three], join='outer'))
   332  
   333    def test_repeat(self):
   334      # verify that repeat works with a Series since this is
   335      # missing from doctests
   336      df = pd.DataFrame({
   337          'strings': ['A', 'B', 'C', 'D', 'E'],
   338          'repeats': [3, 1, 4, 5, 2],
   339      })
   340      self.run_scenario(df, lambda df: df.strings.str.repeat(df.repeats))
   341  
   342    def test_rename(self):
   343      df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
   344      self.run_scenario(
   345          df, lambda df: df.rename(columns={'B': 'C'}, index={
   346              0: 2, 2: 0
   347          }))
   348  
   349      with expressions.allow_non_parallel_operations():
   350        self.run_scenario(
   351            df,
   352            lambda df: df.rename(
   353                columns={'B': 'C'}, index={
   354                    0: 2, 2: 0
   355                }, errors='raise'))
   356  
   357  
   358  class FusionTest(unittest.TestCase):
   359    @staticmethod
   360    def fused_stages(p):
   361      return p.result.monitoring_metrics().query(
   362          metrics.MetricsFilter().with_name(
   363              fn_runner.FnApiRunner.NUM_FUSED_STAGES_COUNTER)
   364      )['counters'][0].result
   365  
   366    @staticmethod
   367    def create_animal_speed_input(p):
   368      return p | beam.Create([
   369          AnimalSpeed('Aardvark', 5),
   370          AnimalSpeed('Ant', 2),
   371          AnimalSpeed('Elephant', 35),
   372          AnimalSpeed('Zebra', 40)
   373      ],
   374                             reshuffle=False)
   375  
   376    def test_loc_filter(self):
   377      with beam.Pipeline() as p:
   378        _ = (
   379            self.create_animal_speed_input(p)
   380            | transforms.DataframeTransform(lambda df: df[df.Speed > 10]))
   381      self.assertEqual(self.fused_stages(p), 1)
   382  
   383    def test_column_manipulation(self):
   384      def set_column(df, name, s):
   385        df[name] = s
   386        return df
   387  
   388      with beam.Pipeline() as p:
   389        _ = (
   390            self.create_animal_speed_input(p)
   391            | transforms.DataframeTransform(
   392                lambda df: set_column(df, 'x', df.Speed + df.Animal.str.len())))
   393      self.assertEqual(self.fused_stages(p), 1)
   394  
   395  
   396  class TransformPartsTest(unittest.TestCase):
   397    def test_rebatch(self):
   398      with beam.Pipeline() as p:
   399        sA = pd.Series(range(1000))
   400        sB = sA * sA
   401        pcA = p | 'CreatePCollA' >> beam.Create([('k0', sA[::3]),
   402                                                 ('k1', sA[1::3]),
   403                                                 ('k2', sA[2::3])])
   404        pcB = p | 'CreatePCollB' >> beam.Create([('k0', sB[::3]),
   405                                                 ('k1', sB[1::3]),
   406                                                 ('k2', sB[2::3])])
   407        input = {'A': pcA, 'B': pcB} | beam.CoGroupByKey()
   408        output = input | beam.ParDo(
   409            transforms._ReBatch(target_size=sA.memory_usage()))
   410  
   411        # There should be exactly two elements, as the target size will be
   412        # hit when 2/3 of pcA and 2/3 of pcB is seen, but not before.
   413        assert_that(output | beam.combiners.Count.Globally(), equal_to([2]))
   414  
   415        # Sanity check that we got all the right values.
   416        assert_that(
   417            output | beam.Map(lambda x: x['A'].sum())
   418            | 'SumA' >> beam.CombineGlobally(sum),
   419            equal_to([sA.sum()]),
   420            label='CheckValuesA')
   421        assert_that(
   422            output | beam.Map(lambda x: x['B'].sum())
   423            | 'SumB' >> beam.CombineGlobally(sum),
   424            equal_to([sB.sum()]),
   425            label='CheckValuesB')
   426  
   427  
   428  if __name__ == '__main__':
   429    unittest.main()