github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/convert_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import unittest
    18  
    19  import pandas as pd
    20  
    21  import apache_beam as beam
    22  from apache_beam.dataframe import convert
    23  from apache_beam.testing.util import assert_that
    24  from apache_beam.testing.util import equal_to
    25  
    26  
    27  def equal_to_unordered_series(expected):
    28    def check(actual):
    29      actual = pd.concat(actual)
    30      if sorted(expected) != sorted(actual):
    31        raise AssertionError('Series not equal: \n%s\n%s\n' % (expected, actual))
    32  
    33    return check
    34  
    35  
    36  class ConvertTest(unittest.TestCase):
    37    def test_convert_yield_pandas(self):
    38      with beam.Pipeline() as p:
    39        a = pd.Series([1, 2, 3])
    40        b = pd.Series([100, 200, 300])
    41  
    42        pc_a = p | 'A' >> beam.Create([a])
    43        pc_b = p | 'B' >> beam.Create([b])
    44  
    45        df_a = convert.to_dataframe(pc_a, proxy=a[:0])
    46        df_b = convert.to_dataframe(pc_b, proxy=b[:0])
    47  
    48        df_2a = 2 * df_a
    49        df_3a = 3 * df_a
    50        df_ab = df_a * df_b
    51  
    52        # Converting multiple results at a time can be more efficient.
    53        pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab,
    54                                              yield_elements='pandas')
    55        # But separate conversions can be done as well.
    56        pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas')
    57  
    58        assert_that(pc_2a, equal_to_unordered_series(2 * a), label='Check2a')
    59        assert_that(pc_3a, equal_to_unordered_series(3 * a), label='Check3a')
    60        assert_that(pc_ab, equal_to_unordered_series(a * b), label='Checkab')
    61  
    62    def test_convert(self):
    63      with beam.Pipeline() as p:
    64        a = pd.Series([1, 2, 3])
    65        b = pd.Series([100, 200, 300])
    66  
    67        pc_a = p | 'A' >> beam.Create(a)
    68        pc_b = p | 'B' >> beam.Create(b)
    69  
    70        df_a = convert.to_dataframe(pc_a)
    71        df_b = convert.to_dataframe(pc_b)
    72  
    73        df_2a = 2 * df_a
    74        df_3a = 3 * df_a
    75        df_ab = df_a * df_b
    76  
    77        # Converting multiple results at a time can be more efficient.
    78        pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab)
    79        # But separate conversions can be done as well.
    80        pc_3a = convert.to_pcollection(df_3a)
    81  
    82        assert_that(pc_2a, equal_to(list(2 * a)), label='Check2a')
    83        assert_that(pc_3a, equal_to(list(3 * a)), label='Check3a')
    84        assert_that(pc_ab, equal_to(list(a * b)), label='Checkab')
    85  
    86    def test_convert_with_none(self):
    87      # Ensure the logical Any type allows (nullable) None, see BEAM-12587.
    88      df = pd.DataFrame({'A': ['str', 10, None], 'B': [None, 'str', 20]})
    89      with beam.Pipeline() as p:
    90        res = convert.to_pcollection(df, pipeline=p) | beam.Map(tuple)
    91        assert_that(res, equal_to([(row.A, row.B) for _, row in df.iterrows()]))
    92  
    93    def test_convert_scalar(self):
    94      with beam.Pipeline() as p:
    95        pc = p | 'A' >> beam.Create([1, 2, 3])
    96        s = convert.to_dataframe(pc)
    97        pc_sum = convert.to_pcollection(s.sum())
    98        assert_that(pc_sum, equal_to([6]))
    99  
   100    def test_convert_non_deferred(self):
   101      with beam.Pipeline() as p:
   102        s1 = pd.Series([1, 2, 3])
   103        s2 = convert.to_dataframe(p | beam.Create([100, 200, 300]))
   104  
   105        pc1, pc2 = convert.to_pcollection(s1, s2, pipeline=p)
   106        assert_that(pc1, equal_to([1, 2, 3]), label='CheckNonDeferred')
   107        assert_that(pc2, equal_to([100, 200, 300]), label='CheckDeferred')
   108  
   109    def test_convert_memoization(self):
   110      with beam.Pipeline() as p:
   111        a = pd.Series([1, 2, 3])
   112        b = pd.Series([100, 200, 300])
   113  
   114        pc_a = p | 'A' >> beam.Create([a])
   115        pc_b = p | 'B' >> beam.Create([b])
   116  
   117        df_a = convert.to_dataframe(pc_a, proxy=a[:0])
   118        df_b = convert.to_dataframe(pc_b, proxy=b[:0])
   119  
   120        df_2a = 2 * df_a
   121        df_3a = 3 * df_a
   122        df_ab = df_a * df_b
   123  
   124        # Two calls to to_pcollection with the same Dataframe should produce the
   125        # same PCollection(s)
   126        pc_2a_, pc_ab_ = convert.to_pcollection(df_2a, df_ab)
   127        pc_3a, pc_2a, pc_ab = convert.to_pcollection(df_3a, df_2a, df_ab)
   128  
   129        self.assertIs(pc_2a, pc_2a_)
   130        self.assertIs(pc_ab, pc_ab_)
   131        self.assertIsNot(pc_3a, pc_2a)
   132        self.assertIsNot(pc_3a, pc_ab)
   133  
   134        # The same conversions without the unbatching transform should also cache
   135        # PCollections
   136        pc_2a_pandas_, pc_ab_pandas_ = convert.to_pcollection(df_2a, df_ab,
   137                                              yield_elements='pandas')
   138        pc_3a_pandas, pc_2a_pandas, pc_ab_pandas = convert.to_pcollection(df_3a,
   139                                                                          df_2a,
   140                                                                          df_ab,
   141                                                     yield_elements='pandas')
   142  
   143        self.assertIs(pc_2a_pandas, pc_2a_pandas_)
   144        self.assertIs(pc_ab_pandas, pc_ab_pandas_)
   145        self.assertIsNot(pc_3a_pandas, pc_2a_pandas)
   146        self.assertIsNot(pc_3a_pandas, pc_ab_pandas)
   147  
   148        # .. but the cached PCollections should be different
   149        self.assertIsNot(pc_2a_pandas, pc_2a)
   150        self.assertIsNot(pc_ab_pandas, pc_ab)
   151        self.assertIsNot(pc_3a_pandas, pc_3a)
   152  
   153    def test_convert_memoization_clears_cache(self):
   154      # This test re-runs the other memoization test, and makes sure that the
   155      # cache is cleaned up with the pipeline. Otherwise there would be concerns
   156      # of it growing without bound.
   157  
   158      import gc
   159  
   160      # Make sure cache is clear
   161      gc.collect()
   162      self.assertEqual(len(convert.TO_PCOLLECTION_CACHE), 0)
   163  
   164      # Disable GC so it doesn't run pre-emptively, confounding assertions about
   165      # cache size
   166      gc.disable()
   167  
   168      # Also disable logging, as some implementations may artificially extend
   169      # the life of objects.
   170      import logging
   171      logging.disable(logging.INFO)
   172  
   173      try:
   174        self.test_convert_memoization()
   175        self.assertEqual(len(convert.TO_PCOLLECTION_CACHE), 3)
   176  
   177        gc.collect()
   178  
   179        # PCollections should be removed from cache after pipelines go out of
   180        # scope and are GC'd
   181        self.assertEqual(len(convert.TO_PCOLLECTION_CACHE), 0)
   182      finally:
   183        # Always re-enable GC and logging
   184        gc.enable()
   185        logging.disable(logging.NOTSET)
   186  
   187    def test_auto_convert(self):
   188      class MySchemaTransform(beam.PTransform):
   189        def expand(self, pcoll):
   190          return pcoll | beam.Map(
   191              lambda x: beam.Row(
   192                  a=x.n**2 - x.m**2, b=2 * x.m * x.n, c=x.n**2 + x.m**2))
   193  
   194      with beam.Pipeline() as p:
   195        pc_mn = p | beam.Create([
   196            (1, 2), (2, 3), (3, 10)
   197        ]) | beam.MapTuple(lambda m, n: beam.Row(m=m, n=n))
   198  
   199        df_mn = convert.to_dataframe(pc_mn)
   200  
   201        # Apply a transform directly to a dataframe to get another dataframe.
   202        df_abc = df_mn | MySchemaTransform()
   203  
   204        pc_abc = convert.to_pcollection(df_abc) | beam.Map(tuple)
   205        assert_that(pc_abc, equal_to([(3, 4, 5), (5, 12, 13), (91, 60, 109)]))
   206  
   207  
   208  if __name__ == '__main__':
   209    unittest.main()