github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/convert_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import unittest 18 19 import pandas as pd 20 21 import apache_beam as beam 22 from apache_beam.dataframe import convert 23 from apache_beam.testing.util import assert_that 24 from apache_beam.testing.util import equal_to 25 26 27 def equal_to_unordered_series(expected): 28 def check(actual): 29 actual = pd.concat(actual) 30 if sorted(expected) != sorted(actual): 31 raise AssertionError('Series not equal: \n%s\n%s\n' % (expected, actual)) 32 33 return check 34 35 36 class ConvertTest(unittest.TestCase): 37 def test_convert_yield_pandas(self): 38 with beam.Pipeline() as p: 39 a = pd.Series([1, 2, 3]) 40 b = pd.Series([100, 200, 300]) 41 42 pc_a = p | 'A' >> beam.Create([a]) 43 pc_b = p | 'B' >> beam.Create([b]) 44 45 df_a = convert.to_dataframe(pc_a, proxy=a[:0]) 46 df_b = convert.to_dataframe(pc_b, proxy=b[:0]) 47 48 df_2a = 2 * df_a 49 df_3a = 3 * df_a 50 df_ab = df_a * df_b 51 52 # Converting multiple results at a time can be more efficient. 53 pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab, 54 yield_elements='pandas') 55 # But separate conversions can be done as well. 56 pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas') 57 58 assert_that(pc_2a, equal_to_unordered_series(2 * a), label='Check2a') 59 assert_that(pc_3a, equal_to_unordered_series(3 * a), label='Check3a') 60 assert_that(pc_ab, equal_to_unordered_series(a * b), label='Checkab') 61 62 def test_convert(self): 63 with beam.Pipeline() as p: 64 a = pd.Series([1, 2, 3]) 65 b = pd.Series([100, 200, 300]) 66 67 pc_a = p | 'A' >> beam.Create(a) 68 pc_b = p | 'B' >> beam.Create(b) 69 70 df_a = convert.to_dataframe(pc_a) 71 df_b = convert.to_dataframe(pc_b) 72 73 df_2a = 2 * df_a 74 df_3a = 3 * df_a 75 df_ab = df_a * df_b 76 77 # Converting multiple results at a time can be more efficient. 78 pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab) 79 # But separate conversions can be done as well. 80 pc_3a = convert.to_pcollection(df_3a) 81 82 assert_that(pc_2a, equal_to(list(2 * a)), label='Check2a') 83 assert_that(pc_3a, equal_to(list(3 * a)), label='Check3a') 84 assert_that(pc_ab, equal_to(list(a * b)), label='Checkab') 85 86 def test_convert_with_none(self): 87 # Ensure the logical Any type allows (nullable) None, see BEAM-12587. 88 df = pd.DataFrame({'A': ['str', 10, None], 'B': [None, 'str', 20]}) 89 with beam.Pipeline() as p: 90 res = convert.to_pcollection(df, pipeline=p) | beam.Map(tuple) 91 assert_that(res, equal_to([(row.A, row.B) for _, row in df.iterrows()])) 92 93 def test_convert_scalar(self): 94 with beam.Pipeline() as p: 95 pc = p | 'A' >> beam.Create([1, 2, 3]) 96 s = convert.to_dataframe(pc) 97 pc_sum = convert.to_pcollection(s.sum()) 98 assert_that(pc_sum, equal_to([6])) 99 100 def test_convert_non_deferred(self): 101 with beam.Pipeline() as p: 102 s1 = pd.Series([1, 2, 3]) 103 s2 = convert.to_dataframe(p | beam.Create([100, 200, 300])) 104 105 pc1, pc2 = convert.to_pcollection(s1, s2, pipeline=p) 106 assert_that(pc1, equal_to([1, 2, 3]), label='CheckNonDeferred') 107 assert_that(pc2, equal_to([100, 200, 300]), label='CheckDeferred') 108 109 def test_convert_memoization(self): 110 with beam.Pipeline() as p: 111 a = pd.Series([1, 2, 3]) 112 b = pd.Series([100, 200, 300]) 113 114 pc_a = p | 'A' >> beam.Create([a]) 115 pc_b = p | 'B' >> beam.Create([b]) 116 117 df_a = convert.to_dataframe(pc_a, proxy=a[:0]) 118 df_b = convert.to_dataframe(pc_b, proxy=b[:0]) 119 120 df_2a = 2 * df_a 121 df_3a = 3 * df_a 122 df_ab = df_a * df_b 123 124 # Two calls to to_pcollection with the same Dataframe should produce the 125 # same PCollection(s) 126 pc_2a_, pc_ab_ = convert.to_pcollection(df_2a, df_ab) 127 pc_3a, pc_2a, pc_ab = convert.to_pcollection(df_3a, df_2a, df_ab) 128 129 self.assertIs(pc_2a, pc_2a_) 130 self.assertIs(pc_ab, pc_ab_) 131 self.assertIsNot(pc_3a, pc_2a) 132 self.assertIsNot(pc_3a, pc_ab) 133 134 # The same conversions without the unbatching transform should also cache 135 # PCollections 136 pc_2a_pandas_, pc_ab_pandas_ = convert.to_pcollection(df_2a, df_ab, 137 yield_elements='pandas') 138 pc_3a_pandas, pc_2a_pandas, pc_ab_pandas = convert.to_pcollection(df_3a, 139 df_2a, 140 df_ab, 141 yield_elements='pandas') 142 143 self.assertIs(pc_2a_pandas, pc_2a_pandas_) 144 self.assertIs(pc_ab_pandas, pc_ab_pandas_) 145 self.assertIsNot(pc_3a_pandas, pc_2a_pandas) 146 self.assertIsNot(pc_3a_pandas, pc_ab_pandas) 147 148 # .. but the cached PCollections should be different 149 self.assertIsNot(pc_2a_pandas, pc_2a) 150 self.assertIsNot(pc_ab_pandas, pc_ab) 151 self.assertIsNot(pc_3a_pandas, pc_3a) 152 153 def test_convert_memoization_clears_cache(self): 154 # This test re-runs the other memoization test, and makes sure that the 155 # cache is cleaned up with the pipeline. Otherwise there would be concerns 156 # of it growing without bound. 157 158 import gc 159 160 # Make sure cache is clear 161 gc.collect() 162 self.assertEqual(len(convert.TO_PCOLLECTION_CACHE), 0) 163 164 # Disable GC so it doesn't run pre-emptively, confounding assertions about 165 # cache size 166 gc.disable() 167 168 # Also disable logging, as some implementations may artificially extend 169 # the life of objects. 170 import logging 171 logging.disable(logging.INFO) 172 173 try: 174 self.test_convert_memoization() 175 self.assertEqual(len(convert.TO_PCOLLECTION_CACHE), 3) 176 177 gc.collect() 178 179 # PCollections should be removed from cache after pipelines go out of 180 # scope and are GC'd 181 self.assertEqual(len(convert.TO_PCOLLECTION_CACHE), 0) 182 finally: 183 # Always re-enable GC and logging 184 gc.enable() 185 logging.disable(logging.NOTSET) 186 187 def test_auto_convert(self): 188 class MySchemaTransform(beam.PTransform): 189 def expand(self, pcoll): 190 return pcoll | beam.Map( 191 lambda x: beam.Row( 192 a=x.n**2 - x.m**2, b=2 * x.m * x.n, c=x.n**2 + x.m**2)) 193 194 with beam.Pipeline() as p: 195 pc_mn = p | beam.Create([ 196 (1, 2), (2, 3), (3, 10) 197 ]) | beam.MapTuple(lambda m, n: beam.Row(m=m, n=n)) 198 199 df_mn = convert.to_dataframe(pc_mn) 200 201 # Apply a transform directly to a dataframe to get another dataframe. 202 df_abc = df_mn | MySchemaTransform() 203 204 pc_abc = convert.to_pcollection(df_abc) | beam.Map(tuple) 205 assert_that(pc_abc, equal_to([(3, 4, 5), (5, 12, 13), (91, 60, 109)])) 206 207 208 if __name__ == '__main__': 209 unittest.main()