github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/schemas_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for schemas.""" 19 20 # pytype: skip-file 21 22 import typing 23 import unittest 24 25 import numpy as np 26 import pandas as pd 27 from parameterized import parameterized 28 29 import apache_beam as beam 30 from apache_beam.coders import RowCoder 31 from apache_beam.coders.typecoders import registry as coders_registry 32 from apache_beam.dataframe import schemas 33 from apache_beam.dataframe import transforms 34 from apache_beam.testing.test_pipeline import TestPipeline 35 from apache_beam.testing.util import assert_that 36 from apache_beam.testing.util import equal_to 37 from apache_beam.typehints import row_type 38 from apache_beam.typehints import typehints 39 from apache_beam.typehints.native_type_compatibility import match_is_named_tuple 40 41 Simple = typing.NamedTuple( 42 'Simple', [('name', str), ('id', int), ('height', float)]) 43 coders_registry.register_coder(Simple, RowCoder) 44 Animal = typing.NamedTuple( 45 'Animal', [('animal', str), ('max_speed', typing.Optional[float])]) 46 coders_registry.register_coder(Animal, RowCoder) 47 48 49 def matches_df(expected): 50 def check_df_pcoll_equal(actual): 51 actual = pd.concat(actual) 52 sorted_actual = actual.sort_values(by=list(actual.columns)).reset_index( 53 drop=True) 54 sorted_expected = expected.sort_values( 55 by=list(expected.columns)).reset_index(drop=True) 56 pd.testing.assert_frame_equal(sorted_actual, sorted_expected) 57 58 return check_df_pcoll_equal 59 60 61 # Test data for all supported types that can be easily tested. 62 # Excludes bytes because it's difficult to create a series and dataframe bytes 63 # dtype. For example: 64 # pd.Series([b'abc'], dtype=bytes).dtype != 'S' 65 # pd.Series([b'abc'], dtype=bytes).astype(bytes).dtype == 'S' 66 # (test data, pandas_type, column_name, beam_type) 67 COLUMNS = [ 68 ([375, 24, 0, 10, 16], np.int32, 'i32', np.int32), 69 ([375, 24, 0, 10, 16], np.int64, 'i64', np.int64), 70 ([375, 24, None, 10, 16], 71 pd.Int32Dtype(), 72 'i32_nullable', 73 typing.Optional[np.int32]), 74 ([375, 24, None, 10, 16], 75 pd.Int64Dtype(), 76 'i64_nullable', 77 typing.Optional[np.int64]), 78 ([375., 24., None, 10., 16.], 79 np.float64, 80 'f64', 81 typing.Optional[np.float64]), 82 ([375., 24., None, 10., 16.], 83 np.float32, 84 'f32', 85 typing.Optional[np.float32]), 86 ([True, False, True, True, False], bool, 'bool', bool), 87 (['Falcon', 'Ostrich', None, 3.14, 0], object, 'any', typing.Any), 88 ([True, False, True, None, False], 89 pd.BooleanDtype(), 90 'bool_nullable', 91 typing.Optional[bool]), 92 (['Falcon', 'Ostrich', None, 'Aardvark', 'Elephant'], 93 pd.StringDtype(), 94 'strdtype', 95 typing.Optional[str]), 96 ] # type: typing.List[typing.Tuple[typing.List[typing.Any], typing.Any, str, typing.Any]] 97 98 NICE_TYPES_DF = pd.DataFrame(columns=[name for _, _, name, _ in COLUMNS]) 99 for arr, dtype, name, _ in COLUMNS: 100 NICE_TYPES_DF[name] = pd.Series(arr, dtype=dtype, name=name).astype(dtype) 101 102 NICE_TYPES_PROXY = NICE_TYPES_DF[:0] 103 104 SERIES_TESTS = [(pd.Series(arr, dtype=dtype, name=name), arr, beam_type) 105 for (arr, dtype, name, beam_type) in COLUMNS] 106 107 _TEST_ARRAYS = [ 108 arr for (arr, _, _, _) in COLUMNS 109 ] # type: typing.List[typing.List[typing.Any]] 110 DF_RESULT = list(zip(*_TEST_ARRAYS)) 111 BEAM_SCHEMA = typing.NamedTuple( # type: ignore 112 'BEAM_SCHEMA', [(name, beam_type) for _, _, name, beam_type in COLUMNS]) 113 INDEX_DF_TESTS = [( 114 NICE_TYPES_DF.set_index([name for _, _, name, _ in COLUMNS[:i]]), 115 DF_RESULT, 116 BEAM_SCHEMA) for i in range(1, len(COLUMNS) + 1)] 117 118 NOINDEX_DF_TESTS = [(NICE_TYPES_DF, DF_RESULT, BEAM_SCHEMA)] 119 120 # Get major, minor, bugfix version 121 PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:3])) 122 123 124 def test_name_func(testcase_func, param_num, params): 125 df_or_series, _, _ = params.args 126 if isinstance(df_or_series, pd.Series): 127 return f"{testcase_func.__name__}_Series[{df_or_series.dtype}]" 128 elif isinstance(df_or_series, pd.DataFrame): 129 return ( 130 f"{testcase_func.__name__}_DataFrame" 131 f"[{','.join(str(dtype) for dtype in df_or_series.dtypes)}]") 132 else: 133 raise ValueError( 134 f"Encountered unsupported param in {testcase_func.__name__}. " 135 "Expected Series or DataFrame, got:\n" + str(df_or_series)) 136 137 138 class SchemasTest(unittest.TestCase): 139 def test_simple_df(self): 140 expected = pd.DataFrame({ 141 'name': list(str(i) for i in range(5)), 142 'id': list(range(5)), 143 'height': list(float(i) for i in range(5)) 144 }, 145 columns=['name', 'id', 'height']) 146 147 expected.name = expected.name.astype(pd.StringDtype()) 148 149 with TestPipeline() as p: 150 res = ( 151 p 152 | beam.Create( 153 [Simple(name=str(i), id=i, height=float(i)) for i in range(5)]) 154 | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10)) 155 assert_that(res, matches_df(expected)) 156 157 def test_simple_df_with_beam_row(self): 158 expected = pd.DataFrame({ 159 'name': list(str(i) for i in range(5)), 160 'id': list(range(5)), 161 'height': list(float(i) for i in range(5)) 162 }, 163 columns=['name', 'id', 'height']) 164 expected.name = expected.name.astype(pd.StringDtype()) 165 166 with TestPipeline() as p: 167 res = ( 168 p 169 | beam.Create([(str(i), i, float(i)) for i in range(5)]) 170 | beam.Select( 171 name=lambda r: str(r[0]), 172 id=lambda r: int(r[1]), 173 height=lambda r: float(r[2])) 174 | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10)) 175 assert_that(res, matches_df(expected)) 176 177 def test_generate_proxy(self): 178 expected = pd.DataFrame({ 179 'animal': pd.Series(dtype=pd.StringDtype()), 180 'max_speed': pd.Series(dtype=np.float64) 181 }) 182 183 pd.testing.assert_frame_equal(schemas.generate_proxy(Animal), expected) 184 185 def test_generate_proxy_beam_typehint(self): 186 expected = pd.Series(dtype=pd.Int32Dtype()) 187 188 actual = schemas.generate_proxy(typehints.Optional[np.int32]) 189 190 pd.testing.assert_series_equal(actual, expected) 191 192 def test_nice_types_proxy_roundtrip(self): 193 roundtripped = schemas.generate_proxy( 194 schemas.element_type_from_dataframe(NICE_TYPES_PROXY)) 195 self.assertTrue(roundtripped.equals(NICE_TYPES_PROXY)) 196 197 @unittest.skipIf( 198 PD_VERSION == (1, 2, 1), 199 "Can't roundtrip bytes in pandas 1.2.1" 200 "https://github.com/pandas-dev/pandas/issues/39474") 201 def test_bytes_proxy_roundtrip(self): 202 proxy = pd.DataFrame({'bytes': []}) 203 proxy.bytes = proxy.bytes.astype(bytes) 204 205 roundtripped = schemas.generate_proxy( 206 schemas.element_type_from_dataframe(proxy)) 207 208 self.assertEqual(roundtripped.bytes.dtype.kind, 'S') 209 210 def test_batch_with_df_transform(self): 211 with TestPipeline() as p: 212 res = ( 213 p 214 | beam.Create([ 215 Animal('Falcon', 380.0), 216 Animal('Falcon', 370.0), 217 Animal('Parrot', 24.0), 218 Animal('Parrot', 26.0) 219 ]) 220 | schemas.BatchRowsAsDataFrame() 221 | transforms.DataframeTransform( 222 lambda df: df.groupby('animal').mean(), 223 # TODO: Generate proxy in this case as well 224 proxy=schemas.generate_proxy(Animal), 225 include_indexes=True)) 226 assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)])) 227 228 # Do the same thing, but use reset_index() to make sure 'animal' is included 229 with TestPipeline() as p: 230 with beam.dataframe.allow_non_parallel_operations(): 231 res = ( 232 p 233 | beam.Create([ 234 Animal('Falcon', 380.0), 235 Animal('Falcon', 370.0), 236 Animal('Parrot', 24.0), 237 Animal('Parrot', 26.0) 238 ]) 239 | schemas.BatchRowsAsDataFrame() 240 | transforms.DataframeTransform( 241 lambda df: df.groupby('animal').mean().reset_index(), 242 # TODO: Generate proxy in this case as well 243 proxy=schemas.generate_proxy(Animal))) 244 assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)])) 245 246 def assert_typehints_equal(self, left, right): 247 def maybe_drop_rowtypeconstraint(typehint): 248 if isinstance(typehint, row_type.RowTypeConstraint): 249 return typehint.user_type 250 else: 251 return typehint 252 253 left = maybe_drop_rowtypeconstraint(typehints.normalize(left)) 254 right = maybe_drop_rowtypeconstraint(typehints.normalize(right)) 255 256 if match_is_named_tuple(left): 257 self.assertTrue(match_is_named_tuple(right)) 258 self.assertEqual(left.__annotations__, right.__annotations__) 259 else: 260 self.assertEqual(left, right) 261 262 @parameterized.expand( 263 SERIES_TESTS + NOINDEX_DF_TESTS, name_func=test_name_func) 264 def test_unbatch_no_index(self, df_or_series, rows, beam_type): 265 proxy = df_or_series[:0] 266 267 with TestPipeline() as p: 268 res = ( 269 p | beam.Create([df_or_series[::2], df_or_series[1::2]]) 270 | schemas.UnbatchPandas(proxy)) 271 272 # Verify that the unbatched PCollection has the expected typehint 273 # TODO(https://github.com/apache/beam/issues/19923): typehints should 274 # support NamedTuple so we can use typehints.is_consistent_with here 275 # instead 276 self.assert_typehints_equal(res.element_type, beam_type) 277 278 assert_that(res, equal_to(rows)) 279 280 @parameterized.expand(SERIES_TESTS + INDEX_DF_TESTS, name_func=test_name_func) 281 def test_unbatch_with_index(self, df_or_series, rows, _): 282 proxy = df_or_series[:0] 283 284 if (PD_VERSION < (1, 2) and 285 set(['i32_nullable', 'i64_nullable']).intersection(proxy.index.names)): 286 self.skipTest( 287 "pandas<1.2 incorrectly changes Int64Dtype to int64 when " 288 "moved to index.") 289 290 with TestPipeline() as p: 291 res = ( 292 p | beam.Create([df_or_series[::2], df_or_series[1::2]]) 293 | schemas.UnbatchPandas(proxy, include_indexes=True)) 294 295 assert_that(res, equal_to(rows)) 296 297 @parameterized.expand(SERIES_TESTS, name_func=test_name_func) 298 def test_unbatch_series_with_index_warns( 299 self, series, unused_rows, unused_type): 300 proxy = series[:0] 301 302 with TestPipeline() as p: 303 input_pc = p | beam.Create([series[::2], series[1::2]]) 304 with self.assertWarns(UserWarning): 305 _ = input_pc | schemas.UnbatchPandas(proxy, include_indexes=True) 306 307 def test_unbatch_include_index_unnamed_index_raises(self): 308 df = pd.DataFrame({'foo': [1, 2, 3, 4]}) 309 proxy = df[:0] 310 311 with TestPipeline() as p: 312 pc = p | beam.Create([df[::2], df[1::2]]) 313 314 with self.assertRaisesRegex(ValueError, 'unnamed'): 315 _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True) 316 317 def test_unbatch_include_index_nonunique_index_raises(self): 318 df = pd.DataFrame({'foo': [1, 2, 3, 4]}) 319 df.index = pd.MultiIndex.from_arrays([[1, 2, 3, 4], [4, 3, 2, 1]], 320 names=['bar', 'bar']) 321 proxy = df[:0] 322 323 with TestPipeline() as p: 324 pc = p | beam.Create([df[::2], df[1::2]]) 325 326 with self.assertRaisesRegex(ValueError, 'bar'): 327 _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True) 328 329 def test_unbatch_include_index_column_conflict_raises(self): 330 df = pd.DataFrame({'foo': [1, 2, 3, 4]}) 331 df.index = pd.Index([4, 3, 2, 1], name='foo') 332 proxy = df[:0] 333 334 with TestPipeline() as p: 335 pc = p | beam.Create([df[::2], df[1::2]]) 336 337 with self.assertRaisesRegex(ValueError, 'foo'): 338 _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True) 339 340 def test_unbatch_datetime(self): 341 342 s = pd.Series( 343 pd.date_range( 344 '1/1/2000', periods=100, freq='m', tz='America/Los_Angeles')) 345 proxy = s[:0] 346 347 with TestPipeline() as p: 348 res = ( 349 p | beam.Create([s[::2], s[1::2]]) 350 | schemas.UnbatchPandas(proxy, include_indexes=True)) 351 352 assert_that(res, equal_to(list(s))) 353 354 355 if __name__ == '__main__': 356 unittest.main()