github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/schemas_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/schemas_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tests for schemas."""
    19  
    20  # pytype: skip-file
    21  
    22  import typing
    23  import unittest
    24  
    25  import numpy as np
    26  import pandas as pd
    27  from parameterized import parameterized
    28  
    29  import apache_beam as beam
    30  from apache_beam.coders import RowCoder
    31  from apache_beam.coders.typecoders import registry as coders_registry
    32  from apache_beam.dataframe import schemas
    33  from apache_beam.dataframe import transforms
    34  from apache_beam.testing.test_pipeline import TestPipeline
    35  from apache_beam.testing.util import assert_that
    36  from apache_beam.testing.util import equal_to
    37  from apache_beam.typehints import row_type
    38  from apache_beam.typehints import typehints
    39  from apache_beam.typehints.native_type_compatibility import match_is_named_tuple
    40  
    41  Simple = typing.NamedTuple(
    42      'Simple', [('name', str), ('id', int), ('height', float)])
    43  coders_registry.register_coder(Simple, RowCoder)
    44  Animal = typing.NamedTuple(
    45      'Animal', [('animal', str), ('max_speed', typing.Optional[float])])
    46  coders_registry.register_coder(Animal, RowCoder)
    47  
    48  
    49  def matches_df(expected):
    50    def check_df_pcoll_equal(actual):
    51      actual = pd.concat(actual)
    52      sorted_actual = actual.sort_values(by=list(actual.columns)).reset_index(
    53          drop=True)
    54      sorted_expected = expected.sort_values(
    55          by=list(expected.columns)).reset_index(drop=True)
    56      pd.testing.assert_frame_equal(sorted_actual, sorted_expected)
    57  
    58    return check_df_pcoll_equal
    59  
    60  
    61  # Test data for all supported types that can be easily tested.
    62  # Excludes bytes because it's difficult to create a series and dataframe bytes
    63  # dtype. For example:
    64  #   pd.Series([b'abc'], dtype=bytes).dtype != 'S'
    65  #   pd.Series([b'abc'], dtype=bytes).astype(bytes).dtype == 'S'
    66  # (test data, pandas_type, column_name, beam_type)
    67  COLUMNS = [
    68      ([375, 24, 0, 10, 16], np.int32, 'i32', np.int32),
    69      ([375, 24, 0, 10, 16], np.int64, 'i64', np.int64),
    70      ([375, 24, None, 10, 16],
    71       pd.Int32Dtype(),
    72       'i32_nullable',
    73       typing.Optional[np.int32]),
    74      ([375, 24, None, 10, 16],
    75       pd.Int64Dtype(),
    76       'i64_nullable',
    77       typing.Optional[np.int64]),
    78      ([375., 24., None, 10., 16.],
    79       np.float64,
    80       'f64',
    81       typing.Optional[np.float64]),
    82      ([375., 24., None, 10., 16.],
    83       np.float32,
    84       'f32',
    85       typing.Optional[np.float32]),
    86      ([True, False, True, True, False], bool, 'bool', bool),
    87      (['Falcon', 'Ostrich', None, 3.14, 0], object, 'any', typing.Any),
    88      ([True, False, True, None, False],
    89       pd.BooleanDtype(),
    90       'bool_nullable',
    91       typing.Optional[bool]),
    92      (['Falcon', 'Ostrich', None, 'Aardvark', 'Elephant'],
    93       pd.StringDtype(),
    94       'strdtype',
    95       typing.Optional[str]),
    96  ]  # type: typing.List[typing.Tuple[typing.List[typing.Any], typing.Any, str, typing.Any]]
    97  
    98  NICE_TYPES_DF = pd.DataFrame(columns=[name for _, _, name, _ in COLUMNS])
    99  for arr, dtype, name, _ in COLUMNS:
   100    NICE_TYPES_DF[name] = pd.Series(arr, dtype=dtype, name=name).astype(dtype)
   101  
   102  NICE_TYPES_PROXY = NICE_TYPES_DF[:0]
   103  
   104  SERIES_TESTS = [(pd.Series(arr, dtype=dtype, name=name), arr, beam_type)
   105                  for (arr, dtype, name, beam_type) in COLUMNS]
   106  
   107  _TEST_ARRAYS = [
   108      arr for (arr, _, _, _) in COLUMNS
   109  ]  # type: typing.List[typing.List[typing.Any]]
   110  DF_RESULT = list(zip(*_TEST_ARRAYS))
   111  BEAM_SCHEMA = typing.NamedTuple(  # type: ignore
   112      'BEAM_SCHEMA', [(name, beam_type) for _, _, name, beam_type in COLUMNS])
   113  INDEX_DF_TESTS = [(
   114      NICE_TYPES_DF.set_index([name for _, _, name, _ in COLUMNS[:i]]),
   115      DF_RESULT,
   116      BEAM_SCHEMA) for i in range(1, len(COLUMNS) + 1)]
   117  
   118  NOINDEX_DF_TESTS = [(NICE_TYPES_DF, DF_RESULT, BEAM_SCHEMA)]
   119  
   120  # Get major, minor, bugfix version
   121  PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:3]))
   122  
   123  
   124  def test_name_func(testcase_func, param_num, params):
   125    df_or_series, _, _ = params.args
   126    if isinstance(df_or_series, pd.Series):
   127      return f"{testcase_func.__name__}_Series[{df_or_series.dtype}]"
   128    elif isinstance(df_or_series, pd.DataFrame):
   129      return (
   130          f"{testcase_func.__name__}_DataFrame"
   131          f"[{','.join(str(dtype) for dtype in df_or_series.dtypes)}]")
   132    else:
   133      raise ValueError(
   134          f"Encountered unsupported param in {testcase_func.__name__}. "
   135          "Expected Series or DataFrame, got:\n" + str(df_or_series))
   136  
   137  
   138  class SchemasTest(unittest.TestCase):
   139    def test_simple_df(self):
   140      expected = pd.DataFrame({
   141          'name': list(str(i) for i in range(5)),
   142          'id': list(range(5)),
   143          'height': list(float(i) for i in range(5))
   144      },
   145                              columns=['name', 'id', 'height'])
   146  
   147      expected.name = expected.name.astype(pd.StringDtype())
   148  
   149      with TestPipeline() as p:
   150        res = (
   151            p
   152            | beam.Create(
   153                [Simple(name=str(i), id=i, height=float(i)) for i in range(5)])
   154            | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10))
   155        assert_that(res, matches_df(expected))
   156  
   157    def test_simple_df_with_beam_row(self):
   158      expected = pd.DataFrame({
   159          'name': list(str(i) for i in range(5)),
   160          'id': list(range(5)),
   161          'height': list(float(i) for i in range(5))
   162      },
   163                              columns=['name', 'id', 'height'])
   164      expected.name = expected.name.astype(pd.StringDtype())
   165  
   166      with TestPipeline() as p:
   167        res = (
   168            p
   169            | beam.Create([(str(i), i, float(i)) for i in range(5)])
   170            | beam.Select(
   171                name=lambda r: str(r[0]),
   172                id=lambda r: int(r[1]),
   173                height=lambda r: float(r[2]))
   174            | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10))
   175        assert_that(res, matches_df(expected))
   176  
   177    def test_generate_proxy(self):
   178      expected = pd.DataFrame({
   179          'animal': pd.Series(dtype=pd.StringDtype()),
   180          'max_speed': pd.Series(dtype=np.float64)
   181      })
   182  
   183      pd.testing.assert_frame_equal(schemas.generate_proxy(Animal), expected)
   184  
   185    def test_generate_proxy_beam_typehint(self):
   186      expected = pd.Series(dtype=pd.Int32Dtype())
   187  
   188      actual = schemas.generate_proxy(typehints.Optional[np.int32])
   189  
   190      pd.testing.assert_series_equal(actual, expected)
   191  
   192    def test_nice_types_proxy_roundtrip(self):
   193      roundtripped = schemas.generate_proxy(
   194          schemas.element_type_from_dataframe(NICE_TYPES_PROXY))
   195      self.assertTrue(roundtripped.equals(NICE_TYPES_PROXY))
   196  
   197    @unittest.skipIf(
   198        PD_VERSION == (1, 2, 1),
   199        "Can't roundtrip bytes in pandas 1.2.1"
   200        "https://github.com/pandas-dev/pandas/issues/39474")
   201    def test_bytes_proxy_roundtrip(self):
   202      proxy = pd.DataFrame({'bytes': []})
   203      proxy.bytes = proxy.bytes.astype(bytes)
   204  
   205      roundtripped = schemas.generate_proxy(
   206          schemas.element_type_from_dataframe(proxy))
   207  
   208      self.assertEqual(roundtripped.bytes.dtype.kind, 'S')
   209  
   210    def test_batch_with_df_transform(self):
   211      with TestPipeline() as p:
   212        res = (
   213            p
   214            | beam.Create([
   215                Animal('Falcon', 380.0),
   216                Animal('Falcon', 370.0),
   217                Animal('Parrot', 24.0),
   218                Animal('Parrot', 26.0)
   219            ])
   220            | schemas.BatchRowsAsDataFrame()
   221            | transforms.DataframeTransform(
   222                lambda df: df.groupby('animal').mean(),
   223                # TODO: Generate proxy in this case as well
   224                proxy=schemas.generate_proxy(Animal),
   225                include_indexes=True))
   226        assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
   227  
   228      # Do the same thing, but use reset_index() to make sure 'animal' is included
   229      with TestPipeline() as p:
   230        with beam.dataframe.allow_non_parallel_operations():
   231          res = (
   232              p
   233              | beam.Create([
   234                  Animal('Falcon', 380.0),
   235                  Animal('Falcon', 370.0),
   236                  Animal('Parrot', 24.0),
   237                  Animal('Parrot', 26.0)
   238              ])
   239              | schemas.BatchRowsAsDataFrame()
   240              | transforms.DataframeTransform(
   241                  lambda df: df.groupby('animal').mean().reset_index(),
   242                  # TODO: Generate proxy in this case as well
   243                  proxy=schemas.generate_proxy(Animal)))
   244          assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
   245  
   246    def assert_typehints_equal(self, left, right):
   247      def maybe_drop_rowtypeconstraint(typehint):
   248        if isinstance(typehint, row_type.RowTypeConstraint):
   249          return typehint.user_type
   250        else:
   251          return typehint
   252  
   253      left = maybe_drop_rowtypeconstraint(typehints.normalize(left))
   254      right = maybe_drop_rowtypeconstraint(typehints.normalize(right))
   255  
   256      if match_is_named_tuple(left):
   257        self.assertTrue(match_is_named_tuple(right))
   258        self.assertEqual(left.__annotations__, right.__annotations__)
   259      else:
   260        self.assertEqual(left, right)
   261  
   262    @parameterized.expand(
   263        SERIES_TESTS + NOINDEX_DF_TESTS, name_func=test_name_func)
   264    def test_unbatch_no_index(self, df_or_series, rows, beam_type):
   265      proxy = df_or_series[:0]
   266  
   267      with TestPipeline() as p:
   268        res = (
   269            p | beam.Create([df_or_series[::2], df_or_series[1::2]])
   270            | schemas.UnbatchPandas(proxy))
   271  
   272        # Verify that the unbatched PCollection has the expected typehint
   273        # TODO(https://github.com/apache/beam/issues/19923): typehints should
   274        # support NamedTuple so we can use typehints.is_consistent_with here
   275        # instead
   276        self.assert_typehints_equal(res.element_type, beam_type)
   277  
   278        assert_that(res, equal_to(rows))
   279  
   280    @parameterized.expand(SERIES_TESTS + INDEX_DF_TESTS, name_func=test_name_func)
   281    def test_unbatch_with_index(self, df_or_series, rows, _):
   282      proxy = df_or_series[:0]
   283  
   284      if (PD_VERSION < (1, 2) and
   285          set(['i32_nullable', 'i64_nullable']).intersection(proxy.index.names)):
   286        self.skipTest(
   287            "pandas<1.2 incorrectly changes Int64Dtype to int64 when "
   288            "moved to index.")
   289  
   290      with TestPipeline() as p:
   291        res = (
   292            p | beam.Create([df_or_series[::2], df_or_series[1::2]])
   293            | schemas.UnbatchPandas(proxy, include_indexes=True))
   294  
   295        assert_that(res, equal_to(rows))
   296  
   297    @parameterized.expand(SERIES_TESTS, name_func=test_name_func)
   298    def test_unbatch_series_with_index_warns(
   299        self, series, unused_rows, unused_type):
   300      proxy = series[:0]
   301  
   302      with TestPipeline() as p:
   303        input_pc = p | beam.Create([series[::2], series[1::2]])
   304        with self.assertWarns(UserWarning):
   305          _ = input_pc | schemas.UnbatchPandas(proxy, include_indexes=True)
   306  
   307    def test_unbatch_include_index_unnamed_index_raises(self):
   308      df = pd.DataFrame({'foo': [1, 2, 3, 4]})
   309      proxy = df[:0]
   310  
   311      with TestPipeline() as p:
   312        pc = p | beam.Create([df[::2], df[1::2]])
   313  
   314        with self.assertRaisesRegex(ValueError, 'unnamed'):
   315          _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
   316  
   317    def test_unbatch_include_index_nonunique_index_raises(self):
   318      df = pd.DataFrame({'foo': [1, 2, 3, 4]})
   319      df.index = pd.MultiIndex.from_arrays([[1, 2, 3, 4], [4, 3, 2, 1]],
   320                                           names=['bar', 'bar'])
   321      proxy = df[:0]
   322  
   323      with TestPipeline() as p:
   324        pc = p | beam.Create([df[::2], df[1::2]])
   325  
   326        with self.assertRaisesRegex(ValueError, 'bar'):
   327          _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
   328  
   329    def test_unbatch_include_index_column_conflict_raises(self):
   330      df = pd.DataFrame({'foo': [1, 2, 3, 4]})
   331      df.index = pd.Index([4, 3, 2, 1], name='foo')
   332      proxy = df[:0]
   333  
   334      with TestPipeline() as p:
   335        pc = p | beam.Create([df[::2], df[1::2]])
   336  
   337        with self.assertRaisesRegex(ValueError, 'foo'):
   338          _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
   339  
   340    def test_unbatch_datetime(self):
   341  
   342      s = pd.Series(
   343          pd.date_range(
   344              '1/1/2000', periods=100, freq='m', tz='America/Los_Angeles'))
   345      proxy = s[:0]
   346  
   347      with TestPipeline() as p:
   348        res = (
   349            p | beam.Create([s[::2], s[1::2]])
   350            | schemas.UnbatchPandas(proxy, include_indexes=True))
   351  
   352        assert_that(res, equal_to(list(s)))
   353  
   354  
   355  if __name__ == '__main__':
   356    unittest.main()