github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/coders/coders_property_based_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Property tests for coders in the Python SDK.
    19  
    20  The tests in this file utilize the hypothesis library to generate random test
    21  cases and run them against Beam's coder implementations.
    22  
    23  These tests are similar to fuzzing, except they test invariant properties
    24  of code.
    25  """
    26  
    27  import keyword
    28  import math
    29  import typing
    30  import unittest
    31  # TODO(pabloem): Include other categories of characters
    32  from datetime import datetime
    33  from string import ascii_letters
    34  from string import digits
    35  
    36  import numpy as np
    37  from hypothesis import strategies as st
    38  from hypothesis import assume
    39  from hypothesis import given
    40  from hypothesis import settings
    41  from pytz import utc
    42  
    43  from apache_beam.coders import FloatCoder
    44  from apache_beam.coders import RowCoder
    45  from apache_beam.coders import StrUtf8Coder
    46  from apache_beam.coders.typecoders import registry as coders_registry
    47  from apache_beam.typehints.schemas import PRIMITIVE_TO_ATOMIC_TYPE
    48  from apache_beam.typehints.schemas import typing_to_runner_api
    49  from apache_beam.utils.timestamp import Timestamp
    50  
    51  SCHEMA_TYPES_TO_STRATEGY = {
    52      str: st.text(),
    53      bytes: st.binary(),
    54      typing.ByteString: st.binary(),
    55      # Maximum datetime on year 3000 to conform to Windows OS limits.
    56      Timestamp: st.datetimes(
    57          min_value=datetime(1970, 1, 1, 1, 1),
    58          max_value=datetime(
    59              3000, 1, 1, 0,
    60              0)).map(lambda dt: Timestamp.from_utc_datetime(dt.astimezone(utc))),
    61      int: st.integers(min_value=-(1 << 63 - 1), max_value=1 << 63 - 1),
    62      # INT8/BYTE not yet supported by RowCoder.
    63      # np.int8: st.binary(min_size=1, max_size=1),
    64      # INT16 not yet supported by RowCoder.
    65      # np.int16: st.integers(min_value=-(1 << 15 - 1), max_value=1 << 15 - 1),
    66      np.int32: st.integers(min_value=-(1 << 31 - 1), max_value=1 << 31 - 1),
    67      np.int64: st.integers(min_value=-(1 << 63 - 1), max_value=1 << 63 - 1),
    68      np.uint32: st.integers(min_value=0, max_value=1 << 32 - 1),
    69      np.uint64: st.integers(min_value=0, max_value=1 << 64 - 1),
    70      np.float32: st.floats(width=32, allow_nan=False),
    71      np.float64: st.floats(width=64, allow_nan=False),
    72      float: st.floats(width=64, allow_nan=False),
    73      bool: st.booleans()
    74  }
    75  
    76  # TODO(https://github.com/apache/beam/issues/23003): Support logical types.
    77  SCHEMA_TYPES = list(SCHEMA_TYPES_TO_STRATEGY.keys())
    78  
    79  # A hypothesis strategy that generates schemas.
    80  # A schema is a list containing tuples of strings (field names), types (field
    81  # types) and boolean (nullable or not).
    82  # This strategy currently generates rows with simple types (i.e. non-list, and
    83  # non-map fields).
    84  SCHEMA_GENERATOR_STRATEGY = st.lists(
    85      st.tuples(
    86          st.text(ascii_letters + digits + '_', min_size=1),
    87          st.sampled_from(SCHEMA_TYPES),
    88          st.booleans()))
    89  
    90  TYPES_UNSUPPORTED_BY_ROW_CODER = {np.int8, np.int16}
    91  
    92  
    93  class TypesAreAllTested(unittest.TestCase):
    94    def test_all_types_are_tested(self):
    95      # Verify that all types among Beam's defined types are being tested
    96      self.assertEqual(
    97          set(SCHEMA_TYPES).intersection(PRIMITIVE_TO_ATOMIC_TYPE.keys()),
    98          set(PRIMITIVE_TO_ATOMIC_TYPE.keys()).difference(
    99              TYPES_UNSUPPORTED_BY_ROW_CODER))
   100  
   101  
   102  class ProperyTestingCoders(unittest.TestCase):
   103    @given(st.text())
   104    def test_string_coder(self, txt: str):
   105      coder = StrUtf8Coder()
   106      self.assertEqual(coder.decode(coder.encode(txt)), txt)
   107  
   108    @given(st.floats())
   109    def test_float_coder(self, num: float):
   110      coder = FloatCoder()
   111      test_num = coder.decode(coder.encode(num))
   112      if math.isnan(num):
   113        # nan != nan.
   114        self.assertTrue(math.isnan(test_num))
   115      else:
   116        self.assertEqual(coder.decode(coder.encode(num)), num)
   117  
   118    @settings(deadline=None, print_blob=True)
   119    @given(st.data())
   120    def test_row_coder(self, data: st.DataObject):
   121      """Generate rows and schemas, and test their encoding/decoding.
   122  
   123      The schemas are generated based on the SCHEMA_GENERATOR_STRATEGY.
   124      """
   125      schema = data.draw(SCHEMA_GENERATOR_STRATEGY)
   126      # Assume that the cardinality of the set of names is the same
   127      # as the length of the schema. This means there's no duplicate
   128      # names for fields.
   129      # If this condition does not hold, then we must not continue the
   130      # test.
   131      assume(len({name for name, _, _ in schema}) == len(schema))
   132      assume(all(not keyword.iskeyword(name) for name, _, _ in schema))
   133      assume(
   134          len({n[0]
   135               for n, _, _ in schema}.intersection(set(digits + '_'))) == 0)
   136      RowType = typing.NamedTuple(  # type: ignore
   137          'RandomRowType',
   138          [(name, type_ if not nullable else typing.Optional[type_]) for name,
   139           type_,
   140           nullable in schema])
   141      coders_registry.register_coder(RowType, RowCoder)
   142  
   143      # TODO(https://github.com/apache/beam/issues/23002): Apply nulls for these
   144      row = RowType(  # type: ignore
   145          **{
   146              name: data.draw(SCHEMA_TYPES_TO_STRATEGY[type_])
   147              for name,
   148              type_,
   149              nullable in schema
   150          })
   151  
   152      coder = RowCoder(typing_to_runner_api(RowType).row_type.schema)
   153      self.assertEqual(coder.decode(coder.encode(row)), row)
   154  
   155  
   156  if __name__ == "__main__":
   157    unittest.main()