github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/partitionings_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import unittest
    18  
    19  import pandas as pd
    20  
    21  from apache_beam.dataframe.partitionings import Arbitrary
    22  from apache_beam.dataframe.partitionings import Index
    23  from apache_beam.dataframe.partitionings import JoinIndex
    24  from apache_beam.dataframe.partitionings import Singleton
    25  
    26  
    27  class PartitioningsTest(unittest.TestCase):
    28    # pylint: disable=bad-option-value
    29  
    30    multi_index_df = pd.DataFrame({
    31        'shape': ['dodecahedron', 'icosahedron'] * 12,
    32        'color': ['red', 'yellow', 'blue'] * 8,
    33        'size': range(24),
    34        'value': range(24)
    35    }).set_index(['shape', 'color', 'size'])
    36  
    37    def test_index_is_subpartition(self):
    38      ordered_list = [
    39          Singleton(),
    40          Index([3]),
    41          Index([1, 3]),
    42          Index(),
    43          JoinIndex('ref'),
    44          JoinIndex(),
    45          Arbitrary()
    46      ]
    47      for loose, strict in zip(ordered_list[:-1], ordered_list[1:]):
    48        self.assertTrue(strict.is_subpartitioning_of(loose), (strict, loose))
    49        self.assertFalse(loose.is_subpartitioning_of(strict), (loose, strict))
    50      # Incomparable.
    51      self.assertFalse(Index([1, 2]).is_subpartitioning_of(Index([1, 3])))
    52      self.assertFalse(Index([1, 3]).is_subpartitioning_of(Index([1, 2])))
    53      self.assertFalse(JoinIndex('a').is_subpartitioning_of(JoinIndex('b')))
    54      self.assertFalse(JoinIndex('b').is_subpartitioning_of(JoinIndex('a')))
    55  
    56    def _check_partition(self, partitioning, min_non_empty, max_non_empty=None):
    57      num_partitions = 1000
    58      if max_non_empty is None:
    59        max_non_empty = min_non_empty
    60      parts = list(partitioning.partition_fn(self.multi_index_df, num_partitions))
    61      self.assertEqual(num_partitions, len(parts))
    62      self.assertGreaterEqual(len([p for _, p in parts if len(p)]), min_non_empty)
    63      self.assertLessEqual(len([p for _, p in parts if len(p)]), max_non_empty)
    64      self.assertEqual(
    65          sorted(self.multi_index_df.value),
    66          sorted(sum((list(p.value) for _, p in parts), [])))
    67  
    68    def test_index_partition(self):
    69      self._check_partition(Index([0]), 2)
    70      self._check_partition(Index([0, 1]), 6)
    71      self._check_partition(Index([1]), 3)
    72      self._check_partition(Index([2]), 7, 24)
    73      self._check_partition(Index([0, 2]), 7, 24)
    74      self._check_partition(Index(), 7, 24)
    75  
    76    def test_nothing_subpartition(self):
    77      for p in [Index([1]), Index([1, 2]), Index(), Singleton()]:
    78        self.assertTrue(Arbitrary().is_subpartitioning_of(p), p)
    79  
    80    def test_singleton_subpartition(self):
    81      self.assertTrue(Singleton().is_subpartitioning_of(Singleton()))
    82      for p in [Arbitrary(), Index([1]), Index([1, 2]), Index()]:
    83        self.assertFalse(Singleton().is_subpartitioning_of(p), p)
    84  
    85    def test_singleton_partition(self):
    86      parts = list(Singleton().partition_fn(pd.Series(range(10)), 1000))
    87      self.assertEqual(1, len(parts))
    88  
    89  
    90  if __name__ == '__main__':
    91    unittest.main()