github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/partitionings.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/partitionings.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import random
    18  from typing import Any
    19  from typing import Iterable
    20  from typing import Tuple
    21  from typing import TypeVar
    22  
    23  import numpy as np
    24  import pandas as pd
    25  
    26  Frame = TypeVar('Frame', bound=pd.core.generic.NDFrame)
    27  
    28  
    29  class Partitioning(object):
    30    """A class representing a (consistent) partitioning of dataframe objects.
    31    """
    32    def __repr__(self):
    33      return self.__class__.__name__
    34  
    35    def is_subpartitioning_of(self, other):
    36      # type: (Partitioning) -> bool
    37  
    38      """Returns whether self is a sub-partition of other.
    39  
    40      Specifically, returns whether something partitioned by self is necissarily
    41      also partitioned by other.
    42      """
    43      raise NotImplementedError
    44  
    45    def __lt__(self, other):
    46      return self != other and self <= other
    47  
    48    def __le__(self, other):
    49      return not self.is_subpartitioning_of(other)
    50  
    51    def partition_fn(self, df, num_partitions):
    52      # type: (Frame, int) -> Iterable[Tuple[Any, Frame]]
    53  
    54      """A callable that actually performs the partitioning of a Frame df.
    55  
    56      This will be invoked via a FlatMap in conjunction with a GroupKey to
    57      achieve the desired partitioning.
    58      """
    59      raise NotImplementedError
    60  
    61    def test_partition_fn(self, df):
    62      return self.partition_fn(df, 5)
    63  
    64  
    65  class Index(Partitioning):
    66    """A partitioning by index (either fully or partially).
    67  
    68    If the set of "levels" of the index to consider is not specified, the entire
    69    index is used.
    70  
    71    These form a partial order, given by
    72  
    73        Singleton() < Index([i]) < Index([i, j]) < ... < Index() < Arbitrary()
    74  
    75    The ordering is implemented via the is_subpartitioning_of method, where the
    76    examples on the right are subpartitionings of the examples on the left above.
    77    """
    78    def __init__(self, levels=None):
    79      self._levels = levels
    80  
    81    def __repr__(self):
    82      if self._levels:
    83        return 'Index%s' % self._levels
    84      else:
    85        return 'Index'
    86  
    87    def __eq__(self, other):
    88      return type(self) == type(other) and self._levels == other._levels
    89  
    90    def __hash__(self):
    91      if self._levels:
    92        return hash(tuple(sorted(self._levels)))
    93      else:
    94        return hash(type(self))
    95  
    96    def is_subpartitioning_of(self, other):
    97      if isinstance(other, Singleton):
    98        return True
    99      elif isinstance(other, Index):
   100        if self._levels is None:
   101          return True
   102        elif other._levels is None:
   103          return False
   104        else:
   105          return all(level in self._levels for level in other._levels)
   106      elif isinstance(other, (Arbitrary, JoinIndex)):
   107        return False
   108      else:
   109        raise ValueError(f"Encountered unknown type {other!r}")
   110  
   111    def _hash_index(self, df):
   112      if self._levels is None:
   113        levels = list(range(df.index.nlevels))
   114      else:
   115        levels = self._levels
   116      return sum(
   117          pd.util.hash_array(np.asarray(df.index.get_level_values(level)))
   118          for level in levels)
   119  
   120    def partition_fn(self, df, num_partitions):
   121      hashes = self._hash_index(df)
   122      for key in range(num_partitions):
   123        yield key, df[hashes % num_partitions == key]
   124  
   125    def check(self, dfs):
   126      # Drop empty DataFrames
   127      dfs = [df for df in dfs if len(df)]
   128  
   129      if not len(dfs):
   130        return True
   131  
   132      def apply_consistent_order(dfs):
   133        # Apply consistent order between dataframes by using sum of the index's
   134        # hash.
   135        # Apply consistent order within dataframe with sort_index()
   136        # Also drops any empty dataframes.
   137        return sorted((df.sort_index() for df in dfs if len(df)),
   138                      key=lambda df: sum(self._hash_index(df)))
   139  
   140      dfs = apply_consistent_order(dfs)
   141      repartitioned_dfs = apply_consistent_order(
   142          df for _, df in self.test_partition_fn(pd.concat(dfs)))
   143  
   144      # Assert that each index is identical
   145      for df, repartitioned_df in zip(dfs, repartitioned_dfs):
   146        if not df.index.equals(repartitioned_df.index):
   147          return False
   148  
   149      return True
   150  
   151  
   152  class Singleton(Partitioning):
   153    """A partitioning of all the data into a single partition.
   154    """
   155    def __init__(self, reason=None):
   156      self._reason = reason
   157  
   158    @property
   159    def reason(self):
   160      return self._reason
   161  
   162    def __eq__(self, other):
   163      return type(self) == type(other)
   164  
   165    def __hash__(self):
   166      return hash(type(self))
   167  
   168    def is_subpartitioning_of(self, other):
   169      return isinstance(other, Singleton)
   170  
   171    def partition_fn(self, df, num_partitions):
   172      yield None, df
   173  
   174    def check(self, dfs):
   175      return len(dfs) <= 1
   176  
   177  
   178  class JoinIndex(Partitioning):
   179    """A partitioning that lets two frames be joined.
   180    This can either be a hash partitioning on the full index, or a common
   181    ancestor with no intervening re-indexing/re-partitioning.
   182  
   183    It fits into the partial ordering as
   184  
   185        Index() < JoinIndex(x) < JoinIndex() < Arbitrary()
   186  
   187    with
   188  
   189        JoinIndex(x) and JoinIndex(y)
   190  
   191    being incomparable for nontrivial x != y.
   192  
   193    Expressions desiring to make use of this index should simply declare a
   194    requirement of JoinIndex().
   195    """
   196    def __init__(self, ancestor=None):
   197      self._ancestor = ancestor
   198  
   199    def __repr__(self):
   200      if self._ancestor:
   201        return 'JoinIndex[%s]' % self._ancestor
   202      else:
   203        return 'JoinIndex'
   204  
   205    def __eq__(self, other):
   206      if type(self) != type(other):
   207        return False
   208      elif self._ancestor is None:
   209        return other._ancestor is None
   210      elif other._ancestor is None:
   211        return False
   212      else:
   213        return self._ancestor == other._ancestor
   214  
   215    def __hash__(self):
   216      return hash((type(self), self._ancestor))
   217  
   218    def is_subpartitioning_of(self, other):
   219      if isinstance(other, Arbitrary):
   220        return False
   221      elif isinstance(other, JoinIndex):
   222        return self._ancestor is None or self == other
   223      else:
   224        return True
   225  
   226    def test_partition_fn(self, df):
   227      return Index().test_partition_fn(df)
   228  
   229    def check(self, dfs):
   230      return True
   231  
   232  
   233  class Arbitrary(Partitioning):
   234    """A partitioning imposing no constraints on the actual partitioning.
   235    """
   236    def __eq__(self, other):
   237      return type(self) == type(other)
   238  
   239    def __hash__(self):
   240      return hash(type(self))
   241  
   242    def is_subpartitioning_of(self, other):
   243      return True
   244  
   245    def test_partition_fn(self, df):
   246      num_partitions = 10
   247  
   248      def shuffled(seq):
   249        seq = list(seq)
   250        random.shuffle(seq)
   251        return seq
   252  
   253      part = pd.Series(shuffled(range(len(df))), index=df.index) % num_partitions
   254      for k in range(num_partitions):
   255        yield k, df[part == k]
   256  
   257    def check(self, dfs):
   258      return True