github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/partitionings.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import random 18 from typing import Any 19 from typing import Iterable 20 from typing import Tuple 21 from typing import TypeVar 22 23 import numpy as np 24 import pandas as pd 25 26 Frame = TypeVar('Frame', bound=pd.core.generic.NDFrame) 27 28 29 class Partitioning(object): 30 """A class representing a (consistent) partitioning of dataframe objects. 31 """ 32 def __repr__(self): 33 return self.__class__.__name__ 34 35 def is_subpartitioning_of(self, other): 36 # type: (Partitioning) -> bool 37 38 """Returns whether self is a sub-partition of other. 39 40 Specifically, returns whether something partitioned by self is necissarily 41 also partitioned by other. 42 """ 43 raise NotImplementedError 44 45 def __lt__(self, other): 46 return self != other and self <= other 47 48 def __le__(self, other): 49 return not self.is_subpartitioning_of(other) 50 51 def partition_fn(self, df, num_partitions): 52 # type: (Frame, int) -> Iterable[Tuple[Any, Frame]] 53 54 """A callable that actually performs the partitioning of a Frame df. 55 56 This will be invoked via a FlatMap in conjunction with a GroupKey to 57 achieve the desired partitioning. 58 """ 59 raise NotImplementedError 60 61 def test_partition_fn(self, df): 62 return self.partition_fn(df, 5) 63 64 65 class Index(Partitioning): 66 """A partitioning by index (either fully or partially). 67 68 If the set of "levels" of the index to consider is not specified, the entire 69 index is used. 70 71 These form a partial order, given by 72 73 Singleton() < Index([i]) < Index([i, j]) < ... < Index() < Arbitrary() 74 75 The ordering is implemented via the is_subpartitioning_of method, where the 76 examples on the right are subpartitionings of the examples on the left above. 77 """ 78 def __init__(self, levels=None): 79 self._levels = levels 80 81 def __repr__(self): 82 if self._levels: 83 return 'Index%s' % self._levels 84 else: 85 return 'Index' 86 87 def __eq__(self, other): 88 return type(self) == type(other) and self._levels == other._levels 89 90 def __hash__(self): 91 if self._levels: 92 return hash(tuple(sorted(self._levels))) 93 else: 94 return hash(type(self)) 95 96 def is_subpartitioning_of(self, other): 97 if isinstance(other, Singleton): 98 return True 99 elif isinstance(other, Index): 100 if self._levels is None: 101 return True 102 elif other._levels is None: 103 return False 104 else: 105 return all(level in self._levels for level in other._levels) 106 elif isinstance(other, (Arbitrary, JoinIndex)): 107 return False 108 else: 109 raise ValueError(f"Encountered unknown type {other!r}") 110 111 def _hash_index(self, df): 112 if self._levels is None: 113 levels = list(range(df.index.nlevels)) 114 else: 115 levels = self._levels 116 return sum( 117 pd.util.hash_array(np.asarray(df.index.get_level_values(level))) 118 for level in levels) 119 120 def partition_fn(self, df, num_partitions): 121 hashes = self._hash_index(df) 122 for key in range(num_partitions): 123 yield key, df[hashes % num_partitions == key] 124 125 def check(self, dfs): 126 # Drop empty DataFrames 127 dfs = [df for df in dfs if len(df)] 128 129 if not len(dfs): 130 return True 131 132 def apply_consistent_order(dfs): 133 # Apply consistent order between dataframes by using sum of the index's 134 # hash. 135 # Apply consistent order within dataframe with sort_index() 136 # Also drops any empty dataframes. 137 return sorted((df.sort_index() for df in dfs if len(df)), 138 key=lambda df: sum(self._hash_index(df))) 139 140 dfs = apply_consistent_order(dfs) 141 repartitioned_dfs = apply_consistent_order( 142 df for _, df in self.test_partition_fn(pd.concat(dfs))) 143 144 # Assert that each index is identical 145 for df, repartitioned_df in zip(dfs, repartitioned_dfs): 146 if not df.index.equals(repartitioned_df.index): 147 return False 148 149 return True 150 151 152 class Singleton(Partitioning): 153 """A partitioning of all the data into a single partition. 154 """ 155 def __init__(self, reason=None): 156 self._reason = reason 157 158 @property 159 def reason(self): 160 return self._reason 161 162 def __eq__(self, other): 163 return type(self) == type(other) 164 165 def __hash__(self): 166 return hash(type(self)) 167 168 def is_subpartitioning_of(self, other): 169 return isinstance(other, Singleton) 170 171 def partition_fn(self, df, num_partitions): 172 yield None, df 173 174 def check(self, dfs): 175 return len(dfs) <= 1 176 177 178 class JoinIndex(Partitioning): 179 """A partitioning that lets two frames be joined. 180 This can either be a hash partitioning on the full index, or a common 181 ancestor with no intervening re-indexing/re-partitioning. 182 183 It fits into the partial ordering as 184 185 Index() < JoinIndex(x) < JoinIndex() < Arbitrary() 186 187 with 188 189 JoinIndex(x) and JoinIndex(y) 190 191 being incomparable for nontrivial x != y. 192 193 Expressions desiring to make use of this index should simply declare a 194 requirement of JoinIndex(). 195 """ 196 def __init__(self, ancestor=None): 197 self._ancestor = ancestor 198 199 def __repr__(self): 200 if self._ancestor: 201 return 'JoinIndex[%s]' % self._ancestor 202 else: 203 return 'JoinIndex' 204 205 def __eq__(self, other): 206 if type(self) != type(other): 207 return False 208 elif self._ancestor is None: 209 return other._ancestor is None 210 elif other._ancestor is None: 211 return False 212 else: 213 return self._ancestor == other._ancestor 214 215 def __hash__(self): 216 return hash((type(self), self._ancestor)) 217 218 def is_subpartitioning_of(self, other): 219 if isinstance(other, Arbitrary): 220 return False 221 elif isinstance(other, JoinIndex): 222 return self._ancestor is None or self == other 223 else: 224 return True 225 226 def test_partition_fn(self, df): 227 return Index().test_partition_fn(df) 228 229 def check(self, dfs): 230 return True 231 232 233 class Arbitrary(Partitioning): 234 """A partitioning imposing no constraints on the actual partitioning. 235 """ 236 def __eq__(self, other): 237 return type(self) == type(other) 238 239 def __hash__(self): 240 return hash(type(self)) 241 242 def is_subpartitioning_of(self, other): 243 return True 244 245 def test_partition_fn(self, df): 246 num_partitions = 10 247 248 def shuffled(seq): 249 seq = list(seq) 250 random.shuffle(seq) 251 return seq 252 253 part = pd.Series(shuffled(range(len(df))), index=df.index) % num_partitions 254 for k in range(num_partitions): 255 yield k, df[part == k] 256 257 def check(self, dfs): 258 return True