github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/expressions.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import contextlib 18 import random 19 import threading 20 from typing import Any 21 from typing import Callable 22 from typing import Generic 23 from typing import Iterable 24 from typing import Optional 25 from typing import TypeVar 26 27 from apache_beam.dataframe import partitionings 28 29 30 class Session(object): 31 """A session represents a mapping of expressions to concrete values. 32 33 The bindings typically include required placeholders, but may be any 34 intermediate expression as well. 35 """ 36 def __init__(self, bindings=None): 37 self._bindings = dict(bindings or {}) 38 39 def evaluate(self, expr): # type: (Expression) -> Any 40 if expr not in self._bindings: 41 self._bindings[expr] = expr.evaluate_at(self) 42 return self._bindings[expr] 43 44 def lookup(self, expr): # type: (Expression) -> Any 45 return self._bindings[expr] 46 47 48 class PartitioningSession(Session): 49 """An extension of Session that enforces actual partitioning of inputs. 50 51 Each expression is evaluated multiple times for various supported 52 partitionings determined by its `requires_partition_by` specification. For 53 each tested partitioning, the input is partitioned and the expression is 54 evaluated on each partition separately, as if this were actually executed in 55 a parallel manner. 56 57 For each input partitioning, the results are verified to be partitioned 58 appropriately according to the expression's `preserves_partition_by` 59 specification. 60 61 For testing only. 62 """ 63 def evaluate(self, expr): 64 import pandas as pd 65 import collections 66 67 def is_scalar(expr): 68 return not isinstance(expr.proxy(), pd.core.generic.NDFrame) 69 70 if expr not in self._bindings: 71 if is_scalar(expr) or not expr.args(): 72 result = super().evaluate(expr) 73 else: 74 scaler_args = [arg for arg in expr.args() if is_scalar(arg)] 75 76 def evaluate_with(input_partitioning): 77 parts = collections.defaultdict( 78 lambda: Session({arg: self.evaluate(arg) 79 for arg in scaler_args})) 80 for arg in expr.args(): 81 if not is_scalar(arg): 82 input = self.evaluate(arg) 83 for key, part in input_partitioning.test_partition_fn(input): 84 parts[key]._bindings[arg] = part 85 if not parts: 86 parts[None] # Create at least one entry. 87 88 results = [] 89 for session in parts.values(): 90 if any(len(session.lookup(arg)) for arg in expr.args() 91 if not is_scalar(arg)): 92 results.append(session.evaluate(expr)) 93 94 expected_output_partitioning = output_partitioning( 95 expr, input_partitioning) 96 97 if not expected_output_partitioning.check(results): 98 raise AssertionError( 99 f"""Expression does not preserve partitioning! 100 Expression: {expr} 101 Requires: {expr.requires_partition_by()} 102 Preserves: {expr.preserves_partition_by()} 103 Input partitioning: {input_partitioning} 104 Expected output partitioning: {expected_output_partitioning} 105 """) 106 107 if results: 108 return pd.concat(results) 109 else: 110 # Choose any single session. 111 return next(iter(parts.values())).evaluate(expr) 112 113 # Store random state so it can be re-used for each execution, in case 114 # the expression is part of a test that relies on the random seed. 115 random_state = random.getstate() 116 117 result = None 118 # Run with all supported partitionings s.t. the smallest subpartitioning 119 # is used last. This way the final result is computed with the most 120 # challenging partitioning. Avoids heisenbugs where sometimes the result 121 # is computed trivially with Singleton partitioning and passes. 122 for input_partitioning in sorted(set([expr.requires_partition_by(), 123 partitionings.Arbitrary(), 124 partitionings.JoinIndex(), 125 partitionings.Index(), 126 partitionings.Singleton()])): 127 if not expr.requires_partition_by().is_subpartitioning_of( 128 input_partitioning): 129 continue 130 131 random.setstate(random_state) 132 133 result = evaluate_with(input_partitioning) 134 135 assert result is not None 136 self._bindings[expr] = result 137 return self._bindings[expr] 138 139 140 # The return type of an Expression 141 T = TypeVar('T') 142 143 144 def output_partitioning(expr, input_partitioning): 145 """ Return the expected output partitioning for `expr` when it's input is 146 partitioned by `input_partitioning`. 147 148 For internal use only; No backward compatibility guarantees """ 149 assert expr.requires_partition_by().is_subpartitioning_of(input_partitioning) 150 151 if expr.preserves_partition_by().is_subpartitioning_of(input_partitioning): 152 return min(input_partitioning, expr.preserves_partition_by()) 153 else: 154 return partitionings.Arbitrary() 155 156 157 class Expression(Generic[T]): 158 """An expression is an operation bound to a set of arguments. 159 160 An expression represents a deferred tree of operations, which can be 161 evaluated at a specific bindings of root expressions to values. 162 163 requires_partition_by indicates the upper bound of a set of partitionings that 164 are acceptable inputs to this expression. The expression should be able to 165 produce the correct result when given input(s) partitioned by its 166 requires_partition_by attribute, or by any partitoning that is _not_ 167 a subpartitioning of it. 168 169 preserves_partition_by indicates the upper bound of a set of partitionings 170 that can be preserved by this expression. When the input(s) to this expression 171 are partitioned by preserves_partition_by, or by any partitioning that is 172 _not_ a subpartitioning of it, this expression should produce output(s) 173 partitioned by the same partitioning. 174 175 However, if the partitioning of an expression's input is a subpartitioning of 176 the partitioning that it preserves, the output is presumed to have no 177 particular partitioning (i.e. Arbitrary()). 178 179 For example, let's look at an "element-wise operation", that has no 180 partitioning requirement, and preserves any partitioning given to it:: 181 182 requires_partition_by = Arbitrary() -----------------------------+ 183 | 184 +-----------+-------------+---------- ... ----+---------| 185 | | | | | 186 Singleton() < Index([i]) < Index([i, j]) < ... < Index() < Arbitrary() 187 | | | | | 188 +-----------+-------------+---------- ... ----+---------| 189 | 190 preserves_partition_by = Arbitrary() ----------------------------+ 191 192 As a more interesting example, consider this expression, which requires Index 193 partitioning, and preserves just Singleton partitioning:: 194 195 requires_partition_by = Index() -----------------------+ 196 | 197 +-----------+-------------+---------- ... ----| 198 | | | | 199 Singleton() < Index([i]) < Index([i, j]) < ... < Index() < Arbitrary() 200 | 201 | 202 preserves_partition_by = Singleton() 203 204 Note that any non-Arbitrary partitioning is an acceptable input for this 205 expression. However, unless the inputs are Singleton-partitioned, the 206 expression makes no guarantees about the partitioning of the output. 207 """ 208 def __init__(self, name: str, proxy: T, _id: Optional[str] = None): 209 self._name = name 210 self._proxy = proxy 211 # Store for preservation through pickling. 212 self._id = _id or '%s_%s_%s' % (name, type(proxy).__name__, id(self)) 213 214 def proxy(self) -> T: 215 return self._proxy 216 217 def __hash__(self): 218 return hash(self._id) 219 220 def __eq__(self, other): 221 return self._id == other._id 222 223 def __repr__(self): 224 return '%s[%s]' % (self.__class__.__name__, self._id) 225 226 def placeholders(self): 227 """Returns all the placeholders that self depends on.""" 228 raise NotImplementedError(type(self)) 229 230 def evaluate_at(self, session: Session) -> T: 231 """Returns the result of self with the bindings given in session.""" 232 raise NotImplementedError(type(self)) 233 234 def requires_partition_by(self) -> partitionings.Partitioning: 235 """Returns the partitioning, if any, require to evaluate this expression. 236 237 Returns partitioning.Arbitrary() to require no partitioning is required. 238 """ 239 raise NotImplementedError(type(self)) 240 241 def preserves_partition_by(self) -> partitionings.Partitioning: 242 """Returns the partitioning, if any, preserved by this expression. 243 244 This gives an upper bound on the partitioning of its ouput. The actual 245 partitioning of the output may be less strict (e.g. if the input was 246 less partitioned). 247 """ 248 raise NotImplementedError(type(self)) 249 250 251 class PlaceholderExpression(Expression): 252 """An expression whose value must be explicitly bound in the session.""" 253 def __init__( 254 self, # type: PlaceholderExpression 255 proxy, # type: T 256 reference=None, # type: Any 257 ): 258 """Initialize a placeholder expression. 259 260 Args: 261 proxy: A proxy object with the type expected to be bound to this 262 expression. Used for type checking at pipeline construction time. 263 """ 264 super().__init__('placeholder', proxy) 265 self._reference = reference 266 267 def placeholders(self): 268 return frozenset([self]) 269 270 def args(self): 271 return () 272 273 def evaluate_at(self, session): 274 return session.lookup(self) 275 276 def requires_partition_by(self): 277 return partitionings.Arbitrary() 278 279 def preserves_partition_by(self): 280 return partitionings.Index() 281 282 283 class ConstantExpression(Expression): 284 """An expression whose value is known at pipeline construction time.""" 285 def __init__( 286 self, # type: ConstantExpression 287 value, # type: T 288 proxy=None # type: Optional[T] 289 ): 290 """Initialize a constant expression. 291 292 Args: 293 value: The constant value to be produced by this expression. 294 proxy: (Optional) a proxy object with same type as `value` to use for 295 rapid type checking at pipeline construction time. If not provided, 296 `value` will be used directly. 297 """ 298 if proxy is None: 299 proxy = value 300 super().__init__('constant', proxy) 301 self._value = value 302 303 def placeholders(self): 304 return frozenset() 305 306 def args(self): 307 return () 308 309 def evaluate_at(self, session): 310 return self._value 311 312 def requires_partition_by(self): 313 return partitionings.Arbitrary() 314 315 def preserves_partition_by(self): 316 return partitionings.Arbitrary() 317 318 319 class ComputedExpression(Expression): 320 """An expression whose value must be computed at pipeline execution time.""" 321 def __init__( 322 self, # type: ComputedExpression 323 name, # type: str 324 func, # type: Callable[...,T] 325 args, # type: Iterable[Expression] 326 proxy=None, # type: Optional[T] 327 _id=None, # type: Optional[str] 328 requires_partition_by=partitionings.Index(), # type: partitionings.Partitioning 329 preserves_partition_by=partitionings.Singleton(), # type: partitionings.Partitioning 330 ): 331 """Initialize a computed expression. 332 333 Args: 334 name: The name of this expression. 335 func: The function that will be used to compute the value of this 336 expression. Should accept arguments of the types returned when 337 evaluating the `args` expressions. 338 args: The list of expressions that will be used to produce inputs to 339 `func`. 340 proxy: (Optional) a proxy object with same type as the objects that this 341 ComputedExpression will produce at execution time. If not provided, a 342 proxy will be generated using `func` and the proxies of `args`. 343 _id: (Optional) a string to uniquely identify this expression. 344 requires_partition_by: The required (common) partitioning of the args. 345 preserves_partition_by: The level of partitioning preserved. 346 """ 347 if (not _get_allow_non_parallel() and 348 isinstance(requires_partition_by, partitionings.Singleton)): 349 reason = requires_partition_by.reason or ( 350 f"Encountered non-parallelizable form of {name!r}.") 351 352 raise NonParallelOperation( 353 f"{reason}\n" 354 "Consider using an allow_non_parallel_operations block if you're " 355 "sure you want to do this. See " 356 "https://s.apache.org/dataframe-non-parallel-operations for more " 357 "information.") 358 args = tuple(args) 359 if proxy is None: 360 proxy = func(*(arg.proxy() for arg in args)) 361 super().__init__(name, proxy, _id) 362 self._func = func 363 self._args = args 364 self._requires_partition_by = requires_partition_by 365 self._preserves_partition_by = preserves_partition_by 366 367 def placeholders(self): 368 return frozenset.union( 369 frozenset(), *[arg.placeholders() for arg in self.args()]) 370 371 def args(self): 372 return self._args 373 374 def evaluate_at(self, session): 375 return self._func(*(session.evaluate(arg) for arg in self._args)) 376 377 def requires_partition_by(self): 378 return self._requires_partition_by 379 380 def preserves_partition_by(self): 381 return self._preserves_partition_by 382 383 384 def elementwise_expression(name, func, args): 385 return ComputedExpression( 386 name, 387 func, 388 args, 389 requires_partition_by=partitionings.Arbitrary(), 390 preserves_partition_by=partitionings.Arbitrary()) 391 392 393 _ALLOW_NON_PARALLEL = threading.local() 394 _ALLOW_NON_PARALLEL.value = False 395 396 397 def _get_allow_non_parallel(): 398 return _ALLOW_NON_PARALLEL.value 399 400 401 @contextlib.contextmanager 402 def allow_non_parallel_operations(allow=True): 403 if allow is None: 404 yield 405 else: 406 old_value, _ALLOW_NON_PARALLEL.value = _ALLOW_NON_PARALLEL.value, allow 407 yield 408 _ALLOW_NON_PARALLEL.value = old_value 409 410 411 class NonParallelOperation(Exception): 412 def __init__(self, msg): 413 super().__init__(self, msg) 414 self.msg = msg