github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/expressions.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  import contextlib
    18  import random
    19  import threading
    20  from typing import Any
    21  from typing import Callable
    22  from typing import Generic
    23  from typing import Iterable
    24  from typing import Optional
    25  from typing import TypeVar
    26  
    27  from apache_beam.dataframe import partitionings
    28  
    29  
    30  class Session(object):
    31    """A session represents a mapping of expressions to concrete values.
    32  
    33    The bindings typically include required placeholders, but may be any
    34    intermediate expression as well.
    35    """
    36    def __init__(self, bindings=None):
    37      self._bindings = dict(bindings or {})
    38  
    39    def evaluate(self, expr):  # type: (Expression) -> Any
    40      if expr not in self._bindings:
    41        self._bindings[expr] = expr.evaluate_at(self)
    42      return self._bindings[expr]
    43  
    44    def lookup(self, expr):  #  type: (Expression) -> Any
    45      return self._bindings[expr]
    46  
    47  
    48  class PartitioningSession(Session):
    49    """An extension of Session that enforces actual partitioning of inputs.
    50  
    51    Each expression is evaluated multiple times for various supported
    52    partitionings determined by its `requires_partition_by` specification. For
    53    each tested partitioning, the input is partitioned and the expression is
    54    evaluated on each partition separately, as if this were actually executed in
    55    a parallel manner.
    56  
    57    For each input partitioning, the results are verified to be partitioned
    58    appropriately according to the expression's `preserves_partition_by`
    59    specification.
    60  
    61    For testing only.
    62    """
    63    def evaluate(self, expr):
    64      import pandas as pd
    65      import collections
    66  
    67      def is_scalar(expr):
    68        return not isinstance(expr.proxy(), pd.core.generic.NDFrame)
    69  
    70      if expr not in self._bindings:
    71        if is_scalar(expr) or not expr.args():
    72          result = super().evaluate(expr)
    73        else:
    74          scaler_args = [arg for arg in expr.args() if is_scalar(arg)]
    75  
    76          def evaluate_with(input_partitioning):
    77            parts = collections.defaultdict(
    78                lambda: Session({arg: self.evaluate(arg)
    79                                 for arg in scaler_args}))
    80            for arg in expr.args():
    81              if not is_scalar(arg):
    82                input = self.evaluate(arg)
    83                for key, part in input_partitioning.test_partition_fn(input):
    84                  parts[key]._bindings[arg] = part
    85            if not parts:
    86              parts[None]  # Create at least one entry.
    87  
    88            results = []
    89            for session in parts.values():
    90              if any(len(session.lookup(arg)) for arg in expr.args()
    91                     if not is_scalar(arg)):
    92                results.append(session.evaluate(expr))
    93  
    94            expected_output_partitioning = output_partitioning(
    95                expr, input_partitioning)
    96  
    97            if not expected_output_partitioning.check(results):
    98              raise AssertionError(
    99                  f"""Expression does not preserve partitioning!
   100                  Expression: {expr}
   101                  Requires: {expr.requires_partition_by()}
   102                  Preserves: {expr.preserves_partition_by()}
   103                  Input partitioning: {input_partitioning}
   104                  Expected output partitioning: {expected_output_partitioning}
   105                  """)
   106  
   107            if results:
   108              return pd.concat(results)
   109            else:
   110              # Choose any single session.
   111              return next(iter(parts.values())).evaluate(expr)
   112  
   113          # Store random state so it can be re-used for each execution, in case
   114          # the expression is part of a test that relies on the random seed.
   115          random_state = random.getstate()
   116  
   117          result = None
   118          # Run with all supported partitionings s.t. the smallest subpartitioning
   119          # is used last. This way the final result is computed with the most
   120          # challenging partitioning. Avoids heisenbugs where sometimes the result
   121          # is computed trivially with Singleton partitioning and passes.
   122          for input_partitioning in sorted(set([expr.requires_partition_by(),
   123                                                partitionings.Arbitrary(),
   124                                                partitionings.JoinIndex(),
   125                                                partitionings.Index(),
   126                                                partitionings.Singleton()])):
   127            if not expr.requires_partition_by().is_subpartitioning_of(
   128                input_partitioning):
   129              continue
   130  
   131            random.setstate(random_state)
   132  
   133            result = evaluate_with(input_partitioning)
   134  
   135          assert result is not None
   136          self._bindings[expr] = result
   137      return self._bindings[expr]
   138  
   139  
   140  # The return type of an Expression
   141  T = TypeVar('T')
   142  
   143  
   144  def output_partitioning(expr, input_partitioning):
   145    """ Return the expected output partitioning for `expr` when it's input is
   146    partitioned by `input_partitioning`.
   147  
   148    For internal use only; No backward compatibility guarantees """
   149    assert expr.requires_partition_by().is_subpartitioning_of(input_partitioning)
   150  
   151    if expr.preserves_partition_by().is_subpartitioning_of(input_partitioning):
   152      return min(input_partitioning, expr.preserves_partition_by())
   153    else:
   154      return partitionings.Arbitrary()
   155  
   156  
   157  class Expression(Generic[T]):
   158    """An expression is an operation bound to a set of arguments.
   159  
   160    An expression represents a deferred tree of operations, which can be
   161    evaluated at a specific bindings of root expressions to values.
   162  
   163    requires_partition_by indicates the upper bound of a set of partitionings that
   164    are acceptable inputs to this expression. The expression should be able to
   165    produce the correct result when given input(s) partitioned by its
   166    requires_partition_by attribute, or by any partitoning that is _not_
   167    a subpartitioning of it.
   168  
   169    preserves_partition_by indicates the upper bound of a set of partitionings
   170    that can be preserved by this expression. When the input(s) to this expression
   171    are partitioned by preserves_partition_by, or by any partitioning that is
   172    _not_ a subpartitioning of it, this expression should produce output(s)
   173    partitioned by the same partitioning.
   174  
   175    However, if the partitioning of an expression's input is a subpartitioning of
   176    the partitioning that it preserves, the output is presumed to have no
   177    particular partitioning (i.e. Arbitrary()).
   178  
   179    For example, let's look at an "element-wise operation", that has no
   180    partitioning requirement, and preserves any partitioning given to it::
   181  
   182      requires_partition_by = Arbitrary() -----------------------------+
   183                                                                       |
   184               +-----------+-------------+---------- ... ----+---------|
   185               |           |             |                   |         |
   186          Singleton() < Index([i]) < Index([i, j]) < ... < Index() < Arbitrary()
   187               |           |             |                   |         |
   188               +-----------+-------------+---------- ... ----+---------|
   189                                                                       |
   190      preserves_partition_by = Arbitrary() ----------------------------+
   191  
   192    As a more interesting example, consider this expression, which requires Index
   193    partitioning, and preserves just Singleton partitioning::
   194  
   195      requires_partition_by = Index() -----------------------+
   196                                                             |
   197               +-----------+-------------+---------- ... ----|
   198               |           |             |                   |
   199          Singleton() < Index([i]) < Index([i, j]) < ... < Index() < Arbitrary()
   200               |
   201               |
   202      preserves_partition_by = Singleton()
   203  
   204    Note that any non-Arbitrary partitioning is an acceptable input for this
   205    expression. However, unless the inputs are Singleton-partitioned, the
   206    expression makes no guarantees about the partitioning of the output.
   207    """
   208    def __init__(self, name: str, proxy: T, _id: Optional[str] = None):
   209      self._name = name
   210      self._proxy = proxy
   211      # Store for preservation through pickling.
   212      self._id = _id or '%s_%s_%s' % (name, type(proxy).__name__, id(self))
   213  
   214    def proxy(self) -> T:
   215      return self._proxy
   216  
   217    def __hash__(self):
   218      return hash(self._id)
   219  
   220    def __eq__(self, other):
   221      return self._id == other._id
   222  
   223    def __repr__(self):
   224      return '%s[%s]' % (self.__class__.__name__, self._id)
   225  
   226    def placeholders(self):
   227      """Returns all the placeholders that self depends on."""
   228      raise NotImplementedError(type(self))
   229  
   230    def evaluate_at(self, session: Session) -> T:
   231      """Returns the result of self with the bindings given in session."""
   232      raise NotImplementedError(type(self))
   233  
   234    def requires_partition_by(self) -> partitionings.Partitioning:
   235      """Returns the partitioning, if any, require to evaluate this expression.
   236  
   237      Returns partitioning.Arbitrary() to require no partitioning is required.
   238      """
   239      raise NotImplementedError(type(self))
   240  
   241    def preserves_partition_by(self) -> partitionings.Partitioning:
   242      """Returns the partitioning, if any, preserved by this expression.
   243  
   244      This gives an upper bound on the partitioning of its ouput.  The actual
   245      partitioning of the output may be less strict (e.g. if the input was
   246      less partitioned).
   247      """
   248      raise NotImplementedError(type(self))
   249  
   250  
   251  class PlaceholderExpression(Expression):
   252    """An expression whose value must be explicitly bound in the session."""
   253    def __init__(
   254        self,  # type: PlaceholderExpression
   255        proxy, # type: T
   256        reference=None,  # type: Any
   257    ):
   258      """Initialize a placeholder expression.
   259  
   260      Args:
   261        proxy: A proxy object with the type expected to be bound to this
   262          expression. Used for type checking at pipeline construction time.
   263      """
   264      super().__init__('placeholder', proxy)
   265      self._reference = reference
   266  
   267    def placeholders(self):
   268      return frozenset([self])
   269  
   270    def args(self):
   271      return ()
   272  
   273    def evaluate_at(self, session):
   274      return session.lookup(self)
   275  
   276    def requires_partition_by(self):
   277      return partitionings.Arbitrary()
   278  
   279    def preserves_partition_by(self):
   280      return partitionings.Index()
   281  
   282  
   283  class ConstantExpression(Expression):
   284    """An expression whose value is known at pipeline construction time."""
   285    def __init__(
   286        self,  # type: ConstantExpression
   287        value,  # type: T
   288        proxy=None  # type: Optional[T]
   289    ):
   290      """Initialize a constant expression.
   291  
   292      Args:
   293        value: The constant value to be produced by this expression.
   294        proxy: (Optional) a proxy object with same type as `value` to use for
   295          rapid type checking at pipeline construction time. If not provided,
   296          `value` will be used directly.
   297      """
   298      if proxy is None:
   299        proxy = value
   300      super().__init__('constant', proxy)
   301      self._value = value
   302  
   303    def placeholders(self):
   304      return frozenset()
   305  
   306    def args(self):
   307      return ()
   308  
   309    def evaluate_at(self, session):
   310      return self._value
   311  
   312    def requires_partition_by(self):
   313      return partitionings.Arbitrary()
   314  
   315    def preserves_partition_by(self):
   316      return partitionings.Arbitrary()
   317  
   318  
   319  class ComputedExpression(Expression):
   320    """An expression whose value must be computed at pipeline execution time."""
   321    def __init__(
   322        self,  # type: ComputedExpression
   323        name,  # type: str
   324        func,  # type: Callable[...,T]
   325        args,  # type: Iterable[Expression]
   326        proxy=None,  # type: Optional[T]
   327        _id=None,  # type: Optional[str]
   328        requires_partition_by=partitionings.Index(),  # type: partitionings.Partitioning
   329        preserves_partition_by=partitionings.Singleton(),  # type: partitionings.Partitioning
   330    ):
   331      """Initialize a computed expression.
   332  
   333      Args:
   334        name: The name of this expression.
   335        func: The function that will be used to compute the value of this
   336          expression. Should accept arguments of the types returned when
   337          evaluating the `args` expressions.
   338        args: The list of expressions that will be used to produce inputs to
   339          `func`.
   340        proxy: (Optional) a proxy object with same type as the objects that this
   341          ComputedExpression will produce at execution time. If not provided, a
   342          proxy will be generated using `func` and the proxies of `args`.
   343        _id: (Optional) a string to uniquely identify this expression.
   344        requires_partition_by: The required (common) partitioning of the args.
   345        preserves_partition_by: The level of partitioning preserved.
   346      """
   347      if (not _get_allow_non_parallel() and
   348          isinstance(requires_partition_by, partitionings.Singleton)):
   349        reason = requires_partition_by.reason or (
   350            f"Encountered non-parallelizable form of {name!r}.")
   351  
   352        raise NonParallelOperation(
   353            f"{reason}\n"
   354            "Consider using an allow_non_parallel_operations block if you're "
   355            "sure you want to do this. See "
   356            "https://s.apache.org/dataframe-non-parallel-operations for more "
   357            "information.")
   358      args = tuple(args)
   359      if proxy is None:
   360        proxy = func(*(arg.proxy() for arg in args))
   361      super().__init__(name, proxy, _id)
   362      self._func = func
   363      self._args = args
   364      self._requires_partition_by = requires_partition_by
   365      self._preserves_partition_by = preserves_partition_by
   366  
   367    def placeholders(self):
   368      return frozenset.union(
   369          frozenset(), *[arg.placeholders() for arg in self.args()])
   370  
   371    def args(self):
   372      return self._args
   373  
   374    def evaluate_at(self, session):
   375      return self._func(*(session.evaluate(arg) for arg in self._args))
   376  
   377    def requires_partition_by(self):
   378      return self._requires_partition_by
   379  
   380    def preserves_partition_by(self):
   381      return self._preserves_partition_by
   382  
   383  
   384  def elementwise_expression(name, func, args):
   385    return ComputedExpression(
   386        name,
   387        func,
   388        args,
   389        requires_partition_by=partitionings.Arbitrary(),
   390        preserves_partition_by=partitionings.Arbitrary())
   391  
   392  
   393  _ALLOW_NON_PARALLEL = threading.local()
   394  _ALLOW_NON_PARALLEL.value = False
   395  
   396  
   397  def _get_allow_non_parallel():
   398    return _ALLOW_NON_PARALLEL.value
   399  
   400  
   401  @contextlib.contextmanager
   402  def allow_non_parallel_operations(allow=True):
   403    if allow is None:
   404      yield
   405    else:
   406      old_value, _ALLOW_NON_PARALLEL.value = _ALLOW_NON_PARALLEL.value, allow
   407      yield
   408      _ALLOW_NON_PARALLEL.value = old_value
   409  
   410  
   411  class NonParallelOperation(Exception):
   412    def __init__(self, msg):
   413      super().__init__(self, msg)
   414      self.msg = msg