github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/frames.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/frames.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Analogs for :class:`pandas.DataFrame` and :class:`pandas.Series`:
    18  :class:`DeferredDataFrame` and :class:`DeferredSeries`.
    19  
    20  These classes are effectively wrappers around a `schema-aware`_
    21  :class:`~apache_beam.pvalue.PCollection` that provide a set of operations
    22  compatible with the `pandas`_ API.
    23  
    24  Note that we aim for the Beam DataFrame API to be completely compatible with
    25  the pandas API, but there are some features that are currently unimplemented
    26  for various reasons. Pay particular attention to the **'Differences from
    27  pandas'** section for each operation to understand where we diverge.
    28  
    29  .. _schema-aware:
    30    https://beam.apache.org/documentation/programming-guide/#what-is-a-schema
    31  .. _pandas:
    32    https://pandas.pydata.org/
    33  """
    34  
    35  import collections
    36  import inspect
    37  import itertools
    38  import math
    39  import re
    40  import warnings
    41  from typing import List
    42  from typing import Optional
    43  
    44  import numpy as np
    45  import pandas as pd
    46  from pandas._libs import lib
    47  from pandas.api.types import is_float_dtype
    48  from pandas.api.types import is_int64_dtype
    49  from pandas.api.types import is_list_like
    50  from pandas.core.groupby.generic import DataFrameGroupBy
    51  
    52  from apache_beam.dataframe import convert
    53  from apache_beam.dataframe import expressions
    54  from apache_beam.dataframe import frame_base
    55  from apache_beam.dataframe import io
    56  from apache_beam.dataframe import partitionings
    57  from apache_beam.transforms import PTransform
    58  
    59  __all__ = [
    60      'DeferredSeries',
    61      'DeferredDataFrame',
    62  ]
    63  
    64  # Get major, minor version
    65  PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
    66  
    67  
    68  def populate_not_implemented(pd_type):
    69    def wrapper(deferred_type):
    70      for attr in dir(pd_type):
    71        # Don't auto-define hidden methods or dunders
    72        if attr.startswith('_'):
    73          continue
    74        if not hasattr(deferred_type, attr):
    75          pd_value = getattr(pd_type, attr)
    76          if isinstance(pd_value, property) or inspect.isclass(pd_value):
    77            # Some of the properties on pandas types (cat, dt, sparse), are
    78            # actually attributes with class values, not properties
    79            setattr(
    80                deferred_type,
    81                attr,
    82                property(
    83                    frame_base.not_implemented_method(attr, base_type=pd_type)))
    84          elif callable(pd_value):
    85            setattr(
    86                deferred_type,
    87                attr,
    88                frame_base.not_implemented_method(attr, base_type=pd_type))
    89      return deferred_type
    90  
    91    return wrapper
    92  
    93  
    94  def _fillna_alias(method):
    95    def wrapper(self, *args, **kwargs):
    96      return self.fillna(*args, method=method, **kwargs)
    97  
    98    wrapper.__name__ = method
    99    wrapper.__doc__ = (
   100        f'{method} is only supported for axis="columns". '
   101        'axis="index" is order-sensitive.')
   102  
   103    return frame_base.with_docs_from(pd.DataFrame)(
   104        frame_base.args_to_kwargs(pd.DataFrame)(
   105            frame_base.populate_defaults(pd.DataFrame)(wrapper)))
   106  
   107  
   108  # These aggregations are commutative and associative, they can be trivially
   109  # "lifted" (i.e. we can pre-aggregate on partitions, group, then post-aggregate)
   110  LIFTABLE_AGGREGATIONS = ['all', 'any', 'max', 'min', 'prod', 'sum']
   111  # These aggregations can be lifted if post-aggregated with "sum"
   112  LIFTABLE_WITH_SUM_AGGREGATIONS = ['size', 'count']
   113  UNLIFTABLE_AGGREGATIONS = [
   114      'mean',
   115      'median',
   116      'quantile',
   117      'describe',
   118      'sem',
   119      'mad',
   120      'skew',
   121      'kurt',
   122      'kurtosis',
   123      'std',
   124      'var',
   125      'corr',
   126      'cov',
   127      'nunique',
   128  ]
   129  ALL_AGGREGATIONS = (
   130      LIFTABLE_AGGREGATIONS + LIFTABLE_WITH_SUM_AGGREGATIONS +
   131      UNLIFTABLE_AGGREGATIONS)
   132  
   133  # These aggregations have specialized distributed implementations on
   134  # DeferredSeries, which are re-used in DeferredFrame. Note they are *not* used
   135  # for grouped aggregations, since they generally require tracking multiple
   136  # intermediate series, which is difficult to lift in groupby.
   137  HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS = {
   138      'quantile',
   139      'std',
   140      'var',
   141      'mean',
   142      'nunique',
   143      'corr',
   144      'cov',
   145      'skew',
   146      'kurt',
   147      'kurtosis'
   148  }
   149  UNLIFTABLE_GLOBAL_AGGREGATIONS = (
   150      set(UNLIFTABLE_AGGREGATIONS) - set(HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS))
   151  
   152  
   153  def _agg_method(base, func):
   154    def wrapper(self, *args, **kwargs):
   155      return self.agg(func, *args, **kwargs)
   156  
   157    if func in UNLIFTABLE_GLOBAL_AGGREGATIONS:
   158      wrapper.__doc__ = (
   159          f"``{func}`` cannot currently be parallelized. It will "
   160          "require collecting all data on a single node.")
   161    wrapper.__name__ = func
   162  
   163    return frame_base.with_docs_from(base)(wrapper)
   164  
   165  
   166  # Docstring to use for head and tail (commonly used to peek at datasets)
   167  _PEEK_METHOD_EXPLANATION = (
   168      "because it is `order-sensitive "
   169      "<https://s.apache.org/dataframe-order-sensitive-operations>`_.\n\n"
   170      "If you want to peek at a large dataset consider using interactive Beam's "
   171      ":func:`ib.collect "
   172      "<apache_beam.runners.interactive.interactive_beam.collect>` "
   173      "with ``n`` specified, or :meth:`sample`. If you want to find the "
   174      "N largest elements, consider using :meth:`DeferredDataFrame.nlargest`.")
   175  
   176  
   177  class DeferredDataFrameOrSeries(frame_base.DeferredFrame):
   178    def _render_indexes(self):
   179      if self.index.nlevels == 1:
   180        return 'index=' + (
   181            '<unnamed>' if self.index.name is None else repr(self.index.name))
   182      else:
   183        return 'indexes=[' + ', '.join(
   184            '<unnamed>' if ix is None else repr(ix)
   185            for ix in self.index.names) + ']'
   186  
   187    __array__ = frame_base.wont_implement_method(
   188        pd.Series, '__array__', reason="non-deferred-result")
   189  
   190    @frame_base.with_docs_from(pd.DataFrame)
   191    @frame_base.args_to_kwargs(pd.DataFrame)
   192    @frame_base.populate_defaults(pd.DataFrame)
   193    @frame_base.maybe_inplace
   194    def drop(self, labels, axis, index, columns, errors, **kwargs):
   195      """drop is not parallelizable when dropping from the index and
   196      ``errors="raise"`` is specified. It requires collecting all data on a single
   197      node in order to detect if one of the index values is missing."""
   198      if labels is not None:
   199        if index is not None or columns is not None:
   200          raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
   201        if axis in (0, 'index'):
   202          index = labels
   203          columns = None
   204        elif axis in (1, 'columns'):
   205          index = None
   206          columns = labels
   207        else:
   208          raise ValueError(
   209              "axis must be one of (0, 1, 'index', 'columns'), "
   210              "got '%s'" % axis)
   211  
   212      if columns is not None:
   213        # Compute the proxy based on just the columns that are dropped.
   214        proxy = self._expr.proxy().drop(columns=columns, errors=errors)
   215      else:
   216        proxy = self._expr.proxy()
   217  
   218      if index is not None and errors == 'raise':
   219        # In order to raise an error about missing index values, we'll
   220        # need to collect the entire dataframe.
   221        # TODO: This could be parallelized by putting index values in a
   222        # ConstantExpression and partitioning by index.
   223        requires = partitionings.Singleton(
   224            reason=(
   225                "drop(errors='raise', axis='index') is not currently "
   226                "parallelizable. This requires collecting all data on a single "
   227                f"node in order to detect if one of {index!r} is missing."))
   228      else:
   229        requires = partitionings.Arbitrary()
   230  
   231      return frame_base.DeferredFrame.wrap(
   232          expressions.ComputedExpression(
   233              'drop',
   234              lambda df: df.drop(
   235                  axis=axis,
   236                  index=index,
   237                  columns=columns,
   238                  errors=errors,
   239                  **kwargs), [self._expr],
   240              proxy=proxy,
   241              requires_partition_by=requires))
   242  
   243    @frame_base.with_docs_from(pd.DataFrame)
   244    @frame_base.args_to_kwargs(pd.DataFrame)
   245    @frame_base.populate_defaults(pd.DataFrame)
   246    def droplevel(self, level, axis):
   247      return frame_base.DeferredFrame.wrap(
   248          expressions.ComputedExpression(
   249              'droplevel',
   250              lambda df: df.droplevel(level, axis=axis), [self._expr],
   251              requires_partition_by=partitionings.Arbitrary(),
   252              preserves_partition_by=partitionings.Arbitrary()
   253              if axis in (1, 'column') else partitionings.Singleton()))
   254  
   255    @frame_base.with_docs_from(pd.DataFrame)
   256    @frame_base.args_to_kwargs(pd.DataFrame)
   257    def swaplevel(self, **kwargs):
   258      return frame_base.DeferredFrame.wrap(
   259          expressions.ComputedExpression(
   260              'swaplevel',
   261              lambda df: df.swaplevel(**kwargs), [self._expr],
   262              requires_partition_by=partitionings.Arbitrary(),
   263              preserves_partition_by=partitionings.Arbitrary()))
   264  
   265    @frame_base.with_docs_from(pd.DataFrame)
   266    @frame_base.args_to_kwargs(pd.DataFrame)
   267    @frame_base.populate_defaults(pd.DataFrame)
   268    @frame_base.maybe_inplace
   269    def fillna(self, value, method, axis, limit, **kwargs):
   270      """When ``axis="index"``, both ``method`` and ``limit`` must be ``None``.
   271      otherwise this operation is order-sensitive."""
   272      # Default value is None, but is overriden with index.
   273      axis = axis or 'index'
   274  
   275      if axis in (0, 'index'):
   276        if method is not None:
   277          raise frame_base.WontImplementError(
   278              f"fillna(method={method!r}, axis={axis!r}) is not supported "
   279              "because it is order-sensitive. Only fillna(method=None) is "
   280              f"supported with axis={axis!r}.",
   281              reason="order-sensitive")
   282        if limit is not None:
   283          raise frame_base.WontImplementError(
   284              f"fillna(limit={method!r}, axis={axis!r}) is not supported because "
   285              "it is order-sensitive. Only fillna(limit=None) is supported with "
   286              f"axis={axis!r}.",
   287              reason="order-sensitive")
   288  
   289      if isinstance(self, DeferredDataFrame) and isinstance(value,
   290                                                            DeferredSeries):
   291        # If self is a DataFrame and value is a Series we want to broadcast value
   292        # to all partitions of self.
   293        # This is OK, as its index must be the same size as the columns set of
   294        # self, so cannot be too large.
   295        class AsScalar(object):
   296          def __init__(self, value):
   297            self.value = value
   298  
   299        with expressions.allow_non_parallel_operations():
   300          value_expr = expressions.ComputedExpression(
   301              'as_scalar',
   302              lambda df: AsScalar(df), [value._expr],
   303              requires_partition_by=partitionings.Singleton())
   304  
   305        get_value = lambda x: x.value
   306        requires = partitionings.Arbitrary()
   307      elif isinstance(value, frame_base.DeferredBase):
   308        # For other DeferredBase combinations, use Index partitioning to
   309        # co-locate on the Index
   310        value_expr = value._expr
   311        get_value = lambda x: x
   312        requires = partitionings.Index()
   313      else:
   314        # Default case, pass value through as a constant, no particular
   315        # partitioning requirement
   316        value_expr = expressions.ConstantExpression(value)
   317        get_value = lambda x: x
   318        requires = partitionings.Arbitrary()
   319  
   320      return frame_base.DeferredFrame.wrap(
   321          # yapf: disable
   322          expressions.ComputedExpression(
   323              'fillna',
   324              lambda df,
   325              value: df.fillna(
   326                  get_value(value),
   327                  method=method,
   328                  axis=axis,
   329                  limit=limit,
   330                  **kwargs), [self._expr, value_expr],
   331              preserves_partition_by=partitionings.Arbitrary(),
   332              requires_partition_by=requires))
   333  
   334    if hasattr(pd.DataFrame, 'ffill'):
   335      ffill = _fillna_alias('ffill')
   336    if hasattr(pd.DataFrame, 'bfill'):
   337      bfill = _fillna_alias('bfill')
   338    if hasattr(pd.DataFrame, 'backfill'):
   339      backfill = _fillna_alias('backfill')
   340    if hasattr(pd.DataFrame, 'pad'):
   341      pad = _fillna_alias('pad')
   342  
   343    @frame_base.with_docs_from(pd.DataFrame)
   344    def first(self, offset):
   345      per_partition = expressions.ComputedExpression(
   346          'first-per-partition',
   347          lambda df: df.sort_index().first(offset=offset), [self._expr],
   348          preserves_partition_by=partitionings.Arbitrary(),
   349          requires_partition_by=partitionings.Arbitrary())
   350      with expressions.allow_non_parallel_operations(True):
   351        return frame_base.DeferredFrame.wrap(
   352            expressions.ComputedExpression(
   353                'first',
   354                lambda df: df.sort_index().first(offset=offset), [per_partition],
   355                preserves_partition_by=partitionings.Arbitrary(),
   356                requires_partition_by=partitionings.Singleton()))
   357  
   358    @frame_base.with_docs_from(pd.DataFrame)
   359    def last(self, offset):
   360      per_partition = expressions.ComputedExpression(
   361          'last-per-partition',
   362          lambda df: df.sort_index().last(offset=offset), [self._expr],
   363          preserves_partition_by=partitionings.Arbitrary(),
   364          requires_partition_by=partitionings.Arbitrary())
   365      with expressions.allow_non_parallel_operations(True):
   366        return frame_base.DeferredFrame.wrap(
   367            expressions.ComputedExpression(
   368                'last',
   369                lambda df: df.sort_index().last(offset=offset), [per_partition],
   370                preserves_partition_by=partitionings.Arbitrary(),
   371                requires_partition_by=partitionings.Singleton()))
   372  
   373    @frame_base.with_docs_from(pd.DataFrame)
   374    @frame_base.args_to_kwargs(pd.DataFrame)
   375    @frame_base.populate_defaults(pd.DataFrame)
   376    def groupby(self, by, level, axis, as_index, group_keys, **kwargs):
   377      """``as_index`` must be ``True``.
   378  
   379      Aggregations grouping by a categorical column with ``observed=False`` set
   380      are not currently parallelizable
   381      (`Issue 21827 <https://github.com/apache/beam/issues/21827>`_).
   382      """
   383      if not as_index:
   384        raise NotImplementedError('groupby(as_index=False)')
   385  
   386      if axis in (1, 'columns'):
   387        return _DeferredGroupByCols(
   388            expressions.ComputedExpression(
   389                'groupbycols',
   390                lambda df: df.groupby(
   391                    by, axis=axis, group_keys=group_keys, **kwargs), [self._expr],
   392                requires_partition_by=partitionings.Arbitrary(),
   393                preserves_partition_by=partitionings.Arbitrary()),
   394            group_keys=group_keys)
   395  
   396      if level is None and by is None:
   397        raise TypeError("You have to supply one of 'by' and 'level'")
   398  
   399      elif level is not None:
   400        if isinstance(level, (list, tuple)):
   401          grouping_indexes = level
   402        else:
   403          grouping_indexes = [level]
   404  
   405        grouping_columns = []
   406  
   407        index = self._expr.proxy().index
   408  
   409        # Translate to level numbers only
   410        grouping_indexes = [
   411            l if isinstance(l, int) else index.names.index(l)
   412            for l in grouping_indexes
   413        ]
   414  
   415        if index.nlevels == 1:
   416          to_group_with_index = self._expr
   417          to_group = self._expr
   418        else:
   419          levels_to_drop = [
   420              i for i in range(index.nlevels) if i not in grouping_indexes
   421          ]
   422  
   423          # Reorder so the grouped indexes are first
   424          to_group_with_index = self.reorder_levels(
   425              grouping_indexes + levels_to_drop)
   426  
   427          grouping_indexes = list(range(len(grouping_indexes)))
   428          levels_to_drop = list(range(len(grouping_indexes), index.nlevels))
   429          if levels_to_drop:
   430            to_group = to_group_with_index.droplevel(levels_to_drop)._expr
   431          else:
   432            to_group = to_group_with_index._expr
   433          to_group_with_index = to_group_with_index._expr
   434  
   435      elif callable(by):
   436  
   437        def map_index(df):
   438          df = df.copy()
   439          df.index = df.index.map(by)
   440          return df
   441  
   442        to_group = expressions.ComputedExpression(
   443            'map_index',
   444            map_index, [self._expr],
   445            requires_partition_by=partitionings.Arbitrary(),
   446            preserves_partition_by=partitionings.Singleton())
   447  
   448        orig_nlevels = self._expr.proxy().index.nlevels
   449  
   450        def prepend_mapped_index(df):
   451          df = df.copy()
   452  
   453          index = df.index.to_frame()
   454          index.insert(0, None, df.index.map(by))
   455  
   456          df.index = pd.MultiIndex.from_frame(
   457              index, names=[None] + list(df.index.names))
   458          return df
   459  
   460        to_group_with_index = expressions.ComputedExpression(
   461            'map_index_keep_orig',
   462            prepend_mapped_index,
   463            [self._expr],
   464            requires_partition_by=partitionings.Arbitrary(),
   465            # Partitioning by the original indexes is preserved
   466            preserves_partition_by=partitionings.Index(
   467                list(range(1, orig_nlevels + 1))))
   468  
   469        grouping_columns = []
   470        # The index we need to group by is the last one
   471        grouping_indexes = [0]
   472  
   473      elif isinstance(by, DeferredSeries):
   474        if isinstance(self, DeferredSeries):
   475  
   476          def set_index(s, by):
   477            df = pd.DataFrame(s)
   478            df, by = df.align(by, axis=0, join='inner')
   479            return df.set_index(by).iloc[:, 0]
   480  
   481          def prepend_index(s, by):
   482            df = pd.DataFrame(s)
   483            df, by = df.align(by, axis=0, join='inner')
   484            return df.set_index([by, df.index]).iloc[:, 0]
   485  
   486        else:
   487  
   488          def set_index(df, by):  # type: ignore
   489            df, by = df.align(by, axis=0, join='inner')
   490            return df.set_index(by)
   491  
   492          def prepend_index(df, by):  # type: ignore
   493            df, by = df.align(by, axis=0, join='inner')
   494            return df.set_index([by, df.index])
   495  
   496        to_group = expressions.ComputedExpression(
   497            'set_index',
   498            set_index, [self._expr, by._expr],
   499            requires_partition_by=partitionings.Index(),
   500            preserves_partition_by=partitionings.Singleton())
   501  
   502        orig_nlevels = self._expr.proxy().index.nlevels
   503        to_group_with_index = expressions.ComputedExpression(
   504            'prependindex',
   505            prepend_index, [self._expr, by._expr],
   506            requires_partition_by=partitionings.Index(),
   507            preserves_partition_by=partitionings.Index(
   508                list(range(1, orig_nlevels + 1))))
   509  
   510        grouping_columns = []
   511        grouping_indexes = [0]
   512  
   513      elif isinstance(by, np.ndarray):
   514        raise frame_base.WontImplementError(
   515            "Grouping by a concrete ndarray is order sensitive.",
   516            reason="order-sensitive")
   517  
   518      elif isinstance(self, DeferredDataFrame):
   519        if not isinstance(by, list):
   520          by = [by]
   521        # Find the columns that we need to move into the index so we can group by
   522        # them
   523        column_names = self._expr.proxy().columns
   524        grouping_columns = list(set(by).intersection(column_names))
   525        index_names = self._expr.proxy().index.names
   526        for label in by:
   527          if label not in index_names and label not in self._expr.proxy().columns:
   528            raise KeyError(label)
   529        grouping_indexes = list(set(by).intersection(index_names))
   530  
   531        if grouping_indexes:
   532          if set(by) == set(index_names):
   533            to_group = self._expr
   534          elif set(by).issubset(index_names):
   535            to_group = self.droplevel(index_names.difference(by))._expr
   536          else:
   537            to_group = self.reset_index(grouping_indexes).set_index(by)._expr
   538        else:
   539          to_group = self.set_index(by)._expr
   540  
   541        if grouping_columns:
   542          # TODO(https://github.com/apache/beam/issues/20759):
   543          # It should be possible to do this without creating
   544          # an expression manually, by using DeferredDataFrame.set_index, i.e.:
   545          #   to_group_with_index = self.set_index([self.index] +
   546          #                                        grouping_columns)._expr
   547          to_group_with_index = expressions.ComputedExpression(
   548              'move_grouped_columns_to_index',
   549              lambda df: df.set_index([df.index] + grouping_columns, drop=False),
   550              [self._expr],
   551              requires_partition_by=partitionings.Arbitrary(),
   552              preserves_partition_by=partitionings.Index(
   553                  list(range(self._expr.proxy().index.nlevels))))
   554        else:
   555          to_group_with_index = self._expr
   556  
   557      else:
   558        raise NotImplementedError(by)
   559  
   560      return DeferredGroupBy(
   561          expressions.ComputedExpression(
   562              'groupbyindex',
   563              lambda df: df.groupby(
   564                  level=list(range(df.index.nlevels)),
   565                  group_keys=group_keys,
   566                  **kwargs), [to_group],
   567              requires_partition_by=partitionings.Index(),
   568              preserves_partition_by=partitionings.Arbitrary()),
   569          kwargs,
   570          to_group,
   571          to_group_with_index,
   572          grouping_columns=grouping_columns,
   573          grouping_indexes=grouping_indexes,
   574          group_keys=group_keys)
   575  
   576    @property  # type: ignore
   577    @frame_base.with_docs_from(pd.DataFrame)
   578    def loc(self):
   579      return _DeferredLoc(self)
   580  
   581    @property  # type: ignore
   582    @frame_base.with_docs_from(pd.DataFrame)
   583    def iloc(self):
   584      """Position-based indexing with `iloc` is order-sensitive in almost every
   585      case. Beam DataFrame users should prefer label-based indexing with `loc`.
   586      """
   587      return _DeferredILoc(self)
   588  
   589    @frame_base.with_docs_from(pd.DataFrame)
   590    @frame_base.args_to_kwargs(pd.DataFrame)
   591    @frame_base.populate_defaults(pd.DataFrame)
   592    @frame_base.maybe_inplace
   593    def reset_index(self, level=None, **kwargs):
   594      """Dropping the entire index (e.g. with ``reset_index(level=None)``) is
   595      not parallelizable. It is also only guaranteed that the newly generated
   596      index values will be unique. The Beam DataFrame API makes no guarantee
   597      that the same index values as the equivalent pandas operation will be
   598      generated, because that implementation is order-sensitive."""
   599      if level is not None and not isinstance(level, (tuple, list)):
   600        level = [level]
   601      if level is None or len(level) == self._expr.proxy().index.nlevels:
   602        # TODO(https://github.com/apache/beam/issues/20859):
   603        # Could do distributed re-index with offsets.
   604        requires_partition_by = partitionings.Singleton(
   605            reason=(
   606                f"reset_index(level={level!r}) drops the entire index and "
   607                "creates a new one, so it cannot currently be parallelized "
   608                "(https://github.com/apache/beam/issues/20859)."))
   609      else:
   610        requires_partition_by = partitionings.Arbitrary()
   611      return frame_base.DeferredFrame.wrap(
   612          expressions.ComputedExpression(
   613              'reset_index',
   614              lambda df: df.reset_index(level=level, **kwargs), [self._expr],
   615              preserves_partition_by=partitionings.Singleton(),
   616              requires_partition_by=requires_partition_by))
   617  
   618    abs = frame_base._elementwise_method('abs', base=pd.core.generic.NDFrame)
   619  
   620    @frame_base.with_docs_from(pd.core.generic.NDFrame)
   621    @frame_base.args_to_kwargs(pd.core.generic.NDFrame)
   622    @frame_base.populate_defaults(pd.core.generic.NDFrame)
   623    def astype(self, dtype, copy, errors):
   624      """astype is not parallelizable when ``errors="ignore"`` is specified.
   625  
   626      ``copy=False`` is not supported because it relies on memory-sharing
   627      semantics.
   628  
   629      ``dtype="category`` is not supported because the type of the output column
   630      depends on the data. Please use ``pd.CategoricalDtype`` with explicit
   631      categories instead.
   632      """
   633      requires = partitionings.Arbitrary()
   634  
   635      if errors == "ignore":
   636        # We need all data in order to ignore errors and propagate the original
   637        # data.
   638        requires = partitionings.Singleton(
   639            reason=(
   640                f"astype(errors={errors!r}) is currently not parallelizable, "
   641                "because all data must be collected on one node to determine if "
   642                "the original data should be propagated instead."))
   643  
   644      if not copy:
   645        raise frame_base.WontImplementError(
   646            f"astype(copy={copy!r}) is not supported because it relies on "
   647            "memory-sharing semantics that are not compatible with the Beam "
   648            "model.")
   649  
   650      # An instance of CategoricalDtype is actualy considered equal to the string
   651      # 'category', so we have to explicitly check if dtype is an instance of
   652      # CategoricalDtype, and allow it.
   653      # See https://github.com/apache/beam/issues/23276
   654      if dtype == 'category' and not isinstance(dtype, pd.CategoricalDtype):
   655        raise frame_base.WontImplementError(
   656            "astype(dtype='category') is not supported because the type of the "
   657            "output column depends on the data. Please use pd.CategoricalDtype "
   658            "with explicit categories instead.",
   659            reason="non-deferred-columns")
   660  
   661      return frame_base.DeferredFrame.wrap(
   662          expressions.ComputedExpression(
   663              'astype',
   664              lambda df: df.astype(dtype=dtype, copy=copy, errors=errors),
   665              [self._expr],
   666              requires_partition_by=requires,
   667              preserves_partition_by=partitionings.Arbitrary()))
   668  
   669    at_time = frame_base._elementwise_method(
   670        'at_time', base=pd.core.generic.NDFrame)
   671    between_time = frame_base._elementwise_method(
   672        'between_time', base=pd.core.generic.NDFrame)
   673    copy = frame_base._elementwise_method('copy', base=pd.core.generic.NDFrame)
   674  
   675    @frame_base.with_docs_from(pd.DataFrame)
   676    @frame_base.args_to_kwargs(pd.DataFrame)
   677    @frame_base.populate_defaults(pd.DataFrame)
   678    @frame_base.maybe_inplace
   679    def replace(self, to_replace, value, limit, method, **kwargs):
   680      """``method`` is not supported in the Beam DataFrame API because it is
   681      order-sensitive. It cannot be specified.
   682  
   683      If ``limit`` is specified this operation is not parallelizable."""
   684      # pylint: disable-next=c-extension-no-member
   685      value_compare = None if PD_VERSION < (1, 4) else lib.no_default
   686      if method is not None and not isinstance(to_replace,
   687                                               dict) and value is value_compare:
   688        # pandas only relies on method if to_replace is not a dictionary, and
   689        # value is the <no_default> value. This is different than
   690        # if ``None`` is explicitly passed for ``value``. In this case, it will be
   691        # respected
   692        raise frame_base.WontImplementError(
   693            f"replace(method={method!r}) is not supported because it is "
   694            "order sensitive. Only replace(method=None) is supported.",
   695            reason="order-sensitive")
   696  
   697      if limit is None:
   698        requires_partition_by = partitionings.Arbitrary()
   699      else:
   700        requires_partition_by = partitionings.Singleton(
   701            reason=(
   702                f"replace(limit={limit!r}) cannot currently be parallelized. It "
   703                "requires collecting all data on a single node."))
   704      return frame_base.DeferredFrame.wrap(
   705          expressions.ComputedExpression(
   706              'replace',
   707              lambda df: df.replace(
   708                  to_replace=to_replace,
   709                  value=value,
   710                  limit=limit,
   711                  method=method,
   712                  **kwargs), [self._expr],
   713              preserves_partition_by=partitionings.Arbitrary(),
   714              requires_partition_by=requires_partition_by))
   715  
   716    @frame_base.with_docs_from(pd.DataFrame)
   717    @frame_base.args_to_kwargs(pd.DataFrame)
   718    @frame_base.populate_defaults(pd.DataFrame)
   719    def tz_localize(self, ambiguous, **kwargs):
   720      """``ambiguous`` cannot be set to ``"infer"`` as its semantics are
   721      order-sensitive. Similarly, specifying ``ambiguous`` as an
   722      :class:`~numpy.ndarray` is order-sensitive, but you can achieve similar
   723      functionality by specifying ``ambiguous`` as a Series."""
   724      if isinstance(ambiguous, np.ndarray):
   725        raise frame_base.WontImplementError(
   726            "tz_localize(ambiguous=ndarray) is not supported because it makes "
   727            "this operation sensitive to the order of the data. Please use a "
   728            "DeferredSeries instead.",
   729            reason="order-sensitive")
   730      elif isinstance(ambiguous, frame_base.DeferredFrame):
   731        return frame_base.DeferredFrame.wrap(
   732            expressions.ComputedExpression(
   733                'tz_localize',
   734                lambda df,
   735                ambiguous: df.tz_localize(ambiguous=ambiguous, **kwargs),
   736                [self._expr, ambiguous._expr],
   737                requires_partition_by=partitionings.Index(),
   738                preserves_partition_by=partitionings.Singleton()))
   739      elif ambiguous == 'infer':
   740        # infer attempts to infer based on the order of the timestamps
   741        raise frame_base.WontImplementError(
   742            f"tz_localize(ambiguous={ambiguous!r}) is not allowed because it "
   743            "makes this operation sensitive to the order of the data.",
   744            reason="order-sensitive")
   745  
   746      return frame_base.DeferredFrame.wrap(
   747          expressions.ComputedExpression(
   748              'tz_localize',
   749              lambda df: df.tz_localize(ambiguous=ambiguous, **kwargs),
   750              [self._expr],
   751              requires_partition_by=partitionings.Arbitrary(),
   752              preserves_partition_by=partitionings.Singleton()))
   753  
   754    @property  # type: ignore
   755    @frame_base.with_docs_from(pd.DataFrame)
   756    def size(self):
   757      sizes = expressions.ComputedExpression(
   758          'get_sizes',
   759          # Wrap scalar results in a Series for easier concatenation later
   760          lambda df: pd.Series(df.size),
   761          [self._expr],
   762          requires_partition_by=partitionings.Arbitrary(),
   763          preserves_partition_by=partitionings.Singleton())
   764  
   765      with expressions.allow_non_parallel_operations(True):
   766        return frame_base.DeferredFrame.wrap(
   767            expressions.ComputedExpression(
   768                'sum_sizes',
   769                lambda sizes: sizes.sum(), [sizes],
   770                requires_partition_by=partitionings.Singleton(),
   771                preserves_partition_by=partitionings.Singleton()))
   772  
   773    def length(self):
   774      """Alternative to ``len(df)`` which returns a deferred result that can be
   775      used in arithmetic with :class:`DeferredSeries` or
   776      :class:`DeferredDataFrame` instances."""
   777      lengths = expressions.ComputedExpression(
   778          'get_lengths',
   779          # Wrap scalar results in a Series for easier concatenation later
   780          lambda df: pd.Series(len(df)),
   781          [self._expr],
   782          requires_partition_by=partitionings.Arbitrary(),
   783          preserves_partition_by=partitionings.Singleton())
   784  
   785      with expressions.allow_non_parallel_operations(True):
   786        return frame_base.DeferredFrame.wrap(
   787            expressions.ComputedExpression(
   788                'sum_lengths',
   789                lambda lengths: lengths.sum(), [lengths],
   790                requires_partition_by=partitionings.Singleton(),
   791                preserves_partition_by=partitionings.Singleton()))
   792  
   793    def __len__(self):
   794      raise frame_base.WontImplementError(
   795          "len(df) is not currently supported because it produces a non-deferred "
   796          "result. Consider using df.length() instead.",
   797          reason="non-deferred-result")
   798  
   799    @property  # type: ignore
   800    @frame_base.with_docs_from(pd.DataFrame)
   801    def empty(self):
   802      empties = expressions.ComputedExpression(
   803          'get_empties',
   804          # Wrap scalar results in a Series for easier concatenation later
   805          lambda df: pd.Series(df.empty),
   806          [self._expr],
   807          requires_partition_by=partitionings.Arbitrary(),
   808          preserves_partition_by=partitionings.Singleton())
   809  
   810      with expressions.allow_non_parallel_operations(True):
   811        return frame_base.DeferredFrame.wrap(
   812            expressions.ComputedExpression(
   813                'check_all_empty',
   814                lambda empties: empties.all(), [empties],
   815                requires_partition_by=partitionings.Singleton(),
   816                preserves_partition_by=partitionings.Singleton()))
   817  
   818    @frame_base.with_docs_from(pd.DataFrame)
   819    def bool(self):
   820      # TODO: Documentation about DeferredScalar
   821      # Will throw if any partition has >1 element
   822      bools = expressions.ComputedExpression(
   823          'get_bools',
   824          # Wrap scalar results in a Series for easier concatenation later
   825          lambda df: pd.Series([], dtype=bool)
   826          if df.empty else pd.Series([df.bool()]),
   827          [self._expr],
   828          requires_partition_by=partitionings.Arbitrary(),
   829          preserves_partition_by=partitionings.Singleton())
   830  
   831      with expressions.allow_non_parallel_operations(True):
   832        # Will throw if overall dataset has != 1 element
   833        return frame_base.DeferredFrame.wrap(
   834            expressions.ComputedExpression(
   835                'combine_all_bools',
   836                lambda bools: bools.bool(), [bools],
   837                proxy=bool(),
   838                requires_partition_by=partitionings.Singleton(),
   839                preserves_partition_by=partitionings.Singleton()))
   840  
   841    @frame_base.with_docs_from(pd.DataFrame)
   842    def equals(self, other):
   843      intermediate = expressions.ComputedExpression(
   844          'equals_partitioned',
   845          # Wrap scalar results in a Series for easier concatenation later
   846          lambda df,
   847          other: pd.Series(df.equals(other)),
   848          [self._expr, other._expr],
   849          requires_partition_by=partitionings.Index(),
   850          preserves_partition_by=partitionings.Singleton())
   851  
   852      with expressions.allow_non_parallel_operations(True):
   853        return frame_base.DeferredFrame.wrap(
   854            expressions.ComputedExpression(
   855                'aggregate_equals',
   856                lambda df: df.all(), [intermediate],
   857                requires_partition_by=partitionings.Singleton(),
   858                preserves_partition_by=partitionings.Singleton()))
   859  
   860    @frame_base.args_to_kwargs(pd.DataFrame)
   861    @frame_base.populate_defaults(pd.DataFrame)
   862    def sort_values(self, axis, **kwargs):
   863      """``sort_values`` is not implemented.
   864  
   865      It is not implemented for ``axis=index`` because it imposes an ordering on
   866      the dataset, and it likely will not be maintained (see
   867      https://s.apache.org/dataframe-order-sensitive-operations).
   868  
   869      It is not implemented for ``axis=columns`` because it makes the order of
   870      the columns depend on the data (see
   871      https://s.apache.org/dataframe-non-deferred-columns)."""
   872      if axis in (0, 'index'):
   873        # axis=index imposes an ordering on the DataFrame rows which we do not
   874        # support
   875        raise frame_base.WontImplementError(
   876            "sort_values(axis=index) is not supported because it imposes an "
   877            "ordering on the dataset which likely will not be preserved.",
   878            reason="order-sensitive")
   879      else:
   880        # axis=columns will reorder the columns based on the data
   881        raise frame_base.WontImplementError(
   882            "sort_values(axis=columns) is not supported because the order of the "
   883            "columns in the result depends on the data.",
   884            reason="non-deferred-columns")
   885  
   886    @frame_base.with_docs_from(pd.DataFrame)
   887    @frame_base.args_to_kwargs(pd.DataFrame)
   888    @frame_base.populate_defaults(pd.DataFrame)
   889    @frame_base.maybe_inplace
   890    def sort_index(self, axis, **kwargs):
   891      """``axis=index`` is not allowed because it imposes an ordering on the
   892      dataset, and we cannot guarantee it will be maintained (see
   893      https://s.apache.org/dataframe-order-sensitive-operations). Only
   894      ``axis=columns`` is allowed."""
   895      if axis in (0, 'index'):
   896        # axis=rows imposes an ordering on the DataFrame which we do not support
   897        raise frame_base.WontImplementError(
   898            "sort_index(axis=index) is not supported because it imposes an "
   899            "ordering on the dataset which we cannot guarantee will be "
   900            "preserved.",
   901            reason="order-sensitive")
   902  
   903      # axis=columns reorders the columns by name
   904      return frame_base.DeferredFrame.wrap(
   905          expressions.ComputedExpression(
   906              'sort_index',
   907              lambda df: df.sort_index(axis, **kwargs),
   908              [self._expr],
   909              requires_partition_by=partitionings.Arbitrary(),
   910              preserves_partition_by=partitionings.Arbitrary(),
   911          ))
   912  
   913    @frame_base.with_docs_from(pd.DataFrame)
   914    @frame_base.args_to_kwargs(pd.DataFrame)
   915    @frame_base.populate_defaults(pd.DataFrame)
   916    @frame_base.maybe_inplace
   917    def where(self, cond, other, errors, **kwargs):
   918      """where is not parallelizable when ``errors="ignore"`` is specified."""
   919      requires = partitionings.Arbitrary()
   920      deferred_args = {}
   921      actual_args = {}
   922  
   923      # TODO(bhulette): This is very similar to the logic in
   924      # frame_base.elementwise_method, can we unify it?
   925      if isinstance(cond, frame_base.DeferredFrame):
   926        deferred_args['cond'] = cond
   927        requires = partitionings.Index()
   928      else:
   929        actual_args['cond'] = cond
   930  
   931      if isinstance(other, frame_base.DeferredFrame):
   932        deferred_args['other'] = other
   933        requires = partitionings.Index()
   934      else:
   935        actual_args['other'] = other
   936  
   937      if errors == "ignore":
   938        # We need all data in order to ignore errors and propagate the original
   939        # data.
   940        requires = partitionings.Singleton(
   941            reason=(
   942                f"where(errors={errors!r}) is currently not parallelizable, "
   943                "because all data must be collected on one node to determine if "
   944                "the original data should be propagated instead."))
   945  
   946      actual_args['errors'] = errors
   947  
   948      def where_execution(df, *args):
   949        runtime_values = {
   950            name: value
   951            for (name, value) in zip(deferred_args.keys(), args)
   952        }
   953        return df.where(**runtime_values, **actual_args, **kwargs)
   954  
   955      return frame_base.DeferredFrame.wrap(
   956          expressions.ComputedExpression(
   957              "where",
   958              where_execution,
   959              [self._expr] + [df._expr for df in deferred_args.values()],
   960              requires_partition_by=requires,
   961              preserves_partition_by=partitionings.Index(),
   962          ))
   963  
   964    @frame_base.with_docs_from(pd.DataFrame)
   965    @frame_base.args_to_kwargs(pd.DataFrame)
   966    @frame_base.populate_defaults(pd.DataFrame)
   967    @frame_base.maybe_inplace
   968    def mask(self, cond, **kwargs):
   969      """mask is not parallelizable when ``errors="ignore"`` is specified."""
   970      return self.where(~cond, **kwargs)
   971  
   972    @frame_base.with_docs_from(pd.DataFrame)
   973    @frame_base.args_to_kwargs(pd.DataFrame)
   974    @frame_base.populate_defaults(pd.DataFrame)
   975    def truncate(self, before, after, axis):
   976  
   977      if axis in (None, 0, 'index'):
   978  
   979        def truncate(df):
   980          return df.sort_index().truncate(before=before, after=after, axis=axis)
   981      else:
   982  
   983        def truncate(df):
   984          return df.truncate(before=before, after=after, axis=axis)
   985  
   986      return frame_base.DeferredFrame.wrap(
   987          expressions.ComputedExpression(
   988              'truncate',
   989              truncate, [self._expr],
   990              requires_partition_by=partitionings.Arbitrary(),
   991              preserves_partition_by=partitionings.Arbitrary()))
   992  
   993    @frame_base.with_docs_from(pd.DataFrame)
   994    @frame_base.args_to_kwargs(pd.DataFrame)
   995    @frame_base.populate_defaults(pd.DataFrame)
   996    def unstack(self, **kwargs):
   997      level = kwargs.get('level', -1)
   998  
   999      if self._expr.proxy().index.nlevels == 1:
  1000        if PD_VERSION < (1, 2):
  1001          raise frame_base.WontImplementError(
  1002              "unstack() is not supported when using pandas < 1.2.0\n"
  1003              "Please upgrade to pandas 1.2.0 or higher to use this operation.")
  1004        return frame_base.DeferredFrame.wrap(
  1005            expressions.ComputedExpression(
  1006                'unstack',
  1007                lambda s: s.unstack(**kwargs), [self._expr],
  1008                requires_partition_by=partitionings.Index()))
  1009      else:
  1010        # Unstacking MultiIndex objects
  1011        idx = self._expr.proxy().index
  1012  
  1013        # Converting level (int, str, or combination) to a list of number levels
  1014        level_list = level if isinstance(level, list) else [level]
  1015        level_number_list = [idx._get_level_number(l) for l in level_list]
  1016  
  1017        # Checking if levels provided are of CategoricalDtype
  1018        if not all(isinstance(idx.levels[l].dtype, (pd.CategoricalDtype,
  1019                                                    pd.BooleanDtype))
  1020                   for l in level_number_list):
  1021          raise frame_base.WontImplementError(
  1022              "unstack() is only supported on DataFrames if unstacked level "
  1023              "is a categorical or boolean column",
  1024              reason="non-deferred-columns")
  1025        else:
  1026          tmp = self._expr.proxy().unstack(**kwargs)
  1027          if isinstance(tmp.columns, pd.MultiIndex):
  1028            levels = []
  1029            for i in range(tmp.columns.nlevels):
  1030              level = tmp.columns.levels[i]
  1031              levels.append(level)
  1032            col_idx = pd.MultiIndex.from_product(levels)
  1033          else:
  1034            if tmp.columns.dtype == 'boolean':
  1035              col_idx = pd.Index([False, True], dtype='boolean')
  1036            else:
  1037              col_idx = pd.CategoricalIndex(tmp.columns.categories)
  1038  
  1039          if isinstance(self._expr.proxy(), pd.Series):
  1040            proxy_dtype = self._expr.proxy().dtypes
  1041          else:
  1042            # Set dtype to object if more than one value
  1043            dtypes = [d for d in self._expr.proxy().dtypes]
  1044            proxy_dtype = object
  1045            if np.int64 in dtypes:
  1046              proxy_dtype = np.int64
  1047            if np.float64 in dtypes:
  1048              proxy_dtype = np.float64
  1049            if object in dtypes:
  1050              proxy_dtype = object
  1051  
  1052          proxy = pd.DataFrame(
  1053              columns=col_idx, dtype=proxy_dtype, index=tmp.index)
  1054  
  1055          with expressions.allow_non_parallel_operations(True):
  1056            return frame_base.DeferredFrame.wrap(
  1057                expressions.ComputedExpression(
  1058                    'unstack',
  1059                    lambda s: pd.concat([proxy, s.unstack(**kwargs)]),
  1060                    [self._expr],
  1061                    proxy=proxy,
  1062                    requires_partition_by=partitionings.Singleton()))
  1063  
  1064    @frame_base.with_docs_from(pd.DataFrame)
  1065    @frame_base.args_to_kwargs(pd.DataFrame)
  1066    @frame_base.populate_defaults(pd.DataFrame)
  1067    def xs(self, key, axis, level, **kwargs):
  1068      """Note that ``xs(axis='index')`` will raise a ``KeyError`` at execution
  1069      time if the key does not exist in the index."""
  1070  
  1071      if axis in ('columns', 1):
  1072        # Special case for axis=columns. This is a simple project that raises a
  1073        # KeyError at construction time for missing columns.
  1074        return frame_base.DeferredFrame.wrap(
  1075            expressions.ComputedExpression(
  1076                'xs',
  1077                lambda df: df.xs(key, axis=axis, **kwargs), [self._expr],
  1078                requires_partition_by=partitionings.Arbitrary(),
  1079                preserves_partition_by=partitionings.Arbitrary()))
  1080      elif axis not in ('index', 0):
  1081        # Make sure that user's axis is valid
  1082        raise ValueError(
  1083            "axis must be one of ('index', 0, 'columns', 1). "
  1084            f"got {axis!r}.")
  1085  
  1086      if not isinstance(key, tuple):
  1087        key_size = 1
  1088        key_series = pd.Series([key], index=[key])
  1089      else:
  1090        key_size = len(key)
  1091        key_series = pd.Series([key], pd.MultiIndex.from_tuples([key]))
  1092  
  1093      key_expr = expressions.ConstantExpression(
  1094          key_series, proxy=key_series.iloc[:0])
  1095  
  1096      if level is None:
  1097        reindexed = self
  1098      else:
  1099        if not isinstance(level, list):
  1100          level = [level]
  1101  
  1102        # If user specifed levels, reindex so those levels are at the beginning.
  1103        # Keep the others and preserve their order.
  1104        level = [
  1105            l if isinstance(l, int) else list(self.index.names).index(l)
  1106            for l in level
  1107        ]
  1108  
  1109        reindexed = self.reorder_levels(
  1110            level + [i for i in range(self.index.nlevels) if i not in level])
  1111  
  1112      def xs_partitioned(frame, key):
  1113        if not len(key):
  1114          # key is not in this partition, return empty dataframe
  1115          result = frame.iloc[:0]
  1116          if key_size < frame.index.nlevels:
  1117            return result.droplevel(list(range(key_size)))
  1118          else:
  1119            return result
  1120  
  1121        # key should be in this partition, call xs. Will raise KeyError if not
  1122        # present.
  1123        return frame.xs(key.item())
  1124  
  1125      return frame_base.DeferredFrame.wrap(
  1126          expressions.ComputedExpression(
  1127              'xs',
  1128              xs_partitioned,
  1129              [reindexed._expr, key_expr],
  1130              requires_partition_by=partitionings.Index(list(range(key_size))),
  1131              # Drops index levels, so partitioning is not preserved
  1132              preserves_partition_by=partitionings.Singleton()))
  1133  
  1134    @property
  1135    def dtype(self):
  1136      return self._expr.proxy().dtype
  1137  
  1138    isin = frame_base._elementwise_method('isin', base=pd.DataFrame)
  1139    combine_first = frame_base._elementwise_method(
  1140        'combine_first', base=pd.DataFrame)
  1141  
  1142    combine = frame_base._proxy_method(
  1143        'combine',
  1144        base=pd.DataFrame,
  1145        requires_partition_by=expressions.partitionings.Singleton(
  1146            reason="combine() is not parallelizable because func might operate "
  1147            "on the full dataset."),
  1148        preserves_partition_by=expressions.partitionings.Singleton())
  1149  
  1150    @property  # type: ignore
  1151    @frame_base.with_docs_from(pd.DataFrame)
  1152    def ndim(self):
  1153      return self._expr.proxy().ndim
  1154  
  1155    @property  # type: ignore
  1156    @frame_base.with_docs_from(pd.DataFrame)
  1157    def index(self):
  1158      return _DeferredIndex(self)
  1159  
  1160    @index.setter
  1161    def _set_index(self, value):
  1162      # TODO: assigning the index is generally order-sensitive, but we could
  1163      # support it in some rare cases, e.g. when assigning the index from one
  1164      # of a DataFrame's columns
  1165      raise NotImplementedError(
  1166          "Assigning an index is not yet supported. "
  1167          "Consider using set_index() instead.")
  1168  
  1169    reindex = frame_base.wont_implement_method(
  1170        pd.DataFrame, 'reindex', reason="order-sensitive")
  1171  
  1172    hist = frame_base.wont_implement_method(
  1173        pd.DataFrame, 'hist', reason="plotting-tools")
  1174  
  1175    attrs = property(
  1176        frame_base.wont_implement_method(
  1177            pd.DataFrame, 'attrs', reason='experimental'))
  1178  
  1179    reorder_levels = frame_base._proxy_method(
  1180        'reorder_levels',
  1181        base=pd.DataFrame,
  1182        requires_partition_by=partitionings.Arbitrary(),
  1183        preserves_partition_by=partitionings.Singleton())
  1184  
  1185    resample = frame_base.wont_implement_method(
  1186        pd.DataFrame, 'resample', reason='event-time-semantics')
  1187  
  1188    rolling = frame_base.wont_implement_method(
  1189        pd.DataFrame, 'rolling', reason='event-time-semantics')
  1190  
  1191    to_xarray = frame_base.wont_implement_method(
  1192        pd.DataFrame, 'to_xarray', reason='non-deferred-result')
  1193    to_clipboard = frame_base.wont_implement_method(
  1194        pd.DataFrame, 'to_clipboard', reason="non-deferred-result")
  1195  
  1196    swapaxes = frame_base.wont_implement_method(
  1197        pd.Series, 'swapaxes', reason="non-deferred-columns")
  1198    infer_object = frame_base.wont_implement_method(
  1199        pd.Series, 'infer_objects', reason="non-deferred-columns")
  1200  
  1201    ewm = frame_base.wont_implement_method(
  1202        pd.Series, 'ewm', reason="event-time-semantics")
  1203    expanding = frame_base.wont_implement_method(
  1204        pd.Series, 'expanding', reason="event-time-semantics")
  1205  
  1206    sparse = property(
  1207        frame_base.not_implemented_method(
  1208            'sparse', '20902', base_type=pd.DataFrame))
  1209  
  1210    transform = frame_base._elementwise_method('transform', base=pd.DataFrame)
  1211  
  1212    tz_convert = frame_base._proxy_method(
  1213        'tz_convert',
  1214        base=pd.DataFrame,
  1215        requires_partition_by=partitionings.Arbitrary(),
  1216        # Manipulates index, partitioning is not preserved
  1217        preserves_partition_by=partitionings.Singleton())
  1218  
  1219    @frame_base.with_docs_from(pd.DataFrame)
  1220    def pipe(self, func, *args, **kwargs):
  1221      if isinstance(func, tuple):
  1222        func, data = func
  1223        kwargs[data] = self
  1224        return func(*args, **kwargs)
  1225  
  1226      return func(self, *args, **kwargs)
  1227  
  1228  
  1229  @populate_not_implemented(pd.Series)
  1230  @frame_base.DeferredFrame._register_for(pd.Series)
  1231  class DeferredSeries(DeferredDataFrameOrSeries):
  1232    def __repr__(self):
  1233      return (
  1234          f'DeferredSeries(name={self.name!r}, dtype={self.dtype}, '
  1235          f'{self._render_indexes()})')
  1236  
  1237    @property  # type: ignore
  1238    @frame_base.with_docs_from(pd.Series)
  1239    def name(self):
  1240      return self._expr.proxy().name
  1241  
  1242    @name.setter
  1243    def name(self, value):
  1244      def fn(s):
  1245        s = s.copy()
  1246        s.name = value
  1247        return s
  1248  
  1249      self._expr = expressions.ComputedExpression(
  1250          'series_set_name',
  1251          fn, [self._expr],
  1252          requires_partition_by=partitionings.Arbitrary(),
  1253          preserves_partition_by=partitionings.Arbitrary())
  1254  
  1255    @property  # type: ignore
  1256    @frame_base.with_docs_from(pd.Series)
  1257    def hasnans(self):
  1258      has_nans = expressions.ComputedExpression(
  1259          'hasnans',
  1260          lambda s: pd.Series(s.hasnans), [self._expr],
  1261          requires_partition_by=partitionings.Arbitrary(),
  1262          preserves_partition_by=partitionings.Singleton())
  1263  
  1264      with expressions.allow_non_parallel_operations():
  1265        return frame_base.DeferredFrame.wrap(
  1266            expressions.ComputedExpression(
  1267                'combine_hasnans',
  1268                lambda s: s.any(), [has_nans],
  1269                requires_partition_by=partitionings.Singleton(),
  1270                preserves_partition_by=partitionings.Singleton()))
  1271  
  1272    @property  # type: ignore
  1273    @frame_base.with_docs_from(pd.Series)
  1274    def dtype(self):
  1275      return self._expr.proxy().dtype
  1276  
  1277    dtypes = dtype
  1278  
  1279    def __getitem__(self, key):
  1280      if _is_null_slice(key) or key is Ellipsis:
  1281        return self
  1282  
  1283      elif (isinstance(key, int) or _is_integer_slice(key)
  1284            ) and self._expr.proxy().index._should_fallback_to_positional():
  1285        raise frame_base.WontImplementError(
  1286            "Accessing an item by an integer key is order sensitive for this "
  1287            "Series.",
  1288            reason="order-sensitive")
  1289  
  1290      elif isinstance(key, slice) or callable(key):
  1291        return frame_base.DeferredFrame.wrap(
  1292            expressions.ComputedExpression(
  1293                # yapf: disable
  1294                'getitem',
  1295                lambda df: df[key],
  1296                [self._expr],
  1297                requires_partition_by=partitionings.Arbitrary(),
  1298                preserves_partition_by=partitionings.Arbitrary()))
  1299  
  1300      elif isinstance(key, DeferredSeries) and key._expr.proxy().dtype == bool:
  1301        return frame_base.DeferredFrame.wrap(
  1302            expressions.ComputedExpression(
  1303                # yapf: disable
  1304                'getitem',
  1305                lambda df,
  1306                indexer: df[indexer],
  1307                [self._expr, key._expr],
  1308                requires_partition_by=partitionings.Index(),
  1309                preserves_partition_by=partitionings.Arbitrary()))
  1310  
  1311      elif pd.core.series.is_iterator(key) or pd.core.common.is_bool_indexer(key):
  1312        raise frame_base.WontImplementError(
  1313            "Accessing a DeferredSeries with an iterator is sensitive to the "
  1314            "order of the data.",
  1315            reason="order-sensitive")
  1316  
  1317      else:
  1318        # We could consider returning a deferred scalar, but that might
  1319        # be more surprising than a clear error.
  1320        raise frame_base.WontImplementError(
  1321            f"Indexing a series with key of type {type(key)} is not supported "
  1322            "because it produces a non-deferred result.",
  1323            reason="non-deferred-result")
  1324  
  1325    @frame_base.with_docs_from(pd.Series)
  1326    def keys(self):
  1327      return self.index
  1328  
  1329    # Series.T == transpose. Both are a no-op
  1330    T = frame_base._elementwise_method('T', base=pd.Series)
  1331    transpose = frame_base._elementwise_method('transpose', base=pd.Series)
  1332    shape = property(
  1333        frame_base.wont_implement_method(
  1334            pd.Series, 'shape', reason="non-deferred-result"))
  1335  
  1336    @frame_base.with_docs_from(pd.Series)
  1337    @frame_base.args_to_kwargs(pd.Series)
  1338    @frame_base.populate_defaults(pd.Series)
  1339    def append(self, to_append, ignore_index, verify_integrity, **kwargs):
  1340      """``ignore_index=True`` is not supported, because it requires generating an
  1341      order-sensitive index."""
  1342      if not isinstance(to_append, DeferredSeries):
  1343        raise frame_base.WontImplementError(
  1344            "append() only accepts DeferredSeries instances, received " +
  1345            str(type(to_append)))
  1346      if ignore_index:
  1347        raise frame_base.WontImplementError(
  1348            "append(ignore_index=True) is order sensitive because it requires "
  1349            "generating a new index based on the order of the data.",
  1350            reason="order-sensitive")
  1351  
  1352      if verify_integrity:
  1353        # We can verify the index is non-unique within index partitioned data.
  1354        requires = partitionings.Index()
  1355      else:
  1356        requires = partitionings.Arbitrary()
  1357  
  1358      return frame_base.DeferredFrame.wrap(
  1359          expressions.ComputedExpression(
  1360              'append',
  1361              lambda s,
  1362              to_append: s.append(
  1363                  to_append, verify_integrity=verify_integrity, **kwargs),
  1364              [self._expr, to_append._expr],
  1365              requires_partition_by=requires,
  1366              preserves_partition_by=partitionings.Arbitrary()))
  1367  
  1368    @frame_base.with_docs_from(pd.Series)
  1369    @frame_base.args_to_kwargs(pd.Series)
  1370    @frame_base.populate_defaults(pd.Series)
  1371    def align(self, other, join, axis, level, method, **kwargs):
  1372      """Aligning per-level is not yet supported. Only the default,
  1373      ``level=None``, is allowed.
  1374  
  1375      Filling NaN values via ``method`` is not supported, because it is
  1376      `order-sensitive
  1377      <https://s.apache.org/dataframe-order-sensitive-operations>`_.
  1378      Only the default, ``method=None``, is allowed."""
  1379      if level is not None:
  1380        raise NotImplementedError('per-level align')
  1381      if method is not None:
  1382        raise frame_base.WontImplementError(
  1383            f"align(method={method!r}) is not supported because it is "
  1384            "order sensitive. Only align(method=None) is supported.",
  1385            reason="order-sensitive")
  1386      # We're using pd.concat here as expressions don't yet support
  1387      # multiple return values.
  1388      aligned = frame_base.DeferredFrame.wrap(
  1389          expressions.ComputedExpression(
  1390              'align',
  1391              lambda x,
  1392              y: pd.concat([x, y], axis=1, join='inner'),
  1393              [self._expr, other._expr],
  1394              requires_partition_by=partitionings.Index(),
  1395              preserves_partition_by=partitionings.Arbitrary()))
  1396      return aligned.iloc[:, 0], aligned.iloc[:, 1]
  1397  
  1398    argsort = frame_base.wont_implement_method(
  1399        pd.Series, 'argsort', reason="order-sensitive")
  1400  
  1401    array = property(
  1402        frame_base.wont_implement_method(
  1403            pd.Series, 'array', reason="non-deferred-result"))
  1404  
  1405    # We can't reliably predict the output type, it depends on whether `key` is:
  1406    # - not in the index (default_value)
  1407    # - in the index once (constant)
  1408    # - in the index multiple times (Series)
  1409    get = frame_base.wont_implement_method(
  1410        pd.Series, 'get', reason="non-deferred-columns")
  1411  
  1412    ravel = frame_base.wont_implement_method(
  1413        pd.Series, 'ravel', reason="non-deferred-result")
  1414  
  1415    slice_shift = frame_base.wont_implement_method(
  1416        pd.Series, 'slice_shift', reason="deprecated")
  1417    tshift = frame_base.wont_implement_method(
  1418        pd.Series, 'tshift', reason="deprecated")
  1419  
  1420    rename = frame_base._proxy_method(
  1421        'rename',
  1422        base=pd.Series,
  1423        requires_partition_by=partitionings.Arbitrary(),
  1424        preserves_partition_by=partitionings.Singleton())
  1425  
  1426    between = frame_base._elementwise_method('between', base=pd.Series)
  1427  
  1428    add_suffix = frame_base._proxy_method(
  1429        'add_suffix',
  1430        base=pd.DataFrame,
  1431        requires_partition_by=partitionings.Arbitrary(),
  1432        preserves_partition_by=partitionings.Singleton())
  1433    add_prefix = frame_base._proxy_method(
  1434        'add_prefix',
  1435        base=pd.DataFrame,
  1436        requires_partition_by=partitionings.Arbitrary(),
  1437        preserves_partition_by=partitionings.Singleton())
  1438  
  1439    info = frame_base.wont_implement_method(
  1440        pd.Series, 'info', reason="non-deferred-result")
  1441  
  1442    def _idxmaxmin_helper(self, op, **kwargs):
  1443      if op == 'idxmax':
  1444        func = pd.Series.idxmax
  1445      elif op == 'idxmin':
  1446        func = pd.Series.idxmin
  1447      else:
  1448        raise ValueError(
  1449            "op must be one of ('idxmax', 'idxmin'). "
  1450            f"got {op!r}.")
  1451  
  1452      def compute_idx(s):
  1453        index = func(s, **kwargs)
  1454        if pd.isna(index):
  1455          return s
  1456        else:
  1457          return s.loc[[index]]
  1458  
  1459      # Avoids empty Series error when evaluating proxy
  1460      index_dtype = self._expr.proxy().index.dtype
  1461      index = pd.Index([], dtype=index_dtype)
  1462      proxy = self._expr.proxy().copy()
  1463      proxy.index = index
  1464      proxy = proxy.append(
  1465          pd.Series([1], index=np.asarray(['0']).astype(proxy.index.dtype)))
  1466  
  1467      idx_func = expressions.ComputedExpression(
  1468          'idx_func',
  1469          compute_idx, [self._expr],
  1470          proxy=proxy,
  1471          requires_partition_by=partitionings.Arbitrary(),
  1472          preserves_partition_by=partitionings.Arbitrary())
  1473  
  1474      with expressions.allow_non_parallel_operations(True):
  1475        return frame_base.DeferredFrame.wrap(
  1476            expressions.ComputedExpression(
  1477                'idx_combine',
  1478                lambda s: func(s, **kwargs), [idx_func],
  1479                requires_partition_by=partitionings.Singleton(),
  1480                preserves_partition_by=partitionings.Singleton()))
  1481  
  1482    @frame_base.with_docs_from(pd.Series)
  1483    @frame_base.args_to_kwargs(pd.Series)
  1484    @frame_base.populate_defaults(pd.Series)
  1485    def idxmin(self, **kwargs):
  1486      return self._idxmaxmin_helper('idxmin', **kwargs)
  1487  
  1488    @frame_base.with_docs_from(pd.Series)
  1489    @frame_base.args_to_kwargs(pd.Series)
  1490    @frame_base.populate_defaults(pd.Series)
  1491    def idxmax(self, **kwargs):
  1492      return self._idxmaxmin_helper('idxmax', **kwargs)
  1493  
  1494    @frame_base.with_docs_from(pd.Series)
  1495    @frame_base.args_to_kwargs(pd.Series)
  1496    @frame_base.populate_defaults(pd.Series)
  1497    def explode(self, ignore_index):
  1498      # ignoring the index will not preserve it
  1499      preserves = (
  1500          partitionings.Singleton() if ignore_index else partitionings.Index())
  1501      return frame_base.DeferredFrame.wrap(
  1502          expressions.ComputedExpression(
  1503              'explode',
  1504              lambda s: s.explode(ignore_index), [self._expr],
  1505              preserves_partition_by=preserves,
  1506              requires_partition_by=partitionings.Arbitrary()))
  1507  
  1508    @frame_base.with_docs_from(pd.DataFrame)
  1509    def dot(self, other):
  1510      """``other`` must be a :class:`DeferredDataFrame` or :class:`DeferredSeries`
  1511      instance. Computing the dot product with an array-like is not supported
  1512      because it is order-sensitive."""
  1513      left = self._expr
  1514      if isinstance(other, DeferredSeries):
  1515        right = expressions.ComputedExpression(
  1516            'to_dataframe',
  1517            pd.DataFrame, [other._expr],
  1518            requires_partition_by=partitionings.Arbitrary(),
  1519            preserves_partition_by=partitionings.Arbitrary())
  1520        right_is_series = True
  1521      elif isinstance(other, DeferredDataFrame):
  1522        right = other._expr
  1523        right_is_series = False
  1524      else:
  1525        raise frame_base.WontImplementError(
  1526            "other must be a DeferredDataFrame or DeferredSeries instance. "
  1527            "Passing a concrete list or numpy array is not supported. Those "
  1528            "types have no index and must be joined based on the order of the "
  1529            "data.",
  1530            reason="order-sensitive")
  1531  
  1532      dots = expressions.ComputedExpression(
  1533          'dot',
  1534          # Transpose so we can sum across rows.
  1535          (lambda left, right: pd.DataFrame(left @ right).T),
  1536          [left, right],
  1537          requires_partition_by=partitionings.Index())
  1538      with expressions.allow_non_parallel_operations(True):
  1539        sums = expressions.ComputedExpression(
  1540            'sum',
  1541            lambda dots: dots.sum(),  #
  1542            [dots],
  1543            requires_partition_by=partitionings.Singleton())
  1544  
  1545        if right_is_series:
  1546          result = expressions.ComputedExpression(
  1547              'extract',
  1548              lambda df: df[0], [sums],
  1549              requires_partition_by=partitionings.Singleton())
  1550        else:
  1551          result = sums
  1552        return frame_base.DeferredFrame.wrap(result)
  1553  
  1554    __matmul__ = dot
  1555  
  1556    @frame_base.with_docs_from(pd.Series)
  1557    @frame_base.args_to_kwargs(pd.Series)
  1558    @frame_base.populate_defaults(pd.Series)
  1559    def nunique(self, **kwargs):
  1560      return self.drop_duplicates(keep="any").size
  1561  
  1562    @frame_base.with_docs_from(pd.Series)
  1563    @frame_base.args_to_kwargs(pd.Series)
  1564    @frame_base.populate_defaults(pd.Series)
  1565    def quantile(self, q, **kwargs):
  1566      """quantile is not parallelizable. See
  1567      `Issue 20933 <https://github.com/apache/beam/issues/20933>`_ tracking
  1568      the possible addition of an approximate, parallelizable implementation of
  1569      quantile."""
  1570      # TODO(https://github.com/apache/beam/issues/20933): Provide an option for
  1571      #  approximate distributed quantiles
  1572      requires = partitionings.Singleton(
  1573          reason=(
  1574              "Computing quantiles across index cannot currently be "
  1575              "parallelized. See https://github.com/apache/beam/issues/20933 "
  1576              "tracking the possible addition of an approximate, parallelizable "
  1577              "implementation of quantile."))
  1578  
  1579      return frame_base.DeferredFrame.wrap(
  1580          expressions.ComputedExpression(
  1581              'quantile',
  1582              lambda df: df.quantile(q=q, **kwargs), [self._expr],
  1583              requires_partition_by=requires,
  1584              preserves_partition_by=partitionings.Singleton()))
  1585  
  1586    @frame_base.with_docs_from(pd.Series)
  1587    def std(self, *args, **kwargs):
  1588      # Compute variance (deferred scalar) with same args, then sqrt it
  1589      return self.var(*args, **kwargs).apply(lambda var: math.sqrt(var))
  1590  
  1591    @frame_base.with_docs_from(pd.Series)
  1592    @frame_base.args_to_kwargs(pd.Series)
  1593    @frame_base.populate_defaults(pd.Series)
  1594    def mean(self, skipna, **kwargs):
  1595      if skipna:
  1596        size = self.count()
  1597      else:
  1598        size = self.length()
  1599  
  1600      return self.sum(skipna=skipna, **kwargs) / size
  1601  
  1602    @frame_base.with_docs_from(pd.Series)
  1603    @frame_base.args_to_kwargs(pd.Series)
  1604    @frame_base.populate_defaults(pd.Series)
  1605    def var(self, axis, skipna, level, ddof, **kwargs):
  1606      """Per-level aggregation is not yet supported
  1607      (https://github.com/apache/beam/issues/21829). Only the default,
  1608      ``level=None``, is allowed."""
  1609      if level is not None:
  1610        raise NotImplementedError("per-level aggregation")
  1611      if skipna is None or skipna:
  1612        self = self.dropna()  # pylint: disable=self-cls-assignment
  1613  
  1614      # See the online, numerically stable formulae at
  1615      # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  1616      # and
  1617      # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
  1618      def compute_moments(x):
  1619        n = len(x)
  1620        m = x.std(ddof=0)**2 * n
  1621        s = x.sum()
  1622        return pd.DataFrame(dict(m=[m], s=[s], n=[n]))
  1623  
  1624      def combine_moments(data):
  1625        m = s = n = 0.0
  1626        for datum in data.itertuples():
  1627          if datum.n == 0:
  1628            continue
  1629          elif n == 0:
  1630            m, s, n = datum.m, datum.s, datum.n
  1631          else:
  1632            delta = s / n - datum.s / datum.n
  1633            m += datum.m + delta**2 * n * datum.n / (n + datum.n)
  1634            s += datum.s
  1635            n += datum.n
  1636        if n <= ddof:
  1637          return float('nan')
  1638        else:
  1639          return m / (n - ddof)
  1640  
  1641      moments = expressions.ComputedExpression(
  1642          'compute_moments',
  1643          compute_moments, [self._expr],
  1644          requires_partition_by=partitionings.Arbitrary())
  1645      with expressions.allow_non_parallel_operations(True):
  1646        return frame_base.DeferredFrame.wrap(
  1647            expressions.ComputedExpression(
  1648                'combine_moments',
  1649                combine_moments, [moments],
  1650                requires_partition_by=partitionings.Singleton()))
  1651  
  1652    @frame_base.with_docs_from(pd.Series)
  1653    @frame_base.args_to_kwargs(pd.Series)
  1654    @frame_base.populate_defaults(pd.Series)
  1655    def corr(self, other, method, min_periods):
  1656      """Only ``method='pearson'`` is currently parallelizable."""
  1657      if method == 'pearson':  # Note that this is the default.
  1658        x, y = self.dropna().align(other.dropna(), 'inner')
  1659        return x._corr_aligned(y, min_periods)
  1660  
  1661      else:
  1662        reason = (
  1663            f"Encountered corr(method={method!r}) which cannot be "
  1664            "parallelized. Only corr(method='pearson') is currently "
  1665            "parallelizable.")
  1666        # The rank-based correlations are not obviously parallelizable, though
  1667        # perhaps an approximation could be done with a knowledge of quantiles
  1668        # and custom partitioning.
  1669        return frame_base.DeferredFrame.wrap(
  1670            expressions.ComputedExpression(
  1671                'corr',
  1672                lambda df,
  1673                other: df.corr(other, method=method, min_periods=min_periods),
  1674                [self._expr, other._expr],
  1675                requires_partition_by=partitionings.Singleton(reason=reason)))
  1676  
  1677    @frame_base.with_docs_from(pd.Series)
  1678    @frame_base.args_to_kwargs(pd.Series)
  1679    @frame_base.populate_defaults(pd.Series)
  1680    def skew(self, axis, skipna, level, numeric_only, **kwargs):
  1681      if level is not None:
  1682        raise NotImplementedError("per-level aggregation")
  1683      if skipna is None or skipna:
  1684        self = self.dropna()  # pylint: disable=self-cls-assignment
  1685      # See the online, numerically stable formulae at
  1686      # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
  1687      # Note that we are calculating the unbias (sample) version of skew here.
  1688      # See https://en.wikipedia.org/wiki/Skewness#Sample_skewness
  1689      # for more details.
  1690      def compute_moments(x):
  1691        n = len(x)
  1692        if n == 0:
  1693          m2, sum, m3 = 0, 0, 0
  1694        else:
  1695          m2 = x.std(ddof=0)**2 * n
  1696          sum = x.sum()
  1697          m3 = (((x - x.mean())**3).sum())
  1698        return pd.DataFrame(dict(m2=[m2], sum=[sum], n=[n], m3=[m3]))
  1699  
  1700      def combine_moments(data):
  1701        m2 = sum = n = m3 = 0.0
  1702        for datum in data.itertuples():
  1703          if datum.n == 0:
  1704            continue
  1705          elif n == 0:
  1706            m2, sum, n, m3 = datum.m2, datum.sum, datum.n, datum.m3
  1707          else:
  1708            n_a, n_b = datum.n, n
  1709            sum_a, sum_b = datum.sum, sum
  1710            m2_a, m2_b = datum.m2, m2
  1711            mean_a, mean_b = sum_a / n_a, sum_b / n_b
  1712            delta = mean_b - mean_a
  1713            combined_n = n_a + n_b
  1714            m3 += datum.m3 + (
  1715                (delta**3 * ((n_a * n_b) * (n_a - n_b)) / ((combined_n)**2)) +
  1716                ((3 * delta) * ((n_a * m2_b) - (n_b * m2_a)) / (combined_n)))
  1717            m2 += datum.m2 + delta**2 * n_b * n_a / combined_n
  1718            sum += datum.sum
  1719            n += datum.n
  1720  
  1721        if n < 3:
  1722          return float('nan')
  1723        elif m2 == 0:
  1724          return float(0)
  1725        else:
  1726          return combined_n * math.sqrt(combined_n - 1) / (combined_n -
  1727                                                           2) * m3 / (
  1728                                                               m2**(3 / 2))
  1729  
  1730      moments = expressions.ComputedExpression(
  1731          'compute_moments',
  1732          compute_moments, [self._expr],
  1733          requires_partition_by=partitionings.Arbitrary())
  1734      with expressions.allow_non_parallel_operations(True):
  1735        return frame_base.DeferredFrame.wrap(
  1736            expressions.ComputedExpression(
  1737                'combine_moments',
  1738                combine_moments, [moments],
  1739                requires_partition_by=partitionings.Singleton()))
  1740  
  1741    @frame_base.with_docs_from(pd.Series)
  1742    @frame_base.args_to_kwargs(pd.Series)
  1743    @frame_base.populate_defaults(pd.Series)
  1744    def kurtosis(self, axis, skipna, level, numeric_only, **kwargs):
  1745      if level is not None:
  1746        raise NotImplementedError("per-level aggregation")
  1747      if skipna is None or skipna:
  1748        self = self.dropna()  # pylint: disable=self-cls-assignment
  1749  
  1750      # See the online, numerically stable formulae at
  1751      # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
  1752      # kurtosis here calculated as sample kurtosis
  1753      # https://en.wikipedia.org/wiki/Kurtosis#Sample_kurtosis
  1754      def compute_moments(x):
  1755        n = len(x)
  1756        if n == 0:
  1757          m2, sum, m3, m4 = 0, 0, 0, 0
  1758        else:
  1759          m2 = x.std(ddof=0)**2 * n
  1760          sum = x.sum()
  1761          m3 = (((x - x.mean())**3).sum())
  1762          m4 = (((x - x.mean())**4).sum())
  1763        return pd.DataFrame(dict(m2=[m2], sum=[sum], n=[n], m3=[m3], m4=[m4]))
  1764  
  1765      def combine_moments(data):
  1766        m2 = sum = n = m3 = m4 = 0.0
  1767        for datum in data.itertuples():
  1768          if datum.n == 0:
  1769            continue
  1770          elif n == 0:
  1771            m2, sum, n, m3, m4 = datum.m2, datum.sum, datum.n, datum.m3, datum.m4
  1772          else:
  1773            n_a, n_b = datum.n, n
  1774            m2_a, m2_b = datum.m2, m2
  1775            m3_a, m3_b = datum.m3, m3
  1776            sum_a, sum_b = datum.sum, sum
  1777            mean_a, mean_b = sum_a / n_a, sum_b / n_b
  1778            delta = mean_b - mean_a
  1779            combined_n = n_a + n_b
  1780            m4 += datum.m4 + ((delta**4) * (n_a * n_b) * (
  1781                (n_a**2) - (n_a * n_b) +
  1782                (n_b**2)) / combined_n**3) + ((6 * delta**2) * ((n_a**2 * m2_b) +
  1783                                                                (n_b**2 * m2_a)) /
  1784                                              (combined_n**2)) + ((4 * delta) *
  1785                                                                  ((n_a * m3_b) -
  1786                                                                   (n_b * m3_a)) /
  1787                                                                  (combined_n))
  1788            m3 += datum.m3 + (
  1789                (delta**3 * ((n_a * n_b) * (n_a - n_b)) / ((combined_n)**2)) +
  1790                ((3 * delta) * ((n_a * m2_b) - (n_b * m2_a)) / (combined_n)))
  1791            m2 += datum.m2 + delta**2 * n_b * n_a / combined_n
  1792            sum += datum.sum
  1793            n += datum.n
  1794  
  1795        if n < 4:
  1796          return float('nan')
  1797        elif m2 == 0:
  1798          return float(0)
  1799        else:
  1800          return (((combined_n + 1) * (combined_n) * (combined_n - 1)) /
  1801                  ((combined_n - 2) *
  1802                   (combined_n - 3))) * (m4 /
  1803                                         (m2)**2) - ((3 * (combined_n - 1)**2) /
  1804                                                     ((combined_n - 2) *
  1805                                                      (combined_n - 3)))
  1806  
  1807      moments = expressions.ComputedExpression(
  1808          'compute_moments',
  1809          compute_moments, [self._expr],
  1810          requires_partition_by=partitionings.Arbitrary())
  1811      with expressions.allow_non_parallel_operations(True):
  1812        return frame_base.DeferredFrame.wrap(
  1813            expressions.ComputedExpression(
  1814                'combine_moments',
  1815                combine_moments, [moments],
  1816                requires_partition_by=partitionings.Singleton()))
  1817  
  1818    @frame_base.with_docs_from(pd.Series)
  1819    def kurt(self, *args, **kwargs):
  1820      # Compute Kurtosis as kurt is an alias for kurtosis.
  1821      return self.kurtosis(*args, **kwargs)
  1822  
  1823    def _corr_aligned(self, other, min_periods):
  1824      std_x = self.std()
  1825      std_y = other.std()
  1826      cov = self._cov_aligned(other, min_periods)
  1827      return cov.apply(
  1828          lambda cov, std_x, std_y: cov / (std_x * std_y), args=[std_x, std_y])
  1829  
  1830    @frame_base.with_docs_from(pd.Series)
  1831    @frame_base.args_to_kwargs(pd.Series)
  1832    @frame_base.populate_defaults(pd.Series)
  1833    def cov(self, other, min_periods, ddof):
  1834      x, y = self.dropna().align(other.dropna(), 'inner')
  1835      return x._cov_aligned(y, min_periods, ddof)
  1836  
  1837    def _cov_aligned(self, other, min_periods, ddof=1):
  1838      # Use the formulae from
  1839      # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Covariance
  1840      def compute_co_moments(x, y):
  1841        n = len(x)
  1842        if n <= 1:
  1843          c = 0
  1844        else:
  1845          c = x.cov(y) * (n - 1)
  1846        sx = x.sum()
  1847        sy = y.sum()
  1848        return pd.DataFrame(dict(c=[c], sx=[sx], sy=[sy], n=[n]))
  1849  
  1850      def combine_co_moments(data):
  1851        c = sx = sy = n = 0.0
  1852        for datum in data.itertuples():
  1853          if datum.n == 0:
  1854            continue
  1855          elif n == 0:
  1856            c, sx, sy, n = datum.c, datum.sx, datum.sy, datum.n
  1857          else:
  1858            c += (
  1859                datum.c + (sx / n - datum.sx / datum.n) *
  1860                (sy / n - datum.sy / datum.n) * n * datum.n / (n + datum.n))
  1861            sx += datum.sx
  1862            sy += datum.sy
  1863            n += datum.n
  1864        if n < max(2, ddof, min_periods or 0):
  1865          return float('nan')
  1866        else:
  1867          return c / (n - ddof)
  1868  
  1869      moments = expressions.ComputedExpression(
  1870          'compute_co_moments',
  1871          compute_co_moments, [self._expr, other._expr],
  1872          requires_partition_by=partitionings.Index())
  1873  
  1874      with expressions.allow_non_parallel_operations(True):
  1875        return frame_base.DeferredFrame.wrap(
  1876            expressions.ComputedExpression(
  1877                'combine_co_moments',
  1878                combine_co_moments, [moments],
  1879                requires_partition_by=partitionings.Singleton()))
  1880  
  1881    @frame_base.with_docs_from(pd.Series)
  1882    @frame_base.args_to_kwargs(pd.Series)
  1883    @frame_base.populate_defaults(pd.Series)
  1884    @frame_base.maybe_inplace
  1885    def dropna(self, **kwargs):
  1886      return frame_base.DeferredFrame.wrap(
  1887          expressions.ComputedExpression(
  1888              'dropna',
  1889              lambda df: df.dropna(**kwargs), [self._expr],
  1890              preserves_partition_by=partitionings.Arbitrary(),
  1891              requires_partition_by=partitionings.Arbitrary()))
  1892  
  1893    @frame_base.with_docs_from(pd.Series)
  1894    @frame_base.args_to_kwargs(pd.Series)
  1895    @frame_base.populate_defaults(pd.Series)
  1896    @frame_base.maybe_inplace
  1897    def set_axis(self, labels, **kwargs):
  1898      # TODO: assigning the index is generally order-sensitive, but we could
  1899      # support it in some rare cases, e.g. when assigning the index from one
  1900      # of a DataFrame's columns
  1901      raise NotImplementedError(
  1902          "Assigning an index is not yet supported. "
  1903          "Consider using set_index() instead.")
  1904  
  1905    isnull = isna = frame_base._elementwise_method('isna', base=pd.Series)
  1906    notnull = notna = frame_base._elementwise_method('notna', base=pd.Series)
  1907  
  1908    items = frame_base.wont_implement_method(
  1909        pd.Series, 'items', reason="non-deferred-result")
  1910    iteritems = frame_base.wont_implement_method(
  1911        pd.Series, 'iteritems', reason="non-deferred-result")
  1912    tolist = frame_base.wont_implement_method(
  1913        pd.Series, 'tolist', reason="non-deferred-result")
  1914    to_numpy = frame_base.wont_implement_method(
  1915        pd.Series, 'to_numpy', reason="non-deferred-result")
  1916    to_string = frame_base.wont_implement_method(
  1917        pd.Series, 'to_string', reason="non-deferred-result")
  1918  
  1919    def _wrap_in_df(self):
  1920      return frame_base.DeferredFrame.wrap(
  1921          expressions.ComputedExpression(
  1922              'wrap_in_df',
  1923              lambda s: pd.DataFrame(s),
  1924              [self._expr],
  1925              requires_partition_by=partitionings.Arbitrary(),
  1926              preserves_partition_by=partitionings.Arbitrary(),
  1927          ))
  1928  
  1929    @frame_base.with_docs_from(pd.Series)
  1930    @frame_base.args_to_kwargs(pd.Series)
  1931    @frame_base.populate_defaults(pd.Series)
  1932    @frame_base.maybe_inplace
  1933    def duplicated(self, keep):
  1934      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  1935      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  1936      a Beam-specific option that guarantees only one duplicate will be kept, but
  1937      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  1938      duplicate element is kept."""
  1939      # Re-use the DataFrame based duplcated, extract the series back out
  1940      df = self._wrap_in_df()
  1941  
  1942      return df.duplicated(keep=keep)[df.columns[0]]
  1943  
  1944    @frame_base.with_docs_from(pd.Series)
  1945    @frame_base.args_to_kwargs(pd.Series)
  1946    @frame_base.populate_defaults(pd.Series)
  1947    @frame_base.maybe_inplace
  1948    def drop_duplicates(self, keep):
  1949      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  1950      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  1951      a Beam-specific option that guarantees only one duplicate will be kept, but
  1952      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  1953      duplicate element is kept."""
  1954      # Re-use the DataFrame based drop_duplicates, extract the series back out
  1955      df = self._wrap_in_df()
  1956  
  1957      return df.drop_duplicates(keep=keep)[df.columns[0]]
  1958  
  1959    @frame_base.with_docs_from(pd.Series)
  1960    @frame_base.args_to_kwargs(pd.Series)
  1961    @frame_base.populate_defaults(pd.Series)
  1962    @frame_base.maybe_inplace
  1963    def sample(self, **kwargs):
  1964      """Only ``n`` and/or ``weights`` may be specified.  ``frac``,
  1965      ``random_state``, and ``replace=True`` are not yet supported.
  1966      See `Issue 21010 <https://github.com/apache/beam/issues/21010>`_.
  1967  
  1968      Note that pandas will raise an error if ``n`` is larger than the length
  1969      of the dataset, while the Beam DataFrame API will simply return the full
  1970      dataset in that case."""
  1971  
  1972      # Re-use the DataFrame based sample, extract the series back out
  1973      df = self._wrap_in_df()
  1974  
  1975      return df.sample(**kwargs)[df.columns[0]]
  1976  
  1977    @frame_base.with_docs_from(pd.Series)
  1978    @frame_base.args_to_kwargs(pd.Series)
  1979    @frame_base.populate_defaults(pd.Series)
  1980    def aggregate(self, func, axis, *args, **kwargs):
  1981      """Some aggregation methods cannot be parallelized, and computing
  1982      them will require collecting all data on a single machine."""
  1983      if kwargs.get('skipna', False):
  1984        # Eagerly generate a proxy to make sure skipna is a valid argument
  1985        # for this aggregation method
  1986        _ = self._expr.proxy().aggregate(func, axis, *args, **kwargs)
  1987        kwargs.pop('skipna')
  1988        return self.dropna().aggregate(func, axis, *args, **kwargs)
  1989      if isinstance(func, list) and len(func) > 1:
  1990        # level arg is ignored for multiple aggregations
  1991        _ = kwargs.pop('level', None)
  1992  
  1993        # Aggregate with each method separately, then stick them all together.
  1994        rows = [self.agg([f], *args, **kwargs) for f in func]
  1995        return frame_base.DeferredFrame.wrap(
  1996            expressions.ComputedExpression(
  1997                'join_aggregate',
  1998                lambda *rows: pd.concat(rows), [row._expr for row in rows]))
  1999      else:
  2000        # We're only handling a single column. It could be 'func' or ['func'],
  2001        # which produce different results. 'func' produces a scalar, ['func']
  2002        # produces a single element Series.
  2003        base_func = func[0] if isinstance(func, list) else func
  2004  
  2005        if (_is_numeric(base_func) and
  2006            not pd.core.dtypes.common.is_numeric_dtype(self.dtype)):
  2007          warnings.warn(
  2008              f"Performing a numeric aggregation, {base_func!r}, on "
  2009              f"Series {self._expr.proxy().name!r} with non-numeric type "
  2010              f"{self.dtype!r}. This can result in runtime errors or surprising "
  2011              "results.")
  2012  
  2013        if 'level' in kwargs:
  2014          # Defer to groupby.agg for level= mode
  2015          return self.groupby(
  2016              level=kwargs.pop('level'), axis=axis).agg(func, *args, **kwargs)
  2017  
  2018        singleton_reason = None
  2019        if 'min_count' in kwargs:
  2020          # Eagerly generate a proxy to make sure min_count is a valid argument
  2021          # for this aggregation method
  2022          _ = self._expr.proxy().agg(func, axis, *args, **kwargs)
  2023  
  2024          singleton_reason = (
  2025              "Aggregation with min_count= requires collecting all data on a "
  2026              "single node.")
  2027  
  2028        # We have specialized distributed implementations for these
  2029        if base_func in HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS:
  2030          result = getattr(self, base_func)(*args, **kwargs)
  2031          if isinstance(func, list):
  2032            with expressions.allow_non_parallel_operations(True):
  2033              return frame_base.DeferredFrame.wrap(
  2034                  expressions.ComputedExpression(
  2035                      f'wrap_aggregate_{base_func}',
  2036                      lambda x: pd.Series(x, index=[base_func]), [result._expr],
  2037                      requires_partition_by=partitionings.Singleton(),
  2038                      preserves_partition_by=partitionings.Singleton()))
  2039          else:
  2040            return result
  2041  
  2042        agg_kwargs = kwargs.copy()
  2043        if ((_is_associative(base_func) or _is_liftable_with_sum(base_func)) and
  2044            singleton_reason is None):
  2045          intermediate = expressions.ComputedExpression(
  2046              f'pre_aggregate_{base_func}',
  2047              # Coerce to a Series, if the result is scalar we still want a Series
  2048              # so we can combine and do the final aggregation next.
  2049              lambda s: pd.Series(s.agg(func, *args, **kwargs)),
  2050              [self._expr],
  2051              requires_partition_by=partitionings.Arbitrary(),
  2052              preserves_partition_by=partitionings.Singleton())
  2053          allow_nonparallel_final = True
  2054          if _is_associative(base_func):
  2055            agg_func = func
  2056          else:
  2057            agg_func = ['sum'] if isinstance(func, list) else 'sum'
  2058        else:
  2059          intermediate = self._expr
  2060          allow_nonparallel_final = None  # i.e. don't change the value
  2061          agg_func = func
  2062          singleton_reason = (
  2063              f"Aggregation function {func!r} cannot currently be "
  2064              "parallelized. It requires collecting all data for "
  2065              "this Series on a single node.")
  2066        with expressions.allow_non_parallel_operations(allow_nonparallel_final):
  2067          return frame_base.DeferredFrame.wrap(
  2068              expressions.ComputedExpression(
  2069                  f'post_aggregate_{base_func}',
  2070                  lambda s: s.agg(agg_func, *args, **agg_kwargs), [intermediate],
  2071                  preserves_partition_by=partitionings.Singleton(),
  2072                  requires_partition_by=partitionings.Singleton(
  2073                      reason=singleton_reason)))
  2074  
  2075    agg = aggregate
  2076  
  2077    @property  # type: ignore
  2078    @frame_base.with_docs_from(pd.Series)
  2079    def axes(self):
  2080      return [self.index]
  2081  
  2082    clip = frame_base._elementwise_method('clip', base=pd.Series)
  2083  
  2084    all = _agg_method(pd.Series, 'all')
  2085    any = _agg_method(pd.Series, 'any')
  2086    # TODO(BEAM-12074): Document that Series.count(level=) will drop NaN's
  2087    count = _agg_method(pd.Series, 'count')
  2088    describe = _agg_method(pd.Series, 'describe')
  2089    min = _agg_method(pd.Series, 'min')
  2090    max = _agg_method(pd.Series, 'max')
  2091    prod = product = _agg_method(pd.Series, 'prod')
  2092    sum = _agg_method(pd.Series, 'sum')
  2093    median = _agg_method(pd.Series, 'median')
  2094    sem = _agg_method(pd.Series, 'sem')
  2095    mad = _agg_method(pd.Series, 'mad')
  2096  
  2097    argmax = frame_base.wont_implement_method(
  2098        pd.Series, 'argmax', reason='order-sensitive')
  2099    argmin = frame_base.wont_implement_method(
  2100        pd.Series, 'argmin', reason='order-sensitive')
  2101    cummax = frame_base.wont_implement_method(
  2102        pd.Series, 'cummax', reason='order-sensitive')
  2103    cummin = frame_base.wont_implement_method(
  2104        pd.Series, 'cummin', reason='order-sensitive')
  2105    cumprod = frame_base.wont_implement_method(
  2106        pd.Series, 'cumprod', reason='order-sensitive')
  2107    cumsum = frame_base.wont_implement_method(
  2108        pd.Series, 'cumsum', reason='order-sensitive')
  2109    diff = frame_base.wont_implement_method(
  2110        pd.Series, 'diff', reason='order-sensitive')
  2111    interpolate = frame_base.wont_implement_method(
  2112        pd.Series, 'interpolate', reason='order-sensitive')
  2113    searchsorted = frame_base.wont_implement_method(
  2114        pd.Series, 'searchsorted', reason='order-sensitive')
  2115    shift = frame_base.wont_implement_method(
  2116        pd.Series, 'shift', reason='order-sensitive')
  2117    pct_change = frame_base.wont_implement_method(
  2118        pd.Series, 'pct_change', reason='order-sensitive')
  2119    is_monotonic = frame_base.wont_implement_method(
  2120        pd.Series, 'is_monotonic', reason='order-sensitive')
  2121    is_monotonic_increasing = frame_base.wont_implement_method(
  2122        pd.Series, 'is_monotonic_increasing', reason='order-sensitive')
  2123    is_monotonic_decreasing = frame_base.wont_implement_method(
  2124        pd.Series, 'is_monotonic_decreasing', reason='order-sensitive')
  2125    asof = frame_base.wont_implement_method(
  2126        pd.Series, 'asof', reason='order-sensitive')
  2127    first_valid_index = frame_base.wont_implement_method(
  2128        pd.Series, 'first_valid_index', reason='order-sensitive')
  2129    last_valid_index = frame_base.wont_implement_method(
  2130        pd.Series, 'last_valid_index', reason='order-sensitive')
  2131    autocorr = frame_base.wont_implement_method(
  2132        pd.Series, 'autocorr', reason='order-sensitive')
  2133    iat = property(
  2134        frame_base.wont_implement_method(
  2135            pd.Series, 'iat', reason='order-sensitive'))
  2136  
  2137    head = frame_base.wont_implement_method(
  2138        pd.Series, 'head', explanation=_PEEK_METHOD_EXPLANATION)
  2139    tail = frame_base.wont_implement_method(
  2140        pd.Series, 'tail', explanation=_PEEK_METHOD_EXPLANATION)
  2141  
  2142    filter = frame_base._elementwise_method('filter', base=pd.Series)
  2143  
  2144    memory_usage = frame_base.wont_implement_method(
  2145        pd.Series, 'memory_usage', reason="non-deferred-result")
  2146    nbytes = frame_base.wont_implement_method(
  2147        pd.Series, 'nbytes', reason="non-deferred-result")
  2148    to_list = frame_base.wont_implement_method(
  2149        pd.Series, 'to_list', reason="non-deferred-result")
  2150  
  2151    factorize = frame_base.wont_implement_method(
  2152        pd.Series, 'factorize', reason="non-deferred-columns")
  2153  
  2154    # In Series __contains__ checks the index
  2155    __contains__ = frame_base.wont_implement_method(
  2156        pd.Series, '__contains__', reason="non-deferred-result")
  2157  
  2158    @frame_base.with_docs_from(pd.Series)
  2159    @frame_base.args_to_kwargs(pd.Series)
  2160    @frame_base.populate_defaults(pd.Series)
  2161    def nlargest(self, keep, **kwargs):
  2162      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  2163      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  2164      a Beam-specific option that guarantees only one duplicate will be kept, but
  2165      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  2166      duplicate element is kept."""
  2167      # TODO(robertwb): Document 'any' option.
  2168      # TODO(robertwb): Consider (conditionally) defaulting to 'any' if no
  2169      # explicit keep parameter is requested.
  2170      if keep == 'any':
  2171        keep = 'first'
  2172      elif keep != 'all':
  2173        raise frame_base.WontImplementError(
  2174            f"nlargest(keep={keep!r}) is not supported because it is "
  2175            "order sensitive. Only keep=\"all\" is supported.",
  2176            reason="order-sensitive")
  2177      kwargs['keep'] = keep
  2178      per_partition = expressions.ComputedExpression(
  2179          'nlargest-per-partition',
  2180          lambda df: df.nlargest(**kwargs), [self._expr],
  2181          preserves_partition_by=partitionings.Arbitrary(),
  2182          requires_partition_by=partitionings.Arbitrary())
  2183      with expressions.allow_non_parallel_operations(True):
  2184        return frame_base.DeferredFrame.wrap(
  2185            expressions.ComputedExpression(
  2186                'nlargest',
  2187                lambda df: df.nlargest(**kwargs), [per_partition],
  2188                preserves_partition_by=partitionings.Arbitrary(),
  2189                requires_partition_by=partitionings.Singleton()))
  2190  
  2191    @frame_base.with_docs_from(pd.Series)
  2192    @frame_base.args_to_kwargs(pd.Series)
  2193    @frame_base.populate_defaults(pd.Series)
  2194    def nsmallest(self, keep, **kwargs):
  2195      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  2196      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  2197      a Beam-specific option that guarantees only one duplicate will be kept, but
  2198      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  2199      duplicate element is kept."""
  2200      if keep == 'any':
  2201        keep = 'first'
  2202      elif keep != 'all':
  2203        raise frame_base.WontImplementError(
  2204            f"nsmallest(keep={keep!r}) is not supported because it is "
  2205            "order sensitive. Only keep=\"all\" is supported.",
  2206            reason="order-sensitive")
  2207      kwargs['keep'] = keep
  2208      per_partition = expressions.ComputedExpression(
  2209          'nsmallest-per-partition',
  2210          lambda df: df.nsmallest(**kwargs), [self._expr],
  2211          preserves_partition_by=partitionings.Arbitrary(),
  2212          requires_partition_by=partitionings.Arbitrary())
  2213      with expressions.allow_non_parallel_operations(True):
  2214        return frame_base.DeferredFrame.wrap(
  2215            expressions.ComputedExpression(
  2216                'nsmallest',
  2217                lambda df: df.nsmallest(**kwargs), [per_partition],
  2218                preserves_partition_by=partitionings.Arbitrary(),
  2219                requires_partition_by=partitionings.Singleton()))
  2220  
  2221    @property  # type: ignore
  2222    @frame_base.with_docs_from(pd.Series)
  2223    def is_unique(self):
  2224      def set_index(s):
  2225        s = s[:]
  2226        s.index = s
  2227        return s
  2228  
  2229      self_index = expressions.ComputedExpression(
  2230          'set_index',
  2231          set_index, [self._expr],
  2232          requires_partition_by=partitionings.Arbitrary(),
  2233          preserves_partition_by=partitionings.Singleton())
  2234  
  2235      is_unique_distributed = expressions.ComputedExpression(
  2236          'is_unique_distributed',
  2237          lambda s: pd.Series(s.is_unique), [self_index],
  2238          requires_partition_by=partitionings.Index(),
  2239          preserves_partition_by=partitionings.Singleton())
  2240  
  2241      with expressions.allow_non_parallel_operations():
  2242        return frame_base.DeferredFrame.wrap(
  2243            expressions.ComputedExpression(
  2244                'combine',
  2245                lambda s: s.all(), [is_unique_distributed],
  2246                requires_partition_by=partitionings.Singleton(),
  2247                preserves_partition_by=partitionings.Singleton()))
  2248  
  2249    plot = frame_base.wont_implement_method(
  2250        pd.Series, 'plot', reason="plotting-tools")
  2251    pop = frame_base.wont_implement_method(
  2252        pd.Series, 'pop', reason="non-deferred-result")
  2253  
  2254    rename_axis = frame_base._elementwise_method('rename_axis', base=pd.Series)
  2255  
  2256    round = frame_base._elementwise_method('round', base=pd.Series)
  2257  
  2258    take = frame_base.wont_implement_method(
  2259        pd.Series, 'take', reason='deprecated')
  2260  
  2261    to_dict = frame_base.wont_implement_method(
  2262        pd.Series, 'to_dict', reason="non-deferred-result")
  2263  
  2264    to_frame = frame_base._elementwise_method('to_frame', base=pd.Series)
  2265  
  2266    @frame_base.with_docs_from(pd.Series)
  2267    def unique(self, as_series=False):
  2268      """unique is not supported by default because it produces a
  2269      non-deferred result: an :class:`~numpy.ndarray`. You can use the
  2270      Beam-specific argument ``unique(as_series=True)`` to get the result as
  2271      a :class:`DeferredSeries`"""
  2272  
  2273      if not as_series:
  2274        raise frame_base.WontImplementError(
  2275            "unique() is not supported by default because it produces a "
  2276            "non-deferred result: a numpy array. You can use the Beam-specific "
  2277            "argument unique(as_series=True) to get the result as a "
  2278            "DeferredSeries",
  2279            reason="non-deferred-result")
  2280      return frame_base.DeferredFrame.wrap(
  2281          expressions.ComputedExpression(
  2282              'unique',
  2283              lambda df: pd.Series(df.unique()), [self._expr],
  2284              preserves_partition_by=partitionings.Singleton(),
  2285              requires_partition_by=partitionings.Singleton(
  2286                  reason="unique() cannot currently be parallelized.")))
  2287  
  2288    @frame_base.with_docs_from(pd.Series)
  2289    def update(self, other):
  2290      self._expr = expressions.ComputedExpression(
  2291          'update',
  2292          lambda df,
  2293          other: df.update(other) or df, [self._expr, other._expr],
  2294          preserves_partition_by=partitionings.Arbitrary(),
  2295          requires_partition_by=partitionings.Index())
  2296  
  2297    @frame_base.with_docs_from(pd.Series)
  2298    def value_counts(
  2299        self,
  2300        sort=False,
  2301        normalize=False,
  2302        ascending=False,
  2303        bins=None,
  2304        dropna=True):
  2305      """``sort`` is ``False`` by default, and ``sort=True`` is not supported
  2306      because it imposes an ordering on the dataset which likely will not be
  2307      preserved.
  2308  
  2309      When ``bin`` is specified this operation is not parallelizable. See
  2310      [Issue 20903](https://github.com/apache/beam/issues/20903) tracking the
  2311      possible addition of a distributed implementation."""
  2312  
  2313      if sort:
  2314        raise frame_base.WontImplementError(
  2315            "value_counts(sort=True) is not supported because it imposes an "
  2316            "ordering on the dataset which likely will not be preserved.",
  2317            reason="order-sensitive")
  2318  
  2319      if bins is not None:
  2320        return frame_base.DeferredFrame.wrap(
  2321            expressions.ComputedExpression(
  2322                'value_counts',
  2323                lambda s: s.value_counts(
  2324                    normalize=normalize, bins=bins, dropna=dropna)[self._expr],
  2325                requires_partition_by=partitionings.Singleton(
  2326                    reason=(
  2327                        "value_counts with bin specified requires collecting "
  2328                        "the entire dataset to identify the range.")),
  2329                preserves_partition_by=partitionings.Singleton(),
  2330            ))
  2331  
  2332      if dropna:
  2333        column = self.dropna()
  2334      else:
  2335        column = self
  2336  
  2337      result = column.groupby(column, dropna=dropna).size()
  2338  
  2339      # groupby.size() names the index, which we don't need
  2340      result.index.name = None
  2341  
  2342      if normalize:
  2343        return result / column.length()
  2344      else:
  2345        return result
  2346  
  2347    values = property(
  2348        frame_base.wont_implement_method(
  2349            pd.Series, 'values', reason="non-deferred-result"))
  2350  
  2351    view = frame_base.wont_implement_method(
  2352        pd.Series,
  2353        'view',
  2354        explanation=(
  2355            "because it relies on memory-sharing semantics that are "
  2356            "not compatible with the Beam model."))
  2357  
  2358    @property  # type: ignore
  2359    @frame_base.with_docs_from(pd.Series)
  2360    def str(self):
  2361      return _DeferredStringMethods(self._expr)
  2362  
  2363    @property  # type: ignore
  2364    @frame_base.with_docs_from(pd.Series)
  2365    def cat(self):
  2366      return _DeferredCategoricalMethods(self._expr)
  2367  
  2368    @property  # type: ignore
  2369    @frame_base.with_docs_from(pd.Series)
  2370    def dt(self):
  2371      return _DeferredDatetimeMethods(self._expr)
  2372  
  2373    @frame_base.with_docs_from(pd.Series)
  2374    def mode(self, *args, **kwargs):
  2375      """mode is not currently parallelizable. An approximate,
  2376      parallelizable implementation of mode may be added in the future
  2377      (`Issue 20946 <https://github.com/apache/beam/issues/20946>`_)."""
  2378      return frame_base.DeferredFrame.wrap(
  2379          expressions.ComputedExpression(
  2380              'mode',
  2381              lambda df: df.mode(*args, **kwargs),
  2382              [self._expr],
  2383              #TODO(https://github.com/apache/beam/issues/20946):
  2384              # Can we add an approximate implementation?
  2385              requires_partition_by=partitionings.Singleton(
  2386                  reason=(
  2387                      "mode cannot currently be parallelized. See "
  2388                      "https://github.com/apache/beam/issues/20946 tracking the "
  2389                      "possble addition of an approximate, parallelizable "
  2390                      "implementation of mode.")),
  2391              preserves_partition_by=partitionings.Singleton()))
  2392  
  2393    apply = frame_base._elementwise_method('apply', base=pd.Series)
  2394    map = frame_base._elementwise_method('map', base=pd.Series)
  2395    # TODO(https://github.com/apache/beam/issues/20764): Implement transform
  2396    # using type inference to determine the proxy
  2397    #transform = frame_base._elementwise_method('transform', base=pd.Series)
  2398  
  2399    @frame_base.with_docs_from(pd.Series)
  2400    @frame_base.args_to_kwargs(pd.Series)
  2401    @frame_base.populate_defaults(pd.Series)
  2402    def repeat(self, repeats, axis):
  2403      """``repeats`` must be an ``int`` or a :class:`DeferredSeries`. Lists are
  2404      not supported because they make this operation order-sensitive."""
  2405      if isinstance(repeats, int):
  2406        return frame_base.DeferredFrame.wrap(
  2407            expressions.ComputedExpression(
  2408                'repeat',
  2409                lambda series: series.repeat(repeats), [self._expr],
  2410                requires_partition_by=partitionings.Arbitrary(),
  2411                preserves_partition_by=partitionings.Arbitrary()))
  2412      elif isinstance(repeats, frame_base.DeferredBase):
  2413        return frame_base.DeferredFrame.wrap(
  2414            expressions.ComputedExpression(
  2415                'repeat',
  2416                lambda series,
  2417                repeats_series: series.repeat(repeats_series),
  2418                [self._expr, repeats._expr],
  2419                requires_partition_by=partitionings.Index(),
  2420                preserves_partition_by=partitionings.Arbitrary()))
  2421      elif isinstance(repeats, list):
  2422        raise frame_base.WontImplementError(
  2423            "repeat(repeats=) repeats must be an int or a DeferredSeries. "
  2424            "Lists are not supported because they make this operation sensitive "
  2425            "to the order of the data.",
  2426            reason="order-sensitive")
  2427      else:
  2428        raise TypeError(
  2429            "repeat(repeats=) value must be an int or a "
  2430            f"DeferredSeries (encountered {type(repeats)}).")
  2431  
  2432    if hasattr(pd.Series, 'compare'):
  2433  
  2434      @frame_base.with_docs_from(pd.Series)
  2435      @frame_base.args_to_kwargs(pd.Series)
  2436      @frame_base.populate_defaults(pd.Series)
  2437      def compare(self, other, align_axis, **kwargs):
  2438  
  2439        if align_axis in ('index', 0):
  2440          preserves_partition = partitionings.Singleton()
  2441        elif align_axis in ('columns', 1):
  2442          preserves_partition = partitionings.Arbitrary()
  2443        else:
  2444          raise ValueError(
  2445              "align_axis must be one of ('index', 0, 'columns', 1). "
  2446              f"got {align_axis!r}.")
  2447  
  2448        return frame_base.DeferredFrame.wrap(
  2449            expressions.ComputedExpression(
  2450                'compare',
  2451                lambda s,
  2452                other: s.compare(other, align_axis, **kwargs),
  2453                [self._expr, other._expr],
  2454                requires_partition_by=partitionings.Index(),
  2455                preserves_partition_by=preserves_partition))
  2456  
  2457  
  2458  @populate_not_implemented(pd.DataFrame)
  2459  @frame_base.DeferredFrame._register_for(pd.DataFrame)
  2460  class DeferredDataFrame(DeferredDataFrameOrSeries):
  2461    def __repr__(self):
  2462      return (
  2463          f'DeferredDataFrame(columns={list(self.columns)}, '
  2464          f'{self._render_indexes()})')
  2465  
  2466    @property  # type: ignore
  2467    @frame_base.with_docs_from(pd.DataFrame)
  2468    def columns(self):
  2469      return self._expr.proxy().columns
  2470  
  2471    @columns.setter
  2472    def columns(self, columns):
  2473      def set_columns(df):
  2474        df = df.copy()
  2475        df.columns = columns
  2476        return df
  2477  
  2478      return frame_base.DeferredFrame.wrap(
  2479          expressions.ComputedExpression(
  2480              'set_columns',
  2481              set_columns, [self._expr],
  2482              requires_partition_by=partitionings.Arbitrary(),
  2483              preserves_partition_by=partitionings.Arbitrary()))
  2484  
  2485    @frame_base.with_docs_from(pd.DataFrame)
  2486    def keys(self):
  2487      return self.columns
  2488  
  2489    def __getattr__(self, name):
  2490      # Column attribute access.
  2491      if name in self._expr.proxy().columns:
  2492        return self[name]
  2493      else:
  2494        return object.__getattribute__(self, name)
  2495  
  2496    def __getitem__(self, key):
  2497      # TODO: Replicate pd.DataFrame.__getitem__ logic
  2498      if isinstance(key, DeferredSeries) and key._expr.proxy().dtype == bool:
  2499        return self.loc[key]
  2500  
  2501      elif isinstance(key, frame_base.DeferredBase):
  2502        # Fail early if key is a DeferredBase as it interacts surprisingly with
  2503        # key in self._expr.proxy().columns
  2504        raise NotImplementedError(
  2505            "Indexing with a non-bool deferred frame is not yet supported. "
  2506            "Consider using df.loc[...]")
  2507  
  2508      elif isinstance(key, slice):
  2509        if _is_null_slice(key):
  2510          return self
  2511        elif _is_integer_slice(key):
  2512          # This depends on the contents of the index.
  2513          raise frame_base.WontImplementError(
  2514              "Integer slices are not supported as they are ambiguous. Please "
  2515              "use iloc or loc with integer slices.")
  2516        else:
  2517          return self.loc[key]
  2518  
  2519      elif (
  2520          (isinstance(key, list) and all(key_column in self._expr.proxy().columns
  2521                                         for key_column in key)) or
  2522          key in self._expr.proxy().columns):
  2523        return self._elementwise(lambda df: df[key], 'get_column')
  2524  
  2525      else:
  2526        raise NotImplementedError(key)
  2527  
  2528    def __contains__(self, key):
  2529      # Checks if proxy has the given column
  2530      return self._expr.proxy().__contains__(key)
  2531  
  2532    def __setitem__(self, key, value):
  2533      if isinstance(
  2534          key, str) or (isinstance(key, list) and
  2535                        all(isinstance(c, str)
  2536                            for c in key)) or (isinstance(key, DeferredSeries) and
  2537                                               key._expr.proxy().dtype == bool):
  2538        # yapf: disable
  2539        return self._elementwise(
  2540            lambda df, key, value: df.__setitem__(key, value),
  2541            'set_column',
  2542            (key, value),
  2543            inplace=True)
  2544      else:
  2545        raise NotImplementedError(key)
  2546  
  2547    @frame_base.with_docs_from(pd.DataFrame)
  2548    @frame_base.args_to_kwargs(pd.DataFrame)
  2549    @frame_base.populate_defaults(pd.DataFrame)
  2550    def align(self, other, join, axis, copy, level, method, **kwargs):
  2551      """Aligning per level is not yet supported. Only the default,
  2552      ``level=None``, is allowed.
  2553  
  2554      Filling NaN values via ``method`` is not supported, because it is
  2555      `order-sensitive
  2556      <https://s.apache.org/dataframe-order-sensitive-operations>`_. Only the
  2557      default, ``method=None``, is allowed.
  2558  
  2559      ``copy=False`` is not supported because its behavior (whether or not it is
  2560      an inplace operation) depends on the data."""
  2561      if not copy:
  2562        raise frame_base.WontImplementError(
  2563            "align(copy=False) is not supported because it might be an inplace "
  2564            "operation depending on the data. Please prefer the default "
  2565            "align(copy=True).")
  2566      if method is not None:
  2567        raise frame_base.WontImplementError(
  2568            f"align(method={method!r}) is not supported because it is "
  2569            "order sensitive. Only align(method=None) is supported.",
  2570            reason="order-sensitive")
  2571      if kwargs:
  2572        raise NotImplementedError('align(%s)' % ', '.join(kwargs.keys()))
  2573  
  2574      if level is not None:
  2575        # Could probably get by partitioning on the used levels.
  2576        requires_partition_by = partitionings.Singleton(reason=(
  2577            f"align(level={level}) is not currently parallelizable. Only "
  2578            "align(level=None) can be parallelized."))
  2579      elif axis in ('columns', 1):
  2580        requires_partition_by = partitionings.Arbitrary()
  2581      else:
  2582        requires_partition_by = partitionings.Index()
  2583      return frame_base.DeferredFrame.wrap(
  2584          expressions.ComputedExpression(
  2585              'align',
  2586              lambda df, other: df.align(other, join=join, axis=axis),
  2587              [self._expr, other._expr],
  2588              requires_partition_by=requires_partition_by,
  2589              preserves_partition_by=partitionings.Arbitrary()))
  2590  
  2591    @frame_base.with_docs_from(pd.DataFrame)
  2592    @frame_base.args_to_kwargs(pd.DataFrame)
  2593    @frame_base.populate_defaults(pd.DataFrame)
  2594    def append(self, other, ignore_index, verify_integrity, sort, **kwargs):
  2595      """``ignore_index=True`` is not supported, because it requires generating an
  2596      order-sensitive index."""
  2597      if not isinstance(other, DeferredDataFrame):
  2598        raise frame_base.WontImplementError(
  2599            "append() only accepts DeferredDataFrame instances, received " +
  2600            str(type(other)))
  2601      if ignore_index:
  2602        raise frame_base.WontImplementError(
  2603            "append(ignore_index=True) is order sensitive because it requires "
  2604            "generating a new index based on the order of the data.",
  2605            reason="order-sensitive")
  2606  
  2607      if verify_integrity:
  2608        # We can verify the index is non-unique within index partitioned data.
  2609        requires = partitionings.Index()
  2610      else:
  2611        requires = partitionings.Arbitrary()
  2612  
  2613      return frame_base.DeferredFrame.wrap(
  2614          expressions.ComputedExpression(
  2615              'append',
  2616              lambda s, other: s.append(other, sort=sort,
  2617                                        verify_integrity=verify_integrity,
  2618                                        **kwargs),
  2619              [self._expr, other._expr],
  2620              requires_partition_by=requires,
  2621              preserves_partition_by=partitionings.Arbitrary()
  2622          )
  2623      )
  2624  
  2625    # If column name exists this is a simple project, otherwise it is a constant
  2626    # (default_value)
  2627    @frame_base.with_docs_from(pd.DataFrame)
  2628    def get(self, key, default_value=None):
  2629      if key in self.columns:
  2630        return self[key]
  2631      else:
  2632        return default_value
  2633  
  2634    @frame_base.with_docs_from(pd.DataFrame)
  2635    @frame_base.args_to_kwargs(pd.DataFrame)
  2636    @frame_base.populate_defaults(pd.DataFrame)
  2637    @frame_base.maybe_inplace
  2638    def set_index(self, keys, **kwargs):
  2639      """``keys`` must be a ``str`` or ``List[str]``. Passing an Index or Series
  2640      is not yet supported (`Issue 20759
  2641      <https://github.com/apache/beam/issues/20759>`_)."""
  2642      if isinstance(keys, str):
  2643        keys = [keys]
  2644  
  2645      if any(isinstance(k, (_DeferredIndex, frame_base.DeferredFrame))
  2646             for k in keys):
  2647        raise NotImplementedError("set_index with Index or Series instances is "
  2648                                  "not yet supported "
  2649                                  "(https://github.com/apache/beam/issues/20759)"
  2650                                  ".")
  2651  
  2652      return frame_base.DeferredFrame.wrap(
  2653        expressions.ComputedExpression(
  2654            'set_index',
  2655            lambda df: df.set_index(keys, **kwargs),
  2656            [self._expr],
  2657            requires_partition_by=partitionings.Arbitrary(),
  2658            preserves_partition_by=partitionings.Singleton()))
  2659  
  2660  
  2661    @frame_base.with_docs_from(pd.DataFrame)
  2662    @frame_base.args_to_kwargs(pd.DataFrame)
  2663    @frame_base.populate_defaults(pd.DataFrame)
  2664    @frame_base.maybe_inplace
  2665    def set_axis(self, labels, axis, **kwargs):
  2666      if axis in ('index', 0):
  2667        # TODO: assigning the index is generally order-sensitive, but we could
  2668        # support it in some rare cases, e.g. when assigning the index from one
  2669        # of a DataFrame's columns
  2670        raise NotImplementedError(
  2671            "Assigning an index is not yet supported. "
  2672            "Consider using set_index() instead.")
  2673      else:
  2674        return frame_base.DeferredFrame.wrap(
  2675            expressions.ComputedExpression(
  2676                'set_axis',
  2677                lambda df: df.set_axis(labels, axis, **kwargs),
  2678                [self._expr],
  2679                requires_partition_by=partitionings.Arbitrary(),
  2680                preserves_partition_by=partitionings.Arbitrary()))
  2681  
  2682  
  2683    @property  # type: ignore
  2684    @frame_base.with_docs_from(pd.DataFrame)
  2685    def axes(self):
  2686      return (self.index, self.columns)
  2687  
  2688    @property  # type: ignore
  2689    @frame_base.with_docs_from(pd.DataFrame)
  2690    def dtypes(self):
  2691      return self._expr.proxy().dtypes
  2692  
  2693    @frame_base.with_docs_from(pd.DataFrame)
  2694    def assign(self, **kwargs):
  2695      """``value`` must be a ``callable`` or :class:`DeferredSeries`. Other types
  2696      make this operation order-sensitive."""
  2697      for name, value in kwargs.items():
  2698        if not callable(value) and not isinstance(value, DeferredSeries):
  2699          raise frame_base.WontImplementError(
  2700              f"Unsupported value for new column '{name}': '{value}'. Only "
  2701              "callables and DeferredSeries instances are supported. Other types "
  2702              "make this operation sensitive to the order of the data",
  2703              reason="order-sensitive")
  2704      return self._elementwise(
  2705          lambda df, *args, **kwargs: df.assign(*args, **kwargs),
  2706          'assign',
  2707          other_kwargs=kwargs)
  2708  
  2709    @frame_base.with_docs_from(pd.DataFrame)
  2710    @frame_base.args_to_kwargs(pd.DataFrame)
  2711    @frame_base.populate_defaults(pd.DataFrame)
  2712    def explode(self, column, ignore_index):
  2713      # ignoring the index will not preserve it
  2714      preserves = (partitionings.Singleton() if ignore_index
  2715                   else partitionings.Index())
  2716      return frame_base.DeferredFrame.wrap(
  2717          expressions.ComputedExpression(
  2718              'explode',
  2719              lambda df: df.explode(column, ignore_index),
  2720              [self._expr],
  2721              preserves_partition_by=preserves,
  2722              requires_partition_by=partitionings.Arbitrary()))
  2723  
  2724    @frame_base.with_docs_from(pd.DataFrame)
  2725    @frame_base.args_to_kwargs(pd.DataFrame)
  2726    @frame_base.populate_defaults(pd.DataFrame)
  2727    def insert(self, value, **kwargs):
  2728      """``value`` cannot be a ``List`` because aligning it with this
  2729      DeferredDataFrame is order-sensitive."""
  2730      if isinstance(value, list):
  2731        raise frame_base.WontImplementError(
  2732            "insert(value=list) is not supported because it joins the input "
  2733            "list to the deferred DataFrame based on the order of the data.",
  2734            reason="order-sensitive")
  2735  
  2736      if isinstance(value, pd.core.generic.NDFrame):
  2737        value = frame_base.DeferredFrame.wrap(
  2738            expressions.ConstantExpression(value))
  2739  
  2740      if isinstance(value, frame_base.DeferredFrame):
  2741        def func_zip(df, value):
  2742          df = df.copy()
  2743          df.insert(value=value, **kwargs)
  2744          return df
  2745  
  2746        inserted = frame_base.DeferredFrame.wrap(
  2747            expressions.ComputedExpression(
  2748                'insert',
  2749                func_zip,
  2750                [self._expr, value._expr],
  2751                requires_partition_by=partitionings.Index(),
  2752                preserves_partition_by=partitionings.Arbitrary()))
  2753      else:
  2754        def func_elementwise(df):
  2755          df = df.copy()
  2756          df.insert(value=value, **kwargs)
  2757          return df
  2758        inserted = frame_base.DeferredFrame.wrap(
  2759            expressions.ComputedExpression(
  2760                'insert',
  2761                func_elementwise,
  2762                [self._expr],
  2763                requires_partition_by=partitionings.Arbitrary(),
  2764                preserves_partition_by=partitionings.Arbitrary()))
  2765  
  2766      self._expr = inserted._expr
  2767  
  2768    @staticmethod
  2769    @frame_base.with_docs_from(pd.DataFrame)
  2770    def from_dict(*args, **kwargs):
  2771      return frame_base.DeferredFrame.wrap(
  2772          expressions.ConstantExpression(pd.DataFrame.from_dict(*args, **kwargs)))
  2773  
  2774    @staticmethod
  2775    @frame_base.with_docs_from(pd.DataFrame)
  2776    def from_records(*args, **kwargs):
  2777      return frame_base.DeferredFrame.wrap(
  2778          expressions.ConstantExpression(pd.DataFrame.from_records(*args,
  2779                                                                   **kwargs)))
  2780  
  2781    @frame_base.with_docs_from(pd.DataFrame)
  2782    @frame_base.args_to_kwargs(pd.DataFrame)
  2783    @frame_base.populate_defaults(pd.DataFrame)
  2784    @frame_base.maybe_inplace
  2785    def duplicated(self, keep, subset):
  2786      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  2787      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  2788      a Beam-specific option that guarantees only one duplicate will be kept, but
  2789      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  2790      duplicate element is kept."""
  2791      # TODO(BEAM-12074): Document keep="any"
  2792      if keep == 'any':
  2793        keep = 'first'
  2794      elif keep is not False:
  2795        raise frame_base.WontImplementError(
  2796            f"duplicated(keep={keep!r}) is not supported because it is "
  2797            "sensitive to the order of the data. Only keep=False and "
  2798            "keep=\"any\" are supported.",
  2799            reason="order-sensitive")
  2800  
  2801      by = subset or list(self.columns)
  2802  
  2803      return self.groupby(by).apply(
  2804          lambda df: pd.DataFrame(df.duplicated(keep=keep, subset=subset),
  2805                                  columns=[None]))[None].droplevel(by)
  2806  
  2807    @frame_base.with_docs_from(pd.DataFrame)
  2808    @frame_base.args_to_kwargs(pd.DataFrame)
  2809    @frame_base.populate_defaults(pd.DataFrame)
  2810    @frame_base.maybe_inplace
  2811    def drop_duplicates(self, keep, subset, ignore_index):
  2812      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  2813      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  2814      a Beam-specific option that guarantees only one duplicate will be kept, but
  2815      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  2816      duplicate element is kept."""
  2817      # TODO(BEAM-12074): Document keep="any"
  2818      if keep == 'any':
  2819        keep = 'first'
  2820      elif keep is not False:
  2821        raise frame_base.WontImplementError(
  2822            f"drop_duplicates(keep={keep!r}) is not supported because it is "
  2823            "sensitive to the order of the data. Only keep=False and "
  2824            "keep=\"any\" are supported.",
  2825            reason="order-sensitive")
  2826  
  2827      if ignore_index is not False:
  2828        raise frame_base.WontImplementError(
  2829            "drop_duplicates(ignore_index=False) is not supported because it "
  2830            "requires generating a new index that is sensitive to the order of "
  2831            "the data.",
  2832            reason="order-sensitive")
  2833  
  2834      by = subset or list(self.columns)
  2835  
  2836      return self.groupby(by).apply(
  2837          lambda df: df.drop_duplicates(keep=keep, subset=subset)).droplevel(by)
  2838  
  2839    @frame_base.with_docs_from(pd.DataFrame)
  2840    @frame_base.args_to_kwargs(pd.DataFrame)
  2841    @frame_base.populate_defaults(pd.DataFrame)
  2842    def aggregate(self, func, axis, *args, **kwargs):
  2843      # We have specialized implementations for these.
  2844      if func in ('quantile',):
  2845        return getattr(self, func)(*args, axis=axis, **kwargs)
  2846  
  2847      # In pandas<1.3.0, maps to a property, args are ignored
  2848      if func in ('size',) and PD_VERSION < (1, 3):
  2849        return getattr(self, func)
  2850  
  2851      # We also have specialized distributed implementations for these. They only
  2852      # support axis=0 (implicitly) though. axis=1 should fall through
  2853      if func in ('corr', 'cov') and axis in (0, 'index'):
  2854        return getattr(self, func)(*args, **kwargs)
  2855  
  2856      if axis is None:
  2857        # Aggregate across all elements by first aggregating across columns,
  2858        # then across rows.
  2859        return self.agg(func, *args, **dict(kwargs, axis=1)).agg(
  2860            func, *args, **dict(kwargs, axis=0))
  2861      elif axis in (1, 'columns'):
  2862        # This is an easy elementwise aggregation.
  2863        return frame_base.DeferredFrame.wrap(
  2864            expressions.ComputedExpression(
  2865                'aggregate',
  2866                lambda df: df.agg(func, axis=1, *args, **kwargs),
  2867                [self._expr],
  2868                requires_partition_by=partitionings.Arbitrary()))
  2869      elif len(self._expr.proxy().columns) == 0:
  2870        # For this corner case, just colocate everything.
  2871        return frame_base.DeferredFrame.wrap(
  2872          expressions.ComputedExpression(
  2873              'aggregate',
  2874              lambda df: df.agg(func, *args, **kwargs),
  2875              [self._expr],
  2876              requires_partition_by=partitionings.Singleton()))
  2877      else:
  2878        # In the general case, we will compute the aggregation of each column
  2879        # separately, then recombine.
  2880  
  2881        # First, handle any kwargs that cause a projection, by eagerly generating
  2882        # the proxy, and only including the columns that are in the output.
  2883        PROJECT_KWARGS = ('numeric_only', 'bool_only', 'include', 'exclude')
  2884        proxy = self._expr.proxy().agg(func, axis, *args, **kwargs)
  2885  
  2886        if isinstance(proxy, pd.DataFrame):
  2887          projected = self[list(proxy.columns)]
  2888        elif isinstance(proxy, pd.Series):
  2889          projected = self[list(proxy.index)]
  2890        else:
  2891          projected = self
  2892  
  2893        nonnumeric_columns = [name for (name, dtype) in projected.dtypes.items()
  2894                              if not
  2895                              pd.core.dtypes.common.is_numeric_dtype(dtype)]
  2896  
  2897        if _is_numeric(func) and nonnumeric_columns:
  2898          if 'numeric_only' in kwargs and kwargs['numeric_only'] is False:
  2899            # User has opted in to execution with non-numeric columns, they
  2900            # will accept runtime errors
  2901            pass
  2902          else:
  2903            raise frame_base.WontImplementError(
  2904                f"Numeric aggregation ({func!r}) on a DataFrame containing "
  2905                f"non-numeric columns ({*nonnumeric_columns,!r} is not "
  2906                "supported, unless `numeric_only=` is specified.\n"
  2907                "Use `numeric_only=True` to only aggregate over numeric "
  2908                "columns.\nUse `numeric_only=False` to aggregate over all "
  2909                "columns. Note this is not recommended, as it could result in "
  2910                "execution time errors.")
  2911  
  2912        for key in PROJECT_KWARGS:
  2913          if key in kwargs:
  2914            kwargs.pop(key)
  2915  
  2916        if not isinstance(func, dict):
  2917          col_names = list(projected._expr.proxy().columns)
  2918          func_by_col = {col: func for col in col_names}
  2919        else:
  2920          func_by_col = func
  2921          col_names = list(func.keys())
  2922        aggregated_cols = []
  2923        has_lists = any(isinstance(f, list) for f in func_by_col.values())
  2924        for col in col_names:
  2925          funcs = func_by_col[col]
  2926          if has_lists and not isinstance(funcs, list):
  2927            # If any of the columns do multiple aggregations, they all must use
  2928            # "list" style output
  2929            funcs = [funcs]
  2930          aggregated_cols.append(projected[col].agg(funcs, *args, **kwargs))
  2931        # The final shape is different depending on whether any of the columns
  2932        # were aggregated by a list of aggregators.
  2933        with expressions.allow_non_parallel_operations():
  2934          if isinstance(proxy, pd.Series):
  2935            return frame_base.DeferredFrame.wrap(
  2936              expressions.ComputedExpression(
  2937                  'join_aggregate',
  2938                    lambda *cols: pd.Series(
  2939                        {col: value for col, value in zip(col_names, cols)}),
  2940                  [col._expr for col in aggregated_cols],
  2941                  requires_partition_by=partitionings.Singleton()))
  2942          elif isinstance(proxy, pd.DataFrame):
  2943            return frame_base.DeferredFrame.wrap(
  2944                expressions.ComputedExpression(
  2945                    'join_aggregate',
  2946                    lambda *cols: pd.DataFrame(
  2947                        {col: value for col, value in zip(col_names, cols)}),
  2948                    [col._expr for col in aggregated_cols],
  2949                    requires_partition_by=partitionings.Singleton()))
  2950          else:
  2951            raise AssertionError("Unexpected proxy type for "
  2952                                 f"DataFrame.aggregate!: proxy={proxy!r}, "
  2953                                 f"type(proxy)={type(proxy)!r}")
  2954  
  2955    agg = aggregate
  2956  
  2957    applymap = frame_base._elementwise_method('applymap', base=pd.DataFrame)
  2958    add_prefix = frame_base._elementwise_method('add_prefix', base=pd.DataFrame)
  2959    add_suffix = frame_base._elementwise_method('add_suffix', base=pd.DataFrame)
  2960  
  2961    memory_usage = frame_base.wont_implement_method(
  2962        pd.DataFrame, 'memory_usage', reason="non-deferred-result")
  2963    info = frame_base.wont_implement_method(
  2964        pd.DataFrame, 'info', reason="non-deferred-result")
  2965  
  2966  
  2967    @frame_base.with_docs_from(pd.DataFrame)
  2968    @frame_base.args_to_kwargs(pd.DataFrame)
  2969    @frame_base.populate_defaults(pd.DataFrame)
  2970    @frame_base.maybe_inplace
  2971    def clip(self, axis, **kwargs):
  2972      """``lower`` and ``upper`` must be :class:`DeferredSeries` instances, or
  2973      constants.  Array-like arguments are not supported because they are
  2974      order-sensitive."""
  2975  
  2976      if any(isinstance(kwargs.get(arg, None), frame_base.DeferredFrame)
  2977             for arg in ('upper', 'lower')) and axis not in (0, 'index'):
  2978        raise frame_base.WontImplementError(
  2979            "axis must be 'index' when upper and/or lower are a DeferredFrame",
  2980            reason='order-sensitive')
  2981  
  2982      return frame_base._elementwise_method('clip', base=pd.DataFrame)(self,
  2983                                                                       axis=axis,
  2984                                                                       **kwargs)
  2985  
  2986    @frame_base.with_docs_from(pd.DataFrame)
  2987    @frame_base.args_to_kwargs(pd.DataFrame)
  2988    @frame_base.populate_defaults(pd.DataFrame)
  2989    def corr(self, method, min_periods):
  2990      """Only ``method="pearson"`` can be parallelized. Other methods require
  2991      collecting all data on a single worker (see
  2992      https://s.apache.org/dataframe-non-parallel-operations for details).
  2993      """
  2994      if method == 'pearson':
  2995        proxy = self._expr.proxy().corr()
  2996        columns = list(proxy.columns)
  2997        args = []
  2998        arg_indices = []
  2999        for col1, col2 in itertools.combinations(columns, 2):
  3000          arg_indices.append((col1, col2))
  3001          args.append(self[col1].corr(self[col2], method=method,
  3002                                      min_periods=min_periods))
  3003        def fill_matrix(*args):
  3004          data = collections.defaultdict(dict)
  3005          for col in columns:
  3006            data[col][col] = 1.0
  3007          for ix, (col1, col2) in enumerate(arg_indices):
  3008            data[col1][col2] = data[col2][col1] = args[ix]
  3009          return pd.DataFrame(data, columns=columns, index=columns)
  3010        with expressions.allow_non_parallel_operations(True):
  3011          return frame_base.DeferredFrame.wrap(
  3012              expressions.ComputedExpression(
  3013                  'fill_matrix',
  3014                  fill_matrix,
  3015                  [arg._expr for arg in args],
  3016                  requires_partition_by=partitionings.Singleton(),
  3017                  proxy=proxy))
  3018  
  3019      else:
  3020        reason = (f"Encountered corr(method={method!r}) which cannot be "
  3021                  "parallelized. Only corr(method='pearson') is currently "
  3022                  "parallelizable.")
  3023        return frame_base.DeferredFrame.wrap(
  3024            expressions.ComputedExpression(
  3025                'corr',
  3026                lambda df: df.corr(method=method, min_periods=min_periods),
  3027                [self._expr],
  3028                requires_partition_by=partitionings.Singleton(reason=reason)))
  3029  
  3030    @frame_base.with_docs_from(pd.DataFrame)
  3031    @frame_base.args_to_kwargs(pd.DataFrame)
  3032    @frame_base.populate_defaults(pd.DataFrame)
  3033    def cov(self, min_periods, ddof):
  3034      proxy = self._expr.proxy().corr()
  3035      columns = list(proxy.columns)
  3036      args = []
  3037      arg_indices = []
  3038      for col in columns:
  3039        arg_indices.append((col, col))
  3040        std = self[col].std(ddof)
  3041        args.append(std.apply(lambda x: x*x, 'square'))
  3042      for ix, col1 in enumerate(columns):
  3043        for col2 in columns[ix+1:]:
  3044          arg_indices.append((col1, col2))
  3045          # Note that this set may be different for each pair.
  3046          no_na = self.loc[self[col1].notna() & self[col2].notna()]
  3047          args.append(no_na[col1]._cov_aligned(no_na[col2], min_periods, ddof))
  3048      def fill_matrix(*args):
  3049        data = collections.defaultdict(dict)
  3050        for ix, (col1, col2) in enumerate(arg_indices):
  3051          data[col1][col2] = data[col2][col1] = args[ix]
  3052        return pd.DataFrame(data, columns=columns, index=columns)
  3053      with expressions.allow_non_parallel_operations(True):
  3054        return frame_base.DeferredFrame.wrap(
  3055            expressions.ComputedExpression(
  3056                'fill_matrix',
  3057                fill_matrix,
  3058                [arg._expr for arg in args],
  3059                requires_partition_by=partitionings.Singleton(),
  3060                proxy=proxy))
  3061  
  3062    @frame_base.with_docs_from(pd.DataFrame)
  3063    @frame_base.args_to_kwargs(pd.DataFrame)
  3064    @frame_base.populate_defaults(pd.DataFrame)
  3065    def corrwith(self, other, axis, drop, method):
  3066      if axis in (1, 'columns'):
  3067        return self._elementwise(
  3068            lambda df, other: df.corrwith(other, axis=axis, drop=drop,
  3069                                          method=method),
  3070            'corrwith',
  3071            other_args=(other,))
  3072  
  3073  
  3074      if not isinstance(other, frame_base.DeferredFrame):
  3075        other = frame_base.DeferredFrame.wrap(
  3076            expressions.ConstantExpression(other))
  3077  
  3078      if isinstance(other, DeferredSeries):
  3079        proxy = self._expr.proxy().corrwith(other._expr.proxy(), axis=axis,
  3080                                            drop=drop, method=method)
  3081        self, other = self.align(other, axis=0, join='inner')
  3082        col_names = proxy.index
  3083        other_cols = [other] * len(col_names)
  3084      elif isinstance(other, DeferredDataFrame):
  3085        proxy = self._expr.proxy().corrwith(
  3086            other._expr.proxy(), axis=axis, method=method, drop=drop)
  3087        self, other = self.align(other, axis=0, join='inner')
  3088        col_names = list(
  3089            set(self.columns)
  3090            .intersection(other.columns)
  3091            .intersection(proxy.index))
  3092        other_cols = [other[col_name] for col_name in col_names]
  3093      else:
  3094        # Raise the right error.
  3095        self._expr.proxy().corrwith(other._expr.proxy(), axis=axis, drop=drop,
  3096                                    method=method)
  3097  
  3098        # Just in case something else becomes valid.
  3099        raise NotImplementedError('corrwith(%s)' % type(other._expr.proxy))
  3100  
  3101      # Generate expressions to compute the actual correlations.
  3102      corrs = [
  3103          self[col_name].corr(other_col, method)
  3104          for col_name, other_col in zip(col_names, other_cols)]
  3105  
  3106      # Combine the results
  3107      def fill_dataframe(*args):
  3108        result = proxy.copy(deep=True)
  3109        for col, value in zip(proxy.index, args):
  3110          result[col] = value
  3111        return result
  3112      with expressions.allow_non_parallel_operations(True):
  3113        return frame_base.DeferredFrame.wrap(
  3114          expressions.ComputedExpression(
  3115            'fill_dataframe',
  3116            fill_dataframe,
  3117            [corr._expr for corr in corrs],
  3118            requires_partition_by=partitionings.Singleton(),
  3119            proxy=proxy))
  3120  
  3121    cummax = frame_base.wont_implement_method(pd.DataFrame, 'cummax',
  3122                                              reason='order-sensitive')
  3123    cummin = frame_base.wont_implement_method(pd.DataFrame, 'cummin',
  3124                                              reason='order-sensitive')
  3125    cumprod = frame_base.wont_implement_method(pd.DataFrame, 'cumprod',
  3126                                               reason='order-sensitive')
  3127    cumsum = frame_base.wont_implement_method(pd.DataFrame, 'cumsum',
  3128                                              reason='order-sensitive')
  3129    # TODO(BEAM-12071): Consider adding an order-insensitive implementation for
  3130    # diff that relies on the index
  3131    diff = frame_base.wont_implement_method(pd.DataFrame, 'diff',
  3132                                            reason='order-sensitive')
  3133    interpolate = frame_base.wont_implement_method(pd.DataFrame, 'interpolate',
  3134                                                   reason='order-sensitive')
  3135  
  3136    pct_change = frame_base.wont_implement_method(
  3137        pd.DataFrame, 'pct_change', reason='order-sensitive')
  3138    asof = frame_base.wont_implement_method(
  3139        pd.DataFrame, 'asof', reason='order-sensitive')
  3140    first_valid_index = frame_base.wont_implement_method(
  3141        pd.DataFrame, 'first_valid_index', reason='order-sensitive')
  3142    last_valid_index = frame_base.wont_implement_method(
  3143        pd.DataFrame, 'last_valid_index', reason='order-sensitive')
  3144    iat = property(frame_base.wont_implement_method(
  3145        pd.DataFrame, 'iat', reason='order-sensitive'))
  3146  
  3147    lookup = frame_base.wont_implement_method(
  3148        pd.DataFrame, 'lookup', reason='deprecated')
  3149  
  3150    head = frame_base.wont_implement_method(pd.DataFrame, 'head',
  3151        explanation=_PEEK_METHOD_EXPLANATION)
  3152    tail = frame_base.wont_implement_method(pd.DataFrame, 'tail',
  3153        explanation=_PEEK_METHOD_EXPLANATION)
  3154  
  3155    @frame_base.with_docs_from(pd.DataFrame)
  3156    @frame_base.args_to_kwargs(pd.DataFrame)
  3157    @frame_base.populate_defaults(pd.DataFrame)
  3158    def sample(self, n, frac, replace, weights, random_state, axis):
  3159      """When ``axis='index'``, only ``n`` and/or ``weights`` may be specified.
  3160      ``frac``, ``random_state``, and ``replace=True`` are not yet supported.
  3161      See `Issue 21010 <https://github.com/apache/beam/issues/21010>`_.
  3162  
  3163      Note that pandas will raise an error if ``n`` is larger than the length
  3164      of the dataset, while the Beam DataFrame API will simply return the full
  3165      dataset in that case.
  3166  
  3167      sample is fully supported for axis='columns'."""
  3168      if axis in (1, 'columns'):
  3169        # Sampling on axis=columns just means projecting random columns
  3170        # Eagerly generate proxy to determine the set of columns at construction
  3171        # time
  3172        proxy = self._expr.proxy().sample(n=n, frac=frac, replace=replace,
  3173                                          weights=weights,
  3174                                          random_state=random_state, axis=axis)
  3175        # Then do the projection
  3176        return self[list(proxy.columns)]
  3177  
  3178      # axis='index'
  3179      if frac is not None or random_state is not None or replace:
  3180        raise NotImplementedError(
  3181            f"When axis={axis!r}, only n and/or weights may be specified. "
  3182            "frac, random_state, and replace=True are not yet supported "
  3183            f"(got frac={frac!r}, random_state={random_state!r}, "
  3184            f"replace={replace!r}). See "
  3185            "https://github.com/apache/beam/issues/21010.")
  3186  
  3187      if n is None:
  3188        n = 1
  3189  
  3190      if isinstance(weights, str):
  3191        weights = self[weights]
  3192  
  3193      tmp_weight_column_name = "___Beam_DataFrame_weights___"
  3194  
  3195      if weights is None:
  3196        self_with_randomized_weights = frame_base.DeferredFrame.wrap(
  3197            expressions.ComputedExpression(
  3198            'randomized_weights',
  3199            lambda df: df.assign(**{tmp_weight_column_name:
  3200                                    np.random.rand(len(df))}),
  3201            [self._expr],
  3202            requires_partition_by=partitionings.Arbitrary(),
  3203            preserves_partition_by=partitionings.Arbitrary()))
  3204      else:
  3205        # See "Fast Parallel Weighted Random Sampling" by Efraimidis and Spirakis
  3206        # https://www.cti.gr/images_gr/reports/99-06-02.ps
  3207        def assign_randomized_weights(df, weights):
  3208          non_zero_weights = (weights > 0) | pd.Series(dtype=bool, index=df.index)
  3209          df = df.loc[non_zero_weights]
  3210          weights = weights.loc[non_zero_weights]
  3211          random_weights = np.log(np.random.rand(len(weights))) / weights
  3212          return df.assign(**{tmp_weight_column_name: random_weights})
  3213        self_with_randomized_weights = frame_base.DeferredFrame.wrap(
  3214            expressions.ComputedExpression(
  3215            'randomized_weights',
  3216            assign_randomized_weights,
  3217            [self._expr, weights._expr],
  3218            requires_partition_by=partitionings.Index(),
  3219            preserves_partition_by=partitionings.Arbitrary()))
  3220  
  3221      return self_with_randomized_weights.nlargest(
  3222          n=n, columns=tmp_weight_column_name, keep='any').drop(
  3223              tmp_weight_column_name, axis=1)
  3224  
  3225    @frame_base.with_docs_from(pd.DataFrame)
  3226    def dot(self, other):
  3227      # We want to broadcast the right hand side to all partitions of the left.
  3228      # This is OK, as its index must be the same size as the columns set of self,
  3229      # so cannot be too large.
  3230      class AsScalar(object):
  3231        def __init__(self, value):
  3232          self.value = value
  3233  
  3234      if isinstance(other, frame_base.DeferredFrame):
  3235        proxy = other._expr.proxy()
  3236        with expressions.allow_non_parallel_operations():
  3237          side = expressions.ComputedExpression(
  3238              'as_scalar',
  3239              lambda df: AsScalar(df),
  3240              [other._expr],
  3241              requires_partition_by=partitionings.Singleton())
  3242      else:
  3243        proxy = pd.DataFrame(columns=range(len(other[0])))
  3244        side = expressions.ConstantExpression(AsScalar(other))
  3245  
  3246      return frame_base.DeferredFrame.wrap(
  3247          expressions.ComputedExpression(
  3248              'dot',
  3249              lambda left, right: left @ right.value,
  3250              [self._expr, side],
  3251              requires_partition_by=partitionings.Arbitrary(),
  3252              preserves_partition_by=partitionings.Arbitrary(),
  3253              proxy=proxy))
  3254  
  3255    __matmul__ = dot
  3256  
  3257    @frame_base.with_docs_from(pd.DataFrame)
  3258    def mode(self, axis=0, *args, **kwargs):
  3259      """mode with axis="columns" is not implemented because it produces
  3260      non-deferred columns.
  3261  
  3262      mode with axis="index" is not currently parallelizable. An approximate,
  3263      parallelizable implementation of mode may be added in the future
  3264      (`Issue 20946 <https://github.com/apache/beam/issues/20946>`_)."""
  3265  
  3266      if axis == 1 or axis == 'columns':
  3267        # Number of columns is max(number mode values for each row), so we can't
  3268        # determine how many there will be before looking at the data.
  3269        raise frame_base.WontImplementError(
  3270            "mode(axis=columns) is not supported because it produces a variable "
  3271            "number of columns depending on the data.",
  3272            reason="non-deferred-columns")
  3273      return frame_base.DeferredFrame.wrap(
  3274          expressions.ComputedExpression(
  3275              'mode',
  3276              lambda df: df.mode(*args, **kwargs),
  3277              [self._expr],
  3278              #TODO(https://github.com/apache/beam/issues/20946):
  3279              # Can we add an approximate implementation?
  3280              requires_partition_by=partitionings.Singleton(reason=(
  3281                  "mode(axis='index') cannot currently be parallelized. See "
  3282                  "https://github.com/apache/beam/issues/20946 tracking the "
  3283                  "possble addition of an approximate, parallelizable "
  3284                  "implementation of mode."
  3285              )),
  3286              preserves_partition_by=partitionings.Singleton()))
  3287  
  3288    @frame_base.with_docs_from(pd.DataFrame)
  3289    @frame_base.args_to_kwargs(pd.DataFrame)
  3290    @frame_base.populate_defaults(pd.DataFrame)
  3291    @frame_base.maybe_inplace
  3292    def dropna(self, axis, **kwargs):
  3293      """dropna with axis="columns" specified cannot be parallelized."""
  3294      # TODO(robertwb): This is a common pattern. Generalize?
  3295      if axis in (1, 'columns'):
  3296        requires_partition_by = partitionings.Singleton(reason=(
  3297            "dropna(axis=1) cannot currently be parallelized. It requires "
  3298            "checking all values in each column for NaN values, to determine "
  3299            "if that column should be dropped."
  3300        ))
  3301      else:
  3302        requires_partition_by = partitionings.Arbitrary()
  3303      return frame_base.DeferredFrame.wrap(
  3304          expressions.ComputedExpression(
  3305              'dropna',
  3306              lambda df: df.dropna(axis=axis, **kwargs),
  3307              [self._expr],
  3308              preserves_partition_by=partitionings.Arbitrary(),
  3309              requires_partition_by=requires_partition_by))
  3310  
  3311    def _eval_or_query(self, name, expr, inplace, **kwargs):
  3312      for key in ('local_dict', 'global_dict', 'level', 'target', 'resolvers'):
  3313        if key in kwargs:
  3314          raise NotImplementedError(f"Setting '{key}' is not yet supported")
  3315  
  3316      # look for '@<py identifier>'
  3317      if re.search(r'\@[^\d\W]\w*', expr, re.UNICODE):
  3318        raise NotImplementedError("Accessing locals with @ is not yet supported "
  3319                                  "(https://github.com/apache/beam/issues/20626)"
  3320                                  )
  3321  
  3322      result_expr = expressions.ComputedExpression(
  3323          name,
  3324          lambda df: getattr(df, name)(expr, **kwargs),
  3325          [self._expr],
  3326          requires_partition_by=partitionings.Arbitrary(),
  3327          preserves_partition_by=partitionings.Arbitrary())
  3328  
  3329      if inplace:
  3330        self._expr = result_expr
  3331      else:
  3332        return frame_base.DeferredFrame.wrap(result_expr)
  3333  
  3334  
  3335    @frame_base.with_docs_from(pd.DataFrame)
  3336    @frame_base.args_to_kwargs(pd.DataFrame)
  3337    @frame_base.populate_defaults(pd.DataFrame)
  3338    def eval(self, expr, inplace, **kwargs):
  3339      """Accessing local variables with ``@<varname>`` is not yet supported
  3340      (`Issue 20626 <https://github.com/apache/beam/issues/20626>`_).
  3341  
  3342      Arguments ``local_dict``, ``global_dict``, ``level``, ``target``, and
  3343      ``resolvers`` are not yet supported."""
  3344      return self._eval_or_query('eval', expr, inplace, **kwargs)
  3345  
  3346    @frame_base.with_docs_from(pd.DataFrame)
  3347    @frame_base.args_to_kwargs(pd.DataFrame)
  3348    @frame_base.populate_defaults(pd.DataFrame)
  3349    def query(self, expr, inplace, **kwargs):
  3350      """Accessing local variables with ``@<varname>`` is not yet supported
  3351      (`Issue 20626 <https://github.com/apache/beam/issues/20626>`_).
  3352  
  3353      Arguments ``local_dict``, ``global_dict``, ``level``, ``target``, and
  3354      ``resolvers`` are not yet supported."""
  3355      return self._eval_or_query('query', expr, inplace, **kwargs)
  3356  
  3357    isnull = isna = frame_base._elementwise_method('isna', base=pd.DataFrame)
  3358    notnull = notna = frame_base._elementwise_method('notna', base=pd.DataFrame)
  3359  
  3360    items = frame_base.wont_implement_method(pd.DataFrame, 'items',
  3361                                             reason="non-deferred-result")
  3362    itertuples = frame_base.wont_implement_method(pd.DataFrame, 'itertuples',
  3363                                                  reason="non-deferred-result")
  3364    iterrows = frame_base.wont_implement_method(pd.DataFrame, 'iterrows',
  3365                                                reason="non-deferred-result")
  3366    iteritems = frame_base.wont_implement_method(pd.DataFrame, 'iteritems',
  3367                                                 reason="non-deferred-result")
  3368  
  3369    def _cols_as_temporary_index(self, cols, suffix=''):
  3370      original_index_names = list(self._expr.proxy().index.names)
  3371      new_index_names = [
  3372          '__apache_beam_temp_%d_%s' % (ix, suffix)
  3373          for (ix, _) in enumerate(original_index_names)]
  3374      def reindex(df):
  3375        return frame_base.DeferredFrame.wrap(
  3376            expressions.ComputedExpression(
  3377                'reindex',
  3378                lambda df:
  3379                    df.rename_axis(index=new_index_names, copy=False)
  3380                    .reset_index().set_index(cols),
  3381                [df._expr],
  3382                preserves_partition_by=partitionings.Singleton(),
  3383                requires_partition_by=partitionings.Arbitrary()))
  3384      def revert(df):
  3385        return frame_base.DeferredFrame.wrap(
  3386            expressions.ComputedExpression(
  3387                'join_restoreindex',
  3388                lambda df:
  3389                    df.reset_index().set_index(new_index_names)
  3390                    .rename_axis(index=original_index_names, copy=False),
  3391                [df._expr],
  3392                preserves_partition_by=partitionings.Singleton(),
  3393                requires_partition_by=partitionings.Arbitrary()))
  3394      return reindex, revert
  3395  
  3396    @frame_base.with_docs_from(pd.DataFrame)
  3397    @frame_base.args_to_kwargs(pd.DataFrame)
  3398    @frame_base.populate_defaults(pd.DataFrame)
  3399    def join(self, other, on, **kwargs):
  3400      if on is not None:
  3401        reindex, revert = self._cols_as_temporary_index(on)
  3402        return revert(reindex(self).join(other, **kwargs))
  3403      if isinstance(other, list):
  3404        other_is_list = True
  3405      else:
  3406        other = [other]
  3407        other_is_list = False
  3408      placeholder = object()
  3409      other_exprs = [
  3410          df._expr for df in other if isinstance(df, frame_base.DeferredFrame)]
  3411      const_others = [
  3412          placeholder if isinstance(df, frame_base.DeferredFrame) else df
  3413          for df in other]
  3414      def fill_placeholders(values):
  3415        values = iter(values)
  3416        filled = [
  3417            next(values) if df is placeholder else df for df in const_others]
  3418        if other_is_list:
  3419          return filled
  3420        else:
  3421          return filled[0]
  3422      return frame_base.DeferredFrame.wrap(
  3423          expressions.ComputedExpression(
  3424              'join',
  3425              lambda df, *deferred_others: df.join(
  3426                  fill_placeholders(deferred_others), **kwargs),
  3427              [self._expr] + other_exprs,
  3428              preserves_partition_by=partitionings.Arbitrary(),
  3429              requires_partition_by=partitionings.Index()))
  3430  
  3431    @frame_base.with_docs_from(pd.DataFrame)
  3432    @frame_base.args_to_kwargs(pd.DataFrame)
  3433    @frame_base.populate_defaults(pd.DataFrame)
  3434    def merge(
  3435        self,
  3436        right,
  3437        on,
  3438        left_on,
  3439        right_on,
  3440        left_index,
  3441        right_index,
  3442        suffixes,
  3443        **kwargs):
  3444      """merge is not parallelizable unless ``left_index`` or ``right_index`` is
  3445      ``True`, because it requires generating an entirely new unique index.
  3446      See notes on :meth:`DeferredDataFrame.reset_index`. It is recommended to
  3447      move the join key for one of your columns to the index to avoid this issue.
  3448      For an example see the enrich pipeline in
  3449      :mod:`apache_beam.examples.dataframe.taxiride`.
  3450  
  3451      ``how="cross"`` is not yet supported.
  3452      """
  3453      self_proxy = self._expr.proxy()
  3454      right_proxy = right._expr.proxy()
  3455      # Validate with a pandas call.
  3456      _ = self_proxy.merge(
  3457          right_proxy,
  3458          on=on,
  3459          left_on=left_on,
  3460          right_on=right_on,
  3461          left_index=left_index,
  3462          right_index=right_index,
  3463          **kwargs)
  3464      if kwargs.get('how', None) == 'cross':
  3465        raise NotImplementedError(
  3466          "cross join is not yet implemented "
  3467          "(https://github.com/apache/beam/issues/20318)")
  3468      if not any([on, left_on, right_on, left_index, right_index]):
  3469        on = [col for col in self_proxy.columns if col in right_proxy.columns]
  3470      if not left_on:
  3471        left_on = on
  3472      if left_on and not isinstance(left_on, list):
  3473        left_on = [left_on]
  3474      if not right_on:
  3475        right_on = on
  3476      if right_on and not isinstance(right_on, list):
  3477        right_on = [right_on]
  3478  
  3479      if left_index:
  3480        indexed_left = self
  3481      else:
  3482        indexed_left = self.set_index(left_on, drop=False)
  3483  
  3484      if right_index:
  3485        indexed_right = right
  3486      else:
  3487        indexed_right = right.set_index(right_on, drop=False)
  3488  
  3489      if left_on and right_on:
  3490        common_cols = set(left_on).intersection(right_on)
  3491        if len(common_cols):
  3492          # When merging on the same column name from both dfs, we need to make
  3493          # sure only one df has the column. Otherwise we end up with
  3494          # two duplicate columns, one with lsuffix and one with rsuffix.
  3495          # It's safe to drop from either because the data has already been duped
  3496          # to the index.
  3497          indexed_right = indexed_right.drop(columns=common_cols)
  3498  
  3499  
  3500      merged = frame_base.DeferredFrame.wrap(
  3501          expressions.ComputedExpression(
  3502              'merge',
  3503              lambda left, right: left.merge(right,
  3504                                             left_index=True,
  3505                                             right_index=True,
  3506                                             suffixes=suffixes,
  3507                                             **kwargs),
  3508              [indexed_left._expr, indexed_right._expr],
  3509              preserves_partition_by=partitionings.Arbitrary(),
  3510              requires_partition_by=partitionings.Index()))
  3511  
  3512      if left_index or right_index:
  3513        return merged
  3514      else:
  3515        return merged.reset_index(drop=True)
  3516  
  3517    @frame_base.with_docs_from(pd.DataFrame)
  3518    @frame_base.args_to_kwargs(pd.DataFrame)
  3519    @frame_base.populate_defaults(pd.DataFrame)
  3520    def nlargest(self, keep, **kwargs):
  3521      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  3522      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  3523      a Beam-specific option that guarantees only one duplicate will be kept, but
  3524      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  3525      duplicate element is kept."""
  3526      if keep == 'any':
  3527        keep = 'first'
  3528      elif keep != 'all':
  3529        raise frame_base.WontImplementError(
  3530            f"nlargest(keep={keep!r}) is not supported because it is "
  3531            "order sensitive. Only keep=\"all\" is supported.",
  3532            reason="order-sensitive")
  3533      kwargs['keep'] = keep
  3534      per_partition = expressions.ComputedExpression(
  3535              'nlargest-per-partition',
  3536              lambda df: df.nlargest(**kwargs),
  3537              [self._expr],
  3538              preserves_partition_by=partitionings.Arbitrary(),
  3539              requires_partition_by=partitionings.Arbitrary())
  3540      with expressions.allow_non_parallel_operations(True):
  3541        return frame_base.DeferredFrame.wrap(
  3542            expressions.ComputedExpression(
  3543                'nlargest',
  3544                lambda df: df.nlargest(**kwargs),
  3545                [per_partition],
  3546                preserves_partition_by=partitionings.Singleton(),
  3547                requires_partition_by=partitionings.Singleton()))
  3548  
  3549    @frame_base.with_docs_from(pd.DataFrame)
  3550    @frame_base.args_to_kwargs(pd.DataFrame)
  3551    @frame_base.populate_defaults(pd.DataFrame)
  3552    def nsmallest(self, keep, **kwargs):
  3553      """Only ``keep=False`` and ``keep="any"`` are supported. Other values of
  3554      ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is
  3555      a Beam-specific option that guarantees only one duplicate will be kept, but
  3556      unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_
  3557      duplicate element is kept."""
  3558      if keep == 'any':
  3559        keep = 'first'
  3560      elif keep != 'all':
  3561        raise frame_base.WontImplementError(
  3562            f"nsmallest(keep={keep!r}) is not supported because it is "
  3563            "order sensitive. Only keep=\"all\" is supported.",
  3564            reason="order-sensitive")
  3565      kwargs['keep'] = keep
  3566      per_partition = expressions.ComputedExpression(
  3567              'nsmallest-per-partition',
  3568              lambda df: df.nsmallest(**kwargs),
  3569              [self._expr],
  3570              preserves_partition_by=partitionings.Arbitrary(),
  3571              requires_partition_by=partitionings.Arbitrary())
  3572      with expressions.allow_non_parallel_operations(True):
  3573        return frame_base.DeferredFrame.wrap(
  3574            expressions.ComputedExpression(
  3575                'nsmallest',
  3576                lambda df: df.nsmallest(**kwargs),
  3577                [per_partition],
  3578                preserves_partition_by=partitionings.Singleton(),
  3579                requires_partition_by=partitionings.Singleton()))
  3580  
  3581    plot = frame_base.wont_implement_method(pd.DataFrame, 'plot',
  3582                                                        reason="plotting-tools")
  3583  
  3584    @frame_base.with_docs_from(pd.DataFrame)
  3585    def pop(self, item):
  3586      result = self[item]
  3587  
  3588      self._expr = expressions.ComputedExpression(
  3589              'popped',
  3590              lambda df: df.drop(columns=[item]),
  3591              [self._expr],
  3592              preserves_partition_by=partitionings.Arbitrary(),
  3593              requires_partition_by=partitionings.Arbitrary())
  3594      return result
  3595  
  3596    @frame_base.with_docs_from(pd.DataFrame)
  3597    @frame_base.args_to_kwargs(pd.DataFrame)
  3598    @frame_base.populate_defaults(pd.DataFrame)
  3599    def quantile(self, q, axis, **kwargs):
  3600      """``quantile(axis="index")`` is not parallelizable. See
  3601      `Issue 20933 <https://github.com/apache/beam/issues/20933>`_ tracking
  3602      the possible addition of an approximate, parallelizable implementation of
  3603      quantile.
  3604  
  3605      When using quantile with ``axis="columns"`` only a single ``q`` value can be
  3606      specified."""
  3607      if axis in (1, 'columns'):
  3608        if isinstance(q, list):
  3609          raise frame_base.WontImplementError(
  3610              "quantile(axis=columns) with multiple q values is not supported "
  3611              "because it transposes the input DataFrame. Note computing "
  3612              "an individual quantile across columns (e.g. "
  3613              f"df.quantile(q={q[0]!r}, axis={axis!r}) is supported.",
  3614              reason="non-deferred-columns")
  3615        else:
  3616          requires = partitionings.Arbitrary()
  3617      else: # axis='index'
  3618        # TODO(https://github.com/apache/beam/issues/20933): Provide an option
  3619        # for approximate distributed quantiles
  3620        requires = partitionings.Singleton(reason=(
  3621            "Computing quantiles across index cannot currently be parallelized. "
  3622            "See https://github.com/apache/beam/issues/20933 tracking the "
  3623            "possible addition of an approximate, parallelizable implementation "
  3624            "of quantile."
  3625        ))
  3626  
  3627      return frame_base.DeferredFrame.wrap(
  3628          expressions.ComputedExpression(
  3629              'quantile',
  3630              lambda df: df.quantile(q=q, axis=axis, **kwargs),
  3631              [self._expr],
  3632              requires_partition_by=requires,
  3633              preserves_partition_by=partitionings.Singleton()))
  3634  
  3635    @frame_base.with_docs_from(pd.DataFrame)
  3636    @frame_base.args_to_kwargs(pd.DataFrame)
  3637    @frame_base.maybe_inplace
  3638    def rename(self, **kwargs):
  3639      """rename is not parallelizable when ``axis="index"`` and
  3640      ``errors="raise"``. It requires collecting all data on a single
  3641      node in order to detect if one of the index values is missing."""
  3642      rename_index = (
  3643          'index' in kwargs
  3644          or kwargs.get('axis', None) in (0, 'index')
  3645          or ('columns' not in kwargs and 'axis' not in kwargs))
  3646      rename_columns = (
  3647          'columns' in kwargs
  3648          or kwargs.get('axis', None) in (1, 'columns'))
  3649  
  3650      if rename_index:
  3651        # Technically, it's still partitioned by index, but it's no longer
  3652        # partitioned by the hash of the index.
  3653        preserves_partition_by = partitionings.Singleton()
  3654      else:
  3655        preserves_partition_by = partitionings.Index()
  3656  
  3657      if kwargs.get('errors', None) == 'raise' and rename_index:
  3658        # TODO: We could do this in parallel by creating a ConstantExpression
  3659        # with a series created from the mapper dict. Then Index() partitioning
  3660        # would co-locate the necessary index values and we could raise
  3661        # individually within each partition. Execution time errors are
  3662        # discouraged anyway so probably not worth the effort.
  3663        requires_partition_by = partitionings.Singleton(reason=(
  3664            "rename(errors='raise', axis='index') requires collecting all "
  3665            "data on a single node in order to detect missing index values."
  3666        ))
  3667      else:
  3668        requires_partition_by = partitionings.Arbitrary()
  3669  
  3670      proxy = None
  3671      if rename_index:
  3672        # The proxy can't be computed by executing rename, it will error
  3673        # renaming the index.
  3674        if rename_columns:
  3675          # Note if both are being renamed, index and columns must be specified
  3676          # (not axis)
  3677          proxy = self._expr.proxy().rename(**{k: v for (k, v) in kwargs.items()
  3678                                               if not k == 'index'})
  3679        else:
  3680          # No change in columns, reuse proxy
  3681          proxy = self._expr.proxy()
  3682  
  3683      return frame_base.DeferredFrame.wrap(
  3684          expressions.ComputedExpression(
  3685              'rename',
  3686              lambda df: df.rename(**kwargs),
  3687              [self._expr],
  3688              proxy=proxy,
  3689              preserves_partition_by=preserves_partition_by,
  3690              requires_partition_by=requires_partition_by))
  3691  
  3692    rename_axis = frame_base._elementwise_method('rename_axis', base=pd.DataFrame)
  3693  
  3694    @frame_base.with_docs_from(pd.DataFrame)
  3695    @frame_base.args_to_kwargs(pd.DataFrame)
  3696    @frame_base.populate_defaults(pd.DataFrame)
  3697    def round(self, decimals, *args, **kwargs):
  3698  
  3699      if isinstance(decimals, frame_base.DeferredFrame):
  3700        # Disallow passing a deferred Series in, our current partitioning model
  3701        # prevents us from using it correctly.
  3702        raise NotImplementedError("Passing a deferred series to round() is not "
  3703                                  "supported, please use a concrete pd.Series "
  3704                                  "instance or a dictionary")
  3705  
  3706      return frame_base.DeferredFrame.wrap(
  3707          expressions.ComputedExpression(
  3708              'round',
  3709              lambda df: df.round(decimals, *args, **kwargs),
  3710              [self._expr],
  3711              requires_partition_by=partitionings.Arbitrary(),
  3712              preserves_partition_by=partitionings.Index()
  3713          )
  3714      )
  3715  
  3716    select_dtypes = frame_base._elementwise_method('select_dtypes',
  3717                                                   base=pd.DataFrame)
  3718  
  3719    @frame_base.with_docs_from(pd.DataFrame)
  3720    @frame_base.args_to_kwargs(pd.DataFrame)
  3721    @frame_base.populate_defaults(pd.DataFrame)
  3722    def shift(self, axis, freq, **kwargs):
  3723      """shift with ``axis="index" is only supported with ``freq`` specified and
  3724      ``fill_value`` undefined. Other configurations make this operation
  3725      order-sensitive."""
  3726      if axis in (1, 'columns'):
  3727        preserves = partitionings.Arbitrary()
  3728        proxy = None
  3729      else:
  3730        if freq is None or 'fill_value' in kwargs:
  3731          fill_value = kwargs.get('fill_value', 'NOT SET')
  3732          raise frame_base.WontImplementError(
  3733              f"shift(axis={axis!r}) is only supported with freq defined, and "
  3734              f"fill_value undefined (got freq={freq!r},"
  3735              f"fill_value={fill_value!r}). Other configurations are sensitive "
  3736              "to the order of the data because they require populating shifted "
  3737              "rows with `fill_value`.",
  3738              reason="order-sensitive")
  3739        # proxy generation fails in pandas <1.2
  3740        # Seems due to https://github.com/pandas-dev/pandas/issues/14811,
  3741        # bug with shift on empty indexes.
  3742        # Fortunately the proxy should be identical to the input.
  3743        proxy = self._expr.proxy().copy()
  3744  
  3745  
  3746        # index is modified, so no partitioning is preserved.
  3747        preserves = partitionings.Singleton()
  3748  
  3749      return frame_base.DeferredFrame.wrap(
  3750          expressions.ComputedExpression(
  3751              'shift',
  3752              lambda df: df.shift(axis=axis, freq=freq, **kwargs),
  3753              [self._expr],
  3754              proxy=proxy,
  3755              preserves_partition_by=preserves,
  3756              requires_partition_by=partitionings.Arbitrary()))
  3757  
  3758  
  3759    shape = property(frame_base.wont_implement_method(
  3760        pd.DataFrame, 'shape', reason="non-deferred-result"))
  3761  
  3762    stack = frame_base._proxy_method(
  3763        'stack',
  3764        base=pd.DataFrame,
  3765        requires_partition_by=partitionings.Arbitrary(),
  3766        preserves_partition_by=partitionings.Singleton())
  3767  
  3768    all = _agg_method(pd.DataFrame, 'all')
  3769    any = _agg_method(pd.DataFrame, 'any')
  3770    count = _agg_method(pd.DataFrame, 'count')
  3771    describe = _agg_method(pd.DataFrame, 'describe')
  3772    max = _agg_method(pd.DataFrame, 'max')
  3773    min = _agg_method(pd.DataFrame, 'min')
  3774  
  3775    @frame_base.with_docs_from(pd.DataFrame)
  3776    @frame_base.args_to_kwargs(pd.DataFrame)
  3777    @frame_base.populate_defaults(pd.DataFrame)
  3778    def pivot(self, index=None, columns=None, values=None, **kwargs):
  3779      """Because pivot is a non-deferred method, any columns specified in
  3780      ``columns`` must be CategoricalDType so we can determine the output column
  3781      names."""
  3782  
  3783      def verify_all_categorical(all_cols_are_categorical):
  3784        if not all_cols_are_categorical:
  3785          message = "pivot() of non-categorical type is not supported because " \
  3786              "the type of the output column depends on the data. Please use " \
  3787              "pd.CategoricalDtype with explicit categories."
  3788          raise frame_base.WontImplementError(
  3789            message, reason="non-deferred-columns")
  3790  
  3791      # If values not provided, take all remaining columns of dataframe
  3792      if not values:
  3793        tmp = self._expr.proxy()
  3794        if index:
  3795          tmp = tmp.drop(index, axis=1)
  3796        if columns:
  3797          tmp = tmp.drop(columns, axis=1)
  3798        values = tmp.columns.values
  3799  
  3800      # Construct column index
  3801      if is_list_like(columns) and len(columns) <= 1:
  3802        columns = columns[0]
  3803      selected_cols = self._expr.proxy()[columns]
  3804      if isinstance(selected_cols, pd.Series):
  3805        all_cols_are_categorical = isinstance(
  3806          selected_cols.dtype, pd.CategoricalDtype
  3807        )
  3808        verify_all_categorical(all_cols_are_categorical)
  3809  
  3810        if is_list_like(values) and len(values) > 1:
  3811          # If more than one value provided, don't create a None level
  3812          values_in_col_index = values
  3813          names = [None, columns]
  3814          col_index = pd.MultiIndex.from_product(
  3815            [values_in_col_index,
  3816            selected_cols.dtypes.categories.astype('category')],
  3817            names=names
  3818          )
  3819        else:
  3820          col_index = pd.CategoricalIndex(
  3821            selected_cols.dtype.categories,
  3822            name=columns
  3823          )
  3824      else:
  3825        all_cols_are_categorical = all(
  3826          isinstance(c, pd.CategoricalDtype) for c in selected_cols.dtypes
  3827        )
  3828        verify_all_categorical(all_cols_are_categorical)
  3829  
  3830        if is_list_like(values) and len(values) > 1:
  3831          # If more than one value provided, don't create a None level
  3832          values_in_col_index = values
  3833          names = [None, *columns]
  3834          categories = [
  3835            c.categories.astype('category') for c in selected_cols.dtypes
  3836          ]
  3837          col_index = pd.MultiIndex.from_product(
  3838            [values_in_col_index, *categories],
  3839            names=names
  3840          )
  3841        else:
  3842          # If one value provided, don't create a None level
  3843          names = columns
  3844          categories = [
  3845            c.categories.astype('category') for c in selected_cols.dtypes
  3846          ]
  3847          col_index = pd.MultiIndex.from_product(
  3848            categories,
  3849            names=names
  3850          )
  3851  
  3852      # Construct row index
  3853      if index:
  3854        if PD_VERSION < (1, 4) and is_list_like(index) and len(index) > 1:
  3855          raise frame_base.WontImplementError(
  3856            "pivot() is not supported when pandas<1.4 and index is a MultiIndex")
  3857        per_partition = expressions.ComputedExpression(
  3858            'pivot-per-partition',
  3859            lambda df: df.set_index(keys=index), [self._expr],
  3860            preserves_partition_by=partitionings.Singleton(),
  3861            requires_partition_by=partitionings.Arbitrary()
  3862        )
  3863        tmp = per_partition.proxy().pivot(
  3864          columns=columns, values=values, **kwargs)
  3865        row_index = tmp.index
  3866      else:
  3867        per_partition = self._expr
  3868        row_index = self._expr.proxy().index
  3869      if PD_VERSION < (1, 4) and isinstance(row_index, pd.MultiIndex):
  3870        raise frame_base.WontImplementError(
  3871          "pivot() is not supported when pandas<1.4 and index is a MultiIndex")
  3872  
  3873      selected_values = self._expr.proxy()[values]
  3874      if isinstance(selected_values, pd.Series):
  3875        value_dtype = selected_values.dtype
  3876      else:
  3877        # Set dtype to object if more than one value
  3878        dtypes = [d for d in selected_values.dtypes]
  3879        value_dtype = object
  3880        if any((is_int64_dtype(x) for x in dtypes)):
  3881          value_dtype = np.int64
  3882        if any((is_float_dtype(x) for x in dtypes)):
  3883          value_dtype = np.float64
  3884        if object in dtypes:
  3885          value_dtype = object
  3886  
  3887      # Construct proxy
  3888      proxy = pd.DataFrame(
  3889        columns=col_index, dtype=value_dtype, index=row_index
  3890      )
  3891  
  3892      def pivot_helper(df):
  3893        result = pd.concat(
  3894          [proxy, df.pivot(columns=columns, values=values, **kwargs)]
  3895        )
  3896        result.columns = col_index
  3897        result = result.astype(value_dtype)
  3898        return result
  3899  
  3900      return frame_base.DeferredFrame.wrap(
  3901          expressions.ComputedExpression(
  3902              'pivot',
  3903              pivot_helper,
  3904              [per_partition],
  3905              proxy=proxy,
  3906              preserves_partition_by=partitionings.Index(),
  3907              requires_partition_by=partitionings.Index()))
  3908  
  3909    prod = product = _agg_method(pd.DataFrame, 'prod')
  3910    sum = _agg_method(pd.DataFrame, 'sum')
  3911    mean = _agg_method(pd.DataFrame, 'mean')
  3912    median = _agg_method(pd.DataFrame, 'median')
  3913    nunique = _agg_method(pd.DataFrame, 'nunique')
  3914    std = _agg_method(pd.DataFrame, 'std')
  3915    var = _agg_method(pd.DataFrame, 'var')
  3916    sem = _agg_method(pd.DataFrame, 'sem')
  3917    mad = _agg_method(pd.DataFrame, 'mad')
  3918    skew = _agg_method(pd.DataFrame, 'skew')
  3919    kurt = _agg_method(pd.DataFrame, 'kurt')
  3920    kurtosis = _agg_method(pd.DataFrame, 'kurtosis')
  3921  
  3922    take = frame_base.wont_implement_method(pd.DataFrame, 'take',
  3923                                            reason='deprecated')
  3924  
  3925    to_records = frame_base.wont_implement_method(pd.DataFrame, 'to_records',
  3926                                                  reason="non-deferred-result")
  3927    to_dict = frame_base.wont_implement_method(pd.DataFrame, 'to_dict',
  3928                                               reason="non-deferred-result")
  3929    to_numpy = frame_base.wont_implement_method(pd.DataFrame, 'to_numpy',
  3930                                                reason="non-deferred-result")
  3931    to_string = frame_base.wont_implement_method(pd.DataFrame, 'to_string',
  3932                                                 reason="non-deferred-result")
  3933  
  3934    to_sparse = frame_base.wont_implement_method(pd.DataFrame, 'to_sparse',
  3935                                                 reason="non-deferred-result")
  3936  
  3937    transpose = frame_base.wont_implement_method(
  3938        pd.DataFrame, 'transpose', reason='non-deferred-columns')
  3939    T = property(frame_base.wont_implement_method(
  3940        pd.DataFrame, 'T', reason='non-deferred-columns'))
  3941  
  3942    update = frame_base._proxy_method(
  3943        'update',
  3944        inplace=True,
  3945        base=pd.DataFrame,
  3946        requires_partition_by=partitionings.Index(),
  3947        preserves_partition_by=partitionings.Arbitrary())
  3948  
  3949    values = property(frame_base.wont_implement_method(
  3950        pd.DataFrame, 'values', reason="non-deferred-result"))
  3951  
  3952    style = property(frame_base.wont_implement_method(
  3953        pd.DataFrame, 'style', reason="non-deferred-result"))
  3954  
  3955    @frame_base.with_docs_from(pd.DataFrame)
  3956    @frame_base.args_to_kwargs(pd.DataFrame)
  3957    @frame_base.populate_defaults(pd.DataFrame)
  3958    def melt(self, ignore_index, **kwargs):
  3959      """``ignore_index=True`` is not supported, because it requires generating an
  3960      order-sensitive index."""
  3961      if ignore_index:
  3962        raise frame_base.WontImplementError(
  3963            "melt(ignore_index=True) is order sensitive because it requires "
  3964            "generating a new index based on the order of the data.",
  3965            reason="order-sensitive")
  3966  
  3967      return frame_base.DeferredFrame.wrap(
  3968          expressions.ComputedExpression(
  3969              'melt',
  3970              lambda df: df.melt(ignore_index=False, **kwargs), [self._expr],
  3971              requires_partition_by=partitionings.Arbitrary(),
  3972              preserves_partition_by=partitionings.Singleton()))
  3973  
  3974    if hasattr(pd.DataFrame, 'value_counts'):
  3975      @frame_base.with_docs_from(pd.DataFrame)
  3976      def value_counts(self, subset=None, sort=False, normalize=False,
  3977                       ascending=False, dropna=True):
  3978        """``sort`` is ``False`` by default, and ``sort=True`` is not supported
  3979        because it imposes an ordering on the dataset which likely will not be
  3980        preserved."""
  3981  
  3982        if sort:
  3983          raise frame_base.WontImplementError(
  3984              "value_counts(sort=True) is not supported because it imposes an "
  3985              "ordering on the dataset which likely will not be preserved.",
  3986              reason="order-sensitive")
  3987        columns = subset or list(self.columns)
  3988  
  3989        if dropna:
  3990          dropped = self.dropna()
  3991        else:
  3992          dropped = self
  3993  
  3994        result = dropped.groupby(columns, dropna=dropna).size()
  3995  
  3996        if normalize:
  3997          return result/dropped.length()
  3998        else:
  3999          return result
  4000  
  4001    if hasattr(pd.DataFrame, 'compare'):
  4002  
  4003      @frame_base.with_docs_from(pd.DataFrame)
  4004      @frame_base.args_to_kwargs(pd.DataFrame)
  4005      @frame_base.populate_defaults(pd.DataFrame)
  4006      def compare(self, other, align_axis, keep_shape, **kwargs):
  4007        """The default values ``align_axis=1 and ``keep_shape=False``
  4008         are not supported, because the output columns depend on the data.
  4009         To use ``align_axis=1``, please specify ``keep_shape=True``."""
  4010  
  4011        preserve_partition = None
  4012  
  4013        if align_axis in (1, 'columns') and not keep_shape:
  4014          raise frame_base.WontImplementError(
  4015            f"compare(align_axis={align_axis!r}, keep_shape={keep_shape!r}) "
  4016            "is not allowed because the output columns depend on the data, "
  4017            "please specify keep_shape=True.",
  4018          reason='non-deferred-columns'
  4019          )
  4020  
  4021        if align_axis in (1, 'columns'):
  4022          preserve_partition = partitionings.Arbitrary()
  4023        elif align_axis in (0, 'index'):
  4024          preserve_partition = partitionings.Singleton()
  4025        else:
  4026          raise ValueError(
  4027            "align_axis must be one of ('index', 0, 'columns', 1). "
  4028            f"got {align_axis!r}.")
  4029  
  4030  
  4031        return frame_base.DeferredFrame.wrap(
  4032          expressions.ComputedExpression(
  4033            'compare',
  4034            lambda df, other: df.compare(other, align_axis, keep_shape, **kwargs),
  4035            [self._expr, other._expr],
  4036            requires_partition_by=partitionings.Index(),
  4037            preserves_partition_by=preserve_partition
  4038          )
  4039        )
  4040  
  4041    def _idxmaxmin_helper(self, op, **kwargs):
  4042      if op == 'idxmax':
  4043        func = pd.DataFrame.idxmax
  4044      elif op == 'idxmin':
  4045        func = pd.DataFrame.idxmin
  4046      else:
  4047        raise ValueError("op must be one of ('idxmax', 'idxmin'). "
  4048                         f"got {op!r}.")
  4049  
  4050      axis = kwargs.get('axis', 0)
  4051  
  4052      index_dtype = self._expr.proxy().index.dtype
  4053      columns_dtype = self._expr.proxy().columns.dtype
  4054  
  4055      def compute_idx(df):
  4056        indexes = func(df, **kwargs).unique()
  4057        if pd.isna(indexes).any():
  4058          return df
  4059        else:
  4060          return df.loc[indexes]
  4061  
  4062      if axis in ('index', 0):
  4063        requires_partition = partitionings.Singleton()
  4064  
  4065        proxy_index = pd.Index([], dtype=columns_dtype)
  4066        proxy = pd.Series([], index=proxy_index, dtype=index_dtype)
  4067        partition_proxy = self._expr.proxy().copy()
  4068  
  4069        idx_per_partition = expressions.ComputedExpression(
  4070          'idx-per-partition',
  4071          compute_idx, [self._expr],
  4072          proxy=partition_proxy,
  4073          requires_partition_by=partitionings.Arbitrary(),
  4074          preserves_partition_by=partitionings.Arbitrary()
  4075        )
  4076  
  4077      elif axis in ('columns', 1):
  4078        requires_partition = partitionings.Index()
  4079  
  4080        proxy_index = pd.Index([], dtype=index_dtype)
  4081        proxy = pd.Series([], index=proxy_index, dtype=columns_dtype)
  4082  
  4083        idx_per_partition = self._expr
  4084  
  4085      else:
  4086        raise ValueError("axis must be one of ('index', 0, 'columns', 1). "
  4087                         f"got {axis!r}.")
  4088  
  4089      with expressions.allow_non_parallel_operations(True):
  4090        return frame_base.DeferredFrame.wrap(
  4091          expressions.ComputedExpression(
  4092            'idx',
  4093            lambda df: func(df, **kwargs), [idx_per_partition],
  4094            proxy=proxy,
  4095            requires_partition_by=requires_partition,
  4096            preserves_partition_by=partitionings.Singleton()
  4097          )
  4098        )
  4099  
  4100  
  4101    @frame_base.with_docs_from(pd.DataFrame)
  4102    @frame_base.args_to_kwargs(pd.DataFrame)
  4103    @frame_base.populate_defaults(pd.DataFrame)
  4104    def idxmin(self, **kwargs):
  4105      return self._idxmaxmin_helper('idxmin', **kwargs)
  4106  
  4107    @frame_base.with_docs_from(pd.DataFrame)
  4108    @frame_base.args_to_kwargs(pd.DataFrame)
  4109    @frame_base.populate_defaults(pd.DataFrame)
  4110    def idxmax(self, **kwargs):
  4111      return self._idxmaxmin_helper('idxmax', **kwargs)
  4112  
  4113  
  4114  for io_func in dir(io):
  4115    if io_func.startswith('to_'):
  4116      setattr(DeferredDataFrame, io_func, getattr(io, io_func))
  4117      setattr(DeferredSeries, io_func, getattr(io, io_func))
  4118  
  4119  
  4120  for meth in ('filter', ):
  4121    setattr(DeferredDataFrame, meth,
  4122            frame_base._elementwise_method(meth, base=pd.DataFrame))
  4123  
  4124  
  4125  @populate_not_implemented(DataFrameGroupBy)
  4126  class DeferredGroupBy(frame_base.DeferredFrame):
  4127    def __init__(self, expr, kwargs,
  4128                 ungrouped: expressions.Expression[pd.core.generic.NDFrame],
  4129                 ungrouped_with_index: expressions.Expression[pd.core.generic.NDFrame], # pylint: disable=line-too-long
  4130                 grouping_columns,
  4131                 grouping_indexes,
  4132                 group_keys,
  4133                 projection=None):
  4134      """This object represents the result of::
  4135  
  4136          ungrouped.groupby(level=[grouping_indexes + grouping_columns],
  4137                            **kwargs)[projection]
  4138  
  4139      :param expr: An expression to compute a pandas GroupBy object. Convenient
  4140          for unliftable aggregations.
  4141      :param ungrouped: An expression to compute the DataFrame pre-grouping, the
  4142          (Multi)Index contains only the grouping columns/indexes.
  4143      :param ungrouped_with_index: Same as ungrouped, except the index includes
  4144          all of the original indexes as well as any grouping columns. This is
  4145          important for operations that expose the original index, e.g. .apply(),
  4146          but we only use it when necessary to avoid unnessary data transfer and
  4147          GBKs.
  4148      :param grouping_columns: list of column labels that were in the original
  4149          groupby(..) ``by`` parameter. Only relevant for grouped DataFrames.
  4150      :param grouping_indexes: list of index names (or index level numbers) to be
  4151          grouped.
  4152      :param kwargs: Keywords args passed to the original groupby(..) call."""
  4153      super().__init__(expr)
  4154      self._ungrouped = ungrouped
  4155      self._ungrouped_with_index = ungrouped_with_index
  4156      self._projection = projection
  4157      self._grouping_columns = grouping_columns
  4158      self._grouping_indexes = grouping_indexes
  4159      self._group_keys = group_keys
  4160      self._kwargs = kwargs
  4161  
  4162      if (self._kwargs.get('dropna', True) is False and
  4163          self._ungrouped.proxy().index.nlevels > 1):
  4164        raise NotImplementedError(
  4165            "dropna=False does not work as intended in the Beam DataFrame API "
  4166            "when grouping on multiple columns or indexes (See "
  4167            "https://github.com/apache/beam/issues/21014).")
  4168  
  4169    def __getattr__(self, name):
  4170      return DeferredGroupBy(
  4171          expressions.ComputedExpression(
  4172              'groupby_project',
  4173              lambda gb: getattr(gb, name), [self._expr],
  4174              requires_partition_by=partitionings.Arbitrary(),
  4175              preserves_partition_by=partitionings.Arbitrary()),
  4176          self._kwargs,
  4177          self._ungrouped,
  4178          self._ungrouped_with_index,
  4179          self._grouping_columns,
  4180          self._grouping_indexes,
  4181          self._group_keys,
  4182          projection=name)
  4183  
  4184    def __getitem__(self, name):
  4185      return DeferredGroupBy(
  4186          expressions.ComputedExpression(
  4187              'groupby_project',
  4188              lambda gb: gb[name], [self._expr],
  4189              requires_partition_by=partitionings.Arbitrary(),
  4190              preserves_partition_by=partitionings.Arbitrary()),
  4191          self._kwargs,
  4192          self._ungrouped,
  4193          self._ungrouped_with_index,
  4194          self._grouping_columns,
  4195          self._grouping_indexes,
  4196          self._group_keys,
  4197          projection=name)
  4198  
  4199    @frame_base.with_docs_from(DataFrameGroupBy)
  4200    def agg(self, fn, *args, **kwargs):
  4201      if _is_associative(fn):
  4202        return _liftable_agg(fn)(self, *args, **kwargs)
  4203      elif _is_liftable_with_sum(fn):
  4204        return _liftable_agg(fn, postagg_meth='sum')(self, *args, **kwargs)
  4205      elif _is_unliftable(fn):
  4206        return _unliftable_agg(fn)(self, *args, **kwargs)
  4207      elif callable(fn):
  4208        return DeferredDataFrame(
  4209            expressions.ComputedExpression(
  4210                'agg',
  4211                lambda gb: gb.agg(fn, *args, **kwargs), [self._expr],
  4212                requires_partition_by=partitionings.Index(),
  4213                preserves_partition_by=partitionings.Singleton()))
  4214      else:
  4215        raise NotImplementedError(f"GroupBy.agg(func={fn!r})")
  4216  
  4217    @property
  4218    def ndim(self):
  4219      return self._expr.proxy().ndim
  4220  
  4221    @frame_base.with_docs_from(DataFrameGroupBy)
  4222    def apply(self, func, *args, **kwargs):
  4223      """Note that ``func`` will be called once during pipeline construction time
  4224      with an empty pandas object, so take care if ``func`` has a side effect.
  4225  
  4226      When called with an empty pandas object, ``func`` is expected to return an
  4227      object of the same type as what will be returned when the pipeline is
  4228      processing actual data. If the result is a pandas object it should have the
  4229      same type and name (for a Series) or column types and names (for
  4230      a DataFrame) as the actual results.
  4231  
  4232      Note that in pandas, ``apply`` attempts to detect if the index is unmodified
  4233      in ``func`` (indicating ``func`` is a transform) and drops the duplicate
  4234      index in the output. To determine this, pandas tests the indexes for
  4235      equality. However, Beam cannot do this since it is sensitive to the input
  4236      data; instead this implementation tests if the indexes are equivalent
  4237      with ``is``. See the `pandas 1.4.0 release notes
  4238      <https://pandas.pydata.org/docs/dev/whatsnew/v1.4.0.html#groupby-apply-consistent-transform-detection>`_
  4239      for a good explanation of the distinction between these approaches. In
  4240      practice, this just means that in some cases the Beam result will have
  4241      a duplicate index, whereas pandas would have dropped it."""
  4242  
  4243      project = _maybe_project_func(self._projection)
  4244      grouping_indexes = self._grouping_indexes
  4245      grouping_columns = self._grouping_columns
  4246      group_keys = self._group_keys
  4247  
  4248      # Unfortunately pandas does not execute func to determine the right proxy.
  4249      # We run user func on a proxy here to detect the return type and generate
  4250      # the proxy.
  4251      fn_input = project(self._ungrouped_with_index.proxy().reset_index(
  4252          grouping_columns, drop=True))
  4253      result = func(fn_input)
  4254      def index_to_arrays(index):
  4255        return [index.get_level_values(level)
  4256                for level in range(index.nlevels)]
  4257  
  4258  
  4259      # By default do_apply will just call pandas apply()
  4260      # We override it below if necessary
  4261      do_apply = lambda gb: gb.apply(func, *args, **kwargs)
  4262  
  4263      if (isinstance(result, pd.core.generic.NDFrame) and
  4264          result.index is fn_input.index):
  4265        # Special case where apply fn is a transform
  4266        # Note we trust that if the user fn produces a proxy with the identical
  4267        # index, it will produce results with identical indexes at execution
  4268        # time too
  4269        proxy = result
  4270      elif isinstance(result, pd.DataFrame):
  4271        # apply fn is not a transform, we need to make sure the original index
  4272        # values are prepended to the result's index
  4273        proxy = result[:0]
  4274  
  4275        # First adjust proxy
  4276        proxy.index = pd.MultiIndex.from_arrays(
  4277            index_to_arrays(self._ungrouped.proxy().index) +
  4278            index_to_arrays(proxy.index),
  4279            names=self._ungrouped.proxy().index.names + proxy.index.names)
  4280  
  4281        # Then override do_apply function
  4282        new_index_names = self._ungrouped.proxy().index.names
  4283        if len(new_index_names) > 1:
  4284          def add_key_index(key, df):
  4285            # df is a dataframe or Series representing the result of func for
  4286            # a single key
  4287            # key is a tuple with the MultiIndex values for this key
  4288            df.index = pd.MultiIndex.from_arrays(
  4289                [[key[i]] * len(df) for i in range(len(new_index_names))] +
  4290                index_to_arrays(df.index),
  4291                names=new_index_names + df.index.names)
  4292            return df
  4293        else:
  4294          def add_key_index(key, df):
  4295            # df is a dataframe or Series representing the result of func for
  4296            # a single key
  4297            df.index = pd.MultiIndex.from_arrays(
  4298                [[key] * len(df)] + index_to_arrays(df.index),
  4299                names=new_index_names + df.index.names)
  4300            return df
  4301  
  4302  
  4303        do_apply = lambda gb: pd.concat([
  4304            add_key_index(k, func(gb.get_group(k), *args, **kwargs))
  4305            for k in gb.groups.keys()])
  4306      elif isinstance(result, pd.Series):
  4307        if isinstance(fn_input, pd.DataFrame):
  4308          # DataFrameGroupBy
  4309          # In this case pandas transposes the Series result, s.t. the Series
  4310          # index values are the columns, and the grouping keys are the new index
  4311          # values.
  4312          dtype = pd.Series([result]).dtype
  4313          proxy = pd.DataFrame(columns=result.index,
  4314                               dtype=result.dtype,
  4315                               index=self._ungrouped.proxy().index)
  4316        elif isinstance(fn_input, pd.Series):
  4317          # SeriesGroupBy
  4318          # In this case the output is still a Series, but with an additional
  4319          # index with the grouping keys.
  4320          proxy = pd.Series(dtype=result.dtype,
  4321                            name=result.name,
  4322                            index=index_to_arrays(self._ungrouped.proxy().index) +
  4323                                  index_to_arrays(result[:0].index))
  4324      else:
  4325        # The user fn returns some non-pandas type. The expected result is a
  4326        # Series where each element is the result of one user fn call.
  4327        dtype = pd.Series([result]).dtype
  4328        proxy = pd.Series([], dtype=dtype, index=self._ungrouped.proxy().index)
  4329  
  4330      def do_partition_apply(df):
  4331        # Remove columns from index, we only needed them there for partitioning
  4332        df = df.reset_index(grouping_columns, drop=True)
  4333  
  4334        gb = df.groupby(level=grouping_indexes or None,
  4335                        by=grouping_columns or None,
  4336                        group_keys=group_keys)
  4337  
  4338        gb = project(gb)
  4339  
  4340        return do_apply(gb)
  4341  
  4342      return DeferredDataFrame(
  4343          expressions.ComputedExpression(
  4344              'apply',
  4345              do_partition_apply,
  4346              [self._ungrouped_with_index],
  4347              proxy=proxy,
  4348              requires_partition_by=partitionings.Index(grouping_indexes +
  4349                                                        grouping_columns),
  4350              preserves_partition_by=partitionings.Index(grouping_indexes)))
  4351  
  4352  
  4353    @frame_base.with_docs_from(DataFrameGroupBy)
  4354    def transform(self, fn, *args, **kwargs):
  4355      """Note that ``func`` will be called once during pipeline construction time
  4356      with an empty pandas object, so take care if ``func`` has a side effect.
  4357  
  4358      When called with an empty pandas object, ``func`` is expected to return an
  4359      object of the same type as what will be returned when the pipeline is
  4360      processing actual data. The result should have the same type and name (for
  4361      a Series) or column types and names (for a DataFrame) as the actual
  4362      results."""
  4363      if not callable(fn):
  4364        raise NotImplementedError(
  4365            "String functions are not yet supported in transform.")
  4366  
  4367      if self._grouping_columns and not self._projection:
  4368        grouping_columns = self._grouping_columns
  4369        def fn_wrapper(x, *args, **kwargs):
  4370          x = x.droplevel(grouping_columns)
  4371          return fn(x, *args, **kwargs)
  4372      else:
  4373        fn_wrapper = fn
  4374  
  4375      project = _maybe_project_func(self._projection)
  4376      group_keys = self._group_keys
  4377  
  4378      # pandas cannot execute fn to determine the right proxy.
  4379      # We run user fn on a proxy here to detect the return type and generate the
  4380      # proxy.
  4381      result = fn_wrapper(project(self._ungrouped_with_index.proxy()))
  4382      parent_frame = self._ungrouped.args()[0].proxy()
  4383      if isinstance(result, pd.core.generic.NDFrame):
  4384        proxy = result[:0]
  4385  
  4386      else:
  4387        # The user fn returns some non-pandas type. The expected result is a
  4388        # Series where each element is the result of one user fn call.
  4389        dtype = pd.Series([result]).dtype
  4390        proxy = pd.Series([], dtype=dtype, name=project(parent_frame).name)
  4391  
  4392        if not isinstance(self._projection, list):
  4393          proxy.name = self._projection
  4394  
  4395      # The final result will have the original indexes
  4396      proxy.index = parent_frame.index
  4397  
  4398      levels = self._grouping_indexes + self._grouping_columns
  4399  
  4400      return DeferredDataFrame(
  4401          expressions.ComputedExpression(
  4402              'transform',
  4403              lambda df: project(
  4404                df.groupby(level=levels, group_keys=group_keys)
  4405              ).transform(
  4406                fn_wrapper,
  4407                *args,
  4408                **kwargs).droplevel(self._grouping_columns),
  4409              [self._ungrouped_with_index],
  4410              proxy=proxy,
  4411              requires_partition_by=partitionings.Index(levels),
  4412              preserves_partition_by=partitionings.Index(self._grouping_indexes)))
  4413  
  4414    @frame_base.with_docs_from(DataFrameGroupBy)
  4415    def pipe(self, func, *args, **kwargs):
  4416      if isinstance(func, tuple):
  4417        func, data = func
  4418        kwargs[data] = self
  4419        return func(*args, **kwargs)
  4420  
  4421      return func(self, *args, **kwargs)
  4422  
  4423    @frame_base.with_docs_from(DataFrameGroupBy)
  4424    def filter(self, func=None, dropna=True):
  4425      if func is None or not callable(func):
  4426        raise TypeError("func must be specified and it must be callable")
  4427  
  4428      def apply_fn(df):
  4429        if func(df):
  4430          return df
  4431        elif not dropna:
  4432          result = df.copy()
  4433          result.iloc[:, :] = np.nan
  4434          return result
  4435        else:
  4436          return df.iloc[:0]
  4437  
  4438      return self.apply(apply_fn).droplevel(self._grouping_columns)
  4439  
  4440    @property  # type: ignore
  4441    @frame_base.with_docs_from(DataFrameGroupBy)
  4442    def dtypes(self):
  4443      return frame_base.DeferredFrame.wrap(
  4444          expressions.ComputedExpression(
  4445              'dtypes',
  4446              lambda gb: gb.dtypes,
  4447              [self._expr],
  4448              requires_partition_by=partitionings.Arbitrary(),
  4449              preserves_partition_by=partitionings.Arbitrary()
  4450          )
  4451      )
  4452  
  4453    if hasattr(DataFrameGroupBy, 'value_counts'):
  4454      @frame_base.with_docs_from(DataFrameGroupBy)
  4455      def value_counts(self, **kwargs):
  4456        """
  4457        DataFrameGroupBy.value_counts() is the same as DataFrame.value_counts()
  4458        """
  4459        return frame_base.DeferredFrame.wrap(
  4460            expressions.ComputedExpression(
  4461                'value_counts',
  4462                lambda df: df.value_counts(**kwargs), [self._expr],
  4463                preserves_partition_by=partitionings.Arbitrary(),
  4464                requires_partition_by=partitionings.Arbitrary())
  4465        )
  4466  
  4467    fillna = frame_base.wont_implement_method(
  4468        DataFrameGroupBy, 'fillna', explanation=(
  4469            "df.fillna() should be used instead. Only method=None is supported "
  4470            "because other methods are order-sensitive. df.groupby(..).fillna() "
  4471            "without a method is equivalent to df.fillna()."))
  4472  
  4473    ffill = frame_base.wont_implement_method(DataFrameGroupBy, 'ffill',
  4474                                             reason="order-sensitive")
  4475    bfill = frame_base.wont_implement_method(DataFrameGroupBy, 'bfill',
  4476                                             reason="order-sensitive")
  4477    pad = frame_base.wont_implement_method(DataFrameGroupBy, 'pad',
  4478                                           reason="order-sensitive")
  4479    backfill = frame_base.wont_implement_method(DataFrameGroupBy, 'backfill',
  4480                                                reason="order-sensitive")
  4481  
  4482    aggregate = agg
  4483  
  4484    hist = frame_base.wont_implement_method(DataFrameGroupBy, 'hist',
  4485                                            reason="plotting-tools")
  4486    plot = frame_base.wont_implement_method(DataFrameGroupBy, 'plot',
  4487                                            reason="plotting-tools")
  4488    boxplot = frame_base.wont_implement_method(DataFrameGroupBy, 'boxplot',
  4489                                               reason="plotting-tools")
  4490  
  4491    head = frame_base.wont_implement_method(
  4492        DataFrameGroupBy, 'head', explanation=_PEEK_METHOD_EXPLANATION)
  4493    tail = frame_base.wont_implement_method(
  4494        DataFrameGroupBy, 'tail', explanation=_PEEK_METHOD_EXPLANATION)
  4495  
  4496    first = frame_base.not_implemented_method('first', base_type=DataFrameGroupBy)
  4497    last = frame_base.not_implemented_method('last', base_type=DataFrameGroupBy)
  4498    nth = property(frame_base.wont_implement_method(
  4499        DataFrameGroupBy, 'nth', reason='order-sensitive'))
  4500    cumcount = frame_base.wont_implement_method(
  4501        DataFrameGroupBy, 'cumcount', reason='order-sensitive')
  4502    cummax = frame_base.wont_implement_method(
  4503        DataFrameGroupBy, 'cummax', reason='order-sensitive')
  4504    cummin = frame_base.wont_implement_method(
  4505        DataFrameGroupBy, 'cummin', reason='order-sensitive')
  4506    cumsum = frame_base.wont_implement_method(
  4507        DataFrameGroupBy, 'cumsum', reason='order-sensitive')
  4508    cumprod = frame_base.wont_implement_method(
  4509        DataFrameGroupBy, 'cumprod', reason='order-sensitive')
  4510    diff = frame_base.wont_implement_method(DataFrameGroupBy, 'diff',
  4511                                            reason='order-sensitive')
  4512    shift = frame_base.wont_implement_method(DataFrameGroupBy, 'shift',
  4513                                             reason='order-sensitive')
  4514  
  4515    pct_change = frame_base.wont_implement_method(DataFrameGroupBy, 'pct_change',
  4516                                                  reason='order-sensitive')
  4517    ohlc = frame_base.wont_implement_method(DataFrameGroupBy, 'ohlc',
  4518                                            reason='order-sensitive')
  4519  
  4520    # TODO(https://github.com/apache/beam/issues/20958): Consider allowing this
  4521    # for categorical keys.
  4522    __len__ = frame_base.wont_implement_method(
  4523        DataFrameGroupBy, '__len__', reason="non-deferred-result")
  4524    groups = property(frame_base.wont_implement_method(
  4525        DataFrameGroupBy, 'groups', reason="non-deferred-result"))
  4526    indices = property(frame_base.wont_implement_method(
  4527        DataFrameGroupBy, 'indices', reason="non-deferred-result"))
  4528  
  4529    resample = frame_base.wont_implement_method(
  4530        DataFrameGroupBy, 'resample', reason='event-time-semantics')
  4531    rolling = frame_base.wont_implement_method(
  4532        DataFrameGroupBy, 'rolling', reason='event-time-semantics')
  4533    ewm = frame_base.wont_implement_method(
  4534        DataFrameGroupBy, 'ewm', reason="event-time-semantics")
  4535    expanding = frame_base.wont_implement_method(
  4536        DataFrameGroupBy, 'expanding', reason="event-time-semantics")
  4537  
  4538    tshift = frame_base.wont_implement_method(
  4539        DataFrameGroupBy, 'tshift', reason="deprecated")
  4540  
  4541  def _maybe_project_func(projection: Optional[List[str]]):
  4542    """ Returns identity func if projection is empty or None, else returns
  4543    a function that projects the specified columns. """
  4544    if projection:
  4545      return lambda df: df[projection]
  4546    else:
  4547      return lambda x: x
  4548  
  4549  
  4550  def _liftable_agg(meth, postagg_meth=None):
  4551    agg_name, _ = frame_base.name_and_func(meth)
  4552  
  4553    if postagg_meth is None:
  4554      post_agg_name = agg_name
  4555    else:
  4556      post_agg_name, _ = frame_base.name_and_func(postagg_meth)
  4557  
  4558    @frame_base.with_docs_from(DataFrameGroupBy, name=agg_name)
  4559    def wrapper(self, *args, **kwargs):
  4560      assert isinstance(self, DeferredGroupBy)
  4561  
  4562      if 'min_count' in kwargs:
  4563        return _unliftable_agg(meth)(self, *args, **kwargs)
  4564  
  4565      to_group = self._ungrouped.proxy().index
  4566      is_categorical_grouping = any(to_group.get_level_values(i).is_categorical()
  4567                                    for i in self._grouping_indexes)
  4568      groupby_kwargs = self._kwargs
  4569      group_keys = self._group_keys
  4570  
  4571      # Don't include un-observed categorical values in the preagg
  4572      preagg_groupby_kwargs = groupby_kwargs.copy()
  4573      preagg_groupby_kwargs['observed'] = True
  4574  
  4575      project = _maybe_project_func(self._projection)
  4576      pre_agg = expressions.ComputedExpression(
  4577          'pre_combine_' + agg_name,
  4578          lambda df: getattr(
  4579              project(
  4580                  df.groupby(level=list(range(df.index.nlevels)),
  4581                             group_keys=group_keys,
  4582                             **preagg_groupby_kwargs)
  4583              ),
  4584              agg_name)(**kwargs),
  4585          [self._ungrouped],
  4586          requires_partition_by=partitionings.Arbitrary(),
  4587          preserves_partition_by=partitionings.Arbitrary())
  4588  
  4589  
  4590      post_agg = expressions.ComputedExpression(
  4591          'post_combine_' + post_agg_name,
  4592          lambda df: getattr(
  4593              df.groupby(level=list(range(df.index.nlevels)),
  4594                         group_keys=group_keys,
  4595                         **groupby_kwargs),
  4596              post_agg_name)(**kwargs),
  4597          [pre_agg],
  4598          requires_partition_by=(partitionings.Singleton(reason=(
  4599              "Aggregations grouped by a categorical column are not currently "
  4600              "parallelizable (https://github.com/apache/beam/issues/21827)."
  4601          ))
  4602                                 if is_categorical_grouping
  4603                                 else partitionings.Index()),
  4604          preserves_partition_by=partitionings.Arbitrary())
  4605      return frame_base.DeferredFrame.wrap(post_agg)
  4606  
  4607    return wrapper
  4608  
  4609  
  4610  def _unliftable_agg(meth):
  4611    agg_name, _ = frame_base.name_and_func(meth)
  4612  
  4613    @frame_base.with_docs_from(DataFrameGroupBy, name=agg_name)
  4614    def wrapper(self, *args, **kwargs):
  4615      assert isinstance(self, DeferredGroupBy)
  4616  
  4617      to_group = self._ungrouped.proxy().index
  4618      group_keys = self._group_keys
  4619      is_categorical_grouping = any(to_group.get_level_values(i).is_categorical()
  4620                                    for i in self._grouping_indexes)
  4621  
  4622      groupby_kwargs = self._kwargs
  4623      project = _maybe_project_func(self._projection)
  4624      post_agg = expressions.ComputedExpression(
  4625          agg_name,
  4626          lambda df: getattr(project(
  4627              df.groupby(level=list(range(df.index.nlevels)),
  4628                         group_keys=group_keys,
  4629                         **groupby_kwargs),
  4630          ), agg_name)(**kwargs),
  4631          [self._ungrouped],
  4632          requires_partition_by=(partitionings.Singleton(reason=(
  4633              "Aggregations grouped by a categorical column are not currently "
  4634              "parallelizable (https://github.com/apache/beam/issues/21827)."
  4635          ))
  4636                                 if is_categorical_grouping
  4637                                 else partitionings.Index()),
  4638          # Some aggregation methods (e.g. corr/cov) add additional index levels.
  4639          # We only preserve the ones that existed _before_ the groupby.
  4640          preserves_partition_by=partitionings.Index(
  4641              list(range(self._ungrouped.proxy().index.nlevels))))
  4642      return frame_base.DeferredFrame.wrap(post_agg)
  4643  
  4644    return wrapper
  4645  
  4646  for meth in LIFTABLE_AGGREGATIONS:
  4647    setattr(DeferredGroupBy, meth, _liftable_agg(meth))
  4648  for meth in LIFTABLE_WITH_SUM_AGGREGATIONS:
  4649    setattr(DeferredGroupBy, meth, _liftable_agg(meth, postagg_meth='sum'))
  4650  for meth in UNLIFTABLE_AGGREGATIONS:
  4651    if meth in ('kurt', 'kurtosis'):
  4652      # pandas doesn't currently allow kurtosis on GroupBy:
  4653      # https://github.com/pandas-dev/pandas/issues/40139
  4654      continue
  4655    setattr(DeferredGroupBy, meth, _unliftable_agg(meth))
  4656  
  4657  def _check_str_or_np_builtin(agg_func, func_list):
  4658    return agg_func in func_list or (
  4659        getattr(agg_func, '__name__', None) in func_list
  4660        and agg_func.__module__ in ('numpy', 'builtins'))
  4661  
  4662  
  4663  def _is_associative(agg_func):
  4664    return _check_str_or_np_builtin(agg_func, LIFTABLE_AGGREGATIONS)
  4665  
  4666  def _is_liftable_with_sum(agg_func):
  4667    return _check_str_or_np_builtin(agg_func, LIFTABLE_WITH_SUM_AGGREGATIONS)
  4668  
  4669  def _is_unliftable(agg_func):
  4670    return _check_str_or_np_builtin(agg_func, UNLIFTABLE_AGGREGATIONS)
  4671  
  4672  NUMERIC_AGGREGATIONS = ['max', 'min', 'prod', 'sum', 'mean', 'median', 'std',
  4673                          'var', 'sem', 'mad', 'skew', 'kurt', 'kurtosis']
  4674  
  4675  def _is_numeric(agg_func):
  4676    return _check_str_or_np_builtin(agg_func, NUMERIC_AGGREGATIONS)
  4677  
  4678  
  4679  @populate_not_implemented(DataFrameGroupBy)
  4680  class _DeferredGroupByCols(frame_base.DeferredFrame):
  4681    # It's not clear that all of these make sense in Pandas either...
  4682    agg = aggregate = frame_base._elementwise_method('agg', base=DataFrameGroupBy)
  4683    any = frame_base._elementwise_method('any', base=DataFrameGroupBy)
  4684    all = frame_base._elementwise_method('all', base=DataFrameGroupBy)
  4685    boxplot = frame_base.wont_implement_method(
  4686        DataFrameGroupBy, 'boxplot', reason="plotting-tools")
  4687    describe = frame_base.not_implemented_method('describe',
  4688                                                 base_type=DataFrameGroupBy)
  4689    diff = frame_base._elementwise_method('diff', base=DataFrameGroupBy)
  4690    fillna = frame_base._elementwise_method('fillna', base=DataFrameGroupBy)
  4691    filter = frame_base._elementwise_method('filter', base=DataFrameGroupBy)
  4692    first = frame_base._elementwise_method('first', base=DataFrameGroupBy)
  4693    get_group = frame_base._elementwise_method('get_group', base=DataFrameGroupBy)
  4694    head = frame_base.wont_implement_method(
  4695        DataFrameGroupBy, 'head', explanation=_PEEK_METHOD_EXPLANATION)
  4696    hist = frame_base.wont_implement_method(
  4697        DataFrameGroupBy, 'hist', reason="plotting-tools")
  4698    idxmax = frame_base._elementwise_method('idxmax', base=DataFrameGroupBy)
  4699    idxmin = frame_base._elementwise_method('idxmin', base=DataFrameGroupBy)
  4700    last = frame_base._elementwise_method('last', base=DataFrameGroupBy)
  4701    mad = frame_base._elementwise_method('mad', base=DataFrameGroupBy)
  4702    max = frame_base._elementwise_method('max', base=DataFrameGroupBy)
  4703    mean = frame_base._elementwise_method('mean', base=DataFrameGroupBy)
  4704    median = frame_base._elementwise_method('median', base=DataFrameGroupBy)
  4705    min = frame_base._elementwise_method('min', base=DataFrameGroupBy)
  4706    nunique = frame_base._elementwise_method('nunique', base=DataFrameGroupBy)
  4707    plot = frame_base.wont_implement_method(
  4708        DataFrameGroupBy, 'plot', reason="plotting-tools")
  4709    prod = frame_base._elementwise_method('prod', base=DataFrameGroupBy)
  4710    quantile = frame_base._elementwise_method('quantile', base=DataFrameGroupBy)
  4711    shift = frame_base._elementwise_method('shift', base=DataFrameGroupBy)
  4712    size = frame_base._elementwise_method('size', base=DataFrameGroupBy)
  4713    skew = frame_base._elementwise_method('skew', base=DataFrameGroupBy)
  4714    std = frame_base._elementwise_method('std', base=DataFrameGroupBy)
  4715    sum = frame_base._elementwise_method('sum', base=DataFrameGroupBy)
  4716    tail = frame_base.wont_implement_method(
  4717        DataFrameGroupBy, 'tail', explanation=_PEEK_METHOD_EXPLANATION)
  4718    take = frame_base.wont_implement_method(
  4719        DataFrameGroupBy, 'take', reason='deprecated')
  4720    tshift = frame_base._elementwise_method('tshift', base=DataFrameGroupBy)
  4721    var = frame_base._elementwise_method('var', base=DataFrameGroupBy)
  4722  
  4723    @property # type: ignore
  4724    @frame_base.with_docs_from(DataFrameGroupBy)
  4725    def groups(self):
  4726      return self._expr.proxy().groups
  4727  
  4728    @property # type: ignore
  4729    @frame_base.with_docs_from(DataFrameGroupBy)
  4730    def indices(self):
  4731      return self._expr.proxy().indices
  4732  
  4733    @property # type: ignore
  4734    @frame_base.with_docs_from(DataFrameGroupBy)
  4735    def ndim(self):
  4736      return self._expr.proxy().ndim
  4737  
  4738    @property # type: ignore
  4739    @frame_base.with_docs_from(DataFrameGroupBy)
  4740    def ngroups(self):
  4741      return self._expr.proxy().ngroups
  4742  
  4743  
  4744  @populate_not_implemented(pd.core.indexes.base.Index)
  4745  class _DeferredIndex(object):
  4746    def __init__(self, frame):
  4747      self._frame = frame
  4748  
  4749    @property
  4750    def names(self):
  4751      return self._frame._expr.proxy().index.names
  4752  
  4753    @names.setter
  4754    def names(self, value):
  4755      def set_index_names(df):
  4756        df = df.copy()
  4757        df.index.names = value
  4758        return df
  4759  
  4760      self._frame._expr = expressions.ComputedExpression(
  4761        'set_index_names',
  4762        set_index_names,
  4763        [self._frame._expr],
  4764        requires_partition_by=partitionings.Arbitrary(),
  4765        preserves_partition_by=partitionings.Arbitrary())
  4766  
  4767    @property
  4768    def name(self):
  4769      return self._frame._expr.proxy().index.name
  4770  
  4771    @name.setter
  4772    def name(self, value):
  4773      self.names = [value]
  4774  
  4775    @property
  4776    def ndim(self):
  4777      return self._frame._expr.proxy().index.ndim
  4778  
  4779    @property
  4780    def dtype(self):
  4781      return self._frame._expr.proxy().index.dtype
  4782  
  4783    @property
  4784    def nlevels(self):
  4785      return self._frame._expr.proxy().index.nlevels
  4786  
  4787    def __getattr__(self, name):
  4788      raise NotImplementedError('index.%s' % name)
  4789  
  4790  
  4791  @populate_not_implemented(pd.core.indexing._LocIndexer)
  4792  class _DeferredLoc(object):
  4793    def __init__(self, frame):
  4794      self._frame = frame
  4795  
  4796    def __getitem__(self, key):
  4797      if isinstance(key, tuple):
  4798        rows, cols = key
  4799        return self[rows][cols]
  4800      elif isinstance(key, list) and key and isinstance(key[0], bool):
  4801        # Aligned by numerical key.
  4802        raise NotImplementedError(type(key))
  4803      elif isinstance(key, list):
  4804        # Select rows, but behaves poorly on missing values.
  4805        raise NotImplementedError(type(key))
  4806      elif isinstance(key, slice):
  4807        args = [self._frame._expr]
  4808        func = lambda df: df.loc[key]
  4809      elif isinstance(key, frame_base.DeferredFrame):
  4810        func = lambda df, key: df.loc[key]
  4811        if pd.core.dtypes.common.is_bool_dtype(key._expr.proxy()):
  4812          # Boolean indexer, just pass it in as-is
  4813          args = [self._frame._expr, key._expr]
  4814        else:
  4815          # Likely a DeferredSeries of labels, overwrite the key's index with it's
  4816          # values so we can colocate them with the labels they're selecting
  4817          def data_to_index(s):
  4818            s = s.copy()
  4819            s.index = s
  4820            return s
  4821  
  4822          reindexed_expr = expressions.ComputedExpression(
  4823              'data_to_index',
  4824              data_to_index,
  4825              [key._expr],
  4826              requires_partition_by=partitionings.Arbitrary(),
  4827              preserves_partition_by=partitionings.Singleton(),
  4828          )
  4829          args = [self._frame._expr, reindexed_expr]
  4830      elif callable(key):
  4831  
  4832        def checked_callable_key(df):
  4833          computed_index = key(df)
  4834          if isinstance(computed_index, tuple):
  4835            row_index, _ = computed_index
  4836          else:
  4837            row_index = computed_index
  4838          if isinstance(row_index, list) and row_index and isinstance(
  4839              row_index[0], bool):
  4840            raise NotImplementedError(type(row_index))
  4841          elif not isinstance(row_index, (slice, pd.Series)):
  4842            raise NotImplementedError(type(row_index))
  4843          return computed_index
  4844  
  4845        args = [self._frame._expr]
  4846        func = lambda df: df.loc[checked_callable_key]
  4847      else:
  4848        raise NotImplementedError(type(key))
  4849  
  4850      return frame_base.DeferredFrame.wrap(
  4851          expressions.ComputedExpression(
  4852              'loc',
  4853              func,
  4854              args,
  4855              requires_partition_by=(
  4856                  partitionings.JoinIndex()
  4857                  if len(args) > 1
  4858                  else partitionings.Arbitrary()),
  4859              preserves_partition_by=partitionings.Arbitrary()))
  4860  
  4861    __setitem__ = frame_base.not_implemented_method(
  4862        'loc.setitem', base_type=pd.core.indexing._LocIndexer)
  4863  
  4864  @populate_not_implemented(pd.core.indexing._iLocIndexer)
  4865  class _DeferredILoc(object):
  4866    def __init__(self, frame):
  4867      self._frame = frame
  4868  
  4869    def __getitem__(self, index):
  4870      if isinstance(index, tuple):
  4871        rows, _ = index
  4872        if rows != slice(None, None, None):
  4873          raise frame_base.WontImplementError(
  4874              "Using iloc to select rows is not supported because it's "
  4875              "position-based indexing is sensitive to the order of the data.",
  4876              reason="order-sensitive")
  4877        return frame_base.DeferredFrame.wrap(
  4878            expressions.ComputedExpression(
  4879                'iloc',
  4880                lambda df: df.iloc[index],
  4881                [self._frame._expr],
  4882                requires_partition_by=partitionings.Arbitrary(),
  4883                preserves_partition_by=partitionings.Arbitrary()))
  4884      else:
  4885        raise frame_base.WontImplementError(
  4886            "Using iloc to select rows is not supported because it's "
  4887            "position-based indexing is sensitive to the order of the data.",
  4888            reason="order-sensitive")
  4889  
  4890    def __setitem__(self, index, value):
  4891      raise frame_base.WontImplementError(
  4892          "Using iloc to mutate a frame is not supported because it's "
  4893          "position-based indexing is sensitive to the order of the data.",
  4894          reason="order-sensitive")
  4895  
  4896  
  4897  class _DeferredStringMethods(frame_base.DeferredBase):
  4898    @frame_base.with_docs_from(pd.core.strings.StringMethods)
  4899    @frame_base.args_to_kwargs(pd.core.strings.StringMethods)
  4900    @frame_base.populate_defaults(pd.core.strings.StringMethods)
  4901    def cat(self, others, join, **kwargs):
  4902      """If defined, ``others`` must be a :class:`DeferredSeries` or a ``list`` of
  4903      ``DeferredSeries``."""
  4904      if others is None:
  4905        # Concatenate series into a single String
  4906        requires = partitionings.Singleton(reason=(
  4907            "cat(others=None) concatenates all data in a Series into a single "
  4908            "string, so it requires collecting all data on a single node."
  4909        ))
  4910        func = lambda df: df.str.cat(join=join, **kwargs)
  4911        args = [self._expr]
  4912  
  4913      elif (isinstance(others, frame_base.DeferredBase) or
  4914           (isinstance(others, list) and
  4915            all(isinstance(other, frame_base.DeferredBase) for other in others))):
  4916  
  4917        if isinstance(others, frame_base.DeferredBase):
  4918          others = [others]
  4919  
  4920        requires = partitionings.Index()
  4921        def func(*args):
  4922          return args[0].str.cat(others=args[1:], join=join, **kwargs)
  4923        args = [self._expr] + [other._expr for other in others]
  4924  
  4925      else:
  4926        raise frame_base.WontImplementError(
  4927            "others must be None, DeferredSeries, or List[DeferredSeries] "
  4928            f"(encountered {type(others)}). Other types are not supported "
  4929            "because they make this operation sensitive to the order of the "
  4930            "data.", reason="order-sensitive")
  4931  
  4932      return frame_base.DeferredFrame.wrap(
  4933          expressions.ComputedExpression(
  4934              'cat',
  4935              func,
  4936              args,
  4937              requires_partition_by=requires,
  4938              preserves_partition_by=partitionings.Arbitrary()))
  4939  
  4940    @frame_base.with_docs_from(pd.core.strings.StringMethods)
  4941    @frame_base.args_to_kwargs(pd.core.strings.StringMethods)
  4942    def repeat(self, repeats):
  4943      """``repeats`` must be an ``int`` or a :class:`DeferredSeries`. Lists are
  4944      not supported because they make this operation order-sensitive."""
  4945      if isinstance(repeats, int):
  4946        return frame_base.DeferredFrame.wrap(
  4947            expressions.ComputedExpression(
  4948                'repeat',
  4949                lambda series: series.str.repeat(repeats),
  4950                [self._expr],
  4951                # TODO(https://github.com/apache/beam/issues/20573): Defer to
  4952                # pandas to compute this proxy. Currently it incorrectly infers
  4953                # dtype bool, may require upstream fix.
  4954                proxy=self._expr.proxy(),
  4955                requires_partition_by=partitionings.Arbitrary(),
  4956                preserves_partition_by=partitionings.Arbitrary()))
  4957      elif isinstance(repeats, frame_base.DeferredBase):
  4958        return frame_base.DeferredFrame.wrap(
  4959            expressions.ComputedExpression(
  4960                'repeat',
  4961                lambda series, repeats_series: series.str.repeat(repeats_series),
  4962                [self._expr, repeats._expr],
  4963                # TODO(https://github.com/apache/beam/issues/20573): Defer to
  4964                # pandas to compute this proxy. Currently it incorrectly infers
  4965                # dtype bool, may require upstream fix.
  4966                proxy=self._expr.proxy(),
  4967                requires_partition_by=partitionings.Index(),
  4968                preserves_partition_by=partitionings.Arbitrary()))
  4969      elif isinstance(repeats, list):
  4970        raise frame_base.WontImplementError(
  4971            "str.repeat(repeats=) repeats must be an int or a DeferredSeries. "
  4972            "Lists are not supported because they make this operation sensitive "
  4973            "to the order of the data.", reason="order-sensitive")
  4974      else:
  4975        raise TypeError("str.repeat(repeats=) value must be an int or a "
  4976                        f"DeferredSeries (encountered {type(repeats)}).")
  4977  
  4978    @frame_base.with_docs_from(pd.core.strings.StringMethods)
  4979    @frame_base.args_to_kwargs(pd.core.strings.StringMethods)
  4980    def get_dummies(self, **kwargs):
  4981      """
  4982      Series must be categorical dtype. Please cast to ``CategoricalDtype``
  4983      to ensure correct categories.
  4984      """
  4985      dtype = self._expr.proxy().dtype
  4986      if not isinstance(dtype, pd.CategoricalDtype):
  4987        raise frame_base.WontImplementError(
  4988            "get_dummies() of non-categorical type is not supported because "
  4989            "the type of the output column depends on the data. Please use "
  4990            "pd.CategoricalDtype with explicit categories.",
  4991            reason="non-deferred-columns")
  4992  
  4993      split_cats = [
  4994        cat.split(sep=kwargs.get('sep', '|')) for cat in dtype.categories
  4995      ]
  4996  
  4997      # Adding the nan category because there could be the case that
  4998      # the data includes NaNs, which is not valid to be casted as a Category,
  4999      # but nevertheless would be broadcasted as a column in get_dummies()
  5000      columns = sorted(set().union(*split_cats))
  5001      columns = columns + ['nan'] if 'nan' not in columns else columns
  5002  
  5003      proxy = pd.DataFrame(columns=columns).astype(int)
  5004  
  5005      return frame_base.DeferredFrame.wrap(
  5006          expressions.ComputedExpression(
  5007              'get_dummies',
  5008              lambda series: pd.concat(
  5009                [proxy, series.str.get_dummies(**kwargs)]
  5010                ).fillna(value=0, method=None).astype('int64'),
  5011              [self._expr],
  5012              proxy=proxy,
  5013              requires_partition_by=partitionings.Arbitrary(),
  5014              preserves_partition_by=partitionings.Arbitrary()))
  5015  
  5016    def _split_helper(self, rsplit=False, **kwargs):
  5017      expand = kwargs.get('expand', False)
  5018  
  5019      if not expand:
  5020        # Not creating separate columns
  5021        proxy = self._expr.proxy()
  5022        if not rsplit:
  5023          func = lambda s: pd.concat([proxy, s.str.split(**kwargs)])
  5024        else:
  5025          func = lambda s: pd.concat([proxy, s.str.rsplit(**kwargs)])
  5026      else:
  5027        # Creating separate columns, so must be more strict on dtype
  5028        dtype = self._expr.proxy().dtype
  5029        if not isinstance(dtype, pd.CategoricalDtype):
  5030          method_name = 'rsplit' if rsplit else 'split'
  5031          raise frame_base.WontImplementError(
  5032              f"{method_name}() of non-categorical type is not supported because "
  5033              "the type of the output column depends on the data. Please use "
  5034              "pd.CategoricalDtype with explicit categories.",
  5035              reason="non-deferred-columns")
  5036  
  5037        # Split the categories
  5038        split_cats = dtype.categories.str.split(**kwargs)
  5039  
  5040        # Count the number of new columns to create for proxy
  5041        max_splits = len(max(split_cats, key=len))
  5042        proxy = pd.DataFrame(columns=range(max_splits))
  5043  
  5044        def func(s):
  5045          if not rsplit:
  5046            result = s.str.split(**kwargs)
  5047          else:
  5048            result = s.str.rsplit(**kwargs)
  5049          result[~result.isna()].replace(np.nan, value=None)
  5050          return result
  5051  
  5052      return frame_base.DeferredFrame.wrap(
  5053          expressions.ComputedExpression(
  5054              'split',
  5055              func,
  5056              [self._expr],
  5057              proxy=proxy,
  5058              requires_partition_by=partitionings.Arbitrary(),
  5059              preserves_partition_by=partitionings.Arbitrary()))
  5060  
  5061    @frame_base.with_docs_from(pd.core.strings.StringMethods)
  5062    @frame_base.args_to_kwargs(pd.core.strings.StringMethods)
  5063    @frame_base.populate_defaults(pd.core.strings.StringMethods)
  5064    def split(self, **kwargs):
  5065      """
  5066      Like other non-deferred methods, dtype must be CategoricalDtype.
  5067      One exception is when ``expand`` is ``False``. Because we are not
  5068      creating new columns at construction time, dtype can be `str`.
  5069      """
  5070      return self._split_helper(rsplit=False, **kwargs)
  5071  
  5072    @frame_base.with_docs_from(pd.core.strings.StringMethods)
  5073    @frame_base.args_to_kwargs(pd.core.strings.StringMethods)
  5074    @frame_base.populate_defaults(pd.core.strings.StringMethods)
  5075    def rsplit(self, **kwargs):
  5076      """
  5077      Like other non-deferred methods, dtype must be CategoricalDtype.
  5078      One exception is when ``expand`` is ``False``. Because we are not
  5079      creating new columns at construction time, dtype can be `str`.
  5080      """
  5081      return self._split_helper(rsplit=True, **kwargs)
  5082  
  5083  
  5084  ELEMENTWISE_STRING_METHODS = [
  5085              'capitalize',
  5086              'casefold',
  5087              'contains',
  5088              'count',
  5089              'endswith',
  5090              'extract',
  5091              'findall',
  5092              'fullmatch',
  5093              'get',
  5094              'isalnum',
  5095              'isalpha',
  5096              'isdecimal',
  5097              'isdigit',
  5098              'islower',
  5099              'isnumeric',
  5100              'isspace',
  5101              'istitle',
  5102              'isupper',
  5103              'join',
  5104              'len',
  5105              'lower',
  5106              'lstrip',
  5107              'match',
  5108              'pad',
  5109              'partition',
  5110              'removeprefix',
  5111              'removesuffix',
  5112              'replace',
  5113              'rpartition',
  5114              'rstrip',
  5115              'slice',
  5116              'slice_replace',
  5117              'startswith',
  5118              'strip',
  5119              'swapcase',
  5120              'title',
  5121              'upper',
  5122              'wrap',
  5123              'zfill',
  5124              '__getitem__',
  5125  ]
  5126  
  5127  NON_ELEMENTWISE_STRING_METHODS = [
  5128              'extractall',
  5129  ]
  5130  
  5131  def make_str_func(method):
  5132    def func(df, *args, **kwargs):
  5133      try:
  5134        df_str = df.str
  5135      except AttributeError:
  5136        # If there's a non-string value in a Series passed to .str method, pandas
  5137        # will generally just replace it with NaN in the result. However if
  5138        # there are _only_ non-string values, pandas will raise:
  5139        #
  5140        #   AttributeError: Can only use .str accessor with string values!
  5141        #
  5142        # This can happen to us at execution time if we split a partition that is
  5143        # only non-strings. This branch just replaces all those values with NaN
  5144        # in that case.
  5145        return df.map(lambda _: np.nan)
  5146      else:
  5147        return getattr(df_str, method)(*args, **kwargs)
  5148  
  5149    return func
  5150  
  5151  for method in ELEMENTWISE_STRING_METHODS:
  5152    if not hasattr(pd.core.strings.StringMethods, method):
  5153      # older versions (1.0.x) don't support some of these methods
  5154      continue
  5155    setattr(_DeferredStringMethods,
  5156            method,
  5157            frame_base._elementwise_method(make_str_func(method),
  5158                                           name=method,
  5159                                           base=pd.core.strings.StringMethods))
  5160  
  5161  for method in NON_ELEMENTWISE_STRING_METHODS:
  5162    if not hasattr(pd.core.strings.StringMethods, method):
  5163      # older versions (1.0.x) don't support some of these methods
  5164      continue
  5165    setattr(_DeferredStringMethods,
  5166            method,
  5167            frame_base._proxy_method(
  5168                make_str_func(method),
  5169                name=method,
  5170                base=pd.core.strings.StringMethods,
  5171                requires_partition_by=partitionings.Arbitrary(),
  5172                preserves_partition_by=partitionings.Singleton()))
  5173  
  5174  
  5175  def make_cat_func(method):
  5176    def func(df, *args, **kwargs):
  5177      return getattr(df.cat, method)(*args, **kwargs)
  5178  
  5179    return func
  5180  
  5181  
  5182  class _DeferredCategoricalMethods(frame_base.DeferredBase):
  5183    @property  # type: ignore
  5184    @frame_base.with_docs_from(pd.core.arrays.categorical.CategoricalAccessor)
  5185    def categories(self):
  5186      return self._expr.proxy().cat.categories
  5187  
  5188    @property  # type: ignore
  5189    @frame_base.with_docs_from(pd.core.arrays.categorical.CategoricalAccessor)
  5190    def ordered(self):
  5191      return self._expr.proxy().cat.ordered
  5192  
  5193    @property  # type: ignore
  5194    @frame_base.with_docs_from(pd.core.arrays.categorical.CategoricalAccessor)
  5195    def codes(self):
  5196      return frame_base.DeferredFrame.wrap(
  5197          expressions.ComputedExpression(
  5198              'codes',
  5199              lambda s: s.cat.codes,
  5200              [self._expr],
  5201              requires_partition_by=partitionings.Arbitrary(),
  5202              preserves_partition_by=partitionings.Arbitrary(),
  5203          )
  5204      )
  5205  
  5206    remove_unused_categories = frame_base.wont_implement_method(
  5207        pd.core.arrays.categorical.CategoricalAccessor,
  5208        'remove_unused_categories', reason="non-deferred-columns")
  5209  
  5210  ELEMENTWISE_CATEGORICAL_METHODS = [
  5211      'add_categories',
  5212      'as_ordered',
  5213      'as_unordered',
  5214      'remove_categories',
  5215      'rename_categories',
  5216      'reorder_categories',
  5217      'set_categories',
  5218  ]
  5219  
  5220  for method in ELEMENTWISE_CATEGORICAL_METHODS:
  5221    setattr(_DeferredCategoricalMethods,
  5222            method,
  5223            frame_base._elementwise_method(
  5224                make_cat_func(method), name=method,
  5225                base=pd.core.arrays.categorical.CategoricalAccessor))
  5226  
  5227  class _DeferredDatetimeMethods(frame_base.DeferredBase):
  5228    @property  # type: ignore
  5229    @frame_base.with_docs_from(pd.core.indexes.accessors.DatetimeProperties)
  5230    def tz(self):
  5231      return self._expr.proxy().dt.tz
  5232  
  5233    @property  # type: ignore
  5234    @frame_base.with_docs_from(pd.core.indexes.accessors.DatetimeProperties)
  5235    def freq(self):
  5236      return self._expr.proxy().dt.freq
  5237  
  5238    @frame_base.with_docs_from(pd.core.indexes.accessors.DatetimeProperties)
  5239    def tz_localize(self, *args, ambiguous='infer', **kwargs):
  5240      """``ambiguous`` cannot be set to ``"infer"`` as its semantics are
  5241      order-sensitive. Similarly, specifying ``ambiguous`` as an
  5242      :class:`~numpy.ndarray` is order-sensitive, but you can achieve similar
  5243      functionality by specifying ``ambiguous`` as a Series."""
  5244      if isinstance(ambiguous, np.ndarray):
  5245        raise frame_base.WontImplementError(
  5246            "tz_localize(ambiguous=ndarray) is not supported because it makes "
  5247            "this operation sensitive to the order of the data. Please use a "
  5248            "DeferredSeries instead.",
  5249            reason="order-sensitive")
  5250      elif isinstance(ambiguous, frame_base.DeferredFrame):
  5251        return frame_base.DeferredFrame.wrap(
  5252            expressions.ComputedExpression(
  5253                'tz_localize',
  5254                lambda s,
  5255                ambiguous: s.dt.tz_localize(*args, ambiguous=ambiguous, **kwargs),
  5256                [self._expr, ambiguous._expr],
  5257                requires_partition_by=partitionings.Index(),
  5258                preserves_partition_by=partitionings.Arbitrary()))
  5259      elif ambiguous == 'infer':
  5260        # infer attempts to infer based on the order of the timestamps
  5261        raise frame_base.WontImplementError(
  5262            f"tz_localize(ambiguous={ambiguous!r}) is not allowed because it "
  5263            "makes this operation sensitive to the order of the data.",
  5264            reason="order-sensitive")
  5265  
  5266      return frame_base.DeferredFrame.wrap(
  5267          expressions.ComputedExpression(
  5268              'tz_localize',
  5269              lambda s: s.dt.tz_localize(*args, ambiguous=ambiguous, **kwargs),
  5270              [self._expr],
  5271              requires_partition_by=partitionings.Arbitrary(),
  5272              preserves_partition_by=partitionings.Arbitrary()))
  5273  
  5274  
  5275    to_period = frame_base.wont_implement_method(
  5276        pd.core.indexes.accessors.DatetimeProperties, 'to_period',
  5277        reason="event-time-semantics")
  5278    to_pydatetime = frame_base.wont_implement_method(
  5279        pd.core.indexes.accessors.DatetimeProperties, 'to_pydatetime',
  5280        reason="non-deferred-result")
  5281    to_pytimedelta = frame_base.wont_implement_method(
  5282        pd.core.indexes.accessors.DatetimeProperties, 'to_pytimedelta',
  5283        reason="non-deferred-result")
  5284  
  5285  def make_dt_property(method):
  5286    def func(df):
  5287      return getattr(df.dt, method)
  5288  
  5289    return func
  5290  
  5291  def make_dt_func(method):
  5292    def func(df, *args, **kwargs):
  5293      return getattr(df.dt, method)(*args, **kwargs)
  5294  
  5295    return func
  5296  
  5297  
  5298  ELEMENTWISE_DATETIME_METHODS = [
  5299    'ceil',
  5300    'day_name',
  5301    'month_name',
  5302    'floor',
  5303    'isocalendar',
  5304    'round',
  5305    'normalize',
  5306    'strftime',
  5307    'tz_convert',
  5308  ]
  5309  
  5310  for method in ELEMENTWISE_DATETIME_METHODS:
  5311    if not hasattr(pd.core.indexes.accessors.DatetimeProperties, method):
  5312      # older versions (1.0.x) don't support some of these methods
  5313      continue
  5314    setattr(_DeferredDatetimeMethods,
  5315            method,
  5316            frame_base._elementwise_method(
  5317                make_dt_func(method),
  5318                name=method,
  5319                base=pd.core.indexes.accessors.DatetimeProperties))
  5320  
  5321  ELEMENTWISE_DATETIME_PROPERTIES = [
  5322    'date',
  5323    'day',
  5324    'dayofweek',
  5325    'dayofyear',
  5326    'days_in_month',
  5327    'daysinmonth',
  5328    'hour',
  5329    'is_leap_year',
  5330    'is_month_end',
  5331    'is_month_start',
  5332    'is_quarter_end',
  5333    'is_quarter_start',
  5334    'is_year_end',
  5335    'is_year_start',
  5336    'microsecond',
  5337    'minute',
  5338    'month',
  5339    'nanosecond',
  5340    'quarter',
  5341    'second',
  5342    'time',
  5343    'timetz',
  5344    'week',
  5345    'weekday',
  5346    'weekofyear',
  5347    'year',
  5348  ]
  5349  
  5350  for method in ELEMENTWISE_DATETIME_PROPERTIES:
  5351    setattr(_DeferredDatetimeMethods,
  5352            method,
  5353            property(frame_base._elementwise_method(
  5354                make_dt_property(method),
  5355                name=method,
  5356                base=pd.core.indexes.accessors.DatetimeProperties)))
  5357  
  5358  
  5359  for base in ['add',
  5360               'sub',
  5361               'mul',
  5362               'div',
  5363               'truediv',
  5364               'floordiv',
  5365               'mod',
  5366               'divmod',
  5367               'pow',
  5368               'and',
  5369               'or']:
  5370    for p in ['%s', 'r%s', '__%s__', '__r%s__']:
  5371      # TODO: non-trivial level?
  5372      name = p % base
  5373      if hasattr(pd.Series, name):
  5374        setattr(
  5375            DeferredSeries,
  5376            name,
  5377            frame_base._elementwise_method(name, restrictions={'level': None},
  5378                                           base=pd.Series))
  5379      if hasattr(pd.DataFrame, name):
  5380        setattr(
  5381            DeferredDataFrame,
  5382            name,
  5383            frame_base._elementwise_method(name, restrictions={'level': None},
  5384                                           base=pd.DataFrame))
  5385    inplace_name = '__i%s__' % base
  5386    if hasattr(pd.Series, inplace_name):
  5387      setattr(
  5388          DeferredSeries,
  5389          inplace_name,
  5390          frame_base._elementwise_method(inplace_name, inplace=True,
  5391                                         base=pd.Series))
  5392    if hasattr(pd.DataFrame, inplace_name):
  5393      setattr(
  5394          DeferredDataFrame,
  5395          inplace_name,
  5396          frame_base._elementwise_method(inplace_name, inplace=True,
  5397                                         base=pd.DataFrame))
  5398  
  5399  # Allow dataframe | SchemaTransform
  5400  def _create_maybe_elementwise_or(base):
  5401    elementwise = frame_base._elementwise_method(
  5402        '__or__', restrictions={'level': None}, base=base)
  5403  
  5404    def _maybe_elementwise_or(self, right):
  5405      if isinstance(right, PTransform):
  5406        return convert.to_dataframe(convert.to_pcollection(self) | right)
  5407      else:
  5408        return elementwise(self, right)
  5409  
  5410    return _maybe_elementwise_or
  5411  
  5412  
  5413  DeferredSeries.__or__ = _create_maybe_elementwise_or(pd.Series)  # type: ignore
  5414  DeferredDataFrame.__or__ = _create_maybe_elementwise_or(pd.DataFrame)  # type: ignore
  5415  
  5416  
  5417  for name in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']:
  5418    for p in '%s', '__%s__':
  5419      # Note that non-underscore name is used for both as the __xxx__ methods are
  5420      # order-sensitive.
  5421      setattr(DeferredSeries, p % name,
  5422              frame_base._elementwise_method(name, base=pd.Series))
  5423      setattr(DeferredDataFrame, p % name,
  5424              frame_base._elementwise_method(name, base=pd.DataFrame))
  5425  
  5426  for name in ['__neg__', '__pos__', '__invert__']:
  5427    setattr(DeferredSeries, name,
  5428            frame_base._elementwise_method(name, base=pd.Series))
  5429    setattr(DeferredDataFrame, name,
  5430            frame_base._elementwise_method(name, base=pd.DataFrame))
  5431  
  5432  DeferredSeries.multiply = DeferredSeries.mul  # type: ignore
  5433  DeferredDataFrame.multiply = DeferredDataFrame.mul  # type: ignore
  5434  DeferredSeries.subtract = DeferredSeries.sub  # type: ignore
  5435  DeferredDataFrame.subtract = DeferredDataFrame.sub  # type: ignore
  5436  DeferredSeries.divide = DeferredSeries.div  # type: ignore
  5437  DeferredDataFrame.divide = DeferredDataFrame.div  # type: ignore
  5438  
  5439  
  5440  def _slice_parts(s):
  5441    yield s.start
  5442    yield s.stop
  5443    yield s.step
  5444  
  5445  def _is_null_slice(s):
  5446    return isinstance(s, slice) and all(x is None for x in _slice_parts(s))
  5447  
  5448  def _is_integer_slice(s):
  5449    return isinstance(s, slice) and all(
  5450        x is None or isinstance(x, int)
  5451        for x in _slice_parts(s)) and not _is_null_slice(s)