github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/frames.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Analogs for :class:`pandas.DataFrame` and :class:`pandas.Series`: 18 :class:`DeferredDataFrame` and :class:`DeferredSeries`. 19 20 These classes are effectively wrappers around a `schema-aware`_ 21 :class:`~apache_beam.pvalue.PCollection` that provide a set of operations 22 compatible with the `pandas`_ API. 23 24 Note that we aim for the Beam DataFrame API to be completely compatible with 25 the pandas API, but there are some features that are currently unimplemented 26 for various reasons. Pay particular attention to the **'Differences from 27 pandas'** section for each operation to understand where we diverge. 28 29 .. _schema-aware: 30 https://beam.apache.org/documentation/programming-guide/#what-is-a-schema 31 .. _pandas: 32 https://pandas.pydata.org/ 33 """ 34 35 import collections 36 import inspect 37 import itertools 38 import math 39 import re 40 import warnings 41 from typing import List 42 from typing import Optional 43 44 import numpy as np 45 import pandas as pd 46 from pandas._libs import lib 47 from pandas.api.types import is_float_dtype 48 from pandas.api.types import is_int64_dtype 49 from pandas.api.types import is_list_like 50 from pandas.core.groupby.generic import DataFrameGroupBy 51 52 from apache_beam.dataframe import convert 53 from apache_beam.dataframe import expressions 54 from apache_beam.dataframe import frame_base 55 from apache_beam.dataframe import io 56 from apache_beam.dataframe import partitionings 57 from apache_beam.transforms import PTransform 58 59 __all__ = [ 60 'DeferredSeries', 61 'DeferredDataFrame', 62 ] 63 64 # Get major, minor version 65 PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2])) 66 67 68 def populate_not_implemented(pd_type): 69 def wrapper(deferred_type): 70 for attr in dir(pd_type): 71 # Don't auto-define hidden methods or dunders 72 if attr.startswith('_'): 73 continue 74 if not hasattr(deferred_type, attr): 75 pd_value = getattr(pd_type, attr) 76 if isinstance(pd_value, property) or inspect.isclass(pd_value): 77 # Some of the properties on pandas types (cat, dt, sparse), are 78 # actually attributes with class values, not properties 79 setattr( 80 deferred_type, 81 attr, 82 property( 83 frame_base.not_implemented_method(attr, base_type=pd_type))) 84 elif callable(pd_value): 85 setattr( 86 deferred_type, 87 attr, 88 frame_base.not_implemented_method(attr, base_type=pd_type)) 89 return deferred_type 90 91 return wrapper 92 93 94 def _fillna_alias(method): 95 def wrapper(self, *args, **kwargs): 96 return self.fillna(*args, method=method, **kwargs) 97 98 wrapper.__name__ = method 99 wrapper.__doc__ = ( 100 f'{method} is only supported for axis="columns". ' 101 'axis="index" is order-sensitive.') 102 103 return frame_base.with_docs_from(pd.DataFrame)( 104 frame_base.args_to_kwargs(pd.DataFrame)( 105 frame_base.populate_defaults(pd.DataFrame)(wrapper))) 106 107 108 # These aggregations are commutative and associative, they can be trivially 109 # "lifted" (i.e. we can pre-aggregate on partitions, group, then post-aggregate) 110 LIFTABLE_AGGREGATIONS = ['all', 'any', 'max', 'min', 'prod', 'sum'] 111 # These aggregations can be lifted if post-aggregated with "sum" 112 LIFTABLE_WITH_SUM_AGGREGATIONS = ['size', 'count'] 113 UNLIFTABLE_AGGREGATIONS = [ 114 'mean', 115 'median', 116 'quantile', 117 'describe', 118 'sem', 119 'mad', 120 'skew', 121 'kurt', 122 'kurtosis', 123 'std', 124 'var', 125 'corr', 126 'cov', 127 'nunique', 128 ] 129 ALL_AGGREGATIONS = ( 130 LIFTABLE_AGGREGATIONS + LIFTABLE_WITH_SUM_AGGREGATIONS + 131 UNLIFTABLE_AGGREGATIONS) 132 133 # These aggregations have specialized distributed implementations on 134 # DeferredSeries, which are re-used in DeferredFrame. Note they are *not* used 135 # for grouped aggregations, since they generally require tracking multiple 136 # intermediate series, which is difficult to lift in groupby. 137 HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS = { 138 'quantile', 139 'std', 140 'var', 141 'mean', 142 'nunique', 143 'corr', 144 'cov', 145 'skew', 146 'kurt', 147 'kurtosis' 148 } 149 UNLIFTABLE_GLOBAL_AGGREGATIONS = ( 150 set(UNLIFTABLE_AGGREGATIONS) - set(HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS)) 151 152 153 def _agg_method(base, func): 154 def wrapper(self, *args, **kwargs): 155 return self.agg(func, *args, **kwargs) 156 157 if func in UNLIFTABLE_GLOBAL_AGGREGATIONS: 158 wrapper.__doc__ = ( 159 f"``{func}`` cannot currently be parallelized. It will " 160 "require collecting all data on a single node.") 161 wrapper.__name__ = func 162 163 return frame_base.with_docs_from(base)(wrapper) 164 165 166 # Docstring to use for head and tail (commonly used to peek at datasets) 167 _PEEK_METHOD_EXPLANATION = ( 168 "because it is `order-sensitive " 169 "<https://s.apache.org/dataframe-order-sensitive-operations>`_.\n\n" 170 "If you want to peek at a large dataset consider using interactive Beam's " 171 ":func:`ib.collect " 172 "<apache_beam.runners.interactive.interactive_beam.collect>` " 173 "with ``n`` specified, or :meth:`sample`. If you want to find the " 174 "N largest elements, consider using :meth:`DeferredDataFrame.nlargest`.") 175 176 177 class DeferredDataFrameOrSeries(frame_base.DeferredFrame): 178 def _render_indexes(self): 179 if self.index.nlevels == 1: 180 return 'index=' + ( 181 '<unnamed>' if self.index.name is None else repr(self.index.name)) 182 else: 183 return 'indexes=[' + ', '.join( 184 '<unnamed>' if ix is None else repr(ix) 185 for ix in self.index.names) + ']' 186 187 __array__ = frame_base.wont_implement_method( 188 pd.Series, '__array__', reason="non-deferred-result") 189 190 @frame_base.with_docs_from(pd.DataFrame) 191 @frame_base.args_to_kwargs(pd.DataFrame) 192 @frame_base.populate_defaults(pd.DataFrame) 193 @frame_base.maybe_inplace 194 def drop(self, labels, axis, index, columns, errors, **kwargs): 195 """drop is not parallelizable when dropping from the index and 196 ``errors="raise"`` is specified. It requires collecting all data on a single 197 node in order to detect if one of the index values is missing.""" 198 if labels is not None: 199 if index is not None or columns is not None: 200 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") 201 if axis in (0, 'index'): 202 index = labels 203 columns = None 204 elif axis in (1, 'columns'): 205 index = None 206 columns = labels 207 else: 208 raise ValueError( 209 "axis must be one of (0, 1, 'index', 'columns'), " 210 "got '%s'" % axis) 211 212 if columns is not None: 213 # Compute the proxy based on just the columns that are dropped. 214 proxy = self._expr.proxy().drop(columns=columns, errors=errors) 215 else: 216 proxy = self._expr.proxy() 217 218 if index is not None and errors == 'raise': 219 # In order to raise an error about missing index values, we'll 220 # need to collect the entire dataframe. 221 # TODO: This could be parallelized by putting index values in a 222 # ConstantExpression and partitioning by index. 223 requires = partitionings.Singleton( 224 reason=( 225 "drop(errors='raise', axis='index') is not currently " 226 "parallelizable. This requires collecting all data on a single " 227 f"node in order to detect if one of {index!r} is missing.")) 228 else: 229 requires = partitionings.Arbitrary() 230 231 return frame_base.DeferredFrame.wrap( 232 expressions.ComputedExpression( 233 'drop', 234 lambda df: df.drop( 235 axis=axis, 236 index=index, 237 columns=columns, 238 errors=errors, 239 **kwargs), [self._expr], 240 proxy=proxy, 241 requires_partition_by=requires)) 242 243 @frame_base.with_docs_from(pd.DataFrame) 244 @frame_base.args_to_kwargs(pd.DataFrame) 245 @frame_base.populate_defaults(pd.DataFrame) 246 def droplevel(self, level, axis): 247 return frame_base.DeferredFrame.wrap( 248 expressions.ComputedExpression( 249 'droplevel', 250 lambda df: df.droplevel(level, axis=axis), [self._expr], 251 requires_partition_by=partitionings.Arbitrary(), 252 preserves_partition_by=partitionings.Arbitrary() 253 if axis in (1, 'column') else partitionings.Singleton())) 254 255 @frame_base.with_docs_from(pd.DataFrame) 256 @frame_base.args_to_kwargs(pd.DataFrame) 257 def swaplevel(self, **kwargs): 258 return frame_base.DeferredFrame.wrap( 259 expressions.ComputedExpression( 260 'swaplevel', 261 lambda df: df.swaplevel(**kwargs), [self._expr], 262 requires_partition_by=partitionings.Arbitrary(), 263 preserves_partition_by=partitionings.Arbitrary())) 264 265 @frame_base.with_docs_from(pd.DataFrame) 266 @frame_base.args_to_kwargs(pd.DataFrame) 267 @frame_base.populate_defaults(pd.DataFrame) 268 @frame_base.maybe_inplace 269 def fillna(self, value, method, axis, limit, **kwargs): 270 """When ``axis="index"``, both ``method`` and ``limit`` must be ``None``. 271 otherwise this operation is order-sensitive.""" 272 # Default value is None, but is overriden with index. 273 axis = axis or 'index' 274 275 if axis in (0, 'index'): 276 if method is not None: 277 raise frame_base.WontImplementError( 278 f"fillna(method={method!r}, axis={axis!r}) is not supported " 279 "because it is order-sensitive. Only fillna(method=None) is " 280 f"supported with axis={axis!r}.", 281 reason="order-sensitive") 282 if limit is not None: 283 raise frame_base.WontImplementError( 284 f"fillna(limit={method!r}, axis={axis!r}) is not supported because " 285 "it is order-sensitive. Only fillna(limit=None) is supported with " 286 f"axis={axis!r}.", 287 reason="order-sensitive") 288 289 if isinstance(self, DeferredDataFrame) and isinstance(value, 290 DeferredSeries): 291 # If self is a DataFrame and value is a Series we want to broadcast value 292 # to all partitions of self. 293 # This is OK, as its index must be the same size as the columns set of 294 # self, so cannot be too large. 295 class AsScalar(object): 296 def __init__(self, value): 297 self.value = value 298 299 with expressions.allow_non_parallel_operations(): 300 value_expr = expressions.ComputedExpression( 301 'as_scalar', 302 lambda df: AsScalar(df), [value._expr], 303 requires_partition_by=partitionings.Singleton()) 304 305 get_value = lambda x: x.value 306 requires = partitionings.Arbitrary() 307 elif isinstance(value, frame_base.DeferredBase): 308 # For other DeferredBase combinations, use Index partitioning to 309 # co-locate on the Index 310 value_expr = value._expr 311 get_value = lambda x: x 312 requires = partitionings.Index() 313 else: 314 # Default case, pass value through as a constant, no particular 315 # partitioning requirement 316 value_expr = expressions.ConstantExpression(value) 317 get_value = lambda x: x 318 requires = partitionings.Arbitrary() 319 320 return frame_base.DeferredFrame.wrap( 321 # yapf: disable 322 expressions.ComputedExpression( 323 'fillna', 324 lambda df, 325 value: df.fillna( 326 get_value(value), 327 method=method, 328 axis=axis, 329 limit=limit, 330 **kwargs), [self._expr, value_expr], 331 preserves_partition_by=partitionings.Arbitrary(), 332 requires_partition_by=requires)) 333 334 if hasattr(pd.DataFrame, 'ffill'): 335 ffill = _fillna_alias('ffill') 336 if hasattr(pd.DataFrame, 'bfill'): 337 bfill = _fillna_alias('bfill') 338 if hasattr(pd.DataFrame, 'backfill'): 339 backfill = _fillna_alias('backfill') 340 if hasattr(pd.DataFrame, 'pad'): 341 pad = _fillna_alias('pad') 342 343 @frame_base.with_docs_from(pd.DataFrame) 344 def first(self, offset): 345 per_partition = expressions.ComputedExpression( 346 'first-per-partition', 347 lambda df: df.sort_index().first(offset=offset), [self._expr], 348 preserves_partition_by=partitionings.Arbitrary(), 349 requires_partition_by=partitionings.Arbitrary()) 350 with expressions.allow_non_parallel_operations(True): 351 return frame_base.DeferredFrame.wrap( 352 expressions.ComputedExpression( 353 'first', 354 lambda df: df.sort_index().first(offset=offset), [per_partition], 355 preserves_partition_by=partitionings.Arbitrary(), 356 requires_partition_by=partitionings.Singleton())) 357 358 @frame_base.with_docs_from(pd.DataFrame) 359 def last(self, offset): 360 per_partition = expressions.ComputedExpression( 361 'last-per-partition', 362 lambda df: df.sort_index().last(offset=offset), [self._expr], 363 preserves_partition_by=partitionings.Arbitrary(), 364 requires_partition_by=partitionings.Arbitrary()) 365 with expressions.allow_non_parallel_operations(True): 366 return frame_base.DeferredFrame.wrap( 367 expressions.ComputedExpression( 368 'last', 369 lambda df: df.sort_index().last(offset=offset), [per_partition], 370 preserves_partition_by=partitionings.Arbitrary(), 371 requires_partition_by=partitionings.Singleton())) 372 373 @frame_base.with_docs_from(pd.DataFrame) 374 @frame_base.args_to_kwargs(pd.DataFrame) 375 @frame_base.populate_defaults(pd.DataFrame) 376 def groupby(self, by, level, axis, as_index, group_keys, **kwargs): 377 """``as_index`` must be ``True``. 378 379 Aggregations grouping by a categorical column with ``observed=False`` set 380 are not currently parallelizable 381 (`Issue 21827 <https://github.com/apache/beam/issues/21827>`_). 382 """ 383 if not as_index: 384 raise NotImplementedError('groupby(as_index=False)') 385 386 if axis in (1, 'columns'): 387 return _DeferredGroupByCols( 388 expressions.ComputedExpression( 389 'groupbycols', 390 lambda df: df.groupby( 391 by, axis=axis, group_keys=group_keys, **kwargs), [self._expr], 392 requires_partition_by=partitionings.Arbitrary(), 393 preserves_partition_by=partitionings.Arbitrary()), 394 group_keys=group_keys) 395 396 if level is None and by is None: 397 raise TypeError("You have to supply one of 'by' and 'level'") 398 399 elif level is not None: 400 if isinstance(level, (list, tuple)): 401 grouping_indexes = level 402 else: 403 grouping_indexes = [level] 404 405 grouping_columns = [] 406 407 index = self._expr.proxy().index 408 409 # Translate to level numbers only 410 grouping_indexes = [ 411 l if isinstance(l, int) else index.names.index(l) 412 for l in grouping_indexes 413 ] 414 415 if index.nlevels == 1: 416 to_group_with_index = self._expr 417 to_group = self._expr 418 else: 419 levels_to_drop = [ 420 i for i in range(index.nlevels) if i not in grouping_indexes 421 ] 422 423 # Reorder so the grouped indexes are first 424 to_group_with_index = self.reorder_levels( 425 grouping_indexes + levels_to_drop) 426 427 grouping_indexes = list(range(len(grouping_indexes))) 428 levels_to_drop = list(range(len(grouping_indexes), index.nlevels)) 429 if levels_to_drop: 430 to_group = to_group_with_index.droplevel(levels_to_drop)._expr 431 else: 432 to_group = to_group_with_index._expr 433 to_group_with_index = to_group_with_index._expr 434 435 elif callable(by): 436 437 def map_index(df): 438 df = df.copy() 439 df.index = df.index.map(by) 440 return df 441 442 to_group = expressions.ComputedExpression( 443 'map_index', 444 map_index, [self._expr], 445 requires_partition_by=partitionings.Arbitrary(), 446 preserves_partition_by=partitionings.Singleton()) 447 448 orig_nlevels = self._expr.proxy().index.nlevels 449 450 def prepend_mapped_index(df): 451 df = df.copy() 452 453 index = df.index.to_frame() 454 index.insert(0, None, df.index.map(by)) 455 456 df.index = pd.MultiIndex.from_frame( 457 index, names=[None] + list(df.index.names)) 458 return df 459 460 to_group_with_index = expressions.ComputedExpression( 461 'map_index_keep_orig', 462 prepend_mapped_index, 463 [self._expr], 464 requires_partition_by=partitionings.Arbitrary(), 465 # Partitioning by the original indexes is preserved 466 preserves_partition_by=partitionings.Index( 467 list(range(1, orig_nlevels + 1)))) 468 469 grouping_columns = [] 470 # The index we need to group by is the last one 471 grouping_indexes = [0] 472 473 elif isinstance(by, DeferredSeries): 474 if isinstance(self, DeferredSeries): 475 476 def set_index(s, by): 477 df = pd.DataFrame(s) 478 df, by = df.align(by, axis=0, join='inner') 479 return df.set_index(by).iloc[:, 0] 480 481 def prepend_index(s, by): 482 df = pd.DataFrame(s) 483 df, by = df.align(by, axis=0, join='inner') 484 return df.set_index([by, df.index]).iloc[:, 0] 485 486 else: 487 488 def set_index(df, by): # type: ignore 489 df, by = df.align(by, axis=0, join='inner') 490 return df.set_index(by) 491 492 def prepend_index(df, by): # type: ignore 493 df, by = df.align(by, axis=0, join='inner') 494 return df.set_index([by, df.index]) 495 496 to_group = expressions.ComputedExpression( 497 'set_index', 498 set_index, [self._expr, by._expr], 499 requires_partition_by=partitionings.Index(), 500 preserves_partition_by=partitionings.Singleton()) 501 502 orig_nlevels = self._expr.proxy().index.nlevels 503 to_group_with_index = expressions.ComputedExpression( 504 'prependindex', 505 prepend_index, [self._expr, by._expr], 506 requires_partition_by=partitionings.Index(), 507 preserves_partition_by=partitionings.Index( 508 list(range(1, orig_nlevels + 1)))) 509 510 grouping_columns = [] 511 grouping_indexes = [0] 512 513 elif isinstance(by, np.ndarray): 514 raise frame_base.WontImplementError( 515 "Grouping by a concrete ndarray is order sensitive.", 516 reason="order-sensitive") 517 518 elif isinstance(self, DeferredDataFrame): 519 if not isinstance(by, list): 520 by = [by] 521 # Find the columns that we need to move into the index so we can group by 522 # them 523 column_names = self._expr.proxy().columns 524 grouping_columns = list(set(by).intersection(column_names)) 525 index_names = self._expr.proxy().index.names 526 for label in by: 527 if label not in index_names and label not in self._expr.proxy().columns: 528 raise KeyError(label) 529 grouping_indexes = list(set(by).intersection(index_names)) 530 531 if grouping_indexes: 532 if set(by) == set(index_names): 533 to_group = self._expr 534 elif set(by).issubset(index_names): 535 to_group = self.droplevel(index_names.difference(by))._expr 536 else: 537 to_group = self.reset_index(grouping_indexes).set_index(by)._expr 538 else: 539 to_group = self.set_index(by)._expr 540 541 if grouping_columns: 542 # TODO(https://github.com/apache/beam/issues/20759): 543 # It should be possible to do this without creating 544 # an expression manually, by using DeferredDataFrame.set_index, i.e.: 545 # to_group_with_index = self.set_index([self.index] + 546 # grouping_columns)._expr 547 to_group_with_index = expressions.ComputedExpression( 548 'move_grouped_columns_to_index', 549 lambda df: df.set_index([df.index] + grouping_columns, drop=False), 550 [self._expr], 551 requires_partition_by=partitionings.Arbitrary(), 552 preserves_partition_by=partitionings.Index( 553 list(range(self._expr.proxy().index.nlevels)))) 554 else: 555 to_group_with_index = self._expr 556 557 else: 558 raise NotImplementedError(by) 559 560 return DeferredGroupBy( 561 expressions.ComputedExpression( 562 'groupbyindex', 563 lambda df: df.groupby( 564 level=list(range(df.index.nlevels)), 565 group_keys=group_keys, 566 **kwargs), [to_group], 567 requires_partition_by=partitionings.Index(), 568 preserves_partition_by=partitionings.Arbitrary()), 569 kwargs, 570 to_group, 571 to_group_with_index, 572 grouping_columns=grouping_columns, 573 grouping_indexes=grouping_indexes, 574 group_keys=group_keys) 575 576 @property # type: ignore 577 @frame_base.with_docs_from(pd.DataFrame) 578 def loc(self): 579 return _DeferredLoc(self) 580 581 @property # type: ignore 582 @frame_base.with_docs_from(pd.DataFrame) 583 def iloc(self): 584 """Position-based indexing with `iloc` is order-sensitive in almost every 585 case. Beam DataFrame users should prefer label-based indexing with `loc`. 586 """ 587 return _DeferredILoc(self) 588 589 @frame_base.with_docs_from(pd.DataFrame) 590 @frame_base.args_to_kwargs(pd.DataFrame) 591 @frame_base.populate_defaults(pd.DataFrame) 592 @frame_base.maybe_inplace 593 def reset_index(self, level=None, **kwargs): 594 """Dropping the entire index (e.g. with ``reset_index(level=None)``) is 595 not parallelizable. It is also only guaranteed that the newly generated 596 index values will be unique. The Beam DataFrame API makes no guarantee 597 that the same index values as the equivalent pandas operation will be 598 generated, because that implementation is order-sensitive.""" 599 if level is not None and not isinstance(level, (tuple, list)): 600 level = [level] 601 if level is None or len(level) == self._expr.proxy().index.nlevels: 602 # TODO(https://github.com/apache/beam/issues/20859): 603 # Could do distributed re-index with offsets. 604 requires_partition_by = partitionings.Singleton( 605 reason=( 606 f"reset_index(level={level!r}) drops the entire index and " 607 "creates a new one, so it cannot currently be parallelized " 608 "(https://github.com/apache/beam/issues/20859).")) 609 else: 610 requires_partition_by = partitionings.Arbitrary() 611 return frame_base.DeferredFrame.wrap( 612 expressions.ComputedExpression( 613 'reset_index', 614 lambda df: df.reset_index(level=level, **kwargs), [self._expr], 615 preserves_partition_by=partitionings.Singleton(), 616 requires_partition_by=requires_partition_by)) 617 618 abs = frame_base._elementwise_method('abs', base=pd.core.generic.NDFrame) 619 620 @frame_base.with_docs_from(pd.core.generic.NDFrame) 621 @frame_base.args_to_kwargs(pd.core.generic.NDFrame) 622 @frame_base.populate_defaults(pd.core.generic.NDFrame) 623 def astype(self, dtype, copy, errors): 624 """astype is not parallelizable when ``errors="ignore"`` is specified. 625 626 ``copy=False`` is not supported because it relies on memory-sharing 627 semantics. 628 629 ``dtype="category`` is not supported because the type of the output column 630 depends on the data. Please use ``pd.CategoricalDtype`` with explicit 631 categories instead. 632 """ 633 requires = partitionings.Arbitrary() 634 635 if errors == "ignore": 636 # We need all data in order to ignore errors and propagate the original 637 # data. 638 requires = partitionings.Singleton( 639 reason=( 640 f"astype(errors={errors!r}) is currently not parallelizable, " 641 "because all data must be collected on one node to determine if " 642 "the original data should be propagated instead.")) 643 644 if not copy: 645 raise frame_base.WontImplementError( 646 f"astype(copy={copy!r}) is not supported because it relies on " 647 "memory-sharing semantics that are not compatible with the Beam " 648 "model.") 649 650 # An instance of CategoricalDtype is actualy considered equal to the string 651 # 'category', so we have to explicitly check if dtype is an instance of 652 # CategoricalDtype, and allow it. 653 # See https://github.com/apache/beam/issues/23276 654 if dtype == 'category' and not isinstance(dtype, pd.CategoricalDtype): 655 raise frame_base.WontImplementError( 656 "astype(dtype='category') is not supported because the type of the " 657 "output column depends on the data. Please use pd.CategoricalDtype " 658 "with explicit categories instead.", 659 reason="non-deferred-columns") 660 661 return frame_base.DeferredFrame.wrap( 662 expressions.ComputedExpression( 663 'astype', 664 lambda df: df.astype(dtype=dtype, copy=copy, errors=errors), 665 [self._expr], 666 requires_partition_by=requires, 667 preserves_partition_by=partitionings.Arbitrary())) 668 669 at_time = frame_base._elementwise_method( 670 'at_time', base=pd.core.generic.NDFrame) 671 between_time = frame_base._elementwise_method( 672 'between_time', base=pd.core.generic.NDFrame) 673 copy = frame_base._elementwise_method('copy', base=pd.core.generic.NDFrame) 674 675 @frame_base.with_docs_from(pd.DataFrame) 676 @frame_base.args_to_kwargs(pd.DataFrame) 677 @frame_base.populate_defaults(pd.DataFrame) 678 @frame_base.maybe_inplace 679 def replace(self, to_replace, value, limit, method, **kwargs): 680 """``method`` is not supported in the Beam DataFrame API because it is 681 order-sensitive. It cannot be specified. 682 683 If ``limit`` is specified this operation is not parallelizable.""" 684 # pylint: disable-next=c-extension-no-member 685 value_compare = None if PD_VERSION < (1, 4) else lib.no_default 686 if method is not None and not isinstance(to_replace, 687 dict) and value is value_compare: 688 # pandas only relies on method if to_replace is not a dictionary, and 689 # value is the <no_default> value. This is different than 690 # if ``None`` is explicitly passed for ``value``. In this case, it will be 691 # respected 692 raise frame_base.WontImplementError( 693 f"replace(method={method!r}) is not supported because it is " 694 "order sensitive. Only replace(method=None) is supported.", 695 reason="order-sensitive") 696 697 if limit is None: 698 requires_partition_by = partitionings.Arbitrary() 699 else: 700 requires_partition_by = partitionings.Singleton( 701 reason=( 702 f"replace(limit={limit!r}) cannot currently be parallelized. It " 703 "requires collecting all data on a single node.")) 704 return frame_base.DeferredFrame.wrap( 705 expressions.ComputedExpression( 706 'replace', 707 lambda df: df.replace( 708 to_replace=to_replace, 709 value=value, 710 limit=limit, 711 method=method, 712 **kwargs), [self._expr], 713 preserves_partition_by=partitionings.Arbitrary(), 714 requires_partition_by=requires_partition_by)) 715 716 @frame_base.with_docs_from(pd.DataFrame) 717 @frame_base.args_to_kwargs(pd.DataFrame) 718 @frame_base.populate_defaults(pd.DataFrame) 719 def tz_localize(self, ambiguous, **kwargs): 720 """``ambiguous`` cannot be set to ``"infer"`` as its semantics are 721 order-sensitive. Similarly, specifying ``ambiguous`` as an 722 :class:`~numpy.ndarray` is order-sensitive, but you can achieve similar 723 functionality by specifying ``ambiguous`` as a Series.""" 724 if isinstance(ambiguous, np.ndarray): 725 raise frame_base.WontImplementError( 726 "tz_localize(ambiguous=ndarray) is not supported because it makes " 727 "this operation sensitive to the order of the data. Please use a " 728 "DeferredSeries instead.", 729 reason="order-sensitive") 730 elif isinstance(ambiguous, frame_base.DeferredFrame): 731 return frame_base.DeferredFrame.wrap( 732 expressions.ComputedExpression( 733 'tz_localize', 734 lambda df, 735 ambiguous: df.tz_localize(ambiguous=ambiguous, **kwargs), 736 [self._expr, ambiguous._expr], 737 requires_partition_by=partitionings.Index(), 738 preserves_partition_by=partitionings.Singleton())) 739 elif ambiguous == 'infer': 740 # infer attempts to infer based on the order of the timestamps 741 raise frame_base.WontImplementError( 742 f"tz_localize(ambiguous={ambiguous!r}) is not allowed because it " 743 "makes this operation sensitive to the order of the data.", 744 reason="order-sensitive") 745 746 return frame_base.DeferredFrame.wrap( 747 expressions.ComputedExpression( 748 'tz_localize', 749 lambda df: df.tz_localize(ambiguous=ambiguous, **kwargs), 750 [self._expr], 751 requires_partition_by=partitionings.Arbitrary(), 752 preserves_partition_by=partitionings.Singleton())) 753 754 @property # type: ignore 755 @frame_base.with_docs_from(pd.DataFrame) 756 def size(self): 757 sizes = expressions.ComputedExpression( 758 'get_sizes', 759 # Wrap scalar results in a Series for easier concatenation later 760 lambda df: pd.Series(df.size), 761 [self._expr], 762 requires_partition_by=partitionings.Arbitrary(), 763 preserves_partition_by=partitionings.Singleton()) 764 765 with expressions.allow_non_parallel_operations(True): 766 return frame_base.DeferredFrame.wrap( 767 expressions.ComputedExpression( 768 'sum_sizes', 769 lambda sizes: sizes.sum(), [sizes], 770 requires_partition_by=partitionings.Singleton(), 771 preserves_partition_by=partitionings.Singleton())) 772 773 def length(self): 774 """Alternative to ``len(df)`` which returns a deferred result that can be 775 used in arithmetic with :class:`DeferredSeries` or 776 :class:`DeferredDataFrame` instances.""" 777 lengths = expressions.ComputedExpression( 778 'get_lengths', 779 # Wrap scalar results in a Series for easier concatenation later 780 lambda df: pd.Series(len(df)), 781 [self._expr], 782 requires_partition_by=partitionings.Arbitrary(), 783 preserves_partition_by=partitionings.Singleton()) 784 785 with expressions.allow_non_parallel_operations(True): 786 return frame_base.DeferredFrame.wrap( 787 expressions.ComputedExpression( 788 'sum_lengths', 789 lambda lengths: lengths.sum(), [lengths], 790 requires_partition_by=partitionings.Singleton(), 791 preserves_partition_by=partitionings.Singleton())) 792 793 def __len__(self): 794 raise frame_base.WontImplementError( 795 "len(df) is not currently supported because it produces a non-deferred " 796 "result. Consider using df.length() instead.", 797 reason="non-deferred-result") 798 799 @property # type: ignore 800 @frame_base.with_docs_from(pd.DataFrame) 801 def empty(self): 802 empties = expressions.ComputedExpression( 803 'get_empties', 804 # Wrap scalar results in a Series for easier concatenation later 805 lambda df: pd.Series(df.empty), 806 [self._expr], 807 requires_partition_by=partitionings.Arbitrary(), 808 preserves_partition_by=partitionings.Singleton()) 809 810 with expressions.allow_non_parallel_operations(True): 811 return frame_base.DeferredFrame.wrap( 812 expressions.ComputedExpression( 813 'check_all_empty', 814 lambda empties: empties.all(), [empties], 815 requires_partition_by=partitionings.Singleton(), 816 preserves_partition_by=partitionings.Singleton())) 817 818 @frame_base.with_docs_from(pd.DataFrame) 819 def bool(self): 820 # TODO: Documentation about DeferredScalar 821 # Will throw if any partition has >1 element 822 bools = expressions.ComputedExpression( 823 'get_bools', 824 # Wrap scalar results in a Series for easier concatenation later 825 lambda df: pd.Series([], dtype=bool) 826 if df.empty else pd.Series([df.bool()]), 827 [self._expr], 828 requires_partition_by=partitionings.Arbitrary(), 829 preserves_partition_by=partitionings.Singleton()) 830 831 with expressions.allow_non_parallel_operations(True): 832 # Will throw if overall dataset has != 1 element 833 return frame_base.DeferredFrame.wrap( 834 expressions.ComputedExpression( 835 'combine_all_bools', 836 lambda bools: bools.bool(), [bools], 837 proxy=bool(), 838 requires_partition_by=partitionings.Singleton(), 839 preserves_partition_by=partitionings.Singleton())) 840 841 @frame_base.with_docs_from(pd.DataFrame) 842 def equals(self, other): 843 intermediate = expressions.ComputedExpression( 844 'equals_partitioned', 845 # Wrap scalar results in a Series for easier concatenation later 846 lambda df, 847 other: pd.Series(df.equals(other)), 848 [self._expr, other._expr], 849 requires_partition_by=partitionings.Index(), 850 preserves_partition_by=partitionings.Singleton()) 851 852 with expressions.allow_non_parallel_operations(True): 853 return frame_base.DeferredFrame.wrap( 854 expressions.ComputedExpression( 855 'aggregate_equals', 856 lambda df: df.all(), [intermediate], 857 requires_partition_by=partitionings.Singleton(), 858 preserves_partition_by=partitionings.Singleton())) 859 860 @frame_base.args_to_kwargs(pd.DataFrame) 861 @frame_base.populate_defaults(pd.DataFrame) 862 def sort_values(self, axis, **kwargs): 863 """``sort_values`` is not implemented. 864 865 It is not implemented for ``axis=index`` because it imposes an ordering on 866 the dataset, and it likely will not be maintained (see 867 https://s.apache.org/dataframe-order-sensitive-operations). 868 869 It is not implemented for ``axis=columns`` because it makes the order of 870 the columns depend on the data (see 871 https://s.apache.org/dataframe-non-deferred-columns).""" 872 if axis in (0, 'index'): 873 # axis=index imposes an ordering on the DataFrame rows which we do not 874 # support 875 raise frame_base.WontImplementError( 876 "sort_values(axis=index) is not supported because it imposes an " 877 "ordering on the dataset which likely will not be preserved.", 878 reason="order-sensitive") 879 else: 880 # axis=columns will reorder the columns based on the data 881 raise frame_base.WontImplementError( 882 "sort_values(axis=columns) is not supported because the order of the " 883 "columns in the result depends on the data.", 884 reason="non-deferred-columns") 885 886 @frame_base.with_docs_from(pd.DataFrame) 887 @frame_base.args_to_kwargs(pd.DataFrame) 888 @frame_base.populate_defaults(pd.DataFrame) 889 @frame_base.maybe_inplace 890 def sort_index(self, axis, **kwargs): 891 """``axis=index`` is not allowed because it imposes an ordering on the 892 dataset, and we cannot guarantee it will be maintained (see 893 https://s.apache.org/dataframe-order-sensitive-operations). Only 894 ``axis=columns`` is allowed.""" 895 if axis in (0, 'index'): 896 # axis=rows imposes an ordering on the DataFrame which we do not support 897 raise frame_base.WontImplementError( 898 "sort_index(axis=index) is not supported because it imposes an " 899 "ordering on the dataset which we cannot guarantee will be " 900 "preserved.", 901 reason="order-sensitive") 902 903 # axis=columns reorders the columns by name 904 return frame_base.DeferredFrame.wrap( 905 expressions.ComputedExpression( 906 'sort_index', 907 lambda df: df.sort_index(axis, **kwargs), 908 [self._expr], 909 requires_partition_by=partitionings.Arbitrary(), 910 preserves_partition_by=partitionings.Arbitrary(), 911 )) 912 913 @frame_base.with_docs_from(pd.DataFrame) 914 @frame_base.args_to_kwargs(pd.DataFrame) 915 @frame_base.populate_defaults(pd.DataFrame) 916 @frame_base.maybe_inplace 917 def where(self, cond, other, errors, **kwargs): 918 """where is not parallelizable when ``errors="ignore"`` is specified.""" 919 requires = partitionings.Arbitrary() 920 deferred_args = {} 921 actual_args = {} 922 923 # TODO(bhulette): This is very similar to the logic in 924 # frame_base.elementwise_method, can we unify it? 925 if isinstance(cond, frame_base.DeferredFrame): 926 deferred_args['cond'] = cond 927 requires = partitionings.Index() 928 else: 929 actual_args['cond'] = cond 930 931 if isinstance(other, frame_base.DeferredFrame): 932 deferred_args['other'] = other 933 requires = partitionings.Index() 934 else: 935 actual_args['other'] = other 936 937 if errors == "ignore": 938 # We need all data in order to ignore errors and propagate the original 939 # data. 940 requires = partitionings.Singleton( 941 reason=( 942 f"where(errors={errors!r}) is currently not parallelizable, " 943 "because all data must be collected on one node to determine if " 944 "the original data should be propagated instead.")) 945 946 actual_args['errors'] = errors 947 948 def where_execution(df, *args): 949 runtime_values = { 950 name: value 951 for (name, value) in zip(deferred_args.keys(), args) 952 } 953 return df.where(**runtime_values, **actual_args, **kwargs) 954 955 return frame_base.DeferredFrame.wrap( 956 expressions.ComputedExpression( 957 "where", 958 where_execution, 959 [self._expr] + [df._expr for df in deferred_args.values()], 960 requires_partition_by=requires, 961 preserves_partition_by=partitionings.Index(), 962 )) 963 964 @frame_base.with_docs_from(pd.DataFrame) 965 @frame_base.args_to_kwargs(pd.DataFrame) 966 @frame_base.populate_defaults(pd.DataFrame) 967 @frame_base.maybe_inplace 968 def mask(self, cond, **kwargs): 969 """mask is not parallelizable when ``errors="ignore"`` is specified.""" 970 return self.where(~cond, **kwargs) 971 972 @frame_base.with_docs_from(pd.DataFrame) 973 @frame_base.args_to_kwargs(pd.DataFrame) 974 @frame_base.populate_defaults(pd.DataFrame) 975 def truncate(self, before, after, axis): 976 977 if axis in (None, 0, 'index'): 978 979 def truncate(df): 980 return df.sort_index().truncate(before=before, after=after, axis=axis) 981 else: 982 983 def truncate(df): 984 return df.truncate(before=before, after=after, axis=axis) 985 986 return frame_base.DeferredFrame.wrap( 987 expressions.ComputedExpression( 988 'truncate', 989 truncate, [self._expr], 990 requires_partition_by=partitionings.Arbitrary(), 991 preserves_partition_by=partitionings.Arbitrary())) 992 993 @frame_base.with_docs_from(pd.DataFrame) 994 @frame_base.args_to_kwargs(pd.DataFrame) 995 @frame_base.populate_defaults(pd.DataFrame) 996 def unstack(self, **kwargs): 997 level = kwargs.get('level', -1) 998 999 if self._expr.proxy().index.nlevels == 1: 1000 if PD_VERSION < (1, 2): 1001 raise frame_base.WontImplementError( 1002 "unstack() is not supported when using pandas < 1.2.0\n" 1003 "Please upgrade to pandas 1.2.0 or higher to use this operation.") 1004 return frame_base.DeferredFrame.wrap( 1005 expressions.ComputedExpression( 1006 'unstack', 1007 lambda s: s.unstack(**kwargs), [self._expr], 1008 requires_partition_by=partitionings.Index())) 1009 else: 1010 # Unstacking MultiIndex objects 1011 idx = self._expr.proxy().index 1012 1013 # Converting level (int, str, or combination) to a list of number levels 1014 level_list = level if isinstance(level, list) else [level] 1015 level_number_list = [idx._get_level_number(l) for l in level_list] 1016 1017 # Checking if levels provided are of CategoricalDtype 1018 if not all(isinstance(idx.levels[l].dtype, (pd.CategoricalDtype, 1019 pd.BooleanDtype)) 1020 for l in level_number_list): 1021 raise frame_base.WontImplementError( 1022 "unstack() is only supported on DataFrames if unstacked level " 1023 "is a categorical or boolean column", 1024 reason="non-deferred-columns") 1025 else: 1026 tmp = self._expr.proxy().unstack(**kwargs) 1027 if isinstance(tmp.columns, pd.MultiIndex): 1028 levels = [] 1029 for i in range(tmp.columns.nlevels): 1030 level = tmp.columns.levels[i] 1031 levels.append(level) 1032 col_idx = pd.MultiIndex.from_product(levels) 1033 else: 1034 if tmp.columns.dtype == 'boolean': 1035 col_idx = pd.Index([False, True], dtype='boolean') 1036 else: 1037 col_idx = pd.CategoricalIndex(tmp.columns.categories) 1038 1039 if isinstance(self._expr.proxy(), pd.Series): 1040 proxy_dtype = self._expr.proxy().dtypes 1041 else: 1042 # Set dtype to object if more than one value 1043 dtypes = [d for d in self._expr.proxy().dtypes] 1044 proxy_dtype = object 1045 if np.int64 in dtypes: 1046 proxy_dtype = np.int64 1047 if np.float64 in dtypes: 1048 proxy_dtype = np.float64 1049 if object in dtypes: 1050 proxy_dtype = object 1051 1052 proxy = pd.DataFrame( 1053 columns=col_idx, dtype=proxy_dtype, index=tmp.index) 1054 1055 with expressions.allow_non_parallel_operations(True): 1056 return frame_base.DeferredFrame.wrap( 1057 expressions.ComputedExpression( 1058 'unstack', 1059 lambda s: pd.concat([proxy, s.unstack(**kwargs)]), 1060 [self._expr], 1061 proxy=proxy, 1062 requires_partition_by=partitionings.Singleton())) 1063 1064 @frame_base.with_docs_from(pd.DataFrame) 1065 @frame_base.args_to_kwargs(pd.DataFrame) 1066 @frame_base.populate_defaults(pd.DataFrame) 1067 def xs(self, key, axis, level, **kwargs): 1068 """Note that ``xs(axis='index')`` will raise a ``KeyError`` at execution 1069 time if the key does not exist in the index.""" 1070 1071 if axis in ('columns', 1): 1072 # Special case for axis=columns. This is a simple project that raises a 1073 # KeyError at construction time for missing columns. 1074 return frame_base.DeferredFrame.wrap( 1075 expressions.ComputedExpression( 1076 'xs', 1077 lambda df: df.xs(key, axis=axis, **kwargs), [self._expr], 1078 requires_partition_by=partitionings.Arbitrary(), 1079 preserves_partition_by=partitionings.Arbitrary())) 1080 elif axis not in ('index', 0): 1081 # Make sure that user's axis is valid 1082 raise ValueError( 1083 "axis must be one of ('index', 0, 'columns', 1). " 1084 f"got {axis!r}.") 1085 1086 if not isinstance(key, tuple): 1087 key_size = 1 1088 key_series = pd.Series([key], index=[key]) 1089 else: 1090 key_size = len(key) 1091 key_series = pd.Series([key], pd.MultiIndex.from_tuples([key])) 1092 1093 key_expr = expressions.ConstantExpression( 1094 key_series, proxy=key_series.iloc[:0]) 1095 1096 if level is None: 1097 reindexed = self 1098 else: 1099 if not isinstance(level, list): 1100 level = [level] 1101 1102 # If user specifed levels, reindex so those levels are at the beginning. 1103 # Keep the others and preserve their order. 1104 level = [ 1105 l if isinstance(l, int) else list(self.index.names).index(l) 1106 for l in level 1107 ] 1108 1109 reindexed = self.reorder_levels( 1110 level + [i for i in range(self.index.nlevels) if i not in level]) 1111 1112 def xs_partitioned(frame, key): 1113 if not len(key): 1114 # key is not in this partition, return empty dataframe 1115 result = frame.iloc[:0] 1116 if key_size < frame.index.nlevels: 1117 return result.droplevel(list(range(key_size))) 1118 else: 1119 return result 1120 1121 # key should be in this partition, call xs. Will raise KeyError if not 1122 # present. 1123 return frame.xs(key.item()) 1124 1125 return frame_base.DeferredFrame.wrap( 1126 expressions.ComputedExpression( 1127 'xs', 1128 xs_partitioned, 1129 [reindexed._expr, key_expr], 1130 requires_partition_by=partitionings.Index(list(range(key_size))), 1131 # Drops index levels, so partitioning is not preserved 1132 preserves_partition_by=partitionings.Singleton())) 1133 1134 @property 1135 def dtype(self): 1136 return self._expr.proxy().dtype 1137 1138 isin = frame_base._elementwise_method('isin', base=pd.DataFrame) 1139 combine_first = frame_base._elementwise_method( 1140 'combine_first', base=pd.DataFrame) 1141 1142 combine = frame_base._proxy_method( 1143 'combine', 1144 base=pd.DataFrame, 1145 requires_partition_by=expressions.partitionings.Singleton( 1146 reason="combine() is not parallelizable because func might operate " 1147 "on the full dataset."), 1148 preserves_partition_by=expressions.partitionings.Singleton()) 1149 1150 @property # type: ignore 1151 @frame_base.with_docs_from(pd.DataFrame) 1152 def ndim(self): 1153 return self._expr.proxy().ndim 1154 1155 @property # type: ignore 1156 @frame_base.with_docs_from(pd.DataFrame) 1157 def index(self): 1158 return _DeferredIndex(self) 1159 1160 @index.setter 1161 def _set_index(self, value): 1162 # TODO: assigning the index is generally order-sensitive, but we could 1163 # support it in some rare cases, e.g. when assigning the index from one 1164 # of a DataFrame's columns 1165 raise NotImplementedError( 1166 "Assigning an index is not yet supported. " 1167 "Consider using set_index() instead.") 1168 1169 reindex = frame_base.wont_implement_method( 1170 pd.DataFrame, 'reindex', reason="order-sensitive") 1171 1172 hist = frame_base.wont_implement_method( 1173 pd.DataFrame, 'hist', reason="plotting-tools") 1174 1175 attrs = property( 1176 frame_base.wont_implement_method( 1177 pd.DataFrame, 'attrs', reason='experimental')) 1178 1179 reorder_levels = frame_base._proxy_method( 1180 'reorder_levels', 1181 base=pd.DataFrame, 1182 requires_partition_by=partitionings.Arbitrary(), 1183 preserves_partition_by=partitionings.Singleton()) 1184 1185 resample = frame_base.wont_implement_method( 1186 pd.DataFrame, 'resample', reason='event-time-semantics') 1187 1188 rolling = frame_base.wont_implement_method( 1189 pd.DataFrame, 'rolling', reason='event-time-semantics') 1190 1191 to_xarray = frame_base.wont_implement_method( 1192 pd.DataFrame, 'to_xarray', reason='non-deferred-result') 1193 to_clipboard = frame_base.wont_implement_method( 1194 pd.DataFrame, 'to_clipboard', reason="non-deferred-result") 1195 1196 swapaxes = frame_base.wont_implement_method( 1197 pd.Series, 'swapaxes', reason="non-deferred-columns") 1198 infer_object = frame_base.wont_implement_method( 1199 pd.Series, 'infer_objects', reason="non-deferred-columns") 1200 1201 ewm = frame_base.wont_implement_method( 1202 pd.Series, 'ewm', reason="event-time-semantics") 1203 expanding = frame_base.wont_implement_method( 1204 pd.Series, 'expanding', reason="event-time-semantics") 1205 1206 sparse = property( 1207 frame_base.not_implemented_method( 1208 'sparse', '20902', base_type=pd.DataFrame)) 1209 1210 transform = frame_base._elementwise_method('transform', base=pd.DataFrame) 1211 1212 tz_convert = frame_base._proxy_method( 1213 'tz_convert', 1214 base=pd.DataFrame, 1215 requires_partition_by=partitionings.Arbitrary(), 1216 # Manipulates index, partitioning is not preserved 1217 preserves_partition_by=partitionings.Singleton()) 1218 1219 @frame_base.with_docs_from(pd.DataFrame) 1220 def pipe(self, func, *args, **kwargs): 1221 if isinstance(func, tuple): 1222 func, data = func 1223 kwargs[data] = self 1224 return func(*args, **kwargs) 1225 1226 return func(self, *args, **kwargs) 1227 1228 1229 @populate_not_implemented(pd.Series) 1230 @frame_base.DeferredFrame._register_for(pd.Series) 1231 class DeferredSeries(DeferredDataFrameOrSeries): 1232 def __repr__(self): 1233 return ( 1234 f'DeferredSeries(name={self.name!r}, dtype={self.dtype}, ' 1235 f'{self._render_indexes()})') 1236 1237 @property # type: ignore 1238 @frame_base.with_docs_from(pd.Series) 1239 def name(self): 1240 return self._expr.proxy().name 1241 1242 @name.setter 1243 def name(self, value): 1244 def fn(s): 1245 s = s.copy() 1246 s.name = value 1247 return s 1248 1249 self._expr = expressions.ComputedExpression( 1250 'series_set_name', 1251 fn, [self._expr], 1252 requires_partition_by=partitionings.Arbitrary(), 1253 preserves_partition_by=partitionings.Arbitrary()) 1254 1255 @property # type: ignore 1256 @frame_base.with_docs_from(pd.Series) 1257 def hasnans(self): 1258 has_nans = expressions.ComputedExpression( 1259 'hasnans', 1260 lambda s: pd.Series(s.hasnans), [self._expr], 1261 requires_partition_by=partitionings.Arbitrary(), 1262 preserves_partition_by=partitionings.Singleton()) 1263 1264 with expressions.allow_non_parallel_operations(): 1265 return frame_base.DeferredFrame.wrap( 1266 expressions.ComputedExpression( 1267 'combine_hasnans', 1268 lambda s: s.any(), [has_nans], 1269 requires_partition_by=partitionings.Singleton(), 1270 preserves_partition_by=partitionings.Singleton())) 1271 1272 @property # type: ignore 1273 @frame_base.with_docs_from(pd.Series) 1274 def dtype(self): 1275 return self._expr.proxy().dtype 1276 1277 dtypes = dtype 1278 1279 def __getitem__(self, key): 1280 if _is_null_slice(key) or key is Ellipsis: 1281 return self 1282 1283 elif (isinstance(key, int) or _is_integer_slice(key) 1284 ) and self._expr.proxy().index._should_fallback_to_positional(): 1285 raise frame_base.WontImplementError( 1286 "Accessing an item by an integer key is order sensitive for this " 1287 "Series.", 1288 reason="order-sensitive") 1289 1290 elif isinstance(key, slice) or callable(key): 1291 return frame_base.DeferredFrame.wrap( 1292 expressions.ComputedExpression( 1293 # yapf: disable 1294 'getitem', 1295 lambda df: df[key], 1296 [self._expr], 1297 requires_partition_by=partitionings.Arbitrary(), 1298 preserves_partition_by=partitionings.Arbitrary())) 1299 1300 elif isinstance(key, DeferredSeries) and key._expr.proxy().dtype == bool: 1301 return frame_base.DeferredFrame.wrap( 1302 expressions.ComputedExpression( 1303 # yapf: disable 1304 'getitem', 1305 lambda df, 1306 indexer: df[indexer], 1307 [self._expr, key._expr], 1308 requires_partition_by=partitionings.Index(), 1309 preserves_partition_by=partitionings.Arbitrary())) 1310 1311 elif pd.core.series.is_iterator(key) or pd.core.common.is_bool_indexer(key): 1312 raise frame_base.WontImplementError( 1313 "Accessing a DeferredSeries with an iterator is sensitive to the " 1314 "order of the data.", 1315 reason="order-sensitive") 1316 1317 else: 1318 # We could consider returning a deferred scalar, but that might 1319 # be more surprising than a clear error. 1320 raise frame_base.WontImplementError( 1321 f"Indexing a series with key of type {type(key)} is not supported " 1322 "because it produces a non-deferred result.", 1323 reason="non-deferred-result") 1324 1325 @frame_base.with_docs_from(pd.Series) 1326 def keys(self): 1327 return self.index 1328 1329 # Series.T == transpose. Both are a no-op 1330 T = frame_base._elementwise_method('T', base=pd.Series) 1331 transpose = frame_base._elementwise_method('transpose', base=pd.Series) 1332 shape = property( 1333 frame_base.wont_implement_method( 1334 pd.Series, 'shape', reason="non-deferred-result")) 1335 1336 @frame_base.with_docs_from(pd.Series) 1337 @frame_base.args_to_kwargs(pd.Series) 1338 @frame_base.populate_defaults(pd.Series) 1339 def append(self, to_append, ignore_index, verify_integrity, **kwargs): 1340 """``ignore_index=True`` is not supported, because it requires generating an 1341 order-sensitive index.""" 1342 if not isinstance(to_append, DeferredSeries): 1343 raise frame_base.WontImplementError( 1344 "append() only accepts DeferredSeries instances, received " + 1345 str(type(to_append))) 1346 if ignore_index: 1347 raise frame_base.WontImplementError( 1348 "append(ignore_index=True) is order sensitive because it requires " 1349 "generating a new index based on the order of the data.", 1350 reason="order-sensitive") 1351 1352 if verify_integrity: 1353 # We can verify the index is non-unique within index partitioned data. 1354 requires = partitionings.Index() 1355 else: 1356 requires = partitionings.Arbitrary() 1357 1358 return frame_base.DeferredFrame.wrap( 1359 expressions.ComputedExpression( 1360 'append', 1361 lambda s, 1362 to_append: s.append( 1363 to_append, verify_integrity=verify_integrity, **kwargs), 1364 [self._expr, to_append._expr], 1365 requires_partition_by=requires, 1366 preserves_partition_by=partitionings.Arbitrary())) 1367 1368 @frame_base.with_docs_from(pd.Series) 1369 @frame_base.args_to_kwargs(pd.Series) 1370 @frame_base.populate_defaults(pd.Series) 1371 def align(self, other, join, axis, level, method, **kwargs): 1372 """Aligning per-level is not yet supported. Only the default, 1373 ``level=None``, is allowed. 1374 1375 Filling NaN values via ``method`` is not supported, because it is 1376 `order-sensitive 1377 <https://s.apache.org/dataframe-order-sensitive-operations>`_. 1378 Only the default, ``method=None``, is allowed.""" 1379 if level is not None: 1380 raise NotImplementedError('per-level align') 1381 if method is not None: 1382 raise frame_base.WontImplementError( 1383 f"align(method={method!r}) is not supported because it is " 1384 "order sensitive. Only align(method=None) is supported.", 1385 reason="order-sensitive") 1386 # We're using pd.concat here as expressions don't yet support 1387 # multiple return values. 1388 aligned = frame_base.DeferredFrame.wrap( 1389 expressions.ComputedExpression( 1390 'align', 1391 lambda x, 1392 y: pd.concat([x, y], axis=1, join='inner'), 1393 [self._expr, other._expr], 1394 requires_partition_by=partitionings.Index(), 1395 preserves_partition_by=partitionings.Arbitrary())) 1396 return aligned.iloc[:, 0], aligned.iloc[:, 1] 1397 1398 argsort = frame_base.wont_implement_method( 1399 pd.Series, 'argsort', reason="order-sensitive") 1400 1401 array = property( 1402 frame_base.wont_implement_method( 1403 pd.Series, 'array', reason="non-deferred-result")) 1404 1405 # We can't reliably predict the output type, it depends on whether `key` is: 1406 # - not in the index (default_value) 1407 # - in the index once (constant) 1408 # - in the index multiple times (Series) 1409 get = frame_base.wont_implement_method( 1410 pd.Series, 'get', reason="non-deferred-columns") 1411 1412 ravel = frame_base.wont_implement_method( 1413 pd.Series, 'ravel', reason="non-deferred-result") 1414 1415 slice_shift = frame_base.wont_implement_method( 1416 pd.Series, 'slice_shift', reason="deprecated") 1417 tshift = frame_base.wont_implement_method( 1418 pd.Series, 'tshift', reason="deprecated") 1419 1420 rename = frame_base._proxy_method( 1421 'rename', 1422 base=pd.Series, 1423 requires_partition_by=partitionings.Arbitrary(), 1424 preserves_partition_by=partitionings.Singleton()) 1425 1426 between = frame_base._elementwise_method('between', base=pd.Series) 1427 1428 add_suffix = frame_base._proxy_method( 1429 'add_suffix', 1430 base=pd.DataFrame, 1431 requires_partition_by=partitionings.Arbitrary(), 1432 preserves_partition_by=partitionings.Singleton()) 1433 add_prefix = frame_base._proxy_method( 1434 'add_prefix', 1435 base=pd.DataFrame, 1436 requires_partition_by=partitionings.Arbitrary(), 1437 preserves_partition_by=partitionings.Singleton()) 1438 1439 info = frame_base.wont_implement_method( 1440 pd.Series, 'info', reason="non-deferred-result") 1441 1442 def _idxmaxmin_helper(self, op, **kwargs): 1443 if op == 'idxmax': 1444 func = pd.Series.idxmax 1445 elif op == 'idxmin': 1446 func = pd.Series.idxmin 1447 else: 1448 raise ValueError( 1449 "op must be one of ('idxmax', 'idxmin'). " 1450 f"got {op!r}.") 1451 1452 def compute_idx(s): 1453 index = func(s, **kwargs) 1454 if pd.isna(index): 1455 return s 1456 else: 1457 return s.loc[[index]] 1458 1459 # Avoids empty Series error when evaluating proxy 1460 index_dtype = self._expr.proxy().index.dtype 1461 index = pd.Index([], dtype=index_dtype) 1462 proxy = self._expr.proxy().copy() 1463 proxy.index = index 1464 proxy = proxy.append( 1465 pd.Series([1], index=np.asarray(['0']).astype(proxy.index.dtype))) 1466 1467 idx_func = expressions.ComputedExpression( 1468 'idx_func', 1469 compute_idx, [self._expr], 1470 proxy=proxy, 1471 requires_partition_by=partitionings.Arbitrary(), 1472 preserves_partition_by=partitionings.Arbitrary()) 1473 1474 with expressions.allow_non_parallel_operations(True): 1475 return frame_base.DeferredFrame.wrap( 1476 expressions.ComputedExpression( 1477 'idx_combine', 1478 lambda s: func(s, **kwargs), [idx_func], 1479 requires_partition_by=partitionings.Singleton(), 1480 preserves_partition_by=partitionings.Singleton())) 1481 1482 @frame_base.with_docs_from(pd.Series) 1483 @frame_base.args_to_kwargs(pd.Series) 1484 @frame_base.populate_defaults(pd.Series) 1485 def idxmin(self, **kwargs): 1486 return self._idxmaxmin_helper('idxmin', **kwargs) 1487 1488 @frame_base.with_docs_from(pd.Series) 1489 @frame_base.args_to_kwargs(pd.Series) 1490 @frame_base.populate_defaults(pd.Series) 1491 def idxmax(self, **kwargs): 1492 return self._idxmaxmin_helper('idxmax', **kwargs) 1493 1494 @frame_base.with_docs_from(pd.Series) 1495 @frame_base.args_to_kwargs(pd.Series) 1496 @frame_base.populate_defaults(pd.Series) 1497 def explode(self, ignore_index): 1498 # ignoring the index will not preserve it 1499 preserves = ( 1500 partitionings.Singleton() if ignore_index else partitionings.Index()) 1501 return frame_base.DeferredFrame.wrap( 1502 expressions.ComputedExpression( 1503 'explode', 1504 lambda s: s.explode(ignore_index), [self._expr], 1505 preserves_partition_by=preserves, 1506 requires_partition_by=partitionings.Arbitrary())) 1507 1508 @frame_base.with_docs_from(pd.DataFrame) 1509 def dot(self, other): 1510 """``other`` must be a :class:`DeferredDataFrame` or :class:`DeferredSeries` 1511 instance. Computing the dot product with an array-like is not supported 1512 because it is order-sensitive.""" 1513 left = self._expr 1514 if isinstance(other, DeferredSeries): 1515 right = expressions.ComputedExpression( 1516 'to_dataframe', 1517 pd.DataFrame, [other._expr], 1518 requires_partition_by=partitionings.Arbitrary(), 1519 preserves_partition_by=partitionings.Arbitrary()) 1520 right_is_series = True 1521 elif isinstance(other, DeferredDataFrame): 1522 right = other._expr 1523 right_is_series = False 1524 else: 1525 raise frame_base.WontImplementError( 1526 "other must be a DeferredDataFrame or DeferredSeries instance. " 1527 "Passing a concrete list or numpy array is not supported. Those " 1528 "types have no index and must be joined based on the order of the " 1529 "data.", 1530 reason="order-sensitive") 1531 1532 dots = expressions.ComputedExpression( 1533 'dot', 1534 # Transpose so we can sum across rows. 1535 (lambda left, right: pd.DataFrame(left @ right).T), 1536 [left, right], 1537 requires_partition_by=partitionings.Index()) 1538 with expressions.allow_non_parallel_operations(True): 1539 sums = expressions.ComputedExpression( 1540 'sum', 1541 lambda dots: dots.sum(), # 1542 [dots], 1543 requires_partition_by=partitionings.Singleton()) 1544 1545 if right_is_series: 1546 result = expressions.ComputedExpression( 1547 'extract', 1548 lambda df: df[0], [sums], 1549 requires_partition_by=partitionings.Singleton()) 1550 else: 1551 result = sums 1552 return frame_base.DeferredFrame.wrap(result) 1553 1554 __matmul__ = dot 1555 1556 @frame_base.with_docs_from(pd.Series) 1557 @frame_base.args_to_kwargs(pd.Series) 1558 @frame_base.populate_defaults(pd.Series) 1559 def nunique(self, **kwargs): 1560 return self.drop_duplicates(keep="any").size 1561 1562 @frame_base.with_docs_from(pd.Series) 1563 @frame_base.args_to_kwargs(pd.Series) 1564 @frame_base.populate_defaults(pd.Series) 1565 def quantile(self, q, **kwargs): 1566 """quantile is not parallelizable. See 1567 `Issue 20933 <https://github.com/apache/beam/issues/20933>`_ tracking 1568 the possible addition of an approximate, parallelizable implementation of 1569 quantile.""" 1570 # TODO(https://github.com/apache/beam/issues/20933): Provide an option for 1571 # approximate distributed quantiles 1572 requires = partitionings.Singleton( 1573 reason=( 1574 "Computing quantiles across index cannot currently be " 1575 "parallelized. See https://github.com/apache/beam/issues/20933 " 1576 "tracking the possible addition of an approximate, parallelizable " 1577 "implementation of quantile.")) 1578 1579 return frame_base.DeferredFrame.wrap( 1580 expressions.ComputedExpression( 1581 'quantile', 1582 lambda df: df.quantile(q=q, **kwargs), [self._expr], 1583 requires_partition_by=requires, 1584 preserves_partition_by=partitionings.Singleton())) 1585 1586 @frame_base.with_docs_from(pd.Series) 1587 def std(self, *args, **kwargs): 1588 # Compute variance (deferred scalar) with same args, then sqrt it 1589 return self.var(*args, **kwargs).apply(lambda var: math.sqrt(var)) 1590 1591 @frame_base.with_docs_from(pd.Series) 1592 @frame_base.args_to_kwargs(pd.Series) 1593 @frame_base.populate_defaults(pd.Series) 1594 def mean(self, skipna, **kwargs): 1595 if skipna: 1596 size = self.count() 1597 else: 1598 size = self.length() 1599 1600 return self.sum(skipna=skipna, **kwargs) / size 1601 1602 @frame_base.with_docs_from(pd.Series) 1603 @frame_base.args_to_kwargs(pd.Series) 1604 @frame_base.populate_defaults(pd.Series) 1605 def var(self, axis, skipna, level, ddof, **kwargs): 1606 """Per-level aggregation is not yet supported 1607 (https://github.com/apache/beam/issues/21829). Only the default, 1608 ``level=None``, is allowed.""" 1609 if level is not None: 1610 raise NotImplementedError("per-level aggregation") 1611 if skipna is None or skipna: 1612 self = self.dropna() # pylint: disable=self-cls-assignment 1613 1614 # See the online, numerically stable formulae at 1615 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 1616 # and 1617 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm 1618 def compute_moments(x): 1619 n = len(x) 1620 m = x.std(ddof=0)**2 * n 1621 s = x.sum() 1622 return pd.DataFrame(dict(m=[m], s=[s], n=[n])) 1623 1624 def combine_moments(data): 1625 m = s = n = 0.0 1626 for datum in data.itertuples(): 1627 if datum.n == 0: 1628 continue 1629 elif n == 0: 1630 m, s, n = datum.m, datum.s, datum.n 1631 else: 1632 delta = s / n - datum.s / datum.n 1633 m += datum.m + delta**2 * n * datum.n / (n + datum.n) 1634 s += datum.s 1635 n += datum.n 1636 if n <= ddof: 1637 return float('nan') 1638 else: 1639 return m / (n - ddof) 1640 1641 moments = expressions.ComputedExpression( 1642 'compute_moments', 1643 compute_moments, [self._expr], 1644 requires_partition_by=partitionings.Arbitrary()) 1645 with expressions.allow_non_parallel_operations(True): 1646 return frame_base.DeferredFrame.wrap( 1647 expressions.ComputedExpression( 1648 'combine_moments', 1649 combine_moments, [moments], 1650 requires_partition_by=partitionings.Singleton())) 1651 1652 @frame_base.with_docs_from(pd.Series) 1653 @frame_base.args_to_kwargs(pd.Series) 1654 @frame_base.populate_defaults(pd.Series) 1655 def corr(self, other, method, min_periods): 1656 """Only ``method='pearson'`` is currently parallelizable.""" 1657 if method == 'pearson': # Note that this is the default. 1658 x, y = self.dropna().align(other.dropna(), 'inner') 1659 return x._corr_aligned(y, min_periods) 1660 1661 else: 1662 reason = ( 1663 f"Encountered corr(method={method!r}) which cannot be " 1664 "parallelized. Only corr(method='pearson') is currently " 1665 "parallelizable.") 1666 # The rank-based correlations are not obviously parallelizable, though 1667 # perhaps an approximation could be done with a knowledge of quantiles 1668 # and custom partitioning. 1669 return frame_base.DeferredFrame.wrap( 1670 expressions.ComputedExpression( 1671 'corr', 1672 lambda df, 1673 other: df.corr(other, method=method, min_periods=min_periods), 1674 [self._expr, other._expr], 1675 requires_partition_by=partitionings.Singleton(reason=reason))) 1676 1677 @frame_base.with_docs_from(pd.Series) 1678 @frame_base.args_to_kwargs(pd.Series) 1679 @frame_base.populate_defaults(pd.Series) 1680 def skew(self, axis, skipna, level, numeric_only, **kwargs): 1681 if level is not None: 1682 raise NotImplementedError("per-level aggregation") 1683 if skipna is None or skipna: 1684 self = self.dropna() # pylint: disable=self-cls-assignment 1685 # See the online, numerically stable formulae at 1686 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics 1687 # Note that we are calculating the unbias (sample) version of skew here. 1688 # See https://en.wikipedia.org/wiki/Skewness#Sample_skewness 1689 # for more details. 1690 def compute_moments(x): 1691 n = len(x) 1692 if n == 0: 1693 m2, sum, m3 = 0, 0, 0 1694 else: 1695 m2 = x.std(ddof=0)**2 * n 1696 sum = x.sum() 1697 m3 = (((x - x.mean())**3).sum()) 1698 return pd.DataFrame(dict(m2=[m2], sum=[sum], n=[n], m3=[m3])) 1699 1700 def combine_moments(data): 1701 m2 = sum = n = m3 = 0.0 1702 for datum in data.itertuples(): 1703 if datum.n == 0: 1704 continue 1705 elif n == 0: 1706 m2, sum, n, m3 = datum.m2, datum.sum, datum.n, datum.m3 1707 else: 1708 n_a, n_b = datum.n, n 1709 sum_a, sum_b = datum.sum, sum 1710 m2_a, m2_b = datum.m2, m2 1711 mean_a, mean_b = sum_a / n_a, sum_b / n_b 1712 delta = mean_b - mean_a 1713 combined_n = n_a + n_b 1714 m3 += datum.m3 + ( 1715 (delta**3 * ((n_a * n_b) * (n_a - n_b)) / ((combined_n)**2)) + 1716 ((3 * delta) * ((n_a * m2_b) - (n_b * m2_a)) / (combined_n))) 1717 m2 += datum.m2 + delta**2 * n_b * n_a / combined_n 1718 sum += datum.sum 1719 n += datum.n 1720 1721 if n < 3: 1722 return float('nan') 1723 elif m2 == 0: 1724 return float(0) 1725 else: 1726 return combined_n * math.sqrt(combined_n - 1) / (combined_n - 1727 2) * m3 / ( 1728 m2**(3 / 2)) 1729 1730 moments = expressions.ComputedExpression( 1731 'compute_moments', 1732 compute_moments, [self._expr], 1733 requires_partition_by=partitionings.Arbitrary()) 1734 with expressions.allow_non_parallel_operations(True): 1735 return frame_base.DeferredFrame.wrap( 1736 expressions.ComputedExpression( 1737 'combine_moments', 1738 combine_moments, [moments], 1739 requires_partition_by=partitionings.Singleton())) 1740 1741 @frame_base.with_docs_from(pd.Series) 1742 @frame_base.args_to_kwargs(pd.Series) 1743 @frame_base.populate_defaults(pd.Series) 1744 def kurtosis(self, axis, skipna, level, numeric_only, **kwargs): 1745 if level is not None: 1746 raise NotImplementedError("per-level aggregation") 1747 if skipna is None or skipna: 1748 self = self.dropna() # pylint: disable=self-cls-assignment 1749 1750 # See the online, numerically stable formulae at 1751 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics 1752 # kurtosis here calculated as sample kurtosis 1753 # https://en.wikipedia.org/wiki/Kurtosis#Sample_kurtosis 1754 def compute_moments(x): 1755 n = len(x) 1756 if n == 0: 1757 m2, sum, m3, m4 = 0, 0, 0, 0 1758 else: 1759 m2 = x.std(ddof=0)**2 * n 1760 sum = x.sum() 1761 m3 = (((x - x.mean())**3).sum()) 1762 m4 = (((x - x.mean())**4).sum()) 1763 return pd.DataFrame(dict(m2=[m2], sum=[sum], n=[n], m3=[m3], m4=[m4])) 1764 1765 def combine_moments(data): 1766 m2 = sum = n = m3 = m4 = 0.0 1767 for datum in data.itertuples(): 1768 if datum.n == 0: 1769 continue 1770 elif n == 0: 1771 m2, sum, n, m3, m4 = datum.m2, datum.sum, datum.n, datum.m3, datum.m4 1772 else: 1773 n_a, n_b = datum.n, n 1774 m2_a, m2_b = datum.m2, m2 1775 m3_a, m3_b = datum.m3, m3 1776 sum_a, sum_b = datum.sum, sum 1777 mean_a, mean_b = sum_a / n_a, sum_b / n_b 1778 delta = mean_b - mean_a 1779 combined_n = n_a + n_b 1780 m4 += datum.m4 + ((delta**4) * (n_a * n_b) * ( 1781 (n_a**2) - (n_a * n_b) + 1782 (n_b**2)) / combined_n**3) + ((6 * delta**2) * ((n_a**2 * m2_b) + 1783 (n_b**2 * m2_a)) / 1784 (combined_n**2)) + ((4 * delta) * 1785 ((n_a * m3_b) - 1786 (n_b * m3_a)) / 1787 (combined_n)) 1788 m3 += datum.m3 + ( 1789 (delta**3 * ((n_a * n_b) * (n_a - n_b)) / ((combined_n)**2)) + 1790 ((3 * delta) * ((n_a * m2_b) - (n_b * m2_a)) / (combined_n))) 1791 m2 += datum.m2 + delta**2 * n_b * n_a / combined_n 1792 sum += datum.sum 1793 n += datum.n 1794 1795 if n < 4: 1796 return float('nan') 1797 elif m2 == 0: 1798 return float(0) 1799 else: 1800 return (((combined_n + 1) * (combined_n) * (combined_n - 1)) / 1801 ((combined_n - 2) * 1802 (combined_n - 3))) * (m4 / 1803 (m2)**2) - ((3 * (combined_n - 1)**2) / 1804 ((combined_n - 2) * 1805 (combined_n - 3))) 1806 1807 moments = expressions.ComputedExpression( 1808 'compute_moments', 1809 compute_moments, [self._expr], 1810 requires_partition_by=partitionings.Arbitrary()) 1811 with expressions.allow_non_parallel_operations(True): 1812 return frame_base.DeferredFrame.wrap( 1813 expressions.ComputedExpression( 1814 'combine_moments', 1815 combine_moments, [moments], 1816 requires_partition_by=partitionings.Singleton())) 1817 1818 @frame_base.with_docs_from(pd.Series) 1819 def kurt(self, *args, **kwargs): 1820 # Compute Kurtosis as kurt is an alias for kurtosis. 1821 return self.kurtosis(*args, **kwargs) 1822 1823 def _corr_aligned(self, other, min_periods): 1824 std_x = self.std() 1825 std_y = other.std() 1826 cov = self._cov_aligned(other, min_periods) 1827 return cov.apply( 1828 lambda cov, std_x, std_y: cov / (std_x * std_y), args=[std_x, std_y]) 1829 1830 @frame_base.with_docs_from(pd.Series) 1831 @frame_base.args_to_kwargs(pd.Series) 1832 @frame_base.populate_defaults(pd.Series) 1833 def cov(self, other, min_periods, ddof): 1834 x, y = self.dropna().align(other.dropna(), 'inner') 1835 return x._cov_aligned(y, min_periods, ddof) 1836 1837 def _cov_aligned(self, other, min_periods, ddof=1): 1838 # Use the formulae from 1839 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Covariance 1840 def compute_co_moments(x, y): 1841 n = len(x) 1842 if n <= 1: 1843 c = 0 1844 else: 1845 c = x.cov(y) * (n - 1) 1846 sx = x.sum() 1847 sy = y.sum() 1848 return pd.DataFrame(dict(c=[c], sx=[sx], sy=[sy], n=[n])) 1849 1850 def combine_co_moments(data): 1851 c = sx = sy = n = 0.0 1852 for datum in data.itertuples(): 1853 if datum.n == 0: 1854 continue 1855 elif n == 0: 1856 c, sx, sy, n = datum.c, datum.sx, datum.sy, datum.n 1857 else: 1858 c += ( 1859 datum.c + (sx / n - datum.sx / datum.n) * 1860 (sy / n - datum.sy / datum.n) * n * datum.n / (n + datum.n)) 1861 sx += datum.sx 1862 sy += datum.sy 1863 n += datum.n 1864 if n < max(2, ddof, min_periods or 0): 1865 return float('nan') 1866 else: 1867 return c / (n - ddof) 1868 1869 moments = expressions.ComputedExpression( 1870 'compute_co_moments', 1871 compute_co_moments, [self._expr, other._expr], 1872 requires_partition_by=partitionings.Index()) 1873 1874 with expressions.allow_non_parallel_operations(True): 1875 return frame_base.DeferredFrame.wrap( 1876 expressions.ComputedExpression( 1877 'combine_co_moments', 1878 combine_co_moments, [moments], 1879 requires_partition_by=partitionings.Singleton())) 1880 1881 @frame_base.with_docs_from(pd.Series) 1882 @frame_base.args_to_kwargs(pd.Series) 1883 @frame_base.populate_defaults(pd.Series) 1884 @frame_base.maybe_inplace 1885 def dropna(self, **kwargs): 1886 return frame_base.DeferredFrame.wrap( 1887 expressions.ComputedExpression( 1888 'dropna', 1889 lambda df: df.dropna(**kwargs), [self._expr], 1890 preserves_partition_by=partitionings.Arbitrary(), 1891 requires_partition_by=partitionings.Arbitrary())) 1892 1893 @frame_base.with_docs_from(pd.Series) 1894 @frame_base.args_to_kwargs(pd.Series) 1895 @frame_base.populate_defaults(pd.Series) 1896 @frame_base.maybe_inplace 1897 def set_axis(self, labels, **kwargs): 1898 # TODO: assigning the index is generally order-sensitive, but we could 1899 # support it in some rare cases, e.g. when assigning the index from one 1900 # of a DataFrame's columns 1901 raise NotImplementedError( 1902 "Assigning an index is not yet supported. " 1903 "Consider using set_index() instead.") 1904 1905 isnull = isna = frame_base._elementwise_method('isna', base=pd.Series) 1906 notnull = notna = frame_base._elementwise_method('notna', base=pd.Series) 1907 1908 items = frame_base.wont_implement_method( 1909 pd.Series, 'items', reason="non-deferred-result") 1910 iteritems = frame_base.wont_implement_method( 1911 pd.Series, 'iteritems', reason="non-deferred-result") 1912 tolist = frame_base.wont_implement_method( 1913 pd.Series, 'tolist', reason="non-deferred-result") 1914 to_numpy = frame_base.wont_implement_method( 1915 pd.Series, 'to_numpy', reason="non-deferred-result") 1916 to_string = frame_base.wont_implement_method( 1917 pd.Series, 'to_string', reason="non-deferred-result") 1918 1919 def _wrap_in_df(self): 1920 return frame_base.DeferredFrame.wrap( 1921 expressions.ComputedExpression( 1922 'wrap_in_df', 1923 lambda s: pd.DataFrame(s), 1924 [self._expr], 1925 requires_partition_by=partitionings.Arbitrary(), 1926 preserves_partition_by=partitionings.Arbitrary(), 1927 )) 1928 1929 @frame_base.with_docs_from(pd.Series) 1930 @frame_base.args_to_kwargs(pd.Series) 1931 @frame_base.populate_defaults(pd.Series) 1932 @frame_base.maybe_inplace 1933 def duplicated(self, keep): 1934 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 1935 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 1936 a Beam-specific option that guarantees only one duplicate will be kept, but 1937 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 1938 duplicate element is kept.""" 1939 # Re-use the DataFrame based duplcated, extract the series back out 1940 df = self._wrap_in_df() 1941 1942 return df.duplicated(keep=keep)[df.columns[0]] 1943 1944 @frame_base.with_docs_from(pd.Series) 1945 @frame_base.args_to_kwargs(pd.Series) 1946 @frame_base.populate_defaults(pd.Series) 1947 @frame_base.maybe_inplace 1948 def drop_duplicates(self, keep): 1949 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 1950 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 1951 a Beam-specific option that guarantees only one duplicate will be kept, but 1952 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 1953 duplicate element is kept.""" 1954 # Re-use the DataFrame based drop_duplicates, extract the series back out 1955 df = self._wrap_in_df() 1956 1957 return df.drop_duplicates(keep=keep)[df.columns[0]] 1958 1959 @frame_base.with_docs_from(pd.Series) 1960 @frame_base.args_to_kwargs(pd.Series) 1961 @frame_base.populate_defaults(pd.Series) 1962 @frame_base.maybe_inplace 1963 def sample(self, **kwargs): 1964 """Only ``n`` and/or ``weights`` may be specified. ``frac``, 1965 ``random_state``, and ``replace=True`` are not yet supported. 1966 See `Issue 21010 <https://github.com/apache/beam/issues/21010>`_. 1967 1968 Note that pandas will raise an error if ``n`` is larger than the length 1969 of the dataset, while the Beam DataFrame API will simply return the full 1970 dataset in that case.""" 1971 1972 # Re-use the DataFrame based sample, extract the series back out 1973 df = self._wrap_in_df() 1974 1975 return df.sample(**kwargs)[df.columns[0]] 1976 1977 @frame_base.with_docs_from(pd.Series) 1978 @frame_base.args_to_kwargs(pd.Series) 1979 @frame_base.populate_defaults(pd.Series) 1980 def aggregate(self, func, axis, *args, **kwargs): 1981 """Some aggregation methods cannot be parallelized, and computing 1982 them will require collecting all data on a single machine.""" 1983 if kwargs.get('skipna', False): 1984 # Eagerly generate a proxy to make sure skipna is a valid argument 1985 # for this aggregation method 1986 _ = self._expr.proxy().aggregate(func, axis, *args, **kwargs) 1987 kwargs.pop('skipna') 1988 return self.dropna().aggregate(func, axis, *args, **kwargs) 1989 if isinstance(func, list) and len(func) > 1: 1990 # level arg is ignored for multiple aggregations 1991 _ = kwargs.pop('level', None) 1992 1993 # Aggregate with each method separately, then stick them all together. 1994 rows = [self.agg([f], *args, **kwargs) for f in func] 1995 return frame_base.DeferredFrame.wrap( 1996 expressions.ComputedExpression( 1997 'join_aggregate', 1998 lambda *rows: pd.concat(rows), [row._expr for row in rows])) 1999 else: 2000 # We're only handling a single column. It could be 'func' or ['func'], 2001 # which produce different results. 'func' produces a scalar, ['func'] 2002 # produces a single element Series. 2003 base_func = func[0] if isinstance(func, list) else func 2004 2005 if (_is_numeric(base_func) and 2006 not pd.core.dtypes.common.is_numeric_dtype(self.dtype)): 2007 warnings.warn( 2008 f"Performing a numeric aggregation, {base_func!r}, on " 2009 f"Series {self._expr.proxy().name!r} with non-numeric type " 2010 f"{self.dtype!r}. This can result in runtime errors or surprising " 2011 "results.") 2012 2013 if 'level' in kwargs: 2014 # Defer to groupby.agg for level= mode 2015 return self.groupby( 2016 level=kwargs.pop('level'), axis=axis).agg(func, *args, **kwargs) 2017 2018 singleton_reason = None 2019 if 'min_count' in kwargs: 2020 # Eagerly generate a proxy to make sure min_count is a valid argument 2021 # for this aggregation method 2022 _ = self._expr.proxy().agg(func, axis, *args, **kwargs) 2023 2024 singleton_reason = ( 2025 "Aggregation with min_count= requires collecting all data on a " 2026 "single node.") 2027 2028 # We have specialized distributed implementations for these 2029 if base_func in HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS: 2030 result = getattr(self, base_func)(*args, **kwargs) 2031 if isinstance(func, list): 2032 with expressions.allow_non_parallel_operations(True): 2033 return frame_base.DeferredFrame.wrap( 2034 expressions.ComputedExpression( 2035 f'wrap_aggregate_{base_func}', 2036 lambda x: pd.Series(x, index=[base_func]), [result._expr], 2037 requires_partition_by=partitionings.Singleton(), 2038 preserves_partition_by=partitionings.Singleton())) 2039 else: 2040 return result 2041 2042 agg_kwargs = kwargs.copy() 2043 if ((_is_associative(base_func) or _is_liftable_with_sum(base_func)) and 2044 singleton_reason is None): 2045 intermediate = expressions.ComputedExpression( 2046 f'pre_aggregate_{base_func}', 2047 # Coerce to a Series, if the result is scalar we still want a Series 2048 # so we can combine and do the final aggregation next. 2049 lambda s: pd.Series(s.agg(func, *args, **kwargs)), 2050 [self._expr], 2051 requires_partition_by=partitionings.Arbitrary(), 2052 preserves_partition_by=partitionings.Singleton()) 2053 allow_nonparallel_final = True 2054 if _is_associative(base_func): 2055 agg_func = func 2056 else: 2057 agg_func = ['sum'] if isinstance(func, list) else 'sum' 2058 else: 2059 intermediate = self._expr 2060 allow_nonparallel_final = None # i.e. don't change the value 2061 agg_func = func 2062 singleton_reason = ( 2063 f"Aggregation function {func!r} cannot currently be " 2064 "parallelized. It requires collecting all data for " 2065 "this Series on a single node.") 2066 with expressions.allow_non_parallel_operations(allow_nonparallel_final): 2067 return frame_base.DeferredFrame.wrap( 2068 expressions.ComputedExpression( 2069 f'post_aggregate_{base_func}', 2070 lambda s: s.agg(agg_func, *args, **agg_kwargs), [intermediate], 2071 preserves_partition_by=partitionings.Singleton(), 2072 requires_partition_by=partitionings.Singleton( 2073 reason=singleton_reason))) 2074 2075 agg = aggregate 2076 2077 @property # type: ignore 2078 @frame_base.with_docs_from(pd.Series) 2079 def axes(self): 2080 return [self.index] 2081 2082 clip = frame_base._elementwise_method('clip', base=pd.Series) 2083 2084 all = _agg_method(pd.Series, 'all') 2085 any = _agg_method(pd.Series, 'any') 2086 # TODO(BEAM-12074): Document that Series.count(level=) will drop NaN's 2087 count = _agg_method(pd.Series, 'count') 2088 describe = _agg_method(pd.Series, 'describe') 2089 min = _agg_method(pd.Series, 'min') 2090 max = _agg_method(pd.Series, 'max') 2091 prod = product = _agg_method(pd.Series, 'prod') 2092 sum = _agg_method(pd.Series, 'sum') 2093 median = _agg_method(pd.Series, 'median') 2094 sem = _agg_method(pd.Series, 'sem') 2095 mad = _agg_method(pd.Series, 'mad') 2096 2097 argmax = frame_base.wont_implement_method( 2098 pd.Series, 'argmax', reason='order-sensitive') 2099 argmin = frame_base.wont_implement_method( 2100 pd.Series, 'argmin', reason='order-sensitive') 2101 cummax = frame_base.wont_implement_method( 2102 pd.Series, 'cummax', reason='order-sensitive') 2103 cummin = frame_base.wont_implement_method( 2104 pd.Series, 'cummin', reason='order-sensitive') 2105 cumprod = frame_base.wont_implement_method( 2106 pd.Series, 'cumprod', reason='order-sensitive') 2107 cumsum = frame_base.wont_implement_method( 2108 pd.Series, 'cumsum', reason='order-sensitive') 2109 diff = frame_base.wont_implement_method( 2110 pd.Series, 'diff', reason='order-sensitive') 2111 interpolate = frame_base.wont_implement_method( 2112 pd.Series, 'interpolate', reason='order-sensitive') 2113 searchsorted = frame_base.wont_implement_method( 2114 pd.Series, 'searchsorted', reason='order-sensitive') 2115 shift = frame_base.wont_implement_method( 2116 pd.Series, 'shift', reason='order-sensitive') 2117 pct_change = frame_base.wont_implement_method( 2118 pd.Series, 'pct_change', reason='order-sensitive') 2119 is_monotonic = frame_base.wont_implement_method( 2120 pd.Series, 'is_monotonic', reason='order-sensitive') 2121 is_monotonic_increasing = frame_base.wont_implement_method( 2122 pd.Series, 'is_monotonic_increasing', reason='order-sensitive') 2123 is_monotonic_decreasing = frame_base.wont_implement_method( 2124 pd.Series, 'is_monotonic_decreasing', reason='order-sensitive') 2125 asof = frame_base.wont_implement_method( 2126 pd.Series, 'asof', reason='order-sensitive') 2127 first_valid_index = frame_base.wont_implement_method( 2128 pd.Series, 'first_valid_index', reason='order-sensitive') 2129 last_valid_index = frame_base.wont_implement_method( 2130 pd.Series, 'last_valid_index', reason='order-sensitive') 2131 autocorr = frame_base.wont_implement_method( 2132 pd.Series, 'autocorr', reason='order-sensitive') 2133 iat = property( 2134 frame_base.wont_implement_method( 2135 pd.Series, 'iat', reason='order-sensitive')) 2136 2137 head = frame_base.wont_implement_method( 2138 pd.Series, 'head', explanation=_PEEK_METHOD_EXPLANATION) 2139 tail = frame_base.wont_implement_method( 2140 pd.Series, 'tail', explanation=_PEEK_METHOD_EXPLANATION) 2141 2142 filter = frame_base._elementwise_method('filter', base=pd.Series) 2143 2144 memory_usage = frame_base.wont_implement_method( 2145 pd.Series, 'memory_usage', reason="non-deferred-result") 2146 nbytes = frame_base.wont_implement_method( 2147 pd.Series, 'nbytes', reason="non-deferred-result") 2148 to_list = frame_base.wont_implement_method( 2149 pd.Series, 'to_list', reason="non-deferred-result") 2150 2151 factorize = frame_base.wont_implement_method( 2152 pd.Series, 'factorize', reason="non-deferred-columns") 2153 2154 # In Series __contains__ checks the index 2155 __contains__ = frame_base.wont_implement_method( 2156 pd.Series, '__contains__', reason="non-deferred-result") 2157 2158 @frame_base.with_docs_from(pd.Series) 2159 @frame_base.args_to_kwargs(pd.Series) 2160 @frame_base.populate_defaults(pd.Series) 2161 def nlargest(self, keep, **kwargs): 2162 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 2163 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 2164 a Beam-specific option that guarantees only one duplicate will be kept, but 2165 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 2166 duplicate element is kept.""" 2167 # TODO(robertwb): Document 'any' option. 2168 # TODO(robertwb): Consider (conditionally) defaulting to 'any' if no 2169 # explicit keep parameter is requested. 2170 if keep == 'any': 2171 keep = 'first' 2172 elif keep != 'all': 2173 raise frame_base.WontImplementError( 2174 f"nlargest(keep={keep!r}) is not supported because it is " 2175 "order sensitive. Only keep=\"all\" is supported.", 2176 reason="order-sensitive") 2177 kwargs['keep'] = keep 2178 per_partition = expressions.ComputedExpression( 2179 'nlargest-per-partition', 2180 lambda df: df.nlargest(**kwargs), [self._expr], 2181 preserves_partition_by=partitionings.Arbitrary(), 2182 requires_partition_by=partitionings.Arbitrary()) 2183 with expressions.allow_non_parallel_operations(True): 2184 return frame_base.DeferredFrame.wrap( 2185 expressions.ComputedExpression( 2186 'nlargest', 2187 lambda df: df.nlargest(**kwargs), [per_partition], 2188 preserves_partition_by=partitionings.Arbitrary(), 2189 requires_partition_by=partitionings.Singleton())) 2190 2191 @frame_base.with_docs_from(pd.Series) 2192 @frame_base.args_to_kwargs(pd.Series) 2193 @frame_base.populate_defaults(pd.Series) 2194 def nsmallest(self, keep, **kwargs): 2195 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 2196 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 2197 a Beam-specific option that guarantees only one duplicate will be kept, but 2198 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 2199 duplicate element is kept.""" 2200 if keep == 'any': 2201 keep = 'first' 2202 elif keep != 'all': 2203 raise frame_base.WontImplementError( 2204 f"nsmallest(keep={keep!r}) is not supported because it is " 2205 "order sensitive. Only keep=\"all\" is supported.", 2206 reason="order-sensitive") 2207 kwargs['keep'] = keep 2208 per_partition = expressions.ComputedExpression( 2209 'nsmallest-per-partition', 2210 lambda df: df.nsmallest(**kwargs), [self._expr], 2211 preserves_partition_by=partitionings.Arbitrary(), 2212 requires_partition_by=partitionings.Arbitrary()) 2213 with expressions.allow_non_parallel_operations(True): 2214 return frame_base.DeferredFrame.wrap( 2215 expressions.ComputedExpression( 2216 'nsmallest', 2217 lambda df: df.nsmallest(**kwargs), [per_partition], 2218 preserves_partition_by=partitionings.Arbitrary(), 2219 requires_partition_by=partitionings.Singleton())) 2220 2221 @property # type: ignore 2222 @frame_base.with_docs_from(pd.Series) 2223 def is_unique(self): 2224 def set_index(s): 2225 s = s[:] 2226 s.index = s 2227 return s 2228 2229 self_index = expressions.ComputedExpression( 2230 'set_index', 2231 set_index, [self._expr], 2232 requires_partition_by=partitionings.Arbitrary(), 2233 preserves_partition_by=partitionings.Singleton()) 2234 2235 is_unique_distributed = expressions.ComputedExpression( 2236 'is_unique_distributed', 2237 lambda s: pd.Series(s.is_unique), [self_index], 2238 requires_partition_by=partitionings.Index(), 2239 preserves_partition_by=partitionings.Singleton()) 2240 2241 with expressions.allow_non_parallel_operations(): 2242 return frame_base.DeferredFrame.wrap( 2243 expressions.ComputedExpression( 2244 'combine', 2245 lambda s: s.all(), [is_unique_distributed], 2246 requires_partition_by=partitionings.Singleton(), 2247 preserves_partition_by=partitionings.Singleton())) 2248 2249 plot = frame_base.wont_implement_method( 2250 pd.Series, 'plot', reason="plotting-tools") 2251 pop = frame_base.wont_implement_method( 2252 pd.Series, 'pop', reason="non-deferred-result") 2253 2254 rename_axis = frame_base._elementwise_method('rename_axis', base=pd.Series) 2255 2256 round = frame_base._elementwise_method('round', base=pd.Series) 2257 2258 take = frame_base.wont_implement_method( 2259 pd.Series, 'take', reason='deprecated') 2260 2261 to_dict = frame_base.wont_implement_method( 2262 pd.Series, 'to_dict', reason="non-deferred-result") 2263 2264 to_frame = frame_base._elementwise_method('to_frame', base=pd.Series) 2265 2266 @frame_base.with_docs_from(pd.Series) 2267 def unique(self, as_series=False): 2268 """unique is not supported by default because it produces a 2269 non-deferred result: an :class:`~numpy.ndarray`. You can use the 2270 Beam-specific argument ``unique(as_series=True)`` to get the result as 2271 a :class:`DeferredSeries`""" 2272 2273 if not as_series: 2274 raise frame_base.WontImplementError( 2275 "unique() is not supported by default because it produces a " 2276 "non-deferred result: a numpy array. You can use the Beam-specific " 2277 "argument unique(as_series=True) to get the result as a " 2278 "DeferredSeries", 2279 reason="non-deferred-result") 2280 return frame_base.DeferredFrame.wrap( 2281 expressions.ComputedExpression( 2282 'unique', 2283 lambda df: pd.Series(df.unique()), [self._expr], 2284 preserves_partition_by=partitionings.Singleton(), 2285 requires_partition_by=partitionings.Singleton( 2286 reason="unique() cannot currently be parallelized."))) 2287 2288 @frame_base.with_docs_from(pd.Series) 2289 def update(self, other): 2290 self._expr = expressions.ComputedExpression( 2291 'update', 2292 lambda df, 2293 other: df.update(other) or df, [self._expr, other._expr], 2294 preserves_partition_by=partitionings.Arbitrary(), 2295 requires_partition_by=partitionings.Index()) 2296 2297 @frame_base.with_docs_from(pd.Series) 2298 def value_counts( 2299 self, 2300 sort=False, 2301 normalize=False, 2302 ascending=False, 2303 bins=None, 2304 dropna=True): 2305 """``sort`` is ``False`` by default, and ``sort=True`` is not supported 2306 because it imposes an ordering on the dataset which likely will not be 2307 preserved. 2308 2309 When ``bin`` is specified this operation is not parallelizable. See 2310 [Issue 20903](https://github.com/apache/beam/issues/20903) tracking the 2311 possible addition of a distributed implementation.""" 2312 2313 if sort: 2314 raise frame_base.WontImplementError( 2315 "value_counts(sort=True) is not supported because it imposes an " 2316 "ordering on the dataset which likely will not be preserved.", 2317 reason="order-sensitive") 2318 2319 if bins is not None: 2320 return frame_base.DeferredFrame.wrap( 2321 expressions.ComputedExpression( 2322 'value_counts', 2323 lambda s: s.value_counts( 2324 normalize=normalize, bins=bins, dropna=dropna)[self._expr], 2325 requires_partition_by=partitionings.Singleton( 2326 reason=( 2327 "value_counts with bin specified requires collecting " 2328 "the entire dataset to identify the range.")), 2329 preserves_partition_by=partitionings.Singleton(), 2330 )) 2331 2332 if dropna: 2333 column = self.dropna() 2334 else: 2335 column = self 2336 2337 result = column.groupby(column, dropna=dropna).size() 2338 2339 # groupby.size() names the index, which we don't need 2340 result.index.name = None 2341 2342 if normalize: 2343 return result / column.length() 2344 else: 2345 return result 2346 2347 values = property( 2348 frame_base.wont_implement_method( 2349 pd.Series, 'values', reason="non-deferred-result")) 2350 2351 view = frame_base.wont_implement_method( 2352 pd.Series, 2353 'view', 2354 explanation=( 2355 "because it relies on memory-sharing semantics that are " 2356 "not compatible with the Beam model.")) 2357 2358 @property # type: ignore 2359 @frame_base.with_docs_from(pd.Series) 2360 def str(self): 2361 return _DeferredStringMethods(self._expr) 2362 2363 @property # type: ignore 2364 @frame_base.with_docs_from(pd.Series) 2365 def cat(self): 2366 return _DeferredCategoricalMethods(self._expr) 2367 2368 @property # type: ignore 2369 @frame_base.with_docs_from(pd.Series) 2370 def dt(self): 2371 return _DeferredDatetimeMethods(self._expr) 2372 2373 @frame_base.with_docs_from(pd.Series) 2374 def mode(self, *args, **kwargs): 2375 """mode is not currently parallelizable. An approximate, 2376 parallelizable implementation of mode may be added in the future 2377 (`Issue 20946 <https://github.com/apache/beam/issues/20946>`_).""" 2378 return frame_base.DeferredFrame.wrap( 2379 expressions.ComputedExpression( 2380 'mode', 2381 lambda df: df.mode(*args, **kwargs), 2382 [self._expr], 2383 #TODO(https://github.com/apache/beam/issues/20946): 2384 # Can we add an approximate implementation? 2385 requires_partition_by=partitionings.Singleton( 2386 reason=( 2387 "mode cannot currently be parallelized. See " 2388 "https://github.com/apache/beam/issues/20946 tracking the " 2389 "possble addition of an approximate, parallelizable " 2390 "implementation of mode.")), 2391 preserves_partition_by=partitionings.Singleton())) 2392 2393 apply = frame_base._elementwise_method('apply', base=pd.Series) 2394 map = frame_base._elementwise_method('map', base=pd.Series) 2395 # TODO(https://github.com/apache/beam/issues/20764): Implement transform 2396 # using type inference to determine the proxy 2397 #transform = frame_base._elementwise_method('transform', base=pd.Series) 2398 2399 @frame_base.with_docs_from(pd.Series) 2400 @frame_base.args_to_kwargs(pd.Series) 2401 @frame_base.populate_defaults(pd.Series) 2402 def repeat(self, repeats, axis): 2403 """``repeats`` must be an ``int`` or a :class:`DeferredSeries`. Lists are 2404 not supported because they make this operation order-sensitive.""" 2405 if isinstance(repeats, int): 2406 return frame_base.DeferredFrame.wrap( 2407 expressions.ComputedExpression( 2408 'repeat', 2409 lambda series: series.repeat(repeats), [self._expr], 2410 requires_partition_by=partitionings.Arbitrary(), 2411 preserves_partition_by=partitionings.Arbitrary())) 2412 elif isinstance(repeats, frame_base.DeferredBase): 2413 return frame_base.DeferredFrame.wrap( 2414 expressions.ComputedExpression( 2415 'repeat', 2416 lambda series, 2417 repeats_series: series.repeat(repeats_series), 2418 [self._expr, repeats._expr], 2419 requires_partition_by=partitionings.Index(), 2420 preserves_partition_by=partitionings.Arbitrary())) 2421 elif isinstance(repeats, list): 2422 raise frame_base.WontImplementError( 2423 "repeat(repeats=) repeats must be an int or a DeferredSeries. " 2424 "Lists are not supported because they make this operation sensitive " 2425 "to the order of the data.", 2426 reason="order-sensitive") 2427 else: 2428 raise TypeError( 2429 "repeat(repeats=) value must be an int or a " 2430 f"DeferredSeries (encountered {type(repeats)}).") 2431 2432 if hasattr(pd.Series, 'compare'): 2433 2434 @frame_base.with_docs_from(pd.Series) 2435 @frame_base.args_to_kwargs(pd.Series) 2436 @frame_base.populate_defaults(pd.Series) 2437 def compare(self, other, align_axis, **kwargs): 2438 2439 if align_axis in ('index', 0): 2440 preserves_partition = partitionings.Singleton() 2441 elif align_axis in ('columns', 1): 2442 preserves_partition = partitionings.Arbitrary() 2443 else: 2444 raise ValueError( 2445 "align_axis must be one of ('index', 0, 'columns', 1). " 2446 f"got {align_axis!r}.") 2447 2448 return frame_base.DeferredFrame.wrap( 2449 expressions.ComputedExpression( 2450 'compare', 2451 lambda s, 2452 other: s.compare(other, align_axis, **kwargs), 2453 [self._expr, other._expr], 2454 requires_partition_by=partitionings.Index(), 2455 preserves_partition_by=preserves_partition)) 2456 2457 2458 @populate_not_implemented(pd.DataFrame) 2459 @frame_base.DeferredFrame._register_for(pd.DataFrame) 2460 class DeferredDataFrame(DeferredDataFrameOrSeries): 2461 def __repr__(self): 2462 return ( 2463 f'DeferredDataFrame(columns={list(self.columns)}, ' 2464 f'{self._render_indexes()})') 2465 2466 @property # type: ignore 2467 @frame_base.with_docs_from(pd.DataFrame) 2468 def columns(self): 2469 return self._expr.proxy().columns 2470 2471 @columns.setter 2472 def columns(self, columns): 2473 def set_columns(df): 2474 df = df.copy() 2475 df.columns = columns 2476 return df 2477 2478 return frame_base.DeferredFrame.wrap( 2479 expressions.ComputedExpression( 2480 'set_columns', 2481 set_columns, [self._expr], 2482 requires_partition_by=partitionings.Arbitrary(), 2483 preserves_partition_by=partitionings.Arbitrary())) 2484 2485 @frame_base.with_docs_from(pd.DataFrame) 2486 def keys(self): 2487 return self.columns 2488 2489 def __getattr__(self, name): 2490 # Column attribute access. 2491 if name in self._expr.proxy().columns: 2492 return self[name] 2493 else: 2494 return object.__getattribute__(self, name) 2495 2496 def __getitem__(self, key): 2497 # TODO: Replicate pd.DataFrame.__getitem__ logic 2498 if isinstance(key, DeferredSeries) and key._expr.proxy().dtype == bool: 2499 return self.loc[key] 2500 2501 elif isinstance(key, frame_base.DeferredBase): 2502 # Fail early if key is a DeferredBase as it interacts surprisingly with 2503 # key in self._expr.proxy().columns 2504 raise NotImplementedError( 2505 "Indexing with a non-bool deferred frame is not yet supported. " 2506 "Consider using df.loc[...]") 2507 2508 elif isinstance(key, slice): 2509 if _is_null_slice(key): 2510 return self 2511 elif _is_integer_slice(key): 2512 # This depends on the contents of the index. 2513 raise frame_base.WontImplementError( 2514 "Integer slices are not supported as they are ambiguous. Please " 2515 "use iloc or loc with integer slices.") 2516 else: 2517 return self.loc[key] 2518 2519 elif ( 2520 (isinstance(key, list) and all(key_column in self._expr.proxy().columns 2521 for key_column in key)) or 2522 key in self._expr.proxy().columns): 2523 return self._elementwise(lambda df: df[key], 'get_column') 2524 2525 else: 2526 raise NotImplementedError(key) 2527 2528 def __contains__(self, key): 2529 # Checks if proxy has the given column 2530 return self._expr.proxy().__contains__(key) 2531 2532 def __setitem__(self, key, value): 2533 if isinstance( 2534 key, str) or (isinstance(key, list) and 2535 all(isinstance(c, str) 2536 for c in key)) or (isinstance(key, DeferredSeries) and 2537 key._expr.proxy().dtype == bool): 2538 # yapf: disable 2539 return self._elementwise( 2540 lambda df, key, value: df.__setitem__(key, value), 2541 'set_column', 2542 (key, value), 2543 inplace=True) 2544 else: 2545 raise NotImplementedError(key) 2546 2547 @frame_base.with_docs_from(pd.DataFrame) 2548 @frame_base.args_to_kwargs(pd.DataFrame) 2549 @frame_base.populate_defaults(pd.DataFrame) 2550 def align(self, other, join, axis, copy, level, method, **kwargs): 2551 """Aligning per level is not yet supported. Only the default, 2552 ``level=None``, is allowed. 2553 2554 Filling NaN values via ``method`` is not supported, because it is 2555 `order-sensitive 2556 <https://s.apache.org/dataframe-order-sensitive-operations>`_. Only the 2557 default, ``method=None``, is allowed. 2558 2559 ``copy=False`` is not supported because its behavior (whether or not it is 2560 an inplace operation) depends on the data.""" 2561 if not copy: 2562 raise frame_base.WontImplementError( 2563 "align(copy=False) is not supported because it might be an inplace " 2564 "operation depending on the data. Please prefer the default " 2565 "align(copy=True).") 2566 if method is not None: 2567 raise frame_base.WontImplementError( 2568 f"align(method={method!r}) is not supported because it is " 2569 "order sensitive. Only align(method=None) is supported.", 2570 reason="order-sensitive") 2571 if kwargs: 2572 raise NotImplementedError('align(%s)' % ', '.join(kwargs.keys())) 2573 2574 if level is not None: 2575 # Could probably get by partitioning on the used levels. 2576 requires_partition_by = partitionings.Singleton(reason=( 2577 f"align(level={level}) is not currently parallelizable. Only " 2578 "align(level=None) can be parallelized.")) 2579 elif axis in ('columns', 1): 2580 requires_partition_by = partitionings.Arbitrary() 2581 else: 2582 requires_partition_by = partitionings.Index() 2583 return frame_base.DeferredFrame.wrap( 2584 expressions.ComputedExpression( 2585 'align', 2586 lambda df, other: df.align(other, join=join, axis=axis), 2587 [self._expr, other._expr], 2588 requires_partition_by=requires_partition_by, 2589 preserves_partition_by=partitionings.Arbitrary())) 2590 2591 @frame_base.with_docs_from(pd.DataFrame) 2592 @frame_base.args_to_kwargs(pd.DataFrame) 2593 @frame_base.populate_defaults(pd.DataFrame) 2594 def append(self, other, ignore_index, verify_integrity, sort, **kwargs): 2595 """``ignore_index=True`` is not supported, because it requires generating an 2596 order-sensitive index.""" 2597 if not isinstance(other, DeferredDataFrame): 2598 raise frame_base.WontImplementError( 2599 "append() only accepts DeferredDataFrame instances, received " + 2600 str(type(other))) 2601 if ignore_index: 2602 raise frame_base.WontImplementError( 2603 "append(ignore_index=True) is order sensitive because it requires " 2604 "generating a new index based on the order of the data.", 2605 reason="order-sensitive") 2606 2607 if verify_integrity: 2608 # We can verify the index is non-unique within index partitioned data. 2609 requires = partitionings.Index() 2610 else: 2611 requires = partitionings.Arbitrary() 2612 2613 return frame_base.DeferredFrame.wrap( 2614 expressions.ComputedExpression( 2615 'append', 2616 lambda s, other: s.append(other, sort=sort, 2617 verify_integrity=verify_integrity, 2618 **kwargs), 2619 [self._expr, other._expr], 2620 requires_partition_by=requires, 2621 preserves_partition_by=partitionings.Arbitrary() 2622 ) 2623 ) 2624 2625 # If column name exists this is a simple project, otherwise it is a constant 2626 # (default_value) 2627 @frame_base.with_docs_from(pd.DataFrame) 2628 def get(self, key, default_value=None): 2629 if key in self.columns: 2630 return self[key] 2631 else: 2632 return default_value 2633 2634 @frame_base.with_docs_from(pd.DataFrame) 2635 @frame_base.args_to_kwargs(pd.DataFrame) 2636 @frame_base.populate_defaults(pd.DataFrame) 2637 @frame_base.maybe_inplace 2638 def set_index(self, keys, **kwargs): 2639 """``keys`` must be a ``str`` or ``List[str]``. Passing an Index or Series 2640 is not yet supported (`Issue 20759 2641 <https://github.com/apache/beam/issues/20759>`_).""" 2642 if isinstance(keys, str): 2643 keys = [keys] 2644 2645 if any(isinstance(k, (_DeferredIndex, frame_base.DeferredFrame)) 2646 for k in keys): 2647 raise NotImplementedError("set_index with Index or Series instances is " 2648 "not yet supported " 2649 "(https://github.com/apache/beam/issues/20759)" 2650 ".") 2651 2652 return frame_base.DeferredFrame.wrap( 2653 expressions.ComputedExpression( 2654 'set_index', 2655 lambda df: df.set_index(keys, **kwargs), 2656 [self._expr], 2657 requires_partition_by=partitionings.Arbitrary(), 2658 preserves_partition_by=partitionings.Singleton())) 2659 2660 2661 @frame_base.with_docs_from(pd.DataFrame) 2662 @frame_base.args_to_kwargs(pd.DataFrame) 2663 @frame_base.populate_defaults(pd.DataFrame) 2664 @frame_base.maybe_inplace 2665 def set_axis(self, labels, axis, **kwargs): 2666 if axis in ('index', 0): 2667 # TODO: assigning the index is generally order-sensitive, but we could 2668 # support it in some rare cases, e.g. when assigning the index from one 2669 # of a DataFrame's columns 2670 raise NotImplementedError( 2671 "Assigning an index is not yet supported. " 2672 "Consider using set_index() instead.") 2673 else: 2674 return frame_base.DeferredFrame.wrap( 2675 expressions.ComputedExpression( 2676 'set_axis', 2677 lambda df: df.set_axis(labels, axis, **kwargs), 2678 [self._expr], 2679 requires_partition_by=partitionings.Arbitrary(), 2680 preserves_partition_by=partitionings.Arbitrary())) 2681 2682 2683 @property # type: ignore 2684 @frame_base.with_docs_from(pd.DataFrame) 2685 def axes(self): 2686 return (self.index, self.columns) 2687 2688 @property # type: ignore 2689 @frame_base.with_docs_from(pd.DataFrame) 2690 def dtypes(self): 2691 return self._expr.proxy().dtypes 2692 2693 @frame_base.with_docs_from(pd.DataFrame) 2694 def assign(self, **kwargs): 2695 """``value`` must be a ``callable`` or :class:`DeferredSeries`. Other types 2696 make this operation order-sensitive.""" 2697 for name, value in kwargs.items(): 2698 if not callable(value) and not isinstance(value, DeferredSeries): 2699 raise frame_base.WontImplementError( 2700 f"Unsupported value for new column '{name}': '{value}'. Only " 2701 "callables and DeferredSeries instances are supported. Other types " 2702 "make this operation sensitive to the order of the data", 2703 reason="order-sensitive") 2704 return self._elementwise( 2705 lambda df, *args, **kwargs: df.assign(*args, **kwargs), 2706 'assign', 2707 other_kwargs=kwargs) 2708 2709 @frame_base.with_docs_from(pd.DataFrame) 2710 @frame_base.args_to_kwargs(pd.DataFrame) 2711 @frame_base.populate_defaults(pd.DataFrame) 2712 def explode(self, column, ignore_index): 2713 # ignoring the index will not preserve it 2714 preserves = (partitionings.Singleton() if ignore_index 2715 else partitionings.Index()) 2716 return frame_base.DeferredFrame.wrap( 2717 expressions.ComputedExpression( 2718 'explode', 2719 lambda df: df.explode(column, ignore_index), 2720 [self._expr], 2721 preserves_partition_by=preserves, 2722 requires_partition_by=partitionings.Arbitrary())) 2723 2724 @frame_base.with_docs_from(pd.DataFrame) 2725 @frame_base.args_to_kwargs(pd.DataFrame) 2726 @frame_base.populate_defaults(pd.DataFrame) 2727 def insert(self, value, **kwargs): 2728 """``value`` cannot be a ``List`` because aligning it with this 2729 DeferredDataFrame is order-sensitive.""" 2730 if isinstance(value, list): 2731 raise frame_base.WontImplementError( 2732 "insert(value=list) is not supported because it joins the input " 2733 "list to the deferred DataFrame based on the order of the data.", 2734 reason="order-sensitive") 2735 2736 if isinstance(value, pd.core.generic.NDFrame): 2737 value = frame_base.DeferredFrame.wrap( 2738 expressions.ConstantExpression(value)) 2739 2740 if isinstance(value, frame_base.DeferredFrame): 2741 def func_zip(df, value): 2742 df = df.copy() 2743 df.insert(value=value, **kwargs) 2744 return df 2745 2746 inserted = frame_base.DeferredFrame.wrap( 2747 expressions.ComputedExpression( 2748 'insert', 2749 func_zip, 2750 [self._expr, value._expr], 2751 requires_partition_by=partitionings.Index(), 2752 preserves_partition_by=partitionings.Arbitrary())) 2753 else: 2754 def func_elementwise(df): 2755 df = df.copy() 2756 df.insert(value=value, **kwargs) 2757 return df 2758 inserted = frame_base.DeferredFrame.wrap( 2759 expressions.ComputedExpression( 2760 'insert', 2761 func_elementwise, 2762 [self._expr], 2763 requires_partition_by=partitionings.Arbitrary(), 2764 preserves_partition_by=partitionings.Arbitrary())) 2765 2766 self._expr = inserted._expr 2767 2768 @staticmethod 2769 @frame_base.with_docs_from(pd.DataFrame) 2770 def from_dict(*args, **kwargs): 2771 return frame_base.DeferredFrame.wrap( 2772 expressions.ConstantExpression(pd.DataFrame.from_dict(*args, **kwargs))) 2773 2774 @staticmethod 2775 @frame_base.with_docs_from(pd.DataFrame) 2776 def from_records(*args, **kwargs): 2777 return frame_base.DeferredFrame.wrap( 2778 expressions.ConstantExpression(pd.DataFrame.from_records(*args, 2779 **kwargs))) 2780 2781 @frame_base.with_docs_from(pd.DataFrame) 2782 @frame_base.args_to_kwargs(pd.DataFrame) 2783 @frame_base.populate_defaults(pd.DataFrame) 2784 @frame_base.maybe_inplace 2785 def duplicated(self, keep, subset): 2786 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 2787 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 2788 a Beam-specific option that guarantees only one duplicate will be kept, but 2789 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 2790 duplicate element is kept.""" 2791 # TODO(BEAM-12074): Document keep="any" 2792 if keep == 'any': 2793 keep = 'first' 2794 elif keep is not False: 2795 raise frame_base.WontImplementError( 2796 f"duplicated(keep={keep!r}) is not supported because it is " 2797 "sensitive to the order of the data. Only keep=False and " 2798 "keep=\"any\" are supported.", 2799 reason="order-sensitive") 2800 2801 by = subset or list(self.columns) 2802 2803 return self.groupby(by).apply( 2804 lambda df: pd.DataFrame(df.duplicated(keep=keep, subset=subset), 2805 columns=[None]))[None].droplevel(by) 2806 2807 @frame_base.with_docs_from(pd.DataFrame) 2808 @frame_base.args_to_kwargs(pd.DataFrame) 2809 @frame_base.populate_defaults(pd.DataFrame) 2810 @frame_base.maybe_inplace 2811 def drop_duplicates(self, keep, subset, ignore_index): 2812 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 2813 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 2814 a Beam-specific option that guarantees only one duplicate will be kept, but 2815 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 2816 duplicate element is kept.""" 2817 # TODO(BEAM-12074): Document keep="any" 2818 if keep == 'any': 2819 keep = 'first' 2820 elif keep is not False: 2821 raise frame_base.WontImplementError( 2822 f"drop_duplicates(keep={keep!r}) is not supported because it is " 2823 "sensitive to the order of the data. Only keep=False and " 2824 "keep=\"any\" are supported.", 2825 reason="order-sensitive") 2826 2827 if ignore_index is not False: 2828 raise frame_base.WontImplementError( 2829 "drop_duplicates(ignore_index=False) is not supported because it " 2830 "requires generating a new index that is sensitive to the order of " 2831 "the data.", 2832 reason="order-sensitive") 2833 2834 by = subset or list(self.columns) 2835 2836 return self.groupby(by).apply( 2837 lambda df: df.drop_duplicates(keep=keep, subset=subset)).droplevel(by) 2838 2839 @frame_base.with_docs_from(pd.DataFrame) 2840 @frame_base.args_to_kwargs(pd.DataFrame) 2841 @frame_base.populate_defaults(pd.DataFrame) 2842 def aggregate(self, func, axis, *args, **kwargs): 2843 # We have specialized implementations for these. 2844 if func in ('quantile',): 2845 return getattr(self, func)(*args, axis=axis, **kwargs) 2846 2847 # In pandas<1.3.0, maps to a property, args are ignored 2848 if func in ('size',) and PD_VERSION < (1, 3): 2849 return getattr(self, func) 2850 2851 # We also have specialized distributed implementations for these. They only 2852 # support axis=0 (implicitly) though. axis=1 should fall through 2853 if func in ('corr', 'cov') and axis in (0, 'index'): 2854 return getattr(self, func)(*args, **kwargs) 2855 2856 if axis is None: 2857 # Aggregate across all elements by first aggregating across columns, 2858 # then across rows. 2859 return self.agg(func, *args, **dict(kwargs, axis=1)).agg( 2860 func, *args, **dict(kwargs, axis=0)) 2861 elif axis in (1, 'columns'): 2862 # This is an easy elementwise aggregation. 2863 return frame_base.DeferredFrame.wrap( 2864 expressions.ComputedExpression( 2865 'aggregate', 2866 lambda df: df.agg(func, axis=1, *args, **kwargs), 2867 [self._expr], 2868 requires_partition_by=partitionings.Arbitrary())) 2869 elif len(self._expr.proxy().columns) == 0: 2870 # For this corner case, just colocate everything. 2871 return frame_base.DeferredFrame.wrap( 2872 expressions.ComputedExpression( 2873 'aggregate', 2874 lambda df: df.agg(func, *args, **kwargs), 2875 [self._expr], 2876 requires_partition_by=partitionings.Singleton())) 2877 else: 2878 # In the general case, we will compute the aggregation of each column 2879 # separately, then recombine. 2880 2881 # First, handle any kwargs that cause a projection, by eagerly generating 2882 # the proxy, and only including the columns that are in the output. 2883 PROJECT_KWARGS = ('numeric_only', 'bool_only', 'include', 'exclude') 2884 proxy = self._expr.proxy().agg(func, axis, *args, **kwargs) 2885 2886 if isinstance(proxy, pd.DataFrame): 2887 projected = self[list(proxy.columns)] 2888 elif isinstance(proxy, pd.Series): 2889 projected = self[list(proxy.index)] 2890 else: 2891 projected = self 2892 2893 nonnumeric_columns = [name for (name, dtype) in projected.dtypes.items() 2894 if not 2895 pd.core.dtypes.common.is_numeric_dtype(dtype)] 2896 2897 if _is_numeric(func) and nonnumeric_columns: 2898 if 'numeric_only' in kwargs and kwargs['numeric_only'] is False: 2899 # User has opted in to execution with non-numeric columns, they 2900 # will accept runtime errors 2901 pass 2902 else: 2903 raise frame_base.WontImplementError( 2904 f"Numeric aggregation ({func!r}) on a DataFrame containing " 2905 f"non-numeric columns ({*nonnumeric_columns,!r} is not " 2906 "supported, unless `numeric_only=` is specified.\n" 2907 "Use `numeric_only=True` to only aggregate over numeric " 2908 "columns.\nUse `numeric_only=False` to aggregate over all " 2909 "columns. Note this is not recommended, as it could result in " 2910 "execution time errors.") 2911 2912 for key in PROJECT_KWARGS: 2913 if key in kwargs: 2914 kwargs.pop(key) 2915 2916 if not isinstance(func, dict): 2917 col_names = list(projected._expr.proxy().columns) 2918 func_by_col = {col: func for col in col_names} 2919 else: 2920 func_by_col = func 2921 col_names = list(func.keys()) 2922 aggregated_cols = [] 2923 has_lists = any(isinstance(f, list) for f in func_by_col.values()) 2924 for col in col_names: 2925 funcs = func_by_col[col] 2926 if has_lists and not isinstance(funcs, list): 2927 # If any of the columns do multiple aggregations, they all must use 2928 # "list" style output 2929 funcs = [funcs] 2930 aggregated_cols.append(projected[col].agg(funcs, *args, **kwargs)) 2931 # The final shape is different depending on whether any of the columns 2932 # were aggregated by a list of aggregators. 2933 with expressions.allow_non_parallel_operations(): 2934 if isinstance(proxy, pd.Series): 2935 return frame_base.DeferredFrame.wrap( 2936 expressions.ComputedExpression( 2937 'join_aggregate', 2938 lambda *cols: pd.Series( 2939 {col: value for col, value in zip(col_names, cols)}), 2940 [col._expr for col in aggregated_cols], 2941 requires_partition_by=partitionings.Singleton())) 2942 elif isinstance(proxy, pd.DataFrame): 2943 return frame_base.DeferredFrame.wrap( 2944 expressions.ComputedExpression( 2945 'join_aggregate', 2946 lambda *cols: pd.DataFrame( 2947 {col: value for col, value in zip(col_names, cols)}), 2948 [col._expr for col in aggregated_cols], 2949 requires_partition_by=partitionings.Singleton())) 2950 else: 2951 raise AssertionError("Unexpected proxy type for " 2952 f"DataFrame.aggregate!: proxy={proxy!r}, " 2953 f"type(proxy)={type(proxy)!r}") 2954 2955 agg = aggregate 2956 2957 applymap = frame_base._elementwise_method('applymap', base=pd.DataFrame) 2958 add_prefix = frame_base._elementwise_method('add_prefix', base=pd.DataFrame) 2959 add_suffix = frame_base._elementwise_method('add_suffix', base=pd.DataFrame) 2960 2961 memory_usage = frame_base.wont_implement_method( 2962 pd.DataFrame, 'memory_usage', reason="non-deferred-result") 2963 info = frame_base.wont_implement_method( 2964 pd.DataFrame, 'info', reason="non-deferred-result") 2965 2966 2967 @frame_base.with_docs_from(pd.DataFrame) 2968 @frame_base.args_to_kwargs(pd.DataFrame) 2969 @frame_base.populate_defaults(pd.DataFrame) 2970 @frame_base.maybe_inplace 2971 def clip(self, axis, **kwargs): 2972 """``lower`` and ``upper`` must be :class:`DeferredSeries` instances, or 2973 constants. Array-like arguments are not supported because they are 2974 order-sensitive.""" 2975 2976 if any(isinstance(kwargs.get(arg, None), frame_base.DeferredFrame) 2977 for arg in ('upper', 'lower')) and axis not in (0, 'index'): 2978 raise frame_base.WontImplementError( 2979 "axis must be 'index' when upper and/or lower are a DeferredFrame", 2980 reason='order-sensitive') 2981 2982 return frame_base._elementwise_method('clip', base=pd.DataFrame)(self, 2983 axis=axis, 2984 **kwargs) 2985 2986 @frame_base.with_docs_from(pd.DataFrame) 2987 @frame_base.args_to_kwargs(pd.DataFrame) 2988 @frame_base.populate_defaults(pd.DataFrame) 2989 def corr(self, method, min_periods): 2990 """Only ``method="pearson"`` can be parallelized. Other methods require 2991 collecting all data on a single worker (see 2992 https://s.apache.org/dataframe-non-parallel-operations for details). 2993 """ 2994 if method == 'pearson': 2995 proxy = self._expr.proxy().corr() 2996 columns = list(proxy.columns) 2997 args = [] 2998 arg_indices = [] 2999 for col1, col2 in itertools.combinations(columns, 2): 3000 arg_indices.append((col1, col2)) 3001 args.append(self[col1].corr(self[col2], method=method, 3002 min_periods=min_periods)) 3003 def fill_matrix(*args): 3004 data = collections.defaultdict(dict) 3005 for col in columns: 3006 data[col][col] = 1.0 3007 for ix, (col1, col2) in enumerate(arg_indices): 3008 data[col1][col2] = data[col2][col1] = args[ix] 3009 return pd.DataFrame(data, columns=columns, index=columns) 3010 with expressions.allow_non_parallel_operations(True): 3011 return frame_base.DeferredFrame.wrap( 3012 expressions.ComputedExpression( 3013 'fill_matrix', 3014 fill_matrix, 3015 [arg._expr for arg in args], 3016 requires_partition_by=partitionings.Singleton(), 3017 proxy=proxy)) 3018 3019 else: 3020 reason = (f"Encountered corr(method={method!r}) which cannot be " 3021 "parallelized. Only corr(method='pearson') is currently " 3022 "parallelizable.") 3023 return frame_base.DeferredFrame.wrap( 3024 expressions.ComputedExpression( 3025 'corr', 3026 lambda df: df.corr(method=method, min_periods=min_periods), 3027 [self._expr], 3028 requires_partition_by=partitionings.Singleton(reason=reason))) 3029 3030 @frame_base.with_docs_from(pd.DataFrame) 3031 @frame_base.args_to_kwargs(pd.DataFrame) 3032 @frame_base.populate_defaults(pd.DataFrame) 3033 def cov(self, min_periods, ddof): 3034 proxy = self._expr.proxy().corr() 3035 columns = list(proxy.columns) 3036 args = [] 3037 arg_indices = [] 3038 for col in columns: 3039 arg_indices.append((col, col)) 3040 std = self[col].std(ddof) 3041 args.append(std.apply(lambda x: x*x, 'square')) 3042 for ix, col1 in enumerate(columns): 3043 for col2 in columns[ix+1:]: 3044 arg_indices.append((col1, col2)) 3045 # Note that this set may be different for each pair. 3046 no_na = self.loc[self[col1].notna() & self[col2].notna()] 3047 args.append(no_na[col1]._cov_aligned(no_na[col2], min_periods, ddof)) 3048 def fill_matrix(*args): 3049 data = collections.defaultdict(dict) 3050 for ix, (col1, col2) in enumerate(arg_indices): 3051 data[col1][col2] = data[col2][col1] = args[ix] 3052 return pd.DataFrame(data, columns=columns, index=columns) 3053 with expressions.allow_non_parallel_operations(True): 3054 return frame_base.DeferredFrame.wrap( 3055 expressions.ComputedExpression( 3056 'fill_matrix', 3057 fill_matrix, 3058 [arg._expr for arg in args], 3059 requires_partition_by=partitionings.Singleton(), 3060 proxy=proxy)) 3061 3062 @frame_base.with_docs_from(pd.DataFrame) 3063 @frame_base.args_to_kwargs(pd.DataFrame) 3064 @frame_base.populate_defaults(pd.DataFrame) 3065 def corrwith(self, other, axis, drop, method): 3066 if axis in (1, 'columns'): 3067 return self._elementwise( 3068 lambda df, other: df.corrwith(other, axis=axis, drop=drop, 3069 method=method), 3070 'corrwith', 3071 other_args=(other,)) 3072 3073 3074 if not isinstance(other, frame_base.DeferredFrame): 3075 other = frame_base.DeferredFrame.wrap( 3076 expressions.ConstantExpression(other)) 3077 3078 if isinstance(other, DeferredSeries): 3079 proxy = self._expr.proxy().corrwith(other._expr.proxy(), axis=axis, 3080 drop=drop, method=method) 3081 self, other = self.align(other, axis=0, join='inner') 3082 col_names = proxy.index 3083 other_cols = [other] * len(col_names) 3084 elif isinstance(other, DeferredDataFrame): 3085 proxy = self._expr.proxy().corrwith( 3086 other._expr.proxy(), axis=axis, method=method, drop=drop) 3087 self, other = self.align(other, axis=0, join='inner') 3088 col_names = list( 3089 set(self.columns) 3090 .intersection(other.columns) 3091 .intersection(proxy.index)) 3092 other_cols = [other[col_name] for col_name in col_names] 3093 else: 3094 # Raise the right error. 3095 self._expr.proxy().corrwith(other._expr.proxy(), axis=axis, drop=drop, 3096 method=method) 3097 3098 # Just in case something else becomes valid. 3099 raise NotImplementedError('corrwith(%s)' % type(other._expr.proxy)) 3100 3101 # Generate expressions to compute the actual correlations. 3102 corrs = [ 3103 self[col_name].corr(other_col, method) 3104 for col_name, other_col in zip(col_names, other_cols)] 3105 3106 # Combine the results 3107 def fill_dataframe(*args): 3108 result = proxy.copy(deep=True) 3109 for col, value in zip(proxy.index, args): 3110 result[col] = value 3111 return result 3112 with expressions.allow_non_parallel_operations(True): 3113 return frame_base.DeferredFrame.wrap( 3114 expressions.ComputedExpression( 3115 'fill_dataframe', 3116 fill_dataframe, 3117 [corr._expr for corr in corrs], 3118 requires_partition_by=partitionings.Singleton(), 3119 proxy=proxy)) 3120 3121 cummax = frame_base.wont_implement_method(pd.DataFrame, 'cummax', 3122 reason='order-sensitive') 3123 cummin = frame_base.wont_implement_method(pd.DataFrame, 'cummin', 3124 reason='order-sensitive') 3125 cumprod = frame_base.wont_implement_method(pd.DataFrame, 'cumprod', 3126 reason='order-sensitive') 3127 cumsum = frame_base.wont_implement_method(pd.DataFrame, 'cumsum', 3128 reason='order-sensitive') 3129 # TODO(BEAM-12071): Consider adding an order-insensitive implementation for 3130 # diff that relies on the index 3131 diff = frame_base.wont_implement_method(pd.DataFrame, 'diff', 3132 reason='order-sensitive') 3133 interpolate = frame_base.wont_implement_method(pd.DataFrame, 'interpolate', 3134 reason='order-sensitive') 3135 3136 pct_change = frame_base.wont_implement_method( 3137 pd.DataFrame, 'pct_change', reason='order-sensitive') 3138 asof = frame_base.wont_implement_method( 3139 pd.DataFrame, 'asof', reason='order-sensitive') 3140 first_valid_index = frame_base.wont_implement_method( 3141 pd.DataFrame, 'first_valid_index', reason='order-sensitive') 3142 last_valid_index = frame_base.wont_implement_method( 3143 pd.DataFrame, 'last_valid_index', reason='order-sensitive') 3144 iat = property(frame_base.wont_implement_method( 3145 pd.DataFrame, 'iat', reason='order-sensitive')) 3146 3147 lookup = frame_base.wont_implement_method( 3148 pd.DataFrame, 'lookup', reason='deprecated') 3149 3150 head = frame_base.wont_implement_method(pd.DataFrame, 'head', 3151 explanation=_PEEK_METHOD_EXPLANATION) 3152 tail = frame_base.wont_implement_method(pd.DataFrame, 'tail', 3153 explanation=_PEEK_METHOD_EXPLANATION) 3154 3155 @frame_base.with_docs_from(pd.DataFrame) 3156 @frame_base.args_to_kwargs(pd.DataFrame) 3157 @frame_base.populate_defaults(pd.DataFrame) 3158 def sample(self, n, frac, replace, weights, random_state, axis): 3159 """When ``axis='index'``, only ``n`` and/or ``weights`` may be specified. 3160 ``frac``, ``random_state``, and ``replace=True`` are not yet supported. 3161 See `Issue 21010 <https://github.com/apache/beam/issues/21010>`_. 3162 3163 Note that pandas will raise an error if ``n`` is larger than the length 3164 of the dataset, while the Beam DataFrame API will simply return the full 3165 dataset in that case. 3166 3167 sample is fully supported for axis='columns'.""" 3168 if axis in (1, 'columns'): 3169 # Sampling on axis=columns just means projecting random columns 3170 # Eagerly generate proxy to determine the set of columns at construction 3171 # time 3172 proxy = self._expr.proxy().sample(n=n, frac=frac, replace=replace, 3173 weights=weights, 3174 random_state=random_state, axis=axis) 3175 # Then do the projection 3176 return self[list(proxy.columns)] 3177 3178 # axis='index' 3179 if frac is not None or random_state is not None or replace: 3180 raise NotImplementedError( 3181 f"When axis={axis!r}, only n and/or weights may be specified. " 3182 "frac, random_state, and replace=True are not yet supported " 3183 f"(got frac={frac!r}, random_state={random_state!r}, " 3184 f"replace={replace!r}). See " 3185 "https://github.com/apache/beam/issues/21010.") 3186 3187 if n is None: 3188 n = 1 3189 3190 if isinstance(weights, str): 3191 weights = self[weights] 3192 3193 tmp_weight_column_name = "___Beam_DataFrame_weights___" 3194 3195 if weights is None: 3196 self_with_randomized_weights = frame_base.DeferredFrame.wrap( 3197 expressions.ComputedExpression( 3198 'randomized_weights', 3199 lambda df: df.assign(**{tmp_weight_column_name: 3200 np.random.rand(len(df))}), 3201 [self._expr], 3202 requires_partition_by=partitionings.Arbitrary(), 3203 preserves_partition_by=partitionings.Arbitrary())) 3204 else: 3205 # See "Fast Parallel Weighted Random Sampling" by Efraimidis and Spirakis 3206 # https://www.cti.gr/images_gr/reports/99-06-02.ps 3207 def assign_randomized_weights(df, weights): 3208 non_zero_weights = (weights > 0) | pd.Series(dtype=bool, index=df.index) 3209 df = df.loc[non_zero_weights] 3210 weights = weights.loc[non_zero_weights] 3211 random_weights = np.log(np.random.rand(len(weights))) / weights 3212 return df.assign(**{tmp_weight_column_name: random_weights}) 3213 self_with_randomized_weights = frame_base.DeferredFrame.wrap( 3214 expressions.ComputedExpression( 3215 'randomized_weights', 3216 assign_randomized_weights, 3217 [self._expr, weights._expr], 3218 requires_partition_by=partitionings.Index(), 3219 preserves_partition_by=partitionings.Arbitrary())) 3220 3221 return self_with_randomized_weights.nlargest( 3222 n=n, columns=tmp_weight_column_name, keep='any').drop( 3223 tmp_weight_column_name, axis=1) 3224 3225 @frame_base.with_docs_from(pd.DataFrame) 3226 def dot(self, other): 3227 # We want to broadcast the right hand side to all partitions of the left. 3228 # This is OK, as its index must be the same size as the columns set of self, 3229 # so cannot be too large. 3230 class AsScalar(object): 3231 def __init__(self, value): 3232 self.value = value 3233 3234 if isinstance(other, frame_base.DeferredFrame): 3235 proxy = other._expr.proxy() 3236 with expressions.allow_non_parallel_operations(): 3237 side = expressions.ComputedExpression( 3238 'as_scalar', 3239 lambda df: AsScalar(df), 3240 [other._expr], 3241 requires_partition_by=partitionings.Singleton()) 3242 else: 3243 proxy = pd.DataFrame(columns=range(len(other[0]))) 3244 side = expressions.ConstantExpression(AsScalar(other)) 3245 3246 return frame_base.DeferredFrame.wrap( 3247 expressions.ComputedExpression( 3248 'dot', 3249 lambda left, right: left @ right.value, 3250 [self._expr, side], 3251 requires_partition_by=partitionings.Arbitrary(), 3252 preserves_partition_by=partitionings.Arbitrary(), 3253 proxy=proxy)) 3254 3255 __matmul__ = dot 3256 3257 @frame_base.with_docs_from(pd.DataFrame) 3258 def mode(self, axis=0, *args, **kwargs): 3259 """mode with axis="columns" is not implemented because it produces 3260 non-deferred columns. 3261 3262 mode with axis="index" is not currently parallelizable. An approximate, 3263 parallelizable implementation of mode may be added in the future 3264 (`Issue 20946 <https://github.com/apache/beam/issues/20946>`_).""" 3265 3266 if axis == 1 or axis == 'columns': 3267 # Number of columns is max(number mode values for each row), so we can't 3268 # determine how many there will be before looking at the data. 3269 raise frame_base.WontImplementError( 3270 "mode(axis=columns) is not supported because it produces a variable " 3271 "number of columns depending on the data.", 3272 reason="non-deferred-columns") 3273 return frame_base.DeferredFrame.wrap( 3274 expressions.ComputedExpression( 3275 'mode', 3276 lambda df: df.mode(*args, **kwargs), 3277 [self._expr], 3278 #TODO(https://github.com/apache/beam/issues/20946): 3279 # Can we add an approximate implementation? 3280 requires_partition_by=partitionings.Singleton(reason=( 3281 "mode(axis='index') cannot currently be parallelized. See " 3282 "https://github.com/apache/beam/issues/20946 tracking the " 3283 "possble addition of an approximate, parallelizable " 3284 "implementation of mode." 3285 )), 3286 preserves_partition_by=partitionings.Singleton())) 3287 3288 @frame_base.with_docs_from(pd.DataFrame) 3289 @frame_base.args_to_kwargs(pd.DataFrame) 3290 @frame_base.populate_defaults(pd.DataFrame) 3291 @frame_base.maybe_inplace 3292 def dropna(self, axis, **kwargs): 3293 """dropna with axis="columns" specified cannot be parallelized.""" 3294 # TODO(robertwb): This is a common pattern. Generalize? 3295 if axis in (1, 'columns'): 3296 requires_partition_by = partitionings.Singleton(reason=( 3297 "dropna(axis=1) cannot currently be parallelized. It requires " 3298 "checking all values in each column for NaN values, to determine " 3299 "if that column should be dropped." 3300 )) 3301 else: 3302 requires_partition_by = partitionings.Arbitrary() 3303 return frame_base.DeferredFrame.wrap( 3304 expressions.ComputedExpression( 3305 'dropna', 3306 lambda df: df.dropna(axis=axis, **kwargs), 3307 [self._expr], 3308 preserves_partition_by=partitionings.Arbitrary(), 3309 requires_partition_by=requires_partition_by)) 3310 3311 def _eval_or_query(self, name, expr, inplace, **kwargs): 3312 for key in ('local_dict', 'global_dict', 'level', 'target', 'resolvers'): 3313 if key in kwargs: 3314 raise NotImplementedError(f"Setting '{key}' is not yet supported") 3315 3316 # look for '@<py identifier>' 3317 if re.search(r'\@[^\d\W]\w*', expr, re.UNICODE): 3318 raise NotImplementedError("Accessing locals with @ is not yet supported " 3319 "(https://github.com/apache/beam/issues/20626)" 3320 ) 3321 3322 result_expr = expressions.ComputedExpression( 3323 name, 3324 lambda df: getattr(df, name)(expr, **kwargs), 3325 [self._expr], 3326 requires_partition_by=partitionings.Arbitrary(), 3327 preserves_partition_by=partitionings.Arbitrary()) 3328 3329 if inplace: 3330 self._expr = result_expr 3331 else: 3332 return frame_base.DeferredFrame.wrap(result_expr) 3333 3334 3335 @frame_base.with_docs_from(pd.DataFrame) 3336 @frame_base.args_to_kwargs(pd.DataFrame) 3337 @frame_base.populate_defaults(pd.DataFrame) 3338 def eval(self, expr, inplace, **kwargs): 3339 """Accessing local variables with ``@<varname>`` is not yet supported 3340 (`Issue 20626 <https://github.com/apache/beam/issues/20626>`_). 3341 3342 Arguments ``local_dict``, ``global_dict``, ``level``, ``target``, and 3343 ``resolvers`` are not yet supported.""" 3344 return self._eval_or_query('eval', expr, inplace, **kwargs) 3345 3346 @frame_base.with_docs_from(pd.DataFrame) 3347 @frame_base.args_to_kwargs(pd.DataFrame) 3348 @frame_base.populate_defaults(pd.DataFrame) 3349 def query(self, expr, inplace, **kwargs): 3350 """Accessing local variables with ``@<varname>`` is not yet supported 3351 (`Issue 20626 <https://github.com/apache/beam/issues/20626>`_). 3352 3353 Arguments ``local_dict``, ``global_dict``, ``level``, ``target``, and 3354 ``resolvers`` are not yet supported.""" 3355 return self._eval_or_query('query', expr, inplace, **kwargs) 3356 3357 isnull = isna = frame_base._elementwise_method('isna', base=pd.DataFrame) 3358 notnull = notna = frame_base._elementwise_method('notna', base=pd.DataFrame) 3359 3360 items = frame_base.wont_implement_method(pd.DataFrame, 'items', 3361 reason="non-deferred-result") 3362 itertuples = frame_base.wont_implement_method(pd.DataFrame, 'itertuples', 3363 reason="non-deferred-result") 3364 iterrows = frame_base.wont_implement_method(pd.DataFrame, 'iterrows', 3365 reason="non-deferred-result") 3366 iteritems = frame_base.wont_implement_method(pd.DataFrame, 'iteritems', 3367 reason="non-deferred-result") 3368 3369 def _cols_as_temporary_index(self, cols, suffix=''): 3370 original_index_names = list(self._expr.proxy().index.names) 3371 new_index_names = [ 3372 '__apache_beam_temp_%d_%s' % (ix, suffix) 3373 for (ix, _) in enumerate(original_index_names)] 3374 def reindex(df): 3375 return frame_base.DeferredFrame.wrap( 3376 expressions.ComputedExpression( 3377 'reindex', 3378 lambda df: 3379 df.rename_axis(index=new_index_names, copy=False) 3380 .reset_index().set_index(cols), 3381 [df._expr], 3382 preserves_partition_by=partitionings.Singleton(), 3383 requires_partition_by=partitionings.Arbitrary())) 3384 def revert(df): 3385 return frame_base.DeferredFrame.wrap( 3386 expressions.ComputedExpression( 3387 'join_restoreindex', 3388 lambda df: 3389 df.reset_index().set_index(new_index_names) 3390 .rename_axis(index=original_index_names, copy=False), 3391 [df._expr], 3392 preserves_partition_by=partitionings.Singleton(), 3393 requires_partition_by=partitionings.Arbitrary())) 3394 return reindex, revert 3395 3396 @frame_base.with_docs_from(pd.DataFrame) 3397 @frame_base.args_to_kwargs(pd.DataFrame) 3398 @frame_base.populate_defaults(pd.DataFrame) 3399 def join(self, other, on, **kwargs): 3400 if on is not None: 3401 reindex, revert = self._cols_as_temporary_index(on) 3402 return revert(reindex(self).join(other, **kwargs)) 3403 if isinstance(other, list): 3404 other_is_list = True 3405 else: 3406 other = [other] 3407 other_is_list = False 3408 placeholder = object() 3409 other_exprs = [ 3410 df._expr for df in other if isinstance(df, frame_base.DeferredFrame)] 3411 const_others = [ 3412 placeholder if isinstance(df, frame_base.DeferredFrame) else df 3413 for df in other] 3414 def fill_placeholders(values): 3415 values = iter(values) 3416 filled = [ 3417 next(values) if df is placeholder else df for df in const_others] 3418 if other_is_list: 3419 return filled 3420 else: 3421 return filled[0] 3422 return frame_base.DeferredFrame.wrap( 3423 expressions.ComputedExpression( 3424 'join', 3425 lambda df, *deferred_others: df.join( 3426 fill_placeholders(deferred_others), **kwargs), 3427 [self._expr] + other_exprs, 3428 preserves_partition_by=partitionings.Arbitrary(), 3429 requires_partition_by=partitionings.Index())) 3430 3431 @frame_base.with_docs_from(pd.DataFrame) 3432 @frame_base.args_to_kwargs(pd.DataFrame) 3433 @frame_base.populate_defaults(pd.DataFrame) 3434 def merge( 3435 self, 3436 right, 3437 on, 3438 left_on, 3439 right_on, 3440 left_index, 3441 right_index, 3442 suffixes, 3443 **kwargs): 3444 """merge is not parallelizable unless ``left_index`` or ``right_index`` is 3445 ``True`, because it requires generating an entirely new unique index. 3446 See notes on :meth:`DeferredDataFrame.reset_index`. It is recommended to 3447 move the join key for one of your columns to the index to avoid this issue. 3448 For an example see the enrich pipeline in 3449 :mod:`apache_beam.examples.dataframe.taxiride`. 3450 3451 ``how="cross"`` is not yet supported. 3452 """ 3453 self_proxy = self._expr.proxy() 3454 right_proxy = right._expr.proxy() 3455 # Validate with a pandas call. 3456 _ = self_proxy.merge( 3457 right_proxy, 3458 on=on, 3459 left_on=left_on, 3460 right_on=right_on, 3461 left_index=left_index, 3462 right_index=right_index, 3463 **kwargs) 3464 if kwargs.get('how', None) == 'cross': 3465 raise NotImplementedError( 3466 "cross join is not yet implemented " 3467 "(https://github.com/apache/beam/issues/20318)") 3468 if not any([on, left_on, right_on, left_index, right_index]): 3469 on = [col for col in self_proxy.columns if col in right_proxy.columns] 3470 if not left_on: 3471 left_on = on 3472 if left_on and not isinstance(left_on, list): 3473 left_on = [left_on] 3474 if not right_on: 3475 right_on = on 3476 if right_on and not isinstance(right_on, list): 3477 right_on = [right_on] 3478 3479 if left_index: 3480 indexed_left = self 3481 else: 3482 indexed_left = self.set_index(left_on, drop=False) 3483 3484 if right_index: 3485 indexed_right = right 3486 else: 3487 indexed_right = right.set_index(right_on, drop=False) 3488 3489 if left_on and right_on: 3490 common_cols = set(left_on).intersection(right_on) 3491 if len(common_cols): 3492 # When merging on the same column name from both dfs, we need to make 3493 # sure only one df has the column. Otherwise we end up with 3494 # two duplicate columns, one with lsuffix and one with rsuffix. 3495 # It's safe to drop from either because the data has already been duped 3496 # to the index. 3497 indexed_right = indexed_right.drop(columns=common_cols) 3498 3499 3500 merged = frame_base.DeferredFrame.wrap( 3501 expressions.ComputedExpression( 3502 'merge', 3503 lambda left, right: left.merge(right, 3504 left_index=True, 3505 right_index=True, 3506 suffixes=suffixes, 3507 **kwargs), 3508 [indexed_left._expr, indexed_right._expr], 3509 preserves_partition_by=partitionings.Arbitrary(), 3510 requires_partition_by=partitionings.Index())) 3511 3512 if left_index or right_index: 3513 return merged 3514 else: 3515 return merged.reset_index(drop=True) 3516 3517 @frame_base.with_docs_from(pd.DataFrame) 3518 @frame_base.args_to_kwargs(pd.DataFrame) 3519 @frame_base.populate_defaults(pd.DataFrame) 3520 def nlargest(self, keep, **kwargs): 3521 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 3522 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 3523 a Beam-specific option that guarantees only one duplicate will be kept, but 3524 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 3525 duplicate element is kept.""" 3526 if keep == 'any': 3527 keep = 'first' 3528 elif keep != 'all': 3529 raise frame_base.WontImplementError( 3530 f"nlargest(keep={keep!r}) is not supported because it is " 3531 "order sensitive. Only keep=\"all\" is supported.", 3532 reason="order-sensitive") 3533 kwargs['keep'] = keep 3534 per_partition = expressions.ComputedExpression( 3535 'nlargest-per-partition', 3536 lambda df: df.nlargest(**kwargs), 3537 [self._expr], 3538 preserves_partition_by=partitionings.Arbitrary(), 3539 requires_partition_by=partitionings.Arbitrary()) 3540 with expressions.allow_non_parallel_operations(True): 3541 return frame_base.DeferredFrame.wrap( 3542 expressions.ComputedExpression( 3543 'nlargest', 3544 lambda df: df.nlargest(**kwargs), 3545 [per_partition], 3546 preserves_partition_by=partitionings.Singleton(), 3547 requires_partition_by=partitionings.Singleton())) 3548 3549 @frame_base.with_docs_from(pd.DataFrame) 3550 @frame_base.args_to_kwargs(pd.DataFrame) 3551 @frame_base.populate_defaults(pd.DataFrame) 3552 def nsmallest(self, keep, **kwargs): 3553 """Only ``keep=False`` and ``keep="any"`` are supported. Other values of 3554 ``keep`` make this an order-sensitive operation. Note ``keep="any"`` is 3555 a Beam-specific option that guarantees only one duplicate will be kept, but 3556 unlike ``"first"`` and ``"last"`` it makes no guarantees about _which_ 3557 duplicate element is kept.""" 3558 if keep == 'any': 3559 keep = 'first' 3560 elif keep != 'all': 3561 raise frame_base.WontImplementError( 3562 f"nsmallest(keep={keep!r}) is not supported because it is " 3563 "order sensitive. Only keep=\"all\" is supported.", 3564 reason="order-sensitive") 3565 kwargs['keep'] = keep 3566 per_partition = expressions.ComputedExpression( 3567 'nsmallest-per-partition', 3568 lambda df: df.nsmallest(**kwargs), 3569 [self._expr], 3570 preserves_partition_by=partitionings.Arbitrary(), 3571 requires_partition_by=partitionings.Arbitrary()) 3572 with expressions.allow_non_parallel_operations(True): 3573 return frame_base.DeferredFrame.wrap( 3574 expressions.ComputedExpression( 3575 'nsmallest', 3576 lambda df: df.nsmallest(**kwargs), 3577 [per_partition], 3578 preserves_partition_by=partitionings.Singleton(), 3579 requires_partition_by=partitionings.Singleton())) 3580 3581 plot = frame_base.wont_implement_method(pd.DataFrame, 'plot', 3582 reason="plotting-tools") 3583 3584 @frame_base.with_docs_from(pd.DataFrame) 3585 def pop(self, item): 3586 result = self[item] 3587 3588 self._expr = expressions.ComputedExpression( 3589 'popped', 3590 lambda df: df.drop(columns=[item]), 3591 [self._expr], 3592 preserves_partition_by=partitionings.Arbitrary(), 3593 requires_partition_by=partitionings.Arbitrary()) 3594 return result 3595 3596 @frame_base.with_docs_from(pd.DataFrame) 3597 @frame_base.args_to_kwargs(pd.DataFrame) 3598 @frame_base.populate_defaults(pd.DataFrame) 3599 def quantile(self, q, axis, **kwargs): 3600 """``quantile(axis="index")`` is not parallelizable. See 3601 `Issue 20933 <https://github.com/apache/beam/issues/20933>`_ tracking 3602 the possible addition of an approximate, parallelizable implementation of 3603 quantile. 3604 3605 When using quantile with ``axis="columns"`` only a single ``q`` value can be 3606 specified.""" 3607 if axis in (1, 'columns'): 3608 if isinstance(q, list): 3609 raise frame_base.WontImplementError( 3610 "quantile(axis=columns) with multiple q values is not supported " 3611 "because it transposes the input DataFrame. Note computing " 3612 "an individual quantile across columns (e.g. " 3613 f"df.quantile(q={q[0]!r}, axis={axis!r}) is supported.", 3614 reason="non-deferred-columns") 3615 else: 3616 requires = partitionings.Arbitrary() 3617 else: # axis='index' 3618 # TODO(https://github.com/apache/beam/issues/20933): Provide an option 3619 # for approximate distributed quantiles 3620 requires = partitionings.Singleton(reason=( 3621 "Computing quantiles across index cannot currently be parallelized. " 3622 "See https://github.com/apache/beam/issues/20933 tracking the " 3623 "possible addition of an approximate, parallelizable implementation " 3624 "of quantile." 3625 )) 3626 3627 return frame_base.DeferredFrame.wrap( 3628 expressions.ComputedExpression( 3629 'quantile', 3630 lambda df: df.quantile(q=q, axis=axis, **kwargs), 3631 [self._expr], 3632 requires_partition_by=requires, 3633 preserves_partition_by=partitionings.Singleton())) 3634 3635 @frame_base.with_docs_from(pd.DataFrame) 3636 @frame_base.args_to_kwargs(pd.DataFrame) 3637 @frame_base.maybe_inplace 3638 def rename(self, **kwargs): 3639 """rename is not parallelizable when ``axis="index"`` and 3640 ``errors="raise"``. It requires collecting all data on a single 3641 node in order to detect if one of the index values is missing.""" 3642 rename_index = ( 3643 'index' in kwargs 3644 or kwargs.get('axis', None) in (0, 'index') 3645 or ('columns' not in kwargs and 'axis' not in kwargs)) 3646 rename_columns = ( 3647 'columns' in kwargs 3648 or kwargs.get('axis', None) in (1, 'columns')) 3649 3650 if rename_index: 3651 # Technically, it's still partitioned by index, but it's no longer 3652 # partitioned by the hash of the index. 3653 preserves_partition_by = partitionings.Singleton() 3654 else: 3655 preserves_partition_by = partitionings.Index() 3656 3657 if kwargs.get('errors', None) == 'raise' and rename_index: 3658 # TODO: We could do this in parallel by creating a ConstantExpression 3659 # with a series created from the mapper dict. Then Index() partitioning 3660 # would co-locate the necessary index values and we could raise 3661 # individually within each partition. Execution time errors are 3662 # discouraged anyway so probably not worth the effort. 3663 requires_partition_by = partitionings.Singleton(reason=( 3664 "rename(errors='raise', axis='index') requires collecting all " 3665 "data on a single node in order to detect missing index values." 3666 )) 3667 else: 3668 requires_partition_by = partitionings.Arbitrary() 3669 3670 proxy = None 3671 if rename_index: 3672 # The proxy can't be computed by executing rename, it will error 3673 # renaming the index. 3674 if rename_columns: 3675 # Note if both are being renamed, index and columns must be specified 3676 # (not axis) 3677 proxy = self._expr.proxy().rename(**{k: v for (k, v) in kwargs.items() 3678 if not k == 'index'}) 3679 else: 3680 # No change in columns, reuse proxy 3681 proxy = self._expr.proxy() 3682 3683 return frame_base.DeferredFrame.wrap( 3684 expressions.ComputedExpression( 3685 'rename', 3686 lambda df: df.rename(**kwargs), 3687 [self._expr], 3688 proxy=proxy, 3689 preserves_partition_by=preserves_partition_by, 3690 requires_partition_by=requires_partition_by)) 3691 3692 rename_axis = frame_base._elementwise_method('rename_axis', base=pd.DataFrame) 3693 3694 @frame_base.with_docs_from(pd.DataFrame) 3695 @frame_base.args_to_kwargs(pd.DataFrame) 3696 @frame_base.populate_defaults(pd.DataFrame) 3697 def round(self, decimals, *args, **kwargs): 3698 3699 if isinstance(decimals, frame_base.DeferredFrame): 3700 # Disallow passing a deferred Series in, our current partitioning model 3701 # prevents us from using it correctly. 3702 raise NotImplementedError("Passing a deferred series to round() is not " 3703 "supported, please use a concrete pd.Series " 3704 "instance or a dictionary") 3705 3706 return frame_base.DeferredFrame.wrap( 3707 expressions.ComputedExpression( 3708 'round', 3709 lambda df: df.round(decimals, *args, **kwargs), 3710 [self._expr], 3711 requires_partition_by=partitionings.Arbitrary(), 3712 preserves_partition_by=partitionings.Index() 3713 ) 3714 ) 3715 3716 select_dtypes = frame_base._elementwise_method('select_dtypes', 3717 base=pd.DataFrame) 3718 3719 @frame_base.with_docs_from(pd.DataFrame) 3720 @frame_base.args_to_kwargs(pd.DataFrame) 3721 @frame_base.populate_defaults(pd.DataFrame) 3722 def shift(self, axis, freq, **kwargs): 3723 """shift with ``axis="index" is only supported with ``freq`` specified and 3724 ``fill_value`` undefined. Other configurations make this operation 3725 order-sensitive.""" 3726 if axis in (1, 'columns'): 3727 preserves = partitionings.Arbitrary() 3728 proxy = None 3729 else: 3730 if freq is None or 'fill_value' in kwargs: 3731 fill_value = kwargs.get('fill_value', 'NOT SET') 3732 raise frame_base.WontImplementError( 3733 f"shift(axis={axis!r}) is only supported with freq defined, and " 3734 f"fill_value undefined (got freq={freq!r}," 3735 f"fill_value={fill_value!r}). Other configurations are sensitive " 3736 "to the order of the data because they require populating shifted " 3737 "rows with `fill_value`.", 3738 reason="order-sensitive") 3739 # proxy generation fails in pandas <1.2 3740 # Seems due to https://github.com/pandas-dev/pandas/issues/14811, 3741 # bug with shift on empty indexes. 3742 # Fortunately the proxy should be identical to the input. 3743 proxy = self._expr.proxy().copy() 3744 3745 3746 # index is modified, so no partitioning is preserved. 3747 preserves = partitionings.Singleton() 3748 3749 return frame_base.DeferredFrame.wrap( 3750 expressions.ComputedExpression( 3751 'shift', 3752 lambda df: df.shift(axis=axis, freq=freq, **kwargs), 3753 [self._expr], 3754 proxy=proxy, 3755 preserves_partition_by=preserves, 3756 requires_partition_by=partitionings.Arbitrary())) 3757 3758 3759 shape = property(frame_base.wont_implement_method( 3760 pd.DataFrame, 'shape', reason="non-deferred-result")) 3761 3762 stack = frame_base._proxy_method( 3763 'stack', 3764 base=pd.DataFrame, 3765 requires_partition_by=partitionings.Arbitrary(), 3766 preserves_partition_by=partitionings.Singleton()) 3767 3768 all = _agg_method(pd.DataFrame, 'all') 3769 any = _agg_method(pd.DataFrame, 'any') 3770 count = _agg_method(pd.DataFrame, 'count') 3771 describe = _agg_method(pd.DataFrame, 'describe') 3772 max = _agg_method(pd.DataFrame, 'max') 3773 min = _agg_method(pd.DataFrame, 'min') 3774 3775 @frame_base.with_docs_from(pd.DataFrame) 3776 @frame_base.args_to_kwargs(pd.DataFrame) 3777 @frame_base.populate_defaults(pd.DataFrame) 3778 def pivot(self, index=None, columns=None, values=None, **kwargs): 3779 """Because pivot is a non-deferred method, any columns specified in 3780 ``columns`` must be CategoricalDType so we can determine the output column 3781 names.""" 3782 3783 def verify_all_categorical(all_cols_are_categorical): 3784 if not all_cols_are_categorical: 3785 message = "pivot() of non-categorical type is not supported because " \ 3786 "the type of the output column depends on the data. Please use " \ 3787 "pd.CategoricalDtype with explicit categories." 3788 raise frame_base.WontImplementError( 3789 message, reason="non-deferred-columns") 3790 3791 # If values not provided, take all remaining columns of dataframe 3792 if not values: 3793 tmp = self._expr.proxy() 3794 if index: 3795 tmp = tmp.drop(index, axis=1) 3796 if columns: 3797 tmp = tmp.drop(columns, axis=1) 3798 values = tmp.columns.values 3799 3800 # Construct column index 3801 if is_list_like(columns) and len(columns) <= 1: 3802 columns = columns[0] 3803 selected_cols = self._expr.proxy()[columns] 3804 if isinstance(selected_cols, pd.Series): 3805 all_cols_are_categorical = isinstance( 3806 selected_cols.dtype, pd.CategoricalDtype 3807 ) 3808 verify_all_categorical(all_cols_are_categorical) 3809 3810 if is_list_like(values) and len(values) > 1: 3811 # If more than one value provided, don't create a None level 3812 values_in_col_index = values 3813 names = [None, columns] 3814 col_index = pd.MultiIndex.from_product( 3815 [values_in_col_index, 3816 selected_cols.dtypes.categories.astype('category')], 3817 names=names 3818 ) 3819 else: 3820 col_index = pd.CategoricalIndex( 3821 selected_cols.dtype.categories, 3822 name=columns 3823 ) 3824 else: 3825 all_cols_are_categorical = all( 3826 isinstance(c, pd.CategoricalDtype) for c in selected_cols.dtypes 3827 ) 3828 verify_all_categorical(all_cols_are_categorical) 3829 3830 if is_list_like(values) and len(values) > 1: 3831 # If more than one value provided, don't create a None level 3832 values_in_col_index = values 3833 names = [None, *columns] 3834 categories = [ 3835 c.categories.astype('category') for c in selected_cols.dtypes 3836 ] 3837 col_index = pd.MultiIndex.from_product( 3838 [values_in_col_index, *categories], 3839 names=names 3840 ) 3841 else: 3842 # If one value provided, don't create a None level 3843 names = columns 3844 categories = [ 3845 c.categories.astype('category') for c in selected_cols.dtypes 3846 ] 3847 col_index = pd.MultiIndex.from_product( 3848 categories, 3849 names=names 3850 ) 3851 3852 # Construct row index 3853 if index: 3854 if PD_VERSION < (1, 4) and is_list_like(index) and len(index) > 1: 3855 raise frame_base.WontImplementError( 3856 "pivot() is not supported when pandas<1.4 and index is a MultiIndex") 3857 per_partition = expressions.ComputedExpression( 3858 'pivot-per-partition', 3859 lambda df: df.set_index(keys=index), [self._expr], 3860 preserves_partition_by=partitionings.Singleton(), 3861 requires_partition_by=partitionings.Arbitrary() 3862 ) 3863 tmp = per_partition.proxy().pivot( 3864 columns=columns, values=values, **kwargs) 3865 row_index = tmp.index 3866 else: 3867 per_partition = self._expr 3868 row_index = self._expr.proxy().index 3869 if PD_VERSION < (1, 4) and isinstance(row_index, pd.MultiIndex): 3870 raise frame_base.WontImplementError( 3871 "pivot() is not supported when pandas<1.4 and index is a MultiIndex") 3872 3873 selected_values = self._expr.proxy()[values] 3874 if isinstance(selected_values, pd.Series): 3875 value_dtype = selected_values.dtype 3876 else: 3877 # Set dtype to object if more than one value 3878 dtypes = [d for d in selected_values.dtypes] 3879 value_dtype = object 3880 if any((is_int64_dtype(x) for x in dtypes)): 3881 value_dtype = np.int64 3882 if any((is_float_dtype(x) for x in dtypes)): 3883 value_dtype = np.float64 3884 if object in dtypes: 3885 value_dtype = object 3886 3887 # Construct proxy 3888 proxy = pd.DataFrame( 3889 columns=col_index, dtype=value_dtype, index=row_index 3890 ) 3891 3892 def pivot_helper(df): 3893 result = pd.concat( 3894 [proxy, df.pivot(columns=columns, values=values, **kwargs)] 3895 ) 3896 result.columns = col_index 3897 result = result.astype(value_dtype) 3898 return result 3899 3900 return frame_base.DeferredFrame.wrap( 3901 expressions.ComputedExpression( 3902 'pivot', 3903 pivot_helper, 3904 [per_partition], 3905 proxy=proxy, 3906 preserves_partition_by=partitionings.Index(), 3907 requires_partition_by=partitionings.Index())) 3908 3909 prod = product = _agg_method(pd.DataFrame, 'prod') 3910 sum = _agg_method(pd.DataFrame, 'sum') 3911 mean = _agg_method(pd.DataFrame, 'mean') 3912 median = _agg_method(pd.DataFrame, 'median') 3913 nunique = _agg_method(pd.DataFrame, 'nunique') 3914 std = _agg_method(pd.DataFrame, 'std') 3915 var = _agg_method(pd.DataFrame, 'var') 3916 sem = _agg_method(pd.DataFrame, 'sem') 3917 mad = _agg_method(pd.DataFrame, 'mad') 3918 skew = _agg_method(pd.DataFrame, 'skew') 3919 kurt = _agg_method(pd.DataFrame, 'kurt') 3920 kurtosis = _agg_method(pd.DataFrame, 'kurtosis') 3921 3922 take = frame_base.wont_implement_method(pd.DataFrame, 'take', 3923 reason='deprecated') 3924 3925 to_records = frame_base.wont_implement_method(pd.DataFrame, 'to_records', 3926 reason="non-deferred-result") 3927 to_dict = frame_base.wont_implement_method(pd.DataFrame, 'to_dict', 3928 reason="non-deferred-result") 3929 to_numpy = frame_base.wont_implement_method(pd.DataFrame, 'to_numpy', 3930 reason="non-deferred-result") 3931 to_string = frame_base.wont_implement_method(pd.DataFrame, 'to_string', 3932 reason="non-deferred-result") 3933 3934 to_sparse = frame_base.wont_implement_method(pd.DataFrame, 'to_sparse', 3935 reason="non-deferred-result") 3936 3937 transpose = frame_base.wont_implement_method( 3938 pd.DataFrame, 'transpose', reason='non-deferred-columns') 3939 T = property(frame_base.wont_implement_method( 3940 pd.DataFrame, 'T', reason='non-deferred-columns')) 3941 3942 update = frame_base._proxy_method( 3943 'update', 3944 inplace=True, 3945 base=pd.DataFrame, 3946 requires_partition_by=partitionings.Index(), 3947 preserves_partition_by=partitionings.Arbitrary()) 3948 3949 values = property(frame_base.wont_implement_method( 3950 pd.DataFrame, 'values', reason="non-deferred-result")) 3951 3952 style = property(frame_base.wont_implement_method( 3953 pd.DataFrame, 'style', reason="non-deferred-result")) 3954 3955 @frame_base.with_docs_from(pd.DataFrame) 3956 @frame_base.args_to_kwargs(pd.DataFrame) 3957 @frame_base.populate_defaults(pd.DataFrame) 3958 def melt(self, ignore_index, **kwargs): 3959 """``ignore_index=True`` is not supported, because it requires generating an 3960 order-sensitive index.""" 3961 if ignore_index: 3962 raise frame_base.WontImplementError( 3963 "melt(ignore_index=True) is order sensitive because it requires " 3964 "generating a new index based on the order of the data.", 3965 reason="order-sensitive") 3966 3967 return frame_base.DeferredFrame.wrap( 3968 expressions.ComputedExpression( 3969 'melt', 3970 lambda df: df.melt(ignore_index=False, **kwargs), [self._expr], 3971 requires_partition_by=partitionings.Arbitrary(), 3972 preserves_partition_by=partitionings.Singleton())) 3973 3974 if hasattr(pd.DataFrame, 'value_counts'): 3975 @frame_base.with_docs_from(pd.DataFrame) 3976 def value_counts(self, subset=None, sort=False, normalize=False, 3977 ascending=False, dropna=True): 3978 """``sort`` is ``False`` by default, and ``sort=True`` is not supported 3979 because it imposes an ordering on the dataset which likely will not be 3980 preserved.""" 3981 3982 if sort: 3983 raise frame_base.WontImplementError( 3984 "value_counts(sort=True) is not supported because it imposes an " 3985 "ordering on the dataset which likely will not be preserved.", 3986 reason="order-sensitive") 3987 columns = subset or list(self.columns) 3988 3989 if dropna: 3990 dropped = self.dropna() 3991 else: 3992 dropped = self 3993 3994 result = dropped.groupby(columns, dropna=dropna).size() 3995 3996 if normalize: 3997 return result/dropped.length() 3998 else: 3999 return result 4000 4001 if hasattr(pd.DataFrame, 'compare'): 4002 4003 @frame_base.with_docs_from(pd.DataFrame) 4004 @frame_base.args_to_kwargs(pd.DataFrame) 4005 @frame_base.populate_defaults(pd.DataFrame) 4006 def compare(self, other, align_axis, keep_shape, **kwargs): 4007 """The default values ``align_axis=1 and ``keep_shape=False`` 4008 are not supported, because the output columns depend on the data. 4009 To use ``align_axis=1``, please specify ``keep_shape=True``.""" 4010 4011 preserve_partition = None 4012 4013 if align_axis in (1, 'columns') and not keep_shape: 4014 raise frame_base.WontImplementError( 4015 f"compare(align_axis={align_axis!r}, keep_shape={keep_shape!r}) " 4016 "is not allowed because the output columns depend on the data, " 4017 "please specify keep_shape=True.", 4018 reason='non-deferred-columns' 4019 ) 4020 4021 if align_axis in (1, 'columns'): 4022 preserve_partition = partitionings.Arbitrary() 4023 elif align_axis in (0, 'index'): 4024 preserve_partition = partitionings.Singleton() 4025 else: 4026 raise ValueError( 4027 "align_axis must be one of ('index', 0, 'columns', 1). " 4028 f"got {align_axis!r}.") 4029 4030 4031 return frame_base.DeferredFrame.wrap( 4032 expressions.ComputedExpression( 4033 'compare', 4034 lambda df, other: df.compare(other, align_axis, keep_shape, **kwargs), 4035 [self._expr, other._expr], 4036 requires_partition_by=partitionings.Index(), 4037 preserves_partition_by=preserve_partition 4038 ) 4039 ) 4040 4041 def _idxmaxmin_helper(self, op, **kwargs): 4042 if op == 'idxmax': 4043 func = pd.DataFrame.idxmax 4044 elif op == 'idxmin': 4045 func = pd.DataFrame.idxmin 4046 else: 4047 raise ValueError("op must be one of ('idxmax', 'idxmin'). " 4048 f"got {op!r}.") 4049 4050 axis = kwargs.get('axis', 0) 4051 4052 index_dtype = self._expr.proxy().index.dtype 4053 columns_dtype = self._expr.proxy().columns.dtype 4054 4055 def compute_idx(df): 4056 indexes = func(df, **kwargs).unique() 4057 if pd.isna(indexes).any(): 4058 return df 4059 else: 4060 return df.loc[indexes] 4061 4062 if axis in ('index', 0): 4063 requires_partition = partitionings.Singleton() 4064 4065 proxy_index = pd.Index([], dtype=columns_dtype) 4066 proxy = pd.Series([], index=proxy_index, dtype=index_dtype) 4067 partition_proxy = self._expr.proxy().copy() 4068 4069 idx_per_partition = expressions.ComputedExpression( 4070 'idx-per-partition', 4071 compute_idx, [self._expr], 4072 proxy=partition_proxy, 4073 requires_partition_by=partitionings.Arbitrary(), 4074 preserves_partition_by=partitionings.Arbitrary() 4075 ) 4076 4077 elif axis in ('columns', 1): 4078 requires_partition = partitionings.Index() 4079 4080 proxy_index = pd.Index([], dtype=index_dtype) 4081 proxy = pd.Series([], index=proxy_index, dtype=columns_dtype) 4082 4083 idx_per_partition = self._expr 4084 4085 else: 4086 raise ValueError("axis must be one of ('index', 0, 'columns', 1). " 4087 f"got {axis!r}.") 4088 4089 with expressions.allow_non_parallel_operations(True): 4090 return frame_base.DeferredFrame.wrap( 4091 expressions.ComputedExpression( 4092 'idx', 4093 lambda df: func(df, **kwargs), [idx_per_partition], 4094 proxy=proxy, 4095 requires_partition_by=requires_partition, 4096 preserves_partition_by=partitionings.Singleton() 4097 ) 4098 ) 4099 4100 4101 @frame_base.with_docs_from(pd.DataFrame) 4102 @frame_base.args_to_kwargs(pd.DataFrame) 4103 @frame_base.populate_defaults(pd.DataFrame) 4104 def idxmin(self, **kwargs): 4105 return self._idxmaxmin_helper('idxmin', **kwargs) 4106 4107 @frame_base.with_docs_from(pd.DataFrame) 4108 @frame_base.args_to_kwargs(pd.DataFrame) 4109 @frame_base.populate_defaults(pd.DataFrame) 4110 def idxmax(self, **kwargs): 4111 return self._idxmaxmin_helper('idxmax', **kwargs) 4112 4113 4114 for io_func in dir(io): 4115 if io_func.startswith('to_'): 4116 setattr(DeferredDataFrame, io_func, getattr(io, io_func)) 4117 setattr(DeferredSeries, io_func, getattr(io, io_func)) 4118 4119 4120 for meth in ('filter', ): 4121 setattr(DeferredDataFrame, meth, 4122 frame_base._elementwise_method(meth, base=pd.DataFrame)) 4123 4124 4125 @populate_not_implemented(DataFrameGroupBy) 4126 class DeferredGroupBy(frame_base.DeferredFrame): 4127 def __init__(self, expr, kwargs, 4128 ungrouped: expressions.Expression[pd.core.generic.NDFrame], 4129 ungrouped_with_index: expressions.Expression[pd.core.generic.NDFrame], # pylint: disable=line-too-long 4130 grouping_columns, 4131 grouping_indexes, 4132 group_keys, 4133 projection=None): 4134 """This object represents the result of:: 4135 4136 ungrouped.groupby(level=[grouping_indexes + grouping_columns], 4137 **kwargs)[projection] 4138 4139 :param expr: An expression to compute a pandas GroupBy object. Convenient 4140 for unliftable aggregations. 4141 :param ungrouped: An expression to compute the DataFrame pre-grouping, the 4142 (Multi)Index contains only the grouping columns/indexes. 4143 :param ungrouped_with_index: Same as ungrouped, except the index includes 4144 all of the original indexes as well as any grouping columns. This is 4145 important for operations that expose the original index, e.g. .apply(), 4146 but we only use it when necessary to avoid unnessary data transfer and 4147 GBKs. 4148 :param grouping_columns: list of column labels that were in the original 4149 groupby(..) ``by`` parameter. Only relevant for grouped DataFrames. 4150 :param grouping_indexes: list of index names (or index level numbers) to be 4151 grouped. 4152 :param kwargs: Keywords args passed to the original groupby(..) call.""" 4153 super().__init__(expr) 4154 self._ungrouped = ungrouped 4155 self._ungrouped_with_index = ungrouped_with_index 4156 self._projection = projection 4157 self._grouping_columns = grouping_columns 4158 self._grouping_indexes = grouping_indexes 4159 self._group_keys = group_keys 4160 self._kwargs = kwargs 4161 4162 if (self._kwargs.get('dropna', True) is False and 4163 self._ungrouped.proxy().index.nlevels > 1): 4164 raise NotImplementedError( 4165 "dropna=False does not work as intended in the Beam DataFrame API " 4166 "when grouping on multiple columns or indexes (See " 4167 "https://github.com/apache/beam/issues/21014).") 4168 4169 def __getattr__(self, name): 4170 return DeferredGroupBy( 4171 expressions.ComputedExpression( 4172 'groupby_project', 4173 lambda gb: getattr(gb, name), [self._expr], 4174 requires_partition_by=partitionings.Arbitrary(), 4175 preserves_partition_by=partitionings.Arbitrary()), 4176 self._kwargs, 4177 self._ungrouped, 4178 self._ungrouped_with_index, 4179 self._grouping_columns, 4180 self._grouping_indexes, 4181 self._group_keys, 4182 projection=name) 4183 4184 def __getitem__(self, name): 4185 return DeferredGroupBy( 4186 expressions.ComputedExpression( 4187 'groupby_project', 4188 lambda gb: gb[name], [self._expr], 4189 requires_partition_by=partitionings.Arbitrary(), 4190 preserves_partition_by=partitionings.Arbitrary()), 4191 self._kwargs, 4192 self._ungrouped, 4193 self._ungrouped_with_index, 4194 self._grouping_columns, 4195 self._grouping_indexes, 4196 self._group_keys, 4197 projection=name) 4198 4199 @frame_base.with_docs_from(DataFrameGroupBy) 4200 def agg(self, fn, *args, **kwargs): 4201 if _is_associative(fn): 4202 return _liftable_agg(fn)(self, *args, **kwargs) 4203 elif _is_liftable_with_sum(fn): 4204 return _liftable_agg(fn, postagg_meth='sum')(self, *args, **kwargs) 4205 elif _is_unliftable(fn): 4206 return _unliftable_agg(fn)(self, *args, **kwargs) 4207 elif callable(fn): 4208 return DeferredDataFrame( 4209 expressions.ComputedExpression( 4210 'agg', 4211 lambda gb: gb.agg(fn, *args, **kwargs), [self._expr], 4212 requires_partition_by=partitionings.Index(), 4213 preserves_partition_by=partitionings.Singleton())) 4214 else: 4215 raise NotImplementedError(f"GroupBy.agg(func={fn!r})") 4216 4217 @property 4218 def ndim(self): 4219 return self._expr.proxy().ndim 4220 4221 @frame_base.with_docs_from(DataFrameGroupBy) 4222 def apply(self, func, *args, **kwargs): 4223 """Note that ``func`` will be called once during pipeline construction time 4224 with an empty pandas object, so take care if ``func`` has a side effect. 4225 4226 When called with an empty pandas object, ``func`` is expected to return an 4227 object of the same type as what will be returned when the pipeline is 4228 processing actual data. If the result is a pandas object it should have the 4229 same type and name (for a Series) or column types and names (for 4230 a DataFrame) as the actual results. 4231 4232 Note that in pandas, ``apply`` attempts to detect if the index is unmodified 4233 in ``func`` (indicating ``func`` is a transform) and drops the duplicate 4234 index in the output. To determine this, pandas tests the indexes for 4235 equality. However, Beam cannot do this since it is sensitive to the input 4236 data; instead this implementation tests if the indexes are equivalent 4237 with ``is``. See the `pandas 1.4.0 release notes 4238 <https://pandas.pydata.org/docs/dev/whatsnew/v1.4.0.html#groupby-apply-consistent-transform-detection>`_ 4239 for a good explanation of the distinction between these approaches. In 4240 practice, this just means that in some cases the Beam result will have 4241 a duplicate index, whereas pandas would have dropped it.""" 4242 4243 project = _maybe_project_func(self._projection) 4244 grouping_indexes = self._grouping_indexes 4245 grouping_columns = self._grouping_columns 4246 group_keys = self._group_keys 4247 4248 # Unfortunately pandas does not execute func to determine the right proxy. 4249 # We run user func on a proxy here to detect the return type and generate 4250 # the proxy. 4251 fn_input = project(self._ungrouped_with_index.proxy().reset_index( 4252 grouping_columns, drop=True)) 4253 result = func(fn_input) 4254 def index_to_arrays(index): 4255 return [index.get_level_values(level) 4256 for level in range(index.nlevels)] 4257 4258 4259 # By default do_apply will just call pandas apply() 4260 # We override it below if necessary 4261 do_apply = lambda gb: gb.apply(func, *args, **kwargs) 4262 4263 if (isinstance(result, pd.core.generic.NDFrame) and 4264 result.index is fn_input.index): 4265 # Special case where apply fn is a transform 4266 # Note we trust that if the user fn produces a proxy with the identical 4267 # index, it will produce results with identical indexes at execution 4268 # time too 4269 proxy = result 4270 elif isinstance(result, pd.DataFrame): 4271 # apply fn is not a transform, we need to make sure the original index 4272 # values are prepended to the result's index 4273 proxy = result[:0] 4274 4275 # First adjust proxy 4276 proxy.index = pd.MultiIndex.from_arrays( 4277 index_to_arrays(self._ungrouped.proxy().index) + 4278 index_to_arrays(proxy.index), 4279 names=self._ungrouped.proxy().index.names + proxy.index.names) 4280 4281 # Then override do_apply function 4282 new_index_names = self._ungrouped.proxy().index.names 4283 if len(new_index_names) > 1: 4284 def add_key_index(key, df): 4285 # df is a dataframe or Series representing the result of func for 4286 # a single key 4287 # key is a tuple with the MultiIndex values for this key 4288 df.index = pd.MultiIndex.from_arrays( 4289 [[key[i]] * len(df) for i in range(len(new_index_names))] + 4290 index_to_arrays(df.index), 4291 names=new_index_names + df.index.names) 4292 return df 4293 else: 4294 def add_key_index(key, df): 4295 # df is a dataframe or Series representing the result of func for 4296 # a single key 4297 df.index = pd.MultiIndex.from_arrays( 4298 [[key] * len(df)] + index_to_arrays(df.index), 4299 names=new_index_names + df.index.names) 4300 return df 4301 4302 4303 do_apply = lambda gb: pd.concat([ 4304 add_key_index(k, func(gb.get_group(k), *args, **kwargs)) 4305 for k in gb.groups.keys()]) 4306 elif isinstance(result, pd.Series): 4307 if isinstance(fn_input, pd.DataFrame): 4308 # DataFrameGroupBy 4309 # In this case pandas transposes the Series result, s.t. the Series 4310 # index values are the columns, and the grouping keys are the new index 4311 # values. 4312 dtype = pd.Series([result]).dtype 4313 proxy = pd.DataFrame(columns=result.index, 4314 dtype=result.dtype, 4315 index=self._ungrouped.proxy().index) 4316 elif isinstance(fn_input, pd.Series): 4317 # SeriesGroupBy 4318 # In this case the output is still a Series, but with an additional 4319 # index with the grouping keys. 4320 proxy = pd.Series(dtype=result.dtype, 4321 name=result.name, 4322 index=index_to_arrays(self._ungrouped.proxy().index) + 4323 index_to_arrays(result[:0].index)) 4324 else: 4325 # The user fn returns some non-pandas type. The expected result is a 4326 # Series where each element is the result of one user fn call. 4327 dtype = pd.Series([result]).dtype 4328 proxy = pd.Series([], dtype=dtype, index=self._ungrouped.proxy().index) 4329 4330 def do_partition_apply(df): 4331 # Remove columns from index, we only needed them there for partitioning 4332 df = df.reset_index(grouping_columns, drop=True) 4333 4334 gb = df.groupby(level=grouping_indexes or None, 4335 by=grouping_columns or None, 4336 group_keys=group_keys) 4337 4338 gb = project(gb) 4339 4340 return do_apply(gb) 4341 4342 return DeferredDataFrame( 4343 expressions.ComputedExpression( 4344 'apply', 4345 do_partition_apply, 4346 [self._ungrouped_with_index], 4347 proxy=proxy, 4348 requires_partition_by=partitionings.Index(grouping_indexes + 4349 grouping_columns), 4350 preserves_partition_by=partitionings.Index(grouping_indexes))) 4351 4352 4353 @frame_base.with_docs_from(DataFrameGroupBy) 4354 def transform(self, fn, *args, **kwargs): 4355 """Note that ``func`` will be called once during pipeline construction time 4356 with an empty pandas object, so take care if ``func`` has a side effect. 4357 4358 When called with an empty pandas object, ``func`` is expected to return an 4359 object of the same type as what will be returned when the pipeline is 4360 processing actual data. The result should have the same type and name (for 4361 a Series) or column types and names (for a DataFrame) as the actual 4362 results.""" 4363 if not callable(fn): 4364 raise NotImplementedError( 4365 "String functions are not yet supported in transform.") 4366 4367 if self._grouping_columns and not self._projection: 4368 grouping_columns = self._grouping_columns 4369 def fn_wrapper(x, *args, **kwargs): 4370 x = x.droplevel(grouping_columns) 4371 return fn(x, *args, **kwargs) 4372 else: 4373 fn_wrapper = fn 4374 4375 project = _maybe_project_func(self._projection) 4376 group_keys = self._group_keys 4377 4378 # pandas cannot execute fn to determine the right proxy. 4379 # We run user fn on a proxy here to detect the return type and generate the 4380 # proxy. 4381 result = fn_wrapper(project(self._ungrouped_with_index.proxy())) 4382 parent_frame = self._ungrouped.args()[0].proxy() 4383 if isinstance(result, pd.core.generic.NDFrame): 4384 proxy = result[:0] 4385 4386 else: 4387 # The user fn returns some non-pandas type. The expected result is a 4388 # Series where each element is the result of one user fn call. 4389 dtype = pd.Series([result]).dtype 4390 proxy = pd.Series([], dtype=dtype, name=project(parent_frame).name) 4391 4392 if not isinstance(self._projection, list): 4393 proxy.name = self._projection 4394 4395 # The final result will have the original indexes 4396 proxy.index = parent_frame.index 4397 4398 levels = self._grouping_indexes + self._grouping_columns 4399 4400 return DeferredDataFrame( 4401 expressions.ComputedExpression( 4402 'transform', 4403 lambda df: project( 4404 df.groupby(level=levels, group_keys=group_keys) 4405 ).transform( 4406 fn_wrapper, 4407 *args, 4408 **kwargs).droplevel(self._grouping_columns), 4409 [self._ungrouped_with_index], 4410 proxy=proxy, 4411 requires_partition_by=partitionings.Index(levels), 4412 preserves_partition_by=partitionings.Index(self._grouping_indexes))) 4413 4414 @frame_base.with_docs_from(DataFrameGroupBy) 4415 def pipe(self, func, *args, **kwargs): 4416 if isinstance(func, tuple): 4417 func, data = func 4418 kwargs[data] = self 4419 return func(*args, **kwargs) 4420 4421 return func(self, *args, **kwargs) 4422 4423 @frame_base.with_docs_from(DataFrameGroupBy) 4424 def filter(self, func=None, dropna=True): 4425 if func is None or not callable(func): 4426 raise TypeError("func must be specified and it must be callable") 4427 4428 def apply_fn(df): 4429 if func(df): 4430 return df 4431 elif not dropna: 4432 result = df.copy() 4433 result.iloc[:, :] = np.nan 4434 return result 4435 else: 4436 return df.iloc[:0] 4437 4438 return self.apply(apply_fn).droplevel(self._grouping_columns) 4439 4440 @property # type: ignore 4441 @frame_base.with_docs_from(DataFrameGroupBy) 4442 def dtypes(self): 4443 return frame_base.DeferredFrame.wrap( 4444 expressions.ComputedExpression( 4445 'dtypes', 4446 lambda gb: gb.dtypes, 4447 [self._expr], 4448 requires_partition_by=partitionings.Arbitrary(), 4449 preserves_partition_by=partitionings.Arbitrary() 4450 ) 4451 ) 4452 4453 if hasattr(DataFrameGroupBy, 'value_counts'): 4454 @frame_base.with_docs_from(DataFrameGroupBy) 4455 def value_counts(self, **kwargs): 4456 """ 4457 DataFrameGroupBy.value_counts() is the same as DataFrame.value_counts() 4458 """ 4459 return frame_base.DeferredFrame.wrap( 4460 expressions.ComputedExpression( 4461 'value_counts', 4462 lambda df: df.value_counts(**kwargs), [self._expr], 4463 preserves_partition_by=partitionings.Arbitrary(), 4464 requires_partition_by=partitionings.Arbitrary()) 4465 ) 4466 4467 fillna = frame_base.wont_implement_method( 4468 DataFrameGroupBy, 'fillna', explanation=( 4469 "df.fillna() should be used instead. Only method=None is supported " 4470 "because other methods are order-sensitive. df.groupby(..).fillna() " 4471 "without a method is equivalent to df.fillna().")) 4472 4473 ffill = frame_base.wont_implement_method(DataFrameGroupBy, 'ffill', 4474 reason="order-sensitive") 4475 bfill = frame_base.wont_implement_method(DataFrameGroupBy, 'bfill', 4476 reason="order-sensitive") 4477 pad = frame_base.wont_implement_method(DataFrameGroupBy, 'pad', 4478 reason="order-sensitive") 4479 backfill = frame_base.wont_implement_method(DataFrameGroupBy, 'backfill', 4480 reason="order-sensitive") 4481 4482 aggregate = agg 4483 4484 hist = frame_base.wont_implement_method(DataFrameGroupBy, 'hist', 4485 reason="plotting-tools") 4486 plot = frame_base.wont_implement_method(DataFrameGroupBy, 'plot', 4487 reason="plotting-tools") 4488 boxplot = frame_base.wont_implement_method(DataFrameGroupBy, 'boxplot', 4489 reason="plotting-tools") 4490 4491 head = frame_base.wont_implement_method( 4492 DataFrameGroupBy, 'head', explanation=_PEEK_METHOD_EXPLANATION) 4493 tail = frame_base.wont_implement_method( 4494 DataFrameGroupBy, 'tail', explanation=_PEEK_METHOD_EXPLANATION) 4495 4496 first = frame_base.not_implemented_method('first', base_type=DataFrameGroupBy) 4497 last = frame_base.not_implemented_method('last', base_type=DataFrameGroupBy) 4498 nth = property(frame_base.wont_implement_method( 4499 DataFrameGroupBy, 'nth', reason='order-sensitive')) 4500 cumcount = frame_base.wont_implement_method( 4501 DataFrameGroupBy, 'cumcount', reason='order-sensitive') 4502 cummax = frame_base.wont_implement_method( 4503 DataFrameGroupBy, 'cummax', reason='order-sensitive') 4504 cummin = frame_base.wont_implement_method( 4505 DataFrameGroupBy, 'cummin', reason='order-sensitive') 4506 cumsum = frame_base.wont_implement_method( 4507 DataFrameGroupBy, 'cumsum', reason='order-sensitive') 4508 cumprod = frame_base.wont_implement_method( 4509 DataFrameGroupBy, 'cumprod', reason='order-sensitive') 4510 diff = frame_base.wont_implement_method(DataFrameGroupBy, 'diff', 4511 reason='order-sensitive') 4512 shift = frame_base.wont_implement_method(DataFrameGroupBy, 'shift', 4513 reason='order-sensitive') 4514 4515 pct_change = frame_base.wont_implement_method(DataFrameGroupBy, 'pct_change', 4516 reason='order-sensitive') 4517 ohlc = frame_base.wont_implement_method(DataFrameGroupBy, 'ohlc', 4518 reason='order-sensitive') 4519 4520 # TODO(https://github.com/apache/beam/issues/20958): Consider allowing this 4521 # for categorical keys. 4522 __len__ = frame_base.wont_implement_method( 4523 DataFrameGroupBy, '__len__', reason="non-deferred-result") 4524 groups = property(frame_base.wont_implement_method( 4525 DataFrameGroupBy, 'groups', reason="non-deferred-result")) 4526 indices = property(frame_base.wont_implement_method( 4527 DataFrameGroupBy, 'indices', reason="non-deferred-result")) 4528 4529 resample = frame_base.wont_implement_method( 4530 DataFrameGroupBy, 'resample', reason='event-time-semantics') 4531 rolling = frame_base.wont_implement_method( 4532 DataFrameGroupBy, 'rolling', reason='event-time-semantics') 4533 ewm = frame_base.wont_implement_method( 4534 DataFrameGroupBy, 'ewm', reason="event-time-semantics") 4535 expanding = frame_base.wont_implement_method( 4536 DataFrameGroupBy, 'expanding', reason="event-time-semantics") 4537 4538 tshift = frame_base.wont_implement_method( 4539 DataFrameGroupBy, 'tshift', reason="deprecated") 4540 4541 def _maybe_project_func(projection: Optional[List[str]]): 4542 """ Returns identity func if projection is empty or None, else returns 4543 a function that projects the specified columns. """ 4544 if projection: 4545 return lambda df: df[projection] 4546 else: 4547 return lambda x: x 4548 4549 4550 def _liftable_agg(meth, postagg_meth=None): 4551 agg_name, _ = frame_base.name_and_func(meth) 4552 4553 if postagg_meth is None: 4554 post_agg_name = agg_name 4555 else: 4556 post_agg_name, _ = frame_base.name_and_func(postagg_meth) 4557 4558 @frame_base.with_docs_from(DataFrameGroupBy, name=agg_name) 4559 def wrapper(self, *args, **kwargs): 4560 assert isinstance(self, DeferredGroupBy) 4561 4562 if 'min_count' in kwargs: 4563 return _unliftable_agg(meth)(self, *args, **kwargs) 4564 4565 to_group = self._ungrouped.proxy().index 4566 is_categorical_grouping = any(to_group.get_level_values(i).is_categorical() 4567 for i in self._grouping_indexes) 4568 groupby_kwargs = self._kwargs 4569 group_keys = self._group_keys 4570 4571 # Don't include un-observed categorical values in the preagg 4572 preagg_groupby_kwargs = groupby_kwargs.copy() 4573 preagg_groupby_kwargs['observed'] = True 4574 4575 project = _maybe_project_func(self._projection) 4576 pre_agg = expressions.ComputedExpression( 4577 'pre_combine_' + agg_name, 4578 lambda df: getattr( 4579 project( 4580 df.groupby(level=list(range(df.index.nlevels)), 4581 group_keys=group_keys, 4582 **preagg_groupby_kwargs) 4583 ), 4584 agg_name)(**kwargs), 4585 [self._ungrouped], 4586 requires_partition_by=partitionings.Arbitrary(), 4587 preserves_partition_by=partitionings.Arbitrary()) 4588 4589 4590 post_agg = expressions.ComputedExpression( 4591 'post_combine_' + post_agg_name, 4592 lambda df: getattr( 4593 df.groupby(level=list(range(df.index.nlevels)), 4594 group_keys=group_keys, 4595 **groupby_kwargs), 4596 post_agg_name)(**kwargs), 4597 [pre_agg], 4598 requires_partition_by=(partitionings.Singleton(reason=( 4599 "Aggregations grouped by a categorical column are not currently " 4600 "parallelizable (https://github.com/apache/beam/issues/21827)." 4601 )) 4602 if is_categorical_grouping 4603 else partitionings.Index()), 4604 preserves_partition_by=partitionings.Arbitrary()) 4605 return frame_base.DeferredFrame.wrap(post_agg) 4606 4607 return wrapper 4608 4609 4610 def _unliftable_agg(meth): 4611 agg_name, _ = frame_base.name_and_func(meth) 4612 4613 @frame_base.with_docs_from(DataFrameGroupBy, name=agg_name) 4614 def wrapper(self, *args, **kwargs): 4615 assert isinstance(self, DeferredGroupBy) 4616 4617 to_group = self._ungrouped.proxy().index 4618 group_keys = self._group_keys 4619 is_categorical_grouping = any(to_group.get_level_values(i).is_categorical() 4620 for i in self._grouping_indexes) 4621 4622 groupby_kwargs = self._kwargs 4623 project = _maybe_project_func(self._projection) 4624 post_agg = expressions.ComputedExpression( 4625 agg_name, 4626 lambda df: getattr(project( 4627 df.groupby(level=list(range(df.index.nlevels)), 4628 group_keys=group_keys, 4629 **groupby_kwargs), 4630 ), agg_name)(**kwargs), 4631 [self._ungrouped], 4632 requires_partition_by=(partitionings.Singleton(reason=( 4633 "Aggregations grouped by a categorical column are not currently " 4634 "parallelizable (https://github.com/apache/beam/issues/21827)." 4635 )) 4636 if is_categorical_grouping 4637 else partitionings.Index()), 4638 # Some aggregation methods (e.g. corr/cov) add additional index levels. 4639 # We only preserve the ones that existed _before_ the groupby. 4640 preserves_partition_by=partitionings.Index( 4641 list(range(self._ungrouped.proxy().index.nlevels)))) 4642 return frame_base.DeferredFrame.wrap(post_agg) 4643 4644 return wrapper 4645 4646 for meth in LIFTABLE_AGGREGATIONS: 4647 setattr(DeferredGroupBy, meth, _liftable_agg(meth)) 4648 for meth in LIFTABLE_WITH_SUM_AGGREGATIONS: 4649 setattr(DeferredGroupBy, meth, _liftable_agg(meth, postagg_meth='sum')) 4650 for meth in UNLIFTABLE_AGGREGATIONS: 4651 if meth in ('kurt', 'kurtosis'): 4652 # pandas doesn't currently allow kurtosis on GroupBy: 4653 # https://github.com/pandas-dev/pandas/issues/40139 4654 continue 4655 setattr(DeferredGroupBy, meth, _unliftable_agg(meth)) 4656 4657 def _check_str_or_np_builtin(agg_func, func_list): 4658 return agg_func in func_list or ( 4659 getattr(agg_func, '__name__', None) in func_list 4660 and agg_func.__module__ in ('numpy', 'builtins')) 4661 4662 4663 def _is_associative(agg_func): 4664 return _check_str_or_np_builtin(agg_func, LIFTABLE_AGGREGATIONS) 4665 4666 def _is_liftable_with_sum(agg_func): 4667 return _check_str_or_np_builtin(agg_func, LIFTABLE_WITH_SUM_AGGREGATIONS) 4668 4669 def _is_unliftable(agg_func): 4670 return _check_str_or_np_builtin(agg_func, UNLIFTABLE_AGGREGATIONS) 4671 4672 NUMERIC_AGGREGATIONS = ['max', 'min', 'prod', 'sum', 'mean', 'median', 'std', 4673 'var', 'sem', 'mad', 'skew', 'kurt', 'kurtosis'] 4674 4675 def _is_numeric(agg_func): 4676 return _check_str_or_np_builtin(agg_func, NUMERIC_AGGREGATIONS) 4677 4678 4679 @populate_not_implemented(DataFrameGroupBy) 4680 class _DeferredGroupByCols(frame_base.DeferredFrame): 4681 # It's not clear that all of these make sense in Pandas either... 4682 agg = aggregate = frame_base._elementwise_method('agg', base=DataFrameGroupBy) 4683 any = frame_base._elementwise_method('any', base=DataFrameGroupBy) 4684 all = frame_base._elementwise_method('all', base=DataFrameGroupBy) 4685 boxplot = frame_base.wont_implement_method( 4686 DataFrameGroupBy, 'boxplot', reason="plotting-tools") 4687 describe = frame_base.not_implemented_method('describe', 4688 base_type=DataFrameGroupBy) 4689 diff = frame_base._elementwise_method('diff', base=DataFrameGroupBy) 4690 fillna = frame_base._elementwise_method('fillna', base=DataFrameGroupBy) 4691 filter = frame_base._elementwise_method('filter', base=DataFrameGroupBy) 4692 first = frame_base._elementwise_method('first', base=DataFrameGroupBy) 4693 get_group = frame_base._elementwise_method('get_group', base=DataFrameGroupBy) 4694 head = frame_base.wont_implement_method( 4695 DataFrameGroupBy, 'head', explanation=_PEEK_METHOD_EXPLANATION) 4696 hist = frame_base.wont_implement_method( 4697 DataFrameGroupBy, 'hist', reason="plotting-tools") 4698 idxmax = frame_base._elementwise_method('idxmax', base=DataFrameGroupBy) 4699 idxmin = frame_base._elementwise_method('idxmin', base=DataFrameGroupBy) 4700 last = frame_base._elementwise_method('last', base=DataFrameGroupBy) 4701 mad = frame_base._elementwise_method('mad', base=DataFrameGroupBy) 4702 max = frame_base._elementwise_method('max', base=DataFrameGroupBy) 4703 mean = frame_base._elementwise_method('mean', base=DataFrameGroupBy) 4704 median = frame_base._elementwise_method('median', base=DataFrameGroupBy) 4705 min = frame_base._elementwise_method('min', base=DataFrameGroupBy) 4706 nunique = frame_base._elementwise_method('nunique', base=DataFrameGroupBy) 4707 plot = frame_base.wont_implement_method( 4708 DataFrameGroupBy, 'plot', reason="plotting-tools") 4709 prod = frame_base._elementwise_method('prod', base=DataFrameGroupBy) 4710 quantile = frame_base._elementwise_method('quantile', base=DataFrameGroupBy) 4711 shift = frame_base._elementwise_method('shift', base=DataFrameGroupBy) 4712 size = frame_base._elementwise_method('size', base=DataFrameGroupBy) 4713 skew = frame_base._elementwise_method('skew', base=DataFrameGroupBy) 4714 std = frame_base._elementwise_method('std', base=DataFrameGroupBy) 4715 sum = frame_base._elementwise_method('sum', base=DataFrameGroupBy) 4716 tail = frame_base.wont_implement_method( 4717 DataFrameGroupBy, 'tail', explanation=_PEEK_METHOD_EXPLANATION) 4718 take = frame_base.wont_implement_method( 4719 DataFrameGroupBy, 'take', reason='deprecated') 4720 tshift = frame_base._elementwise_method('tshift', base=DataFrameGroupBy) 4721 var = frame_base._elementwise_method('var', base=DataFrameGroupBy) 4722 4723 @property # type: ignore 4724 @frame_base.with_docs_from(DataFrameGroupBy) 4725 def groups(self): 4726 return self._expr.proxy().groups 4727 4728 @property # type: ignore 4729 @frame_base.with_docs_from(DataFrameGroupBy) 4730 def indices(self): 4731 return self._expr.proxy().indices 4732 4733 @property # type: ignore 4734 @frame_base.with_docs_from(DataFrameGroupBy) 4735 def ndim(self): 4736 return self._expr.proxy().ndim 4737 4738 @property # type: ignore 4739 @frame_base.with_docs_from(DataFrameGroupBy) 4740 def ngroups(self): 4741 return self._expr.proxy().ngroups 4742 4743 4744 @populate_not_implemented(pd.core.indexes.base.Index) 4745 class _DeferredIndex(object): 4746 def __init__(self, frame): 4747 self._frame = frame 4748 4749 @property 4750 def names(self): 4751 return self._frame._expr.proxy().index.names 4752 4753 @names.setter 4754 def names(self, value): 4755 def set_index_names(df): 4756 df = df.copy() 4757 df.index.names = value 4758 return df 4759 4760 self._frame._expr = expressions.ComputedExpression( 4761 'set_index_names', 4762 set_index_names, 4763 [self._frame._expr], 4764 requires_partition_by=partitionings.Arbitrary(), 4765 preserves_partition_by=partitionings.Arbitrary()) 4766 4767 @property 4768 def name(self): 4769 return self._frame._expr.proxy().index.name 4770 4771 @name.setter 4772 def name(self, value): 4773 self.names = [value] 4774 4775 @property 4776 def ndim(self): 4777 return self._frame._expr.proxy().index.ndim 4778 4779 @property 4780 def dtype(self): 4781 return self._frame._expr.proxy().index.dtype 4782 4783 @property 4784 def nlevels(self): 4785 return self._frame._expr.proxy().index.nlevels 4786 4787 def __getattr__(self, name): 4788 raise NotImplementedError('index.%s' % name) 4789 4790 4791 @populate_not_implemented(pd.core.indexing._LocIndexer) 4792 class _DeferredLoc(object): 4793 def __init__(self, frame): 4794 self._frame = frame 4795 4796 def __getitem__(self, key): 4797 if isinstance(key, tuple): 4798 rows, cols = key 4799 return self[rows][cols] 4800 elif isinstance(key, list) and key and isinstance(key[0], bool): 4801 # Aligned by numerical key. 4802 raise NotImplementedError(type(key)) 4803 elif isinstance(key, list): 4804 # Select rows, but behaves poorly on missing values. 4805 raise NotImplementedError(type(key)) 4806 elif isinstance(key, slice): 4807 args = [self._frame._expr] 4808 func = lambda df: df.loc[key] 4809 elif isinstance(key, frame_base.DeferredFrame): 4810 func = lambda df, key: df.loc[key] 4811 if pd.core.dtypes.common.is_bool_dtype(key._expr.proxy()): 4812 # Boolean indexer, just pass it in as-is 4813 args = [self._frame._expr, key._expr] 4814 else: 4815 # Likely a DeferredSeries of labels, overwrite the key's index with it's 4816 # values so we can colocate them with the labels they're selecting 4817 def data_to_index(s): 4818 s = s.copy() 4819 s.index = s 4820 return s 4821 4822 reindexed_expr = expressions.ComputedExpression( 4823 'data_to_index', 4824 data_to_index, 4825 [key._expr], 4826 requires_partition_by=partitionings.Arbitrary(), 4827 preserves_partition_by=partitionings.Singleton(), 4828 ) 4829 args = [self._frame._expr, reindexed_expr] 4830 elif callable(key): 4831 4832 def checked_callable_key(df): 4833 computed_index = key(df) 4834 if isinstance(computed_index, tuple): 4835 row_index, _ = computed_index 4836 else: 4837 row_index = computed_index 4838 if isinstance(row_index, list) and row_index and isinstance( 4839 row_index[0], bool): 4840 raise NotImplementedError(type(row_index)) 4841 elif not isinstance(row_index, (slice, pd.Series)): 4842 raise NotImplementedError(type(row_index)) 4843 return computed_index 4844 4845 args = [self._frame._expr] 4846 func = lambda df: df.loc[checked_callable_key] 4847 else: 4848 raise NotImplementedError(type(key)) 4849 4850 return frame_base.DeferredFrame.wrap( 4851 expressions.ComputedExpression( 4852 'loc', 4853 func, 4854 args, 4855 requires_partition_by=( 4856 partitionings.JoinIndex() 4857 if len(args) > 1 4858 else partitionings.Arbitrary()), 4859 preserves_partition_by=partitionings.Arbitrary())) 4860 4861 __setitem__ = frame_base.not_implemented_method( 4862 'loc.setitem', base_type=pd.core.indexing._LocIndexer) 4863 4864 @populate_not_implemented(pd.core.indexing._iLocIndexer) 4865 class _DeferredILoc(object): 4866 def __init__(self, frame): 4867 self._frame = frame 4868 4869 def __getitem__(self, index): 4870 if isinstance(index, tuple): 4871 rows, _ = index 4872 if rows != slice(None, None, None): 4873 raise frame_base.WontImplementError( 4874 "Using iloc to select rows is not supported because it's " 4875 "position-based indexing is sensitive to the order of the data.", 4876 reason="order-sensitive") 4877 return frame_base.DeferredFrame.wrap( 4878 expressions.ComputedExpression( 4879 'iloc', 4880 lambda df: df.iloc[index], 4881 [self._frame._expr], 4882 requires_partition_by=partitionings.Arbitrary(), 4883 preserves_partition_by=partitionings.Arbitrary())) 4884 else: 4885 raise frame_base.WontImplementError( 4886 "Using iloc to select rows is not supported because it's " 4887 "position-based indexing is sensitive to the order of the data.", 4888 reason="order-sensitive") 4889 4890 def __setitem__(self, index, value): 4891 raise frame_base.WontImplementError( 4892 "Using iloc to mutate a frame is not supported because it's " 4893 "position-based indexing is sensitive to the order of the data.", 4894 reason="order-sensitive") 4895 4896 4897 class _DeferredStringMethods(frame_base.DeferredBase): 4898 @frame_base.with_docs_from(pd.core.strings.StringMethods) 4899 @frame_base.args_to_kwargs(pd.core.strings.StringMethods) 4900 @frame_base.populate_defaults(pd.core.strings.StringMethods) 4901 def cat(self, others, join, **kwargs): 4902 """If defined, ``others`` must be a :class:`DeferredSeries` or a ``list`` of 4903 ``DeferredSeries``.""" 4904 if others is None: 4905 # Concatenate series into a single String 4906 requires = partitionings.Singleton(reason=( 4907 "cat(others=None) concatenates all data in a Series into a single " 4908 "string, so it requires collecting all data on a single node." 4909 )) 4910 func = lambda df: df.str.cat(join=join, **kwargs) 4911 args = [self._expr] 4912 4913 elif (isinstance(others, frame_base.DeferredBase) or 4914 (isinstance(others, list) and 4915 all(isinstance(other, frame_base.DeferredBase) for other in others))): 4916 4917 if isinstance(others, frame_base.DeferredBase): 4918 others = [others] 4919 4920 requires = partitionings.Index() 4921 def func(*args): 4922 return args[0].str.cat(others=args[1:], join=join, **kwargs) 4923 args = [self._expr] + [other._expr for other in others] 4924 4925 else: 4926 raise frame_base.WontImplementError( 4927 "others must be None, DeferredSeries, or List[DeferredSeries] " 4928 f"(encountered {type(others)}). Other types are not supported " 4929 "because they make this operation sensitive to the order of the " 4930 "data.", reason="order-sensitive") 4931 4932 return frame_base.DeferredFrame.wrap( 4933 expressions.ComputedExpression( 4934 'cat', 4935 func, 4936 args, 4937 requires_partition_by=requires, 4938 preserves_partition_by=partitionings.Arbitrary())) 4939 4940 @frame_base.with_docs_from(pd.core.strings.StringMethods) 4941 @frame_base.args_to_kwargs(pd.core.strings.StringMethods) 4942 def repeat(self, repeats): 4943 """``repeats`` must be an ``int`` or a :class:`DeferredSeries`. Lists are 4944 not supported because they make this operation order-sensitive.""" 4945 if isinstance(repeats, int): 4946 return frame_base.DeferredFrame.wrap( 4947 expressions.ComputedExpression( 4948 'repeat', 4949 lambda series: series.str.repeat(repeats), 4950 [self._expr], 4951 # TODO(https://github.com/apache/beam/issues/20573): Defer to 4952 # pandas to compute this proxy. Currently it incorrectly infers 4953 # dtype bool, may require upstream fix. 4954 proxy=self._expr.proxy(), 4955 requires_partition_by=partitionings.Arbitrary(), 4956 preserves_partition_by=partitionings.Arbitrary())) 4957 elif isinstance(repeats, frame_base.DeferredBase): 4958 return frame_base.DeferredFrame.wrap( 4959 expressions.ComputedExpression( 4960 'repeat', 4961 lambda series, repeats_series: series.str.repeat(repeats_series), 4962 [self._expr, repeats._expr], 4963 # TODO(https://github.com/apache/beam/issues/20573): Defer to 4964 # pandas to compute this proxy. Currently it incorrectly infers 4965 # dtype bool, may require upstream fix. 4966 proxy=self._expr.proxy(), 4967 requires_partition_by=partitionings.Index(), 4968 preserves_partition_by=partitionings.Arbitrary())) 4969 elif isinstance(repeats, list): 4970 raise frame_base.WontImplementError( 4971 "str.repeat(repeats=) repeats must be an int or a DeferredSeries. " 4972 "Lists are not supported because they make this operation sensitive " 4973 "to the order of the data.", reason="order-sensitive") 4974 else: 4975 raise TypeError("str.repeat(repeats=) value must be an int or a " 4976 f"DeferredSeries (encountered {type(repeats)}).") 4977 4978 @frame_base.with_docs_from(pd.core.strings.StringMethods) 4979 @frame_base.args_to_kwargs(pd.core.strings.StringMethods) 4980 def get_dummies(self, **kwargs): 4981 """ 4982 Series must be categorical dtype. Please cast to ``CategoricalDtype`` 4983 to ensure correct categories. 4984 """ 4985 dtype = self._expr.proxy().dtype 4986 if not isinstance(dtype, pd.CategoricalDtype): 4987 raise frame_base.WontImplementError( 4988 "get_dummies() of non-categorical type is not supported because " 4989 "the type of the output column depends on the data. Please use " 4990 "pd.CategoricalDtype with explicit categories.", 4991 reason="non-deferred-columns") 4992 4993 split_cats = [ 4994 cat.split(sep=kwargs.get('sep', '|')) for cat in dtype.categories 4995 ] 4996 4997 # Adding the nan category because there could be the case that 4998 # the data includes NaNs, which is not valid to be casted as a Category, 4999 # but nevertheless would be broadcasted as a column in get_dummies() 5000 columns = sorted(set().union(*split_cats)) 5001 columns = columns + ['nan'] if 'nan' not in columns else columns 5002 5003 proxy = pd.DataFrame(columns=columns).astype(int) 5004 5005 return frame_base.DeferredFrame.wrap( 5006 expressions.ComputedExpression( 5007 'get_dummies', 5008 lambda series: pd.concat( 5009 [proxy, series.str.get_dummies(**kwargs)] 5010 ).fillna(value=0, method=None).astype('int64'), 5011 [self._expr], 5012 proxy=proxy, 5013 requires_partition_by=partitionings.Arbitrary(), 5014 preserves_partition_by=partitionings.Arbitrary())) 5015 5016 def _split_helper(self, rsplit=False, **kwargs): 5017 expand = kwargs.get('expand', False) 5018 5019 if not expand: 5020 # Not creating separate columns 5021 proxy = self._expr.proxy() 5022 if not rsplit: 5023 func = lambda s: pd.concat([proxy, s.str.split(**kwargs)]) 5024 else: 5025 func = lambda s: pd.concat([proxy, s.str.rsplit(**kwargs)]) 5026 else: 5027 # Creating separate columns, so must be more strict on dtype 5028 dtype = self._expr.proxy().dtype 5029 if not isinstance(dtype, pd.CategoricalDtype): 5030 method_name = 'rsplit' if rsplit else 'split' 5031 raise frame_base.WontImplementError( 5032 f"{method_name}() of non-categorical type is not supported because " 5033 "the type of the output column depends on the data. Please use " 5034 "pd.CategoricalDtype with explicit categories.", 5035 reason="non-deferred-columns") 5036 5037 # Split the categories 5038 split_cats = dtype.categories.str.split(**kwargs) 5039 5040 # Count the number of new columns to create for proxy 5041 max_splits = len(max(split_cats, key=len)) 5042 proxy = pd.DataFrame(columns=range(max_splits)) 5043 5044 def func(s): 5045 if not rsplit: 5046 result = s.str.split(**kwargs) 5047 else: 5048 result = s.str.rsplit(**kwargs) 5049 result[~result.isna()].replace(np.nan, value=None) 5050 return result 5051 5052 return frame_base.DeferredFrame.wrap( 5053 expressions.ComputedExpression( 5054 'split', 5055 func, 5056 [self._expr], 5057 proxy=proxy, 5058 requires_partition_by=partitionings.Arbitrary(), 5059 preserves_partition_by=partitionings.Arbitrary())) 5060 5061 @frame_base.with_docs_from(pd.core.strings.StringMethods) 5062 @frame_base.args_to_kwargs(pd.core.strings.StringMethods) 5063 @frame_base.populate_defaults(pd.core.strings.StringMethods) 5064 def split(self, **kwargs): 5065 """ 5066 Like other non-deferred methods, dtype must be CategoricalDtype. 5067 One exception is when ``expand`` is ``False``. Because we are not 5068 creating new columns at construction time, dtype can be `str`. 5069 """ 5070 return self._split_helper(rsplit=False, **kwargs) 5071 5072 @frame_base.with_docs_from(pd.core.strings.StringMethods) 5073 @frame_base.args_to_kwargs(pd.core.strings.StringMethods) 5074 @frame_base.populate_defaults(pd.core.strings.StringMethods) 5075 def rsplit(self, **kwargs): 5076 """ 5077 Like other non-deferred methods, dtype must be CategoricalDtype. 5078 One exception is when ``expand`` is ``False``. Because we are not 5079 creating new columns at construction time, dtype can be `str`. 5080 """ 5081 return self._split_helper(rsplit=True, **kwargs) 5082 5083 5084 ELEMENTWISE_STRING_METHODS = [ 5085 'capitalize', 5086 'casefold', 5087 'contains', 5088 'count', 5089 'endswith', 5090 'extract', 5091 'findall', 5092 'fullmatch', 5093 'get', 5094 'isalnum', 5095 'isalpha', 5096 'isdecimal', 5097 'isdigit', 5098 'islower', 5099 'isnumeric', 5100 'isspace', 5101 'istitle', 5102 'isupper', 5103 'join', 5104 'len', 5105 'lower', 5106 'lstrip', 5107 'match', 5108 'pad', 5109 'partition', 5110 'removeprefix', 5111 'removesuffix', 5112 'replace', 5113 'rpartition', 5114 'rstrip', 5115 'slice', 5116 'slice_replace', 5117 'startswith', 5118 'strip', 5119 'swapcase', 5120 'title', 5121 'upper', 5122 'wrap', 5123 'zfill', 5124 '__getitem__', 5125 ] 5126 5127 NON_ELEMENTWISE_STRING_METHODS = [ 5128 'extractall', 5129 ] 5130 5131 def make_str_func(method): 5132 def func(df, *args, **kwargs): 5133 try: 5134 df_str = df.str 5135 except AttributeError: 5136 # If there's a non-string value in a Series passed to .str method, pandas 5137 # will generally just replace it with NaN in the result. However if 5138 # there are _only_ non-string values, pandas will raise: 5139 # 5140 # AttributeError: Can only use .str accessor with string values! 5141 # 5142 # This can happen to us at execution time if we split a partition that is 5143 # only non-strings. This branch just replaces all those values with NaN 5144 # in that case. 5145 return df.map(lambda _: np.nan) 5146 else: 5147 return getattr(df_str, method)(*args, **kwargs) 5148 5149 return func 5150 5151 for method in ELEMENTWISE_STRING_METHODS: 5152 if not hasattr(pd.core.strings.StringMethods, method): 5153 # older versions (1.0.x) don't support some of these methods 5154 continue 5155 setattr(_DeferredStringMethods, 5156 method, 5157 frame_base._elementwise_method(make_str_func(method), 5158 name=method, 5159 base=pd.core.strings.StringMethods)) 5160 5161 for method in NON_ELEMENTWISE_STRING_METHODS: 5162 if not hasattr(pd.core.strings.StringMethods, method): 5163 # older versions (1.0.x) don't support some of these methods 5164 continue 5165 setattr(_DeferredStringMethods, 5166 method, 5167 frame_base._proxy_method( 5168 make_str_func(method), 5169 name=method, 5170 base=pd.core.strings.StringMethods, 5171 requires_partition_by=partitionings.Arbitrary(), 5172 preserves_partition_by=partitionings.Singleton())) 5173 5174 5175 def make_cat_func(method): 5176 def func(df, *args, **kwargs): 5177 return getattr(df.cat, method)(*args, **kwargs) 5178 5179 return func 5180 5181 5182 class _DeferredCategoricalMethods(frame_base.DeferredBase): 5183 @property # type: ignore 5184 @frame_base.with_docs_from(pd.core.arrays.categorical.CategoricalAccessor) 5185 def categories(self): 5186 return self._expr.proxy().cat.categories 5187 5188 @property # type: ignore 5189 @frame_base.with_docs_from(pd.core.arrays.categorical.CategoricalAccessor) 5190 def ordered(self): 5191 return self._expr.proxy().cat.ordered 5192 5193 @property # type: ignore 5194 @frame_base.with_docs_from(pd.core.arrays.categorical.CategoricalAccessor) 5195 def codes(self): 5196 return frame_base.DeferredFrame.wrap( 5197 expressions.ComputedExpression( 5198 'codes', 5199 lambda s: s.cat.codes, 5200 [self._expr], 5201 requires_partition_by=partitionings.Arbitrary(), 5202 preserves_partition_by=partitionings.Arbitrary(), 5203 ) 5204 ) 5205 5206 remove_unused_categories = frame_base.wont_implement_method( 5207 pd.core.arrays.categorical.CategoricalAccessor, 5208 'remove_unused_categories', reason="non-deferred-columns") 5209 5210 ELEMENTWISE_CATEGORICAL_METHODS = [ 5211 'add_categories', 5212 'as_ordered', 5213 'as_unordered', 5214 'remove_categories', 5215 'rename_categories', 5216 'reorder_categories', 5217 'set_categories', 5218 ] 5219 5220 for method in ELEMENTWISE_CATEGORICAL_METHODS: 5221 setattr(_DeferredCategoricalMethods, 5222 method, 5223 frame_base._elementwise_method( 5224 make_cat_func(method), name=method, 5225 base=pd.core.arrays.categorical.CategoricalAccessor)) 5226 5227 class _DeferredDatetimeMethods(frame_base.DeferredBase): 5228 @property # type: ignore 5229 @frame_base.with_docs_from(pd.core.indexes.accessors.DatetimeProperties) 5230 def tz(self): 5231 return self._expr.proxy().dt.tz 5232 5233 @property # type: ignore 5234 @frame_base.with_docs_from(pd.core.indexes.accessors.DatetimeProperties) 5235 def freq(self): 5236 return self._expr.proxy().dt.freq 5237 5238 @frame_base.with_docs_from(pd.core.indexes.accessors.DatetimeProperties) 5239 def tz_localize(self, *args, ambiguous='infer', **kwargs): 5240 """``ambiguous`` cannot be set to ``"infer"`` as its semantics are 5241 order-sensitive. Similarly, specifying ``ambiguous`` as an 5242 :class:`~numpy.ndarray` is order-sensitive, but you can achieve similar 5243 functionality by specifying ``ambiguous`` as a Series.""" 5244 if isinstance(ambiguous, np.ndarray): 5245 raise frame_base.WontImplementError( 5246 "tz_localize(ambiguous=ndarray) is not supported because it makes " 5247 "this operation sensitive to the order of the data. Please use a " 5248 "DeferredSeries instead.", 5249 reason="order-sensitive") 5250 elif isinstance(ambiguous, frame_base.DeferredFrame): 5251 return frame_base.DeferredFrame.wrap( 5252 expressions.ComputedExpression( 5253 'tz_localize', 5254 lambda s, 5255 ambiguous: s.dt.tz_localize(*args, ambiguous=ambiguous, **kwargs), 5256 [self._expr, ambiguous._expr], 5257 requires_partition_by=partitionings.Index(), 5258 preserves_partition_by=partitionings.Arbitrary())) 5259 elif ambiguous == 'infer': 5260 # infer attempts to infer based on the order of the timestamps 5261 raise frame_base.WontImplementError( 5262 f"tz_localize(ambiguous={ambiguous!r}) is not allowed because it " 5263 "makes this operation sensitive to the order of the data.", 5264 reason="order-sensitive") 5265 5266 return frame_base.DeferredFrame.wrap( 5267 expressions.ComputedExpression( 5268 'tz_localize', 5269 lambda s: s.dt.tz_localize(*args, ambiguous=ambiguous, **kwargs), 5270 [self._expr], 5271 requires_partition_by=partitionings.Arbitrary(), 5272 preserves_partition_by=partitionings.Arbitrary())) 5273 5274 5275 to_period = frame_base.wont_implement_method( 5276 pd.core.indexes.accessors.DatetimeProperties, 'to_period', 5277 reason="event-time-semantics") 5278 to_pydatetime = frame_base.wont_implement_method( 5279 pd.core.indexes.accessors.DatetimeProperties, 'to_pydatetime', 5280 reason="non-deferred-result") 5281 to_pytimedelta = frame_base.wont_implement_method( 5282 pd.core.indexes.accessors.DatetimeProperties, 'to_pytimedelta', 5283 reason="non-deferred-result") 5284 5285 def make_dt_property(method): 5286 def func(df): 5287 return getattr(df.dt, method) 5288 5289 return func 5290 5291 def make_dt_func(method): 5292 def func(df, *args, **kwargs): 5293 return getattr(df.dt, method)(*args, **kwargs) 5294 5295 return func 5296 5297 5298 ELEMENTWISE_DATETIME_METHODS = [ 5299 'ceil', 5300 'day_name', 5301 'month_name', 5302 'floor', 5303 'isocalendar', 5304 'round', 5305 'normalize', 5306 'strftime', 5307 'tz_convert', 5308 ] 5309 5310 for method in ELEMENTWISE_DATETIME_METHODS: 5311 if not hasattr(pd.core.indexes.accessors.DatetimeProperties, method): 5312 # older versions (1.0.x) don't support some of these methods 5313 continue 5314 setattr(_DeferredDatetimeMethods, 5315 method, 5316 frame_base._elementwise_method( 5317 make_dt_func(method), 5318 name=method, 5319 base=pd.core.indexes.accessors.DatetimeProperties)) 5320 5321 ELEMENTWISE_DATETIME_PROPERTIES = [ 5322 'date', 5323 'day', 5324 'dayofweek', 5325 'dayofyear', 5326 'days_in_month', 5327 'daysinmonth', 5328 'hour', 5329 'is_leap_year', 5330 'is_month_end', 5331 'is_month_start', 5332 'is_quarter_end', 5333 'is_quarter_start', 5334 'is_year_end', 5335 'is_year_start', 5336 'microsecond', 5337 'minute', 5338 'month', 5339 'nanosecond', 5340 'quarter', 5341 'second', 5342 'time', 5343 'timetz', 5344 'week', 5345 'weekday', 5346 'weekofyear', 5347 'year', 5348 ] 5349 5350 for method in ELEMENTWISE_DATETIME_PROPERTIES: 5351 setattr(_DeferredDatetimeMethods, 5352 method, 5353 property(frame_base._elementwise_method( 5354 make_dt_property(method), 5355 name=method, 5356 base=pd.core.indexes.accessors.DatetimeProperties))) 5357 5358 5359 for base in ['add', 5360 'sub', 5361 'mul', 5362 'div', 5363 'truediv', 5364 'floordiv', 5365 'mod', 5366 'divmod', 5367 'pow', 5368 'and', 5369 'or']: 5370 for p in ['%s', 'r%s', '__%s__', '__r%s__']: 5371 # TODO: non-trivial level? 5372 name = p % base 5373 if hasattr(pd.Series, name): 5374 setattr( 5375 DeferredSeries, 5376 name, 5377 frame_base._elementwise_method(name, restrictions={'level': None}, 5378 base=pd.Series)) 5379 if hasattr(pd.DataFrame, name): 5380 setattr( 5381 DeferredDataFrame, 5382 name, 5383 frame_base._elementwise_method(name, restrictions={'level': None}, 5384 base=pd.DataFrame)) 5385 inplace_name = '__i%s__' % base 5386 if hasattr(pd.Series, inplace_name): 5387 setattr( 5388 DeferredSeries, 5389 inplace_name, 5390 frame_base._elementwise_method(inplace_name, inplace=True, 5391 base=pd.Series)) 5392 if hasattr(pd.DataFrame, inplace_name): 5393 setattr( 5394 DeferredDataFrame, 5395 inplace_name, 5396 frame_base._elementwise_method(inplace_name, inplace=True, 5397 base=pd.DataFrame)) 5398 5399 # Allow dataframe | SchemaTransform 5400 def _create_maybe_elementwise_or(base): 5401 elementwise = frame_base._elementwise_method( 5402 '__or__', restrictions={'level': None}, base=base) 5403 5404 def _maybe_elementwise_or(self, right): 5405 if isinstance(right, PTransform): 5406 return convert.to_dataframe(convert.to_pcollection(self) | right) 5407 else: 5408 return elementwise(self, right) 5409 5410 return _maybe_elementwise_or 5411 5412 5413 DeferredSeries.__or__ = _create_maybe_elementwise_or(pd.Series) # type: ignore 5414 DeferredDataFrame.__or__ = _create_maybe_elementwise_or(pd.DataFrame) # type: ignore 5415 5416 5417 for name in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: 5418 for p in '%s', '__%s__': 5419 # Note that non-underscore name is used for both as the __xxx__ methods are 5420 # order-sensitive. 5421 setattr(DeferredSeries, p % name, 5422 frame_base._elementwise_method(name, base=pd.Series)) 5423 setattr(DeferredDataFrame, p % name, 5424 frame_base._elementwise_method(name, base=pd.DataFrame)) 5425 5426 for name in ['__neg__', '__pos__', '__invert__']: 5427 setattr(DeferredSeries, name, 5428 frame_base._elementwise_method(name, base=pd.Series)) 5429 setattr(DeferredDataFrame, name, 5430 frame_base._elementwise_method(name, base=pd.DataFrame)) 5431 5432 DeferredSeries.multiply = DeferredSeries.mul # type: ignore 5433 DeferredDataFrame.multiply = DeferredDataFrame.mul # type: ignore 5434 DeferredSeries.subtract = DeferredSeries.sub # type: ignore 5435 DeferredDataFrame.subtract = DeferredDataFrame.sub # type: ignore 5436 DeferredSeries.divide = DeferredSeries.div # type: ignore 5437 DeferredDataFrame.divide = DeferredDataFrame.div # type: ignore 5438 5439 5440 def _slice_parts(s): 5441 yield s.start 5442 yield s.stop 5443 yield s.step 5444 5445 def _is_null_slice(s): 5446 return isinstance(s, slice) and all(x is None for x in _slice_parts(s)) 5447 5448 def _is_integer_slice(s): 5449 return isinstance(s, slice) and all( 5450 x is None or isinstance(x, int) 5451 for x in _slice_parts(s)) and not _is_null_slice(s)