github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/frame_base.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 import functools 18 import operator 19 import re 20 from inspect import cleandoc 21 from inspect import getfullargspec 22 from inspect import isclass 23 from inspect import ismodule 24 from inspect import unwrap 25 from typing import Any 26 from typing import Callable 27 from typing import Dict 28 from typing import List 29 from typing import Optional 30 from typing import Tuple 31 from typing import Union 32 33 import pandas as pd 34 35 from apache_beam.dataframe import expressions 36 from apache_beam.dataframe import partitionings 37 38 39 class DeferredBase(object): 40 41 _pandas_type_map = {} # type: Dict[Union[type, None], type] 42 43 def __init__(self, expr): 44 self._expr = expr 45 46 @classmethod 47 def _register_for(cls, pandas_type): 48 def wrapper(deferred_type): 49 cls._pandas_type_map[pandas_type] = deferred_type 50 return deferred_type 51 52 return wrapper 53 54 @classmethod 55 def wrap(cls, expr, split_tuples=True): 56 proxy_type = type(expr.proxy()) 57 if proxy_type is tuple and split_tuples: 58 59 def get(ix): 60 return expressions.ComputedExpression( 61 # yapf: disable 62 'get_%d' % ix, 63 lambda t: t[ix], 64 [expr], 65 requires_partition_by=partitionings.Arbitrary(), 66 preserves_partition_by=partitionings.Singleton()) 67 68 return tuple(cls.wrap(get(ix)) for ix in range(len(expr.proxy()))) 69 elif proxy_type in cls._pandas_type_map: 70 wrapper_type = cls._pandas_type_map[proxy_type] 71 else: 72 if expr.requires_partition_by() != partitionings.Singleton(): 73 raise ValueError( 74 'Scalar expression %s of type %s partitoned by non-singleton %s' % 75 (expr, proxy_type, expr.requires_partition_by())) 76 wrapper_type = _DeferredScalar 77 return wrapper_type(expr) 78 79 def _elementwise( 80 self, func, name=None, other_args=(), other_kwargs=None, inplace=False): 81 other_kwargs = other_kwargs or {} 82 return _elementwise_function( 83 func, name, inplace=inplace)(self, *other_args, **other_kwargs) 84 85 def __reduce__(self): 86 return UnusableUnpickledDeferredBase, (str(self), ) 87 88 89 class UnusableUnpickledDeferredBase(object): 90 """Placeholder object used to break the transitive pickling chain in case a 91 DeferredBase accidentially gets pickled (e.g. as part of globals). 92 93 Trying to use this object after unpickling is a bug and will result in an 94 error. 95 """ 96 def __init__(self, name): 97 self._name = name 98 99 def __repr__(self): 100 return 'UnusablePickledDeferredBase(%r)' % self.name 101 102 103 class DeferredFrame(DeferredBase): 104 pass 105 106 107 class _DeferredScalar(DeferredBase): 108 def apply(self, func, name=None, args=()): 109 if name is None: 110 name = func.__name__ 111 with expressions.allow_non_parallel_operations( 112 all(isinstance(arg, _DeferredScalar) for arg in args) or None): 113 return DeferredFrame.wrap( 114 expressions.ComputedExpression( 115 name, 116 func, [self._expr] + [arg._expr for arg in args], 117 requires_partition_by=partitionings.Singleton())) 118 119 def __neg__(self): 120 return self.apply(operator.neg) 121 122 def __pos__(self): 123 return self.apply(operator.pos) 124 125 def __invert__(self): 126 return self.apply(operator.invert) 127 128 def __repr__(self): 129 return f"DeferredScalar[type={type(self._expr.proxy())}]" 130 131 def __bool__(self): 132 # TODO(BEAM-11951): Link to documentation 133 raise TypeError( 134 "Testing the truth value of a deferred scalar is not " 135 "allowed. It's not possible to branch on the result of " 136 "deferred operations.") 137 138 139 def _scalar_binop(op): 140 def binop(self, other): 141 if not isinstance(other, DeferredBase): 142 return self.apply(lambda left: getattr(left, op)(other), name=op) 143 elif isinstance(other, _DeferredScalar): 144 return self.apply( 145 lambda left, right: getattr(left, op)(right), name=op, args=[other]) 146 else: 147 return NotImplemented 148 149 return binop 150 151 152 for op in ['__add__', 153 '__sub__', 154 '__mul__', 155 '__div__', 156 '__truediv__', 157 '__floordiv__', 158 '__mod__', 159 '__divmod__', 160 '__pow__', 161 '__and__', 162 '__or__']: 163 setattr(_DeferredScalar, op, _scalar_binop(op)) 164 165 DeferredBase._pandas_type_map[None] = _DeferredScalar 166 167 168 def name_and_func(method: Union[str, Callable]) -> Tuple[str, Callable]: 169 """For the given method name or method, return the method name and the method 170 itself. 171 172 For internal use only. No backwards compatibility guarantees.""" 173 if isinstance(method, str): 174 method_str = method 175 func = lambda df, *args, **kwargs: getattr(df, method_str)(*args, **kwargs) 176 return method, func 177 else: 178 return method.__name__, method 179 180 181 def _elementwise_method( 182 func, name=None, restrictions=None, inplace=False, base=None): 183 return _proxy_method( 184 func, 185 name, 186 restrictions, 187 inplace, 188 base, 189 requires_partition_by=partitionings.Arbitrary(), 190 preserves_partition_by=partitionings.Arbitrary()) 191 192 193 def _proxy_method( 194 func, 195 name=None, 196 restrictions=None, 197 inplace=False, 198 base=None, 199 *, 200 requires_partition_by, # type: partitionings.Partitioning 201 preserves_partition_by, # type: partitionings.Partitioning 202 ): 203 if name is None: 204 name, func = name_and_func(func) 205 if base is None: 206 raise ValueError("base is required for _proxy_method") 207 return _proxy_function( 208 func, 209 name, 210 restrictions, 211 inplace, 212 base, 213 requires_partition_by=requires_partition_by, 214 preserves_partition_by=preserves_partition_by) 215 216 217 def _elementwise_function( 218 func, name=None, restrictions=None, inplace=False, base=None): 219 return _proxy_function( 220 func, 221 name, 222 restrictions, 223 inplace, 224 base, 225 requires_partition_by=partitionings.Arbitrary(), 226 preserves_partition_by=partitionings.Arbitrary()) 227 228 229 def _proxy_function( 230 func, # type: Union[Callable, str] 231 name=None, # type: Optional[str] 232 restrictions=None, # type: Optional[Dict[str, Union[Any, List[Any]]]] 233 inplace=False, # type: bool 234 base=None, # type: Optional[type] 235 *, 236 requires_partition_by, # type: partitionings.Partitioning 237 preserves_partition_by, # type: partitionings.Partitioning 238 ): 239 240 if name is None: 241 if isinstance(func, str): 242 name = func 243 else: 244 name = func.__name__ 245 if restrictions is None: 246 restrictions = {} 247 248 def wrapper(*args, **kwargs): 249 for key, values in restrictions.items(): 250 if key in kwargs: 251 value = kwargs[key] 252 else: 253 try: 254 ix = getfullargspec(func).args.index(key) 255 except ValueError: 256 # TODO: fix for delegation? 257 continue 258 if len(args) <= ix: 259 continue 260 value = args[ix] 261 if callable(values): 262 check = values 263 elif isinstance(values, list): 264 check = lambda x, values=values: x in values 265 else: 266 check = lambda x, value=value: x == value 267 268 if not check(value): 269 raise NotImplementedError( 270 '%s=%s not supported for %s' % (key, value, name)) 271 deferred_arg_indices = [] 272 deferred_arg_exprs = [] 273 constant_args = [None] * len(args) 274 from apache_beam.dataframe.frames import _DeferredIndex 275 for ix, arg in enumerate(args): 276 if isinstance(arg, DeferredBase): 277 deferred_arg_indices.append(ix) 278 deferred_arg_exprs.append(arg._expr) 279 elif isinstance(arg, _DeferredIndex): 280 # TODO(robertwb): Consider letting indices pass through as indices. 281 # This would require updating the partitioning code, as indices don't 282 # have indices. 283 deferred_arg_indices.append(ix) 284 deferred_arg_exprs.append( 285 expressions.ComputedExpression( 286 'index_as_series', 287 lambda ix: ix.index.to_series(), # yapf break 288 [arg._frame._expr], 289 preserves_partition_by=partitionings.Singleton(), 290 requires_partition_by=partitionings.Arbitrary())) 291 elif isinstance(arg, pd.core.generic.NDFrame): 292 deferred_arg_indices.append(ix) 293 deferred_arg_exprs.append(expressions.ConstantExpression(arg, arg[0:0])) 294 else: 295 constant_args[ix] = arg 296 297 deferred_kwarg_keys = [] 298 deferred_kwarg_exprs = [] 299 constant_kwargs = {key: None for key in kwargs} 300 for key, arg in kwargs.items(): 301 if isinstance(arg, DeferredBase): 302 deferred_kwarg_keys.append(key) 303 deferred_kwarg_exprs.append(arg._expr) 304 elif isinstance(arg, pd.core.generic.NDFrame): 305 deferred_kwarg_keys.append(key) 306 deferred_kwarg_exprs.append( 307 expressions.ConstantExpression(arg, arg[0:0])) 308 else: 309 constant_kwargs[key] = arg 310 311 deferred_exprs = deferred_arg_exprs + deferred_kwarg_exprs 312 313 if inplace: 314 actual_func = _copy_and_mutate(func) 315 else: 316 actual_func = func 317 318 def apply(*actual_args): 319 actual_args, actual_kwargs = (actual_args[:len(deferred_arg_exprs)], 320 actual_args[len(deferred_arg_exprs):]) 321 322 full_args = list(constant_args) 323 for ix, arg in zip(deferred_arg_indices, actual_args): 324 full_args[ix] = arg 325 326 full_kwargs = dict(constant_kwargs) 327 for key, arg in zip(deferred_kwarg_keys, actual_kwargs): 328 full_kwargs[key] = arg 329 330 return actual_func(*full_args, **full_kwargs) 331 332 if (requires_partition_by.is_subpartitioning_of(partitionings.Index()) and 333 sum(isinstance(arg.proxy(), pd.core.generic.NDFrame) 334 for arg in deferred_exprs) > 1): 335 # Implicit join on index if there is more than one indexed input. 336 actual_requires_partition_by = partitionings.JoinIndex() 337 else: 338 actual_requires_partition_by = requires_partition_by 339 340 result_expr = expressions.ComputedExpression( 341 name, 342 apply, 343 deferred_exprs, 344 requires_partition_by=actual_requires_partition_by, 345 preserves_partition_by=preserves_partition_by) 346 if inplace: 347 args[0]._expr = result_expr 348 349 else: 350 return DeferredFrame.wrap(result_expr) 351 352 wrapper.__name__ = name 353 if restrictions: 354 wrapper.__doc__ = "\n".join( 355 f"Only {kw}={value!r} is supported" 356 for (kw, value) in restrictions.items()) 357 358 if base is not None: 359 return with_docs_from(base)(wrapper) 360 else: 361 return wrapper 362 363 364 def _prettify_pandas_type(pandas_type): 365 if pandas_type in (pd.DataFrame, pd.Series): 366 return f'pandas.{pandas_type.__name__}' 367 elif isclass(pandas_type): 368 return f'{pandas_type.__module__}.{pandas_type.__name__}' 369 elif ismodule(pandas_type): 370 return pandas_type.__name__ 371 else: 372 raise TypeError(pandas_type) 373 374 375 def wont_implement_method(base_type, name, reason=None, explanation=None): 376 """Generate a stub method that raises WontImplementError. 377 378 Note either reason or explanation must be specified. If both are specified, 379 explanation is ignored. 380 381 Args: 382 base_type: The pandas type of the method that this is trying to replicate. 383 name: The name of the method that this is aiming to replicate. 384 reason: If specified, use data from the corresponding entry in 385 ``_WONT_IMPLEMENT_REASONS`` to generate a helpful exception message 386 and docstring for the method. 387 explanation: If specified, use this string as an explanation for why 388 this operation is not supported when generating an exception message 389 and docstring. 390 """ 391 if reason is not None: 392 if reason not in _WONT_IMPLEMENT_REASONS: 393 raise AssertionError( 394 f"reason must be one of {list(_WONT_IMPLEMENT_REASONS.keys())}, " 395 f"got {reason!r}") 396 reason_data = _WONT_IMPLEMENT_REASONS[reason] 397 elif explanation is not None: 398 reason_data = {'explanation': explanation} 399 else: 400 raise ValueError("One of (reason, explanation) must be specified") 401 402 def wrapper(*args, **kwargs): 403 raise WontImplementError( 404 f"'{name}' is not yet supported {reason_data['explanation']}", 405 reason=reason) 406 407 wrapper.__name__ = name 408 wrapper.__doc__ = ( 409 f":meth:`{_prettify_pandas_type(base_type)}.{name}` is not yet supported " 410 f"in the Beam DataFrame API {reason_data['explanation']}") 411 412 if 'url' in reason_data: 413 wrapper.__doc__ += f"\n\n For more information see {reason_data['url']}." 414 415 return wrapper 416 417 418 def not_implemented_method(op, issue='20318', base_type=None): 419 """Generate a stub method for ``op`` that simply raises a NotImplementedError. 420 421 For internal use only. No backwards compatibility guarantees.""" 422 assert base_type is not None, "base_type must be specified" 423 issue_url = f"https://issues.apache.org/jira/{issue}." if issue.startswith( 424 "BEAM-") else f"https://github.com/apache/beam/issues/{issue}" 425 426 def wrapper(*args, **kwargs): 427 raise NotImplementedError( 428 f"{op!r} is not implemented yet. " 429 f"If support for {op!r} is important to you, please let the Beam " 430 "community know by writing to user@beam.apache.org " 431 "(see https://beam.apache.org/community/contact-us/) or commenting on " 432 f"{issue_url}") 433 434 wrapper.__name__ = op 435 wrapper.__doc__ = ( 436 f":meth:`{_prettify_pandas_type(base_type)}.{op}` is not implemented yet " 437 "in the Beam DataFrame API.\n\n" 438 f"If support for {op!r} is important to you, please let the Beam " 439 "community know by `writing to user@beam.apache.org " 440 "<https://beam.apache.org/community/contact-us/>`_ or commenting on " 441 f"`{issue} <{issue_url}>`_.") 442 443 return wrapper 444 445 446 def _copy_and_mutate(func): 447 def wrapper(self, *args, **kwargs): 448 copy = self.copy() 449 func(copy, *args, **kwargs) 450 return copy 451 452 return wrapper 453 454 455 def maybe_inplace(func): 456 """Handles the inplace= kwarg available in many pandas operations. 457 458 This decorator produces a new function handles the inplace kwarg. When 459 `inplace=False`, the new function simply yields the result of `func` 460 directly. 461 462 When `inplace=True`, the output of `func` is used to replace this instances 463 expression. The result is that any operations applied to this instance after 464 the inplace operation will refernce the updated expression. 465 466 For internal use only. No backwards compatibility guarantees.""" 467 @functools.wraps(func) 468 def wrapper(self, inplace=False, **kwargs): 469 result = func(self, **kwargs) 470 if inplace: 471 self._expr = result._expr 472 else: 473 return result 474 475 return wrapper 476 477 478 def args_to_kwargs(base_type): 479 """Convert all args to kwargs before calling the decorated function. 480 481 When applied to a function, this decorator creates a new function 482 that always calls the wrapped function with *only* keyword arguments. It 483 inspects the argspec for the identically-named method on `base_type` to 484 determine the name to use for arguments that are converted to keyword 485 arguments. 486 487 For internal use only. No backwards compatibility guarantees.""" 488 def wrap(func): 489 arg_names = getfullargspec(unwrap(getattr(base_type, func.__name__))).args 490 491 @functools.wraps(func) 492 def wrapper(*args, **kwargs): 493 for name, value in zip(arg_names, args): 494 if name in kwargs: 495 raise TypeError( 496 "%s() got multiple values for argument '%s'" % 497 (func.__name__, name)) 498 kwargs[name] = value 499 return func(**kwargs) 500 501 return wrapper 502 503 return wrap 504 505 506 BEAM_SPECIFIC = "Differences from pandas" 507 508 SECTION_ORDER = [ 509 'Parameters', 510 'Returns', 511 'Raises', 512 BEAM_SPECIFIC, 513 'See Also', 514 'Notes', 515 'Examples' 516 ] 517 518 EXAMPLES_DISCLAIMER = ( 519 "**NOTE:** These examples are pulled directly from the pandas " 520 "documentation for convenience. Usage of the Beam DataFrame API will look " 521 "different because it is a deferred API.") 522 EXAMPLES_DIFFERENCES = EXAMPLES_DISCLAIMER + ( 523 " In addition, some arguments shown here may not be supported, see " 524 f"**{BEAM_SPECIFIC!r}** for details.") 525 526 527 def with_docs_from(base_type, name=None): 528 """Decorator that updates the documentation from the wrapped function to 529 duplicate the documentation from the identically-named method in `base_type`. 530 531 Any docstring on the original function will be included in the new function 532 under a "Differences from pandas" heading. 533 """ 534 def wrap(func): 535 fn_name = name or func.__name__ 536 orig_doc = getattr(base_type, fn_name).__doc__ 537 if orig_doc is None: 538 return func 539 540 orig_doc = cleandoc(orig_doc) 541 542 section_splits = re.split(r'^(.*)$\n^-+$\n', orig_doc, flags=re.MULTILINE) 543 intro = section_splits[0].strip() 544 sections = dict(zip(section_splits[1::2], section_splits[2::2])) 545 546 beam_has_differences = bool(func.__doc__) 547 548 for header, content in sections.items(): 549 content = content.strip() 550 551 # Replace references to version numbers so its clear they reference 552 # *pandas* versions 553 content = re.sub(r'([Vv]ersion\s+[\d\.]+)', r'pandas \1', content) 554 555 if header == "Examples": 556 content = '\n\n'.join([ 557 ( 558 EXAMPLES_DIFFERENCES 559 if beam_has_differences else EXAMPLES_DISCLAIMER), 560 # Indent the examples under a doctest heading, 561 # add skipif option. This makes sure our doctest 562 # framework doesn't run these pandas tests. 563 (".. doctest::\n" 564 " :skipif: True"), 565 re.sub(r"^", " ", content, flags=re.MULTILINE), 566 ]) 567 else: 568 content = content.replace('DataFrame', 'DeferredDataFrame').replace( 569 'Series', 'DeferredSeries') 570 sections[header] = content 571 572 if beam_has_differences: 573 sections[BEAM_SPECIFIC] = cleandoc(func.__doc__) 574 else: 575 sections[BEAM_SPECIFIC] = ( 576 "This operation has no known divergences from the " 577 "pandas API.") 578 579 def format_section(header): 580 return '\n'.join([header, ''.join('-' for _ in header), sections[header]]) 581 582 func.__doc__ = '\n\n'.join([intro] + [ 583 format_section(header) for header in SECTION_ORDER if header in sections 584 ]) 585 586 return func 587 588 return wrap 589 590 591 def populate_defaults(base_type): 592 """Populate default values for keyword arguments in decorated function. 593 594 When applied to a function, this decorator creates a new function 595 with default values for all keyword arguments, based on the default values 596 for the identically-named method on `base_type`. 597 598 For internal use only. No backwards compatibility guarantees.""" 599 def wrap(func): 600 base_argspec = getfullargspec(unwrap(getattr(base_type, func.__name__))) 601 if not base_argspec.defaults: 602 return func 603 604 arg_to_default = dict( 605 zip( 606 base_argspec.args[-len(base_argspec.defaults):], 607 base_argspec.defaults)) 608 609 unwrapped_func = unwrap(func) 610 # args that do not have defaults in func, but do have defaults in base 611 func_argspec = getfullargspec(unwrapped_func) 612 num_non_defaults = len(func_argspec.args) - len(func_argspec.defaults or ()) 613 defaults_to_populate = set( 614 func_argspec.args[:num_non_defaults]).intersection( 615 arg_to_default.keys()) 616 617 @functools.wraps(func) 618 def wrapper(**kwargs): 619 for name in defaults_to_populate: 620 if name not in kwargs: 621 kwargs[name] = arg_to_default[name] 622 return func(**kwargs) 623 624 return wrapper 625 626 return wrap 627 628 629 _WONT_IMPLEMENT_REASONS = { 630 'order-sensitive': { 631 'explanation': "because it is sensitive to the order of the data.", 632 'url': 'https://s.apache.org/dataframe-order-sensitive-operations', 633 }, 634 'non-deferred-columns': { 635 'explanation': ( 636 "because the columns in the output DataFrame depend " 637 "on the data."), 638 'url': 'https://s.apache.org/dataframe-non-deferred-columns', 639 }, 640 'non-deferred-result': { 641 'explanation': ( 642 "because it produces an output type that is not " 643 "deferred."), 644 'url': 'https://s.apache.org/dataframe-non-deferred-result', 645 }, 646 'plotting-tools': { 647 'explanation': "because it is a plotting tool.", 648 'url': 'https://s.apache.org/dataframe-plotting-tools', 649 }, 650 'event-time-semantics': { 651 'explanation': ( 652 "because implementing it would require integrating with Beam " 653 "event-time semantics"), 654 'url': 'https://s.apache.org/dataframe-event-time-semantics', 655 }, 656 'deprecated': { 657 'explanation': "because it is deprecated in pandas.", 658 }, 659 'experimental': { 660 'explanation': "because it is experimental in pandas.", 661 }, 662 } 663 664 665 class WontImplementError(NotImplementedError): 666 """An subclass of NotImplementedError to raise indicating that implementing 667 the given method is not planned. 668 669 Raising this error will also prevent this doctests from being validated 670 when run with the beam dataframe validation doctest runner. 671 """ 672 def __init__(self, msg, reason=None): 673 if reason is not None: 674 if reason not in _WONT_IMPLEMENT_REASONS: 675 raise AssertionError( 676 f"reason must be one of {list(_WONT_IMPLEMENT_REASONS.keys())}, " 677 f"got {reason!r}") 678 679 reason_data = _WONT_IMPLEMENT_REASONS[reason] 680 if 'url' in reason_data: 681 msg = f"{msg}\nFor more information see {reason_data['url']}." 682 683 super().__init__(msg)