github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_top_level_functions.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """A module providing various functionality from the top-level pandas namespace.
    18  """
    19  
    20  import re
    21  from typing import Mapping
    22  
    23  import pandas as pd
    24  
    25  from apache_beam.dataframe import expressions
    26  from apache_beam.dataframe import frame_base
    27  from apache_beam.dataframe import partitionings
    28  
    29  
    30  def _call_on_first_arg(name):
    31    def wrapper(target, *args, **kwargs):
    32      if isinstance(target, frame_base.DeferredBase):
    33        return getattr(target, name)(*args, **kwargs)
    34      else:
    35        return getattr(pd, name)(target, *args, **kwargs)
    36  
    37    return staticmethod(wrapper)
    38  
    39  
    40  def _maybe_wrap_constant_expr(res):
    41    if type(res) in frame_base.DeferredBase._pandas_type_map:
    42      return frame_base.DeferredBase.wrap(
    43          expressions.ConstantExpression(res, res[0:0]))
    44    else:
    45      return res
    46  
    47  
    48  def _defer_to_pandas(name):
    49    func = getattr(pd, name)
    50  
    51    def wrapper(*args, **kwargs):
    52      res = func(*args, **kwargs)
    53      return _maybe_wrap_constant_expr(res)
    54  
    55    return staticmethod(wrapper)
    56  
    57  
    58  def _defer_to_pandas_maybe_elementwise(name):
    59    """ Same as _defer_to_pandas, except it handles DeferredBase args, assuming
    60    the function can be processed elementwise. """
    61    func = getattr(pd, name)
    62  
    63    def wrapper(*args, **kwargs):
    64      if any(isinstance(arg, frame_base.DeferredBase)
    65             for arg in args + tuple(kwargs.values())):
    66        return frame_base._elementwise_function(func, name)(*args, **kwargs)
    67  
    68      res = func(*args, **kwargs)
    69      return _maybe_wrap_constant_expr(res)
    70  
    71    return staticmethod(wrapper)
    72  
    73  
    74  def _is_top_level_function(o):
    75    return (
    76        callable(o) and not isinstance(o, type) and hasattr(o, '__name__') and
    77        re.match('[a-z].*', o.__name__))
    78  
    79  
    80  class DeferredPandasModule(object):
    81    array = _defer_to_pandas('array')
    82    bdate_range = _defer_to_pandas('bdate_range')
    83  
    84    @staticmethod
    85    @frame_base.args_to_kwargs(pd)
    86    @frame_base.populate_defaults(pd)
    87    def concat(
    88        objs,
    89        axis,
    90        join,
    91        ignore_index,
    92        keys,
    93        levels,
    94        names,
    95        verify_integrity,
    96        sort,
    97        copy):
    98  
    99      if ignore_index:
   100        raise NotImplementedError('concat(ignore_index)')
   101      if levels:
   102        raise NotImplementedError('concat(levels)')
   103  
   104      if isinstance(objs, Mapping):
   105        if keys is None:
   106          keys = list(objs.keys())
   107        objs = [objs[k] for k in keys]
   108      else:
   109        objs = list(objs)
   110  
   111      if keys is None:
   112        preserves_partitioning = partitionings.Arbitrary()
   113      else:
   114        # Index 0 will be a new index for keys, only partitioning by the original
   115        # indexes (1 to N) will be preserved.
   116        nlevels = min(o._expr.proxy().index.nlevels for o in objs)
   117        preserves_partitioning = partitionings.Index(
   118            [i for i in range(1, nlevels + 1)])
   119  
   120      deferred_none = expressions.ConstantExpression(None)
   121      exprs = [deferred_none if o is None else o._expr for o in objs]
   122  
   123      if axis in (1, 'columns'):
   124        required_partitioning = partitionings.Index()
   125      elif verify_integrity:
   126        required_partitioning = partitionings.Index()
   127      else:
   128        required_partitioning = partitionings.Arbitrary()
   129  
   130      return frame_base.DeferredBase.wrap(
   131          expressions.ComputedExpression(
   132              'concat',
   133              lambda *objs: pd.concat(
   134                  objs,
   135                  axis=axis,
   136                  join=join,
   137                  ignore_index=ignore_index,
   138                  keys=keys,
   139                  levels=levels,
   140                  names=names,
   141                  verify_integrity=verify_integrity),  # yapf break
   142              exprs,
   143              requires_partition_by=required_partitioning,
   144              preserves_partition_by=preserves_partitioning))
   145  
   146    date_range = _defer_to_pandas('date_range')
   147    describe_option = _defer_to_pandas('describe_option')
   148    factorize = _call_on_first_arg('factorize')
   149    get_option = _defer_to_pandas('get_option')
   150    interval_range = _defer_to_pandas('interval_range')
   151    isna = _call_on_first_arg('isna')
   152    isnull = _call_on_first_arg('isnull')
   153    json_normalize = _defer_to_pandas('json_normalize')
   154    melt = _call_on_first_arg('melt')
   155    merge = _call_on_first_arg('merge')
   156    melt = _call_on_first_arg('melt')
   157    merge_ordered = frame_base.wont_implement_method(
   158        pd, 'merge_ordered', reason='order-sensitive')
   159    notna = _call_on_first_arg('notna')
   160    notnull = _call_on_first_arg('notnull')
   161    option_context = _defer_to_pandas('option_context')
   162    period_range = _defer_to_pandas('period_range')
   163    pivot = _call_on_first_arg('pivot')
   164    pivot_table = _call_on_first_arg('pivot_table')
   165    show_versions = _defer_to_pandas('show_versions')
   166    test = frame_base.wont_implement_method(
   167        pd,
   168        'test',
   169        explanation="because it is an internal pandas testing utility.")
   170    timedelta_range = _defer_to_pandas('timedelta_range')
   171    to_pickle = frame_base.wont_implement_method(
   172        pd, 'to_pickle', reason='order-sensitive')
   173    to_datetime = _defer_to_pandas_maybe_elementwise('to_datetime')
   174    notna = _call_on_first_arg('notna')
   175  
   176    def __getattr__(self, name):
   177      if name.startswith('read_'):
   178  
   179        def func(*args, **kwargs):
   180          raise frame_base.WontImplementError(
   181              'Use p | apache_beam.dataframe.io.%s' % name)
   182  
   183        return func
   184      res = getattr(pd, name)
   185      if _is_top_level_function(res):
   186        return frame_base.not_implemented_method(name, base_type=pd)
   187      else:
   188        return res
   189  
   190  
   191  pd_wrapper = DeferredPandasModule()