github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_top_level_functions.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """A module providing various functionality from the top-level pandas namespace. 18 """ 19 20 import re 21 from typing import Mapping 22 23 import pandas as pd 24 25 from apache_beam.dataframe import expressions 26 from apache_beam.dataframe import frame_base 27 from apache_beam.dataframe import partitionings 28 29 30 def _call_on_first_arg(name): 31 def wrapper(target, *args, **kwargs): 32 if isinstance(target, frame_base.DeferredBase): 33 return getattr(target, name)(*args, **kwargs) 34 else: 35 return getattr(pd, name)(target, *args, **kwargs) 36 37 return staticmethod(wrapper) 38 39 40 def _maybe_wrap_constant_expr(res): 41 if type(res) in frame_base.DeferredBase._pandas_type_map: 42 return frame_base.DeferredBase.wrap( 43 expressions.ConstantExpression(res, res[0:0])) 44 else: 45 return res 46 47 48 def _defer_to_pandas(name): 49 func = getattr(pd, name) 50 51 def wrapper(*args, **kwargs): 52 res = func(*args, **kwargs) 53 return _maybe_wrap_constant_expr(res) 54 55 return staticmethod(wrapper) 56 57 58 def _defer_to_pandas_maybe_elementwise(name): 59 """ Same as _defer_to_pandas, except it handles DeferredBase args, assuming 60 the function can be processed elementwise. """ 61 func = getattr(pd, name) 62 63 def wrapper(*args, **kwargs): 64 if any(isinstance(arg, frame_base.DeferredBase) 65 for arg in args + tuple(kwargs.values())): 66 return frame_base._elementwise_function(func, name)(*args, **kwargs) 67 68 res = func(*args, **kwargs) 69 return _maybe_wrap_constant_expr(res) 70 71 return staticmethod(wrapper) 72 73 74 def _is_top_level_function(o): 75 return ( 76 callable(o) and not isinstance(o, type) and hasattr(o, '__name__') and 77 re.match('[a-z].*', o.__name__)) 78 79 80 class DeferredPandasModule(object): 81 array = _defer_to_pandas('array') 82 bdate_range = _defer_to_pandas('bdate_range') 83 84 @staticmethod 85 @frame_base.args_to_kwargs(pd) 86 @frame_base.populate_defaults(pd) 87 def concat( 88 objs, 89 axis, 90 join, 91 ignore_index, 92 keys, 93 levels, 94 names, 95 verify_integrity, 96 sort, 97 copy): 98 99 if ignore_index: 100 raise NotImplementedError('concat(ignore_index)') 101 if levels: 102 raise NotImplementedError('concat(levels)') 103 104 if isinstance(objs, Mapping): 105 if keys is None: 106 keys = list(objs.keys()) 107 objs = [objs[k] for k in keys] 108 else: 109 objs = list(objs) 110 111 if keys is None: 112 preserves_partitioning = partitionings.Arbitrary() 113 else: 114 # Index 0 will be a new index for keys, only partitioning by the original 115 # indexes (1 to N) will be preserved. 116 nlevels = min(o._expr.proxy().index.nlevels for o in objs) 117 preserves_partitioning = partitionings.Index( 118 [i for i in range(1, nlevels + 1)]) 119 120 deferred_none = expressions.ConstantExpression(None) 121 exprs = [deferred_none if o is None else o._expr for o in objs] 122 123 if axis in (1, 'columns'): 124 required_partitioning = partitionings.Index() 125 elif verify_integrity: 126 required_partitioning = partitionings.Index() 127 else: 128 required_partitioning = partitionings.Arbitrary() 129 130 return frame_base.DeferredBase.wrap( 131 expressions.ComputedExpression( 132 'concat', 133 lambda *objs: pd.concat( 134 objs, 135 axis=axis, 136 join=join, 137 ignore_index=ignore_index, 138 keys=keys, 139 levels=levels, 140 names=names, 141 verify_integrity=verify_integrity), # yapf break 142 exprs, 143 requires_partition_by=required_partitioning, 144 preserves_partition_by=preserves_partitioning)) 145 146 date_range = _defer_to_pandas('date_range') 147 describe_option = _defer_to_pandas('describe_option') 148 factorize = _call_on_first_arg('factorize') 149 get_option = _defer_to_pandas('get_option') 150 interval_range = _defer_to_pandas('interval_range') 151 isna = _call_on_first_arg('isna') 152 isnull = _call_on_first_arg('isnull') 153 json_normalize = _defer_to_pandas('json_normalize') 154 melt = _call_on_first_arg('melt') 155 merge = _call_on_first_arg('merge') 156 melt = _call_on_first_arg('melt') 157 merge_ordered = frame_base.wont_implement_method( 158 pd, 'merge_ordered', reason='order-sensitive') 159 notna = _call_on_first_arg('notna') 160 notnull = _call_on_first_arg('notnull') 161 option_context = _defer_to_pandas('option_context') 162 period_range = _defer_to_pandas('period_range') 163 pivot = _call_on_first_arg('pivot') 164 pivot_table = _call_on_first_arg('pivot_table') 165 show_versions = _defer_to_pandas('show_versions') 166 test = frame_base.wont_implement_method( 167 pd, 168 'test', 169 explanation="because it is an internal pandas testing utility.") 170 timedelta_range = _defer_to_pandas('timedelta_range') 171 to_pickle = frame_base.wont_implement_method( 172 pd, 'to_pickle', reason='order-sensitive') 173 to_datetime = _defer_to_pandas_maybe_elementwise('to_datetime') 174 notna = _call_on_first_arg('notna') 175 176 def __getattr__(self, name): 177 if name.startswith('read_'): 178 179 def func(*args, **kwargs): 180 raise frame_base.WontImplementError( 181 'Use p | apache_beam.dataframe.io.%s' % name) 182 183 return func 184 res = getattr(pd, name) 185 if _is_top_level_function(res): 186 return frame_base.not_implemented_method(name, base_type=pd) 187 else: 188 return res 189 190 191 pd_wrapper = DeferredPandasModule()