github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/schemas.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Utilities for relating schema-aware PCollections and DataFrame transforms. 19 20 The utilities here enforce the type mapping defined in 21 :mod:`apache_beam.typehints.pandas_type_compatibility`. 22 """ 23 24 # pytype: skip-file 25 26 import warnings 27 from typing import Any 28 from typing import Dict 29 from typing import NamedTuple 30 from typing import Optional 31 from typing import Sequence 32 from typing import Tuple 33 from typing import TypeVar 34 from typing import Union 35 36 import pandas as pd 37 38 import apache_beam as beam 39 from apache_beam import typehints 40 from apache_beam.transforms.util import BatchElements 41 from apache_beam.typehints.pandas_type_compatibility import INDEX_OPTION_NAME 42 from apache_beam.typehints.pandas_type_compatibility import create_pandas_batch_converter 43 from apache_beam.typehints.pandas_type_compatibility import dtype_from_typehint 44 from apache_beam.typehints.pandas_type_compatibility import dtype_to_fieldtype 45 from apache_beam.typehints.row_type import RowTypeConstraint 46 from apache_beam.typehints.schemas import named_fields_from_element_type 47 from apache_beam.typehints.typehints import normalize 48 49 __all__ = ( 50 'BatchRowsAsDataFrame', 51 'generate_proxy', 52 'UnbatchPandas', 53 'element_type_from_dataframe') 54 55 T = TypeVar('T', bound=NamedTuple) 56 57 58 @typehints.with_input_types(T) 59 @typehints.with_output_types(pd.DataFrame) 60 class BatchRowsAsDataFrame(beam.PTransform): 61 """A transform that batches schema-aware PCollection elements into DataFrames 62 63 Batching parameters are inherited from 64 :class:`~apache_beam.transforms.util.BatchElements`. 65 """ 66 def __init__(self, *args, proxy=None, **kwargs): 67 self._batch_elements_transform = BatchElements(*args, **kwargs) 68 self._proxy = proxy 69 70 def expand(self, pcoll): 71 if self._proxy is not None: 72 # Generate typehint 73 proxy = self._proxy 74 element_typehint = _element_typehint_from_proxy(proxy) 75 else: 76 # Generate proxy 77 proxy = generate_proxy(pcoll.element_type) 78 element_typehint = pcoll.element_type 79 80 converter = create_pandas_batch_converter( 81 element_type=element_typehint, batch_type=type(proxy)) 82 83 return ( 84 pcoll | self._batch_elements_transform 85 | beam.Map(converter.produce_batch)) 86 87 88 def generate_proxy(element_type): 89 # type: (type) -> pd.DataFrame 90 91 """Generate a proxy pandas object for the given PCollection element_type. 92 93 Currently only supports generating a DataFrame proxy from a schema-aware 94 PCollection or a Series proxy from a primitively typed PCollection. 95 """ 96 dtype = dtype_from_typehint(element_type) 97 if dtype is not object: 98 return pd.Series(dtype=dtype) 99 else: 100 fields = named_fields_from_element_type(element_type) 101 proxy = pd.DataFrame(columns=[name for name, _ in fields]) 102 for name, typehint in fields: 103 dtype = dtype_from_typehint(typehint) 104 proxy[name] = proxy[name].astype(dtype) 105 106 return proxy 107 108 109 def element_type_from_dataframe(proxy, include_indexes=False): 110 # type: (pd.DataFrame, bool) -> type 111 112 """Generate an element_type for an element-wise PCollection from a proxy 113 pandas object. Currently only supports converting the element_type for 114 a schema-aware PCollection to a proxy DataFrame. 115 116 Currently only supports generating a DataFrame proxy from a schema-aware 117 PCollection. 118 """ 119 return element_typehint_from_dataframe_proxy(proxy, include_indexes).user_type 120 121 122 def _element_typehint_from_proxy( 123 proxy: pd.core.generic.NDFrame, include_indexes: bool = False): 124 if isinstance(proxy, pd.DataFrame): 125 return element_typehint_from_dataframe_proxy( 126 proxy, include_indexes=include_indexes) 127 elif isinstance(proxy, pd.Series): 128 if include_indexes: 129 warnings.warn( 130 "include_indexes=True for a Series input. Note that this " 131 "parameter is _not_ respected for DeferredSeries " 132 "conversion.") 133 return dtype_to_fieldtype(proxy.dtype) 134 else: 135 raise TypeError(f"Proxy '{proxy}' has unsupported type '{type(proxy)}'") 136 137 138 def element_typehint_from_dataframe_proxy( 139 proxy: pd.DataFrame, include_indexes: bool = False) -> RowTypeConstraint: 140 141 output_columns = [] 142 if include_indexes: 143 remaining_index_names = list(proxy.index.names) 144 i = 0 145 while len(remaining_index_names): 146 index_name = remaining_index_names.pop(0) 147 if index_name is None: 148 raise ValueError( 149 "Encountered an unnamed index. Cannot convert to a " 150 "schema-aware PCollection with include_indexes=True. " 151 "Please name all indexes or consider not including " 152 "indexes.") 153 elif index_name in remaining_index_names: 154 raise ValueError( 155 "Encountered multiple indexes with the name '%s'. " 156 "Cannot convert to a schema-aware PCollection with " 157 "include_indexes=True. Please ensure all indexes have " 158 "unique names or consider not including indexes." % index_name) 159 elif index_name in proxy.columns: 160 raise ValueError( 161 "Encountered an index that has the same name as one " 162 "of the columns, '%s'. Cannot convert to a " 163 "schema-aware PCollection with include_indexes=True. " 164 "Please ensure all indexes have unique names or " 165 "consider not including indexes." % index_name) 166 else: 167 # its ok! 168 output_columns.append( 169 (index_name, proxy.index.get_level_values(i).dtype)) 170 i += 1 171 172 output_columns.extend(zip(proxy.columns, proxy.dtypes)) 173 174 fields = [(column, dtype_to_fieldtype(dtype)) 175 for (column, dtype) in output_columns] 176 field_options: Optional[Dict[str, Sequence[Tuple[str, Any]]]] 177 if include_indexes: 178 field_options = { 179 index_name: [(INDEX_OPTION_NAME, None)] 180 for index_name in proxy.index.names 181 } 182 else: 183 field_options = None 184 185 return RowTypeConstraint.from_fields(fields, field_options=field_options) 186 187 188 def _unbatch_transform(proxy, include_indexes): 189 element_typehint = normalize( 190 _element_typehint_from_proxy(proxy, include_indexes=include_indexes)) 191 192 converter = create_pandas_batch_converter( 193 element_type=element_typehint, batch_type=type(proxy)) 194 195 return beam.FlatMap( 196 converter.explode_batch).with_output_types(element_typehint) 197 198 199 @typehints.with_input_types(Union[pd.DataFrame, pd.Series]) 200 class UnbatchPandas(beam.PTransform): 201 """A transform that explodes a PCollection of DataFrame or Series. DataFrame 202 is converterd to a schema-aware PCollection, while Series is converted to its 203 underlying type. 204 205 Args: 206 include_indexes: (optional, default: False) When unbatching a DataFrame 207 if include_indexes=True, attempt to include index columns in the output 208 schema for expanded DataFrames. Raises an error if any of the index 209 levels are unnamed (name=None), or if any of the names are not unique 210 among all column and index names. 211 """ 212 def __init__(self, proxy, include_indexes=False): 213 self._proxy = proxy 214 self._include_indexes = include_indexes 215 216 def expand(self, pcoll): 217 return pcoll | _unbatch_transform(self._proxy, self._include_indexes)