github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/schemas.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/schemas.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Utilities for relating schema-aware PCollections and DataFrame transforms.
    19  
    20  The utilities here enforce the type mapping defined in
    21  :mod:`apache_beam.typehints.pandas_type_compatibility`.
    22  """
    23  
    24  # pytype: skip-file
    25  
    26  import warnings
    27  from typing import Any
    28  from typing import Dict
    29  from typing import NamedTuple
    30  from typing import Optional
    31  from typing import Sequence
    32  from typing import Tuple
    33  from typing import TypeVar
    34  from typing import Union
    35  
    36  import pandas as pd
    37  
    38  import apache_beam as beam
    39  from apache_beam import typehints
    40  from apache_beam.transforms.util import BatchElements
    41  from apache_beam.typehints.pandas_type_compatibility import INDEX_OPTION_NAME
    42  from apache_beam.typehints.pandas_type_compatibility import create_pandas_batch_converter
    43  from apache_beam.typehints.pandas_type_compatibility import dtype_from_typehint
    44  from apache_beam.typehints.pandas_type_compatibility import dtype_to_fieldtype
    45  from apache_beam.typehints.row_type import RowTypeConstraint
    46  from apache_beam.typehints.schemas import named_fields_from_element_type
    47  from apache_beam.typehints.typehints import normalize
    48  
    49  __all__ = (
    50      'BatchRowsAsDataFrame',
    51      'generate_proxy',
    52      'UnbatchPandas',
    53      'element_type_from_dataframe')
    54  
    55  T = TypeVar('T', bound=NamedTuple)
    56  
    57  
    58  @typehints.with_input_types(T)
    59  @typehints.with_output_types(pd.DataFrame)
    60  class BatchRowsAsDataFrame(beam.PTransform):
    61    """A transform that batches schema-aware PCollection elements into DataFrames
    62  
    63    Batching parameters are inherited from
    64    :class:`~apache_beam.transforms.util.BatchElements`.
    65    """
    66    def __init__(self, *args, proxy=None, **kwargs):
    67      self._batch_elements_transform = BatchElements(*args, **kwargs)
    68      self._proxy = proxy
    69  
    70    def expand(self, pcoll):
    71      if self._proxy is not None:
    72        # Generate typehint
    73        proxy = self._proxy
    74        element_typehint = _element_typehint_from_proxy(proxy)
    75      else:
    76        # Generate proxy
    77        proxy = generate_proxy(pcoll.element_type)
    78        element_typehint = pcoll.element_type
    79  
    80      converter = create_pandas_batch_converter(
    81          element_type=element_typehint, batch_type=type(proxy))
    82  
    83      return (
    84          pcoll | self._batch_elements_transform
    85          | beam.Map(converter.produce_batch))
    86  
    87  
    88  def generate_proxy(element_type):
    89    # type: (type) -> pd.DataFrame
    90  
    91    """Generate a proxy pandas object for the given PCollection element_type.
    92  
    93    Currently only supports generating a DataFrame proxy from a schema-aware
    94    PCollection or a Series proxy from a primitively typed PCollection.
    95    """
    96    dtype = dtype_from_typehint(element_type)
    97    if dtype is not object:
    98      return pd.Series(dtype=dtype)
    99    else:
   100      fields = named_fields_from_element_type(element_type)
   101      proxy = pd.DataFrame(columns=[name for name, _ in fields])
   102      for name, typehint in fields:
   103        dtype = dtype_from_typehint(typehint)
   104        proxy[name] = proxy[name].astype(dtype)
   105  
   106      return proxy
   107  
   108  
   109  def element_type_from_dataframe(proxy, include_indexes=False):
   110    # type: (pd.DataFrame, bool) -> type
   111  
   112    """Generate an element_type for an element-wise PCollection from a proxy
   113    pandas object. Currently only supports converting the element_type for
   114    a schema-aware PCollection to a proxy DataFrame.
   115  
   116    Currently only supports generating a DataFrame proxy from a schema-aware
   117    PCollection.
   118    """
   119    return element_typehint_from_dataframe_proxy(proxy, include_indexes).user_type
   120  
   121  
   122  def _element_typehint_from_proxy(
   123      proxy: pd.core.generic.NDFrame, include_indexes: bool = False):
   124    if isinstance(proxy, pd.DataFrame):
   125      return element_typehint_from_dataframe_proxy(
   126          proxy, include_indexes=include_indexes)
   127    elif isinstance(proxy, pd.Series):
   128      if include_indexes:
   129        warnings.warn(
   130            "include_indexes=True for a Series input. Note that this "
   131            "parameter is _not_ respected for DeferredSeries "
   132            "conversion.")
   133      return dtype_to_fieldtype(proxy.dtype)
   134    else:
   135      raise TypeError(f"Proxy '{proxy}' has unsupported type '{type(proxy)}'")
   136  
   137  
   138  def element_typehint_from_dataframe_proxy(
   139      proxy: pd.DataFrame, include_indexes: bool = False) -> RowTypeConstraint:
   140  
   141    output_columns = []
   142    if include_indexes:
   143      remaining_index_names = list(proxy.index.names)
   144      i = 0
   145      while len(remaining_index_names):
   146        index_name = remaining_index_names.pop(0)
   147        if index_name is None:
   148          raise ValueError(
   149              "Encountered an unnamed index. Cannot convert to a "
   150              "schema-aware PCollection with include_indexes=True. "
   151              "Please name all indexes or consider not including "
   152              "indexes.")
   153        elif index_name in remaining_index_names:
   154          raise ValueError(
   155              "Encountered multiple indexes with the name '%s'. "
   156              "Cannot convert to a schema-aware PCollection with "
   157              "include_indexes=True. Please ensure all indexes have "
   158              "unique names or consider not including indexes." % index_name)
   159        elif index_name in proxy.columns:
   160          raise ValueError(
   161              "Encountered an index that has the same name as one "
   162              "of the columns, '%s'. Cannot convert to a "
   163              "schema-aware PCollection with include_indexes=True. "
   164              "Please ensure all indexes have unique names or "
   165              "consider not including indexes." % index_name)
   166        else:
   167          # its ok!
   168          output_columns.append(
   169              (index_name, proxy.index.get_level_values(i).dtype))
   170          i += 1
   171  
   172    output_columns.extend(zip(proxy.columns, proxy.dtypes))
   173  
   174    fields = [(column, dtype_to_fieldtype(dtype))
   175              for (column, dtype) in output_columns]
   176    field_options: Optional[Dict[str, Sequence[Tuple[str, Any]]]]
   177    if include_indexes:
   178      field_options = {
   179          index_name: [(INDEX_OPTION_NAME, None)]
   180          for index_name in proxy.index.names
   181      }
   182    else:
   183      field_options = None
   184  
   185    return RowTypeConstraint.from_fields(fields, field_options=field_options)
   186  
   187  
   188  def _unbatch_transform(proxy, include_indexes):
   189    element_typehint = normalize(
   190        _element_typehint_from_proxy(proxy, include_indexes=include_indexes))
   191  
   192    converter = create_pandas_batch_converter(
   193        element_type=element_typehint, batch_type=type(proxy))
   194  
   195    return beam.FlatMap(
   196        converter.explode_batch).with_output_types(element_typehint)
   197  
   198  
   199  @typehints.with_input_types(Union[pd.DataFrame, pd.Series])
   200  class UnbatchPandas(beam.PTransform):
   201    """A transform that explodes a PCollection of DataFrame or Series. DataFrame
   202    is converterd to a schema-aware PCollection, while Series is converted to its
   203    underlying type.
   204  
   205    Args:
   206      include_indexes: (optional, default: False) When unbatching a DataFrame
   207          if include_indexes=True, attempt to include index columns in the output
   208          schema for expanded DataFrames. Raises an error if any of the index
   209          levels are unnamed (name=None), or if any of the names are not unique
   210          among all column and index names.
   211    """
   212    def __init__(self, proxy, include_indexes=False):
   213      self._proxy = proxy
   214      self._include_indexes = include_indexes
   215  
   216    def expand(self, pcoll):
   217      return pcoll | _unbatch_transform(self._proxy, self._include_indexes)