github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/display.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  :class:`DisplayData`, its classes, interfaces and methods.
    20  
    21  The classes in this module allow users and transform developers to define
    22  static display data to be displayed when a pipeline runs.
    23  :class:`~apache_beam.transforms.ptransform.PTransform` s,
    24  :class:`~apache_beam.transforms.core.DoFn` s
    25  and other pipeline components are subclasses of the :class:`HasDisplayData`
    26  mixin. To add static display data to a component, you can override the
    27  :meth:`HasDisplayData.display_data()` method.
    28  
    29  Available classes:
    30  
    31  * :class:`HasDisplayData` - Components that inherit from this class can have
    32    static display data shown in the UI.
    33  * :class:`DisplayDataItem` - This class represents static display data
    34    elements.
    35  * :class:`DisplayData` - Internal class that is used to create display data
    36    and communicate it to the API.
    37  """
    38  
    39  # pytype: skip-file
    40  
    41  import calendar
    42  import inspect
    43  import json
    44  from datetime import datetime
    45  from datetime import timedelta
    46  from typing import TYPE_CHECKING
    47  from typing import List
    48  
    49  from apache_beam.portability import common_urns
    50  from apache_beam.portability.api import beam_runner_api_pb2
    51  
    52  if TYPE_CHECKING:
    53    from apache_beam.options.pipeline_options import PipelineOptions
    54  
    55  __all__ = ['HasDisplayData', 'DisplayDataItem', 'DisplayData']
    56  
    57  
    58  class HasDisplayData(object):
    59    """ Basic mixin for elements that contain display data.
    60  
    61    It implements only the display_data method and a
    62    _get_display_data_namespace method.
    63    """
    64    def display_data(self):
    65      # type: () -> dict
    66  
    67      """ Returns the display data associated to a pipeline component.
    68  
    69      It should be reimplemented in pipeline components that wish to have
    70      static display data.
    71  
    72      Returns:
    73        Dict[str, Any]: A dictionary containing ``key:value`` pairs.
    74        The value might be an integer, float or string value; a
    75        :class:`DisplayDataItem` for values that have more data
    76        (e.g. short value, label, url); or a :class:`HasDisplayData` instance
    77        that has more display data that should be picked up. For example::
    78  
    79          {
    80            'key1': 'string_value',
    81            'key2': 1234,
    82            'key3': 3.14159265,
    83            'key4': DisplayDataItem('apache.org', url='http://apache.org'),
    84            'key5': subComponent
    85          }
    86      """
    87      return {}
    88  
    89    def _get_display_data_namespace(self):
    90      # type: () -> str
    91      return '{}.{}'.format(self.__module__, self.__class__.__name__)
    92  
    93  
    94  class DisplayData(object):
    95    """ Static display data associated with a pipeline component.
    96    """
    97    def __init__(
    98        self,
    99        namespace,  # type: str
   100        display_data_dict  # type: dict
   101    ):
   102      # type: (...) -> None
   103      self.namespace = namespace
   104      self.items = []  # type: List[DisplayDataItem]
   105      self._populate_items(display_data_dict)
   106  
   107    def _populate_items(self, display_data_dict):
   108      """ Populates the list of display data items.
   109      """
   110      for key, element in display_data_dict.items():
   111        if isinstance(element, HasDisplayData):
   112          subcomponent_display_data = DisplayData(
   113              element._get_display_data_namespace(), element.display_data())
   114          self.items += subcomponent_display_data.items
   115          continue
   116  
   117        if isinstance(element, DisplayDataItem):
   118          if element.should_drop():
   119            continue
   120          element.key = key
   121          element.namespace = self.namespace
   122          self.items.append(element)
   123          continue
   124  
   125        # If it's not a HasDisplayData element,
   126        # nor a dictionary, then it's a simple value
   127        self.items.append(
   128            DisplayDataItem(element, namespace=self.namespace, key=key))
   129  
   130    def to_proto(self):
   131      # type: (...) -> List[beam_runner_api_pb2.DisplayData]
   132  
   133      """Returns a List of Beam proto representation of Display data."""
   134      def create_payload(dd):
   135        display_data_dict = None
   136        try:
   137          display_data_dict = dd.get_dict()
   138        except ValueError:
   139          # Skip if the display data is invalid.
   140          return None
   141  
   142        # We use 'label' or 'key' properties to populate the 'label' attribute of
   143        # 'LabelledPayload'. 'label' is a better choice since it's expected to be
   144        # more human readable but some transforms, sources, etc. may not set a
   145        # 'label' property when configuring DisplayData.
   146        label = (
   147            display_data_dict['label']
   148            if 'label' in display_data_dict else display_data_dict['key'])
   149  
   150        value = display_data_dict['value']
   151        if isinstance(value, str):
   152          return beam_runner_api_pb2.LabelledPayload(
   153              label=label,
   154              string_value=value,
   155              key=display_data_dict['key'],
   156              namespace=display_data_dict.get('namespace', ''))
   157        elif isinstance(value, bool):
   158          return beam_runner_api_pb2.LabelledPayload(
   159              label=label,
   160              bool_value=value,
   161              key=display_data_dict['key'],
   162              namespace=display_data_dict.get('namespace', ''))
   163        elif isinstance(value, int):
   164          return beam_runner_api_pb2.LabelledPayload(
   165              label=label,
   166              int_value=value,
   167              key=display_data_dict['key'],
   168              namespace=display_data_dict.get('namespace', ''))
   169        elif isinstance(value, (float, complex)):
   170          return beam_runner_api_pb2.LabelledPayload(
   171              label=label,
   172              double_value=value,
   173              key=display_data_dict['key'],
   174              namespace=display_data_dict.get('namespace', ''))
   175        else:
   176          raise ValueError(
   177              'Unsupported type %s for value of display data %s' %
   178              (type(value), label))
   179  
   180      dd_protos = []
   181      for dd in self.items:
   182        dd_proto = create_payload(dd)
   183        if dd_proto:
   184          dd_protos.append(
   185              beam_runner_api_pb2.DisplayData(
   186                  urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
   187                  payload=create_payload(dd).SerializeToString()))
   188      return dd_protos
   189  
   190    @classmethod
   191    def create_from_options(cls, pipeline_options):
   192      """ Creates :class:`~apache_beam.transforms.display.DisplayData` from a
   193      :class:`~apache_beam.options.pipeline_options.PipelineOptions` instance.
   194  
   195      When creating :class:`~apache_beam.transforms.display.DisplayData`, this
   196      method will convert the value of any item of a non-supported type to its
   197      string representation.
   198      The normal :meth:`.create_from()` method rejects those items.
   199  
   200      Returns:
   201        ~apache_beam.transforms.display.DisplayData:
   202          A :class:`~apache_beam.transforms.display.DisplayData` instance with
   203          populated items.
   204  
   205      Raises:
   206        ValueError: If the **has_display_data** argument is
   207          not an instance of :class:`HasDisplayData`.
   208      """
   209      from apache_beam.options.pipeline_options import PipelineOptions
   210      if not isinstance(pipeline_options, PipelineOptions):
   211        raise ValueError(
   212            'Element of class {}.{} does not subclass PipelineOptions'.format(
   213                pipeline_options.__module__, pipeline_options.__class__.__name__))
   214  
   215      items = {
   216          k: (v if DisplayDataItem._get_value_type(v) is not None else str(v))
   217          for k,
   218          v in pipeline_options.display_data().items()
   219      }
   220      return cls(pipeline_options._get_display_data_namespace(), items)
   221  
   222    @classmethod
   223    def create_from(cls, has_display_data):
   224      """ Creates :class:`~apache_beam.transforms.display.DisplayData` from a
   225      :class:`HasDisplayData` instance.
   226  
   227      Returns:
   228        ~apache_beam.transforms.display.DisplayData:
   229          A :class:`~apache_beam.transforms.display.DisplayData` instance with
   230          populated items.
   231  
   232      Raises:
   233        ValueError: If the **has_display_data** argument is
   234          not an instance of :class:`HasDisplayData`.
   235      """
   236      if not isinstance(has_display_data, HasDisplayData):
   237        raise ValueError(
   238            'Element of class {}.{} does not subclass HasDisplayData'.format(
   239                has_display_data.__module__, has_display_data.__class__.__name__))
   240      return cls(
   241          has_display_data._get_display_data_namespace(),
   242          has_display_data.display_data())
   243  
   244  
   245  class DisplayDataItem(object):
   246    """ A DisplayDataItem represents a unit of static display data.
   247  
   248    Each item is identified by a key and the namespace of the component the
   249    display item belongs to.
   250    """
   251    typeDict = {
   252        str: 'STRING',
   253        int: 'INTEGER',
   254        float: 'FLOAT',
   255        bool: 'BOOLEAN',
   256        timedelta: 'DURATION',
   257        datetime: 'TIMESTAMP'
   258    }
   259  
   260    def __init__(
   261        self,
   262        value,
   263        url=None,
   264        label=None,
   265        namespace=None,
   266        key=None,
   267        shortValue=None):
   268      self.namespace = namespace
   269      self.key = key
   270      self.type = self._get_value_type(value)
   271      self.shortValue = (
   272          shortValue if shortValue is not None else self._get_short_value(
   273              value, self.type))
   274      self.value = value
   275      self.url = url
   276      self.label = label
   277      self._drop_if_none = False
   278      self._drop_if_default = False
   279  
   280    def drop_if_none(self):
   281      # type: () -> DisplayDataItem
   282  
   283      """ The item should be dropped if its value is None.
   284  
   285      Returns:
   286        Returns self.
   287      """
   288      self._drop_if_none = True
   289      return self
   290  
   291    def drop_if_default(self, default):
   292      # type: (...) -> DisplayDataItem
   293  
   294      """ The item should be dropped if its value is equal to its default.
   295  
   296      Returns:
   297        Returns self.
   298      """
   299      self._default = default
   300      self._drop_if_default = True
   301      return self
   302  
   303    def should_drop(self):
   304      # type: () -> bool
   305  
   306      """ Return True if the item should be dropped, or False if it should not
   307      be dropped. This depends on the drop_if_none, and drop_if_default calls.
   308  
   309      Returns:
   310        True or False; depending on whether the item should be dropped or kept.
   311      """
   312      if self._drop_if_none and self.value is None:
   313        return True
   314      if self._drop_if_default and self.value == self._default:
   315        return True
   316      return False
   317  
   318    def is_valid(self):
   319      # type: () -> None
   320  
   321      """ Checks that all the necessary fields of the :class:`DisplayDataItem`
   322      are filled in. It checks that neither key, namespace, value or type are
   323      :data:`None`.
   324  
   325      Raises:
   326        ValueError: If the item does not have a key, namespace,
   327          value or type.
   328      """
   329      if self.key is None:
   330        raise ValueError(
   331            'Invalid DisplayDataItem %s. Key must not be None.' % self)
   332      if self.namespace is None:
   333        raise ValueError(
   334            'Invalid DisplayDataItem %s. Namespace must not be None' % self)
   335      if self.value is None:
   336        raise ValueError(
   337            'Invalid DisplayDataItem %s. Value must not be None' % self)
   338      if self.type is None:
   339        raise ValueError(
   340            'Invalid DisplayDataItem. Value {} is of an unsupported type.'.format(
   341                self.value))
   342  
   343    def _get_dict(self):
   344      res = {
   345          'key': self.key,
   346          'namespace': self.namespace,
   347          'type': self.type if self.type != 'CLASS' else 'STRING'
   348      }
   349      # TODO: Python Class types should not be special-cased once
   350      # the Fn API is in.
   351      if self.url is not None:
   352        res['url'] = self.url
   353      if self.shortValue is not None:
   354        res['shortValue'] = self.shortValue
   355      if self.label is not None:
   356        res['label'] = self.label
   357      res['value'] = self._format_value(self.value, self.type)
   358      return res
   359  
   360    def get_dict(self):
   361      # type: () -> dict
   362  
   363      """ Returns the internal-API dictionary representing the
   364      :class:`DisplayDataItem`.
   365  
   366      Returns:
   367        Dict[str, Any]: A dictionary. The internal-API dictionary representing
   368        the :class:`DisplayDataItem`.
   369  
   370      Raises:
   371        ValueError: if the item is not valid.
   372      """
   373      self.is_valid()
   374      return self._get_dict()
   375  
   376    def __repr__(self):
   377      return 'DisplayDataItem({})'.format(json.dumps(self._get_dict()))
   378  
   379    def __eq__(self, other):
   380      if isinstance(other, self.__class__):
   381        return self._get_dict() == other._get_dict()
   382      return False
   383  
   384    def __hash__(self):
   385      return hash(tuple(sorted(self._get_dict().items())))
   386  
   387    @classmethod
   388    def _format_value(cls, value, type_):
   389      """ Returns the API representation of a value given its type.
   390  
   391      Args:
   392        value: The value of the item that needs to be shortened.
   393        type_(string): The type of the value.
   394  
   395      Returns:
   396        A formatted value in the form of a float, int, or string.
   397      """
   398      res = value
   399      if type_ == 'CLASS':
   400        res = '{}.{}'.format(value.__module__, value.__name__)
   401      elif type_ == 'DURATION':
   402        res = value.total_seconds() * 1000
   403      elif type_ == 'TIMESTAMP':
   404        res = calendar.timegm(
   405            value.timetuple()) * 1000 + value.microsecond // 1000
   406      return res
   407  
   408    @classmethod
   409    def _get_short_value(cls, value, type_):
   410      """ Calculates the short value for an item.
   411  
   412      Args:
   413        value: The value of the item that needs to be shortened.
   414        type_(string): The type of the value.
   415  
   416      Returns:
   417        The unqualified name of a class if type_ is 'CLASS'. None otherwise.
   418      """
   419      if type_ == 'CLASS':
   420        return value.__name__
   421      return None
   422  
   423    @classmethod
   424    def _get_value_type(cls, value):
   425      """ Infers the type of a given value.
   426  
   427      Args:
   428        value: The value whose type needs to be inferred. For 'DURATION' and
   429          'TIMESTAMP', the corresponding Python type is datetime.timedelta and
   430          datetime.datetime respectively. For Python classes, the API type is
   431          just 'STRING' at the moment.
   432  
   433      Returns:
   434        One of 'STRING', 'INTEGER', 'FLOAT', 'CLASS', 'DURATION', or
   435        'TIMESTAMP', depending on the type of the value.
   436      """
   437      #TODO: Fix Args: documentation once the Python classes handling has changed
   438      type_ = cls.typeDict.get(type(value))
   439      if type_ is None:
   440        type_ = 'CLASS' if inspect.isclass(value) else None
   441      if type_ is None and value is None:
   442        type_ = 'STRING'
   443      return type_