github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_metrics.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_metrics.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  DataflowRunner implementation of MetricResults. It is in charge of
    20  responding to queries of current metrics by going to the dataflow
    21  service.
    22  """
    23  
    24  # pytype: skip-file
    25  
    26  import argparse
    27  import logging
    28  import numbers
    29  import sys
    30  from collections import defaultdict
    31  
    32  from apache_beam.metrics.cells import DistributionData
    33  from apache_beam.metrics.cells import DistributionResult
    34  from apache_beam.metrics.execution import MetricKey
    35  from apache_beam.metrics.execution import MetricResult
    36  from apache_beam.metrics.metric import MetricResults
    37  from apache_beam.metrics.metricbase import MetricName
    38  from apache_beam.options.pipeline_options import GoogleCloudOptions
    39  from apache_beam.options.pipeline_options import PipelineOptions
    40  
    41  _LOGGER = logging.getLogger(__name__)
    42  
    43  
    44  def _get_match(proto, filter_fn):
    45    """Finds and returns the first element that matches a query.
    46  
    47    If no element matches the query, it throws ValueError.
    48    If more than one element matches the query, it returns only the first.
    49    """
    50    query = [elm for elm in proto if filter_fn(elm)]
    51    if len(query) == 0:
    52      raise ValueError('Could not find element')
    53    elif len(query) > 1:
    54      raise ValueError('Too many matches')
    55  
    56    return query[0]
    57  
    58  
    59  # V1b3 MetricStructuredName keys to accept and copy to the MetricKey labels.
    60  STEP_LABEL = 'step'
    61  STRUCTURED_NAME_LABELS = set(
    62      ['execution_step', 'original_name', 'output_user_name'])
    63  
    64  
    65  class DataflowMetrics(MetricResults):
    66    """Implementation of MetricResults class for the Dataflow runner."""
    67    def __init__(self, dataflow_client=None, job_result=None, job_graph=None):
    68      """Initialize the Dataflow metrics object.
    69  
    70      Args:
    71        dataflow_client: apiclient.DataflowApplicationClient to interact with the
    72          dataflow service.
    73        job_result: DataflowPipelineResult with the state and id information of
    74          the job.
    75        job_graph: apiclient.Job instance to be able to translate between internal
    76          step names (e.g. "s2"), and user step names (e.g. "split").
    77      """
    78      super().__init__()
    79      self._dataflow_client = dataflow_client
    80      self.job_result = job_result
    81      self._queried_after_termination = False
    82      self._cached_metrics = None
    83      self._job_graph = job_graph
    84  
    85    @staticmethod
    86    def _is_counter(metric_result):
    87      return isinstance(metric_result.attempted, numbers.Number)
    88  
    89    @staticmethod
    90    def _is_distribution(metric_result):
    91      return isinstance(metric_result.attempted, DistributionResult)
    92  
    93    def _translate_step_name(self, internal_name):
    94      """Translate between internal step names (e.g. "s1") and user step names."""
    95      if not self._job_graph:
    96        raise ValueError(
    97            'Could not translate the internal step name %r since job graph is '
    98            'not available.' % internal_name)
    99      user_step_name = None
   100      if (self._job_graph and internal_name in
   101          self._job_graph.proto_pipeline.components.transforms.keys()):
   102        # Dataflow Runner v2 with portable job submission uses proto transform map
   103        # IDs for step names. Also PTransform.unique_name maps to user step names.
   104        # Hence we lookup user step names based on the proto.
   105        user_step_name = self._job_graph.proto_pipeline.components.transforms[
   106            internal_name].unique_name
   107      else:
   108        try:
   109          step = _get_match(
   110              self._job_graph.proto.steps, lambda x: x.name == internal_name)
   111          user_step_name = _get_match(
   112              step.properties.additionalProperties,
   113              lambda x: x.key == 'user_name').value.string_value
   114        except ValueError:
   115          pass  # Exception is handled below.
   116      if not user_step_name:
   117        raise ValueError(
   118            'Could not translate the internal step name %r.' % internal_name)
   119      return user_step_name
   120  
   121    def _get_metric_key(self, metric):
   122      """Populate the MetricKey object for a queried metric result."""
   123      step = ""
   124      name = metric.name.name  # Always extract a name
   125      labels = {}
   126      try:  # Try to extract the user step name.
   127        # If ValueError is thrown within this try-block, it is because of
   128        # one of the following:
   129        # 1. Unable to translate the step name. Only happening with improperly
   130        #   formatted job graph (unlikely), or step name not being the internal
   131        #   step name (only happens for unstructured-named metrics).
   132        # 2. Unable to unpack [step] or [namespace]; which should only happen
   133        #   for unstructured names.
   134        step = _get_match(
   135            metric.name.context.additionalProperties,
   136            lambda x: x.key == STEP_LABEL).value
   137        step = self._translate_step_name(step)
   138      except ValueError:
   139        pass
   140  
   141      namespace = "dataflow/v1b3"  # Try to extract namespace or add a default.
   142      try:
   143        namespace = _get_match(
   144            metric.name.context.additionalProperties,
   145            lambda x: x.key == 'namespace').value
   146      except ValueError:
   147        pass
   148  
   149      for kv in metric.name.context.additionalProperties:
   150        if kv.key in STRUCTURED_NAME_LABELS:
   151          labels[kv.key] = kv.value
   152      # Package everything besides namespace and name the labels as well,
   153      # including unmodified step names to assist in integration the exact
   154      # unmodified values which come from dataflow.
   155      return MetricKey(step, MetricName(namespace, name), labels=labels)
   156  
   157    def _populate_metrics(self, response, result, user_metrics=False):
   158      """Move metrics from response to results as MetricResults."""
   159      if user_metrics:
   160        metrics = [
   161            metric for metric in response.metrics if metric.name.origin == 'user'
   162        ]
   163      else:
   164        metrics = [
   165            metric for metric in response.metrics
   166            if metric.name.origin == 'dataflow/v1b3'
   167        ]
   168  
   169      # Get the tentative/committed versions of every metric together.
   170      metrics_by_name = defaultdict(lambda: {})
   171      for metric in metrics:
   172        if (metric.name.name.endswith('_MIN') or
   173            metric.name.name.endswith('_MAX') or
   174            metric.name.name.endswith('_MEAN') or
   175            metric.name.name.endswith('_COUNT')):
   176          # The Dataflow Service presents distribution metrics in two ways:
   177          # One way is as a single distribution object with all its fields, and
   178          # another way is as four different scalar metrics labeled as _MIN,
   179          # _MAX, _COUNT_, _MEAN.
   180          # TODO(pabloem) remove these when distributions are not being broken up
   181          #  in the service.
   182          # The second way is only useful for the UI, and should be ignored.
   183          continue
   184        is_tentative = [
   185            prop for prop in metric.name.context.additionalProperties
   186            if prop.key == 'tentative' and prop.value == 'true'
   187        ]
   188        tentative_or_committed = 'tentative' if is_tentative else 'committed'
   189  
   190        metric_key = self._get_metric_key(metric)
   191        if metric_key is None:
   192          continue
   193        metrics_by_name[metric_key][tentative_or_committed] = metric
   194  
   195      # Now we create the MetricResult elements.
   196      for metric_key, metric in metrics_by_name.items():
   197        attempted = self._get_metric_value(metric['tentative'])
   198        committed = self._get_metric_value(metric['committed'])
   199        result.append(
   200            MetricResult(metric_key, attempted=attempted, committed=committed))
   201  
   202    def _get_metric_value(self, metric):
   203      """Get a metric result object from a MetricUpdate from Dataflow API."""
   204      if metric is None:
   205        return None
   206  
   207      if metric.scalar is not None:
   208        return metric.scalar.integer_value
   209      elif metric.distribution is not None:
   210        dist_count = _get_match(
   211            metric.distribution.object_value.properties,
   212            lambda x: x.key == 'count').value.integer_value
   213        dist_min = _get_match(
   214            metric.distribution.object_value.properties,
   215            lambda x: x.key == 'min').value.integer_value
   216        dist_max = _get_match(
   217            metric.distribution.object_value.properties,
   218            lambda x: x.key == 'max').value.integer_value
   219        dist_sum = _get_match(
   220            metric.distribution.object_value.properties,
   221            lambda x: x.key == 'sum').value.integer_value
   222        if dist_sum is None:
   223          # distribution metric is not meant to use on large values, but in case
   224          # it is, the value can overflow and become double_value, the correctness
   225          # of the value may not be guaranteed.
   226          _LOGGER.info(
   227              "Distribution metric sum value seems to have "
   228              "overflowed integer_value range, the correctness of sum or mean "
   229              "value may not be guaranteed: %s" % metric.distribution)
   230          dist_sum = int(
   231              _get_match(
   232                  metric.distribution.object_value.properties,
   233                  lambda x: x.key == 'sum').value.double_value)
   234        return DistributionResult(
   235            DistributionData(dist_sum, dist_count, dist_min, dist_max))
   236      else:
   237        return None
   238  
   239    def _get_metrics_from_dataflow(self, job_id=None):
   240      """Return cached metrics or query the dataflow service."""
   241      if not job_id:
   242        try:
   243          job_id = self.job_result.job_id()
   244        except AttributeError:
   245          job_id = None
   246      if not job_id:
   247        raise ValueError('Can not query metrics. Job id is unknown.')
   248  
   249      if self._cached_metrics:
   250        return self._cached_metrics
   251  
   252      job_metrics = self._dataflow_client.get_job_metrics(job_id)
   253      # If we cannot determine that the job has terminated,
   254      # then metrics will not change and we can cache them.
   255      if self.job_result and self.job_result.is_in_terminal_state():
   256        self._cached_metrics = job_metrics
   257      return job_metrics
   258  
   259    def all_metrics(self, job_id=None):
   260      """Return all user and system metrics from the dataflow service."""
   261      metric_results = []
   262      response = self._get_metrics_from_dataflow(job_id=job_id)
   263      self._populate_metrics(response, metric_results, user_metrics=True)
   264      self._populate_metrics(response, metric_results, user_metrics=False)
   265      return metric_results
   266  
   267    def query(self, filter=None):
   268      metric_results = []
   269      response = self._get_metrics_from_dataflow()
   270      self._populate_metrics(response, metric_results, user_metrics=True)
   271      return {
   272          self.COUNTERS: [
   273              elm for elm in metric_results if self.matches(filter, elm.key) and
   274              DataflowMetrics._is_counter(elm)
   275          ],
   276          self.DISTRIBUTIONS: [
   277              elm for elm in metric_results if self.matches(filter, elm.key) and
   278              DataflowMetrics._is_distribution(elm)
   279          ],
   280          self.GAUGES: []
   281      }  # TODO(pabloem): Add Gauge support for dataflow.
   282  
   283  
   284  def main(argv):
   285    """Print the metric results for the dataflow --job_id and --project.
   286  
   287    Instead of running an entire pipeline which takes several minutes, use this
   288    main method to display MetricResults for a specific --job_id and --project
   289    which takes only a few seconds.
   290    """
   291    # TODO(https://github.com/apache/beam/issues/19452): The MetricResults do not
   292    # show translated step names as the job_graph is not provided to
   293    # DataflowMetrics. Import here to avoid adding the dependency for local
   294    # running scenarios.
   295    try:
   296      # pylint: disable=wrong-import-order, wrong-import-position
   297      from apache_beam.runners.dataflow.internal import apiclient
   298    except ImportError:
   299      raise ImportError(
   300          'Google Cloud Dataflow runner not available, '
   301          'please install apache_beam[gcp]')
   302    if argv[0] == __file__:
   303      argv = argv[1:]
   304    parser = argparse.ArgumentParser()
   305    parser.add_argument(
   306        '-j', '--job_id', type=str, help='The job id to query metrics for.')
   307    parser.add_argument(
   308        '-p',
   309        '--project',
   310        type=str,
   311        help='The project name to query metrics for.')
   312    flags = parser.parse_args(argv)
   313  
   314    # Get a Dataflow API client and set its project and job_id in the options.
   315    options = PipelineOptions()
   316    gcloud_options = options.view_as(GoogleCloudOptions)
   317    gcloud_options.project = flags.project
   318    dataflow_client = apiclient.DataflowApplicationClient(options)
   319    df_metrics = DataflowMetrics(dataflow_client)
   320    all_metrics = df_metrics.all_metrics(job_id=flags.job_id)
   321    _LOGGER.info(
   322        'Printing all MetricResults for %s in %s', flags.job_id, flags.project)
   323    for metric_result in all_metrics:
   324      _LOGGER.info(metric_result)
   325  
   326  
   327  if __name__ == '__main__':
   328    logging.getLogger().setLevel(logging.INFO)
   329    main(sys.argv)