github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_exercise_metrics_pipeline.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A word-counting workflow."""
    19  
    20  # pytype: skip-file
    21  
    22  import time
    23  
    24  from hamcrest.library.number.ordering_comparison import greater_than
    25  
    26  import apache_beam as beam
    27  from apache_beam.metrics import Metrics
    28  from apache_beam.testing.metric_result_matchers import DistributionMatcher
    29  from apache_beam.testing.metric_result_matchers import MetricResultMatcher
    30  
    31  SLEEP_TIME_SECS = 1
    32  INPUT = [0, 0, 0, 100]
    33  METRIC_NAMESPACE = (
    34      'apache_beam.runners.dataflow.'
    35      'dataflow_exercise_metrics_pipeline.UserMetricsDoFn')
    36  
    37  
    38  def metric_matchers():
    39    """MetricResult matchers common to all tests."""
    40    # TODO(ajamato): Matcher for the 'metrics' step's ElementCount.
    41    # TODO(ajamato): Matcher for the 'metrics' step's MeanByteCount.
    42    # TODO(ajamato): Matcher for the start and finish exec times.
    43    # TODO(ajamato): Matcher for a gauge metric once implemented in dataflow.
    44    matchers = [
    45        # User Counter Metrics.
    46        MetricResultMatcher(
    47            name='total_values',
    48            namespace=METRIC_NAMESPACE,
    49            step='metrics',
    50            attempted=sum(INPUT),
    51            committed=sum(INPUT)),
    52        MetricResultMatcher(
    53            name='ExecutionTime_StartBundle',
    54            step='metrics',
    55            attempted=greater_than(0),
    56            committed=greater_than(0)),
    57        MetricResultMatcher(
    58            name='ExecutionTime_ProcessElement',
    59            step='metrics',
    60            attempted=greater_than(0),
    61            committed=greater_than(0)),
    62        MetricResultMatcher(
    63            name='ExecutionTime_FinishBundle',
    64            step='metrics',
    65            attempted=greater_than(0),
    66            committed=greater_than(0)),
    67        MetricResultMatcher(
    68            name='distribution_values',
    69            namespace=METRIC_NAMESPACE,
    70            step='metrics',
    71            attempted=DistributionMatcher(
    72                sum_value=sum(INPUT),
    73                count_value=len(INPUT),
    74                min_value=min(INPUT),
    75                max_value=max(INPUT)),
    76            committed=DistributionMatcher(
    77                sum_value=sum(INPUT),
    78                count_value=len(INPUT),
    79                min_value=min(INPUT),
    80                max_value=max(INPUT)),
    81        ),
    82        # Element count and MeanByteCount for a User ParDo.
    83        MetricResultMatcher(
    84            name='ElementCount',
    85            labels={
    86                'output_user_name': 'metrics-out0',
    87                'original_name': 'metrics-out0-ElementCount'
    88            },
    89            attempted=greater_than(0),
    90            committed=greater_than(0)),
    91        MetricResultMatcher(
    92            name='MeanByteCount',
    93            labels={
    94                'output_user_name': 'metrics-out0',
    95                'original_name': 'metrics-out0-MeanByteCount'
    96            },
    97            attempted=greater_than(0),
    98            committed=greater_than(0))
    99    ]
   100  
   101    pcoll_names = [
   102        'GroupByKey/Reify-out0',
   103        'GroupByKey/Read-out0',
   104        'map_to_common_key-out0',
   105        'GroupByKey/GroupByWindow-out0',
   106        'GroupByKey/Read-out0',
   107        'GroupByKey/Reify-out0'
   108    ]
   109    for name in pcoll_names:
   110      matchers.extend([
   111          MetricResultMatcher(
   112              name='ElementCount',
   113              labels={
   114                  'output_user_name': name,
   115                  'original_name': '%s-ElementCount' % name
   116              },
   117              attempted=greater_than(0),
   118              committed=greater_than(0)),
   119          MetricResultMatcher(
   120              name='MeanByteCount',
   121              labels={
   122                  'output_user_name': name,
   123                  'original_name': '%s-MeanByteCount' % name
   124              },
   125              attempted=greater_than(0),
   126              committed=greater_than(0)),
   127      ])
   128    return matchers
   129  
   130  
   131  class UserMetricsDoFn(beam.DoFn):
   132    """Parse each line of input text into words."""
   133    def __init__(self):
   134      self.total_metric = Metrics.counter(self.__class__, 'total_values')
   135      self.dist_metric = Metrics.distribution(
   136          self.__class__, 'distribution_values')
   137      # TODO(ajamato): Add a verifier for gauge once it is supported by the SDKs
   138      # and runners.
   139      self.latest_metric = Metrics.gauge(self.__class__, 'latest_value')
   140  
   141    def start_bundle(self):
   142      time.sleep(SLEEP_TIME_SECS)
   143  
   144    def process(self, element):
   145      """Returns the processed element and increments the metrics."""
   146      elem_int = int(element)
   147      self.total_metric.inc(elem_int)
   148      self.dist_metric.update(elem_int)
   149      self.latest_metric.set(elem_int)
   150      time.sleep(SLEEP_TIME_SECS)
   151      return [elem_int]
   152  
   153    def finish_bundle(self):
   154      time.sleep(SLEEP_TIME_SECS)
   155  
   156  
   157  def apply_and_run(pipeline):
   158    """Given an initialized Pipeline applies transforms and runs it."""
   159    _ = (
   160        pipeline
   161        | beam.Create(INPUT)
   162        | 'metrics' >> (beam.ParDo(UserMetricsDoFn()))
   163        | 'map_to_common_key' >> beam.Map(lambda x: ('key', x))
   164        | beam.GroupByKey()
   165        | 'm_out' >> beam.FlatMap(
   166            lambda x: [
   167                1,
   168                2,
   169                3,
   170                4,
   171                5,
   172                beam.pvalue.TaggedOutput('once', x),
   173                beam.pvalue.TaggedOutput('twice', x),
   174                beam.pvalue.TaggedOutput('twice', x)
   175            ]))
   176    result = pipeline.run()
   177    result.wait_until_finish()
   178    return result