github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_exercise_metrics_pipeline.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A word-counting workflow.""" 19 20 # pytype: skip-file 21 22 import time 23 24 from hamcrest.library.number.ordering_comparison import greater_than 25 26 import apache_beam as beam 27 from apache_beam.metrics import Metrics 28 from apache_beam.testing.metric_result_matchers import DistributionMatcher 29 from apache_beam.testing.metric_result_matchers import MetricResultMatcher 30 31 SLEEP_TIME_SECS = 1 32 INPUT = [0, 0, 0, 100] 33 METRIC_NAMESPACE = ( 34 'apache_beam.runners.dataflow.' 35 'dataflow_exercise_metrics_pipeline.UserMetricsDoFn') 36 37 38 def metric_matchers(): 39 """MetricResult matchers common to all tests.""" 40 # TODO(ajamato): Matcher for the 'metrics' step's ElementCount. 41 # TODO(ajamato): Matcher for the 'metrics' step's MeanByteCount. 42 # TODO(ajamato): Matcher for the start and finish exec times. 43 # TODO(ajamato): Matcher for a gauge metric once implemented in dataflow. 44 matchers = [ 45 # User Counter Metrics. 46 MetricResultMatcher( 47 name='total_values', 48 namespace=METRIC_NAMESPACE, 49 step='metrics', 50 attempted=sum(INPUT), 51 committed=sum(INPUT)), 52 MetricResultMatcher( 53 name='ExecutionTime_StartBundle', 54 step='metrics', 55 attempted=greater_than(0), 56 committed=greater_than(0)), 57 MetricResultMatcher( 58 name='ExecutionTime_ProcessElement', 59 step='metrics', 60 attempted=greater_than(0), 61 committed=greater_than(0)), 62 MetricResultMatcher( 63 name='ExecutionTime_FinishBundle', 64 step='metrics', 65 attempted=greater_than(0), 66 committed=greater_than(0)), 67 MetricResultMatcher( 68 name='distribution_values', 69 namespace=METRIC_NAMESPACE, 70 step='metrics', 71 attempted=DistributionMatcher( 72 sum_value=sum(INPUT), 73 count_value=len(INPUT), 74 min_value=min(INPUT), 75 max_value=max(INPUT)), 76 committed=DistributionMatcher( 77 sum_value=sum(INPUT), 78 count_value=len(INPUT), 79 min_value=min(INPUT), 80 max_value=max(INPUT)), 81 ), 82 # Element count and MeanByteCount for a User ParDo. 83 MetricResultMatcher( 84 name='ElementCount', 85 labels={ 86 'output_user_name': 'metrics-out0', 87 'original_name': 'metrics-out0-ElementCount' 88 }, 89 attempted=greater_than(0), 90 committed=greater_than(0)), 91 MetricResultMatcher( 92 name='MeanByteCount', 93 labels={ 94 'output_user_name': 'metrics-out0', 95 'original_name': 'metrics-out0-MeanByteCount' 96 }, 97 attempted=greater_than(0), 98 committed=greater_than(0)) 99 ] 100 101 pcoll_names = [ 102 'GroupByKey/Reify-out0', 103 'GroupByKey/Read-out0', 104 'map_to_common_key-out0', 105 'GroupByKey/GroupByWindow-out0', 106 'GroupByKey/Read-out0', 107 'GroupByKey/Reify-out0' 108 ] 109 for name in pcoll_names: 110 matchers.extend([ 111 MetricResultMatcher( 112 name='ElementCount', 113 labels={ 114 'output_user_name': name, 115 'original_name': '%s-ElementCount' % name 116 }, 117 attempted=greater_than(0), 118 committed=greater_than(0)), 119 MetricResultMatcher( 120 name='MeanByteCount', 121 labels={ 122 'output_user_name': name, 123 'original_name': '%s-MeanByteCount' % name 124 }, 125 attempted=greater_than(0), 126 committed=greater_than(0)), 127 ]) 128 return matchers 129 130 131 class UserMetricsDoFn(beam.DoFn): 132 """Parse each line of input text into words.""" 133 def __init__(self): 134 self.total_metric = Metrics.counter(self.__class__, 'total_values') 135 self.dist_metric = Metrics.distribution( 136 self.__class__, 'distribution_values') 137 # TODO(ajamato): Add a verifier for gauge once it is supported by the SDKs 138 # and runners. 139 self.latest_metric = Metrics.gauge(self.__class__, 'latest_value') 140 141 def start_bundle(self): 142 time.sleep(SLEEP_TIME_SECS) 143 144 def process(self, element): 145 """Returns the processed element and increments the metrics.""" 146 elem_int = int(element) 147 self.total_metric.inc(elem_int) 148 self.dist_metric.update(elem_int) 149 self.latest_metric.set(elem_int) 150 time.sleep(SLEEP_TIME_SECS) 151 return [elem_int] 152 153 def finish_bundle(self): 154 time.sleep(SLEEP_TIME_SECS) 155 156 157 def apply_and_run(pipeline): 158 """Given an initialized Pipeline applies transforms and runs it.""" 159 _ = ( 160 pipeline 161 | beam.Create(INPUT) 162 | 'metrics' >> (beam.ParDo(UserMetricsDoFn())) 163 | 'map_to_common_key' >> beam.Map(lambda x: ('key', x)) 164 | beam.GroupByKey() 165 | 'm_out' >> beam.FlatMap( 166 lambda x: [ 167 1, 168 2, 169 3, 170 4, 171 5, 172 beam.pvalue.TaggedOutput('once', x), 173 beam.pvalue.TaggedOutput('twice', x), 174 beam.pvalue.TaggedOutput('twice', x) 175 ])) 176 result = pipeline.run() 177 result.wait_until_finish() 178 return result