github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_metrics.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 DataflowRunner implementation of MetricResults. It is in charge of 20 responding to queries of current metrics by going to the dataflow 21 service. 22 """ 23 24 # pytype: skip-file 25 26 import argparse 27 import logging 28 import numbers 29 import sys 30 from collections import defaultdict 31 32 from apache_beam.metrics.cells import DistributionData 33 from apache_beam.metrics.cells import DistributionResult 34 from apache_beam.metrics.execution import MetricKey 35 from apache_beam.metrics.execution import MetricResult 36 from apache_beam.metrics.metric import MetricResults 37 from apache_beam.metrics.metricbase import MetricName 38 from apache_beam.options.pipeline_options import GoogleCloudOptions 39 from apache_beam.options.pipeline_options import PipelineOptions 40 41 _LOGGER = logging.getLogger(__name__) 42 43 44 def _get_match(proto, filter_fn): 45 """Finds and returns the first element that matches a query. 46 47 If no element matches the query, it throws ValueError. 48 If more than one element matches the query, it returns only the first. 49 """ 50 query = [elm for elm in proto if filter_fn(elm)] 51 if len(query) == 0: 52 raise ValueError('Could not find element') 53 elif len(query) > 1: 54 raise ValueError('Too many matches') 55 56 return query[0] 57 58 59 # V1b3 MetricStructuredName keys to accept and copy to the MetricKey labels. 60 STEP_LABEL = 'step' 61 STRUCTURED_NAME_LABELS = set( 62 ['execution_step', 'original_name', 'output_user_name']) 63 64 65 class DataflowMetrics(MetricResults): 66 """Implementation of MetricResults class for the Dataflow runner.""" 67 def __init__(self, dataflow_client=None, job_result=None, job_graph=None): 68 """Initialize the Dataflow metrics object. 69 70 Args: 71 dataflow_client: apiclient.DataflowApplicationClient to interact with the 72 dataflow service. 73 job_result: DataflowPipelineResult with the state and id information of 74 the job. 75 job_graph: apiclient.Job instance to be able to translate between internal 76 step names (e.g. "s2"), and user step names (e.g. "split"). 77 """ 78 super().__init__() 79 self._dataflow_client = dataflow_client 80 self.job_result = job_result 81 self._queried_after_termination = False 82 self._cached_metrics = None 83 self._job_graph = job_graph 84 85 @staticmethod 86 def _is_counter(metric_result): 87 return isinstance(metric_result.attempted, numbers.Number) 88 89 @staticmethod 90 def _is_distribution(metric_result): 91 return isinstance(metric_result.attempted, DistributionResult) 92 93 def _translate_step_name(self, internal_name): 94 """Translate between internal step names (e.g. "s1") and user step names.""" 95 if not self._job_graph: 96 raise ValueError( 97 'Could not translate the internal step name %r since job graph is ' 98 'not available.' % internal_name) 99 user_step_name = None 100 if (self._job_graph and internal_name in 101 self._job_graph.proto_pipeline.components.transforms.keys()): 102 # Dataflow Runner v2 with portable job submission uses proto transform map 103 # IDs for step names. Also PTransform.unique_name maps to user step names. 104 # Hence we lookup user step names based on the proto. 105 user_step_name = self._job_graph.proto_pipeline.components.transforms[ 106 internal_name].unique_name 107 else: 108 try: 109 step = _get_match( 110 self._job_graph.proto.steps, lambda x: x.name == internal_name) 111 user_step_name = _get_match( 112 step.properties.additionalProperties, 113 lambda x: x.key == 'user_name').value.string_value 114 except ValueError: 115 pass # Exception is handled below. 116 if not user_step_name: 117 raise ValueError( 118 'Could not translate the internal step name %r.' % internal_name) 119 return user_step_name 120 121 def _get_metric_key(self, metric): 122 """Populate the MetricKey object for a queried metric result.""" 123 step = "" 124 name = metric.name.name # Always extract a name 125 labels = {} 126 try: # Try to extract the user step name. 127 # If ValueError is thrown within this try-block, it is because of 128 # one of the following: 129 # 1. Unable to translate the step name. Only happening with improperly 130 # formatted job graph (unlikely), or step name not being the internal 131 # step name (only happens for unstructured-named metrics). 132 # 2. Unable to unpack [step] or [namespace]; which should only happen 133 # for unstructured names. 134 step = _get_match( 135 metric.name.context.additionalProperties, 136 lambda x: x.key == STEP_LABEL).value 137 step = self._translate_step_name(step) 138 except ValueError: 139 pass 140 141 namespace = "dataflow/v1b3" # Try to extract namespace or add a default. 142 try: 143 namespace = _get_match( 144 metric.name.context.additionalProperties, 145 lambda x: x.key == 'namespace').value 146 except ValueError: 147 pass 148 149 for kv in metric.name.context.additionalProperties: 150 if kv.key in STRUCTURED_NAME_LABELS: 151 labels[kv.key] = kv.value 152 # Package everything besides namespace and name the labels as well, 153 # including unmodified step names to assist in integration the exact 154 # unmodified values which come from dataflow. 155 return MetricKey(step, MetricName(namespace, name), labels=labels) 156 157 def _populate_metrics(self, response, result, user_metrics=False): 158 """Move metrics from response to results as MetricResults.""" 159 if user_metrics: 160 metrics = [ 161 metric for metric in response.metrics if metric.name.origin == 'user' 162 ] 163 else: 164 metrics = [ 165 metric for metric in response.metrics 166 if metric.name.origin == 'dataflow/v1b3' 167 ] 168 169 # Get the tentative/committed versions of every metric together. 170 metrics_by_name = defaultdict(lambda: {}) 171 for metric in metrics: 172 if (metric.name.name.endswith('_MIN') or 173 metric.name.name.endswith('_MAX') or 174 metric.name.name.endswith('_MEAN') or 175 metric.name.name.endswith('_COUNT')): 176 # The Dataflow Service presents distribution metrics in two ways: 177 # One way is as a single distribution object with all its fields, and 178 # another way is as four different scalar metrics labeled as _MIN, 179 # _MAX, _COUNT_, _MEAN. 180 # TODO(pabloem) remove these when distributions are not being broken up 181 # in the service. 182 # The second way is only useful for the UI, and should be ignored. 183 continue 184 is_tentative = [ 185 prop for prop in metric.name.context.additionalProperties 186 if prop.key == 'tentative' and prop.value == 'true' 187 ] 188 tentative_or_committed = 'tentative' if is_tentative else 'committed' 189 190 metric_key = self._get_metric_key(metric) 191 if metric_key is None: 192 continue 193 metrics_by_name[metric_key][tentative_or_committed] = metric 194 195 # Now we create the MetricResult elements. 196 for metric_key, metric in metrics_by_name.items(): 197 attempted = self._get_metric_value(metric['tentative']) 198 committed = self._get_metric_value(metric['committed']) 199 result.append( 200 MetricResult(metric_key, attempted=attempted, committed=committed)) 201 202 def _get_metric_value(self, metric): 203 """Get a metric result object from a MetricUpdate from Dataflow API.""" 204 if metric is None: 205 return None 206 207 if metric.scalar is not None: 208 return metric.scalar.integer_value 209 elif metric.distribution is not None: 210 dist_count = _get_match( 211 metric.distribution.object_value.properties, 212 lambda x: x.key == 'count').value.integer_value 213 dist_min = _get_match( 214 metric.distribution.object_value.properties, 215 lambda x: x.key == 'min').value.integer_value 216 dist_max = _get_match( 217 metric.distribution.object_value.properties, 218 lambda x: x.key == 'max').value.integer_value 219 dist_sum = _get_match( 220 metric.distribution.object_value.properties, 221 lambda x: x.key == 'sum').value.integer_value 222 if dist_sum is None: 223 # distribution metric is not meant to use on large values, but in case 224 # it is, the value can overflow and become double_value, the correctness 225 # of the value may not be guaranteed. 226 _LOGGER.info( 227 "Distribution metric sum value seems to have " 228 "overflowed integer_value range, the correctness of sum or mean " 229 "value may not be guaranteed: %s" % metric.distribution) 230 dist_sum = int( 231 _get_match( 232 metric.distribution.object_value.properties, 233 lambda x: x.key == 'sum').value.double_value) 234 return DistributionResult( 235 DistributionData(dist_sum, dist_count, dist_min, dist_max)) 236 else: 237 return None 238 239 def _get_metrics_from_dataflow(self, job_id=None): 240 """Return cached metrics or query the dataflow service.""" 241 if not job_id: 242 try: 243 job_id = self.job_result.job_id() 244 except AttributeError: 245 job_id = None 246 if not job_id: 247 raise ValueError('Can not query metrics. Job id is unknown.') 248 249 if self._cached_metrics: 250 return self._cached_metrics 251 252 job_metrics = self._dataflow_client.get_job_metrics(job_id) 253 # If we cannot determine that the job has terminated, 254 # then metrics will not change and we can cache them. 255 if self.job_result and self.job_result.is_in_terminal_state(): 256 self._cached_metrics = job_metrics 257 return job_metrics 258 259 def all_metrics(self, job_id=None): 260 """Return all user and system metrics from the dataflow service.""" 261 metric_results = [] 262 response = self._get_metrics_from_dataflow(job_id=job_id) 263 self._populate_metrics(response, metric_results, user_metrics=True) 264 self._populate_metrics(response, metric_results, user_metrics=False) 265 return metric_results 266 267 def query(self, filter=None): 268 metric_results = [] 269 response = self._get_metrics_from_dataflow() 270 self._populate_metrics(response, metric_results, user_metrics=True) 271 return { 272 self.COUNTERS: [ 273 elm for elm in metric_results if self.matches(filter, elm.key) and 274 DataflowMetrics._is_counter(elm) 275 ], 276 self.DISTRIBUTIONS: [ 277 elm for elm in metric_results if self.matches(filter, elm.key) and 278 DataflowMetrics._is_distribution(elm) 279 ], 280 self.GAUGES: [] 281 } # TODO(pabloem): Add Gauge support for dataflow. 282 283 284 def main(argv): 285 """Print the metric results for the dataflow --job_id and --project. 286 287 Instead of running an entire pipeline which takes several minutes, use this 288 main method to display MetricResults for a specific --job_id and --project 289 which takes only a few seconds. 290 """ 291 # TODO(https://github.com/apache/beam/issues/19452): The MetricResults do not 292 # show translated step names as the job_graph is not provided to 293 # DataflowMetrics. Import here to avoid adding the dependency for local 294 # running scenarios. 295 try: 296 # pylint: disable=wrong-import-order, wrong-import-position 297 from apache_beam.runners.dataflow.internal import apiclient 298 except ImportError: 299 raise ImportError( 300 'Google Cloud Dataflow runner not available, ' 301 'please install apache_beam[gcp]') 302 if argv[0] == __file__: 303 argv = argv[1:] 304 parser = argparse.ArgumentParser() 305 parser.add_argument( 306 '-j', '--job_id', type=str, help='The job id to query metrics for.') 307 parser.add_argument( 308 '-p', 309 '--project', 310 type=str, 311 help='The project name to query metrics for.') 312 flags = parser.parse_args(argv) 313 314 # Get a Dataflow API client and set its project and job_id in the options. 315 options = PipelineOptions() 316 gcloud_options = options.view_as(GoogleCloudOptions) 317 gcloud_options.project = flags.project 318 dataflow_client = apiclient.DataflowApplicationClient(options) 319 df_metrics = DataflowMetrics(dataflow_client) 320 all_metrics = df_metrics.all_metrics(job_id=flags.job_id) 321 _LOGGER.info( 322 'Printing all MetricResults for %s in %s', flags.job_id, flags.project) 323 for metric_result in all_metrics: 324 _LOGGER.info(metric_result) 325 326 327 if __name__ == '__main__': 328 logging.getLogger().setLevel(logging.INFO) 329 main(sys.argv)