github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/dataflow_metrics_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Tests corresponding to the DataflowRunner implementation of MetricsResult, 20 the DataflowMetrics class. 21 """ 22 23 # pytype: skip-file 24 25 import types 26 import unittest 27 28 import mock 29 30 from apache_beam import DoFn 31 from apache_beam import ParDo 32 from apache_beam.metrics.cells import DistributionData 33 from apache_beam.metrics.cells import DistributionResult 34 from apache_beam.metrics.execution import MetricKey 35 from apache_beam.metrics.execution import MetricResult 36 from apache_beam.metrics.metricbase import MetricName 37 from apache_beam.options.pipeline_options import PipelineOptions 38 from apache_beam.pipeline import Pipeline 39 from apache_beam.runners.dataflow import dataflow_metrics 40 from apache_beam.testing import metric_result_matchers 41 from apache_beam.testing.metric_result_matchers import MetricResultMatcher 42 from apache_beam.transforms import Create 43 from apache_beam.transforms.environments import DockerEnvironment 44 45 # Protect against environments where apitools library is not available. 46 # pylint: disable=wrong-import-order, wrong-import-position 47 try: 48 from apache_beam.runners.dataflow.internal import apiclient 49 except ImportError: 50 apiclient = None # type: ignore 51 # pylint: enable=wrong-import-order, wrong-import-position 52 53 54 class DictToObject(object): 55 """Translate from a dict(list()) structure to an object structure""" 56 def __init__(self, data): 57 for name, value in data.items(): 58 setattr(self, name, self._wrap(value)) 59 60 def _wrap(self, value): 61 if isinstance(value, (tuple, list, set, frozenset)): 62 return type(value)([self._wrap(v) for v in value]) 63 return DictToObject(value) if isinstance(value, dict) else value 64 65 66 class TestDataflowMetrics(unittest.TestCase): 67 68 # TODO(https://github.com/apache/beam/issues/19258): Write a dump tool to 69 # generate this fake data, or somehow make this easier to maintain. 70 ONLY_COUNTERS_LIST = { 71 "metrics": [ 72 { 73 "name": { 74 "context": { 75 "additionalProperties": [{ 76 "key": "namespace", 77 "value": "__main__.WordExtractingDoFn" 78 }, { 79 "key": "step", "value": "s2" 80 }, 81 { 82 "key": "tentative", 83 "value": "true" 84 }] 85 }, 86 "name": "words", 87 "origin": "user" 88 }, 89 "scalar": { 90 "integer_value": 26185 91 }, 92 "distribution": None, 93 "updateTime": "2017-03-22T18:47:06.402Z" 94 }, 95 { 96 "name": { 97 "context": { 98 "additionalProperties": [{ 99 "key": "namespace", 100 "value": "__main__.WordExtractingDoFn" 101 }, { 102 "key": "step", "value": "s2" 103 }] 104 }, 105 "name": "words", 106 "origin": "user" 107 }, 108 "scalar": { 109 "integer_value": 26181 110 }, 111 "distribution": None, 112 "updateTime": "2017-03-22T18:47:06.402Z" 113 }, 114 { 115 "name": { 116 "context": { 117 "additionalProperties": [{ 118 "key": "namespace", 119 "value": "__main__.WordExtractingDoFn" 120 }, { 121 "key": "step", "value": "s2" 122 }, 123 { 124 "key": "tentative", 125 "value": "true" 126 }] 127 }, 128 "name": "empty_lines", 129 "origin": "user" 130 }, 131 "scalar": { 132 "integer_value": 1080 133 }, 134 "distribution": None, 135 "updateTime": "2017-03-22T18:47:06.402Z" 136 }, 137 { 138 "name": { 139 "context": { 140 "additionalProperties": [{ 141 "key": "namespace", 142 "value": "__main__.WordExtractingDoFn" 143 }, { 144 "key": "step", "value": "s2" 145 }] 146 }, 147 "name": "empty_lines", 148 "origin": "user" 149 }, 150 "scalar": { 151 "integer_value": 1080 152 }, 153 "distribution": None, 154 "updateTime": "2017-03-22T18:47:06.402Z" 155 }, 156 ] 157 } 158 STRUCTURED_COUNTER_LIST = { 159 "metrics": [ 160 { 161 "name": { 162 "context": { 163 "additionalProperties": [{ 164 "key": "namespace", 165 "value": "__main__.WordExtractingDoFn" 166 }, { 167 "key": "step", "value": "s2" 168 }, 169 { 170 "key": "tentative", 171 "value": "true" 172 }] 173 }, 174 "name": "word_lengths", 175 "origin": "user" 176 }, 177 "scalar": { 178 "integer_value": 109475 179 }, 180 "distribution": None, 181 "updateTime": "2017-03-22T18:47:06.402Z" 182 }, 183 { 184 "name": { 185 "context": { 186 "additionalProperties": [{ 187 "key": "namespace", 188 "value": "__main__.WordExtractingDoFn" 189 }, { 190 "key": "step", "value": "s2" 191 }] 192 }, 193 "name": "word_lengths", 194 "origin": "user" 195 }, 196 "scalar": { 197 "integer_value": 109475 198 }, 199 "distribution": None, 200 "updateTime": "2017-03-22T18:47:06.402Z" 201 }, 202 { 203 "name": { 204 "context": { 205 "additionalProperties": [{ 206 "key": "namespace", 207 "value": "__main__.WordExtractingDoFn" 208 }, { 209 "key": "step", "value": "s2" 210 }, 211 { 212 "key": "tentative", 213 "value": "true" 214 }] 215 }, 216 "name": "word_length_dist", 217 "origin": "user" 218 }, 219 "scalar": None, 220 "distribution": { 221 "object_value": { 222 "properties": [ 223 { 224 "key": "min", "value": { 225 "integer_value": 2 226 } 227 }, 228 { 229 "key": "max", "value": { 230 "integer_value": 16 231 } 232 }, 233 { 234 "key": "count", "value": { 235 "integer_value": 2 236 } 237 }, 238 { 239 "key": "mean", "value": { 240 "integer_value": 9 241 } 242 }, 243 { 244 "key": "sum", "value": { 245 "integer_value": 18 246 } 247 }, 248 ] 249 } 250 }, 251 "updateTime": "2017-03-22T18:47:06.402Z" 252 }, 253 { 254 "name": { 255 "context": { 256 "additionalProperties": [{ 257 "key": "namespace", 258 "value": "__main__.WordExtractingDoFn" 259 }, { 260 "key": "step", "value": "s2" 261 }] 262 }, 263 "name": "word_length_dist", 264 "origin": "user" 265 }, 266 "scalar": None, 267 "distribution": { 268 "object_value": { 269 "properties": [ 270 { 271 "key": "min", "value": { 272 "integer_value": 2 273 } 274 }, 275 { 276 "key": "max", "value": { 277 "integer_value": 16 278 } 279 }, 280 { 281 "key": "count", "value": { 282 "integer_value": 2 283 } 284 }, 285 { 286 "key": "mean", "value": { 287 "integer_value": 9 288 } 289 }, 290 { 291 "key": "sum", "value": { 292 "integer_value": 18 293 } 294 }, 295 ] 296 } 297 }, 298 "updateTime": "2017-03-22T18:47:06.402Z" 299 }, 300 ] 301 } 302 SYSTEM_COUNTERS_LIST = { 303 "metrics": [ 304 # ElementCount 305 { 306 "name": { 307 "context": { 308 "additionalProperties": [ 309 { 310 "key": "original_name", 311 "value": 312 "ToIsmRecordForMultimap-out0-ElementCount" 313 }, # yapf: disable 314 { 315 "key": "output_user_name", 316 "value": "ToIsmRecordForMultimap-out0" 317 } 318 ] 319 }, 320 "name": "ElementCount", 321 "origin": "dataflow/v1b3" 322 }, 323 "scalar": { 324 "integer_value": 42 325 }, 326 "distribution": None, 327 "updateTime": "2017-03-22T18:47:06.402Z" 328 }, 329 { 330 "name": { 331 "context": { 332 "additionalProperties": [ 333 { 334 "key": "original_name", 335 "value": 336 "ToIsmRecordForMultimap-out0-ElementCount" 337 }, # yapf: disable 338 { 339 "key": "output_user_name", 340 "value": "ToIsmRecordForMultimap-out0" 341 }, { 342 "key": "tentative", "value": "true" 343 } 344 ] 345 }, 346 "name": "ElementCount", 347 "origin": "dataflow/v1b3" 348 }, 349 "scalar": { 350 "integer_value": 42 351 }, 352 "distribution": None, 353 "updateTime": "2017-03-22T18:47:06.402Z" 354 }, 355 # MeanByteCount 356 { 357 "name": { 358 "context": { 359 "additionalProperties": [ 360 { 361 "key": "original_name", 362 "value": "Read-out0-MeanByteCount" 363 }, 364 { 365 "key": "output_user_name", 366 "value": "GroupByKey/Read-out0" 367 } 368 ] 369 }, 370 "name": "MeanByteCount", 371 "origin": "dataflow/v1b3" 372 }, 373 "scalar": { 374 "integer_value": 31 375 }, 376 "distribution": None, 377 "updateTime": "2017-03-22T18:47:06.402Z" 378 }, 379 { 380 "name": { 381 "context": { 382 "additionalProperties": [ 383 { 384 "key": "original_name", 385 "value": "Read-out0-MeanByteCount" 386 }, 387 { 388 "key": "output_user_name", 389 "value": "GroupByKey/Read-out0" 390 }, { 391 "key": "tentative", "value": "true" 392 } 393 ] 394 }, 395 "name": "MeanByteCount", 396 "origin": "dataflow/v1b3" 397 }, 398 "scalar": { 399 "integer_value": 31 400 }, 401 "distribution": None, 402 "updateTime": "2017-03-22T18:47:06.402Z" 403 }, 404 # ExecutionTime 405 { 406 "name": { 407 "context": { 408 "additionalProperties": [ 409 { 410 "key": "step", "value": "write/Write/Write" 411 }, 412 ] 413 }, 414 "name": "ExecutionTime_ProcessElement", 415 "origin": "dataflow/v1b3" 416 }, 417 "scalar": { 418 "integer_value": 1000 419 }, 420 "distribution": None, 421 "updateTime": "2017-03-22T18:47:06.402Z" 422 }, 423 { 424 "name": { 425 "context": { 426 "additionalProperties": [{ 427 "key": "step", "value": "write/Write/Write" 428 }, 429 { 430 "key": "tentative", 431 "value": "true" 432 }] 433 }, 434 "name": "ExecutionTime_ProcessElement", 435 "origin": "dataflow/v1b3" 436 }, 437 "scalar": { 438 "integer_value": 1000 439 }, 440 "distribution": None, 441 "updateTime": "2017-03-22T18:47:06.402Z" 442 }, 443 ] 444 } 445 446 def setup_mock_client_result(self, counter_list=None): 447 mock_client = mock.Mock() 448 mock_query_result = DictToObject(counter_list) 449 mock_client.get_job_metrics.return_value = mock_query_result 450 mock_job_result = mock.Mock() 451 mock_job_result.job_id.return_value = 1 452 mock_job_result.is_in_terminal_state.return_value = False 453 return mock_client, mock_job_result 454 455 def test_cache_functions(self): 456 mock_client, mock_job_result = self.setup_mock_client_result( 457 self.STRUCTURED_COUNTER_LIST) 458 dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) 459 460 # At first creation, we should always query dataflow. 461 self.assertTrue(dm._cached_metrics is None) 462 463 # Right after querying, we still query again. 464 dm.query() 465 self.assertTrue(dm._cached_metrics is None) 466 467 # The job has ended. The query should not run again after this. 468 mock_job_result.is_in_terminal_state.return_value = True 469 dm.query() 470 self.assertTrue(dm._cached_metrics) 471 472 def test_query_structured_metrics(self): 473 mock_client, mock_job_result = self.setup_mock_client_result( 474 self.STRUCTURED_COUNTER_LIST) 475 dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) 476 dm._translate_step_name = types.MethodType(lambda self, x: 'split', dm) 477 query_result = dm.query() 478 expected_counters = [ 479 MetricResult( 480 MetricKey( 481 'split', 482 MetricName('__main__.WordExtractingDoFn', 'word_lengths'), 483 ), 484 109475, 485 109475), 486 ] 487 self.assertEqual(query_result['counters'], expected_counters) 488 489 expected_distributions = [ 490 MetricResult( 491 MetricKey( 492 'split', 493 MetricName('__main__.WordExtractingDoFn', 'word_length_dist'), 494 ), 495 DistributionResult(DistributionData(18, 2, 2, 16)), 496 DistributionResult(DistributionData(18, 2, 2, 16))), 497 ] 498 self.assertEqual(query_result['distributions'], expected_distributions) 499 500 @unittest.skipIf(apiclient is None, 'GCP dependencies are not installed') 501 def test_translate_portable_job_step_name(self): 502 mock_client, mock_job_result = self.setup_mock_client_result( 503 self.ONLY_COUNTERS_LIST) 504 505 pipeline_options = PipelineOptions([ 506 '--experiments=use_runner_v2', 507 '--experiments=use_portable_job_submission', 508 '--temp_location=gs://any-location/temp', 509 '--project=dummy_project', 510 ]) 511 512 pipeline = Pipeline(options=pipeline_options) 513 pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn()) # pylint:disable=expression-not-assigned 514 515 test_environment = DockerEnvironment(container_image='test_default_image') 516 proto_pipeline, _ = pipeline.to_runner_api( 517 return_context=True, default_environment=test_environment) 518 519 job = apiclient.Job(pipeline_options, proto_pipeline) 520 dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result, job) 521 self.assertEqual( 522 'MyTestParDo', 523 dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14')) 524 525 def test_query_counters(self): 526 mock_client, mock_job_result = self.setup_mock_client_result( 527 self.ONLY_COUNTERS_LIST) 528 dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) 529 dm._translate_step_name = types.MethodType(lambda self, x: 'split', dm) 530 query_result = dm.query() 531 expected_counters = [ 532 MetricResult( 533 MetricKey( 534 'split', 535 MetricName('__main__.WordExtractingDoFn', 'empty_lines')), 536 1080, 537 1080), 538 MetricResult( 539 MetricKey( 540 'split', MetricName('__main__.WordExtractingDoFn', 'words')), 541 26181, 542 26185), 543 ] 544 self.assertEqual( 545 sorted(query_result['counters'], key=lambda x: x.key.metric.name), 546 sorted(expected_counters, key=lambda x: x.key.metric.name)) 547 548 def test_system_counters_set_labels_and_step_name(self): 549 mock_client, mock_job_result = self.setup_mock_client_result( 550 self.SYSTEM_COUNTERS_LIST) 551 test_object = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) 552 all_metrics = test_object.all_metrics() 553 554 matchers = [ 555 MetricResultMatcher( 556 name='ElementCount', 557 labels={ 558 'original_name': 'ToIsmRecordForMultimap-out0-ElementCount', 559 'output_user_name': 'ToIsmRecordForMultimap-out0' 560 }, 561 attempted=42, 562 committed=42), 563 MetricResultMatcher( 564 name='MeanByteCount', 565 labels={ 566 'original_name': 'Read-out0-MeanByteCount', 567 'output_user_name': 'GroupByKey/Read-out0' 568 }, 569 attempted=31, 570 committed=31), 571 MetricResultMatcher( 572 name='ExecutionTime_ProcessElement', 573 step='write/Write/Write', 574 attempted=1000, 575 committed=1000) 576 ] 577 errors = metric_result_matchers.verify_all(all_metrics, matchers) 578 self.assertFalse(errors, errors) 579 580 581 if __name__ == '__main__': 582 unittest.main()