k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/prometheus/manifests/dashboards/master_panels.py (about) 1 #!/usr/bin/env python3 2 3 # Copyright 2022 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 from grafanalib import core as g 18 import defaults as d 19 20 21 def api_call_latency_panel(expression): 22 def api_call_latency(title, verb, scope, threshold): 23 return d.Graph( 24 title=title, 25 targets=[ 26 d.Target(expr=str(threshold), legendFormat="threshold"), 27 d.Target( 28 expr=d.one_line(expression % {"verb": verb, "scope": scope} 29 ), 30 # TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed 31 # legendFormat="{{verb}} {{scope}}/{{resource}}", 32 ), 33 ], 34 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 35 ) 36 37 return [ 38 api_call_latency( 39 title="GET resource latency (percentaile=99, scope=resource, threshold=1s)", 40 verb="GET", 41 scope="resource", 42 threshold=1, 43 ), 44 api_call_latency( 45 title="LIST namespace latency (percentaile=99, scope=namespace, threshold=5s)", 46 verb="LIST", 47 scope="namespace", 48 threshold=5, 49 ), 50 api_call_latency( 51 title="LIST cluster latency (percentaile=99, scope=cluster, threshold=30s)", 52 verb="LIST", 53 scope="cluster", 54 threshold=30, 55 ), 56 api_call_latency( 57 title="Mutating API call latency (threshold=1s)", 58 verb=d.any_of("CREATE", "DELETE", "PATCH", "POST", "PUT"), 59 scope=d.any_of("namespace", "cluster", "resource"), 60 threshold=1, 61 ), 62 ] 63 64 API_CALL_LATENCY_PANELS = api_call_latency_panel(""" 65 apiserver:apiserver_request_latency_1m:histogram_quantile{ 66 quantile="0.99", 67 verb=~"%(verb)s", 68 scope=~"%(scope)s", 69 resource=~"${resource:regex}s*", 70 subresource!~"exec|proxy", 71 }""") 72 73 QUANTILE_API_CALL_LATENCY_PANELS = api_call_latency_panel(""" 74 quantile_over_time(0.99, 75 apiserver:apiserver_request_latency_1m:histogram_quantile{ 76 quantile="0.99", 77 verb=~"%(verb)s", 78 scope=~"%(scope)s", 79 resource=~"${resource:regex}s*", 80 subresource!~"exec|proxy", 81 }[5d])""") 82 83 APF_PANELS = [ 84 d.simple_graph( 85 "Requests waiting time", 86 "histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{instance=~\".*(${instance:pipe})\"}[1m])) by (le, instance, priority_level))", 87 legend="{{instance}} {{priority_level}}", 88 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 89 ), 90 d.simple_graph( 91 "Execution time", 92 "histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{instance=~\".*(${instance:pipe})\"}[1m])) by (le, instance, priority_level))", 93 legend="{{instance}} {{priority_level}}", 94 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 95 ), 96 d.simple_graph( 97 "Total execution time per second", 98 "sum(irate(apiserver_flowcontrol_request_execution_seconds_sum{instance=~\".*(${instance:pipe})\"}[1m])) by (instance, priority_level)", 99 legend="{{instance}} {{priority_level}}", 100 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 101 ), 102 d.simple_graph( 103 "Requests rate by priority level", 104 "sum(irate(apiserver_flowcontrol_dispatched_requests_total{instance=~\".*(${instance:pipe})\"}[1m])) by (instance, priority_level)", 105 legend="{{instance}} {{priority_level}}", 106 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 107 ), 108 d.simple_graph( 109 "Concurrency in use", 110 "sum(apiserver_flowcontrol_request_concurrency_in_use{instance=~\".*(${instance:pipe})\"}) by (instance, priority_level)", 111 legend="{{instance}} {{priority_level}}", 112 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 113 ), 114 d.simple_graph( 115 "Current executing requests", 116 "sum(apiserver_flowcontrol_current_executing_requests{instance=~\".*(${instance:pipe})\"}) by (instance, priority_level)", 117 legend="{{instance}} {{priority_level}}", 118 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 119 ), 120 d.simple_graph( 121 "Inqueue requests", 122 "sum(apiserver_flowcontrol_current_inqueue_requests{instance=~\".*(${instance:pipe})\"}) by (instance, priority_level)", 123 legend="{{instance}} {{priority_level}}", 124 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 125 ), 126 d.simple_graph( 127 "Nominal number of execution seats", 128 "avg(apiserver_flowcontrol_nominal_limit_seats{instance=~\".*(${instance:pipe})\"}) by (priority_level)", 129 legend="{{priority_level}}", 130 ), 131 d.simple_graph( 132 "Lower bound on number of execution seats", 133 "avg(apiserver_flowcontrol_lower_limit_seats{instance=~\".*(${instance:pipe})\"}) by (priority_level)", 134 legend="{{priority_level}}", 135 ), 136 d.simple_graph( 137 "Upper bound on number of execution seats", 138 "avg(apiserver_flowcontrol_upper_limit_seats{instance=~\".*(${instance:pipe})\"}) by (priority_level)", 139 legend="{{priority_level}}", 140 ), 141 d.simple_graph( 142 "Number of seats Priority Level could use divided by nominal seats (50th percentile)", 143 "histogram_quantile(0.5, rate(apiserver_flowcontrol_demand_seats_bucket{instance=~\".*(${instance:pipe})\"}[10s]))", 144 legend="{{instance}} {{priority_level}}", 145 ), 146 d.simple_graph( 147 "High watermark of demand seats over last adjustment period", 148 "apiserver_flowcontrol_demand_seats_high_watermark{instance=~\".*(${instance:pipe})\"}", 149 legend="{{instance}} {{priority_level}}", 150 ), 151 d.simple_graph( 152 "Smoothed seat demands", 153 "apiserver_flowcontrol_demand_seats_smoothed{instance=~\".*(${instance:pipe})\"}", 154 legend="{{instance}} {{priority_level}}", 155 ), 156 d.simple_graph( 157 "Current seat limit for each Priority Level", 158 "apiserver_flowcontrol_current_limit_seats{instance=~\".*(${instance:pipe})\"}", 159 legend="{{instance}} {{priority_level}}", 160 ), 161 ] 162 163 HEALTH_PANELS = [ 164 d.simple_graph( 165 "Unhealthy nodes", 166 "sum(node_collector_unhealthy_nodes_in_zone) by (zone)", 167 legend="{{zone}}", 168 ), 169 d.simple_graph( 170 "Pod creations", 171 'sum(irate(apiserver_request_total{verb="POST", resource="pods", subresource=""}[1m]))', 172 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 173 ), 174 d.simple_graph( 175 "Pod bindings", 176 'sum(irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding"}[1m]))', 177 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 178 ), 179 # It's not clear which "Component restarts" shows more accurate results. 180 d.simple_graph( 181 "Component restarts", 182 "sum(rate(process_start_time_seconds[1m]) > bool 0) by (job, endpoint)", 183 ), 184 d.simple_graph( 185 "Component restarts 2", 186 'sum(min_over_time(container_start_time_seconds{container!="",container!="POD"}[2m])) by (container)', 187 ), 188 d.simple_graph( 189 "Active component", "sum(leader_election_master_status) by (name, instance)" 190 ), 191 ] 192 193 ETCD_PANELS = [ 194 d.simple_graph("etcd leader", "etcd_server_is_leader", legend="{{instance}}"), 195 d.simple_graph( 196 "etcd bytes sent", 197 "rate(etcd_network_client_grpc_sent_bytes_total[1m])", 198 yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT), 199 legend="{{instance}}", 200 ), 201 d.simple_graph( 202 "etcd operations rate", 203 d.one_line( 204 """ 205 sum( 206 rate( 207 etcd_request_duration_seconds_count{ 208 operation=~"${etcd_operation:regex}", 209 type=~".*(${etcd_type:pipe})" 210 }[1m] 211 ) 212 ) by (operation, type) 213 """ 214 ), 215 yAxes=g.single_y_axis(format=g.OPS_FORMAT), 216 legend="{{operation}} {{type}}", 217 ), 218 d.simple_graph( 219 "etcd get latency by type (99th percentile)", 220 d.one_line( 221 """ 222 histogram_quantile( 223 0.99, 224 sum( 225 rate( 226 etcd_request_duration_seconds_bucket{ 227 operation=~"${etcd_operation:regex}", 228 type=~".*(${etcd_type:pipe})" 229 }[1m] 230 ) 231 ) by (le, operation, type, instance) 232 ) 233 """ 234 ), 235 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 236 legend="{{operation}} {{type}} on {{instance}}", 237 ), 238 d.simple_graph( 239 "etcd get latency by type (50th percentile)", 240 d.one_line( 241 """ 242 histogram_quantile( 243 0.50, 244 sum( 245 rate( 246 etcd_request_duration_seconds_bucket{ 247 operation=~"${etcd_operation:regex}", 248 type=~".*(${etcd_type:pipe})" 249 }[1m] 250 ) 251 ) by (le, operation, type, instance) 252 ) 253 """ 254 ), 255 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 256 ), 257 d.simple_graph("etcd instance id", "sum(etcd_server_id) by (instance, server_id)"), 258 d.simple_graph( 259 "etcd network latency (99th percentile)", 260 "histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (le, instance, To))", 261 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 262 ), 263 d.simple_graph( 264 "etcd compaction keys", 265 "delta(etcd_debugging_mvcc_db_compaction_keys_total[1m])", 266 ), 267 d.simple_graph( 268 "etcd compaction pause sum duration", 269 "delta(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_sum[1m])", 270 yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT), 271 ), 272 d.simple_graph( 273 "etcd compaction pause num chunks", 274 "delta(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_count[1m])", 275 ), 276 d.simple_graph( 277 "etcd_disk_backend_commit_duration_seconds", 278 "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) by (le, instance))", 279 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 280 ), 281 d.simple_graph( 282 "etcd wal fsync duration", 283 "histogram_quantile(1.0, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (le, endpoint))", 284 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 285 ), 286 d.Graph( 287 title="etcd compaction max pause", 288 points=True, 289 lines=False, 290 targets=[ 291 d.Target( 292 expr="histogram_quantile(1.0, sum(rate(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_bucket[1m])) by (le, instance))" 293 ) 294 ], 295 yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT), 296 ), 297 d.simple_graph( 298 "etcd objects", 299 "sum(etcd_object_counts) by (resource, instance)", 300 legend="{{instance}}: {{resource}}", 301 ), 302 d.simple_graph( 303 "etcd db size", 304 [ 305 "etcd_mvcc_db_total_size_in_bytes", 306 "etcd_mvcc_db_total_size_in_use_in_bytes", 307 "etcd_server_quota_backend_bytes", 308 ], 309 yAxes=g.single_y_axis(format=g.BYTES_FORMAT), 310 ), 311 ] 312 313 APISERVER_PANELS = [ 314 d.simple_graph( 315 "goroutines", 316 'go_goroutines{job="master", endpoint="apiserver"}', 317 legend="{{instance}}", 318 ), 319 d.simple_graph( 320 "gc rate", 321 'rate(go_gc_duration_seconds_count{job="master", endpoint="apiserver"}[1m])', 322 legend="{{instance}}", 323 ), 324 d.simple_graph( 325 "alloc rate", 326 'rate(go_memstats_alloc_bytes_total{job="master", endpoint="apiserver"}[1m])', 327 yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT), 328 legend="{{instance}}", 329 ), 330 d.simple_graph( 331 "Number of active watches", 332 'sum(apiserver_registered_watchers{kind=~"(?i:(${resource:regex}))s*"}) by (instance, group, version, kind)', 333 legend="{{instance}}: {{version}}.{{group}}.{{kind}}", 334 ), 335 d.simple_graph( 336 "Watch events rate", 337 d.one_line( 338 """ 339 sum( 340 irate( 341 apiserver_watch_events_total{ 342 kind=~"(?i:(${resource:regex}))s*" 343 }[1m] 344 ) 345 ) by (instance, group, version, kind)""" 346 ), 347 legend="{{instance}}: {{version}}.{{group}}.{{kind}}", 348 ), 349 d.simple_graph( 350 "Watch events traffic", 351 d.one_line( 352 """ 353 sum( 354 irate( 355 apiserver_watch_events_sizes_sum{ 356 kind=~"(?i:(${resource:regex}))s*" 357 }[1m] 358 ) 359 ) by (instance, group, version, kind)""" 360 ), 361 yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT), 362 legend="{{instance}}: {{version}}.{{group}}.{{kind}}", 363 ), 364 d.simple_graph( 365 "Watch event avg size", 366 d.one_line( 367 """ 368 sum( 369 rate( 370 apiserver_watch_events_sizes_sum{ 371 kind=~"(?i:(${resource:regex}))s*" 372 }[1m] 373 ) 374 / 375 rate( 376 apiserver_watch_events_sizes_count{ 377 kind=~"(?i:(${resource:regex}))s*" 378 }[1m] 379 ) 380 ) by (instance, group, version, kind)""" 381 ), 382 legend="{{instance}}: {{version}}.{{group}}.{{kind}}", 383 ), 384 d.simple_graph( 385 "Watch terminated total", 386 "sum(rate(apiserver_terminated_watchers_total{}[1m])) by (resource, instance)", 387 legend="{{instance}}: {{resource}}", 388 ), 389 d.simple_graph( 390 "Inflight requests", 391 "sum(apiserver_current_inflight_requests) by (requestKind, instance)", 392 legend="{{instance}}: {{requestKind}}", 393 ), 394 d.simple_graph( 395 "Request rate", 396 d.one_line( 397 """ 398 sum( 399 rate( 400 apiserver_request_total{ 401 verb=~"${verb:regex}", 402 resource=~"${resource:regex}s*" 403 }[1m] 404 ) 405 ) by (verb, resource, subresource, instance)""" 406 ), 407 # TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed 408 # legend="{{instance}}: {{verb}} {{resource}}", 409 ), 410 d.simple_graph( 411 "Request rate by code", 412 "sum(rate(apiserver_request_total[1m])) by (code, instance)", 413 legend="{{instance}}: {{code}}", 414 ), 415 d.simple_graph( 416 "Request latency (50th percentile) (excl. WATCH)", 417 d.one_line( 418 """ 419 apiserver:apiserver_request_latency:histogram_quantile{ 420 quantile="0.50", 421 verb!="WATCH", 422 verb=~"${verb:regex}", 423 resource=~"${resource:regex}s*" 424 }""" 425 ), 426 # TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed 427 # legend="{{verb}} {{scope}}/{{resource}}", 428 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 429 ), 430 d.simple_graph( 431 "Request latency (99th percentile) (excl. WATCH)", 432 d.one_line( 433 """ 434 apiserver:apiserver_request_latency:histogram_quantile{ 435 quantile="0.99", 436 verb!="WATCH", 437 verb=~"${verb:regex}", 438 resource=~"${resource:regex}s*" 439 }""" 440 ), 441 # TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed 442 # legend="{{verb}} {{scope}}/{{resource}}", 443 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 444 ), 445 d.simple_graph( 446 "Traffic (excl. WATCH)", 447 d.one_line( 448 """ 449 sum( 450 rate( 451 apiserver_response_sizes_sum{ 452 verb!="WATCH", 453 verb=~"${verb:regex}", 454 resource=~"${resource:regex}s*" 455 }[1m] 456 ) 457 ) by (verb, version, resource, subresource, scope, instance)""" 458 ), 459 yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT), 460 ), 461 d.simple_graph( 462 "Webhook admission duration (99th percentile)", 463 "histogram_quantile(0.99, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket[1m])) by (le, type, name))", 464 legend="{{type}}: {{name}}", 465 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 466 ), 467 d.simple_graph( 468 "Request filter latency for each filter type (99th percentile)", 469 "histogram_quantile(0.99, sum(rate(apiserver_request_filter_duration_seconds_bucket[1m])) by (le, filter))", 470 legend="{{filter}}", 471 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 472 ), 473 d.simple_graph( 474 "Failed external requests", 475 'sum(rate(rest_client_requests_total{endpoint="apiserver", code!="200", host!="[::1]:443"}[1m])) by (code, instance, method)', 476 legend="{{instance}}: {{code}} {{method}}", 477 ), 478 d.simple_graph( 479 "Extrernal requests latency (99th percentile)", 480 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{endpoint="apiserver", host!="[::1]:443"}[1m])) by (verb, host, instance, le))', 481 legend="{{instance}}: {{verb}} {{host}}", 482 yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), 483 ) 484 ] 485 486 CONTROLLER_MANAGER_PANELS = [ 487 d.simple_graph( 488 "Workqueue depths", 489 'workqueue_depth{endpoint="kube-controller-manager"}', 490 legend="{{name}}", 491 ) 492 ] 493 494 VM_PANELS = [ 495 d.simple_graph( 496 "fs bytes reads by container", 497 "sum(rate(container_fs_reads_bytes_total[1m])) by (container, instance)", 498 legend="{{instance}}: {{container}}", 499 yAxes=g.single_y_axis(format=g.BYTES_FORMAT), 500 ), 501 d.simple_graph( 502 "fs reads by container", 503 "sum(rate(container_fs_reads_total[1m])) by (container, instance)", 504 legend="{{instance}}: {{container}}", 505 ), 506 d.simple_graph( 507 "fs bytes writes by container", 508 "sum(rate(container_fs_writes_bytes_total[1m])) by (container, instance)", 509 legend="{{instance}}: {{container}}", 510 yAxes=g.single_y_axis(format=g.BYTES_FORMAT), 511 ), 512 d.simple_graph( 513 "fs writes by container", 514 "sum(rate(container_fs_writes_total[1m])) by (container, instance)", 515 legend="{{instance}}: {{container}}", 516 ), 517 d.Graph( 518 title="CPU usage by container", 519 targets=[ 520 d.TargetWithInterval( 521 expr='sum(rate(container_cpu_usage_seconds_total{container!=""}[1m])) by (container, instance)', 522 legendFormat="{{instance}}: {{container}}", 523 ), 524 d.TargetWithInterval(expr="machine_cpu_cores", legendFormat="limit"), 525 ], 526 ), 527 d.Graph( 528 title="memory usage by container", 529 targets=[ 530 d.TargetWithInterval( 531 expr='sum(container_memory_usage_bytes{container!=""}) by (container, instance)', 532 legendFormat="{{instance}}: {{container}}", 533 ), 534 d.TargetWithInterval(expr="machine_memory_bytes", legendFormat="limit"), 535 ], 536 yAxes=g.single_y_axis(format=g.BYTES_FORMAT), 537 ), 538 d.Graph( 539 title="memory working set by container", 540 targets=[ 541 d.TargetWithInterval( 542 expr='sum(container_memory_working_set_bytes{container!=""}) by (container, instance)', 543 legendFormat="{{instance}}: {{container}}", 544 ), 545 d.TargetWithInterval(expr="machine_memory_bytes", legendFormat="limit"), 546 ], 547 yAxes=g.single_y_axis(format=g.BYTES_FORMAT), 548 ), 549 d.Graph( 550 title="Network usage (bytes)", 551 targets=[ 552 d.Target( 553 expr='rate(container_network_transmit_bytes_total{id="/"}[1m])', 554 legendFormat="{{instance}} transmit", 555 ), 556 d.Target( 557 expr='rate(container_network_receive_bytes_total{id="/"}[1m])', 558 legendFormat="{{instance}} receive", 559 ), 560 ], 561 yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT), 562 ), 563 d.Graph( 564 title="Network usage (packets)", 565 targets=[ 566 d.Target( 567 expr='rate(container_network_transmit_packets_total{id="/"}[1m])', 568 legendFormat="{{instance}} transmit", 569 ), 570 d.Target( 571 expr='rate(container_network_receive_packets_total{id="/"}[1m])', 572 legendFormat="{{instance}} receive", 573 ), 574 ], 575 ), 576 d.Graph( 577 title="Network usage (avg packet size)", 578 targets=[ 579 d.Target( 580 expr='rate(container_network_transmit_bytes_total{id="/"}[1m]) / rate(container_network_transmit_packets_total{id="/"}[1m])', 581 legendFormat="{{instance}} transmit", 582 ), 583 d.Target( 584 expr='rate(container_network_receive_bytes_total{id="/"}[1m]) / rate(container_network_receive_packets_total{id="/"}[1m])', 585 legendFormat="{{instance}} receive", 586 ), 587 ], 588 yAxes=g.single_y_axis(format=g.BYTES_FORMAT), 589 ), 590 d.Graph( 591 title="Network tcp segments", 592 targets=[ 593 d.Target( 594 expr="sum(rate(node_netstat_Tcp_InSegs[1m])) by (instance)", 595 legendFormat="InSegs {{instance}}", 596 ), 597 d.Target( 598 expr="sum(rate(node_netstat_Tcp_OutSegs[1m])) by (instance)", 599 legendFormat="OutSegs {{instance}}", 600 ), 601 d.Target( 602 expr="sum(rate(node_netstat_Tcp_RetransSegs[1m])) by (instance)", 603 legendFormat="RetransSegs {{instance}}", 604 ), 605 ], 606 yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10), 607 ), 608 ]