github.com/thanos-io/thanos@v0.32.5/mixin/dashboards/rule.libsonnet (about) 1 local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 2 local utils = import '../lib/utils.libsonnet'; 3 4 { 5 local thanos = self, 6 rule+:: { 7 selector: error 'must provide selector for Thanos Rule dashboard', 8 title: error 'must provide title for Thanos Rule dashboard', 9 dashboard:: { 10 selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), 11 dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), 12 ruleGroupDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'rule_group', 'strategy']), 13 }, 14 }, 15 grafanaDashboards+:: { 16 [if thanos.rule != null then 'rule.json']: 17 local grpcUnarySelector = utils.joinLabels([thanos.rule.dashboard.selector, 'grpc_type="unary"']); 18 local grpcServerStreamSelector = utils.joinLabels([thanos.rule.dashboard.selector, 'grpc_type="server_stream"']); 19 20 g.dashboard(thanos.rule.title) 21 .addRow( 22 g.row('Rule Group Evaluations') 23 .addPanel( 24 g.panel('Rule Group Evaluations') + 25 g.queryPanel( 26 'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard, 27 '{{ rule_group }} {{ strategy }}', 28 ) 29 ) 30 .addPanel( 31 g.panel('Rule Group Evaluations Failed') + 32 g.queryPanel( 33 'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard, 34 '{{ rule_group }} {{ strategy }}', 35 ) 36 ) 37 .addPanel( 38 g.panel('Rule Group Evaluations Missed') + 39 g.queryPanel( 40 'sum by (%(ruleGroupDimensions)s) (increase(prometheus_rule_group_iterations_missed_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard, 41 '{{ rule_group }} {{ strategy }}', 42 ) 43 ) 44 .addPanel( 45 g.panel('Rule Group Evaluations Too Slow') + 46 g.queryPanel( 47 ||| 48 ( 49 sum by(%(dimensions)s, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) 50 > 51 sum by(%(dimensions)s, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) 52 ) 53 ||| % thanos.rule.dashboard, 54 '{{ rule_group }}', 55 ) 56 ) 57 ) 58 .addRow( 59 g.row('Alert Sent') 60 .addPanel( 61 g.panel('Dropped Rate', 'Shows rate of dropped alerts.') + 62 g.queryPanel( 63 'sum by (%(dimensions)s, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{%s}[$__rate_interval]))' % [thanos.rule.dashboard.dimensions, thanos.rule.dashboard.selector], 64 '{{alertmanager}}' 65 ) 66 ) 67 .addPanel( 68 g.panel('Sent Rate', 'Shows rate of alerts that successfully sent to alert manager.') + 69 g.queryPanel( 70 'sum by (%(dimensions)s, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{%s}[$__rate_interval]))' % [thanos.rule.dashboard.dimensions, thanos.rule.dashboard.selector], 71 '{{alertmanager}}' 72 ) + 73 g.stack 74 ) 75 .addPanel( 76 g.panel('Sent Errors', 'Shows ratio of errors compared to the total number of sent alerts.') + 77 g.qpsErrTotalPanel( 78 'thanos_alert_sender_errors_total{%s}' % thanos.rule.dashboard.selector, 79 'thanos_alert_sender_alerts_sent_total{%s}' % thanos.rule.dashboard.selector, 80 thanos.rule.dashboard.dimensions 81 ) 82 ) 83 .addPanel( 84 g.panel('Sent Duration', 'Shows how long has it taken to send alerts to alert manager.') + 85 g.latencyPanel('thanos_alert_sender_latency_seconds', thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions), 86 ) 87 ) 88 .addRow( 89 g.row('Alert Queue') 90 .addPanel( 91 g.panel('Push Rate', 'Shows rate of queued alerts.') + 92 g.queryPanel( 93 'sum by (%s) (rate(thanos_alert_queue_alerts_dropped_total{%s}[$__rate_interval]))' % [thanos.rule.dashboard.dimensions, thanos.rule.dashboard.selector], 94 '{{job}}' 95 ) 96 ) 97 .addPanel( 98 g.panel('Drop Ratio', 'Shows ratio of dropped alerts compared to the total number of queued alerts.') + 99 g.qpsErrTotalPanel( 100 'thanos_alert_queue_alerts_dropped_total{%s}' % thanos.rule.dashboard.selector, 101 'thanos_alert_queue_alerts_pushed_total{%s}' % thanos.rule.dashboard.selector, 102 thanos.rule.dashboard.dimensions 103 ) 104 ) 105 ) 106 .addRow( 107 g.row('gRPC (Unary)') 108 .addPanel( 109 g.panel('Rate', 'Shows rate of handled Unary gRPC requests.') + 110 g.grpcRequestsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.rule.dashboard.dimensions) 111 ) 112 .addPanel( 113 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + 114 g.grpcErrorsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.rule.dashboard.dimensions) 115 ) 116 .addPanel( 117 g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles.') + 118 g.latencyPanel('grpc_server_handling_seconds', grpcUnarySelector, thanos.rule.dashboard.dimensions) 119 ) 120 ) 121 .addRow( 122 g.row('gRPC (Stream)') 123 .addPanel( 124 g.panel('Rate', 'Shows rate of handled Streamed gRPC requests.') + 125 g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.rule.dashboard.dimensions) 126 ) 127 .addPanel( 128 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + 129 g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.rule.dashboard.dimensions) 130 ) 131 .addPanel( 132 g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles') + 133 g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.rule.dashboard.dimensions) 134 ) 135 ) 136 .addRow( 137 g.resourceUtilizationRow(thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions) 138 ), 139 140 __overviewRows__+:: if thanos.rule == null then [] else [ 141 g.row('Rule') 142 .addPanel( 143 g.panel('Alert Sent Rate', 'Shows rate of alerts that successfully sent to alert manager.') + 144 g.queryPanel( 145 'sum by (%s) (rate(thanos_alert_sender_alerts_sent_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.dashboard.overview.dimensions, 'alertmanager']), thanos.dashboard.overview.selector], 146 '{{alertmanager}}' 147 ) + 148 g.addDashboardLink(thanos.rule.title) + 149 g.stack 150 ) 151 .addPanel( 152 g.panel('Alert Sent Errors', 'Shows ratio of errors compared to the total number of sent alerts.') + 153 g.qpsErrTotalPanel( 154 'thanos_alert_sender_errors_total{%s}' % thanos.dashboard.overview.selector, 155 'thanos_alert_sender_alerts_sent_total{%s}' % thanos.dashboard.overview.selector, 156 thanos.dashboard.overview.dimensions 157 ) + 158 g.addDashboardLink(thanos.rule.title) 159 ) 160 .addPanel( 161 g.sloLatency( 162 'Alert Sent Duration', 163 'Shows how long has it taken to send alerts to alert manager.', 164 'thanos_alert_sender_latency_seconds_bucket{%s}' % thanos.dashboard.overview.selector, 165 thanos.dashboard.overview.dimensions, 166 0.99, 167 0.5, 168 1 169 ) + 170 g.addDashboardLink(thanos.rule.title) 171 ) + 172 g.collapse, 173 ], 174 }, 175 }