github.com/thanos-io/thanos@v0.32.5/mixin/dashboards/receive.libsonnet (about) 1 local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 2 local utils = import '../lib/utils.libsonnet'; 3 4 5 { 6 local thanos = self, 7 receive+:: { 8 selector: error 'must provide selector for Thanos Receive dashboard', 9 title: error 'must provide title for Thanos Receive dashboard', 10 dashboard:: { 11 selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), 12 dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), 13 tenantSelector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"', 'tenant=~"$tenant"']), 14 tenantDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'tenant']), 15 }, 16 }, 17 grafanaDashboards+:: { 18 local grafana = import 'grafonnet/grafana.libsonnet', 19 local template = grafana.template, 20 [if thanos.receive != null then 'receive.json']: 21 local receiveHandlerSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'handler="receive"']); 22 local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); 23 local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); 24 local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); 25 26 local tenantReceiveHandlerSeclector = utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'handler="receive"']); 27 local tenantHttpCode2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code=~"2.."']); 28 local tenantHttpCodeNot2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code!~"2.."']); 29 30 local tenantWithHttpCodeDimensions = std.join(', ', ['tenant', 'code']); 31 g.dashboard(thanos.receive.title) { 32 templating+: { 33 list+: [ 34 template.new( 35 'tenant', 36 '$datasource', 37 'label_values(http_requests_total{%s}, %s)' % [std.join(', ', [thanos.receive.dashboard.selector] + ['tenant!=""']), 'tenant'], 38 label='tenant', 39 refresh=1, 40 sort=2, 41 current='all', 42 allValues=null, 43 includeAll=true 44 ), 45 ], 46 }, 47 } 48 .addRow( 49 g.row('WRITE - Incoming Request') 50 .addPanel( 51 g.panel('Rate', 'Shows rate of incoming requests.') + 52 g.httpQpsPanel('http_requests_total', receiveHandlerSelector, thanos.receive.dashboard.dimensions) 53 ) 54 .addPanel( 55 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + 56 g.httpErrPanel('http_requests_total', receiveHandlerSelector, thanos.receive.dashboard.dimensions) 57 ) 58 .addPanel( 59 g.panel('Duration', 'Shows how long has it taken to handle incoming requests in quantiles.') + 60 g.latencyPanel('http_request_duration_seconds', receiveHandlerSelector, thanos.receive.dashboard.dimensions) 61 ) 62 ) 63 .addRow( 64 g.row('WRITE - Incoming Request (tenant focus)') 65 .addPanel( 66 g.panel('Rate of write requests (by tenant and code)') + 67 g.queryPanel( 68 'sum by (%s) (rate(http_requests_total{%s}[$__rate_interval]))' % [tenantWithHttpCodeDimensions, tenantReceiveHandlerSeclector], 69 '{{code}} - {{tenant}}' 70 ) 71 ) 72 .addPanel( 73 g.panel('Number of errors (by tenant and code)') + 74 g.queryPanel( 75 'sum by (%s) (rate(http_requests_total{%s}[$__rate_interval]))' % [ 76 tenantWithHttpCodeDimensions, 77 tenantHttpCodeNot2XXSelector, 78 ], 79 '{{code}} - {{tenant}}' 80 ) 81 ) 82 .addPanel( 83 g.panel('Average request duration (by tenant)') + 84 g.queryPanel( 85 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$__rate_interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [ 86 thanos.receive.dashboard.tenantDimensions, 87 tenantReceiveHandlerSeclector, 88 thanos.receive.dashboard.tenantDimensions, 89 tenantReceiveHandlerSeclector, 90 ], 91 '{{tenant}}' 92 ) 93 ) 94 ) 95 .addRow( 96 g.row('HTTP requests (tenant focus)') 97 .addPanel( 98 g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + 99 g.queryPanel( 100 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$__rate_interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$__rate_interval]))' % [ 101 thanos.receive.dashboard.tenantDimensions, 102 tenantHttpCode2XXSelector, 103 thanos.receive.dashboard.tenantDimensions, 104 tenantHttpCode2XXSelector, 105 ], 106 '{{tenant}}' 107 ) 108 ) 109 .addPanel( 110 g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + 111 g.queryPanel( 112 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$__rate_interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$__rate_interval]))' % [ 113 thanos.receive.dashboard.tenantDimensions, 114 tenantHttpCodeNot2XXSelector, 115 thanos.receive.dashboard.tenantDimensions, 116 tenantHttpCodeNot2XXSelector, 117 ], 118 '{{tenant}}' 119 ) 120 ) 121 .addPanel( 122 g.panel('Inflight requests (per tenant and method)') + 123 g.queryPanel( 124 'sum by (%s) (http_inflight_requests{%s})' % [ 125 std.join(', ', [thanos.receive.dashboard.tenantDimensions, 'method']), 126 tenantReceiveHandlerSeclector, 127 ], 128 '{{method}} - {{tenant}}' 129 ) 130 ) 131 ) 132 .addRow( 133 g.row('Series & Samples (tenant focus)') 134 .addPanel( 135 g.panel('Rate of series received (per tenant, only 2XX)') + 136 g.queryPanel( 137 'sum(rate(thanos_receive_write_timeseries_sum{%s}[$__rate_interval])) by (%s) ' % [ 138 utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), 139 thanos.receive.dashboard.tenantDimensions, 140 ], 141 '{{tenant}}' 142 ) 143 ) 144 .addPanel( 145 g.panel('Rate of series not written (per tenant and code, non 2XX)') + 146 g.queryPanel( 147 'sum(rate(thanos_receive_write_timeseries_sum{%s}[$__rate_interval])) by (%s) ' % [ 148 utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), 149 tenantWithHttpCodeDimensions, 150 ], 151 '{{code}} - {{tenant}}' 152 ) 153 ) 154 .addPanel( 155 g.panel('Rate of samples received (per tenant, only 2XX)') + 156 g.queryPanel( 157 'sum(rate(thanos_receive_write_samples_sum{%s}[$__rate_interval])) by (%s) ' % [ 158 utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), 159 thanos.receive.dashboard.tenantDimensions, 160 ], 161 '{{tenant}}' 162 ) 163 ) 164 .addPanel( 165 g.panel('Rate of samples not written (per tenant and code, non 2XX)') + 166 g.queryPanel( 167 'sum(rate(thanos_receive_write_samples_sum{%s}[$__rate_interval])) by (%s) ' % [ 168 utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), 169 tenantWithHttpCodeDimensions, 170 ], 171 '{{code}} - {{tenant}}' 172 ) 173 ) 174 ) 175 .addRow( 176 g.row('WRITE - Replication') 177 .addPanel( 178 g.panel('Rate', 'Shows rate of replications to other receive nodes.') + 179 g.queryPanel( 180 'sum by (%s) (rate(thanos_receive_replications_total{%s}[$__rate_interval]))' % [thanos.receive.dashboard.dimensions, thanos.receive.dashboard.selector], 181 'all {{job}}', 182 ) 183 ) 184 .addPanel( 185 g.panel('Errors', 'Shows ratio of errors compared to the total number of replications to other receive nodes.') + 186 g.qpsErrTotalPanel( 187 'thanos_receive_replications_total{%s}' % utils.joinLabels([thanos.receive.dashboard.selector, 'result="error"']), 188 'thanos_receive_replications_total{%s}' % thanos.receive.dashboard.selector, 189 thanos.receive.dashboard.dimensions 190 ) 191 ) 192 ) 193 .addRow( 194 g.row('WRITE - Forward Request') 195 .addPanel( 196 g.panel('Rate', 'Shows rate of forwarded requests to other receive nodes.') + 197 g.queryPanel( 198 'sum by (%s) (rate(thanos_receive_forward_requests_total{%s}[$__rate_interval]))' % [thanos.receive.dashboard.dimensions, thanos.receive.dashboard.selector], 199 'all {{job}}', 200 ) 201 ) 202 .addPanel( 203 g.panel('Errors', 'Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.') + 204 g.qpsErrTotalPanel( 205 'thanos_receive_forward_requests_total{%s}' % utils.joinLabels([thanos.receive.dashboard.selector, 'result="error"']), 206 'thanos_receive_forward_requests_total{%s}' % thanos.receive.dashboard.selector, 207 thanos.receive.dashboard.dimensions 208 ) 209 ) 210 ) 211 .addRow( 212 // TODO(https://github.com/thanos-io/thanos/issues/3926) 213 g.row('WRITE - gRPC (Unary)') 214 .addPanel( 215 g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + 216 g.grpcRequestsPanel('grpc_server_handled_total', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions) 217 ) 218 .addPanel( 219 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + 220 g.grpcErrorsPanel('grpc_server_handled_total', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions) 221 ) 222 .addPanel( 223 g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + 224 g.latencyPanel('grpc_server_handling_seconds', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions) 225 ) 226 ) 227 .addRow( 228 // TODO(https://github.com/thanos-io/thanos/issues/3926) 229 g.row('READ - gRPC (Unary)') 230 .addPanel( 231 g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + 232 g.grpcRequestsPanel('grpc_server_handled_total', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions) 233 ) 234 .addPanel( 235 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + 236 g.grpcErrorsPanel('grpc_server_handled_total', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions) 237 ) 238 .addPanel( 239 g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + 240 g.latencyPanel('grpc_server_handling_seconds', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions) 241 ) 242 ) 243 .addRow( 244 // TODO(https://github.com/thanos-io/thanos/issues/3926) 245 g.row('READ - gRPC (Stream)') 246 .addPanel( 247 g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + 248 g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.receive.dashboard.dimensions) 249 ) 250 .addPanel( 251 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + 252 g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.receive.dashboard.dimensions) 253 ) 254 .addPanel( 255 g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + 256 g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.receive.dashboard.dimensions) 257 ) 258 ) 259 .addRow( 260 g.row('Last Updated') 261 .addPanel( 262 g.panel('Successful Upload', 'Shows the relative time of last successful upload to the object-store bucket.') + 263 g.tablePanel( 264 ['time() - max by (%s) (thanos_objstore_bucket_last_successful_upload_time{%s})' % [utils.joinLabels([thanos.receive.dashboard.dimensions, 'bucket']), thanos.receive.dashboard.selector]], 265 { 266 Value: { 267 alias: 'Uploaded Ago', 268 unit: 's', 269 type: 'number', 270 }, 271 }, 272 ) 273 ) 274 ) 275 .addRow( 276 g.resourceUtilizationRow(thanos.receive.dashboard.selector, thanos.receive.dashboard.dimensions) 277 ), 278 279 __overviewRows__+:: if thanos.receive == null then [] else [ 280 g.row('Receive') 281 .addPanel( 282 g.panel('Incoming Requests Rate', 'Shows rate of incoming requests.') + 283 g.httpQpsPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), thanos.dashboard.overview.dimensions) + 284 g.addDashboardLink(thanos.receive.title) 285 ) 286 .addPanel( 287 g.panel('Incoming Requests Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + 288 g.httpErrPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), thanos.dashboard.overview.dimensions) + 289 g.addDashboardLink(thanos.receive.title) 290 ) 291 .addPanel( 292 g.sloLatency( 293 'Incoming Requests Latency 99th Percentile', 294 'Shows how long has it taken to handle incoming requests.', 295 'http_request_duration_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), 296 thanos.dashboard.overview.dimensions, 297 0.99, 298 0.5, 299 1 300 ) + 301 g.addDashboardLink(thanos.receive.title) 302 ), 303 ], 304 }, 305 }