github.com/thanos-io/thanos@v0.32.5/mixin/dashboards/store.libsonnet (about) 1 local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 2 local utils = import '../lib/utils.libsonnet'; 3 4 { 5 local thanos = self, 6 store+:: { 7 selector: error 'must provide selector for Thanos Store dashboard', 8 title: error 'must provide title for Thanos Store dashboard', 9 dashboard:: { 10 selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), 11 dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), 12 }, 13 }, 14 grafanaDashboards+:: { 15 [if thanos.store != null then 'store.json']: 16 local grpcUnarySelector = utils.joinLabels([thanos.store.dashboard.selector, 'grpc_type="unary"']); 17 local grpcServerStreamSelector = utils.joinLabels([thanos.store.dashboard.selector, 'grpc_type="server_stream"']); 18 local dataSizeDimensions = utils.joinLabels([thanos.store.dashboard.dimensions, 'data_type']); 19 20 g.dashboard(thanos.store.title) 21 .addRow( 22 g.row('gRPC (Unary)') 23 .addPanel( 24 g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + 25 g.grpcRequestsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.store.dashboard.dimensions) 26 ) 27 .addPanel( 28 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + 29 g.grpcErrorsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.store.dashboard.dimensions) 30 ) 31 .addPanel( 32 g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + 33 g.latencyPanel('grpc_server_handling_seconds', grpcUnarySelector, thanos.store.dashboard.dimensions) 34 ) 35 ) 36 .addRow( 37 g.row('gRPC (Stream)') 38 .addPanel( 39 g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + 40 g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.store.dashboard.dimensions) 41 ) 42 .addPanel( 43 g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + 44 g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.store.dashboard.dimensions) 45 ) 46 .addPanel( 47 g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + 48 g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.store.dashboard.dimensions) 49 ) 50 ) 51 .addRow( 52 g.row('Bucket Operations') 53 .addPanel( 54 g.panel('Rate', 'Shows rate of execution for operations against the bucket.') + 55 g.queryPanel( 56 'sum by (%s) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']), thanos.store.dashboard.selector], 57 '{{job}} {{operation}}' 58 ) + 59 g.stack 60 ) 61 .addPanel( 62 g.panel('Errors', 'Shows ratio of errors compared to the total number of executed operations against the bucket.') + 63 g.queryPanel( 64 'sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard { dimensions: utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']) }, 65 '{{job}} {{operation}}' 66 ) + 67 { yaxes: g.yaxes({ format: 'percentunit' }) } + 68 g.stack, 69 ) 70 .addPanel( 71 g.panel('Duration', 'Shows how long has it taken to execute operations against the bucket, in quantiles.') + 72 $.latencyByOperationPanel('thanos_objstore_bucket_operation_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) 73 ) 74 ) 75 .addRow( 76 g.row('Block Operations') 77 .addPanel( 78 g.panel('Block Load Rate', 'Shows rate of block loads from the bucket.') + 79 g.queryPanel( 80 'sum by (%s) (rate(thanos_bucket_store_block_loads_total{%s}[$__rate_interval]))' % [thanos.store.dashboard.dimensions, thanos.store.dashboard.selector], 81 'block loads' 82 ) + 83 g.stack 84 ) 85 .addPanel( 86 g.panel('Block Load Errors', 'Shows ratio of errors compared to the total number of block loads from the bucket.') + 87 g.qpsErrTotalPanel( 88 'thanos_bucket_store_block_load_failures_total{%s}' % thanos.store.dashboard.selector, 89 'thanos_bucket_store_block_loads_total{%s}' % thanos.store.dashboard.selector, 90 thanos.store.dashboard.dimensions 91 ) 92 ) 93 .addPanel( 94 g.panel('Block Drop Rate', 'Shows rate of block drops.') + 95 g.queryPanel( 96 'sum by (%s) (rate(thanos_bucket_store_block_drops_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']), thanos.store.dashboard.selector], 97 'block drops {{job}}' 98 ) + 99 g.stack 100 ) 101 .addPanel( 102 g.panel('Block Drop Errors', 'Shows ratio of errors compared to the total number of block drops.') + 103 g.qpsErrTotalPanel( 104 'thanos_bucket_store_block_drop_failures_total{%s}' % thanos.store.dashboard.selector, 105 'thanos_bucket_store_block_drops_total{%s}' % thanos.store.dashboard.selector, 106 thanos.store.dashboard.dimensions 107 ) 108 ) 109 ) 110 .addRow( 111 g.row('Cache Operations') 112 .addPanel( 113 g.panel('Requests', 'Show rate of cache requests.') + 114 g.queryPanel( 115 'sum by (%s) (rate(thanos_store_index_cache_requests_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], 116 '{{job}} {{item_type}}', 117 ) + 118 g.stack 119 ) 120 .addPanel( 121 g.panel('Hits', 'Shows ratio of errors compared to the total number of cache hits.') + 122 g.queryPanel( 123 'sum by (%s) (rate(thanos_store_index_cache_hits_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], 124 '{{job}} {{item_type}}', 125 ) + 126 g.stack 127 ) 128 .addPanel( 129 g.panel('Added', 'Show rate of added items to cache.') + 130 g.queryPanel( 131 'sum by (%s) (rate(thanos_store_index_cache_items_added_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], 132 '{{job}} {{item_type}}', 133 ) + 134 g.stack 135 ) 136 .addPanel( 137 g.panel('Evicted', 'Show rate of evicted items from cache.') + 138 g.queryPanel( 139 'sum by (%s) (rate(thanos_store_index_cache_items_evicted_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], 140 '{{job}} {{item_type}}', 141 ) + 142 g.stack 143 ) 144 ) 145 .addRow( 146 g.row('Store Sent') 147 .addPanel( 148 g.panel('Chunk Size', 'Shows size of chunks that have sent to the bucket.') + 149 g.queryPanel( 150 [ 151 'histogram_quantile(0.99, sum by (%s) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{%s}[$__rate_interval])))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'le']), thanos.store.dashboard.selector], 152 'sum by (%(dimensions)s) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard, 153 'histogram_quantile(0.50, sum by (%s) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{%s}[$__rate_interval])))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'le']), thanos.store.dashboard.selector], 154 ], 155 [ 156 'P99', 157 'mean', 158 'P50', 159 ], 160 ) + 161 { yaxes: g.yaxes('bytes') } 162 ), 163 ) 164 .addRow( 165 g.row('Series Operations') 166 .addPanel( 167 g.panel('Block queried') + 168 g.queryPanel( 169 [ 170 'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 171 'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_blocks_queried_sum{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_blocks_queried_count{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard, 172 'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 173 ], [ 174 'P99', 175 'mean {{job}}', 176 'P50', 177 ], 178 ) 179 ) 180 .addPanel( 181 g.panel('Data Fetched', 'Show the size of data fetched') + 182 g.queryPanel( 183 [ 184 'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_fetched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 185 'sum by (%s) (rate(thanos_bucket_store_series_data_fetched_sum{%s}[$__rate_interval])) / sum by (%s) (rate(thanos_bucket_store_series_data_fetched_count{%s}[$__rate_interval]))' % [dataSizeDimensions, thanos.store.dashboard.selector, dataSizeDimensions, thanos.store.dashboard.selector], 186 'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_fetched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 187 ], [ 188 'P99: {{data_type}} / {{job}}', 189 'mean: {{data_type}} / {{job}}', 190 'P50: {{data_type}} / {{job}}', 191 ], 192 ) + 193 { yaxes: g.yaxes('bytes') } 194 ) 195 .addPanel( 196 g.panel('Data Touched', 'Show the size of data touched') + 197 g.queryPanel( 198 [ 199 'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_touched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 200 'sum by (%s) (rate(thanos_bucket_store_series_data_touched_sum{%s}[$__rate_interval])) / sum by (%s) (rate(thanos_bucket_store_series_data_touched_count{%s}[$__rate_interval]))' % [dataSizeDimensions, thanos.store.dashboard.selector, dataSizeDimensions, thanos.store.dashboard.selector], 201 'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_touched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 202 ], [ 203 'P99: {{data_type}} / {{job}}', 204 'mean: {{data_type}} / {{job}}', 205 'P50: {{data_type}} / {{job}}', 206 ], 207 ) + 208 { yaxes: g.yaxes('bytes') } 209 ) 210 .addPanel( 211 g.panel('Result series') + 212 g.queryPanel( 213 [ 214 'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_result_series{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 215 'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_result_series_sum{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_result_series_count{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard, 216 'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_result_series{%s}[$__rate_interval])))' % thanos.store.dashboard.selector, 217 ], [ 218 'P99', 219 'mean {{job}}', 220 'P50', 221 ], 222 ) 223 ) 224 ) 225 .addRow( 226 g.row('Series Operation Durations') 227 .addPanel( 228 g.panel('Get All', 'Shows how long has it taken to get all series.') + 229 g.latencyPanel('thanos_bucket_store_series_get_all_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) 230 ) 231 .addPanel( 232 g.panel('Merge', 'Shows how long has it taken to merge series.') + 233 g.latencyPanel('thanos_bucket_store_series_merge_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) 234 ) 235 .addPanel( 236 g.panel('Gate', 'Shows how long has it taken for a series to wait at the gate.') + 237 g.latencyPanel('thanos_bucket_store_series_gate_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) 238 ) 239 ) 240 .addRow( 241 g.resourceUtilizationRow(thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) 242 ), 243 244 __overviewRows__+:: if thanos.store == null then [] else [ 245 g.row('Store') 246 .addPanel( 247 g.panel('gRPC (Unary) Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + 248 g.grpcRequestsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) + 249 g.addDashboardLink(thanos.store.title) 250 ) 251 .addPanel( 252 g.panel('gRPC (Unary) Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + 253 g.grpcErrorsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) + 254 g.addDashboardLink(thanos.store.title) 255 ) 256 .addPanel( 257 g.sloLatency( 258 'gRPC Latency 99th Percentile', 259 'Shows how long has it taken to handle requests from queriers.', 260 'grpc_server_handling_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), 261 thanos.dashboard.overview.dimensions, 262 0.99, 263 0.5, 264 1 265 ) + 266 g.addDashboardLink(thanos.store.title) 267 ), 268 ], 269 }, 270 271 latencyByOperationPanel(metricName, selector, dimensions, multiplier='1'):: { 272 local params = { metricName: metricName, selector: selector, multiplier: multiplier, dimensions: dimensions }, 273 274 nullPointMode: 'null as zero', 275 targets: [ 276 { 277 expr: 'histogram_quantile(0.99, sum by (%(dimensions)s, operation, le) (rate(%(metricName)s_bucket{%(selector)s}[$__rate_interval]))) * %(multiplier)s' % params, 278 format: 'time_series', 279 intervalFactor: 2, 280 legendFormat: 'P99 {{job}}', 281 refId: 'A', 282 step: 10, 283 }, 284 { 285 expr: 'sum by (%(dimensions)s, operation) (rate(%(metricName)s_sum{%(selector)s}[$__rate_interval])) * %(multiplier)s / sum by (%(dimensions)s, operation) (rate(%(metricName)s_count{%(selector)s}[$__rate_interval]))' % params, 286 format: 'time_series', 287 intervalFactor: 2, 288 legendFormat: 'mean {{job}}', 289 refId: 'B', 290 step: 10, 291 }, 292 { 293 expr: 'histogram_quantile(0.50, sum by (%(dimensions)s, operation, le) (rate(%(metricName)s_bucket{%(selector)s}[$__rate_interval]))) * %(multiplier)s' % params, 294 format: 'time_series', 295 intervalFactor: 2, 296 legendFormat: 'P50 {{job}}', 297 refId: 'C', 298 step: 10, 299 }, 300 ], 301 yaxes: g.yaxes('s'), 302 }, 303 }