github.com/thanos-io/thanos@v0.32.5/mixin/dashboards/store.libsonnet (about)

     1  local g = import '../lib/thanos-grafana-builder/builder.libsonnet';
     2  local utils = import '../lib/utils.libsonnet';
     3  
     4  {
     5    local thanos = self,
     6    store+:: {
     7      selector: error 'must provide selector for Thanos Store dashboard',
     8      title: error 'must provide title for Thanos Store dashboard',
     9      dashboard:: {
    10        selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']),
    11        dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']),
    12      },
    13    },
    14    grafanaDashboards+:: {
    15      [if thanos.store != null then 'store.json']:
    16        local grpcUnarySelector = utils.joinLabels([thanos.store.dashboard.selector, 'grpc_type="unary"']);
    17        local grpcServerStreamSelector = utils.joinLabels([thanos.store.dashboard.selector, 'grpc_type="server_stream"']);
    18        local dataSizeDimensions = utils.joinLabels([thanos.store.dashboard.dimensions, 'data_type']);
    19  
    20        g.dashboard(thanos.store.title)
    21        .addRow(
    22          g.row('gRPC (Unary)')
    23          .addPanel(
    24            g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') +
    25            g.grpcRequestsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.store.dashboard.dimensions)
    26          )
    27          .addPanel(
    28            g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') +
    29            g.grpcErrorsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.store.dashboard.dimensions)
    30          )
    31          .addPanel(
    32            g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') +
    33            g.latencyPanel('grpc_server_handling_seconds', grpcUnarySelector, thanos.store.dashboard.dimensions)
    34          )
    35        )
    36        .addRow(
    37          g.row('gRPC (Stream)')
    38          .addPanel(
    39            g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') +
    40            g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.store.dashboard.dimensions)
    41          )
    42          .addPanel(
    43            g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') +
    44            g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.store.dashboard.dimensions)
    45          )
    46          .addPanel(
    47            g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') +
    48            g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.store.dashboard.dimensions)
    49          )
    50        )
    51        .addRow(
    52          g.row('Bucket Operations')
    53          .addPanel(
    54            g.panel('Rate', 'Shows rate of execution for operations against the bucket.') +
    55            g.queryPanel(
    56              'sum by (%s) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']), thanos.store.dashboard.selector],
    57              '{{job}} {{operation}}'
    58            ) +
    59            g.stack
    60          )
    61          .addPanel(
    62            g.panel('Errors', 'Shows ratio of errors compared to the total number of executed operations against the bucket.') +
    63            g.queryPanel(
    64              'sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard { dimensions: utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']) },
    65              '{{job}} {{operation}}'
    66            ) +
    67            { yaxes: g.yaxes({ format: 'percentunit' }) } +
    68            g.stack,
    69          )
    70          .addPanel(
    71            g.panel('Duration', 'Shows how long has it taken to execute operations against the bucket, in quantiles.') +
    72            $.latencyByOperationPanel('thanos_objstore_bucket_operation_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions)
    73          )
    74        )
    75        .addRow(
    76          g.row('Block Operations')
    77          .addPanel(
    78            g.panel('Block Load Rate', 'Shows rate of block loads from the bucket.') +
    79            g.queryPanel(
    80              'sum by (%s) (rate(thanos_bucket_store_block_loads_total{%s}[$__rate_interval]))' % [thanos.store.dashboard.dimensions, thanos.store.dashboard.selector],
    81              'block loads'
    82            ) +
    83            g.stack
    84          )
    85          .addPanel(
    86            g.panel('Block Load Errors', 'Shows ratio of errors compared to the total number of block loads from the bucket.') +
    87            g.qpsErrTotalPanel(
    88              'thanos_bucket_store_block_load_failures_total{%s}' % thanos.store.dashboard.selector,
    89              'thanos_bucket_store_block_loads_total{%s}' % thanos.store.dashboard.selector,
    90              thanos.store.dashboard.dimensions
    91            )
    92          )
    93          .addPanel(
    94            g.panel('Block Drop Rate', 'Shows rate of block drops.') +
    95            g.queryPanel(
    96              'sum by (%s) (rate(thanos_bucket_store_block_drops_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']), thanos.store.dashboard.selector],
    97              'block drops {{job}}'
    98            ) +
    99            g.stack
   100          )
   101          .addPanel(
   102            g.panel('Block Drop Errors', 'Shows ratio of errors compared to the total number of block drops.') +
   103            g.qpsErrTotalPanel(
   104              'thanos_bucket_store_block_drop_failures_total{%s}' % thanos.store.dashboard.selector,
   105              'thanos_bucket_store_block_drops_total{%s}' % thanos.store.dashboard.selector,
   106              thanos.store.dashboard.dimensions
   107            )
   108          )
   109        )
   110        .addRow(
   111          g.row('Cache Operations')
   112          .addPanel(
   113            g.panel('Requests', 'Show rate of cache requests.') +
   114            g.queryPanel(
   115              'sum by (%s) (rate(thanos_store_index_cache_requests_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector],
   116              '{{job}} {{item_type}}',
   117            ) +
   118            g.stack
   119          )
   120          .addPanel(
   121            g.panel('Hits', 'Shows ratio of errors compared to the total number of cache hits.') +
   122            g.queryPanel(
   123              'sum by (%s) (rate(thanos_store_index_cache_hits_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector],
   124              '{{job}} {{item_type}}',
   125            ) +
   126            g.stack
   127          )
   128          .addPanel(
   129            g.panel('Added', 'Show rate of added items to cache.') +
   130            g.queryPanel(
   131              'sum by (%s) (rate(thanos_store_index_cache_items_added_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector],
   132              '{{job}} {{item_type}}',
   133            ) +
   134            g.stack
   135          )
   136          .addPanel(
   137            g.panel('Evicted', 'Show rate of evicted items from cache.') +
   138            g.queryPanel(
   139              'sum by (%s) (rate(thanos_store_index_cache_items_evicted_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector],
   140              '{{job}} {{item_type}}',
   141            ) +
   142            g.stack
   143          )
   144        )
   145        .addRow(
   146          g.row('Store Sent')
   147          .addPanel(
   148            g.panel('Chunk Size', 'Shows size of chunks that have sent to the bucket.') +
   149            g.queryPanel(
   150              [
   151                'histogram_quantile(0.99, sum by (%s) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{%s}[$__rate_interval])))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'le']), thanos.store.dashboard.selector],
   152                'sum by (%(dimensions)s) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard,
   153                'histogram_quantile(0.50, sum by (%s) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{%s}[$__rate_interval])))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'le']), thanos.store.dashboard.selector],
   154              ],
   155              [
   156                'P99',
   157                'mean',
   158                'P50',
   159              ],
   160            ) +
   161            { yaxes: g.yaxes('bytes') }
   162          ),
   163        )
   164        .addRow(
   165          g.row('Series Operations')
   166          .addPanel(
   167            g.panel('Block queried') +
   168            g.queryPanel(
   169              [
   170                'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   171                'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_blocks_queried_sum{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_blocks_queried_count{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard,
   172                'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   173              ], [
   174                'P99',
   175                'mean {{job}}',
   176                'P50',
   177              ],
   178            )
   179          )
   180          .addPanel(
   181            g.panel('Data Fetched', 'Show the size of data fetched') +
   182            g.queryPanel(
   183              [
   184                'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_fetched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   185                'sum by (%s) (rate(thanos_bucket_store_series_data_fetched_sum{%s}[$__rate_interval])) / sum by (%s) (rate(thanos_bucket_store_series_data_fetched_count{%s}[$__rate_interval]))' % [dataSizeDimensions, thanos.store.dashboard.selector, dataSizeDimensions, thanos.store.dashboard.selector],
   186                'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_fetched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   187              ], [
   188                'P99: {{data_type}} / {{job}}',
   189                'mean: {{data_type}} / {{job}}',
   190                'P50: {{data_type}} / {{job}}',
   191              ],
   192            ) +
   193            { yaxes: g.yaxes('bytes') }
   194          )
   195          .addPanel(
   196            g.panel('Data Touched', 'Show the size of data touched') +
   197            g.queryPanel(
   198              [
   199                'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_touched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   200                'sum by (%s) (rate(thanos_bucket_store_series_data_touched_sum{%s}[$__rate_interval])) / sum by (%s) (rate(thanos_bucket_store_series_data_touched_count{%s}[$__rate_interval]))' % [dataSizeDimensions, thanos.store.dashboard.selector, dataSizeDimensions, thanos.store.dashboard.selector],
   201                'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_touched{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   202              ], [
   203                'P99: {{data_type}} / {{job}}',
   204                'mean: {{data_type}} / {{job}}',
   205                'P50: {{data_type}} / {{job}}',
   206              ],
   207            ) +
   208            { yaxes: g.yaxes('bytes') }
   209          )
   210          .addPanel(
   211            g.panel('Result series') +
   212            g.queryPanel(
   213              [
   214                'histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_result_series{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   215                'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_result_series_sum{%(selector)s}[$__rate_interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_result_series_count{%(selector)s}[$__rate_interval]))' % thanos.store.dashboard,
   216                'histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_result_series{%s}[$__rate_interval])))' % thanos.store.dashboard.selector,
   217              ], [
   218                'P99',
   219                'mean {{job}}',
   220                'P50',
   221              ],
   222            )
   223          )
   224        )
   225        .addRow(
   226          g.row('Series Operation Durations')
   227          .addPanel(
   228            g.panel('Get All', 'Shows how long has it taken to get all series.') +
   229            g.latencyPanel('thanos_bucket_store_series_get_all_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions)
   230          )
   231          .addPanel(
   232            g.panel('Merge', 'Shows how long has it taken to merge series.') +
   233            g.latencyPanel('thanos_bucket_store_series_merge_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions)
   234          )
   235          .addPanel(
   236            g.panel('Gate', 'Shows how long has it taken for a series to wait at the gate.') +
   237            g.latencyPanel('thanos_bucket_store_series_gate_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions)
   238          )
   239        )
   240        .addRow(
   241          g.resourceUtilizationRow(thanos.store.dashboard.selector, thanos.store.dashboard.dimensions)
   242        ),
   243  
   244      __overviewRows__+:: if thanos.store == null then [] else [
   245        g.row('Store')
   246        .addPanel(
   247          g.panel('gRPC (Unary) Rate', 'Shows rate of handled Unary gRPC requests from queriers.') +
   248          g.grpcRequestsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) +
   249          g.addDashboardLink(thanos.store.title)
   250        )
   251        .addPanel(
   252          g.panel('gRPC (Unary) Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') +
   253          g.grpcErrorsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) +
   254          g.addDashboardLink(thanos.store.title)
   255        )
   256        .addPanel(
   257          g.sloLatency(
   258            'gRPC Latency 99th Percentile',
   259            'Shows how long has it taken to handle requests from queriers.',
   260            'grpc_server_handling_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']),
   261            thanos.dashboard.overview.dimensions,
   262            0.99,
   263            0.5,
   264            1
   265          ) +
   266          g.addDashboardLink(thanos.store.title)
   267        ),
   268      ],
   269    },
   270  
   271    latencyByOperationPanel(metricName, selector, dimensions, multiplier='1'):: {
   272      local params = { metricName: metricName, selector: selector, multiplier: multiplier, dimensions: dimensions },
   273  
   274      nullPointMode: 'null as zero',
   275      targets: [
   276        {
   277          expr: 'histogram_quantile(0.99, sum by (%(dimensions)s, operation, le) (rate(%(metricName)s_bucket{%(selector)s}[$__rate_interval]))) * %(multiplier)s' % params,
   278          format: 'time_series',
   279          intervalFactor: 2,
   280          legendFormat: 'P99 {{job}}',
   281          refId: 'A',
   282          step: 10,
   283        },
   284        {
   285          expr: 'sum by (%(dimensions)s, operation) (rate(%(metricName)s_sum{%(selector)s}[$__rate_interval])) * %(multiplier)s  / sum by (%(dimensions)s, operation) (rate(%(metricName)s_count{%(selector)s}[$__rate_interval]))' % params,
   286          format: 'time_series',
   287          intervalFactor: 2,
   288          legendFormat: 'mean {{job}}',
   289          refId: 'B',
   290          step: 10,
   291        },
   292        {
   293          expr: 'histogram_quantile(0.50, sum by (%(dimensions)s, operation, le) (rate(%(metricName)s_bucket{%(selector)s}[$__rate_interval]))) * %(multiplier)s' % params,
   294          format: 'time_series',
   295          intervalFactor: 2,
   296          legendFormat: 'P50 {{job}}',
   297          refId: 'C',
   298          step: 10,
   299        },
   300      ],
   301      yaxes: g.yaxes('s'),
   302    },
   303  }