github.com/thanos-io/thanos@v0.32.5/mixin/dashboards/receive.libsonnet (about)

     1  local g = import '../lib/thanos-grafana-builder/builder.libsonnet';
     2  local utils = import '../lib/utils.libsonnet';
     3  
     4  
     5  {
     6    local thanos = self,
     7    receive+:: {
     8      selector: error 'must provide selector for Thanos Receive dashboard',
     9      title: error 'must provide title for Thanos Receive dashboard',
    10      dashboard:: {
    11        selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']),
    12        dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']),
    13        tenantSelector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"', 'tenant=~"$tenant"']),
    14        tenantDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'tenant']),
    15      },
    16    },
    17    grafanaDashboards+:: {
    18      local grafana = import 'grafonnet/grafana.libsonnet',
    19      local template = grafana.template,
    20      [if thanos.receive != null then 'receive.json']:
    21        local receiveHandlerSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'handler="receive"']);
    22        local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']);
    23        local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']);
    24        local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']);
    25  
    26        local tenantReceiveHandlerSeclector = utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'handler="receive"']);
    27        local tenantHttpCode2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code=~"2.."']);
    28        local tenantHttpCodeNot2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code!~"2.."']);
    29  
    30        local tenantWithHttpCodeDimensions = std.join(', ', ['tenant', 'code']);
    31        g.dashboard(thanos.receive.title) {
    32          templating+: {
    33            list+: [
    34              template.new(
    35                'tenant',
    36                '$datasource',
    37                'label_values(http_requests_total{%s}, %s)' % [std.join(', ', [thanos.receive.dashboard.selector] + ['tenant!=""']), 'tenant'],
    38                label='tenant',
    39                refresh=1,
    40                sort=2,
    41                current='all',
    42                allValues=null,
    43                includeAll=true
    44              ),
    45            ],
    46          },
    47        }
    48        .addRow(
    49          g.row('WRITE - Incoming Request')
    50          .addPanel(
    51            g.panel('Rate', 'Shows rate of incoming requests.') +
    52            g.httpQpsPanel('http_requests_total', receiveHandlerSelector, thanos.receive.dashboard.dimensions)
    53          )
    54          .addPanel(
    55            g.panel('Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') +
    56            g.httpErrPanel('http_requests_total', receiveHandlerSelector, thanos.receive.dashboard.dimensions)
    57          )
    58          .addPanel(
    59            g.panel('Duration', 'Shows how long has it taken to handle incoming requests in quantiles.') +
    60            g.latencyPanel('http_request_duration_seconds', receiveHandlerSelector, thanos.receive.dashboard.dimensions)
    61          )
    62        )
    63        .addRow(
    64          g.row('WRITE - Incoming Request (tenant focus)')
    65          .addPanel(
    66            g.panel('Rate of write requests (by tenant and code)') +
    67            g.queryPanel(
    68              'sum by (%s) (rate(http_requests_total{%s}[$__rate_interval]))' % [tenantWithHttpCodeDimensions, tenantReceiveHandlerSeclector],
    69              '{{code}} - {{tenant}}'
    70            )
    71          )
    72          .addPanel(
    73            g.panel('Number of errors (by tenant and code)') +
    74            g.queryPanel(
    75              'sum by (%s) (rate(http_requests_total{%s}[$__rate_interval]))' % [
    76                tenantWithHttpCodeDimensions,
    77                tenantHttpCodeNot2XXSelector,
    78              ],
    79              '{{code}} - {{tenant}}'
    80            )
    81          )
    82          .addPanel(
    83            g.panel('Average request duration (by tenant)') +
    84            g.queryPanel(
    85              'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$__rate_interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [
    86                thanos.receive.dashboard.tenantDimensions,
    87                tenantReceiveHandlerSeclector,
    88                thanos.receive.dashboard.tenantDimensions,
    89                tenantReceiveHandlerSeclector,
    90              ],
    91              '{{tenant}}'
    92            )
    93          )
    94        )
    95        .addRow(
    96          g.row('HTTP requests (tenant focus)')
    97          .addPanel(
    98            g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') +
    99            g.queryPanel(
   100              'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$__rate_interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$__rate_interval]))' % [
   101                thanos.receive.dashboard.tenantDimensions,
   102                tenantHttpCode2XXSelector,
   103                thanos.receive.dashboard.tenantDimensions,
   104                tenantHttpCode2XXSelector,
   105              ],
   106              '{{tenant}}'
   107            )
   108          )
   109          .addPanel(
   110            g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') +
   111            g.queryPanel(
   112              'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$__rate_interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$__rate_interval]))' % [
   113                thanos.receive.dashboard.tenantDimensions,
   114                tenantHttpCodeNot2XXSelector,
   115                thanos.receive.dashboard.tenantDimensions,
   116                tenantHttpCodeNot2XXSelector,
   117              ],
   118              '{{tenant}}'
   119            )
   120          )
   121          .addPanel(
   122            g.panel('Inflight requests (per tenant and method)') +
   123            g.queryPanel(
   124              'sum by (%s) (http_inflight_requests{%s})' % [
   125                std.join(', ', [thanos.receive.dashboard.tenantDimensions, 'method']),
   126                tenantReceiveHandlerSeclector,
   127              ],
   128              '{{method}} - {{tenant}}'
   129            )
   130          )
   131        )
   132        .addRow(
   133          g.row('Series & Samples (tenant focus)')
   134          .addPanel(
   135            g.panel('Rate of series received (per tenant, only 2XX)') +
   136            g.queryPanel(
   137              'sum(rate(thanos_receive_write_timeseries_sum{%s}[$__rate_interval])) by (%s) ' % [
   138                utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']),
   139                thanos.receive.dashboard.tenantDimensions,
   140              ],
   141              '{{tenant}}'
   142            )
   143          )
   144          .addPanel(
   145            g.panel('Rate of series not written (per tenant and code, non 2XX)') +
   146            g.queryPanel(
   147              'sum(rate(thanos_receive_write_timeseries_sum{%s}[$__rate_interval])) by (%s) ' % [
   148                utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']),
   149                tenantWithHttpCodeDimensions,
   150              ],
   151              '{{code}} - {{tenant}}'
   152            )
   153          )
   154          .addPanel(
   155            g.panel('Rate of samples received (per tenant, only 2XX)') +
   156            g.queryPanel(
   157              'sum(rate(thanos_receive_write_samples_sum{%s}[$__rate_interval])) by (%s) ' % [
   158                utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']),
   159                thanos.receive.dashboard.tenantDimensions,
   160              ],
   161              '{{tenant}}'
   162            )
   163          )
   164          .addPanel(
   165            g.panel('Rate of samples not written (per tenant and code, non 2XX)') +
   166            g.queryPanel(
   167              'sum(rate(thanos_receive_write_samples_sum{%s}[$__rate_interval])) by (%s) ' % [
   168                utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']),
   169                tenantWithHttpCodeDimensions,
   170              ],
   171              '{{code}} - {{tenant}}'
   172            )
   173          )
   174        )
   175        .addRow(
   176          g.row('WRITE - Replication')
   177          .addPanel(
   178            g.panel('Rate', 'Shows rate of replications to other receive nodes.') +
   179            g.queryPanel(
   180              'sum by (%s) (rate(thanos_receive_replications_total{%s}[$__rate_interval]))' % [thanos.receive.dashboard.dimensions, thanos.receive.dashboard.selector],
   181              'all {{job}}',
   182            )
   183          )
   184          .addPanel(
   185            g.panel('Errors', 'Shows ratio of errors compared to the total number of replications to other receive nodes.') +
   186            g.qpsErrTotalPanel(
   187              'thanos_receive_replications_total{%s}' % utils.joinLabels([thanos.receive.dashboard.selector, 'result="error"']),
   188              'thanos_receive_replications_total{%s}' % thanos.receive.dashboard.selector,
   189              thanos.receive.dashboard.dimensions
   190            )
   191          )
   192        )
   193        .addRow(
   194          g.row('WRITE - Forward Request')
   195          .addPanel(
   196            g.panel('Rate', 'Shows rate of forwarded requests to other receive nodes.') +
   197            g.queryPanel(
   198              'sum by (%s) (rate(thanos_receive_forward_requests_total{%s}[$__rate_interval]))' % [thanos.receive.dashboard.dimensions, thanos.receive.dashboard.selector],
   199              'all {{job}}',
   200            )
   201          )
   202          .addPanel(
   203            g.panel('Errors', 'Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.') +
   204            g.qpsErrTotalPanel(
   205              'thanos_receive_forward_requests_total{%s}' % utils.joinLabels([thanos.receive.dashboard.selector, 'result="error"']),
   206              'thanos_receive_forward_requests_total{%s}' % thanos.receive.dashboard.selector,
   207              thanos.receive.dashboard.dimensions
   208            )
   209          )
   210        )
   211        .addRow(
   212          // TODO(https://github.com/thanos-io/thanos/issues/3926)
   213          g.row('WRITE - gRPC (Unary)')
   214          .addPanel(
   215            g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') +
   216            g.grpcRequestsPanel('grpc_server_handled_total', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions)
   217          )
   218          .addPanel(
   219            g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') +
   220            g.grpcErrorsPanel('grpc_server_handled_total', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions)
   221          )
   222          .addPanel(
   223            g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') +
   224            g.latencyPanel('grpc_server_handling_seconds', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions)
   225          )
   226        )
   227        .addRow(
   228          // TODO(https://github.com/thanos-io/thanos/issues/3926)
   229          g.row('READ - gRPC (Unary)')
   230          .addPanel(
   231            g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') +
   232            g.grpcRequestsPanel('grpc_server_handled_total', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions)
   233          )
   234          .addPanel(
   235            g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') +
   236            g.grpcErrorsPanel('grpc_server_handled_total', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions)
   237          )
   238          .addPanel(
   239            g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') +
   240            g.latencyPanel('grpc_server_handling_seconds', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions)
   241          )
   242        )
   243        .addRow(
   244          // TODO(https://github.com/thanos-io/thanos/issues/3926)
   245          g.row('READ - gRPC (Stream)')
   246          .addPanel(
   247            g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') +
   248            g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.receive.dashboard.dimensions)
   249          )
   250          .addPanel(
   251            g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') +
   252            g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.receive.dashboard.dimensions)
   253          )
   254          .addPanel(
   255            g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') +
   256            g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.receive.dashboard.dimensions)
   257          )
   258        )
   259        .addRow(
   260          g.row('Last Updated')
   261          .addPanel(
   262            g.panel('Successful Upload', 'Shows the relative time of last successful upload to the object-store bucket.') +
   263            g.tablePanel(
   264              ['time() - max by (%s) (thanos_objstore_bucket_last_successful_upload_time{%s})' % [utils.joinLabels([thanos.receive.dashboard.dimensions, 'bucket']), thanos.receive.dashboard.selector]],
   265              {
   266                Value: {
   267                  alias: 'Uploaded Ago',
   268                  unit: 's',
   269                  type: 'number',
   270                },
   271              },
   272            )
   273          )
   274        )
   275        .addRow(
   276          g.resourceUtilizationRow(thanos.receive.dashboard.selector, thanos.receive.dashboard.dimensions)
   277        ),
   278  
   279      __overviewRows__+:: if thanos.receive == null then [] else [
   280        g.row('Receive')
   281        .addPanel(
   282          g.panel('Incoming Requests Rate', 'Shows rate of incoming requests.') +
   283          g.httpQpsPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), thanos.dashboard.overview.dimensions) +
   284          g.addDashboardLink(thanos.receive.title)
   285        )
   286        .addPanel(
   287          g.panel('Incoming Requests Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') +
   288          g.httpErrPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), thanos.dashboard.overview.dimensions) +
   289          g.addDashboardLink(thanos.receive.title)
   290        )
   291        .addPanel(
   292          g.sloLatency(
   293            'Incoming Requests Latency 99th Percentile',
   294            'Shows how long has it taken to handle incoming requests.',
   295            'http_request_duration_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']),
   296            thanos.dashboard.overview.dimensions,
   297            0.99,
   298            0.5,
   299            1
   300          ) +
   301          g.addDashboardLink(thanos.receive.title)
   302        ),
   303      ],
   304    },
   305  }