k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/ghproxy_alerts.libsonnet (about)

     1  {
     2    prometheusAlerts+:: {
     3      local monitoringLink = $._config.instance.monitoringLink,
     4      local dashboardID = $._config.grafanaDashboardIDs['ghproxy.json'],
     5      groups+: [
     6        {
     7          name: 'ghproxy',
     8          rules: [
     9            {
    10              alert: 'ghproxy-specific-status-code-5xx',
    11              expr: |||
    12                sum(rate(github_request_duration_count{status=~"5.."}[5m])) by (status,path) / ignoring(status) group_left sum(rate(github_request_duration_count[5m])) by (path) * 100 > 10
    13              |||,
    14              labels: {
    15                severity: 'warning',
    16              },
    17              annotations: {
    18                message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')],
    19              },
    20            },
    21            {
    22              alert: 'ghproxy-global-status-code-5xx',
    23              expr: |||
    24                sum(rate(github_request_duration_count{status=~"5.."}[5m])) by (status) / ignoring(status) group_left sum(rate(github_request_duration_count[5m])) * 100 > 3
    25              |||,
    26              labels: {
    27                severity: 'warning',
    28              },
    29              annotations: {
    30                message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=8' % [dashboardID], 'the ghproxy dashboard')],
    31              },
    32            },
    33            {
    34              alert: 'ghproxy-specific-status-code-4xx',
    35              // Paths that contains error codes expected by prow(Grabbed from previous prow alerts):
    36              //  - "/repos/:owner/:repo/pulls/:pullId/requested_reviewers" 422 (https://github.com/kubernetes/test-infra/blob/e84a6897b7fae65ba295a4c370057e4a216345ef/prow/github/client.go#L2712)
    37              //  - "/search/issues" 403 (Permission denied, very likely not prow error)
    38              //  - "/repos/:owner/:repo/pulls/:pullId/merge" 405 (https://github.com/kubernetes/test-infra/blob/e84a6897b7fae65ba295a4c370057e4a216345ef/prow/github/client.go#L3472)
    39              //  - "/repos/:owner/:repo/statuses/:statusId" 422 (https://github.com/kubernetes/test-infra/blob/858e80618451fb86f6a9f10a7f9f5bbf1bc7be2a/prow/crier/reporters/github/reporter.go#L159)
    40              //  These paths + statuscode combinations are excluded from alerts to reduce noise.
    41              expr: |||
    42                 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410",status=~"4..",path!="/repos/:owner/:repo/pulls/:pullId/requested_reviewers",path!="/search/issues",path!="/repos/:owner/:repo/pulls/:pullId/merge",path!="/repos/:owner/:repo/statuses/:statusId"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10
    43              |||,
    44              labels: {
    45                severity: 'warning',
    46              },
    47              annotations: {
    48                message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')],
    49              },
    50            },
    51            {
    52              alert: 'ghproxy-specific-status-code-not-422',
    53              expr: |||
    54                 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410", status!="422", status=~"4..",path=~"/repos/:owner/:repo/pulls/:pullId/requested_reviewers|/repos/:owner/:repo/statuses/:statusId"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10
    55              |||,
    56              labels: {
    57                severity: 'warning',
    58              },
    59              annotations: {
    60                message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')],
    61              },
    62            },
    63            {
    64              alert: 'ghproxy-specific-status-code-not-403',
    65              expr: |||
    66                 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410", status!="403", status=~"4..",path="/search/issues"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10
    67              |||,
    68              labels: {
    69                severity: 'warning',
    70              },
    71              annotations: {
    72                message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')],
    73              },
    74            },
    75            {
    76              alert: 'ghproxy-specific-status-code-not-405',
    77              expr: |||
    78                 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410", status!="405", status=~"4..",path="/repos/:owner/:repo/pulls/:pullId/merge"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10
    79              |||,
    80              labels: {
    81                severity: 'warning',
    82              },
    83              annotations: {
    84                message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')],
    85              },
    86            },
    87            {
    88              alert: 'ghproxy-global-status-code-4xx',
    89              expr: |||
    90                sum(rate(github_request_duration_count{status=~"4..",status!="404",status!="410",status!="403",status!="405",status!="422"}[30m])) by (status) / ignoring(status) group_left sum(rate(github_request_duration_count[30m])) * 100 > 3
    91              |||,
    92              labels: {
    93                severity: 'warning',
    94              },
    95              annotations: {
    96                message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=8' % [dashboardID], 'the ghproxy dashboard')],
    97              },
    98            },
    99            {
   100              alert: 'ghproxy-global-status-code-403-405-422',
   101              expr: |||
   102                sum(rate(github_request_duration_count{status=~"403|405|422"}[30m])) by (status) / ignoring(status) group_left sum(rate(github_request_duration_count[30m])) * 100 > 10
   103              |||,
   104              labels: {
   105                severity: 'warning',
   106              },
   107              annotations: {
   108                message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=8' % [dashboardID], 'the ghproxy dashboard')],
   109              },
   110            },
   111            {
   112              alert: 'ghproxy-running-out-github-tokens-in-a-hour',
   113              // check 30% of the capacity (5000): 1500
   114              expr: |||
   115                github_token_usage{job="ghproxy"} <  1500
   116                and
   117                predict_linear(github_token_usage{job="ghproxy"}[30m], 1 * 3600) < 0
   118              |||,
   119              'for': '5m',
   120              labels: {
   121                severity: 'high',
   122              },
   123              annotations: {
   124                message: 'token {{ $labels.token_hash }} will run out of API quota before the next reset.',
   125              },
   126            }
   127          ],
   128        },
   129      ],
   130    },
   131  }