k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/ghproxy_alerts.libsonnet (about) 1 { 2 prometheusAlerts+:: { 3 local monitoringLink = $._config.instance.monitoringLink, 4 local dashboardID = $._config.grafanaDashboardIDs['ghproxy.json'], 5 groups+: [ 6 { 7 name: 'ghproxy', 8 rules: [ 9 { 10 alert: 'ghproxy-specific-status-code-5xx', 11 expr: ||| 12 sum(rate(github_request_duration_count{status=~"5.."}[5m])) by (status,path) / ignoring(status) group_left sum(rate(github_request_duration_count[5m])) by (path) * 100 > 10 13 |||, 14 labels: { 15 severity: 'warning', 16 }, 17 annotations: { 18 message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')], 19 }, 20 }, 21 { 22 alert: 'ghproxy-global-status-code-5xx', 23 expr: ||| 24 sum(rate(github_request_duration_count{status=~"5.."}[5m])) by (status) / ignoring(status) group_left sum(rate(github_request_duration_count[5m])) * 100 > 3 25 |||, 26 labels: { 27 severity: 'warning', 28 }, 29 annotations: { 30 message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=8' % [dashboardID], 'the ghproxy dashboard')], 31 }, 32 }, 33 { 34 alert: 'ghproxy-specific-status-code-4xx', 35 // Paths that contains error codes expected by prow(Grabbed from previous prow alerts): 36 // - "/repos/:owner/:repo/pulls/:pullId/requested_reviewers" 422 (https://github.com/kubernetes/test-infra/blob/e84a6897b7fae65ba295a4c370057e4a216345ef/prow/github/client.go#L2712) 37 // - "/search/issues" 403 (Permission denied, very likely not prow error) 38 // - "/repos/:owner/:repo/pulls/:pullId/merge" 405 (https://github.com/kubernetes/test-infra/blob/e84a6897b7fae65ba295a4c370057e4a216345ef/prow/github/client.go#L3472) 39 // - "/repos/:owner/:repo/statuses/:statusId" 422 (https://github.com/kubernetes/test-infra/blob/858e80618451fb86f6a9f10a7f9f5bbf1bc7be2a/prow/crier/reporters/github/reporter.go#L159) 40 // These paths + statuscode combinations are excluded from alerts to reduce noise. 41 expr: ||| 42 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410",status=~"4..",path!="/repos/:owner/:repo/pulls/:pullId/requested_reviewers",path!="/search/issues",path!="/repos/:owner/:repo/pulls/:pullId/merge",path!="/repos/:owner/:repo/statuses/:statusId"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10 43 |||, 44 labels: { 45 severity: 'warning', 46 }, 47 annotations: { 48 message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')], 49 }, 50 }, 51 { 52 alert: 'ghproxy-specific-status-code-not-422', 53 expr: ||| 54 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410", status!="422", status=~"4..",path=~"/repos/:owner/:repo/pulls/:pullId/requested_reviewers|/repos/:owner/:repo/statuses/:statusId"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10 55 |||, 56 labels: { 57 severity: 'warning', 58 }, 59 annotations: { 60 message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')], 61 }, 62 }, 63 { 64 alert: 'ghproxy-specific-status-code-not-403', 65 expr: ||| 66 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410", status!="403", status=~"4..",path="/search/issues"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10 67 |||, 68 labels: { 69 severity: 'warning', 70 }, 71 annotations: { 72 message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')], 73 }, 74 }, 75 { 76 alert: 'ghproxy-specific-status-code-not-405', 77 expr: ||| 78 sum by(status, path) (rate(github_request_duration_count{status!="404",status!="410", status!="405", status=~"4..",path="/repos/:owner/:repo/pulls/:pullId/merge"}[30m])) / ignoring(status) group_left() sum by(path) (rate(github_request_duration_count[30m])) * 100 > 10 79 |||, 80 labels: { 81 severity: 'warning', 82 }, 83 annotations: { 84 message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are erroring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=9' % [dashboardID], 'the ghproxy dashboard')], 85 }, 86 }, 87 { 88 alert: 'ghproxy-global-status-code-4xx', 89 expr: ||| 90 sum(rate(github_request_duration_count{status=~"4..",status!="404",status!="410",status!="403",status!="405",status!="422"}[30m])) by (status) / ignoring(status) group_left sum(rate(github_request_duration_count[30m])) * 100 > 3 91 |||, 92 labels: { 93 severity: 'warning', 94 }, 95 annotations: { 96 message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=8' % [dashboardID], 'the ghproxy dashboard')], 97 }, 98 }, 99 { 100 alert: 'ghproxy-global-status-code-403-405-422', 101 expr: ||| 102 sum(rate(github_request_duration_count{status=~"403|405|422"}[30m])) by (status) / ignoring(status) group_left sum(rate(github_request_duration_count[30m])) * 100 > 10 103 |||, 104 labels: { 105 severity: 'warning', 106 }, 107 annotations: { 108 message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check %s.' % [monitoringLink('/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&panelId=8' % [dashboardID], 'the ghproxy dashboard')], 109 }, 110 }, 111 { 112 alert: 'ghproxy-running-out-github-tokens-in-a-hour', 113 // check 30% of the capacity (5000): 1500 114 expr: ||| 115 github_token_usage{job="ghproxy"} < 1500 116 and 117 predict_linear(github_token_usage{job="ghproxy"}[30m], 1 * 3600) < 0 118 |||, 119 'for': '5m', 120 labels: { 121 severity: 'high', 122 }, 123 annotations: { 124 message: 'token {{ $labels.token_hash }} will run out of API quota before the next reset.', 125 }, 126 } 127 ], 128 }, 129 ], 130 }, 131 }