go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/internal/metrics/metrics.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metrics 16 17 import ( 18 "math" 19 20 "go.chromium.org/luci/common/tsmon/distribution" 21 "go.chromium.org/luci/common/tsmon/field" 22 "go.chromium.org/luci/common/tsmon/metric" 23 "go.chromium.org/luci/common/tsmon/types" 24 ) 25 26 // MaxRetryFieldValue is the number to cap the value of retry count field at, 27 // for metrics that include it, thus indicating a value greater or equal to it. 28 // This makes the metric field have a reasonable number of distinct values. 29 const MaxRetryFieldValue = 10 30 31 var ( 32 // bucketer1msTo5min covers range of 1..300k. 33 bucketer1msTo5min = distribution.GeometricBucketer(math.Pow(10, 0.055), 100) 34 35 // TQ guts metrics, primary useful to debug TQ. 36 37 InprocSweepDurationMS = metric.NewCumulativeDistribution( 38 "tq/sweep/inproc/duration", 39 "Duration of a full inproc sweep cycle across all DBs (ms)", 40 &types.MetricMetadata{Units: types.Milliseconds}, 41 bucketer1msTo5min, 42 ) 43 44 SweepFetchMetaDurationsMS = metric.NewCumulativeDistribution( 45 "tq/sweep/fetch/meta/durations", 46 "Duration of FetchRemindersMeta operation (ms)", 47 &types.MetricMetadata{Units: types.Milliseconds}, 48 bucketer1msTo5min, 49 field.String("status"), // OK | limit | timeout | failures 50 field.Int("level"), // 0 means the primary shard task, 1+ are its children 51 field.String("db"), 52 ) 53 54 SweepFetchMetaReminders = metric.NewCounter( 55 "tq/sweep/fetch/meta/reminders", 56 "Count of Reminders fetched by FetchRemindersMeta", 57 nil, 58 field.String("status"), // OK | limit | timeout | failures 59 field.Int("level"), // 0 means the primary shard task, 1+ are its children 60 field.String("db"), 61 ) 62 63 ReminderStalenessMS = metric.NewCumulativeDistribution( 64 "tq/reminders/staleness", 65 ("Distribution of staleness of scanned Reminders during the sweep. " + 66 "May be incomplete if keyspace wasn't scanned completely"), 67 &types.MetricMetadata{Units: types.Milliseconds}, 68 distribution.DefaultBucketer, 69 field.Int("level"), 70 field.String("db"), 71 ) 72 73 RemindersCreated = metric.NewCounter( 74 "tq/reminders/created", 75 "Count of reminders created and if they are still fresh in the post-txn defer", 76 nil, 77 field.String("task_class"), // matches TaskClass.ID 78 field.String("staleness"), // fresh | stale 79 field.String("db"), 80 ) 81 82 RemindersDeleted = metric.NewCounter( 83 "tq/reminders/processed", 84 "Count of reminders processed (i.e. deleted)", 85 nil, 86 field.String("task_class"), // matches TaskClass.ID 87 field.String("txn_path"), // happy | sweep 88 field.String("db"), 89 ) 90 91 RemindersLatencyMS = metric.NewCumulativeDistribution( 92 "tq/reminders/latency", 93 "Time between AddTask call and the deletion of the reminder", 94 &types.MetricMetadata{Units: types.Milliseconds}, 95 bucketer1msTo5min, 96 field.String("task_class"), // matches TaskClass.ID 97 field.String("txn_path"), // happy | sweep 98 field.String("db"), 99 ) 100 101 // TQ metrics that might be useful for TQ clients as well. 102 103 SubmitCount = metric.NewCounter( 104 "tq/submit/count", 105 "Count of submitted tasks", 106 nil, 107 field.String("task_class"), // matches TaskClass.ID 108 field.String("txn_path"), // none | happy | sweep 109 field.String("grpc_code"), // gRPC canonical code 110 ) 111 112 SubmitDurationMS = metric.NewCumulativeDistribution( 113 "tq/submit/duration", 114 "Duration of submit calls", 115 &types.MetricMetadata{Units: types.Milliseconds}, 116 distribution.DefaultBucketer, 117 field.String("task_class"), // matches TaskClass.ID 118 field.String("txn_path"), // none | happy | sweep 119 field.String("grpc_code"), // gRPC canonical code 120 ) 121 122 ServerRejectedCount = metric.NewCounter( 123 "tq/server/rejected", 124 "Count of rejected (e.g. malformed) task pushes", 125 nil, 126 field.String("reason"), // auth | bad_request | unknown_class | no_handler | bad_payload 127 ) 128 129 ServerHandledCount = metric.NewCounter( 130 "tq/server/handled", 131 "Count of handled non-rejected tasks", 132 nil, 133 field.String("task_class"), // matches TaskClass.ID 134 field.String("result"), // OK | retry | transient | fatal 135 field.Int("retry"), // 0 for first try, incrementing until cap. 136 ) 137 138 ServerDurationMS = metric.NewCumulativeDistribution( 139 "tq/server/duration", 140 "Duration of handling of non-rejected tasks", 141 &types.MetricMetadata{Units: types.Milliseconds}, 142 distribution.DefaultBucketer, 143 field.String("task_class"), // matches TaskClass.ID 144 field.String("result"), // OK | retry | transient | fatal 145 ) 146 147 ServerTaskLatency = metric.NewCumulativeDistribution( 148 "tq/server/latency", 149 "Time between task's expected ETA and actual completion", 150 &types.MetricMetadata{Units: types.Milliseconds}, 151 distribution.DefaultBucketer, 152 field.String("task_class"), // matches TaskClass.ID 153 field.String("result"), // OK | retry | transient | fatal 154 field.Int("retry"), // 0 for first try, incrementing until cap. 155 ) 156 157 ServerRunning = metric.NewInt( 158 "tq/server/running", 159 "Number of task handlers currently running", 160 nil, 161 field.String("task_class"), // matches TaskClass.ID 162 ) 163 )