agones.dev/agones@v1.54.0/pkg/metrics/kubernetes_client.go (about) 1 // Copyright 2019 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metrics 16 17 import ( 18 "context" 19 "net/url" 20 "time" 21 22 "agones.dev/agones/pkg/util/runtime" 23 "go.opencensus.io/stats" 24 "go.opencensus.io/stats/view" 25 "go.opencensus.io/tag" 26 "k8s.io/client-go/tools/cache" 27 "k8s.io/client-go/tools/metrics" 28 "k8s.io/client-go/util/workqueue" 29 ) 30 31 var ( 32 keyQueueName = MustTagKey("queue_name") 33 34 httpRequestTotalStats = stats.Int64("http/request_total", "The total of HTTP requests.", "1") 35 httpRequestLatencyStats = stats.Float64("http/latency", "The duration of HTTP requests.", "s") 36 37 cacheListTotalStats = stats.Float64("cache/list_total", "The total number of list operations.", "1") 38 cacheListLatencyStats = stats.Float64("cache/list_latency", "Duration of a Kubernetes API call in seconds", "s") 39 cacheListItemCountStats = stats.Float64("cache/list_items_count", "Count of items in a list from the Kubernetes API.", "1") 40 cacheWatchesTotalStats = stats.Float64("cache/watches_total", "Total number of watch operations.", "1") 41 cacheShortWatchesTotalStats = stats.Float64("cache/short_watches_total", "Total number of short watch operations.", "1") 42 cacheWatchesLatencyStats = stats.Float64("cache/watches_latency", "Duration of watches on the Kubernetes API.", "s") 43 cacheItemsInWatchesCountStats = stats.Float64("cache/watch_events", "Number of items in watches on the Kubernetes API.", "1") 44 cacheLastResourceVersionStats = stats.Float64("cache/last_resource_version", "Last resource version from the Kubernetes API.", "1") 45 46 workQueueDepthStats = stats.Float64("workqueue/depth", "Current depth of the work queue.", "1") 47 workQueueItemsTotalStats = stats.Float64("workqueue/items_total", "Total number of items added to the work queue.", "1") 48 workQueueLatencyStats = stats.Float64("workqueue/latency", "How long an item stays in the work queue.", "s") 49 workQueueWorkDurationStats = stats.Float64("workqueue/work_duration", "How long processing an item from the work queue takes.", "s") 50 workQueueRetriesTotalStats = stats.Float64("workqueue/retries_total", "Total number of items retried to the work queue.", "1") 51 workQueueLongestRunningProcessorStats = stats.Float64("workqueue/longest_running_processor", "How long the longest workqueue processors been running in microseconds.", "1") 52 workQueueUnfinishedWorkStats = stats.Float64("workqueue/unfinished_work", "How long has unfinished work been in the workqueue.", "1") 53 ) 54 55 func init() { 56 distributionSeconds := []float64{0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3} 57 distributionNumbers := []float64{0, 10, 50, 100, 150, 250, 300} 58 59 runtime.Must(view.Register(&view.View{ 60 Name: "k8s_client_http_request_total", 61 Measure: httpRequestTotalStats, 62 Description: "The total of HTTP requests to the Kubernetes API by status code", 63 Aggregation: view.Count(), 64 TagKeys: []tag.Key{keyVerb, keyStatusCode}, 65 })) 66 67 runtime.Must(view.Register(&view.View{ 68 Name: "k8s_client_http_request_duration_seconds", 69 Measure: httpRequestLatencyStats, 70 Description: "The distribution of HTTP requests latencies to the Kubernetes API by status code", 71 Aggregation: view.Distribution(distributionSeconds...), 72 TagKeys: []tag.Key{keyVerb, keyEndpoint}, 73 })) 74 75 runtime.Must(view.Register(&view.View{ 76 Name: "k8s_client_cache_list_total", 77 Measure: cacheListTotalStats, 78 Description: "The total number of list operations for client-go caches", 79 Aggregation: view.Count(), 80 })) 81 82 runtime.Must(view.Register(&view.View{ 83 Name: "k8s_client_cache_list_duration_seconds", 84 Measure: cacheListLatencyStats, 85 Description: "Duration of a Kubernetes list API call in seconds", 86 Aggregation: view.Distribution(distributionSeconds...), 87 })) 88 89 runtime.Must(view.Register(&view.View{ 90 Name: "k8s_client_cache_list_items", 91 Measure: cacheListItemCountStats, 92 Description: "Count of items in a list from the Kubernetes API.", 93 Aggregation: view.Distribution(distributionNumbers...), 94 })) 95 96 runtime.Must(view.Register(&view.View{ 97 Name: "k8s_client_cache_watches_total", 98 Measure: cacheWatchesTotalStats, 99 Description: "The total number of watch operations for client-go caches", 100 Aggregation: view.Count(), 101 })) 102 103 runtime.Must(view.Register(&view.View{ 104 Name: "k8s_client_cache_short_watches_total", 105 Measure: cacheShortWatchesTotalStats, 106 Description: "The total number of short watch operations for client-go caches", 107 Aggregation: view.Count(), 108 })) 109 110 runtime.Must(view.Register(&view.View{ 111 Name: "k8s_client_cache_watch_duration_seconds", 112 Measure: cacheWatchesLatencyStats, 113 Description: "Duration of watches on the Kubernetes API.", 114 Aggregation: view.Distribution(distributionSeconds...), 115 })) 116 117 runtime.Must(view.Register(&view.View{ 118 Name: "k8s_client_cache_watch_events", 119 Measure: cacheItemsInWatchesCountStats, 120 Description: "Number of items in watches on the Kubernetes API.", 121 Aggregation: view.Distribution(distributionNumbers...), 122 })) 123 124 runtime.Must(view.Register(&view.View{ 125 Name: "k8s_client_cache_last_resource_version", 126 Measure: cacheLastResourceVersionStats, 127 Description: "Last resource version from the Kubernetes API.", 128 Aggregation: view.LastValue(), 129 })) 130 131 runtime.Must(view.Register(&view.View{ 132 Name: "k8s_client_workqueue_depth", 133 Measure: workQueueDepthStats, 134 Description: "Current depth of the work queue.", 135 Aggregation: view.LastValue(), 136 TagKeys: []tag.Key{keyQueueName}, 137 })) 138 139 runtime.Must(view.Register(&view.View{ 140 Name: "k8s_client_workqueue_items_total", 141 Measure: workQueueItemsTotalStats, 142 Description: "Total number of items added to the work queue.", 143 Aggregation: view.Count(), 144 TagKeys: []tag.Key{keyQueueName}, 145 })) 146 147 runtime.Must(view.Register(&view.View{ 148 Name: "k8s_client_workqueue_latency_seconds", 149 Measure: workQueueLatencyStats, 150 Description: "How long an item stays in the work queue.", 151 Aggregation: view.Distribution(distributionSeconds...), 152 TagKeys: []tag.Key{keyQueueName}, 153 })) 154 155 runtime.Must(view.Register(&view.View{ 156 Name: "k8s_client_workqueue_work_duration_seconds", 157 Measure: workQueueWorkDurationStats, 158 Description: "How long processing an item from the work queue takes.", 159 Aggregation: view.Distribution(distributionSeconds...), 160 TagKeys: []tag.Key{keyQueueName}, 161 })) 162 163 runtime.Must(view.Register(&view.View{ 164 Name: "k8s_client_workqueue_retries_total", 165 Measure: workQueueRetriesTotalStats, 166 Description: "Total number of items retried to the work queue.", 167 Aggregation: view.Count(), 168 TagKeys: []tag.Key{keyQueueName}, 169 })) 170 171 runtime.Must(view.Register(&view.View{ 172 Name: "k8s_client_workqueue_longest_running_processor", 173 Measure: workQueueLongestRunningProcessorStats, 174 Description: "How long the longest running workqueue processor has been running in microseconds.", 175 Aggregation: view.LastValue(), 176 TagKeys: []tag.Key{keyQueueName}, 177 })) 178 179 runtime.Must(view.Register(&view.View{ 180 Name: "k8s_client_workqueue_unfinished_work_seconds", 181 Measure: workQueueUnfinishedWorkStats, 182 Description: "How long unfinished work has been sitting in the workqueue in seconds.", 183 Aggregation: view.LastValue(), 184 TagKeys: []tag.Key{keyQueueName}, 185 })) 186 187 clientGoRequest := &clientGoMetricAdapter{} 188 clientGoRequest.Register() 189 } 190 191 // Definition of client-go metrics adapter for HTTP requests, caches and workerqueues observations 192 type clientGoMetricAdapter struct{} 193 194 func (c *clientGoMetricAdapter) Register() { 195 metrics.Register(metrics.RegisterOpts{ 196 RequestLatency: c, 197 RequestResult: c, 198 }) 199 workqueue.SetProvider(c) 200 } 201 202 func (clientGoMetricAdapter) Increment(ctx context.Context, code string, method string, _ string) { 203 RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyStatusCode, code), 204 tag.Insert(keyVerb, method)}, httpRequestTotalStats.M(int64(1))) 205 } 206 207 func (clientGoMetricAdapter) Observe(ctx context.Context, verb string, u url.URL, latency time.Duration) { 208 // url is without {namespace} and {name}, so cardinality of resulting metrics is low. 209 RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyVerb, verb), 210 tag.Insert(keyEndpoint, u.Path)}, httpRequestLatencyStats.M(latency.Seconds())) 211 } 212 213 // ocMetric adapts OpenCensus measures to cache metrics 214 type ocMetric struct { 215 *stats.Float64Measure 216 ctx context.Context 217 } 218 219 func newOcMetric(m *stats.Float64Measure) *ocMetric { 220 return &ocMetric{ 221 Float64Measure: m, 222 ctx: context.Background(), 223 } 224 } 225 226 func (m *ocMetric) withTag(key tag.Key, value string) *ocMetric { 227 ctx, err := tag.New(m.ctx, tag.Upsert(key, value)) 228 if err != nil { 229 panic(err) 230 } 231 m.ctx = ctx 232 return m 233 } 234 235 func (m *ocMetric) Inc() { 236 stats.Record(m.ctx, m.Float64Measure.M(float64(1))) 237 } 238 239 func (m *ocMetric) Dec() { 240 stats.Record(m.ctx, m.Float64Measure.M(float64(-1))) 241 } 242 243 // observeFunc is an adapter that allows the use of functions as summary metric. 244 // useful for converting metrics unit before sending them to OC 245 type observeFunc func(float64) 246 247 func (o observeFunc) Observe(f float64) { 248 o(f) 249 } 250 251 func (m *ocMetric) Observe(f float64) { 252 stats.Record(m.ctx, m.Float64Measure.M(f)) 253 } 254 255 func (m *ocMetric) Set(f float64) { 256 stats.Record(m.ctx, m.Float64Measure.M(f)) 257 } 258 259 func (clientGoMetricAdapter) NewListsMetric(string) cache.CounterMetric { 260 return newOcMetric(cacheListTotalStats) 261 } 262 263 func (clientGoMetricAdapter) NewListDurationMetric(string) cache.SummaryMetric { 264 return newOcMetric(cacheListLatencyStats) 265 } 266 267 func (clientGoMetricAdapter) NewItemsInListMetric(string) cache.SummaryMetric { 268 return newOcMetric(cacheListItemCountStats) 269 } 270 271 func (clientGoMetricAdapter) NewWatchesMetric(string) cache.CounterMetric { 272 return newOcMetric(cacheWatchesTotalStats) 273 } 274 275 func (clientGoMetricAdapter) NewShortWatchesMetric(string) cache.CounterMetric { 276 return newOcMetric(cacheShortWatchesTotalStats) 277 } 278 279 func (clientGoMetricAdapter) NewWatchDurationMetric(string) cache.SummaryMetric { 280 return newOcMetric(cacheWatchesLatencyStats) 281 } 282 283 func (clientGoMetricAdapter) NewItemsInWatchMetric(string) cache.SummaryMetric { 284 return newOcMetric(cacheItemsInWatchesCountStats) 285 } 286 287 func (clientGoMetricAdapter) NewLastResourceVersionMetric(string) cache.GaugeMetric { 288 return newOcMetric(cacheLastResourceVersionStats) 289 } 290 291 func (clientGoMetricAdapter) NewDepthMetric(name string) workqueue.GaugeMetric { 292 return newOcMetric(workQueueDepthStats).withTag(keyQueueName, name) 293 } 294 295 func (clientGoMetricAdapter) NewAddsMetric(name string) workqueue.CounterMetric { 296 return newOcMetric(workQueueItemsTotalStats).withTag(keyQueueName, name) 297 } 298 299 func (clientGoMetricAdapter) NewLatencyMetric(name string) workqueue.HistogramMetric { 300 m := newOcMetric(workQueueLatencyStats).withTag(keyQueueName, name) 301 // Convert microseconds to seconds for consistency across metrics. 302 return observeFunc(func(f float64) { 303 m.Observe(f / 1e6) 304 }) 305 } 306 307 func (clientGoMetricAdapter) NewWorkDurationMetric(name string) workqueue.HistogramMetric { 308 m := newOcMetric(workQueueWorkDurationStats).withTag(keyQueueName, name) 309 // Convert microseconds to seconds for consistency across metrics. 310 return observeFunc(func(f float64) { 311 m.Observe(f / 1e6) 312 }) 313 } 314 315 func (clientGoMetricAdapter) NewRetriesMetric(name string) workqueue.CounterMetric { 316 return newOcMetric(workQueueRetriesTotalStats).withTag(keyQueueName, name) 317 } 318 319 func (clientGoMetricAdapter) NewLongestRunningProcessorSecondsMetric(string) workqueue.SettableGaugeMetric { 320 return newOcMetric(workQueueLongestRunningProcessorStats) 321 } 322 323 func (clientGoMetricAdapter) NewUnfinishedWorkSecondsMetric(string) workqueue.SettableGaugeMetric { 324 return newOcMetric(workQueueUnfinishedWorkStats) 325 } 326 327 func (clientGoMetricAdapter) NewDeprecatedDepthMetric(name string) workqueue.GaugeMetric { 328 return newOcMetric(workQueueDepthStats).withTag(keyQueueName, name) 329 } 330 331 func (clientGoMetricAdapter) NewDeprecatedAddsMetric(name string) workqueue.CounterMetric { 332 return newOcMetric(workQueueItemsTotalStats).withTag(keyQueueName, name) 333 } 334 335 func (clientGoMetricAdapter) NewDeprecatedLatencyMetric(name string) workqueue.SummaryMetric { 336 m := newOcMetric(workQueueLatencyStats).withTag(keyQueueName, name) 337 // Convert microseconds to seconds for consistency across metrics. 338 return observeFunc(func(f float64) { 339 m.Observe(f / 1e6) 340 }) 341 } 342 343 func (clientGoMetricAdapter) NewDeprecatedLongestRunningProcessorMicrosecondsMetric(string) workqueue.SettableGaugeMetric { 344 return newOcMetric(workQueueLongestRunningProcessorStats) 345 } 346 347 func (clientGoMetricAdapter) NewDeprecatedRetriesMetric(name string) workqueue.CounterMetric { 348 return newOcMetric(workQueueRetriesTotalStats).withTag(keyQueueName, name) 349 } 350 351 func (clientGoMetricAdapter) NewDeprecatedUnfinishedWorkSecondsMetric(string) workqueue.SettableGaugeMetric { 352 return newOcMetric(workQueueUnfinishedWorkStats) 353 } 354 355 func (clientGoMetricAdapter) NewDeprecatedWorkDurationMetric(name string) workqueue.SummaryMetric { 356 m := newOcMetric(workQueueWorkDurationStats).withTag(keyQueueName, name) 357 // Convert microseconds to seconds for consistency across metrics. 358 return observeFunc(func(f float64) { 359 m.Observe(f / 1e6) 360 }) 361 }