agones.dev/agones@v1.54.0/pkg/metrics/kubernetes_client.go (about)

     1  // Copyright 2019 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"context"
    19  	"net/url"
    20  	"time"
    21  
    22  	"agones.dev/agones/pkg/util/runtime"
    23  	"go.opencensus.io/stats"
    24  	"go.opencensus.io/stats/view"
    25  	"go.opencensus.io/tag"
    26  	"k8s.io/client-go/tools/cache"
    27  	"k8s.io/client-go/tools/metrics"
    28  	"k8s.io/client-go/util/workqueue"
    29  )
    30  
    31  var (
    32  	keyQueueName = MustTagKey("queue_name")
    33  
    34  	httpRequestTotalStats   = stats.Int64("http/request_total", "The total of HTTP requests.", "1")
    35  	httpRequestLatencyStats = stats.Float64("http/latency", "The duration of HTTP requests.", "s")
    36  
    37  	cacheListTotalStats           = stats.Float64("cache/list_total", "The total number of list operations.", "1")
    38  	cacheListLatencyStats         = stats.Float64("cache/list_latency", "Duration of a Kubernetes API call in seconds", "s")
    39  	cacheListItemCountStats       = stats.Float64("cache/list_items_count", "Count of items in a list from the Kubernetes API.", "1")
    40  	cacheWatchesTotalStats        = stats.Float64("cache/watches_total", "Total number of watch operations.", "1")
    41  	cacheShortWatchesTotalStats   = stats.Float64("cache/short_watches_total", "Total number of short watch operations.", "1")
    42  	cacheWatchesLatencyStats      = stats.Float64("cache/watches_latency", "Duration of watches on the Kubernetes API.", "s")
    43  	cacheItemsInWatchesCountStats = stats.Float64("cache/watch_events", "Number of items in watches on the Kubernetes API.", "1")
    44  	cacheLastResourceVersionStats = stats.Float64("cache/last_resource_version", "Last resource version from the Kubernetes API.", "1")
    45  
    46  	workQueueDepthStats                   = stats.Float64("workqueue/depth", "Current depth of the work queue.", "1")
    47  	workQueueItemsTotalStats              = stats.Float64("workqueue/items_total", "Total number of items added to the work queue.", "1")
    48  	workQueueLatencyStats                 = stats.Float64("workqueue/latency", "How long an item stays in the work queue.", "s")
    49  	workQueueWorkDurationStats            = stats.Float64("workqueue/work_duration", "How long processing an item from the work queue takes.", "s")
    50  	workQueueRetriesTotalStats            = stats.Float64("workqueue/retries_total", "Total number of items retried to the work queue.", "1")
    51  	workQueueLongestRunningProcessorStats = stats.Float64("workqueue/longest_running_processor", "How long the longest workqueue processors been running in microseconds.", "1")
    52  	workQueueUnfinishedWorkStats          = stats.Float64("workqueue/unfinished_work", "How long has unfinished work been in the workqueue.", "1")
    53  )
    54  
    55  func init() {
    56  	distributionSeconds := []float64{0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3}
    57  	distributionNumbers := []float64{0, 10, 50, 100, 150, 250, 300}
    58  
    59  	runtime.Must(view.Register(&view.View{
    60  		Name:        "k8s_client_http_request_total",
    61  		Measure:     httpRequestTotalStats,
    62  		Description: "The total of HTTP requests to the Kubernetes API by status code",
    63  		Aggregation: view.Count(),
    64  		TagKeys:     []tag.Key{keyVerb, keyStatusCode},
    65  	}))
    66  
    67  	runtime.Must(view.Register(&view.View{
    68  		Name:        "k8s_client_http_request_duration_seconds",
    69  		Measure:     httpRequestLatencyStats,
    70  		Description: "The distribution of HTTP requests latencies to the Kubernetes API by status code",
    71  		Aggregation: view.Distribution(distributionSeconds...),
    72  		TagKeys:     []tag.Key{keyVerb, keyEndpoint},
    73  	}))
    74  
    75  	runtime.Must(view.Register(&view.View{
    76  		Name:        "k8s_client_cache_list_total",
    77  		Measure:     cacheListTotalStats,
    78  		Description: "The total number of list operations for client-go caches",
    79  		Aggregation: view.Count(),
    80  	}))
    81  
    82  	runtime.Must(view.Register(&view.View{
    83  		Name:        "k8s_client_cache_list_duration_seconds",
    84  		Measure:     cacheListLatencyStats,
    85  		Description: "Duration of a Kubernetes list API call in seconds",
    86  		Aggregation: view.Distribution(distributionSeconds...),
    87  	}))
    88  
    89  	runtime.Must(view.Register(&view.View{
    90  		Name:        "k8s_client_cache_list_items",
    91  		Measure:     cacheListItemCountStats,
    92  		Description: "Count of items in a list from the Kubernetes API.",
    93  		Aggregation: view.Distribution(distributionNumbers...),
    94  	}))
    95  
    96  	runtime.Must(view.Register(&view.View{
    97  		Name:        "k8s_client_cache_watches_total",
    98  		Measure:     cacheWatchesTotalStats,
    99  		Description: "The total number of watch operations for client-go caches",
   100  		Aggregation: view.Count(),
   101  	}))
   102  
   103  	runtime.Must(view.Register(&view.View{
   104  		Name:        "k8s_client_cache_short_watches_total",
   105  		Measure:     cacheShortWatchesTotalStats,
   106  		Description: "The total number of short watch operations for client-go caches",
   107  		Aggregation: view.Count(),
   108  	}))
   109  
   110  	runtime.Must(view.Register(&view.View{
   111  		Name:        "k8s_client_cache_watch_duration_seconds",
   112  		Measure:     cacheWatchesLatencyStats,
   113  		Description: "Duration of watches on the Kubernetes API.",
   114  		Aggregation: view.Distribution(distributionSeconds...),
   115  	}))
   116  
   117  	runtime.Must(view.Register(&view.View{
   118  		Name:        "k8s_client_cache_watch_events",
   119  		Measure:     cacheItemsInWatchesCountStats,
   120  		Description: "Number of items in watches on the Kubernetes API.",
   121  		Aggregation: view.Distribution(distributionNumbers...),
   122  	}))
   123  
   124  	runtime.Must(view.Register(&view.View{
   125  		Name:        "k8s_client_cache_last_resource_version",
   126  		Measure:     cacheLastResourceVersionStats,
   127  		Description: "Last resource version from the Kubernetes API.",
   128  		Aggregation: view.LastValue(),
   129  	}))
   130  
   131  	runtime.Must(view.Register(&view.View{
   132  		Name:        "k8s_client_workqueue_depth",
   133  		Measure:     workQueueDepthStats,
   134  		Description: "Current depth of the work queue.",
   135  		Aggregation: view.LastValue(),
   136  		TagKeys:     []tag.Key{keyQueueName},
   137  	}))
   138  
   139  	runtime.Must(view.Register(&view.View{
   140  		Name:        "k8s_client_workqueue_items_total",
   141  		Measure:     workQueueItemsTotalStats,
   142  		Description: "Total number of items added to the work queue.",
   143  		Aggregation: view.Count(),
   144  		TagKeys:     []tag.Key{keyQueueName},
   145  	}))
   146  
   147  	runtime.Must(view.Register(&view.View{
   148  		Name:        "k8s_client_workqueue_latency_seconds",
   149  		Measure:     workQueueLatencyStats,
   150  		Description: "How long an item stays in the work queue.",
   151  		Aggregation: view.Distribution(distributionSeconds...),
   152  		TagKeys:     []tag.Key{keyQueueName},
   153  	}))
   154  
   155  	runtime.Must(view.Register(&view.View{
   156  		Name:        "k8s_client_workqueue_work_duration_seconds",
   157  		Measure:     workQueueWorkDurationStats,
   158  		Description: "How long processing an item from the work queue takes.",
   159  		Aggregation: view.Distribution(distributionSeconds...),
   160  		TagKeys:     []tag.Key{keyQueueName},
   161  	}))
   162  
   163  	runtime.Must(view.Register(&view.View{
   164  		Name:        "k8s_client_workqueue_retries_total",
   165  		Measure:     workQueueRetriesTotalStats,
   166  		Description: "Total number of items retried to the work queue.",
   167  		Aggregation: view.Count(),
   168  		TagKeys:     []tag.Key{keyQueueName},
   169  	}))
   170  
   171  	runtime.Must(view.Register(&view.View{
   172  		Name:        "k8s_client_workqueue_longest_running_processor",
   173  		Measure:     workQueueLongestRunningProcessorStats,
   174  		Description: "How long the longest running workqueue processor has been running in microseconds.",
   175  		Aggregation: view.LastValue(),
   176  		TagKeys:     []tag.Key{keyQueueName},
   177  	}))
   178  
   179  	runtime.Must(view.Register(&view.View{
   180  		Name:        "k8s_client_workqueue_unfinished_work_seconds",
   181  		Measure:     workQueueUnfinishedWorkStats,
   182  		Description: "How long unfinished work has been sitting in the workqueue in seconds.",
   183  		Aggregation: view.LastValue(),
   184  		TagKeys:     []tag.Key{keyQueueName},
   185  	}))
   186  
   187  	clientGoRequest := &clientGoMetricAdapter{}
   188  	clientGoRequest.Register()
   189  }
   190  
   191  // Definition of client-go metrics adapter for HTTP requests, caches and workerqueues observations
   192  type clientGoMetricAdapter struct{}
   193  
   194  func (c *clientGoMetricAdapter) Register() {
   195  	metrics.Register(metrics.RegisterOpts{
   196  		RequestLatency: c,
   197  		RequestResult:  c,
   198  	})
   199  	workqueue.SetProvider(c)
   200  }
   201  
   202  func (clientGoMetricAdapter) Increment(ctx context.Context, code string, method string, _ string) {
   203  	RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyStatusCode, code),
   204  		tag.Insert(keyVerb, method)}, httpRequestTotalStats.M(int64(1)))
   205  }
   206  
   207  func (clientGoMetricAdapter) Observe(ctx context.Context, verb string, u url.URL, latency time.Duration) {
   208  	// url is without {namespace} and {name}, so cardinality of resulting metrics is low.
   209  	RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyVerb, verb),
   210  		tag.Insert(keyEndpoint, u.Path)}, httpRequestLatencyStats.M(latency.Seconds()))
   211  }
   212  
   213  // ocMetric adapts OpenCensus measures to cache metrics
   214  type ocMetric struct {
   215  	*stats.Float64Measure
   216  	ctx context.Context
   217  }
   218  
   219  func newOcMetric(m *stats.Float64Measure) *ocMetric {
   220  	return &ocMetric{
   221  		Float64Measure: m,
   222  		ctx:            context.Background(),
   223  	}
   224  }
   225  
   226  func (m *ocMetric) withTag(key tag.Key, value string) *ocMetric {
   227  	ctx, err := tag.New(m.ctx, tag.Upsert(key, value))
   228  	if err != nil {
   229  		panic(err)
   230  	}
   231  	m.ctx = ctx
   232  	return m
   233  }
   234  
   235  func (m *ocMetric) Inc() {
   236  	stats.Record(m.ctx, m.Float64Measure.M(float64(1)))
   237  }
   238  
   239  func (m *ocMetric) Dec() {
   240  	stats.Record(m.ctx, m.Float64Measure.M(float64(-1)))
   241  }
   242  
   243  // observeFunc is an adapter that allows the use of functions as summary metric.
   244  // useful for converting metrics unit before sending them to OC
   245  type observeFunc func(float64)
   246  
   247  func (o observeFunc) Observe(f float64) {
   248  	o(f)
   249  }
   250  
   251  func (m *ocMetric) Observe(f float64) {
   252  	stats.Record(m.ctx, m.Float64Measure.M(f))
   253  }
   254  
   255  func (m *ocMetric) Set(f float64) {
   256  	stats.Record(m.ctx, m.Float64Measure.M(f))
   257  }
   258  
   259  func (clientGoMetricAdapter) NewListsMetric(string) cache.CounterMetric {
   260  	return newOcMetric(cacheListTotalStats)
   261  }
   262  
   263  func (clientGoMetricAdapter) NewListDurationMetric(string) cache.SummaryMetric {
   264  	return newOcMetric(cacheListLatencyStats)
   265  }
   266  
   267  func (clientGoMetricAdapter) NewItemsInListMetric(string) cache.SummaryMetric {
   268  	return newOcMetric(cacheListItemCountStats)
   269  }
   270  
   271  func (clientGoMetricAdapter) NewWatchesMetric(string) cache.CounterMetric {
   272  	return newOcMetric(cacheWatchesTotalStats)
   273  }
   274  
   275  func (clientGoMetricAdapter) NewShortWatchesMetric(string) cache.CounterMetric {
   276  	return newOcMetric(cacheShortWatchesTotalStats)
   277  }
   278  
   279  func (clientGoMetricAdapter) NewWatchDurationMetric(string) cache.SummaryMetric {
   280  	return newOcMetric(cacheWatchesLatencyStats)
   281  }
   282  
   283  func (clientGoMetricAdapter) NewItemsInWatchMetric(string) cache.SummaryMetric {
   284  	return newOcMetric(cacheItemsInWatchesCountStats)
   285  }
   286  
   287  func (clientGoMetricAdapter) NewLastResourceVersionMetric(string) cache.GaugeMetric {
   288  	return newOcMetric(cacheLastResourceVersionStats)
   289  }
   290  
   291  func (clientGoMetricAdapter) NewDepthMetric(name string) workqueue.GaugeMetric {
   292  	return newOcMetric(workQueueDepthStats).withTag(keyQueueName, name)
   293  }
   294  
   295  func (clientGoMetricAdapter) NewAddsMetric(name string) workqueue.CounterMetric {
   296  	return newOcMetric(workQueueItemsTotalStats).withTag(keyQueueName, name)
   297  }
   298  
   299  func (clientGoMetricAdapter) NewLatencyMetric(name string) workqueue.HistogramMetric {
   300  	m := newOcMetric(workQueueLatencyStats).withTag(keyQueueName, name)
   301  	// Convert microseconds to seconds for consistency across metrics.
   302  	return observeFunc(func(f float64) {
   303  		m.Observe(f / 1e6)
   304  	})
   305  }
   306  
   307  func (clientGoMetricAdapter) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
   308  	m := newOcMetric(workQueueWorkDurationStats).withTag(keyQueueName, name)
   309  	// Convert microseconds to seconds for consistency across metrics.
   310  	return observeFunc(func(f float64) {
   311  		m.Observe(f / 1e6)
   312  	})
   313  }
   314  
   315  func (clientGoMetricAdapter) NewRetriesMetric(name string) workqueue.CounterMetric {
   316  	return newOcMetric(workQueueRetriesTotalStats).withTag(keyQueueName, name)
   317  }
   318  
   319  func (clientGoMetricAdapter) NewLongestRunningProcessorSecondsMetric(string) workqueue.SettableGaugeMetric {
   320  	return newOcMetric(workQueueLongestRunningProcessorStats)
   321  }
   322  
   323  func (clientGoMetricAdapter) NewUnfinishedWorkSecondsMetric(string) workqueue.SettableGaugeMetric {
   324  	return newOcMetric(workQueueUnfinishedWorkStats)
   325  }
   326  
   327  func (clientGoMetricAdapter) NewDeprecatedDepthMetric(name string) workqueue.GaugeMetric {
   328  	return newOcMetric(workQueueDepthStats).withTag(keyQueueName, name)
   329  }
   330  
   331  func (clientGoMetricAdapter) NewDeprecatedAddsMetric(name string) workqueue.CounterMetric {
   332  	return newOcMetric(workQueueItemsTotalStats).withTag(keyQueueName, name)
   333  }
   334  
   335  func (clientGoMetricAdapter) NewDeprecatedLatencyMetric(name string) workqueue.SummaryMetric {
   336  	m := newOcMetric(workQueueLatencyStats).withTag(keyQueueName, name)
   337  	// Convert microseconds to seconds for consistency across metrics.
   338  	return observeFunc(func(f float64) {
   339  		m.Observe(f / 1e6)
   340  	})
   341  }
   342  
   343  func (clientGoMetricAdapter) NewDeprecatedLongestRunningProcessorMicrosecondsMetric(string) workqueue.SettableGaugeMetric {
   344  	return newOcMetric(workQueueLongestRunningProcessorStats)
   345  }
   346  
   347  func (clientGoMetricAdapter) NewDeprecatedRetriesMetric(name string) workqueue.CounterMetric {
   348  	return newOcMetric(workQueueRetriesTotalStats).withTag(keyQueueName, name)
   349  }
   350  
   351  func (clientGoMetricAdapter) NewDeprecatedUnfinishedWorkSecondsMetric(string) workqueue.SettableGaugeMetric {
   352  	return newOcMetric(workQueueUnfinishedWorkStats)
   353  }
   354  
   355  func (clientGoMetricAdapter) NewDeprecatedWorkDurationMetric(name string) workqueue.SummaryMetric {
   356  	m := newOcMetric(workQueueWorkDurationStats).withTag(keyQueueName, name)
   357  	// Convert microseconds to seconds for consistency across metrics.
   358  	return observeFunc(func(f float64) {
   359  		m.Observe(f / 1e6)
   360  	})
   361  }