github.com/crossplane/upjet@v1.3.0/pkg/metrics/metrics.go

github.com/crossplane/upjet@v1.3.0/pkg/metrics/metrics.go (about)

     1  // SPDX-FileCopyrightText: 2023 The Crossplane Authors <https://crossplane.io>
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package metrics
     6  
     7  import (
     8  	"context"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/crossplane/crossplane-runtime/pkg/resource"
    13  	"github.com/pkg/errors"
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"k8s.io/apimachinery/pkg/runtime/schema"
    16  	"k8s.io/client-go/tools/cache"
    17  	"sigs.k8s.io/controller-runtime/pkg/cluster"
    18  	"sigs.k8s.io/controller-runtime/pkg/manager"
    19  	"sigs.k8s.io/controller-runtime/pkg/metrics"
    20  )
    21  
    22  const (
    23  	promNSUpjet     = "upjet"
    24  	promSysTF       = "terraform"
    25  	promSysResource = "resource"
    26  )
    27  
    28  var (
    29  	// CLITime is the Terraform CLI execution times histogram.
    30  	CLITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
    31  		Namespace: promNSUpjet,
    32  		Subsystem: promSysTF,
    33  		Name:      "cli_duration",
    34  		Help:      "Measures in seconds how long it takes a Terraform CLI invocation to complete",
    35  		Buckets:   []float64{1.0, 3, 5, 10, 15, 30, 60, 120, 300},
    36  	}, []string{"subcommand", "mode"})
    37  
    38  	// ExternalAPITime is the SDK processing times histogram.
    39  	ExternalAPITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
    40  		Namespace: promNSUpjet,
    41  		Subsystem: promSysResource,
    42  		Name:      "ext_api_duration",
    43  		Help:      "Measures in seconds how long it takes a Cloud SDK call to complete",
    44  		Buckets:   []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
    45  	}, []string{"operation"})
    46  
    47  	// ExternalAPICalls is a counter metric of the number of external
    48  	// API calls. "service" and "operation" labels could be used to
    49  	// classify calls into a two-level hierarchy, in which calls are
    50  	// "operations" that belong to a "service". Users should beware of
    51  	// performance implications of high cardinality that could occur
    52  	// when there are many services and operations. See:
    53  	// https://prometheus.io/docs/practices/naming/#labels
    54  	ExternalAPICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
    55  		Namespace: promNSUpjet,
    56  		Subsystem: promSysResource,
    57  		Name:      "external_api_calls_total",
    58  		Help:      "The number of external API calls.",
    59  	}, []string{"service", "operation"})
    60  
    61  	// DeletionTime is the histogram metric for collecting statistics on the
    62  	// intervals between the deletion timestamp and the moment when
    63  	// the resource is observed to be missing (actually deleted).
    64  	DeletionTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
    65  		Namespace: promNSUpjet,
    66  		Subsystem: promSysResource,
    67  		Name:      "deletion_seconds",
    68  		Help:      "Measures in seconds how long it takes for a resource to be deleted",
    69  		Buckets:   []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
    70  	}, []string{"group", "version", "kind"})
    71  
    72  	// ReconcileDelay is the histogram metric for collecting statistics on the
    73  	// delays between when the expected reconciles of an up-to-date resource
    74  	// should happen and when the resource is actually reconciled. Only
    75  	// delays from the expected reconcile times are considered.
    76  	ReconcileDelay = prometheus.NewHistogramVec(prometheus.HistogramOpts{
    77  		Namespace: promNSUpjet,
    78  		Subsystem: promSysResource,
    79  		Name:      "reconcile_delay_seconds",
    80  		Help:      "Measures in seconds how long the reconciles for a resource have been delayed from the configured poll periods",
    81  		Buckets:   []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
    82  	}, []string{"group", "version", "kind"})
    83  
    84  	// CLIExecutions are the active number of terraform CLI invocations.
    85  	CLIExecutions = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    86  		Namespace: promNSUpjet,
    87  		Subsystem: promSysTF,
    88  		Name:      "active_cli_invocations",
    89  		Help:      "The number of active (running) Terraform CLI invocations",
    90  	}, []string{"subcommand", "mode"})
    91  
    92  	// TFProcesses are the active number of
    93  	// terraform CLI & Terraform provider processes running.
    94  	TFProcesses = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    95  		Namespace: promNSUpjet,
    96  		Subsystem: promSysTF,
    97  		Name:      "running_processes",
    98  		Help:      "The number of running Terraform CLI and Terraform provider processes",
    99  	}, []string{"type"})
   100  
   101  	// TTRMeasurements are the time-to-readiness measurements for
   102  	// the managed resources.
   103  	TTRMeasurements = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   104  		Namespace: promNSUpjet,
   105  		Subsystem: promSysResource,
   106  		Name:      "ttr",
   107  		Help:      "Measures in seconds the time-to-readiness (TTR) for managed resources",
   108  		Buckets:   []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
   109  	}, []string{"group", "version", "kind"})
   110  )
   111  
   112  var _ manager.Runnable = &MetricRecorder{}
   113  
   114  type MetricRecorder struct {
   115  	observations sync.Map
   116  	gvk          schema.GroupVersionKind
   117  	cluster      cluster.Cluster
   118  
   119  	pollInterval time.Duration
   120  }
   121  
   122  type Observations struct {
   123  	expectedReconcileTime *time.Time
   124  	observeReconcileDelay bool
   125  }
   126  
   127  func NewMetricRecorder(gvk schema.GroupVersionKind, c cluster.Cluster, pollInterval time.Duration) *MetricRecorder {
   128  	return &MetricRecorder{
   129  		gvk:          gvk,
   130  		cluster:      c,
   131  		pollInterval: pollInterval,
   132  	}
   133  }
   134  
   135  func (r *MetricRecorder) SetReconcileTime(name string) {
   136  	if r == nil {
   137  		return
   138  	}
   139  	o, ok := r.observations.Load(name)
   140  	if !ok {
   141  		o = &Observations{}
   142  		r.observations.Store(name, o)
   143  	}
   144  	t := time.Now().Add(r.pollInterval)
   145  	o.(*Observations).expectedReconcileTime = &t
   146  	o.(*Observations).observeReconcileDelay = true
   147  }
   148  
   149  func (r *MetricRecorder) ObserveReconcileDelay(gvk schema.GroupVersionKind, name string) {
   150  	if r == nil {
   151  		return
   152  	}
   153  	o, _ := r.observations.Load(name)
   154  	if o == nil || !o.(*Observations).observeReconcileDelay || o.(*Observations).expectedReconcileTime == nil {
   155  		return
   156  	}
   157  	d := time.Since(*o.(*Observations).expectedReconcileTime)
   158  	if d < 0 {
   159  		d = 0
   160  	}
   161  	ReconcileDelay.WithLabelValues(gvk.Group, gvk.Version, gvk.Kind).Observe(d.Seconds())
   162  	o.(*Observations).observeReconcileDelay = false
   163  }
   164  
   165  func (r *MetricRecorder) Start(ctx context.Context) error {
   166  	inf, err := r.cluster.GetCache().GetInformerForKind(ctx, r.gvk)
   167  	if err != nil {
   168  		return errors.Wrapf(err, "cannot get informer for metric recorder for resource %s", r.gvk)
   169  	}
   170  
   171  	registered, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{
   172  		DeleteFunc: func(obj interface{}) {
   173  			if final, ok := obj.(cache.DeletedFinalStateUnknown); ok {
   174  				obj = final.Obj
   175  			}
   176  			managed := obj.(resource.Managed)
   177  			r.observations.Delete(managed.GetName())
   178  		},
   179  	})
   180  	if err != nil {
   181  		return errors.Wrap(err, "cannot add delete event handler to informer for metric recorder")
   182  	}
   183  	defer inf.RemoveEventHandler(registered) //nolint:errcheck // this happens on destruction. We cannot do anything anyway.
   184  
   185  	<-ctx.Done()
   186  
   187  	return nil
   188  }
   189  
   190  func init() {
   191  	metrics.Registry.MustRegister(CLITime, CLIExecutions, TFProcesses, TTRMeasurements, ExternalAPITime, ExternalAPICalls, DeletionTime, ReconcileDelay)
   192  }