github.com/crossplane/upjet@v1.3.0/pkg/metrics/metrics.go (about) 1 // SPDX-FileCopyrightText: 2023 The Crossplane Authors <https://crossplane.io> 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package metrics 6 7 import ( 8 "context" 9 "sync" 10 "time" 11 12 "github.com/crossplane/crossplane-runtime/pkg/resource" 13 "github.com/pkg/errors" 14 "github.com/prometheus/client_golang/prometheus" 15 "k8s.io/apimachinery/pkg/runtime/schema" 16 "k8s.io/client-go/tools/cache" 17 "sigs.k8s.io/controller-runtime/pkg/cluster" 18 "sigs.k8s.io/controller-runtime/pkg/manager" 19 "sigs.k8s.io/controller-runtime/pkg/metrics" 20 ) 21 22 const ( 23 promNSUpjet = "upjet" 24 promSysTF = "terraform" 25 promSysResource = "resource" 26 ) 27 28 var ( 29 // CLITime is the Terraform CLI execution times histogram. 30 CLITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 31 Namespace: promNSUpjet, 32 Subsystem: promSysTF, 33 Name: "cli_duration", 34 Help: "Measures in seconds how long it takes a Terraform CLI invocation to complete", 35 Buckets: []float64{1.0, 3, 5, 10, 15, 30, 60, 120, 300}, 36 }, []string{"subcommand", "mode"}) 37 38 // ExternalAPITime is the SDK processing times histogram. 39 ExternalAPITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 40 Namespace: promNSUpjet, 41 Subsystem: promSysResource, 42 Name: "ext_api_duration", 43 Help: "Measures in seconds how long it takes a Cloud SDK call to complete", 44 Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600}, 45 }, []string{"operation"}) 46 47 // ExternalAPICalls is a counter metric of the number of external 48 // API calls. "service" and "operation" labels could be used to 49 // classify calls into a two-level hierarchy, in which calls are 50 // "operations" that belong to a "service". Users should beware of 51 // performance implications of high cardinality that could occur 52 // when there are many services and operations. See: 53 // https://prometheus.io/docs/practices/naming/#labels 54 ExternalAPICalls = prometheus.NewCounterVec(prometheus.CounterOpts{ 55 Namespace: promNSUpjet, 56 Subsystem: promSysResource, 57 Name: "external_api_calls_total", 58 Help: "The number of external API calls.", 59 }, []string{"service", "operation"}) 60 61 // DeletionTime is the histogram metric for collecting statistics on the 62 // intervals between the deletion timestamp and the moment when 63 // the resource is observed to be missing (actually deleted). 64 DeletionTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 65 Namespace: promNSUpjet, 66 Subsystem: promSysResource, 67 Name: "deletion_seconds", 68 Help: "Measures in seconds how long it takes for a resource to be deleted", 69 Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600}, 70 }, []string{"group", "version", "kind"}) 71 72 // ReconcileDelay is the histogram metric for collecting statistics on the 73 // delays between when the expected reconciles of an up-to-date resource 74 // should happen and when the resource is actually reconciled. Only 75 // delays from the expected reconcile times are considered. 76 ReconcileDelay = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 77 Namespace: promNSUpjet, 78 Subsystem: promSysResource, 79 Name: "reconcile_delay_seconds", 80 Help: "Measures in seconds how long the reconciles for a resource have been delayed from the configured poll periods", 81 Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600}, 82 }, []string{"group", "version", "kind"}) 83 84 // CLIExecutions are the active number of terraform CLI invocations. 85 CLIExecutions = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 86 Namespace: promNSUpjet, 87 Subsystem: promSysTF, 88 Name: "active_cli_invocations", 89 Help: "The number of active (running) Terraform CLI invocations", 90 }, []string{"subcommand", "mode"}) 91 92 // TFProcesses are the active number of 93 // terraform CLI & Terraform provider processes running. 94 TFProcesses = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 95 Namespace: promNSUpjet, 96 Subsystem: promSysTF, 97 Name: "running_processes", 98 Help: "The number of running Terraform CLI and Terraform provider processes", 99 }, []string{"type"}) 100 101 // TTRMeasurements are the time-to-readiness measurements for 102 // the managed resources. 103 TTRMeasurements = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 104 Namespace: promNSUpjet, 105 Subsystem: promSysResource, 106 Name: "ttr", 107 Help: "Measures in seconds the time-to-readiness (TTR) for managed resources", 108 Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600}, 109 }, []string{"group", "version", "kind"}) 110 ) 111 112 var _ manager.Runnable = &MetricRecorder{} 113 114 type MetricRecorder struct { 115 observations sync.Map 116 gvk schema.GroupVersionKind 117 cluster cluster.Cluster 118 119 pollInterval time.Duration 120 } 121 122 type Observations struct { 123 expectedReconcileTime *time.Time 124 observeReconcileDelay bool 125 } 126 127 func NewMetricRecorder(gvk schema.GroupVersionKind, c cluster.Cluster, pollInterval time.Duration) *MetricRecorder { 128 return &MetricRecorder{ 129 gvk: gvk, 130 cluster: c, 131 pollInterval: pollInterval, 132 } 133 } 134 135 func (r *MetricRecorder) SetReconcileTime(name string) { 136 if r == nil { 137 return 138 } 139 o, ok := r.observations.Load(name) 140 if !ok { 141 o = &Observations{} 142 r.observations.Store(name, o) 143 } 144 t := time.Now().Add(r.pollInterval) 145 o.(*Observations).expectedReconcileTime = &t 146 o.(*Observations).observeReconcileDelay = true 147 } 148 149 func (r *MetricRecorder) ObserveReconcileDelay(gvk schema.GroupVersionKind, name string) { 150 if r == nil { 151 return 152 } 153 o, _ := r.observations.Load(name) 154 if o == nil || !o.(*Observations).observeReconcileDelay || o.(*Observations).expectedReconcileTime == nil { 155 return 156 } 157 d := time.Since(*o.(*Observations).expectedReconcileTime) 158 if d < 0 { 159 d = 0 160 } 161 ReconcileDelay.WithLabelValues(gvk.Group, gvk.Version, gvk.Kind).Observe(d.Seconds()) 162 o.(*Observations).observeReconcileDelay = false 163 } 164 165 func (r *MetricRecorder) Start(ctx context.Context) error { 166 inf, err := r.cluster.GetCache().GetInformerForKind(ctx, r.gvk) 167 if err != nil { 168 return errors.Wrapf(err, "cannot get informer for metric recorder for resource %s", r.gvk) 169 } 170 171 registered, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{ 172 DeleteFunc: func(obj interface{}) { 173 if final, ok := obj.(cache.DeletedFinalStateUnknown); ok { 174 obj = final.Obj 175 } 176 managed := obj.(resource.Managed) 177 r.observations.Delete(managed.GetName()) 178 }, 179 }) 180 if err != nil { 181 return errors.Wrap(err, "cannot add delete event handler to informer for metric recorder") 182 } 183 defer inf.RemoveEventHandler(registered) //nolint:errcheck // this happens on destruction. We cannot do anything anyway. 184 185 <-ctx.Done() 186 187 return nil 188 } 189 190 func init() { 191 metrics.Registry.MustRegister(CLITime, CLIExecutions, TFProcesses, TTRMeasurements, ExternalAPITime, ExternalAPICalls, DeletionTime, ReconcileDelay) 192 }