github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/pkg/metricsexporter/metricsexporter.go (about)

     1  // Copyright (C) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package metricsexporter
     5  
     6  import (
     7  	"strconv"
     8  
     9  	"github.com/pkg/errors"
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"go.uber.org/zap"
    12  )
    13  
    14  type metricName string
    15  
    16  const (
    17  	NamesReconcile               metricName = "reconcile"
    18  	NamesDeployment              metricName = "deployment"
    19  	NamesDeploymentUpdateError   metricName = "deploymentUpdateErrorCounter"
    20  	NamesDeploymentDeleteCounter metricName = "deploymentDeleteCounter"
    21  	NamesDeploymentDeleteError   metricName = "deploymentDeleteErrorCounter"
    22  	NamesDeploymentUpdateCounter metricName = "deploymentUpdateCounter"
    23  	NamesConfigMap               metricName = "configMap"
    24  	NamesServicesCreated         metricName = "servicesCreated"
    25  	NamesServices                metricName = "services"
    26  	NamesRoleBindings            metricName = "roleBindings"
    27  	NamesIngress                 metricName = "ingress"
    28  	NamesIngressDeleted          metricName = "ingressDeleted"
    29  	NamesVMOUpdate               metricName = "vmoupdate"
    30  	NamesQueue                   metricName = "queue"
    31  )
    32  
    33  type metricsExporter struct {
    34  	internalMetricsDelegate metricsDelegate
    35  	internalConfig          configuration
    36  	internalData            data
    37  }
    38  
    39  type configuration struct {
    40  	// this Metric array will be automatically populated with all the metrics from each map. Metrics not included in a map can be added to thisMetric array for registration.
    41  	allMetrics []prometheus.Collector
    42  	// this Metric map will be automatically populated with all metrics which were not registered correctly. Metrics in thisMetric map will be retried periodically.
    43  	failedMetrics map[prometheus.Collector]int
    44  	registry      prometheus.Registerer
    45  }
    46  
    47  type data struct {
    48  	functionMetricsMap     map[metricName]*FunctionMetrics
    49  	simpleCounterMetricMap map[metricName]*CounterMetric
    50  	simpleGaugeMetricMap   map[metricName]*GaugeMetric
    51  	durationMetricMap      map[metricName]*DurationMetric
    52  	timestampMetricMap     map[metricName]*TimestampMetric
    53  	errorMetricMap         map[metricName]*ErrorMetric
    54  }
    55  
    56  type metricsDelegate struct {
    57  }
    58  
    59  // Class of metrics to automatically capture 4 types of metrics for a given function
    60  type FunctionMetrics struct {
    61  	durationMetric    DurationMetric
    62  	callsTotal        CounterMetric
    63  	lastCallTimestamp TimestampMetric
    64  	errorTotal        ErrorMetric
    65  	// The function to create the label values for the error and timestamp metrics. A default is provided as &DefaultLabelFunction
    66  	labelFunction *func(int64) string
    67  	index         int64
    68  }
    69  
    70  // Method to call at the start of the tracked function. Starts the duration timer and increments the total count
    71  func (f *FunctionMetrics) LogStart() {
    72  	f.callsTotal.metric.Inc()
    73  	f.index = f.index + 1
    74  	f.durationMetric.TimerStart()
    75  }
    76  
    77  // Method to defer to the end of the tracked function. Stops the duration timer, sets the lastCallTimestamp. Pass in an argument of true to set an error for the current function call.
    78  func (f *FunctionMetrics) LogEnd(errorObserved bool) {
    79  	label := (*f.labelFunction)(f.index)
    80  	f.durationMetric.TimerStop()
    81  	f.lastCallTimestamp.SetLastTimeWithLabel(label)
    82  	if errorObserved {
    83  		f.errorTotal.IncWithLabel(label)
    84  	}
    85  }
    86  
    87  func (f *FunctionMetrics) IncError() {
    88  	f.errorTotal.IncWithLabel(f.GetLabel())
    89  }
    90  
    91  // Invokes the supplied labelFunction to return the string which would be used as a label. The label can be dynamic and may change depending on the labelFunctions behavior (i.e. a timestamp string)
    92  func (f *FunctionMetrics) GetLabel() string {
    93  	return (*f.labelFunction)(f.index)
    94  }
    95  
    96  // Type to count events such as the number fo function calls.
    97  type CounterMetric struct {
    98  	metric prometheus.Counter
    99  	index  int64
   100  }
   101  
   102  // Inc increases the counterMetric by one
   103  func (c *CounterMetric) Inc() {
   104  	c.index = c.index + 1
   105  	c.metric.Inc()
   106  }
   107  
   108  // Add increases the counter metric by the argument value
   109  func (c *CounterMetric) Add(num float64) {
   110  	c.index = c.index + int64(num)
   111  	c.metric.Add(num)
   112  }
   113  
   114  // GetLabel returns the current value of the counter as a string
   115  func (c *CounterMetric) GetLabel() string {
   116  	return strconv.FormatInt(c.index, 10)
   117  }
   118  
   119  type GaugeMetric struct {
   120  	metric prometheus.Gauge
   121  }
   122  
   123  // Set sets the value of the gauge metric to the given value
   124  func (g *GaugeMetric) Set(num float64) {
   125  	g.metric.Set(num)
   126  }
   127  
   128  // SetToCurrentTime sets the value of the gauge metric to the system timestamp
   129  func (g *GaugeMetric) SetToCurrentTime() {
   130  	g.metric.SetToCurrentTime()
   131  }
   132  
   133  // Add sets the value of the gauge metric to the current value plus the given value
   134  func (g *GaugeMetric) Add(num float64) {
   135  	g.metric.Add(num)
   136  }
   137  
   138  // Type to track length of a function call. Method to start and stop the duration timer are available.
   139  type DurationMetric struct {
   140  	metric prometheus.Summary
   141  	timer  *prometheus.Timer
   142  }
   143  
   144  // Creates a new timer, and starts the timer
   145  func (d *DurationMetric) TimerStart() {
   146  	d.timer = prometheus.NewTimer(d.metric)
   147  }
   148  
   149  // stops the timer and record the duration since the last call to TimerStart
   150  func (d *DurationMetric) TimerStop() {
   151  	d.timer.ObserveDuration()
   152  }
   153  
   154  // Type to track the last timestamp of a function call. Includes a method to set the last timestamp
   155  type TimestampMetric struct {
   156  	metric        *prometheus.GaugeVec
   157  	labelFunction *func() string
   158  }
   159  
   160  // Adds a timestamp as the current time. The label must be supplied as an argument
   161  func (t *TimestampMetric) SetLastTime() {
   162  	t.SetLastTimeWithLabel((*t.labelFunction)())
   163  }
   164  
   165  // Adds a timestamp as the current time. The label must be supplied as an argument
   166  func (t *TimestampMetric) SetLastTimeWithLabel(indexString string) {
   167  	lastTimeMetric, err := t.metric.GetMetricWithLabelValues(indexString)
   168  	if err != nil {
   169  		zap.S().Errorf("Failed to log the last reconcile time metric label %s: %v", indexString, err)
   170  	} else {
   171  		lastTimeMetric.SetToCurrentTime()
   172  	}
   173  }
   174  
   175  // Type to track the occurrence of an error. Includes a metod to add an error count
   176  type ErrorMetric struct {
   177  	metric        *prometheus.CounterVec
   178  	labelFunction *func() string
   179  }
   180  
   181  func (e *ErrorMetric) Inc() {
   182  	e.IncWithLabel((*e.labelFunction)())
   183  }
   184  
   185  // Adds an error count. The label must be supplied as an argument
   186  func (e *ErrorMetric) IncWithLabel(label string) {
   187  	errorMetric, err := e.metric.GetMetricWithLabelValues(label)
   188  	if err != nil {
   189  		zap.S().Errorf("Failed to get metric label %s: %v", label, err)
   190  	} else {
   191  		errorMetric.Inc()
   192  	}
   193  }
   194  
   195  // initConfiguration returns an empty configuration struct
   196  func initConfiguration() configuration {
   197  	return configuration{
   198  		allMetrics:    []prometheus.Collector{},
   199  		failedMetrics: map[prometheus.Collector]int{},
   200  		registry:      prometheus.DefaultRegisterer,
   201  	}
   202  }
   203  
   204  // initFunctionMetricsMap returns a populated map of functionMetrics to be used in the data struct, add additional metrics here
   205  func initFunctionMetricsMap() map[metricName]*FunctionMetrics {
   206  	return map[metricName]*FunctionMetrics{
   207  		NamesReconcile: {
   208  			durationMetric: DurationMetric{
   209  				metric: prometheus.NewSummary(prometheus.SummaryOpts{Name: "vmo_reconcile_duration_seconds", Help: "Tracks the duration of the reconcile function in seconds"}),
   210  			},
   211  			callsTotal: CounterMetric{
   212  				metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_reconcile_total", Help: "Tracks how many times the syncHandlerStandardMode function is called. thisMetric corresponds to the number of reconciles performed by the VMO"}),
   213  			},
   214  			lastCallTimestamp: TimestampMetric{
   215  				metric: prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_reconcile_last_timestamp_seconds", Help: "The timestamp of the last time the syncHandlerStandardMode function completed"}, []string{"reconcile_index"}),
   216  			},
   217  			errorTotal: ErrorMetric{
   218  				metric: prometheus.NewCounterVec(prometheus.CounterOpts{Name: "vmo_reconcile_error_total", Help: "Tracks how many times the syncHandlerStandardMode function encounters an error"}, []string{"reconcile_index"}),
   219  			},
   220  			index:         int64(0),
   221  			labelFunction: &DefaultLabelFunction,
   222  		},
   223  
   224  		NamesDeployment: {
   225  			durationMetric: DurationMetric{
   226  				metric: prometheus.NewSummary(prometheus.SummaryOpts{Name: "vmo_deployment_duration_seconds", Help: "The duration of the last call to the deployment function"}),
   227  			},
   228  			callsTotal: CounterMetric{
   229  				metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_deployment_total", Help: "Tracks how many times the deployment function is called"}),
   230  			},
   231  			lastCallTimestamp: TimestampMetric{
   232  				metric: prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_deployment_last_timestamp_seconds", Help: "The timestamp of the last time the deployment function completed"}, []string{"deployment_index"}),
   233  			},
   234  			errorTotal: ErrorMetric{
   235  				metric: prometheus.NewCounterVec(prometheus.CounterOpts{Name: "vmo_deployment_error_total", Help: "Tracks how many times the deployment failed"}, []string{"deployment_index"}),
   236  			},
   237  			index:         int64(0),
   238  			labelFunction: &DefaultLabelFunction,
   239  		},
   240  
   241  		NamesIngress: {
   242  			durationMetric: DurationMetric{
   243  				metric: prometheus.NewSummary(prometheus.SummaryOpts{Name: "vmo_ingress_duration_seconds", Help: "Tracks the duration of the ingress function in seconds"}),
   244  			},
   245  			callsTotal: CounterMetric{
   246  				metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_ingress_total", Help: "Tracks how many times the ingress function is called. This metric corresponds to the number of ingress requests performed by the VMO"}),
   247  			},
   248  			lastCallTimestamp: TimestampMetric{
   249  				metric: prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_ingress_last_timestamp_seconds", Help: "The timestamp of the last time the ingress function completed"}, []string{"ingress_index"}),
   250  			},
   251  			errorTotal: ErrorMetric{
   252  				metric: prometheus.NewCounterVec(prometheus.CounterOpts{Name: "vmo_ingress_error_total", Help: "Tracks how many times the syncHandlerStandardMode function encounters an error"}, []string{"ingress_index"}),
   253  			},
   254  			index:         int64(0),
   255  			labelFunction: &DefaultLabelFunction,
   256  		},
   257  	}
   258  }
   259  
   260  // initCounterMetricMap returns a populated map of counter metrics to be used in the data struct, add additional metrics here
   261  func initCounterMetricMap() map[metricName]*CounterMetric {
   262  	return map[metricName]*CounterMetric{
   263  		NamesDeploymentUpdateCounter: {
   264  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_deployment_update_total", Help: "Tracks how many times a deployment update is attempted"}),
   265  		},
   266  		NamesDeploymentDeleteCounter: {
   267  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_deployment_delete_total", Help: "Tracks how many times the delete functionality is invoked"}),
   268  		},
   269  		NamesIngressDeleted: {
   270  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_ingress_delete_total", Help: "Tracks how many ingresses are deleted"}),
   271  		},
   272  		NamesConfigMap: {
   273  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_configmap_total", Help: "Tracks how many times the configMap functionality is invoked"}),
   274  		},
   275  		NamesServices: {
   276  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_services_total", Help: "Tracks how many times the services functionality is invoked"}),
   277  		},
   278  		NamesServicesCreated: {
   279  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_services_created_total", Help: "Tracks how many services are created"}),
   280  		},
   281  		NamesRoleBindings: {
   282  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_rolebindings_total", Help: "Tracks how many times the rolebindings functionality is invoked"}),
   283  		},
   284  		NamesVMOUpdate: {
   285  			metric: prometheus.NewCounter(prometheus.CounterOpts{Name: "vmo_updates_total", Help: "Tracks how many times the update functionality is invoked"}),
   286  		},
   287  	}
   288  }
   289  
   290  // initGaugeMetricMap returns a map of gauge metrics to be used in the data struct, add additional metrics here
   291  func initGaugeMetricMap() map[metricName]*GaugeMetric {
   292  	return map[metricName]*GaugeMetric{
   293  		NamesQueue: {
   294  			metric: prometheus.NewGauge(prometheus.GaugeOpts{Name: "vmo_work_queue_size", Help: "Tracks the size of the VMO work queue"}),
   295  		},
   296  	}
   297  }
   298  
   299  // initDurationMetricMap returns a map of duration metrics to be used in the data struct, add additional metrics here
   300  func initDurationMetricMap() map[metricName]*DurationMetric {
   301  	return map[metricName]*DurationMetric{}
   302  }
   303  
   304  // initTimestampMetricMap returns a map of timestamp metrics to be used in the data struct, add additional metrics here
   305  func initTimestampMetricMap() map[metricName]*TimestampMetric {
   306  	return map[metricName]*TimestampMetric{
   307  		NamesConfigMap: {
   308  			metric:        prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_configmap_last_successful_timestamp", Help: "The timestamp of the last time the configMap function completed successfully"}, []string{"configMap_index"}),
   309  			labelFunction: &configMapLabelFunction,
   310  		},
   311  		NamesServices: {
   312  			metric:        prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_services_last_successful_timestamp", Help: "The timestamp of the last time the createService function completed successfully"}, []string{"service_index"}),
   313  			labelFunction: &servicesLabelFunction,
   314  		},
   315  		NamesRoleBindings: {
   316  			metric:        prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_rolebindings_last_successful_timestamp", Help: "The timestamp of the last time the roleBindings function completed successfully"}, []string{"rolebindings_index"}),
   317  			labelFunction: &roleBindingLabelFunction,
   318  		},
   319  		NamesVMOUpdate: {
   320  			metric:        prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "vmo_update_last_successful_timestamp", Help: "The timestamp of the last time the vmo update completed successfully"}, []string{"update_index"}),
   321  			labelFunction: &VMOUpdateLabelFunction,
   322  		},
   323  	}
   324  }
   325  
   326  // initErrorMetricMap returns a map of error metrics to be used in the data struct, add additional metrics here
   327  func initErrorMetricMap() map[metricName]*ErrorMetric {
   328  	return map[metricName]*ErrorMetric{
   329  		NamesDeploymentUpdateError: {
   330  			metric:        prometheus.NewCounterVec(prometheus.CounterOpts{Name: "vmo_deployment_update_error_total", Help: "Tracks how many times a deployment update fails"}, []string{"deployment_index"}),
   331  			labelFunction: &deploymentLabelFunction,
   332  		},
   333  		NamesDeploymentDeleteError: {
   334  			metric:        prometheus.NewCounterVec(prometheus.CounterOpts{Name: "vmo_deployment_delete_error_counter", Help: "Tracks how many times the delete functionality failed"}, []string{"deployment_index"}),
   335  			labelFunction: &deploymentLabelFunction,
   336  		},
   337  	}
   338  }
   339  
   340  var (
   341  	MetricsExp               = metricsExporter{}
   342  	DefaultLabelFunction     func(index int64) string
   343  	deploymentLabelFunction  func() string
   344  	configMapLabelFunction   func() string
   345  	servicesLabelFunction    func() string
   346  	roleBindingLabelFunction func() string
   347  	VMOUpdateLabelFunction   func() string
   348  	TestDelegate             = metricsDelegate{}
   349  )
   350  
   351  // initializeFailedMetricsArray simply adds metrics in the allMetrics array to the failed metrics map, call this before registering metrics
   352  func (md *metricsDelegate) initializeFailedMetricsArray() {
   353  	//the failed metrics array will initially contain all metrics so they may be registered
   354  	for i, metric := range MetricsExp.internalConfig.allMetrics {
   355  		MetricsExp.internalConfig.failedMetrics[metric] = i
   356  	}
   357  }
   358  
   359  // registerMetricsHandlersHelper loops through the failed metrics map and deletes metrics which have been registered successfully
   360  func (md *metricsDelegate) registerMetricsHandlersHelper() error {
   361  	var errorObserved error
   362  	for metric := range MetricsExp.internalConfig.failedMetrics {
   363  		err := MetricsExp.internalConfig.registry.Register(metric)
   364  		if err != nil {
   365  			if errorObserved != nil {
   366  				errorObserved = errors.Wrap(errorObserved, err.Error())
   367  			} else {
   368  				errorObserved = err
   369  			}
   370  		} else {
   371  			//if a metric is registered, delete it from the failed metrics map so that it is not retried
   372  			delete(MetricsExp.internalConfig.failedMetrics, metric)
   373  		}
   374  	}
   375  	return errorObserved
   376  }