github.com/verrazzano/verrazzano@v1.7.0/application-operator/metricsexporter/metricsexporter_utils.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package metricsexporter
     5  
     6  import (
     7  	"fmt"
     8  	"net/http"
     9  	"time"
    10  
    11  	"github.com/pkg/errors"
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/prometheus/client_golang/prometheus/promhttp"
    14  	vzlogInit "github.com/verrazzano/verrazzano/pkg/log"
    15  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    16  	"go.uber.org/zap"
    17  	"k8s.io/apimachinery/pkg/util/wait"
    18  )
    19  
    20  type metricName string
    21  
    22  const (
    23  	AppconfigReconcileCounter              metricName = "appconfig reconcile counter"
    24  	AppconfigReconcileError                metricName = "appconfig reconcile error"
    25  	AppconfigReconcileDuration             metricName = "appconfig reconcile duration"
    26  	CohworkloadReconcileCounter            metricName = "coherence reconcile counter"
    27  	CohworkloadReconcileError              metricName = "coherence reconcile error"
    28  	CohworkloadReconcileDuration           metricName = "coherence reconcile duration"
    29  	HelidonReconcileCounter                metricName = "helidon reconcile counter"
    30  	HelidonReconcileError                  metricName = "helidon reconcile error"
    31  	HelidonReconcileDuration               metricName = "helidon reconcile duration"
    32  	IngresstraitReconcileCounter           metricName = "ingress reconcile counter"
    33  	IngresstraitReconcileError             metricName = "ingress reconcile error"
    34  	IngresstraitReconcileDuration          metricName = "ingress reconcile duration"
    35  	AppconfigHandleCounter                 metricName = "appconfig handle counter"
    36  	AppconfigHandleError                   metricName = "appconfig handle error"
    37  	AppconfigHandleDuration                metricName = "appconfig handle duration"
    38  	IstioHandleCounter                     metricName = "istio handle counter"
    39  	IstioHandleError                       metricName = "istio handle error"
    40  	IstioHandleDuration                    metricName = "istio handle duration"
    41  	LabelerPodHandleCounter                metricName = "LabelerPod handle counter"
    42  	LabelerPodHandleError                  metricName = "LabelerPod handle error"
    43  	LabelerPodHandleDuration               metricName = "LabelerPod handle duration"
    44  	BindingUpdaterHandleCounter            metricName = "BindingUpdater handle counter"
    45  	BindingUpdaterHandleError              metricName = "BindingUpdater handle error"
    46  	BindingUpdaterHandleDuration           metricName = "BindingUpdater handle duration"
    47  	MultiClusterAppconfigPodHandleCounter  metricName = "MultiClusterAppconfig handle counter"
    48  	MultiClusterAppconfigPodHandleError    metricName = "MultiClusterAppconfig handle error"
    49  	MultiClusterAppconfigPodHandleDuration metricName = "MultiClusterAppconfig handle duration"
    50  	MultiClusterCompHandleCounter          metricName = "MultiClusterComp handle counter"
    51  	MultiClusterCompHandleError            metricName = "MultiClusterComp handle error"
    52  	MultiClusterCompHandleDuration         metricName = "MultiClusterComp handle duration"
    53  	MultiClusterConfigmapHandleCounter     metricName = "MultiClusterConfigmap handle counter"
    54  	MultiClusterConfigmapHandleError       metricName = "MultiClusterConfigmap handle error"
    55  	MultiClusterConfigmapHandleDuration    metricName = "MultiClusterConfigmap handle duration"
    56  	MultiClusterSecretHandleCounter        metricName = "MultiClusterSecret handle counter"
    57  	MultiClusterSecretHandleError          metricName = "MultiClusterSecret handle error"
    58  	MultiClusterSecretHandleDuration       metricName = "MultiClusterSecret handle duration"
    59  	VzProjHandleCounter                    metricName = "VzProj handle counter"
    60  	VzProjHandleError                      metricName = "VzProj handle error"
    61  	VzProjHandleDuration                   metricName = "VzProj handle duration"
    62  )
    63  
    64  func init() {
    65  	RequiredInitialization()
    66  	RegisterMetrics()
    67  }
    68  
    69  // RequiredInitialization initializes the metrics object, but does not register the metrics
    70  func RequiredInitialization() {
    71  	MetricsExp = metricsExporter{
    72  		internalConfig: initConfiguration(),
    73  		internalData: data{
    74  			simpleCounterMetricMap: initCounterMetricMap(),
    75  			durationMetricMap:      initDurationMetricMap(),
    76  		},
    77  	}
    78  }
    79  
    80  // RegisterMetrics begins the process of registering metrics
    81  func RegisterMetrics() {
    82  	InitializeAllMetricsArray()
    83  	go registerMetricsHandlers(zap.S())
    84  }
    85  
    86  // InitializeAllMetricsArray initializes the allMetrics array
    87  func InitializeAllMetricsArray() {
    88  	// Loop through all metrics declarations in metric maps
    89  	for _, value := range MetricsExp.internalData.simpleCounterMetricMap {
    90  		MetricsExp.internalConfig.allMetrics = append(MetricsExp.internalConfig.allMetrics, value.metric)
    91  	}
    92  	for _, value := range MetricsExp.internalData.durationMetricMap {
    93  		MetricsExp.internalConfig.allMetrics = append(MetricsExp.internalConfig.allMetrics, value.metric)
    94  	}
    95  
    96  }
    97  
    98  // initCounterMetricMap initializes the simpleCounterMetricMap for the metricsExporter object
    99  func initCounterMetricMap() map[metricName]*SimpleCounterMetric {
   100  	return map[metricName]*SimpleCounterMetric{
   101  		AppconfigReconcileCounter: {
   102  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   103  				Name: "vz_application_operator_appconfig_successful_reconcile_total",
   104  				Help: "Tracks how many times the appconfig reconcile process has been successful"}),
   105  		},
   106  		AppconfigReconcileError: {
   107  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   108  				Name: "vz_application_operator_appconfig_error_reconcile_total",
   109  				Help: "Tracks how many times the appconfig reconcile process has failed"}),
   110  		},
   111  		CohworkloadReconcileCounter: {
   112  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   113  				Name: "vz_application_operator_cohworkload_successful_reconcile_total",
   114  				Help: "Tracks how many times the cohworkload reconcile process has been successful"}),
   115  		},
   116  		CohworkloadReconcileError: {
   117  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   118  				Name: "vz_application_operator_cohworkload_error_reconcile_total",
   119  				Help: "Tracks how many times the cohworkload reconcile process has failed"}),
   120  		},
   121  		HelidonReconcileCounter: {
   122  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   123  				Name: "vz_application_operator_helidonworkload_successful_reconcile_total",
   124  				Help: "Tracks how many times the helidonworkload reconcile process has been successful"}),
   125  		},
   126  		HelidonReconcileError: {
   127  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   128  				Name: "vz_application_operator_helidonworkload_error_reconcile_total",
   129  				Help: "Tracks how many times the helidonworkload reconcile process has failed"}),
   130  		},
   131  		IngresstraitReconcileCounter: {
   132  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   133  				Name: "vz_application_operator_ingresstrait_successful_reconcile_total",
   134  				Help: "Tracks how many times the ingresstrait reconcile process has been successful"}),
   135  		},
   136  		IngresstraitReconcileError: {
   137  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   138  				Name: "vz_application_operator_ingresstrait_error_reconcile_total",
   139  				Help: "Tracks how many times the ingresstrait reconcile process has failed"}),
   140  		},
   141  		AppconfigHandleCounter: {
   142  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   143  				Name: "vz_application_operator_appconfig_handle_total",
   144  				Help: "Tracks how many times appconfig handle process has been successful"}),
   145  		},
   146  		AppconfigHandleError: {
   147  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   148  				Name: "vz_application_operator_appconfig_error_handle_total",
   149  				Help: "Tracks how many times appconfig handle process has failed"}),
   150  		},
   151  		IstioHandleCounter: {
   152  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   153  				Name: "vz_application_operator_istio_handle_total",
   154  				Help: "Tracks how many times istio handle process has been successful"}),
   155  		},
   156  		IstioHandleError: {
   157  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   158  				Name: "vz_application_operator_istio_error_handle_total",
   159  				Help: "Tracks how many times istio handle process has failed"}),
   160  		},
   161  		LabelerPodHandleCounter: {
   162  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   163  				Name: "vz_application_operator_labelerPod_handle_total",
   164  				Help: "Tracks how many times the labeler pod handle process has been successful"}),
   165  		},
   166  		LabelerPodHandleError: {
   167  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   168  				Name: "vz_application_operator_labelerpod_error_handle_total",
   169  				Help: "Tracks how many times the labeler pod handle process has failed"}),
   170  		},
   171  		BindingUpdaterHandleCounter: {
   172  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   173  				Name: "vz_application_operator_bindingupdater_handle_total",
   174  				Help: "Tracks how many times the binding updater handle process has been successful"}),
   175  		},
   176  		BindingUpdaterHandleError: {
   177  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   178  				Name: "vz_application_operator_bindingupdater_error_handle_total",
   179  				Help: "Tracks how many times the binding updater handle process has failed"}),
   180  		},
   181  		MultiClusterAppconfigPodHandleCounter: {
   182  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   183  				Name: "vz_application_operator_multiclusterappconfig_handle_total",
   184  				Help: "Tracks how many times the multicluster appconfig pod handle process has been successful"}),
   185  		},
   186  		MultiClusterAppconfigPodHandleError: {
   187  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   188  				Name: "vz_application_operator_multiclusterappconfig_error_handle_total",
   189  				Help: "Tracks how many times the multicluster appconfig pod handle process has failed"}),
   190  		},
   191  		MultiClusterCompHandleCounter: {
   192  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   193  				Name: "vz_application_operator_multiclustercomp_handle_total",
   194  				Help: "Tracks how many times the multicluster component handle process has been successful"}),
   195  		},
   196  		MultiClusterCompHandleError: {
   197  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   198  				Name: "vz_application_operator_multiclustercomp_error_handle_total",
   199  				Help: "Tracks how many times the multicluster component handle process has failed"}),
   200  		},
   201  		MultiClusterConfigmapHandleCounter: {
   202  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   203  				Name: "vz_application_operator_multiclustercomp_handle_total",
   204  				Help: "Tracks how many times the multicluster configmap handle process has been successful"}),
   205  		},
   206  		MultiClusterConfigmapHandleError: {
   207  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   208  				Name: "vz_application_operator_multiclustercomp_error_handle_total",
   209  				Help: "Tracks how many times the multicluster configmap handle process has failed"}),
   210  		},
   211  		MultiClusterSecretHandleCounter: {
   212  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   213  				Name: "vz_application_operator_multiclustersecret_handle_total",
   214  				Help: "Tracks how many times the multicluster secret handle process has been successful"}),
   215  		},
   216  		MultiClusterSecretHandleError: {
   217  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   218  				Name: "vz_application_operator_multiclustersecret_error_handle_total",
   219  				Help: "Tracks how many times the multicluster secret handle process has failed"}),
   220  		},
   221  		VzProjHandleCounter: {
   222  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   223  				Name: "vz_application_operator_vzproj_handle_total",
   224  				Help: "Tracks how many times the vz project handle process has been successful"}),
   225  		},
   226  		VzProjHandleError: {
   227  			metric: prometheus.NewCounter(prometheus.CounterOpts{
   228  				Name: "vz_application_operator_vzproj_error_handle_total",
   229  				Help: "Tracks how many times the vz project handle process has failed"}),
   230  		},
   231  	}
   232  }
   233  
   234  // initDurationMetricMap initializes the DurationMetricMap for the metricsExporter object
   235  func initDurationMetricMap() map[metricName]*DurationMetrics {
   236  	return map[metricName]*DurationMetrics{
   237  		AppconfigReconcileDuration: {
   238  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   239  				Name: "vz_application_operator_appconfig_reconcile_duration",
   240  				Help: "The duration in seconds of vao appconfig reconcile process",
   241  			}),
   242  		},
   243  		CohworkloadReconcileDuration: {
   244  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   245  				Name: "vz_application_operator_cohworkload_reconcile_duration",
   246  				Help: "The duration in seconds of vao coherence workload reconcile process",
   247  			}),
   248  		},
   249  		HelidonReconcileDuration: {
   250  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   251  				Name: "vz_application_operator_helidon_reconcile_duration",
   252  				Help: "The duration in seconds of vao helidon reconcile process",
   253  			}),
   254  		},
   255  		IngresstraitReconcileDuration: {
   256  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   257  				Name: "vz_application_operator_ingresstrait_reconcile_duration",
   258  				Help: "The duration in seconds of vao ingresstrait reconcile process",
   259  			}),
   260  		},
   261  		AppconfigHandleDuration: {
   262  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   263  				Name: "vz_application_operator_appconfig_handle_duration",
   264  				Help: "The duration in seconds of vao appconfig handle process",
   265  			}),
   266  		},
   267  		IstioHandleDuration: {
   268  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   269  				Name: "vz_application_operator_istio_handle_duration",
   270  				Help: "The duration in seconds of vao istio handle process",
   271  			}),
   272  		},
   273  		LabelerPodHandleDuration: {
   274  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   275  				Name: "vz_application_operator_labelerpod_handle_duration",
   276  				Help: "The duration in seconds of vao labeler pod handle process",
   277  			}),
   278  		},
   279  		MultiClusterConfigmapHandleDuration: {
   280  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   281  				Name: "vz_application_operator_multiclusterconfigmap_handle_duration",
   282  				Help: "The duration in seconds of vao multicluster configmap handle process",
   283  			}),
   284  		},
   285  		MultiClusterAppconfigPodHandleDuration: {
   286  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   287  				Name: "vz_application_operator_multiclusterappconfig_handle_duration",
   288  				Help: "The duration in seconds of vao multicluster appconfig process",
   289  			}),
   290  		},
   291  		MultiClusterCompHandleDuration: {
   292  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   293  				Name: "vz_application_operator_multiclustercomp_handle_duration",
   294  				Help: "The duration in seconds of vao multicluster component handle process",
   295  			}),
   296  		},
   297  		MultiClusterSecretHandleDuration: {
   298  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   299  				Name: "vz_application_operator_multiclustersecret_handle_duration",
   300  				Help: "The duration in seconds of vao multicluster secret handle process",
   301  			}),
   302  		},
   303  		VzProjHandleDuration: {
   304  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   305  				Name: "vz_application_operator_vzproj_handle_duration",
   306  				Help: "The duration in seconds of vao vz project handle process",
   307  			}),
   308  		},
   309  		BindingUpdaterHandleDuration: {
   310  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   311  				Name: "vz_application_operator_bindingupdater_handle_duration",
   312  				Help: "The duration in seconds of vao binding updater handle process",
   313  			}),
   314  		},
   315  	}
   316  }
   317  
   318  // registerMetricsHandlersHelper is a helper function that assists in registering metrics
   319  func registerMetricsHandlersHelper() error {
   320  	var errorObserved error
   321  	for metric := range MetricsExp.internalConfig.failedMetrics {
   322  		err := MetricsExp.internalConfig.registry.Register(metric)
   323  		if err != nil {
   324  			if errorObserved != nil {
   325  				errorObserved = errors.Wrap(errorObserved, err.Error())
   326  			} else {
   327  				errorObserved = err
   328  			}
   329  		} else {
   330  			// If a metric is registered, delete it from the failed metrics map so that it is not retried
   331  			delete(MetricsExp.internalConfig.failedMetrics, metric)
   332  		}
   333  	}
   334  	return errorObserved
   335  }
   336  
   337  // registerMetricsHandlers registers the metrics and provides error handling
   338  func registerMetricsHandlers(log *zap.SugaredLogger) {
   339  	// Get list of metrics to register initially
   340  	initializeFailedMetricsArray()
   341  	// Loop until there is no error in registering
   342  	for err := registerMetricsHandlersHelper(); err != nil; err = registerMetricsHandlersHelper() {
   343  		log.Infof("Failed to register metrics for VMI %v", err)
   344  		time.Sleep(time.Second)
   345  	}
   346  }
   347  
   348  // initializeFailedMetricsArray initializes the failedMetrics array
   349  func initializeFailedMetricsArray() {
   350  	for i, metric := range MetricsExp.internalConfig.allMetrics {
   351  		MetricsExp.internalConfig.failedMetrics[metric] = i
   352  	}
   353  }
   354  
   355  // StartMetricsServer starts the metric server to begin emitting metrics to Prometheus
   356  func StartMetricsServer() error {
   357  	vlog, err := vzlog.EnsureResourceLogger(&vzlog.ResourceConfig{
   358  		Name:           "",
   359  		Namespace:      "",
   360  		ID:             "",
   361  		Generation:     0,
   362  		ControllerName: "metricsexporter",
   363  	})
   364  	if err != nil {
   365  		return err
   366  	}
   367  	go wait.Until(func() {
   368  		http.Handle("/metrics", promhttp.Handler())
   369  		server := &http.Server{
   370  			Addr:              ":9100",
   371  			ReadHeaderTimeout: 3 * time.Second,
   372  		}
   373  		err := server.ListenAndServe()
   374  		if err != nil {
   375  			vlog.Oncef("Failed to start metrics server for VMI: %v", err)
   376  		}
   377  	}, time.Second*3, wait.NeverStop)
   378  	return nil
   379  }
   380  
   381  // initConfiguration returns an empty struct of type configuration
   382  func initConfiguration() configuration {
   383  	return configuration{
   384  		allMetrics:    []prometheus.Collector{},
   385  		failedMetrics: map[prometheus.Collector]int{},
   386  		registry:      prometheus.DefaultRegisterer,
   387  	}
   388  }
   389  
   390  // GetSimpleCounterMetric returns a simpleCounterMetric from the simpleCounterMetricMap given a metricName
   391  func GetSimpleCounterMetric(name metricName) (*SimpleCounterMetric, error) {
   392  	counterMetric, ok := MetricsExp.internalData.simpleCounterMetricMap[name]
   393  	if !ok {
   394  		return nil, fmt.Errorf("%v not found in SimpleCounterMetricMap due to metricName being defined, but not being a key in the map", name)
   395  	}
   396  	return counterMetric, nil
   397  }
   398  
   399  // GetDurationMetric returns a durationMetric from the durationMetricMap given a metricName
   400  func GetDurationMetric(name metricName) (*DurationMetrics, error) {
   401  	durationMetric, ok := MetricsExp.internalData.durationMetricMap[name]
   402  	if !ok {
   403  		return nil, fmt.Errorf("%v not found in durationMetricMap due to metricName being defined, but not being a key in the map", name)
   404  	}
   405  	return durationMetric, nil
   406  }
   407  func ExposeControllerMetrics(controllerName string, successname metricName, errorname metricName, durationname metricName) (*SimpleCounterMetric, *SimpleCounterMetric, *DurationMetrics, *zap.SugaredLogger, error) {
   408  	zapLogForMetrics := zap.S().With(vzlogInit.FieldController, controllerName)
   409  	counterMetricObject, err := GetSimpleCounterMetric(successname)
   410  	if err != nil {
   411  		zapLogForMetrics.Error(err)
   412  		return nil, nil, nil, nil, err
   413  	}
   414  	errorCounterMetricObject, err := GetSimpleCounterMetric(errorname)
   415  	if err != nil {
   416  		zapLogForMetrics.Error(err)
   417  		return nil, nil, nil, nil, err
   418  	}
   419  
   420  	durationMetricObject, err := GetDurationMetric(durationname)
   421  	if err != nil {
   422  		zapLogForMetrics.Error(err)
   423  		return nil, nil, nil, nil, err
   424  	}
   425  	return counterMetricObject, errorCounterMetricObject, durationMetricObject, zapLogForMetrics, nil
   426  }