github.com/verrazzano/verrazzano@v1.7.0/platform-operator/metricsexporter/metricsexporter_utils.go (about)

     1  // Copyright (c) 2022, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package metricsexporter
     5  
     6  import (
     7  	"fmt"
     8  	"net/http"
     9  	"time"
    10  
    11  	"github.com/pkg/errors"
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/prometheus/client_golang/prometheus/promhttp"
    14  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    15  	vzapi "github.com/verrazzano/verrazzano/platform-operator/apis/verrazzano/v1alpha1"
    16  	"github.com/verrazzano/verrazzano/platform-operator/constants"
    17  	"github.com/verrazzano/verrazzano/platform-operator/controllers/verrazzano/component/grafanadashboards"
    18  	"github.com/verrazzano/verrazzano/platform-operator/controllers/verrazzano/component/networkpolicies"
    19  	"github.com/verrazzano/verrazzano/platform-operator/controllers/verrazzano/component/registry"
    20  	"github.com/verrazzano/verrazzano/platform-operator/controllers/verrazzano/component/vmo"
    21  	"go.uber.org/zap"
    22  	"k8s.io/apimachinery/pkg/util/wait"
    23  )
    24  
    25  var MetricsExp MetricsExporter
    26  
    27  type metricName string
    28  
    29  const (
    30  	component                      = "component"
    31  	ReconcileCounter    metricName = "reconcile counter"
    32  	ReconcileError      metricName = "reconcile error"
    33  	ReconcileDuration   metricName = "reconcile duration"
    34  	AvailableComponents metricName = "available components"
    35  	EnabledComponents   metricName = "enabled components"
    36  )
    37  
    38  // Init cannot be called until the NGINX namespace is determined at startup
    39  func Init() {
    40  	RequiredInitialization()
    41  	RegisterMetrics(zap.S())
    42  }
    43  
    44  // This function initializes the metrics object, but does not register the metrics
    45  func RequiredInitialization() {
    46  	MetricsExp = MetricsExporter{
    47  		internalConfig: initConfiguration(),
    48  		internalData: data{
    49  			simpleCounterMetricMap:   initSimpleCounterMetricMap(),
    50  			simpleGaugeMetricMap:     initSimpleGaugeMetricMap(),
    51  			durationMetricMap:        initDurationMetricMap(),
    52  			componentHealth:          initComponentHealthMetrics(),
    53  			componentInstallDuration: initComponentInstallDurationMetrics(),
    54  			componentUpgradeDuration: initComponentUpgradeDurationMetrics(),
    55  		},
    56  	}
    57  	// initialize component availability metric to false
    58  	for _, component := range registry.GetComponents() {
    59  		if IsNonMetricComponent(component.Name()) {
    60  			continue
    61  		}
    62  		MetricsExp.internalData.componentHealth.SetComponentHealth(component.GetJSONName(), false, false)
    63  		SetComponentInstallDurationMetric(component.GetJSONName(), 0)
    64  		SetComponentUpgradeDurationMetric(component.GetJSONName(), 0)
    65  
    66  	}
    67  
    68  }
    69  
    70  // This function begins the process of registering metrics
    71  func RegisterMetrics(log *zap.SugaredLogger) {
    72  	InitializeAllMetricsArray()
    73  	go registerMetricsHandlers(log)
    74  }
    75  
    76  // This function initializes the simpleCounterMetricMap for the metricsExporter object
    77  func initSimpleCounterMetricMap() map[metricName]*SimpleCounterMetric {
    78  	return map[metricName]*SimpleCounterMetric{
    79  		ReconcileCounter: {
    80  			prometheus.NewCounter(prometheus.CounterOpts{
    81  				Name: "vz_platform_operator_reconcile_total",
    82  				Help: "The number of times the reconcile function has been called in the verrazzano-platform-operator",
    83  			}),
    84  		},
    85  		ReconcileError: {
    86  			prometheus.NewCounter(prometheus.CounterOpts{
    87  				Name: "vz_platform_operator_error_reconcile_total",
    88  				Help: "The number of times the reconcile function has returned an error in the verrazzano-platform-operator",
    89  			}),
    90  		},
    91  	}
    92  }
    93  
    94  func initComponentHealthMetrics() *ComponentHealth {
    95  	return &ComponentHealth{
    96  		available: prometheus.NewGaugeVec(prometheus.GaugeOpts{
    97  			Name: "vz_platform_operator_component_health",
    98  			Help: "Is component enabled and available",
    99  		}, []string{component}),
   100  	}
   101  }
   102  
   103  func initComponentInstallDurationMetrics() *ComponentInstallDuration {
   104  	return &ComponentInstallDuration{
   105  		installDuration: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   106  			Name: "vz_platform_operator_component_install_duration_seconds",
   107  			Help: "The duration of the latest installation of each component in seconds",
   108  		}, []string{component}),
   109  	}
   110  }
   111  
   112  func initComponentUpgradeDurationMetrics() *ComponentUpgradeDuration {
   113  	return &ComponentUpgradeDuration{
   114  		upgradeDuration: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   115  			Name: "vz_platform_operator_component_upgrade_duration_seconds",
   116  			Help: "The duration of the latest upgrade of each component in seconds",
   117  		}, []string{component}),
   118  	}
   119  }
   120  
   121  // This function initializes the simpleGaugeMetricMap for the metricsExporter object
   122  func initSimpleGaugeMetricMap() map[metricName]*SimpleGaugeMetric {
   123  	return map[metricName]*SimpleGaugeMetric{
   124  		AvailableComponents: {
   125  			metric: prometheus.NewGauge(prometheus.GaugeOpts{
   126  				Name: "vz_platform_operator_component_health_total",
   127  				Help: "The number of currently available Verrazzano components",
   128  			}),
   129  		},
   130  		EnabledComponents: {
   131  			metric: prometheus.NewGauge(prometheus.GaugeOpts{
   132  				Name: "vz_platform_operator_component_enabled_total",
   133  				Help: "The number of currently enabled Verrazzano components",
   134  			}),
   135  		},
   136  	}
   137  }
   138  
   139  // This function initializes the durationMetricMap for the metricsExporter object
   140  func initDurationMetricMap() map[metricName]*DurationMetric {
   141  	return map[metricName]*DurationMetric{
   142  		ReconcileDuration: {
   143  			metric: prometheus.NewSummary(prometheus.SummaryOpts{
   144  				Name: "vz_platform_operator_reconcile_duration",
   145  				Help: "The duration in seconds of vpo reconcile process",
   146  			}),
   147  		},
   148  	}
   149  }
   150  
   151  // This function is used to determine whether a durationTime for a component metric should be set and what the duration time is
   152  // If the start time is greater than the completion time, the metric will not be set
   153  // After this check, the function calculates the duration time and tries to set the metric of the component
   154  // If the component's name is not in the metric map, an error will be raised to prevent a seg fault
   155  func metricParserHelperFunction(log vzlog.VerrazzanoLogger, componentName string, startTime string, completionTime string, typeofOperation string) {
   156  	startInSeconds, err := time.Parse(time.RFC3339, startTime)
   157  	if err != nil {
   158  		log.Errorf("Error in parsing start time %s for operation %s for component %s", startTime, typeofOperation, componentName)
   159  		return
   160  	}
   161  	startInSecondsUnix := startInSeconds.Unix()
   162  	completionInSeconds, err := time.Parse(time.RFC3339, completionTime)
   163  	if err != nil {
   164  		log.Errorf("Error in parsing completion time %s for operation %s for component %s", completionTime, typeofOperation, componentName)
   165  		return
   166  	}
   167  	completionInSecondsUnix := completionInSeconds.Unix()
   168  	if startInSecondsUnix >= completionInSecondsUnix {
   169  		log.Debug("Component %s is not updated, as there is an ongoing operation in progress")
   170  		return
   171  	}
   172  	totalDuration := (completionInSecondsUnix - startInSecondsUnix)
   173  	if typeofOperation == constants.InstallOperation {
   174  		err := SetComponentInstallDurationMetric(componentName, totalDuration)
   175  		if err != nil {
   176  			log.Errorf(err.Error())
   177  			return
   178  		}
   179  	}
   180  	if typeofOperation == constants.UpgradeOperation {
   181  		err := SetComponentUpgradeDurationMetric(componentName, totalDuration)
   182  		if err != nil {
   183  			log.Errorf(err.Error())
   184  			return
   185  		}
   186  	}
   187  }
   188  
   189  func SetComponentInstallDurationMetric(JSONName string, totalDuration int64) error {
   190  	metric, err := MetricsExp.internalData.componentInstallDuration.installDuration.GetMetricWithLabelValues(JSONName)
   191  	if err != nil {
   192  		return err
   193  	}
   194  	metric.Set(float64(totalDuration))
   195  	return nil
   196  }
   197  
   198  func SetComponentUpgradeDurationMetric(JSONName string, totalDuration int64) error {
   199  	metric, err := MetricsExp.internalData.componentUpgradeDuration.upgradeDuration.GetMetricWithLabelValues(JSONName)
   200  	if err != nil {
   201  		return err
   202  	}
   203  	metric.Set(float64(totalDuration))
   204  	return nil
   205  }
   206  
   207  // This function is a helper function that assists in registering metrics
   208  func registerMetricsHandlersHelper() error {
   209  	var errorObserved error
   210  	for metric := range MetricsExp.internalConfig.failedMetrics {
   211  		err := MetricsExp.internalConfig.registry.Register(metric)
   212  		if err != nil {
   213  			if errorObserved != nil {
   214  				errorObserved = errors.Wrap(errorObserved, err.Error())
   215  			} else {
   216  				errorObserved = err
   217  			}
   218  		} else {
   219  			// if a metric is registered, delete it from the failed metrics map so that it is not retried
   220  			delete(MetricsExp.internalConfig.failedMetrics, metric)
   221  		}
   222  	}
   223  	return errorObserved
   224  }
   225  
   226  // This function registers the metrics and provides error handling
   227  func registerMetricsHandlers(log *zap.SugaredLogger) {
   228  	initializeFailedMetricsArray() // Get list of metrics to register initially
   229  	// loop until there is no error in registering
   230  	for err := registerMetricsHandlersHelper(); err != nil; err = registerMetricsHandlersHelper() {
   231  		log.Errorf("Failed to register metrics for VPO %v \n", err)
   232  		time.Sleep(time.Second)
   233  	}
   234  	// register component health metrics vector
   235  	MetricsExp.internalConfig.registry.MustRegister(MetricsExp.internalData.componentHealth.available)
   236  	MetricsExp.internalConfig.registry.MustRegister(MetricsExp.internalData.componentInstallDuration.installDuration)
   237  	MetricsExp.internalConfig.registry.MustRegister(MetricsExp.internalData.componentUpgradeDuration.upgradeDuration)
   238  }
   239  
   240  // This function initializes the failedMetrics array
   241  func initializeFailedMetricsArray() {
   242  	for i, metric := range MetricsExp.internalConfig.allMetrics {
   243  		MetricsExp.internalConfig.failedMetrics[metric] = i
   244  	}
   245  }
   246  
   247  // This function starts the metric server to begin emitting metrics to Prometheus
   248  func StartMetricsServer(log *zap.SugaredLogger) {
   249  	go wait.Until(func() {
   250  		http.Handle("/metrics", promhttp.Handler())
   251  		server := &http.Server{
   252  			Addr:              ":9100",
   253  			ReadHeaderTimeout: 3 * time.Second,
   254  		}
   255  		if err := server.ListenAndServe(); err != nil {
   256  			log.Errorf("Failed to start metrics server for verrazzano-platform-operator: %v", err)
   257  		}
   258  	}, time.Second*3, wait.NeverStop)
   259  }
   260  
   261  // This functionn parses the VZ CR and extracts the install and update data for each component
   262  func AnalyzeVerrazzanoResourceMetrics(log vzlog.VerrazzanoLogger, cr vzapi.Verrazzano) {
   263  	mapOfComponents := cr.Status.Components
   264  	for componentName, componentStatusDetails := range mapOfComponents {
   265  		// If component is not in the metricsMap, move on to the next component
   266  		if IsNonMetricComponent(componentName) {
   267  			continue
   268  		}
   269  		var installCompletionTime string
   270  		var upgradeCompletionTime string
   271  		var upgradeStartTime string
   272  		var installStartTime string
   273  		for _, status := range componentStatusDetails.Conditions {
   274  			if status.Type == vzapi.CondInstallStarted {
   275  				installStartTime = status.LastTransitionTime
   276  			}
   277  			if status.Type == vzapi.CondInstallComplete {
   278  				installCompletionTime = status.LastTransitionTime
   279  			}
   280  			if status.Type == vzapi.CondUpgradeStarted {
   281  				upgradeStartTime = status.LastTransitionTime
   282  			}
   283  			if status.Type == vzapi.CondUpgradeComplete {
   284  				upgradeCompletionTime = status.LastTransitionTime
   285  			}
   286  		}
   287  		found, component := registry.FindComponent(componentName)
   288  		if !found {
   289  			continue
   290  		}
   291  		componentJSONName := component.GetJSONName()
   292  		if installStartTime != "" && installCompletionTime != "" {
   293  			metricParserHelperFunction(log, componentJSONName, installStartTime, installCompletionTime, constants.InstallOperation)
   294  		}
   295  		if upgradeStartTime != "" && upgradeCompletionTime != "" {
   296  			metricParserHelperFunction(log, componentJSONName, upgradeStartTime, upgradeCompletionTime, constants.UpgradeOperation)
   297  		}
   298  	}
   299  }
   300  
   301  // This function initializes the allMetrics array
   302  func InitializeAllMetricsArray() {
   303  	// loop through all metrics declarations in metric maps
   304  	for _, value := range MetricsExp.internalData.simpleCounterMetricMap {
   305  		MetricsExp.internalConfig.allMetrics = append(MetricsExp.internalConfig.allMetrics, value.metric)
   306  	}
   307  	for _, value := range MetricsExp.internalData.durationMetricMap {
   308  		MetricsExp.internalConfig.allMetrics = append(MetricsExp.internalConfig.allMetrics, value.metric)
   309  	}
   310  	for _, value := range MetricsExp.internalData.simpleGaugeMetricMap {
   311  		MetricsExp.internalConfig.allMetrics = append(MetricsExp.internalConfig.allMetrics, value.metric)
   312  	}
   313  }
   314  
   315  // This function returns an empty struct of type configuration
   316  func initConfiguration() configuration {
   317  	return configuration{
   318  		allMetrics:    []prometheus.Collector{},
   319  		failedMetrics: map[prometheus.Collector]int{},
   320  		registry:      prometheus.DefaultRegisterer,
   321  	}
   322  }
   323  
   324  // This function returns a simpleCounterMetric from the simpleCounterMetricMap given a metricName
   325  func GetSimpleCounterMetric(name metricName) (*SimpleCounterMetric, error) {
   326  	counterMetric, ok := MetricsExp.internalData.simpleCounterMetricMap[name]
   327  	if !ok {
   328  		return nil, fmt.Errorf("%v not found in SimpleCounterMetricMap due to metricName being defined, but not being a key in the map", name)
   329  	}
   330  	return counterMetric, nil
   331  }
   332  
   333  // This function returns a durationMetric from the durationMetricMap given a metricName
   334  func GetDurationMetric(name metricName) (*DurationMetric, error) {
   335  	durationMetric, ok := MetricsExp.internalData.durationMetricMap[name]
   336  	if !ok {
   337  		return nil, fmt.Errorf("%v not found in durationMetricMap due to metricName being defined, but not being a key in the map", name)
   338  	}
   339  	return durationMetric, nil
   340  }
   341  
   342  // This function returns a simpleGaugeMetric from the simpleGaugeMetricMap given a metricName
   343  func GetSimpleGaugeMetric(name metricName) (*SimpleGaugeMetric, error) {
   344  	gaugeMetric, ok := MetricsExp.internalData.simpleGaugeMetricMap[name]
   345  	if !ok {
   346  		return nil, fmt.Errorf("%v not found in SimpleGaugeMetricMap due to metricName being defined, but not being a key in the map", name)
   347  	}
   348  	return gaugeMetric, nil
   349  }
   350  
   351  // SetComponentAvailabilityMetric updates the components availability status metric
   352  func SetComponentAvailabilityMetric(JSONname string, availability vzapi.ComponentAvailability, isEnabled bool) error {
   353  	_, err := MetricsExp.internalData.componentHealth.SetComponentHealth(JSONname, availability == vzapi.ComponentAvailable, isEnabled)
   354  	if err != nil {
   355  		return err
   356  	}
   357  	return nil
   358  }
   359  
   360  func IsNonMetricComponent(componentName string) bool {
   361  	var nonMetricComponents = map[string]bool{
   362  		vmo.ComponentName:               true,
   363  		networkpolicies.ComponentName:   true,
   364  		grafanadashboards.ComponentName: true,
   365  	}
   366  	return nonMetricComponents[componentName]
   367  }