github.com/verrazzano/verrazzano@v1.7.1/tests/e2e/metrics/syscomponents/metrics_test.go (about)

     1  // Copyright (c) 2021, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package syscomponents
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    13  	"github.com/verrazzano/verrazzano/pkg/nginxutil"
    14  	"github.com/verrazzano/verrazzano/tools/vz/pkg/constants"
    15  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    16  
    17  	. "github.com/onsi/ginkgo/v2"
    18  	. "github.com/onsi/gomega"
    19  	"github.com/verrazzano/verrazzano/pkg/k8sutil"
    20  	"github.com/verrazzano/verrazzano/tests/e2e/pkg"
    21  	"github.com/verrazzano/verrazzano/tests/e2e/pkg/test/framework"
    22  )
    23  
    24  const (
    25  	metricsVersion = "1.4.0"
    26  
    27  	longPollingInterval = 10 * time.Second
    28  	longWaitTimeout     = 15 * time.Minute
    29  
    30  	// Constants for sample metrics of system components validated by the test
    31  	ingressControllerSuccess       = "nginx_ingress_controller_success"
    32  	containerStartTimeSeconds      = "container_start_time_seconds"
    33  	cpuSecondsTotal                = "node_cpu_seconds_total"
    34  	istioRequestsTotal             = "istio_requests_total"
    35  	sidecarInjectionRequests       = "sidecar_injection_requests_total"
    36  	prometheusTargetIntervalLength = "prometheus_target_interval_length_seconds"
    37  	envoyStatsRecentLookups        = "envoy_server_stats_recent_lookups"
    38  	vmoFunctionMetric              = "vz_monitoring_operator_reconcile_total"
    39  	vmoCounterMetric               = "vz_monitoring_operator_deployment_update_total"
    40  	vmoGaugeMetric                 = "vz_monitoring_operator_work_queue_size"
    41  	vmoTimestampMetric             = "vz_monitoring_operator_configmap_last_successful_timestamp"
    42  	vaoSuccessCountMetric          = "vz_application_operator_appconfig_successful_reconcile_total"
    43  	vaoFailCountMetric             = "vz_application_operator_appconfig_error_reconcile_total"
    44  	vaoDurationCountMetric         = "vz_application_operator_appconfig_reconcile_duration_count"
    45  	esClusterStatusMetric          = "opensearch_cluster_status"
    46  
    47  	// Namespaces used for validating envoy stats
    48  	verrazzanoSystemNamespace = "verrazzano-system"
    49  	istioSystemNamespace      = "istio-system"
    50  	keycloakNamespace         = "keycloak"
    51  
    52  	// Constants for various metric labels, used in the validation
    53  	nodeExporter        = "node-exporter"
    54  	istiod              = "istiod"
    55  	pilot               = "pilot"
    56  	prometheus          = "prometheus-operator-kube-p-prometheus"
    57  	oldPrometheus       = "prometheus"
    58  	controllerNamespace = "controller_namespace"
    59  	ingressController   = "ingress-controller"
    60  	appK8SIOInstance    = "app_kubernetes_io_instance"
    61  	job                 = "job"
    62  	app                 = "app"
    63  	namespace           = "namespace"
    64  	container           = "container"
    65  	esMaster            = "es-master"
    66  
    67  	failedVerifyVersionMsg = "Failed to verify the Verrazzano version was min 1.4.0: %v"
    68  )
    69  
    70  var clusterName = os.Getenv("CLUSTER_NAME")
    71  var kubeConfig = os.Getenv("KUBECONFIG")
    72  
    73  // will be initialized in BeforeSuiteFunc so that any log messages during init are available
    74  var clusterNameMetricsLabel = ""
    75  var isMinVersion110 bool
    76  
    77  var adminKubeConfig string
    78  var isManagedClusterProfile bool
    79  
    80  var ingressNGINXNamespace string
    81  
    82  // List of namespaces considered for validating the envoy-stats
    83  var envoyStatsNamespaces = []string{
    84  	ingressNGINXNamespace,
    85  	istioSystemNamespace,
    86  	verrazzanoSystemNamespace,
    87  }
    88  
    89  // List of pods to be excluded from verrazzano-system namespace for envoy-stats as they do not have envoy
    90  var excludePodsVS = []string{
    91  	"coherence-operator",
    92  	"oam-kubernetes-runtime",
    93  	"verrazzano-application-operator",
    94  	"verrazzano-monitoring-operator",
    95  	"verrazzano-cluster-operator",
    96  	"verrazzano-operator",
    97  	"weblogic-operator-webhook",
    98  }
    99  
   100  // List of pods to be excluded from istio-system namespace for envoy-stats as they do not have envoy
   101  var excludePodsIstio = []string{
   102  	"istiocoredns",
   103  	"istiod",
   104  }
   105  var metricsTest pkg.MetricsTest
   106  
   107  var t = framework.NewTestFramework("syscomponents")
   108  
   109  var beforeSuite = t.BeforeSuiteFunc(func() {
   110  	present := false
   111  	var err error
   112  	adminKubeConfig, present = os.LookupEnv("ADMIN_KUBECONFIG")
   113  	isManagedClusterProfile = pkg.IsManagedClusterProfile()
   114  	if isManagedClusterProfile {
   115  		if !present {
   116  			Fail(fmt.Sprintln("Environment variable ADMIN_KUBECONFIG is required to run the test"))
   117  		}
   118  	} else {
   119  		// Include the namespace keycloak for the validation for admin cluster and single cluster installation
   120  		envoyStatsNamespaces = append(envoyStatsNamespaces, keycloakNamespace)
   121  		adminKubeConfig, err = k8sutil.GetKubeConfigLocation()
   122  		if err != nil {
   123  			Fail(err.Error())
   124  		}
   125  	}
   126  
   127  	isMinVersion110, err = pkg.IsVerrazzanoMinVersion("1.1.0", adminKubeConfig)
   128  	if err != nil {
   129  		Fail(err.Error())
   130  	}
   131  
   132  	defaultLabels := map[string]string{}
   133  	if clusterLabelVal := getClusterNameForPromQuery(); clusterLabelVal != "" {
   134  		defaultLabels[getClusterNameMetricLabel()] = clusterLabelVal
   135  	}
   136  	metricsTest, err = pkg.NewMetricsTest(adminKubeConfig, defaultLabels)
   137  	if err != nil {
   138  		Fail(err.Error())
   139  	}
   140  
   141  	ingressNGINXNamespace, err = nginxutil.DetermineNamespaceForIngressNGINX(vzlog.DefaultLogger())
   142  	if err != nil {
   143  		Fail(err.Error())
   144  	}
   145  })
   146  
   147  var _ = BeforeSuite(beforeSuite)
   148  
   149  var afterSuite = t.AfterSuiteFunc(func() {})
   150  
   151  var _ = AfterSuite(afterSuite)
   152  
   153  var _ = t.AfterEach(func() {})
   154  
   155  // 'It' Wrapper to only run spec if the Verrazzano Monitoring Operator is installed in the cluster
   156  func WhenVMOInstalledAndMinVersionIt(description string, version string, kubeConfigPath string, f func()) {
   157  	_, err := pkg.GetDeployment(verrazzanoSystemNamespace, constants.VerrazzanoMonitoringOperator)
   158  	if err != nil {
   159  		if apierrors.IsNotFound(err) {
   160  			t.It(description, func() {
   161  				Skip("VMO is not installed, skipping")
   162  			})
   163  		} else {
   164  			Fail(err.Error())
   165  		}
   166  	} else {
   167  		t.ItMinimumVersion(description, version, kubeConfigPath, f)
   168  	}
   169  }
   170  
   171  var _ = t.Describe("Thanos Metrics", Label("f:observability.monitoring.prom"), func() {
   172  	// Query Prometheus for the sample metrics from the default scraping jobs
   173  	var _ = t.Describe("for the system components", func() {
   174  		t.It("Verify sample NGINX metrics can be queried from Thanos", func() {
   175  			eventuallyMetricsContainLabels(ingressControllerSuccess, map[string]string{
   176  				controllerNamespace: ingressNGINXNamespace,
   177  				appK8SIOInstance:    ingressController,
   178  			})
   179  		})
   180  
   181  		if !pkg.IsManagedClusterProfile() {
   182  			t.ItMinimumVersion("Verify sample OpenSearch metrics can be queried from Thanos", "1.5.0", kubeConfig, func() {
   183  				eventuallyMetricsContainLabels(esClusterStatusMetric, map[string]string{})
   184  			})
   185  		}
   186  
   187  		t.It("Verify sample Container Advisor metrics can be queried from Thanos", func() {
   188  			eventuallyMetricsContainLabels(containerStartTimeSeconds, map[string]string{})
   189  		})
   190  		t.ItMinimumVersion("Verify VPO summary counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   191  			eventuallyMetricsContainLabels("vz_platform_operator_reconcile_duration_count", map[string]string{})
   192  		})
   193  		t.ItMinimumVersion("Verify VPO summary sum times can be queried from Thanos", metricsVersion, kubeConfig, func() {
   194  			eventuallyMetricsContainLabels("vz_platform_operator_reconcile_duration_sum", map[string]string{})
   195  		})
   196  		t.ItMinimumVersion("Verify VPO counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   197  			eventuallyMetricsContainLabels("vz_platform_operator_reconcile_total", map[string]string{})
   198  		})
   199  		t.ItMinimumVersion("Verify VPO error counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   200  			eventuallyMetricsContainLabels("vz_platform_operator_error_reconcile_total", map[string]string{})
   201  		})
   202  		t.ItMinimumVersion("Verify VPO install metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   203  			eventuallyMetricsContainLabels("vz_platform_operator_component_install_duration_seconds", map[string]string{})
   204  		})
   205  		t.ItMinimumVersion("Verify VPO upgrade counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   206  			eventuallyMetricsContainLabels("vz_platform_operator_component_upgrade_duration_seconds", map[string]string{})
   207  		})
   208  
   209  		WhenVMOInstalledAndMinVersionIt("Verify VMO function metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   210  			eventuallyMetricsContainLabels(vmoFunctionMetric, map[string]string{})
   211  		})
   212  
   213  		WhenVMOInstalledAndMinVersionIt("Verify VMO counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   214  			eventuallyMetricsContainLabels(vmoCounterMetric, map[string]string{})
   215  		})
   216  
   217  		WhenVMOInstalledAndMinVersionIt("Verify VMO gauge metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   218  			eventuallyMetricsContainLabels(vmoGaugeMetric, map[string]string{})
   219  		})
   220  
   221  		WhenVMOInstalledAndMinVersionIt("Verify VMO timestamp metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   222  			eventuallyMetricsContainLabels(vmoTimestampMetric, map[string]string{})
   223  		})
   224  
   225  		t.ItMinimumVersion("Verify VAO successful counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   226  			eventuallyMetricsContainLabels(vaoSuccessCountMetric, map[string]string{})
   227  		})
   228  		t.ItMinimumVersion("Verify VAO failed counter metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   229  			eventuallyMetricsContainLabels(vaoFailCountMetric, map[string]string{})
   230  		})
   231  		t.ItMinimumVersion("Verify VAO Duration summary metrics can be queried from Thanos", metricsVersion, kubeConfig, func() {
   232  			eventuallyMetricsContainLabels(vaoDurationCountMetric, map[string]string{})
   233  		})
   234  
   235  		t.It("Verify sample Node Exporter metrics can be queried from Thanos", func() {
   236  			Eventually(func() bool {
   237  				kv := map[string]string{
   238  					job: nodeExporter,
   239  				}
   240  				return metricsTest.MetricsExist(cpuSecondsTotal, kv)
   241  			}, longWaitTimeout, longPollingInterval).Should(BeTrue())
   242  		})
   243  
   244  		if istioInjection == "enabled" {
   245  			t.It("Verify sample mesh metrics can be queried from Thanos", func() {
   246  				Eventually(func() bool {
   247  					kv := map[string]string{
   248  						namespace: verrazzanoSystemNamespace,
   249  					}
   250  					return metricsTest.MetricsExist(istioRequestsTotal, kv)
   251  				}, longWaitTimeout, longPollingInterval).Should(BeTrue())
   252  			})
   253  
   254  			t.It("Verify sample istiod metrics can be queried from Thanos", func() {
   255  				Eventually(func() bool {
   256  					kv := map[string]string{
   257  						app: istiod,
   258  						job: pilot,
   259  					}
   260  
   261  					minVer14, err := pkg.IsVerrazzanoMinVersion("1.4.0", adminKubeConfig)
   262  					if err != nil {
   263  						pkg.Log(pkg.Error, fmt.Sprintf(failedVerifyVersionMsg, err))
   264  						return false
   265  					}
   266  					if minVer14 {
   267  						kv = map[string]string{
   268  							app: istiod,
   269  							job: istiod,
   270  						}
   271  					}
   272  					return metricsTest.MetricsExist(sidecarInjectionRequests, kv)
   273  				}, longWaitTimeout, longPollingInterval).Should(BeTrue())
   274  			})
   275  		}
   276  
   277  		t.It("Verify sample metrics can be queried from Thanos", func() {
   278  			Eventually(func() bool {
   279  				kv := map[string]string{
   280  					job: oldPrometheus,
   281  				}
   282  
   283  				minVer14, err := pkg.IsVerrazzanoMinVersion("1.4.0", adminKubeConfig)
   284  				if err != nil {
   285  					pkg.Log(pkg.Error, fmt.Sprintf(failedVerifyVersionMsg, err))
   286  					return false
   287  				}
   288  				if minVer14 {
   289  					kv = map[string]string{
   290  						job: prometheus,
   291  					}
   292  				}
   293  				return metricsTest.MetricsExist(prometheusTargetIntervalLength, kv)
   294  			}, longWaitTimeout, longPollingInterval).Should(BeTrue())
   295  		})
   296  		if istioInjection == "enabled" {
   297  			t.It("Verify envoy stats", func() {
   298  				Eventually(func() bool {
   299  					return verifyEnvoyStats(envoyStatsRecentLookups)
   300  				}, longWaitTimeout, longPollingInterval).Should(BeTrue())
   301  			})
   302  		}
   303  	})
   304  })
   305  
   306  // Validate the Istio envoy stats for the pods in the namespaces defined in envoyStatsNamespaces
   307  func verifyEnvoyStats(metricName string) bool {
   308  	clientset, err := pkg.GetKubernetesClientsetForCluster(kubeConfig)
   309  	if err != nil {
   310  		t.Logs.Errorf("Error getting clienset for %s, error: %v", kubeConfig, err)
   311  		return false
   312  	}
   313  	for _, ns := range envoyStatsNamespaces {
   314  		pods, err := pkg.ListPodsInCluster(ns, clientset)
   315  		if err != nil {
   316  			t.Logs.Errorf("Error listing pods in cluster for namespace: %s, error: %v", namespace, err)
   317  			return false
   318  		}
   319  		labels := map[string]string{}
   320  		for _, pod := range pods.Items {
   321  			if ns == istioSystemNamespace && excludePods(pod.Name, excludePodsIstio) ||
   322  				ns == verrazzanoSystemNamespace && excludePods(pod.Name, excludePodsVS) {
   323  				continue
   324  			}
   325  			labels[ns] = pod.Name
   326  		}
   327  		metricsTest.MetricsExist(metricName, labels)
   328  	}
   329  	return true
   330  }
   331  
   332  func getClusterNameMetricLabel() string {
   333  	if clusterNameMetricsLabel == "" {
   334  		// ignore error getting the metric label - we'll just use the default value returned
   335  		lbl, err := pkg.GetClusterNameMetricLabel(adminKubeConfig)
   336  		if err != nil {
   337  			t.Logs.Errorf("Error getting cluster name metric label: %s", err.Error())
   338  		}
   339  		clusterNameMetricsLabel = lbl
   340  	}
   341  	return clusterNameMetricsLabel
   342  }
   343  
   344  // Exclude the pods where envoy stats are not available
   345  func excludePods(pod string, excludeList []string) bool {
   346  	for _, excludes := range excludeList {
   347  		if strings.HasPrefix(pod, excludes) {
   348  			return true
   349  		}
   350  	}
   351  	return false
   352  }
   353  
   354  // Return the cluster name used for the Prometheus query
   355  func getClusterNameForPromQuery() string {
   356  	if isManagedClusterProfile {
   357  		return clusterName
   358  	}
   359  	if isMinVersion110 {
   360  		return "local"
   361  	}
   362  	return ""
   363  }
   364  
   365  // Queries Thanos for a given metric name and a map of labels for the metric
   366  func eventuallyMetricsContainLabels(metricName string, kv map[string]string) {
   367  	Eventually(func() bool {
   368  		return metricsTest.MetricsExist(metricName, kv)
   369  	}, longWaitTimeout, longPollingInterval).Should(BeTrue())
   370  }