k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/prometheus/prometheus.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package prometheus
    18  
    19  import (
    20  	"context"
    21  	"embed"
    22  	"encoding/json"
    23  	"errors"
    24  	"fmt"
    25  	"io/fs"
    26  	"os"
    27  	"sync"
    28  	"time"
    29  
    30  	"golang.org/x/sync/errgroup"
    31  	corev1 "k8s.io/api/core/v1"
    32  	rbacv1 "k8s.io/api/rbac/v1"
    33  	apierrs "k8s.io/apimachinery/pkg/api/errors"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	"k8s.io/client-go/kubernetes"
    37  	"k8s.io/klog/v2"
    38  	"k8s.io/perf-tests/clusterloader2/pkg/config"
    39  	clerrors "k8s.io/perf-tests/clusterloader2/pkg/errors"
    40  	"k8s.io/perf-tests/clusterloader2/pkg/flags"
    41  	"k8s.io/perf-tests/clusterloader2/pkg/framework"
    42  	"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
    43  	"k8s.io/perf-tests/clusterloader2/pkg/provider"
    44  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    45  )
    46  
    47  const (
    48  	namespace                    = "monitoring"
    49  	storageClass                 = "ssd"
    50  	checkPrometheusReadyInterval = 30 * time.Second
    51  	numK8sClients                = 1
    52  
    53  	// All paths here are relative to manifests dir.
    54  	coreManifests                = "*.yaml"
    55  	defaultServiceMonitors       = "default/*.yaml"
    56  	kubeStateMetricsManifests    = "exporters/kube-state-metrics/*.yaml"
    57  	masterIPServiceMonitors      = "master-ip/*.yaml"
    58  	metricsServerManifests       = "exporters/metrics-server/*.yaml"
    59  	nodeExporterPod              = "exporters/node_exporter/node-exporter.yaml"
    60  	windowsNodeExporterManifests = "exporters/windows_node_exporter/*.yaml"
    61  	pushgatewayManifests         = "pushgateway/*.yaml"
    62  )
    63  
    64  //go:embed manifests
    65  var manifestsFSWithPrefix embed.FS
    66  var manifestsFS fs.FS
    67  
    68  func init() {
    69  	var err error
    70  	// go's embed generates embed.FS with all files with 'manifests/' prefix.
    71  	// To be consistent with --prometheus-manifest-path (which is defined inside of manifests) we need to drip this prefix.
    72  	manifestsFS, err = fs.Sub(manifestsFSWithPrefix, "manifests")
    73  	if err != nil {
    74  		panic(fmt.Sprintf("failed to strip manifests prefix: %v", err))
    75  	}
    76  }
    77  
    78  // InitFlags initializes prometheus flags.
    79  func InitFlags(p *config.PrometheusConfig) {
    80  	flags.BoolEnvVar(&p.EnableServer, "enable-prometheus-server", "ENABLE_PROMETHEUS_SERVER", false, "Whether to set-up the prometheus server in the cluster.")
    81  	flags.BoolEnvVar(&p.TearDownServer, "tear-down-prometheus-server", "TEAR_DOWN_PROMETHEUS_SERVER", true, "Whether to tear-down the prometheus server after tests (if set-up).")
    82  	flags.BoolEnvVar(&p.EnablePushgateway, "enable-pushgateway", "PROMETHEUS_ENABLE_PUSHGATEWAY", false, "Whether to set-up the Pushgateway. Only work with enabled Prometheus server.")
    83  	flags.BoolEnvVar(&p.ScrapeEtcd, "prometheus-scrape-etcd", "PROMETHEUS_SCRAPE_ETCD", false, "Whether to scrape etcd metrics.")
    84  	flags.BoolEnvVar(&p.ScrapeNodeExporter, "prometheus-scrape-node-exporter", "PROMETHEUS_SCRAPE_NODE_EXPORTER", false, "Whether to scrape node exporter metrics.")
    85  	flags.BoolEnvVar(&p.ScrapeWindowsNodeExporter, "prometheus-scrape-windows-node-exporter", "PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER", false, "Whether to scrape Windows node exporter metrics.")
    86  	flags.BoolEnvVar(&p.ScrapeKubelets, "prometheus-scrape-kubelets", "PROMETHEUS_SCRAPE_KUBELETS", false, "Whether to scrape kubelets (nodes + master). Experimental, may not work in larger clusters. Requires heapster node to be at least n1-standard-4, which needs to be provided manually.")
    87  	flags.BoolEnvVar(&p.ScrapeMasterKubelets, "prometheus-scrape-master-kubelets", "PROMETHEUS_SCRAPE_MASTER_KUBELETS", false, "Whether to scrape kubelets running on master nodes.")
    88  	flags.BoolEnvVar(&p.ScrapeKubeProxy, "prometheus-scrape-kube-proxy", "PROMETHEUS_SCRAPE_KUBE_PROXY", true, "Whether to scrape kube proxy.")
    89  	flags.StringEnvVar(&p.KubeProxySelectorKey, "prometheus-kube-proxy-selector-key", "PROMETHEUS_KUBE_PROXY_SELECTOR_KEY", "component", "Label key used to scrape kube proxy.")
    90  	flags.BoolEnvVar(&p.ScrapeKubeStateMetrics, "prometheus-scrape-kube-state-metrics", "PROMETHEUS_SCRAPE_KUBE_STATE_METRICS", false, "Whether to scrape kube-state-metrics. Only run occasionally.")
    91  	flags.BoolEnvVar(&p.ScrapeMetricsServerMetrics, "prometheus-scrape-metrics-server", "PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS", false, "Whether to scrape metrics-server. Only run occasionally.")
    92  	flags.BoolEnvVar(&p.ScrapeNodeLocalDNS, "prometheus-scrape-node-local-dns", "PROMETHEUS_SCRAPE_NODE_LOCAL_DNS", false, "Whether to scrape node-local-dns pods.")
    93  	flags.BoolEnvVar(&p.ScrapeAnet, "prometheus-scrape-anet", "PROMETHEUS_SCRAPE_ANET", false, "Whether to scrape anet pods.")
    94  	flags.BoolEnvVar(&p.ScrapeCiliumOperator, "prometheus-scrape-cilium-operator", "PROMETHEUS_SCRAPE_CILIUM_OPERATOR", false, "Whether to scrape cilium-operator pods.")
    95  	flags.BoolEnvVar(&p.ScrapeMastersWithPublicIPs, "prometheus-scrape-masters-with-public-ips", "PROMETHEUS_SCRAPE_MASTERS_WITH_PUBLIC_IPS", false, "Whether to scrape master machines using public ips, instead of private.")
    96  	flags.IntEnvVar(&p.APIServerScrapePort, "prometheus-apiserver-scrape-port", "PROMETHEUS_APISERVER_SCRAPE_PORT", 443, "Port for scraping kube-apiserver (default 443).")
    97  	flags.StringEnvVar(&p.SnapshotProject, "experimental-snapshot-project", "PROJECT", "", "GCP project used where disks and snapshots are located.")
    98  	flags.StringEnvVar(&p.ManifestPath, "prometheus-manifest-path", "PROMETHEUS_MANIFEST_PATH", "", "Path to the prometheus manifest files.")
    99  	flags.StringEnvVar(&p.StorageClassProvisioner, "prometheus-storage-class-provisioner", "PROMETHEUS_STORAGE_CLASS_PROVISIONER", "kubernetes.io/gce-pd", "Volumes plugin used to provision PVs for Prometheus.")
   100  	flags.StringEnvVar(&p.StorageClassVolumeType, "prometheus-storage-class-volume-type", "PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE", "pd-ssd", "Volume types of storage class, This will be different depending on the provisioner.")
   101  	flags.StringEnvVar(&p.PVCStorageClass, "prometheus-pvc-storage-class", "PROMETHEUS_PVC_STORAGE_CLASS", "ssd", "Storage class used with prometheus persistent volume claim.")
   102  	flags.DurationEnvVar(&p.ReadyTimeout, "prometheus-ready-timeout", "PROMETHEUS_READY_TIMEOUT", 15*time.Minute, "Timeout for waiting for Prometheus stack to become healthy.")
   103  	flags.StringEnvVar(&p.PrometheusMemoryRequest, "prometheus-memory-request", "PROMETHEUS_MEMORY_REQUEST", "10Gi", "Memory request to be used by promehteus.")
   104  	err := flags.MarkDeprecated("prometheus-manifest-path", "prometheus manifests are now taken from the embed FS prepared in the build time. This flag is planned to be removed in Jan 2023. Do you really need this flag?")
   105  	if err != nil {
   106  		klog.Fatalf("unable to mark flag prometheus-manifest-path deprecated %v", err)
   107  	}
   108  }
   109  
   110  // ValidatePrometheusFlags validates prometheus flags.
   111  func ValidatePrometheusFlags(p *config.PrometheusConfig) *clerrors.ErrorList {
   112  	errList := clerrors.NewErrorList()
   113  	if *shouldSnapshotPrometheusDisk && p.SnapshotProject == "" {
   114  		errList.Append(fmt.Errorf("requesting snapshot, but snapshot project not configured. Use --experimental-snapshot-project flag"))
   115  	}
   116  	return errList
   117  }
   118  
   119  // Controller is a util for managing (setting up / tearing down) the prometheus stack in
   120  // the cluster.
   121  type Controller struct {
   122  	clusterLoaderConfig *config.ClusterLoaderConfig
   123  	// provider is the cloud provider derived from the --provider flag.
   124  	provider provider.Provider
   125  	// framework associated with the cluster where the prometheus stack should be set up.
   126  	// For kubemark it's the root cluster, otherwise it's the main (and only) cluster.
   127  	framework *framework.Framework
   128  	// templateMapping is a mapping defining placeholders used in manifest templates.
   129  	templateMapping map[string]interface{}
   130  	// diskMetadata store name and zone of Prometheus persistent disk.
   131  	diskMetadata prometheusDiskMetadata
   132  	// snapshotLock makes sure that only single Prometheus snapshot is happening
   133  	snapshotLock sync.Mutex
   134  	// snapshotted is a check if the Prometheus snapshot is already done - protected by snapshotLock
   135  	snapshotted bool
   136  	// snapshotError contains error from snapshot attempt - protected by snapshotLock
   137  	snapshotError error
   138  	// ssh executor to run commands in cluster nodes via ssh
   139  	ssh util.SSHExecutor
   140  	// timeout for waiting for Prometheus stack to become healthy
   141  	readyTimeout time.Duration
   142  }
   143  
   144  // NewController creates a new instance of Controller for the given config.
   145  func NewController(clusterLoaderConfig *config.ClusterLoaderConfig) (pc *Controller, err error) {
   146  	pc = &Controller{
   147  		clusterLoaderConfig: clusterLoaderConfig,
   148  		provider:            clusterLoaderConfig.ClusterConfig.Provider,
   149  		readyTimeout:        clusterLoaderConfig.PrometheusConfig.ReadyTimeout,
   150  	}
   151  
   152  	if pc.framework, err = framework.NewRootFramework(&clusterLoaderConfig.ClusterConfig, numK8sClients); err != nil {
   153  		return nil, err
   154  	}
   155  
   156  	mapping, errList := config.GetMapping(clusterLoaderConfig, nil)
   157  	if errList != nil {
   158  		return nil, errList
   159  	}
   160  	mapping["MasterIps"], err = getMasterIps(clusterLoaderConfig.ClusterConfig, clusterLoaderConfig.PrometheusConfig.ScrapeMastersWithPublicIPs)
   161  	if err != nil {
   162  		klog.Warningf("Couldn't get master ip, will ignore manifests requiring it: %v", err)
   163  		delete(mapping, "MasterIps")
   164  	}
   165  	if _, exists := mapping["PROMETHEUS_SCRAPE_APISERVER_ONLY"]; !exists {
   166  		mapping["PROMETHEUS_SCRAPE_APISERVER_ONLY"] = clusterLoaderConfig.ClusterConfig.Provider.Features().ShouldPrometheusScrapeApiserverOnly
   167  	}
   168  	// TODO: Change to pure assignments when overrides are not used.
   169  	if _, exists := mapping["PROMETHEUS_SCRAPE_ETCD"]; !exists {
   170  		mapping["PROMETHEUS_SCRAPE_ETCD"] = clusterLoaderConfig.PrometheusConfig.ScrapeEtcd
   171  	} else {
   172  		// Backward compatibility.
   173  		clusterLoaderConfig.PrometheusConfig.ScrapeEtcd = mapping["PROMETHEUS_SCRAPE_ETCD"].(bool)
   174  	}
   175  	if _, exists := mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"]; !exists {
   176  		mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"] = clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter
   177  	} else {
   178  		// Backward compatibility.
   179  		clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter = mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"].(bool)
   180  	}
   181  	if _, exists := mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"]; !exists {
   182  		mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"] = clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter
   183  	} else {
   184  		// Backward compatibility.
   185  		clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter = mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"].(bool)
   186  	}
   187  	if _, exists := mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"]; !exists {
   188  		clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = clusterLoaderConfig.ClusterConfig.Provider.Features().ShouldScrapeKubeProxy
   189  		mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy
   190  	} else {
   191  		// Backward compatibility
   192  		clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"].(bool)
   193  	}
   194  	if _, exists := mapping["PROMETHEUS_SCRAPE_ANET"]; !exists {
   195  		mapping["PROMETHEUS_SCRAPE_ANET"] = clusterLoaderConfig.PrometheusConfig.ScrapeAnet
   196  	} else {
   197  		clusterLoaderConfig.PrometheusConfig.ScrapeAnet = mapping["PROMETHEUS_SCRAPE_ANET"].(bool)
   198  	}
   199  	if _, exists := mapping["PROMETHEUS_SCRAPE_CILIUM_OPERATOR"]; !exists {
   200  		mapping["PROMETHEUS_SCRAPE_CILIUM_OPERATOR"] = clusterLoaderConfig.PrometheusConfig.ScrapeCiliumOperator
   201  	} else {
   202  		clusterLoaderConfig.PrometheusConfig.ScrapeCiliumOperator = mapping["PROMETHEUS_SCRAPE_CILIUM_OPERATOR"].(bool)
   203  	}
   204  	mapping["PROMETHEUS_SCRAPE_NODE_LOCAL_DNS"] = clusterLoaderConfig.PrometheusConfig.ScrapeNodeLocalDNS
   205  	mapping["PROMETHEUS_SCRAPE_KUBE_STATE_METRICS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubeStateMetrics
   206  	mapping["PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS"] = clusterLoaderConfig.PrometheusConfig.ScrapeMetricsServerMetrics
   207  	mapping["PROMETHEUS_SCRAPE_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets
   208  	mapping["PROMETHEUS_SCRAPE_MASTER_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets || clusterLoaderConfig.PrometheusConfig.ScrapeMasterKubelets
   209  	mapping["PROMETHEUS_APISERVER_SCRAPE_PORT"] = clusterLoaderConfig.PrometheusConfig.APIServerScrapePort
   210  	mapping["PROMETHEUS_STORAGE_CLASS_PROVISIONER"] = clusterLoaderConfig.PrometheusConfig.StorageClassProvisioner
   211  	mapping["PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE"] = clusterLoaderConfig.PrometheusConfig.StorageClassVolumeType
   212  	mapping["PROMETHEUS_KUBE_PROXY_SELECTOR_KEY"] = clusterLoaderConfig.PrometheusConfig.KubeProxySelectorKey
   213  	mapping["PROMETHEUS_PVC_STORAGE_CLASS"] = clusterLoaderConfig.PrometheusConfig.PVCStorageClass
   214  	mapping["PROMETHEUS_MEMORY_REQUEST"] = clusterLoaderConfig.PrometheusConfig.PrometheusMemoryRequest
   215  	snapshotEnabled, _ := pc.isEnabled()
   216  	mapping["RetainPD"] = snapshotEnabled
   217  
   218  	pc.templateMapping = mapping
   219  
   220  	pc.ssh = &util.GCloudSSHExecutor{}
   221  
   222  	return pc, nil
   223  }
   224  
   225  // SetUpPrometheusStack sets up prometheus stack in the cluster.
   226  // This method is idempotent, if the prometheus stack is already set up applying the manifests
   227  // again will be no-op.
   228  func (pc *Controller) SetUpPrometheusStack() error {
   229  	k8sClient := pc.framework.GetClientSets().GetClient()
   230  
   231  	klog.V(2).Info("Setting up prometheus stack")
   232  	if err := client.CreateNamespace(k8sClient, namespace); err != nil {
   233  		return err
   234  	}
   235  	// Removing Storage Class as Reclaim Policy cannot be changed
   236  	if err := client.DeleteStorageClass(k8sClient, storageClass); err != nil {
   237  		return err
   238  	}
   239  	if err := pc.applyManifests(coreManifests); err != nil {
   240  		return err
   241  	}
   242  	if pc.clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter {
   243  		if err := pc.runNodeExporter(); err != nil {
   244  			return err
   245  		}
   246  	}
   247  	if pc.clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter {
   248  		if err := pc.applyManifests(windowsNodeExporterManifests); err != nil {
   249  			return err
   250  		}
   251  	} else {
   252  		// Backward compatibility
   253  		// If enabled scraping windows node, need to setup windows node and template mapping
   254  		if isWindowsNodeScrapingEnabled(pc.templateMapping, pc.clusterLoaderConfig) {
   255  			if err := setUpWindowsNodeAndTemplate(k8sClient, pc.templateMapping); err != nil {
   256  				return err
   257  			}
   258  		}
   259  	}
   260  	if !pc.isKubemark() {
   261  		if err := pc.applyManifests(defaultServiceMonitors); err != nil {
   262  			return err
   263  		}
   264  	}
   265  
   266  	if pc.clusterLoaderConfig.PrometheusConfig.ScrapeKubeStateMetrics && pc.clusterLoaderConfig.ClusterConfig.Provider.Features().SupportKubeStateMetrics {
   267  		klog.V(2).Infof("Applying kube-state-metrics in the cluster.")
   268  		if err := pc.applyManifests(kubeStateMetricsManifests); err != nil {
   269  			return err
   270  		}
   271  	}
   272  	if pc.clusterLoaderConfig.PrometheusConfig.ScrapeMetricsServerMetrics && pc.clusterLoaderConfig.ClusterConfig.Provider.Features().SupportMetricsServerMetrics {
   273  		klog.V(2).Infof("Applying metrics server in the cluster.")
   274  		if err := pc.applyManifests(metricsServerManifests); err != nil {
   275  			return err
   276  		}
   277  	}
   278  	if _, ok := pc.templateMapping["MasterIps"]; ok {
   279  		if err := pc.exposeAPIServerMetrics(); err != nil {
   280  			return err
   281  		}
   282  		if err := pc.applyManifests(masterIPServiceMonitors); err != nil {
   283  			return err
   284  		}
   285  	}
   286  	if pc.clusterLoaderConfig.PrometheusConfig.EnablePushgateway {
   287  		klog.V(2).Infof("Applying Pushgateway in the cluster.")
   288  		if err := pc.applyManifests(pushgatewayManifests); err != nil {
   289  			return err
   290  		}
   291  	}
   292  	if err := pc.waitForPrometheusToBeHealthy(); err != nil {
   293  		dumpAdditionalLogsOnPrometheusSetupFailure(k8sClient)
   294  		return err
   295  	}
   296  	klog.V(2).Info("Prometheus stack set up successfully")
   297  	if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil {
   298  		klog.Warningf("Error while caching prometheus disk metadata: %v", err)
   299  	}
   300  	return nil
   301  }
   302  
   303  func (pc *Controller) MakePrometheusSnapshotIfEnabled() error {
   304  	klog.V(2).Info("Get snapshot from Prometheus")
   305  	if err := pc.snapshotPrometheusIfEnabled(); err != nil {
   306  		klog.Warningf("Error while getting prometheus snapshot: %v", err)
   307  		return err
   308  	}
   309  
   310  	return nil
   311  }
   312  
   313  // TearDownPrometheusStack tears down prometheus stack, releasing all prometheus resources.
   314  func (pc *Controller) TearDownPrometheusStack() error {
   315  	// Get disk metadata again to be sure
   316  	if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil {
   317  		klog.Warningf("Error while caching prometheus disk metadata: %v", err)
   318  	}
   319  	defer func() {
   320  		klog.V(2).Info("Snapshotting prometheus disk")
   321  		if err := pc.snapshotPrometheusDiskIfEnabledSynchronized(); err != nil {
   322  			klog.Warningf("Error while snapshotting prometheus disk: %v", err)
   323  		}
   324  		if err := pc.deletePrometheusDiskIfEnabled(); err != nil {
   325  			klog.Warningf("Error while deleting prometheus disk: %v", err)
   326  		}
   327  	}()
   328  
   329  	klog.V(2).Info("Tearing down prometheus stack")
   330  	k8sClient := pc.framework.GetClientSets().GetClient()
   331  	if err := client.DeleteNamespace(k8sClient, namespace); err != nil {
   332  		return err
   333  	}
   334  	if err := client.WaitForDeleteNamespace(k8sClient, namespace, client.DefaultNamespaceDeletionTimeout); err != nil {
   335  		return err
   336  	}
   337  	return nil
   338  }
   339  
   340  // GetFramework returns prometheus framework.
   341  func (pc *Controller) GetFramework() *framework.Framework {
   342  	return pc.framework
   343  }
   344  
   345  func (pc *Controller) applyManifests(manifestGlob string) error {
   346  	return pc.framework.ApplyTemplatedManifests(
   347  		pc.manifestsFS(), manifestGlob, pc.templateMapping, client.Retry(apierrs.IsNotFound))
   348  }
   349  
   350  func (pc *Controller) manifestsFS() fs.FS {
   351  	if pc.clusterLoaderConfig.PrometheusConfig.ManifestPath != "" {
   352  		return os.DirFS(pc.clusterLoaderConfig.PrometheusConfig.ManifestPath)
   353  	}
   354  
   355  	return manifestsFS
   356  }
   357  
   358  // exposeAPIServerMetrics configures anonymous access to the apiserver metrics.
   359  func (pc *Controller) exposeAPIServerMetrics() error {
   360  	klog.V(2).Info("Exposing kube-apiserver metrics in the cluster")
   361  	// We need to get a client to the cluster where the test is being executed on,
   362  	// not the cluster that the prometheus is running in. Usually, there is only
   363  	// once cluster, but in case of kubemark we have two and thus we need to
   364  	// create a new client here.
   365  	clientSet, err := framework.NewMultiClientSet(
   366  		pc.clusterLoaderConfig.ClusterConfig.KubeConfigPath, numK8sClients)
   367  	if err != nil {
   368  		return err
   369  	}
   370  	createClusterRole := func() error {
   371  		_, err := clientSet.GetClient().RbacV1().ClusterRoles().Create(context.TODO(), &rbacv1.ClusterRole{
   372  			ObjectMeta: metav1.ObjectMeta{Name: "apiserver-metrics-viewer"},
   373  			Rules: []rbacv1.PolicyRule{
   374  				{Verbs: []string{"get"}, NonResourceURLs: []string{"/metrics"}},
   375  			},
   376  		}, metav1.CreateOptions{})
   377  		return err
   378  	}
   379  	createClusterRoleBinding := func() error {
   380  		_, err := clientSet.GetClient().RbacV1().ClusterRoleBindings().Create(context.TODO(), &rbacv1.ClusterRoleBinding{
   381  			ObjectMeta: metav1.ObjectMeta{Name: "system:anonymous"},
   382  			RoleRef:    rbacv1.RoleRef{Kind: "ClusterRole", Name: "apiserver-metrics-viewer"},
   383  			Subjects: []rbacv1.Subject{
   384  				{Kind: "User", Name: "system:anonymous"},
   385  			},
   386  		}, metav1.CreateOptions{})
   387  		return err
   388  	}
   389  	if err := retryCreateFunction(createClusterRole); err != nil {
   390  		return err
   391  	}
   392  	if err := retryCreateFunction(createClusterRoleBinding); err != nil {
   393  		return err
   394  	}
   395  	return nil
   396  }
   397  
   398  // runNodeExporter adds node-exporter as master's static manifest pod.
   399  // TODO(mborsz): Consider migrating to something less ugly, e.g. daemonset-based approach,
   400  // when master nodes have configured networking.
   401  func (pc *Controller) runNodeExporter() error {
   402  	klog.V(2).Infof("Starting node-exporter on master nodes.")
   403  	kubemarkFramework, err := framework.NewFramework(&pc.clusterLoaderConfig.ClusterConfig, numK8sClients)
   404  	if err != nil {
   405  		return err
   406  	}
   407  
   408  	// Validate masters first
   409  	nodes, err := client.ListNodes(kubemarkFramework.GetClientSets().GetClient())
   410  	if err != nil {
   411  		return err
   412  	}
   413  
   414  	var g errgroup.Group
   415  	numMasters := 0
   416  	for _, node := range nodes {
   417  		node := node
   418  		if util.LegacyIsMasterNode(&node) || util.IsControlPlaneNode(&node) {
   419  			numMasters++
   420  			g.Go(func() error {
   421  				f, err := pc.manifestsFS().Open(nodeExporterPod)
   422  				if err != nil {
   423  					return fmt.Errorf("unable to open manifest file: %v", err)
   424  				}
   425  				defer f.Close()
   426  				return pc.ssh.Exec("sudo tee /etc/kubernetes/manifests/node-exporter.yaml > /dev/null", &node, f)
   427  			})
   428  		}
   429  	}
   430  
   431  	if numMasters == 0 {
   432  		return fmt.Errorf("node-exporter requires master to be registered nodes")
   433  	}
   434  
   435  	return g.Wait()
   436  }
   437  
   438  func (pc *Controller) waitForPrometheusToBeHealthy() error {
   439  	klog.V(2).Info("Waiting for Prometheus stack to become healthy...")
   440  	return wait.PollImmediate(
   441  		checkPrometheusReadyInterval,
   442  		pc.readyTimeout,
   443  		pc.isPrometheusReady)
   444  }
   445  
   446  func (pc *Controller) isPrometheusReady() (bool, error) {
   447  	// TODO(mm4tt): Re-enable kube-proxy monitoring and expect more targets.
   448  	// This is a safeguard from a race condition where the prometheus server is started before
   449  	// targets are registered. These 4 targets are always expected, in all possible configurations:
   450  	// prometheus, prometheus-operator, grafana, apiserver
   451  	expectedTargets := 4
   452  	if pc.clusterLoaderConfig.PrometheusConfig.ScrapeEtcd {
   453  		// If scraping etcd is enabled (or it's kubemark where we scrape etcd unconditionally) we need
   454  		// a bit more complicated logic to asses whether all targets are ready. Etcd metric port has
   455  		// changed in https://github.com/kubernetes/kubernetes/pull/77561, depending on the k8s version
   456  		// etcd metrics may be available at port 2379 xor 2382. We solve that by setting two etcd
   457  		// serviceMonitors one for 2379 and other for 2382 and expect that at least 1 of them should be healthy.
   458  		ok, err := CheckAllTargetsReady( // All non-etcd targets should be ready.
   459  			pc.framework.GetClientSets().GetClient(),
   460  			func(t Target) bool { return !isEtcdEndpoint(t.Labels["endpoint"]) },
   461  			expectedTargets)
   462  		if err != nil || !ok {
   463  			return ok, err
   464  		}
   465  		return CheckTargetsReady( // 1 out of 2 etcd targets should be ready.
   466  			pc.framework.GetClientSets().GetClient(),
   467  			func(t Target) bool { return isEtcdEndpoint(t.Labels["endpoint"]) },
   468  			2, // expected targets: etcd-2379 and etcd-2382
   469  			1) // one of them should be healthy
   470  	}
   471  	return CheckAllTargetsReady(
   472  		pc.framework.GetClientSets().GetClient(),
   473  		func(Target) bool { return true }, // All targets.
   474  		expectedTargets)
   475  }
   476  
   477  func retryCreateFunction(f func() error) error {
   478  	return client.RetryWithExponentialBackOff(
   479  		client.RetryFunction(f, client.Allow(apierrs.IsAlreadyExists)))
   480  }
   481  
   482  func (pc *Controller) isKubemark() bool {
   483  	return pc.provider.Features().IsKubemarkProvider
   484  }
   485  
   486  func dumpAdditionalLogsOnPrometheusSetupFailure(k8sClient kubernetes.Interface) {
   487  	klog.V(2).Info("Dumping monitoring/prometheus-k8s events...")
   488  	list, err := client.ListEvents(k8sClient, namespace, "prometheus-k8s")
   489  	if err != nil {
   490  		klog.Warningf("Error while listing monitoring/prometheus-k8s events: %v", err)
   491  		return
   492  	}
   493  	s, err := json.MarshalIndent(list, "" /*=prefix*/, "  " /*=indent*/)
   494  	if err != nil {
   495  		klog.Warningf("Error while marshalling response %v: %v", list, err)
   496  		return
   497  	}
   498  	klog.V(2).Info(string(s))
   499  }
   500  
   501  func getMasterIps(clusterConfig config.ClusterConfig, usePublicIPs bool) ([]string, error) {
   502  	if usePublicIPs {
   503  		if len(clusterConfig.MasterIPs) == 0 {
   504  			return nil, fmt.Errorf("requested to use public IPs, however no publics IPs are provided")
   505  		}
   506  		return clusterConfig.MasterIPs, nil
   507  	}
   508  	if len(clusterConfig.MasterInternalIPs) != 0 {
   509  		klog.V(2).Infof("Using internal master ips (%s) to monitor master's components", clusterConfig.MasterInternalIPs)
   510  		return clusterConfig.MasterInternalIPs, nil
   511  	}
   512  	klog.V(1).Infof("Unable to determine master ips from flags or registered nodes. Will fallback to default/kubernetes service, which can be inaccurate in HA environments.")
   513  	ips, err := getMasterIpsFromKubernetesService(clusterConfig)
   514  	if err != nil {
   515  		klog.Warningf("Failed to translate default/kubernetes service to IP: %v", err)
   516  		return nil, fmt.Errorf("no ips are set, fallback to default/kubernetes service failed due to: %v", err)
   517  	}
   518  	klog.V(2).Infof("default/kubernetes service translated to: %v", ips)
   519  	return ips, nil
   520  }
   521  
   522  func getMasterIpsFromKubernetesService(clusterConfig config.ClusterConfig) ([]string, error) {
   523  	// This has to be done in the kubemark cluster, thus we need to create a new client.
   524  	clientSet, err := framework.NewMultiClientSet(clusterConfig.KubeConfigPath, numK8sClients)
   525  	if err != nil {
   526  		return nil, err
   527  	}
   528  
   529  	var endpoints *corev1.Endpoints
   530  	f := func() error {
   531  		var err error
   532  		endpoints, err = clientSet.GetClient().CoreV1().Endpoints("default").Get(context.TODO(), "kubernetes", metav1.GetOptions{})
   533  		return err
   534  	}
   535  
   536  	if err := client.RetryWithExponentialBackOff(client.RetryFunction(f)); err != nil {
   537  		return nil, err
   538  	}
   539  
   540  	var ips []string
   541  	for _, subnet := range endpoints.Subsets {
   542  		for _, address := range subnet.Addresses {
   543  			ips = append(ips, address.IP)
   544  		}
   545  	}
   546  
   547  	if len(ips) == 0 {
   548  		return nil, errors.New("no master ips available in default/kubernetes service")
   549  	}
   550  
   551  	return ips, nil
   552  }
   553  
   554  func isEtcdEndpoint(endpoint string) bool {
   555  	return endpoint == "etcd-2379" || endpoint == "etcd-2382"
   556  }