k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/prometheus/prometheus.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package prometheus 18 19 import ( 20 "context" 21 "embed" 22 "encoding/json" 23 "errors" 24 "fmt" 25 "io/fs" 26 "os" 27 "sync" 28 "time" 29 30 "golang.org/x/sync/errgroup" 31 corev1 "k8s.io/api/core/v1" 32 rbacv1 "k8s.io/api/rbac/v1" 33 apierrs "k8s.io/apimachinery/pkg/api/errors" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/util/wait" 36 "k8s.io/client-go/kubernetes" 37 "k8s.io/klog/v2" 38 "k8s.io/perf-tests/clusterloader2/pkg/config" 39 clerrors "k8s.io/perf-tests/clusterloader2/pkg/errors" 40 "k8s.io/perf-tests/clusterloader2/pkg/flags" 41 "k8s.io/perf-tests/clusterloader2/pkg/framework" 42 "k8s.io/perf-tests/clusterloader2/pkg/framework/client" 43 "k8s.io/perf-tests/clusterloader2/pkg/provider" 44 "k8s.io/perf-tests/clusterloader2/pkg/util" 45 ) 46 47 const ( 48 namespace = "monitoring" 49 storageClass = "ssd" 50 checkPrometheusReadyInterval = 30 * time.Second 51 numK8sClients = 1 52 53 // All paths here are relative to manifests dir. 54 coreManifests = "*.yaml" 55 defaultServiceMonitors = "default/*.yaml" 56 kubeStateMetricsManifests = "exporters/kube-state-metrics/*.yaml" 57 masterIPServiceMonitors = "master-ip/*.yaml" 58 metricsServerManifests = "exporters/metrics-server/*.yaml" 59 nodeExporterPod = "exporters/node_exporter/node-exporter.yaml" 60 windowsNodeExporterManifests = "exporters/windows_node_exporter/*.yaml" 61 pushgatewayManifests = "pushgateway/*.yaml" 62 ) 63 64 //go:embed manifests 65 var manifestsFSWithPrefix embed.FS 66 var manifestsFS fs.FS 67 68 func init() { 69 var err error 70 // go's embed generates embed.FS with all files with 'manifests/' prefix. 71 // To be consistent with --prometheus-manifest-path (which is defined inside of manifests) we need to drip this prefix. 72 manifestsFS, err = fs.Sub(manifestsFSWithPrefix, "manifests") 73 if err != nil { 74 panic(fmt.Sprintf("failed to strip manifests prefix: %v", err)) 75 } 76 } 77 78 // InitFlags initializes prometheus flags. 79 func InitFlags(p *config.PrometheusConfig) { 80 flags.BoolEnvVar(&p.EnableServer, "enable-prometheus-server", "ENABLE_PROMETHEUS_SERVER", false, "Whether to set-up the prometheus server in the cluster.") 81 flags.BoolEnvVar(&p.TearDownServer, "tear-down-prometheus-server", "TEAR_DOWN_PROMETHEUS_SERVER", true, "Whether to tear-down the prometheus server after tests (if set-up).") 82 flags.BoolEnvVar(&p.EnablePushgateway, "enable-pushgateway", "PROMETHEUS_ENABLE_PUSHGATEWAY", false, "Whether to set-up the Pushgateway. Only work with enabled Prometheus server.") 83 flags.BoolEnvVar(&p.ScrapeEtcd, "prometheus-scrape-etcd", "PROMETHEUS_SCRAPE_ETCD", false, "Whether to scrape etcd metrics.") 84 flags.BoolEnvVar(&p.ScrapeNodeExporter, "prometheus-scrape-node-exporter", "PROMETHEUS_SCRAPE_NODE_EXPORTER", false, "Whether to scrape node exporter metrics.") 85 flags.BoolEnvVar(&p.ScrapeWindowsNodeExporter, "prometheus-scrape-windows-node-exporter", "PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER", false, "Whether to scrape Windows node exporter metrics.") 86 flags.BoolEnvVar(&p.ScrapeKubelets, "prometheus-scrape-kubelets", "PROMETHEUS_SCRAPE_KUBELETS", false, "Whether to scrape kubelets (nodes + master). Experimental, may not work in larger clusters. Requires heapster node to be at least n1-standard-4, which needs to be provided manually.") 87 flags.BoolEnvVar(&p.ScrapeMasterKubelets, "prometheus-scrape-master-kubelets", "PROMETHEUS_SCRAPE_MASTER_KUBELETS", false, "Whether to scrape kubelets running on master nodes.") 88 flags.BoolEnvVar(&p.ScrapeKubeProxy, "prometheus-scrape-kube-proxy", "PROMETHEUS_SCRAPE_KUBE_PROXY", true, "Whether to scrape kube proxy.") 89 flags.StringEnvVar(&p.KubeProxySelectorKey, "prometheus-kube-proxy-selector-key", "PROMETHEUS_KUBE_PROXY_SELECTOR_KEY", "component", "Label key used to scrape kube proxy.") 90 flags.BoolEnvVar(&p.ScrapeKubeStateMetrics, "prometheus-scrape-kube-state-metrics", "PROMETHEUS_SCRAPE_KUBE_STATE_METRICS", false, "Whether to scrape kube-state-metrics. Only run occasionally.") 91 flags.BoolEnvVar(&p.ScrapeMetricsServerMetrics, "prometheus-scrape-metrics-server", "PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS", false, "Whether to scrape metrics-server. Only run occasionally.") 92 flags.BoolEnvVar(&p.ScrapeNodeLocalDNS, "prometheus-scrape-node-local-dns", "PROMETHEUS_SCRAPE_NODE_LOCAL_DNS", false, "Whether to scrape node-local-dns pods.") 93 flags.BoolEnvVar(&p.ScrapeAnet, "prometheus-scrape-anet", "PROMETHEUS_SCRAPE_ANET", false, "Whether to scrape anet pods.") 94 flags.BoolEnvVar(&p.ScrapeCiliumOperator, "prometheus-scrape-cilium-operator", "PROMETHEUS_SCRAPE_CILIUM_OPERATOR", false, "Whether to scrape cilium-operator pods.") 95 flags.BoolEnvVar(&p.ScrapeMastersWithPublicIPs, "prometheus-scrape-masters-with-public-ips", "PROMETHEUS_SCRAPE_MASTERS_WITH_PUBLIC_IPS", false, "Whether to scrape master machines using public ips, instead of private.") 96 flags.IntEnvVar(&p.APIServerScrapePort, "prometheus-apiserver-scrape-port", "PROMETHEUS_APISERVER_SCRAPE_PORT", 443, "Port for scraping kube-apiserver (default 443).") 97 flags.StringEnvVar(&p.SnapshotProject, "experimental-snapshot-project", "PROJECT", "", "GCP project used where disks and snapshots are located.") 98 flags.StringEnvVar(&p.ManifestPath, "prometheus-manifest-path", "PROMETHEUS_MANIFEST_PATH", "", "Path to the prometheus manifest files.") 99 flags.StringEnvVar(&p.StorageClassProvisioner, "prometheus-storage-class-provisioner", "PROMETHEUS_STORAGE_CLASS_PROVISIONER", "kubernetes.io/gce-pd", "Volumes plugin used to provision PVs for Prometheus.") 100 flags.StringEnvVar(&p.StorageClassVolumeType, "prometheus-storage-class-volume-type", "PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE", "pd-ssd", "Volume types of storage class, This will be different depending on the provisioner.") 101 flags.StringEnvVar(&p.PVCStorageClass, "prometheus-pvc-storage-class", "PROMETHEUS_PVC_STORAGE_CLASS", "ssd", "Storage class used with prometheus persistent volume claim.") 102 flags.DurationEnvVar(&p.ReadyTimeout, "prometheus-ready-timeout", "PROMETHEUS_READY_TIMEOUT", 15*time.Minute, "Timeout for waiting for Prometheus stack to become healthy.") 103 flags.StringEnvVar(&p.PrometheusMemoryRequest, "prometheus-memory-request", "PROMETHEUS_MEMORY_REQUEST", "10Gi", "Memory request to be used by promehteus.") 104 err := flags.MarkDeprecated("prometheus-manifest-path", "prometheus manifests are now taken from the embed FS prepared in the build time. This flag is planned to be removed in Jan 2023. Do you really need this flag?") 105 if err != nil { 106 klog.Fatalf("unable to mark flag prometheus-manifest-path deprecated %v", err) 107 } 108 } 109 110 // ValidatePrometheusFlags validates prometheus flags. 111 func ValidatePrometheusFlags(p *config.PrometheusConfig) *clerrors.ErrorList { 112 errList := clerrors.NewErrorList() 113 if *shouldSnapshotPrometheusDisk && p.SnapshotProject == "" { 114 errList.Append(fmt.Errorf("requesting snapshot, but snapshot project not configured. Use --experimental-snapshot-project flag")) 115 } 116 return errList 117 } 118 119 // Controller is a util for managing (setting up / tearing down) the prometheus stack in 120 // the cluster. 121 type Controller struct { 122 clusterLoaderConfig *config.ClusterLoaderConfig 123 // provider is the cloud provider derived from the --provider flag. 124 provider provider.Provider 125 // framework associated with the cluster where the prometheus stack should be set up. 126 // For kubemark it's the root cluster, otherwise it's the main (and only) cluster. 127 framework *framework.Framework 128 // templateMapping is a mapping defining placeholders used in manifest templates. 129 templateMapping map[string]interface{} 130 // diskMetadata store name and zone of Prometheus persistent disk. 131 diskMetadata prometheusDiskMetadata 132 // snapshotLock makes sure that only single Prometheus snapshot is happening 133 snapshotLock sync.Mutex 134 // snapshotted is a check if the Prometheus snapshot is already done - protected by snapshotLock 135 snapshotted bool 136 // snapshotError contains error from snapshot attempt - protected by snapshotLock 137 snapshotError error 138 // ssh executor to run commands in cluster nodes via ssh 139 ssh util.SSHExecutor 140 // timeout for waiting for Prometheus stack to become healthy 141 readyTimeout time.Duration 142 } 143 144 // NewController creates a new instance of Controller for the given config. 145 func NewController(clusterLoaderConfig *config.ClusterLoaderConfig) (pc *Controller, err error) { 146 pc = &Controller{ 147 clusterLoaderConfig: clusterLoaderConfig, 148 provider: clusterLoaderConfig.ClusterConfig.Provider, 149 readyTimeout: clusterLoaderConfig.PrometheusConfig.ReadyTimeout, 150 } 151 152 if pc.framework, err = framework.NewRootFramework(&clusterLoaderConfig.ClusterConfig, numK8sClients); err != nil { 153 return nil, err 154 } 155 156 mapping, errList := config.GetMapping(clusterLoaderConfig, nil) 157 if errList != nil { 158 return nil, errList 159 } 160 mapping["MasterIps"], err = getMasterIps(clusterLoaderConfig.ClusterConfig, clusterLoaderConfig.PrometheusConfig.ScrapeMastersWithPublicIPs) 161 if err != nil { 162 klog.Warningf("Couldn't get master ip, will ignore manifests requiring it: %v", err) 163 delete(mapping, "MasterIps") 164 } 165 if _, exists := mapping["PROMETHEUS_SCRAPE_APISERVER_ONLY"]; !exists { 166 mapping["PROMETHEUS_SCRAPE_APISERVER_ONLY"] = clusterLoaderConfig.ClusterConfig.Provider.Features().ShouldPrometheusScrapeApiserverOnly 167 } 168 // TODO: Change to pure assignments when overrides are not used. 169 if _, exists := mapping["PROMETHEUS_SCRAPE_ETCD"]; !exists { 170 mapping["PROMETHEUS_SCRAPE_ETCD"] = clusterLoaderConfig.PrometheusConfig.ScrapeEtcd 171 } else { 172 // Backward compatibility. 173 clusterLoaderConfig.PrometheusConfig.ScrapeEtcd = mapping["PROMETHEUS_SCRAPE_ETCD"].(bool) 174 } 175 if _, exists := mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"]; !exists { 176 mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"] = clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter 177 } else { 178 // Backward compatibility. 179 clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter = mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"].(bool) 180 } 181 if _, exists := mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"]; !exists { 182 mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"] = clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter 183 } else { 184 // Backward compatibility. 185 clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter = mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"].(bool) 186 } 187 if _, exists := mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"]; !exists { 188 clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = clusterLoaderConfig.ClusterConfig.Provider.Features().ShouldScrapeKubeProxy 189 mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy 190 } else { 191 // Backward compatibility 192 clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"].(bool) 193 } 194 if _, exists := mapping["PROMETHEUS_SCRAPE_ANET"]; !exists { 195 mapping["PROMETHEUS_SCRAPE_ANET"] = clusterLoaderConfig.PrometheusConfig.ScrapeAnet 196 } else { 197 clusterLoaderConfig.PrometheusConfig.ScrapeAnet = mapping["PROMETHEUS_SCRAPE_ANET"].(bool) 198 } 199 if _, exists := mapping["PROMETHEUS_SCRAPE_CILIUM_OPERATOR"]; !exists { 200 mapping["PROMETHEUS_SCRAPE_CILIUM_OPERATOR"] = clusterLoaderConfig.PrometheusConfig.ScrapeCiliumOperator 201 } else { 202 clusterLoaderConfig.PrometheusConfig.ScrapeCiliumOperator = mapping["PROMETHEUS_SCRAPE_CILIUM_OPERATOR"].(bool) 203 } 204 mapping["PROMETHEUS_SCRAPE_NODE_LOCAL_DNS"] = clusterLoaderConfig.PrometheusConfig.ScrapeNodeLocalDNS 205 mapping["PROMETHEUS_SCRAPE_KUBE_STATE_METRICS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubeStateMetrics 206 mapping["PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS"] = clusterLoaderConfig.PrometheusConfig.ScrapeMetricsServerMetrics 207 mapping["PROMETHEUS_SCRAPE_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets 208 mapping["PROMETHEUS_SCRAPE_MASTER_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets || clusterLoaderConfig.PrometheusConfig.ScrapeMasterKubelets 209 mapping["PROMETHEUS_APISERVER_SCRAPE_PORT"] = clusterLoaderConfig.PrometheusConfig.APIServerScrapePort 210 mapping["PROMETHEUS_STORAGE_CLASS_PROVISIONER"] = clusterLoaderConfig.PrometheusConfig.StorageClassProvisioner 211 mapping["PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE"] = clusterLoaderConfig.PrometheusConfig.StorageClassVolumeType 212 mapping["PROMETHEUS_KUBE_PROXY_SELECTOR_KEY"] = clusterLoaderConfig.PrometheusConfig.KubeProxySelectorKey 213 mapping["PROMETHEUS_PVC_STORAGE_CLASS"] = clusterLoaderConfig.PrometheusConfig.PVCStorageClass 214 mapping["PROMETHEUS_MEMORY_REQUEST"] = clusterLoaderConfig.PrometheusConfig.PrometheusMemoryRequest 215 snapshotEnabled, _ := pc.isEnabled() 216 mapping["RetainPD"] = snapshotEnabled 217 218 pc.templateMapping = mapping 219 220 pc.ssh = &util.GCloudSSHExecutor{} 221 222 return pc, nil 223 } 224 225 // SetUpPrometheusStack sets up prometheus stack in the cluster. 226 // This method is idempotent, if the prometheus stack is already set up applying the manifests 227 // again will be no-op. 228 func (pc *Controller) SetUpPrometheusStack() error { 229 k8sClient := pc.framework.GetClientSets().GetClient() 230 231 klog.V(2).Info("Setting up prometheus stack") 232 if err := client.CreateNamespace(k8sClient, namespace); err != nil { 233 return err 234 } 235 // Removing Storage Class as Reclaim Policy cannot be changed 236 if err := client.DeleteStorageClass(k8sClient, storageClass); err != nil { 237 return err 238 } 239 if err := pc.applyManifests(coreManifests); err != nil { 240 return err 241 } 242 if pc.clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter { 243 if err := pc.runNodeExporter(); err != nil { 244 return err 245 } 246 } 247 if pc.clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter { 248 if err := pc.applyManifests(windowsNodeExporterManifests); err != nil { 249 return err 250 } 251 } else { 252 // Backward compatibility 253 // If enabled scraping windows node, need to setup windows node and template mapping 254 if isWindowsNodeScrapingEnabled(pc.templateMapping, pc.clusterLoaderConfig) { 255 if err := setUpWindowsNodeAndTemplate(k8sClient, pc.templateMapping); err != nil { 256 return err 257 } 258 } 259 } 260 if !pc.isKubemark() { 261 if err := pc.applyManifests(defaultServiceMonitors); err != nil { 262 return err 263 } 264 } 265 266 if pc.clusterLoaderConfig.PrometheusConfig.ScrapeKubeStateMetrics && pc.clusterLoaderConfig.ClusterConfig.Provider.Features().SupportKubeStateMetrics { 267 klog.V(2).Infof("Applying kube-state-metrics in the cluster.") 268 if err := pc.applyManifests(kubeStateMetricsManifests); err != nil { 269 return err 270 } 271 } 272 if pc.clusterLoaderConfig.PrometheusConfig.ScrapeMetricsServerMetrics && pc.clusterLoaderConfig.ClusterConfig.Provider.Features().SupportMetricsServerMetrics { 273 klog.V(2).Infof("Applying metrics server in the cluster.") 274 if err := pc.applyManifests(metricsServerManifests); err != nil { 275 return err 276 } 277 } 278 if _, ok := pc.templateMapping["MasterIps"]; ok { 279 if err := pc.exposeAPIServerMetrics(); err != nil { 280 return err 281 } 282 if err := pc.applyManifests(masterIPServiceMonitors); err != nil { 283 return err 284 } 285 } 286 if pc.clusterLoaderConfig.PrometheusConfig.EnablePushgateway { 287 klog.V(2).Infof("Applying Pushgateway in the cluster.") 288 if err := pc.applyManifests(pushgatewayManifests); err != nil { 289 return err 290 } 291 } 292 if err := pc.waitForPrometheusToBeHealthy(); err != nil { 293 dumpAdditionalLogsOnPrometheusSetupFailure(k8sClient) 294 return err 295 } 296 klog.V(2).Info("Prometheus stack set up successfully") 297 if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil { 298 klog.Warningf("Error while caching prometheus disk metadata: %v", err) 299 } 300 return nil 301 } 302 303 func (pc *Controller) MakePrometheusSnapshotIfEnabled() error { 304 klog.V(2).Info("Get snapshot from Prometheus") 305 if err := pc.snapshotPrometheusIfEnabled(); err != nil { 306 klog.Warningf("Error while getting prometheus snapshot: %v", err) 307 return err 308 } 309 310 return nil 311 } 312 313 // TearDownPrometheusStack tears down prometheus stack, releasing all prometheus resources. 314 func (pc *Controller) TearDownPrometheusStack() error { 315 // Get disk metadata again to be sure 316 if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil { 317 klog.Warningf("Error while caching prometheus disk metadata: %v", err) 318 } 319 defer func() { 320 klog.V(2).Info("Snapshotting prometheus disk") 321 if err := pc.snapshotPrometheusDiskIfEnabledSynchronized(); err != nil { 322 klog.Warningf("Error while snapshotting prometheus disk: %v", err) 323 } 324 if err := pc.deletePrometheusDiskIfEnabled(); err != nil { 325 klog.Warningf("Error while deleting prometheus disk: %v", err) 326 } 327 }() 328 329 klog.V(2).Info("Tearing down prometheus stack") 330 k8sClient := pc.framework.GetClientSets().GetClient() 331 if err := client.DeleteNamespace(k8sClient, namespace); err != nil { 332 return err 333 } 334 if err := client.WaitForDeleteNamespace(k8sClient, namespace, client.DefaultNamespaceDeletionTimeout); err != nil { 335 return err 336 } 337 return nil 338 } 339 340 // GetFramework returns prometheus framework. 341 func (pc *Controller) GetFramework() *framework.Framework { 342 return pc.framework 343 } 344 345 func (pc *Controller) applyManifests(manifestGlob string) error { 346 return pc.framework.ApplyTemplatedManifests( 347 pc.manifestsFS(), manifestGlob, pc.templateMapping, client.Retry(apierrs.IsNotFound)) 348 } 349 350 func (pc *Controller) manifestsFS() fs.FS { 351 if pc.clusterLoaderConfig.PrometheusConfig.ManifestPath != "" { 352 return os.DirFS(pc.clusterLoaderConfig.PrometheusConfig.ManifestPath) 353 } 354 355 return manifestsFS 356 } 357 358 // exposeAPIServerMetrics configures anonymous access to the apiserver metrics. 359 func (pc *Controller) exposeAPIServerMetrics() error { 360 klog.V(2).Info("Exposing kube-apiserver metrics in the cluster") 361 // We need to get a client to the cluster where the test is being executed on, 362 // not the cluster that the prometheus is running in. Usually, there is only 363 // once cluster, but in case of kubemark we have two and thus we need to 364 // create a new client here. 365 clientSet, err := framework.NewMultiClientSet( 366 pc.clusterLoaderConfig.ClusterConfig.KubeConfigPath, numK8sClients) 367 if err != nil { 368 return err 369 } 370 createClusterRole := func() error { 371 _, err := clientSet.GetClient().RbacV1().ClusterRoles().Create(context.TODO(), &rbacv1.ClusterRole{ 372 ObjectMeta: metav1.ObjectMeta{Name: "apiserver-metrics-viewer"}, 373 Rules: []rbacv1.PolicyRule{ 374 {Verbs: []string{"get"}, NonResourceURLs: []string{"/metrics"}}, 375 }, 376 }, metav1.CreateOptions{}) 377 return err 378 } 379 createClusterRoleBinding := func() error { 380 _, err := clientSet.GetClient().RbacV1().ClusterRoleBindings().Create(context.TODO(), &rbacv1.ClusterRoleBinding{ 381 ObjectMeta: metav1.ObjectMeta{Name: "system:anonymous"}, 382 RoleRef: rbacv1.RoleRef{Kind: "ClusterRole", Name: "apiserver-metrics-viewer"}, 383 Subjects: []rbacv1.Subject{ 384 {Kind: "User", Name: "system:anonymous"}, 385 }, 386 }, metav1.CreateOptions{}) 387 return err 388 } 389 if err := retryCreateFunction(createClusterRole); err != nil { 390 return err 391 } 392 if err := retryCreateFunction(createClusterRoleBinding); err != nil { 393 return err 394 } 395 return nil 396 } 397 398 // runNodeExporter adds node-exporter as master's static manifest pod. 399 // TODO(mborsz): Consider migrating to something less ugly, e.g. daemonset-based approach, 400 // when master nodes have configured networking. 401 func (pc *Controller) runNodeExporter() error { 402 klog.V(2).Infof("Starting node-exporter on master nodes.") 403 kubemarkFramework, err := framework.NewFramework(&pc.clusterLoaderConfig.ClusterConfig, numK8sClients) 404 if err != nil { 405 return err 406 } 407 408 // Validate masters first 409 nodes, err := client.ListNodes(kubemarkFramework.GetClientSets().GetClient()) 410 if err != nil { 411 return err 412 } 413 414 var g errgroup.Group 415 numMasters := 0 416 for _, node := range nodes { 417 node := node 418 if util.LegacyIsMasterNode(&node) || util.IsControlPlaneNode(&node) { 419 numMasters++ 420 g.Go(func() error { 421 f, err := pc.manifestsFS().Open(nodeExporterPod) 422 if err != nil { 423 return fmt.Errorf("unable to open manifest file: %v", err) 424 } 425 defer f.Close() 426 return pc.ssh.Exec("sudo tee /etc/kubernetes/manifests/node-exporter.yaml > /dev/null", &node, f) 427 }) 428 } 429 } 430 431 if numMasters == 0 { 432 return fmt.Errorf("node-exporter requires master to be registered nodes") 433 } 434 435 return g.Wait() 436 } 437 438 func (pc *Controller) waitForPrometheusToBeHealthy() error { 439 klog.V(2).Info("Waiting for Prometheus stack to become healthy...") 440 return wait.PollImmediate( 441 checkPrometheusReadyInterval, 442 pc.readyTimeout, 443 pc.isPrometheusReady) 444 } 445 446 func (pc *Controller) isPrometheusReady() (bool, error) { 447 // TODO(mm4tt): Re-enable kube-proxy monitoring and expect more targets. 448 // This is a safeguard from a race condition where the prometheus server is started before 449 // targets are registered. These 4 targets are always expected, in all possible configurations: 450 // prometheus, prometheus-operator, grafana, apiserver 451 expectedTargets := 4 452 if pc.clusterLoaderConfig.PrometheusConfig.ScrapeEtcd { 453 // If scraping etcd is enabled (or it's kubemark where we scrape etcd unconditionally) we need 454 // a bit more complicated logic to asses whether all targets are ready. Etcd metric port has 455 // changed in https://github.com/kubernetes/kubernetes/pull/77561, depending on the k8s version 456 // etcd metrics may be available at port 2379 xor 2382. We solve that by setting two etcd 457 // serviceMonitors one for 2379 and other for 2382 and expect that at least 1 of them should be healthy. 458 ok, err := CheckAllTargetsReady( // All non-etcd targets should be ready. 459 pc.framework.GetClientSets().GetClient(), 460 func(t Target) bool { return !isEtcdEndpoint(t.Labels["endpoint"]) }, 461 expectedTargets) 462 if err != nil || !ok { 463 return ok, err 464 } 465 return CheckTargetsReady( // 1 out of 2 etcd targets should be ready. 466 pc.framework.GetClientSets().GetClient(), 467 func(t Target) bool { return isEtcdEndpoint(t.Labels["endpoint"]) }, 468 2, // expected targets: etcd-2379 and etcd-2382 469 1) // one of them should be healthy 470 } 471 return CheckAllTargetsReady( 472 pc.framework.GetClientSets().GetClient(), 473 func(Target) bool { return true }, // All targets. 474 expectedTargets) 475 } 476 477 func retryCreateFunction(f func() error) error { 478 return client.RetryWithExponentialBackOff( 479 client.RetryFunction(f, client.Allow(apierrs.IsAlreadyExists))) 480 } 481 482 func (pc *Controller) isKubemark() bool { 483 return pc.provider.Features().IsKubemarkProvider 484 } 485 486 func dumpAdditionalLogsOnPrometheusSetupFailure(k8sClient kubernetes.Interface) { 487 klog.V(2).Info("Dumping monitoring/prometheus-k8s events...") 488 list, err := client.ListEvents(k8sClient, namespace, "prometheus-k8s") 489 if err != nil { 490 klog.Warningf("Error while listing monitoring/prometheus-k8s events: %v", err) 491 return 492 } 493 s, err := json.MarshalIndent(list, "" /*=prefix*/, " " /*=indent*/) 494 if err != nil { 495 klog.Warningf("Error while marshalling response %v: %v", list, err) 496 return 497 } 498 klog.V(2).Info(string(s)) 499 } 500 501 func getMasterIps(clusterConfig config.ClusterConfig, usePublicIPs bool) ([]string, error) { 502 if usePublicIPs { 503 if len(clusterConfig.MasterIPs) == 0 { 504 return nil, fmt.Errorf("requested to use public IPs, however no publics IPs are provided") 505 } 506 return clusterConfig.MasterIPs, nil 507 } 508 if len(clusterConfig.MasterInternalIPs) != 0 { 509 klog.V(2).Infof("Using internal master ips (%s) to monitor master's components", clusterConfig.MasterInternalIPs) 510 return clusterConfig.MasterInternalIPs, nil 511 } 512 klog.V(1).Infof("Unable to determine master ips from flags or registered nodes. Will fallback to default/kubernetes service, which can be inaccurate in HA environments.") 513 ips, err := getMasterIpsFromKubernetesService(clusterConfig) 514 if err != nil { 515 klog.Warningf("Failed to translate default/kubernetes service to IP: %v", err) 516 return nil, fmt.Errorf("no ips are set, fallback to default/kubernetes service failed due to: %v", err) 517 } 518 klog.V(2).Infof("default/kubernetes service translated to: %v", ips) 519 return ips, nil 520 } 521 522 func getMasterIpsFromKubernetesService(clusterConfig config.ClusterConfig) ([]string, error) { 523 // This has to be done in the kubemark cluster, thus we need to create a new client. 524 clientSet, err := framework.NewMultiClientSet(clusterConfig.KubeConfigPath, numK8sClients) 525 if err != nil { 526 return nil, err 527 } 528 529 var endpoints *corev1.Endpoints 530 f := func() error { 531 var err error 532 endpoints, err = clientSet.GetClient().CoreV1().Endpoints("default").Get(context.TODO(), "kubernetes", metav1.GetOptions{}) 533 return err 534 } 535 536 if err := client.RetryWithExponentialBackOff(client.RetryFunction(f)); err != nil { 537 return nil, err 538 } 539 540 var ips []string 541 for _, subnet := range endpoints.Subsets { 542 for _, address := range subnet.Addresses { 543 ips = append(ips, address.IP) 544 } 545 } 546 547 if len(ips) == 0 { 548 return nil, errors.New("no master ips available in default/kubernetes service") 549 } 550 551 return ips, nil 552 } 553 554 func isEtcdEndpoint(endpoint string) bool { 555 return endpoint == "etcd-2379" || endpoint == "etcd-2382" 556 }