github.com/zppinho/prow@v0.0.0-20240510014325-1738badeb017/pkg/flagutil/kubernetes_cluster_clients.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package flagutil 18 19 import ( 20 "context" 21 "errors" 22 "flag" 23 "fmt" 24 "os" 25 "path/filepath" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/prometheus/client_golang/prometheus" 31 "github.com/sirupsen/logrus" 32 "gopkg.in/fsnotify.v1" 33 34 k8sauthorizationv1 "k8s.io/api/authorization/v1" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 utilerrors "k8s.io/apimachinery/pkg/util/errors" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/client-go/kubernetes" 39 authorizationv1 "k8s.io/client-go/kubernetes/typed/authorization/v1" 40 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 41 "k8s.io/client-go/rest" 42 "k8s.io/client-go/tools/clientcmd" 43 "sigs.k8s.io/controller-runtime/pkg/cache" 44 ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" 45 "sigs.k8s.io/controller-runtime/pkg/manager" 46 "sigs.k8s.io/controller-runtime/pkg/metrics/server" 47 48 prow "sigs.k8s.io/prow/pkg/client/clientset/versioned" 49 prowv1 "sigs.k8s.io/prow/pkg/client/clientset/versioned/typed/prowjobs/v1" 50 "sigs.k8s.io/prow/pkg/kube" 51 ) 52 53 func init() { 54 prometheus.MustRegister(clientCreationFailures) 55 } 56 57 // KubernetesOptions holds options for interacting with Kubernetes. 58 // These options are both useful for clients interacting with ProwJobs 59 // and other resources on the infrastructure cluster, as well as Pods 60 // on build clusters. 61 type KubernetesOptions struct { 62 kubeconfig string 63 kubeconfigDir string 64 kubeconfigSuffix string 65 projectedTokenFile string 66 noInClusterConfig bool 67 NOInClusterConfigDefault bool 68 69 // from the setter SetDisabledClusters 70 disabledClusters sets.Set[string] 71 72 // from resolution 73 resolved bool 74 dryRun bool 75 prowJobClientset prow.Interface 76 clusterConfigs map[string]rest.Config 77 kubernetesClientsByContext map[string]kubernetes.Interface 78 infrastructureClusterConfig *rest.Config 79 kubeconfigWach *sync.Once 80 kubeconfigWatchEvents <-chan fsnotify.Event 81 } 82 83 var MissingPermissions = errors.New("missing permissions") 84 85 // AddKubeconfigChangeCallback adds a callback that gets called whenever the kubeconfig changes. 86 // The main usecase for this is to exit components that can not reload a kubeconfig at runtime 87 // so the kubelet restarts them 88 func (o *KubernetesOptions) AddKubeconfigChangeCallback(callback func()) error { 89 if err := o.resolve(o.dryRun); err != nil { 90 return fmt.Errorf("resolving failed: %w", err) 91 } 92 93 var err error 94 o.kubeconfigWach.Do(func() { 95 var watcher *fsnotify.Watcher 96 watcher, err = fsnotify.NewWatcher() 97 if err != nil { 98 err = fmt.Errorf("failed to create watcher: %w", err) 99 return 100 } 101 if o.kubeconfig != "" { 102 err = watcher.Add(o.kubeconfig) 103 if err != nil { 104 err = fmt.Errorf("failed to watch %s: %w", o.kubeconfig, err) 105 return 106 } 107 } 108 if o.kubeconfigDir != "" { 109 err = watcher.Add(o.kubeconfigDir) 110 if err != nil { 111 err = fmt.Errorf("failed to watch %s: %w", o.kubeconfigDir, err) 112 return 113 } 114 } 115 if o.kubeconfig == "" && o.kubeconfigDir == "" { 116 if envVal := os.Getenv(clientcmd.RecommendedConfigPathEnvVar); envVal != "" { 117 for _, element := range sets.List(sets.New[string](filepath.SplitList(envVal)...)) { 118 err = watcher.Add(element) 119 if err != nil { 120 err = fmt.Errorf("failed to watch %s: %w", element, err) 121 return 122 } 123 } 124 } 125 } 126 o.kubeconfigWatchEvents = watcher.Events 127 128 go func() { 129 for watchErr := range watcher.Errors { 130 logrus.WithError(watchErr).Error("Kubeconfig watcher errored") 131 } 132 if err := watcher.Close(); err != nil { 133 logrus.WithError(err).Error("Failed to close watcher") 134 } 135 }() 136 }) 137 if err != nil { 138 return fmt.Errorf("failed to set up watches: %w", err) 139 } 140 141 go func() { 142 for e := range o.kubeconfigWatchEvents { 143 if e.Op == fsnotify.Chmod { 144 // For some reason we get frequent chmod events 145 continue 146 } 147 logrus.WithField("event", e.String()).Info("Kubeconfig changed") 148 callback() 149 } 150 }() 151 152 return nil 153 } 154 155 // LoadClusterConfigs returns the resolved rest.Configs and each callback function will be executed if 156 // the underlying kubeconfig files are modified. This function is for the case where the rest.Configs are 157 // needed without interests of the clients. 158 func (o *KubernetesOptions) LoadClusterConfigs(callBacks ...func()) (map[string]rest.Config, error) { 159 var errs []error 160 if !o.resolved { 161 if err := o.resolve(o.dryRun); err != nil { 162 errs = append(errs, fmt.Errorf("failed to resolve the kubeneates options: %w", err)) 163 } 164 } 165 166 if o.kubeconfig == "" && o.kubeconfigDir == "" { 167 if envVal := os.Getenv(clientcmd.RecommendedConfigPathEnvVar); envVal != "" { 168 if kubeconfigsFromEnv := strings.Split(envVal, ":"); len(kubeconfigsFromEnv) > 0 && 169 len(kubeconfigsFromEnv) > len(o.clusterConfigs) { 170 errs = append(errs, fmt.Errorf("%s env var with value %s had %d elements but only got %d kubeconfigs", 171 clientcmd.RecommendedConfigPathEnvVar, envVal, len(kubeconfigsFromEnv), len(o.clusterConfigs))) 172 } 173 } 174 } 175 176 for i, callBack := range callBacks { 177 if callBack != nil { 178 if err := o.AddKubeconfigChangeCallback(callBack); err != nil { 179 errs = append(errs, fmt.Errorf("failed to add the %d-th kubeconfig change call back: %w", i, err)) 180 } 181 } 182 } 183 return o.clusterConfigs, utilerrors.NewAggregate(errs) 184 } 185 186 // AddFlags injects Kubernetes options into the given FlagSet. 187 func (o *KubernetesOptions) AddFlags(fs *flag.FlagSet) { 188 fs.StringVar(&o.kubeconfig, "kubeconfig", "", "Path to .kube/config file. If neither of --kubeconfig and --kubeconfig-dir is provided, use the in-cluster config. All contexts other than the default are used as build clusters.") 189 fs.StringVar(&o.kubeconfigDir, "kubeconfig-dir", "", "Path to the directory containing kubeconfig files. If neither of --kubeconfig and --kubeconfig-dir is provided, use the in-cluster config. All contexts other than the default are used as build clusters.") 190 fs.StringVar(&o.kubeconfigSuffix, "kubeconfig-suffix", "", "The files without the suffix will be ignored when loading kubeconfig files from --kubeconfig-dir. It must be used together with --kubeconfig-dir.") 191 fs.StringVar(&o.projectedTokenFile, "projected-token-file", "", "A projected serviceaccount token file. If set, this will be configured as token file in the in-cluster config.") 192 fs.BoolVar(&o.noInClusterConfig, "no-in-cluster-config", o.NOInClusterConfigDefault, "Not resolving InCluster Config if set.") 193 } 194 195 // Validate validates Kubernetes options. 196 func (o *KubernetesOptions) Validate(_ bool) error { 197 if o.kubeconfig != "" { 198 if _, err := os.Stat(o.kubeconfig); err != nil { 199 return fmt.Errorf("error accessing --kubeconfig: %w", err) 200 } 201 } 202 203 if o.kubeconfigDir != "" { 204 if fileInfo, err := os.Stat(o.kubeconfigDir); err != nil { 205 return fmt.Errorf("error accessing --kubeconfig-dir: %w", err) 206 } else if !fileInfo.IsDir() { 207 return fmt.Errorf("--kubeconfig-dir must be a directory") 208 } 209 } 210 211 if o.kubeconfigSuffix != "" && o.kubeconfigDir == "" { 212 return fmt.Errorf("--kubeconfig-dir must be set if --kubeconfig-suffix is set") 213 } 214 215 return nil 216 } 217 218 // resolve loads all of the clients we need and caches them for future calls. 219 func (o *KubernetesOptions) resolve(dryRun bool) error { 220 if o.resolved { 221 return nil 222 } 223 224 o.kubeconfigWach = &sync.Once{} 225 226 clusterConfigs, err := kube.LoadClusterConfigs(kube.NewConfig(kube.ConfigFile(o.kubeconfig), 227 kube.ConfigDir(o.kubeconfigDir), kube.ConfigProjectedTokenFile(o.projectedTokenFile), 228 kube.NoInClusterConfig(o.noInClusterConfig), kube.ConfigSuffix(o.kubeconfigSuffix), 229 kube.DisabledClusters(o.disabledClusters))) 230 if err != nil { 231 return fmt.Errorf("load --kubeconfig=%q configs: %w", o.kubeconfig, err) 232 } 233 o.clusterConfigs = clusterConfigs 234 235 clients := map[string]kubernetes.Interface{} 236 for context, config := range clusterConfigs { 237 client, err := kubernetes.NewForConfig(&config) 238 if err != nil { 239 return fmt.Errorf("create %s kubernetes client: %w", context, err) 240 } 241 clients[context] = client 242 } 243 244 localCfg := clusterConfigs[kube.InClusterContext] 245 o.infrastructureClusterConfig = &localCfg 246 pjClient, err := prow.NewForConfig(&localCfg) 247 if err != nil { 248 return err 249 } 250 251 o.dryRun = dryRun 252 if dryRun { 253 return nil 254 } 255 256 o.prowJobClientset = pjClient 257 o.kubernetesClientsByContext = clients 258 o.resolved = true 259 260 return nil 261 } 262 263 // ProwJobClientset returns a ProwJob clientset for use in informer factories. 264 func (o *KubernetesOptions) ProwJobClientset(dryRun bool) (prowJobClientset prow.Interface, err error) { 265 if err := o.resolve(dryRun); err != nil { 266 return nil, err 267 } 268 269 if o.dryRun { 270 return nil, errors.New("no dry-run prowjob clientset is supported in dry-run mode") 271 } 272 273 return o.prowJobClientset, nil 274 } 275 276 // ProwJobClient returns a ProwJob client. 277 func (o *KubernetesOptions) ProwJobClient(namespace string, dryRun bool) (prowJobClient prowv1.ProwJobInterface, err error) { 278 if err := o.resolve(dryRun); err != nil { 279 return nil, err 280 } 281 282 if o.dryRun { 283 return nil, errors.New("no dry-run prowjob client is supported in dry-run mode") 284 } 285 return o.prowJobClientset.ProwV1().ProwJobs(namespace), nil 286 } 287 288 // InfrastructureClusterConfig returns the *rest.Config for the infrastructure cluster 289 func (o *KubernetesOptions) InfrastructureClusterConfig(dryRun bool) (*rest.Config, error) { 290 if err := o.resolve(dryRun); err != nil { 291 return nil, err 292 } 293 294 return o.infrastructureClusterConfig, nil 295 } 296 297 // InfrastructureClusterClient returns a Kubernetes client for the infrastructure cluster. 298 func (o *KubernetesOptions) InfrastructureClusterClient(dryRun bool) (kubernetesClient kubernetes.Interface, err error) { 299 return o.ClusterClientForContext(kube.InClusterContext, dryRun) 300 } 301 302 // ClusterClientForContext returns a Kubernetes client for the given context name. 303 func (o *KubernetesOptions) ClusterClientForContext(context string, dryRun bool) (kubernetesClient kubernetes.Interface, err error) { 304 if err := o.resolve(dryRun); err != nil { 305 return nil, err 306 } 307 308 if o.dryRun { 309 return nil, errors.New("no dry-run kubernetes client is supported in dry-run mode") 310 } 311 312 client, exists := o.kubernetesClientsByContext[context] 313 if !exists { 314 return nil, fmt.Errorf("context %q does not exist in the provided config", context) 315 } 316 return client, nil 317 } 318 319 // BuildClusterClients returns Pod clients for build clusters. 320 func (o *KubernetesOptions) BuildClusterClients(namespace string, dryRun bool) (buildClusterClients map[string]corev1.PodInterface, err error) { 321 if err := o.resolve(dryRun); err != nil { 322 return nil, err 323 } 324 325 if o.dryRun { 326 return nil, errors.New("no dry-run pod client is supported for build clusters in dry-run mode") 327 } 328 329 buildClients := map[string]corev1.PodInterface{} 330 for context, client := range o.kubernetesClientsByContext { 331 buildClients[context] = client.CoreV1().Pods(namespace) 332 } 333 return buildClients, nil 334 } 335 336 // BuildClusterCoreV1Clients returns core v1 clients for build clusters. 337 func (o *KubernetesOptions) BuildClusterCoreV1Clients(dryRun bool) (v1Clients map[string]corev1.CoreV1Interface, err error) { 338 if err := o.resolve(dryRun); err != nil { 339 return nil, err 340 } 341 342 if o.dryRun { 343 return nil, errors.New("no dry-run pod client is supported for build clusters in dry-run mode") 344 } 345 346 clients := map[string]corev1.CoreV1Interface{} 347 for context, client := range o.kubernetesClientsByContext { 348 clients[context] = client.CoreV1() 349 } 350 return clients, nil 351 } 352 353 var clientCreationFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ 354 Name: "kubernetes_failed_client_creations", 355 Help: "The number of clusters for which we failed to create a client.", 356 }, []string{"cluster"}) 357 358 // BuildClusterManagers returns a manager per buildCluster. 359 // Per default, LeaderElection and the metrics listener are disabled, as we assume 360 // that there is another manager for ProwJobs that handles that. 361 func (o *KubernetesOptions) BuildClusterManagers(dryRun bool, requiredTestPodVerbs []string, callBack func(), namespace string, opts ...func(*manager.Options)) (map[string]manager.Manager, error) { 362 if err := o.resolve(dryRun); err != nil { 363 return nil, err 364 } 365 366 options := manager.Options{ 367 LeaderElection: false, 368 Metrics: server.Options{ 369 BindAddress: "0", 370 }, 371 Client: ctrlruntimeclient.Options{ 372 DryRun: &o.dryRun, 373 }, 374 Cache: cache.Options{ 375 DefaultNamespaces: map[string]cache.Config{ 376 namespace: {}, 377 }, 378 }, 379 } 380 for _, opt := range opts { 381 opt(&options) 382 } 383 384 res := map[string]manager.Manager{} 385 var errs []error 386 var lock sync.Mutex 387 var threads sync.WaitGroup 388 threads.Add(len(o.clusterConfigs)) 389 for buildClusterName, buildClusterConfig := range o.clusterConfigs { 390 go func(name string, config rest.Config) { 391 defer threads.Done() 392 // This fails if we are unable to connect to the cluster --- either 393 // due to missing or expired kubeconfig secrets, or if some other 394 // auth-related executable (e.g., gke-gcloud-auth-plugin) is missing 395 // from the base image. 396 mgr, err := manager.New(&config, options) 397 if err != nil { 398 clientCreationFailures.WithLabelValues(name).Add(1) 399 lock.Lock() 400 errs = append(errs, fmt.Errorf("failed to construct manager for cluster %s: %w", name, err)) 401 lock.Unlock() 402 return 403 } 404 405 // Check to see if we are able to perform actions against pods in 406 // the build cluster. The actions are given in requiredTestPodVerbs. 407 authzClient, err := authorizationv1.NewForConfig(&config) 408 if err != nil { 409 lock.Lock() 410 errs = append(errs, fmt.Errorf("failed to construct authz client for cluster %s: %s", name, err)) 411 lock.Unlock() 412 return 413 } 414 if err := CheckAuthorizations(authzClient.SelfSubjectAccessReviews(), namespace, requiredTestPodVerbs); err != nil { 415 lock.Lock() 416 errs = append(errs, fmt.Errorf("failed pod resource authorization check for cluster %s: %w", name, err)) 417 lock.Unlock() 418 return 419 } 420 421 lock.Lock() 422 res[name] = mgr 423 lock.Unlock() 424 }(buildClusterName, buildClusterConfig) 425 } 426 threads.Wait() 427 428 aggregatedErr := utilerrors.NewAggregate(errs) 429 430 if aggregatedErr != nil { 431 // Retry the build clusters that failed to be connected initially. If 432 // suddenly we can connect to them successfully, execute the callback 433 // function (e.g., to terminate this pod to force a restart). This is 434 // useful where a build cluster is not reachable transiently, such as 435 // when an API server upgrade causes connection problems. 436 go func() { 437 for { 438 for buildClusterName, buildClusterConfig := range o.clusterConfigs { 439 // Do not check already-successfully-checked build clusters. 440 if _, ok := res[buildClusterName]; ok { 441 continue 442 } 443 444 // If there are any errors with this (still troublesome) 445 // build cluster, keep checking. 446 if _, err := manager.New(&buildClusterConfig, options); err != nil { 447 logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct build cluster manager: %s", err) 448 continue 449 } 450 451 authzClient, err := authorizationv1.NewForConfig(&buildClusterConfig) 452 if err != nil { 453 logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct authz client: %s", err) 454 continue 455 } 456 if err := CheckAuthorizations(authzClient.SelfSubjectAccessReviews(), namespace, requiredTestPodVerbs); err != nil { 457 logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct build cluster manager: %s", err) 458 continue 459 } 460 461 logrus.WithField("build-cluster", buildClusterName).Info("Build cluster that failed to connect initially now worked.") 462 callBack() 463 } 464 // Sleep arbitrarily amount of time 465 time.Sleep(5 * time.Second) 466 } 467 }() 468 } else { 469 logrus.Debug("No error constructing managers for build clusters, skip polling build clusters.") 470 } 471 return res, aggregatedErr 472 } 473 474 // CheckAuthorizations checks if we are able to perform the required actions 475 // against test pods for the provided pod verbs (requiredTestPodVerbs). 476 func CheckAuthorizations(client authorizationv1.SelfSubjectAccessReviewInterface, namespace string, requiredTestPodVerbs []string) error { 477 478 var errs []error 479 // Unfortunately we have to do multiple API requests because there is no way 480 // to check for multiple verbs on a resource at once. The closest 481 // alternative is the "*" wildcard verb, but that appears to be overbroad 482 // and fails on the integration test cluster. The approach we take here is 483 // essentially equivalent to the following kubectl command: 484 // 485 // $ cat <<EOF | kubectl --context=kind-kind-prow-integration create -f - -v 8 486 // apiVersion: authorization.k8s.io/v1 487 // kind: SubjectAccessReview 488 // spec: 489 // resourceAttributes: 490 // resource: pods 491 // verb: list # also test for get, create, etc 492 // namespace: test-pods 493 // user: system:serviceaccount:default:prow-controller-manager 494 // EOF 495 // 496 // The difference in our case is that (1) we are running the below check 497 // *inside* the main service cluster itself, (2) we are running the check 498 // against an entirely different build cluster, and (3) we are using 499 // SelfSubjectAccessReview so that we don't have to provide a `user` field 500 // (so that this code can work with whatever user is the default when we're 501 // connecting to the build cluster). 502 // 503 // See 504 // https://kubernetes.io/docs/reference/access-authn-authz/authorization/#checking-api-access 505 // for more information. 506 for _, verb := range requiredTestPodVerbs { 507 ssar := k8sauthorizationv1.SelfSubjectAccessReview{ 508 Spec: k8sauthorizationv1.SelfSubjectAccessReviewSpec{ 509 ResourceAttributes: &k8sauthorizationv1.ResourceAttributes{ 510 Namespace: namespace, 511 Verb: verb, 512 Resource: "pods", 513 }, 514 }, 515 } 516 ssarExpanded, err := client.Create(context.TODO(), &ssar, metav1.CreateOptions{}) 517 if err != nil { 518 errs = append(errs, err) 519 continue 520 } 521 522 if !ssarExpanded.Status.Allowed { 523 errs = append(errs, fmt.Errorf("%w: unable to %q pods", MissingPermissions, verb)) 524 } 525 } 526 527 return utilerrors.NewAggregate(errs) 528 } 529 530 // BuildClusterUncachedRuntimeClients returns ctrlruntimeclients for the build cluster in a non-caching implementation. 531 func (o *KubernetesOptions) BuildClusterUncachedRuntimeClients(dryRun bool) (map[string]ctrlruntimeclient.Client, error) { 532 if err := o.resolve(dryRun); err != nil { 533 return nil, err 534 } 535 536 var errs []error 537 clients := map[string]ctrlruntimeclient.Client{} 538 for name := range o.clusterConfigs { 539 cfg := o.clusterConfigs[name] 540 client, err := ctrlruntimeclient.New(&cfg, ctrlruntimeclient.Options{}) 541 if err != nil { 542 clientCreationFailures.WithLabelValues(name).Add(1) 543 errs = append(errs, fmt.Errorf("failed to construct client for cluster %q: %w", name, err)) 544 continue 545 } 546 if o.dryRun { 547 client = ctrlruntimeclient.NewDryRunClient(client) 548 } 549 clients[name] = client 550 } 551 552 return clients, utilerrors.NewAggregate(errs) 553 } 554 555 func (o *KubernetesOptions) KnownClusters(dryRun bool) (map[string]rest.Config, error) { 556 if err := o.resolve(dryRun); err != nil { 557 return nil, err 558 } 559 return o.clusterConfigs, nil 560 } 561 562 // SetDisabledClusters sets disabledClusters 563 // It has no effects if the options have been resolved. 564 func (o *KubernetesOptions) SetDisabledClusters(disabledClusters sets.Set[string]) { 565 if o.resolved { 566 logrus.WithField("disabledClusters", o.disabledClusters).Warn("SetDisabledClusters has to be called before it is resolved") 567 return 568 } 569 o.disabledClusters = disabledClusters 570 }