sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/flagutil/kubernetes_cluster_clients.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package flagutil 18 19 import ( 20 "context" 21 "errors" 22 "flag" 23 "fmt" 24 "os" 25 "path/filepath" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/prometheus/client_golang/prometheus" 31 "github.com/sirupsen/logrus" 32 "gopkg.in/fsnotify.v1" 33 34 k8sauthorizationv1 "k8s.io/api/authorization/v1" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 utilerrors "k8s.io/apimachinery/pkg/util/errors" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/client-go/kubernetes" 39 authorizationv1 "k8s.io/client-go/kubernetes/typed/authorization/v1" 40 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 41 "k8s.io/client-go/rest" 42 "k8s.io/client-go/tools/clientcmd" 43 ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" 44 "sigs.k8s.io/controller-runtime/pkg/manager" 45 46 prow "sigs.k8s.io/prow/pkg/client/clientset/versioned" 47 prowv1 "sigs.k8s.io/prow/pkg/client/clientset/versioned/typed/prowjobs/v1" 48 "sigs.k8s.io/prow/pkg/kube" 49 ) 50 51 func init() { 52 prometheus.MustRegister(clientCreationFailures) 53 } 54 55 // KubernetesOptions holds options for interacting with Kubernetes. 56 // These options are both useful for clients interacting with ProwJobs 57 // and other resources on the infrastructure cluster, as well as Pods 58 // on build clusters. 59 type KubernetesOptions struct { 60 kubeconfig string 61 kubeconfigDir string 62 kubeconfigSuffix string 63 projectedTokenFile string 64 noInClusterConfig bool 65 NOInClusterConfigDefault bool 66 67 // from the setter SetDisabledClusters 68 disabledClusters sets.Set[string] 69 70 // from resolution 71 resolved bool 72 dryRun bool 73 prowJobClientset prow.Interface 74 clusterConfigs map[string]rest.Config 75 kubernetesClientsByContext map[string]kubernetes.Interface 76 infrastructureClusterConfig *rest.Config 77 kubeconfigWach *sync.Once 78 kubeconfigWatchEvents <-chan fsnotify.Event 79 } 80 81 var MissingPermissions = errors.New("missing permissions") 82 83 // AddKubeconfigChangeCallback adds a callback that gets called whenever the kubeconfig changes. 84 // The main usecase for this is to exit components that can not reload a kubeconfig at runtime 85 // so the kubelet restarts them 86 func (o *KubernetesOptions) AddKubeconfigChangeCallback(callback func()) error { 87 if err := o.resolve(o.dryRun); err != nil { 88 return fmt.Errorf("resolving failed: %w", err) 89 } 90 91 var err error 92 o.kubeconfigWach.Do(func() { 93 var watcher *fsnotify.Watcher 94 watcher, err = fsnotify.NewWatcher() 95 if err != nil { 96 err = fmt.Errorf("failed to create watcher: %w", err) 97 return 98 } 99 if o.kubeconfig != "" { 100 err = watcher.Add(o.kubeconfig) 101 if err != nil { 102 err = fmt.Errorf("failed to watch %s: %w", o.kubeconfig, err) 103 return 104 } 105 } 106 if o.kubeconfigDir != "" { 107 err = watcher.Add(o.kubeconfigDir) 108 if err != nil { 109 err = fmt.Errorf("failed to watch %s: %w", o.kubeconfigDir, err) 110 return 111 } 112 } 113 if o.kubeconfig == "" && o.kubeconfigDir == "" { 114 if envVal := os.Getenv(clientcmd.RecommendedConfigPathEnvVar); envVal != "" { 115 for _, element := range sets.List(sets.New[string](filepath.SplitList(envVal)...)) { 116 err = watcher.Add(element) 117 if err != nil { 118 err = fmt.Errorf("failed to watch %s: %w", element, err) 119 return 120 } 121 } 122 } 123 } 124 o.kubeconfigWatchEvents = watcher.Events 125 126 go func() { 127 for watchErr := range watcher.Errors { 128 logrus.WithError(watchErr).Error("Kubeconfig watcher errored") 129 } 130 if err := watcher.Close(); err != nil { 131 logrus.WithError(err).Error("Failed to close watcher") 132 } 133 }() 134 }) 135 if err != nil { 136 return fmt.Errorf("failed to set up watches: %w", err) 137 } 138 139 go func() { 140 for e := range o.kubeconfigWatchEvents { 141 if e.Op == fsnotify.Chmod { 142 // For some reason we get frequent chmod events 143 continue 144 } 145 logrus.WithField("event", e.String()).Info("Kubeconfig changed") 146 callback() 147 } 148 }() 149 150 return nil 151 } 152 153 // LoadClusterConfigs returns the resolved rest.Configs and each callback function will be executed if 154 // the underlying kubeconfig files are modified. This function is for the case where the rest.Configs are 155 // needed without interests of the clients. 156 func (o *KubernetesOptions) LoadClusterConfigs(callBacks ...func()) (map[string]rest.Config, error) { 157 var errs []error 158 if !o.resolved { 159 if err := o.resolve(o.dryRun); err != nil { 160 errs = append(errs, fmt.Errorf("failed to resolve the kubeneates options: %w", err)) 161 } 162 } 163 164 if o.kubeconfig == "" && o.kubeconfigDir == "" { 165 if envVal := os.Getenv(clientcmd.RecommendedConfigPathEnvVar); envVal != "" { 166 if kubeconfigsFromEnv := strings.Split(envVal, ":"); len(kubeconfigsFromEnv) > 0 && 167 len(kubeconfigsFromEnv) > len(o.clusterConfigs) { 168 errs = append(errs, fmt.Errorf("%s env var with value %s had %d elements but only got %d kubeconfigs", 169 clientcmd.RecommendedConfigPathEnvVar, envVal, len(kubeconfigsFromEnv), len(o.clusterConfigs))) 170 } 171 } 172 } 173 174 for i, callBack := range callBacks { 175 if callBack != nil { 176 if err := o.AddKubeconfigChangeCallback(callBack); err != nil { 177 errs = append(errs, fmt.Errorf("failed to add the %d-th kubeconfig change call back: %w", i, err)) 178 } 179 } 180 } 181 return o.clusterConfigs, utilerrors.NewAggregate(errs) 182 } 183 184 // AddFlags injects Kubernetes options into the given FlagSet. 185 func (o *KubernetesOptions) AddFlags(fs *flag.FlagSet) { 186 fs.StringVar(&o.kubeconfig, "kubeconfig", "", "Path to .kube/config file. If neither of --kubeconfig and --kubeconfig-dir is provided, use the in-cluster config. All contexts other than the default are used as build clusters.") 187 fs.StringVar(&o.kubeconfigDir, "kubeconfig-dir", "", "Path to the directory containing kubeconfig files. If neither of --kubeconfig and --kubeconfig-dir is provided, use the in-cluster config. All contexts other than the default are used as build clusters.") 188 fs.StringVar(&o.kubeconfigSuffix, "kubeconfig-suffix", "", "The files without the suffix will be ignored when loading kubeconfig files from --kubeconfig-dir. It must be used together with --kubeconfig-dir.") 189 fs.StringVar(&o.projectedTokenFile, "projected-token-file", "", "A projected serviceaccount token file. If set, this will be configured as token file in the in-cluster config.") 190 fs.BoolVar(&o.noInClusterConfig, "no-in-cluster-config", o.NOInClusterConfigDefault, "Not resolving InCluster Config if set.") 191 } 192 193 // Validate validates Kubernetes options. 194 func (o *KubernetesOptions) Validate(_ bool) error { 195 if o.kubeconfig != "" { 196 if _, err := os.Stat(o.kubeconfig); err != nil { 197 return fmt.Errorf("error accessing --kubeconfig: %w", err) 198 } 199 } 200 201 if o.kubeconfigDir != "" { 202 if fileInfo, err := os.Stat(o.kubeconfigDir); err != nil { 203 return fmt.Errorf("error accessing --kubeconfig-dir: %w", err) 204 } else if !fileInfo.IsDir() { 205 return fmt.Errorf("--kubeconfig-dir must be a directory") 206 } 207 } 208 209 if o.kubeconfigSuffix != "" && o.kubeconfigDir == "" { 210 return fmt.Errorf("--kubeconfig-dir must be set if --kubeconfig-suffix is set") 211 } 212 213 return nil 214 } 215 216 // resolve loads all of the clients we need and caches them for future calls. 217 func (o *KubernetesOptions) resolve(dryRun bool) error { 218 if o.resolved { 219 return nil 220 } 221 222 o.kubeconfigWach = &sync.Once{} 223 224 clusterConfigs, err := kube.LoadClusterConfigs(kube.NewConfig(kube.ConfigFile(o.kubeconfig), 225 kube.ConfigDir(o.kubeconfigDir), kube.ConfigProjectedTokenFile(o.projectedTokenFile), 226 kube.NoInClusterConfig(o.noInClusterConfig), kube.ConfigSuffix(o.kubeconfigSuffix), 227 kube.DisabledClusters(o.disabledClusters))) 228 if err != nil { 229 return fmt.Errorf("load --kubeconfig=%q configs: %w", o.kubeconfig, err) 230 } 231 o.clusterConfigs = clusterConfigs 232 233 clients := map[string]kubernetes.Interface{} 234 for context, config := range clusterConfigs { 235 client, err := kubernetes.NewForConfig(&config) 236 if err != nil { 237 return fmt.Errorf("create %s kubernetes client: %w", context, err) 238 } 239 clients[context] = client 240 } 241 242 localCfg := clusterConfigs[kube.InClusterContext] 243 o.infrastructureClusterConfig = &localCfg 244 pjClient, err := prow.NewForConfig(&localCfg) 245 if err != nil { 246 return err 247 } 248 249 o.dryRun = dryRun 250 if dryRun { 251 return nil 252 } 253 254 o.prowJobClientset = pjClient 255 o.kubernetesClientsByContext = clients 256 o.resolved = true 257 258 return nil 259 } 260 261 // ProwJobClientset returns a ProwJob clientset for use in informer factories. 262 func (o *KubernetesOptions) ProwJobClientset(dryRun bool) (prowJobClientset prow.Interface, err error) { 263 if err := o.resolve(dryRun); err != nil { 264 return nil, err 265 } 266 267 if o.dryRun { 268 return nil, errors.New("no dry-run prowjob clientset is supported in dry-run mode") 269 } 270 271 return o.prowJobClientset, nil 272 } 273 274 // ProwJobClient returns a ProwJob client. 275 func (o *KubernetesOptions) ProwJobClient(namespace string, dryRun bool) (prowJobClient prowv1.ProwJobInterface, err error) { 276 if err := o.resolve(dryRun); err != nil { 277 return nil, err 278 } 279 280 if o.dryRun { 281 return nil, errors.New("no dry-run prowjob client is supported in dry-run mode") 282 } 283 return o.prowJobClientset.ProwV1().ProwJobs(namespace), nil 284 } 285 286 // InfrastructureClusterConfig returns the *rest.Config for the infrastructure cluster 287 func (o *KubernetesOptions) InfrastructureClusterConfig(dryRun bool) (*rest.Config, error) { 288 if err := o.resolve(dryRun); err != nil { 289 return nil, err 290 } 291 292 return o.infrastructureClusterConfig, nil 293 } 294 295 // InfrastructureClusterClient returns a Kubernetes client for the infrastructure cluster. 296 func (o *KubernetesOptions) InfrastructureClusterClient(dryRun bool) (kubernetesClient kubernetes.Interface, err error) { 297 return o.ClusterClientForContext(kube.InClusterContext, dryRun) 298 } 299 300 // ClusterClientForContext returns a Kubernetes client for the given context name. 301 func (o *KubernetesOptions) ClusterClientForContext(context string, dryRun bool) (kubernetesClient kubernetes.Interface, err error) { 302 if err := o.resolve(dryRun); err != nil { 303 return nil, err 304 } 305 306 if o.dryRun { 307 return nil, errors.New("no dry-run kubernetes client is supported in dry-run mode") 308 } 309 310 client, exists := o.kubernetesClientsByContext[context] 311 if !exists { 312 return nil, fmt.Errorf("context %q does not exist in the provided config", context) 313 } 314 return client, nil 315 } 316 317 // BuildClusterClients returns Pod clients for build clusters. 318 func (o *KubernetesOptions) BuildClusterClients(namespace string, dryRun bool) (buildClusterClients map[string]corev1.PodInterface, err error) { 319 if err := o.resolve(dryRun); err != nil { 320 return nil, err 321 } 322 323 if o.dryRun { 324 return nil, errors.New("no dry-run pod client is supported for build clusters in dry-run mode") 325 } 326 327 buildClients := map[string]corev1.PodInterface{} 328 for context, client := range o.kubernetesClientsByContext { 329 buildClients[context] = client.CoreV1().Pods(namespace) 330 } 331 return buildClients, nil 332 } 333 334 // BuildClusterCoreV1Clients returns core v1 clients for build clusters. 335 func (o *KubernetesOptions) BuildClusterCoreV1Clients(dryRun bool) (v1Clients map[string]corev1.CoreV1Interface, err error) { 336 if err := o.resolve(dryRun); err != nil { 337 return nil, err 338 } 339 340 if o.dryRun { 341 return nil, errors.New("no dry-run pod client is supported for build clusters in dry-run mode") 342 } 343 344 clients := map[string]corev1.CoreV1Interface{} 345 for context, client := range o.kubernetesClientsByContext { 346 clients[context] = client.CoreV1() 347 } 348 return clients, nil 349 } 350 351 var clientCreationFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ 352 Name: "kubernetes_failed_client_creations", 353 Help: "The number of clusters for which we failed to create a client.", 354 }, []string{"cluster"}) 355 356 // BuildClusterManagers returns a manager per buildCluster. 357 // Per default, LeaderElection and the metrics listener are disabled, as we assume 358 // that there is another manager for ProwJobs that handles that. 359 func (o *KubernetesOptions) BuildClusterManagers(dryRun bool, requiredTestPodVerbs []string, callBack func(), opts ...func(*manager.Options)) (map[string]manager.Manager, error) { 360 if err := o.resolve(dryRun); err != nil { 361 return nil, err 362 } 363 364 options := manager.Options{ 365 LeaderElection: false, 366 MetricsBindAddress: "0", 367 DryRunClient: o.dryRun, 368 } 369 for _, opt := range opts { 370 opt(&options) 371 } 372 373 res := map[string]manager.Manager{} 374 var errs []error 375 var lock sync.Mutex 376 var threads sync.WaitGroup 377 threads.Add(len(o.clusterConfigs)) 378 for buildClusterName, buildClusterConfig := range o.clusterConfigs { 379 go func(name string, config rest.Config) { 380 defer threads.Done() 381 // This fails if we are unable to connect to the cluster --- either 382 // due to missing or expired kubeconfig secrets, or if some other 383 // auth-related executable (e.g., gke-gcloud-auth-plugin) is missing 384 // from the base image. 385 mgr, err := manager.New(&config, options) 386 if err != nil { 387 clientCreationFailures.WithLabelValues(name).Add(1) 388 lock.Lock() 389 errs = append(errs, fmt.Errorf("failed to construct manager for cluster %s: %w", name, err)) 390 lock.Unlock() 391 return 392 } 393 394 // Check to see if we are able to perform actions against pods in 395 // the build cluster. The actions are given in requiredTestPodVerbs. 396 authzClient, err := authorizationv1.NewForConfig(&config) 397 if err != nil { 398 lock.Lock() 399 errs = append(errs, fmt.Errorf("failed to construct authz client for cluster %s: %s", name, err)) 400 lock.Unlock() 401 return 402 } 403 if err := CheckAuthorizations(authzClient.SelfSubjectAccessReviews(), options.Namespace, requiredTestPodVerbs); err != nil { 404 lock.Lock() 405 errs = append(errs, fmt.Errorf("failed pod resource authorization check for cluster %s: %w", name, err)) 406 lock.Unlock() 407 return 408 } 409 410 lock.Lock() 411 res[name] = mgr 412 lock.Unlock() 413 }(buildClusterName, buildClusterConfig) 414 } 415 threads.Wait() 416 417 aggregatedErr := utilerrors.NewAggregate(errs) 418 419 if aggregatedErr != nil { 420 // Retry the build clusters that failed to be connected initially. If 421 // suddenly we can connect to them successfully, execute the callback 422 // function (e.g., to terminate this pod to force a restart). This is 423 // useful where a build cluster is not reachable transiently, such as 424 // when an API server upgrade causes connection problems. 425 go func() { 426 for { 427 for buildClusterName, buildClusterConfig := range o.clusterConfigs { 428 // Do not check already-successfully-checked build clusters. 429 if _, ok := res[buildClusterName]; ok { 430 continue 431 } 432 433 // If there are any errors with this (still troublesome) 434 // build cluster, keep checking. 435 if _, err := manager.New(&buildClusterConfig, options); err != nil { 436 logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct build cluster manager: %s", err) 437 continue 438 } 439 440 authzClient, err := authorizationv1.NewForConfig(&buildClusterConfig) 441 if err != nil { 442 logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct authz client: %s", err) 443 continue 444 } 445 if err := CheckAuthorizations(authzClient.SelfSubjectAccessReviews(), options.Namespace, requiredTestPodVerbs); err != nil { 446 logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct build cluster manager: %s", err) 447 continue 448 } 449 450 logrus.WithField("build-cluster", buildClusterName).Info("Build cluster that failed to connect initially now worked.") 451 callBack() 452 } 453 // Sleep arbitrarily amount of time 454 time.Sleep(5 * time.Second) 455 } 456 }() 457 } else { 458 logrus.Debug("No error constructing managers for build clusters, skip polling build clusters.") 459 } 460 return res, aggregatedErr 461 } 462 463 // CheckAuthorizations checks if we are able to perform the required actions 464 // against test pods for the provided pod verbs (requiredTestPodVerbs). 465 func CheckAuthorizations(client authorizationv1.SelfSubjectAccessReviewInterface, namespace string, requiredTestPodVerbs []string) error { 466 467 var errs []error 468 // Unfortunately we have to do multiple API requests because there is no way 469 // to check for multiple verbs on a resource at once. The closest 470 // alternative is the "*" wildcard verb, but that appears to be overbroad 471 // and fails on the integration test cluster. The approach we take here is 472 // essentially equivalent to the following kubectl command: 473 // 474 // $ cat <<EOF | kubectl --context=kind-kind-prow-integration create -f - -v 8 475 // apiVersion: authorization.k8s.io/v1 476 // kind: SubjectAccessReview 477 // spec: 478 // resourceAttributes: 479 // resource: pods 480 // verb: list # also test for get, create, etc 481 // namespace: test-pods 482 // user: system:serviceaccount:default:prow-controller-manager 483 // EOF 484 // 485 // The difference in our case is that (1) we are running the below check 486 // *inside* the main service cluster itself, (2) we are running the check 487 // against an entirely different build cluster, and (3) we are using 488 // SelfSubjectAccessReview so that we don't have to provide a `user` field 489 // (so that this code can work with whatever user is the default when we're 490 // connecting to the build cluster). 491 // 492 // See 493 // https://kubernetes.io/docs/reference/access-authn-authz/authorization/#checking-api-access 494 // for more information. 495 for _, verb := range requiredTestPodVerbs { 496 ssar := k8sauthorizationv1.SelfSubjectAccessReview{ 497 Spec: k8sauthorizationv1.SelfSubjectAccessReviewSpec{ 498 ResourceAttributes: &k8sauthorizationv1.ResourceAttributes{ 499 Namespace: namespace, 500 Verb: verb, 501 Resource: "pods", 502 }, 503 }, 504 } 505 ssarExpanded, err := client.Create(context.TODO(), &ssar, metav1.CreateOptions{}) 506 if err != nil { 507 errs = append(errs, err) 508 continue 509 } 510 511 if !ssarExpanded.Status.Allowed { 512 errs = append(errs, fmt.Errorf("%w: unable to %q pods", MissingPermissions, verb)) 513 } 514 } 515 516 return utilerrors.NewAggregate(errs) 517 } 518 519 // BuildClusterUncachedRuntimeClients returns ctrlruntimeclients for the build cluster in a non-caching implementation. 520 func (o *KubernetesOptions) BuildClusterUncachedRuntimeClients(dryRun bool) (map[string]ctrlruntimeclient.Client, error) { 521 if err := o.resolve(dryRun); err != nil { 522 return nil, err 523 } 524 525 var errs []error 526 clients := map[string]ctrlruntimeclient.Client{} 527 for name := range o.clusterConfigs { 528 cfg := o.clusterConfigs[name] 529 client, err := ctrlruntimeclient.New(&cfg, ctrlruntimeclient.Options{}) 530 if err != nil { 531 clientCreationFailures.WithLabelValues(name).Add(1) 532 errs = append(errs, fmt.Errorf("failed to construct client for cluster %q: %w", name, err)) 533 continue 534 } 535 if o.dryRun { 536 client = ctrlruntimeclient.NewDryRunClient(client) 537 } 538 clients[name] = client 539 } 540 541 return clients, utilerrors.NewAggregate(errs) 542 } 543 544 func (o *KubernetesOptions) KnownClusters(dryRun bool) (map[string]rest.Config, error) { 545 if err := o.resolve(dryRun); err != nil { 546 return nil, err 547 } 548 return o.clusterConfigs, nil 549 } 550 551 // SetDisabledClusters sets disabledClusters 552 // It has no effects if the options have been resolved. 553 func (o *KubernetesOptions) SetDisabledClusters(disabledClusters sets.Set[string]) { 554 if o.resolved { 555 logrus.WithField("disabledClusters", o.disabledClusters).Warn("SetDisabledClusters has to be called before it is resolved") 556 return 557 } 558 o.disabledClusters = disabledClusters 559 }