
     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    17  package flagutil
    19  import (
    20  	"context"
    21  	"errors"
    22  	"flag"
    23  	"fmt"
    24  	"os"
    25  	"path/filepath"
    26  	"strings"
    27  	"sync"
    28  	"time"
    30  	""
    31  	""
    32  	""
    34  	k8sauthorizationv1 ""
    35  	metav1 ""
    36  	utilerrors ""
    37  	""
    38  	""
    39  	authorizationv1 ""
    40  	corev1 ""
    41  	""
    42  	""
    43  	""
    44  	ctrlruntimeclient ""
    45  	""
    46  	""
    48  	prow ""
    49  	prowv1 ""
    50  	""
    51  )
    53  func init() {
    54  	prometheus.MustRegister(clientCreationFailures)
    55  }
    57  // KubernetesOptions holds options for interacting with Kubernetes.
    58  // These options are both useful for clients interacting with ProwJobs
    59  // and other resources on the infrastructure cluster, as well as Pods
    60  // on build clusters.
    61  type KubernetesOptions struct {
    62  	kubeconfig               string
    63  	kubeconfigDir            string
    64  	kubeconfigSuffix         string
    65  	projectedTokenFile       string
    66  	noInClusterConfig        bool
    67  	NOInClusterConfigDefault bool
    69  	// from the setter SetDisabledClusters
    70  	disabledClusters sets.Set[string]
    72  	// from resolution
    73  	resolved                    bool
    74  	dryRun                      bool
    75  	prowJobClientset            prow.Interface
    76  	clusterConfigs              map[string]rest.Config
    77  	kubernetesClientsByContext  map[string]kubernetes.Interface
    78  	infrastructureClusterConfig *rest.Config
    79  	kubeconfigWach              *sync.Once
    80  	kubeconfigWatchEvents       <-chan fsnotify.Event
    81  }
    83  var MissingPermissions = errors.New("missing permissions")
    85  // AddKubeconfigChangeCallback adds a callback that gets called whenever the kubeconfig changes.
    86  // The main usecase for this is to exit components that can not reload a kubeconfig at runtime
    87  // so the kubelet restarts them
    88  func (o *KubernetesOptions) AddKubeconfigChangeCallback(callback func()) error {
    89  	if err := o.resolve(o.dryRun); err != nil {
    90  		return fmt.Errorf("resolving failed: %w", err)
    91  	}
    93  	var err error
    94  	o.kubeconfigWach.Do(func() {
    95  		var watcher *fsnotify.Watcher
    96  		watcher, err = fsnotify.NewWatcher()
    97  		if err != nil {
    98  			err = fmt.Errorf("failed to create watcher: %w", err)
    99  			return
   100  		}
   101  		if o.kubeconfig != "" {
   102  			err = watcher.Add(o.kubeconfig)
   103  			if err != nil {
   104  				err = fmt.Errorf("failed to watch %s: %w", o.kubeconfig, err)
   105  				return
   106  			}
   107  		}
   108  		if o.kubeconfigDir != "" {
   109  			err = watcher.Add(o.kubeconfigDir)
   110  			if err != nil {
   111  				err = fmt.Errorf("failed to watch %s: %w", o.kubeconfigDir, err)
   112  				return
   113  			}
   114  		}
   115  		if o.kubeconfig == "" && o.kubeconfigDir == "" {
   116  			if envVal := os.Getenv(clientcmd.RecommendedConfigPathEnvVar); envVal != "" {
   117  				for _, element := range sets.List(sets.New[string](filepath.SplitList(envVal)...)) {
   118  					err = watcher.Add(element)
   119  					if err != nil {
   120  						err = fmt.Errorf("failed to watch %s: %w", element, err)
   121  						return
   122  					}
   123  				}
   124  			}
   125  		}
   126  		o.kubeconfigWatchEvents = watcher.Events
   128  		go func() {
   129  			for watchErr := range watcher.Errors {
   130  				logrus.WithError(watchErr).Error("Kubeconfig watcher errored")
   131  			}
   132  			if err := watcher.Close(); err != nil {
   133  				logrus.WithError(err).Error("Failed to close watcher")
   134  			}
   135  		}()
   136  	})
   137  	if err != nil {
   138  		return fmt.Errorf("failed to set up watches: %w", err)
   139  	}
   141  	go func() {
   142  		for e := range o.kubeconfigWatchEvents {
   143  			if e.Op == fsnotify.Chmod {
   144  				// For some reason we get frequent chmod events
   145  				continue
   146  			}
   147  			logrus.WithField("event", e.String()).Info("Kubeconfig changed")
   148  			callback()
   149  		}
   150  	}()
   152  	return nil
   153  }
   155  // LoadClusterConfigs returns the resolved rest.Configs and each callback function will be executed if
   156  // the underlying kubeconfig files are modified. This function is for the case where the rest.Configs are
   157  // needed without interests of the clients.
   158  func (o *KubernetesOptions) LoadClusterConfigs(callBacks ...func()) (map[string]rest.Config, error) {
   159  	var errs []error
   160  	if !o.resolved {
   161  		if err := o.resolve(o.dryRun); err != nil {
   162  			errs = append(errs, fmt.Errorf("failed to resolve the kubeneates options: %w", err))
   163  		}
   164  	}
   166  	if o.kubeconfig == "" && o.kubeconfigDir == "" {
   167  		if envVal := os.Getenv(clientcmd.RecommendedConfigPathEnvVar); envVal != "" {
   168  			if kubeconfigsFromEnv := strings.Split(envVal, ":"); len(kubeconfigsFromEnv) > 0 &&
   169  				len(kubeconfigsFromEnv) > len(o.clusterConfigs) {
   170  				errs = append(errs, fmt.Errorf("%s env var with value %s had %d elements but only got %d kubeconfigs",
   171  					clientcmd.RecommendedConfigPathEnvVar, envVal, len(kubeconfigsFromEnv), len(o.clusterConfigs)))
   172  			}
   173  		}
   174  	}
   176  	for i, callBack := range callBacks {
   177  		if callBack != nil {
   178  			if err := o.AddKubeconfigChangeCallback(callBack); err != nil {
   179  				errs = append(errs, fmt.Errorf("failed to add the %d-th kubeconfig change call back: %w", i, err))
   180  			}
   181  		}
   182  	}
   183  	return o.clusterConfigs, utilerrors.NewAggregate(errs)
   184  }
   186  // AddFlags injects Kubernetes options into the given FlagSet.
   187  func (o *KubernetesOptions) AddFlags(fs *flag.FlagSet) {
   188  	fs.StringVar(&o.kubeconfig, "kubeconfig", "", "Path to .kube/config file. If neither of --kubeconfig and --kubeconfig-dir is provided, use the in-cluster config. All contexts other than the default are used as build clusters.")
   189  	fs.StringVar(&o.kubeconfigDir, "kubeconfig-dir", "", "Path to the directory containing kubeconfig files. If neither of --kubeconfig and --kubeconfig-dir is provided, use the in-cluster config. All contexts other than the default are used as build clusters.")
   190  	fs.StringVar(&o.kubeconfigSuffix, "kubeconfig-suffix", "", "The files without the suffix will be ignored when loading kubeconfig files from --kubeconfig-dir. It must be used together with --kubeconfig-dir.")
   191  	fs.StringVar(&o.projectedTokenFile, "projected-token-file", "", "A projected serviceaccount token file. If set, this will be configured as token file in the in-cluster config.")
   192  	fs.BoolVar(&o.noInClusterConfig, "no-in-cluster-config", o.NOInClusterConfigDefault, "Not resolving InCluster Config if set.")
   193  }
   195  // Validate validates Kubernetes options.
   196  func (o *KubernetesOptions) Validate(_ bool) error {
   197  	if o.kubeconfig != "" {
   198  		if _, err := os.Stat(o.kubeconfig); err != nil {
   199  			return fmt.Errorf("error accessing --kubeconfig: %w", err)
   200  		}
   201  	}
   203  	if o.kubeconfigDir != "" {
   204  		if fileInfo, err := os.Stat(o.kubeconfigDir); err != nil {
   205  			return fmt.Errorf("error accessing --kubeconfig-dir: %w", err)
   206  		} else if !fileInfo.IsDir() {
   207  			return fmt.Errorf("--kubeconfig-dir must be a directory")
   208  		}
   209  	}
   211  	if o.kubeconfigSuffix != "" && o.kubeconfigDir == "" {
   212  		return fmt.Errorf("--kubeconfig-dir must be set if --kubeconfig-suffix is set")
   213  	}
   215  	return nil
   216  }
   218  // resolve loads all of the clients we need and caches them for future calls.
   219  func (o *KubernetesOptions) resolve(dryRun bool) error {
   220  	if o.resolved {
   221  		return nil
   222  	}
   224  	o.kubeconfigWach = &sync.Once{}
   226  	clusterConfigs, err := kube.LoadClusterConfigs(kube.NewConfig(kube.ConfigFile(o.kubeconfig),
   227  		kube.ConfigDir(o.kubeconfigDir), kube.ConfigProjectedTokenFile(o.projectedTokenFile),
   228  		kube.NoInClusterConfig(o.noInClusterConfig), kube.ConfigSuffix(o.kubeconfigSuffix),
   229  		kube.DisabledClusters(o.disabledClusters)))
   230  	if err != nil {
   231  		return fmt.Errorf("load --kubeconfig=%q configs: %w", o.kubeconfig, err)
   232  	}
   233  	o.clusterConfigs = clusterConfigs
   235  	clients := map[string]kubernetes.Interface{}
   236  	for context, config := range clusterConfigs {
   237  		client, err := kubernetes.NewForConfig(&config)
   238  		if err != nil {
   239  			return fmt.Errorf("create %s kubernetes client: %w", context, err)
   240  		}
   241  		clients[context] = client
   242  	}
   244  	localCfg := clusterConfigs[kube.InClusterContext]
   245  	o.infrastructureClusterConfig = &localCfg
   246  	pjClient, err := prow.NewForConfig(&localCfg)
   247  	if err != nil {
   248  		return err
   249  	}
   251  	o.dryRun = dryRun
   252  	if dryRun {
   253  		return nil
   254  	}
   256  	o.prowJobClientset = pjClient
   257  	o.kubernetesClientsByContext = clients
   258  	o.resolved = true
   260  	return nil
   261  }
   263  // ProwJobClientset returns a ProwJob clientset for use in informer factories.
   264  func (o *KubernetesOptions) ProwJobClientset(dryRun bool) (prowJobClientset prow.Interface, err error) {
   265  	if err := o.resolve(dryRun); err != nil {
   266  		return nil, err
   267  	}
   269  	if o.dryRun {
   270  		return nil, errors.New("no dry-run prowjob clientset is supported in dry-run mode")
   271  	}
   273  	return o.prowJobClientset, nil
   274  }
   276  // ProwJobClient returns a ProwJob client.
   277  func (o *KubernetesOptions) ProwJobClient(namespace string, dryRun bool) (prowJobClient prowv1.ProwJobInterface, err error) {
   278  	if err := o.resolve(dryRun); err != nil {
   279  		return nil, err
   280  	}
   282  	if o.dryRun {
   283  		return nil, errors.New("no dry-run prowjob client is supported in dry-run mode")
   284  	}
   285  	return o.prowJobClientset.ProwV1().ProwJobs(namespace), nil
   286  }
   288  // InfrastructureClusterConfig returns the *rest.Config for the infrastructure cluster
   289  func (o *KubernetesOptions) InfrastructureClusterConfig(dryRun bool) (*rest.Config, error) {
   290  	if err := o.resolve(dryRun); err != nil {
   291  		return nil, err
   292  	}
   294  	return o.infrastructureClusterConfig, nil
   295  }
   297  // InfrastructureClusterClient returns a Kubernetes client for the infrastructure cluster.
   298  func (o *KubernetesOptions) InfrastructureClusterClient(dryRun bool) (kubernetesClient kubernetes.Interface, err error) {
   299  	return o.ClusterClientForContext(kube.InClusterContext, dryRun)
   300  }
   302  // ClusterClientForContext returns a Kubernetes client for the given context name.
   303  func (o *KubernetesOptions) ClusterClientForContext(context string, dryRun bool) (kubernetesClient kubernetes.Interface, err error) {
   304  	if err := o.resolve(dryRun); err != nil {
   305  		return nil, err
   306  	}
   308  	if o.dryRun {
   309  		return nil, errors.New("no dry-run kubernetes client is supported in dry-run mode")
   310  	}
   312  	client, exists := o.kubernetesClientsByContext[context]
   313  	if !exists {
   314  		return nil, fmt.Errorf("context %q does not exist in the provided config", context)
   315  	}
   316  	return client, nil
   317  }
   319  // BuildClusterClients returns Pod clients for build clusters.
   320  func (o *KubernetesOptions) BuildClusterClients(namespace string, dryRun bool) (buildClusterClients map[string]corev1.PodInterface, err error) {
   321  	if err := o.resolve(dryRun); err != nil {
   322  		return nil, err
   323  	}
   325  	if o.dryRun {
   326  		return nil, errors.New("no dry-run pod client is supported for build clusters in dry-run mode")
   327  	}
   329  	buildClients := map[string]corev1.PodInterface{}
   330  	for context, client := range o.kubernetesClientsByContext {
   331  		buildClients[context] = client.CoreV1().Pods(namespace)
   332  	}
   333  	return buildClients, nil
   334  }
   336  // BuildClusterCoreV1Clients returns core v1 clients for build clusters.
   337  func (o *KubernetesOptions) BuildClusterCoreV1Clients(dryRun bool) (v1Clients map[string]corev1.CoreV1Interface, err error) {
   338  	if err := o.resolve(dryRun); err != nil {
   339  		return nil, err
   340  	}
   342  	if o.dryRun {
   343  		return nil, errors.New("no dry-run pod client is supported for build clusters in dry-run mode")
   344  	}
   346  	clients := map[string]corev1.CoreV1Interface{}
   347  	for context, client := range o.kubernetesClientsByContext {
   348  		clients[context] = client.CoreV1()
   349  	}
   350  	return clients, nil
   351  }
   353  var clientCreationFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
   354  	Name: "kubernetes_failed_client_creations",
   355  	Help: "The number of clusters for which we failed to create a client.",
   356  }, []string{"cluster"})
   358  // BuildClusterManagers returns a manager per buildCluster.
   359  // Per default, LeaderElection and the metrics listener are disabled, as we assume
   360  // that there is another manager for ProwJobs that handles that.
   361  func (o *KubernetesOptions) BuildClusterManagers(dryRun bool, requiredTestPodVerbs []string, callBack func(), namespace string, opts ...func(*manager.Options)) (map[string]manager.Manager, error) {
   362  	if err := o.resolve(dryRun); err != nil {
   363  		return nil, err
   364  	}
   366  	options := manager.Options{
   367  		LeaderElection: false,
   368  		Metrics: server.Options{
   369  			BindAddress: "0",
   370  		},
   371  		Client: ctrlruntimeclient.Options{
   372  			DryRun: &o.dryRun,
   373  		},
   374  		Cache: cache.Options{
   375  			DefaultNamespaces: map[string]cache.Config{
   376  				namespace: {},
   377  			},
   378  		},
   379  	}
   380  	for _, opt := range opts {
   381  		opt(&options)
   382  	}
   384  	res := map[string]manager.Manager{}
   385  	var errs []error
   386  	var lock sync.Mutex
   387  	var threads sync.WaitGroup
   388  	threads.Add(len(o.clusterConfigs))
   389  	for buildClusterName, buildClusterConfig := range o.clusterConfigs {
   390  		go func(name string, config rest.Config) {
   391  			defer threads.Done()
   392  			// This fails if we are unable to connect to the cluster --- either
   393  			// due to missing or expired kubeconfig secrets, or if some other
   394  			// auth-related executable (e.g., gke-gcloud-auth-plugin) is missing
   395  			// from the base image.
   396  			mgr, err := manager.New(&config, options)
   397  			if err != nil {
   398  				clientCreationFailures.WithLabelValues(name).Add(1)
   399  				lock.Lock()
   400  				errs = append(errs, fmt.Errorf("failed to construct manager for cluster %s: %w", name, err))
   401  				lock.Unlock()
   402  				return
   403  			}
   405  			// Check to see if we are able to perform actions against pods in
   406  			// the build cluster. The actions are given in requiredTestPodVerbs.
   407  			authzClient, err := authorizationv1.NewForConfig(&config)
   408  			if err != nil {
   409  				lock.Lock()
   410  				errs = append(errs, fmt.Errorf("failed to construct authz client for cluster %s: %s", name, err))
   411  				lock.Unlock()
   412  				return
   413  			}
   414  			if err := CheckAuthorizations(authzClient.SelfSubjectAccessReviews(), namespace, requiredTestPodVerbs); err != nil {
   415  				lock.Lock()
   416  				errs = append(errs, fmt.Errorf("failed pod resource authorization check for cluster %s: %w", name, err))
   417  				lock.Unlock()
   418  				return
   419  			}
   421  			lock.Lock()
   422  			res[name] = mgr
   423  			lock.Unlock()
   424  		}(buildClusterName, buildClusterConfig)
   425  	}
   426  	threads.Wait()
   428  	aggregatedErr := utilerrors.NewAggregate(errs)
   430  	if aggregatedErr != nil {
   431  		// Retry the build clusters that failed to be connected initially. If
   432  		// suddenly we can connect to them successfully, execute the callback
   433  		// function (e.g., to terminate this pod to force a restart). This is
   434  		// useful where a build cluster is not reachable transiently, such as
   435  		// when an API server upgrade causes connection problems.
   436  		go func() {
   437  			for {
   438  				for buildClusterName, buildClusterConfig := range o.clusterConfigs {
   439  					// Do not check already-successfully-checked build clusters.
   440  					if _, ok := res[buildClusterName]; ok {
   441  						continue
   442  					}
   444  					// If there are any errors with this (still troublesome)
   445  					// build cluster, keep checking.
   446  					if _, err := manager.New(&buildClusterConfig, options); err != nil {
   447  						logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct build cluster manager: %s", err)
   448  						continue
   449  					}
   451  					authzClient, err := authorizationv1.NewForConfig(&buildClusterConfig)
   452  					if err != nil {
   453  						logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct authz client: %s", err)
   454  						continue
   455  					}
   456  					if err := CheckAuthorizations(authzClient.SelfSubjectAccessReviews(), namespace, requiredTestPodVerbs); err != nil {
   457  						logrus.WithField("build-cluster", buildClusterName).Tracef("failed to construct build cluster manager: %s", err)
   458  						continue
   459  					}
   461  					logrus.WithField("build-cluster", buildClusterName).Info("Build cluster that failed to connect initially now worked.")
   462  					callBack()
   463  				}
   464  				// Sleep arbitrarily amount of time
   465  				time.Sleep(5 * time.Second)
   466  			}
   467  		}()
   468  	} else {
   469  		logrus.Debug("No error constructing managers for build clusters, skip polling build clusters.")
   470  	}
   471  	return res, aggregatedErr
   472  }
   474  // CheckAuthorizations checks if we are able to perform the required actions
   475  // against test pods for the provided pod verbs (requiredTestPodVerbs).
   476  func CheckAuthorizations(client authorizationv1.SelfSubjectAccessReviewInterface, namespace string, requiredTestPodVerbs []string) error {
   478  	var errs []error
   479  	// Unfortunately we have to do multiple API requests because there is no way
   480  	// to check for multiple verbs on a resource at once. The closest
   481  	// alternative is the "*" wildcard verb, but that appears to be overbroad
   482  	// and fails on the integration test cluster. The approach we take here is
   483  	// essentially equivalent to the following kubectl command:
   484  	//
   485  	// 	 $ cat <<EOF | kubectl --context=kind-kind-prow-integration create -f - -v 8
   486  	//   apiVersion:
   487  	//   kind: SubjectAccessReview
   488  	//   spec:
   489  	//     resourceAttributes:
   490  	//       resource: pods
   491  	//       verb: list # also test for get, create, etc
   492  	//       namespace: test-pods
   493  	//     user: system:serviceaccount:default:prow-controller-manager
   494  	//   EOF
   495  	//
   496  	//  The difference in our case is that (1) we are running the below check
   497  	//  *inside* the main service cluster itself, (2) we are running the check
   498  	//  against an entirely different build cluster, and (3) we are using
   499  	//  SelfSubjectAccessReview so that we don't have to provide a `user` field
   500  	//  (so that this code can work with whatever user is the default when we're
   501  	//  connecting to the build cluster).
   502  	//
   503  	// See
   504  	//
   505  	// for more information.
   506  	for _, verb := range requiredTestPodVerbs {
   507  		ssar := k8sauthorizationv1.SelfSubjectAccessReview{
   508  			Spec: k8sauthorizationv1.SelfSubjectAccessReviewSpec{
   509  				ResourceAttributes: &k8sauthorizationv1.ResourceAttributes{
   510  					Namespace: namespace,
   511  					Verb:      verb,
   512  					Resource:  "pods",
   513  				},
   514  			},
   515  		}
   516  		ssarExpanded, err := client.Create(context.TODO(), &ssar, metav1.CreateOptions{})
   517  		if err != nil {
   518  			errs = append(errs, err)
   519  			continue
   520  		}
   522  		if !ssarExpanded.Status.Allowed {
   523  			errs = append(errs, fmt.Errorf("%w: unable to %q pods", MissingPermissions, verb))
   524  		}
   525  	}
   527  	return utilerrors.NewAggregate(errs)
   528  }
   530  // BuildClusterUncachedRuntimeClients returns ctrlruntimeclients for the build cluster in a non-caching implementation.
   531  func (o *KubernetesOptions) BuildClusterUncachedRuntimeClients(dryRun bool) (map[string]ctrlruntimeclient.Client, error) {
   532  	if err := o.resolve(dryRun); err != nil {
   533  		return nil, err
   534  	}
   536  	var errs []error
   537  	clients := map[string]ctrlruntimeclient.Client{}
   538  	for name := range o.clusterConfigs {
   539  		cfg := o.clusterConfigs[name]
   540  		client, err := ctrlruntimeclient.New(&cfg, ctrlruntimeclient.Options{})
   541  		if err != nil {
   542  			clientCreationFailures.WithLabelValues(name).Add(1)
   543  			errs = append(errs, fmt.Errorf("failed to construct client for cluster %q: %w", name, err))
   544  			continue
   545  		}
   546  		if o.dryRun {
   547  			client = ctrlruntimeclient.NewDryRunClient(client)
   548  		}
   549  		clients[name] = client
   550  	}
   552  	return clients, utilerrors.NewAggregate(errs)
   553  }
   555  func (o *KubernetesOptions) KnownClusters(dryRun bool) (map[string]rest.Config, error) {
   556  	if err := o.resolve(dryRun); err != nil {
   557  		return nil, err
   558  	}
   559  	return o.clusterConfigs, nil
   560  }
   562  // SetDisabledClusters sets disabledClusters
   563  // It has no effects if the options have been resolved.
   564  func (o *KubernetesOptions) SetDisabledClusters(disabledClusters sets.Set[string]) {
   565  	if o.resolved {
   566  		logrus.WithField("disabledClusters", o.disabledClusters).Warn("SetDisabledClusters has to be called before it is resolved")
   567  		return
   568  	}
   569  	o.disabledClusters = disabledClusters
   570  }