sigs.k8s.io/kueue@v0.6.2/cmd/kueue/main.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"flag"
    23  	"net/http"
    24  	"os"
    25  
    26  	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
    27  	// to ensure that exec-entrypoint and run can make use of them.
    28  	_ "k8s.io/client-go/plugin/pkg/client/auth"
    29  
    30  	zaplog "go.uber.org/zap"
    31  	"go.uber.org/zap/zapcore"
    32  	corev1 "k8s.io/api/core/v1"
    33  	schedulingv1 "k8s.io/api/scheduling/v1"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    36  	"k8s.io/apimachinery/pkg/util/validation/field"
    37  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    38  	autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1"
    39  	"k8s.io/client-go/discovery"
    40  	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
    41  	"k8s.io/client-go/rest"
    42  	"k8s.io/utils/ptr"
    43  	ctrl "sigs.k8s.io/controller-runtime"
    44  	"sigs.k8s.io/controller-runtime/pkg/healthz"
    45  	"sigs.k8s.io/controller-runtime/pkg/log/zap"
    46  
    47  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    48  	kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
    49  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    50  	"sigs.k8s.io/kueue/pkg/cache"
    51  	"sigs.k8s.io/kueue/pkg/config"
    52  	"sigs.k8s.io/kueue/pkg/constants"
    53  	"sigs.k8s.io/kueue/pkg/controller/admissionchecks/multikueue"
    54  	"sigs.k8s.io/kueue/pkg/controller/admissionchecks/provisioning"
    55  	"sigs.k8s.io/kueue/pkg/controller/core"
    56  	"sigs.k8s.io/kueue/pkg/controller/core/indexer"
    57  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    58  	"sigs.k8s.io/kueue/pkg/debugger"
    59  	"sigs.k8s.io/kueue/pkg/features"
    60  	"sigs.k8s.io/kueue/pkg/metrics"
    61  	"sigs.k8s.io/kueue/pkg/queue"
    62  	"sigs.k8s.io/kueue/pkg/scheduler"
    63  	"sigs.k8s.io/kueue/pkg/util/cert"
    64  	"sigs.k8s.io/kueue/pkg/util/kubeversion"
    65  	"sigs.k8s.io/kueue/pkg/util/useragent"
    66  	"sigs.k8s.io/kueue/pkg/version"
    67  	"sigs.k8s.io/kueue/pkg/visibility"
    68  	"sigs.k8s.io/kueue/pkg/webhooks"
    69  
    70  	// Ensure linking of the job controllers.
    71  	_ "sigs.k8s.io/kueue/pkg/controller/jobs"
    72  	// +kubebuilder:scaffold:imports
    73  )
    74  
    75  var (
    76  	scheme   = runtime.NewScheme()
    77  	setupLog = ctrl.Log.WithName("setup")
    78  )
    79  
    80  func init() {
    81  	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
    82  	utilruntime.Must(schedulingv1.AddToScheme(scheme))
    83  
    84  	utilruntime.Must(kueue.AddToScheme(scheme))
    85  	utilruntime.Must(kueuealpha.AddToScheme(scheme))
    86  	utilruntime.Must(configapi.AddToScheme(scheme))
    87  	utilruntime.Must(autoscaling.AddToScheme(scheme))
    88  	// Add any additional framework integration types.
    89  	utilruntime.Must(
    90  		jobframework.ForEachIntegration(func(_ string, cb jobframework.IntegrationCallbacks) error {
    91  			if cb.AddToScheme != nil {
    92  				return cb.AddToScheme(scheme)
    93  			}
    94  			return nil
    95  		}),
    96  	)
    97  
    98  	// +kubebuilder:scaffold:scheme
    99  }
   100  
   101  func main() {
   102  	var configFile string
   103  	flag.StringVar(&configFile, "config", "",
   104  		"The controller will load its initial configuration from this file. "+
   105  			"Omit this flag to use the default configuration values. ")
   106  
   107  	var featureGates string
   108  	flag.StringVar(&featureGates, "feature-gates", "", "A set of key=value pairs that describe feature gates for alpha/experimental features.")
   109  
   110  	opts := zap.Options{
   111  		TimeEncoder: zapcore.RFC3339NanoTimeEncoder,
   112  		ZapOpts:     []zaplog.Option{zaplog.AddCaller()},
   113  	}
   114  	opts.BindFlags(flag.CommandLine)
   115  	flag.Parse()
   116  
   117  	if err := utilfeature.DefaultMutableFeatureGate.Set(featureGates); err != nil {
   118  		setupLog.Error(err, "Unable to set flag gates for known features")
   119  		os.Exit(1)
   120  	}
   121  
   122  	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
   123  	setupLog.Info("Initializing", "gitVersion", version.GitVersion, "gitCommit", version.GitCommit)
   124  
   125  	options, cfg, err := apply(configFile)
   126  	if err != nil {
   127  		setupLog.Error(err, "Unable to load the configuration")
   128  		os.Exit(1)
   129  	}
   130  
   131  	metrics.Register()
   132  
   133  	kubeConfig := ctrl.GetConfigOrDie()
   134  	if kubeConfig.UserAgent == "" {
   135  		kubeConfig.UserAgent = useragent.Default()
   136  	}
   137  	kubeConfig.QPS = *cfg.ClientConnection.QPS
   138  	kubeConfig.Burst = int(*cfg.ClientConnection.Burst)
   139  	setupLog.V(2).Info("K8S Client", "qps", kubeConfig.QPS, "burst", kubeConfig.Burst)
   140  	mgr, err := ctrl.NewManager(kubeConfig, options)
   141  	if err != nil {
   142  		setupLog.Error(err, "Unable to start manager")
   143  		os.Exit(1)
   144  	}
   145  
   146  	certsReady := make(chan struct{})
   147  
   148  	if cfg.InternalCertManagement != nil && *cfg.InternalCertManagement.Enable {
   149  		if err = cert.ManageCerts(mgr, cfg, certsReady); err != nil {
   150  			setupLog.Error(err, "Unable to set up cert rotation")
   151  			os.Exit(1)
   152  		}
   153  	} else {
   154  		close(certsReady)
   155  	}
   156  
   157  	cCache := cache.New(mgr.GetClient(), cache.WithPodsReadyTracking(blockForPodsReady(&cfg)))
   158  	queues := queue.NewManager(mgr.GetClient(), cCache, queue.WithPodsReadyRequeuingTimestamp(podsReadyRequeuingTimestamp(&cfg)))
   159  
   160  	ctx := ctrl.SetupSignalHandler()
   161  	if err := setupIndexes(ctx, mgr, &cfg); err != nil {
   162  		setupLog.Error(err, "Unable to setup indexes")
   163  		os.Exit(1)
   164  	}
   165  	debugger.NewDumper(cCache, queues).ListenForSignal(ctx)
   166  
   167  	serverVersionFetcher := setupServerVersionFetcher(mgr, kubeConfig)
   168  
   169  	setupProbeEndpoints(mgr, certsReady)
   170  	// Cert won't be ready until manager starts, so start a goroutine here which
   171  	// will block until the cert is ready before setting up the controllers.
   172  	// Controllers who register after manager starts will start directly.
   173  	go setupControllers(mgr, cCache, queues, certsReady, &cfg, serverVersionFetcher)
   174  
   175  	go func() {
   176  		queues.CleanUpOnContext(ctx)
   177  	}()
   178  	go func() {
   179  		cCache.CleanUpOnContext(ctx)
   180  	}()
   181  
   182  	if features.Enabled(features.VisibilityOnDemand) {
   183  		go visibility.CreateAndStartVisibilityServer(queues, ctx)
   184  	}
   185  
   186  	setupScheduler(mgr, cCache, queues, &cfg)
   187  
   188  	setupLog.Info("Starting manager")
   189  	if err := mgr.Start(ctx); err != nil {
   190  		setupLog.Error(err, "Could not run manager")
   191  		os.Exit(1)
   192  	}
   193  }
   194  
   195  func setupIndexes(ctx context.Context, mgr ctrl.Manager, cfg *configapi.Configuration) error {
   196  	err := indexer.Setup(ctx, mgr.GetFieldIndexer())
   197  	if err != nil {
   198  		return err
   199  	}
   200  
   201  	// setup provision admission check controller indexes
   202  	if features.Enabled(features.ProvisioningACC) {
   203  		if !provisioning.ServerSupportsProvisioningRequest(mgr) {
   204  			setupLog.Error(nil, "Provisioning Requests are not supported, skipped admission check controller setup")
   205  		} else if err := provisioning.SetupIndexer(ctx, mgr.GetFieldIndexer()); err != nil {
   206  			setupLog.Error(err, "Could not setup provisioning indexer")
   207  			os.Exit(1)
   208  		}
   209  	}
   210  
   211  	if features.Enabled(features.MultiKueue) {
   212  		if err := multikueue.SetupIndexer(ctx, mgr.GetFieldIndexer(), *cfg.Namespace); err != nil {
   213  			setupLog.Error(err, "Could not setup multikueue indexer")
   214  			os.Exit(1)
   215  		}
   216  	}
   217  
   218  	opts := []jobframework.Option{
   219  		jobframework.WithEnabledFrameworks(cfg.Integrations),
   220  	}
   221  	return jobframework.SetupIndexes(ctx, mgr.GetFieldIndexer(), opts...)
   222  }
   223  
   224  func setupControllers(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manager, certsReady chan struct{}, cfg *configapi.Configuration, serverVersionFetcher *kubeversion.ServerVersionFetcher) {
   225  	// The controllers won't work until the webhooks are operating, and the webhook won't work until the
   226  	// certs are all in place.
   227  	cert.WaitForCertsReady(setupLog, certsReady)
   228  
   229  	if failedCtrl, err := core.SetupControllers(mgr, queues, cCache, cfg); err != nil {
   230  		setupLog.Error(err, "Unable to create controller", "controller", failedCtrl)
   231  		os.Exit(1)
   232  	}
   233  
   234  	// setup provision admission check controller
   235  	if features.Enabled(features.ProvisioningACC) && provisioning.ServerSupportsProvisioningRequest(mgr) {
   236  		// A info message is added in setupIndexes if autoscaling is not supported by the cluster
   237  		ctrl, err := provisioning.NewController(mgr.GetClient(), mgr.GetEventRecorderFor("kueue-provisioning-request-controller"))
   238  		if err != nil {
   239  			setupLog.Error(err, "Could not create the provisioning controller")
   240  			os.Exit(1)
   241  		}
   242  
   243  		if err := ctrl.SetupWithManager(mgr); err != nil {
   244  			setupLog.Error(err, "Could not setup provisioning controller")
   245  			os.Exit(1)
   246  		}
   247  	}
   248  
   249  	if features.Enabled(features.MultiKueue) {
   250  		if err := multikueue.SetupControllers(mgr, *cfg.Namespace,
   251  			multikueue.WithGCInterval(cfg.MultiKueue.GCInterval.Duration),
   252  			multikueue.WithOrigin(ptr.Deref(cfg.MultiKueue.Origin, configapi.DefaultMultiKueueOrigin)),
   253  		); err != nil {
   254  			setupLog.Error(err, "Could not setup MultiKueue controller")
   255  			os.Exit(1)
   256  		}
   257  	}
   258  
   259  	if failedWebhook, err := webhooks.Setup(mgr); err != nil {
   260  		setupLog.Error(err, "Unable to create webhook", "webhook", failedWebhook)
   261  		os.Exit(1)
   262  	}
   263  
   264  	opts := []jobframework.Option{
   265  		jobframework.WithManageJobsWithoutQueueName(cfg.ManageJobsWithoutQueueName),
   266  		jobframework.WithWaitForPodsReady(cfg.WaitForPodsReady),
   267  		jobframework.WithKubeServerVersion(serverVersionFetcher),
   268  		jobframework.WithIntegrationOptions(corev1.SchemeGroupVersion.WithKind("Pod").String(), cfg.Integrations.PodOptions),
   269  		jobframework.WithEnabledFrameworks(cfg.Integrations),
   270  		jobframework.WithManagerName(constants.KueueName),
   271  	}
   272  	if err := jobframework.SetupControllers(mgr, setupLog, opts...); err != nil {
   273  		setupLog.Error(err, "Unable to create controller or webhook", "kubernetesVersion", serverVersionFetcher.GetServerVersion())
   274  		os.Exit(1)
   275  	}
   276  	// +kubebuilder:scaffold:builder
   277  }
   278  
   279  // setupProbeEndpoints registers the health endpoints
   280  func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
   281  	defer setupLog.Info("Probe endpoints are configured on healthz and readyz")
   282  
   283  	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
   284  		setupLog.Error(err, "unable to set up health check")
   285  		os.Exit(1)
   286  	}
   287  
   288  	// Wait for the webhook server to be listening before advertising the
   289  	// Kueue replica as ready. This allows users to wait with sending the first
   290  	// requests, requiring webhooks, until the Kueue deployment is available, so
   291  	// that the early requests are not rejected during the Kueue's startup.
   292  	// We wrap the call to GetWebhookServer in a closure to delay calling
   293  	// the function, otherwise a not fully-initialized webhook server (without
   294  	// ready certs) fails the start of the manager.
   295  	if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
   296  		select {
   297  		case <-certsReady:
   298  			return mgr.GetWebhookServer().StartedChecker()(req)
   299  		default:
   300  			return errors.New("certificates are not ready")
   301  		}
   302  	}); err != nil {
   303  		setupLog.Error(err, "unable to set up ready check")
   304  		os.Exit(1)
   305  	}
   306  }
   307  
   308  func setupScheduler(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manager, cfg *configapi.Configuration) {
   309  	sched := scheduler.New(
   310  		queues,
   311  		cCache,
   312  		mgr.GetClient(),
   313  		mgr.GetEventRecorderFor(constants.AdmissionName),
   314  		scheduler.WithPodsReadyRequeuingTimestamp(podsReadyRequeuingTimestamp(cfg)),
   315  	)
   316  	if err := mgr.Add(sched); err != nil {
   317  		setupLog.Error(err, "Unable to add scheduler to manager")
   318  		os.Exit(1)
   319  	}
   320  }
   321  
   322  func setupServerVersionFetcher(mgr ctrl.Manager, kubeConfig *rest.Config) *kubeversion.ServerVersionFetcher {
   323  	discoveryClient, err := discovery.NewDiscoveryClientForConfig(kubeConfig)
   324  	if err != nil {
   325  		setupLog.Error(err, "Unable to create the discovery client")
   326  		os.Exit(1)
   327  	}
   328  
   329  	serverVersionFetcher := kubeversion.NewServerVersionFetcher(discoveryClient)
   330  
   331  	if err := mgr.Add(serverVersionFetcher); err != nil {
   332  		setupLog.Error(err, "Unable to add server version fetcher to manager")
   333  		os.Exit(1)
   334  	}
   335  
   336  	if err := serverVersionFetcher.FetchServerVersion(); err != nil {
   337  		setupLog.Error(err, "failed to fetch kubernetes server version")
   338  		os.Exit(1)
   339  	}
   340  
   341  	return serverVersionFetcher
   342  }
   343  
   344  func blockForPodsReady(cfg *configapi.Configuration) bool {
   345  	return config.WaitForPodsReadyIsEnabled(cfg) && cfg.WaitForPodsReady.BlockAdmission != nil && *cfg.WaitForPodsReady.BlockAdmission
   346  }
   347  
   348  func podsReadyRequeuingTimestamp(cfg *configapi.Configuration) configapi.RequeuingTimestamp {
   349  	if cfg.WaitForPodsReady != nil && cfg.WaitForPodsReady.RequeuingStrategy != nil &&
   350  		cfg.WaitForPodsReady.RequeuingStrategy.Timestamp != nil {
   351  		return *cfg.WaitForPodsReady.RequeuingStrategy.Timestamp
   352  	}
   353  	return configapi.EvictionTimestamp
   354  }
   355  
   356  func apply(configFile string) (ctrl.Options, configapi.Configuration, error) {
   357  	options, cfg, err := config.Load(scheme, configFile)
   358  	if err != nil {
   359  		return options, cfg, err
   360  	}
   361  
   362  	if cfg.Integrations != nil {
   363  		var errorlist field.ErrorList
   364  		availableFrameworks := jobframework.GetIntegrationsList()
   365  		path := field.NewPath("integrations", "frameworks")
   366  		for _, framework := range cfg.Integrations.Frameworks {
   367  			if _, found := jobframework.GetIntegration(framework); !found {
   368  				errorlist = append(errorlist, field.NotSupported(path, framework, availableFrameworks))
   369  			}
   370  		}
   371  		if len(errorlist) > 0 {
   372  			err := errorlist.ToAggregate()
   373  			return options, cfg, err
   374  		}
   375  	}
   376  
   377  	cfgStr, err := config.Encode(scheme, &cfg)
   378  	if err != nil {
   379  		return options, cfg, err
   380  	}
   381  	setupLog.Info("Successfully loaded configuration", "config", cfgStr)
   382  
   383  	return options, cfg, nil
   384  }