sigs.k8s.io/kueue@v0.6.2/cmd/kueue/main.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "context" 21 "errors" 22 "flag" 23 "net/http" 24 "os" 25 26 // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) 27 // to ensure that exec-entrypoint and run can make use of them. 28 _ "k8s.io/client-go/plugin/pkg/client/auth" 29 30 zaplog "go.uber.org/zap" 31 "go.uber.org/zap/zapcore" 32 corev1 "k8s.io/api/core/v1" 33 schedulingv1 "k8s.io/api/scheduling/v1" 34 "k8s.io/apimachinery/pkg/runtime" 35 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 36 "k8s.io/apimachinery/pkg/util/validation/field" 37 utilfeature "k8s.io/apiserver/pkg/util/feature" 38 autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" 39 "k8s.io/client-go/discovery" 40 clientgoscheme "k8s.io/client-go/kubernetes/scheme" 41 "k8s.io/client-go/rest" 42 "k8s.io/utils/ptr" 43 ctrl "sigs.k8s.io/controller-runtime" 44 "sigs.k8s.io/controller-runtime/pkg/healthz" 45 "sigs.k8s.io/controller-runtime/pkg/log/zap" 46 47 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 48 kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" 49 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 50 "sigs.k8s.io/kueue/pkg/cache" 51 "sigs.k8s.io/kueue/pkg/config" 52 "sigs.k8s.io/kueue/pkg/constants" 53 "sigs.k8s.io/kueue/pkg/controller/admissionchecks/multikueue" 54 "sigs.k8s.io/kueue/pkg/controller/admissionchecks/provisioning" 55 "sigs.k8s.io/kueue/pkg/controller/core" 56 "sigs.k8s.io/kueue/pkg/controller/core/indexer" 57 "sigs.k8s.io/kueue/pkg/controller/jobframework" 58 "sigs.k8s.io/kueue/pkg/debugger" 59 "sigs.k8s.io/kueue/pkg/features" 60 "sigs.k8s.io/kueue/pkg/metrics" 61 "sigs.k8s.io/kueue/pkg/queue" 62 "sigs.k8s.io/kueue/pkg/scheduler" 63 "sigs.k8s.io/kueue/pkg/util/cert" 64 "sigs.k8s.io/kueue/pkg/util/kubeversion" 65 "sigs.k8s.io/kueue/pkg/util/useragent" 66 "sigs.k8s.io/kueue/pkg/version" 67 "sigs.k8s.io/kueue/pkg/visibility" 68 "sigs.k8s.io/kueue/pkg/webhooks" 69 70 // Ensure linking of the job controllers. 71 _ "sigs.k8s.io/kueue/pkg/controller/jobs" 72 // +kubebuilder:scaffold:imports 73 ) 74 75 var ( 76 scheme = runtime.NewScheme() 77 setupLog = ctrl.Log.WithName("setup") 78 ) 79 80 func init() { 81 utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 82 utilruntime.Must(schedulingv1.AddToScheme(scheme)) 83 84 utilruntime.Must(kueue.AddToScheme(scheme)) 85 utilruntime.Must(kueuealpha.AddToScheme(scheme)) 86 utilruntime.Must(configapi.AddToScheme(scheme)) 87 utilruntime.Must(autoscaling.AddToScheme(scheme)) 88 // Add any additional framework integration types. 89 utilruntime.Must( 90 jobframework.ForEachIntegration(func(_ string, cb jobframework.IntegrationCallbacks) error { 91 if cb.AddToScheme != nil { 92 return cb.AddToScheme(scheme) 93 } 94 return nil 95 }), 96 ) 97 98 // +kubebuilder:scaffold:scheme 99 } 100 101 func main() { 102 var configFile string 103 flag.StringVar(&configFile, "config", "", 104 "The controller will load its initial configuration from this file. "+ 105 "Omit this flag to use the default configuration values. ") 106 107 var featureGates string 108 flag.StringVar(&featureGates, "feature-gates", "", "A set of key=value pairs that describe feature gates for alpha/experimental features.") 109 110 opts := zap.Options{ 111 TimeEncoder: zapcore.RFC3339NanoTimeEncoder, 112 ZapOpts: []zaplog.Option{zaplog.AddCaller()}, 113 } 114 opts.BindFlags(flag.CommandLine) 115 flag.Parse() 116 117 if err := utilfeature.DefaultMutableFeatureGate.Set(featureGates); err != nil { 118 setupLog.Error(err, "Unable to set flag gates for known features") 119 os.Exit(1) 120 } 121 122 ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) 123 setupLog.Info("Initializing", "gitVersion", version.GitVersion, "gitCommit", version.GitCommit) 124 125 options, cfg, err := apply(configFile) 126 if err != nil { 127 setupLog.Error(err, "Unable to load the configuration") 128 os.Exit(1) 129 } 130 131 metrics.Register() 132 133 kubeConfig := ctrl.GetConfigOrDie() 134 if kubeConfig.UserAgent == "" { 135 kubeConfig.UserAgent = useragent.Default() 136 } 137 kubeConfig.QPS = *cfg.ClientConnection.QPS 138 kubeConfig.Burst = int(*cfg.ClientConnection.Burst) 139 setupLog.V(2).Info("K8S Client", "qps", kubeConfig.QPS, "burst", kubeConfig.Burst) 140 mgr, err := ctrl.NewManager(kubeConfig, options) 141 if err != nil { 142 setupLog.Error(err, "Unable to start manager") 143 os.Exit(1) 144 } 145 146 certsReady := make(chan struct{}) 147 148 if cfg.InternalCertManagement != nil && *cfg.InternalCertManagement.Enable { 149 if err = cert.ManageCerts(mgr, cfg, certsReady); err != nil { 150 setupLog.Error(err, "Unable to set up cert rotation") 151 os.Exit(1) 152 } 153 } else { 154 close(certsReady) 155 } 156 157 cCache := cache.New(mgr.GetClient(), cache.WithPodsReadyTracking(blockForPodsReady(&cfg))) 158 queues := queue.NewManager(mgr.GetClient(), cCache, queue.WithPodsReadyRequeuingTimestamp(podsReadyRequeuingTimestamp(&cfg))) 159 160 ctx := ctrl.SetupSignalHandler() 161 if err := setupIndexes(ctx, mgr, &cfg); err != nil { 162 setupLog.Error(err, "Unable to setup indexes") 163 os.Exit(1) 164 } 165 debugger.NewDumper(cCache, queues).ListenForSignal(ctx) 166 167 serverVersionFetcher := setupServerVersionFetcher(mgr, kubeConfig) 168 169 setupProbeEndpoints(mgr, certsReady) 170 // Cert won't be ready until manager starts, so start a goroutine here which 171 // will block until the cert is ready before setting up the controllers. 172 // Controllers who register after manager starts will start directly. 173 go setupControllers(mgr, cCache, queues, certsReady, &cfg, serverVersionFetcher) 174 175 go func() { 176 queues.CleanUpOnContext(ctx) 177 }() 178 go func() { 179 cCache.CleanUpOnContext(ctx) 180 }() 181 182 if features.Enabled(features.VisibilityOnDemand) { 183 go visibility.CreateAndStartVisibilityServer(queues, ctx) 184 } 185 186 setupScheduler(mgr, cCache, queues, &cfg) 187 188 setupLog.Info("Starting manager") 189 if err := mgr.Start(ctx); err != nil { 190 setupLog.Error(err, "Could not run manager") 191 os.Exit(1) 192 } 193 } 194 195 func setupIndexes(ctx context.Context, mgr ctrl.Manager, cfg *configapi.Configuration) error { 196 err := indexer.Setup(ctx, mgr.GetFieldIndexer()) 197 if err != nil { 198 return err 199 } 200 201 // setup provision admission check controller indexes 202 if features.Enabled(features.ProvisioningACC) { 203 if !provisioning.ServerSupportsProvisioningRequest(mgr) { 204 setupLog.Error(nil, "Provisioning Requests are not supported, skipped admission check controller setup") 205 } else if err := provisioning.SetupIndexer(ctx, mgr.GetFieldIndexer()); err != nil { 206 setupLog.Error(err, "Could not setup provisioning indexer") 207 os.Exit(1) 208 } 209 } 210 211 if features.Enabled(features.MultiKueue) { 212 if err := multikueue.SetupIndexer(ctx, mgr.GetFieldIndexer(), *cfg.Namespace); err != nil { 213 setupLog.Error(err, "Could not setup multikueue indexer") 214 os.Exit(1) 215 } 216 } 217 218 opts := []jobframework.Option{ 219 jobframework.WithEnabledFrameworks(cfg.Integrations), 220 } 221 return jobframework.SetupIndexes(ctx, mgr.GetFieldIndexer(), opts...) 222 } 223 224 func setupControllers(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manager, certsReady chan struct{}, cfg *configapi.Configuration, serverVersionFetcher *kubeversion.ServerVersionFetcher) { 225 // The controllers won't work until the webhooks are operating, and the webhook won't work until the 226 // certs are all in place. 227 cert.WaitForCertsReady(setupLog, certsReady) 228 229 if failedCtrl, err := core.SetupControllers(mgr, queues, cCache, cfg); err != nil { 230 setupLog.Error(err, "Unable to create controller", "controller", failedCtrl) 231 os.Exit(1) 232 } 233 234 // setup provision admission check controller 235 if features.Enabled(features.ProvisioningACC) && provisioning.ServerSupportsProvisioningRequest(mgr) { 236 // A info message is added in setupIndexes if autoscaling is not supported by the cluster 237 ctrl, err := provisioning.NewController(mgr.GetClient(), mgr.GetEventRecorderFor("kueue-provisioning-request-controller")) 238 if err != nil { 239 setupLog.Error(err, "Could not create the provisioning controller") 240 os.Exit(1) 241 } 242 243 if err := ctrl.SetupWithManager(mgr); err != nil { 244 setupLog.Error(err, "Could not setup provisioning controller") 245 os.Exit(1) 246 } 247 } 248 249 if features.Enabled(features.MultiKueue) { 250 if err := multikueue.SetupControllers(mgr, *cfg.Namespace, 251 multikueue.WithGCInterval(cfg.MultiKueue.GCInterval.Duration), 252 multikueue.WithOrigin(ptr.Deref(cfg.MultiKueue.Origin, configapi.DefaultMultiKueueOrigin)), 253 ); err != nil { 254 setupLog.Error(err, "Could not setup MultiKueue controller") 255 os.Exit(1) 256 } 257 } 258 259 if failedWebhook, err := webhooks.Setup(mgr); err != nil { 260 setupLog.Error(err, "Unable to create webhook", "webhook", failedWebhook) 261 os.Exit(1) 262 } 263 264 opts := []jobframework.Option{ 265 jobframework.WithManageJobsWithoutQueueName(cfg.ManageJobsWithoutQueueName), 266 jobframework.WithWaitForPodsReady(cfg.WaitForPodsReady), 267 jobframework.WithKubeServerVersion(serverVersionFetcher), 268 jobframework.WithIntegrationOptions(corev1.SchemeGroupVersion.WithKind("Pod").String(), cfg.Integrations.PodOptions), 269 jobframework.WithEnabledFrameworks(cfg.Integrations), 270 jobframework.WithManagerName(constants.KueueName), 271 } 272 if err := jobframework.SetupControllers(mgr, setupLog, opts...); err != nil { 273 setupLog.Error(err, "Unable to create controller or webhook", "kubernetesVersion", serverVersionFetcher.GetServerVersion()) 274 os.Exit(1) 275 } 276 // +kubebuilder:scaffold:builder 277 } 278 279 // setupProbeEndpoints registers the health endpoints 280 func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) { 281 defer setupLog.Info("Probe endpoints are configured on healthz and readyz") 282 283 if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 284 setupLog.Error(err, "unable to set up health check") 285 os.Exit(1) 286 } 287 288 // Wait for the webhook server to be listening before advertising the 289 // Kueue replica as ready. This allows users to wait with sending the first 290 // requests, requiring webhooks, until the Kueue deployment is available, so 291 // that the early requests are not rejected during the Kueue's startup. 292 // We wrap the call to GetWebhookServer in a closure to delay calling 293 // the function, otherwise a not fully-initialized webhook server (without 294 // ready certs) fails the start of the manager. 295 if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error { 296 select { 297 case <-certsReady: 298 return mgr.GetWebhookServer().StartedChecker()(req) 299 default: 300 return errors.New("certificates are not ready") 301 } 302 }); err != nil { 303 setupLog.Error(err, "unable to set up ready check") 304 os.Exit(1) 305 } 306 } 307 308 func setupScheduler(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manager, cfg *configapi.Configuration) { 309 sched := scheduler.New( 310 queues, 311 cCache, 312 mgr.GetClient(), 313 mgr.GetEventRecorderFor(constants.AdmissionName), 314 scheduler.WithPodsReadyRequeuingTimestamp(podsReadyRequeuingTimestamp(cfg)), 315 ) 316 if err := mgr.Add(sched); err != nil { 317 setupLog.Error(err, "Unable to add scheduler to manager") 318 os.Exit(1) 319 } 320 } 321 322 func setupServerVersionFetcher(mgr ctrl.Manager, kubeConfig *rest.Config) *kubeversion.ServerVersionFetcher { 323 discoveryClient, err := discovery.NewDiscoveryClientForConfig(kubeConfig) 324 if err != nil { 325 setupLog.Error(err, "Unable to create the discovery client") 326 os.Exit(1) 327 } 328 329 serverVersionFetcher := kubeversion.NewServerVersionFetcher(discoveryClient) 330 331 if err := mgr.Add(serverVersionFetcher); err != nil { 332 setupLog.Error(err, "Unable to add server version fetcher to manager") 333 os.Exit(1) 334 } 335 336 if err := serverVersionFetcher.FetchServerVersion(); err != nil { 337 setupLog.Error(err, "failed to fetch kubernetes server version") 338 os.Exit(1) 339 } 340 341 return serverVersionFetcher 342 } 343 344 func blockForPodsReady(cfg *configapi.Configuration) bool { 345 return config.WaitForPodsReadyIsEnabled(cfg) && cfg.WaitForPodsReady.BlockAdmission != nil && *cfg.WaitForPodsReady.BlockAdmission 346 } 347 348 func podsReadyRequeuingTimestamp(cfg *configapi.Configuration) configapi.RequeuingTimestamp { 349 if cfg.WaitForPodsReady != nil && cfg.WaitForPodsReady.RequeuingStrategy != nil && 350 cfg.WaitForPodsReady.RequeuingStrategy.Timestamp != nil { 351 return *cfg.WaitForPodsReady.RequeuingStrategy.Timestamp 352 } 353 return configapi.EvictionTimestamp 354 } 355 356 func apply(configFile string) (ctrl.Options, configapi.Configuration, error) { 357 options, cfg, err := config.Load(scheme, configFile) 358 if err != nil { 359 return options, cfg, err 360 } 361 362 if cfg.Integrations != nil { 363 var errorlist field.ErrorList 364 availableFrameworks := jobframework.GetIntegrationsList() 365 path := field.NewPath("integrations", "frameworks") 366 for _, framework := range cfg.Integrations.Frameworks { 367 if _, found := jobframework.GetIntegration(framework); !found { 368 errorlist = append(errorlist, field.NotSupported(path, framework, availableFrameworks)) 369 } 370 } 371 if len(errorlist) > 0 { 372 err := errorlist.ToAggregate() 373 return options, cfg, err 374 } 375 } 376 377 cfgStr, err := config.Encode(scheme, &cfg) 378 if err != nil { 379 return options, cfg, err 380 } 381 setupLog.Info("Successfully loaded configuration", "config", cfgStr) 382 383 return options, cfg, nil 384 }