volcano.sh/volcano@v1.9.0/pkg/scheduler/cache/cache.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cache 18 19 import ( 20 "context" 21 "fmt" 22 "os" 23 "strconv" 24 "strings" 25 "sync" 26 "time" 27 28 "golang.org/x/time/rate" 29 v1 "k8s.io/api/core/v1" 30 schedulingv1 "k8s.io/api/scheduling/v1" 31 apierrors "k8s.io/apimachinery/pkg/api/errors" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/runtime" 34 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 35 "k8s.io/apimachinery/pkg/util/sets" 36 "k8s.io/apimachinery/pkg/util/wait" 37 utilfeature "k8s.io/apiserver/pkg/util/feature" 38 "k8s.io/client-go/informers" 39 infov1 "k8s.io/client-go/informers/core/v1" 40 schedv1 "k8s.io/client-go/informers/scheduling/v1" 41 storagev1 "k8s.io/client-go/informers/storage/v1" 42 storagev1beta1 "k8s.io/client-go/informers/storage/v1beta1" 43 "k8s.io/client-go/kubernetes" 44 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 45 "k8s.io/client-go/rest" 46 "k8s.io/client-go/tools/cache" 47 "k8s.io/client-go/tools/record" 48 "k8s.io/client-go/util/retry" 49 "k8s.io/client-go/util/workqueue" 50 "k8s.io/klog/v2" 51 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 52 "k8s.io/kubernetes/pkg/scheduler/framework" 53 54 batch "volcano.sh/apis/pkg/apis/batch/v1alpha1" 55 "volcano.sh/apis/pkg/apis/scheduling" 56 schedulingscheme "volcano.sh/apis/pkg/apis/scheduling/scheme" 57 vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 58 vcclient "volcano.sh/apis/pkg/client/clientset/versioned" 59 "volcano.sh/apis/pkg/client/clientset/versioned/scheme" 60 vcinformer "volcano.sh/apis/pkg/client/informers/externalversions" 61 cpuinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/nodeinfo/v1alpha1" 62 vcinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1" 63 64 "volcano.sh/volcano/cmd/scheduler/app/options" 65 "volcano.sh/volcano/pkg/features" 66 schedulingapi "volcano.sh/volcano/pkg/scheduler/api" 67 volumescheduling "volcano.sh/volcano/pkg/scheduler/capabilities/volumebinding" 68 "volcano.sh/volcano/pkg/scheduler/metrics" 69 "volcano.sh/volcano/pkg/scheduler/metrics/source" 70 commonutil "volcano.sh/volcano/pkg/util" 71 ) 72 73 const ( 74 // default interval for sync data from metrics server, the value is 30s 75 defaultMetricsInternal = 30 * time.Second 76 ) 77 78 // defaultIgnoredProvisioners contains provisioners that will be ignored during pod pvc request computation and preemption. 79 var defaultIgnoredProvisioners = []string{"rancher.io/local-path", "hostpath.csi.k8s.io"} 80 81 func init() { 82 schemeBuilder := runtime.SchemeBuilder{ 83 v1.AddToScheme, 84 } 85 86 utilruntime.Must(schemeBuilder.AddToScheme(scheme.Scheme)) 87 } 88 89 // New returns a Cache implementation. 90 func New(config *rest.Config, schedulerNames []string, defaultQueue string, nodeSelectors []string, nodeWorkers uint32, ignoredProvisioners []string) Cache { 91 return newSchedulerCache(config, schedulerNames, defaultQueue, nodeSelectors, nodeWorkers, ignoredProvisioners) 92 } 93 94 // SchedulerCache cache for the kube batch 95 type SchedulerCache struct { 96 sync.Mutex 97 98 kubeClient kubernetes.Interface 99 restConfig *rest.Config 100 vcClient vcclient.Interface 101 defaultQueue string 102 // schedulerName is the name for volcano scheduler 103 schedulerNames []string 104 nodeSelectorLabels map[string]string 105 metricsConf map[string]string 106 107 podInformer infov1.PodInformer 108 nodeInformer infov1.NodeInformer 109 podGroupInformerV1beta1 vcinformerv1.PodGroupInformer 110 queueInformerV1beta1 vcinformerv1.QueueInformer 111 pvInformer infov1.PersistentVolumeInformer 112 pvcInformer infov1.PersistentVolumeClaimInformer 113 scInformer storagev1.StorageClassInformer 114 pcInformer schedv1.PriorityClassInformer 115 quotaInformer infov1.ResourceQuotaInformer 116 csiNodeInformer storagev1.CSINodeInformer 117 csiDriverInformer storagev1.CSIDriverInformer 118 csiStorageCapacityInformer storagev1beta1.CSIStorageCapacityInformer 119 cpuInformer cpuinformerv1.NumatopologyInformer 120 121 Binder Binder 122 Evictor Evictor 123 StatusUpdater StatusUpdater 124 PodGroupBinder BatchBinder 125 VolumeBinder VolumeBinder 126 127 Recorder record.EventRecorder 128 129 Jobs map[schedulingapi.JobID]*schedulingapi.JobInfo 130 Nodes map[string]*schedulingapi.NodeInfo 131 Queues map[schedulingapi.QueueID]*schedulingapi.QueueInfo 132 PriorityClasses map[string]*schedulingv1.PriorityClass 133 NodeList []string 134 defaultPriorityClass *schedulingv1.PriorityClass 135 defaultPriority int32 136 CSINodesStatus map[string]*schedulingapi.CSINodeStatusInfo 137 138 NamespaceCollection map[string]*schedulingapi.NamespaceCollection 139 140 errTasks workqueue.RateLimitingInterface 141 nodeQueue workqueue.RateLimitingInterface 142 DeletedJobs workqueue.RateLimitingInterface 143 144 informerFactory informers.SharedInformerFactory 145 vcInformerFactory vcinformer.SharedInformerFactory 146 147 BindFlowChannel chan *schedulingapi.TaskInfo 148 bindCache []*schedulingapi.TaskInfo 149 batchNum int 150 151 // A map from image name to its imageState. 152 imageStates map[string]*imageState 153 154 nodeWorkers uint32 155 156 // IgnoredCSIProvisioners contains a list of provisioners, and pod request pvc with these provisioners will 157 // not be counted in pod pvc resource request and node.Allocatable, because the spec.drivers of csinode resource 158 // is always null, these provisioners usually are host path csi controllers like rancher.io/local-path and hostpath.csi.k8s.io. 159 IgnoredCSIProvisioners sets.Set[string] 160 } 161 162 type imageState struct { 163 // Size of the image 164 size int64 165 // A set of node names for nodes having this image present 166 nodes sets.String 167 } 168 169 // DefaultBinder with kube client and event recorder 170 type DefaultBinder struct { 171 kubeclient kubernetes.Interface 172 recorder record.EventRecorder 173 } 174 175 // Bind will send bind request to api server 176 func (db *DefaultBinder) Bind(kubeClient kubernetes.Interface, tasks []*schedulingapi.TaskInfo) ([]*schedulingapi.TaskInfo, error) { 177 var errTasks []*schedulingapi.TaskInfo 178 for _, task := range tasks { 179 p := task.Pod 180 if err := db.kubeclient.CoreV1().Pods(p.Namespace).Bind(context.TODO(), 181 &v1.Binding{ 182 ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID, Annotations: p.Annotations}, 183 Target: v1.ObjectReference{ 184 Kind: "Node", 185 Name: task.NodeName, 186 }, 187 }, 188 metav1.CreateOptions{}); err != nil { 189 klog.Errorf("Failed to bind pod <%v/%v> to node %s : %#v", p.Namespace, p.Name, task.NodeName, err) 190 errTasks = append(errTasks, task) 191 } else { 192 db.recorder.Eventf(task.Pod, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", task.Namespace, task.Name, task.NodeName) 193 metrics.UpdateTaskScheduleDuration(metrics.Duration(p.CreationTimestamp.Time)) // update metrics as soon as pod is bind 194 } 195 } 196 197 if len(errTasks) > 0 { 198 return errTasks, fmt.Errorf("failed to bind pods") 199 } 200 201 return nil, nil 202 } 203 204 // NewDefaultBinder create binder with kube client and event recorder, support fake binder if passed fake client and fake event recorder 205 func NewDefaultBinder(kbclient kubernetes.Interface, record record.EventRecorder) *DefaultBinder { 206 return &DefaultBinder{ 207 kubeclient: kbclient, 208 recorder: record, 209 } 210 } 211 212 type defaultEvictor struct { 213 kubeclient kubernetes.Interface 214 recorder record.EventRecorder 215 } 216 217 // Evict will send delete pod request to api server 218 func (de *defaultEvictor) Evict(p *v1.Pod, reason string) error { 219 klog.V(3).Infof("Evicting pod %v/%v, because of %v", p.Namespace, p.Name, reason) 220 221 evictMsg := fmt.Sprintf("Pod is evicted, because of %v", reason) 222 annotations := map[string]string{} 223 // record that we are evicting the pod 224 de.recorder.AnnotatedEventf(p, annotations, v1.EventTypeWarning, "Evict", evictMsg) 225 226 pod := p.DeepCopy() 227 condition := &v1.PodCondition{ 228 Type: v1.PodReady, 229 Status: v1.ConditionFalse, 230 Reason: "Evict", 231 Message: evictMsg, 232 } 233 if !podutil.UpdatePodCondition(&pod.Status, condition) { 234 klog.V(1).Infof("UpdatePodCondition: existed condition, not update") 235 klog.V(1).Infof("%+v", pod.Status.Conditions) 236 return nil 237 } 238 if _, err := de.kubeclient.CoreV1().Pods(p.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}); err != nil { 239 klog.Errorf("Failed to update pod <%v/%v> status: %v", pod.Namespace, pod.Name, err) 240 return err 241 } 242 if err := de.kubeclient.CoreV1().Pods(p.Namespace).Delete(context.TODO(), p.Name, metav1.DeleteOptions{}); err != nil { 243 klog.Errorf("Failed to evict pod <%v/%v>: %#v", p.Namespace, p.Name, err) 244 return err 245 } 246 247 return nil 248 } 249 250 // defaultStatusUpdater is the default implementation of the StatusUpdater interface 251 type defaultStatusUpdater struct { 252 kubeclient kubernetes.Interface 253 vcclient vcclient.Interface 254 } 255 256 // following the same logic as podutil.UpdatePodCondition 257 func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool { 258 lastTransitionTime := metav1.Now() 259 // Try to find this pod condition. 260 _, oldCondition := podutil.GetPodCondition(status, condition.Type) 261 262 if oldCondition == nil { 263 // We are adding new pod condition. 264 return true 265 } 266 // We are updating an existing condition, so we need to check if it has changed. 267 if condition.Status == oldCondition.Status { 268 lastTransitionTime = oldCondition.LastTransitionTime 269 } 270 271 isEqual := condition.Status == oldCondition.Status && 272 condition.Reason == oldCondition.Reason && 273 condition.Message == oldCondition.Message && 274 condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) && 275 lastTransitionTime.Equal(&oldCondition.LastTransitionTime) 276 277 // Return true if one of the fields have changed. 278 return !isEqual 279 } 280 281 // UpdatePodCondition will Update pod with podCondition 282 func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) { 283 klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status) 284 if podutil.UpdatePodCondition(&pod.Status, condition) { 285 return su.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}) 286 } 287 return pod, nil 288 } 289 290 // UpdatePodGroup will Update pod with podCondition 291 func (su *defaultStatusUpdater) UpdatePodGroup(pg *schedulingapi.PodGroup) (*schedulingapi.PodGroup, error) { 292 podgroup := &vcv1beta1.PodGroup{} 293 if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil { 294 klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err) 295 return nil, err 296 } 297 298 updated, err := su.vcclient.SchedulingV1beta1().PodGroups(podgroup.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{}) 299 if err != nil { 300 klog.Errorf("Error while updating PodGroup with error: %v", err) 301 return nil, err 302 } 303 304 podGroupInfo := &schedulingapi.PodGroup{Version: schedulingapi.PodGroupVersionV1Beta1} 305 if err := schedulingscheme.Scheme.Convert(updated, &podGroupInfo.PodGroup, nil); err != nil { 306 klog.Errorf("Error while converting v1alpha.PodGroup to api.PodGroup with error: %v", err) 307 return nil, err 308 } 309 310 return podGroupInfo, nil 311 } 312 313 // UpdateQueueStatus will update the status of queue 314 func (su *defaultStatusUpdater) UpdateQueueStatus(queue *schedulingapi.QueueInfo) error { 315 var newQueue = &vcv1beta1.Queue{} 316 if err := schedulingscheme.Scheme.Convert(queue.Queue, newQueue, nil); err != nil { 317 klog.Errorf("error occurred in converting scheduling.Queue to v1beta1.Queue: %s", err.Error()) 318 return err 319 } 320 321 _, err := su.vcclient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}) 322 if err != nil { 323 klog.Errorf("error occurred in updating Queue <%s>: %s", newQueue.Name, err.Error()) 324 return err 325 } 326 return nil 327 } 328 329 type defaultVolumeBinder struct { 330 volumeBinder volumescheduling.SchedulerVolumeBinder 331 } 332 333 // AllocateVolumes allocates volume on the host to the task 334 func (dvb *defaultVolumeBinder) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error { 335 logger := klog.FromContext(context.TODO()) 336 allBound, err := dvb.volumeBinder.AssumePodVolumes(logger, task.Pod, hostname, podVolumes) 337 task.VolumeReady = allBound 338 339 return err 340 } 341 342 // RevertVolumes clean cache generated by AllocateVolumes 343 func (dvb *defaultVolumeBinder) RevertVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) { 344 if podVolumes != nil { 345 klog.Infof("Revert assumed volumes for task %v/%v on node %s", task.Namespace, task.Name, task.NodeName) 346 dvb.volumeBinder.RevertAssumedPodVolumes(podVolumes) 347 task.VolumeReady = false 348 task.PodVolumes = nil 349 } 350 } 351 352 // GetPodVolumes get pod volume on the host 353 func (dvb *defaultVolumeBinder) GetPodVolumes(task *schedulingapi.TaskInfo, 354 node *v1.Node) (podVolumes *volumescheduling.PodVolumes, err error) { 355 logger := klog.FromContext(context.TODO()) 356 podVolumeClaims, err := dvb.volumeBinder.GetPodVolumeClaims(logger, task.Pod) 357 if err != nil { 358 return nil, err 359 } 360 // if len(unboundClaimsImmediate) > 0 { 361 // return nil, fmt.Errorf("pod has unbound immediate PersistentVolumeClaims") 362 // } 363 364 podVolumes, reasons, err := dvb.volumeBinder.FindPodVolumes(logger, task.Pod, podVolumeClaims, node) 365 if err != nil { 366 return nil, err 367 } else if len(reasons) > 0 { 368 var errors []string 369 for _, reason := range reasons { 370 errors = append(errors, string(reason)) 371 } 372 return nil, fmt.Errorf(strings.Join(errors, ",")) 373 } 374 375 return podVolumes, err 376 } 377 378 // BindVolumes binds volumes to the task 379 func (dvb *defaultVolumeBinder) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error { 380 // If task's volumes are ready, did not bind them again. 381 if task.VolumeReady { 382 return nil 383 } 384 385 return dvb.volumeBinder.BindPodVolumes(context.TODO(), task.Pod, podVolumes) 386 } 387 388 type podgroupBinder struct { 389 kubeclient kubernetes.Interface 390 vcclient vcclient.Interface 391 } 392 393 // Bind will add silo cluster annotaion on pod and podgroup 394 func (pgb *podgroupBinder) Bind(job *schedulingapi.JobInfo, cluster string) (*schedulingapi.JobInfo, error) { 395 if len(job.Tasks) == 0 { 396 klog.V(4).Infof("Job pods have not been created yet") 397 return job, nil 398 } 399 for _, task := range job.Tasks { 400 pod := task.Pod 401 pod.Annotations[batch.ForwardClusterKey] = cluster 402 pod.ResourceVersion = "" 403 _, err := pgb.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}) 404 if err != nil { 405 klog.Errorf("Error while update pod annotation with error: %v", err) 406 return nil, err 407 } 408 } 409 410 pg := job.PodGroup 411 pg.Annotations[batch.ForwardClusterKey] = cluster 412 podgroup := &vcv1beta1.PodGroup{} 413 if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil { 414 klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err) 415 return nil, err 416 } 417 newPg, err := pgb.vcclient.SchedulingV1beta1().PodGroups(pg.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{}) 418 if err != nil { 419 klog.Errorf("Error while update PodGroup annotation with error: %v", err) 420 return nil, err 421 } 422 job.PodGroup.ResourceVersion = newPg.ResourceVersion 423 klog.V(4).Infof("Bind PodGroup <%s> successfully", job.PodGroup.Name) 424 return job, nil 425 } 426 427 // updateNodeSelectors parse and update node selector key value pairs to schedule cache 428 func (sc *SchedulerCache) updateNodeSelectors(nodeSelectors []string) { 429 for _, nodeSelectorLabel := range nodeSelectors { 430 nodeSelectorLabelLen := len(nodeSelectorLabel) 431 if nodeSelectorLabelLen <= 0 { 432 continue 433 } 434 // check input 435 index := strings.Index(nodeSelectorLabel, ":") 436 if index < 0 || index >= (nodeSelectorLabelLen-1) { 437 continue 438 } 439 nodeSelectorLabelName := strings.TrimSpace(nodeSelectorLabel[:index]) 440 nodeSelectorLabelValue := strings.TrimSpace(nodeSelectorLabel[index+1:]) 441 key := nodeSelectorLabelName + ":" + nodeSelectorLabelValue 442 sc.nodeSelectorLabels[key] = "" 443 } 444 } 445 446 // setBatchBindParallel configure the parallel when binding tasks to apiserver 447 func (sc *SchedulerCache) setBatchBindParallel() { 448 sc.BindFlowChannel = make(chan *schedulingapi.TaskInfo, 5000) 449 var batchNum int 450 batchNum, err := strconv.Atoi(os.Getenv("BATCH_BIND_NUM")) 451 if err == nil && batchNum > 0 { 452 sc.batchNum = batchNum 453 } else { 454 sc.batchNum = 1 455 } 456 } 457 458 func (sc *SchedulerCache) setDefaultVolumeBinder() { 459 logger := klog.FromContext(context.TODO()) 460 var capacityCheck *volumescheduling.CapacityCheck 461 if options.ServerOpts != nil && options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) { 462 capacityCheck = &volumescheduling.CapacityCheck{ 463 CSIDriverInformer: sc.csiDriverInformer, 464 CSIStorageCapacityInformer: sc.csiStorageCapacityInformer, 465 } 466 } 467 sc.VolumeBinder = &defaultVolumeBinder{ 468 volumeBinder: volumescheduling.NewVolumeBinder( 469 logger, 470 sc.kubeClient, 471 sc.podInformer, 472 sc.nodeInformer, 473 sc.csiNodeInformer, 474 sc.pvcInformer, 475 sc.pvInformer, 476 sc.scInformer, 477 capacityCheck, 478 30*time.Second, 479 ), 480 } 481 } 482 483 // newDefaultQueue init default queue 484 func newDefaultQueue(vcClient vcclient.Interface, defaultQueue string) { 485 reclaimable := true 486 defaultQue := vcv1beta1.Queue{ 487 ObjectMeta: metav1.ObjectMeta{ 488 Name: defaultQueue, 489 }, 490 Spec: vcv1beta1.QueueSpec{ 491 Reclaimable: &reclaimable, 492 Weight: 1, 493 }, 494 } 495 496 err := retry.OnError(wait.Backoff{ 497 Steps: 60, 498 Duration: time.Second, 499 Factor: 1, 500 Jitter: 0.1, 501 }, func(err error) bool { 502 return !apierrors.IsAlreadyExists(err) 503 }, func() error { 504 _, err := vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), &defaultQue, metav1.CreateOptions{}) 505 return err 506 }) 507 if err != nil && !apierrors.IsAlreadyExists(err) { 508 panic(fmt.Errorf("failed init default queue, with err: %v", err)) 509 } 510 } 511 512 func newSchedulerCache(config *rest.Config, schedulerNames []string, defaultQueue string, nodeSelectors []string, nodeWorkers uint32, ignoredProvisioners []string) *SchedulerCache { 513 kubeClient, err := kubernetes.NewForConfig(config) 514 if err != nil { 515 panic(fmt.Sprintf("failed init kubeClient, with err: %v", err)) 516 } 517 vcClient, err := vcclient.NewForConfig(config) 518 if err != nil { 519 panic(fmt.Sprintf("failed init vcClient, with err: %v", err)) 520 } 521 eventClient, err := kubernetes.NewForConfig(config) 522 if err != nil { 523 panic(fmt.Sprintf("failed init eventClient, with err: %v", err)) 524 } 525 526 // create default queue 527 newDefaultQueue(vcClient, defaultQueue) 528 klog.Infof("Create init queue named default") 529 530 errTaskRateLimiter := workqueue.NewMaxOfRateLimiter( 531 workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 1000*time.Second), 532 &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(100), 1000)}, 533 ) 534 535 sc := &SchedulerCache{ 536 Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo), 537 Nodes: make(map[string]*schedulingapi.NodeInfo), 538 Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo), 539 PriorityClasses: make(map[string]*schedulingv1.PriorityClass), 540 errTasks: workqueue.NewRateLimitingQueue(errTaskRateLimiter), 541 nodeQueue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), 542 DeletedJobs: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), 543 kubeClient: kubeClient, 544 vcClient: vcClient, 545 restConfig: config, 546 defaultQueue: defaultQueue, 547 schedulerNames: schedulerNames, 548 nodeSelectorLabels: make(map[string]string), 549 NamespaceCollection: make(map[string]*schedulingapi.NamespaceCollection), 550 CSINodesStatus: make(map[string]*schedulingapi.CSINodeStatusInfo), 551 imageStates: make(map[string]*imageState), 552 553 NodeList: []string{}, 554 nodeWorkers: nodeWorkers, 555 } 556 557 ignoredProvisionersSet := sets.New[string]() 558 for _, provisioner := range append(ignoredProvisioners, defaultIgnoredProvisioners...) { 559 ignoredProvisionersSet.Insert(provisioner) 560 } 561 sc.IgnoredCSIProvisioners = ignoredProvisionersSet 562 563 if len(nodeSelectors) > 0 { 564 sc.updateNodeSelectors(nodeSelectors) 565 } 566 // Prepare event clients. 567 broadcaster := record.NewBroadcaster() 568 broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: eventClient.CoreV1().Events("")}) 569 sc.Recorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: commonutil.GenerateComponentName(sc.schedulerNames)}) 570 571 // set concurrency configuration when binding 572 sc.setBatchBindParallel() 573 if bindMethodMap == nil { 574 klog.V(3).Info("no registered bind method, new a default one") 575 bindMethodMap = NewDefaultBinder(sc.kubeClient, sc.Recorder) 576 } 577 sc.Binder = GetBindMethod() 578 579 sc.Evictor = &defaultEvictor{ 580 kubeclient: sc.kubeClient, 581 recorder: sc.Recorder, 582 } 583 584 sc.StatusUpdater = &defaultStatusUpdater{ 585 kubeclient: sc.kubeClient, 586 vcclient: sc.vcClient, 587 } 588 589 sc.PodGroupBinder = &podgroupBinder{ 590 kubeclient: sc.kubeClient, 591 vcclient: sc.vcClient, 592 } 593 594 // add all events handlers 595 sc.addEventHandler() 596 // finally, init default volume binder which has dependencies on other informers 597 sc.setDefaultVolumeBinder() 598 return sc 599 } 600 601 func (sc *SchedulerCache) addEventHandler() { 602 informerFactory := informers.NewSharedInformerFactory(sc.kubeClient, 0) 603 sc.informerFactory = informerFactory 604 mySchedulerPodName, c := getMultiSchedulerInfo() 605 606 // explicitly register informers to the factory, otherwise resources listers cannot get anything 607 // even with no error returned. 608 // `Namespace` informer is used by `InterPodAffinity` plugin, 609 // `SelectorSpread` and `PodTopologySpread` plugins uses the following four so far. 610 informerFactory.Core().V1().Namespaces().Informer() 611 informerFactory.Core().V1().Services().Informer() 612 if utilfeature.DefaultFeatureGate.Enabled(features.WorkLoadSupport) { 613 informerFactory.Core().V1().ReplicationControllers().Informer() 614 informerFactory.Apps().V1().ReplicaSets().Informer() 615 informerFactory.Apps().V1().StatefulSets().Informer() 616 } 617 618 // `PodDisruptionBudgets` informer is used by `Pdb` plugin 619 if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionBudgetsSupport) { 620 informerFactory.Policy().V1().PodDisruptionBudgets().Informer() 621 } 622 623 // create informer for node information 624 sc.nodeInformer = informerFactory.Core().V1().Nodes() 625 sc.nodeInformer.Informer().AddEventHandlerWithResyncPeriod( 626 cache.FilteringResourceEventHandler{ 627 FilterFunc: func(obj interface{}) bool { 628 var node *v1.Node 629 switch t := obj.(type) { 630 case *v1.Node: 631 node = t 632 case cache.DeletedFinalStateUnknown: 633 var ok bool 634 node, ok = t.Obj.(*v1.Node) 635 if !ok { 636 klog.Errorf("Cannot convert to *v1.Node: %v", t.Obj) 637 return false 638 } 639 default: 640 return false 641 } 642 643 if !responsibleForNode(node.Name, mySchedulerPodName, c) { 644 return false 645 } 646 if len(sc.nodeSelectorLabels) == 0 { 647 return true 648 } 649 for labelName, labelValue := range node.Labels { 650 key := labelName + ":" + labelValue 651 if _, ok := sc.nodeSelectorLabels[key]; ok { 652 return true 653 } 654 } 655 klog.Infof("node %s ignore add/update/delete into schedulerCache", node.Name) 656 return false 657 }, 658 Handler: cache.ResourceEventHandlerFuncs{ 659 AddFunc: sc.AddNode, 660 UpdateFunc: sc.UpdateNode, 661 DeleteFunc: sc.DeleteNode, 662 }, 663 }, 664 0, 665 ) 666 667 sc.podInformer = informerFactory.Core().V1().Pods() 668 sc.pvcInformer = informerFactory.Core().V1().PersistentVolumeClaims() 669 sc.pvInformer = informerFactory.Core().V1().PersistentVolumes() 670 sc.scInformer = informerFactory.Storage().V1().StorageClasses() 671 sc.csiNodeInformer = informerFactory.Storage().V1().CSINodes() 672 sc.csiNodeInformer.Informer().AddEventHandler( 673 cache.ResourceEventHandlerFuncs{ 674 AddFunc: sc.AddOrUpdateCSINode, 675 UpdateFunc: sc.UpdateCSINode, 676 DeleteFunc: sc.DeleteCSINode, 677 }, 678 ) 679 680 if options.ServerOpts != nil && options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) { 681 sc.csiDriverInformer = informerFactory.Storage().V1().CSIDrivers() 682 sc.csiStorageCapacityInformer = informerFactory.Storage().V1beta1().CSIStorageCapacities() 683 } 684 685 // create informer for pod information 686 sc.podInformer.Informer().AddEventHandler( 687 cache.FilteringResourceEventHandler{ 688 FilterFunc: func(obj interface{}) bool { 689 switch v := obj.(type) { 690 case *v1.Pod: 691 if !responsibleForPod(v, sc.schedulerNames, mySchedulerPodName, c) { 692 if len(v.Spec.NodeName) == 0 { 693 return false 694 } 695 if !responsibleForNode(v.Spec.NodeName, mySchedulerPodName, c) { 696 return false 697 } 698 } 699 return true 700 case cache.DeletedFinalStateUnknown: 701 if _, ok := v.Obj.(*v1.Pod); ok { 702 // The carried object may be stale, always pass to clean up stale obj in event handlers. 703 return true 704 } 705 klog.Errorf("Cannot convert object %T to *v1.Pod", v.Obj) 706 return false 707 default: 708 return false 709 } 710 }, 711 Handler: cache.ResourceEventHandlerFuncs{ 712 AddFunc: sc.AddPod, 713 UpdateFunc: sc.UpdatePod, 714 DeleteFunc: sc.DeletePod, 715 }, 716 }) 717 718 if options.ServerOpts != nil && options.ServerOpts.EnablePriorityClass && utilfeature.DefaultFeatureGate.Enabled(features.PriorityClass) { 719 sc.pcInformer = informerFactory.Scheduling().V1().PriorityClasses() 720 sc.pcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 721 AddFunc: sc.AddPriorityClass, 722 UpdateFunc: sc.UpdatePriorityClass, 723 DeleteFunc: sc.DeletePriorityClass, 724 }) 725 } 726 727 sc.quotaInformer = informerFactory.Core().V1().ResourceQuotas() 728 sc.quotaInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 729 AddFunc: sc.AddResourceQuota, 730 UpdateFunc: sc.UpdateResourceQuota, 731 DeleteFunc: sc.DeleteResourceQuota, 732 }) 733 734 vcinformers := vcinformer.NewSharedInformerFactory(sc.vcClient, 0) 735 sc.vcInformerFactory = vcinformers 736 737 // create informer for PodGroup(v1beta1) information 738 sc.podGroupInformerV1beta1 = vcinformers.Scheduling().V1beta1().PodGroups() 739 sc.podGroupInformerV1beta1.Informer().AddEventHandler( 740 cache.FilteringResourceEventHandler{ 741 FilterFunc: func(obj interface{}) bool { 742 var pg *vcv1beta1.PodGroup 743 switch v := obj.(type) { 744 case *vcv1beta1.PodGroup: 745 pg = v 746 case cache.DeletedFinalStateUnknown: 747 var ok bool 748 pg, ok = v.Obj.(*vcv1beta1.PodGroup) 749 if !ok { 750 klog.Errorf("Cannot convert to podgroup: %v", v.Obj) 751 return false 752 } 753 default: 754 return false 755 } 756 757 return responsibleForPodGroup(pg, mySchedulerPodName, c) 758 }, 759 Handler: cache.ResourceEventHandlerFuncs{ 760 AddFunc: sc.AddPodGroupV1beta1, 761 UpdateFunc: sc.UpdatePodGroupV1beta1, 762 DeleteFunc: sc.DeletePodGroupV1beta1, 763 }, 764 }) 765 766 // create informer(v1beta1) for Queue information 767 sc.queueInformerV1beta1 = vcinformers.Scheduling().V1beta1().Queues() 768 sc.queueInformerV1beta1.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 769 AddFunc: sc.AddQueueV1beta1, 770 UpdateFunc: sc.UpdateQueueV1beta1, 771 DeleteFunc: sc.DeleteQueueV1beta1, 772 }) 773 774 if utilfeature.DefaultFeatureGate.Enabled(features.ResourceTopology) { 775 sc.cpuInformer = vcinformers.Nodeinfo().V1alpha1().Numatopologies() 776 sc.cpuInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 777 AddFunc: sc.AddNumaInfoV1alpha1, 778 UpdateFunc: sc.UpdateNumaInfoV1alpha1, 779 DeleteFunc: sc.DeleteNumaInfoV1alpha1, 780 }) 781 } 782 } 783 784 // Run starts the schedulerCache 785 func (sc *SchedulerCache) Run(stopCh <-chan struct{}) { 786 sc.informerFactory.Start(stopCh) 787 sc.vcInformerFactory.Start(stopCh) 788 sc.WaitForCacheSync(stopCh) 789 for i := 0; i < int(sc.nodeWorkers); i++ { 790 go wait.Until(sc.runNodeWorker, 0, stopCh) 791 } 792 793 // Re-sync error tasks. 794 go wait.Until(sc.processResyncTask, 0, stopCh) 795 796 // Cleanup jobs. 797 go wait.Until(sc.processCleanupJob, 0, stopCh) 798 799 go wait.Until(sc.processBindTask, time.Millisecond*20, stopCh) 800 801 // Get metrics data 802 klog.V(3).Infof("Start metrics collection, metricsConf is %v", sc.metricsConf) 803 interval, err := time.ParseDuration(sc.metricsConf["interval"]) 804 if err != nil || interval <= 0 { 805 interval = defaultMetricsInternal 806 } 807 klog.V(3).Infof("The interval for querying metrics data is %v", interval) 808 go wait.Until(sc.GetMetricsData, interval, stopCh) 809 } 810 811 // WaitForCacheSync sync the cache with the api server 812 func (sc *SchedulerCache) WaitForCacheSync(stopCh <-chan struct{}) { 813 sc.informerFactory.WaitForCacheSync(stopCh) 814 sc.vcInformerFactory.WaitForCacheSync(stopCh) 815 } 816 817 // findJobAndTask returns job and the task info 818 func (sc *SchedulerCache) findJobAndTask(taskInfo *schedulingapi.TaskInfo) (*schedulingapi.JobInfo, *schedulingapi.TaskInfo, error) { 819 job, found := sc.Jobs[taskInfo.Job] 820 if !found { 821 return nil, nil, fmt.Errorf("failed to find Job %v for Task %v", 822 taskInfo.Job, taskInfo.UID) 823 } 824 825 task, found := job.Tasks[taskInfo.UID] 826 if !found { 827 return nil, nil, fmt.Errorf("failed to find task in status %v by id %v", 828 taskInfo.Status, taskInfo.UID) 829 } 830 831 return job, task, nil 832 } 833 834 // Evict will evict the pod. 835 // 836 // If error occurs both task and job are guaranteed to be in the original state. 837 func (sc *SchedulerCache) Evict(taskInfo *schedulingapi.TaskInfo, reason string) error { 838 sc.Mutex.Lock() 839 defer sc.Mutex.Unlock() 840 841 job, task, err := sc.findJobAndTask(taskInfo) 842 843 if err != nil { 844 return err 845 } 846 847 node, found := sc.Nodes[task.NodeName] 848 if !found { 849 return fmt.Errorf("failed to bind Task %v to host %v, host does not exist", 850 task.UID, task.NodeName) 851 } 852 853 originalStatus := task.Status 854 if err := job.UpdateTaskStatus(task, schedulingapi.Releasing); err != nil { 855 return err 856 } 857 858 // Add new task to node. 859 if err := node.UpdateTask(task); err != nil { 860 // After failing to update task to a node we need to revert task status from Releasing, 861 // otherwise task might be stuck in the Releasing state indefinitely. 862 if err := job.UpdateTaskStatus(task, originalStatus); err != nil { 863 klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+ 864 "from %s to %s after failing to update Task on Node <%s>: %v", 865 task.Namespace, task.Name, task.Status, originalStatus, node.Name, err) 866 sc.resyncTask(task) 867 } 868 return err 869 } 870 871 p := task.Pod 872 873 go func() { 874 err := sc.Evictor.Evict(p, reason) 875 if err != nil { 876 sc.resyncTask(task) 877 } 878 }() 879 880 podgroup := &vcv1beta1.PodGroup{} 881 if job.PodGroup != nil { 882 err = schedulingscheme.Scheme.Convert(&job.PodGroup.PodGroup, podgroup, nil) 883 } else { 884 err = fmt.Errorf("the PodGroup of Job <%s/%s> is nil", job.Namespace, job.Name) 885 } 886 887 if err != nil { 888 klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err) 889 return err 890 } 891 sc.Recorder.Eventf(podgroup, v1.EventTypeNormal, "Evict", reason) 892 return nil 893 } 894 895 // Bind binds task to the target host. 896 func (sc *SchedulerCache) Bind(tasks []*schedulingapi.TaskInfo) { 897 tmp := time.Now() 898 errTasks, err := sc.Binder.Bind(sc.kubeClient, tasks) 899 if err == nil { 900 klog.V(3).Infof("bind ok, latency %v", time.Since(tmp)) 901 } else { 902 for _, task := range errTasks { 903 klog.V(2).Infof("resyncTask task %s", task.Name) 904 sc.VolumeBinder.RevertVolumes(task, task.PodVolumes) 905 sc.resyncTask(task) 906 } 907 } 908 } 909 910 // BindPodGroup binds job to silo cluster 911 func (sc *SchedulerCache) BindPodGroup(job *schedulingapi.JobInfo, cluster string) error { 912 if _, err := sc.PodGroupBinder.Bind(job, cluster); err != nil { 913 klog.Errorf("Bind job <%s> to cluster <%s> failed: %v", job.Name, cluster, err) 914 return err 915 } 916 return nil 917 } 918 919 // GetPodVolumes get pod volume on the host 920 func (sc *SchedulerCache) GetPodVolumes(task *schedulingapi.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) { 921 return sc.VolumeBinder.GetPodVolumes(task, node) 922 } 923 924 // AllocateVolumes allocates volume on the host to the task 925 func (sc *SchedulerCache) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error { 926 return sc.VolumeBinder.AllocateVolumes(task, hostname, podVolumes) 927 } 928 929 // BindVolumes binds volumes to the task 930 func (sc *SchedulerCache) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error { 931 return sc.VolumeBinder.BindVolumes(task, podVolumes) 932 } 933 934 // RevertVolumes clean cache generated by AllocateVolumes 935 func (sc *SchedulerCache) RevertVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) { 936 sc.VolumeBinder.RevertVolumes(task, podVolumes) 937 } 938 939 // Client returns the kubernetes clientSet 940 func (sc *SchedulerCache) Client() kubernetes.Interface { 941 return sc.kubeClient 942 } 943 944 // ClientConfig returns the rest config 945 func (sc *SchedulerCache) ClientConfig() *rest.Config { 946 return sc.restConfig 947 } 948 949 // SharedInformerFactory returns the scheduler SharedInformerFactory 950 func (sc *SchedulerCache) SharedInformerFactory() informers.SharedInformerFactory { 951 return sc.informerFactory 952 } 953 954 // SetSharedInformerFactory sets the scheduler SharedInformerFactory for unit test 955 func (sc *SchedulerCache) SetSharedInformerFactory(factory informers.SharedInformerFactory) { 956 sc.informerFactory = factory 957 } 958 959 // UpdateSchedulerNumaInfo used to update scheduler node cache NumaSchedulerInfo 960 func (sc *SchedulerCache) UpdateSchedulerNumaInfo(AllocatedSets map[string]schedulingapi.ResNumaSets) error { 961 sc.Mutex.Lock() 962 defer sc.Mutex.Unlock() 963 964 for nodeName, sets := range AllocatedSets { 965 if _, found := sc.Nodes[nodeName]; !found { 966 continue 967 } 968 969 numaInfo := sc.Nodes[nodeName].NumaSchedulerInfo 970 if numaInfo == nil { 971 continue 972 } 973 974 numaInfo.Allocate(sets) 975 } 976 return nil 977 } 978 979 // EventRecorder returns the Event Recorder 980 func (sc *SchedulerCache) EventRecorder() record.EventRecorder { 981 return sc.Recorder 982 } 983 984 // taskUnschedulable updates pod status of pending task 985 func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason, message string) error { 986 pod := task.Pod 987 988 condition := &v1.PodCondition{ 989 Type: v1.PodScheduled, 990 Status: v1.ConditionFalse, 991 Reason: reason, // Add more reasons in order to distinguish more specific scenario of pending tasks 992 Message: message, 993 } 994 995 if podConditionHaveUpdate(&pod.Status, condition) { 996 pod = pod.DeepCopy() 997 998 // The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in 999 // k8s core, so using the same string here. 1000 // The reason field in PodCondition can be "Unschedulable" 1001 sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message) 1002 if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil { 1003 return err 1004 } 1005 } else { 1006 klog.V(4).Infof("task unscheduleable %s/%s, message: %s, skip by no condition update", pod.Namespace, pod.Name, message) 1007 } 1008 1009 return nil 1010 } 1011 1012 func (sc *SchedulerCache) deleteJob(job *schedulingapi.JobInfo) { 1013 klog.V(3).Infof("Try to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name) 1014 1015 sc.DeletedJobs.Add(job) 1016 } 1017 1018 func (sc *SchedulerCache) retryDeleteJob(job *schedulingapi.JobInfo) { 1019 klog.V(3).Infof("Retry to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name) 1020 1021 sc.DeletedJobs.AddRateLimited(job) 1022 } 1023 1024 func (sc *SchedulerCache) processCleanupJob() { 1025 obj, shutdown := sc.DeletedJobs.Get() 1026 if shutdown { 1027 return 1028 } 1029 1030 defer sc.DeletedJobs.Done(obj) 1031 1032 job, found := obj.(*schedulingapi.JobInfo) 1033 if !found { 1034 klog.Errorf("Failed to convert <%v> to *JobInfo", obj) 1035 return 1036 } 1037 1038 sc.Mutex.Lock() 1039 defer sc.Mutex.Unlock() 1040 1041 if schedulingapi.JobTerminated(job) { 1042 oldJob, found := sc.Jobs[job.UID] 1043 if !found { 1044 klog.V(3).Infof("Failed to find Job <%v:%v/%v>, ignore it", job.UID, job.Namespace, job.Name) 1045 sc.DeletedJobs.Forget(obj) 1046 return 1047 } 1048 newPgVersion := oldJob.PgUID 1049 oldPgVersion := job.PgUID 1050 klog.V(5).Infof("Just add pguid:%v, try to delete pguid:%v", newPgVersion, oldPgVersion) 1051 if oldPgVersion == newPgVersion { 1052 delete(sc.Jobs, job.UID) 1053 metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace) 1054 klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name) 1055 } 1056 sc.DeletedJobs.Forget(obj) 1057 } else { 1058 // Retry 1059 sc.retryDeleteJob(job) 1060 } 1061 } 1062 1063 func (sc *SchedulerCache) resyncTask(task *schedulingapi.TaskInfo) { 1064 key := sc.generateErrTaskKey(task) 1065 sc.errTasks.AddRateLimited(key) 1066 } 1067 1068 func (sc *SchedulerCache) generateErrTaskKey(task *schedulingapi.TaskInfo) string { 1069 // Job UID is namespace + / +name, for example: theNs/theJob 1070 // Task UID is derived from the Pod UID, for example: d336abea-4f14-42c7-8a6b-092959a31407 1071 // In the example above, the key ultimately becomes: theNs/theJob/d336abea-4f14-42c7-8a6b-092959a31407 1072 return fmt.Sprintf("%s/%s", task.Job, task.UID) 1073 } 1074 1075 func (sc *SchedulerCache) parseErrTaskKey(key string) (*schedulingapi.TaskInfo, error) { 1076 i := strings.LastIndex(key, "/") 1077 if i == -1 { 1078 return nil, fmt.Errorf("failed to split task key %s", key) 1079 } 1080 1081 jobUID := key[:i] 1082 taskUID := key[i+1:] 1083 1084 sc.Mutex.Lock() 1085 defer sc.Mutex.Unlock() 1086 1087 job, found := sc.Jobs[schedulingapi.JobID(jobUID)] 1088 if !found { 1089 return nil, fmt.Errorf("failed to find job %s", jobUID) 1090 } 1091 1092 task, found := job.Tasks[schedulingapi.TaskID(taskUID)] 1093 if !found { 1094 return nil, fmt.Errorf("failed to find task %s", taskUID) 1095 } 1096 1097 return task, nil 1098 } 1099 1100 func (sc *SchedulerCache) processResyncTask() { 1101 obj, shutdown := sc.errTasks.Get() 1102 if shutdown { 1103 return 1104 } 1105 1106 klog.V(5).Infof("the length of errTasks is %d", sc.errTasks.Len()) 1107 1108 defer sc.errTasks.Done(obj) 1109 1110 taskKey, ok := obj.(string) 1111 if !ok { 1112 klog.Errorf("Failed to convert %v to string.", obj) 1113 sc.errTasks.Forget(obj) 1114 return 1115 } 1116 1117 task, err := sc.parseErrTaskKey(taskKey) 1118 if err != nil { 1119 klog.ErrorS(err, "Failed to get task for sync task", "taskKey", taskKey) 1120 sc.errTasks.Forget(obj) 1121 return 1122 } 1123 1124 reSynced := false 1125 if err := sc.syncTask(task); err != nil { 1126 klog.ErrorS(err, "Failed to sync task, retry it", "namespace", task.Namespace, "name", task.Name) 1127 sc.resyncTask(task) 1128 reSynced = true 1129 } else { 1130 klog.V(4).Infof("Successfully synced task <%s/%s>", task.Namespace, task.Name) 1131 sc.errTasks.Forget(obj) 1132 } 1133 1134 // execute custom bind err handler call back func if exists. 1135 if task.CustomBindErrHandler != nil && !task.CustomBindErrHandlerSucceeded { 1136 err := task.CustomBindErrHandler() 1137 if err != nil { 1138 klog.ErrorS(err, "Failed to execute custom bind err handler, retry it.") 1139 } else { 1140 task.CustomBindErrHandlerSucceeded = true 1141 } 1142 if !task.CustomBindErrHandlerSucceeded && !reSynced { 1143 sc.resyncTask(task) 1144 } 1145 } 1146 } 1147 1148 func (sc *SchedulerCache) runNodeWorker() { 1149 for sc.processSyncNode() { 1150 } 1151 } 1152 1153 func (sc *SchedulerCache) processSyncNode() bool { 1154 obj, shutdown := sc.nodeQueue.Get() 1155 if shutdown { 1156 return false 1157 } 1158 defer sc.nodeQueue.Done(obj) 1159 1160 nodeName, ok := obj.(string) 1161 if !ok { 1162 klog.Errorf("failed to convert %v to string", obj) 1163 return true 1164 } 1165 1166 klog.V(5).Infof("started sync node %s", nodeName) 1167 err := sc.SyncNode(nodeName) 1168 if err == nil { 1169 sc.nodeQueue.Forget(nodeName) 1170 return true 1171 } 1172 1173 klog.Errorf("Failed to sync node <%s>, retry it.", nodeName) 1174 sc.nodeQueue.AddRateLimited(nodeName) 1175 return true 1176 } 1177 1178 // AddBindTask add task to be bind to a cache which consumes by go runtime 1179 func (sc *SchedulerCache) AddBindTask(taskInfo *schedulingapi.TaskInfo) error { 1180 klog.V(5).Infof("add bind task %v/%v", taskInfo.Namespace, taskInfo.Name) 1181 sc.Mutex.Lock() 1182 defer sc.Mutex.Unlock() 1183 job, task, err := sc.findJobAndTask(taskInfo) 1184 if err != nil { 1185 return err 1186 } 1187 1188 node, found := sc.Nodes[taskInfo.NodeName] 1189 if !found { 1190 return fmt.Errorf("failed to bind Task %v to host %v, host does not exist", 1191 task.UID, taskInfo.NodeName) 1192 } 1193 1194 originalStatus := task.Status 1195 if err := job.UpdateTaskStatus(task, schedulingapi.Binding); err != nil { 1196 return err 1197 } 1198 1199 err = taskInfo.SetPodResourceDecision() 1200 if err != nil { 1201 return fmt.Errorf("set task %v/%v resource decision failed, err %v", task.Namespace, task.Name, err) 1202 } 1203 task.NumaInfo = taskInfo.NumaInfo.Clone() 1204 1205 // Add task to the node. 1206 if err := node.AddTask(task); err != nil { 1207 // After failing to update task to a node we need to revert task status from Releasing, 1208 // otherwise task might be stuck in the Releasing state indefinitely. 1209 if err := job.UpdateTaskStatus(task, originalStatus); err != nil { 1210 klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+ 1211 "from %s to %s after failing to update Task on Node <%s>: %v", 1212 task.Namespace, task.Name, task.Status, originalStatus, node.Name, err) 1213 sc.resyncTask(task) 1214 } 1215 return err 1216 } 1217 1218 sc.BindFlowChannel <- taskInfo 1219 1220 return nil 1221 } 1222 1223 func (sc *SchedulerCache) processBindTask() { 1224 for { 1225 select { 1226 case taskInfo, ok := <-sc.BindFlowChannel: 1227 if !ok { 1228 return 1229 } 1230 1231 sc.bindCache = append(sc.bindCache, taskInfo) 1232 if len(sc.bindCache) == sc.batchNum { 1233 sc.BindTask() 1234 } 1235 default: 1236 } 1237 1238 if len(sc.BindFlowChannel) == 0 { 1239 break 1240 } 1241 } 1242 1243 if len(sc.bindCache) == 0 { 1244 return 1245 } 1246 sc.BindTask() 1247 } 1248 1249 // BindTask do k8s binding with a goroutine 1250 func (sc *SchedulerCache) BindTask() { 1251 klog.V(5).Infof("batch bind task count %d", len(sc.bindCache)) 1252 var tmpBindCache []*schedulingapi.TaskInfo = make([]*schedulingapi.TaskInfo, len(sc.bindCache)) 1253 copy(tmpBindCache, sc.bindCache) 1254 go func(tasks []*schedulingapi.TaskInfo) { 1255 successfulTasks := make([]*schedulingapi.TaskInfo, 0) 1256 for _, task := range tasks { 1257 if err := sc.VolumeBinder.BindVolumes(task, task.PodVolumes); err != nil { 1258 klog.Errorf("task %s/%s bind Volumes failed: %#v", task.Namespace, task.Name, err) 1259 sc.VolumeBinder.RevertVolumes(task, task.PodVolumes) 1260 sc.resyncTask(task) 1261 } else { 1262 successfulTasks = append(successfulTasks, task) 1263 klog.V(5).Infof("task %s/%s bind Volumes done", task.Namespace, task.Name) 1264 } 1265 } 1266 1267 bindTasks := make([]*schedulingapi.TaskInfo, len(successfulTasks)) 1268 copy(bindTasks, successfulTasks) 1269 sc.Bind(bindTasks) 1270 }(tmpBindCache) 1271 sc.bindCache = sc.bindCache[0:0] 1272 } 1273 1274 // Snapshot returns the complete snapshot of the cluster from cache 1275 func (sc *SchedulerCache) Snapshot() *schedulingapi.ClusterInfo { 1276 sc.Mutex.Lock() 1277 defer sc.Mutex.Unlock() 1278 1279 snapshot := &schedulingapi.ClusterInfo{ 1280 Nodes: make(map[string]*schedulingapi.NodeInfo), 1281 Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo), 1282 Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo), 1283 NamespaceInfo: make(map[schedulingapi.NamespaceName]*schedulingapi.NamespaceInfo), 1284 RevocableNodes: make(map[string]*schedulingapi.NodeInfo), 1285 NodeList: make([]string, len(sc.NodeList)), 1286 CSINodesStatus: make(map[string]*schedulingapi.CSINodeStatusInfo), 1287 } 1288 1289 copy(snapshot.NodeList, sc.NodeList) 1290 for _, value := range sc.Nodes { 1291 value.RefreshNumaSchedulerInfoByCrd() 1292 } 1293 1294 for _, value := range sc.CSINodesStatus { 1295 snapshot.CSINodesStatus[value.CSINodeName] = value.Clone() 1296 } 1297 1298 for _, value := range sc.Nodes { 1299 if !value.Ready() { 1300 continue 1301 } 1302 1303 snapshot.Nodes[value.Name] = value.Clone() 1304 1305 if value.RevocableZone != "" { 1306 snapshot.RevocableNodes[value.Name] = snapshot.Nodes[value.Name] 1307 } 1308 } 1309 1310 for _, value := range sc.Queues { 1311 snapshot.Queues[value.UID] = value.Clone() 1312 } 1313 1314 var cloneJobLock sync.Mutex 1315 var wg sync.WaitGroup 1316 1317 cloneJob := func(value *schedulingapi.JobInfo) { 1318 defer wg.Done() 1319 if value.PodGroup != nil { 1320 value.Priority = sc.defaultPriority 1321 1322 priName := value.PodGroup.Spec.PriorityClassName 1323 if priorityClass, found := sc.PriorityClasses[priName]; found { 1324 value.Priority = priorityClass.Value 1325 } 1326 1327 klog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>", 1328 value.Namespace, value.Name, priName, value.Priority) 1329 } 1330 1331 clonedJob := value.Clone() 1332 1333 cloneJobLock.Lock() 1334 snapshot.Jobs[value.UID] = clonedJob 1335 cloneJobLock.Unlock() 1336 } 1337 1338 for _, value := range sc.NamespaceCollection { 1339 info := value.Snapshot() 1340 snapshot.NamespaceInfo[info.Name] = info 1341 } 1342 1343 for _, value := range sc.Jobs { 1344 // If no scheduling spec, does not handle it. 1345 if value.PodGroup == nil { 1346 klog.V(4).Infof("The scheduling spec of Job <%v:%s/%s> is nil, ignore it.", 1347 value.UID, value.Namespace, value.Name) 1348 1349 continue 1350 } 1351 1352 if _, found := snapshot.Queues[value.Queue]; !found { 1353 klog.V(3).Infof("The Queue <%v> of Job <%v/%v> does not exist, ignore it.", 1354 value.Queue, value.Namespace, value.Name) 1355 continue 1356 } 1357 1358 wg.Add(1) 1359 go cloneJob(value) 1360 } 1361 wg.Wait() 1362 1363 klog.V(3).Infof("There are <%d> Jobs, <%d> Queues and <%d> Nodes in total for scheduling.", 1364 len(snapshot.Jobs), len(snapshot.Queues), len(snapshot.Nodes)) 1365 1366 return snapshot 1367 } 1368 1369 // String returns information about the cache in a string format 1370 func (sc *SchedulerCache) String() string { 1371 sc.Mutex.Lock() 1372 defer sc.Mutex.Unlock() 1373 1374 str := "Cache:\n" 1375 1376 if len(sc.Nodes) != 0 { 1377 str += "Nodes:\n" 1378 for _, n := range sc.Nodes { 1379 str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n", 1380 n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks)) 1381 1382 i := 0 1383 for _, p := range n.Tasks { 1384 str += fmt.Sprintf("\t\t %d: %v\n", i, p) 1385 i++ 1386 } 1387 } 1388 } 1389 1390 if len(sc.Jobs) != 0 { 1391 str += "Jobs:\n" 1392 for _, job := range sc.Jobs { 1393 str += fmt.Sprintf("\t %s\n", job) 1394 } 1395 } 1396 1397 if len(sc.NamespaceCollection) != 0 { 1398 str += "Namespaces:\n" 1399 for _, ns := range sc.NamespaceCollection { 1400 info := ns.Snapshot() 1401 str += fmt.Sprintf("\t Namespace(%s)\n", info.Name) 1402 } 1403 } 1404 1405 if len(sc.NodeList) != 0 { 1406 str += fmt.Sprintf("NodeList: %v\n", sc.NodeList) 1407 } 1408 1409 return str 1410 } 1411 1412 // RecordJobStatusEvent records related events according to job status. 1413 func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo, updatePG bool) { 1414 pgUnschedulable := job.PodGroup != nil && 1415 (job.PodGroup.Status.Phase == scheduling.PodGroupUnknown || 1416 job.PodGroup.Status.Phase == scheduling.PodGroupPending || 1417 job.PodGroup.Status.Phase == scheduling.PodGroupInqueue) 1418 1419 // If pending or unschedulable, record unschedulable event. 1420 if pgUnschedulable { 1421 msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v", 1422 len(job.TaskStatusIndex[schedulingapi.Pending]), 1423 len(job.Tasks), 1424 job.FitError()) 1425 sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeWarning, string(scheduling.PodGroupUnschedulableType), msg) 1426 } else if updatePG { 1427 sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupScheduled), string(scheduling.PodGroupReady)) 1428 } 1429 1430 baseErrorMessage := job.JobFitErrors 1431 if baseErrorMessage == "" { 1432 baseErrorMessage = schedulingapi.AllNodeUnavailableMsg 1433 } 1434 // Update podCondition for tasks Allocated and Pending before job discarded 1435 for _, status := range []schedulingapi.TaskStatus{schedulingapi.Allocated, schedulingapi.Pending, schedulingapi.Pipelined} { 1436 for _, taskInfo := range job.TaskStatusIndex[status] { 1437 reason, msg := job.TaskSchedulingReason(taskInfo.UID) 1438 if len(msg) == 0 { 1439 msg = baseErrorMessage 1440 } 1441 if err := sc.taskUnschedulable(taskInfo, reason, msg); err != nil { 1442 klog.Errorf("Failed to update unschedulable task status <%s/%s>: %v", 1443 taskInfo.Namespace, taskInfo.Name, err) 1444 } 1445 } 1446 } 1447 } 1448 1449 // UpdateJobStatus update the status of job and its tasks. 1450 func (sc *SchedulerCache) UpdateJobStatus(job *schedulingapi.JobInfo, updatePG bool) (*schedulingapi.JobInfo, error) { 1451 if updatePG { 1452 pg, err := sc.StatusUpdater.UpdatePodGroup(job.PodGroup) 1453 if err != nil { 1454 return nil, err 1455 } 1456 job.PodGroup = pg 1457 } 1458 1459 sc.RecordJobStatusEvent(job, updatePG) 1460 1461 return job, nil 1462 } 1463 1464 // UpdateQueueStatus update the status of queue. 1465 func (sc *SchedulerCache) UpdateQueueStatus(queue *schedulingapi.QueueInfo) error { 1466 return sc.StatusUpdater.UpdateQueueStatus(queue) 1467 } 1468 1469 func (sc *SchedulerCache) recordPodGroupEvent(podGroup *schedulingapi.PodGroup, eventType, reason, msg string) { 1470 if podGroup == nil { 1471 return 1472 } 1473 1474 pg := &vcv1beta1.PodGroup{} 1475 if err := schedulingscheme.Scheme.Convert(&podGroup.PodGroup, pg, nil); err != nil { 1476 klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err) 1477 return 1478 } 1479 sc.Recorder.Eventf(pg, eventType, reason, msg) 1480 } 1481 1482 func (sc *SchedulerCache) SetMetricsConf(conf map[string]string) { 1483 sc.metricsConf = conf 1484 } 1485 1486 func (sc *SchedulerCache) GetMetricsData() { 1487 metricsType := sc.metricsConf["type"] 1488 if len(metricsType) == 0 { 1489 klog.V(3).Infof("The metrics type is not set in the volcano scheduler configmap file. " + 1490 "As a result, the CPU and memory load information of the node is not collected.") 1491 return 1492 } 1493 1494 client, err := source.NewMetricsClient(sc.restConfig, sc.metricsConf) 1495 if err != nil { 1496 klog.Errorf("Error creating client: %v\n", err) 1497 return 1498 } 1499 ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) 1500 defer cancel() 1501 nodeMetricsMap := make(map[string]*source.NodeMetrics, len(sc.NodeList)) 1502 sc.Mutex.Lock() 1503 1504 for _, nodeName := range sc.NodeList { 1505 nodeMetricsMap[nodeName] = &source.NodeMetrics{} 1506 } 1507 sc.Mutex.Unlock() 1508 1509 err = client.NodesMetricsAvg(ctx, nodeMetricsMap) 1510 if err != nil { 1511 klog.Errorf("Error getting node metrics: %v\n", err) 1512 return 1513 } 1514 1515 sc.setMetricsData(nodeMetricsMap) 1516 } 1517 1518 func (sc *SchedulerCache) setMetricsData(usageInfo map[string]*source.NodeMetrics) { 1519 sc.Mutex.Lock() 1520 defer sc.Mutex.Unlock() 1521 1522 for nodeName, nodeMetric := range usageInfo { 1523 nodeUsage := &schedulingapi.NodeUsage{ 1524 CPUUsageAvg: make(map[string]float64), 1525 MEMUsageAvg: make(map[string]float64), 1526 } 1527 nodeUsage.MetricsTime = nodeMetric.MetricsTime 1528 nodeUsage.CPUUsageAvg[source.NODE_METRICS_PERIOD] = nodeMetric.CPU 1529 nodeUsage.MEMUsageAvg[source.NODE_METRICS_PERIOD] = nodeMetric.Memory 1530 1531 nodeInfo, ok := sc.Nodes[nodeName] 1532 if !ok { 1533 klog.Errorf("The information about node %s cannot be found in the cache.", nodeName) 1534 continue 1535 } 1536 klog.V(5).Infof("node: %s, ResourceUsage: %+v => %+v", nodeName, *nodeInfo.ResourceUsage, nodeUsage) 1537 nodeInfo.ResourceUsage = nodeUsage 1538 } 1539 } 1540 1541 // createImageStateSummary returns a summarizing snapshot of the given image's state. 1542 func (sc *SchedulerCache) createImageStateSummary(state *imageState) *framework.ImageStateSummary { 1543 return &framework.ImageStateSummary{ 1544 Size: state.size, 1545 NumNodes: len(state.nodes), 1546 } 1547 }