volcano.sh/volcano@v1.9.0/pkg/scheduler/cache/cache.go

volcano.sh/volcano@v1.9.0/pkg/scheduler/cache/cache.go (about)

     1  /*
     2   Copyright 2021 The Volcano Authors.
     3  
     4   Licensed under the Apache License, Version 2.0 (the "License");
     5   you may not use this file except in compliance with the License.
     6   You may obtain a copy of the License at
     7  
     8       http://www.apache.org/licenses/LICENSE-2.0
     9  
    10   Unless required by applicable law or agreed to in writing, software
    11   distributed under the License is distributed on an "AS IS" BASIS,
    12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   See the License for the specific language governing permissions and
    14   limitations under the License.
    15  */
    16  
    17  package cache
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"strconv"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	"golang.org/x/time/rate"
    29  	v1 "k8s.io/api/core/v1"
    30  	schedulingv1 "k8s.io/api/scheduling/v1"
    31  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/runtime"
    34  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    35  	"k8s.io/apimachinery/pkg/util/sets"
    36  	"k8s.io/apimachinery/pkg/util/wait"
    37  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    38  	"k8s.io/client-go/informers"
    39  	infov1 "k8s.io/client-go/informers/core/v1"
    40  	schedv1 "k8s.io/client-go/informers/scheduling/v1"
    41  	storagev1 "k8s.io/client-go/informers/storage/v1"
    42  	storagev1beta1 "k8s.io/client-go/informers/storage/v1beta1"
    43  	"k8s.io/client-go/kubernetes"
    44  	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    45  	"k8s.io/client-go/rest"
    46  	"k8s.io/client-go/tools/cache"
    47  	"k8s.io/client-go/tools/record"
    48  	"k8s.io/client-go/util/retry"
    49  	"k8s.io/client-go/util/workqueue"
    50  	"k8s.io/klog/v2"
    51  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    52  	"k8s.io/kubernetes/pkg/scheduler/framework"
    53  
    54  	batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
    55  	"volcano.sh/apis/pkg/apis/scheduling"
    56  	schedulingscheme "volcano.sh/apis/pkg/apis/scheduling/scheme"
    57  	vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    58  	vcclient "volcano.sh/apis/pkg/client/clientset/versioned"
    59  	"volcano.sh/apis/pkg/client/clientset/versioned/scheme"
    60  	vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
    61  	cpuinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/nodeinfo/v1alpha1"
    62  	vcinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
    63  
    64  	"volcano.sh/volcano/cmd/scheduler/app/options"
    65  	"volcano.sh/volcano/pkg/features"
    66  	schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
    67  	volumescheduling "volcano.sh/volcano/pkg/scheduler/capabilities/volumebinding"
    68  	"volcano.sh/volcano/pkg/scheduler/metrics"
    69  	"volcano.sh/volcano/pkg/scheduler/metrics/source"
    70  	commonutil "volcano.sh/volcano/pkg/util"
    71  )
    72  
    73  const (
    74  	// default interval for sync data from metrics server, the value is 30s
    75  	defaultMetricsInternal = 30 * time.Second
    76  )
    77  
    78  // defaultIgnoredProvisioners contains provisioners that will be ignored during pod pvc request computation and preemption.
    79  var defaultIgnoredProvisioners = []string{"rancher.io/local-path", "hostpath.csi.k8s.io"}
    80  
    81  func init() {
    82  	schemeBuilder := runtime.SchemeBuilder{
    83  		v1.AddToScheme,
    84  	}
    85  
    86  	utilruntime.Must(schemeBuilder.AddToScheme(scheme.Scheme))
    87  }
    88  
    89  // New returns a Cache implementation.
    90  func New(config *rest.Config, schedulerNames []string, defaultQueue string, nodeSelectors []string, nodeWorkers uint32, ignoredProvisioners []string) Cache {
    91  	return newSchedulerCache(config, schedulerNames, defaultQueue, nodeSelectors, nodeWorkers, ignoredProvisioners)
    92  }
    93  
    94  // SchedulerCache cache for the kube batch
    95  type SchedulerCache struct {
    96  	sync.Mutex
    97  
    98  	kubeClient   kubernetes.Interface
    99  	restConfig   *rest.Config
   100  	vcClient     vcclient.Interface
   101  	defaultQueue string
   102  	// schedulerName is the name for volcano scheduler
   103  	schedulerNames     []string
   104  	nodeSelectorLabels map[string]string
   105  	metricsConf        map[string]string
   106  
   107  	podInformer                infov1.PodInformer
   108  	nodeInformer               infov1.NodeInformer
   109  	podGroupInformerV1beta1    vcinformerv1.PodGroupInformer
   110  	queueInformerV1beta1       vcinformerv1.QueueInformer
   111  	pvInformer                 infov1.PersistentVolumeInformer
   112  	pvcInformer                infov1.PersistentVolumeClaimInformer
   113  	scInformer                 storagev1.StorageClassInformer
   114  	pcInformer                 schedv1.PriorityClassInformer
   115  	quotaInformer              infov1.ResourceQuotaInformer
   116  	csiNodeInformer            storagev1.CSINodeInformer
   117  	csiDriverInformer          storagev1.CSIDriverInformer
   118  	csiStorageCapacityInformer storagev1beta1.CSIStorageCapacityInformer
   119  	cpuInformer                cpuinformerv1.NumatopologyInformer
   120  
   121  	Binder         Binder
   122  	Evictor        Evictor
   123  	StatusUpdater  StatusUpdater
   124  	PodGroupBinder BatchBinder
   125  	VolumeBinder   VolumeBinder
   126  
   127  	Recorder record.EventRecorder
   128  
   129  	Jobs                 map[schedulingapi.JobID]*schedulingapi.JobInfo
   130  	Nodes                map[string]*schedulingapi.NodeInfo
   131  	Queues               map[schedulingapi.QueueID]*schedulingapi.QueueInfo
   132  	PriorityClasses      map[string]*schedulingv1.PriorityClass
   133  	NodeList             []string
   134  	defaultPriorityClass *schedulingv1.PriorityClass
   135  	defaultPriority      int32
   136  	CSINodesStatus       map[string]*schedulingapi.CSINodeStatusInfo
   137  
   138  	NamespaceCollection map[string]*schedulingapi.NamespaceCollection
   139  
   140  	errTasks    workqueue.RateLimitingInterface
   141  	nodeQueue   workqueue.RateLimitingInterface
   142  	DeletedJobs workqueue.RateLimitingInterface
   143  
   144  	informerFactory   informers.SharedInformerFactory
   145  	vcInformerFactory vcinformer.SharedInformerFactory
   146  
   147  	BindFlowChannel chan *schedulingapi.TaskInfo
   148  	bindCache       []*schedulingapi.TaskInfo
   149  	batchNum        int
   150  
   151  	// A map from image name to its imageState.
   152  	imageStates map[string]*imageState
   153  
   154  	nodeWorkers uint32
   155  
   156  	// IgnoredCSIProvisioners contains a list of provisioners, and pod request pvc with these provisioners will
   157  	// not be counted in pod pvc resource request and node.Allocatable, because the spec.drivers of csinode resource
   158  	// is always null, these provisioners usually are host path csi controllers like rancher.io/local-path and hostpath.csi.k8s.io.
   159  	IgnoredCSIProvisioners sets.Set[string]
   160  }
   161  
   162  type imageState struct {
   163  	// Size of the image
   164  	size int64
   165  	// A set of node names for nodes having this image present
   166  	nodes sets.String
   167  }
   168  
   169  // DefaultBinder with kube client and event recorder
   170  type DefaultBinder struct {
   171  	kubeclient kubernetes.Interface
   172  	recorder   record.EventRecorder
   173  }
   174  
   175  // Bind will send bind request to api server
   176  func (db *DefaultBinder) Bind(kubeClient kubernetes.Interface, tasks []*schedulingapi.TaskInfo) ([]*schedulingapi.TaskInfo, error) {
   177  	var errTasks []*schedulingapi.TaskInfo
   178  	for _, task := range tasks {
   179  		p := task.Pod
   180  		if err := db.kubeclient.CoreV1().Pods(p.Namespace).Bind(context.TODO(),
   181  			&v1.Binding{
   182  				ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID, Annotations: p.Annotations},
   183  				Target: v1.ObjectReference{
   184  					Kind: "Node",
   185  					Name: task.NodeName,
   186  				},
   187  			},
   188  			metav1.CreateOptions{}); err != nil {
   189  			klog.Errorf("Failed to bind pod <%v/%v> to node %s : %#v", p.Namespace, p.Name, task.NodeName, err)
   190  			errTasks = append(errTasks, task)
   191  		} else {
   192  			db.recorder.Eventf(task.Pod, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", task.Namespace, task.Name, task.NodeName)
   193  			metrics.UpdateTaskScheduleDuration(metrics.Duration(p.CreationTimestamp.Time)) // update metrics as soon as pod is bind
   194  		}
   195  	}
   196  
   197  	if len(errTasks) > 0 {
   198  		return errTasks, fmt.Errorf("failed to bind pods")
   199  	}
   200  
   201  	return nil, nil
   202  }
   203  
   204  // NewDefaultBinder create binder with kube client and event recorder, support fake binder if passed fake client and fake event recorder
   205  func NewDefaultBinder(kbclient kubernetes.Interface, record record.EventRecorder) *DefaultBinder {
   206  	return &DefaultBinder{
   207  		kubeclient: kbclient,
   208  		recorder:   record,
   209  	}
   210  }
   211  
   212  type defaultEvictor struct {
   213  	kubeclient kubernetes.Interface
   214  	recorder   record.EventRecorder
   215  }
   216  
   217  // Evict will send delete pod request to api server
   218  func (de *defaultEvictor) Evict(p *v1.Pod, reason string) error {
   219  	klog.V(3).Infof("Evicting pod %v/%v, because of %v", p.Namespace, p.Name, reason)
   220  
   221  	evictMsg := fmt.Sprintf("Pod is evicted, because of %v", reason)
   222  	annotations := map[string]string{}
   223  	// record that we are evicting the pod
   224  	de.recorder.AnnotatedEventf(p, annotations, v1.EventTypeWarning, "Evict", evictMsg)
   225  
   226  	pod := p.DeepCopy()
   227  	condition := &v1.PodCondition{
   228  		Type:    v1.PodReady,
   229  		Status:  v1.ConditionFalse,
   230  		Reason:  "Evict",
   231  		Message: evictMsg,
   232  	}
   233  	if !podutil.UpdatePodCondition(&pod.Status, condition) {
   234  		klog.V(1).Infof("UpdatePodCondition: existed condition, not update")
   235  		klog.V(1).Infof("%+v", pod.Status.Conditions)
   236  		return nil
   237  	}
   238  	if _, err := de.kubeclient.CoreV1().Pods(p.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
   239  		klog.Errorf("Failed to update pod <%v/%v> status: %v", pod.Namespace, pod.Name, err)
   240  		return err
   241  	}
   242  	if err := de.kubeclient.CoreV1().Pods(p.Namespace).Delete(context.TODO(), p.Name, metav1.DeleteOptions{}); err != nil {
   243  		klog.Errorf("Failed to evict pod <%v/%v>: %#v", p.Namespace, p.Name, err)
   244  		return err
   245  	}
   246  
   247  	return nil
   248  }
   249  
   250  // defaultStatusUpdater is the default implementation of the StatusUpdater interface
   251  type defaultStatusUpdater struct {
   252  	kubeclient kubernetes.Interface
   253  	vcclient   vcclient.Interface
   254  }
   255  
   256  // following the same logic as podutil.UpdatePodCondition
   257  func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool {
   258  	lastTransitionTime := metav1.Now()
   259  	// Try to find this pod condition.
   260  	_, oldCondition := podutil.GetPodCondition(status, condition.Type)
   261  
   262  	if oldCondition == nil {
   263  		// We are adding new pod condition.
   264  		return true
   265  	}
   266  	// We are updating an existing condition, so we need to check if it has changed.
   267  	if condition.Status == oldCondition.Status {
   268  		lastTransitionTime = oldCondition.LastTransitionTime
   269  	}
   270  
   271  	isEqual := condition.Status == oldCondition.Status &&
   272  		condition.Reason == oldCondition.Reason &&
   273  		condition.Message == oldCondition.Message &&
   274  		condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) &&
   275  		lastTransitionTime.Equal(&oldCondition.LastTransitionTime)
   276  
   277  	// Return true if one of the fields have changed.
   278  	return !isEqual
   279  }
   280  
   281  // UpdatePodCondition will Update pod with podCondition
   282  func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) {
   283  	klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
   284  	if podutil.UpdatePodCondition(&pod.Status, condition) {
   285  		return su.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
   286  	}
   287  	return pod, nil
   288  }
   289  
   290  // UpdatePodGroup will Update pod with podCondition
   291  func (su *defaultStatusUpdater) UpdatePodGroup(pg *schedulingapi.PodGroup) (*schedulingapi.PodGroup, error) {
   292  	podgroup := &vcv1beta1.PodGroup{}
   293  	if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
   294  		klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
   295  		return nil, err
   296  	}
   297  
   298  	updated, err := su.vcclient.SchedulingV1beta1().PodGroups(podgroup.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
   299  	if err != nil {
   300  		klog.Errorf("Error while updating PodGroup with error: %v", err)
   301  		return nil, err
   302  	}
   303  
   304  	podGroupInfo := &schedulingapi.PodGroup{Version: schedulingapi.PodGroupVersionV1Beta1}
   305  	if err := schedulingscheme.Scheme.Convert(updated, &podGroupInfo.PodGroup, nil); err != nil {
   306  		klog.Errorf("Error while converting v1alpha.PodGroup to api.PodGroup with error: %v", err)
   307  		return nil, err
   308  	}
   309  
   310  	return podGroupInfo, nil
   311  }
   312  
   313  // UpdateQueueStatus will update the status of queue
   314  func (su *defaultStatusUpdater) UpdateQueueStatus(queue *schedulingapi.QueueInfo) error {
   315  	var newQueue = &vcv1beta1.Queue{}
   316  	if err := schedulingscheme.Scheme.Convert(queue.Queue, newQueue, nil); err != nil {
   317  		klog.Errorf("error occurred in converting scheduling.Queue to v1beta1.Queue: %s", err.Error())
   318  		return err
   319  	}
   320  
   321  	_, err := su.vcclient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{})
   322  	if err != nil {
   323  		klog.Errorf("error occurred in updating Queue <%s>: %s", newQueue.Name, err.Error())
   324  		return err
   325  	}
   326  	return nil
   327  }
   328  
   329  type defaultVolumeBinder struct {
   330  	volumeBinder volumescheduling.SchedulerVolumeBinder
   331  }
   332  
   333  // AllocateVolumes allocates volume on the host to the task
   334  func (dvb *defaultVolumeBinder) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
   335  	logger := klog.FromContext(context.TODO())
   336  	allBound, err := dvb.volumeBinder.AssumePodVolumes(logger, task.Pod, hostname, podVolumes)
   337  	task.VolumeReady = allBound
   338  
   339  	return err
   340  }
   341  
   342  // RevertVolumes clean cache generated by AllocateVolumes
   343  func (dvb *defaultVolumeBinder) RevertVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) {
   344  	if podVolumes != nil {
   345  		klog.Infof("Revert assumed volumes for task %v/%v on node %s", task.Namespace, task.Name, task.NodeName)
   346  		dvb.volumeBinder.RevertAssumedPodVolumes(podVolumes)
   347  		task.VolumeReady = false
   348  		task.PodVolumes = nil
   349  	}
   350  }
   351  
   352  // GetPodVolumes get pod volume on the host
   353  func (dvb *defaultVolumeBinder) GetPodVolumes(task *schedulingapi.TaskInfo,
   354  	node *v1.Node) (podVolumes *volumescheduling.PodVolumes, err error) {
   355  	logger := klog.FromContext(context.TODO())
   356  	podVolumeClaims, err := dvb.volumeBinder.GetPodVolumeClaims(logger, task.Pod)
   357  	if err != nil {
   358  		return nil, err
   359  	}
   360  	// if len(unboundClaimsImmediate) > 0 {
   361  	// 	return nil, fmt.Errorf("pod has unbound immediate PersistentVolumeClaims")
   362  	// }
   363  
   364  	podVolumes, reasons, err := dvb.volumeBinder.FindPodVolumes(logger, task.Pod, podVolumeClaims, node)
   365  	if err != nil {
   366  		return nil, err
   367  	} else if len(reasons) > 0 {
   368  		var errors []string
   369  		for _, reason := range reasons {
   370  			errors = append(errors, string(reason))
   371  		}
   372  		return nil, fmt.Errorf(strings.Join(errors, ","))
   373  	}
   374  
   375  	return podVolumes, err
   376  }
   377  
   378  // BindVolumes binds volumes to the task
   379  func (dvb *defaultVolumeBinder) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
   380  	// If task's volumes are ready, did not bind them again.
   381  	if task.VolumeReady {
   382  		return nil
   383  	}
   384  
   385  	return dvb.volumeBinder.BindPodVolumes(context.TODO(), task.Pod, podVolumes)
   386  }
   387  
   388  type podgroupBinder struct {
   389  	kubeclient kubernetes.Interface
   390  	vcclient   vcclient.Interface
   391  }
   392  
   393  // Bind will add silo cluster annotaion on pod and podgroup
   394  func (pgb *podgroupBinder) Bind(job *schedulingapi.JobInfo, cluster string) (*schedulingapi.JobInfo, error) {
   395  	if len(job.Tasks) == 0 {
   396  		klog.V(4).Infof("Job pods have not been created yet")
   397  		return job, nil
   398  	}
   399  	for _, task := range job.Tasks {
   400  		pod := task.Pod
   401  		pod.Annotations[batch.ForwardClusterKey] = cluster
   402  		pod.ResourceVersion = ""
   403  		_, err := pgb.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
   404  		if err != nil {
   405  			klog.Errorf("Error while update pod annotation with error: %v", err)
   406  			return nil, err
   407  		}
   408  	}
   409  
   410  	pg := job.PodGroup
   411  	pg.Annotations[batch.ForwardClusterKey] = cluster
   412  	podgroup := &vcv1beta1.PodGroup{}
   413  	if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
   414  		klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
   415  		return nil, err
   416  	}
   417  	newPg, err := pgb.vcclient.SchedulingV1beta1().PodGroups(pg.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
   418  	if err != nil {
   419  		klog.Errorf("Error while update PodGroup annotation with error: %v", err)
   420  		return nil, err
   421  	}
   422  	job.PodGroup.ResourceVersion = newPg.ResourceVersion
   423  	klog.V(4).Infof("Bind PodGroup <%s> successfully", job.PodGroup.Name)
   424  	return job, nil
   425  }
   426  
   427  // updateNodeSelectors parse and update node selector key value pairs to schedule cache
   428  func (sc *SchedulerCache) updateNodeSelectors(nodeSelectors []string) {
   429  	for _, nodeSelectorLabel := range nodeSelectors {
   430  		nodeSelectorLabelLen := len(nodeSelectorLabel)
   431  		if nodeSelectorLabelLen <= 0 {
   432  			continue
   433  		}
   434  		// check input
   435  		index := strings.Index(nodeSelectorLabel, ":")
   436  		if index < 0 || index >= (nodeSelectorLabelLen-1) {
   437  			continue
   438  		}
   439  		nodeSelectorLabelName := strings.TrimSpace(nodeSelectorLabel[:index])
   440  		nodeSelectorLabelValue := strings.TrimSpace(nodeSelectorLabel[index+1:])
   441  		key := nodeSelectorLabelName + ":" + nodeSelectorLabelValue
   442  		sc.nodeSelectorLabels[key] = ""
   443  	}
   444  }
   445  
   446  // setBatchBindParallel configure the parallel when binding tasks to apiserver
   447  func (sc *SchedulerCache) setBatchBindParallel() {
   448  	sc.BindFlowChannel = make(chan *schedulingapi.TaskInfo, 5000)
   449  	var batchNum int
   450  	batchNum, err := strconv.Atoi(os.Getenv("BATCH_BIND_NUM"))
   451  	if err == nil && batchNum > 0 {
   452  		sc.batchNum = batchNum
   453  	} else {
   454  		sc.batchNum = 1
   455  	}
   456  }
   457  
   458  func (sc *SchedulerCache) setDefaultVolumeBinder() {
   459  	logger := klog.FromContext(context.TODO())
   460  	var capacityCheck *volumescheduling.CapacityCheck
   461  	if options.ServerOpts != nil && options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) {
   462  		capacityCheck = &volumescheduling.CapacityCheck{
   463  			CSIDriverInformer:          sc.csiDriverInformer,
   464  			CSIStorageCapacityInformer: sc.csiStorageCapacityInformer,
   465  		}
   466  	}
   467  	sc.VolumeBinder = &defaultVolumeBinder{
   468  		volumeBinder: volumescheduling.NewVolumeBinder(
   469  			logger,
   470  			sc.kubeClient,
   471  			sc.podInformer,
   472  			sc.nodeInformer,
   473  			sc.csiNodeInformer,
   474  			sc.pvcInformer,
   475  			sc.pvInformer,
   476  			sc.scInformer,
   477  			capacityCheck,
   478  			30*time.Second,
   479  		),
   480  	}
   481  }
   482  
   483  // newDefaultQueue init default queue
   484  func newDefaultQueue(vcClient vcclient.Interface, defaultQueue string) {
   485  	reclaimable := true
   486  	defaultQue := vcv1beta1.Queue{
   487  		ObjectMeta: metav1.ObjectMeta{
   488  			Name: defaultQueue,
   489  		},
   490  		Spec: vcv1beta1.QueueSpec{
   491  			Reclaimable: &reclaimable,
   492  			Weight:      1,
   493  		},
   494  	}
   495  
   496  	err := retry.OnError(wait.Backoff{
   497  		Steps:    60,
   498  		Duration: time.Second,
   499  		Factor:   1,
   500  		Jitter:   0.1,
   501  	}, func(err error) bool {
   502  		return !apierrors.IsAlreadyExists(err)
   503  	}, func() error {
   504  		_, err := vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), &defaultQue, metav1.CreateOptions{})
   505  		return err
   506  	})
   507  	if err != nil && !apierrors.IsAlreadyExists(err) {
   508  		panic(fmt.Errorf("failed init default queue, with err: %v", err))
   509  	}
   510  }
   511  
   512  func newSchedulerCache(config *rest.Config, schedulerNames []string, defaultQueue string, nodeSelectors []string, nodeWorkers uint32, ignoredProvisioners []string) *SchedulerCache {
   513  	kubeClient, err := kubernetes.NewForConfig(config)
   514  	if err != nil {
   515  		panic(fmt.Sprintf("failed init kubeClient, with err: %v", err))
   516  	}
   517  	vcClient, err := vcclient.NewForConfig(config)
   518  	if err != nil {
   519  		panic(fmt.Sprintf("failed init vcClient, with err: %v", err))
   520  	}
   521  	eventClient, err := kubernetes.NewForConfig(config)
   522  	if err != nil {
   523  		panic(fmt.Sprintf("failed init eventClient, with err: %v", err))
   524  	}
   525  
   526  	// create default queue
   527  	newDefaultQueue(vcClient, defaultQueue)
   528  	klog.Infof("Create init queue named default")
   529  
   530  	errTaskRateLimiter := workqueue.NewMaxOfRateLimiter(
   531  		workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 1000*time.Second),
   532  		&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(100), 1000)},
   533  	)
   534  
   535  	sc := &SchedulerCache{
   536  		Jobs:                make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
   537  		Nodes:               make(map[string]*schedulingapi.NodeInfo),
   538  		Queues:              make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
   539  		PriorityClasses:     make(map[string]*schedulingv1.PriorityClass),
   540  		errTasks:            workqueue.NewRateLimitingQueue(errTaskRateLimiter),
   541  		nodeQueue:           workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
   542  		DeletedJobs:         workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
   543  		kubeClient:          kubeClient,
   544  		vcClient:            vcClient,
   545  		restConfig:          config,
   546  		defaultQueue:        defaultQueue,
   547  		schedulerNames:      schedulerNames,
   548  		nodeSelectorLabels:  make(map[string]string),
   549  		NamespaceCollection: make(map[string]*schedulingapi.NamespaceCollection),
   550  		CSINodesStatus:      make(map[string]*schedulingapi.CSINodeStatusInfo),
   551  		imageStates:         make(map[string]*imageState),
   552  
   553  		NodeList:    []string{},
   554  		nodeWorkers: nodeWorkers,
   555  	}
   556  
   557  	ignoredProvisionersSet := sets.New[string]()
   558  	for _, provisioner := range append(ignoredProvisioners, defaultIgnoredProvisioners...) {
   559  		ignoredProvisionersSet.Insert(provisioner)
   560  	}
   561  	sc.IgnoredCSIProvisioners = ignoredProvisionersSet
   562  
   563  	if len(nodeSelectors) > 0 {
   564  		sc.updateNodeSelectors(nodeSelectors)
   565  	}
   566  	// Prepare event clients.
   567  	broadcaster := record.NewBroadcaster()
   568  	broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: eventClient.CoreV1().Events("")})
   569  	sc.Recorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: commonutil.GenerateComponentName(sc.schedulerNames)})
   570  
   571  	// set concurrency configuration when binding
   572  	sc.setBatchBindParallel()
   573  	if bindMethodMap == nil {
   574  		klog.V(3).Info("no registered bind method, new a default one")
   575  		bindMethodMap = NewDefaultBinder(sc.kubeClient, sc.Recorder)
   576  	}
   577  	sc.Binder = GetBindMethod()
   578  
   579  	sc.Evictor = &defaultEvictor{
   580  		kubeclient: sc.kubeClient,
   581  		recorder:   sc.Recorder,
   582  	}
   583  
   584  	sc.StatusUpdater = &defaultStatusUpdater{
   585  		kubeclient: sc.kubeClient,
   586  		vcclient:   sc.vcClient,
   587  	}
   588  
   589  	sc.PodGroupBinder = &podgroupBinder{
   590  		kubeclient: sc.kubeClient,
   591  		vcclient:   sc.vcClient,
   592  	}
   593  
   594  	// add all events handlers
   595  	sc.addEventHandler()
   596  	// finally, init default volume binder which has dependencies on other informers
   597  	sc.setDefaultVolumeBinder()
   598  	return sc
   599  }
   600  
   601  func (sc *SchedulerCache) addEventHandler() {
   602  	informerFactory := informers.NewSharedInformerFactory(sc.kubeClient, 0)
   603  	sc.informerFactory = informerFactory
   604  	mySchedulerPodName, c := getMultiSchedulerInfo()
   605  
   606  	// explicitly register informers to the factory, otherwise resources listers cannot get anything
   607  	// even with no error returned.
   608  	// `Namespace` informer is used by `InterPodAffinity` plugin,
   609  	// `SelectorSpread` and `PodTopologySpread` plugins uses the following four so far.
   610  	informerFactory.Core().V1().Namespaces().Informer()
   611  	informerFactory.Core().V1().Services().Informer()
   612  	if utilfeature.DefaultFeatureGate.Enabled(features.WorkLoadSupport) {
   613  		informerFactory.Core().V1().ReplicationControllers().Informer()
   614  		informerFactory.Apps().V1().ReplicaSets().Informer()
   615  		informerFactory.Apps().V1().StatefulSets().Informer()
   616  	}
   617  
   618  	// `PodDisruptionBudgets` informer is used by `Pdb` plugin
   619  	if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionBudgetsSupport) {
   620  		informerFactory.Policy().V1().PodDisruptionBudgets().Informer()
   621  	}
   622  
   623  	// create informer for node information
   624  	sc.nodeInformer = informerFactory.Core().V1().Nodes()
   625  	sc.nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
   626  		cache.FilteringResourceEventHandler{
   627  			FilterFunc: func(obj interface{}) bool {
   628  				var node *v1.Node
   629  				switch t := obj.(type) {
   630  				case *v1.Node:
   631  					node = t
   632  				case cache.DeletedFinalStateUnknown:
   633  					var ok bool
   634  					node, ok = t.Obj.(*v1.Node)
   635  					if !ok {
   636  						klog.Errorf("Cannot convert to *v1.Node: %v", t.Obj)
   637  						return false
   638  					}
   639  				default:
   640  					return false
   641  				}
   642  
   643  				if !responsibleForNode(node.Name, mySchedulerPodName, c) {
   644  					return false
   645  				}
   646  				if len(sc.nodeSelectorLabels) == 0 {
   647  					return true
   648  				}
   649  				for labelName, labelValue := range node.Labels {
   650  					key := labelName + ":" + labelValue
   651  					if _, ok := sc.nodeSelectorLabels[key]; ok {
   652  						return true
   653  					}
   654  				}
   655  				klog.Infof("node %s ignore add/update/delete into schedulerCache", node.Name)
   656  				return false
   657  			},
   658  			Handler: cache.ResourceEventHandlerFuncs{
   659  				AddFunc:    sc.AddNode,
   660  				UpdateFunc: sc.UpdateNode,
   661  				DeleteFunc: sc.DeleteNode,
   662  			},
   663  		},
   664  		0,
   665  	)
   666  
   667  	sc.podInformer = informerFactory.Core().V1().Pods()
   668  	sc.pvcInformer = informerFactory.Core().V1().PersistentVolumeClaims()
   669  	sc.pvInformer = informerFactory.Core().V1().PersistentVolumes()
   670  	sc.scInformer = informerFactory.Storage().V1().StorageClasses()
   671  	sc.csiNodeInformer = informerFactory.Storage().V1().CSINodes()
   672  	sc.csiNodeInformer.Informer().AddEventHandler(
   673  		cache.ResourceEventHandlerFuncs{
   674  			AddFunc:    sc.AddOrUpdateCSINode,
   675  			UpdateFunc: sc.UpdateCSINode,
   676  			DeleteFunc: sc.DeleteCSINode,
   677  		},
   678  	)
   679  
   680  	if options.ServerOpts != nil && options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) {
   681  		sc.csiDriverInformer = informerFactory.Storage().V1().CSIDrivers()
   682  		sc.csiStorageCapacityInformer = informerFactory.Storage().V1beta1().CSIStorageCapacities()
   683  	}
   684  
   685  	// create informer for pod information
   686  	sc.podInformer.Informer().AddEventHandler(
   687  		cache.FilteringResourceEventHandler{
   688  			FilterFunc: func(obj interface{}) bool {
   689  				switch v := obj.(type) {
   690  				case *v1.Pod:
   691  					if !responsibleForPod(v, sc.schedulerNames, mySchedulerPodName, c) {
   692  						if len(v.Spec.NodeName) == 0 {
   693  							return false
   694  						}
   695  						if !responsibleForNode(v.Spec.NodeName, mySchedulerPodName, c) {
   696  							return false
   697  						}
   698  					}
   699  					return true
   700  				case cache.DeletedFinalStateUnknown:
   701  					if _, ok := v.Obj.(*v1.Pod); ok {
   702  						// The carried object may be stale, always pass to clean up stale obj in event handlers.
   703  						return true
   704  					}
   705  					klog.Errorf("Cannot convert object %T to *v1.Pod", v.Obj)
   706  					return false
   707  				default:
   708  					return false
   709  				}
   710  			},
   711  			Handler: cache.ResourceEventHandlerFuncs{
   712  				AddFunc:    sc.AddPod,
   713  				UpdateFunc: sc.UpdatePod,
   714  				DeleteFunc: sc.DeletePod,
   715  			},
   716  		})
   717  
   718  	if options.ServerOpts != nil && options.ServerOpts.EnablePriorityClass && utilfeature.DefaultFeatureGate.Enabled(features.PriorityClass) {
   719  		sc.pcInformer = informerFactory.Scheduling().V1().PriorityClasses()
   720  		sc.pcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   721  			AddFunc:    sc.AddPriorityClass,
   722  			UpdateFunc: sc.UpdatePriorityClass,
   723  			DeleteFunc: sc.DeletePriorityClass,
   724  		})
   725  	}
   726  
   727  	sc.quotaInformer = informerFactory.Core().V1().ResourceQuotas()
   728  	sc.quotaInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   729  		AddFunc:    sc.AddResourceQuota,
   730  		UpdateFunc: sc.UpdateResourceQuota,
   731  		DeleteFunc: sc.DeleteResourceQuota,
   732  	})
   733  
   734  	vcinformers := vcinformer.NewSharedInformerFactory(sc.vcClient, 0)
   735  	sc.vcInformerFactory = vcinformers
   736  
   737  	// create informer for PodGroup(v1beta1) information
   738  	sc.podGroupInformerV1beta1 = vcinformers.Scheduling().V1beta1().PodGroups()
   739  	sc.podGroupInformerV1beta1.Informer().AddEventHandler(
   740  		cache.FilteringResourceEventHandler{
   741  			FilterFunc: func(obj interface{}) bool {
   742  				var pg *vcv1beta1.PodGroup
   743  				switch v := obj.(type) {
   744  				case *vcv1beta1.PodGroup:
   745  					pg = v
   746  				case cache.DeletedFinalStateUnknown:
   747  					var ok bool
   748  					pg, ok = v.Obj.(*vcv1beta1.PodGroup)
   749  					if !ok {
   750  						klog.Errorf("Cannot convert to podgroup: %v", v.Obj)
   751  						return false
   752  					}
   753  				default:
   754  					return false
   755  				}
   756  
   757  				return responsibleForPodGroup(pg, mySchedulerPodName, c)
   758  			},
   759  			Handler: cache.ResourceEventHandlerFuncs{
   760  				AddFunc:    sc.AddPodGroupV1beta1,
   761  				UpdateFunc: sc.UpdatePodGroupV1beta1,
   762  				DeleteFunc: sc.DeletePodGroupV1beta1,
   763  			},
   764  		})
   765  
   766  	// create informer(v1beta1) for Queue information
   767  	sc.queueInformerV1beta1 = vcinformers.Scheduling().V1beta1().Queues()
   768  	sc.queueInformerV1beta1.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   769  		AddFunc:    sc.AddQueueV1beta1,
   770  		UpdateFunc: sc.UpdateQueueV1beta1,
   771  		DeleteFunc: sc.DeleteQueueV1beta1,
   772  	})
   773  
   774  	if utilfeature.DefaultFeatureGate.Enabled(features.ResourceTopology) {
   775  		sc.cpuInformer = vcinformers.Nodeinfo().V1alpha1().Numatopologies()
   776  		sc.cpuInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   777  			AddFunc:    sc.AddNumaInfoV1alpha1,
   778  			UpdateFunc: sc.UpdateNumaInfoV1alpha1,
   779  			DeleteFunc: sc.DeleteNumaInfoV1alpha1,
   780  		})
   781  	}
   782  }
   783  
   784  // Run  starts the schedulerCache
   785  func (sc *SchedulerCache) Run(stopCh <-chan struct{}) {
   786  	sc.informerFactory.Start(stopCh)
   787  	sc.vcInformerFactory.Start(stopCh)
   788  	sc.WaitForCacheSync(stopCh)
   789  	for i := 0; i < int(sc.nodeWorkers); i++ {
   790  		go wait.Until(sc.runNodeWorker, 0, stopCh)
   791  	}
   792  
   793  	// Re-sync error tasks.
   794  	go wait.Until(sc.processResyncTask, 0, stopCh)
   795  
   796  	// Cleanup jobs.
   797  	go wait.Until(sc.processCleanupJob, 0, stopCh)
   798  
   799  	go wait.Until(sc.processBindTask, time.Millisecond*20, stopCh)
   800  
   801  	// Get metrics data
   802  	klog.V(3).Infof("Start metrics collection, metricsConf is %v", sc.metricsConf)
   803  	interval, err := time.ParseDuration(sc.metricsConf["interval"])
   804  	if err != nil || interval <= 0 {
   805  		interval = defaultMetricsInternal
   806  	}
   807  	klog.V(3).Infof("The interval for querying metrics data is %v", interval)
   808  	go wait.Until(sc.GetMetricsData, interval, stopCh)
   809  }
   810  
   811  // WaitForCacheSync sync the cache with the api server
   812  func (sc *SchedulerCache) WaitForCacheSync(stopCh <-chan struct{}) {
   813  	sc.informerFactory.WaitForCacheSync(stopCh)
   814  	sc.vcInformerFactory.WaitForCacheSync(stopCh)
   815  }
   816  
   817  // findJobAndTask returns job and the task info
   818  func (sc *SchedulerCache) findJobAndTask(taskInfo *schedulingapi.TaskInfo) (*schedulingapi.JobInfo, *schedulingapi.TaskInfo, error) {
   819  	job, found := sc.Jobs[taskInfo.Job]
   820  	if !found {
   821  		return nil, nil, fmt.Errorf("failed to find Job %v for Task %v",
   822  			taskInfo.Job, taskInfo.UID)
   823  	}
   824  
   825  	task, found := job.Tasks[taskInfo.UID]
   826  	if !found {
   827  		return nil, nil, fmt.Errorf("failed to find task in status %v by id %v",
   828  			taskInfo.Status, taskInfo.UID)
   829  	}
   830  
   831  	return job, task, nil
   832  }
   833  
   834  // Evict will evict the pod.
   835  //
   836  // If error occurs both task and job are guaranteed to be in the original state.
   837  func (sc *SchedulerCache) Evict(taskInfo *schedulingapi.TaskInfo, reason string) error {
   838  	sc.Mutex.Lock()
   839  	defer sc.Mutex.Unlock()
   840  
   841  	job, task, err := sc.findJobAndTask(taskInfo)
   842  
   843  	if err != nil {
   844  		return err
   845  	}
   846  
   847  	node, found := sc.Nodes[task.NodeName]
   848  	if !found {
   849  		return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
   850  			task.UID, task.NodeName)
   851  	}
   852  
   853  	originalStatus := task.Status
   854  	if err := job.UpdateTaskStatus(task, schedulingapi.Releasing); err != nil {
   855  		return err
   856  	}
   857  
   858  	// Add new task to node.
   859  	if err := node.UpdateTask(task); err != nil {
   860  		// After failing to update task to a node we need to revert task status from Releasing,
   861  		// otherwise task might be stuck in the Releasing state indefinitely.
   862  		if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
   863  			klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
   864  				"from %s to %s after failing to update Task on Node <%s>: %v",
   865  				task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
   866  			sc.resyncTask(task)
   867  		}
   868  		return err
   869  	}
   870  
   871  	p := task.Pod
   872  
   873  	go func() {
   874  		err := sc.Evictor.Evict(p, reason)
   875  		if err != nil {
   876  			sc.resyncTask(task)
   877  		}
   878  	}()
   879  
   880  	podgroup := &vcv1beta1.PodGroup{}
   881  	if job.PodGroup != nil {
   882  		err = schedulingscheme.Scheme.Convert(&job.PodGroup.PodGroup, podgroup, nil)
   883  	} else {
   884  		err = fmt.Errorf("the PodGroup of Job <%s/%s> is nil", job.Namespace, job.Name)
   885  	}
   886  
   887  	if err != nil {
   888  		klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
   889  		return err
   890  	}
   891  	sc.Recorder.Eventf(podgroup, v1.EventTypeNormal, "Evict", reason)
   892  	return nil
   893  }
   894  
   895  // Bind binds task to the target host.
   896  func (sc *SchedulerCache) Bind(tasks []*schedulingapi.TaskInfo) {
   897  	tmp := time.Now()
   898  	errTasks, err := sc.Binder.Bind(sc.kubeClient, tasks)
   899  	if err == nil {
   900  		klog.V(3).Infof("bind ok, latency %v", time.Since(tmp))
   901  	} else {
   902  		for _, task := range errTasks {
   903  			klog.V(2).Infof("resyncTask task %s", task.Name)
   904  			sc.VolumeBinder.RevertVolumes(task, task.PodVolumes)
   905  			sc.resyncTask(task)
   906  		}
   907  	}
   908  }
   909  
   910  // BindPodGroup binds job to silo cluster
   911  func (sc *SchedulerCache) BindPodGroup(job *schedulingapi.JobInfo, cluster string) error {
   912  	if _, err := sc.PodGroupBinder.Bind(job, cluster); err != nil {
   913  		klog.Errorf("Bind job <%s> to cluster <%s> failed: %v", job.Name, cluster, err)
   914  		return err
   915  	}
   916  	return nil
   917  }
   918  
   919  // GetPodVolumes get pod volume on the host
   920  func (sc *SchedulerCache) GetPodVolumes(task *schedulingapi.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
   921  	return sc.VolumeBinder.GetPodVolumes(task, node)
   922  }
   923  
   924  // AllocateVolumes allocates volume on the host to the task
   925  func (sc *SchedulerCache) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
   926  	return sc.VolumeBinder.AllocateVolumes(task, hostname, podVolumes)
   927  }
   928  
   929  // BindVolumes binds volumes to the task
   930  func (sc *SchedulerCache) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
   931  	return sc.VolumeBinder.BindVolumes(task, podVolumes)
   932  }
   933  
   934  // RevertVolumes clean cache generated by AllocateVolumes
   935  func (sc *SchedulerCache) RevertVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) {
   936  	sc.VolumeBinder.RevertVolumes(task, podVolumes)
   937  }
   938  
   939  // Client returns the kubernetes clientSet
   940  func (sc *SchedulerCache) Client() kubernetes.Interface {
   941  	return sc.kubeClient
   942  }
   943  
   944  // ClientConfig returns the rest config
   945  func (sc *SchedulerCache) ClientConfig() *rest.Config {
   946  	return sc.restConfig
   947  }
   948  
   949  // SharedInformerFactory returns the scheduler SharedInformerFactory
   950  func (sc *SchedulerCache) SharedInformerFactory() informers.SharedInformerFactory {
   951  	return sc.informerFactory
   952  }
   953  
   954  // SetSharedInformerFactory sets the scheduler SharedInformerFactory for unit test
   955  func (sc *SchedulerCache) SetSharedInformerFactory(factory informers.SharedInformerFactory) {
   956  	sc.informerFactory = factory
   957  }
   958  
   959  // UpdateSchedulerNumaInfo used to update scheduler node cache NumaSchedulerInfo
   960  func (sc *SchedulerCache) UpdateSchedulerNumaInfo(AllocatedSets map[string]schedulingapi.ResNumaSets) error {
   961  	sc.Mutex.Lock()
   962  	defer sc.Mutex.Unlock()
   963  
   964  	for nodeName, sets := range AllocatedSets {
   965  		if _, found := sc.Nodes[nodeName]; !found {
   966  			continue
   967  		}
   968  
   969  		numaInfo := sc.Nodes[nodeName].NumaSchedulerInfo
   970  		if numaInfo == nil {
   971  			continue
   972  		}
   973  
   974  		numaInfo.Allocate(sets)
   975  	}
   976  	return nil
   977  }
   978  
   979  // EventRecorder returns the Event Recorder
   980  func (sc *SchedulerCache) EventRecorder() record.EventRecorder {
   981  	return sc.Recorder
   982  }
   983  
   984  // taskUnschedulable updates pod status of pending task
   985  func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason, message string) error {
   986  	pod := task.Pod
   987  
   988  	condition := &v1.PodCondition{
   989  		Type:    v1.PodScheduled,
   990  		Status:  v1.ConditionFalse,
   991  		Reason:  reason, // Add more reasons in order to distinguish more specific scenario of pending tasks
   992  		Message: message,
   993  	}
   994  
   995  	if podConditionHaveUpdate(&pod.Status, condition) {
   996  		pod = pod.DeepCopy()
   997  
   998  		// The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
   999  		// k8s core, so using the same string here.
  1000  		// The reason field in PodCondition can be "Unschedulable"
  1001  		sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
  1002  		if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil {
  1003  			return err
  1004  		}
  1005  	} else {
  1006  		klog.V(4).Infof("task unscheduleable %s/%s, message: %s, skip by no condition update", pod.Namespace, pod.Name, message)
  1007  	}
  1008  
  1009  	return nil
  1010  }
  1011  
  1012  func (sc *SchedulerCache) deleteJob(job *schedulingapi.JobInfo) {
  1013  	klog.V(3).Infof("Try to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
  1014  
  1015  	sc.DeletedJobs.Add(job)
  1016  }
  1017  
  1018  func (sc *SchedulerCache) retryDeleteJob(job *schedulingapi.JobInfo) {
  1019  	klog.V(3).Infof("Retry to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
  1020  
  1021  	sc.DeletedJobs.AddRateLimited(job)
  1022  }
  1023  
  1024  func (sc *SchedulerCache) processCleanupJob() {
  1025  	obj, shutdown := sc.DeletedJobs.Get()
  1026  	if shutdown {
  1027  		return
  1028  	}
  1029  
  1030  	defer sc.DeletedJobs.Done(obj)
  1031  
  1032  	job, found := obj.(*schedulingapi.JobInfo)
  1033  	if !found {
  1034  		klog.Errorf("Failed to convert <%v> to *JobInfo", obj)
  1035  		return
  1036  	}
  1037  
  1038  	sc.Mutex.Lock()
  1039  	defer sc.Mutex.Unlock()
  1040  
  1041  	if schedulingapi.JobTerminated(job) {
  1042  		oldJob, found := sc.Jobs[job.UID]
  1043  		if !found {
  1044  			klog.V(3).Infof("Failed to find Job <%v:%v/%v>, ignore it", job.UID, job.Namespace, job.Name)
  1045  			sc.DeletedJobs.Forget(obj)
  1046  			return
  1047  		}
  1048  		newPgVersion := oldJob.PgUID
  1049  		oldPgVersion := job.PgUID
  1050  		klog.V(5).Infof("Just add pguid:%v, try to delete pguid:%v", newPgVersion, oldPgVersion)
  1051  		if oldPgVersion == newPgVersion {
  1052  			delete(sc.Jobs, job.UID)
  1053  			metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace)
  1054  			klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name)
  1055  		}
  1056  		sc.DeletedJobs.Forget(obj)
  1057  	} else {
  1058  		// Retry
  1059  		sc.retryDeleteJob(job)
  1060  	}
  1061  }
  1062  
  1063  func (sc *SchedulerCache) resyncTask(task *schedulingapi.TaskInfo) {
  1064  	key := sc.generateErrTaskKey(task)
  1065  	sc.errTasks.AddRateLimited(key)
  1066  }
  1067  
  1068  func (sc *SchedulerCache) generateErrTaskKey(task *schedulingapi.TaskInfo) string {
  1069  	// Job UID is namespace + / +name, for example: theNs/theJob
  1070  	// Task UID is derived from the Pod UID, for example: d336abea-4f14-42c7-8a6b-092959a31407
  1071  	// In the example above, the key ultimately becomes: theNs/theJob/d336abea-4f14-42c7-8a6b-092959a31407
  1072  	return fmt.Sprintf("%s/%s", task.Job, task.UID)
  1073  }
  1074  
  1075  func (sc *SchedulerCache) parseErrTaskKey(key string) (*schedulingapi.TaskInfo, error) {
  1076  	i := strings.LastIndex(key, "/")
  1077  	if i == -1 {
  1078  		return nil, fmt.Errorf("failed to split task key %s", key)
  1079  	}
  1080  
  1081  	jobUID := key[:i]
  1082  	taskUID := key[i+1:]
  1083  
  1084  	sc.Mutex.Lock()
  1085  	defer sc.Mutex.Unlock()
  1086  
  1087  	job, found := sc.Jobs[schedulingapi.JobID(jobUID)]
  1088  	if !found {
  1089  		return nil, fmt.Errorf("failed to find job %s", jobUID)
  1090  	}
  1091  
  1092  	task, found := job.Tasks[schedulingapi.TaskID(taskUID)]
  1093  	if !found {
  1094  		return nil, fmt.Errorf("failed to find task %s", taskUID)
  1095  	}
  1096  
  1097  	return task, nil
  1098  }
  1099  
  1100  func (sc *SchedulerCache) processResyncTask() {
  1101  	obj, shutdown := sc.errTasks.Get()
  1102  	if shutdown {
  1103  		return
  1104  	}
  1105  
  1106  	klog.V(5).Infof("the length of errTasks is %d", sc.errTasks.Len())
  1107  
  1108  	defer sc.errTasks.Done(obj)
  1109  
  1110  	taskKey, ok := obj.(string)
  1111  	if !ok {
  1112  		klog.Errorf("Failed to convert %v to string.", obj)
  1113  		sc.errTasks.Forget(obj)
  1114  		return
  1115  	}
  1116  
  1117  	task, err := sc.parseErrTaskKey(taskKey)
  1118  	if err != nil {
  1119  		klog.ErrorS(err, "Failed to get task for sync task", "taskKey", taskKey)
  1120  		sc.errTasks.Forget(obj)
  1121  		return
  1122  	}
  1123  
  1124  	reSynced := false
  1125  	if err := sc.syncTask(task); err != nil {
  1126  		klog.ErrorS(err, "Failed to sync task, retry it", "namespace", task.Namespace, "name", task.Name)
  1127  		sc.resyncTask(task)
  1128  		reSynced = true
  1129  	} else {
  1130  		klog.V(4).Infof("Successfully synced task <%s/%s>", task.Namespace, task.Name)
  1131  		sc.errTasks.Forget(obj)
  1132  	}
  1133  
  1134  	// execute custom bind err handler call back func if exists.
  1135  	if task.CustomBindErrHandler != nil && !task.CustomBindErrHandlerSucceeded {
  1136  		err := task.CustomBindErrHandler()
  1137  		if err != nil {
  1138  			klog.ErrorS(err, "Failed to execute custom bind err handler, retry it.")
  1139  		} else {
  1140  			task.CustomBindErrHandlerSucceeded = true
  1141  		}
  1142  		if !task.CustomBindErrHandlerSucceeded && !reSynced {
  1143  			sc.resyncTask(task)
  1144  		}
  1145  	}
  1146  }
  1147  
  1148  func (sc *SchedulerCache) runNodeWorker() {
  1149  	for sc.processSyncNode() {
  1150  	}
  1151  }
  1152  
  1153  func (sc *SchedulerCache) processSyncNode() bool {
  1154  	obj, shutdown := sc.nodeQueue.Get()
  1155  	if shutdown {
  1156  		return false
  1157  	}
  1158  	defer sc.nodeQueue.Done(obj)
  1159  
  1160  	nodeName, ok := obj.(string)
  1161  	if !ok {
  1162  		klog.Errorf("failed to convert %v to string", obj)
  1163  		return true
  1164  	}
  1165  
  1166  	klog.V(5).Infof("started sync node %s", nodeName)
  1167  	err := sc.SyncNode(nodeName)
  1168  	if err == nil {
  1169  		sc.nodeQueue.Forget(nodeName)
  1170  		return true
  1171  	}
  1172  
  1173  	klog.Errorf("Failed to sync node <%s>, retry it.", nodeName)
  1174  	sc.nodeQueue.AddRateLimited(nodeName)
  1175  	return true
  1176  }
  1177  
  1178  // AddBindTask add task to be bind to a cache which consumes by go runtime
  1179  func (sc *SchedulerCache) AddBindTask(taskInfo *schedulingapi.TaskInfo) error {
  1180  	klog.V(5).Infof("add bind task %v/%v", taskInfo.Namespace, taskInfo.Name)
  1181  	sc.Mutex.Lock()
  1182  	defer sc.Mutex.Unlock()
  1183  	job, task, err := sc.findJobAndTask(taskInfo)
  1184  	if err != nil {
  1185  		return err
  1186  	}
  1187  
  1188  	node, found := sc.Nodes[taskInfo.NodeName]
  1189  	if !found {
  1190  		return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
  1191  			task.UID, taskInfo.NodeName)
  1192  	}
  1193  
  1194  	originalStatus := task.Status
  1195  	if err := job.UpdateTaskStatus(task, schedulingapi.Binding); err != nil {
  1196  		return err
  1197  	}
  1198  
  1199  	err = taskInfo.SetPodResourceDecision()
  1200  	if err != nil {
  1201  		return fmt.Errorf("set task %v/%v resource decision failed, err %v", task.Namespace, task.Name, err)
  1202  	}
  1203  	task.NumaInfo = taskInfo.NumaInfo.Clone()
  1204  
  1205  	// Add task to the node.
  1206  	if err := node.AddTask(task); err != nil {
  1207  		// After failing to update task to a node we need to revert task status from Releasing,
  1208  		// otherwise task might be stuck in the Releasing state indefinitely.
  1209  		if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
  1210  			klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
  1211  				"from %s to %s after failing to update Task on Node <%s>: %v",
  1212  				task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
  1213  			sc.resyncTask(task)
  1214  		}
  1215  		return err
  1216  	}
  1217  
  1218  	sc.BindFlowChannel <- taskInfo
  1219  
  1220  	return nil
  1221  }
  1222  
  1223  func (sc *SchedulerCache) processBindTask() {
  1224  	for {
  1225  		select {
  1226  		case taskInfo, ok := <-sc.BindFlowChannel:
  1227  			if !ok {
  1228  				return
  1229  			}
  1230  
  1231  			sc.bindCache = append(sc.bindCache, taskInfo)
  1232  			if len(sc.bindCache) == sc.batchNum {
  1233  				sc.BindTask()
  1234  			}
  1235  		default:
  1236  		}
  1237  
  1238  		if len(sc.BindFlowChannel) == 0 {
  1239  			break
  1240  		}
  1241  	}
  1242  
  1243  	if len(sc.bindCache) == 0 {
  1244  		return
  1245  	}
  1246  	sc.BindTask()
  1247  }
  1248  
  1249  // BindTask do k8s binding with a goroutine
  1250  func (sc *SchedulerCache) BindTask() {
  1251  	klog.V(5).Infof("batch bind task count %d", len(sc.bindCache))
  1252  	var tmpBindCache []*schedulingapi.TaskInfo = make([]*schedulingapi.TaskInfo, len(sc.bindCache))
  1253  	copy(tmpBindCache, sc.bindCache)
  1254  	go func(tasks []*schedulingapi.TaskInfo) {
  1255  		successfulTasks := make([]*schedulingapi.TaskInfo, 0)
  1256  		for _, task := range tasks {
  1257  			if err := sc.VolumeBinder.BindVolumes(task, task.PodVolumes); err != nil {
  1258  				klog.Errorf("task %s/%s bind Volumes failed: %#v", task.Namespace, task.Name, err)
  1259  				sc.VolumeBinder.RevertVolumes(task, task.PodVolumes)
  1260  				sc.resyncTask(task)
  1261  			} else {
  1262  				successfulTasks = append(successfulTasks, task)
  1263  				klog.V(5).Infof("task %s/%s bind Volumes done", task.Namespace, task.Name)
  1264  			}
  1265  		}
  1266  
  1267  		bindTasks := make([]*schedulingapi.TaskInfo, len(successfulTasks))
  1268  		copy(bindTasks, successfulTasks)
  1269  		sc.Bind(bindTasks)
  1270  	}(tmpBindCache)
  1271  	sc.bindCache = sc.bindCache[0:0]
  1272  }
  1273  
  1274  // Snapshot returns the complete snapshot of the cluster from cache
  1275  func (sc *SchedulerCache) Snapshot() *schedulingapi.ClusterInfo {
  1276  	sc.Mutex.Lock()
  1277  	defer sc.Mutex.Unlock()
  1278  
  1279  	snapshot := &schedulingapi.ClusterInfo{
  1280  		Nodes:          make(map[string]*schedulingapi.NodeInfo),
  1281  		Jobs:           make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
  1282  		Queues:         make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
  1283  		NamespaceInfo:  make(map[schedulingapi.NamespaceName]*schedulingapi.NamespaceInfo),
  1284  		RevocableNodes: make(map[string]*schedulingapi.NodeInfo),
  1285  		NodeList:       make([]string, len(sc.NodeList)),
  1286  		CSINodesStatus: make(map[string]*schedulingapi.CSINodeStatusInfo),
  1287  	}
  1288  
  1289  	copy(snapshot.NodeList, sc.NodeList)
  1290  	for _, value := range sc.Nodes {
  1291  		value.RefreshNumaSchedulerInfoByCrd()
  1292  	}
  1293  
  1294  	for _, value := range sc.CSINodesStatus {
  1295  		snapshot.CSINodesStatus[value.CSINodeName] = value.Clone()
  1296  	}
  1297  
  1298  	for _, value := range sc.Nodes {
  1299  		if !value.Ready() {
  1300  			continue
  1301  		}
  1302  
  1303  		snapshot.Nodes[value.Name] = value.Clone()
  1304  
  1305  		if value.RevocableZone != "" {
  1306  			snapshot.RevocableNodes[value.Name] = snapshot.Nodes[value.Name]
  1307  		}
  1308  	}
  1309  
  1310  	for _, value := range sc.Queues {
  1311  		snapshot.Queues[value.UID] = value.Clone()
  1312  	}
  1313  
  1314  	var cloneJobLock sync.Mutex
  1315  	var wg sync.WaitGroup
  1316  
  1317  	cloneJob := func(value *schedulingapi.JobInfo) {
  1318  		defer wg.Done()
  1319  		if value.PodGroup != nil {
  1320  			value.Priority = sc.defaultPriority
  1321  
  1322  			priName := value.PodGroup.Spec.PriorityClassName
  1323  			if priorityClass, found := sc.PriorityClasses[priName]; found {
  1324  				value.Priority = priorityClass.Value
  1325  			}
  1326  
  1327  			klog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>",
  1328  				value.Namespace, value.Name, priName, value.Priority)
  1329  		}
  1330  
  1331  		clonedJob := value.Clone()
  1332  
  1333  		cloneJobLock.Lock()
  1334  		snapshot.Jobs[value.UID] = clonedJob
  1335  		cloneJobLock.Unlock()
  1336  	}
  1337  
  1338  	for _, value := range sc.NamespaceCollection {
  1339  		info := value.Snapshot()
  1340  		snapshot.NamespaceInfo[info.Name] = info
  1341  	}
  1342  
  1343  	for _, value := range sc.Jobs {
  1344  		// If no scheduling spec, does not handle it.
  1345  		if value.PodGroup == nil {
  1346  			klog.V(4).Infof("The scheduling spec of Job <%v:%s/%s> is nil, ignore it.",
  1347  				value.UID, value.Namespace, value.Name)
  1348  
  1349  			continue
  1350  		}
  1351  
  1352  		if _, found := snapshot.Queues[value.Queue]; !found {
  1353  			klog.V(3).Infof("The Queue <%v> of Job <%v/%v> does not exist, ignore it.",
  1354  				value.Queue, value.Namespace, value.Name)
  1355  			continue
  1356  		}
  1357  
  1358  		wg.Add(1)
  1359  		go cloneJob(value)
  1360  	}
  1361  	wg.Wait()
  1362  
  1363  	klog.V(3).Infof("There are <%d> Jobs, <%d> Queues and <%d> Nodes in total for scheduling.",
  1364  		len(snapshot.Jobs), len(snapshot.Queues), len(snapshot.Nodes))
  1365  
  1366  	return snapshot
  1367  }
  1368  
  1369  // String returns information about the cache in a string format
  1370  func (sc *SchedulerCache) String() string {
  1371  	sc.Mutex.Lock()
  1372  	defer sc.Mutex.Unlock()
  1373  
  1374  	str := "Cache:\n"
  1375  
  1376  	if len(sc.Nodes) != 0 {
  1377  		str += "Nodes:\n"
  1378  		for _, n := range sc.Nodes {
  1379  			str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
  1380  				n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
  1381  
  1382  			i := 0
  1383  			for _, p := range n.Tasks {
  1384  				str += fmt.Sprintf("\t\t %d: %v\n", i, p)
  1385  				i++
  1386  			}
  1387  		}
  1388  	}
  1389  
  1390  	if len(sc.Jobs) != 0 {
  1391  		str += "Jobs:\n"
  1392  		for _, job := range sc.Jobs {
  1393  			str += fmt.Sprintf("\t %s\n", job)
  1394  		}
  1395  	}
  1396  
  1397  	if len(sc.NamespaceCollection) != 0 {
  1398  		str += "Namespaces:\n"
  1399  		for _, ns := range sc.NamespaceCollection {
  1400  			info := ns.Snapshot()
  1401  			str += fmt.Sprintf("\t Namespace(%s)\n", info.Name)
  1402  		}
  1403  	}
  1404  
  1405  	if len(sc.NodeList) != 0 {
  1406  		str += fmt.Sprintf("NodeList: %v\n", sc.NodeList)
  1407  	}
  1408  
  1409  	return str
  1410  }
  1411  
  1412  // RecordJobStatusEvent records related events according to job status.
  1413  func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo, updatePG bool) {
  1414  	pgUnschedulable := job.PodGroup != nil &&
  1415  		(job.PodGroup.Status.Phase == scheduling.PodGroupUnknown ||
  1416  			job.PodGroup.Status.Phase == scheduling.PodGroupPending ||
  1417  			job.PodGroup.Status.Phase == scheduling.PodGroupInqueue)
  1418  
  1419  	// If pending or unschedulable, record unschedulable event.
  1420  	if pgUnschedulable {
  1421  		msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v",
  1422  			len(job.TaskStatusIndex[schedulingapi.Pending]),
  1423  			len(job.Tasks),
  1424  			job.FitError())
  1425  		sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeWarning, string(scheduling.PodGroupUnschedulableType), msg)
  1426  	} else if updatePG {
  1427  		sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupScheduled), string(scheduling.PodGroupReady))
  1428  	}
  1429  
  1430  	baseErrorMessage := job.JobFitErrors
  1431  	if baseErrorMessage == "" {
  1432  		baseErrorMessage = schedulingapi.AllNodeUnavailableMsg
  1433  	}
  1434  	// Update podCondition for tasks Allocated and Pending before job discarded
  1435  	for _, status := range []schedulingapi.TaskStatus{schedulingapi.Allocated, schedulingapi.Pending, schedulingapi.Pipelined} {
  1436  		for _, taskInfo := range job.TaskStatusIndex[status] {
  1437  			reason, msg := job.TaskSchedulingReason(taskInfo.UID)
  1438  			if len(msg) == 0 {
  1439  				msg = baseErrorMessage
  1440  			}
  1441  			if err := sc.taskUnschedulable(taskInfo, reason, msg); err != nil {
  1442  				klog.Errorf("Failed to update unschedulable task status <%s/%s>: %v",
  1443  					taskInfo.Namespace, taskInfo.Name, err)
  1444  			}
  1445  		}
  1446  	}
  1447  }
  1448  
  1449  // UpdateJobStatus update the status of job and its tasks.
  1450  func (sc *SchedulerCache) UpdateJobStatus(job *schedulingapi.JobInfo, updatePG bool) (*schedulingapi.JobInfo, error) {
  1451  	if updatePG {
  1452  		pg, err := sc.StatusUpdater.UpdatePodGroup(job.PodGroup)
  1453  		if err != nil {
  1454  			return nil, err
  1455  		}
  1456  		job.PodGroup = pg
  1457  	}
  1458  
  1459  	sc.RecordJobStatusEvent(job, updatePG)
  1460  
  1461  	return job, nil
  1462  }
  1463  
  1464  // UpdateQueueStatus update the status of queue.
  1465  func (sc *SchedulerCache) UpdateQueueStatus(queue *schedulingapi.QueueInfo) error {
  1466  	return sc.StatusUpdater.UpdateQueueStatus(queue)
  1467  }
  1468  
  1469  func (sc *SchedulerCache) recordPodGroupEvent(podGroup *schedulingapi.PodGroup, eventType, reason, msg string) {
  1470  	if podGroup == nil {
  1471  		return
  1472  	}
  1473  
  1474  	pg := &vcv1beta1.PodGroup{}
  1475  	if err := schedulingscheme.Scheme.Convert(&podGroup.PodGroup, pg, nil); err != nil {
  1476  		klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
  1477  		return
  1478  	}
  1479  	sc.Recorder.Eventf(pg, eventType, reason, msg)
  1480  }
  1481  
  1482  func (sc *SchedulerCache) SetMetricsConf(conf map[string]string) {
  1483  	sc.metricsConf = conf
  1484  }
  1485  
  1486  func (sc *SchedulerCache) GetMetricsData() {
  1487  	metricsType := sc.metricsConf["type"]
  1488  	if len(metricsType) == 0 {
  1489  		klog.V(3).Infof("The metrics type is not set in the volcano scheduler configmap file. " +
  1490  			"As a result, the CPU and memory load information of the node is not collected.")
  1491  		return
  1492  	}
  1493  
  1494  	client, err := source.NewMetricsClient(sc.restConfig, sc.metricsConf)
  1495  	if err != nil {
  1496  		klog.Errorf("Error creating client: %v\n", err)
  1497  		return
  1498  	}
  1499  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
  1500  	defer cancel()
  1501  	nodeMetricsMap := make(map[string]*source.NodeMetrics, len(sc.NodeList))
  1502  	sc.Mutex.Lock()
  1503  
  1504  	for _, nodeName := range sc.NodeList {
  1505  		nodeMetricsMap[nodeName] = &source.NodeMetrics{}
  1506  	}
  1507  	sc.Mutex.Unlock()
  1508  
  1509  	err = client.NodesMetricsAvg(ctx, nodeMetricsMap)
  1510  	if err != nil {
  1511  		klog.Errorf("Error getting node metrics: %v\n", err)
  1512  		return
  1513  	}
  1514  
  1515  	sc.setMetricsData(nodeMetricsMap)
  1516  }
  1517  
  1518  func (sc *SchedulerCache) setMetricsData(usageInfo map[string]*source.NodeMetrics) {
  1519  	sc.Mutex.Lock()
  1520  	defer sc.Mutex.Unlock()
  1521  
  1522  	for nodeName, nodeMetric := range usageInfo {
  1523  		nodeUsage := &schedulingapi.NodeUsage{
  1524  			CPUUsageAvg: make(map[string]float64),
  1525  			MEMUsageAvg: make(map[string]float64),
  1526  		}
  1527  		nodeUsage.MetricsTime = nodeMetric.MetricsTime
  1528  		nodeUsage.CPUUsageAvg[source.NODE_METRICS_PERIOD] = nodeMetric.CPU
  1529  		nodeUsage.MEMUsageAvg[source.NODE_METRICS_PERIOD] = nodeMetric.Memory
  1530  
  1531  		nodeInfo, ok := sc.Nodes[nodeName]
  1532  		if !ok {
  1533  			klog.Errorf("The information about node %s cannot be found in the cache.", nodeName)
  1534  			continue
  1535  		}
  1536  		klog.V(5).Infof("node: %s, ResourceUsage: %+v => %+v", nodeName, *nodeInfo.ResourceUsage, nodeUsage)
  1537  		nodeInfo.ResourceUsage = nodeUsage
  1538  	}
  1539  }
  1540  
  1541  // createImageStateSummary returns a summarizing snapshot of the given image's state.
  1542  func (sc *SchedulerCache) createImageStateSummary(state *imageState) *framework.ImageStateSummary {
  1543  	return &framework.ImageStateSummary{
  1544  		Size:     state.size,
  1545  		NumNodes: len(state.nodes),
  1546  	}
  1547  }