volcano.sh/volcano@v1.9.0/pkg/scheduler/api/job_info.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package api
    18  
    19  import (
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  	"time"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/klog/v2"
    32  
    33  	batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
    34  	"volcano.sh/apis/pkg/apis/scheduling"
    35  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    36  
    37  	volumescheduling "volcano.sh/volcano/pkg/scheduler/capabilities/volumebinding"
    38  )
    39  
    40  // DisruptionBudget define job min pod available and max pod unavailable value
    41  type DisruptionBudget struct {
    42  	MinAvailable  string
    43  	MaxUnavilable string
    44  }
    45  
    46  // NewDisruptionBudget create disruption budget for job
    47  func NewDisruptionBudget(minAvailable, maxUnavilable string) *DisruptionBudget {
    48  	disruptionBudget := &DisruptionBudget{
    49  		MinAvailable:  minAvailable,
    50  		MaxUnavilable: maxUnavilable,
    51  	}
    52  	return disruptionBudget
    53  }
    54  
    55  // Clone return a clone of DisruptionBudget
    56  func (db *DisruptionBudget) Clone() *DisruptionBudget {
    57  	return &DisruptionBudget{
    58  		MinAvailable:  db.MinAvailable,
    59  		MaxUnavilable: db.MaxUnavilable,
    60  	}
    61  }
    62  
    63  // JobWaitingTime is maximum waiting time that a job could stay Pending in service level agreement
    64  // when job waits longer than waiting time, it should enqueue at once, and cluster should reserve resources for it
    65  const JobWaitingTime = "sla-waiting-time"
    66  
    67  // TaskID is UID type for Task
    68  type TaskID types.UID
    69  
    70  // TransactionContext holds all the fields that needed by scheduling transaction
    71  type TransactionContext struct {
    72  	NodeName string
    73  	Status   TaskStatus
    74  }
    75  
    76  // Clone returns a clone of TransactionContext
    77  func (ctx *TransactionContext) Clone() *TransactionContext {
    78  	if ctx == nil {
    79  		return nil
    80  	}
    81  	clone := *ctx
    82  	return &clone
    83  }
    84  
    85  type TopologyInfo struct {
    86  	Policy string
    87  	ResMap map[int]v1.ResourceList // key: numa ID
    88  }
    89  
    90  func (info *TopologyInfo) Clone() *TopologyInfo {
    91  	copyInfo := &TopologyInfo{
    92  		Policy: info.Policy,
    93  		ResMap: make(map[int]v1.ResourceList),
    94  	}
    95  
    96  	for numaID, resList := range info.ResMap {
    97  		copyInfo.ResMap[numaID] = resList.DeepCopy()
    98  	}
    99  
   100  	return copyInfo
   101  }
   102  
   103  // TaskInfo will have all infos about the task
   104  type TaskInfo struct {
   105  	UID TaskID
   106  	Job JobID
   107  
   108  	Name      string
   109  	Namespace string
   110  
   111  	// Resreq is the resource that used when task running.
   112  	Resreq *Resource
   113  	// InitResreq is the resource that used to launch a task.
   114  	InitResreq *Resource
   115  
   116  	TransactionContext
   117  	// LastTransaction holds the context of last scheduling transaction
   118  	LastTransaction *TransactionContext
   119  
   120  	Priority    int32
   121  	VolumeReady bool
   122  	Preemptable bool
   123  	BestEffort  bool
   124  
   125  	// RevocableZone supports setting volcano.sh/revocable-zone annotation or label for pod/podgroup
   126  	// we only support empty value or * value for this version and we will support specify revocable zone name for future releases
   127  	// empty value means workload can not use revocable node
   128  	// * value means workload can use all the revocable node for during node active revocable time.
   129  	RevocableZone string
   130  
   131  	NumaInfo   *TopologyInfo
   132  	PodVolumes *volumescheduling.PodVolumes
   133  	Pod        *v1.Pod
   134  
   135  	// CustomBindErrHandler is a custom callback func called when task bind err.
   136  	CustomBindErrHandler func() error `json:"-"`
   137  	// CustomBindErrHandlerSucceeded indicates whether CustomBindErrHandler is executed successfully.
   138  	CustomBindErrHandlerSucceeded bool
   139  }
   140  
   141  func getJobID(pod *v1.Pod) JobID {
   142  	if gn, found := pod.Annotations[v1beta1.KubeGroupNameAnnotationKey]; found && len(gn) != 0 {
   143  		// Make sure Pod and PodGroup belong to the same namespace.
   144  		jobID := fmt.Sprintf("%s/%s", pod.Namespace, gn)
   145  		return JobID(jobID)
   146  	}
   147  
   148  	return ""
   149  }
   150  
   151  func getTaskID(pod *v1.Pod) TaskID {
   152  	if ts, found := pod.Annotations[batch.TaskSpecKey]; found && len(ts) != 0 {
   153  		return TaskID(ts)
   154  	}
   155  
   156  	return ""
   157  }
   158  
   159  const TaskPriorityAnnotation = "volcano.sh/task-priority"
   160  
   161  // NewTaskInfo creates new taskInfo object for a Pod
   162  func NewTaskInfo(pod *v1.Pod) *TaskInfo {
   163  	initResReq := GetPodResourceRequest(pod)
   164  	resReq := initResReq
   165  	bestEffort := initResReq.IsEmpty()
   166  	preemptable := GetPodPreemptable(pod)
   167  	revocableZone := GetPodRevocableZone(pod)
   168  	topologyInfo := GetPodTopologyInfo(pod)
   169  
   170  	jobID := getJobID(pod)
   171  
   172  	ti := &TaskInfo{
   173  		UID:           TaskID(pod.UID),
   174  		Job:           jobID,
   175  		Name:          pod.Name,
   176  		Namespace:     pod.Namespace,
   177  		Priority:      1,
   178  		Pod:           pod,
   179  		Resreq:        resReq,
   180  		InitResreq:    initResReq,
   181  		Preemptable:   preemptable,
   182  		BestEffort:    bestEffort,
   183  		RevocableZone: revocableZone,
   184  		NumaInfo:      topologyInfo,
   185  		TransactionContext: TransactionContext{
   186  			NodeName: pod.Spec.NodeName,
   187  			Status:   getTaskStatus(pod),
   188  		},
   189  	}
   190  
   191  	if pod.Spec.Priority != nil {
   192  		ti.Priority = *pod.Spec.Priority
   193  	}
   194  
   195  	if taskPriority, ok := pod.Annotations[TaskPriorityAnnotation]; ok {
   196  		if priority, err := strconv.ParseInt(taskPriority, 10, 32); err == nil {
   197  			ti.Priority = int32(priority)
   198  		}
   199  	}
   200  
   201  	return ti
   202  }
   203  
   204  // GetTransactionContext get transaction context of a task
   205  func (ti *TaskInfo) GetTransactionContext() TransactionContext {
   206  	return ti.TransactionContext
   207  }
   208  
   209  // GenerateLastTxContext generate and set context of last transaction for a task
   210  func (ti *TaskInfo) GenerateLastTxContext() {
   211  	ctx := ti.GetTransactionContext()
   212  	ti.LastTransaction = &ctx
   213  }
   214  
   215  // ClearLastTxContext clear context of last transaction for a task
   216  func (ti *TaskInfo) ClearLastTxContext() {
   217  	ti.LastTransaction = nil
   218  }
   219  
   220  func (ti *TaskInfo) SetPodResourceDecision() error {
   221  	if ti.NumaInfo == nil || len(ti.NumaInfo.ResMap) == 0 {
   222  		return nil
   223  	}
   224  
   225  	klog.V(4).Infof("%v/%v resource decision: %v", ti.Namespace, ti.Name, ti.NumaInfo.ResMap)
   226  	decision := PodResourceDecision{
   227  		NUMAResources: ti.NumaInfo.ResMap,
   228  	}
   229  
   230  	layout, err := json.Marshal(&decision)
   231  	if err != nil {
   232  		return err
   233  	}
   234  
   235  	metav1.SetMetaDataAnnotation(&ti.Pod.ObjectMeta, topologyDecisionAnnotation, string(layout[:]))
   236  	return nil
   237  }
   238  
   239  func (ti *TaskInfo) UnsetPodResourceDecision() {
   240  	delete(ti.Pod.Annotations, topologyDecisionAnnotation)
   241  }
   242  
   243  // Clone is used for cloning a task
   244  func (ti *TaskInfo) Clone() *TaskInfo {
   245  	return &TaskInfo{
   246  		UID:           ti.UID,
   247  		Job:           ti.Job,
   248  		Name:          ti.Name,
   249  		Namespace:     ti.Namespace,
   250  		Priority:      ti.Priority,
   251  		PodVolumes:    ti.PodVolumes,
   252  		Pod:           ti.Pod,
   253  		Resreq:        ti.Resreq.Clone(),
   254  		InitResreq:    ti.InitResreq.Clone(),
   255  		VolumeReady:   ti.VolumeReady,
   256  		Preemptable:   ti.Preemptable,
   257  		BestEffort:    ti.BestEffort,
   258  		RevocableZone: ti.RevocableZone,
   259  		NumaInfo:      ti.NumaInfo.Clone(),
   260  		TransactionContext: TransactionContext{
   261  			NodeName: ti.NodeName,
   262  			Status:   ti.Status,
   263  		},
   264  		LastTransaction: ti.LastTransaction.Clone(),
   265  	}
   266  }
   267  
   268  func (ti *TaskInfo) GetTaskSpecKey() TaskID {
   269  	if ti.Pod == nil {
   270  		return ""
   271  	}
   272  	return getTaskID(ti.Pod)
   273  }
   274  
   275  // String returns the taskInfo details in a string
   276  func (ti TaskInfo) String() string {
   277  	res := fmt.Sprintf("Task (%v:%v/%v): job %v, status %v, pri %v, "+
   278  		"resreq %v, preemptable %v, revocableZone %v",
   279  		ti.UID, ti.Namespace, ti.Name, ti.Job, ti.Status, ti.Priority,
   280  		ti.Resreq, ti.Preemptable, ti.RevocableZone)
   281  
   282  	if ti.NumaInfo != nil {
   283  		res += fmt.Sprintf(", numaInfo %v", *ti.NumaInfo)
   284  	}
   285  
   286  	return res
   287  }
   288  
   289  // JobID is the type of JobInfo's ID.
   290  type JobID types.UID
   291  
   292  type tasksMap map[TaskID]*TaskInfo
   293  
   294  // NodeResourceMap stores resource in a node
   295  type NodeResourceMap map[string]*Resource
   296  
   297  // JobInfo will have all info of a Job
   298  type JobInfo struct {
   299  	UID   JobID
   300  	PgUID types.UID
   301  
   302  	Name      string
   303  	Namespace string
   304  
   305  	Queue QueueID
   306  
   307  	Priority int32
   308  
   309  	MinAvailable int32
   310  
   311  	WaitingTime *time.Duration
   312  
   313  	JobFitErrors   string
   314  	NodesFitErrors map[TaskID]*FitErrors
   315  
   316  	// All tasks of the Job.
   317  	TaskStatusIndex       map[TaskStatus]tasksMap
   318  	Tasks                 tasksMap
   319  	TaskMinAvailable      map[TaskID]int32
   320  	TaskMinAvailableTotal int32
   321  
   322  	Allocated    *Resource
   323  	TotalRequest *Resource
   324  
   325  	CreationTimestamp metav1.Time
   326  	PodGroup          *PodGroup
   327  
   328  	ScheduleStartTimestamp metav1.Time
   329  
   330  	Preemptable bool
   331  
   332  	// RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
   333  	// we only support empty value or * value for this version and we will support specify revocable zone name for future release
   334  	// empty value means workload can not use revocable node
   335  	// * value means workload can use all the revocable node for during node active revocable time.
   336  	RevocableZone string
   337  	Budget        *DisruptionBudget
   338  }
   339  
   340  // NewJobInfo creates a new jobInfo for set of tasks
   341  func NewJobInfo(uid JobID, tasks ...*TaskInfo) *JobInfo {
   342  	job := &JobInfo{
   343  		UID:              uid,
   344  		MinAvailable:     0,
   345  		NodesFitErrors:   make(map[TaskID]*FitErrors),
   346  		Allocated:        EmptyResource(),
   347  		TotalRequest:     EmptyResource(),
   348  		TaskStatusIndex:  map[TaskStatus]tasksMap{},
   349  		Tasks:            tasksMap{},
   350  		TaskMinAvailable: map[TaskID]int32{},
   351  	}
   352  
   353  	for _, task := range tasks {
   354  		job.AddTaskInfo(task)
   355  	}
   356  
   357  	return job
   358  }
   359  
   360  // UnsetPodGroup removes podGroup details from a job
   361  func (ji *JobInfo) UnsetPodGroup() {
   362  	ji.PodGroup = nil
   363  }
   364  
   365  // SetPodGroup sets podGroup details to a job
   366  func (ji *JobInfo) SetPodGroup(pg *PodGroup) {
   367  	ji.Name = pg.Name
   368  	ji.Namespace = pg.Namespace
   369  	ji.MinAvailable = pg.Spec.MinMember
   370  	ji.Queue = QueueID(pg.Spec.Queue)
   371  	ji.CreationTimestamp = pg.GetCreationTimestamp()
   372  
   373  	var err error
   374  	ji.WaitingTime, err = ji.extractWaitingTime(pg, v1beta1.JobWaitingTime)
   375  	if err != nil {
   376  		klog.Warningf("Error occurs in parsing waiting time for job <%s/%s>, err: %s.",
   377  			pg.Namespace, pg.Name, err.Error())
   378  		ji.WaitingTime = nil
   379  	}
   380  	if ji.WaitingTime == nil {
   381  		ji.WaitingTime, err = ji.extractWaitingTime(pg, JobWaitingTime)
   382  		if err != nil {
   383  			klog.Warningf("Error occurs in parsing waiting time for job <%s/%s>, err: %s.",
   384  				pg.Namespace, pg.Name, err.Error())
   385  			ji.WaitingTime = nil
   386  		}
   387  	}
   388  
   389  	ji.Preemptable = ji.extractPreemptable(pg)
   390  	ji.RevocableZone = ji.extractRevocableZone(pg)
   391  	ji.Budget = ji.extractBudget(pg)
   392  
   393  	taskMinAvailableTotal := int32(0)
   394  	for task, member := range pg.Spec.MinTaskMember {
   395  		ji.TaskMinAvailable[TaskID(task)] = member
   396  		taskMinAvailableTotal += member
   397  	}
   398  	ji.TaskMinAvailableTotal = taskMinAvailableTotal
   399  
   400  	ji.PgUID = pg.UID
   401  	ji.PodGroup = pg
   402  }
   403  
   404  // extractWaitingTime reads sla waiting time for job from podgroup annotations
   405  // TODO: should also read from given field in volcano job spec
   406  func (ji *JobInfo) extractWaitingTime(pg *PodGroup, waitingTimeKey string) (*time.Duration, error) {
   407  	if _, exist := pg.Annotations[waitingTimeKey]; !exist {
   408  		return nil, nil
   409  	}
   410  
   411  	jobWaitingTime, err := time.ParseDuration(pg.Annotations[waitingTimeKey])
   412  	if err != nil {
   413  		return nil, err
   414  	}
   415  
   416  	if jobWaitingTime <= 0 {
   417  		return nil, errors.New("invalid sla waiting time")
   418  	}
   419  
   420  	return &jobWaitingTime, nil
   421  }
   422  
   423  // extractPreemptable return volcano.sh/preemptable value for job
   424  func (ji *JobInfo) extractPreemptable(pg *PodGroup) bool {
   425  	// check annotaion first
   426  	if len(pg.Annotations) > 0 {
   427  		if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
   428  			b, err := strconv.ParseBool(value)
   429  			if err != nil {
   430  				klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
   431  				return false
   432  			}
   433  			return b
   434  		}
   435  	}
   436  
   437  	// it annotation does not exit, check label
   438  	if len(pg.Labels) > 0 {
   439  		if value, found := pg.Labels[v1beta1.PodPreemptable]; found {
   440  			b, err := strconv.ParseBool(value)
   441  			if err != nil {
   442  				klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
   443  				return false
   444  			}
   445  			return b
   446  		}
   447  	}
   448  
   449  	return false
   450  }
   451  
   452  // extractRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
   453  func (ji *JobInfo) extractRevocableZone(pg *PodGroup) string {
   454  	// check annotation first
   455  	if len(pg.Annotations) > 0 {
   456  		if value, found := pg.Annotations[v1beta1.RevocableZone]; found {
   457  			if value != "*" {
   458  				return ""
   459  			}
   460  			return value
   461  		}
   462  
   463  		if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
   464  			if b, err := strconv.ParseBool(value); err == nil && b {
   465  				return "*"
   466  			}
   467  		}
   468  	}
   469  
   470  	return ""
   471  }
   472  
   473  // extractBudget return budget value for job
   474  func (ji *JobInfo) extractBudget(pg *PodGroup) *DisruptionBudget {
   475  	if len(pg.Annotations) > 0 {
   476  		if value, found := pg.Annotations[v1beta1.JDBMinAvailable]; found {
   477  			return NewDisruptionBudget(value, "")
   478  		} else if value, found := pg.Annotations[v1beta1.JDBMaxUnavailable]; found {
   479  			return NewDisruptionBudget("", value)
   480  		}
   481  	}
   482  
   483  	return NewDisruptionBudget("", "")
   484  }
   485  
   486  // GetMinResources return the min resources of podgroup.
   487  func (ji *JobInfo) GetMinResources() *Resource {
   488  	if ji.PodGroup.Spec.MinResources == nil {
   489  		return EmptyResource()
   490  	}
   491  
   492  	return NewResource(*ji.PodGroup.Spec.MinResources)
   493  }
   494  
   495  func (ji *JobInfo) GetElasticResources() *Resource {
   496  	minResource := ji.GetMinResources()
   497  	if ji.Allocated.LessEqualPartly(minResource, Zero) {
   498  		return EmptyResource()
   499  	}
   500  	return ji.Allocated.Clone().Sub(minResource)
   501  }
   502  
   503  func (ji *JobInfo) addTaskIndex(ti *TaskInfo) {
   504  	if _, found := ji.TaskStatusIndex[ti.Status]; !found {
   505  		ji.TaskStatusIndex[ti.Status] = tasksMap{}
   506  	}
   507  	ji.TaskStatusIndex[ti.Status][ti.UID] = ti
   508  }
   509  
   510  // AddTaskInfo is used to add a task to a job
   511  func (ji *JobInfo) AddTaskInfo(ti *TaskInfo) {
   512  	ji.Tasks[ti.UID] = ti
   513  	ji.addTaskIndex(ti)
   514  	ji.TotalRequest.Add(ti.Resreq)
   515  	if AllocatedStatus(ti.Status) {
   516  		ji.Allocated.Add(ti.Resreq)
   517  	}
   518  }
   519  
   520  // UpdateTaskStatus is used to update task's status in a job.
   521  // If error occurs both task and job are guaranteed to be in the original state.
   522  func (ji *JobInfo) UpdateTaskStatus(task *TaskInfo, status TaskStatus) error {
   523  	if err := validateStatusUpdate(task.Status, status); err != nil {
   524  		return err
   525  	}
   526  
   527  	// First remove the task (if exist) from the task list.
   528  	if _, found := ji.Tasks[task.UID]; found {
   529  		if err := ji.DeleteTaskInfo(task); err != nil {
   530  			return err
   531  		}
   532  	}
   533  
   534  	// Update task's status to the target status once task addition is guaranteed to succeed.
   535  	task.Status = status
   536  	ji.AddTaskInfo(task)
   537  
   538  	return nil
   539  }
   540  
   541  func (ji *JobInfo) deleteTaskIndex(ti *TaskInfo) {
   542  	if tasks, found := ji.TaskStatusIndex[ti.Status]; found {
   543  		delete(tasks, ti.UID)
   544  
   545  		if len(tasks) == 0 {
   546  			delete(ji.TaskStatusIndex, ti.Status)
   547  		}
   548  	}
   549  }
   550  
   551  // DeleteTaskInfo is used to delete a task from a job
   552  func (ji *JobInfo) DeleteTaskInfo(ti *TaskInfo) error {
   553  	if task, found := ji.Tasks[ti.UID]; found {
   554  		ji.TotalRequest.Sub(task.Resreq)
   555  		if AllocatedStatus(task.Status) {
   556  			ji.Allocated.Sub(task.Resreq)
   557  		}
   558  		delete(ji.Tasks, task.UID)
   559  		ji.deleteTaskIndex(task)
   560  		return nil
   561  	}
   562  
   563  	klog.Warningf("failed to find task <%v/%v> in job <%v/%v>", ti.Namespace, ti.Name, ji.Namespace, ji.Name)
   564  	return nil
   565  }
   566  
   567  // Clone is used to clone a jobInfo object
   568  func (ji *JobInfo) Clone() *JobInfo {
   569  	info := &JobInfo{
   570  		UID:       ji.UID,
   571  		Name:      ji.Name,
   572  		Namespace: ji.Namespace,
   573  		Queue:     ji.Queue,
   574  		Priority:  ji.Priority,
   575  
   576  		MinAvailable:   ji.MinAvailable,
   577  		WaitingTime:    ji.WaitingTime,
   578  		JobFitErrors:   ji.JobFitErrors,
   579  		NodesFitErrors: make(map[TaskID]*FitErrors),
   580  		Allocated:      EmptyResource(),
   581  		TotalRequest:   EmptyResource(),
   582  
   583  		PodGroup: ji.PodGroup.Clone(),
   584  
   585  		TaskStatusIndex:       map[TaskStatus]tasksMap{},
   586  		TaskMinAvailable:      make(map[TaskID]int32, len(ji.TaskMinAvailable)),
   587  		TaskMinAvailableTotal: ji.TaskMinAvailableTotal,
   588  		Tasks:                 tasksMap{},
   589  		Preemptable:           ji.Preemptable,
   590  		RevocableZone:         ji.RevocableZone,
   591  		Budget:                ji.Budget.Clone(),
   592  	}
   593  
   594  	ji.CreationTimestamp.DeepCopyInto(&info.CreationTimestamp)
   595  
   596  	for task, minAvailable := range ji.TaskMinAvailable {
   597  		info.TaskMinAvailable[task] = minAvailable
   598  	}
   599  	for _, task := range ji.Tasks {
   600  		info.AddTaskInfo(task.Clone())
   601  	}
   602  
   603  	return info
   604  }
   605  
   606  // String returns a jobInfo object in string format
   607  func (ji JobInfo) String() string {
   608  	res := ""
   609  
   610  	i := 0
   611  	for _, task := range ji.Tasks {
   612  		res += fmt.Sprintf("\n\t %d: %v", i, task)
   613  		i++
   614  	}
   615  
   616  	return fmt.Sprintf("Job (%v): namespace %v (%v), name %v, minAvailable %d, podGroup %+v, preemptable %+v, revocableZone %+v, minAvailable %+v, maxAvailable %+v",
   617  		ji.UID, ji.Namespace, ji.Queue, ji.Name, ji.MinAvailable, ji.PodGroup, ji.Preemptable, ji.RevocableZone, ji.Budget.MinAvailable, ji.Budget.MaxUnavilable) + res
   618  }
   619  
   620  // FitError returns detailed information on why a job's task failed to fit on
   621  // each available node
   622  func (ji *JobInfo) FitError() string {
   623  	sortReasonsHistogram := func(reasons map[string]int) []string {
   624  		reasonStrings := []string{}
   625  		for k, v := range reasons {
   626  			reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
   627  		}
   628  		sort.Strings(reasonStrings)
   629  		return reasonStrings
   630  	}
   631  
   632  	// Stat histogram for all tasks of the job
   633  	reasons := make(map[string]int)
   634  	for status, taskMap := range ji.TaskStatusIndex {
   635  		reasons[status.String()] += len(taskMap)
   636  	}
   637  	reasons["minAvailable"] = int(ji.MinAvailable)
   638  	reasonMsg := fmt.Sprintf("%v, %v", scheduling.PodGroupNotReady, strings.Join(sortReasonsHistogram(reasons), ", "))
   639  
   640  	// Stat histogram for pending tasks only
   641  	reasons = make(map[string]int)
   642  	for uid := range ji.TaskStatusIndex[Pending] {
   643  		reason, _ := ji.TaskSchedulingReason(uid)
   644  		reasons[reason]++
   645  	}
   646  	if len(reasons) > 0 {
   647  		reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", "))
   648  	}
   649  	return reasonMsg
   650  }
   651  
   652  // TaskSchedulingReason get detailed reason and message of the given task
   653  // It returns detailed reason and message for tasks based on last scheduling transaction.
   654  func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string) {
   655  	taskInfo, exists := ji.Tasks[tid]
   656  	if !exists {
   657  		return "", ""
   658  	}
   659  
   660  	// Get detailed scheduling reason based on LastTransaction
   661  	ctx := taskInfo.GetTransactionContext()
   662  	if taskInfo.LastTransaction != nil {
   663  		ctx = *taskInfo.LastTransaction
   664  	}
   665  
   666  	msg = ji.JobFitErrors
   667  	switch status := ctx.Status; status {
   668  	case Allocated:
   669  		// Pod is schedulable
   670  		msg = fmt.Sprintf("Pod %s/%s can possibly be assigned to %s", taskInfo.Namespace, taskInfo.Name, ctx.NodeName)
   671  		return PodReasonSchedulable, msg
   672  	case Pipelined:
   673  		msg = fmt.Sprintf("Pod %s/%s can possibly be assigned to %s, once resource is released", taskInfo.Namespace, taskInfo.Name, ctx.NodeName)
   674  		return PodReasonUnschedulable, msg
   675  	case Pending:
   676  		if fe := ji.NodesFitErrors[tid]; fe != nil {
   677  			// Pod is unschedulable
   678  			return PodReasonUnschedulable, fe.Error()
   679  		}
   680  		// Pod is not scheduled yet, keep UNSCHEDULABLE as the reason to support cluster autoscaler
   681  		return PodReasonUnschedulable, msg
   682  	default:
   683  		return status.String(), msg
   684  	}
   685  }
   686  
   687  // ReadyTaskNum returns the number of tasks that are ready or that is best-effort.
   688  func (ji *JobInfo) ReadyTaskNum() int32 {
   689  	occupied := 0
   690  	occupied += len(ji.TaskStatusIndex[Bound])
   691  	occupied += len(ji.TaskStatusIndex[Binding])
   692  	occupied += len(ji.TaskStatusIndex[Running])
   693  	occupied += len(ji.TaskStatusIndex[Allocated])
   694  	occupied += len(ji.TaskStatusIndex[Succeeded])
   695  
   696  	return int32(occupied)
   697  }
   698  
   699  // WaitingTaskNum returns the number of tasks that are pipelined.
   700  func (ji *JobInfo) WaitingTaskNum() int32 {
   701  	return int32(len(ji.TaskStatusIndex[Pipelined]))
   702  }
   703  
   704  func (ji *JobInfo) PendingBestEffortTaskNum() int32 {
   705  	count := 0
   706  	for _, task := range ji.TaskStatusIndex[Pending] {
   707  		if task.BestEffort {
   708  			count++
   709  		}
   710  	}
   711  	return int32(count)
   712  }
   713  
   714  // CheckTaskValid returns whether each task of job is valid.
   715  func (ji *JobInfo) CheckTaskValid() bool {
   716  	// if job minAvailable is less than sum of(task minAvailable), skip this check
   717  	if ji.MinAvailable < ji.TaskMinAvailableTotal {
   718  		return true
   719  	}
   720  
   721  	actual := map[TaskID]int32{}
   722  	for status, tasks := range ji.TaskStatusIndex {
   723  		if AllocatedStatus(status) ||
   724  			status == Succeeded ||
   725  			status == Pipelined ||
   726  			status == Pending {
   727  			for _, task := range tasks {
   728  				actual[getTaskID(task.Pod)]++
   729  			}
   730  		}
   731  	}
   732  
   733  	klog.V(4).Infof("job %s/%s actual: %+v, ji.TaskMinAvailable: %+v", ji.Name, ji.Namespace, actual, ji.TaskMinAvailable)
   734  	for task, minAvailable := range ji.TaskMinAvailable {
   735  		if minAvailable == 0 {
   736  			continue
   737  		}
   738  		if act, ok := actual[task]; !ok || act < minAvailable {
   739  			return false
   740  		}
   741  	}
   742  
   743  	return true
   744  }
   745  
   746  // CheckTaskReady return whether each task of job is ready.
   747  func (ji *JobInfo) CheckTaskReady() bool {
   748  	if ji.MinAvailable < ji.TaskMinAvailableTotal {
   749  		return true
   750  	}
   751  	occupiedMap := map[TaskID]int32{}
   752  	for status, tasks := range ji.TaskStatusIndex {
   753  		if AllocatedStatus(status) ||
   754  			status == Succeeded {
   755  			for _, task := range tasks {
   756  				occupiedMap[getTaskID(task.Pod)]++
   757  			}
   758  			continue
   759  		}
   760  
   761  		if status == Pending {
   762  			for _, task := range tasks {
   763  				if task.InitResreq.IsEmpty() {
   764  					occupiedMap[getTaskID(task.Pod)]++
   765  				}
   766  			}
   767  		}
   768  	}
   769  	for taskID, minNum := range ji.TaskMinAvailable {
   770  		if occupiedMap[taskID] < minNum {
   771  			klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min avaliable", ji.Namespace, ji.Name, taskID, occupiedMap[taskID])
   772  			return false
   773  		}
   774  	}
   775  	return true
   776  }
   777  
   778  // CheckTaskPipelined return whether each task of job is pipelined.
   779  func (ji *JobInfo) CheckTaskPipelined() bool {
   780  	if ji.MinAvailable < ji.TaskMinAvailableTotal {
   781  		return true
   782  	}
   783  	occupiedMap := map[TaskID]int32{}
   784  	for status, tasks := range ji.TaskStatusIndex {
   785  		if AllocatedStatus(status) ||
   786  			status == Succeeded ||
   787  			status == Pipelined {
   788  			for _, task := range tasks {
   789  				occupiedMap[getTaskID(task.Pod)]++
   790  			}
   791  			continue
   792  		}
   793  
   794  		if status == Pending {
   795  			for _, task := range tasks {
   796  				if task.InitResreq.IsEmpty() {
   797  					occupiedMap[getTaskID(task.Pod)]++
   798  				}
   799  			}
   800  		}
   801  	}
   802  	for taskID, minNum := range ji.TaskMinAvailable {
   803  		if occupiedMap[taskID] < minNum {
   804  			klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min avaliable", ji.Namespace, ji.Name, taskID, occupiedMap[taskID])
   805  			return false
   806  		}
   807  	}
   808  	return true
   809  }
   810  
   811  // CheckTaskStarving return whether job has at least one task which is starving.
   812  func (ji *JobInfo) CheckTaskStarving() bool {
   813  	if ji.MinAvailable < ji.TaskMinAvailableTotal {
   814  		return true
   815  	}
   816  	occupiedMap := map[TaskID]int32{}
   817  	for status, tasks := range ji.TaskStatusIndex {
   818  		if AllocatedStatus(status) ||
   819  			status == Succeeded ||
   820  			status == Pipelined {
   821  			for _, task := range tasks {
   822  				occupiedMap[getTaskID(task.Pod)]++
   823  			}
   824  			continue
   825  		}
   826  	}
   827  	for taskID, minNum := range ji.TaskMinAvailable {
   828  		if occupiedMap[taskID] < minNum {
   829  			klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min available", ji.Namespace, ji.Name, taskID, occupiedMap[taskID])
   830  			return true
   831  		}
   832  	}
   833  	return false
   834  }
   835  
   836  // ValidTaskNum returns the number of tasks that are valid.
   837  func (ji *JobInfo) ValidTaskNum() int32 {
   838  	occupied := 0
   839  	for status, tasks := range ji.TaskStatusIndex {
   840  		if AllocatedStatus(status) ||
   841  			status == Succeeded ||
   842  			status == Pipelined ||
   843  			status == Pending {
   844  			occupied += len(tasks)
   845  		}
   846  	}
   847  
   848  	return int32(occupied)
   849  }
   850  
   851  func (ji *JobInfo) IsReady() bool {
   852  	return ji.ReadyTaskNum()+ji.PendingBestEffortTaskNum() >= ji.MinAvailable
   853  }
   854  
   855  func (ji *JobInfo) IsPipelined() bool {
   856  	return ji.WaitingTaskNum()+ji.ReadyTaskNum()+ji.PendingBestEffortTaskNum() >= ji.MinAvailable
   857  }
   858  
   859  func (ji *JobInfo) IsStarving() bool {
   860  	return ji.WaitingTaskNum()+ji.ReadyTaskNum() < ji.MinAvailable
   861  }
   862  
   863  // IsPending returns whether job is in pending status
   864  func (ji *JobInfo) IsPending() bool {
   865  	return ji.PodGroup == nil ||
   866  		ji.PodGroup.Status.Phase == scheduling.PodGroupPending ||
   867  		ji.PodGroup.Status.Phase == ""
   868  }
   869  
   870  // HasPendingTasks return whether job has pending tasks
   871  func (ji *JobInfo) HasPendingTasks() bool {
   872  	return len(ji.TaskStatusIndex[Pending]) != 0
   873  }