volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/allocate/allocate.go (about)

     1  /*
     2   Copyright 2021 The Volcano Authors.
     3  
     4   Licensed under the Apache License, Version 2.0 (the "License");
     5   you may not use this file except in compliance with the License.
     6   You may obtain a copy of the License at
     7  
     8       http://www.apache.org/licenses/LICENSE-2.0
     9  
    10   Unless required by applicable law or agreed to in writing, software
    11   distributed under the License is distributed on an "AS IS" BASIS,
    12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   See the License for the specific language governing permissions and
    14   limitations under the License.
    15  */
    16  
    17  package allocate
    18  
    19  import (
    20  	"time"
    21  
    22  	"k8s.io/klog/v2"
    23  
    24  	"volcano.sh/apis/pkg/apis/scheduling"
    25  	"volcano.sh/volcano/pkg/scheduler/api"
    26  	"volcano.sh/volcano/pkg/scheduler/conf"
    27  	"volcano.sh/volcano/pkg/scheduler/framework"
    28  	"volcano.sh/volcano/pkg/scheduler/metrics"
    29  	"volcano.sh/volcano/pkg/scheduler/util"
    30  )
    31  
    32  type Action struct {
    33  	session *framework.Session
    34  }
    35  
    36  func New() *Action {
    37  	return &Action{}
    38  }
    39  
    40  func (alloc *Action) Name() string {
    41  	return "allocate"
    42  }
    43  
    44  func (alloc *Action) Initialize() {}
    45  
    46  func (alloc *Action) Execute(ssn *framework.Session) {
    47  	klog.V(5).Infof("Enter Allocate ...")
    48  	defer klog.V(5).Infof("Leaving Allocate ...")
    49  
    50  	// the allocation for pod may have many stages
    51  	// 1. pick a queue named Q (using ssn.QueueOrderFn)
    52  	// 2. pick a job named J from Q (using ssn.JobOrderFn)
    53  	// 3. pick a task T from J (using ssn.TaskOrderFn)
    54  	// 4. use predicateFn to filter out node that T can not be allocated on.
    55  	// 5. use ssn.NodeOrderFn to judge the best node and assign it to T
    56  
    57  	// queues sort queues by QueueOrderFn.
    58  	queues := util.NewPriorityQueue(ssn.QueueOrderFn)
    59  	// jobsMap is used to find job with the highest priority in given queue.
    60  	jobsMap := map[api.QueueID]*util.PriorityQueue{}
    61  
    62  	alloc.session = ssn
    63  	alloc.pickUpQueuesAndJobs(queues, jobsMap)
    64  	klog.V(3).Infof("Try to allocate resource to %d Queues", len(jobsMap))
    65  	alloc.allocateResources(queues, jobsMap)
    66  }
    67  
    68  func (alloc *Action) pickUpQueuesAndJobs(queues *util.PriorityQueue, jobsMap map[api.QueueID]*util.PriorityQueue) {
    69  	ssn := alloc.session
    70  	for _, job := range ssn.Jobs {
    71  		// If not config enqueue action, change Pending pg into Inqueue statue to avoid blocking job scheduling.
    72  		if conf.EnabledActionMap["enqueue"] {
    73  			if job.IsPending() {
    74  				klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: job status is pending.",
    75  					job.Namespace, job.Name, job.Queue)
    76  				continue
    77  			}
    78  		} else if job.IsPending() {
    79  			klog.V(4).Infof("Job <%s/%s> Queue <%s> status update from pending to inqueue, reason: no enqueue action is configured.",
    80  				job.Namespace, job.Name, job.Queue)
    81  			job.PodGroup.Status.Phase = scheduling.PodGroupInqueue
    82  		}
    83  
    84  		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
    85  			klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
    86  			continue
    87  		}
    88  
    89  		if _, found := ssn.Queues[job.Queue]; !found {
    90  			klog.Warningf("Skip adding Job <%s/%s> because its queue %s is not found",
    91  				job.Namespace, job.Name, job.Queue)
    92  			continue
    93  		}
    94  
    95  		if _, found := jobsMap[job.Queue]; !found {
    96  			jobsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
    97  			queues.Push(ssn.Queues[job.Queue])
    98  		}
    99  
   100  		klog.V(4).Infof("Added Job <%s/%s> into Queue <%s>", job.Namespace, job.Name, job.Queue)
   101  		jobsMap[job.Queue].Push(job)
   102  	}
   103  }
   104  
   105  // allocateResources primarily accomplishes two steps:
   106  // 1. picks up tasks.
   107  // 2. allocates resources to these tasks. (this step is carried out by the allocateResourcesForTasks method.)
   108  func (alloc *Action) allocateResources(queues *util.PriorityQueue, jobsMap map[api.QueueID]*util.PriorityQueue) {
   109  	ssn := alloc.session
   110  	pendingTasks := map[api.JobID]*util.PriorityQueue{}
   111  
   112  	allNodes := ssn.NodeList
   113  
   114  	// To pick <namespace, queue> tuple for job, we choose to pick namespace firstly.
   115  	// Because we believe that number of queues would less than namespaces in most case.
   116  	// And, this action would make the resource usage among namespace balanced.
   117  	for {
   118  		if queues.Empty() {
   119  			break
   120  		}
   121  
   122  		queue := queues.Pop().(*api.QueueInfo)
   123  
   124  		if ssn.Overused(queue) {
   125  			klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name)
   126  			continue
   127  		}
   128  
   129  		klog.V(3).Infof("Try to allocate resource to Jobs in Queue <%s>", queue.Name)
   130  
   131  		jobs, found := jobsMap[queue.UID]
   132  		if !found || jobs.Empty() {
   133  			klog.V(4).Infof("Can not find jobs for queue %s.", queue.Name)
   134  			continue
   135  		}
   136  
   137  		job := jobs.Pop().(*api.JobInfo)
   138  		if _, found = pendingTasks[job.UID]; !found {
   139  			tasks := util.NewPriorityQueue(ssn.TaskOrderFn)
   140  			for _, task := range job.TaskStatusIndex[api.Pending] {
   141  				// Skip BestEffort task in 'allocate' action.
   142  				if task.Resreq.IsEmpty() {
   143  					klog.V(4).Infof("Task <%v/%v> is BestEffort task, skip it.",
   144  						task.Namespace, task.Name)
   145  					continue
   146  				}
   147  
   148  				tasks.Push(task)
   149  			}
   150  			pendingTasks[job.UID] = tasks
   151  		}
   152  		tasks := pendingTasks[job.UID]
   153  
   154  		// Added Queue back until no job in Namespace.
   155  		queues.Push(queue)
   156  
   157  		if tasks.Empty() {
   158  			continue
   159  		}
   160  
   161  		klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>",
   162  			tasks.Len(), job.Namespace, job.Name)
   163  
   164  		alloc.allocateResourcesForTasks(tasks, job, jobs, queue, allNodes)
   165  	}
   166  }
   167  
   168  func (alloc *Action) allocateResourcesForTasks(tasks *util.PriorityQueue, job *api.JobInfo, jobs *util.PriorityQueue, queue *api.QueueInfo, allNodes []*api.NodeInfo) {
   169  	ssn := alloc.session
   170  	stmt := framework.NewStatement(ssn)
   171  	ph := util.NewPredicateHelper()
   172  
   173  	for !tasks.Empty() {
   174  		task := tasks.Pop().(*api.TaskInfo)
   175  
   176  		if !ssn.Allocatable(queue, task) {
   177  			klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
   178  			continue
   179  		}
   180  
   181  		klog.V(3).Infof("There are <%d> nodes for Job <%v/%v>", len(ssn.Nodes), job.Namespace, job.Name)
   182  
   183  		if err := ssn.PrePredicateFn(task); err != nil {
   184  			klog.V(3).Infof("PrePredicate for task %s/%s failed for: %v", task.Namespace, task.Name, err)
   185  			fitErrors := api.NewFitErrors()
   186  			for _, ni := range allNodes {
   187  				fitErrors.SetNodeError(ni.Name, err)
   188  			}
   189  			job.NodesFitErrors[task.UID] = fitErrors
   190  			break
   191  		}
   192  
   193  		predicateNodes, fitErrors := ph.PredicateNodes(task, allNodes, alloc.predicate, true)
   194  		if len(predicateNodes) == 0 {
   195  			job.NodesFitErrors[task.UID] = fitErrors
   196  			break
   197  		}
   198  
   199  		// Candidate nodes are divided into two gradients:
   200  		// - the first gradient node: a list of free nodes that satisfy the task resource request;
   201  		// - The second gradient node: the node list whose sum of node idle resources and future idle meets the task resource request;
   202  		// Score the first gradient node first. If the first gradient node meets the requirements, ignore the second gradient node list,
   203  		// otherwise, score the second gradient node and select the appropriate node.
   204  		var candidateNodes [][]*api.NodeInfo
   205  		var idleCandidateNodes []*api.NodeInfo
   206  		var futureIdleCandidateNodes []*api.NodeInfo
   207  		for _, n := range predicateNodes {
   208  			if task.InitResreq.LessEqual(n.Idle, api.Zero) {
   209  				idleCandidateNodes = append(idleCandidateNodes, n)
   210  			} else if task.InitResreq.LessEqual(n.FutureIdle(), api.Zero) {
   211  				futureIdleCandidateNodes = append(futureIdleCandidateNodes, n)
   212  			} else {
   213  				klog.V(5).Infof("Predicate filtered node %v, idle: %v and future idle: %v do not meet the requirements of task: %v",
   214  					n.Name, n.Idle, n.FutureIdle(), task.Name)
   215  			}
   216  		}
   217  		candidateNodes = append(candidateNodes, idleCandidateNodes)
   218  		candidateNodes = append(candidateNodes, futureIdleCandidateNodes)
   219  
   220  		var bestNode *api.NodeInfo
   221  		for index, nodes := range candidateNodes {
   222  			if klog.V(5).Enabled() {
   223  				for _, node := range nodes {
   224  					klog.V(5).Infof("node %v, idle: %v, future idle: %v", node.Name, node.Idle, node.FutureIdle())
   225  				}
   226  			}
   227  			switch {
   228  			case len(nodes) == 0:
   229  				klog.V(5).Infof("Task: %v, no matching node is found in the candidateNodes(index: %d) list.", task.Name, index)
   230  			case len(nodes) == 1: // If only one node after predicate, just use it.
   231  				bestNode = nodes[0]
   232  			case len(nodes) > 1: // If more than one node after predicate, using "the best" one
   233  				nodeScores := util.PrioritizeNodes(task, nodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
   234  
   235  				bestNode = ssn.BestNodeFn(task, nodeScores)
   236  				if bestNode == nil {
   237  					bestNode = util.SelectBestNode(nodeScores)
   238  				}
   239  			}
   240  
   241  			// If a proper node is found in idleCandidateNodes, skip futureIdleCandidateNodes and directly return the node information.
   242  			if bestNode != nil {
   243  				break
   244  			}
   245  		}
   246  
   247  		// Allocate idle resource to the task.
   248  		if task.InitResreq.LessEqual(bestNode.Idle, api.Zero) {
   249  			klog.V(3).Infof("Binding Task <%v/%v> to node <%v>",
   250  				task.Namespace, task.Name, bestNode.Name)
   251  			if err := stmt.Allocate(task, bestNode); err != nil {
   252  				klog.Errorf("Failed to bind Task %v on %v in Session %v, err: %v",
   253  					task.UID, bestNode.Name, ssn.UID, err)
   254  			} else {
   255  				metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
   256  				metrics.UpdateE2eSchedulingLastTimeByJob(job.Name, string(job.Queue), job.Namespace, time.Now())
   257  			}
   258  		} else {
   259  			klog.V(3).Infof("Predicates failed in allocate for task <%s/%s> on node <%s> with limited resources",
   260  				task.Namespace, task.Name, bestNode.Name)
   261  
   262  			// Allocate releasing resource to the task if any.
   263  			if task.InitResreq.LessEqual(bestNode.FutureIdle(), api.Zero) {
   264  				klog.V(3).Infof("Pipelining Task <%v/%v> to node <%v> for <%v> on <%v>",
   265  					task.Namespace, task.Name, bestNode.Name, task.InitResreq, bestNode.Releasing)
   266  				if err := stmt.Pipeline(task, bestNode.Name); err != nil {
   267  					klog.Errorf("Failed to pipeline Task %v on %v in Session %v for %v.",
   268  						task.UID, bestNode.Name, ssn.UID, err)
   269  				} else {
   270  					metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
   271  					metrics.UpdateE2eSchedulingLastTimeByJob(job.Name, string(job.Queue), job.Namespace, time.Now())
   272  				}
   273  			}
   274  		}
   275  
   276  		if ssn.JobReady(job) && !tasks.Empty() {
   277  			jobs.Push(job)
   278  			break
   279  		}
   280  	}
   281  
   282  	if ssn.JobReady(job) {
   283  		stmt.Commit()
   284  	} else {
   285  		if !ssn.JobPipelined(job) {
   286  			stmt.Discard()
   287  		}
   288  	}
   289  }
   290  
   291  func (alloc *Action) predicate(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) {
   292  	// Check for Resource Predicate
   293  	if ok, resources := task.InitResreq.LessEqualWithResourcesName(node.FutureIdle(), api.Zero); !ok {
   294  		return nil, api.NewFitError(task, node, api.WrapInsufficientResourceReason(resources))
   295  	}
   296  	var statusSets util.StatusSets
   297  	statusSets, err := alloc.session.PredicateFn(task, node)
   298  	if err != nil {
   299  		return nil, api.NewFitError(task, node, err.Error())
   300  	}
   301  
   302  	if statusSets.ContainsUnschedulable() || statusSets.ContainsUnschedulableAndUnresolvable() ||
   303  		statusSets.ContainsErrorSkipOrWait() {
   304  		return nil, api.NewFitError(task, node, statusSets.Message())
   305  	}
   306  	return nil, nil
   307  }
   308  
   309  func (alloc *Action) UnInitialize() {}