volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/allocate/allocate.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package allocate 18 19 import ( 20 "time" 21 22 "k8s.io/klog/v2" 23 24 "volcano.sh/apis/pkg/apis/scheduling" 25 "volcano.sh/volcano/pkg/scheduler/api" 26 "volcano.sh/volcano/pkg/scheduler/conf" 27 "volcano.sh/volcano/pkg/scheduler/framework" 28 "volcano.sh/volcano/pkg/scheduler/metrics" 29 "volcano.sh/volcano/pkg/scheduler/util" 30 ) 31 32 type Action struct { 33 session *framework.Session 34 } 35 36 func New() *Action { 37 return &Action{} 38 } 39 40 func (alloc *Action) Name() string { 41 return "allocate" 42 } 43 44 func (alloc *Action) Initialize() {} 45 46 func (alloc *Action) Execute(ssn *framework.Session) { 47 klog.V(5).Infof("Enter Allocate ...") 48 defer klog.V(5).Infof("Leaving Allocate ...") 49 50 // the allocation for pod may have many stages 51 // 1. pick a queue named Q (using ssn.QueueOrderFn) 52 // 2. pick a job named J from Q (using ssn.JobOrderFn) 53 // 3. pick a task T from J (using ssn.TaskOrderFn) 54 // 4. use predicateFn to filter out node that T can not be allocated on. 55 // 5. use ssn.NodeOrderFn to judge the best node and assign it to T 56 57 // queues sort queues by QueueOrderFn. 58 queues := util.NewPriorityQueue(ssn.QueueOrderFn) 59 // jobsMap is used to find job with the highest priority in given queue. 60 jobsMap := map[api.QueueID]*util.PriorityQueue{} 61 62 alloc.session = ssn 63 alloc.pickUpQueuesAndJobs(queues, jobsMap) 64 klog.V(3).Infof("Try to allocate resource to %d Queues", len(jobsMap)) 65 alloc.allocateResources(queues, jobsMap) 66 } 67 68 func (alloc *Action) pickUpQueuesAndJobs(queues *util.PriorityQueue, jobsMap map[api.QueueID]*util.PriorityQueue) { 69 ssn := alloc.session 70 for _, job := range ssn.Jobs { 71 // If not config enqueue action, change Pending pg into Inqueue statue to avoid blocking job scheduling. 72 if conf.EnabledActionMap["enqueue"] { 73 if job.IsPending() { 74 klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: job status is pending.", 75 job.Namespace, job.Name, job.Queue) 76 continue 77 } 78 } else if job.IsPending() { 79 klog.V(4).Infof("Job <%s/%s> Queue <%s> status update from pending to inqueue, reason: no enqueue action is configured.", 80 job.Namespace, job.Name, job.Queue) 81 job.PodGroup.Status.Phase = scheduling.PodGroupInqueue 82 } 83 84 if vr := ssn.JobValid(job); vr != nil && !vr.Pass { 85 klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message) 86 continue 87 } 88 89 if _, found := ssn.Queues[job.Queue]; !found { 90 klog.Warningf("Skip adding Job <%s/%s> because its queue %s is not found", 91 job.Namespace, job.Name, job.Queue) 92 continue 93 } 94 95 if _, found := jobsMap[job.Queue]; !found { 96 jobsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn) 97 queues.Push(ssn.Queues[job.Queue]) 98 } 99 100 klog.V(4).Infof("Added Job <%s/%s> into Queue <%s>", job.Namespace, job.Name, job.Queue) 101 jobsMap[job.Queue].Push(job) 102 } 103 } 104 105 // allocateResources primarily accomplishes two steps: 106 // 1. picks up tasks. 107 // 2. allocates resources to these tasks. (this step is carried out by the allocateResourcesForTasks method.) 108 func (alloc *Action) allocateResources(queues *util.PriorityQueue, jobsMap map[api.QueueID]*util.PriorityQueue) { 109 ssn := alloc.session 110 pendingTasks := map[api.JobID]*util.PriorityQueue{} 111 112 allNodes := ssn.NodeList 113 114 // To pick <namespace, queue> tuple for job, we choose to pick namespace firstly. 115 // Because we believe that number of queues would less than namespaces in most case. 116 // And, this action would make the resource usage among namespace balanced. 117 for { 118 if queues.Empty() { 119 break 120 } 121 122 queue := queues.Pop().(*api.QueueInfo) 123 124 if ssn.Overused(queue) { 125 klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name) 126 continue 127 } 128 129 klog.V(3).Infof("Try to allocate resource to Jobs in Queue <%s>", queue.Name) 130 131 jobs, found := jobsMap[queue.UID] 132 if !found || jobs.Empty() { 133 klog.V(4).Infof("Can not find jobs for queue %s.", queue.Name) 134 continue 135 } 136 137 job := jobs.Pop().(*api.JobInfo) 138 if _, found = pendingTasks[job.UID]; !found { 139 tasks := util.NewPriorityQueue(ssn.TaskOrderFn) 140 for _, task := range job.TaskStatusIndex[api.Pending] { 141 // Skip BestEffort task in 'allocate' action. 142 if task.Resreq.IsEmpty() { 143 klog.V(4).Infof("Task <%v/%v> is BestEffort task, skip it.", 144 task.Namespace, task.Name) 145 continue 146 } 147 148 tasks.Push(task) 149 } 150 pendingTasks[job.UID] = tasks 151 } 152 tasks := pendingTasks[job.UID] 153 154 // Added Queue back until no job in Namespace. 155 queues.Push(queue) 156 157 if tasks.Empty() { 158 continue 159 } 160 161 klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>", 162 tasks.Len(), job.Namespace, job.Name) 163 164 alloc.allocateResourcesForTasks(tasks, job, jobs, queue, allNodes) 165 } 166 } 167 168 func (alloc *Action) allocateResourcesForTasks(tasks *util.PriorityQueue, job *api.JobInfo, jobs *util.PriorityQueue, queue *api.QueueInfo, allNodes []*api.NodeInfo) { 169 ssn := alloc.session 170 stmt := framework.NewStatement(ssn) 171 ph := util.NewPredicateHelper() 172 173 for !tasks.Empty() { 174 task := tasks.Pop().(*api.TaskInfo) 175 176 if !ssn.Allocatable(queue, task) { 177 klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name) 178 continue 179 } 180 181 klog.V(3).Infof("There are <%d> nodes for Job <%v/%v>", len(ssn.Nodes), job.Namespace, job.Name) 182 183 if err := ssn.PrePredicateFn(task); err != nil { 184 klog.V(3).Infof("PrePredicate for task %s/%s failed for: %v", task.Namespace, task.Name, err) 185 fitErrors := api.NewFitErrors() 186 for _, ni := range allNodes { 187 fitErrors.SetNodeError(ni.Name, err) 188 } 189 job.NodesFitErrors[task.UID] = fitErrors 190 break 191 } 192 193 predicateNodes, fitErrors := ph.PredicateNodes(task, allNodes, alloc.predicate, true) 194 if len(predicateNodes) == 0 { 195 job.NodesFitErrors[task.UID] = fitErrors 196 break 197 } 198 199 // Candidate nodes are divided into two gradients: 200 // - the first gradient node: a list of free nodes that satisfy the task resource request; 201 // - The second gradient node: the node list whose sum of node idle resources and future idle meets the task resource request; 202 // Score the first gradient node first. If the first gradient node meets the requirements, ignore the second gradient node list, 203 // otherwise, score the second gradient node and select the appropriate node. 204 var candidateNodes [][]*api.NodeInfo 205 var idleCandidateNodes []*api.NodeInfo 206 var futureIdleCandidateNodes []*api.NodeInfo 207 for _, n := range predicateNodes { 208 if task.InitResreq.LessEqual(n.Idle, api.Zero) { 209 idleCandidateNodes = append(idleCandidateNodes, n) 210 } else if task.InitResreq.LessEqual(n.FutureIdle(), api.Zero) { 211 futureIdleCandidateNodes = append(futureIdleCandidateNodes, n) 212 } else { 213 klog.V(5).Infof("Predicate filtered node %v, idle: %v and future idle: %v do not meet the requirements of task: %v", 214 n.Name, n.Idle, n.FutureIdle(), task.Name) 215 } 216 } 217 candidateNodes = append(candidateNodes, idleCandidateNodes) 218 candidateNodes = append(candidateNodes, futureIdleCandidateNodes) 219 220 var bestNode *api.NodeInfo 221 for index, nodes := range candidateNodes { 222 if klog.V(5).Enabled() { 223 for _, node := range nodes { 224 klog.V(5).Infof("node %v, idle: %v, future idle: %v", node.Name, node.Idle, node.FutureIdle()) 225 } 226 } 227 switch { 228 case len(nodes) == 0: 229 klog.V(5).Infof("Task: %v, no matching node is found in the candidateNodes(index: %d) list.", task.Name, index) 230 case len(nodes) == 1: // If only one node after predicate, just use it. 231 bestNode = nodes[0] 232 case len(nodes) > 1: // If more than one node after predicate, using "the best" one 233 nodeScores := util.PrioritizeNodes(task, nodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn) 234 235 bestNode = ssn.BestNodeFn(task, nodeScores) 236 if bestNode == nil { 237 bestNode = util.SelectBestNode(nodeScores) 238 } 239 } 240 241 // If a proper node is found in idleCandidateNodes, skip futureIdleCandidateNodes and directly return the node information. 242 if bestNode != nil { 243 break 244 } 245 } 246 247 // Allocate idle resource to the task. 248 if task.InitResreq.LessEqual(bestNode.Idle, api.Zero) { 249 klog.V(3).Infof("Binding Task <%v/%v> to node <%v>", 250 task.Namespace, task.Name, bestNode.Name) 251 if err := stmt.Allocate(task, bestNode); err != nil { 252 klog.Errorf("Failed to bind Task %v on %v in Session %v, err: %v", 253 task.UID, bestNode.Name, ssn.UID, err) 254 } else { 255 metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time)) 256 metrics.UpdateE2eSchedulingLastTimeByJob(job.Name, string(job.Queue), job.Namespace, time.Now()) 257 } 258 } else { 259 klog.V(3).Infof("Predicates failed in allocate for task <%s/%s> on node <%s> with limited resources", 260 task.Namespace, task.Name, bestNode.Name) 261 262 // Allocate releasing resource to the task if any. 263 if task.InitResreq.LessEqual(bestNode.FutureIdle(), api.Zero) { 264 klog.V(3).Infof("Pipelining Task <%v/%v> to node <%v> for <%v> on <%v>", 265 task.Namespace, task.Name, bestNode.Name, task.InitResreq, bestNode.Releasing) 266 if err := stmt.Pipeline(task, bestNode.Name); err != nil { 267 klog.Errorf("Failed to pipeline Task %v on %v in Session %v for %v.", 268 task.UID, bestNode.Name, ssn.UID, err) 269 } else { 270 metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time)) 271 metrics.UpdateE2eSchedulingLastTimeByJob(job.Name, string(job.Queue), job.Namespace, time.Now()) 272 } 273 } 274 } 275 276 if ssn.JobReady(job) && !tasks.Empty() { 277 jobs.Push(job) 278 break 279 } 280 } 281 282 if ssn.JobReady(job) { 283 stmt.Commit() 284 } else { 285 if !ssn.JobPipelined(job) { 286 stmt.Discard() 287 } 288 } 289 } 290 291 func (alloc *Action) predicate(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { 292 // Check for Resource Predicate 293 if ok, resources := task.InitResreq.LessEqualWithResourcesName(node.FutureIdle(), api.Zero); !ok { 294 return nil, api.NewFitError(task, node, api.WrapInsufficientResourceReason(resources)) 295 } 296 var statusSets util.StatusSets 297 statusSets, err := alloc.session.PredicateFn(task, node) 298 if err != nil { 299 return nil, api.NewFitError(task, node, err.Error()) 300 } 301 302 if statusSets.ContainsUnschedulable() || statusSets.ContainsUnschedulableAndUnresolvable() || 303 statusSets.ContainsErrorSkipOrWait() { 304 return nil, api.NewFitError(task, node, statusSets.Message()) 305 } 306 return nil, nil 307 } 308 309 func (alloc *Action) UnInitialize() {}