volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/preempt/preempt.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package preempt 18 19 import ( 20 "fmt" 21 22 "k8s.io/klog/v2" 23 24 "volcano.sh/volcano/pkg/scheduler/api" 25 "volcano.sh/volcano/pkg/scheduler/framework" 26 "volcano.sh/volcano/pkg/scheduler/metrics" 27 "volcano.sh/volcano/pkg/scheduler/util" 28 ) 29 30 type Action struct{} 31 32 func New() *Action { 33 return &Action{} 34 } 35 36 func (pmpt *Action) Name() string { 37 return "preempt" 38 } 39 40 func (pmpt *Action) Initialize() {} 41 42 func (pmpt *Action) Execute(ssn *framework.Session) { 43 klog.V(5).Infof("Enter Preempt ...") 44 defer klog.V(5).Infof("Leaving Preempt ...") 45 46 preemptorsMap := map[api.QueueID]*util.PriorityQueue{} 47 preemptorTasks := map[api.JobID]*util.PriorityQueue{} 48 49 var underRequest []*api.JobInfo 50 queues := map[api.QueueID]*api.QueueInfo{} 51 52 for _, job := range ssn.Jobs { 53 if job.IsPending() { 54 continue 55 } 56 57 if vr := ssn.JobValid(job); vr != nil && !vr.Pass { 58 klog.V(4).Infof("Job <%s/%s> Queue <%s> skip preemption, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message) 59 continue 60 } 61 62 if queue, found := ssn.Queues[job.Queue]; !found { 63 continue 64 } else if _, existed := queues[queue.UID]; !existed { 65 klog.V(3).Infof("Added Queue <%s> for Job <%s/%s>", 66 queue.Name, job.Namespace, job.Name) 67 queues[queue.UID] = queue 68 } 69 70 // check job if starving for more resources. 71 if ssn.JobStarving(job) { 72 if _, found := preemptorsMap[job.Queue]; !found { 73 preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn) 74 } 75 preemptorsMap[job.Queue].Push(job) 76 underRequest = append(underRequest, job) 77 preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn) 78 for _, task := range job.TaskStatusIndex[api.Pending] { 79 preemptorTasks[job.UID].Push(task) 80 } 81 } 82 } 83 84 ph := util.NewPredicateHelper() 85 // Preemption between Jobs within Queue. 86 for _, queue := range queues { 87 for { 88 preemptors := preemptorsMap[queue.UID] 89 90 // If no preemptors, no preemption. 91 if preemptors == nil || preemptors.Empty() { 92 klog.V(4).Infof("No preemptors in Queue <%s>, break.", queue.Name) 93 break 94 } 95 96 preemptorJob := preemptors.Pop().(*api.JobInfo) 97 98 stmt := framework.NewStatement(ssn) 99 assigned := false 100 for { 101 // If job is not request more resource, then stop preempting. 102 if !ssn.JobStarving(preemptorJob) { 103 break 104 } 105 106 // If not preemptor tasks, next job. 107 if preemptorTasks[preemptorJob.UID].Empty() { 108 klog.V(3).Infof("No preemptor task in job <%s/%s>.", 109 preemptorJob.Namespace, preemptorJob.Name) 110 break 111 } 112 113 preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo) 114 115 if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool { 116 // Ignore non running task. 117 if !api.PreemptableStatus(task.Status) { 118 return false 119 } 120 // BestEffort pod is not supported to preempt unBestEffort pod. 121 if preemptor.BestEffort && !task.BestEffort { 122 return false 123 } 124 if !task.Preemptable { 125 return false 126 } 127 job, found := ssn.Jobs[task.Job] 128 if !found { 129 return false 130 } 131 // Preempt other jobs within queue 132 return job.Queue == preemptorJob.Queue && preemptor.Job != task.Job 133 }, ph); preempted { 134 assigned = true 135 } 136 } 137 138 // Commit changes only if job is pipelined, otherwise try next job. 139 if ssn.JobPipelined(preemptorJob) { 140 stmt.Commit() 141 } else { 142 stmt.Discard() 143 continue 144 } 145 146 if assigned { 147 preemptors.Push(preemptorJob) 148 } 149 } 150 151 // Preemption between Task within Job. 152 for _, job := range underRequest { 153 // Fix: preemptor numbers lose when in same job 154 preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn) 155 for _, task := range job.TaskStatusIndex[api.Pending] { 156 preemptorTasks[job.UID].Push(task) 157 } 158 for { 159 if _, found := preemptorTasks[job.UID]; !found { 160 break 161 } 162 163 if preemptorTasks[job.UID].Empty() { 164 break 165 } 166 167 preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo) 168 169 stmt := framework.NewStatement(ssn) 170 assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool { 171 // Ignore non running task. 172 if !api.PreemptableStatus(task.Status) { 173 return false 174 } 175 // BestEffort pod is not supported to preempt unBestEffort pod. 176 if preemptor.BestEffort && !task.BestEffort { 177 return false 178 } 179 // Preempt tasks within job. 180 return preemptor.Job == task.Job 181 }, ph) 182 stmt.Commit() 183 184 // If no preemption, next job. 185 if !assigned { 186 break 187 } 188 } 189 } 190 } 191 192 // call victimTasksFn to evict tasks 193 victimTasks(ssn) 194 } 195 196 func (pmpt *Action) UnInitialize() {} 197 198 func preempt( 199 ssn *framework.Session, 200 stmt *framework.Statement, 201 preemptor *api.TaskInfo, 202 filter func(*api.TaskInfo) bool, 203 predicateHelper util.PredicateHelper, 204 ) (bool, error) { 205 assigned := false 206 allNodes := ssn.NodeList 207 if err := ssn.PrePredicateFn(preemptor); err != nil { 208 return false, fmt.Errorf("PrePredicate for task %s/%s failed for: %v", preemptor.Namespace, preemptor.Name, err) 209 } 210 211 predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { 212 var statusSets util.StatusSets 213 statusSets, _ = ssn.PredicateFn(task, node) 214 215 // When filtering candidate nodes, need to consider the node statusSets instead of the err information. 216 // refer to kube-scheduler preemption code: https://github.com/kubernetes/kubernetes/blob/9d87fa215d9e8020abdc17132d1252536cd752d2/pkg/scheduler/framework/preemption/preemption.go#L422 217 if statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() { 218 return nil, api.NewFitError(task, node, statusSets.Message()) 219 } 220 return nil, nil 221 } 222 223 predicateNodes, _ := predicateHelper.PredicateNodes(preemptor, allNodes, predicateFn, true) 224 225 nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn) 226 227 selectedNodes := util.SortNodes(nodeScores) 228 229 job, found := ssn.Jobs[preemptor.Job] 230 if !found { 231 return false, fmt.Errorf("Job %s not found in SSN", preemptor.Job) 232 } 233 234 currentQueue := ssn.Queues[job.Queue] 235 236 for _, node := range selectedNodes { 237 klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", 238 preemptor.Namespace, preemptor.Name, node.Name) 239 240 var preemptees []*api.TaskInfo 241 for _, task := range node.Tasks { 242 if filter == nil { 243 preemptees = append(preemptees, task.Clone()) 244 } else if filter(task) { 245 preemptees = append(preemptees, task.Clone()) 246 } 247 } 248 victims := ssn.Preemptable(preemptor, preemptees) 249 metrics.UpdatePreemptionVictimsCount(len(victims)) 250 251 if err := util.ValidateVictims(preemptor, node, victims); err != nil { 252 klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err) 253 continue 254 } 255 256 victimsQueue := ssn.BuildVictimsPriorityQueue(victims) 257 // Preempt victims for tasks, pick lowest priority task first. 258 preempted := api.EmptyResource() 259 260 for !victimsQueue.Empty() { 261 // If reclaimed enough resources, break loop to avoid Sub panic. 262 // Preempt action is about preempt in same queue, which job is not allocatable in allocate action, due to: 263 // 1. cluster has free resource, but queue not allocatable 264 // 2. cluster has no free resource, but queue not allocatable 265 // 3. cluster has no free resource, but queue allocatable 266 // for case 1 and 2, high priority job/task can preempt low priority job/task in same queue; 267 // for case 3, it need to do reclaim resource from other queue, in reclaim action; 268 // so if current queue is not allocatable(the queue will be overused when consider current preemptor's requests) 269 // or current idle resource is not enougth for preemptor, it need to continue preempting 270 // otherwise, break out 271 if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { 272 break 273 } 274 preemptee := victimsQueue.Pop().(*api.TaskInfo) 275 klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", 276 preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) 277 if err := stmt.Evict(preemptee, "preempt"); err != nil { 278 klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", 279 preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) 280 continue 281 } 282 preempted.Add(preemptee.Resreq) 283 } 284 285 metrics.RegisterPreemptionAttempts() 286 klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.", 287 preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq) 288 289 // If preemptor's queue is overused, it means preemptor can not be allocated. So no need care about the node idle resource 290 if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { 291 if err := stmt.Pipeline(preemptor, node.Name); err != nil { 292 klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", 293 preemptor.Namespace, preemptor.Name, node.Name) 294 } 295 296 // Ignore pipeline error, will be corrected in next scheduling loop. 297 assigned = true 298 299 break 300 } 301 } 302 303 return assigned, nil 304 } 305 306 func victimTasks(ssn *framework.Session) { 307 stmt := framework.NewStatement(ssn) 308 tasks := make([]*api.TaskInfo, 0) 309 victimTasksMap := ssn.VictimTasks(tasks) 310 for victim := range victimTasksMap { 311 if err := stmt.Evict(victim.Clone(), "evict"); err != nil { 312 klog.Errorf("Failed to evict Task <%s/%s>: %v", 313 victim.Namespace, victim.Name, err) 314 continue 315 } 316 } 317 stmt.Commit() 318 }