volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/reclaim/reclaim.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package reclaim 18 19 import ( 20 "k8s.io/klog/v2" 21 22 "volcano.sh/volcano/pkg/scheduler/api" 23 "volcano.sh/volcano/pkg/scheduler/framework" 24 "volcano.sh/volcano/pkg/scheduler/util" 25 ) 26 27 type Action struct{} 28 29 func New() *Action { 30 return &Action{} 31 } 32 33 func (ra *Action) Name() string { 34 return "reclaim" 35 } 36 37 func (ra *Action) Initialize() {} 38 39 func (ra *Action) Execute(ssn *framework.Session) { 40 klog.V(5).Infof("Enter Reclaim ...") 41 defer klog.V(5).Infof("Leaving Reclaim ...") 42 43 queues := util.NewPriorityQueue(ssn.QueueOrderFn) 44 queueMap := map[api.QueueID]*api.QueueInfo{} 45 46 preemptorsMap := map[api.QueueID]*util.PriorityQueue{} 47 preemptorTasks := map[api.JobID]*util.PriorityQueue{} 48 49 klog.V(3).Infof("There are <%d> Jobs and <%d> Queues in total for scheduling.", 50 len(ssn.Jobs), len(ssn.Queues)) 51 52 for _, job := range ssn.Jobs { 53 if job.IsPending() { 54 continue 55 } 56 57 if vr := ssn.JobValid(job); vr != nil && !vr.Pass { 58 klog.V(4).Infof("Job <%s/%s> Queue <%s> skip reclaim, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message) 59 continue 60 } 61 62 if queue, found := ssn.Queues[job.Queue]; !found { 63 klog.Errorf("Failed to find Queue <%s> for Job <%s/%s>", 64 job.Queue, job.Namespace, job.Name) 65 continue 66 } else if _, existed := queueMap[queue.UID]; !existed { 67 klog.V(4).Infof("Added Queue <%s> for Job <%s/%s>", queue.Name, job.Namespace, job.Name) 68 queueMap[queue.UID] = queue 69 queues.Push(queue) 70 } 71 72 if job.HasPendingTasks() { 73 if _, found := preemptorsMap[job.Queue]; !found { 74 preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn) 75 } 76 preemptorsMap[job.Queue].Push(job) 77 preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn) 78 for _, task := range job.TaskStatusIndex[api.Pending] { 79 preemptorTasks[job.UID].Push(task) 80 } 81 } 82 } 83 84 for { 85 // If no queues, break 86 if queues.Empty() { 87 break 88 } 89 90 var job *api.JobInfo 91 var task *api.TaskInfo 92 93 queue := queues.Pop().(*api.QueueInfo) 94 if ssn.Overused(queue) { 95 klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name) 96 continue 97 } 98 if !ssn.Preemptive(queue) { 99 klog.V(3).Infof("Queue <%s> can not reclaim by preempt others, ignore it.", queue.Name) 100 continue 101 } 102 103 // Found "high" priority job 104 jobs, found := preemptorsMap[queue.UID] 105 if !found || jobs.Empty() { 106 continue 107 } else { 108 job = jobs.Pop().(*api.JobInfo) 109 } 110 111 // Found "high" priority task to reclaim others 112 if tasks, found := preemptorTasks[job.UID]; !found || tasks.Empty() { 113 continue 114 } else { 115 task = tasks.Pop().(*api.TaskInfo) 116 } 117 118 if !ssn.Allocatable(queue, task) { 119 klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name) 120 continue 121 } 122 123 if err := ssn.PrePredicateFn(task); err != nil { 124 klog.V(3).Infof("PrePredicate for task %s/%s failed for: %v", task.Namespace, task.Name, err) 125 continue 126 } 127 128 assigned := false 129 for _, n := range ssn.Nodes { 130 var statusSets util.StatusSets 131 statusSets, _ = ssn.PredicateFn(task, n) 132 133 // When filtering candidate nodes, need to consider the node statusSets instead of the err information. 134 // refer to kube-scheduler preemption code: https://github.com/kubernetes/kubernetes/blob/9d87fa215d9e8020abdc17132d1252536cd752d2/pkg/scheduler/framework/preemption/preemption.go#L422 135 if statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() { 136 klog.V(5).Infof("predicates failed in reclaim for task <%s/%s> on node <%s>, reason is %s.", 137 task.Namespace, task.Name, n.Name, statusSets.Message()) 138 continue 139 } 140 klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", 141 task.Namespace, task.Name, n.Name) 142 143 var reclaimees []*api.TaskInfo 144 for _, task := range n.Tasks { 145 // Ignore non running task. 146 if task.Status != api.Running { 147 continue 148 } 149 if !task.Preemptable { 150 continue 151 } 152 153 if j, found := ssn.Jobs[task.Job]; !found { 154 continue 155 } else if j.Queue != job.Queue { 156 q := ssn.Queues[j.Queue] 157 if !q.Reclaimable() { 158 continue 159 } 160 // Clone task to avoid modify Task's status on node. 161 reclaimees = append(reclaimees, task.Clone()) 162 } 163 } 164 165 if len(reclaimees) == 0 { 166 klog.V(4).Infof("No reclaimees on Node <%s>.", n.Name) 167 continue 168 } 169 170 victims := ssn.Reclaimable(task, reclaimees) 171 172 if err := util.ValidateVictims(task, n, victims); err != nil { 173 klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err) 174 continue 175 } 176 177 victimsQueue := ssn.BuildVictimsPriorityQueue(victims) 178 179 resreq := task.InitResreq.Clone() 180 reclaimed := api.EmptyResource() 181 182 // Reclaim victims for tasks. 183 for !victimsQueue.Empty() { 184 reclaimee := victimsQueue.Pop().(*api.TaskInfo) 185 klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>", 186 reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name) 187 if err := ssn.Evict(reclaimee, "reclaim"); err != nil { 188 klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v", 189 reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err) 190 continue 191 } 192 reclaimed.Add(reclaimee.Resreq) 193 // If reclaimed enough resources, break loop to avoid Sub panic. 194 if resreq.LessEqual(reclaimed, api.Zero) { 195 break 196 } 197 } 198 199 klog.V(3).Infof("Reclaimed <%v> for task <%s/%s> requested <%v>.", 200 reclaimed, task.Namespace, task.Name, task.InitResreq) 201 202 if task.InitResreq.LessEqual(reclaimed, api.Zero) { 203 if err := ssn.Pipeline(task, n.Name); err != nil { 204 klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", 205 task.Namespace, task.Name, n.Name) 206 } 207 208 // Ignore error of pipeline, will be corrected in next scheduling loop. 209 assigned = true 210 211 break 212 } 213 } 214 215 if assigned { 216 jobs.Push(job) 217 } 218 queues.Push(queue) 219 } 220 } 221 222 func (ra *Action) UnInitialize() { 223 }