volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/backfill/backfill.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package backfill 18 19 import ( 20 "fmt" 21 "time" 22 23 "k8s.io/klog/v2" 24 25 "volcano.sh/volcano/pkg/scheduler/api" 26 "volcano.sh/volcano/pkg/scheduler/framework" 27 "volcano.sh/volcano/pkg/scheduler/metrics" 28 "volcano.sh/volcano/pkg/scheduler/util" 29 ) 30 31 type Action struct{} 32 33 func New() *Action { 34 return &Action{} 35 } 36 37 func (backfill *Action) Name() string { 38 return "backfill" 39 } 40 41 func (backfill *Action) Initialize() {} 42 43 func (backfill *Action) Execute(ssn *framework.Session) { 44 klog.V(5).Infof("Enter Backfill ...") 45 defer klog.V(5).Infof("Leaving Backfill ...") 46 47 predicatFunc := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { 48 var statusSets util.StatusSets 49 statusSets, err := ssn.PredicateFn(task, node) 50 if err != nil { 51 return nil, err 52 } 53 54 // predicateHelper.PredicateNodes will print the log if predicate failed, so don't print log anymore here 55 if statusSets.ContainsUnschedulable() || statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() { 56 err := fmt.Errorf(statusSets.Message()) // should not include variables in api node errors 57 return nil, err 58 } 59 return nil, nil 60 } 61 62 // TODO (k82cn): When backfill, it's also need to balance between Queues. 63 pendingTasks := backfill.pickUpPendingTasks(ssn) 64 for _, task := range pendingTasks { 65 job := ssn.Jobs[task.Job] 66 ph := util.NewPredicateHelper() 67 allocated := false 68 fe := api.NewFitErrors() 69 70 if err := ssn.PrePredicateFn(task); err != nil { 71 klog.V(3).Infof("PrePredicate for task %s/%s failed in backfill for: %v", task.Namespace, task.Name, err) 72 for _, ni := range ssn.Nodes { 73 fe.SetNodeError(ni.Name, err) 74 } 75 job.NodesFitErrors[task.UID] = fe 76 break 77 } 78 79 predicateNodes, fitErrors := ph.PredicateNodes(task, ssn.NodeList, predicatFunc, true) 80 if len(predicateNodes) == 0 { 81 job.NodesFitErrors[task.UID] = fitErrors 82 break 83 } 84 85 node := predicateNodes[0] 86 if len(predicateNodes) > 1 { 87 nodeScores := util.PrioritizeNodes(task, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn) 88 node = ssn.BestNodeFn(task, nodeScores) 89 if node == nil { 90 node = util.SelectBestNode(nodeScores) 91 } 92 } 93 94 klog.V(3).Infof("Binding Task <%v/%v> to node <%v>", task.Namespace, task.Name, node.Name) 95 if err := ssn.Allocate(task, node); err != nil { 96 klog.Errorf("Failed to bind Task %v on %v in Session %v", task.UID, node.Name, ssn.UID) 97 fe.SetNodeError(node.Name, err) 98 continue 99 } 100 101 metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time)) 102 metrics.UpdateE2eSchedulingLastTimeByJob(job.Name, string(job.Queue), job.Namespace, time.Now()) 103 allocated = true 104 105 if !allocated { 106 job.NodesFitErrors[task.UID] = fe 107 } 108 // TODO (k82cn): backfill for other case. 109 } 110 } 111 112 func (backfill *Action) UnInitialize() {} 113 114 func (backfill *Action) pickUpPendingTasks(ssn *framework.Session) []*api.TaskInfo { 115 queues := util.NewPriorityQueue(ssn.QueueOrderFn) 116 jobs := map[api.QueueID]*util.PriorityQueue{} 117 tasks := map[api.JobID]*util.PriorityQueue{} 118 var pendingTasks []*api.TaskInfo 119 for _, job := range ssn.Jobs { 120 if job.IsPending() { 121 continue 122 } 123 124 if vr := ssn.JobValid(job); vr != nil && !vr.Pass { 125 klog.V(4).Infof("Job <%s/%s> Queue <%s> skip backfill, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message) 126 continue 127 } 128 129 queue, found := ssn.Queues[job.Queue] 130 if !found { 131 continue 132 } 133 134 for _, task := range job.TaskStatusIndex[api.Pending] { 135 if !task.BestEffort { 136 continue 137 } 138 if _, existed := tasks[job.UID]; !existed { 139 tasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn) 140 } 141 tasks[job.UID].Push(task) 142 } 143 144 for _, task := range job.TaskStatusIndex[api.Pipelined] { 145 if !task.BestEffort { 146 continue 147 } 148 149 stmt := framework.NewStatement(ssn) 150 err := stmt.UnPipeline(task) 151 if err != nil { 152 klog.Errorf("Failed to unpipeline task: %s", err.Error()) 153 continue 154 } 155 if _, existed := tasks[job.UID]; !existed { 156 tasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn) 157 } 158 tasks[job.UID].Push(task) 159 } 160 161 if _, existed := tasks[job.UID]; !existed { 162 continue 163 } 164 165 if _, existed := jobs[queue.UID]; !existed { 166 queues.Push(queue) 167 jobs[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn) 168 } 169 jobs[job.Queue].Push(job) 170 } 171 172 for !queues.Empty() { 173 queue, ok := queues.Pop().(*api.QueueInfo) 174 if !ok { 175 klog.V(3).Infof("QueueInfo transition failed, ignore it.") 176 continue 177 } 178 for !jobs[queue.UID].Empty() { 179 job, ok := jobs[queue.UID].Pop().(*api.JobInfo) 180 if !ok { 181 klog.Errorf("JobInfo transition failed, ignore it.") 182 continue 183 } 184 for !tasks[job.UID].Empty() { 185 pendingTasks = append(pendingTasks, tasks[job.UID].Pop().(*api.TaskInfo)) 186 } 187 } 188 } 189 return pendingTasks 190 }