volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/reclaim/reclaim.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package reclaim
    18  
    19  import (
    20  	"k8s.io/klog/v2"
    21  
    22  	"volcano.sh/volcano/pkg/scheduler/api"
    23  	"volcano.sh/volcano/pkg/scheduler/framework"
    24  	"volcano.sh/volcano/pkg/scheduler/util"
    25  )
    26  
    27  type Action struct{}
    28  
    29  func New() *Action {
    30  	return &Action{}
    31  }
    32  
    33  func (ra *Action) Name() string {
    34  	return "reclaim"
    35  }
    36  
    37  func (ra *Action) Initialize() {}
    38  
    39  func (ra *Action) Execute(ssn *framework.Session) {
    40  	klog.V(5).Infof("Enter Reclaim ...")
    41  	defer klog.V(5).Infof("Leaving Reclaim ...")
    42  
    43  	queues := util.NewPriorityQueue(ssn.QueueOrderFn)
    44  	queueMap := map[api.QueueID]*api.QueueInfo{}
    45  
    46  	preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
    47  	preemptorTasks := map[api.JobID]*util.PriorityQueue{}
    48  
    49  	klog.V(3).Infof("There are <%d> Jobs and <%d> Queues in total for scheduling.",
    50  		len(ssn.Jobs), len(ssn.Queues))
    51  
    52  	for _, job := range ssn.Jobs {
    53  		if job.IsPending() {
    54  			continue
    55  		}
    56  
    57  		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
    58  			klog.V(4).Infof("Job <%s/%s> Queue <%s> skip reclaim, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
    59  			continue
    60  		}
    61  
    62  		if queue, found := ssn.Queues[job.Queue]; !found {
    63  			klog.Errorf("Failed to find Queue <%s> for Job <%s/%s>",
    64  				job.Queue, job.Namespace, job.Name)
    65  			continue
    66  		} else if _, existed := queueMap[queue.UID]; !existed {
    67  			klog.V(4).Infof("Added Queue <%s> for Job <%s/%s>", queue.Name, job.Namespace, job.Name)
    68  			queueMap[queue.UID] = queue
    69  			queues.Push(queue)
    70  		}
    71  
    72  		if job.HasPendingTasks() {
    73  			if _, found := preemptorsMap[job.Queue]; !found {
    74  				preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
    75  			}
    76  			preemptorsMap[job.Queue].Push(job)
    77  			preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
    78  			for _, task := range job.TaskStatusIndex[api.Pending] {
    79  				preemptorTasks[job.UID].Push(task)
    80  			}
    81  		}
    82  	}
    83  
    84  	for {
    85  		// If no queues, break
    86  		if queues.Empty() {
    87  			break
    88  		}
    89  
    90  		var job *api.JobInfo
    91  		var task *api.TaskInfo
    92  
    93  		queue := queues.Pop().(*api.QueueInfo)
    94  		if ssn.Overused(queue) {
    95  			klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name)
    96  			continue
    97  		}
    98  		if !ssn.Preemptive(queue) {
    99  			klog.V(3).Infof("Queue <%s> can not reclaim by preempt others, ignore it.", queue.Name)
   100  			continue
   101  		}
   102  
   103  		// Found "high" priority job
   104  		jobs, found := preemptorsMap[queue.UID]
   105  		if !found || jobs.Empty() {
   106  			continue
   107  		} else {
   108  			job = jobs.Pop().(*api.JobInfo)
   109  		}
   110  
   111  		// Found "high" priority task to reclaim others
   112  		if tasks, found := preemptorTasks[job.UID]; !found || tasks.Empty() {
   113  			continue
   114  		} else {
   115  			task = tasks.Pop().(*api.TaskInfo)
   116  		}
   117  
   118  		if !ssn.Allocatable(queue, task) {
   119  			klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
   120  			continue
   121  		}
   122  
   123  		if err := ssn.PrePredicateFn(task); err != nil {
   124  			klog.V(3).Infof("PrePredicate for task %s/%s failed for: %v", task.Namespace, task.Name, err)
   125  			continue
   126  		}
   127  
   128  		assigned := false
   129  		for _, n := range ssn.Nodes {
   130  			var statusSets util.StatusSets
   131  			statusSets, _ = ssn.PredicateFn(task, n)
   132  
   133  			// When filtering candidate nodes, need to consider the node statusSets instead of the err information.
   134  			// refer to kube-scheduler preemption code: https://github.com/kubernetes/kubernetes/blob/9d87fa215d9e8020abdc17132d1252536cd752d2/pkg/scheduler/framework/preemption/preemption.go#L422
   135  			if statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() {
   136  				klog.V(5).Infof("predicates failed in reclaim for task <%s/%s> on node <%s>, reason is %s.",
   137  					task.Namespace, task.Name, n.Name, statusSets.Message())
   138  				continue
   139  			}
   140  			klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
   141  				task.Namespace, task.Name, n.Name)
   142  
   143  			var reclaimees []*api.TaskInfo
   144  			for _, task := range n.Tasks {
   145  				// Ignore non running task.
   146  				if task.Status != api.Running {
   147  					continue
   148  				}
   149  				if !task.Preemptable {
   150  					continue
   151  				}
   152  
   153  				if j, found := ssn.Jobs[task.Job]; !found {
   154  					continue
   155  				} else if j.Queue != job.Queue {
   156  					q := ssn.Queues[j.Queue]
   157  					if !q.Reclaimable() {
   158  						continue
   159  					}
   160  					// Clone task to avoid modify Task's status on node.
   161  					reclaimees = append(reclaimees, task.Clone())
   162  				}
   163  			}
   164  
   165  			if len(reclaimees) == 0 {
   166  				klog.V(4).Infof("No reclaimees on Node <%s>.", n.Name)
   167  				continue
   168  			}
   169  
   170  			victims := ssn.Reclaimable(task, reclaimees)
   171  
   172  			if err := util.ValidateVictims(task, n, victims); err != nil {
   173  				klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err)
   174  				continue
   175  			}
   176  
   177  			victimsQueue := ssn.BuildVictimsPriorityQueue(victims)
   178  
   179  			resreq := task.InitResreq.Clone()
   180  			reclaimed := api.EmptyResource()
   181  
   182  			// Reclaim victims for tasks.
   183  			for !victimsQueue.Empty() {
   184  				reclaimee := victimsQueue.Pop().(*api.TaskInfo)
   185  				klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>",
   186  					reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name)
   187  				if err := ssn.Evict(reclaimee, "reclaim"); err != nil {
   188  					klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v",
   189  						reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err)
   190  					continue
   191  				}
   192  				reclaimed.Add(reclaimee.Resreq)
   193  				// If reclaimed enough resources, break loop to avoid Sub panic.
   194  				if resreq.LessEqual(reclaimed, api.Zero) {
   195  					break
   196  				}
   197  			}
   198  
   199  			klog.V(3).Infof("Reclaimed <%v> for task <%s/%s> requested <%v>.",
   200  				reclaimed, task.Namespace, task.Name, task.InitResreq)
   201  
   202  			if task.InitResreq.LessEqual(reclaimed, api.Zero) {
   203  				if err := ssn.Pipeline(task, n.Name); err != nil {
   204  					klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
   205  						task.Namespace, task.Name, n.Name)
   206  				}
   207  
   208  				// Ignore error of pipeline, will be corrected in next scheduling loop.
   209  				assigned = true
   210  
   211  				break
   212  			}
   213  		}
   214  
   215  		if assigned {
   216  			jobs.Push(job)
   217  		}
   218  		queues.Push(queue)
   219  	}
   220  }
   221  
   222  func (ra *Action) UnInitialize() {
   223  }