volcano.sh/volcano@v1.9.0/pkg/scheduler/actions/preempt/preempt.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package preempt
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"k8s.io/klog/v2"
    23  
    24  	"volcano.sh/volcano/pkg/scheduler/api"
    25  	"volcano.sh/volcano/pkg/scheduler/framework"
    26  	"volcano.sh/volcano/pkg/scheduler/metrics"
    27  	"volcano.sh/volcano/pkg/scheduler/util"
    28  )
    29  
    30  type Action struct{}
    31  
    32  func New() *Action {
    33  	return &Action{}
    34  }
    35  
    36  func (pmpt *Action) Name() string {
    37  	return "preempt"
    38  }
    39  
    40  func (pmpt *Action) Initialize() {}
    41  
    42  func (pmpt *Action) Execute(ssn *framework.Session) {
    43  	klog.V(5).Infof("Enter Preempt ...")
    44  	defer klog.V(5).Infof("Leaving Preempt ...")
    45  
    46  	preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
    47  	preemptorTasks := map[api.JobID]*util.PriorityQueue{}
    48  
    49  	var underRequest []*api.JobInfo
    50  	queues := map[api.QueueID]*api.QueueInfo{}
    51  
    52  	for _, job := range ssn.Jobs {
    53  		if job.IsPending() {
    54  			continue
    55  		}
    56  
    57  		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
    58  			klog.V(4).Infof("Job <%s/%s> Queue <%s> skip preemption, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
    59  			continue
    60  		}
    61  
    62  		if queue, found := ssn.Queues[job.Queue]; !found {
    63  			continue
    64  		} else if _, existed := queues[queue.UID]; !existed {
    65  			klog.V(3).Infof("Added Queue <%s> for Job <%s/%s>",
    66  				queue.Name, job.Namespace, job.Name)
    67  			queues[queue.UID] = queue
    68  		}
    69  
    70  		// check job if starving for more resources.
    71  		if ssn.JobStarving(job) {
    72  			if _, found := preemptorsMap[job.Queue]; !found {
    73  				preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
    74  			}
    75  			preemptorsMap[job.Queue].Push(job)
    76  			underRequest = append(underRequest, job)
    77  			preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
    78  			for _, task := range job.TaskStatusIndex[api.Pending] {
    79  				preemptorTasks[job.UID].Push(task)
    80  			}
    81  		}
    82  	}
    83  
    84  	ph := util.NewPredicateHelper()
    85  	// Preemption between Jobs within Queue.
    86  	for _, queue := range queues {
    87  		for {
    88  			preemptors := preemptorsMap[queue.UID]
    89  
    90  			// If no preemptors, no preemption.
    91  			if preemptors == nil || preemptors.Empty() {
    92  				klog.V(4).Infof("No preemptors in Queue <%s>, break.", queue.Name)
    93  				break
    94  			}
    95  
    96  			preemptorJob := preemptors.Pop().(*api.JobInfo)
    97  
    98  			stmt := framework.NewStatement(ssn)
    99  			assigned := false
   100  			for {
   101  				// If job is not request more resource, then stop preempting.
   102  				if !ssn.JobStarving(preemptorJob) {
   103  					break
   104  				}
   105  
   106  				// If not preemptor tasks, next job.
   107  				if preemptorTasks[preemptorJob.UID].Empty() {
   108  					klog.V(3).Infof("No preemptor task in job <%s/%s>.",
   109  						preemptorJob.Namespace, preemptorJob.Name)
   110  					break
   111  				}
   112  
   113  				preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo)
   114  
   115  				if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
   116  					// Ignore non running task.
   117  					if !api.PreemptableStatus(task.Status) {
   118  						return false
   119  					}
   120  					// BestEffort pod is not supported to preempt unBestEffort pod.
   121  					if preemptor.BestEffort && !task.BestEffort {
   122  						return false
   123  					}
   124  					if !task.Preemptable {
   125  						return false
   126  					}
   127  					job, found := ssn.Jobs[task.Job]
   128  					if !found {
   129  						return false
   130  					}
   131  					// Preempt other jobs within queue
   132  					return job.Queue == preemptorJob.Queue && preemptor.Job != task.Job
   133  				}, ph); preempted {
   134  					assigned = true
   135  				}
   136  			}
   137  
   138  			// Commit changes only if job is pipelined, otherwise try next job.
   139  			if ssn.JobPipelined(preemptorJob) {
   140  				stmt.Commit()
   141  			} else {
   142  				stmt.Discard()
   143  				continue
   144  			}
   145  
   146  			if assigned {
   147  				preemptors.Push(preemptorJob)
   148  			}
   149  		}
   150  
   151  		// Preemption between Task within Job.
   152  		for _, job := range underRequest {
   153  			// Fix: preemptor numbers lose when in same job
   154  			preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
   155  			for _, task := range job.TaskStatusIndex[api.Pending] {
   156  				preemptorTasks[job.UID].Push(task)
   157  			}
   158  			for {
   159  				if _, found := preemptorTasks[job.UID]; !found {
   160  					break
   161  				}
   162  
   163  				if preemptorTasks[job.UID].Empty() {
   164  					break
   165  				}
   166  
   167  				preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo)
   168  
   169  				stmt := framework.NewStatement(ssn)
   170  				assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
   171  					// Ignore non running task.
   172  					if !api.PreemptableStatus(task.Status) {
   173  						return false
   174  					}
   175  					// BestEffort pod is not supported to preempt unBestEffort pod.
   176  					if preemptor.BestEffort && !task.BestEffort {
   177  						return false
   178  					}
   179  					// Preempt tasks within job.
   180  					return preemptor.Job == task.Job
   181  				}, ph)
   182  				stmt.Commit()
   183  
   184  				// If no preemption, next job.
   185  				if !assigned {
   186  					break
   187  				}
   188  			}
   189  		}
   190  	}
   191  
   192  	// call victimTasksFn to evict tasks
   193  	victimTasks(ssn)
   194  }
   195  
   196  func (pmpt *Action) UnInitialize() {}
   197  
   198  func preempt(
   199  	ssn *framework.Session,
   200  	stmt *framework.Statement,
   201  	preemptor *api.TaskInfo,
   202  	filter func(*api.TaskInfo) bool,
   203  	predicateHelper util.PredicateHelper,
   204  ) (bool, error) {
   205  	assigned := false
   206  	allNodes := ssn.NodeList
   207  	if err := ssn.PrePredicateFn(preemptor); err != nil {
   208  		return false, fmt.Errorf("PrePredicate for task %s/%s failed for: %v", preemptor.Namespace, preemptor.Name, err)
   209  	}
   210  
   211  	predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) {
   212  		var statusSets util.StatusSets
   213  		statusSets, _ = ssn.PredicateFn(task, node)
   214  
   215  		// When filtering candidate nodes, need to consider the node statusSets instead of the err information.
   216  		// refer to kube-scheduler preemption code: https://github.com/kubernetes/kubernetes/blob/9d87fa215d9e8020abdc17132d1252536cd752d2/pkg/scheduler/framework/preemption/preemption.go#L422
   217  		if statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() {
   218  			return nil, api.NewFitError(task, node, statusSets.Message())
   219  		}
   220  		return nil, nil
   221  	}
   222  
   223  	predicateNodes, _ := predicateHelper.PredicateNodes(preemptor, allNodes, predicateFn, true)
   224  
   225  	nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
   226  
   227  	selectedNodes := util.SortNodes(nodeScores)
   228  
   229  	job, found := ssn.Jobs[preemptor.Job]
   230  	if !found {
   231  		return false, fmt.Errorf("Job %s not found in SSN", preemptor.Job)
   232  	}
   233  
   234  	currentQueue := ssn.Queues[job.Queue]
   235  
   236  	for _, node := range selectedNodes {
   237  		klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
   238  			preemptor.Namespace, preemptor.Name, node.Name)
   239  
   240  		var preemptees []*api.TaskInfo
   241  		for _, task := range node.Tasks {
   242  			if filter == nil {
   243  				preemptees = append(preemptees, task.Clone())
   244  			} else if filter(task) {
   245  				preemptees = append(preemptees, task.Clone())
   246  			}
   247  		}
   248  		victims := ssn.Preemptable(preemptor, preemptees)
   249  		metrics.UpdatePreemptionVictimsCount(len(victims))
   250  
   251  		if err := util.ValidateVictims(preemptor, node, victims); err != nil {
   252  			klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err)
   253  			continue
   254  		}
   255  
   256  		victimsQueue := ssn.BuildVictimsPriorityQueue(victims)
   257  		// Preempt victims for tasks, pick lowest priority task first.
   258  		preempted := api.EmptyResource()
   259  
   260  		for !victimsQueue.Empty() {
   261  			// If reclaimed enough resources, break loop to avoid Sub panic.
   262  			// Preempt action is about preempt in same queue, which job is not allocatable in allocate action, due to:
   263  			// 1. cluster has free resource, but queue not allocatable
   264  			// 2. cluster has no free resource, but queue not allocatable
   265  			// 3. cluster has no free resource, but queue allocatable
   266  			// for case 1 and 2, high priority job/task can preempt low priority job/task in same queue;
   267  			// for case 3, it need to do reclaim resource from other queue, in reclaim action;
   268  			// so if current queue is not allocatable(the queue will be overused when consider current preemptor's requests)
   269  			// or current idle resource is not enougth for preemptor, it need to continue preempting
   270  			// otherwise, break out
   271  			if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
   272  				break
   273  			}
   274  			preemptee := victimsQueue.Pop().(*api.TaskInfo)
   275  			klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>",
   276  				preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name)
   277  			if err := stmt.Evict(preemptee, "preempt"); err != nil {
   278  				klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v",
   279  					preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err)
   280  				continue
   281  			}
   282  			preempted.Add(preemptee.Resreq)
   283  		}
   284  
   285  		metrics.RegisterPreemptionAttempts()
   286  		klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.",
   287  			preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq)
   288  
   289  		// If preemptor's queue is overused, it means preemptor can not be allocated. So no need care about the node idle resource
   290  		if ssn.Allocatable(currentQueue, preemptor) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
   291  			if err := stmt.Pipeline(preemptor, node.Name); err != nil {
   292  				klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
   293  					preemptor.Namespace, preemptor.Name, node.Name)
   294  			}
   295  
   296  			// Ignore pipeline error, will be corrected in next scheduling loop.
   297  			assigned = true
   298  
   299  			break
   300  		}
   301  	}
   302  
   303  	return assigned, nil
   304  }
   305  
   306  func victimTasks(ssn *framework.Session) {
   307  	stmt := framework.NewStatement(ssn)
   308  	tasks := make([]*api.TaskInfo, 0)
   309  	victimTasksMap := ssn.VictimTasks(tasks)
   310  	for victim := range victimTasksMap {
   311  		if err := stmt.Evict(victim.Clone(), "evict"); err != nil {
   312  			klog.Errorf("Failed to evict Task <%s/%s>: %v",
   313  				victim.Namespace, victim.Name, err)
   314  			continue
   315  		}
   316  	}
   317  	stmt.Commit()
   318  }