volcano.sh/volcano@v1.9.0/pkg/scheduler/util/predicate_helper.go (about)

     1  package util
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"sync/atomic"
     9  
    10  	"k8s.io/client-go/util/workqueue"
    11  	"k8s.io/klog/v2"
    12  
    13  	"volcano.sh/volcano/pkg/scheduler/api"
    14  )
    15  
    16  type PredicateHelper interface {
    17  	PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn, enableErrorCache bool) ([]*api.NodeInfo, *api.FitErrors)
    18  }
    19  
    20  type predicateHelper struct {
    21  	taskPredicateErrorCache map[string]map[string]error
    22  }
    23  
    24  // PredicateNodes returns the specified number of nodes that fit a task
    25  func (ph *predicateHelper) PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn, enableErrorCache bool) ([]*api.NodeInfo, *api.FitErrors) {
    26  	var errorLock sync.RWMutex
    27  	fe := api.NewFitErrors()
    28  
    29  	allNodes := len(nodes)
    30  	if allNodes == 0 {
    31  		return make([]*api.NodeInfo, 0), fe
    32  	}
    33  	numNodesToFind := CalculateNumOfFeasibleNodesToFind(int32(allNodes))
    34  
    35  	//allocate enough space to avoid growing it
    36  	predicateNodes := make([]*api.NodeInfo, numNodesToFind)
    37  
    38  	numFoundNodes := int32(0)
    39  	processedNodes := int32(0)
    40  
    41  	taskGroupid := taskGroupID(task)
    42  	nodeErrorCache, taskFailedBefore := ph.taskPredicateErrorCache[taskGroupid]
    43  	if nodeErrorCache == nil {
    44  		nodeErrorCache = map[string]error{}
    45  	}
    46  
    47  	//create a context with cancellation
    48  	ctx, cancel := context.WithCancel(context.Background())
    49  
    50  	checkNode := func(index int) {
    51  		// Check the nodes starting from where is left off in the previous scheduling cycle,
    52  		// to make sure all nodes have the same chance of being examined across pods.
    53  		node := nodes[(lastProcessedNodeIndex+index)%allNodes]
    54  		atomic.AddInt32(&processedNodes, 1)
    55  		klog.V(4).Infof("Considering Task <%v/%v> on node <%v>: <%v> vs. <%v>",
    56  			task.Namespace, task.Name, node.Name, task.Resreq, node.Idle)
    57  
    58  		// Check if the task had "predicate" failure before.
    59  		// And then check if the task failed to predict on this node before.
    60  		if enableErrorCache && taskFailedBefore {
    61  			errorLock.RLock()
    62  			errC, ok := nodeErrorCache[node.Name]
    63  			errorLock.RUnlock()
    64  
    65  			if ok {
    66  				errorLock.Lock()
    67  				fe.SetNodeError(node.Name, errC)
    68  				errorLock.Unlock()
    69  				return
    70  			}
    71  		}
    72  
    73  		// TODO (k82cn): Enable eCache for performance improvement.
    74  		if _, err := fn(task, node); err != nil {
    75  			klog.V(3).Infof("Predicates failed: %v", err)
    76  			errorLock.Lock()
    77  			nodeErrorCache[node.Name] = err
    78  			ph.taskPredicateErrorCache[taskGroupid] = nodeErrorCache
    79  			fe.SetNodeError(node.Name, err)
    80  			errorLock.Unlock()
    81  			return
    82  		}
    83  
    84  		//check if the number of found nodes is more than the numNodesTofind
    85  		length := atomic.AddInt32(&numFoundNodes, 1)
    86  		if length > numNodesToFind {
    87  			cancel()
    88  			atomic.AddInt32(&numFoundNodes, -1)
    89  		} else {
    90  			predicateNodes[length-1] = node
    91  		}
    92  	}
    93  
    94  	//workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), checkNode)
    95  	workqueue.ParallelizeUntil(ctx, 16, allNodes, checkNode)
    96  
    97  	//processedNodes := int(numFoundNodes) + len(filteredNodesStatuses) + len(failedPredicateMap)
    98  	lastProcessedNodeIndex = (lastProcessedNodeIndex + int(processedNodes)) % allNodes
    99  	predicateNodes = predicateNodes[:numFoundNodes]
   100  	return predicateNodes, fe
   101  }
   102  
   103  func taskGroupID(task *api.TaskInfo) string {
   104  	return fmt.Sprintf("%s/%s", task.Job, task.GetTaskSpecKey())
   105  }
   106  
   107  func NewPredicateHelper() PredicateHelper {
   108  	return &predicateHelper{taskPredicateErrorCache: map[string]map[string]error{}}
   109  }
   110  
   111  type StatusSets []*api.Status
   112  
   113  func (s StatusSets) ContainsUnschedulable() bool {
   114  	for _, status := range s {
   115  		if status == nil {
   116  			continue
   117  		}
   118  		if status.Code == api.Unschedulable {
   119  			return true
   120  		}
   121  	}
   122  	return false
   123  }
   124  
   125  func (s StatusSets) ContainsUnschedulableAndUnresolvable() bool {
   126  	for _, status := range s {
   127  		if status == nil {
   128  			continue
   129  		}
   130  		if status.Code == api.UnschedulableAndUnresolvable {
   131  			return true
   132  		}
   133  	}
   134  	return false
   135  }
   136  
   137  func (s StatusSets) ContainsErrorSkipOrWait() bool {
   138  	for _, status := range s {
   139  		if status == nil {
   140  			continue
   141  		}
   142  		if status.Code == api.Error || status.Code == api.Skip || status.Code == api.Wait {
   143  			return true
   144  		}
   145  	}
   146  	return false
   147  }
   148  
   149  // Message return the message generated from StatusSets
   150  func (s StatusSets) Message() string {
   151  	if s == nil {
   152  		return ""
   153  	}
   154  	all := make([]string, 0, len(s))
   155  	for _, status := range s {
   156  		if status.Reason == "" {
   157  			continue
   158  		}
   159  		all = append(all, status.Reason)
   160  	}
   161  	return strings.Join(all, ",")
   162  }
   163  
   164  // Reasons return the reasons list
   165  func (s StatusSets) Reasons() []string {
   166  	if s == nil {
   167  		return nil
   168  	}
   169  	all := make([]string, 0, len(s))
   170  	for _, status := range s {
   171  		if status.Reason == "" {
   172  			continue
   173  		}
   174  		all = append(all, status.Reason)
   175  	}
   176  	return all
   177  }