github.com/djenriquez/nomad-1@v0.8.1/scheduler/stack.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"math"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// serviceJobAntiAffinityPenalty is the penalty applied
    12  	// to the score for placing an alloc on a node that
    13  	// already has an alloc for this job.
    14  	serviceJobAntiAffinityPenalty = 20.0
    15  
    16  	// batchJobAntiAffinityPenalty is the same as the
    17  	// serviceJobAntiAffinityPenalty but for batch type jobs.
    18  	batchJobAntiAffinityPenalty = 10.0
    19  
    20  	// previousFailedAllocNodePenalty is a scoring penalty for nodes
    21  	// that a failed allocation was previously run on
    22  	previousFailedAllocNodePenalty = 50.0
    23  
    24  	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
    25  	// that have a score lower than this. -10 is the highest possible score for a
    26  	// node with penalty (based on batchJobAntiAffinityPenalty)
    27  	skipScoreThreshold = -10.0
    28  
    29  	// maxSkip limits the number of nodes that can be skipped in the limit iterator
    30  	maxSkip = 3
    31  )
    32  
    33  // Stack is a chained collection of iterators. The stack is used to
    34  // make placement decisions. Different schedulers may customize the
    35  // stack they use to vary the way placements are made.
    36  type Stack interface {
    37  	// SetNodes is used to set the base set of potential nodes
    38  	SetNodes([]*structs.Node)
    39  
    40  	// SetTaskGroup is used to set the job for selection
    41  	SetJob(job *structs.Job)
    42  
    43  	// Select is used to select a node for the task group
    44  	Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources)
    45  }
    46  
    47  type SelectOptions struct {
    48  	PenaltyNodeIDs map[string]struct{}
    49  	PreferredNodes []*structs.Node
    50  }
    51  
    52  // GenericStack is the Stack used for the Generic scheduler. It is
    53  // designed to make better placement decisions at the cost of performance.
    54  type GenericStack struct {
    55  	batch  bool
    56  	ctx    Context
    57  	source *StaticIterator
    58  
    59  	wrappedChecks       *FeasibilityWrapper
    60  	quota               FeasibleIterator
    61  	jobConstraint       *ConstraintChecker
    62  	taskGroupDrivers    *DriverChecker
    63  	taskGroupConstraint *ConstraintChecker
    64  
    65  	distinctHostsConstraint    *DistinctHostsIterator
    66  	distinctPropertyConstraint *DistinctPropertyIterator
    67  	binPack                    *BinPackIterator
    68  	jobAntiAff                 *JobAntiAffinityIterator
    69  	nodeAntiAff                *NodeAntiAffinityIterator
    70  	limit                      *LimitIterator
    71  	maxScore                   *MaxScoreIterator
    72  }
    73  
    74  // NewGenericStack constructs a stack used for selecting service placements
    75  func NewGenericStack(batch bool, ctx Context) *GenericStack {
    76  	// Create a new stack
    77  	s := &GenericStack{
    78  		batch: batch,
    79  		ctx:   ctx,
    80  	}
    81  
    82  	// Create the source iterator. We randomize the order we visit nodes
    83  	// to reduce collisions between schedulers and to do a basic load
    84  	// balancing across eligible nodes.
    85  	s.source = NewRandomIterator(ctx, nil)
    86  
    87  	// Create the quota iterator to determine if placements would result in the
    88  	// quota attached to the namespace of the job to go over.
    89  	s.quota = NewQuotaIterator(ctx, s.source)
    90  
    91  	// Attach the job constraints. The job is filled in later.
    92  	s.jobConstraint = NewConstraintChecker(ctx, nil)
    93  
    94  	// Filter on task group drivers first as they are faster
    95  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
    96  
    97  	// Filter on task group constraints second
    98  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
    99  
   100  	// Create the feasibility wrapper which wraps all feasibility checks in
   101  	// which feasibility checking can be skipped if the computed node class has
   102  	// previously been marked as eligible or ineligible. Generally this will be
   103  	// checks that only needs to examine the single node to determine feasibility.
   104  	jobs := []FeasibilityChecker{s.jobConstraint}
   105  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint}
   106  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs)
   107  
   108  	// Filter on distinct host constraints.
   109  	s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks)
   110  
   111  	// Filter on distinct property constraints.
   112  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint)
   113  
   114  	// Upgrade from feasible to rank iterator
   115  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   116  
   117  	// Apply the bin packing, this depends on the resources needed
   118  	// by a particular task group. Only enable eviction for the service
   119  	// scheduler as that logic is expensive.
   120  	evict := !batch
   121  	s.binPack = NewBinPackIterator(ctx, rankSource, evict, 0)
   122  
   123  	// Apply the job anti-affinity iterator. This is to avoid placing
   124  	// multiple allocations on the same node for this job. The penalty
   125  	// is less for batch jobs as it matters less.
   126  	penalty := serviceJobAntiAffinityPenalty
   127  	if batch {
   128  		penalty = batchJobAntiAffinityPenalty
   129  	}
   130  	s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, penalty, "")
   131  
   132  	s.nodeAntiAff = NewNodeAntiAffinityIterator(ctx, s.jobAntiAff, previousFailedAllocNodePenalty)
   133  
   134  	// Apply a limit function. This is to avoid scanning *every* possible node.
   135  	s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2, skipScoreThreshold, maxSkip)
   136  
   137  	// Select the node with the maximum score for placement
   138  	s.maxScore = NewMaxScoreIterator(ctx, s.limit)
   139  	return s
   140  }
   141  
   142  func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
   143  	// Shuffle base nodes
   144  	shuffleNodes(baseNodes)
   145  
   146  	// Update the set of base nodes
   147  	s.source.SetNodes(baseNodes)
   148  
   149  	// Apply a limit function. This is to avoid scanning *every* possible node.
   150  	// For batch jobs we only need to evaluate 2 options and depend on the
   151  	// power of two choices. For services jobs we need to visit "enough".
   152  	// Using a log of the total number of nodes is a good restriction, with
   153  	// at least 2 as the floor
   154  	limit := 2
   155  	if n := len(baseNodes); !s.batch && n > 0 {
   156  		logLimit := int(math.Ceil(math.Log2(float64(n))))
   157  		if logLimit > limit {
   158  			limit = logLimit
   159  		}
   160  	}
   161  	s.limit.SetLimit(limit)
   162  }
   163  
   164  func (s *GenericStack) SetJob(job *structs.Job) {
   165  	s.jobConstraint.SetConstraints(job.Constraints)
   166  	s.distinctHostsConstraint.SetJob(job)
   167  	s.distinctPropertyConstraint.SetJob(job)
   168  	s.binPack.SetPriority(job.Priority)
   169  	s.jobAntiAff.SetJob(job.ID)
   170  	s.ctx.Eligibility().SetJob(job)
   171  
   172  	if contextual, ok := s.quota.(ContextualIterator); ok {
   173  		contextual.SetJob(job)
   174  	}
   175  }
   176  
   177  func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
   178  
   179  	// This block handles trying to select from preferred nodes if options specify them
   180  	// It also sets back the set of nodes to the original nodes
   181  	if options != nil && len(options.PreferredNodes) > 0 {
   182  		originalNodes := s.source.nodes
   183  		s.source.SetNodes(options.PreferredNodes)
   184  		optionsNew := *options
   185  		optionsNew.PreferredNodes = nil
   186  		if option, resources := s.Select(tg, &optionsNew); option != nil {
   187  			s.source.SetNodes(originalNodes)
   188  			return option, resources
   189  		}
   190  		s.source.SetNodes(originalNodes)
   191  		return s.Select(tg, &optionsNew)
   192  	}
   193  
   194  	// Reset the max selector and context
   195  	s.maxScore.Reset()
   196  	s.ctx.Reset()
   197  	start := time.Now()
   198  
   199  	// Get the task groups constraints.
   200  	tgConstr := taskGroupConstraints(tg)
   201  
   202  	// Update the parameters of iterators
   203  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   204  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   205  	s.distinctHostsConstraint.SetTaskGroup(tg)
   206  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   207  	s.wrappedChecks.SetTaskGroup(tg.Name)
   208  	s.binPack.SetTaskGroup(tg)
   209  	if options != nil {
   210  		s.nodeAntiAff.SetPenaltyNodes(options.PenaltyNodeIDs)
   211  	}
   212  
   213  	if contextual, ok := s.quota.(ContextualIterator); ok {
   214  		contextual.SetTaskGroup(tg)
   215  	}
   216  
   217  	// Find the node with the max score
   218  	option := s.maxScore.Next()
   219  
   220  	// Ensure that the task resources were specified
   221  	if option != nil && len(option.TaskResources) != len(tg.Tasks) {
   222  		for _, task := range tg.Tasks {
   223  			option.SetTaskResources(task, task.Resources)
   224  		}
   225  	}
   226  
   227  	// Store the compute time
   228  	s.ctx.Metrics().AllocationTime = time.Since(start)
   229  	return option, tgConstr.size
   230  }
   231  
   232  // SystemStack is the Stack used for the System scheduler. It is designed to
   233  // attempt to make placements on all nodes.
   234  type SystemStack struct {
   235  	ctx                        Context
   236  	source                     *StaticIterator
   237  	wrappedChecks              *FeasibilityWrapper
   238  	quota                      FeasibleIterator
   239  	jobConstraint              *ConstraintChecker
   240  	taskGroupDrivers           *DriverChecker
   241  	taskGroupConstraint        *ConstraintChecker
   242  	distinctPropertyConstraint *DistinctPropertyIterator
   243  	binPack                    *BinPackIterator
   244  }
   245  
   246  // NewSystemStack constructs a stack used for selecting service placements
   247  func NewSystemStack(ctx Context) *SystemStack {
   248  	// Create a new stack
   249  	s := &SystemStack{ctx: ctx}
   250  
   251  	// Create the source iterator. We visit nodes in a linear order because we
   252  	// have to evaluate on all nodes.
   253  	s.source = NewStaticIterator(ctx, nil)
   254  
   255  	// Create the quota iterator to determine if placements would result in the
   256  	// quota attached to the namespace of the job to go over.
   257  	s.quota = NewQuotaIterator(ctx, s.source)
   258  
   259  	// Attach the job constraints. The job is filled in later.
   260  	s.jobConstraint = NewConstraintChecker(ctx, nil)
   261  
   262  	// Filter on task group drivers first as they are faster
   263  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
   264  
   265  	// Filter on task group constraints second
   266  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
   267  
   268  	// Create the feasibility wrapper which wraps all feasibility checks in
   269  	// which feasibility checking can be skipped if the computed node class has
   270  	// previously been marked as eligible or ineligible. Generally this will be
   271  	// checks that only needs to examine the single node to determine feasibility.
   272  	jobs := []FeasibilityChecker{s.jobConstraint}
   273  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint}
   274  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs)
   275  
   276  	// Filter on distinct property constraints.
   277  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks)
   278  
   279  	// Upgrade from feasible to rank iterator
   280  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   281  
   282  	// Apply the bin packing, this depends on the resources needed
   283  	// by a particular task group. Enable eviction as system jobs are high
   284  	// priority.
   285  	s.binPack = NewBinPackIterator(ctx, rankSource, true, 0)
   286  	return s
   287  }
   288  
   289  func (s *SystemStack) SetNodes(baseNodes []*structs.Node) {
   290  	// Update the set of base nodes
   291  	s.source.SetNodes(baseNodes)
   292  }
   293  
   294  func (s *SystemStack) SetJob(job *structs.Job) {
   295  	s.jobConstraint.SetConstraints(job.Constraints)
   296  	s.distinctPropertyConstraint.SetJob(job)
   297  	s.binPack.SetPriority(job.Priority)
   298  	s.ctx.Eligibility().SetJob(job)
   299  
   300  	if contextual, ok := s.quota.(ContextualIterator); ok {
   301  		contextual.SetJob(job)
   302  	}
   303  }
   304  
   305  func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
   306  	// Reset the binpack selector and context
   307  	s.binPack.Reset()
   308  	s.ctx.Reset()
   309  	start := time.Now()
   310  
   311  	// Get the task groups constraints.
   312  	tgConstr := taskGroupConstraints(tg)
   313  
   314  	// Update the parameters of iterators
   315  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   316  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   317  	s.wrappedChecks.SetTaskGroup(tg.Name)
   318  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   319  	s.binPack.SetTaskGroup(tg)
   320  
   321  	if contextual, ok := s.quota.(ContextualIterator); ok {
   322  		contextual.SetTaskGroup(tg)
   323  	}
   324  
   325  	// Get the next option that satisfies the constraints.
   326  	option := s.binPack.Next()
   327  
   328  	// Ensure that the task resources were specified
   329  	if option != nil && len(option.TaskResources) != len(tg.Tasks) {
   330  		for _, task := range tg.Tasks {
   331  			option.SetTaskResources(task, task.Resources)
   332  		}
   333  	}
   334  
   335  	// Store the compute time
   336  	s.ctx.Metrics().AllocationTime = time.Since(start)
   337  	return option, tgConstr.size
   338  }