github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	"github.com/hashicorp/nomad/nomad/state"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/nomad/scheduler"
    11  )
    12  
    13  var (
    14  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    15  	// single Raft transaction. This is to ensure that the Raft message does not
    16  	// become too large.
    17  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    18  )
    19  
    20  // CoreScheduler is a special "scheduler" that is registered
    21  // as "_core". It is used to run various administrative work
    22  // across the cluster.
    23  type CoreScheduler struct {
    24  	srv  *Server
    25  	snap *state.StateSnapshot
    26  }
    27  
    28  // NewCoreScheduler is used to return a new system scheduler instance
    29  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    30  	s := &CoreScheduler{
    31  		srv:  srv,
    32  		snap: snap,
    33  	}
    34  	return s
    35  }
    36  
    37  // Process is used to implement the scheduler.Scheduler interface
    38  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    39  	switch eval.JobID {
    40  	case structs.CoreJobEvalGC:
    41  		return c.evalGC(eval)
    42  	case structs.CoreJobNodeGC:
    43  		return c.nodeGC(eval)
    44  	case structs.CoreJobJobGC:
    45  		return c.jobGC(eval)
    46  	case structs.CoreJobForceGC:
    47  		return c.forceGC(eval)
    48  	default:
    49  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    50  	}
    51  }
    52  
    53  // forceGC is used to garbage collect all eligible objects.
    54  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    55  	if err := c.jobGC(eval); err != nil {
    56  		return err
    57  	}
    58  	if err := c.evalGC(eval); err != nil {
    59  		return err
    60  	}
    61  
    62  	// Node GC must occur after the others to ensure the allocations are
    63  	// cleared.
    64  	return c.nodeGC(eval)
    65  }
    66  
    67  // jobGC is used to garbage collect eligible jobs.
    68  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    69  	// Get all the jobs eligible for garbage collection.
    70  	iter, err := c.snap.JobsByGC(true)
    71  	if err != nil {
    72  		return err
    73  	}
    74  
    75  	var oldThreshold uint64
    76  	if eval.JobID == structs.CoreJobForceGC {
    77  		// The GC was forced, so set the threshold to its maximum so everything
    78  		// will GC.
    79  		oldThreshold = math.MaxUint64
    80  		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
    81  	} else {
    82  		// Get the time table to calculate GC cutoffs.
    83  		tt := c.srv.fsm.TimeTable()
    84  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
    85  		oldThreshold = tt.NearestIndex(cutoff)
    86  		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
    87  			oldThreshold, c.srv.config.JobGCThreshold)
    88  	}
    89  
    90  	// Collect the allocations, evaluations and jobs to GC
    91  	var gcAlloc, gcEval, gcJob []string
    92  
    93  OUTER:
    94  	for i := iter.Next(); i != nil; i = iter.Next() {
    95  		job := i.(*structs.Job)
    96  
    97  		// Ignore new jobs.
    98  		if job.CreateIndex > oldThreshold {
    99  			continue
   100  		}
   101  
   102  		evals, err := c.snap.EvalsByJob(job.ID)
   103  		if err != nil {
   104  			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
   105  			continue
   106  		}
   107  
   108  		allEvalsGC := true
   109  		var jobAlloc, jobEval []string
   110  		for _, eval := range evals {
   111  			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
   112  			if err != nil {
   113  				continue OUTER
   114  			}
   115  
   116  			if gc {
   117  				jobEval = append(jobEval, eval.ID)
   118  				jobAlloc = append(jobAlloc, allocs...)
   119  			} else {
   120  				allEvalsGC = false
   121  				break
   122  			}
   123  		}
   124  
   125  		// Job is eligible for garbage collection
   126  		if allEvalsGC {
   127  			gcJob = append(gcJob, job.ID)
   128  			gcAlloc = append(gcAlloc, jobAlloc...)
   129  			gcEval = append(gcEval, jobEval...)
   130  		}
   131  	}
   132  
   133  	// Fast-path the nothing case
   134  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   135  		return nil
   136  	}
   137  	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
   138  		len(gcJob), len(gcEval), len(gcAlloc))
   139  
   140  	// Reap the evals and allocs
   141  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   142  		return err
   143  	}
   144  
   145  	// Call to the leader to deregister the jobs.
   146  	for _, job := range gcJob {
   147  		req := structs.JobDeregisterRequest{
   148  			JobID: job,
   149  			WriteRequest: structs.WriteRequest{
   150  				Region: c.srv.config.Region,
   151  			},
   152  		}
   153  		var resp structs.JobDeregisterResponse
   154  		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
   155  			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
   156  			return err
   157  		}
   158  	}
   159  
   160  	return nil
   161  }
   162  
   163  // evalGC is used to garbage collect old evaluations
   164  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   165  	// Iterate over the evaluations
   166  	iter, err := c.snap.Evals()
   167  	if err != nil {
   168  		return err
   169  	}
   170  
   171  	var oldThreshold uint64
   172  	if eval.JobID == structs.CoreJobForceGC {
   173  		// The GC was forced, so set the threshold to its maximum so everything
   174  		// will GC.
   175  		oldThreshold = math.MaxUint64
   176  		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
   177  	} else {
   178  		// Compute the old threshold limit for GC using the FSM
   179  		// time table.  This is a rough mapping of a time to the
   180  		// Raft index it belongs to.
   181  		tt := c.srv.fsm.TimeTable()
   182  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   183  		oldThreshold = tt.NearestIndex(cutoff)
   184  		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
   185  			oldThreshold, c.srv.config.EvalGCThreshold)
   186  	}
   187  
   188  	// Collect the allocations and evaluations to GC
   189  	var gcAlloc, gcEval []string
   190  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   191  		eval := raw.(*structs.Evaluation)
   192  
   193  		// The Evaluation GC should not handle batch jobs since those need to be
   194  		// garbage collected in one shot
   195  		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
   196  		if err != nil {
   197  			return err
   198  		}
   199  
   200  		if gc {
   201  			gcEval = append(gcEval, eval.ID)
   202  		}
   203  		gcAlloc = append(gcAlloc, allocs...)
   204  	}
   205  
   206  	// Fast-path the nothing case
   207  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   208  		return nil
   209  	}
   210  	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
   211  		len(gcEval), len(gcAlloc))
   212  
   213  	return c.evalReap(gcEval, gcAlloc)
   214  }
   215  
   216  // gcEval returns whether the eval should be garbage collected given a raft
   217  // threshold index. The eval disqualifies for garbage collection if it or its
   218  // allocs are not older than the threshold. If the eval should be garbage
   219  // collected, the associated alloc ids that should also be removed are also
   220  // returned
   221  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
   222  	bool, []string, error) {
   223  	// Ignore non-terminal and new evaluations
   224  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   225  		return false, nil, nil
   226  	}
   227  
   228  	// If the eval is from a running "batch" job we don't want to garbage
   229  	// collect its allocations. If there is a long running batch job and its
   230  	// terminal allocations get GC'd the scheduler would re-run the
   231  	// allocations.
   232  	if eval.Type == structs.JobTypeBatch {
   233  		if !allowBatch {
   234  			return false, nil, nil
   235  		}
   236  
   237  		// Check if the job is running
   238  		job, err := c.snap.JobByID(eval.JobID)
   239  		if err != nil {
   240  			return false, nil, err
   241  		}
   242  
   243  		// We don't want to gc anything related to a job which is not dead
   244  		if job != nil && job.Status != structs.JobStatusDead {
   245  			return false, nil, nil
   246  		}
   247  	}
   248  
   249  	// Get the allocations by eval
   250  	allocs, err := c.snap.AllocsByEval(eval.ID)
   251  	if err != nil {
   252  		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
   253  			eval.ID, err)
   254  		return false, nil, err
   255  	}
   256  
   257  	// Scan the allocations to ensure they are terminal and old
   258  	gcEval := true
   259  	var gcAllocIDs []string
   260  	for _, alloc := range allocs {
   261  		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
   262  			// Can't GC the evaluation since not all of the allocations are
   263  			// terminal
   264  			gcEval = false
   265  		} else {
   266  			// The allocation is eligible to be GC'd
   267  			gcAllocIDs = append(gcAllocIDs, alloc.ID)
   268  		}
   269  	}
   270  
   271  	return gcEval, gcAllocIDs, nil
   272  }
   273  
   274  // evalReap contacts the leader and issues a reap on the passed evals and
   275  // allocs.
   276  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   277  	// Call to the leader to issue the reap
   278  	for _, req := range c.partitionReap(evals, allocs) {
   279  		var resp structs.GenericResponse
   280  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   281  			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
   282  			return err
   283  		}
   284  	}
   285  
   286  	return nil
   287  }
   288  
   289  // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single
   290  // request does not contain too many allocations and evaluations. This is
   291  // necessary to ensure that the Raft transaction does not become too large.
   292  func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   293  	var requests []*structs.EvalDeleteRequest
   294  	submittedEvals, submittedAllocs := 0, 0
   295  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   296  		req := &structs.EvalDeleteRequest{
   297  			WriteRequest: structs.WriteRequest{
   298  				Region: c.srv.config.Region,
   299  			},
   300  		}
   301  		requests = append(requests, req)
   302  		available := maxIdsPerReap
   303  
   304  		// Add the allocs first
   305  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   306  			if remaining <= available {
   307  				req.Allocs = allocs[submittedAllocs:]
   308  				available -= remaining
   309  				submittedAllocs += remaining
   310  			} else {
   311  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   312  				submittedAllocs += available
   313  
   314  				// Exhausted space so skip adding evals
   315  				continue
   316  			}
   317  		}
   318  
   319  		// Add the evals
   320  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   321  			if remaining <= available {
   322  				req.Evals = evals[submittedEvals:]
   323  				submittedEvals += remaining
   324  			} else {
   325  				req.Evals = evals[submittedEvals : submittedEvals+available]
   326  				submittedEvals += available
   327  			}
   328  		}
   329  	}
   330  
   331  	return requests
   332  }
   333  
   334  // nodeGC is used to garbage collect old nodes
   335  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   336  	// Iterate over the evaluations
   337  	iter, err := c.snap.Nodes()
   338  	if err != nil {
   339  		return err
   340  	}
   341  
   342  	var oldThreshold uint64
   343  	if eval.JobID == structs.CoreJobForceGC {
   344  		// The GC was forced, so set the threshold to its maximum so everything
   345  		// will GC.
   346  		oldThreshold = math.MaxUint64
   347  		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
   348  	} else {
   349  		// Compute the old threshold limit for GC using the FSM
   350  		// time table.  This is a rough mapping of a time to the
   351  		// Raft index it belongs to.
   352  		tt := c.srv.fsm.TimeTable()
   353  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   354  		oldThreshold = tt.NearestIndex(cutoff)
   355  		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
   356  			oldThreshold, c.srv.config.NodeGCThreshold)
   357  	}
   358  
   359  	// Collect the nodes to GC
   360  	var gcNode []string
   361  OUTER:
   362  	for {
   363  		raw := iter.Next()
   364  		if raw == nil {
   365  			break
   366  		}
   367  		node := raw.(*structs.Node)
   368  
   369  		// Ignore non-terminal and new nodes
   370  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   371  			continue
   372  		}
   373  
   374  		// Get the allocations by node
   375  		allocs, err := c.snap.AllocsByNode(node.ID)
   376  		if err != nil {
   377  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
   378  				eval.ID, err)
   379  			continue
   380  		}
   381  
   382  		// If there are any non-terminal allocations, skip the node. If the node
   383  		// is terminal and the allocations are not, the scheduler may not have
   384  		// run yet to transition the allocs on the node to terminal. We delay
   385  		// GC'ing until this happens.
   386  		for _, alloc := range allocs {
   387  			if !alloc.TerminalStatus() {
   388  				continue OUTER
   389  			}
   390  		}
   391  
   392  		// Node is eligible for garbage collection
   393  		gcNode = append(gcNode, node.ID)
   394  	}
   395  
   396  	// Fast-path the nothing case
   397  	if len(gcNode) == 0 {
   398  		return nil
   399  	}
   400  	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))
   401  
   402  	// Call to the leader to issue the reap
   403  	for _, nodeID := range gcNode {
   404  		req := structs.NodeDeregisterRequest{
   405  			NodeID: nodeID,
   406  			WriteRequest: structs.WriteRequest{
   407  				Region: c.srv.config.Region,
   408  			},
   409  		}
   410  		var resp structs.NodeUpdateResponse
   411  		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   412  			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
   413  			return err
   414  		}
   415  	}
   416  	return nil
   417  }