github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/state"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/scheduler"
    12  )
    13  
    14  var (
    15  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    16  	// single Raft transaction. This is to ensure that the Raft message does not
    17  	// become too large.
    18  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    19  )
    20  
    21  // CoreScheduler is a special "scheduler" that is registered
    22  // as "_core". It is used to run various administrative work
    23  // across the cluster.
    24  type CoreScheduler struct {
    25  	srv  *Server
    26  	snap *state.StateSnapshot
    27  }
    28  
    29  // NewCoreScheduler is used to return a new system scheduler instance
    30  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    31  	s := &CoreScheduler{
    32  		srv:  srv,
    33  		snap: snap,
    34  	}
    35  	return s
    36  }
    37  
    38  // Process is used to implement the scheduler.Scheduler interface
    39  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    40  	switch eval.JobID {
    41  	case structs.CoreJobEvalGC:
    42  		return c.evalGC(eval)
    43  	case structs.CoreJobNodeGC:
    44  		return c.nodeGC(eval)
    45  	case structs.CoreJobJobGC:
    46  		return c.jobGC(eval)
    47  	case structs.CoreJobForceGC:
    48  		return c.forceGC(eval)
    49  	default:
    50  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    51  	}
    52  }
    53  
    54  // forceGC is used to garbage collect all eligible objects.
    55  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    56  	if err := c.jobGC(eval); err != nil {
    57  		return err
    58  	}
    59  	if err := c.evalGC(eval); err != nil {
    60  		return err
    61  	}
    62  
    63  	// Node GC must occur after the others to ensure the allocations are
    64  	// cleared.
    65  	return c.nodeGC(eval)
    66  }
    67  
    68  // jobGC is used to garbage collect eligible jobs.
    69  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    70  	// Get all the jobs eligible for garbage collection.
    71  	ws := memdb.NewWatchSet()
    72  	iter, err := c.snap.JobsByGC(ws, true)
    73  	if err != nil {
    74  		return err
    75  	}
    76  
    77  	var oldThreshold uint64
    78  	if eval.JobID == structs.CoreJobForceGC {
    79  		// The GC was forced, so set the threshold to its maximum so everything
    80  		// will GC.
    81  		oldThreshold = math.MaxUint64
    82  		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
    83  	} else {
    84  		// Get the time table to calculate GC cutoffs.
    85  		tt := c.srv.fsm.TimeTable()
    86  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
    87  		oldThreshold = tt.NearestIndex(cutoff)
    88  		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
    89  			oldThreshold, c.srv.config.JobGCThreshold)
    90  	}
    91  
    92  	// Collect the allocations, evaluations and jobs to GC
    93  	var gcAlloc, gcEval, gcJob []string
    94  
    95  OUTER:
    96  	for i := iter.Next(); i != nil; i = iter.Next() {
    97  		job := i.(*structs.Job)
    98  
    99  		// Ignore new jobs.
   100  		if job.CreateIndex > oldThreshold {
   101  			continue
   102  		}
   103  
   104  		ws := memdb.NewWatchSet()
   105  		evals, err := c.snap.EvalsByJob(ws, job.ID)
   106  		if err != nil {
   107  			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
   108  			continue
   109  		}
   110  
   111  		allEvalsGC := true
   112  		var jobAlloc, jobEval []string
   113  		for _, eval := range evals {
   114  			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
   115  			if err != nil {
   116  				continue OUTER
   117  			}
   118  
   119  			if gc {
   120  				jobEval = append(jobEval, eval.ID)
   121  				jobAlloc = append(jobAlloc, allocs...)
   122  			} else {
   123  				allEvalsGC = false
   124  				break
   125  			}
   126  		}
   127  
   128  		// Job is eligible for garbage collection
   129  		if allEvalsGC {
   130  			gcJob = append(gcJob, job.ID)
   131  			gcAlloc = append(gcAlloc, jobAlloc...)
   132  			gcEval = append(gcEval, jobEval...)
   133  		}
   134  	}
   135  
   136  	// Fast-path the nothing case
   137  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   138  		return nil
   139  	}
   140  	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
   141  		len(gcJob), len(gcEval), len(gcAlloc))
   142  
   143  	// Reap the evals and allocs
   144  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   145  		return err
   146  	}
   147  
   148  	// Call to the leader to deregister the jobs.
   149  	for _, job := range gcJob {
   150  		req := structs.JobDeregisterRequest{
   151  			JobID: job,
   152  			WriteRequest: structs.WriteRequest{
   153  				Region: c.srv.config.Region,
   154  			},
   155  		}
   156  		var resp structs.JobDeregisterResponse
   157  		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
   158  			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
   159  			return err
   160  		}
   161  	}
   162  
   163  	return nil
   164  }
   165  
   166  // evalGC is used to garbage collect old evaluations
   167  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   168  	// Iterate over the evaluations
   169  	ws := memdb.NewWatchSet()
   170  	iter, err := c.snap.Evals(ws)
   171  	if err != nil {
   172  		return err
   173  	}
   174  
   175  	var oldThreshold uint64
   176  	if eval.JobID == structs.CoreJobForceGC {
   177  		// The GC was forced, so set the threshold to its maximum so everything
   178  		// will GC.
   179  		oldThreshold = math.MaxUint64
   180  		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
   181  	} else {
   182  		// Compute the old threshold limit for GC using the FSM
   183  		// time table.  This is a rough mapping of a time to the
   184  		// Raft index it belongs to.
   185  		tt := c.srv.fsm.TimeTable()
   186  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   187  		oldThreshold = tt.NearestIndex(cutoff)
   188  		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
   189  			oldThreshold, c.srv.config.EvalGCThreshold)
   190  	}
   191  
   192  	// Collect the allocations and evaluations to GC
   193  	var gcAlloc, gcEval []string
   194  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   195  		eval := raw.(*structs.Evaluation)
   196  
   197  		// The Evaluation GC should not handle batch jobs since those need to be
   198  		// garbage collected in one shot
   199  		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
   200  		if err != nil {
   201  			return err
   202  		}
   203  
   204  		if gc {
   205  			gcEval = append(gcEval, eval.ID)
   206  		}
   207  		gcAlloc = append(gcAlloc, allocs...)
   208  	}
   209  
   210  	// Fast-path the nothing case
   211  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   212  		return nil
   213  	}
   214  	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
   215  		len(gcEval), len(gcAlloc))
   216  
   217  	return c.evalReap(gcEval, gcAlloc)
   218  }
   219  
   220  // gcEval returns whether the eval should be garbage collected given a raft
   221  // threshold index. The eval disqualifies for garbage collection if it or its
   222  // allocs are not older than the threshold. If the eval should be garbage
   223  // collected, the associated alloc ids that should also be removed are also
   224  // returned
   225  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
   226  	bool, []string, error) {
   227  	// Ignore non-terminal and new evaluations
   228  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   229  		return false, nil, nil
   230  	}
   231  
   232  	// Create a watchset
   233  	ws := memdb.NewWatchSet()
   234  
   235  	// If the eval is from a running "batch" job we don't want to garbage
   236  	// collect its allocations. If there is a long running batch job and its
   237  	// terminal allocations get GC'd the scheduler would re-run the
   238  	// allocations.
   239  	if eval.Type == structs.JobTypeBatch {
   240  		// Check if the job is running
   241  		job, err := c.snap.JobByID(ws, eval.JobID)
   242  		if err != nil {
   243  			return false, nil, err
   244  		}
   245  
   246  		// We don't want to gc anything related to a job which is not dead
   247  		// If the batch job doesn't exist we can GC it regardless of allowBatch
   248  		if job != nil && (!allowBatch || job.Status != structs.JobStatusDead) {
   249  			return false, nil, nil
   250  		}
   251  	}
   252  
   253  	// Get the allocations by eval
   254  	allocs, err := c.snap.AllocsByEval(ws, eval.ID)
   255  	if err != nil {
   256  		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
   257  			eval.ID, err)
   258  		return false, nil, err
   259  	}
   260  
   261  	// Scan the allocations to ensure they are terminal and old
   262  	gcEval := true
   263  	var gcAllocIDs []string
   264  	for _, alloc := range allocs {
   265  		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
   266  			// Can't GC the evaluation since not all of the allocations are
   267  			// terminal
   268  			gcEval = false
   269  		} else {
   270  			// The allocation is eligible to be GC'd
   271  			gcAllocIDs = append(gcAllocIDs, alloc.ID)
   272  		}
   273  	}
   274  
   275  	return gcEval, gcAllocIDs, nil
   276  }
   277  
   278  // evalReap contacts the leader and issues a reap on the passed evals and
   279  // allocs.
   280  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   281  	// Call to the leader to issue the reap
   282  	for _, req := range c.partitionReap(evals, allocs) {
   283  		var resp structs.GenericResponse
   284  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   285  			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
   286  			return err
   287  		}
   288  	}
   289  
   290  	return nil
   291  }
   292  
   293  // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single
   294  // request does not contain too many allocations and evaluations. This is
   295  // necessary to ensure that the Raft transaction does not become too large.
   296  func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   297  	var requests []*structs.EvalDeleteRequest
   298  	submittedEvals, submittedAllocs := 0, 0
   299  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   300  		req := &structs.EvalDeleteRequest{
   301  			WriteRequest: structs.WriteRequest{
   302  				Region: c.srv.config.Region,
   303  			},
   304  		}
   305  		requests = append(requests, req)
   306  		available := maxIdsPerReap
   307  
   308  		// Add the allocs first
   309  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   310  			if remaining <= available {
   311  				req.Allocs = allocs[submittedAllocs:]
   312  				available -= remaining
   313  				submittedAllocs += remaining
   314  			} else {
   315  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   316  				submittedAllocs += available
   317  
   318  				// Exhausted space so skip adding evals
   319  				continue
   320  			}
   321  		}
   322  
   323  		// Add the evals
   324  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   325  			if remaining <= available {
   326  				req.Evals = evals[submittedEvals:]
   327  				submittedEvals += remaining
   328  			} else {
   329  				req.Evals = evals[submittedEvals : submittedEvals+available]
   330  				submittedEvals += available
   331  			}
   332  		}
   333  	}
   334  
   335  	return requests
   336  }
   337  
   338  // nodeGC is used to garbage collect old nodes
   339  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   340  	// Iterate over the evaluations
   341  	ws := memdb.NewWatchSet()
   342  	iter, err := c.snap.Nodes(ws)
   343  	if err != nil {
   344  		return err
   345  	}
   346  
   347  	var oldThreshold uint64
   348  	if eval.JobID == structs.CoreJobForceGC {
   349  		// The GC was forced, so set the threshold to its maximum so everything
   350  		// will GC.
   351  		oldThreshold = math.MaxUint64
   352  		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
   353  	} else {
   354  		// Compute the old threshold limit for GC using the FSM
   355  		// time table.  This is a rough mapping of a time to the
   356  		// Raft index it belongs to.
   357  		tt := c.srv.fsm.TimeTable()
   358  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   359  		oldThreshold = tt.NearestIndex(cutoff)
   360  		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
   361  			oldThreshold, c.srv.config.NodeGCThreshold)
   362  	}
   363  
   364  	// Collect the nodes to GC
   365  	var gcNode []string
   366  OUTER:
   367  	for {
   368  		raw := iter.Next()
   369  		if raw == nil {
   370  			break
   371  		}
   372  		node := raw.(*structs.Node)
   373  
   374  		// Ignore non-terminal and new nodes
   375  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   376  			continue
   377  		}
   378  
   379  		// Get the allocations by node
   380  		ws := memdb.NewWatchSet()
   381  		allocs, err := c.snap.AllocsByNode(ws, node.ID)
   382  		if err != nil {
   383  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
   384  				eval.ID, err)
   385  			continue
   386  		}
   387  
   388  		// If there are any non-terminal allocations, skip the node. If the node
   389  		// is terminal and the allocations are not, the scheduler may not have
   390  		// run yet to transition the allocs on the node to terminal. We delay
   391  		// GC'ing until this happens.
   392  		for _, alloc := range allocs {
   393  			if !alloc.TerminalStatus() {
   394  				continue OUTER
   395  			}
   396  		}
   397  
   398  		// Node is eligible for garbage collection
   399  		gcNode = append(gcNode, node.ID)
   400  	}
   401  
   402  	// Fast-path the nothing case
   403  	if len(gcNode) == 0 {
   404  		return nil
   405  	}
   406  	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))
   407  
   408  	// Call to the leader to issue the reap
   409  	for _, nodeID := range gcNode {
   410  		req := structs.NodeDeregisterRequest{
   411  			NodeID: nodeID,
   412  			WriteRequest: structs.WriteRequest{
   413  				Region: c.srv.config.Region,
   414  			},
   415  		}
   416  		var resp structs.NodeUpdateResponse
   417  		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   418  			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
   419  			return err
   420  		}
   421  	}
   422  	return nil
   423  }