github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	"github.com/hashicorp/nomad/nomad/state"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/nomad/scheduler"
    11  )
    12  
    13  var (
    14  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    15  	// single Raft transaction. This is to ensure that the Raft message does not
    16  	// become too large.
    17  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    18  )
    19  
    20  // CoreScheduler is a special "scheduler" that is registered
    21  // as "_core". It is used to run various administrative work
    22  // across the cluster.
    23  type CoreScheduler struct {
    24  	srv  *Server
    25  	snap *state.StateSnapshot
    26  }
    27  
    28  // NewCoreScheduler is used to return a new system scheduler instance
    29  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    30  	s := &CoreScheduler{
    31  		srv:  srv,
    32  		snap: snap,
    33  	}
    34  	return s
    35  }
    36  
    37  // Process is used to implement the scheduler.Scheduler interface
    38  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    39  	switch eval.JobID {
    40  	case structs.CoreJobEvalGC:
    41  		return c.evalGC(eval)
    42  	case structs.CoreJobNodeGC:
    43  		return c.nodeGC(eval)
    44  	case structs.CoreJobJobGC:
    45  		return c.jobGC(eval)
    46  	case structs.CoreJobForceGC:
    47  		return c.forceGC(eval)
    48  	default:
    49  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    50  	}
    51  }
    52  
    53  // forceGC is used to garbage collect all eligible objects.
    54  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    55  	if err := c.jobGC(eval); err != nil {
    56  		return err
    57  	}
    58  	if err := c.evalGC(eval); err != nil {
    59  		return err
    60  	}
    61  
    62  	// Node GC must occur after the others to ensure the allocations are
    63  	// cleared.
    64  	return c.nodeGC(eval)
    65  }
    66  
    67  // jobGC is used to garbage collect eligible jobs.
    68  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    69  	// Get all the jobs eligible for garbage collection.
    70  	iter, err := c.snap.JobsByGC(true)
    71  	if err != nil {
    72  		return err
    73  	}
    74  
    75  	var oldThreshold uint64
    76  	if eval.JobID == structs.CoreJobForceGC {
    77  		// The GC was forced, so set the threshold to its maximum so everything
    78  		// will GC.
    79  		oldThreshold = math.MaxUint64
    80  		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
    81  	} else {
    82  		// Get the time table to calculate GC cutoffs.
    83  		tt := c.srv.fsm.TimeTable()
    84  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
    85  		oldThreshold = tt.NearestIndex(cutoff)
    86  		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
    87  			oldThreshold, c.srv.config.JobGCThreshold)
    88  	}
    89  
    90  	// Collect the allocations, evaluations and jobs to GC
    91  	var gcAlloc, gcEval, gcJob []string
    92  
    93  OUTER:
    94  	for i := iter.Next(); i != nil; i = iter.Next() {
    95  		job := i.(*structs.Job)
    96  
    97  		// Ignore new jobs.
    98  		if job.CreateIndex > oldThreshold {
    99  			continue
   100  		}
   101  
   102  		evals, err := c.snap.EvalsByJob(job.ID)
   103  		if err != nil {
   104  			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
   105  			continue
   106  		}
   107  
   108  		for _, eval := range evals {
   109  			gc, allocs, err := c.gcEval(eval, oldThreshold)
   110  			if err != nil || !gc {
   111  				// We skip the job because it is not finished if it has
   112  				// non-terminal allocations.
   113  				continue OUTER
   114  			}
   115  
   116  			gcEval = append(gcEval, eval.ID)
   117  			gcAlloc = append(gcAlloc, allocs...)
   118  		}
   119  
   120  		// Job is eligible for garbage collection
   121  		gcJob = append(gcJob, job.ID)
   122  	}
   123  
   124  	// Fast-path the nothing case
   125  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   126  		return nil
   127  	}
   128  	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
   129  		len(gcJob), len(gcEval), len(gcAlloc))
   130  
   131  	// Reap the evals and allocs
   132  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   133  		return err
   134  	}
   135  
   136  	// Call to the leader to deregister the jobs.
   137  	for _, job := range gcJob {
   138  		req := structs.JobDeregisterRequest{
   139  			JobID: job,
   140  			WriteRequest: structs.WriteRequest{
   141  				Region: c.srv.config.Region,
   142  			},
   143  		}
   144  		var resp structs.JobDeregisterResponse
   145  		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
   146  			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
   147  			return err
   148  		}
   149  	}
   150  
   151  	return nil
   152  }
   153  
   154  // evalGC is used to garbage collect old evaluations
   155  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   156  	// Iterate over the evaluations
   157  	iter, err := c.snap.Evals()
   158  	if err != nil {
   159  		return err
   160  	}
   161  
   162  	var oldThreshold uint64
   163  	if eval.JobID == structs.CoreJobForceGC {
   164  		// The GC was forced, so set the threshold to its maximum so everything
   165  		// will GC.
   166  		oldThreshold = math.MaxUint64
   167  		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
   168  	} else {
   169  		// Compute the old threshold limit for GC using the FSM
   170  		// time table.  This is a rough mapping of a time to the
   171  		// Raft index it belongs to.
   172  		tt := c.srv.fsm.TimeTable()
   173  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   174  		oldThreshold = tt.NearestIndex(cutoff)
   175  		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
   176  			oldThreshold, c.srv.config.EvalGCThreshold)
   177  	}
   178  
   179  	// Collect the allocations and evaluations to GC
   180  	var gcAlloc, gcEval []string
   181  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   182  		eval := raw.(*structs.Evaluation)
   183  
   184  		gc, allocs, err := c.gcEval(eval, oldThreshold)
   185  		if err != nil {
   186  			return err
   187  		}
   188  
   189  		// If the eval is from a running "batch" job we don't want to garbage
   190  		// collect its allocations. If there is a long running batch job and its
   191  		// terminal allocations get GC'd the scheduler would re-run the
   192  		// allocations.
   193  		if eval.Type == structs.JobTypeBatch {
   194  			// Check if the job is running
   195  			job, err := c.snap.JobByID(eval.JobID)
   196  			if err != nil {
   197  				return err
   198  			}
   199  
   200  			// If the job has been deregistered, we want to garbage collect the
   201  			// allocations and evaluations.
   202  			if job != nil && len(allocs) != 0 {
   203  				continue
   204  			}
   205  		}
   206  
   207  		if gc {
   208  			gcEval = append(gcEval, eval.ID)
   209  			gcAlloc = append(gcAlloc, allocs...)
   210  		}
   211  	}
   212  
   213  	// Fast-path the nothing case
   214  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   215  		return nil
   216  	}
   217  	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
   218  		len(gcEval), len(gcAlloc))
   219  
   220  	return c.evalReap(gcEval, gcAlloc)
   221  }
   222  
   223  // gcEval returns whether the eval should be garbage collected given a raft
   224  // threshold index. The eval disqualifies for garbage collection if it or its
   225  // allocs are not older than the threshold. If the eval should be garbage
   226  // collected, the associated alloc ids that should also be removed are also
   227  // returned
   228  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64) (
   229  	bool, []string, error) {
   230  	// Ignore non-terminal and new evaluations
   231  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   232  		return false, nil, nil
   233  	}
   234  
   235  	// Get the allocations by eval
   236  	allocs, err := c.snap.AllocsByEval(eval.ID)
   237  	if err != nil {
   238  		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
   239  			eval.ID, err)
   240  		return false, nil, err
   241  	}
   242  
   243  	// Scan the allocations to ensure they are terminal and old
   244  	for _, alloc := range allocs {
   245  		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
   246  			return false, nil, nil
   247  		}
   248  	}
   249  
   250  	allocIds := make([]string, len(allocs))
   251  	for i, alloc := range allocs {
   252  		allocIds[i] = alloc.ID
   253  	}
   254  
   255  	// Evaluation is eligible for garbage collection
   256  	return true, allocIds, nil
   257  }
   258  
   259  // evalReap contacts the leader and issues a reap on the passed evals and
   260  // allocs.
   261  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   262  	// Call to the leader to issue the reap
   263  	for _, req := range c.partitionReap(evals, allocs) {
   264  		var resp structs.GenericResponse
   265  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   266  			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
   267  			return err
   268  		}
   269  	}
   270  
   271  	return nil
   272  }
   273  
   274  // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single
   275  // request does not contain too many allocations and evaluations. This is
   276  // necessary to ensure that the Raft transaction does not become too large.
   277  func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   278  	var requests []*structs.EvalDeleteRequest
   279  	submittedEvals, submittedAllocs := 0, 0
   280  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   281  		req := &structs.EvalDeleteRequest{
   282  			WriteRequest: structs.WriteRequest{
   283  				Region: c.srv.config.Region,
   284  			},
   285  		}
   286  		requests = append(requests, req)
   287  		available := maxIdsPerReap
   288  
   289  		// Add the allocs first
   290  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   291  			if remaining <= available {
   292  				req.Allocs = allocs[submittedAllocs:]
   293  				available -= remaining
   294  				submittedAllocs += remaining
   295  			} else {
   296  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   297  				submittedAllocs += available
   298  
   299  				// Exhausted space so skip adding evals
   300  				continue
   301  			}
   302  		}
   303  
   304  		// Add the evals
   305  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   306  			if remaining <= available {
   307  				req.Evals = evals[submittedEvals:]
   308  				submittedEvals += remaining
   309  			} else {
   310  				req.Evals = evals[submittedEvals : submittedEvals+available]
   311  				submittedEvals += available
   312  			}
   313  		}
   314  	}
   315  
   316  	return requests
   317  }
   318  
   319  // nodeGC is used to garbage collect old nodes
   320  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   321  	// Iterate over the evaluations
   322  	iter, err := c.snap.Nodes()
   323  	if err != nil {
   324  		return err
   325  	}
   326  
   327  	var oldThreshold uint64
   328  	if eval.JobID == structs.CoreJobForceGC {
   329  		// The GC was forced, so set the threshold to its maximum so everything
   330  		// will GC.
   331  		oldThreshold = math.MaxUint64
   332  		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
   333  	} else {
   334  		// Compute the old threshold limit for GC using the FSM
   335  		// time table.  This is a rough mapping of a time to the
   336  		// Raft index it belongs to.
   337  		tt := c.srv.fsm.TimeTable()
   338  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   339  		oldThreshold = tt.NearestIndex(cutoff)
   340  		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
   341  			oldThreshold, c.srv.config.NodeGCThreshold)
   342  	}
   343  
   344  	// Collect the nodes to GC
   345  	var gcNode []string
   346  	for {
   347  		raw := iter.Next()
   348  		if raw == nil {
   349  			break
   350  		}
   351  		node := raw.(*structs.Node)
   352  
   353  		// Ignore non-terminal and new nodes
   354  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   355  			continue
   356  		}
   357  
   358  		// Get the allocations by node
   359  		allocs, err := c.snap.AllocsByNode(node.ID)
   360  		if err != nil {
   361  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
   362  				eval.ID, err)
   363  			continue
   364  		}
   365  
   366  		// If there are any allocations, skip the node
   367  		if len(allocs) > 0 {
   368  			continue
   369  		}
   370  
   371  		// Node is eligible for garbage collection
   372  		gcNode = append(gcNode, node.ID)
   373  	}
   374  
   375  	// Fast-path the nothing case
   376  	if len(gcNode) == 0 {
   377  		return nil
   378  	}
   379  	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))
   380  
   381  	// Call to the leader to issue the reap
   382  	for _, nodeID := range gcNode {
   383  		req := structs.NodeDeregisterRequest{
   384  			NodeID: nodeID,
   385  			WriteRequest: structs.WriteRequest{
   386  				Region: c.srv.config.Region,
   387  			},
   388  		}
   389  		var resp structs.NodeUpdateResponse
   390  		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   391  			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
   392  			return err
   393  		}
   394  	}
   395  	return nil
   396  }