github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/state"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/scheduler"
    12  )
    13  
    14  var (
    15  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    16  	// single Raft transaction. This is to ensure that the Raft message does not
    17  	// become too large.
    18  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    19  )
    20  
    21  // CoreScheduler is a special "scheduler" that is registered
    22  // as "_core". It is used to run various administrative work
    23  // across the cluster.
    24  type CoreScheduler struct {
    25  	srv  *Server
    26  	snap *state.StateSnapshot
    27  }
    28  
    29  // NewCoreScheduler is used to return a new system scheduler instance
    30  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    31  	s := &CoreScheduler{
    32  		srv:  srv,
    33  		snap: snap,
    34  	}
    35  	return s
    36  }
    37  
    38  // Process is used to implement the scheduler.Scheduler interface
    39  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    40  	switch eval.JobID {
    41  	case structs.CoreJobEvalGC:
    42  		return c.evalGC(eval)
    43  	case structs.CoreJobNodeGC:
    44  		return c.nodeGC(eval)
    45  	case structs.CoreJobJobGC:
    46  		return c.jobGC(eval)
    47  	case structs.CoreJobForceGC:
    48  		return c.forceGC(eval)
    49  	default:
    50  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    51  	}
    52  }
    53  
    54  // forceGC is used to garbage collect all eligible objects.
    55  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    56  	if err := c.jobGC(eval); err != nil {
    57  		return err
    58  	}
    59  	if err := c.evalGC(eval); err != nil {
    60  		return err
    61  	}
    62  
    63  	// Node GC must occur after the others to ensure the allocations are
    64  	// cleared.
    65  	return c.nodeGC(eval)
    66  }
    67  
    68  // jobGC is used to garbage collect eligible jobs.
    69  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    70  	// Get all the jobs eligible for garbage collection.
    71  	ws := memdb.NewWatchSet()
    72  	iter, err := c.snap.JobsByGC(ws, true)
    73  	if err != nil {
    74  		return err
    75  	}
    76  
    77  	var oldThreshold uint64
    78  	if eval.JobID == structs.CoreJobForceGC {
    79  		// The GC was forced, so set the threshold to its maximum so everything
    80  		// will GC.
    81  		oldThreshold = math.MaxUint64
    82  		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
    83  	} else {
    84  		// Get the time table to calculate GC cutoffs.
    85  		tt := c.srv.fsm.TimeTable()
    86  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
    87  		oldThreshold = tt.NearestIndex(cutoff)
    88  		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
    89  			oldThreshold, c.srv.config.JobGCThreshold)
    90  	}
    91  
    92  	// Collect the allocations, evaluations and jobs to GC
    93  	var gcAlloc, gcEval, gcJob []string
    94  
    95  OUTER:
    96  	for i := iter.Next(); i != nil; i = iter.Next() {
    97  		job := i.(*structs.Job)
    98  
    99  		// Ignore new jobs.
   100  		if job.CreateIndex > oldThreshold {
   101  			continue
   102  		}
   103  
   104  		ws := memdb.NewWatchSet()
   105  		evals, err := c.snap.EvalsByJob(ws, job.ID)
   106  		if err != nil {
   107  			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
   108  			continue
   109  		}
   110  
   111  		allEvalsGC := true
   112  		var jobAlloc, jobEval []string
   113  		for _, eval := range evals {
   114  			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
   115  			if err != nil {
   116  				continue OUTER
   117  			}
   118  
   119  			if gc {
   120  				jobEval = append(jobEval, eval.ID)
   121  				jobAlloc = append(jobAlloc, allocs...)
   122  			} else {
   123  				allEvalsGC = false
   124  				break
   125  			}
   126  		}
   127  
   128  		// Job is eligible for garbage collection
   129  		if allEvalsGC {
   130  			gcJob = append(gcJob, job.ID)
   131  			gcAlloc = append(gcAlloc, jobAlloc...)
   132  			gcEval = append(gcEval, jobEval...)
   133  		}
   134  	}
   135  
   136  	// Fast-path the nothing case
   137  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   138  		return nil
   139  	}
   140  	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
   141  		len(gcJob), len(gcEval), len(gcAlloc))
   142  
   143  	// Reap the evals and allocs
   144  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   145  		return err
   146  	}
   147  
   148  	// Call to the leader to deregister the jobs.
   149  	for _, job := range gcJob {
   150  		req := structs.JobDeregisterRequest{
   151  			JobID: job,
   152  			Purge: true,
   153  			WriteRequest: structs.WriteRequest{
   154  				Region: c.srv.config.Region,
   155  			},
   156  		}
   157  		var resp structs.JobDeregisterResponse
   158  		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
   159  			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
   160  			return err
   161  		}
   162  	}
   163  
   164  	return nil
   165  }
   166  
   167  // evalGC is used to garbage collect old evaluations
   168  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   169  	// Iterate over the evaluations
   170  	ws := memdb.NewWatchSet()
   171  	iter, err := c.snap.Evals(ws)
   172  	if err != nil {
   173  		return err
   174  	}
   175  
   176  	var oldThreshold uint64
   177  	if eval.JobID == structs.CoreJobForceGC {
   178  		// The GC was forced, so set the threshold to its maximum so everything
   179  		// will GC.
   180  		oldThreshold = math.MaxUint64
   181  		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
   182  	} else {
   183  		// Compute the old threshold limit for GC using the FSM
   184  		// time table.  This is a rough mapping of a time to the
   185  		// Raft index it belongs to.
   186  		tt := c.srv.fsm.TimeTable()
   187  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   188  		oldThreshold = tt.NearestIndex(cutoff)
   189  		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
   190  			oldThreshold, c.srv.config.EvalGCThreshold)
   191  	}
   192  
   193  	// Collect the allocations and evaluations to GC
   194  	var gcAlloc, gcEval []string
   195  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   196  		eval := raw.(*structs.Evaluation)
   197  
   198  		// The Evaluation GC should not handle batch jobs since those need to be
   199  		// garbage collected in one shot
   200  		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
   201  		if err != nil {
   202  			return err
   203  		}
   204  
   205  		if gc {
   206  			gcEval = append(gcEval, eval.ID)
   207  		}
   208  		gcAlloc = append(gcAlloc, allocs...)
   209  	}
   210  
   211  	// Fast-path the nothing case
   212  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   213  		return nil
   214  	}
   215  	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
   216  		len(gcEval), len(gcAlloc))
   217  
   218  	return c.evalReap(gcEval, gcAlloc)
   219  }
   220  
   221  // gcEval returns whether the eval should be garbage collected given a raft
   222  // threshold index. The eval disqualifies for garbage collection if it or its
   223  // allocs are not older than the threshold. If the eval should be garbage
   224  // collected, the associated alloc ids that should also be removed are also
   225  // returned
   226  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
   227  	bool, []string, error) {
   228  	// Ignore non-terminal and new evaluations
   229  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   230  		return false, nil, nil
   231  	}
   232  
   233  	// Create a watchset
   234  	ws := memdb.NewWatchSet()
   235  
   236  	// If the eval is from a running "batch" job we don't want to garbage
   237  	// collect its allocations. If there is a long running batch job and its
   238  	// terminal allocations get GC'd the scheduler would re-run the
   239  	// allocations.
   240  	if eval.Type == structs.JobTypeBatch {
   241  		// Check if the job is running
   242  		job, err := c.snap.JobByID(ws, eval.JobID)
   243  		if err != nil {
   244  			return false, nil, err
   245  		}
   246  
   247  		// Can collect if:
   248  		// Job doesn't exist
   249  		// Job is Stopped and dead
   250  		// allowBatch and the job is dead
   251  		collect := false
   252  		if job == nil {
   253  			collect = true
   254  		} else if job.Status != structs.JobStatusDead {
   255  			collect = false
   256  		} else if job.Stop {
   257  			collect = true
   258  		} else if allowBatch {
   259  			collect = true
   260  		}
   261  
   262  		// We don't want to gc anything related to a job which is not dead
   263  		// If the batch job doesn't exist we can GC it regardless of allowBatch
   264  		if !collect {
   265  			return false, nil, nil
   266  		}
   267  	}
   268  
   269  	// Get the allocations by eval
   270  	allocs, err := c.snap.AllocsByEval(ws, eval.ID)
   271  	if err != nil {
   272  		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
   273  			eval.ID, err)
   274  		return false, nil, err
   275  	}
   276  
   277  	// Scan the allocations to ensure they are terminal and old
   278  	gcEval := true
   279  	var gcAllocIDs []string
   280  	for _, alloc := range allocs {
   281  		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
   282  			// Can't GC the evaluation since not all of the allocations are
   283  			// terminal
   284  			gcEval = false
   285  		} else {
   286  			// The allocation is eligible to be GC'd
   287  			gcAllocIDs = append(gcAllocIDs, alloc.ID)
   288  		}
   289  	}
   290  
   291  	return gcEval, gcAllocIDs, nil
   292  }
   293  
   294  // evalReap contacts the leader and issues a reap on the passed evals and
   295  // allocs.
   296  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   297  	// Call to the leader to issue the reap
   298  	for _, req := range c.partitionReap(evals, allocs) {
   299  		var resp structs.GenericResponse
   300  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   301  			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
   302  			return err
   303  		}
   304  	}
   305  
   306  	return nil
   307  }
   308  
   309  // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single
   310  // request does not contain too many allocations and evaluations. This is
   311  // necessary to ensure that the Raft transaction does not become too large.
   312  func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   313  	var requests []*structs.EvalDeleteRequest
   314  	submittedEvals, submittedAllocs := 0, 0
   315  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   316  		req := &structs.EvalDeleteRequest{
   317  			WriteRequest: structs.WriteRequest{
   318  				Region: c.srv.config.Region,
   319  			},
   320  		}
   321  		requests = append(requests, req)
   322  		available := maxIdsPerReap
   323  
   324  		// Add the allocs first
   325  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   326  			if remaining <= available {
   327  				req.Allocs = allocs[submittedAllocs:]
   328  				available -= remaining
   329  				submittedAllocs += remaining
   330  			} else {
   331  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   332  				submittedAllocs += available
   333  
   334  				// Exhausted space so skip adding evals
   335  				continue
   336  			}
   337  		}
   338  
   339  		// Add the evals
   340  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   341  			if remaining <= available {
   342  				req.Evals = evals[submittedEvals:]
   343  				submittedEvals += remaining
   344  			} else {
   345  				req.Evals = evals[submittedEvals : submittedEvals+available]
   346  				submittedEvals += available
   347  			}
   348  		}
   349  	}
   350  
   351  	return requests
   352  }
   353  
   354  // nodeGC is used to garbage collect old nodes
   355  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   356  	// Iterate over the evaluations
   357  	ws := memdb.NewWatchSet()
   358  	iter, err := c.snap.Nodes(ws)
   359  	if err != nil {
   360  		return err
   361  	}
   362  
   363  	var oldThreshold uint64
   364  	if eval.JobID == structs.CoreJobForceGC {
   365  		// The GC was forced, so set the threshold to its maximum so everything
   366  		// will GC.
   367  		oldThreshold = math.MaxUint64
   368  		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
   369  	} else {
   370  		// Compute the old threshold limit for GC using the FSM
   371  		// time table.  This is a rough mapping of a time to the
   372  		// Raft index it belongs to.
   373  		tt := c.srv.fsm.TimeTable()
   374  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   375  		oldThreshold = tt.NearestIndex(cutoff)
   376  		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
   377  			oldThreshold, c.srv.config.NodeGCThreshold)
   378  	}
   379  
   380  	// Collect the nodes to GC
   381  	var gcNode []string
   382  OUTER:
   383  	for {
   384  		raw := iter.Next()
   385  		if raw == nil {
   386  			break
   387  		}
   388  		node := raw.(*structs.Node)
   389  
   390  		// Ignore non-terminal and new nodes
   391  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   392  			continue
   393  		}
   394  
   395  		// Get the allocations by node
   396  		ws := memdb.NewWatchSet()
   397  		allocs, err := c.snap.AllocsByNode(ws, node.ID)
   398  		if err != nil {
   399  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
   400  				eval.ID, err)
   401  			continue
   402  		}
   403  
   404  		// If there are any non-terminal allocations, skip the node. If the node
   405  		// is terminal and the allocations are not, the scheduler may not have
   406  		// run yet to transition the allocs on the node to terminal. We delay
   407  		// GC'ing until this happens.
   408  		for _, alloc := range allocs {
   409  			if !alloc.TerminalStatus() {
   410  				continue OUTER
   411  			}
   412  		}
   413  
   414  		// Node is eligible for garbage collection
   415  		gcNode = append(gcNode, node.ID)
   416  	}
   417  
   418  	// Fast-path the nothing case
   419  	if len(gcNode) == 0 {
   420  		return nil
   421  	}
   422  	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))
   423  
   424  	// Call to the leader to issue the reap
   425  	for _, nodeID := range gcNode {
   426  		req := structs.NodeDeregisterRequest{
   427  			NodeID: nodeID,
   428  			WriteRequest: structs.WriteRequest{
   429  				Region: c.srv.config.Region,
   430  			},
   431  		}
   432  		var resp structs.NodeUpdateResponse
   433  		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   434  			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
   435  			return err
   436  		}
   437  	}
   438  	return nil
   439  }