github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/core_sched.go

github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/state"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/scheduler"
    12  )
    13  
    14  var (
    15  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    16  	// single Raft transaction. This is to ensure that the Raft message does not
    17  	// become too large.
    18  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    19  )
    20  
    21  // CoreScheduler is a special "scheduler" that is registered
    22  // as "_core". It is used to run various administrative work
    23  // across the cluster.
    24  type CoreScheduler struct {
    25  	srv  *Server
    26  	snap *state.StateSnapshot
    27  }
    28  
    29  // NewCoreScheduler is used to return a new system scheduler instance
    30  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    31  	s := &CoreScheduler{
    32  		srv:  srv,
    33  		snap: snap,
    34  	}
    35  	return s
    36  }
    37  
    38  // Process is used to implement the scheduler.Scheduler interface
    39  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    40  	switch eval.JobID {
    41  	case structs.CoreJobEvalGC:
    42  		return c.evalGC(eval)
    43  	case structs.CoreJobNodeGC:
    44  		return c.nodeGC(eval)
    45  	case structs.CoreJobJobGC:
    46  		return c.jobGC(eval)
    47  	case structs.CoreJobDeploymentGC:
    48  		return c.deploymentGC(eval)
    49  	case structs.CoreJobForceGC:
    50  		return c.forceGC(eval)
    51  	default:
    52  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    53  	}
    54  }
    55  
    56  // forceGC is used to garbage collect all eligible objects.
    57  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    58  	if err := c.jobGC(eval); err != nil {
    59  		return err
    60  	}
    61  	if err := c.evalGC(eval); err != nil {
    62  		return err
    63  	}
    64  	if err := c.deploymentGC(eval); err != nil {
    65  		return err
    66  	}
    67  
    68  	// Node GC must occur after the others to ensure the allocations are
    69  	// cleared.
    70  	return c.nodeGC(eval)
    71  }
    72  
    73  // jobGC is used to garbage collect eligible jobs.
    74  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    75  	// Get all the jobs eligible for garbage collection.
    76  	ws := memdb.NewWatchSet()
    77  	iter, err := c.snap.JobsByGC(ws, true)
    78  	if err != nil {
    79  		return err
    80  	}
    81  
    82  	var oldThreshold uint64
    83  	if eval.JobID == structs.CoreJobForceGC {
    84  		// The GC was forced, so set the threshold to its maximum so everything
    85  		// will GC.
    86  		oldThreshold = math.MaxUint64
    87  		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
    88  	} else {
    89  		// Get the time table to calculate GC cutoffs.
    90  		tt := c.srv.fsm.TimeTable()
    91  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
    92  		oldThreshold = tt.NearestIndex(cutoff)
    93  		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
    94  			oldThreshold, c.srv.config.JobGCThreshold)
    95  	}
    96  
    97  	// Collect the allocations, evaluations and jobs to GC
    98  	var gcAlloc, gcEval, gcJob []string
    99  
   100  OUTER:
   101  	for i := iter.Next(); i != nil; i = iter.Next() {
   102  		job := i.(*structs.Job)
   103  
   104  		// Ignore new jobs.
   105  		if job.CreateIndex > oldThreshold {
   106  			continue
   107  		}
   108  
   109  		ws := memdb.NewWatchSet()
   110  		evals, err := c.snap.EvalsByJob(ws, job.ID)
   111  		if err != nil {
   112  			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
   113  			continue
   114  		}
   115  
   116  		allEvalsGC := true
   117  		var jobAlloc, jobEval []string
   118  		for _, eval := range evals {
   119  			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
   120  			if err != nil {
   121  				continue OUTER
   122  			}
   123  
   124  			if gc {
   125  				jobEval = append(jobEval, eval.ID)
   126  				jobAlloc = append(jobAlloc, allocs...)
   127  			} else {
   128  				allEvalsGC = false
   129  				break
   130  			}
   131  		}
   132  
   133  		// Job is eligible for garbage collection
   134  		if allEvalsGC {
   135  			gcJob = append(gcJob, job.ID)
   136  			gcAlloc = append(gcAlloc, jobAlloc...)
   137  			gcEval = append(gcEval, jobEval...)
   138  		}
   139  	}
   140  
   141  	// Fast-path the nothing case
   142  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   143  		return nil
   144  	}
   145  	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
   146  		len(gcJob), len(gcEval), len(gcAlloc))
   147  
   148  	// Reap the evals and allocs
   149  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   150  		return err
   151  	}
   152  
   153  	// Call to the leader to deregister the jobs.
   154  	for _, job := range gcJob {
   155  		req := structs.JobDeregisterRequest{
   156  			JobID: job,
   157  			Purge: true,
   158  			WriteRequest: structs.WriteRequest{
   159  				Region: c.srv.config.Region,
   160  			},
   161  		}
   162  		var resp structs.JobDeregisterResponse
   163  		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
   164  			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
   165  			return err
   166  		}
   167  	}
   168  
   169  	return nil
   170  }
   171  
   172  // evalGC is used to garbage collect old evaluations
   173  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   174  	// Iterate over the evaluations
   175  	ws := memdb.NewWatchSet()
   176  	iter, err := c.snap.Evals(ws)
   177  	if err != nil {
   178  		return err
   179  	}
   180  
   181  	var oldThreshold uint64
   182  	if eval.JobID == structs.CoreJobForceGC {
   183  		// The GC was forced, so set the threshold to its maximum so everything
   184  		// will GC.
   185  		oldThreshold = math.MaxUint64
   186  		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
   187  	} else {
   188  		// Compute the old threshold limit for GC using the FSM
   189  		// time table.  This is a rough mapping of a time to the
   190  		// Raft index it belongs to.
   191  		tt := c.srv.fsm.TimeTable()
   192  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   193  		oldThreshold = tt.NearestIndex(cutoff)
   194  		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
   195  			oldThreshold, c.srv.config.EvalGCThreshold)
   196  	}
   197  
   198  	// Collect the allocations and evaluations to GC
   199  	var gcAlloc, gcEval []string
   200  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   201  		eval := raw.(*structs.Evaluation)
   202  
   203  		// The Evaluation GC should not handle batch jobs since those need to be
   204  		// garbage collected in one shot
   205  		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
   206  		if err != nil {
   207  			return err
   208  		}
   209  
   210  		if gc {
   211  			gcEval = append(gcEval, eval.ID)
   212  		}
   213  		gcAlloc = append(gcAlloc, allocs...)
   214  	}
   215  
   216  	// Fast-path the nothing case
   217  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   218  		return nil
   219  	}
   220  	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
   221  		len(gcEval), len(gcAlloc))
   222  
   223  	return c.evalReap(gcEval, gcAlloc)
   224  }
   225  
   226  // gcEval returns whether the eval should be garbage collected given a raft
   227  // threshold index. The eval disqualifies for garbage collection if it or its
   228  // allocs are not older than the threshold. If the eval should be garbage
   229  // collected, the associated alloc ids that should also be removed are also
   230  // returned
   231  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
   232  	bool, []string, error) {
   233  	// Ignore non-terminal and new evaluations
   234  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   235  		return false, nil, nil
   236  	}
   237  
   238  	// Create a watchset
   239  	ws := memdb.NewWatchSet()
   240  
   241  	// If the eval is from a running "batch" job we don't want to garbage
   242  	// collect its allocations. If there is a long running batch job and its
   243  	// terminal allocations get GC'd the scheduler would re-run the
   244  	// allocations.
   245  	if eval.Type == structs.JobTypeBatch {
   246  		// Check if the job is running
   247  		job, err := c.snap.JobByID(ws, eval.JobID)
   248  		if err != nil {
   249  			return false, nil, err
   250  		}
   251  
   252  		// Can collect if:
   253  		// Job doesn't exist
   254  		// Job is Stopped and dead
   255  		// allowBatch and the job is dead
   256  		collect := false
   257  		if job == nil {
   258  			collect = true
   259  		} else if job.Status != structs.JobStatusDead {
   260  			collect = false
   261  		} else if job.Stop {
   262  			collect = true
   263  		} else if allowBatch {
   264  			collect = true
   265  		}
   266  
   267  		// We don't want to gc anything related to a job which is not dead
   268  		// If the batch job doesn't exist we can GC it regardless of allowBatch
   269  		if !collect {
   270  			return false, nil, nil
   271  		}
   272  	}
   273  
   274  	// Get the allocations by eval
   275  	allocs, err := c.snap.AllocsByEval(ws, eval.ID)
   276  	if err != nil {
   277  		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
   278  			eval.ID, err)
   279  		return false, nil, err
   280  	}
   281  
   282  	// Scan the allocations to ensure they are terminal and old
   283  	gcEval := true
   284  	var gcAllocIDs []string
   285  	for _, alloc := range allocs {
   286  		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
   287  			// Can't GC the evaluation since not all of the allocations are
   288  			// terminal
   289  			gcEval = false
   290  		} else {
   291  			// The allocation is eligible to be GC'd
   292  			gcAllocIDs = append(gcAllocIDs, alloc.ID)
   293  		}
   294  	}
   295  
   296  	return gcEval, gcAllocIDs, nil
   297  }
   298  
   299  // evalReap contacts the leader and issues a reap on the passed evals and
   300  // allocs.
   301  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   302  	// Call to the leader to issue the reap
   303  	for _, req := range c.partitionEvalReap(evals, allocs) {
   304  		var resp structs.GenericResponse
   305  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   306  			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
   307  			return err
   308  		}
   309  	}
   310  
   311  	return nil
   312  }
   313  
   314  // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single
   315  // request does not contain too many allocations and evaluations. This is
   316  // necessary to ensure that the Raft transaction does not become too large.
   317  func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   318  	var requests []*structs.EvalDeleteRequest
   319  	submittedEvals, submittedAllocs := 0, 0
   320  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   321  		req := &structs.EvalDeleteRequest{
   322  			WriteRequest: structs.WriteRequest{
   323  				Region: c.srv.config.Region,
   324  			},
   325  		}
   326  		requests = append(requests, req)
   327  		available := maxIdsPerReap
   328  
   329  		// Add the allocs first
   330  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   331  			if remaining <= available {
   332  				req.Allocs = allocs[submittedAllocs:]
   333  				available -= remaining
   334  				submittedAllocs += remaining
   335  			} else {
   336  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   337  				submittedAllocs += available
   338  
   339  				// Exhausted space so skip adding evals
   340  				continue
   341  			}
   342  		}
   343  
   344  		// Add the evals
   345  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   346  			if remaining <= available {
   347  				req.Evals = evals[submittedEvals:]
   348  				submittedEvals += remaining
   349  			} else {
   350  				req.Evals = evals[submittedEvals : submittedEvals+available]
   351  				submittedEvals += available
   352  			}
   353  		}
   354  	}
   355  
   356  	return requests
   357  }
   358  
   359  // nodeGC is used to garbage collect old nodes
   360  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   361  	// Iterate over the evaluations
   362  	ws := memdb.NewWatchSet()
   363  	iter, err := c.snap.Nodes(ws)
   364  	if err != nil {
   365  		return err
   366  	}
   367  
   368  	var oldThreshold uint64
   369  	if eval.JobID == structs.CoreJobForceGC {
   370  		// The GC was forced, so set the threshold to its maximum so everything
   371  		// will GC.
   372  		oldThreshold = math.MaxUint64
   373  		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
   374  	} else {
   375  		// Compute the old threshold limit for GC using the FSM
   376  		// time table.  This is a rough mapping of a time to the
   377  		// Raft index it belongs to.
   378  		tt := c.srv.fsm.TimeTable()
   379  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   380  		oldThreshold = tt.NearestIndex(cutoff)
   381  		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
   382  			oldThreshold, c.srv.config.NodeGCThreshold)
   383  	}
   384  
   385  	// Collect the nodes to GC
   386  	var gcNode []string
   387  OUTER:
   388  	for {
   389  		raw := iter.Next()
   390  		if raw == nil {
   391  			break
   392  		}
   393  		node := raw.(*structs.Node)
   394  
   395  		// Ignore non-terminal and new nodes
   396  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   397  			continue
   398  		}
   399  
   400  		// Get the allocations by node
   401  		ws := memdb.NewWatchSet()
   402  		allocs, err := c.snap.AllocsByNode(ws, node.ID)
   403  		if err != nil {
   404  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
   405  				eval.ID, err)
   406  			continue
   407  		}
   408  
   409  		// If there are any non-terminal allocations, skip the node. If the node
   410  		// is terminal and the allocations are not, the scheduler may not have
   411  		// run yet to transition the allocs on the node to terminal. We delay
   412  		// GC'ing until this happens.
   413  		for _, alloc := range allocs {
   414  			if !alloc.TerminalStatus() {
   415  				continue OUTER
   416  			}
   417  		}
   418  
   419  		// Node is eligible for garbage collection
   420  		gcNode = append(gcNode, node.ID)
   421  	}
   422  
   423  	// Fast-path the nothing case
   424  	if len(gcNode) == 0 {
   425  		return nil
   426  	}
   427  	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))
   428  
   429  	// Call to the leader to issue the reap
   430  	for _, nodeID := range gcNode {
   431  		req := structs.NodeDeregisterRequest{
   432  			NodeID: nodeID,
   433  			WriteRequest: structs.WriteRequest{
   434  				Region: c.srv.config.Region,
   435  			},
   436  		}
   437  		var resp structs.NodeUpdateResponse
   438  		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   439  			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
   440  			return err
   441  		}
   442  	}
   443  	return nil
   444  }
   445  
   446  // deploymentGC is used to garbage collect old deployments
   447  func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error {
   448  	// Iterate over the deployments
   449  	ws := memdb.NewWatchSet()
   450  	iter, err := c.snap.Deployments(ws)
   451  	if err != nil {
   452  		return err
   453  	}
   454  
   455  	var oldThreshold uint64
   456  	if eval.JobID == structs.CoreJobForceGC {
   457  		// The GC was forced, so set the threshold to its maximum so everything
   458  		// will GC.
   459  		oldThreshold = math.MaxUint64
   460  		c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC")
   461  	} else {
   462  		// Compute the old threshold limit for GC using the FSM
   463  		// time table.  This is a rough mapping of a time to the
   464  		// Raft index it belongs to.
   465  		tt := c.srv.fsm.TimeTable()
   466  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold)
   467  		oldThreshold = tt.NearestIndex(cutoff)
   468  		c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)",
   469  			oldThreshold, c.srv.config.DeploymentGCThreshold)
   470  	}
   471  
   472  	// Collect the deployments to GC
   473  	var gcDeployment []string
   474  
   475  OUTER:
   476  	for {
   477  		raw := iter.Next()
   478  		if raw == nil {
   479  			break
   480  		}
   481  		deploy := raw.(*structs.Deployment)
   482  
   483  		// Ignore non-terminal and new deployments
   484  		if deploy.Active() || deploy.ModifyIndex > oldThreshold {
   485  			continue
   486  		}
   487  
   488  		// Ensure there are no allocs referencing this deployment.
   489  		allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID)
   490  		if err != nil {
   491  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v",
   492  				deploy.ID, err)
   493  			continue
   494  		}
   495  
   496  		// Ensure there is no allocation referencing the deployment.
   497  		for _, alloc := range allocs {
   498  			if !alloc.TerminalStatus() {
   499  				continue OUTER
   500  			}
   501  		}
   502  
   503  		// Deployment is eligible for garbage collection
   504  		gcDeployment = append(gcDeployment, deploy.ID)
   505  	}
   506  
   507  	// Fast-path the nothing case
   508  	if len(gcDeployment) == 0 {
   509  		return nil
   510  	}
   511  	c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment))
   512  	return c.deploymentReap(gcDeployment)
   513  }
   514  
   515  // deploymentReap contacts the leader and issues a reap on the passed
   516  // deployments.
   517  func (c *CoreScheduler) deploymentReap(deployments []string) error {
   518  	// Call to the leader to issue the reap
   519  	for _, req := range c.partitionDeploymentReap(deployments) {
   520  		var resp structs.GenericResponse
   521  		if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil {
   522  			c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err)
   523  			return err
   524  		}
   525  	}
   526  
   527  	return nil
   528  }
   529  
   530  // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make,
   531  // ensuring a single request does not contain too many deployments. This is
   532  // necessary to ensure that the Raft transaction does not become too large.
   533  func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest {
   534  	var requests []*structs.DeploymentDeleteRequest
   535  	submittedDeployments := 0
   536  	for submittedDeployments != len(deployments) {
   537  		req := &structs.DeploymentDeleteRequest{
   538  			WriteRequest: structs.WriteRequest{
   539  				Region: c.srv.config.Region,
   540  			},
   541  		}
   542  		requests = append(requests, req)
   543  		available := maxIdsPerReap
   544  
   545  		if remaining := len(deployments) - submittedDeployments; remaining > 0 {
   546  			if remaining <= available {
   547  				req.Deployments = deployments[submittedDeployments:]
   548  				submittedDeployments += remaining
   549  			} else {
   550  				req.Deployments = deployments[submittedDeployments : submittedDeployments+available]
   551  				submittedDeployments += available
   552  			}
   553  		}
   554  	}
   555  
   556  	return requests
   557  }