github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/state"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/scheduler"
    12  )
    13  
    14  var (
    15  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    16  	// single Raft transaction. This is to ensure that the Raft message does not
    17  	// become too large.
    18  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    19  )
    20  
    21  // CoreScheduler is a special "scheduler" that is registered
    22  // as "_core". It is used to run various administrative work
    23  // across the cluster.
    24  type CoreScheduler struct {
    25  	srv  *Server
    26  	snap *state.StateSnapshot
    27  }
    28  
    29  // NewCoreScheduler is used to return a new system scheduler instance
    30  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    31  	s := &CoreScheduler{
    32  		srv:  srv,
    33  		snap: snap,
    34  	}
    35  	return s
    36  }
    37  
    38  // Process is used to implement the scheduler.Scheduler interface
    39  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    40  	switch eval.JobID {
    41  	case structs.CoreJobEvalGC:
    42  		return c.evalGC(eval)
    43  	case structs.CoreJobNodeGC:
    44  		return c.nodeGC(eval)
    45  	case structs.CoreJobJobGC:
    46  		return c.jobGC(eval)
    47  	case structs.CoreJobDeploymentGC:
    48  		return c.deploymentGC(eval)
    49  	case structs.CoreJobForceGC:
    50  		return c.forceGC(eval)
    51  	default:
    52  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    53  	}
    54  }
    55  
    56  // forceGC is used to garbage collect all eligible objects.
    57  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    58  	if err := c.jobGC(eval); err != nil {
    59  		return err
    60  	}
    61  	if err := c.evalGC(eval); err != nil {
    62  		return err
    63  	}
    64  	if err := c.deploymentGC(eval); err != nil {
    65  		return err
    66  	}
    67  
    68  	// Node GC must occur after the others to ensure the allocations are
    69  	// cleared.
    70  	return c.nodeGC(eval)
    71  }
    72  
    73  // jobGC is used to garbage collect eligible jobs.
    74  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    75  	// Get all the jobs eligible for garbage collection.
    76  	ws := memdb.NewWatchSet()
    77  	iter, err := c.snap.JobsByGC(ws, true)
    78  	if err != nil {
    79  		return err
    80  	}
    81  
    82  	var oldThreshold uint64
    83  	if eval.JobID == structs.CoreJobForceGC {
    84  		// The GC was forced, so set the threshold to its maximum so everything
    85  		// will GC.
    86  		oldThreshold = math.MaxUint64
    87  		c.srv.logger.Println("[DEBUG] sched.core: forced job GC")
    88  	} else {
    89  		// Get the time table to calculate GC cutoffs.
    90  		tt := c.srv.fsm.TimeTable()
    91  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
    92  		oldThreshold = tt.NearestIndex(cutoff)
    93  		c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)",
    94  			oldThreshold, c.srv.config.JobGCThreshold)
    95  	}
    96  
    97  	// Collect the allocations, evaluations and jobs to GC
    98  	var gcAlloc, gcEval []string
    99  	var gcJob []*structs.Job
   100  
   101  OUTER:
   102  	for i := iter.Next(); i != nil; i = iter.Next() {
   103  		job := i.(*structs.Job)
   104  
   105  		// Ignore new jobs.
   106  		if job.CreateIndex > oldThreshold {
   107  			continue
   108  		}
   109  
   110  		ws := memdb.NewWatchSet()
   111  		evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID)
   112  		if err != nil {
   113  			c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err)
   114  			continue
   115  		}
   116  
   117  		allEvalsGC := true
   118  		var jobAlloc, jobEval []string
   119  		for _, eval := range evals {
   120  			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
   121  			if err != nil {
   122  				continue OUTER
   123  			}
   124  
   125  			if gc {
   126  				jobEval = append(jobEval, eval.ID)
   127  				jobAlloc = append(jobAlloc, allocs...)
   128  			} else {
   129  				allEvalsGC = false
   130  				break
   131  			}
   132  		}
   133  
   134  		// Job is eligible for garbage collection
   135  		if allEvalsGC {
   136  			gcJob = append(gcJob, job)
   137  			gcAlloc = append(gcAlloc, jobAlloc...)
   138  			gcEval = append(gcEval, jobEval...)
   139  		}
   140  	}
   141  
   142  	// Fast-path the nothing case
   143  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   144  		return nil
   145  	}
   146  	c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible",
   147  		len(gcJob), len(gcEval), len(gcAlloc))
   148  
   149  	// Reap the evals and allocs
   150  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   151  		return err
   152  	}
   153  
   154  	// Call to the leader to deregister the jobs.
   155  	for _, job := range gcJob {
   156  		req := structs.JobDeregisterRequest{
   157  			JobID: job.ID,
   158  			Purge: true,
   159  			WriteRequest: structs.WriteRequest{
   160  				Region:    c.srv.config.Region,
   161  				Namespace: job.Namespace,
   162  				AuthToken: eval.LeaderACL,
   163  			},
   164  		}
   165  		var resp structs.JobDeregisterResponse
   166  		if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil {
   167  			c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err)
   168  			return err
   169  		}
   170  	}
   171  
   172  	return nil
   173  }
   174  
   175  // evalGC is used to garbage collect old evaluations
   176  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   177  	// Iterate over the evaluations
   178  	ws := memdb.NewWatchSet()
   179  	iter, err := c.snap.Evals(ws)
   180  	if err != nil {
   181  		return err
   182  	}
   183  
   184  	var oldThreshold uint64
   185  	if eval.JobID == structs.CoreJobForceGC {
   186  		// The GC was forced, so set the threshold to its maximum so everything
   187  		// will GC.
   188  		oldThreshold = math.MaxUint64
   189  		c.srv.logger.Println("[DEBUG] sched.core: forced eval GC")
   190  	} else {
   191  		// Compute the old threshold limit for GC using the FSM
   192  		// time table.  This is a rough mapping of a time to the
   193  		// Raft index it belongs to.
   194  		tt := c.srv.fsm.TimeTable()
   195  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   196  		oldThreshold = tt.NearestIndex(cutoff)
   197  		c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)",
   198  			oldThreshold, c.srv.config.EvalGCThreshold)
   199  	}
   200  
   201  	// Collect the allocations and evaluations to GC
   202  	var gcAlloc, gcEval []string
   203  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   204  		eval := raw.(*structs.Evaluation)
   205  
   206  		// The Evaluation GC should not handle batch jobs since those need to be
   207  		// garbage collected in one shot
   208  		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
   209  		if err != nil {
   210  			return err
   211  		}
   212  
   213  		if gc {
   214  			gcEval = append(gcEval, eval.ID)
   215  		}
   216  		gcAlloc = append(gcAlloc, allocs...)
   217  	}
   218  
   219  	// Fast-path the nothing case
   220  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   221  		return nil
   222  	}
   223  	c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible",
   224  		len(gcEval), len(gcAlloc))
   225  
   226  	return c.evalReap(gcEval, gcAlloc)
   227  }
   228  
   229  // gcEval returns whether the eval should be garbage collected given a raft
   230  // threshold index. The eval disqualifies for garbage collection if it or its
   231  // allocs are not older than the threshold. If the eval should be garbage
   232  // collected, the associated alloc ids that should also be removed are also
   233  // returned
   234  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
   235  	bool, []string, error) {
   236  	// Ignore non-terminal and new evaluations
   237  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   238  		return false, nil, nil
   239  	}
   240  
   241  	// Create a watchset
   242  	ws := memdb.NewWatchSet()
   243  
   244  	// If the eval is from a running "batch" job we don't want to garbage
   245  	// collect its allocations. If there is a long running batch job and its
   246  	// terminal allocations get GC'd the scheduler would re-run the
   247  	// allocations.
   248  	if eval.Type == structs.JobTypeBatch {
   249  		// Check if the job is running
   250  		job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
   251  		if err != nil {
   252  			return false, nil, err
   253  		}
   254  
   255  		// Can collect if:
   256  		// Job doesn't exist
   257  		// Job is Stopped and dead
   258  		// allowBatch and the job is dead
   259  		collect := false
   260  		if job == nil {
   261  			collect = true
   262  		} else if job.Status != structs.JobStatusDead {
   263  			collect = false
   264  		} else if job.Stop {
   265  			collect = true
   266  		} else if allowBatch {
   267  			collect = true
   268  		}
   269  
   270  		// We don't want to gc anything related to a job which is not dead
   271  		// If the batch job doesn't exist we can GC it regardless of allowBatch
   272  		if !collect {
   273  			return false, nil, nil
   274  		}
   275  	}
   276  
   277  	// Get the allocations by eval
   278  	allocs, err := c.snap.AllocsByEval(ws, eval.ID)
   279  	if err != nil {
   280  		c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v",
   281  			eval.ID, err)
   282  		return false, nil, err
   283  	}
   284  
   285  	// Scan the allocations to ensure they are terminal and old
   286  	gcEval := true
   287  	var gcAllocIDs []string
   288  	for _, alloc := range allocs {
   289  		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
   290  			// Can't GC the evaluation since not all of the allocations are
   291  			// terminal
   292  			gcEval = false
   293  		} else {
   294  			// The allocation is eligible to be GC'd
   295  			gcAllocIDs = append(gcAllocIDs, alloc.ID)
   296  		}
   297  	}
   298  
   299  	return gcEval, gcAllocIDs, nil
   300  }
   301  
   302  // evalReap contacts the leader and issues a reap on the passed evals and
   303  // allocs.
   304  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   305  	// Call to the leader to issue the reap
   306  	for _, req := range c.partitionEvalReap(evals, allocs) {
   307  		var resp structs.GenericResponse
   308  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   309  			c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err)
   310  			return err
   311  		}
   312  	}
   313  
   314  	return nil
   315  }
   316  
   317  // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single
   318  // request does not contain too many allocations and evaluations. This is
   319  // necessary to ensure that the Raft transaction does not become too large.
   320  func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   321  	var requests []*structs.EvalDeleteRequest
   322  	submittedEvals, submittedAllocs := 0, 0
   323  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   324  		req := &structs.EvalDeleteRequest{
   325  			WriteRequest: structs.WriteRequest{
   326  				Region: c.srv.config.Region,
   327  			},
   328  		}
   329  		requests = append(requests, req)
   330  		available := maxIdsPerReap
   331  
   332  		// Add the allocs first
   333  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   334  			if remaining <= available {
   335  				req.Allocs = allocs[submittedAllocs:]
   336  				available -= remaining
   337  				submittedAllocs += remaining
   338  			} else {
   339  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   340  				submittedAllocs += available
   341  
   342  				// Exhausted space so skip adding evals
   343  				continue
   344  			}
   345  		}
   346  
   347  		// Add the evals
   348  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   349  			if remaining <= available {
   350  				req.Evals = evals[submittedEvals:]
   351  				submittedEvals += remaining
   352  			} else {
   353  				req.Evals = evals[submittedEvals : submittedEvals+available]
   354  				submittedEvals += available
   355  			}
   356  		}
   357  	}
   358  
   359  	return requests
   360  }
   361  
   362  // nodeGC is used to garbage collect old nodes
   363  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   364  	// Iterate over the evaluations
   365  	ws := memdb.NewWatchSet()
   366  	iter, err := c.snap.Nodes(ws)
   367  	if err != nil {
   368  		return err
   369  	}
   370  
   371  	var oldThreshold uint64
   372  	if eval.JobID == structs.CoreJobForceGC {
   373  		// The GC was forced, so set the threshold to its maximum so everything
   374  		// will GC.
   375  		oldThreshold = math.MaxUint64
   376  		c.srv.logger.Println("[DEBUG] sched.core: forced node GC")
   377  	} else {
   378  		// Compute the old threshold limit for GC using the FSM
   379  		// time table.  This is a rough mapping of a time to the
   380  		// Raft index it belongs to.
   381  		tt := c.srv.fsm.TimeTable()
   382  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   383  		oldThreshold = tt.NearestIndex(cutoff)
   384  		c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)",
   385  			oldThreshold, c.srv.config.NodeGCThreshold)
   386  	}
   387  
   388  	// Collect the nodes to GC
   389  	var gcNode []string
   390  OUTER:
   391  	for {
   392  		raw := iter.Next()
   393  		if raw == nil {
   394  			break
   395  		}
   396  		node := raw.(*structs.Node)
   397  
   398  		// Ignore non-terminal and new nodes
   399  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   400  			continue
   401  		}
   402  
   403  		// Get the allocations by node
   404  		ws := memdb.NewWatchSet()
   405  		allocs, err := c.snap.AllocsByNode(ws, node.ID)
   406  		if err != nil {
   407  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v",
   408  				eval.ID, err)
   409  			continue
   410  		}
   411  
   412  		// If there are any non-terminal allocations, skip the node. If the node
   413  		// is terminal and the allocations are not, the scheduler may not have
   414  		// run yet to transition the allocs on the node to terminal. We delay
   415  		// GC'ing until this happens.
   416  		for _, alloc := range allocs {
   417  			if !alloc.TerminalStatus() {
   418  				continue OUTER
   419  			}
   420  		}
   421  
   422  		// Node is eligible for garbage collection
   423  		gcNode = append(gcNode, node.ID)
   424  	}
   425  
   426  	// Fast-path the nothing case
   427  	if len(gcNode) == 0 {
   428  		return nil
   429  	}
   430  	c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode))
   431  
   432  	// Call to the leader to issue the reap
   433  	for _, nodeID := range gcNode {
   434  		req := structs.NodeDeregisterRequest{
   435  			NodeID: nodeID,
   436  			WriteRequest: structs.WriteRequest{
   437  				Region:    c.srv.config.Region,
   438  				AuthToken: eval.LeaderACL,
   439  			},
   440  		}
   441  		var resp structs.NodeUpdateResponse
   442  		if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   443  			c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err)
   444  			return err
   445  		}
   446  	}
   447  	return nil
   448  }
   449  
   450  // deploymentGC is used to garbage collect old deployments
   451  func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error {
   452  	// Iterate over the deployments
   453  	ws := memdb.NewWatchSet()
   454  	iter, err := c.snap.Deployments(ws)
   455  	if err != nil {
   456  		return err
   457  	}
   458  
   459  	var oldThreshold uint64
   460  	if eval.JobID == structs.CoreJobForceGC {
   461  		// The GC was forced, so set the threshold to its maximum so everything
   462  		// will GC.
   463  		oldThreshold = math.MaxUint64
   464  		c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC")
   465  	} else {
   466  		// Compute the old threshold limit for GC using the FSM
   467  		// time table.  This is a rough mapping of a time to the
   468  		// Raft index it belongs to.
   469  		tt := c.srv.fsm.TimeTable()
   470  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold)
   471  		oldThreshold = tt.NearestIndex(cutoff)
   472  		c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)",
   473  			oldThreshold, c.srv.config.DeploymentGCThreshold)
   474  	}
   475  
   476  	// Collect the deployments to GC
   477  	var gcDeployment []string
   478  
   479  OUTER:
   480  	for {
   481  		raw := iter.Next()
   482  		if raw == nil {
   483  			break
   484  		}
   485  		deploy := raw.(*structs.Deployment)
   486  
   487  		// Ignore non-terminal and new deployments
   488  		if deploy.Active() || deploy.ModifyIndex > oldThreshold {
   489  			continue
   490  		}
   491  
   492  		// Ensure there are no allocs referencing this deployment.
   493  		allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID)
   494  		if err != nil {
   495  			c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v",
   496  				deploy.ID, err)
   497  			continue
   498  		}
   499  
   500  		// Ensure there is no allocation referencing the deployment.
   501  		for _, alloc := range allocs {
   502  			if !alloc.TerminalStatus() {
   503  				continue OUTER
   504  			}
   505  		}
   506  
   507  		// Deployment is eligible for garbage collection
   508  		gcDeployment = append(gcDeployment, deploy.ID)
   509  	}
   510  
   511  	// Fast-path the nothing case
   512  	if len(gcDeployment) == 0 {
   513  		return nil
   514  	}
   515  	c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment))
   516  	return c.deploymentReap(gcDeployment)
   517  }
   518  
   519  // deploymentReap contacts the leader and issues a reap on the passed
   520  // deployments.
   521  func (c *CoreScheduler) deploymentReap(deployments []string) error {
   522  	// Call to the leader to issue the reap
   523  	for _, req := range c.partitionDeploymentReap(deployments) {
   524  		var resp structs.GenericResponse
   525  		if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil {
   526  			c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err)
   527  			return err
   528  		}
   529  	}
   530  
   531  	return nil
   532  }
   533  
   534  // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make,
   535  // ensuring a single request does not contain too many deployments. This is
   536  // necessary to ensure that the Raft transaction does not become too large.
   537  func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest {
   538  	var requests []*structs.DeploymentDeleteRequest
   539  	submittedDeployments := 0
   540  	for submittedDeployments != len(deployments) {
   541  		req := &structs.DeploymentDeleteRequest{
   542  			WriteRequest: structs.WriteRequest{
   543  				Region: c.srv.config.Region,
   544  			},
   545  		}
   546  		requests = append(requests, req)
   547  		available := maxIdsPerReap
   548  
   549  		if remaining := len(deployments) - submittedDeployments; remaining > 0 {
   550  			if remaining <= available {
   551  				req.Deployments = deployments[submittedDeployments:]
   552  				submittedDeployments += remaining
   553  			} else {
   554  				req.Deployments = deployments[submittedDeployments : submittedDeployments+available]
   555  				submittedDeployments += available
   556  			}
   557  		}
   558  	}
   559  
   560  	return requests
   561  }