github.com/adityamillind98/nomad@v0.11.8/nomad/core_sched.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"strings"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	memdb "github.com/hashicorp/go-memdb"
    11  	version "github.com/hashicorp/go-version"
    12  	"github.com/hashicorp/nomad/nomad/state"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"github.com/hashicorp/nomad/scheduler"
    15  )
    16  
    17  var (
    18  	// maxIdsPerReap is the maximum number of evals and allocations to reap in a
    19  	// single Raft transaction. This is to ensure that the Raft message does not
    20  	// become too large.
    21  	maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids.
    22  )
    23  
    24  // CoreScheduler is a special "scheduler" that is registered
    25  // as "_core". It is used to run various administrative work
    26  // across the cluster.
    27  type CoreScheduler struct {
    28  	srv    *Server
    29  	snap   *state.StateSnapshot
    30  	logger log.Logger
    31  }
    32  
    33  // NewCoreScheduler is used to return a new system scheduler instance
    34  func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler {
    35  	s := &CoreScheduler{
    36  		srv:    srv,
    37  		snap:   snap,
    38  		logger: srv.logger.ResetNamed("core.sched"),
    39  	}
    40  	return s
    41  }
    42  
    43  // Process is used to implement the scheduler.Scheduler interface
    44  func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
    45  	job := strings.Split(eval.JobID, ":") // extra data can be smuggled in w/ JobID
    46  	switch job[0] {
    47  	case structs.CoreJobEvalGC:
    48  		return c.evalGC(eval)
    49  	case structs.CoreJobNodeGC:
    50  		return c.nodeGC(eval)
    51  	case structs.CoreJobJobGC:
    52  		return c.jobGC(eval)
    53  	case structs.CoreJobDeploymentGC:
    54  		return c.deploymentGC(eval)
    55  	case structs.CoreJobCSIVolumeClaimGC:
    56  		return c.csiVolumeClaimGC(eval)
    57  	case structs.CoreJobCSIPluginGC:
    58  		return c.csiPluginGC(eval)
    59  	case structs.CoreJobForceGC:
    60  		return c.forceGC(eval)
    61  	default:
    62  		return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID)
    63  	}
    64  }
    65  
    66  // forceGC is used to garbage collect all eligible objects.
    67  func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
    68  	if err := c.jobGC(eval); err != nil {
    69  		return err
    70  	}
    71  	if err := c.evalGC(eval); err != nil {
    72  		return err
    73  	}
    74  	if err := c.deploymentGC(eval); err != nil {
    75  		return err
    76  	}
    77  	if err := c.csiPluginGC(eval); err != nil {
    78  		return err
    79  	}
    80  	if err := c.csiVolumeClaimGC(eval); err != nil {
    81  		return err
    82  	}
    83  
    84  	// Node GC must occur after the others to ensure the allocations are
    85  	// cleared.
    86  	return c.nodeGC(eval)
    87  }
    88  
    89  // jobGC is used to garbage collect eligible jobs.
    90  func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error {
    91  	// Get all the jobs eligible for garbage collection.
    92  	ws := memdb.NewWatchSet()
    93  	iter, err := c.snap.JobsByGC(ws, true)
    94  	if err != nil {
    95  		return err
    96  	}
    97  
    98  	var oldThreshold uint64
    99  	if eval.JobID == structs.CoreJobForceGC {
   100  		// The GC was forced, so set the threshold to its maximum so everything
   101  		// will GC.
   102  		oldThreshold = math.MaxUint64
   103  		c.logger.Debug("forced job GC")
   104  	} else {
   105  		// Get the time table to calculate GC cutoffs.
   106  		tt := c.srv.fsm.TimeTable()
   107  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold)
   108  		oldThreshold = tt.NearestIndex(cutoff)
   109  		c.logger.Debug("job GC scanning before cutoff index",
   110  			"index", oldThreshold, "job_gc_threshold", c.srv.config.JobGCThreshold)
   111  	}
   112  
   113  	// Collect the allocations, evaluations and jobs to GC
   114  	var gcAlloc, gcEval []string
   115  	var gcJob []*structs.Job
   116  
   117  OUTER:
   118  	for i := iter.Next(); i != nil; i = iter.Next() {
   119  		job := i.(*structs.Job)
   120  
   121  		// Ignore new jobs.
   122  		if job.CreateIndex > oldThreshold {
   123  			continue
   124  		}
   125  
   126  		ws := memdb.NewWatchSet()
   127  		evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID)
   128  		if err != nil {
   129  			c.logger.Error("job GC failed to get evals for job", "job", job.ID, "error", err)
   130  			continue
   131  		}
   132  
   133  		allEvalsGC := true
   134  		var jobAlloc, jobEval []string
   135  		for _, eval := range evals {
   136  			gc, allocs, err := c.gcEval(eval, oldThreshold, true)
   137  			if err != nil {
   138  				continue OUTER
   139  			}
   140  
   141  			if gc {
   142  				jobEval = append(jobEval, eval.ID)
   143  				jobAlloc = append(jobAlloc, allocs...)
   144  			} else {
   145  				allEvalsGC = false
   146  				break
   147  			}
   148  		}
   149  
   150  		// Job is eligible for garbage collection
   151  		if allEvalsGC {
   152  			gcJob = append(gcJob, job)
   153  			gcAlloc = append(gcAlloc, jobAlloc...)
   154  			gcEval = append(gcEval, jobEval...)
   155  		}
   156  
   157  	}
   158  
   159  	// Fast-path the nothing case
   160  	if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 {
   161  		return nil
   162  	}
   163  	c.logger.Debug("job GC found eligible objects",
   164  		"jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc))
   165  
   166  	// Reap the evals and allocs
   167  	if err := c.evalReap(gcEval, gcAlloc); err != nil {
   168  		return err
   169  	}
   170  
   171  	// Reap the jobs
   172  	return c.jobReap(gcJob, eval.LeaderACL)
   173  }
   174  
   175  // jobReap contacts the leader and issues a reap on the passed jobs
   176  func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error {
   177  	// Call to the leader to issue the reap
   178  	for _, req := range c.partitionJobReap(jobs, leaderACL) {
   179  		var resp structs.JobBatchDeregisterResponse
   180  		if err := c.srv.RPC("Job.BatchDeregister", req, &resp); err != nil {
   181  			c.logger.Error("batch job reap failed", "error", err)
   182  			return err
   183  		}
   184  	}
   185  
   186  	return nil
   187  }
   188  
   189  // partitionJobReap returns a list of JobBatchDeregisterRequests to make,
   190  // ensuring a single request does not contain too many jobs. This is necessary
   191  // to ensure that the Raft transaction does not become too large.
   192  func (c *CoreScheduler) partitionJobReap(jobs []*structs.Job, leaderACL string) []*structs.JobBatchDeregisterRequest {
   193  	option := &structs.JobDeregisterOptions{Purge: true}
   194  	var requests []*structs.JobBatchDeregisterRequest
   195  	submittedJobs := 0
   196  	for submittedJobs != len(jobs) {
   197  		req := &structs.JobBatchDeregisterRequest{
   198  			Jobs: make(map[structs.NamespacedID]*structs.JobDeregisterOptions),
   199  			WriteRequest: structs.WriteRequest{
   200  				Region:    c.srv.config.Region,
   201  				AuthToken: leaderACL,
   202  			},
   203  		}
   204  		requests = append(requests, req)
   205  		available := maxIdsPerReap
   206  
   207  		if remaining := len(jobs) - submittedJobs; remaining > 0 {
   208  			if remaining <= available {
   209  				for _, job := range jobs[submittedJobs:] {
   210  					jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace}
   211  					req.Jobs[jns] = option
   212  				}
   213  				submittedJobs += remaining
   214  			} else {
   215  				for _, job := range jobs[submittedJobs : submittedJobs+available] {
   216  					jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace}
   217  					req.Jobs[jns] = option
   218  				}
   219  				submittedJobs += available
   220  			}
   221  		}
   222  	}
   223  
   224  	return requests
   225  }
   226  
   227  // evalGC is used to garbage collect old evaluations
   228  func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
   229  	// Iterate over the evaluations
   230  	ws := memdb.NewWatchSet()
   231  	iter, err := c.snap.Evals(ws)
   232  	if err != nil {
   233  		return err
   234  	}
   235  
   236  	var oldThreshold uint64
   237  	if eval.JobID == structs.CoreJobForceGC {
   238  		// The GC was forced, so set the threshold to its maximum so everything
   239  		// will GC.
   240  		oldThreshold = math.MaxUint64
   241  		c.logger.Debug("forced eval GC")
   242  	} else {
   243  		// Compute the old threshold limit for GC using the FSM
   244  		// time table.  This is a rough mapping of a time to the
   245  		// Raft index it belongs to.
   246  		tt := c.srv.fsm.TimeTable()
   247  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold)
   248  		oldThreshold = tt.NearestIndex(cutoff)
   249  		c.logger.Debug("eval GC scanning before cutoff index",
   250  			"index", oldThreshold, "eval_gc_threshold", c.srv.config.EvalGCThreshold)
   251  	}
   252  
   253  	// Collect the allocations and evaluations to GC
   254  	var gcAlloc, gcEval []string
   255  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   256  		eval := raw.(*structs.Evaluation)
   257  
   258  		// The Evaluation GC should not handle batch jobs since those need to be
   259  		// garbage collected in one shot
   260  		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
   261  		if err != nil {
   262  			return err
   263  		}
   264  
   265  		if gc {
   266  			gcEval = append(gcEval, eval.ID)
   267  		}
   268  		gcAlloc = append(gcAlloc, allocs...)
   269  	}
   270  
   271  	// Fast-path the nothing case
   272  	if len(gcEval) == 0 && len(gcAlloc) == 0 {
   273  		return nil
   274  	}
   275  	c.logger.Debug("eval GC found eligibile objects",
   276  		"evals", len(gcEval), "allocs", len(gcAlloc))
   277  
   278  	return c.evalReap(gcEval, gcAlloc)
   279  }
   280  
   281  // gcEval returns whether the eval should be garbage collected given a raft
   282  // threshold index. The eval disqualifies for garbage collection if it or its
   283  // allocs are not older than the threshold. If the eval should be garbage
   284  // collected, the associated alloc ids that should also be removed are also
   285  // returned
   286  func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
   287  	bool, []string, error) {
   288  	// Ignore non-terminal and new evaluations
   289  	if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
   290  		return false, nil, nil
   291  	}
   292  
   293  	// Create a watchset
   294  	ws := memdb.NewWatchSet()
   295  
   296  	// Look up the job
   297  	job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
   298  	if err != nil {
   299  		return false, nil, err
   300  	}
   301  
   302  	// Get the allocations by eval
   303  	allocs, err := c.snap.AllocsByEval(ws, eval.ID)
   304  	if err != nil {
   305  		c.logger.Error("failed to get allocs for eval",
   306  			"eval_id", eval.ID, "error", err)
   307  		return false, nil, err
   308  	}
   309  
   310  	// If the eval is from a running "batch" job we don't want to garbage
   311  	// collect its allocations. If there is a long running batch job and its
   312  	// terminal allocations get GC'd the scheduler would re-run the
   313  	// allocations.
   314  	if eval.Type == structs.JobTypeBatch {
   315  		// Check if the job is running
   316  
   317  		// Can collect if:
   318  		// Job doesn't exist
   319  		// Job is Stopped and dead
   320  		// allowBatch and the job is dead
   321  		collect := false
   322  		if job == nil {
   323  			collect = true
   324  		} else if job.Status != structs.JobStatusDead {
   325  			collect = false
   326  		} else if job.Stop {
   327  			collect = true
   328  		} else if allowBatch {
   329  			collect = true
   330  		}
   331  
   332  		// We don't want to gc anything related to a job which is not dead
   333  		// If the batch job doesn't exist we can GC it regardless of allowBatch
   334  		if !collect {
   335  			// Find allocs associated with older (based on createindex) and GC them if terminal
   336  			oldAllocs := olderVersionTerminalAllocs(allocs, job)
   337  			return false, oldAllocs, nil
   338  		}
   339  	}
   340  
   341  	// Scan the allocations to ensure they are terminal and old
   342  	gcEval := true
   343  	var gcAllocIDs []string
   344  	for _, alloc := range allocs {
   345  		if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) {
   346  			// Can't GC the evaluation since not all of the allocations are
   347  			// terminal
   348  			gcEval = false
   349  		} else {
   350  			// The allocation is eligible to be GC'd
   351  			gcAllocIDs = append(gcAllocIDs, alloc.ID)
   352  		}
   353  	}
   354  
   355  	return gcEval, gcAllocIDs, nil
   356  }
   357  
   358  // olderVersionTerminalAllocs returns terminal allocations whose job create index
   359  // is older than the job's create index
   360  func olderVersionTerminalAllocs(allocs []*structs.Allocation, job *structs.Job) []string {
   361  	var ret []string
   362  	for _, alloc := range allocs {
   363  		if alloc.Job != nil && alloc.Job.CreateIndex < job.CreateIndex && alloc.TerminalStatus() {
   364  			ret = append(ret, alloc.ID)
   365  		}
   366  	}
   367  	return ret
   368  }
   369  
   370  // evalReap contacts the leader and issues a reap on the passed evals and
   371  // allocs.
   372  func (c *CoreScheduler) evalReap(evals, allocs []string) error {
   373  	// Call to the leader to issue the reap
   374  	for _, req := range c.partitionEvalReap(evals, allocs) {
   375  		var resp structs.GenericResponse
   376  		if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil {
   377  			c.logger.Error("eval reap failed", "error", err)
   378  			return err
   379  		}
   380  	}
   381  
   382  	return nil
   383  }
   384  
   385  // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single
   386  // request does not contain too many allocations and evaluations. This is
   387  // necessary to ensure that the Raft transaction does not become too large.
   388  func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest {
   389  	var requests []*structs.EvalDeleteRequest
   390  	submittedEvals, submittedAllocs := 0, 0
   391  	for submittedEvals != len(evals) || submittedAllocs != len(allocs) {
   392  		req := &structs.EvalDeleteRequest{
   393  			WriteRequest: structs.WriteRequest{
   394  				Region: c.srv.config.Region,
   395  			},
   396  		}
   397  		requests = append(requests, req)
   398  		available := maxIdsPerReap
   399  
   400  		// Add the allocs first
   401  		if remaining := len(allocs) - submittedAllocs; remaining > 0 {
   402  			if remaining <= available {
   403  				req.Allocs = allocs[submittedAllocs:]
   404  				available -= remaining
   405  				submittedAllocs += remaining
   406  			} else {
   407  				req.Allocs = allocs[submittedAllocs : submittedAllocs+available]
   408  				submittedAllocs += available
   409  
   410  				// Exhausted space so skip adding evals
   411  				continue
   412  			}
   413  		}
   414  
   415  		// Add the evals
   416  		if remaining := len(evals) - submittedEvals; remaining > 0 {
   417  			if remaining <= available {
   418  				req.Evals = evals[submittedEvals:]
   419  				submittedEvals += remaining
   420  			} else {
   421  				req.Evals = evals[submittedEvals : submittedEvals+available]
   422  				submittedEvals += available
   423  			}
   424  		}
   425  	}
   426  
   427  	return requests
   428  }
   429  
   430  // nodeGC is used to garbage collect old nodes
   431  func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error {
   432  	// Iterate over the evaluations
   433  	ws := memdb.NewWatchSet()
   434  	iter, err := c.snap.Nodes(ws)
   435  	if err != nil {
   436  		return err
   437  	}
   438  
   439  	var oldThreshold uint64
   440  	if eval.JobID == structs.CoreJobForceGC {
   441  		// The GC was forced, so set the threshold to its maximum so everything
   442  		// will GC.
   443  		oldThreshold = math.MaxUint64
   444  		c.logger.Debug("forced node GC")
   445  	} else {
   446  		// Compute the old threshold limit for GC using the FSM
   447  		// time table.  This is a rough mapping of a time to the
   448  		// Raft index it belongs to.
   449  		tt := c.srv.fsm.TimeTable()
   450  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold)
   451  		oldThreshold = tt.NearestIndex(cutoff)
   452  		c.logger.Debug("node GC scanning before cutoff index",
   453  			"index", oldThreshold, "node_gc_threshold", c.srv.config.NodeGCThreshold)
   454  	}
   455  
   456  	// Collect the nodes to GC
   457  	var gcNode []string
   458  OUTER:
   459  	for {
   460  		raw := iter.Next()
   461  		if raw == nil {
   462  			break
   463  		}
   464  		node := raw.(*structs.Node)
   465  
   466  		// Ignore non-terminal and new nodes
   467  		if !node.TerminalStatus() || node.ModifyIndex > oldThreshold {
   468  			continue
   469  		}
   470  
   471  		// Get the allocations by node
   472  		ws := memdb.NewWatchSet()
   473  		allocs, err := c.snap.AllocsByNode(ws, node.ID)
   474  		if err != nil {
   475  			c.logger.Error("failed to get allocs for node",
   476  				"node_id", node.ID, "error", err)
   477  			continue
   478  		}
   479  
   480  		// If there are any non-terminal allocations, skip the node. If the node
   481  		// is terminal and the allocations are not, the scheduler may not have
   482  		// run yet to transition the allocs on the node to terminal. We delay
   483  		// GC'ing until this happens.
   484  		for _, alloc := range allocs {
   485  			if !alloc.TerminalStatus() {
   486  				continue OUTER
   487  			}
   488  		}
   489  
   490  		// Node is eligible for garbage collection
   491  		gcNode = append(gcNode, node.ID)
   492  	}
   493  
   494  	// Fast-path the nothing case
   495  	if len(gcNode) == 0 {
   496  		return nil
   497  	}
   498  	c.logger.Debug("node GC found eligible nodes", "nodes", len(gcNode))
   499  	return c.nodeReap(eval, gcNode)
   500  }
   501  
   502  func (c *CoreScheduler) nodeReap(eval *structs.Evaluation, nodeIDs []string) error {
   503  	// For old clusters, send single deregistration messages COMPAT(0.11)
   504  	minVersionBatchNodeDeregister := version.Must(version.NewVersion("0.9.4"))
   505  	if !ServersMeetMinimumVersion(c.srv.Members(), minVersionBatchNodeDeregister, true) {
   506  		for _, id := range nodeIDs {
   507  			req := structs.NodeDeregisterRequest{
   508  				NodeID: id,
   509  				WriteRequest: structs.WriteRequest{
   510  					Region:    c.srv.config.Region,
   511  					AuthToken: eval.LeaderACL,
   512  				},
   513  			}
   514  			var resp structs.NodeUpdateResponse
   515  			if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil {
   516  				c.logger.Error("node reap failed", "node_id", id, "error", err)
   517  				return err
   518  			}
   519  		}
   520  		return nil
   521  	}
   522  
   523  	// Call to the leader to issue the reap
   524  	for _, ids := range partitionAll(maxIdsPerReap, nodeIDs) {
   525  		req := structs.NodeBatchDeregisterRequest{
   526  			NodeIDs: ids,
   527  			WriteRequest: structs.WriteRequest{
   528  				Region:    c.srv.config.Region,
   529  				AuthToken: eval.LeaderACL,
   530  			},
   531  		}
   532  		var resp structs.NodeUpdateResponse
   533  		if err := c.srv.RPC("Node.BatchDeregister", &req, &resp); err != nil {
   534  			c.logger.Error("node reap failed", "node_ids", ids, "error", err)
   535  			return err
   536  		}
   537  	}
   538  	return nil
   539  }
   540  
   541  // deploymentGC is used to garbage collect old deployments
   542  func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error {
   543  	// Iterate over the deployments
   544  	ws := memdb.NewWatchSet()
   545  	iter, err := c.snap.Deployments(ws)
   546  	if err != nil {
   547  		return err
   548  	}
   549  
   550  	var oldThreshold uint64
   551  	if eval.JobID == structs.CoreJobForceGC {
   552  		// The GC was forced, so set the threshold to its maximum so everything
   553  		// will GC.
   554  		oldThreshold = math.MaxUint64
   555  		c.logger.Debug("forced deployment GC")
   556  	} else {
   557  		// Compute the old threshold limit for GC using the FSM
   558  		// time table.  This is a rough mapping of a time to the
   559  		// Raft index it belongs to.
   560  		tt := c.srv.fsm.TimeTable()
   561  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold)
   562  		oldThreshold = tt.NearestIndex(cutoff)
   563  		c.logger.Debug("deployment GC scanning before cutoff index",
   564  			"index", oldThreshold, "deployment_gc_threshold", c.srv.config.DeploymentGCThreshold)
   565  	}
   566  
   567  	// Collect the deployments to GC
   568  	var gcDeployment []string
   569  
   570  OUTER:
   571  	for {
   572  		raw := iter.Next()
   573  		if raw == nil {
   574  			break
   575  		}
   576  		deploy := raw.(*structs.Deployment)
   577  
   578  		// Ignore non-terminal and new deployments
   579  		if deploy.Active() || deploy.ModifyIndex > oldThreshold {
   580  			continue
   581  		}
   582  
   583  		// Ensure there are no allocs referencing this deployment.
   584  		allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID)
   585  		if err != nil {
   586  			c.logger.Error("failed to get allocs for deployment",
   587  				"deployment_id", deploy.ID, "error", err)
   588  			continue
   589  		}
   590  
   591  		// Ensure there is no allocation referencing the deployment.
   592  		for _, alloc := range allocs {
   593  			if !alloc.TerminalStatus() {
   594  				continue OUTER
   595  			}
   596  		}
   597  
   598  		// Deployment is eligible for garbage collection
   599  		gcDeployment = append(gcDeployment, deploy.ID)
   600  	}
   601  
   602  	// Fast-path the nothing case
   603  	if len(gcDeployment) == 0 {
   604  		return nil
   605  	}
   606  	c.logger.Debug("deployment GC found eligible deployments", "deployments", len(gcDeployment))
   607  	return c.deploymentReap(gcDeployment)
   608  }
   609  
   610  // deploymentReap contacts the leader and issues a reap on the passed
   611  // deployments.
   612  func (c *CoreScheduler) deploymentReap(deployments []string) error {
   613  	// Call to the leader to issue the reap
   614  	for _, req := range c.partitionDeploymentReap(deployments) {
   615  		var resp structs.GenericResponse
   616  		if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil {
   617  			c.logger.Error("deployment reap failed", "error", err)
   618  			return err
   619  		}
   620  	}
   621  
   622  	return nil
   623  }
   624  
   625  // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make,
   626  // ensuring a single request does not contain too many deployments. This is
   627  // necessary to ensure that the Raft transaction does not become too large.
   628  func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest {
   629  	var requests []*structs.DeploymentDeleteRequest
   630  	submittedDeployments := 0
   631  	for submittedDeployments != len(deployments) {
   632  		req := &structs.DeploymentDeleteRequest{
   633  			WriteRequest: structs.WriteRequest{
   634  				Region: c.srv.config.Region,
   635  			},
   636  		}
   637  		requests = append(requests, req)
   638  		available := maxIdsPerReap
   639  
   640  		if remaining := len(deployments) - submittedDeployments; remaining > 0 {
   641  			if remaining <= available {
   642  				req.Deployments = deployments[submittedDeployments:]
   643  				submittedDeployments += remaining
   644  			} else {
   645  				req.Deployments = deployments[submittedDeployments : submittedDeployments+available]
   646  				submittedDeployments += available
   647  			}
   648  		}
   649  	}
   650  
   651  	return requests
   652  }
   653  
   654  // allocGCEligible returns if the allocation is eligible to be garbage collected
   655  // according to its terminal status and its reschedule trackers
   656  func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool {
   657  	// Not in a terminal status and old enough
   658  	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
   659  		return false
   660  	}
   661  
   662  	// If the allocation is still running on the client we can not garbage
   663  	// collect it.
   664  	if a.ClientStatus == structs.AllocClientStatusRunning {
   665  		return false
   666  	}
   667  
   668  	// If the job is deleted, stopped or dead all allocs can be removed
   669  	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
   670  		return true
   671  	}
   672  
   673  	// If the allocation's desired state is Stop, it can be GCed even if it
   674  	// has failed and hasn't been rescheduled. This can happen during job updates
   675  	if a.DesiredStatus == structs.AllocDesiredStatusStop {
   676  		return true
   677  	}
   678  
   679  	// If the alloc hasn't failed then we don't need to consider it for rescheduling
   680  	// Rescheduling needs to copy over information from the previous alloc so that it
   681  	// can enforce the reschedule policy
   682  	if a.ClientStatus != structs.AllocClientStatusFailed {
   683  		return true
   684  	}
   685  
   686  	var reschedulePolicy *structs.ReschedulePolicy
   687  	tg := job.LookupTaskGroup(a.TaskGroup)
   688  
   689  	if tg != nil {
   690  		reschedulePolicy = tg.ReschedulePolicy
   691  	}
   692  	// No reschedule policy or rescheduling is disabled
   693  	if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
   694  		return true
   695  	}
   696  	// Restart tracking information has been carried forward
   697  	if a.NextAllocation != "" {
   698  		return true
   699  	}
   700  
   701  	// This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet
   702  	if reschedulePolicy.Unlimited {
   703  		return false
   704  	}
   705  
   706  	// No restarts have been attempted yet
   707  	if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
   708  		return false
   709  	}
   710  
   711  	// Don't GC if most recent reschedule attempt is within time interval
   712  	interval := reschedulePolicy.Interval
   713  	lastIndex := len(a.RescheduleTracker.Events)
   714  	lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
   715  	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
   716  
   717  	return timeDiff > interval.Nanoseconds()
   718  }
   719  
   720  // csiVolumeClaimGC is used to garbage collect CSI volume claims
   721  func (c *CoreScheduler) csiVolumeClaimGC(eval *structs.Evaluation) error {
   722  
   723  	gcClaims := func(ns, volID string) error {
   724  		req := &structs.CSIVolumeClaimRequest{
   725  			VolumeID: volID,
   726  			Claim:    structs.CSIVolumeClaimRelease,
   727  		}
   728  		req.Namespace = ns
   729  		req.Region = c.srv.config.Region
   730  		err := c.srv.RPC("CSIVolume.Claim", req, &structs.CSIVolumeClaimResponse{})
   731  		return err
   732  	}
   733  
   734  	c.logger.Trace("garbage collecting unclaimed CSI volume claims", "eval.JobID", eval.JobID)
   735  
   736  	// Volume ID smuggled in with the eval's own JobID
   737  	evalVolID := strings.Split(eval.JobID, ":")
   738  
   739  	// COMPAT(1.0): 0.11.0 shipped with 3 fields. tighten this check to len == 2
   740  	if len(evalVolID) > 1 {
   741  		volID := evalVolID[1]
   742  		return gcClaims(eval.Namespace, volID)
   743  	}
   744  
   745  	ws := memdb.NewWatchSet()
   746  
   747  	iter, err := c.snap.CSIVolumes(ws)
   748  	if err != nil {
   749  		return err
   750  	}
   751  
   752  	// Get the time table to calculate GC cutoffs.
   753  	var oldThreshold uint64
   754  	if eval.JobID == structs.CoreJobForceGC {
   755  		// The GC was forced, so set the threshold to its maximum so
   756  		// everything will GC.
   757  		oldThreshold = math.MaxUint64
   758  		c.logger.Debug("forced volume claim GC")
   759  	} else {
   760  		tt := c.srv.fsm.TimeTable()
   761  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.CSIVolumeClaimGCThreshold)
   762  		oldThreshold = tt.NearestIndex(cutoff)
   763  	}
   764  
   765  	c.logger.Debug("CSI volume claim GC scanning before cutoff index",
   766  		"index", oldThreshold,
   767  		"csi_volume_claim_gc_threshold", c.srv.config.CSIVolumeClaimGCThreshold)
   768  
   769  NEXT_VOLUME:
   770  	for i := iter.Next(); i != nil; i = iter.Next() {
   771  		vol := i.(*structs.CSIVolume)
   772  
   773  		// Ignore new volumes
   774  		if vol.CreateIndex > oldThreshold {
   775  			continue
   776  		}
   777  
   778  		// we only call the claim release RPC if the volume has claims
   779  		// that no longer have valid allocations. otherwise we'd send
   780  		// out a lot of do-nothing RPCs.
   781  		for id := range vol.ReadClaims {
   782  			alloc, err := c.snap.AllocByID(ws, id)
   783  			if err != nil {
   784  				return err
   785  			}
   786  			if alloc == nil {
   787  				err = gcClaims(vol.Namespace, vol.ID)
   788  				if err != nil {
   789  					return err
   790  				}
   791  				goto NEXT_VOLUME
   792  			}
   793  		}
   794  		for id := range vol.WriteClaims {
   795  			alloc, err := c.snap.AllocByID(ws, id)
   796  			if err != nil {
   797  				return err
   798  			}
   799  			if alloc == nil {
   800  				err = gcClaims(vol.Namespace, vol.ID)
   801  				if err != nil {
   802  					return err
   803  				}
   804  				goto NEXT_VOLUME
   805  			}
   806  		}
   807  		if len(vol.PastClaims) > 0 {
   808  			err = gcClaims(vol.Namespace, vol.ID)
   809  			if err != nil {
   810  				return err
   811  			}
   812  		}
   813  
   814  	}
   815  	return nil
   816  
   817  }
   818  
   819  // csiPluginGC is used to garbage collect unused plugins
   820  func (c *CoreScheduler) csiPluginGC(eval *structs.Evaluation) error {
   821  
   822  	ws := memdb.NewWatchSet()
   823  
   824  	iter, err := c.snap.CSIPlugins(ws)
   825  	if err != nil {
   826  		return err
   827  	}
   828  
   829  	// Get the time table to calculate GC cutoffs.
   830  	var oldThreshold uint64
   831  	if eval.JobID == structs.CoreJobForceGC {
   832  		// The GC was forced, so set the threshold to its maximum so
   833  		// everything will GC.
   834  		oldThreshold = math.MaxUint64
   835  		c.logger.Debug("forced plugin GC")
   836  	} else {
   837  		tt := c.srv.fsm.TimeTable()
   838  		cutoff := time.Now().UTC().Add(-1 * c.srv.config.CSIPluginGCThreshold)
   839  		oldThreshold = tt.NearestIndex(cutoff)
   840  	}
   841  
   842  	c.logger.Debug("CSI plugin GC scanning before cutoff index",
   843  		"index", oldThreshold, "csi_plugin_gc_threshold", c.srv.config.CSIPluginGCThreshold)
   844  
   845  	for i := iter.Next(); i != nil; i = iter.Next() {
   846  		plugin := i.(*structs.CSIPlugin)
   847  
   848  		// Ignore new plugins
   849  		if plugin.CreateIndex > oldThreshold {
   850  			continue
   851  		}
   852  
   853  		req := &structs.CSIPluginDeleteRequest{ID: plugin.ID}
   854  		req.Region = c.srv.Region()
   855  		err := c.srv.RPC("CSIPlugin.Delete", req, &structs.CSIPluginDeleteResponse{})
   856  		if err != nil {
   857  			if err.Error() == "plugin in use" {
   858  				continue
   859  			}
   860  			c.logger.Error("failed to GC plugin", "plugin_id", plugin.ID, "error", err)
   861  			return err
   862  		}
   863  	}
   864  	return nil
   865  }