github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/drainer/watch_jobs.go

github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/drainer/watch_jobs.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  	memdb "github.com/hashicorp/go-memdb"
    10  
    11  	"github.com/hashicorp/nomad/helper"
    12  	"github.com/hashicorp/nomad/nomad/state"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"golang.org/x/time/rate"
    15  )
    16  
    17  type DrainRequest struct {
    18  	Allocs []*structs.Allocation
    19  	Resp   *structs.BatchFuture
    20  }
    21  
    22  func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
    23  	return &DrainRequest{
    24  		Allocs: allocs,
    25  		Resp:   structs.NewBatchFuture(),
    26  	}
    27  }
    28  
    29  // DrainingJobWatcher is the interface for watching a job drain
    30  type DrainingJobWatcher interface {
    31  	// RegisterJob is used to start watching a draining job
    32  	RegisterJobs(job []structs.NamespacedID)
    33  
    34  	// Drain is used to emit allocations that should be drained.
    35  	Drain() <-chan *DrainRequest
    36  
    37  	// Migrated is allocations for draining jobs that have transitioned to
    38  	// stop. There is no guarantee that duplicates won't be published.
    39  	Migrated() <-chan []*structs.Allocation
    40  }
    41  
    42  // drainingJobWatcher is used to watch draining jobs and emit events when
    43  // draining allocations have replacements
    44  type drainingJobWatcher struct {
    45  	ctx    context.Context
    46  	logger log.Logger
    47  
    48  	// state is the state that is watched for state changes.
    49  	state *state.StateStore
    50  
    51  	// limiter is used to limit the rate of blocking queries
    52  	limiter *rate.Limiter
    53  
    54  	// jobs is the set of tracked jobs.
    55  	jobs map[structs.NamespacedID]struct{}
    56  
    57  	// queryCtx is used to cancel a blocking query.
    58  	queryCtx    context.Context
    59  	queryCancel context.CancelFunc
    60  
    61  	// drainCh and migratedCh are used to emit allocations
    62  	drainCh    chan *DrainRequest
    63  	migratedCh chan []*structs.Allocation
    64  
    65  	l sync.RWMutex
    66  }
    67  
    68  // NewDrainingJobWatcher returns a new job watcher. The caller is expected to
    69  // cancel the context to clean up the drainer.
    70  func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher {
    71  
    72  	// Create a context that can cancel the blocking query so that when a new
    73  	// job gets registered it is handled.
    74  	queryCtx, queryCancel := context.WithCancel(ctx)
    75  
    76  	w := &drainingJobWatcher{
    77  		ctx:         ctx,
    78  		queryCtx:    queryCtx,
    79  		queryCancel: queryCancel,
    80  		limiter:     limiter,
    81  		logger:      logger.Named("job_watcher"),
    82  		state:       state,
    83  		jobs:        make(map[structs.NamespacedID]struct{}, 64),
    84  		drainCh:     make(chan *DrainRequest),
    85  		migratedCh:  make(chan []*structs.Allocation),
    86  	}
    87  
    88  	go w.watch()
    89  	return w
    90  }
    91  
    92  // RegisterJob marks the given job as draining and adds it to being watched.
    93  func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) {
    94  	w.l.Lock()
    95  	defer w.l.Unlock()
    96  
    97  	updated := false
    98  	for _, jns := range jobs {
    99  		if _, ok := w.jobs[jns]; ok {
   100  			continue
   101  		}
   102  
   103  		// Add the job and cancel the context
   104  		w.logger.Trace("registering job", "job", jns)
   105  		w.jobs[jns] = struct{}{}
   106  		updated = true
   107  	}
   108  
   109  	if updated {
   110  		w.queryCancel()
   111  
   112  		// Create a new query context
   113  		w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
   114  	}
   115  }
   116  
   117  // Drain returns the channel that emits allocations to drain.
   118  func (w *drainingJobWatcher) Drain() <-chan *DrainRequest {
   119  	return w.drainCh
   120  }
   121  
   122  // Migrated returns the channel that emits allocations for draining jobs that
   123  // have been migrated.
   124  func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation {
   125  	return w.migratedCh
   126  }
   127  
   128  // deregisterJob removes the job from being watched.
   129  func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
   130  	w.l.Lock()
   131  	defer w.l.Unlock()
   132  	jns := structs.NamespacedID{
   133  		ID:        jobID,
   134  		Namespace: namespace,
   135  	}
   136  	delete(w.jobs, jns)
   137  	w.logger.Trace("deregistering job", "job", jns)
   138  }
   139  
   140  // watch is the long lived watching routine that detects job drain changes.
   141  func (w *drainingJobWatcher) watch() {
   142  	timer, stop := helper.NewSafeTimer(stateReadErrorDelay)
   143  	defer stop()
   144  
   145  	waitIndex := uint64(1)
   146  
   147  	for {
   148  		timer.Reset(stateReadErrorDelay)
   149  
   150  		w.logger.Trace("getting job allocs at index", "index", waitIndex)
   151  		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex)
   152  
   153  		if err != nil {
   154  			if err == context.Canceled {
   155  				// Determine if it is a cancel or a shutdown
   156  				select {
   157  				case <-w.ctx.Done():
   158  					return
   159  				default:
   160  					// The query context was cancelled;
   161  					// reset index so we don't miss past
   162  					// updates to newly registered jobs
   163  					waitIndex = 1
   164  					continue
   165  				}
   166  			}
   167  
   168  			w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err)
   169  			select {
   170  			case <-w.ctx.Done():
   171  				w.logger.Trace("shutting down")
   172  				return
   173  			case <-timer.C:
   174  				continue
   175  			}
   176  		}
   177  		w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index)
   178  
   179  		lastHandled := waitIndex
   180  		waitIndex = index
   181  
   182  		// Snapshot the state store
   183  		snap, err := w.state.Snapshot()
   184  		if err != nil {
   185  			w.logger.Warn("failed to snapshot statestore", "error", err)
   186  			continue
   187  		}
   188  
   189  		currentJobs := w.drainingJobs()
   190  		var allDrain, allMigrated []*structs.Allocation
   191  		for jns, allocs := range jobAllocs {
   192  			// Check if the job is still registered
   193  			if _, ok := currentJobs[jns]; !ok {
   194  				w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns)
   195  				continue
   196  			}
   197  
   198  			w.logger.Trace("handling job", "job", jns)
   199  
   200  			// Lookup the job
   201  			job, err := snap.JobByID(nil, jns.Namespace, jns.ID)
   202  			if err != nil {
   203  				w.logger.Warn("failed to lookup job", "job", jns, "error", err)
   204  				continue
   205  			}
   206  
   207  			// Ignore purged jobs
   208  			if job == nil {
   209  				w.logger.Trace("ignoring garbage collected job", "job", jns)
   210  				w.deregisterJob(jns.ID, jns.Namespace)
   211  				continue
   212  			}
   213  
   214  			// Ignore any system jobs
   215  			if job.Type == structs.JobTypeSystem {
   216  				w.deregisterJob(job.ID, job.Namespace)
   217  				continue
   218  			}
   219  
   220  			result, err := handleJob(snap, job, allocs, lastHandled)
   221  			if err != nil {
   222  				w.logger.Error("handling drain for job failed", "job", jns, "error", err)
   223  				continue
   224  			}
   225  
   226  			w.logger.Trace("received result for job", "job", jns, "result", result)
   227  
   228  			allDrain = append(allDrain, result.drain...)
   229  			allMigrated = append(allMigrated, result.migrated...)
   230  
   231  			// Stop tracking this job
   232  			if result.done {
   233  				w.deregisterJob(job.ID, job.Namespace)
   234  			}
   235  		}
   236  
   237  		if len(allDrain) != 0 {
   238  			// Create the request
   239  			req := NewDrainRequest(allDrain)
   240  			w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain))
   241  
   242  			select {
   243  			case w.drainCh <- req:
   244  			case <-w.ctx.Done():
   245  				w.logger.Trace("shutting down")
   246  				return
   247  			}
   248  
   249  			// Wait for the request to be committed
   250  			select {
   251  			case <-req.Resp.WaitCh():
   252  			case <-w.ctx.Done():
   253  				w.logger.Trace("shutting down")
   254  				return
   255  			}
   256  
   257  			// See if it successfully committed
   258  			if err := req.Resp.Error(); err != nil {
   259  				w.logger.Error("failed to transition allocations", "error", err)
   260  			}
   261  
   262  			// Wait until the new index
   263  			if index := req.Resp.Index(); index > waitIndex {
   264  				waitIndex = index
   265  			}
   266  		}
   267  
   268  		if len(allMigrated) != 0 {
   269  			w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated))
   270  			select {
   271  			case w.migratedCh <- allMigrated:
   272  			case <-w.ctx.Done():
   273  				w.logger.Trace("shutting down")
   274  				return
   275  			}
   276  		}
   277  	}
   278  }
   279  
   280  // jobResult is the set of actions to take for a draining job given its current
   281  // state.
   282  type jobResult struct {
   283  	// drain is the set of allocations to emit for draining.
   284  	drain []*structs.Allocation
   285  
   286  	// migrated is the set of allocations to emit as migrated
   287  	migrated []*structs.Allocation
   288  
   289  	// done marks whether the job has been fully drained.
   290  	done bool
   291  }
   292  
   293  // newJobResult returns a jobResult with done=true. It is the responsibility of
   294  // callers to set done=false when a remaining drainable alloc is found.
   295  func newJobResult() *jobResult {
   296  	return &jobResult{
   297  		done: true,
   298  	}
   299  }
   300  
   301  func (r *jobResult) String() string {
   302  	return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done)
   303  }
   304  
   305  // handleJob takes the state of a draining job and returns the desired actions.
   306  func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) {
   307  	r := newJobResult()
   308  	batch := job.Type == structs.JobTypeBatch
   309  	taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups))
   310  	for _, tg := range job.TaskGroups {
   311  		// Only capture the groups that have a migrate strategy or we are just
   312  		// watching batch
   313  		if tg.Migrate != nil || batch {
   314  			taskGroups[tg.Name] = tg
   315  		}
   316  	}
   317  
   318  	// Sort the allocations by TG
   319  	tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups))
   320  	for _, alloc := range allocs {
   321  		if _, ok := taskGroups[alloc.TaskGroup]; !ok {
   322  			continue
   323  		}
   324  
   325  		tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc)
   326  	}
   327  
   328  	for name, tg := range taskGroups {
   329  		allocs := tgAllocs[name]
   330  		if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil {
   331  			return nil, fmt.Errorf("drain for task group %q failed: %v", name, err)
   332  		}
   333  	}
   334  
   335  	return r, nil
   336  }
   337  
   338  // handleTaskGroup takes the state of a draining task group and computes the
   339  // desired actions. For batch jobs we only notify when they have been migrated
   340  // and never mark them for drain. Batch jobs are allowed to complete up until
   341  // the deadline, after which they are force killed.
   342  func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup,
   343  	allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error {
   344  
   345  	// Determine how many allocations can be drained
   346  	drainingNodes := make(map[string]bool, 4)
   347  	healthy := 0
   348  	remainingDrainingAlloc := false
   349  	var drainable []*structs.Allocation
   350  
   351  	for _, alloc := range allocs {
   352  		// Check if the alloc is on a draining node.
   353  		onDrainingNode, ok := drainingNodes[alloc.NodeID]
   354  		if !ok {
   355  			// Look up the node
   356  			node, err := snap.NodeByID(nil, alloc.NodeID)
   357  			if err != nil {
   358  				return err
   359  			}
   360  
   361  			// Check if the node exists and whether it has a drain strategy
   362  			onDrainingNode = node != nil && node.DrainStrategy != nil
   363  			drainingNodes[alloc.NodeID] = onDrainingNode
   364  		}
   365  
   366  		// Check if the alloc should be considered migrated. A migrated
   367  		// allocation is one that is terminal, is on a draining
   368  		// allocation, and has only happened since our last handled index to
   369  		// avoid emitting many duplicate migrate events.
   370  		if alloc.TerminalStatus() &&
   371  			onDrainingNode &&
   372  			alloc.ModifyIndex > lastHandledIndex {
   373  			result.migrated = append(result.migrated, alloc)
   374  			continue
   375  		}
   376  
   377  		// If the service alloc is running and has its deployment status set, it
   378  		// is considered healthy from a migration standpoint.
   379  		if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() {
   380  			healthy++
   381  		}
   382  
   383  		// An alloc can't be considered for migration if:
   384  		// - It isn't on a draining node
   385  		// - It is already terminal
   386  		if !onDrainingNode || alloc.TerminalStatus() {
   387  			continue
   388  		}
   389  
   390  		// Capture the fact that there is an allocation that is still draining
   391  		// for this job.
   392  		remainingDrainingAlloc = true
   393  
   394  		// If we haven't marked this allocation for migration already, capture
   395  		// it as eligible for draining.
   396  		if !batch && !alloc.DesiredTransition.ShouldMigrate() {
   397  			drainable = append(drainable, alloc)
   398  		}
   399  	}
   400  
   401  	// Update the done status
   402  	if remainingDrainingAlloc {
   403  		result.done = false
   404  	}
   405  
   406  	// We don't mark batch for drain so exit
   407  	if batch {
   408  		return nil
   409  	}
   410  
   411  	// Determine how many we can drain
   412  	thresholdCount := tg.Count - tg.Migrate.MaxParallel
   413  	numToDrain := healthy - thresholdCount
   414  	numToDrain = helper.Min(len(drainable), numToDrain)
   415  	if numToDrain <= 0 {
   416  		return nil
   417  	}
   418  
   419  	result.drain = append(result.drain, drainable[0:numToDrain]...)
   420  	return nil
   421  }
   422  
   423  // getJobAllocs returns all allocations for draining jobs
   424  func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) {
   425  	if err := w.limiter.Wait(ctx); err != nil {
   426  		return nil, 0, err
   427  	}
   428  
   429  	resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx)
   430  	if err != nil {
   431  		return nil, 0, err
   432  	}
   433  	if resp == nil {
   434  		return nil, index, nil
   435  	}
   436  
   437  	return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil
   438  }
   439  
   440  // getJobAllocsImpl returns a map of draining jobs to their allocations.
   441  func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   442  	index, err := state.Index("allocs")
   443  	if err != nil {
   444  		return nil, 0, err
   445  	}
   446  
   447  	// Capture the draining jobs.
   448  	draining := w.drainingJobs()
   449  	l := len(draining)
   450  	if l == 0 {
   451  		return nil, index, nil
   452  	}
   453  
   454  	// Capture the allocs for each draining job.
   455  	var maxIndex uint64 = 0
   456  	resp := make(map[structs.NamespacedID][]*structs.Allocation, l)
   457  	for jns := range draining {
   458  		allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false)
   459  		if err != nil {
   460  			return nil, index, err
   461  		}
   462  
   463  		resp[jns] = allocs
   464  		for _, alloc := range allocs {
   465  			if maxIndex < alloc.ModifyIndex {
   466  				maxIndex = alloc.ModifyIndex
   467  			}
   468  		}
   469  	}
   470  
   471  	// Prefer using the actual max index of affected allocs since it means less
   472  	// unblocking
   473  	if maxIndex != 0 {
   474  		index = maxIndex
   475  	}
   476  
   477  	return resp, index, nil
   478  }
   479  
   480  // drainingJobs captures the set of draining jobs.
   481  func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} {
   482  	w.l.RLock()
   483  	defer w.l.RUnlock()
   484  
   485  	l := len(w.jobs)
   486  	if l == 0 {
   487  		return nil
   488  	}
   489  
   490  	draining := make(map[structs.NamespacedID]struct{}, l)
   491  	for k := range w.jobs {
   492  		draining[k] = struct{}{}
   493  	}
   494  
   495  	return draining
   496  }
   497  
   498  // getQueryCtx is a helper for getting the query context.
   499  func (w *drainingJobWatcher) getQueryCtx() context.Context {
   500  	w.l.RLock()
   501  	defer w.l.RUnlock()
   502  	return w.queryCtx
   503  }