github.com/hernad/nomad@v1.6.112/nomad/drainer/watch_jobs.go

github.com/hernad/nomad@v1.6.112/nomad/drainer/watch_jobs.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package drainer
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"sync"
    10  
    11  	log "github.com/hashicorp/go-hclog"
    12  	memdb "github.com/hashicorp/go-memdb"
    13  
    14  	"github.com/hernad/nomad/helper"
    15  	"github.com/hernad/nomad/nomad/state"
    16  	"github.com/hernad/nomad/nomad/structs"
    17  	"golang.org/x/time/rate"
    18  )
    19  
    20  type DrainRequest struct {
    21  	Allocs []*structs.Allocation
    22  	Resp   *structs.BatchFuture
    23  }
    24  
    25  func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
    26  	return &DrainRequest{
    27  		Allocs: allocs,
    28  		Resp:   structs.NewBatchFuture(),
    29  	}
    30  }
    31  
    32  // DrainingJobWatcher is the interface for watching a job drain
    33  type DrainingJobWatcher interface {
    34  	// RegisterJob is used to start watching a draining job
    35  	RegisterJobs(jobs []structs.NamespacedID)
    36  
    37  	// Drain is used to emit allocations that should be drained.
    38  	Drain() <-chan *DrainRequest
    39  
    40  	// Migrated is allocations for draining jobs that have transitioned to
    41  	// stop. There is no guarantee that duplicates won't be published.
    42  	Migrated() <-chan []*structs.Allocation
    43  }
    44  
    45  // drainingJobWatcher is used to watch draining jobs and emit events when
    46  // draining allocations have replacements
    47  type drainingJobWatcher struct {
    48  	ctx    context.Context
    49  	logger log.Logger
    50  
    51  	// state is the state that is watched for state changes.
    52  	state *state.StateStore
    53  
    54  	// limiter is used to limit the rate of blocking queries
    55  	limiter *rate.Limiter
    56  
    57  	// jobs is the set of tracked jobs.
    58  	jobs map[structs.NamespacedID]struct{}
    59  
    60  	// queryCtx is used to cancel a blocking query.
    61  	queryCtx    context.Context
    62  	queryCancel context.CancelFunc
    63  
    64  	// drainCh and migratedCh are used to emit allocations
    65  	drainCh    chan *DrainRequest
    66  	migratedCh chan []*structs.Allocation
    67  
    68  	l sync.RWMutex
    69  }
    70  
    71  // NewDrainingJobWatcher returns a new job watcher. The caller is expected to
    72  // cancel the context to clean up the drainer.
    73  func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher {
    74  
    75  	// Create a context that can cancel the blocking query so that when a new
    76  	// job gets registered it is handled.
    77  	queryCtx, queryCancel := context.WithCancel(ctx)
    78  
    79  	w := &drainingJobWatcher{
    80  		ctx:         ctx,
    81  		queryCtx:    queryCtx,
    82  		queryCancel: queryCancel,
    83  		limiter:     limiter,
    84  		logger:      logger.Named("job_watcher"),
    85  		state:       state,
    86  		jobs:        make(map[structs.NamespacedID]struct{}, 64),
    87  		drainCh:     make(chan *DrainRequest),
    88  		migratedCh:  make(chan []*structs.Allocation),
    89  	}
    90  
    91  	go w.watch()
    92  	return w
    93  }
    94  
    95  // RegisterJob marks the given job as draining and adds it to being watched.
    96  func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) {
    97  	w.l.Lock()
    98  	defer w.l.Unlock()
    99  
   100  	updated := false
   101  	for _, jns := range jobs {
   102  		if _, ok := w.jobs[jns]; ok {
   103  			continue
   104  		}
   105  
   106  		// Add the job and cancel the context
   107  		w.logger.Trace("registering job", "job", jns)
   108  		w.jobs[jns] = struct{}{}
   109  		updated = true
   110  	}
   111  
   112  	if updated {
   113  		w.queryCancel()
   114  
   115  		// Create a new query context
   116  		w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
   117  	}
   118  }
   119  
   120  // Drain returns the channel that emits allocations to drain.
   121  func (w *drainingJobWatcher) Drain() <-chan *DrainRequest {
   122  	return w.drainCh
   123  }
   124  
   125  // Migrated returns the channel that emits allocations for draining jobs that
   126  // have been migrated.
   127  func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation {
   128  	return w.migratedCh
   129  }
   130  
   131  // deregisterJob removes the job from being watched.
   132  func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
   133  	w.l.Lock()
   134  	defer w.l.Unlock()
   135  	jns := structs.NamespacedID{
   136  		ID:        jobID,
   137  		Namespace: namespace,
   138  	}
   139  	delete(w.jobs, jns)
   140  	w.logger.Trace("deregistering job", "job", jns)
   141  }
   142  
   143  // watch is the long lived watching routine that detects job drain changes.
   144  func (w *drainingJobWatcher) watch() {
   145  	timer, stop := helper.NewSafeTimer(stateReadErrorDelay)
   146  	defer stop()
   147  
   148  	waitIndex := uint64(1)
   149  
   150  	for {
   151  		timer.Reset(stateReadErrorDelay)
   152  
   153  		w.logger.Trace("getting job allocs at index", "index", waitIndex)
   154  		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex)
   155  
   156  		if err != nil {
   157  			if err == context.Canceled {
   158  				// Determine if it is a cancel or a shutdown
   159  				select {
   160  				case <-w.ctx.Done():
   161  					return
   162  				default:
   163  					// The query context was cancelled;
   164  					// reset index so we don't miss past
   165  					// updates to newly registered jobs
   166  					waitIndex = 1
   167  					continue
   168  				}
   169  			}
   170  
   171  			w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err)
   172  			select {
   173  			case <-w.ctx.Done():
   174  				w.logger.Trace("shutting down")
   175  				return
   176  			case <-timer.C:
   177  				continue
   178  			}
   179  		}
   180  		w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index)
   181  
   182  		lastHandled := waitIndex
   183  		waitIndex = index
   184  
   185  		// Snapshot the state store
   186  		snap, err := w.state.Snapshot()
   187  		if err != nil {
   188  			w.logger.Warn("failed to snapshot statestore", "error", err)
   189  			continue
   190  		}
   191  
   192  		currentJobs := w.drainingJobs()
   193  		var allDrain, allMigrated []*structs.Allocation
   194  		for jns, allocs := range jobAllocs {
   195  			// Check if the job is still registered
   196  			if _, ok := currentJobs[jns]; !ok {
   197  				w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns)
   198  				continue
   199  			}
   200  
   201  			w.logger.Trace("handling job", "job", jns)
   202  
   203  			// Lookup the job
   204  			job, err := snap.JobByID(nil, jns.Namespace, jns.ID)
   205  			if err != nil {
   206  				w.logger.Warn("failed to lookup job", "job", jns, "error", err)
   207  				continue
   208  			}
   209  
   210  			// Ignore purged jobs
   211  			if job == nil {
   212  				w.logger.Trace("ignoring garbage collected job", "job", jns)
   213  				w.deregisterJob(jns.ID, jns.Namespace)
   214  				continue
   215  			}
   216  
   217  			// Ignore any system jobs
   218  			if job.Type == structs.JobTypeSystem {
   219  				w.deregisterJob(job.ID, job.Namespace)
   220  				continue
   221  			}
   222  
   223  			result, err := handleJob(snap, job, allocs, lastHandled)
   224  			if err != nil {
   225  				w.logger.Error("handling drain for job failed", "job", jns, "error", err)
   226  				continue
   227  			}
   228  
   229  			w.logger.Trace("received result for job", "job", jns, "result", result)
   230  
   231  			allDrain = append(allDrain, result.drain...)
   232  			allMigrated = append(allMigrated, result.migrated...)
   233  
   234  			// Stop tracking this job
   235  			if result.done {
   236  				w.deregisterJob(job.ID, job.Namespace)
   237  			}
   238  		}
   239  
   240  		if len(allDrain) != 0 {
   241  			// Create the request
   242  			req := NewDrainRequest(allDrain)
   243  			w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain))
   244  
   245  			select {
   246  			case w.drainCh <- req:
   247  			case <-w.ctx.Done():
   248  				w.logger.Trace("shutting down")
   249  				return
   250  			}
   251  
   252  			// Wait for the request to be committed
   253  			select {
   254  			case <-req.Resp.WaitCh():
   255  			case <-w.ctx.Done():
   256  				w.logger.Trace("shutting down")
   257  				return
   258  			}
   259  
   260  			// See if it successfully committed
   261  			if err := req.Resp.Error(); err != nil {
   262  				w.logger.Error("failed to transition allocations", "error", err)
   263  			}
   264  
   265  			// Wait until the new index
   266  			if index := req.Resp.Index(); index > waitIndex {
   267  				waitIndex = index
   268  			}
   269  		}
   270  
   271  		if len(allMigrated) != 0 {
   272  			w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated))
   273  			select {
   274  			case w.migratedCh <- allMigrated:
   275  			case <-w.ctx.Done():
   276  				w.logger.Trace("shutting down")
   277  				return
   278  			}
   279  		}
   280  	}
   281  }
   282  
   283  // jobResult is the set of actions to take for a draining job given its current
   284  // state.
   285  type jobResult struct {
   286  	// drain is the set of allocations to emit for draining.
   287  	drain []*structs.Allocation
   288  
   289  	// migrated is the set of allocations to emit as migrated
   290  	migrated []*structs.Allocation
   291  
   292  	// done marks whether the job has been fully drained.
   293  	done bool
   294  }
   295  
   296  // newJobResult returns a jobResult with done=true. It is the responsibility of
   297  // callers to set done=false when a remaining drainable alloc is found.
   298  func newJobResult() *jobResult {
   299  	return &jobResult{
   300  		done: true,
   301  	}
   302  }
   303  
   304  func (r *jobResult) String() string {
   305  	return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done)
   306  }
   307  
   308  // handleJob takes the state of a draining job and returns the desired actions.
   309  func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) {
   310  	r := newJobResult()
   311  	batch := job.Type == structs.JobTypeBatch
   312  	taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups))
   313  	for _, tg := range job.TaskGroups {
   314  		// Only capture the groups that have a migrate strategy or we are just
   315  		// watching batch
   316  		if tg.Migrate != nil || batch {
   317  			taskGroups[tg.Name] = tg
   318  		}
   319  	}
   320  
   321  	// Sort the allocations by TG
   322  	tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups))
   323  	for _, alloc := range allocs {
   324  		if _, ok := taskGroups[alloc.TaskGroup]; !ok {
   325  			continue
   326  		}
   327  
   328  		tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc)
   329  	}
   330  
   331  	for name, tg := range taskGroups {
   332  		allocs := tgAllocs[name]
   333  		if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil {
   334  			return nil, fmt.Errorf("drain for task group %q failed: %v", name, err)
   335  		}
   336  	}
   337  
   338  	return r, nil
   339  }
   340  
   341  // handleTaskGroup takes the state of a draining task group and computes the
   342  // desired actions. For batch jobs we only notify when they have been migrated
   343  // and never mark them for drain. Batch jobs are allowed to complete up until
   344  // the deadline, after which they are force killed.
   345  func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup,
   346  	allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error {
   347  
   348  	// Determine how many allocations can be drained
   349  	drainingNodes := make(map[string]bool, 4)
   350  	healthy := 0
   351  	remainingDrainingAlloc := false
   352  	var drainable []*structs.Allocation
   353  
   354  	for _, alloc := range allocs {
   355  		// Check if the alloc is on a draining node.
   356  		onDrainingNode, ok := drainingNodes[alloc.NodeID]
   357  		if !ok {
   358  			// Look up the node
   359  			node, err := snap.NodeByID(nil, alloc.NodeID)
   360  			if err != nil {
   361  				return err
   362  			}
   363  
   364  			// Check if the node exists and whether it has a drain strategy
   365  			onDrainingNode = node != nil && node.DrainStrategy != nil
   366  			drainingNodes[alloc.NodeID] = onDrainingNode
   367  		}
   368  
   369  		// Check if the alloc should be considered migrated. A migrated
   370  		// allocation is one that is terminal on the client, is on a draining
   371  		// allocation, and has been updated since our last handled index to
   372  		// avoid emitting many duplicate migrate events.
   373  		if alloc.ClientTerminalStatus() &&
   374  			onDrainingNode &&
   375  			alloc.ModifyIndex > lastHandledIndex {
   376  			result.migrated = append(result.migrated, alloc)
   377  			continue
   378  		}
   379  
   380  		// If the service alloc is running and has its deployment status set, it
   381  		// is considered healthy from a migration standpoint.
   382  		if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() {
   383  			healthy++
   384  		}
   385  
   386  		// An alloc can't be considered for migration if:
   387  		// - It isn't on a draining node
   388  		// - It is already terminal on the client
   389  		if !onDrainingNode || alloc.ClientTerminalStatus() {
   390  			continue
   391  		}
   392  
   393  		// Capture the fact that there is an allocation that is still draining
   394  		// for this job.
   395  		remainingDrainingAlloc = true
   396  
   397  		// If we haven't marked this allocation for migration already, capture
   398  		// it as eligible for draining.
   399  		if !batch && !alloc.DesiredTransition.ShouldMigrate() {
   400  			drainable = append(drainable, alloc)
   401  		}
   402  	}
   403  
   404  	// Update the done status
   405  	if remainingDrainingAlloc {
   406  		result.done = false
   407  	}
   408  
   409  	// We don't mark batch for drain so exit
   410  	if batch {
   411  		return nil
   412  	}
   413  
   414  	// Determine how many we can drain
   415  	thresholdCount := tg.Count - tg.Migrate.MaxParallel
   416  	numToDrain := healthy - thresholdCount
   417  	numToDrain = helper.Min(len(drainable), numToDrain)
   418  	if numToDrain <= 0 {
   419  		return nil
   420  	}
   421  
   422  	result.drain = append(result.drain, drainable[0:numToDrain]...)
   423  	return nil
   424  }
   425  
   426  // getJobAllocs returns all allocations for draining jobs
   427  func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) {
   428  	if err := w.limiter.Wait(ctx); err != nil {
   429  		return nil, 0, err
   430  	}
   431  
   432  	resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx)
   433  	if err != nil {
   434  		return nil, 0, err
   435  	}
   436  	if resp == nil {
   437  		return nil, index, nil
   438  	}
   439  
   440  	return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil
   441  }
   442  
   443  // getJobAllocsImpl returns a map of draining jobs to their allocations.
   444  func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   445  	index, err := state.Index("allocs")
   446  	if err != nil {
   447  		return nil, 0, err
   448  	}
   449  
   450  	// Capture the draining jobs.
   451  	draining := w.drainingJobs()
   452  	l := len(draining)
   453  	if l == 0 {
   454  		return nil, index, nil
   455  	}
   456  
   457  	// Capture the allocs for each draining job.
   458  	var maxIndex uint64 = 0
   459  	resp := make(map[structs.NamespacedID][]*structs.Allocation, l)
   460  	for jns := range draining {
   461  		allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false)
   462  		if err != nil {
   463  			return nil, index, err
   464  		}
   465  
   466  		resp[jns] = allocs
   467  		for _, alloc := range allocs {
   468  			if maxIndex < alloc.ModifyIndex {
   469  				maxIndex = alloc.ModifyIndex
   470  			}
   471  		}
   472  	}
   473  
   474  	// Prefer using the actual max index of affected allocs since it means less
   475  	// unblocking
   476  	if maxIndex != 0 {
   477  		index = maxIndex
   478  	}
   479  
   480  	return resp, index, nil
   481  }
   482  
   483  // drainingJobs captures the set of draining jobs.
   484  func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} {
   485  	w.l.RLock()
   486  	defer w.l.RUnlock()
   487  
   488  	l := len(w.jobs)
   489  	if l == 0 {
   490  		return nil
   491  	}
   492  
   493  	draining := make(map[structs.NamespacedID]struct{}, l)
   494  	for k := range w.jobs {
   495  		draining[k] = struct{}{}
   496  	}
   497  
   498  	return draining
   499  }
   500  
   501  // getQueryCtx is a helper for getting the query context.
   502  func (w *drainingJobWatcher) getQueryCtx() context.Context {
   503  	w.l.RLock()
   504  	defer w.l.RUnlock()
   505  	return w.queryCtx
   506  }