github.com/bigcommerce/nomad@v0.9.3-bc/nomad/drainer/watch_jobs.go

github.com/bigcommerce/nomad@v0.9.3-bc/nomad/drainer/watch_jobs.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	memdb "github.com/hashicorp/go-memdb"
    11  
    12  	"github.com/hashicorp/nomad/helper"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"golang.org/x/time/rate"
    16  )
    17  
    18  type DrainRequest struct {
    19  	Allocs []*structs.Allocation
    20  	Resp   *structs.BatchFuture
    21  }
    22  
    23  func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
    24  	return &DrainRequest{
    25  		Allocs: allocs,
    26  		Resp:   structs.NewBatchFuture(),
    27  	}
    28  }
    29  
    30  // DrainingJobWatcher is the interface for watching a job drain
    31  type DrainingJobWatcher interface {
    32  	// RegisterJob is used to start watching a draining job
    33  	RegisterJobs(job []structs.NamespacedID)
    34  
    35  	// Drain is used to emit allocations that should be drained.
    36  	Drain() <-chan *DrainRequest
    37  
    38  	// Migrated is allocations for draining jobs that have transitioned to
    39  	// stop. There is no guarantee that duplicates won't be published.
    40  	Migrated() <-chan []*structs.Allocation
    41  }
    42  
    43  // drainingJobWatcher is used to watch draining jobs and emit events when
    44  // draining allocations have replacements
    45  type drainingJobWatcher struct {
    46  	ctx    context.Context
    47  	logger log.Logger
    48  
    49  	// state is the state that is watched for state changes.
    50  	state *state.StateStore
    51  
    52  	// limiter is used to limit the rate of blocking queries
    53  	limiter *rate.Limiter
    54  
    55  	// jobs is the set of tracked jobs.
    56  	jobs map[structs.NamespacedID]struct{}
    57  
    58  	// queryCtx is used to cancel a blocking query.
    59  	queryCtx    context.Context
    60  	queryCancel context.CancelFunc
    61  
    62  	// drainCh and migratedCh are used to emit allocations
    63  	drainCh    chan *DrainRequest
    64  	migratedCh chan []*structs.Allocation
    65  
    66  	l sync.RWMutex
    67  }
    68  
    69  // NewDrainingJobWatcher returns a new job watcher. The caller is expected to
    70  // cancel the context to clean up the drainer.
    71  func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher {
    72  
    73  	// Create a context that can cancel the blocking query so that when a new
    74  	// job gets registered it is handled.
    75  	queryCtx, queryCancel := context.WithCancel(ctx)
    76  
    77  	w := &drainingJobWatcher{
    78  		ctx:         ctx,
    79  		queryCtx:    queryCtx,
    80  		queryCancel: queryCancel,
    81  		limiter:     limiter,
    82  		logger:      logger.Named("job_watcher"),
    83  		state:       state,
    84  		jobs:        make(map[structs.NamespacedID]struct{}, 64),
    85  		drainCh:     make(chan *DrainRequest),
    86  		migratedCh:  make(chan []*structs.Allocation),
    87  	}
    88  
    89  	go w.watch()
    90  	return w
    91  }
    92  
    93  // RegisterJob marks the given job as draining and adds it to being watched.
    94  func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) {
    95  	w.l.Lock()
    96  	defer w.l.Unlock()
    97  
    98  	updated := false
    99  	for _, jns := range jobs {
   100  		if _, ok := w.jobs[jns]; ok {
   101  			continue
   102  		}
   103  
   104  		// Add the job and cancel the context
   105  		w.logger.Trace("registering job", "job", jns)
   106  		w.jobs[jns] = struct{}{}
   107  		updated = true
   108  	}
   109  
   110  	if updated {
   111  		w.queryCancel()
   112  
   113  		// Create a new query context
   114  		w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
   115  	}
   116  }
   117  
   118  // Drain returns the channel that emits allocations to drain.
   119  func (w *drainingJobWatcher) Drain() <-chan *DrainRequest {
   120  	return w.drainCh
   121  }
   122  
   123  // Migrated returns the channel that emits allocations for draining jobs that
   124  // have been migrated.
   125  func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation {
   126  	return w.migratedCh
   127  }
   128  
   129  // deregisterJob removes the job from being watched.
   130  func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
   131  	w.l.Lock()
   132  	defer w.l.Unlock()
   133  	jns := structs.NamespacedID{
   134  		ID:        jobID,
   135  		Namespace: namespace,
   136  	}
   137  	delete(w.jobs, jns)
   138  	w.logger.Trace("deregistering job", "job", jns)
   139  }
   140  
   141  // watch is the long lived watching routine that detects job drain changes.
   142  func (w *drainingJobWatcher) watch() {
   143  	waitIndex := uint64(1)
   144  	for {
   145  		w.logger.Trace("getting job allocs at index", "index", waitIndex)
   146  		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex)
   147  		w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index, "error", err)
   148  		if err != nil {
   149  			if err == context.Canceled {
   150  				// Determine if it is a cancel or a shutdown
   151  				select {
   152  				case <-w.ctx.Done():
   153  					w.logger.Trace("shutting down")
   154  					return
   155  				default:
   156  					// The query context was cancelled;
   157  					// reset index so we don't miss past
   158  					// updates to newly registered jobs
   159  					waitIndex = 1
   160  					continue
   161  				}
   162  			}
   163  
   164  			w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err)
   165  			select {
   166  			case <-w.ctx.Done():
   167  				w.logger.Trace("shutting down")
   168  				return
   169  			case <-time.After(stateReadErrorDelay):
   170  				continue
   171  			}
   172  		}
   173  
   174  		lastHandled := waitIndex
   175  		waitIndex = index
   176  
   177  		// Snapshot the state store
   178  		snap, err := w.state.Snapshot()
   179  		if err != nil {
   180  			w.logger.Warn("failed to snapshot statestore", "error", err)
   181  			continue
   182  		}
   183  
   184  		currentJobs := w.drainingJobs()
   185  		var allDrain, allMigrated []*structs.Allocation
   186  		for jns, allocs := range jobAllocs {
   187  			// Check if the job is still registered
   188  			if _, ok := currentJobs[jns]; !ok {
   189  				w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns)
   190  				continue
   191  			}
   192  
   193  			w.logger.Trace("handling job", "job", jns)
   194  
   195  			// Lookup the job
   196  			job, err := snap.JobByID(nil, jns.Namespace, jns.ID)
   197  			if err != nil {
   198  				w.logger.Warn("failed to lookup job", "job", jns, "error", err)
   199  				continue
   200  			}
   201  
   202  			// Ignore purged jobs
   203  			if job == nil {
   204  				w.logger.Trace("ignoring garbage collected job", "job", jns)
   205  				w.deregisterJob(jns.ID, jns.Namespace)
   206  				continue
   207  			}
   208  
   209  			// Ignore any system jobs
   210  			if job.Type == structs.JobTypeSystem {
   211  				w.deregisterJob(job.ID, job.Namespace)
   212  				continue
   213  			}
   214  
   215  			result, err := handleJob(snap, job, allocs, lastHandled)
   216  			if err != nil {
   217  				w.logger.Error("handling drain for job failed", "job", jns, "error", err)
   218  				continue
   219  			}
   220  
   221  			w.logger.Trace("received result for job", "job", jns, "result", result)
   222  
   223  			allDrain = append(allDrain, result.drain...)
   224  			allMigrated = append(allMigrated, result.migrated...)
   225  
   226  			// Stop tracking this job
   227  			if result.done {
   228  				w.deregisterJob(job.ID, job.Namespace)
   229  			}
   230  		}
   231  
   232  		if len(allDrain) != 0 {
   233  			// Create the request
   234  			req := NewDrainRequest(allDrain)
   235  			w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain))
   236  
   237  			select {
   238  			case w.drainCh <- req:
   239  			case <-w.ctx.Done():
   240  				w.logger.Trace("shutting down")
   241  				return
   242  			}
   243  
   244  			// Wait for the request to be committed
   245  			select {
   246  			case <-req.Resp.WaitCh():
   247  			case <-w.ctx.Done():
   248  				w.logger.Trace("shutting down")
   249  				return
   250  			}
   251  
   252  			// See if it successfully committed
   253  			if err := req.Resp.Error(); err != nil {
   254  				w.logger.Error("failed to transition allocations", "error", err)
   255  			}
   256  
   257  			// Wait until the new index
   258  			if index := req.Resp.Index(); index > waitIndex {
   259  				waitIndex = index
   260  			}
   261  		}
   262  
   263  		if len(allMigrated) != 0 {
   264  			w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated))
   265  			select {
   266  			case w.migratedCh <- allMigrated:
   267  			case <-w.ctx.Done():
   268  				w.logger.Trace("shutting down")
   269  				return
   270  			}
   271  		}
   272  	}
   273  }
   274  
   275  // jobResult is the set of actions to take for a draining job given its current
   276  // state.
   277  type jobResult struct {
   278  	// drain is the set of allocations to emit for draining.
   279  	drain []*structs.Allocation
   280  
   281  	// migrated is the set of allocations to emit as migrated
   282  	migrated []*structs.Allocation
   283  
   284  	// done marks whether the job has been fully drained.
   285  	done bool
   286  }
   287  
   288  // newJobResult returns a jobResult with done=true. It is the responsibility of
   289  // callers to set done=false when a remaining drainable alloc is found.
   290  func newJobResult() *jobResult {
   291  	return &jobResult{
   292  		done: true,
   293  	}
   294  }
   295  
   296  func (r *jobResult) String() string {
   297  	return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done)
   298  }
   299  
   300  // handleJob takes the state of a draining job and returns the desired actions.
   301  func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) {
   302  	r := newJobResult()
   303  	batch := job.Type == structs.JobTypeBatch
   304  	taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups))
   305  	for _, tg := range job.TaskGroups {
   306  		// Only capture the groups that have a migrate strategy or we are just
   307  		// watching batch
   308  		if tg.Migrate != nil || batch {
   309  			taskGroups[tg.Name] = tg
   310  		}
   311  	}
   312  
   313  	// Sort the allocations by TG
   314  	tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups))
   315  	for _, alloc := range allocs {
   316  		if _, ok := taskGroups[alloc.TaskGroup]; !ok {
   317  			continue
   318  		}
   319  
   320  		tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc)
   321  	}
   322  
   323  	for name, tg := range taskGroups {
   324  		allocs := tgAllocs[name]
   325  		if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil {
   326  			return nil, fmt.Errorf("drain for task group %q failed: %v", name, err)
   327  		}
   328  	}
   329  
   330  	return r, nil
   331  }
   332  
   333  // handleTaskGroup takes the state of a draining task group and computes the
   334  // desired actions. For batch jobs we only notify when they have been migrated
   335  // and never mark them for drain. Batch jobs are allowed to complete up until
   336  // the deadline, after which they are force killed.
   337  func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup,
   338  	allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error {
   339  
   340  	// Determine how many allocations can be drained
   341  	drainingNodes := make(map[string]bool, 4)
   342  	healthy := 0
   343  	remainingDrainingAlloc := false
   344  	var drainable []*structs.Allocation
   345  
   346  	for _, alloc := range allocs {
   347  		// Check if the alloc is on a draining node.
   348  		onDrainingNode, ok := drainingNodes[alloc.NodeID]
   349  		if !ok {
   350  			// Look up the node
   351  			node, err := snap.NodeByID(nil, alloc.NodeID)
   352  			if err != nil {
   353  				return err
   354  			}
   355  
   356  			// Check if the node exists and whether it has a drain strategy
   357  			onDrainingNode = node != nil && node.DrainStrategy != nil
   358  			drainingNodes[alloc.NodeID] = onDrainingNode
   359  		}
   360  
   361  		// Check if the alloc should be considered migrated. A migrated
   362  		// allocation is one that is terminal, is on a draining
   363  		// allocation, and has only happened since our last handled index to
   364  		// avoid emitting many duplicate migrate events.
   365  		if alloc.TerminalStatus() &&
   366  			onDrainingNode &&
   367  			alloc.ModifyIndex > lastHandledIndex {
   368  			result.migrated = append(result.migrated, alloc)
   369  			continue
   370  		}
   371  
   372  		// If the service alloc is running and has its deployment status set, it
   373  		// is considered healthy from a migration standpoint.
   374  		if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() {
   375  			healthy++
   376  		}
   377  
   378  		// An alloc can't be considered for migration if:
   379  		// - It isn't on a draining node
   380  		// - It is already terminal
   381  		if !onDrainingNode || alloc.TerminalStatus() {
   382  			continue
   383  		}
   384  
   385  		// Capture the fact that there is an allocation that is still draining
   386  		// for this job.
   387  		remainingDrainingAlloc = true
   388  
   389  		// If we haven't marked this allocation for migration already, capture
   390  		// it as eligible for draining.
   391  		if !batch && !alloc.DesiredTransition.ShouldMigrate() {
   392  			drainable = append(drainable, alloc)
   393  		}
   394  	}
   395  
   396  	// Update the done status
   397  	if remainingDrainingAlloc {
   398  		result.done = false
   399  	}
   400  
   401  	// We don't mark batch for drain so exit
   402  	if batch {
   403  		return nil
   404  	}
   405  
   406  	// Determine how many we can drain
   407  	thresholdCount := tg.Count - tg.Migrate.MaxParallel
   408  	numToDrain := healthy - thresholdCount
   409  	numToDrain = helper.IntMin(len(drainable), numToDrain)
   410  	if numToDrain <= 0 {
   411  		return nil
   412  	}
   413  
   414  	result.drain = append(result.drain, drainable[0:numToDrain]...)
   415  	return nil
   416  }
   417  
   418  // getJobAllocs returns all allocations for draining jobs
   419  func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) {
   420  	if err := w.limiter.Wait(ctx); err != nil {
   421  		return nil, 0, err
   422  	}
   423  
   424  	resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx)
   425  	if err != nil {
   426  		return nil, 0, err
   427  	}
   428  	if resp == nil {
   429  		return nil, index, nil
   430  	}
   431  
   432  	return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil
   433  }
   434  
   435  // getJobAllocsImpl returns a map of draining jobs to their allocations.
   436  func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   437  	index, err := state.Index("allocs")
   438  	if err != nil {
   439  		return nil, 0, err
   440  	}
   441  
   442  	// Capture the draining jobs.
   443  	draining := w.drainingJobs()
   444  	l := len(draining)
   445  	if l == 0 {
   446  		return nil, index, nil
   447  	}
   448  
   449  	// Capture the allocs for each draining job.
   450  	var maxIndex uint64 = 0
   451  	resp := make(map[structs.NamespacedID][]*structs.Allocation, l)
   452  	for jns := range draining {
   453  		allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false)
   454  		if err != nil {
   455  			return nil, index, err
   456  		}
   457  
   458  		resp[jns] = allocs
   459  		for _, alloc := range allocs {
   460  			if maxIndex < alloc.ModifyIndex {
   461  				maxIndex = alloc.ModifyIndex
   462  			}
   463  		}
   464  	}
   465  
   466  	// Prefer using the actual max index of affected allocs since it means less
   467  	// unblocking
   468  	if maxIndex != 0 {
   469  		index = maxIndex
   470  	}
   471  
   472  	return resp, index, nil
   473  }
   474  
   475  // drainingJobs captures the set of draining jobs.
   476  func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} {
   477  	w.l.RLock()
   478  	defer w.l.RUnlock()
   479  
   480  	l := len(w.jobs)
   481  	if l == 0 {
   482  		return nil
   483  	}
   484  
   485  	draining := make(map[structs.NamespacedID]struct{}, l)
   486  	for k := range w.jobs {
   487  		draining[k] = struct{}{}
   488  	}
   489  
   490  	return draining
   491  }
   492  
   493  // getQueryCtx is a helper for getting the query context.
   494  func (w *drainingJobWatcher) getQueryCtx() context.Context {
   495  	w.l.RLock()
   496  	defer w.l.RUnlock()
   497  	return w.queryCtx
   498  }