github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/drainer/watch_jobs.go

github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/drainer/watch_jobs.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	memdb "github.com/hashicorp/go-memdb"
    11  
    12  	"github.com/hashicorp/nomad/helper"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"golang.org/x/time/rate"
    16  )
    17  
    18  type DrainRequest struct {
    19  	Allocs []*structs.Allocation
    20  	Resp   *structs.BatchFuture
    21  }
    22  
    23  func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
    24  	return &DrainRequest{
    25  		Allocs: allocs,
    26  		Resp:   structs.NewBatchFuture(),
    27  	}
    28  }
    29  
    30  // DrainingJobWatcher is the interface for watching a job drain
    31  type DrainingJobWatcher interface {
    32  	// RegisterJob is used to start watching a draining job
    33  	RegisterJobs(job []structs.NamespacedID)
    34  
    35  	// Drain is used to emit allocations that should be drained.
    36  	Drain() <-chan *DrainRequest
    37  
    38  	// Migrated is allocations for draining jobs that have transitioned to
    39  	// stop. There is no guarantee that duplicates won't be published.
    40  	Migrated() <-chan []*structs.Allocation
    41  }
    42  
    43  // drainingJobWatcher is used to watch draining jobs and emit events when
    44  // draining allocations have replacements
    45  type drainingJobWatcher struct {
    46  	ctx    context.Context
    47  	logger log.Logger
    48  
    49  	// state is the state that is watched for state changes.
    50  	state *state.StateStore
    51  
    52  	// limiter is used to limit the rate of blocking queries
    53  	limiter *rate.Limiter
    54  
    55  	// jobs is the set of tracked jobs.
    56  	jobs map[structs.NamespacedID]struct{}
    57  
    58  	// queryCtx is used to cancel a blocking query.
    59  	queryCtx    context.Context
    60  	queryCancel context.CancelFunc
    61  
    62  	// drainCh and migratedCh are used to emit allocations
    63  	drainCh    chan *DrainRequest
    64  	migratedCh chan []*structs.Allocation
    65  
    66  	l sync.RWMutex
    67  }
    68  
    69  // NewDrainingJobWatcher returns a new job watcher. The caller is expected to
    70  // cancel the context to clean up the drainer.
    71  func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher {
    72  
    73  	// Create a context that can cancel the blocking query so that when a new
    74  	// job gets registered it is handled.
    75  	queryCtx, queryCancel := context.WithCancel(ctx)
    76  
    77  	w := &drainingJobWatcher{
    78  		ctx:         ctx,
    79  		queryCtx:    queryCtx,
    80  		queryCancel: queryCancel,
    81  		limiter:     limiter,
    82  		logger:      logger.Named("job_watcher"),
    83  		state:       state,
    84  		jobs:        make(map[structs.NamespacedID]struct{}, 64),
    85  		drainCh:     make(chan *DrainRequest),
    86  		migratedCh:  make(chan []*structs.Allocation),
    87  	}
    88  
    89  	go w.watch()
    90  	return w
    91  }
    92  
    93  // RegisterJob marks the given job as draining and adds it to being watched.
    94  func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) {
    95  	w.l.Lock()
    96  	defer w.l.Unlock()
    97  
    98  	updated := false
    99  	for _, jns := range jobs {
   100  		if _, ok := w.jobs[jns]; ok {
   101  			continue
   102  		}
   103  
   104  		// Add the job and cancel the context
   105  		w.logger.Trace("registering job", "job", jns)
   106  		w.jobs[jns] = struct{}{}
   107  		updated = true
   108  	}
   109  
   110  	if updated {
   111  		w.queryCancel()
   112  
   113  		// Create a new query context
   114  		w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
   115  	}
   116  }
   117  
   118  // Drain returns the channel that emits allocations to drain.
   119  func (w *drainingJobWatcher) Drain() <-chan *DrainRequest {
   120  	return w.drainCh
   121  }
   122  
   123  // Migrated returns the channel that emits allocations for draining jobs that
   124  // have been migrated.
   125  func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation {
   126  	return w.migratedCh
   127  }
   128  
   129  // deregisterJob removes the job from being watched.
   130  func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
   131  	w.l.Lock()
   132  	defer w.l.Unlock()
   133  	jns := structs.NamespacedID{
   134  		ID:        jobID,
   135  		Namespace: namespace,
   136  	}
   137  	delete(w.jobs, jns)
   138  	w.logger.Trace("deregistering job", "job", jns)
   139  }
   140  
   141  // watch is the long lived watching routine that detects job drain changes.
   142  func (w *drainingJobWatcher) watch() {
   143  	waitIndex := uint64(1)
   144  	for {
   145  		w.logger.Trace("getting job allocs at index", "index", waitIndex)
   146  		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex)
   147  		if err != nil {
   148  			if err == context.Canceled {
   149  				// Determine if it is a cancel or a shutdown
   150  				select {
   151  				case <-w.ctx.Done():
   152  					return
   153  				default:
   154  					// The query context was cancelled;
   155  					// reset index so we don't miss past
   156  					// updates to newly registered jobs
   157  					waitIndex = 1
   158  					continue
   159  				}
   160  			}
   161  
   162  			w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err)
   163  			select {
   164  			case <-w.ctx.Done():
   165  				w.logger.Trace("shutting down")
   166  				return
   167  			case <-time.After(stateReadErrorDelay):
   168  				continue
   169  			}
   170  		}
   171  		w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index)
   172  
   173  		lastHandled := waitIndex
   174  		waitIndex = index
   175  
   176  		// Snapshot the state store
   177  		snap, err := w.state.Snapshot()
   178  		if err != nil {
   179  			w.logger.Warn("failed to snapshot statestore", "error", err)
   180  			continue
   181  		}
   182  
   183  		currentJobs := w.drainingJobs()
   184  		var allDrain, allMigrated []*structs.Allocation
   185  		for jns, allocs := range jobAllocs {
   186  			// Check if the job is still registered
   187  			if _, ok := currentJobs[jns]; !ok {
   188  				w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns)
   189  				continue
   190  			}
   191  
   192  			w.logger.Trace("handling job", "job", jns)
   193  
   194  			// Lookup the job
   195  			job, err := snap.JobByID(nil, jns.Namespace, jns.ID)
   196  			if err != nil {
   197  				w.logger.Warn("failed to lookup job", "job", jns, "error", err)
   198  				continue
   199  			}
   200  
   201  			// Ignore purged jobs
   202  			if job == nil {
   203  				w.logger.Trace("ignoring garbage collected job", "job", jns)
   204  				w.deregisterJob(jns.ID, jns.Namespace)
   205  				continue
   206  			}
   207  
   208  			// Ignore any system jobs
   209  			if job.Type == structs.JobTypeSystem {
   210  				w.deregisterJob(job.ID, job.Namespace)
   211  				continue
   212  			}
   213  
   214  			result, err := handleJob(snap, job, allocs, lastHandled)
   215  			if err != nil {
   216  				w.logger.Error("handling drain for job failed", "job", jns, "error", err)
   217  				continue
   218  			}
   219  
   220  			w.logger.Trace("received result for job", "job", jns, "result", result)
   221  
   222  			allDrain = append(allDrain, result.drain...)
   223  			allMigrated = append(allMigrated, result.migrated...)
   224  
   225  			// Stop tracking this job
   226  			if result.done {
   227  				w.deregisterJob(job.ID, job.Namespace)
   228  			}
   229  		}
   230  
   231  		if len(allDrain) != 0 {
   232  			// Create the request
   233  			req := NewDrainRequest(allDrain)
   234  			w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain))
   235  
   236  			select {
   237  			case w.drainCh <- req:
   238  			case <-w.ctx.Done():
   239  				w.logger.Trace("shutting down")
   240  				return
   241  			}
   242  
   243  			// Wait for the request to be committed
   244  			select {
   245  			case <-req.Resp.WaitCh():
   246  			case <-w.ctx.Done():
   247  				w.logger.Trace("shutting down")
   248  				return
   249  			}
   250  
   251  			// See if it successfully committed
   252  			if err := req.Resp.Error(); err != nil {
   253  				w.logger.Error("failed to transition allocations", "error", err)
   254  			}
   255  
   256  			// Wait until the new index
   257  			if index := req.Resp.Index(); index > waitIndex {
   258  				waitIndex = index
   259  			}
   260  		}
   261  
   262  		if len(allMigrated) != 0 {
   263  			w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated))
   264  			select {
   265  			case w.migratedCh <- allMigrated:
   266  			case <-w.ctx.Done():
   267  				w.logger.Trace("shutting down")
   268  				return
   269  			}
   270  		}
   271  	}
   272  }
   273  
   274  // jobResult is the set of actions to take for a draining job given its current
   275  // state.
   276  type jobResult struct {
   277  	// drain is the set of allocations to emit for draining.
   278  	drain []*structs.Allocation
   279  
   280  	// migrated is the set of allocations to emit as migrated
   281  	migrated []*structs.Allocation
   282  
   283  	// done marks whether the job has been fully drained.
   284  	done bool
   285  }
   286  
   287  // newJobResult returns a jobResult with done=true. It is the responsibility of
   288  // callers to set done=false when a remaining drainable alloc is found.
   289  func newJobResult() *jobResult {
   290  	return &jobResult{
   291  		done: true,
   292  	}
   293  }
   294  
   295  func (r *jobResult) String() string {
   296  	return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done)
   297  }
   298  
   299  // handleJob takes the state of a draining job and returns the desired actions.
   300  func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) {
   301  	r := newJobResult()
   302  	batch := job.Type == structs.JobTypeBatch
   303  	taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups))
   304  	for _, tg := range job.TaskGroups {
   305  		// Only capture the groups that have a migrate strategy or we are just
   306  		// watching batch
   307  		if tg.Migrate != nil || batch {
   308  			taskGroups[tg.Name] = tg
   309  		}
   310  	}
   311  
   312  	// Sort the allocations by TG
   313  	tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups))
   314  	for _, alloc := range allocs {
   315  		if _, ok := taskGroups[alloc.TaskGroup]; !ok {
   316  			continue
   317  		}
   318  
   319  		tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc)
   320  	}
   321  
   322  	for name, tg := range taskGroups {
   323  		allocs := tgAllocs[name]
   324  		if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil {
   325  			return nil, fmt.Errorf("drain for task group %q failed: %v", name, err)
   326  		}
   327  	}
   328  
   329  	return r, nil
   330  }
   331  
   332  // handleTaskGroup takes the state of a draining task group and computes the
   333  // desired actions. For batch jobs we only notify when they have been migrated
   334  // and never mark them for drain. Batch jobs are allowed to complete up until
   335  // the deadline, after which they are force killed.
   336  func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup,
   337  	allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error {
   338  
   339  	// Determine how many allocations can be drained
   340  	drainingNodes := make(map[string]bool, 4)
   341  	healthy := 0
   342  	remainingDrainingAlloc := false
   343  	var drainable []*structs.Allocation
   344  
   345  	for _, alloc := range allocs {
   346  		// Check if the alloc is on a draining node.
   347  		onDrainingNode, ok := drainingNodes[alloc.NodeID]
   348  		if !ok {
   349  			// Look up the node
   350  			node, err := snap.NodeByID(nil, alloc.NodeID)
   351  			if err != nil {
   352  				return err
   353  			}
   354  
   355  			// Check if the node exists and whether it has a drain strategy
   356  			onDrainingNode = node != nil && node.DrainStrategy != nil
   357  			drainingNodes[alloc.NodeID] = onDrainingNode
   358  		}
   359  
   360  		// Check if the alloc should be considered migrated. A migrated
   361  		// allocation is one that is terminal, is on a draining
   362  		// allocation, and has only happened since our last handled index to
   363  		// avoid emitting many duplicate migrate events.
   364  		if alloc.TerminalStatus() &&
   365  			onDrainingNode &&
   366  			alloc.ModifyIndex > lastHandledIndex {
   367  			result.migrated = append(result.migrated, alloc)
   368  			continue
   369  		}
   370  
   371  		// If the service alloc is running and has its deployment status set, it
   372  		// is considered healthy from a migration standpoint.
   373  		if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() {
   374  			healthy++
   375  		}
   376  
   377  		// An alloc can't be considered for migration if:
   378  		// - It isn't on a draining node
   379  		// - It is already terminal
   380  		if !onDrainingNode || alloc.TerminalStatus() {
   381  			continue
   382  		}
   383  
   384  		// Capture the fact that there is an allocation that is still draining
   385  		// for this job.
   386  		remainingDrainingAlloc = true
   387  
   388  		// If we haven't marked this allocation for migration already, capture
   389  		// it as eligible for draining.
   390  		if !batch && !alloc.DesiredTransition.ShouldMigrate() {
   391  			drainable = append(drainable, alloc)
   392  		}
   393  	}
   394  
   395  	// Update the done status
   396  	if remainingDrainingAlloc {
   397  		result.done = false
   398  	}
   399  
   400  	// We don't mark batch for drain so exit
   401  	if batch {
   402  		return nil
   403  	}
   404  
   405  	// Determine how many we can drain
   406  	thresholdCount := tg.Count - tg.Migrate.MaxParallel
   407  	numToDrain := healthy - thresholdCount
   408  	numToDrain = helper.IntMin(len(drainable), numToDrain)
   409  	if numToDrain <= 0 {
   410  		return nil
   411  	}
   412  
   413  	result.drain = append(result.drain, drainable[0:numToDrain]...)
   414  	return nil
   415  }
   416  
   417  // getJobAllocs returns all allocations for draining jobs
   418  func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) {
   419  	if err := w.limiter.Wait(ctx); err != nil {
   420  		return nil, 0, err
   421  	}
   422  
   423  	resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx)
   424  	if err != nil {
   425  		return nil, 0, err
   426  	}
   427  	if resp == nil {
   428  		return nil, index, nil
   429  	}
   430  
   431  	return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil
   432  }
   433  
   434  // getJobAllocsImpl returns a map of draining jobs to their allocations.
   435  func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   436  	index, err := state.Index("allocs")
   437  	if err != nil {
   438  		return nil, 0, err
   439  	}
   440  
   441  	// Capture the draining jobs.
   442  	draining := w.drainingJobs()
   443  	l := len(draining)
   444  	if l == 0 {
   445  		return nil, index, nil
   446  	}
   447  
   448  	// Capture the allocs for each draining job.
   449  	var maxIndex uint64 = 0
   450  	resp := make(map[structs.NamespacedID][]*structs.Allocation, l)
   451  	for jns := range draining {
   452  		allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false)
   453  		if err != nil {
   454  			return nil, index, err
   455  		}
   456  
   457  		resp[jns] = allocs
   458  		for _, alloc := range allocs {
   459  			if maxIndex < alloc.ModifyIndex {
   460  				maxIndex = alloc.ModifyIndex
   461  			}
   462  		}
   463  	}
   464  
   465  	// Prefer using the actual max index of affected allocs since it means less
   466  	// unblocking
   467  	if maxIndex != 0 {
   468  		index = maxIndex
   469  	}
   470  
   471  	return resp, index, nil
   472  }
   473  
   474  // drainingJobs captures the set of draining jobs.
   475  func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} {
   476  	w.l.RLock()
   477  	defer w.l.RUnlock()
   478  
   479  	l := len(w.jobs)
   480  	if l == 0 {
   481  		return nil
   482  	}
   483  
   484  	draining := make(map[structs.NamespacedID]struct{}, l)
   485  	for k := range w.jobs {
   486  		draining[k] = struct{}{}
   487  	}
   488  
   489  	return draining
   490  }
   491  
   492  // getQueryCtx is a helper for getting the query context.
   493  func (w *drainingJobWatcher) getQueryCtx() context.Context {
   494  	w.l.RLock()
   495  	defer w.l.RUnlock()
   496  	return w.queryCtx
   497  }