github.com/uchennaokeke444/nomad@v0.11.8/nomad/deploymentwatcher/deployments_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"golang.org/x/time/rate"
    10  
    11  	log "github.com/hashicorp/go-hclog"
    12  	memdb "github.com/hashicorp/go-memdb"
    13  
    14  	"github.com/hashicorp/nomad/nomad/state"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    20  	// second
    21  	LimitStateQueriesPerSecond = 100.0
    22  
    23  	// CrossDeploymentUpdateBatchDuration is the duration in which allocation
    24  	// desired transition and evaluation creation updates are batched across
    25  	// all deployment watchers before committing to Raft.
    26  	CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond
    27  )
    28  
    29  var (
    30  	// notEnabled is the error returned when the deployment watcher is not
    31  	// enabled
    32  	notEnabled = fmt.Errorf("deployment watcher not enabled")
    33  )
    34  
    35  // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
    36  // to apply data transforms via Raft.
    37  type DeploymentRaftEndpoints interface {
    38  	// UpsertJob is used to upsert a job
    39  	UpsertJob(job *structs.Job) (uint64, error)
    40  
    41  	// UpdateDeploymentStatus is used to make a deployment status update
    42  	// and potentially create an evaluation.
    43  	UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
    44  
    45  	// UpdateDeploymentPromotion is used to promote canaries in a deployment
    46  	UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    47  
    48  	// UpdateDeploymentAllocHealth is used to set the health of allocations in a
    49  	// deployment
    50  	UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    51  
    52  	// UpdateAllocDesiredTransition is used to update the desired transition
    53  	// for allocations.
    54  	UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error)
    55  }
    56  
    57  // Watcher is used to watch deployments and their allocations created
    58  // by the scheduler and trigger the scheduler when allocation health
    59  // transitions.
    60  type Watcher struct {
    61  	enabled bool
    62  	logger  log.Logger
    63  
    64  	// queryLimiter is used to limit the rate of blocking queries
    65  	queryLimiter *rate.Limiter
    66  
    67  	// updateBatchDuration is the duration to batch allocation desired
    68  	// transition and eval creation across all deployment watchers
    69  	updateBatchDuration time.Duration
    70  
    71  	// raft contains the set of Raft endpoints that can be used by the
    72  	// deployments watcher
    73  	raft DeploymentRaftEndpoints
    74  
    75  	// state is the state that is watched for state changes.
    76  	state *state.StateStore
    77  
    78  	// watchers is the set of active watchers, one per deployment
    79  	watchers map[string]*deploymentWatcher
    80  
    81  	// allocUpdateBatcher is used to batch the creation of evaluations and
    82  	// allocation desired transition updates
    83  	allocUpdateBatcher *AllocUpdateBatcher
    84  
    85  	// ctx and exitFn are used to cancel the watcher
    86  	ctx    context.Context
    87  	exitFn context.CancelFunc
    88  
    89  	l sync.RWMutex
    90  }
    91  
    92  // NewDeploymentsWatcher returns a deployments watcher that is used to watch
    93  // deployments and trigger the scheduler as needed.
    94  func NewDeploymentsWatcher(logger log.Logger,
    95  	raft DeploymentRaftEndpoints, stateQueriesPerSecond float64,
    96  	updateBatchDuration time.Duration) *Watcher {
    97  
    98  	return &Watcher{
    99  		raft:                raft,
   100  		queryLimiter:        rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
   101  		updateBatchDuration: updateBatchDuration,
   102  		logger:              logger.Named("deployments_watcher"),
   103  	}
   104  }
   105  
   106  // SetEnabled is used to control if the watcher is enabled. The watcher
   107  // should only be enabled on the active leader. When being enabled the state is
   108  // passed in as it is no longer valid once a leader election has taken place.
   109  func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) {
   110  	w.l.Lock()
   111  	defer w.l.Unlock()
   112  
   113  	wasEnabled := w.enabled
   114  	w.enabled = enabled
   115  
   116  	if state != nil {
   117  		w.state = state
   118  	}
   119  
   120  	// Flush the state to create the necessary objects
   121  	w.flush(enabled)
   122  
   123  	// If we are starting now, launch the watch daemon
   124  	if enabled && !wasEnabled {
   125  		go w.watchDeployments(w.ctx)
   126  	}
   127  }
   128  
   129  // flush is used to clear the state of the watcher
   130  func (w *Watcher) flush(enabled bool) {
   131  	// Stop all the watchers and clear it
   132  	for _, watcher := range w.watchers {
   133  		watcher.StopWatch()
   134  	}
   135  
   136  	// Kill everything associated with the watcher
   137  	if w.exitFn != nil {
   138  		w.exitFn()
   139  	}
   140  
   141  	w.watchers = make(map[string]*deploymentWatcher, 32)
   142  	w.ctx, w.exitFn = context.WithCancel(context.Background())
   143  
   144  	if enabled {
   145  		w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft)
   146  	} else {
   147  		w.allocUpdateBatcher = nil
   148  	}
   149  }
   150  
   151  // watchDeployments is the long lived go-routine that watches for deployments to
   152  // add and remove watchers on.
   153  func (w *Watcher) watchDeployments(ctx context.Context) {
   154  	dindex := uint64(1)
   155  	for {
   156  		// Block getting all deployments using the last deployment index.
   157  		deployments, idx, err := w.getDeploys(ctx, dindex)
   158  		if err != nil {
   159  			if err == context.Canceled {
   160  				return
   161  			}
   162  
   163  			w.logger.Error("failed to retrieve deployments", "error", err)
   164  		}
   165  
   166  		// Update the latest index
   167  		dindex = idx
   168  
   169  		// Ensure we are tracking the things we should and not tracking what we
   170  		// shouldn't be
   171  		for _, d := range deployments {
   172  			if d.Active() {
   173  				if err := w.add(d); err != nil {
   174  					w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err)
   175  				}
   176  			} else {
   177  				w.remove(d)
   178  			}
   179  		}
   180  	}
   181  }
   182  
   183  // getDeploys retrieves all deployments blocking at the given index.
   184  func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) {
   185  	resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx)
   186  	if err != nil {
   187  		return nil, 0, err
   188  	}
   189  
   190  	return resp.([]*structs.Deployment), index, nil
   191  }
   192  
   193  // getDeploysImpl retrieves all deployments from the passed state store.
   194  func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   195  
   196  	iter, err := state.Deployments(ws)
   197  	if err != nil {
   198  		return nil, 0, err
   199  	}
   200  
   201  	var deploys []*structs.Deployment
   202  	for {
   203  		raw := iter.Next()
   204  		if raw == nil {
   205  			break
   206  		}
   207  		deploy := raw.(*structs.Deployment)
   208  		deploys = append(deploys, deploy)
   209  	}
   210  
   211  	// Use the last index that affected the deployment table
   212  	index, err := state.Index("deployment")
   213  	if err != nil {
   214  		return nil, 0, err
   215  	}
   216  
   217  	return deploys, index, nil
   218  }
   219  
   220  // add adds a deployment to the watch list
   221  func (w *Watcher) add(d *structs.Deployment) error {
   222  	w.l.Lock()
   223  	defer w.l.Unlock()
   224  	_, err := w.addLocked(d)
   225  	return err
   226  }
   227  
   228  // addLocked adds a deployment to the watch list and should only be called when
   229  // locked. Creating the deploymentWatcher starts a go routine to .watch() it
   230  func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
   231  	// Not enabled so no-op
   232  	if !w.enabled {
   233  		return nil, nil
   234  	}
   235  
   236  	if !d.Active() {
   237  		return nil, fmt.Errorf("deployment %q is terminal", d.ID)
   238  	}
   239  
   240  	// Already watched so just update the deployment
   241  	if w, ok := w.watchers[d.ID]; ok {
   242  		w.updateDeployment(d)
   243  		return nil, nil
   244  	}
   245  
   246  	// Get the job the deployment is referencing
   247  	snap, err := w.state.Snapshot()
   248  	if err != nil {
   249  		return nil, err
   250  	}
   251  
   252  	job, err := snap.JobByID(nil, d.Namespace, d.JobID)
   253  	if err != nil {
   254  		return nil, err
   255  	}
   256  	if job == nil {
   257  		return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
   258  	}
   259  
   260  	watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w)
   261  	w.watchers[d.ID] = watcher
   262  	return watcher, nil
   263  }
   264  
   265  // remove stops watching a deployment. This can be because the deployment is
   266  // complete or being deleted.
   267  func (w *Watcher) remove(d *structs.Deployment) {
   268  	w.l.Lock()
   269  	defer w.l.Unlock()
   270  
   271  	// Not enabled so no-op
   272  	if !w.enabled {
   273  		return
   274  	}
   275  
   276  	if watcher, ok := w.watchers[d.ID]; ok {
   277  		watcher.StopWatch()
   278  		delete(w.watchers, d.ID)
   279  	}
   280  }
   281  
   282  // forceAdd is used to force a lookup of the given deployment object and create
   283  // a watcher. If the deployment does not exist or is terminal an error is
   284  // returned.
   285  func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
   286  	snap, err := w.state.Snapshot()
   287  	if err != nil {
   288  		return nil, err
   289  	}
   290  
   291  	deployment, err := snap.DeploymentByID(nil, dID)
   292  	if err != nil {
   293  		return nil, err
   294  	}
   295  
   296  	if deployment == nil {
   297  		return nil, fmt.Errorf("unknown deployment %q", dID)
   298  	}
   299  
   300  	return w.addLocked(deployment)
   301  }
   302  
   303  // getOrCreateWatcher returns the deployment watcher for the given deployment ID.
   304  func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
   305  	w.l.Lock()
   306  	defer w.l.Unlock()
   307  
   308  	// Not enabled so no-op
   309  	if !w.enabled {
   310  		return nil, notEnabled
   311  	}
   312  
   313  	watcher, ok := w.watchers[dID]
   314  	if ok {
   315  		return watcher, nil
   316  	}
   317  
   318  	return w.forceAdd(dID)
   319  }
   320  
   321  // SetAllocHealth is used to set the health of allocations for a deployment. If
   322  // there are any unhealthy allocations, the deployment is updated to be failed.
   323  // Otherwise the allocations are updated and an evaluation is created.
   324  func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
   325  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   326  	if err != nil {
   327  		return err
   328  	}
   329  
   330  	return watcher.SetAllocHealth(req, resp)
   331  }
   332  
   333  // PromoteDeployment is used to promote a deployment. If promote is false,
   334  // deployment is marked as failed. Otherwise the deployment is updated and an
   335  // evaluation is created.
   336  func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
   337  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   338  	if err != nil {
   339  		return err
   340  	}
   341  
   342  	return watcher.PromoteDeployment(req, resp)
   343  }
   344  
   345  // PauseDeployment is used to toggle the pause state on a deployment. If the
   346  // deployment is being unpaused, an evaluation is created.
   347  func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
   348  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   349  	if err != nil {
   350  		return err
   351  	}
   352  
   353  	return watcher.PauseDeployment(req, resp)
   354  }
   355  
   356  // FailDeployment is used to fail the deployment.
   357  func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
   358  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   359  	if err != nil {
   360  		return err
   361  	}
   362  
   363  	return watcher.FailDeployment(req, resp)
   364  }
   365  
   366  // createUpdate commits the given allocation desired transition and evaluation
   367  // to Raft but batches the commit with other calls.
   368  func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) {
   369  	b := w.allocUpdateBatcher
   370  	if b == nil {
   371  		return 0, notEnabled
   372  	}
   373  	return b.CreateUpdate(allocs, eval).Results()
   374  }
   375  
   376  // upsertJob commits the given job to Raft
   377  func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
   378  	return w.raft.UpsertJob(job)
   379  }
   380  
   381  // upsertDeploymentStatusUpdate commits the given deployment update and optional
   382  // evaluation to Raft
   383  func (w *Watcher) upsertDeploymentStatusUpdate(
   384  	u *structs.DeploymentStatusUpdate,
   385  	e *structs.Evaluation,
   386  	j *structs.Job) (uint64, error) {
   387  	return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
   388  		DeploymentUpdate: u,
   389  		Eval:             e,
   390  		Job:              j,
   391  	})
   392  }
   393  
   394  // upsertDeploymentPromotion commits the given deployment promotion to Raft
   395  func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
   396  	return w.raft.UpdateDeploymentPromotion(req)
   397  }
   398  
   399  // upsertDeploymentAllocHealth commits the given allocation health changes to
   400  // Raft
   401  func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
   402  	return w.raft.UpdateDeploymentAllocHealth(req)
   403  }