github.com/hernad/nomad@v1.6.112/nomad/deploymentwatcher/deployments_watcher.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package deploymentwatcher
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"sync"
    10  	"time"
    11  
    12  	"golang.org/x/time/rate"
    13  
    14  	log "github.com/hashicorp/go-hclog"
    15  	memdb "github.com/hashicorp/go-memdb"
    16  
    17  	"github.com/hernad/nomad/nomad/state"
    18  	"github.com/hernad/nomad/nomad/structs"
    19  )
    20  
    21  const (
    22  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    23  	// second
    24  	LimitStateQueriesPerSecond = 100.0
    25  
    26  	// CrossDeploymentUpdateBatchDuration is the duration in which allocation
    27  	// desired transition and evaluation creation updates are batched across
    28  	// all deployment watchers before committing to Raft.
    29  	CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond
    30  )
    31  
    32  var (
    33  	// notEnabled is the error returned when the deployment watcher is not
    34  	// enabled
    35  	notEnabled = fmt.Errorf("deployment watcher not enabled")
    36  )
    37  
    38  // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
    39  // to apply data transforms via Raft.
    40  type DeploymentRaftEndpoints interface {
    41  	// UpsertJob is used to upsert a job
    42  	UpsertJob(job *structs.Job) (uint64, error)
    43  
    44  	// UpdateDeploymentStatus is used to make a deployment status update
    45  	// and potentially create an evaluation.
    46  	UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
    47  
    48  	// UpdateDeploymentPromotion is used to promote canaries in a deployment
    49  	UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    50  
    51  	// UpdateDeploymentAllocHealth is used to set the health of allocations in a
    52  	// deployment
    53  	UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    54  
    55  	// UpdateAllocDesiredTransition is used to update the desired transition
    56  	// for allocations.
    57  	UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error)
    58  }
    59  
    60  // Watcher is used to watch deployments and their allocations created
    61  // by the scheduler and trigger the scheduler when allocation health
    62  // transitions.
    63  type Watcher struct {
    64  	enabled bool
    65  	logger  log.Logger
    66  
    67  	// queryLimiter is used to limit the rate of blocking queries
    68  	queryLimiter *rate.Limiter
    69  
    70  	// updateBatchDuration is the duration to batch allocation desired
    71  	// transition and eval creation across all deployment watchers
    72  	updateBatchDuration time.Duration
    73  
    74  	// raft contains the set of Raft endpoints that can be used by the
    75  	// deployments watcher
    76  	raft DeploymentRaftEndpoints
    77  
    78  	// state is the state that is watched for state changes.
    79  	state *state.StateStore
    80  
    81  	// server interface for Deployment RPCs
    82  	deploymentRPC DeploymentRPC
    83  
    84  	// server interface for Job RPCs
    85  	jobRPC JobRPC
    86  
    87  	// watchers is the set of active watchers, one per deployment
    88  	watchers map[string]*deploymentWatcher
    89  
    90  	// allocUpdateBatcher is used to batch the creation of evaluations and
    91  	// allocation desired transition updates
    92  	allocUpdateBatcher *AllocUpdateBatcher
    93  
    94  	// ctx and exitFn are used to cancel the watcher
    95  	ctx    context.Context
    96  	exitFn context.CancelFunc
    97  
    98  	l sync.RWMutex
    99  }
   100  
   101  // NewDeploymentsWatcher returns a deployments watcher that is used to watch
   102  // deployments and trigger the scheduler as needed.
   103  func NewDeploymentsWatcher(logger log.Logger,
   104  	raft DeploymentRaftEndpoints,
   105  	deploymentRPC DeploymentRPC, jobRPC JobRPC,
   106  	stateQueriesPerSecond float64,
   107  	updateBatchDuration time.Duration,
   108  ) *Watcher {
   109  
   110  	return &Watcher{
   111  		raft:                raft,
   112  		deploymentRPC:       deploymentRPC,
   113  		jobRPC:              jobRPC,
   114  		queryLimiter:        rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
   115  		updateBatchDuration: updateBatchDuration,
   116  		logger:              logger.Named("deployments_watcher"),
   117  	}
   118  }
   119  
   120  // SetEnabled is used to control if the watcher is enabled. The watcher
   121  // should only be enabled on the active leader. When being enabled the state is
   122  // passed in as it is no longer valid once a leader election has taken place.
   123  func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) {
   124  	w.l.Lock()
   125  	defer w.l.Unlock()
   126  
   127  	wasEnabled := w.enabled
   128  	w.enabled = enabled
   129  
   130  	if state != nil {
   131  		w.state = state
   132  	}
   133  
   134  	// Flush the state to create the necessary objects
   135  	w.flush(enabled)
   136  
   137  	// If we are starting now, launch the watch daemon
   138  	if enabled && !wasEnabled {
   139  		go w.watchDeployments(w.ctx)
   140  	}
   141  }
   142  
   143  // flush is used to clear the state of the watcher
   144  func (w *Watcher) flush(enabled bool) {
   145  	// Stop all the watchers and clear it
   146  	for _, watcher := range w.watchers {
   147  		watcher.StopWatch()
   148  	}
   149  
   150  	// Kill everything associated with the watcher
   151  	if w.exitFn != nil {
   152  		w.exitFn()
   153  	}
   154  
   155  	w.watchers = make(map[string]*deploymentWatcher, 32)
   156  	w.ctx, w.exitFn = context.WithCancel(context.Background())
   157  
   158  	if enabled {
   159  		w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft)
   160  	} else {
   161  		w.allocUpdateBatcher = nil
   162  	}
   163  }
   164  
   165  // watchDeployments is the long lived go-routine that watches for deployments to
   166  // add and remove watchers on.
   167  func (w *Watcher) watchDeployments(ctx context.Context) {
   168  	dindex := uint64(1)
   169  	for {
   170  		// Block getting all deployments using the last deployment index.
   171  		deployments, idx, err := w.getDeploys(ctx, dindex)
   172  		if err != nil {
   173  			if err == context.Canceled {
   174  				return
   175  			}
   176  
   177  			w.logger.Error("failed to retrieve deployments", "error", err)
   178  		}
   179  
   180  		// Update the latest index
   181  		dindex = idx
   182  
   183  		// Ensure we are tracking the things we should and not tracking what we
   184  		// shouldn't be
   185  		for _, d := range deployments {
   186  			if d.Active() {
   187  				if err := w.add(d); err != nil {
   188  					w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err)
   189  				}
   190  			} else {
   191  				w.remove(d)
   192  			}
   193  		}
   194  	}
   195  }
   196  
   197  // getDeploys retrieves all deployments blocking at the given index.
   198  func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) {
   199  	// state can be updated concurrently
   200  	w.l.Lock()
   201  	stateStore := w.state
   202  	w.l.Unlock()
   203  
   204  	resp, index, err := stateStore.BlockingQuery(w.getDeploysImpl, minIndex, ctx)
   205  	if err != nil {
   206  		return nil, 0, err
   207  	}
   208  
   209  	return resp.([]*structs.Deployment), index, nil
   210  }
   211  
   212  // getDeploysImpl retrieves all deployments from the passed state store.
   213  func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, store *state.StateStore) (interface{}, uint64, error) {
   214  
   215  	iter, err := store.Deployments(ws, state.SortDefault)
   216  	if err != nil {
   217  		return nil, 0, err
   218  	}
   219  
   220  	var deploys []*structs.Deployment
   221  	for {
   222  		raw := iter.Next()
   223  		if raw == nil {
   224  			break
   225  		}
   226  		deploy := raw.(*structs.Deployment)
   227  		deploys = append(deploys, deploy)
   228  	}
   229  
   230  	// Use the last index that affected the deployment table
   231  	index, err := store.Index("deployment")
   232  	if err != nil {
   233  		return nil, 0, err
   234  	}
   235  
   236  	return deploys, index, nil
   237  }
   238  
   239  // add adds a deployment to the watch list
   240  func (w *Watcher) add(d *structs.Deployment) error {
   241  	w.l.Lock()
   242  	defer w.l.Unlock()
   243  	_, err := w.addLocked(d)
   244  	return err
   245  }
   246  
   247  // addLocked adds a deployment to the watch list and should only be called when
   248  // locked. Creating the deploymentWatcher starts a go routine to .watch() it
   249  func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
   250  	// Not enabled so no-op
   251  	if !w.enabled {
   252  		return nil, nil
   253  	}
   254  
   255  	if !d.Active() {
   256  		return nil, fmt.Errorf("deployment %q is terminal", d.ID)
   257  	}
   258  
   259  	// Already watched so just update the deployment
   260  	if w, ok := w.watchers[d.ID]; ok {
   261  		w.updateDeployment(d)
   262  		return nil, nil
   263  	}
   264  
   265  	// Get the job the deployment is referencing
   266  	snap, err := w.state.Snapshot()
   267  	if err != nil {
   268  		return nil, err
   269  	}
   270  
   271  	job, err := snap.JobByID(nil, d.Namespace, d.JobID)
   272  	if err != nil {
   273  		return nil, err
   274  	}
   275  	if job == nil {
   276  		return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
   277  	}
   278  
   279  	watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job,
   280  		w, w.deploymentRPC, w.jobRPC)
   281  	w.watchers[d.ID] = watcher
   282  	return watcher, nil
   283  }
   284  
   285  // remove stops watching a deployment. This can be because the deployment is
   286  // complete or being deleted.
   287  func (w *Watcher) remove(d *structs.Deployment) {
   288  	w.l.Lock()
   289  	defer w.l.Unlock()
   290  
   291  	// Not enabled so no-op
   292  	if !w.enabled {
   293  		return
   294  	}
   295  
   296  	if watcher, ok := w.watchers[d.ID]; ok {
   297  		watcher.StopWatch()
   298  		delete(w.watchers, d.ID)
   299  	}
   300  }
   301  
   302  // forceAdd is used to force a lookup of the given deployment object and create
   303  // a watcher. If the deployment does not exist or is terminal an error is
   304  // returned.
   305  func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
   306  	snap, err := w.state.Snapshot()
   307  	if err != nil {
   308  		return nil, err
   309  	}
   310  
   311  	deployment, err := snap.DeploymentByID(nil, dID)
   312  	if err != nil {
   313  		return nil, err
   314  	}
   315  
   316  	if deployment == nil {
   317  		return nil, fmt.Errorf("unknown deployment %q", dID)
   318  	}
   319  
   320  	return w.addLocked(deployment)
   321  }
   322  
   323  // getOrCreateWatcher returns the deployment watcher for the given deployment ID.
   324  func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
   325  	w.l.Lock()
   326  	defer w.l.Unlock()
   327  
   328  	// Not enabled so no-op
   329  	if !w.enabled {
   330  		return nil, notEnabled
   331  	}
   332  
   333  	watcher, ok := w.watchers[dID]
   334  	if ok {
   335  		return watcher, nil
   336  	}
   337  
   338  	return w.forceAdd(dID)
   339  }
   340  
   341  // SetAllocHealth is used to set the health of allocations for a deployment. If
   342  // there are any unhealthy allocations, the deployment is updated to be failed.
   343  // Otherwise the allocations are updated and an evaluation is created.
   344  func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
   345  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   346  	if err != nil {
   347  		return err
   348  	}
   349  
   350  	return watcher.SetAllocHealth(req, resp)
   351  }
   352  
   353  // PromoteDeployment is used to promote a deployment. If promote is false,
   354  // deployment is marked as failed. Otherwise the deployment is updated and an
   355  // evaluation is created.
   356  func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
   357  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   358  	if err != nil {
   359  		return err
   360  	}
   361  
   362  	return watcher.PromoteDeployment(req, resp)
   363  }
   364  
   365  // PauseDeployment is used to toggle the pause state on a deployment. If the
   366  // deployment is being unpaused, an evaluation is created.
   367  func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
   368  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   369  	if err != nil {
   370  		return err
   371  	}
   372  
   373  	return watcher.PauseDeployment(req, resp)
   374  }
   375  
   376  // FailDeployment is used to fail the deployment.
   377  func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
   378  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   379  	if err != nil {
   380  		return err
   381  	}
   382  
   383  	return watcher.FailDeployment(req, resp)
   384  }
   385  
   386  // RunDeployment is used to run a pending multiregion deployment.  In
   387  // single-region deployments, the pending state is unused.
   388  func (w *Watcher) RunDeployment(req *structs.DeploymentRunRequest, resp *structs.DeploymentUpdateResponse) error {
   389  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   390  	if err != nil {
   391  		return err
   392  	}
   393  
   394  	return watcher.RunDeployment(req, resp)
   395  }
   396  
   397  // UnblockDeployment is used to unblock a multiregion deployment.  In
   398  // single-region deployments, the blocked state is unused.
   399  func (w *Watcher) UnblockDeployment(req *structs.DeploymentUnblockRequest, resp *structs.DeploymentUpdateResponse) error {
   400  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   401  	if err != nil {
   402  		return err
   403  	}
   404  
   405  	return watcher.UnblockDeployment(req, resp)
   406  }
   407  
   408  // CancelDeployment is used to cancel a multiregion deployment.  In
   409  // single-region deployments, the deploymentwatcher has sole responsibility to
   410  // cancel deployments so this RPC is never used.
   411  func (w *Watcher) CancelDeployment(req *structs.DeploymentCancelRequest, resp *structs.DeploymentUpdateResponse) error {
   412  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   413  	if err != nil {
   414  		return err
   415  	}
   416  
   417  	return watcher.CancelDeployment(req, resp)
   418  }
   419  
   420  // createUpdate commits the given allocation desired transition and evaluation
   421  // to Raft but batches the commit with other calls.
   422  func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) {
   423  	b := w.allocUpdateBatcher
   424  	if b == nil {
   425  		return 0, notEnabled
   426  	}
   427  	return b.CreateUpdate(allocs, eval).Results()
   428  }
   429  
   430  // upsertJob commits the given job to Raft
   431  func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
   432  	return w.raft.UpsertJob(job)
   433  }
   434  
   435  // upsertDeploymentStatusUpdate commits the given deployment update and optional
   436  // evaluation to Raft
   437  func (w *Watcher) upsertDeploymentStatusUpdate(
   438  	u *structs.DeploymentStatusUpdate,
   439  	e *structs.Evaluation,
   440  	j *structs.Job) (uint64, error) {
   441  	return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
   442  		DeploymentUpdate: u,
   443  		Eval:             e,
   444  		Job:              j,
   445  	})
   446  }
   447  
   448  // upsertDeploymentPromotion commits the given deployment promotion to Raft
   449  func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
   450  	return w.raft.UpdateDeploymentPromotion(req)
   451  }
   452  
   453  // upsertDeploymentAllocHealth commits the given allocation health changes to
   454  // Raft
   455  func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
   456  	return w.raft.UpdateDeploymentAllocHealth(req)
   457  }