github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/nomad/deploymentwatcher/deployments_watcher.go

github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/nomad/deploymentwatcher/deployments_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"golang.org/x/time/rate"
    10  
    11  	log "github.com/hashicorp/go-hclog"
    12  	memdb "github.com/hashicorp/go-memdb"
    13  
    14  	"github.com/hashicorp/nomad/nomad/state"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    20  	// second
    21  	LimitStateQueriesPerSecond = 100.0
    22  
    23  	// CrossDeploymentUpdateBatchDuration is the duration in which allocation
    24  	// desired transition and evaluation creation updates are batched across
    25  	// all deployment watchers before committing to Raft.
    26  	CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond
    27  )
    28  
    29  var (
    30  	// notEnabled is the error returned when the deployment watcher is not
    31  	// enabled
    32  	notEnabled = fmt.Errorf("deployment watcher not enabled")
    33  )
    34  
    35  // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
    36  // to apply data transforms via Raft.
    37  type DeploymentRaftEndpoints interface {
    38  	// UpsertJob is used to upsert a job
    39  	UpsertJob(job *structs.Job) (uint64, error)
    40  
    41  	// UpdateDeploymentStatus is used to make a deployment status update
    42  	// and potentially create an evaluation.
    43  	UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
    44  
    45  	// UpdateDeploymentPromotion is used to promote canaries in a deployment
    46  	UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    47  
    48  	// UpdateDeploymentAllocHealth is used to set the health of allocations in a
    49  	// deployment
    50  	UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    51  
    52  	// UpdateAllocDesiredTransition is used to update the desired transition
    53  	// for allocations.
    54  	UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error)
    55  }
    56  
    57  // Watcher is used to watch deployments and their allocations created
    58  // by the scheduler and trigger the scheduler when allocation health
    59  // transitions.
    60  type Watcher struct {
    61  	enabled bool
    62  	logger  log.Logger
    63  
    64  	// queryLimiter is used to limit the rate of blocking queries
    65  	queryLimiter *rate.Limiter
    66  
    67  	// updateBatchDuration is the duration to batch allocation desired
    68  	// transition and eval creation across all deployment watchers
    69  	updateBatchDuration time.Duration
    70  
    71  	// raft contains the set of Raft endpoints that can be used by the
    72  	// deployments watcher
    73  	raft DeploymentRaftEndpoints
    74  
    75  	// state is the state that is watched for state changes.
    76  	state *state.StateStore
    77  
    78  	// server interface for Deployment RPCs
    79  	deploymentRPC DeploymentRPC
    80  
    81  	// server interface for Job RPCs
    82  	jobRPC JobRPC
    83  
    84  	// watchers is the set of active watchers, one per deployment
    85  	watchers map[string]*deploymentWatcher
    86  
    87  	// allocUpdateBatcher is used to batch the creation of evaluations and
    88  	// allocation desired transition updates
    89  	allocUpdateBatcher *AllocUpdateBatcher
    90  
    91  	// ctx and exitFn are used to cancel the watcher
    92  	ctx    context.Context
    93  	exitFn context.CancelFunc
    94  
    95  	l sync.RWMutex
    96  }
    97  
    98  // NewDeploymentsWatcher returns a deployments watcher that is used to watch
    99  // deployments and trigger the scheduler as needed.
   100  func NewDeploymentsWatcher(logger log.Logger,
   101  	raft DeploymentRaftEndpoints,
   102  	deploymentRPC DeploymentRPC, jobRPC JobRPC,
   103  	stateQueriesPerSecond float64,
   104  	updateBatchDuration time.Duration,
   105  ) *Watcher {
   106  
   107  	return &Watcher{
   108  		raft:                raft,
   109  		deploymentRPC:       deploymentRPC,
   110  		jobRPC:              jobRPC,
   111  		queryLimiter:        rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
   112  		updateBatchDuration: updateBatchDuration,
   113  		logger:              logger.Named("deployments_watcher"),
   114  	}
   115  }
   116  
   117  // SetEnabled is used to control if the watcher is enabled. The watcher
   118  // should only be enabled on the active leader. When being enabled the state is
   119  // passed in as it is no longer valid once a leader election has taken place.
   120  func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) {
   121  	w.l.Lock()
   122  	defer w.l.Unlock()
   123  
   124  	wasEnabled := w.enabled
   125  	w.enabled = enabled
   126  
   127  	if state != nil {
   128  		w.state = state
   129  	}
   130  
   131  	// Flush the state to create the necessary objects
   132  	w.flush(enabled)
   133  
   134  	// If we are starting now, launch the watch daemon
   135  	if enabled && !wasEnabled {
   136  		go w.watchDeployments(w.ctx)
   137  	}
   138  }
   139  
   140  // flush is used to clear the state of the watcher
   141  func (w *Watcher) flush(enabled bool) {
   142  	// Stop all the watchers and clear it
   143  	for _, watcher := range w.watchers {
   144  		watcher.StopWatch()
   145  	}
   146  
   147  	// Kill everything associated with the watcher
   148  	if w.exitFn != nil {
   149  		w.exitFn()
   150  	}
   151  
   152  	w.watchers = make(map[string]*deploymentWatcher, 32)
   153  	w.ctx, w.exitFn = context.WithCancel(context.Background())
   154  
   155  	if enabled {
   156  		w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft)
   157  	} else {
   158  		w.allocUpdateBatcher = nil
   159  	}
   160  }
   161  
   162  // watchDeployments is the long lived go-routine that watches for deployments to
   163  // add and remove watchers on.
   164  func (w *Watcher) watchDeployments(ctx context.Context) {
   165  	dindex := uint64(1)
   166  	for {
   167  		// Block getting all deployments using the last deployment index.
   168  		deployments, idx, err := w.getDeploys(ctx, dindex)
   169  		if err != nil {
   170  			if err == context.Canceled {
   171  				return
   172  			}
   173  
   174  			w.logger.Error("failed to retrieve deployments", "error", err)
   175  		}
   176  
   177  		// Update the latest index
   178  		dindex = idx
   179  
   180  		// Ensure we are tracking the things we should and not tracking what we
   181  		// shouldn't be
   182  		for _, d := range deployments {
   183  			if d.Active() {
   184  				if err := w.add(d); err != nil {
   185  					w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err)
   186  				}
   187  			} else {
   188  				w.remove(d)
   189  			}
   190  		}
   191  	}
   192  }
   193  
   194  // getDeploys retrieves all deployments blocking at the given index.
   195  func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) {
   196  	resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx)
   197  	if err != nil {
   198  		return nil, 0, err
   199  	}
   200  
   201  	return resp.([]*structs.Deployment), index, nil
   202  }
   203  
   204  // getDeploysImpl retrieves all deployments from the passed state store.
   205  func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   206  
   207  	iter, err := state.Deployments(ws)
   208  	if err != nil {
   209  		return nil, 0, err
   210  	}
   211  
   212  	var deploys []*structs.Deployment
   213  	for {
   214  		raw := iter.Next()
   215  		if raw == nil {
   216  			break
   217  		}
   218  		deploy := raw.(*structs.Deployment)
   219  		deploys = append(deploys, deploy)
   220  	}
   221  
   222  	// Use the last index that affected the deployment table
   223  	index, err := state.Index("deployment")
   224  	if err != nil {
   225  		return nil, 0, err
   226  	}
   227  
   228  	return deploys, index, nil
   229  }
   230  
   231  // add adds a deployment to the watch list
   232  func (w *Watcher) add(d *structs.Deployment) error {
   233  	w.l.Lock()
   234  	defer w.l.Unlock()
   235  	_, err := w.addLocked(d)
   236  	return err
   237  }
   238  
   239  // addLocked adds a deployment to the watch list and should only be called when
   240  // locked. Creating the deploymentWatcher starts a go routine to .watch() it
   241  func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
   242  	// Not enabled so no-op
   243  	if !w.enabled {
   244  		return nil, nil
   245  	}
   246  
   247  	if !d.Active() {
   248  		return nil, fmt.Errorf("deployment %q is terminal", d.ID)
   249  	}
   250  
   251  	// Already watched so just update the deployment
   252  	if w, ok := w.watchers[d.ID]; ok {
   253  		w.updateDeployment(d)
   254  		return nil, nil
   255  	}
   256  
   257  	// Get the job the deployment is referencing
   258  	snap, err := w.state.Snapshot()
   259  	if err != nil {
   260  		return nil, err
   261  	}
   262  
   263  	job, err := snap.JobByID(nil, d.Namespace, d.JobID)
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	if job == nil {
   268  		return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
   269  	}
   270  
   271  	watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job,
   272  		w, w.deploymentRPC, w.jobRPC)
   273  	w.watchers[d.ID] = watcher
   274  	return watcher, nil
   275  }
   276  
   277  // remove stops watching a deployment. This can be because the deployment is
   278  // complete or being deleted.
   279  func (w *Watcher) remove(d *structs.Deployment) {
   280  	w.l.Lock()
   281  	defer w.l.Unlock()
   282  
   283  	// Not enabled so no-op
   284  	if !w.enabled {
   285  		return
   286  	}
   287  
   288  	if watcher, ok := w.watchers[d.ID]; ok {
   289  		watcher.StopWatch()
   290  		delete(w.watchers, d.ID)
   291  	}
   292  }
   293  
   294  // forceAdd is used to force a lookup of the given deployment object and create
   295  // a watcher. If the deployment does not exist or is terminal an error is
   296  // returned.
   297  func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
   298  	snap, err := w.state.Snapshot()
   299  	if err != nil {
   300  		return nil, err
   301  	}
   302  
   303  	deployment, err := snap.DeploymentByID(nil, dID)
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  
   308  	if deployment == nil {
   309  		return nil, fmt.Errorf("unknown deployment %q", dID)
   310  	}
   311  
   312  	return w.addLocked(deployment)
   313  }
   314  
   315  // getOrCreateWatcher returns the deployment watcher for the given deployment ID.
   316  func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
   317  	w.l.Lock()
   318  	defer w.l.Unlock()
   319  
   320  	// Not enabled so no-op
   321  	if !w.enabled {
   322  		return nil, notEnabled
   323  	}
   324  
   325  	watcher, ok := w.watchers[dID]
   326  	if ok {
   327  		return watcher, nil
   328  	}
   329  
   330  	return w.forceAdd(dID)
   331  }
   332  
   333  // SetAllocHealth is used to set the health of allocations for a deployment. If
   334  // there are any unhealthy allocations, the deployment is updated to be failed.
   335  // Otherwise the allocations are updated and an evaluation is created.
   336  func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
   337  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   338  	if err != nil {
   339  		return err
   340  	}
   341  
   342  	return watcher.SetAllocHealth(req, resp)
   343  }
   344  
   345  // PromoteDeployment is used to promote a deployment. If promote is false,
   346  // deployment is marked as failed. Otherwise the deployment is updated and an
   347  // evaluation is created.
   348  func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
   349  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   350  	if err != nil {
   351  		return err
   352  	}
   353  
   354  	return watcher.PromoteDeployment(req, resp)
   355  }
   356  
   357  // PauseDeployment is used to toggle the pause state on a deployment. If the
   358  // deployment is being unpaused, an evaluation is created.
   359  func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
   360  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   361  	if err != nil {
   362  		return err
   363  	}
   364  
   365  	return watcher.PauseDeployment(req, resp)
   366  }
   367  
   368  // FailDeployment is used to fail the deployment.
   369  func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
   370  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   371  	if err != nil {
   372  		return err
   373  	}
   374  
   375  	return watcher.FailDeployment(req, resp)
   376  }
   377  
   378  // RunDeployment is used to run a pending multiregion deployment.  In
   379  // single-region deployments, the pending state is unused.
   380  func (w *Watcher) RunDeployment(req *structs.DeploymentRunRequest, resp *structs.DeploymentUpdateResponse) error {
   381  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   382  	if err != nil {
   383  		return err
   384  	}
   385  
   386  	return watcher.RunDeployment(req, resp)
   387  }
   388  
   389  // UnblockDeployment is used to unblock a multiregion deployment.  In
   390  // single-region deployments, the blocked state is unused.
   391  func (w *Watcher) UnblockDeployment(req *structs.DeploymentUnblockRequest, resp *structs.DeploymentUpdateResponse) error {
   392  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   393  	if err != nil {
   394  		return err
   395  	}
   396  
   397  	return watcher.UnblockDeployment(req, resp)
   398  }
   399  
   400  // CancelDeployment is used to cancel a multiregion deployment.  In
   401  // single-region deployments, the deploymentwatcher has sole responsibility to
   402  // cancel deployments so this RPC is never used.
   403  func (w *Watcher) CancelDeployment(req *structs.DeploymentCancelRequest, resp *structs.DeploymentUpdateResponse) error {
   404  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   405  	if err != nil {
   406  		return err
   407  	}
   408  
   409  	return watcher.CancelDeployment(req, resp)
   410  }
   411  
   412  // createUpdate commits the given allocation desired transition and evaluation
   413  // to Raft but batches the commit with other calls.
   414  func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) {
   415  	b := w.allocUpdateBatcher
   416  	if b == nil {
   417  		return 0, notEnabled
   418  	}
   419  	return b.CreateUpdate(allocs, eval).Results()
   420  }
   421  
   422  // upsertJob commits the given job to Raft
   423  func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
   424  	return w.raft.UpsertJob(job)
   425  }
   426  
   427  // upsertDeploymentStatusUpdate commits the given deployment update and optional
   428  // evaluation to Raft
   429  func (w *Watcher) upsertDeploymentStatusUpdate(
   430  	u *structs.DeploymentStatusUpdate,
   431  	e *structs.Evaluation,
   432  	j *structs.Job) (uint64, error) {
   433  	return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
   434  		DeploymentUpdate: u,
   435  		Eval:             e,
   436  		Job:              j,
   437  	})
   438  }
   439  
   440  // upsertDeploymentPromotion commits the given deployment promotion to Raft
   441  func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
   442  	return w.raft.UpdateDeploymentPromotion(req)
   443  }
   444  
   445  // upsertDeploymentAllocHealth commits the given allocation health changes to
   446  // Raft
   447  func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
   448  	return w.raft.UpdateDeploymentAllocHealth(req)
   449  }