github.com/hernad/nomad@v1.6.112/nomad/deploymentwatcher/deployments_watcher.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package deploymentwatcher 5 6 import ( 7 "context" 8 "fmt" 9 "sync" 10 "time" 11 12 "golang.org/x/time/rate" 13 14 log "github.com/hashicorp/go-hclog" 15 memdb "github.com/hashicorp/go-memdb" 16 17 "github.com/hernad/nomad/nomad/state" 18 "github.com/hernad/nomad/nomad/structs" 19 ) 20 21 const ( 22 // LimitStateQueriesPerSecond is the number of state queries allowed per 23 // second 24 LimitStateQueriesPerSecond = 100.0 25 26 // CrossDeploymentUpdateBatchDuration is the duration in which allocation 27 // desired transition and evaluation creation updates are batched across 28 // all deployment watchers before committing to Raft. 29 CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond 30 ) 31 32 var ( 33 // notEnabled is the error returned when the deployment watcher is not 34 // enabled 35 notEnabled = fmt.Errorf("deployment watcher not enabled") 36 ) 37 38 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 39 // to apply data transforms via Raft. 40 type DeploymentRaftEndpoints interface { 41 // UpsertJob is used to upsert a job 42 UpsertJob(job *structs.Job) (uint64, error) 43 44 // UpdateDeploymentStatus is used to make a deployment status update 45 // and potentially create an evaluation. 46 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 47 48 // UpdateDeploymentPromotion is used to promote canaries in a deployment 49 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 50 51 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 52 // deployment 53 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 54 55 // UpdateAllocDesiredTransition is used to update the desired transition 56 // for allocations. 57 UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error) 58 } 59 60 // Watcher is used to watch deployments and their allocations created 61 // by the scheduler and trigger the scheduler when allocation health 62 // transitions. 63 type Watcher struct { 64 enabled bool 65 logger log.Logger 66 67 // queryLimiter is used to limit the rate of blocking queries 68 queryLimiter *rate.Limiter 69 70 // updateBatchDuration is the duration to batch allocation desired 71 // transition and eval creation across all deployment watchers 72 updateBatchDuration time.Duration 73 74 // raft contains the set of Raft endpoints that can be used by the 75 // deployments watcher 76 raft DeploymentRaftEndpoints 77 78 // state is the state that is watched for state changes. 79 state *state.StateStore 80 81 // server interface for Deployment RPCs 82 deploymentRPC DeploymentRPC 83 84 // server interface for Job RPCs 85 jobRPC JobRPC 86 87 // watchers is the set of active watchers, one per deployment 88 watchers map[string]*deploymentWatcher 89 90 // allocUpdateBatcher is used to batch the creation of evaluations and 91 // allocation desired transition updates 92 allocUpdateBatcher *AllocUpdateBatcher 93 94 // ctx and exitFn are used to cancel the watcher 95 ctx context.Context 96 exitFn context.CancelFunc 97 98 l sync.RWMutex 99 } 100 101 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 102 // deployments and trigger the scheduler as needed. 103 func NewDeploymentsWatcher(logger log.Logger, 104 raft DeploymentRaftEndpoints, 105 deploymentRPC DeploymentRPC, jobRPC JobRPC, 106 stateQueriesPerSecond float64, 107 updateBatchDuration time.Duration, 108 ) *Watcher { 109 110 return &Watcher{ 111 raft: raft, 112 deploymentRPC: deploymentRPC, 113 jobRPC: jobRPC, 114 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 115 updateBatchDuration: updateBatchDuration, 116 logger: logger.Named("deployments_watcher"), 117 } 118 } 119 120 // SetEnabled is used to control if the watcher is enabled. The watcher 121 // should only be enabled on the active leader. When being enabled the state is 122 // passed in as it is no longer valid once a leader election has taken place. 123 func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) { 124 w.l.Lock() 125 defer w.l.Unlock() 126 127 wasEnabled := w.enabled 128 w.enabled = enabled 129 130 if state != nil { 131 w.state = state 132 } 133 134 // Flush the state to create the necessary objects 135 w.flush(enabled) 136 137 // If we are starting now, launch the watch daemon 138 if enabled && !wasEnabled { 139 go w.watchDeployments(w.ctx) 140 } 141 } 142 143 // flush is used to clear the state of the watcher 144 func (w *Watcher) flush(enabled bool) { 145 // Stop all the watchers and clear it 146 for _, watcher := range w.watchers { 147 watcher.StopWatch() 148 } 149 150 // Kill everything associated with the watcher 151 if w.exitFn != nil { 152 w.exitFn() 153 } 154 155 w.watchers = make(map[string]*deploymentWatcher, 32) 156 w.ctx, w.exitFn = context.WithCancel(context.Background()) 157 158 if enabled { 159 w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft) 160 } else { 161 w.allocUpdateBatcher = nil 162 } 163 } 164 165 // watchDeployments is the long lived go-routine that watches for deployments to 166 // add and remove watchers on. 167 func (w *Watcher) watchDeployments(ctx context.Context) { 168 dindex := uint64(1) 169 for { 170 // Block getting all deployments using the last deployment index. 171 deployments, idx, err := w.getDeploys(ctx, dindex) 172 if err != nil { 173 if err == context.Canceled { 174 return 175 } 176 177 w.logger.Error("failed to retrieve deployments", "error", err) 178 } 179 180 // Update the latest index 181 dindex = idx 182 183 // Ensure we are tracking the things we should and not tracking what we 184 // shouldn't be 185 for _, d := range deployments { 186 if d.Active() { 187 if err := w.add(d); err != nil { 188 w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err) 189 } 190 } else { 191 w.remove(d) 192 } 193 } 194 } 195 } 196 197 // getDeploys retrieves all deployments blocking at the given index. 198 func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) { 199 // state can be updated concurrently 200 w.l.Lock() 201 stateStore := w.state 202 w.l.Unlock() 203 204 resp, index, err := stateStore.BlockingQuery(w.getDeploysImpl, minIndex, ctx) 205 if err != nil { 206 return nil, 0, err 207 } 208 209 return resp.([]*structs.Deployment), index, nil 210 } 211 212 // getDeploysImpl retrieves all deployments from the passed state store. 213 func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, store *state.StateStore) (interface{}, uint64, error) { 214 215 iter, err := store.Deployments(ws, state.SortDefault) 216 if err != nil { 217 return nil, 0, err 218 } 219 220 var deploys []*structs.Deployment 221 for { 222 raw := iter.Next() 223 if raw == nil { 224 break 225 } 226 deploy := raw.(*structs.Deployment) 227 deploys = append(deploys, deploy) 228 } 229 230 // Use the last index that affected the deployment table 231 index, err := store.Index("deployment") 232 if err != nil { 233 return nil, 0, err 234 } 235 236 return deploys, index, nil 237 } 238 239 // add adds a deployment to the watch list 240 func (w *Watcher) add(d *structs.Deployment) error { 241 w.l.Lock() 242 defer w.l.Unlock() 243 _, err := w.addLocked(d) 244 return err 245 } 246 247 // addLocked adds a deployment to the watch list and should only be called when 248 // locked. Creating the deploymentWatcher starts a go routine to .watch() it 249 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 250 // Not enabled so no-op 251 if !w.enabled { 252 return nil, nil 253 } 254 255 if !d.Active() { 256 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 257 } 258 259 // Already watched so just update the deployment 260 if w, ok := w.watchers[d.ID]; ok { 261 w.updateDeployment(d) 262 return nil, nil 263 } 264 265 // Get the job the deployment is referencing 266 snap, err := w.state.Snapshot() 267 if err != nil { 268 return nil, err 269 } 270 271 job, err := snap.JobByID(nil, d.Namespace, d.JobID) 272 if err != nil { 273 return nil, err 274 } 275 if job == nil { 276 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 277 } 278 279 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, 280 w, w.deploymentRPC, w.jobRPC) 281 w.watchers[d.ID] = watcher 282 return watcher, nil 283 } 284 285 // remove stops watching a deployment. This can be because the deployment is 286 // complete or being deleted. 287 func (w *Watcher) remove(d *structs.Deployment) { 288 w.l.Lock() 289 defer w.l.Unlock() 290 291 // Not enabled so no-op 292 if !w.enabled { 293 return 294 } 295 296 if watcher, ok := w.watchers[d.ID]; ok { 297 watcher.StopWatch() 298 delete(w.watchers, d.ID) 299 } 300 } 301 302 // forceAdd is used to force a lookup of the given deployment object and create 303 // a watcher. If the deployment does not exist or is terminal an error is 304 // returned. 305 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 306 snap, err := w.state.Snapshot() 307 if err != nil { 308 return nil, err 309 } 310 311 deployment, err := snap.DeploymentByID(nil, dID) 312 if err != nil { 313 return nil, err 314 } 315 316 if deployment == nil { 317 return nil, fmt.Errorf("unknown deployment %q", dID) 318 } 319 320 return w.addLocked(deployment) 321 } 322 323 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 324 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 325 w.l.Lock() 326 defer w.l.Unlock() 327 328 // Not enabled so no-op 329 if !w.enabled { 330 return nil, notEnabled 331 } 332 333 watcher, ok := w.watchers[dID] 334 if ok { 335 return watcher, nil 336 } 337 338 return w.forceAdd(dID) 339 } 340 341 // SetAllocHealth is used to set the health of allocations for a deployment. If 342 // there are any unhealthy allocations, the deployment is updated to be failed. 343 // Otherwise the allocations are updated and an evaluation is created. 344 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 345 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 346 if err != nil { 347 return err 348 } 349 350 return watcher.SetAllocHealth(req, resp) 351 } 352 353 // PromoteDeployment is used to promote a deployment. If promote is false, 354 // deployment is marked as failed. Otherwise the deployment is updated and an 355 // evaluation is created. 356 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 357 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 358 if err != nil { 359 return err 360 } 361 362 return watcher.PromoteDeployment(req, resp) 363 } 364 365 // PauseDeployment is used to toggle the pause state on a deployment. If the 366 // deployment is being unpaused, an evaluation is created. 367 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 368 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 369 if err != nil { 370 return err 371 } 372 373 return watcher.PauseDeployment(req, resp) 374 } 375 376 // FailDeployment is used to fail the deployment. 377 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 378 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 379 if err != nil { 380 return err 381 } 382 383 return watcher.FailDeployment(req, resp) 384 } 385 386 // RunDeployment is used to run a pending multiregion deployment. In 387 // single-region deployments, the pending state is unused. 388 func (w *Watcher) RunDeployment(req *structs.DeploymentRunRequest, resp *structs.DeploymentUpdateResponse) error { 389 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 390 if err != nil { 391 return err 392 } 393 394 return watcher.RunDeployment(req, resp) 395 } 396 397 // UnblockDeployment is used to unblock a multiregion deployment. In 398 // single-region deployments, the blocked state is unused. 399 func (w *Watcher) UnblockDeployment(req *structs.DeploymentUnblockRequest, resp *structs.DeploymentUpdateResponse) error { 400 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 401 if err != nil { 402 return err 403 } 404 405 return watcher.UnblockDeployment(req, resp) 406 } 407 408 // CancelDeployment is used to cancel a multiregion deployment. In 409 // single-region deployments, the deploymentwatcher has sole responsibility to 410 // cancel deployments so this RPC is never used. 411 func (w *Watcher) CancelDeployment(req *structs.DeploymentCancelRequest, resp *structs.DeploymentUpdateResponse) error { 412 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 413 if err != nil { 414 return err 415 } 416 417 return watcher.CancelDeployment(req, resp) 418 } 419 420 // createUpdate commits the given allocation desired transition and evaluation 421 // to Raft but batches the commit with other calls. 422 func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) { 423 b := w.allocUpdateBatcher 424 if b == nil { 425 return 0, notEnabled 426 } 427 return b.CreateUpdate(allocs, eval).Results() 428 } 429 430 // upsertJob commits the given job to Raft 431 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 432 return w.raft.UpsertJob(job) 433 } 434 435 // upsertDeploymentStatusUpdate commits the given deployment update and optional 436 // evaluation to Raft 437 func (w *Watcher) upsertDeploymentStatusUpdate( 438 u *structs.DeploymentStatusUpdate, 439 e *structs.Evaluation, 440 j *structs.Job) (uint64, error) { 441 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 442 DeploymentUpdate: u, 443 Eval: e, 444 Job: j, 445 }) 446 } 447 448 // upsertDeploymentPromotion commits the given deployment promotion to Raft 449 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 450 return w.raft.UpdateDeploymentPromotion(req) 451 } 452 453 // upsertDeploymentAllocHealth commits the given allocation health changes to 454 // Raft 455 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 456 return w.raft.UpdateDeploymentAllocHealth(req) 457 }