github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/nomad/deploymentwatcher/deployments_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "golang.org/x/time/rate" 10 11 log "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 14 "github.com/hashicorp/nomad/nomad/state" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // LimitStateQueriesPerSecond is the number of state queries allowed per 20 // second 21 LimitStateQueriesPerSecond = 100.0 22 23 // CrossDeploymentUpdateBatchDuration is the duration in which allocation 24 // desired transition and evaluation creation updates are batched across 25 // all deployment watchers before committing to Raft. 26 CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond 27 ) 28 29 var ( 30 // notEnabled is the error returned when the deployment watcher is not 31 // enabled 32 notEnabled = fmt.Errorf("deployment watcher not enabled") 33 ) 34 35 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 36 // to apply data transforms via Raft. 37 type DeploymentRaftEndpoints interface { 38 // UpsertJob is used to upsert a job 39 UpsertJob(job *structs.Job) (uint64, error) 40 41 // UpdateDeploymentStatus is used to make a deployment status update 42 // and potentially create an evaluation. 43 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 44 45 // UpdateDeploymentPromotion is used to promote canaries in a deployment 46 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 47 48 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 49 // deployment 50 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 51 52 // UpdateAllocDesiredTransition is used to update the desired transition 53 // for allocations. 54 UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error) 55 } 56 57 // Watcher is used to watch deployments and their allocations created 58 // by the scheduler and trigger the scheduler when allocation health 59 // transitions. 60 type Watcher struct { 61 enabled bool 62 logger log.Logger 63 64 // queryLimiter is used to limit the rate of blocking queries 65 queryLimiter *rate.Limiter 66 67 // updateBatchDuration is the duration to batch allocation desired 68 // transition and eval creation across all deployment watchers 69 updateBatchDuration time.Duration 70 71 // raft contains the set of Raft endpoints that can be used by the 72 // deployments watcher 73 raft DeploymentRaftEndpoints 74 75 // state is the state that is watched for state changes. 76 state *state.StateStore 77 78 // server interface for Deployment RPCs 79 deploymentRPC DeploymentRPC 80 81 // server interface for Job RPCs 82 jobRPC JobRPC 83 84 // watchers is the set of active watchers, one per deployment 85 watchers map[string]*deploymentWatcher 86 87 // allocUpdateBatcher is used to batch the creation of evaluations and 88 // allocation desired transition updates 89 allocUpdateBatcher *AllocUpdateBatcher 90 91 // ctx and exitFn are used to cancel the watcher 92 ctx context.Context 93 exitFn context.CancelFunc 94 95 l sync.RWMutex 96 } 97 98 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 99 // deployments and trigger the scheduler as needed. 100 func NewDeploymentsWatcher(logger log.Logger, 101 raft DeploymentRaftEndpoints, 102 deploymentRPC DeploymentRPC, jobRPC JobRPC, 103 stateQueriesPerSecond float64, 104 updateBatchDuration time.Duration, 105 ) *Watcher { 106 107 return &Watcher{ 108 raft: raft, 109 deploymentRPC: deploymentRPC, 110 jobRPC: jobRPC, 111 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 112 updateBatchDuration: updateBatchDuration, 113 logger: logger.Named("deployments_watcher"), 114 } 115 } 116 117 // SetEnabled is used to control if the watcher is enabled. The watcher 118 // should only be enabled on the active leader. When being enabled the state is 119 // passed in as it is no longer valid once a leader election has taken place. 120 func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) { 121 w.l.Lock() 122 defer w.l.Unlock() 123 124 wasEnabled := w.enabled 125 w.enabled = enabled 126 127 if state != nil { 128 w.state = state 129 } 130 131 // Flush the state to create the necessary objects 132 w.flush(enabled) 133 134 // If we are starting now, launch the watch daemon 135 if enabled && !wasEnabled { 136 go w.watchDeployments(w.ctx) 137 } 138 } 139 140 // flush is used to clear the state of the watcher 141 func (w *Watcher) flush(enabled bool) { 142 // Stop all the watchers and clear it 143 for _, watcher := range w.watchers { 144 watcher.StopWatch() 145 } 146 147 // Kill everything associated with the watcher 148 if w.exitFn != nil { 149 w.exitFn() 150 } 151 152 w.watchers = make(map[string]*deploymentWatcher, 32) 153 w.ctx, w.exitFn = context.WithCancel(context.Background()) 154 155 if enabled { 156 w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft) 157 } else { 158 w.allocUpdateBatcher = nil 159 } 160 } 161 162 // watchDeployments is the long lived go-routine that watches for deployments to 163 // add and remove watchers on. 164 func (w *Watcher) watchDeployments(ctx context.Context) { 165 dindex := uint64(1) 166 for { 167 // Block getting all deployments using the last deployment index. 168 deployments, idx, err := w.getDeploys(ctx, dindex) 169 if err != nil { 170 if err == context.Canceled { 171 return 172 } 173 174 w.logger.Error("failed to retrieve deployments", "error", err) 175 } 176 177 // Update the latest index 178 dindex = idx 179 180 // Ensure we are tracking the things we should and not tracking what we 181 // shouldn't be 182 for _, d := range deployments { 183 if d.Active() { 184 if err := w.add(d); err != nil { 185 w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err) 186 } 187 } else { 188 w.remove(d) 189 } 190 } 191 } 192 } 193 194 // getDeploys retrieves all deployments blocking at the given index. 195 func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) { 196 resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx) 197 if err != nil { 198 return nil, 0, err 199 } 200 201 return resp.([]*structs.Deployment), index, nil 202 } 203 204 // getDeploysImpl retrieves all deployments from the passed state store. 205 func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 206 207 iter, err := state.Deployments(ws) 208 if err != nil { 209 return nil, 0, err 210 } 211 212 var deploys []*structs.Deployment 213 for { 214 raw := iter.Next() 215 if raw == nil { 216 break 217 } 218 deploy := raw.(*structs.Deployment) 219 deploys = append(deploys, deploy) 220 } 221 222 // Use the last index that affected the deployment table 223 index, err := state.Index("deployment") 224 if err != nil { 225 return nil, 0, err 226 } 227 228 return deploys, index, nil 229 } 230 231 // add adds a deployment to the watch list 232 func (w *Watcher) add(d *structs.Deployment) error { 233 w.l.Lock() 234 defer w.l.Unlock() 235 _, err := w.addLocked(d) 236 return err 237 } 238 239 // addLocked adds a deployment to the watch list and should only be called when 240 // locked. Creating the deploymentWatcher starts a go routine to .watch() it 241 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 242 // Not enabled so no-op 243 if !w.enabled { 244 return nil, nil 245 } 246 247 if !d.Active() { 248 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 249 } 250 251 // Already watched so just update the deployment 252 if w, ok := w.watchers[d.ID]; ok { 253 w.updateDeployment(d) 254 return nil, nil 255 } 256 257 // Get the job the deployment is referencing 258 snap, err := w.state.Snapshot() 259 if err != nil { 260 return nil, err 261 } 262 263 job, err := snap.JobByID(nil, d.Namespace, d.JobID) 264 if err != nil { 265 return nil, err 266 } 267 if job == nil { 268 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 269 } 270 271 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, 272 w, w.deploymentRPC, w.jobRPC) 273 w.watchers[d.ID] = watcher 274 return watcher, nil 275 } 276 277 // remove stops watching a deployment. This can be because the deployment is 278 // complete or being deleted. 279 func (w *Watcher) remove(d *structs.Deployment) { 280 w.l.Lock() 281 defer w.l.Unlock() 282 283 // Not enabled so no-op 284 if !w.enabled { 285 return 286 } 287 288 if watcher, ok := w.watchers[d.ID]; ok { 289 watcher.StopWatch() 290 delete(w.watchers, d.ID) 291 } 292 } 293 294 // forceAdd is used to force a lookup of the given deployment object and create 295 // a watcher. If the deployment does not exist or is terminal an error is 296 // returned. 297 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 298 snap, err := w.state.Snapshot() 299 if err != nil { 300 return nil, err 301 } 302 303 deployment, err := snap.DeploymentByID(nil, dID) 304 if err != nil { 305 return nil, err 306 } 307 308 if deployment == nil { 309 return nil, fmt.Errorf("unknown deployment %q", dID) 310 } 311 312 return w.addLocked(deployment) 313 } 314 315 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 316 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 317 w.l.Lock() 318 defer w.l.Unlock() 319 320 // Not enabled so no-op 321 if !w.enabled { 322 return nil, notEnabled 323 } 324 325 watcher, ok := w.watchers[dID] 326 if ok { 327 return watcher, nil 328 } 329 330 return w.forceAdd(dID) 331 } 332 333 // SetAllocHealth is used to set the health of allocations for a deployment. If 334 // there are any unhealthy allocations, the deployment is updated to be failed. 335 // Otherwise the allocations are updated and an evaluation is created. 336 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 337 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 338 if err != nil { 339 return err 340 } 341 342 return watcher.SetAllocHealth(req, resp) 343 } 344 345 // PromoteDeployment is used to promote a deployment. If promote is false, 346 // deployment is marked as failed. Otherwise the deployment is updated and an 347 // evaluation is created. 348 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 349 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 350 if err != nil { 351 return err 352 } 353 354 return watcher.PromoteDeployment(req, resp) 355 } 356 357 // PauseDeployment is used to toggle the pause state on a deployment. If the 358 // deployment is being unpaused, an evaluation is created. 359 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 360 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 361 if err != nil { 362 return err 363 } 364 365 return watcher.PauseDeployment(req, resp) 366 } 367 368 // FailDeployment is used to fail the deployment. 369 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 370 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 371 if err != nil { 372 return err 373 } 374 375 return watcher.FailDeployment(req, resp) 376 } 377 378 // RunDeployment is used to run a pending multiregion deployment. In 379 // single-region deployments, the pending state is unused. 380 func (w *Watcher) RunDeployment(req *structs.DeploymentRunRequest, resp *structs.DeploymentUpdateResponse) error { 381 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 382 if err != nil { 383 return err 384 } 385 386 return watcher.RunDeployment(req, resp) 387 } 388 389 // UnblockDeployment is used to unblock a multiregion deployment. In 390 // single-region deployments, the blocked state is unused. 391 func (w *Watcher) UnblockDeployment(req *structs.DeploymentUnblockRequest, resp *structs.DeploymentUpdateResponse) error { 392 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 393 if err != nil { 394 return err 395 } 396 397 return watcher.UnblockDeployment(req, resp) 398 } 399 400 // CancelDeployment is used to cancel a multiregion deployment. In 401 // single-region deployments, the deploymentwatcher has sole responsibility to 402 // cancel deployments so this RPC is never used. 403 func (w *Watcher) CancelDeployment(req *structs.DeploymentCancelRequest, resp *structs.DeploymentUpdateResponse) error { 404 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 405 if err != nil { 406 return err 407 } 408 409 return watcher.CancelDeployment(req, resp) 410 } 411 412 // createUpdate commits the given allocation desired transition and evaluation 413 // to Raft but batches the commit with other calls. 414 func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) { 415 b := w.allocUpdateBatcher 416 if b == nil { 417 return 0, notEnabled 418 } 419 return b.CreateUpdate(allocs, eval).Results() 420 } 421 422 // upsertJob commits the given job to Raft 423 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 424 return w.raft.UpsertJob(job) 425 } 426 427 // upsertDeploymentStatusUpdate commits the given deployment update and optional 428 // evaluation to Raft 429 func (w *Watcher) upsertDeploymentStatusUpdate( 430 u *structs.DeploymentStatusUpdate, 431 e *structs.Evaluation, 432 j *structs.Job) (uint64, error) { 433 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 434 DeploymentUpdate: u, 435 Eval: e, 436 Job: j, 437 }) 438 } 439 440 // upsertDeploymentPromotion commits the given deployment promotion to Raft 441 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 442 return w.raft.UpdateDeploymentPromotion(req) 443 } 444 445 // upsertDeploymentAllocHealth commits the given allocation health changes to 446 // Raft 447 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 448 return w.raft.UpdateDeploymentAllocHealth(req) 449 }