github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/deploymentwatcher/deployment_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "log" 6 "sync" 7 "time" 8 9 "golang.org/x/time/rate" 10 11 memdb "github.com/hashicorp/go-memdb" 12 "github.com/hashicorp/nomad/helper" 13 "github.com/hashicorp/nomad/helper/uuid" 14 "github.com/hashicorp/nomad/nomad/state" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // perJobEvalBatchPeriod is the batching length before creating an evaluation to 20 // trigger the scheduler when allocations are marked as healthy. 21 perJobEvalBatchPeriod = 1 * time.Second 22 ) 23 24 // deploymentTriggers are the set of functions required to trigger changes on 25 // behalf of a deployment 26 type deploymentTriggers interface { 27 // createEvaluation is used to create an evaluation. 28 createEvaluation(eval *structs.Evaluation) (uint64, error) 29 30 // upsertJob is used to roll back a job when autoreverting for a deployment 31 upsertJob(job *structs.Job) (uint64, error) 32 33 // upsertDeploymentStatusUpdate is used to upsert a deployment status update 34 // and an optional evaluation and job to upsert 35 upsertDeploymentStatusUpdate(u *structs.DeploymentStatusUpdate, eval *structs.Evaluation, job *structs.Job) (uint64, error) 36 37 // upsertDeploymentPromotion is used to promote canaries in a deployment 38 upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 39 40 // upsertDeploymentAllocHealth is used to set the health of allocations in a 41 // deployment 42 upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 43 } 44 45 // deploymentWatcher is used to watch a single deployment and trigger the 46 // scheduler when allocation health transitions. 47 type deploymentWatcher struct { 48 // queryLimiter is used to limit the rate of blocking queries 49 queryLimiter *rate.Limiter 50 51 // deploymentTriggers holds the methods required to trigger changes on behalf of the 52 // deployment 53 deploymentTriggers 54 55 // state is the state that is watched for state changes. 56 state *state.StateStore 57 58 // d is the deployment being watched 59 d *structs.Deployment 60 61 // j is the job the deployment is for 62 j *structs.Job 63 64 // outstandingBatch marks whether an outstanding function exists to create 65 // the evaluation. Access should be done through the lock 66 outstandingBatch bool 67 68 // latestEval is the latest eval for the job. It is updated by the watch 69 // loop and any time an evaluation is created. The field should be accessed 70 // by holding the lock or using the setter and getter methods. 71 latestEval uint64 72 73 logger *log.Logger 74 ctx context.Context 75 exitFn context.CancelFunc 76 l sync.RWMutex 77 } 78 79 // newDeploymentWatcher returns a deployment watcher that is used to watch 80 // deployments and trigger the scheduler as needed. 81 func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, 82 logger *log.Logger, state *state.StateStore, d *structs.Deployment, 83 j *structs.Job, triggers deploymentTriggers) *deploymentWatcher { 84 85 ctx, exitFn := context.WithCancel(parent) 86 w := &deploymentWatcher{ 87 queryLimiter: queryLimiter, 88 d: d, 89 j: j, 90 state: state, 91 deploymentTriggers: triggers, 92 logger: logger, 93 ctx: ctx, 94 exitFn: exitFn, 95 } 96 97 // Start the long lived watcher that scans for allocation updates 98 go w.watch() 99 100 return w 101 } 102 103 func (w *deploymentWatcher) SetAllocHealth( 104 req *structs.DeploymentAllocHealthRequest, 105 resp *structs.DeploymentUpdateResponse) error { 106 107 // If we are failing the deployment, update the status and potentially 108 // rollback 109 var j *structs.Job 110 var u *structs.DeploymentStatusUpdate 111 112 // If there are unhealthy allocations we need to mark the deployment as 113 // failed and check if we should roll back to a stable job. 114 if l := len(req.UnhealthyAllocationIDs); l != 0 { 115 unhealthy := make(map[string]struct{}, l) 116 for _, alloc := range req.UnhealthyAllocationIDs { 117 unhealthy[alloc] = struct{}{} 118 } 119 120 // Get the allocations for the deployment 121 snap, err := w.state.Snapshot() 122 if err != nil { 123 return err 124 } 125 126 allocs, err := snap.AllocsByDeployment(nil, req.DeploymentID) 127 if err != nil { 128 return err 129 } 130 131 // Determine if we should autorevert to an older job 132 desc := structs.DeploymentStatusDescriptionFailedAllocations 133 for _, alloc := range allocs { 134 // Check that the alloc has been marked unhealthy 135 if _, ok := unhealthy[alloc.ID]; !ok { 136 continue 137 } 138 139 // Check if the group has autorevert set 140 group, ok := w.d.TaskGroups[alloc.TaskGroup] 141 if !ok || !group.AutoRevert { 142 continue 143 } 144 145 var err error 146 j, err = w.latestStableJob() 147 if err != nil { 148 return err 149 } 150 151 if j != nil { 152 j, desc = w.handleRollbackValidity(j, desc) 153 } 154 break 155 } 156 157 u = w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) 158 } 159 160 // Canonicalize the job in case it doesn't have namespace set 161 j.Canonicalize() 162 163 // Create the request 164 areq := &structs.ApplyDeploymentAllocHealthRequest{ 165 DeploymentAllocHealthRequest: *req, 166 Eval: w.getEval(), 167 DeploymentUpdate: u, 168 Job: j, 169 } 170 171 index, err := w.upsertDeploymentAllocHealth(areq) 172 if err != nil { 173 return err 174 } 175 176 // Build the response 177 resp.EvalID = areq.Eval.ID 178 resp.EvalCreateIndex = index 179 resp.DeploymentModifyIndex = index 180 resp.Index = index 181 if j != nil { 182 resp.RevertedJobVersion = helper.Uint64ToPtr(j.Version) 183 } 184 w.setLatestEval(index) 185 return nil 186 } 187 188 // handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job 189 // Returns a modified description and job accordingly. 190 func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) { 191 // Only rollback if job being changed has a different spec. 192 // This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback 193 // If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed. 194 if w.j.SpecChanged(rollbackJob) { 195 desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version) 196 } else { 197 desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version) 198 rollbackJob = nil 199 } 200 return rollbackJob, desc 201 } 202 203 func (w *deploymentWatcher) PromoteDeployment( 204 req *structs.DeploymentPromoteRequest, 205 resp *structs.DeploymentUpdateResponse) error { 206 207 // Create the request 208 areq := &structs.ApplyDeploymentPromoteRequest{ 209 DeploymentPromoteRequest: *req, 210 Eval: w.getEval(), 211 } 212 213 index, err := w.upsertDeploymentPromotion(areq) 214 if err != nil { 215 return err 216 } 217 218 // Build the response 219 resp.EvalID = areq.Eval.ID 220 resp.EvalCreateIndex = index 221 resp.DeploymentModifyIndex = index 222 resp.Index = index 223 w.setLatestEval(index) 224 return nil 225 } 226 227 func (w *deploymentWatcher) PauseDeployment( 228 req *structs.DeploymentPauseRequest, 229 resp *structs.DeploymentUpdateResponse) error { 230 // Determine the status we should transition to and if we need to create an 231 // evaluation 232 status, desc := structs.DeploymentStatusPaused, structs.DeploymentStatusDescriptionPaused 233 var eval *structs.Evaluation 234 evalID := "" 235 if !req.Pause { 236 status, desc = structs.DeploymentStatusRunning, structs.DeploymentStatusDescriptionRunning 237 eval = w.getEval() 238 evalID = eval.ID 239 } 240 update := w.getDeploymentStatusUpdate(status, desc) 241 242 // Commit the change 243 i, err := w.upsertDeploymentStatusUpdate(update, eval, nil) 244 if err != nil { 245 return err 246 } 247 248 // Build the response 249 if evalID != "" { 250 resp.EvalID = evalID 251 resp.EvalCreateIndex = i 252 } 253 resp.DeploymentModifyIndex = i 254 resp.Index = i 255 w.setLatestEval(i) 256 return nil 257 } 258 259 func (w *deploymentWatcher) FailDeployment( 260 req *structs.DeploymentFailRequest, 261 resp *structs.DeploymentUpdateResponse) error { 262 263 status, desc := structs.DeploymentStatusFailed, structs.DeploymentStatusDescriptionFailedByUser 264 265 // Determine if we should rollback 266 rollback := false 267 for _, state := range w.d.TaskGroups { 268 if state.AutoRevert { 269 rollback = true 270 break 271 } 272 } 273 274 var rollbackJob *structs.Job 275 if rollback { 276 var err error 277 rollbackJob, err = w.latestStableJob() 278 if err != nil { 279 return err 280 } 281 282 if rollbackJob != nil { 283 rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc) 284 } else { 285 desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc) 286 } 287 } 288 289 // Commit the change 290 update := w.getDeploymentStatusUpdate(status, desc) 291 eval := w.getEval() 292 i, err := w.upsertDeploymentStatusUpdate(update, eval, rollbackJob) 293 if err != nil { 294 return err 295 } 296 297 // Build the response 298 resp.EvalID = eval.ID 299 resp.EvalCreateIndex = i 300 resp.DeploymentModifyIndex = i 301 resp.Index = i 302 if rollbackJob != nil { 303 resp.RevertedJobVersion = helper.Uint64ToPtr(rollbackJob.Version) 304 } 305 w.setLatestEval(i) 306 return nil 307 } 308 309 // StopWatch stops watching the deployment. This should be called whenever a 310 // deployment is completed or the watcher is no longer needed. 311 func (w *deploymentWatcher) StopWatch() { 312 w.exitFn() 313 } 314 315 // watch is the long running watcher that takes actions upon allocation changes 316 func (w *deploymentWatcher) watch() { 317 allocIndex := uint64(1) 318 for { 319 // Block getting all allocations that are part of the deployment using 320 // the last evaluation index. This will have us block waiting for 321 // something to change past what the scheduler has evaluated. 322 allocs, index, err := w.getAllocs(allocIndex) 323 if err != nil { 324 if err == context.Canceled || w.ctx.Err() == context.Canceled { 325 return 326 } 327 328 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to retrieve allocations for deployment %q: %v", w.d.ID, err) 329 return 330 } 331 allocIndex = index 332 333 // Get the latest evaluation index 334 latestEval, err := w.latestEvalIndex() 335 if err != nil { 336 if err == context.Canceled || w.ctx.Err() == context.Canceled { 337 return 338 } 339 340 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine last evaluation index for job %q: %v", w.d.JobID, err) 341 return 342 } 343 344 // Create an evaluation trigger if there is any allocation whose 345 // deployment status has been updated past the latest eval index. 346 createEval, failDeployment, rollback := false, false, false 347 for _, alloc := range allocs { 348 if alloc.DeploymentStatus == nil || alloc.DeploymentStatus.ModifyIndex <= latestEval { 349 continue 350 } 351 352 // We need to create an eval 353 createEval = true 354 355 if alloc.DeploymentStatus.IsUnhealthy() { 356 // Check if the group has autorevert set 357 group, ok := w.d.TaskGroups[alloc.TaskGroup] 358 if ok && group.AutoRevert { 359 rollback = true 360 } 361 362 // Since we have an unhealthy allocation, fail the deployment 363 failDeployment = true 364 } 365 366 // All conditions have been hit so we can break 367 if createEval && failDeployment && rollback { 368 break 369 } 370 } 371 372 // Change the deployments status to failed 373 if failDeployment { 374 // Default description 375 desc := structs.DeploymentStatusDescriptionFailedAllocations 376 377 // Rollback to the old job if necessary 378 var j *structs.Job 379 if rollback { 380 var err error 381 j, err = w.latestStableJob() 382 if err != nil { 383 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to lookup latest stable job for %q: %v", w.d.JobID, err) 384 } 385 386 // Description should include that the job is being rolled back to 387 // version N 388 if j != nil { 389 j, desc = w.handleRollbackValidity(j, desc) 390 } else { 391 desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc) 392 } 393 } 394 395 // Update the status of the deployment to failed and create an 396 // evaluation. 397 e := w.getEval() 398 u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) 399 if index, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil { 400 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to update deployment %q status: %v", w.d.ID, err) 401 } else { 402 w.setLatestEval(index) 403 } 404 } else if createEval { 405 // Create an eval to push the deployment along 406 w.createEvalBatched(index) 407 } 408 } 409 } 410 411 // latestStableJob returns the latest stable job. It may be nil if none exist 412 func (w *deploymentWatcher) latestStableJob() (*structs.Job, error) { 413 snap, err := w.state.Snapshot() 414 if err != nil { 415 return nil, err 416 } 417 418 versions, err := snap.JobVersionsByID(nil, w.d.Namespace, w.d.JobID) 419 if err != nil { 420 return nil, err 421 } 422 423 var stable *structs.Job 424 for _, job := range versions { 425 if job.Stable { 426 stable = job 427 break 428 } 429 } 430 431 return stable, nil 432 } 433 434 // createEvalBatched creates an eval but batches calls together 435 func (w *deploymentWatcher) createEvalBatched(forIndex uint64) { 436 w.l.Lock() 437 defer w.l.Unlock() 438 439 if w.outstandingBatch || forIndex < w.latestEval { 440 return 441 } 442 443 w.outstandingBatch = true 444 445 time.AfterFunc(perJobEvalBatchPeriod, func() { 446 // If the timer has been created and then we shutdown, we need to no-op 447 // the evaluation creation. 448 select { 449 case <-w.ctx.Done(): 450 return 451 default: 452 } 453 454 // Create the eval 455 evalCreateIndex, err := w.createEvaluation(w.getEval()) 456 if err != nil { 457 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to create evaluation for deployment %q: %v", w.d.ID, err) 458 } else { 459 w.setLatestEval(evalCreateIndex) 460 } 461 462 w.l.Lock() 463 w.outstandingBatch = false 464 w.l.Unlock() 465 466 }) 467 } 468 469 // getEval returns an evaluation suitable for the deployment 470 func (w *deploymentWatcher) getEval() *structs.Evaluation { 471 return &structs.Evaluation{ 472 ID: uuid.Generate(), 473 Namespace: w.j.Namespace, 474 Priority: w.j.Priority, 475 Type: w.j.Type, 476 TriggeredBy: structs.EvalTriggerDeploymentWatcher, 477 JobID: w.j.ID, 478 DeploymentID: w.d.ID, 479 Status: structs.EvalStatusPending, 480 } 481 } 482 483 // getDeploymentStatusUpdate returns a deployment status update 484 func (w *deploymentWatcher) getDeploymentStatusUpdate(status, desc string) *structs.DeploymentStatusUpdate { 485 return &structs.DeploymentStatusUpdate{ 486 DeploymentID: w.d.ID, 487 Status: status, 488 StatusDescription: desc, 489 } 490 } 491 492 // getAllocs retrieves the allocations that are part of the deployment blocking 493 // at the given index. 494 func (w *deploymentWatcher) getAllocs(index uint64) ([]*structs.AllocListStub, uint64, error) { 495 resp, index, err := w.state.BlockingQuery(w.getAllocsImpl, index, w.ctx) 496 if err != nil { 497 return nil, 0, err 498 } 499 if err := w.ctx.Err(); err != nil { 500 return nil, 0, err 501 } 502 503 return resp.([]*structs.AllocListStub), index, nil 504 } 505 506 // getDeploysImpl retrieves all deployments from the passed state store. 507 func (w *deploymentWatcher) getAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 508 if err := w.queryLimiter.Wait(w.ctx); err != nil { 509 return nil, 0, err 510 } 511 512 // Capture all the allocations 513 allocs, err := state.AllocsByDeployment(ws, w.d.ID) 514 if err != nil { 515 return nil, 0, err 516 } 517 518 stubs := make([]*structs.AllocListStub, 0, len(allocs)) 519 for _, alloc := range allocs { 520 stubs = append(stubs, alloc.Stub()) 521 } 522 523 // Use the last index that affected the jobs table 524 index, err := state.Index("allocs") 525 if err != nil { 526 return nil, index, err 527 } 528 529 return stubs, index, nil 530 } 531 532 // latestEvalIndex returns the index of the last evaluation created for 533 // the job. The index is used to determine if an allocation update requires an 534 // evaluation to be triggered. 535 func (w *deploymentWatcher) latestEvalIndex() (uint64, error) { 536 if err := w.queryLimiter.Wait(w.ctx); err != nil { 537 return 0, err 538 } 539 540 snap, err := w.state.Snapshot() 541 if err != nil { 542 return 0, err 543 } 544 545 evals, err := snap.EvalsByJob(nil, w.d.Namespace, w.d.JobID) 546 if err != nil { 547 return 0, err 548 } 549 550 if len(evals) == 0 { 551 idx, err := snap.Index("evals") 552 if err != nil { 553 w.setLatestEval(idx) 554 } 555 return idx, err 556 } 557 558 // Prefer using the snapshot index. Otherwise use the create index 559 e := evals[0] 560 if e.SnapshotIndex != 0 { 561 w.setLatestEval(e.SnapshotIndex) 562 return e.SnapshotIndex, nil 563 } 564 565 w.setLatestEval(e.CreateIndex) 566 return e.CreateIndex, nil 567 } 568 569 // setLatestEval sets the given index as the latest eval unless the currently 570 // stored index is higher. 571 func (w *deploymentWatcher) setLatestEval(index uint64) { 572 w.l.Lock() 573 defer w.l.Unlock() 574 if index > w.latestEval { 575 w.latestEval = index 576 } 577 } 578 579 // getLatestEval returns the latest eval index. 580 func (w *deploymentWatcher) getLatestEval() uint64 { 581 w.l.Lock() 582 defer w.l.Unlock() 583 return w.latestEval 584 }