github.com/smithx10/nomad@v0.9.1-rc1/nomad/deploymentwatcher/deployment_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 memdb "github.com/hashicorp/go-memdb" 11 "github.com/hashicorp/nomad/helper" 12 "github.com/hashicorp/nomad/helper/uuid" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "golang.org/x/time/rate" 16 ) 17 18 const ( 19 // perJobEvalBatchPeriod is the batching length before creating an evaluation to 20 // trigger the scheduler when allocations are marked as healthy. 21 perJobEvalBatchPeriod = 1 * time.Second 22 ) 23 24 var ( 25 // allowRescheduleTransition is the transition that allows failed 26 // allocations part of a deployment to be rescheduled. We create a one off 27 // variable to avoid creating a new object for every request. 28 allowRescheduleTransition = &structs.DesiredTransition{ 29 Reschedule: helper.BoolToPtr(true), 30 } 31 ) 32 33 // deploymentTriggers are the set of functions required to trigger changes on 34 // behalf of a deployment 35 type deploymentTriggers interface { 36 // createUpdate is used to create allocation desired transition updates and 37 // an evaluation. 38 createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) 39 40 // upsertJob is used to roll back a job when autoreverting for a deployment 41 upsertJob(job *structs.Job) (uint64, error) 42 43 // upsertDeploymentStatusUpdate is used to upsert a deployment status update 44 // and an optional evaluation and job to upsert 45 upsertDeploymentStatusUpdate(u *structs.DeploymentStatusUpdate, eval *structs.Evaluation, job *structs.Job) (uint64, error) 46 47 // upsertDeploymentPromotion is used to promote canaries in a deployment 48 upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 49 50 // upsertDeploymentAllocHealth is used to set the health of allocations in a 51 // deployment 52 upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 53 } 54 55 // deploymentWatcher is used to watch a single deployment and trigger the 56 // scheduler when allocation health transitions. 57 type deploymentWatcher struct { 58 // queryLimiter is used to limit the rate of blocking queries 59 queryLimiter *rate.Limiter 60 61 // deploymentTriggers holds the methods required to trigger changes on behalf of the 62 // deployment 63 deploymentTriggers 64 65 // state is the state that is watched for state changes. 66 state *state.StateStore 67 68 // deploymentID is the deployment's ID being watched 69 deploymentID string 70 71 // deploymentUpdateCh is triggered when there is an updated deployment 72 deploymentUpdateCh chan struct{} 73 74 // d is the deployment being watched 75 d *structs.Deployment 76 77 // j is the job the deployment is for 78 j *structs.Job 79 80 // outstandingBatch marks whether an outstanding function exists to create 81 // the evaluation. Access should be done through the lock. 82 outstandingBatch bool 83 84 // outstandingAllowReplacements is the map of allocations that will be 85 // marked as allowing a replacement. Access should be done through the lock. 86 outstandingAllowReplacements map[string]*structs.DesiredTransition 87 88 // latestEval is the latest eval for the job. It is updated by the watch 89 // loop and any time an evaluation is created. The field should be accessed 90 // by holding the lock or using the setter and getter methods. 91 latestEval uint64 92 93 logger log.Logger 94 ctx context.Context 95 exitFn context.CancelFunc 96 l sync.RWMutex 97 } 98 99 // newDeploymentWatcher returns a deployment watcher that is used to watch 100 // deployments and trigger the scheduler as needed. 101 func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, 102 logger log.Logger, state *state.StateStore, d *structs.Deployment, 103 j *structs.Job, triggers deploymentTriggers) *deploymentWatcher { 104 105 ctx, exitFn := context.WithCancel(parent) 106 w := &deploymentWatcher{ 107 queryLimiter: queryLimiter, 108 deploymentID: d.ID, 109 deploymentUpdateCh: make(chan struct{}, 1), 110 d: d, 111 j: j, 112 state: state, 113 deploymentTriggers: triggers, 114 logger: logger.With("deployment_id", d.ID, "job", j.NamespacedID()), 115 ctx: ctx, 116 exitFn: exitFn, 117 } 118 119 // Start the long lived watcher that scans for allocation updates 120 go w.watch() 121 122 return w 123 } 124 125 // updateDeployment is used to update the tracked deployment. 126 func (w *deploymentWatcher) updateDeployment(d *structs.Deployment) { 127 w.l.Lock() 128 defer w.l.Unlock() 129 130 // Update and trigger 131 w.d = d 132 select { 133 case w.deploymentUpdateCh <- struct{}{}: 134 default: 135 } 136 } 137 138 // getDeployment returns the tracked deployment. 139 func (w *deploymentWatcher) getDeployment() *structs.Deployment { 140 w.l.RLock() 141 defer w.l.RUnlock() 142 return w.d 143 } 144 145 func (w *deploymentWatcher) SetAllocHealth( 146 req *structs.DeploymentAllocHealthRequest, 147 resp *structs.DeploymentUpdateResponse) error { 148 149 // If we are failing the deployment, update the status and potentially 150 // rollback 151 var j *structs.Job 152 var u *structs.DeploymentStatusUpdate 153 154 // If there are unhealthy allocations we need to mark the deployment as 155 // failed and check if we should roll back to a stable job. 156 if l := len(req.UnhealthyAllocationIDs); l != 0 { 157 unhealthy := make(map[string]struct{}, l) 158 for _, alloc := range req.UnhealthyAllocationIDs { 159 unhealthy[alloc] = struct{}{} 160 } 161 162 // Get the allocations for the deployment 163 snap, err := w.state.Snapshot() 164 if err != nil { 165 return err 166 } 167 168 allocs, err := snap.AllocsByDeployment(nil, req.DeploymentID) 169 if err != nil { 170 return err 171 } 172 173 // Determine if we should autorevert to an older job 174 desc := structs.DeploymentStatusDescriptionFailedAllocations 175 for _, alloc := range allocs { 176 // Check that the alloc has been marked unhealthy 177 if _, ok := unhealthy[alloc.ID]; !ok { 178 continue 179 } 180 181 // Check if the group has autorevert set 182 group, ok := w.getDeployment().TaskGroups[alloc.TaskGroup] 183 if !ok || !group.AutoRevert { 184 continue 185 } 186 187 var err error 188 j, err = w.latestStableJob() 189 if err != nil { 190 return err 191 } 192 193 if j != nil { 194 j, desc = w.handleRollbackValidity(j, desc) 195 } 196 break 197 } 198 199 u = w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) 200 } 201 202 // Canonicalize the job in case it doesn't have namespace set 203 j.Canonicalize() 204 205 // Create the request 206 areq := &structs.ApplyDeploymentAllocHealthRequest{ 207 DeploymentAllocHealthRequest: *req, 208 Timestamp: time.Now(), 209 Eval: w.getEval(), 210 DeploymentUpdate: u, 211 Job: j, 212 } 213 214 index, err := w.upsertDeploymentAllocHealth(areq) 215 if err != nil { 216 return err 217 } 218 219 // Build the response 220 resp.EvalID = areq.Eval.ID 221 resp.EvalCreateIndex = index 222 resp.DeploymentModifyIndex = index 223 resp.Index = index 224 if j != nil { 225 resp.RevertedJobVersion = helper.Uint64ToPtr(j.Version) 226 } 227 return nil 228 } 229 230 // handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job 231 // Returns a modified description and job accordingly. 232 func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) { 233 // Only rollback if job being changed has a different spec. 234 // This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback 235 // If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed. 236 if w.j.SpecChanged(rollbackJob) { 237 desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version) 238 } else { 239 desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version) 240 rollbackJob = nil 241 } 242 return rollbackJob, desc 243 } 244 245 func (w *deploymentWatcher) PromoteDeployment( 246 req *structs.DeploymentPromoteRequest, 247 resp *structs.DeploymentUpdateResponse) error { 248 249 // Create the request 250 areq := &structs.ApplyDeploymentPromoteRequest{ 251 DeploymentPromoteRequest: *req, 252 Eval: w.getEval(), 253 } 254 255 index, err := w.upsertDeploymentPromotion(areq) 256 if err != nil { 257 return err 258 } 259 260 // Build the response 261 resp.EvalID = areq.Eval.ID 262 resp.EvalCreateIndex = index 263 resp.DeploymentModifyIndex = index 264 resp.Index = index 265 return nil 266 } 267 268 func (w *deploymentWatcher) PauseDeployment( 269 req *structs.DeploymentPauseRequest, 270 resp *structs.DeploymentUpdateResponse) error { 271 // Determine the status we should transition to and if we need to create an 272 // evaluation 273 status, desc := structs.DeploymentStatusPaused, structs.DeploymentStatusDescriptionPaused 274 var eval *structs.Evaluation 275 evalID := "" 276 if !req.Pause { 277 status, desc = structs.DeploymentStatusRunning, structs.DeploymentStatusDescriptionRunning 278 eval = w.getEval() 279 evalID = eval.ID 280 } 281 update := w.getDeploymentStatusUpdate(status, desc) 282 283 // Commit the change 284 i, err := w.upsertDeploymentStatusUpdate(update, eval, nil) 285 if err != nil { 286 return err 287 } 288 289 // Build the response 290 if evalID != "" { 291 resp.EvalID = evalID 292 resp.EvalCreateIndex = i 293 } 294 resp.DeploymentModifyIndex = i 295 resp.Index = i 296 return nil 297 } 298 299 func (w *deploymentWatcher) FailDeployment( 300 req *structs.DeploymentFailRequest, 301 resp *structs.DeploymentUpdateResponse) error { 302 303 status, desc := structs.DeploymentStatusFailed, structs.DeploymentStatusDescriptionFailedByUser 304 305 // Determine if we should rollback 306 rollback := false 307 for _, state := range w.getDeployment().TaskGroups { 308 if state.AutoRevert { 309 rollback = true 310 break 311 } 312 } 313 314 var rollbackJob *structs.Job 315 if rollback { 316 var err error 317 rollbackJob, err = w.latestStableJob() 318 if err != nil { 319 return err 320 } 321 322 if rollbackJob != nil { 323 rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc) 324 } else { 325 desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc) 326 } 327 } 328 329 // Commit the change 330 update := w.getDeploymentStatusUpdate(status, desc) 331 eval := w.getEval() 332 i, err := w.upsertDeploymentStatusUpdate(update, eval, rollbackJob) 333 if err != nil { 334 return err 335 } 336 337 // Build the response 338 resp.EvalID = eval.ID 339 resp.EvalCreateIndex = i 340 resp.DeploymentModifyIndex = i 341 resp.Index = i 342 if rollbackJob != nil { 343 resp.RevertedJobVersion = helper.Uint64ToPtr(rollbackJob.Version) 344 } 345 return nil 346 } 347 348 // StopWatch stops watching the deployment. This should be called whenever a 349 // deployment is completed or the watcher is no longer needed. 350 func (w *deploymentWatcher) StopWatch() { 351 w.exitFn() 352 } 353 354 // watch is the long running watcher that watches for both allocation and 355 // deployment changes. Its function is to create evaluations to trigger the 356 // scheduler when more progress can be made, to fail the deployment if it has 357 // failed and potentially rolling back the job. Progress can be made when an 358 // allocation transitions to healthy, so we create an eval. 359 func (w *deploymentWatcher) watch() { 360 // Get the deadline. This is likely a zero time to begin with but we need to 361 // handle the case that the deployment has already progressed and we are now 362 // just starting to watch it. This must likely would occur if there was a 363 // leader transition and we are now starting our watcher. 364 currentDeadline := w.getDeploymentProgressCutoff(w.getDeployment()) 365 var deadlineTimer *time.Timer 366 if currentDeadline.IsZero() { 367 deadlineTimer = time.NewTimer(0) 368 if !deadlineTimer.Stop() { 369 <-deadlineTimer.C 370 } 371 } else { 372 deadlineTimer = time.NewTimer(currentDeadline.Sub(time.Now())) 373 } 374 375 allocIndex := uint64(1) 376 var updates *allocUpdates 377 378 rollback, deadlineHit := false, false 379 380 FAIL: 381 for { 382 select { 383 case <-w.ctx.Done(): 384 return 385 case <-deadlineTimer.C: 386 // We have hit the progress deadline so fail the deployment. We need 387 // to determine whether we should roll back the job by inspecting 388 // which allocs as part of the deployment are healthy and which 389 // aren't. 390 deadlineHit = true 391 fail, rback, err := w.shouldFail() 392 if err != nil { 393 w.logger.Error("failed to determine whether to rollback job", "error", err) 394 } 395 if !fail { 396 w.logger.Debug("skipping deadline") 397 continue 398 } 399 400 w.logger.Debug("deadline hit", "rollback", rback) 401 rollback = rback 402 break FAIL 403 case <-w.deploymentUpdateCh: 404 // Get the updated deployment and check if we should change the 405 // deadline timer 406 next := w.getDeploymentProgressCutoff(w.getDeployment()) 407 if !next.Equal(currentDeadline) { 408 prevDeadlineZero := currentDeadline.IsZero() 409 currentDeadline = next 410 // The most recent deadline can be zero if no allocs were created for this deployment. 411 // The deadline timer would have already been stopped once in that case. To prevent 412 // deadlocking on the already stopped deadline timer, we only drain the channel if 413 // the previous deadline was not zero. 414 if !prevDeadlineZero && !deadlineTimer.Stop() { 415 select { 416 case <-deadlineTimer.C: 417 default: 418 } 419 } 420 421 // If the next deadline is zero, we should not reset the timer 422 // as we aren't tracking towards a progress deadline yet. This 423 // can happen if you have multiple task groups with progress 424 // deadlines and one of the task groups hasn't made any 425 // placements. As soon as the other task group finishes its 426 // rollout, the next progress deadline becomes zero, so we want 427 // to avoid resetting, causing a deployment failure. 428 if !next.IsZero() { 429 deadlineTimer.Reset(next.Sub(time.Now())) 430 } 431 } 432 433 case updates = <-w.getAllocsCh(allocIndex): 434 if err := updates.err; err != nil { 435 if err == context.Canceled || w.ctx.Err() == context.Canceled { 436 return 437 } 438 439 w.logger.Error("failed to retrieve allocations", "error", err) 440 return 441 } 442 allocIndex = updates.index 443 444 // We have allocation changes for this deployment so determine the 445 // steps to take. 446 res, err := w.handleAllocUpdate(updates.allocs) 447 if err != nil { 448 if err == context.Canceled || w.ctx.Err() == context.Canceled { 449 return 450 } 451 452 w.logger.Error("failed handling allocation updates", "error", err) 453 return 454 } 455 456 // The deployment has failed, so break out of the watch loop and 457 // handle the failure 458 if res.failDeployment { 459 rollback = res.rollback 460 break FAIL 461 } 462 463 // Create an eval to push the deployment along 464 if res.createEval || len(res.allowReplacements) != 0 { 465 w.createBatchedUpdate(res.allowReplacements, allocIndex) 466 } 467 } 468 } 469 470 // Change the deployments status to failed 471 desc := structs.DeploymentStatusDescriptionFailedAllocations 472 if deadlineHit { 473 desc = structs.DeploymentStatusDescriptionProgressDeadline 474 } 475 476 // Rollback to the old job if necessary 477 var j *structs.Job 478 if rollback { 479 var err error 480 j, err = w.latestStableJob() 481 if err != nil { 482 w.logger.Error("failed to lookup latest stable job", "error", err) 483 } 484 485 // Description should include that the job is being rolled back to 486 // version N 487 if j != nil { 488 j, desc = w.handleRollbackValidity(j, desc) 489 } else { 490 desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc) 491 } 492 } 493 494 // Update the status of the deployment to failed and create an evaluation. 495 e := w.getEval() 496 u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) 497 if _, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil { 498 w.logger.Error("failed to update deployment status", "error", err) 499 } 500 } 501 502 // allocUpdateResult is used to return the desired actions given the newest set 503 // of allocations for the deployment. 504 type allocUpdateResult struct { 505 createEval bool 506 failDeployment bool 507 rollback bool 508 allowReplacements []string 509 } 510 511 // handleAllocUpdate is used to compute the set of actions to take based on the 512 // updated allocations for the deployment. 513 func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (allocUpdateResult, error) { 514 var res allocUpdateResult 515 516 // Get the latest evaluation index 517 latestEval, err := w.jobEvalStatus() 518 if err != nil { 519 if err == context.Canceled || w.ctx.Err() == context.Canceled { 520 return res, err 521 } 522 523 return res, fmt.Errorf("failed to determine last evaluation index for job %q: %v", w.j.ID, err) 524 } 525 526 deployment := w.getDeployment() 527 for _, alloc := range allocs { 528 dstate, ok := deployment.TaskGroups[alloc.TaskGroup] 529 if !ok { 530 continue 531 } 532 533 // Determine if the update stanza for this group is progress based 534 progressBased := dstate.ProgressDeadline != 0 535 536 // Check if the allocation has failed and we need to mark it for allow 537 // replacements 538 if progressBased && alloc.DeploymentStatus.IsUnhealthy() && 539 deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() { 540 res.allowReplacements = append(res.allowReplacements, alloc.ID) 541 continue 542 } 543 544 // We need to create an eval so the job can progress. 545 if alloc.DeploymentStatus.IsHealthy() && alloc.DeploymentStatus.ModifyIndex > latestEval { 546 res.createEval = true 547 } 548 549 // If the group is using a progress deadline, we don't have to do anything. 550 if progressBased { 551 continue 552 } 553 554 // Fail on the first bad allocation 555 if alloc.DeploymentStatus.IsUnhealthy() { 556 // Check if the group has autorevert set 557 if dstate.AutoRevert { 558 res.rollback = true 559 } 560 561 // Since we have an unhealthy allocation, fail the deployment 562 res.failDeployment = true 563 } 564 565 // All conditions have been hit so we can break 566 if res.createEval && res.failDeployment && res.rollback { 567 break 568 } 569 } 570 571 return res, nil 572 } 573 574 // shouldFail returns whether the job should be failed and whether it should 575 // rolled back to an earlier stable version by examining the allocations in the 576 // deployment. 577 func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) { 578 snap, err := w.state.Snapshot() 579 if err != nil { 580 return false, false, err 581 } 582 583 d, err := snap.DeploymentByID(nil, w.deploymentID) 584 if err != nil { 585 return false, false, err 586 } 587 if d == nil { 588 // The deployment wasn't in the state store, possibly due to a system gc 589 return false, false, fmt.Errorf("deployment id not found: %q", w.deploymentID) 590 } 591 592 fail = false 593 for tg, state := range d.TaskGroups { 594 // If we are in a canary state we fail if there aren't enough healthy 595 // allocs to satisfy DesiredCanaries 596 if state.DesiredCanaries > 0 && !state.Promoted { 597 if state.HealthyAllocs >= state.DesiredCanaries { 598 continue 599 } 600 } else if state.HealthyAllocs >= state.DesiredTotal { 601 continue 602 } 603 604 // We have failed this TG 605 fail = true 606 607 // We don't need to autorevert this group 608 upd := w.j.LookupTaskGroup(tg).Update 609 if upd == nil || !upd.AutoRevert { 610 continue 611 } 612 613 // Unhealthy allocs and we need to autorevert 614 return true, true, nil 615 } 616 617 return fail, false, nil 618 } 619 620 // getDeploymentProgressCutoff returns the progress cutoff for the given 621 // deployment 622 func (w *deploymentWatcher) getDeploymentProgressCutoff(d *structs.Deployment) time.Time { 623 var next time.Time 624 doneTGs := w.doneGroups(d) 625 for name, state := range d.TaskGroups { 626 // This task group is done so we don't have to concern ourselves with 627 // its progress deadline. 628 if done, ok := doneTGs[name]; ok && done { 629 continue 630 } 631 632 if state.RequireProgressBy.IsZero() { 633 continue 634 } 635 636 if next.IsZero() || state.RequireProgressBy.Before(next) { 637 next = state.RequireProgressBy 638 } 639 } 640 return next 641 } 642 643 // doneGroups returns a map of task group to whether the deployment appears to 644 // be done for the group. A true value doesn't mean no more action will be taken 645 // in the life time of the deployment because there could always be node 646 // failures, or rescheduling events. 647 func (w *deploymentWatcher) doneGroups(d *structs.Deployment) map[string]bool { 648 if d == nil { 649 return nil 650 } 651 652 // Collect the allocations by the task group 653 snap, err := w.state.Snapshot() 654 if err != nil { 655 return nil 656 } 657 658 allocs, err := snap.AllocsByDeployment(nil, d.ID) 659 if err != nil { 660 return nil 661 } 662 663 // Go through the allocs and count up how many healthy allocs we have 664 healthy := make(map[string]int, len(d.TaskGroups)) 665 for _, a := range allocs { 666 if a.TerminalStatus() || !a.DeploymentStatus.IsHealthy() { 667 continue 668 } 669 healthy[a.TaskGroup]++ 670 } 671 672 // Go through each group and check if it done 673 groups := make(map[string]bool, len(d.TaskGroups)) 674 for name, state := range d.TaskGroups { 675 // Requires promotion 676 if state.DesiredCanaries != 0 && !state.Promoted { 677 groups[name] = false 678 continue 679 } 680 681 // Check we have enough healthy currently running allocations 682 groups[name] = healthy[name] >= state.DesiredTotal 683 } 684 685 return groups 686 } 687 688 // latestStableJob returns the latest stable job. It may be nil if none exist 689 func (w *deploymentWatcher) latestStableJob() (*structs.Job, error) { 690 snap, err := w.state.Snapshot() 691 if err != nil { 692 return nil, err 693 } 694 695 versions, err := snap.JobVersionsByID(nil, w.j.Namespace, w.j.ID) 696 if err != nil { 697 return nil, err 698 } 699 700 var stable *structs.Job 701 for _, job := range versions { 702 if job.Stable { 703 stable = job 704 break 705 } 706 } 707 708 return stable, nil 709 } 710 711 // createBatchedUpdate creates an eval for the given index as well as updating 712 // the given allocations to allow them to reschedule. 713 func (w *deploymentWatcher) createBatchedUpdate(allowReplacements []string, forIndex uint64) { 714 w.l.Lock() 715 defer w.l.Unlock() 716 717 // Store the allocations that can be replaced 718 for _, allocID := range allowReplacements { 719 if w.outstandingAllowReplacements == nil { 720 w.outstandingAllowReplacements = make(map[string]*structs.DesiredTransition, len(allowReplacements)) 721 } 722 w.outstandingAllowReplacements[allocID] = allowRescheduleTransition 723 } 724 725 if w.outstandingBatch || (forIndex < w.latestEval && len(allowReplacements) == 0) { 726 return 727 } 728 729 w.outstandingBatch = true 730 731 time.AfterFunc(perJobEvalBatchPeriod, func() { 732 // If the timer has been created and then we shutdown, we need to no-op 733 // the evaluation creation. 734 select { 735 case <-w.ctx.Done(): 736 return 737 default: 738 } 739 740 w.l.Lock() 741 replacements := w.outstandingAllowReplacements 742 w.outstandingAllowReplacements = nil 743 w.outstandingBatch = false 744 w.l.Unlock() 745 746 // Create the eval 747 if _, err := w.createUpdate(replacements, w.getEval()); err != nil { 748 w.logger.Error("failed to create evaluation for deployment", "deployment_id", w.deploymentID, "error", err) 749 } 750 }) 751 } 752 753 // getEval returns an evaluation suitable for the deployment 754 func (w *deploymentWatcher) getEval() *structs.Evaluation { 755 return &structs.Evaluation{ 756 ID: uuid.Generate(), 757 Namespace: w.j.Namespace, 758 Priority: w.j.Priority, 759 Type: w.j.Type, 760 TriggeredBy: structs.EvalTriggerDeploymentWatcher, 761 JobID: w.j.ID, 762 DeploymentID: w.deploymentID, 763 Status: structs.EvalStatusPending, 764 } 765 } 766 767 // getDeploymentStatusUpdate returns a deployment status update 768 func (w *deploymentWatcher) getDeploymentStatusUpdate(status, desc string) *structs.DeploymentStatusUpdate { 769 return &structs.DeploymentStatusUpdate{ 770 DeploymentID: w.deploymentID, 771 Status: status, 772 StatusDescription: desc, 773 } 774 } 775 776 type allocUpdates struct { 777 allocs []*structs.AllocListStub 778 index uint64 779 err error 780 } 781 782 // getAllocsCh retrieves the allocations that are part of the deployment blocking 783 // at the given index. 784 func (w *deploymentWatcher) getAllocsCh(index uint64) <-chan *allocUpdates { 785 out := make(chan *allocUpdates, 1) 786 go func() { 787 allocs, index, err := w.getAllocs(index) 788 out <- &allocUpdates{ 789 allocs: allocs, 790 index: index, 791 err: err, 792 } 793 }() 794 795 return out 796 } 797 798 // getAllocs retrieves the allocations that are part of the deployment blocking 799 // at the given index. 800 func (w *deploymentWatcher) getAllocs(index uint64) ([]*structs.AllocListStub, uint64, error) { 801 resp, index, err := w.state.BlockingQuery(w.getAllocsImpl, index, w.ctx) 802 if err != nil { 803 return nil, 0, err 804 } 805 if err := w.ctx.Err(); err != nil { 806 return nil, 0, err 807 } 808 809 return resp.([]*structs.AllocListStub), index, nil 810 } 811 812 // getDeploysImpl retrieves all deployments from the passed state store. 813 func (w *deploymentWatcher) getAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 814 if err := w.queryLimiter.Wait(w.ctx); err != nil { 815 return nil, 0, err 816 } 817 818 // Capture all the allocations 819 allocs, err := state.AllocsByDeployment(ws, w.deploymentID) 820 if err != nil { 821 return nil, 0, err 822 } 823 824 maxIndex := uint64(0) 825 stubs := make([]*structs.AllocListStub, 0, len(allocs)) 826 for _, alloc := range allocs { 827 stubs = append(stubs, alloc.Stub()) 828 829 if maxIndex < alloc.ModifyIndex { 830 maxIndex = alloc.ModifyIndex 831 } 832 } 833 834 // Use the last index that affected the allocs table 835 if len(stubs) == 0 { 836 index, err := state.Index("allocs") 837 if err != nil { 838 return nil, index, err 839 } 840 maxIndex = index 841 } 842 843 return stubs, maxIndex, nil 844 } 845 846 // jobEvalStatus returns the latest eval index for a job. The index is used to 847 // determine if an allocation update requires an evaluation to be triggered. 848 func (w *deploymentWatcher) jobEvalStatus() (latestIndex uint64, err error) { 849 if err := w.queryLimiter.Wait(w.ctx); err != nil { 850 return 0, err 851 } 852 853 snap, err := w.state.Snapshot() 854 if err != nil { 855 return 0, err 856 } 857 858 evals, err := snap.EvalsByJob(nil, w.j.Namespace, w.j.ID) 859 if err != nil { 860 return 0, err 861 } 862 863 // If there are no evals for the job, return zero, since we want any 864 // allocation change to trigger an evaluation. 865 if len(evals) == 0 { 866 return 0, nil 867 } 868 869 var max uint64 870 for _, eval := range evals { 871 // A cancelled eval never impacts what the scheduler has saw, so do not 872 // use it's indexes. 873 if eval.Status == structs.EvalStatusCancelled { 874 continue 875 } 876 877 // Prefer using the snapshot index. Otherwise use the create index 878 if eval.SnapshotIndex != 0 && max < eval.SnapshotIndex { 879 max = eval.SnapshotIndex 880 } else if max < eval.CreateIndex { 881 max = eval.CreateIndex 882 } 883 } 884 885 return max, nil 886 }