github.com/hernad/nomad@v1.6.112/nomad/plan_apply.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "context" 8 "fmt" 9 "runtime" 10 "time" 11 12 metrics "github.com/armon/go-metrics" 13 log "github.com/hashicorp/go-hclog" 14 memdb "github.com/hashicorp/go-memdb" 15 multierror "github.com/hashicorp/go-multierror" 16 "github.com/hernad/nomad/helper/uuid" 17 "github.com/hernad/nomad/nomad/state" 18 "github.com/hernad/nomad/nomad/structs" 19 "github.com/hashicorp/raft" 20 ) 21 22 // planner is used to manage the submitted allocation plans that are waiting 23 // to be accessed by the leader 24 type planner struct { 25 *Server 26 log log.Logger 27 28 // planQueue is used to manage the submitted allocation 29 // plans that are waiting to be assessed by the leader 30 planQueue *PlanQueue 31 32 // badNodeTracker keeps a score for nodes that have plan rejections. 33 // Plan rejections are somewhat expected given Nomad's optimistic 34 // scheduling, but repeated rejections for the same node may indicate an 35 // undetected issue, so we need to track rejection history. 36 badNodeTracker BadNodeTracker 37 } 38 39 // newPlanner returns a new planner to be used for managing allocation plans. 40 func newPlanner(s *Server) (*planner, error) { 41 log := s.logger.Named("planner") 42 43 // Create a plan queue 44 planQueue, err := NewPlanQueue() 45 if err != nil { 46 return nil, err 47 } 48 49 // Create the bad node tracker. 50 var badNodeTracker BadNodeTracker 51 if s.config.NodePlanRejectionEnabled { 52 config := DefaultCachedBadNodeTrackerConfig() 53 54 config.Window = s.config.NodePlanRejectionWindow 55 config.Threshold = s.config.NodePlanRejectionThreshold 56 57 badNodeTracker, err = NewCachedBadNodeTracker(log, config) 58 if err != nil { 59 return nil, err 60 } 61 } else { 62 badNodeTracker = &NoopBadNodeTracker{} 63 } 64 65 return &planner{ 66 Server: s, 67 log: log, 68 planQueue: planQueue, 69 badNodeTracker: badNodeTracker, 70 }, nil 71 } 72 73 // planApply is a long lived goroutine that reads plan allocations from 74 // the plan queue, determines if they can be applied safely and applies 75 // them via Raft. 76 // 77 // Naively, we could simply dequeue a plan, verify, apply and then respond. 78 // However, the plan application is bounded by the Raft apply time and 79 // subject to some latency. This creates a stall condition, where we are 80 // not evaluating, but simply waiting for a transaction to apply. 81 // 82 // To avoid this, we overlap verification with apply. This means once 83 // we've verified plan N we attempt to apply it. However, while waiting 84 // for apply, we begin to verify plan N+1 under the assumption that plan 85 // N has succeeded. 86 // 87 // In this sense, we track two parallel versions of the world. One is 88 // the pessimistic one driven by the Raft log which is replicated. The 89 // other is optimistic and assumes our transactions will succeed. In the 90 // happy path, this lets us do productive work during the latency of 91 // apply. 92 // 93 // In the unhappy path (Raft transaction fails), effectively we only 94 // wasted work during a time we would have been waiting anyways. However, 95 // in anticipation of this case we cannot respond to the plan until 96 // the Raft log is updated. This means our schedulers will stall, 97 // but there are many of those and only a single plan verifier. 98 func (p *planner) planApply() { 99 // planIndexCh is used to track an outstanding application and receive 100 // its committed index while snap holds an optimistic state which 101 // includes that plan application. 102 var planIndexCh chan uint64 103 var snap *state.StateSnapshot 104 105 // prevPlanResultIndex is the index when the last PlanResult was 106 // committed. Since only the last plan is optimistically applied to the 107 // snapshot, it's possible the current snapshot's and plan's indexes 108 // are less than the index the previous plan result was committed at. 109 // prevPlanResultIndex also guards against the previous plan committing 110 // during Dequeue, thus causing the snapshot containing the optimistic 111 // commit to be discarded and potentially evaluating the current plan 112 // against an index older than the previous plan was committed at. 113 var prevPlanResultIndex uint64 114 115 // Setup a worker pool with half the cores, with at least 1 116 poolSize := runtime.NumCPU() / 2 117 if poolSize == 0 { 118 poolSize = 1 119 } 120 pool := NewEvaluatePool(poolSize, workerPoolBufferSize) 121 defer pool.Shutdown() 122 123 for { 124 // Pull the next pending plan, exit if we are no longer leader 125 pending, err := p.planQueue.Dequeue(0) 126 if err != nil { 127 return 128 } 129 130 // If last plan has completed get a new snapshot 131 select { 132 case idx := <-planIndexCh: 133 // Previous plan committed. Discard snapshot and ensure 134 // future snapshots include this plan. idx may be 0 if 135 // plan failed to apply, so use max(prev, idx) 136 prevPlanResultIndex = max(prevPlanResultIndex, idx) 137 planIndexCh = nil 138 snap = nil 139 default: 140 } 141 142 if snap != nil { 143 // If snapshot doesn't contain the previous plan 144 // result's index and the current plan's snapshot it, 145 // discard it and get a new one below. 146 minIndex := max(prevPlanResultIndex, pending.plan.SnapshotIndex) 147 if idx, err := snap.LatestIndex(); err != nil || idx < minIndex { 148 snap = nil 149 } 150 } 151 152 // Snapshot the state so that we have a consistent view of the world 153 // if no snapshot is available. 154 // - planIndexCh will be nil if the previous plan result applied 155 // during Dequeue 156 // - snap will be nil if its index < max(prevIndex, curIndex) 157 if planIndexCh == nil || snap == nil { 158 snap, err = p.snapshotMinIndex(prevPlanResultIndex, pending.plan.SnapshotIndex) 159 if err != nil { 160 p.logger.Error("failed to snapshot state", "error", err) 161 pending.respond(nil, err) 162 continue 163 } 164 } 165 166 // Evaluate the plan 167 result, err := evaluatePlan(pool, snap, pending.plan, p.logger) 168 if err != nil { 169 p.logger.Error("failed to evaluate plan", "error", err) 170 pending.respond(nil, err) 171 continue 172 } 173 174 // Check if any of the rejected nodes should be made ineligible. 175 for _, nodeID := range result.RejectedNodes { 176 if p.badNodeTracker.Add(nodeID) { 177 result.IneligibleNodes = append(result.IneligibleNodes, nodeID) 178 } 179 } 180 181 // Fast-path the response if there is nothing to do 182 if result.IsNoOp() { 183 pending.respond(result, nil) 184 continue 185 } 186 187 // Ensure any parallel apply is complete before starting the next one. 188 // This also limits how out of date our snapshot can be. 189 if planIndexCh != nil { 190 idx := <-planIndexCh 191 planIndexCh = nil 192 prevPlanResultIndex = max(prevPlanResultIndex, idx) 193 snap, err = p.snapshotMinIndex(prevPlanResultIndex, pending.plan.SnapshotIndex) 194 if err != nil { 195 p.logger.Error("failed to update snapshot state", "error", err) 196 pending.respond(nil, err) 197 continue 198 } 199 } 200 201 // Dispatch the Raft transaction for the plan 202 future, err := p.applyPlan(pending.plan, result, snap) 203 if err != nil { 204 p.logger.Error("failed to submit plan", "error", err) 205 pending.respond(nil, err) 206 continue 207 } 208 209 // Respond to the plan in async; receive plan's committed index via chan 210 planIndexCh = make(chan uint64, 1) 211 go p.asyncPlanWait(planIndexCh, future, result, pending) 212 } 213 } 214 215 // snapshotMinIndex wraps SnapshotAfter with a 10s timeout and converts timeout 216 // errors to a more descriptive error message. The snapshot is guaranteed to 217 // include both the previous plan and all objects referenced by the plan or 218 // return an error. 219 func (p *planner) snapshotMinIndex(prevPlanResultIndex, planSnapshotIndex uint64) (*state.StateSnapshot, error) { 220 defer metrics.MeasureSince([]string{"nomad", "plan", "wait_for_index"}, time.Now()) 221 222 // Minimum index the snapshot must include is the max of the previous 223 // plan result's and current plan's snapshot index. 224 minIndex := max(prevPlanResultIndex, planSnapshotIndex) 225 226 // This timeout creates backpressure where any concurrent 227 // Plan.Submit RPCs will block waiting for results. This sheds 228 // load across all servers and gives raft some CPU to catch up, 229 // because schedulers won't dequeue more work while waiting. 230 const timeout = 10 * time.Second 231 ctx, cancel := context.WithTimeout(context.Background(), timeout) 232 snap, err := p.fsm.State().SnapshotMinIndex(ctx, minIndex) 233 cancel() 234 if err == context.DeadlineExceeded { 235 return nil, fmt.Errorf("timed out after %s waiting for index=%d (previous plan result index=%d; plan snapshot index=%d)", 236 timeout, minIndex, prevPlanResultIndex, planSnapshotIndex) 237 } 238 239 return snap, err 240 } 241 242 // applyPlan is used to apply the plan result and to return the alloc index 243 func (p *planner) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 244 now := time.Now().UTC().UnixNano() 245 246 // Setup the update request 247 req := structs.ApplyPlanResultsRequest{ 248 AllocUpdateRequest: structs.AllocUpdateRequest{ 249 Job: plan.Job, 250 }, 251 Deployment: result.Deployment, 252 DeploymentUpdates: result.DeploymentUpdates, 253 IneligibleNodes: result.IneligibleNodes, 254 EvalID: plan.EvalID, 255 UpdatedAt: now, 256 } 257 258 preemptedJobIDs := make(map[structs.NamespacedID]struct{}) 259 260 if ServersMeetMinimumVersion(p.Members(), p.Region(), MinVersionPlanNormalization, true) { 261 // Initialize the allocs request using the new optimized log entry format. 262 // Determine the minimum number of updates, could be more if there 263 // are multiple updates per node 264 req.AllocsStopped = make([]*structs.AllocationDiff, 0, len(result.NodeUpdate)) 265 req.AllocsUpdated = make([]*structs.Allocation, 0, len(result.NodeAllocation)) 266 req.AllocsPreempted = make([]*structs.AllocationDiff, 0, len(result.NodePreemptions)) 267 268 for _, updateList := range result.NodeUpdate { 269 for _, stoppedAlloc := range updateList { 270 req.AllocsStopped = append(req.AllocsStopped, normalizeStoppedAlloc(stoppedAlloc, now)) 271 } 272 } 273 274 for _, allocList := range result.NodeAllocation { 275 req.AllocsUpdated = append(req.AllocsUpdated, allocList...) 276 } 277 278 // Set the time the alloc was applied for the first time. This can be used 279 // to approximate the scheduling time. 280 updateAllocTimestamps(req.AllocsUpdated, now) 281 282 err := p.signAllocIdentities(plan.Job, req.AllocsUpdated) 283 if err != nil { 284 return nil, err 285 } 286 287 for _, preemptions := range result.NodePreemptions { 288 for _, preemptedAlloc := range preemptions { 289 req.AllocsPreempted = append(req.AllocsPreempted, normalizePreemptedAlloc(preemptedAlloc, now)) 290 291 // Gather jobids to create follow up evals 292 appendNamespacedJobID(preemptedJobIDs, preemptedAlloc) 293 } 294 } 295 } else { 296 // COMPAT 0.11: This branch is deprecated and will only be used to support 297 // application of older log entries. Expected to be removed in a future version. 298 299 // Determine the minimum number of updates, could be more if there 300 // are multiple updates per node 301 minUpdates := len(result.NodeUpdate) 302 minUpdates += len(result.NodeAllocation) 303 304 // Initialize using the older log entry format for Alloc and NodePreemptions 305 req.Alloc = make([]*structs.Allocation, 0, minUpdates) 306 req.NodePreemptions = make([]*structs.Allocation, 0, len(result.NodePreemptions)) 307 308 for _, updateList := range result.NodeUpdate { 309 req.Alloc = append(req.Alloc, updateList...) 310 } 311 for _, allocList := range result.NodeAllocation { 312 req.Alloc = append(req.Alloc, allocList...) 313 } 314 315 for _, preemptions := range result.NodePreemptions { 316 req.NodePreemptions = append(req.NodePreemptions, preemptions...) 317 } 318 319 // Set the time the alloc was applied for the first time. This can be used 320 // to approximate the scheduling time. 321 updateAllocTimestamps(req.Alloc, now) 322 323 // Set modify time for preempted allocs if any 324 // Also gather jobids to create follow up evals 325 for _, alloc := range req.NodePreemptions { 326 alloc.ModifyTime = now 327 appendNamespacedJobID(preemptedJobIDs, alloc) 328 } 329 } 330 331 var evals []*structs.Evaluation 332 for preemptedJobID := range preemptedJobIDs { 333 job, _ := p.State().JobByID(nil, preemptedJobID.Namespace, preemptedJobID.ID) 334 if job != nil { 335 eval := &structs.Evaluation{ 336 ID: uuid.Generate(), 337 Namespace: job.Namespace, 338 TriggeredBy: structs.EvalTriggerPreemption, 339 JobID: job.ID, 340 Type: job.Type, 341 Priority: job.Priority, 342 Status: structs.EvalStatusPending, 343 CreateTime: now, 344 ModifyTime: now, 345 } 346 evals = append(evals, eval) 347 } 348 } 349 req.PreemptionEvals = evals 350 351 // Dispatch the Raft transaction 352 future, err := p.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) 353 if err != nil { 354 return nil, err 355 } 356 357 // Optimistically apply to our state view 358 if snap != nil { 359 nextIdx := p.raft.AppliedIndex() + 1 360 if err := snap.UpsertPlanResults(structs.ApplyPlanResultsRequestType, nextIdx, &req); err != nil { 361 return future, err 362 } 363 } 364 return future, nil 365 } 366 367 // normalizePreemptedAlloc removes redundant fields from a preempted allocation and 368 // returns AllocationDiff. Since a preempted allocation is always an existing allocation, 369 // the struct returned by this method contains only the differential, which can be 370 // applied to an existing allocation, to yield the updated struct 371 func normalizePreemptedAlloc(preemptedAlloc *structs.Allocation, now int64) *structs.AllocationDiff { 372 return &structs.AllocationDiff{ 373 ID: preemptedAlloc.ID, 374 PreemptedByAllocation: preemptedAlloc.PreemptedByAllocation, 375 ModifyTime: now, 376 } 377 } 378 379 // normalizeStoppedAlloc removes redundant fields from a stopped allocation and 380 // returns AllocationDiff. Since a stopped allocation is always an existing allocation, 381 // the struct returned by this method contains only the differential, which can be 382 // applied to an existing allocation, to yield the updated struct 383 func normalizeStoppedAlloc(stoppedAlloc *structs.Allocation, now int64) *structs.AllocationDiff { 384 return &structs.AllocationDiff{ 385 ID: stoppedAlloc.ID, 386 DesiredDescription: stoppedAlloc.DesiredDescription, 387 ClientStatus: stoppedAlloc.ClientStatus, 388 ModifyTime: now, 389 FollowupEvalID: stoppedAlloc.FollowupEvalID, 390 } 391 } 392 393 // appendNamespacedJobID appends the namespaced Job ID for the alloc to the jobIDs set 394 func appendNamespacedJobID(jobIDs map[structs.NamespacedID]struct{}, alloc *structs.Allocation) { 395 id := structs.NamespacedID{Namespace: alloc.Namespace, ID: alloc.JobID} 396 if _, ok := jobIDs[id]; !ok { 397 jobIDs[id] = struct{}{} 398 } 399 } 400 401 // updateAllocTimestamps sets the CreateTime and ModifyTime for the allocations 402 // to the timestamp provided 403 func updateAllocTimestamps(allocations []*structs.Allocation, timestamp int64) { 404 for _, alloc := range allocations { 405 if alloc.CreateTime == 0 { 406 alloc.CreateTime = timestamp 407 } 408 alloc.ModifyTime = timestamp 409 } 410 } 411 412 func (p *planner) signAllocIdentities(job *structs.Job, allocations []*structs.Allocation) error { 413 414 encrypter := p.Server.encrypter 415 416 for _, alloc := range allocations { 417 alloc.SignedIdentities = map[string]string{} 418 tg := job.LookupTaskGroup(alloc.TaskGroup) 419 for _, task := range tg.Tasks { 420 claims := alloc.ToTaskIdentityClaims(job, task.Name) 421 token, keyID, err := encrypter.SignClaims(claims) 422 if err != nil { 423 return err 424 } 425 alloc.SignedIdentities[task.Name] = token 426 alloc.SigningKeyID = keyID 427 } 428 } 429 return nil 430 } 431 432 // asyncPlanWait is used to apply and respond to a plan async. On successful 433 // commit the plan's index will be sent on the chan. On error the chan will be 434 // closed. 435 func (p *planner) asyncPlanWait(indexCh chan<- uint64, future raft.ApplyFuture, 436 result *structs.PlanResult, pending *pendingPlan) { 437 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 438 defer close(indexCh) 439 440 // Wait for the plan to apply 441 if err := future.Error(); err != nil { 442 p.logger.Error("failed to apply plan", "error", err) 443 pending.respond(nil, err) 444 return 445 } 446 447 // Respond to the plan 448 index := future.Index() 449 result.AllocIndex = index 450 451 // If this is a partial plan application, we need to ensure the scheduler 452 // at least has visibility into any placements it made to avoid double placement. 453 // The RefreshIndex computed by evaluatePlan may be stale due to evaluation 454 // against an optimistic copy of the state. 455 if result.RefreshIndex != 0 { 456 result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex) 457 } 458 pending.respond(result, nil) 459 indexCh <- index 460 } 461 462 // evaluatePlan is used to determine what portions of a plan 463 // can be applied if any. Returns if there should be a plan application 464 // which may be partial or if there was an error 465 func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) { 466 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 467 468 logger.Trace("evaluating plan", "plan", log.Fmt("%#v", plan)) 469 470 // Denormalize without the job 471 err := snap.DenormalizeAllocationsMap(plan.NodeUpdate) 472 if err != nil { 473 return nil, err 474 } 475 // Denormalize without the job 476 err = snap.DenormalizeAllocationsMap(plan.NodePreemptions) 477 if err != nil { 478 return nil, err 479 } 480 481 // Check if the plan exceeds quota 482 overQuota, err := evaluatePlanQuota(snap, plan) 483 if err != nil { 484 return nil, err 485 } 486 487 // Reject the plan and force the scheduler to refresh 488 if overQuota { 489 index, err := refreshIndex(snap) 490 if err != nil { 491 return nil, err 492 } 493 494 logger.Debug("plan for evaluation exceeds quota limit. Forcing state refresh", "eval_id", plan.EvalID, "refresh_index", index) 495 return &structs.PlanResult{RefreshIndex: index}, nil 496 } 497 498 return evaluatePlanPlacements(pool, snap, plan, logger) 499 } 500 501 // evaluatePlanPlacements is used to determine what portions of a plan can be 502 // applied if any, looking for node over commitment. Returns if there should be 503 // a plan application which may be partial or if there was an error 504 func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) { 505 // Create a result holder for the plan 506 result := &structs.PlanResult{ 507 NodeUpdate: make(map[string][]*structs.Allocation), 508 NodeAllocation: make(map[string][]*structs.Allocation), 509 Deployment: plan.Deployment.Copy(), 510 DeploymentUpdates: plan.DeploymentUpdates, 511 NodePreemptions: make(map[string][]*structs.Allocation), 512 } 513 514 // Collect all the nodeIDs 515 nodeIDs := make(map[string]struct{}) 516 nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation)) 517 for nodeID := range plan.NodeUpdate { 518 if _, ok := nodeIDs[nodeID]; !ok { 519 nodeIDs[nodeID] = struct{}{} 520 nodeIDList = append(nodeIDList, nodeID) 521 } 522 } 523 for nodeID := range plan.NodeAllocation { 524 if _, ok := nodeIDs[nodeID]; !ok { 525 nodeIDs[nodeID] = struct{}{} 526 nodeIDList = append(nodeIDList, nodeID) 527 } 528 } 529 530 // Setup a multierror to handle potentially getting many 531 // errors since we are processing in parallel. 532 var mErr multierror.Error 533 partialCommit := false 534 rejectedNodes := make(map[string]struct{}, 0) 535 536 // handleResult is used to process the result of evaluateNodePlan 537 handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) { 538 // Evaluate the plan for this node 539 if err != nil { 540 mErr.Errors = append(mErr.Errors, err) 541 return true 542 } 543 if !fit { 544 metrics.IncrCounterWithLabels([]string{"nomad", "plan", "node_rejected"}, 1, []metrics.Label{{Name: "node_id", Value: nodeID}}) 545 546 // Log the reason why the node's allocations could not be made 547 if reason != "" { 548 //TODO This was debug level and should return 549 //to debug level in the future. However until 550 //https://github.com/hernad/nomad/issues/9506 551 //is resolved this log line is the only way to 552 //monitor the disagreement between workers and 553 //the plan applier. 554 logger.Info("plan for node rejected, refer to https://www.nomadproject.io/s/port-plan-failure for more information", 555 "node_id", nodeID, "reason", reason, "eval_id", plan.EvalID, 556 "namespace", plan.Job.Namespace) 557 } 558 // Set that this is a partial commit and store the node that was 559 // rejected so the plan applier can detect repeated plan rejections 560 // for the same node. 561 partialCommit = true 562 rejectedNodes[nodeID] = struct{}{} 563 564 // If we require all-at-once scheduling, there is no point 565 // to continue the evaluation, as we've already failed. 566 if plan.AllAtOnce { 567 result.NodeUpdate = nil 568 result.NodeAllocation = nil 569 result.DeploymentUpdates = nil 570 result.Deployment = nil 571 result.NodePreemptions = nil 572 return true 573 } 574 575 // Skip this node, since it cannot be used. 576 return 577 } 578 579 // Add this to the plan result 580 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 581 result.NodeUpdate[nodeID] = nodeUpdate 582 } 583 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 584 result.NodeAllocation[nodeID] = nodeAlloc 585 } 586 587 if nodePreemptions := plan.NodePreemptions[nodeID]; nodePreemptions != nil { 588 589 // Do a pass over preempted allocs in the plan to check 590 // whether the alloc is already in a terminal state 591 var filteredNodePreemptions []*structs.Allocation 592 for _, preemptedAlloc := range nodePreemptions { 593 alloc, err := snap.AllocByID(nil, preemptedAlloc.ID) 594 if err != nil { 595 mErr.Errors = append(mErr.Errors, err) 596 continue 597 } 598 if alloc != nil && !alloc.TerminalStatus() { 599 filteredNodePreemptions = append(filteredNodePreemptions, preemptedAlloc) 600 } 601 } 602 603 result.NodePreemptions[nodeID] = filteredNodePreemptions 604 } 605 606 return 607 } 608 609 // Get the pool channels 610 req := pool.RequestCh() 611 resp := pool.ResultCh() 612 outstanding := 0 613 didCancel := false 614 615 // Evaluate each node in the plan, handling results as they are ready to 616 // avoid blocking. 617 OUTER: 618 for len(nodeIDList) > 0 { 619 nodeID := nodeIDList[0] 620 select { 621 case req <- evaluateRequest{snap, plan, nodeID}: 622 outstanding++ 623 nodeIDList = nodeIDList[1:] 624 case r := <-resp: 625 outstanding-- 626 627 // Handle a result that allows us to cancel evaluation, 628 // which may save time processing additional entries. 629 if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { 630 didCancel = true 631 break OUTER 632 } 633 } 634 } 635 636 // Drain the remaining results 637 for outstanding > 0 { 638 r := <-resp 639 if !didCancel { 640 if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { 641 didCancel = true 642 } 643 } 644 outstanding-- 645 } 646 647 // If the plan resulted in a partial commit, we need to determine 648 // a minimum refresh index to force the scheduler to work on a more 649 // up-to-date state to avoid the failures. 650 if partialCommit { 651 index, err := refreshIndex(snap) 652 if err != nil { 653 mErr.Errors = append(mErr.Errors, err) 654 } 655 result.RefreshIndex = index 656 657 if result.RefreshIndex == 0 { 658 err := fmt.Errorf("partialCommit with RefreshIndex of 0") 659 mErr.Errors = append(mErr.Errors, err) 660 } 661 662 // If there was a partial commit and we are operating within a 663 // deployment correct for any canary that may have been desired to be 664 // placed but wasn't actually placed 665 correctDeploymentCanaries(result) 666 } 667 668 for n := range rejectedNodes { 669 result.RejectedNodes = append(result.RejectedNodes, n) 670 } 671 return result, mErr.ErrorOrNil() 672 } 673 674 // correctDeploymentCanaries ensures that the deployment object doesn't list any 675 // canaries as placed if they didn't actually get placed. This could happen if 676 // the plan had a partial commit. 677 func correctDeploymentCanaries(result *structs.PlanResult) { 678 // Hot path 679 if result.Deployment == nil || !result.Deployment.HasPlacedCanaries() { 680 return 681 } 682 683 // Build a set of all the allocations IDs that were placed 684 placedAllocs := make(map[string]struct{}, len(result.NodeAllocation)) 685 for _, placed := range result.NodeAllocation { 686 for _, alloc := range placed { 687 placedAllocs[alloc.ID] = struct{}{} 688 } 689 } 690 691 // Go through all the canaries and ensure that the result list only contains 692 // those that have been placed 693 for _, group := range result.Deployment.TaskGroups { 694 canaries := group.PlacedCanaries 695 if len(canaries) == 0 { 696 continue 697 } 698 699 // Prune the canaries in place to avoid allocating an extra slice 700 i := 0 701 for _, canaryID := range canaries { 702 if _, ok := placedAllocs[canaryID]; ok { 703 canaries[i] = canaryID 704 i++ 705 } 706 } 707 708 group.PlacedCanaries = canaries[:i] 709 } 710 } 711 712 // evaluateNodePlan is used to evaluate the plan for a single node, 713 // returning if the plan is valid or if an error is encountered 714 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) { 715 // If this is an evict-only plan, it always 'fits' since we are removing things. 716 if len(plan.NodeAllocation[nodeID]) == 0 { 717 return true, "", nil 718 } 719 720 // Get the node itself 721 ws := memdb.NewWatchSet() 722 node, err := snap.NodeByID(ws, nodeID) 723 if err != nil { 724 return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err) 725 } 726 727 // If the node does not exist or is not ready for scheduling it is not fit 728 // XXX: There is a potential race between when we do this check and when 729 // the Raft commit happens. 730 if node == nil { 731 return false, "node does not exist", nil 732 } else if node.Status == structs.NodeStatusDisconnected { 733 if isValidForDisconnectedNode(plan, node.ID) { 734 return true, "", nil 735 } 736 return false, "node is disconnected and contains invalid updates", nil 737 } else if node.Status != structs.NodeStatusReady { 738 return false, "node is not ready for placements", nil 739 } 740 741 // Get the existing allocations that are non-terminal 742 existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false) 743 if err != nil { 744 return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 745 } 746 747 // If nodeAllocations is a subset of the existing allocations we can continue, 748 // even if the node is not eligible, as only in-place updates or stop/evict are performed 749 if structs.AllocSubset(existingAlloc, plan.NodeAllocation[nodeID]) { 750 return true, "", nil 751 } 752 if node.SchedulingEligibility == structs.NodeSchedulingIneligible { 753 return false, "node is not eligible", nil 754 } 755 756 // Determine the proposed allocation by first removing allocations 757 // that are planned evictions and adding the new allocations. 758 var remove []*structs.Allocation 759 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 760 remove = append(remove, update...) 761 } 762 763 // Remove any preempted allocs 764 if preempted := plan.NodePreemptions[nodeID]; len(preempted) > 0 { 765 remove = append(remove, preempted...) 766 } 767 768 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 769 remove = append(remove, updated...) 770 } 771 proposed := structs.RemoveAllocs(existingAlloc, remove) 772 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 773 774 // Check if these allocations fit 775 fit, reason, _, err := structs.AllocsFit(node, proposed, nil, true) 776 return fit, reason, err 777 } 778 779 // The plan is only valid for disconnected nodes if it only contains 780 // updates to mark allocations as unknown. 781 func isValidForDisconnectedNode(plan *structs.Plan, nodeID string) bool { 782 for _, alloc := range plan.NodeAllocation[nodeID] { 783 if alloc.ClientStatus != structs.AllocClientStatusUnknown { 784 return false 785 } 786 } 787 788 return true 789 } 790 791 func max(a, b uint64) uint64 { 792 if a > b { 793 return a 794 } 795 return b 796 }