github.com/hernad/nomad@v1.6.112/nomad/worker.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "context" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "strings" 12 "sync" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 log "github.com/hashicorp/go-hclog" 17 memdb "github.com/hashicorp/go-memdb" 18 "github.com/hashicorp/go-version" 19 "github.com/hernad/nomad/helper/uuid" 20 "github.com/hernad/nomad/nomad/state" 21 "github.com/hernad/nomad/nomad/structs" 22 "github.com/hernad/nomad/scheduler" 23 ) 24 25 const ( 26 // backoffBaselineFast is the baseline time for exponential backoff 27 backoffBaselineFast = 20 * time.Millisecond 28 29 // backoffBaselineSlow is the baseline time for exponential backoff 30 // but that is much slower than backoffBaselineFast 31 backoffBaselineSlow = 500 * time.Millisecond 32 33 // backoffLimitSlow is the limit of the exponential backoff for 34 // the slower backoff 35 backoffLimitSlow = 10 * time.Second 36 37 // backoffSchedulerVersionMismatch is the backoff between retries when the 38 // scheduler version mismatches that of the leader. 39 backoffSchedulerVersionMismatch = 30 * time.Second 40 41 // dequeueTimeout is used to timeout an evaluation dequeue so that 42 // we can check if there is a shutdown event 43 dequeueTimeout = 500 * time.Millisecond 44 45 // raftSyncLimit is the limit of time we will wait for Raft replication 46 // to catch up to the evaluation. This is used to fast Nack and 47 // allow another scheduler to pick it up. 48 raftSyncLimit = 5 * time.Second 49 50 // dequeueErrGrace is the grace period where we don't log about 51 // dequeue errors after start. This is to improve the user experience 52 // in dev mode where the leader isn't elected for a few seconds. 53 dequeueErrGrace = 10 * time.Second 54 ) 55 56 type WorkerStatus int 57 58 //go:generate stringer -trimprefix=Worker -output worker_string_workerstatus.go -linecomment -type=WorkerStatus 59 const ( 60 WorkerUnknownStatus WorkerStatus = iota // Unknown 61 WorkerStarting 62 WorkerStarted 63 WorkerPausing 64 WorkerPaused 65 WorkerResuming 66 WorkerStopping 67 WorkerStopped 68 ) 69 70 type SchedulerWorkerStatus int 71 72 //go:generate stringer -trimprefix=Workload -output worker_string_schedulerworkerstatus.go -linecomment -type=SchedulerWorkerStatus 73 const ( 74 WorkloadUnknownStatus SchedulerWorkerStatus = iota 75 WorkloadRunning 76 WorkloadWaitingToDequeue 77 WorkloadWaitingForRaft 78 WorkloadScheduling 79 WorkloadSubmitting 80 WorkloadBackoff 81 WorkloadStopped 82 WorkloadPaused 83 ) 84 85 // Worker is a single threaded scheduling worker. There may be multiple 86 // running per server (leader or follower). They are responsible for dequeuing 87 // pending evaluations, invoking schedulers, plan submission and the 88 // lifecycle around making task allocations. They bridge the business logic 89 // of the scheduler with the plumbing required to make it all work. 90 type Worker struct { 91 srv *Server 92 logger log.Logger 93 start time.Time 94 id string 95 96 status WorkerStatus 97 workloadStatus SchedulerWorkerStatus 98 statusLock sync.RWMutex 99 100 pauseFlag bool 101 pauseLock sync.Mutex 102 pauseCond *sync.Cond 103 ctx context.Context 104 cancelFn context.CancelFunc 105 106 // the Server.Config.EnabledSchedulers value is not safe for concurrent access, so 107 // the worker needs a cached copy of it. Workers are stopped if this value changes. 108 enabledSchedulers []string 109 110 // failures is the count of errors encountered while dequeueing evaluations 111 // and is used to calculate backoff. 112 failures uint 113 evalToken string 114 115 // snapshotIndex is the index of the snapshot in which the scheduler was 116 // first invoked. It is used to mark the SnapshotIndex of evaluations 117 // Created, Updated or Reblocked. 118 snapshotIndex uint64 119 } 120 121 // NewWorker starts a new scheduler worker associated with the given server 122 func NewWorker(ctx context.Context, srv *Server, args SchedulerWorkerPoolArgs) (*Worker, error) { 123 w := newWorker(ctx, srv, args) 124 w.Start() 125 return w, nil 126 } 127 128 // _newWorker creates a worker without calling its Start func. This is useful for testing. 129 func newWorker(ctx context.Context, srv *Server, args SchedulerWorkerPoolArgs) *Worker { 130 w := &Worker{ 131 id: uuid.Generate(), 132 srv: srv, 133 start: time.Now(), 134 status: WorkerStarting, 135 enabledSchedulers: make([]string, len(args.EnabledSchedulers)), 136 } 137 copy(w.enabledSchedulers, args.EnabledSchedulers) 138 139 w.logger = srv.logger.ResetNamed("worker").With("worker_id", w.id) 140 w.pauseCond = sync.NewCond(&w.pauseLock) 141 w.ctx, w.cancelFn = context.WithCancel(ctx) 142 143 return w 144 } 145 146 // ID returns a string ID for the worker. 147 func (w *Worker) ID() string { 148 return w.id 149 } 150 151 // Start transitions a worker to the starting state. Check 152 // to see if it paused using IsStarted() 153 func (w *Worker) Start() { 154 w.setStatus(WorkerStarting) 155 go w.run(raftSyncLimit) 156 } 157 158 // Pause transitions a worker to the pausing state. Check 159 // to see if it paused using IsPaused() 160 func (w *Worker) Pause() { 161 if w.isPausable() { 162 w.setStatus(WorkerPausing) 163 w.setPauseFlag(true) 164 } 165 } 166 167 // Resume transitions a worker to the resuming state. Check 168 // to see if the worker restarted by calling IsStarted() 169 func (w *Worker) Resume() { 170 if w.IsPaused() { 171 w.setStatus(WorkerResuming) 172 w.setPauseFlag(false) 173 w.pauseCond.Broadcast() 174 } 175 } 176 177 // Resume transitions a worker to the stopping state. Check 178 // to see if the worker stopped by calling IsStopped() 179 func (w *Worker) Stop() { 180 w.setStatus(WorkerStopping) 181 w.shutdown() 182 } 183 184 // IsStarted returns a boolean indicating if this worker has been started. 185 func (w *Worker) IsStarted() bool { 186 return w.GetStatus() == WorkerStarted 187 } 188 189 // IsPaused returns a boolean indicating if this worker has been paused. 190 func (w *Worker) IsPaused() bool { 191 return w.GetStatus() == WorkerPaused 192 } 193 194 // IsStopped returns a boolean indicating if this worker has been stopped. 195 func (w *Worker) IsStopped() bool { 196 return w.GetStatus() == WorkerStopped 197 } 198 199 func (w *Worker) isPausable() bool { 200 w.statusLock.RLock() 201 defer w.statusLock.RUnlock() 202 switch w.status { 203 case WorkerPausing, WorkerPaused, WorkerStopping, WorkerStopped: 204 return false 205 default: 206 return true 207 } 208 } 209 210 // GetStatus returns the status of the Worker 211 func (w *Worker) GetStatus() WorkerStatus { 212 w.statusLock.RLock() 213 defer w.statusLock.RUnlock() 214 return w.status 215 } 216 217 // setStatuses is used internally to the worker to update the 218 // status of the worker and workload at one time, since some 219 // transitions need to update both values using the same lock. 220 func (w *Worker) setStatuses(newWorkerStatus WorkerStatus, newWorkloadStatus SchedulerWorkerStatus) { 221 w.statusLock.Lock() 222 defer w.statusLock.Unlock() 223 w.setWorkerStatusLocked(newWorkerStatus) 224 w.setWorkloadStatusLocked(newWorkloadStatus) 225 } 226 227 // setStatus is used internally to the worker to update the 228 // status of the worker based on calls to the Worker API. For 229 // atomically updating the scheduler status and the workload 230 // status, use `setStatuses`. 231 func (w *Worker) setStatus(newStatus WorkerStatus) { 232 w.statusLock.Lock() 233 defer w.statusLock.Unlock() 234 w.setWorkerStatusLocked(newStatus) 235 } 236 237 func (w *Worker) setWorkerStatusLocked(newStatus WorkerStatus) { 238 if newStatus == w.status { 239 return 240 } 241 w.logger.Trace("changed worker status", "from", w.status, "to", newStatus) 242 w.status = newStatus 243 } 244 245 // GetStatus returns the status of the Worker's Workload. 246 func (w *Worker) GetWorkloadStatus() SchedulerWorkerStatus { 247 w.statusLock.RLock() 248 defer w.statusLock.RUnlock() 249 return w.workloadStatus 250 } 251 252 // setWorkloadStatus is used internally to the worker to update the 253 // status of the worker based updates from the workload. 254 func (w *Worker) setWorkloadStatus(newStatus SchedulerWorkerStatus) { 255 w.statusLock.Lock() 256 defer w.statusLock.Unlock() 257 w.setWorkloadStatusLocked(newStatus) 258 } 259 260 func (w *Worker) setWorkloadStatusLocked(newStatus SchedulerWorkerStatus) { 261 if newStatus == w.workloadStatus { 262 return 263 } 264 w.logger.Trace("changed workload status", "from", w.workloadStatus, "to", newStatus) 265 w.workloadStatus = newStatus 266 } 267 268 type WorkerInfo struct { 269 ID string `json:"id"` 270 EnabledSchedulers []string `json:"enabled_schedulers"` 271 Started time.Time `json:"started"` 272 Status string `json:"status"` 273 WorkloadStatus string `json:"workload_status"` 274 } 275 276 func (w WorkerInfo) Copy() WorkerInfo { 277 out := WorkerInfo{ 278 ID: w.ID, 279 EnabledSchedulers: make([]string, len(w.EnabledSchedulers)), 280 Started: w.Started, 281 Status: w.Status, 282 WorkloadStatus: w.WorkloadStatus, 283 } 284 copy(out.EnabledSchedulers, w.EnabledSchedulers) 285 return out 286 } 287 288 func (w WorkerInfo) String() string { 289 // lazy implementation of WorkerInfo to string 290 out, _ := json.Marshal(w) 291 return string(out) 292 } 293 294 func (w *Worker) Info() WorkerInfo { 295 w.pauseLock.Lock() 296 defer w.pauseLock.Unlock() 297 out := WorkerInfo{ 298 ID: w.id, 299 Status: w.status.String(), 300 WorkloadStatus: w.workloadStatus.String(), 301 EnabledSchedulers: make([]string, len(w.enabledSchedulers)), 302 } 303 out.Started = w.start 304 copy(out.EnabledSchedulers, w.enabledSchedulers) 305 return out 306 } 307 308 // ---------------------------------- 309 // Pause Implementation 310 // These functions are used to support the worker's pause behaviors. 311 // ---------------------------------- 312 313 func (w *Worker) setPauseFlag(pause bool) { 314 w.pauseLock.Lock() 315 defer w.pauseLock.Unlock() 316 w.pauseFlag = pause 317 } 318 319 // maybeWait is responsible for making the transition from `pausing` 320 // to `paused`, waiting, and then transitioning back to the running 321 // values. 322 func (w *Worker) maybeWait() { 323 w.pauseLock.Lock() 324 defer w.pauseLock.Unlock() 325 326 if !w.pauseFlag { 327 return 328 } 329 330 w.statusLock.Lock() 331 w.status = WorkerPaused 332 originalWorkloadStatus := w.workloadStatus 333 w.workloadStatus = WorkloadPaused 334 w.logger.Trace("changed workload status", "from", originalWorkloadStatus, "to", w.workloadStatus) 335 336 w.statusLock.Unlock() 337 338 for w.pauseFlag { 339 w.pauseCond.Wait() 340 } 341 342 w.statusLock.Lock() 343 344 w.logger.Trace("changed workload status", "from", w.workloadStatus, "to", originalWorkloadStatus) 345 w.workloadStatus = originalWorkloadStatus 346 347 // only reset the worker status if the worker is not resuming to stop the paused workload. 348 if w.status != WorkerStopping { 349 w.logger.Trace("changed worker status", "from", w.status, "to", WorkerStarted) 350 w.status = WorkerStarted 351 } 352 w.statusLock.Unlock() 353 } 354 355 // Shutdown is used to signal that the worker should shutdown. 356 func (w *Worker) shutdown() { 357 w.pauseLock.Lock() 358 wasPaused := w.pauseFlag 359 w.pauseFlag = false 360 w.pauseLock.Unlock() 361 362 w.logger.Trace("shutdown request received") 363 w.cancelFn() 364 if wasPaused { 365 w.pauseCond.Broadcast() 366 } 367 } 368 369 // markStopped is used to mark the worker and workload as stopped. It should be called in a 370 // defer immediately upon entering the run() function. 371 func (w *Worker) markStopped() { 372 w.setStatuses(WorkerStopped, WorkloadStopped) 373 w.logger.Debug("stopped") 374 } 375 376 func (w *Worker) workerShuttingDown() bool { 377 select { 378 case <-w.ctx.Done(): 379 return true 380 default: 381 return false 382 } 383 } 384 385 // ---------------------------------- 386 // Workload behavior code 387 // ---------------------------------- 388 389 // run is the long-lived goroutine which is used to run the worker 390 func (w *Worker) run(raftSyncLimit time.Duration) { 391 defer func() { 392 w.markStopped() 393 }() 394 w.setStatuses(WorkerStarted, WorkloadRunning) 395 w.logger.Debug("running") 396 for { 397 // Check to see if the context has been cancelled. Server shutdown and Shutdown() 398 // should do this. 399 if w.workerShuttingDown() { 400 return 401 } 402 // Dequeue a pending evaluation 403 eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout) 404 if shutdown { 405 return 406 } 407 408 // since dequeue takes time, we could have shutdown the server after 409 // getting an eval that needs to be nacked before we exit. Explicitly 410 // check the server whether to allow this eval to be processed. 411 if w.srv.IsShutdown() { 412 w.logger.Warn("nacking eval because the server is shutting down", 413 "eval", log.Fmt("%#v", eval)) 414 w.sendNack(eval, token) 415 return 416 } 417 418 // Wait for the raft log to catchup to the evaluation 419 w.setWorkloadStatus(WorkloadWaitingForRaft) 420 snap, err := w.snapshotMinIndex(waitIndex, raftSyncLimit) 421 if err != nil { 422 var timeoutErr ErrMinIndexDeadlineExceeded 423 if errors.As(err, &timeoutErr) { 424 w.logger.Warn("timeout waiting for Raft index required by eval", 425 "eval", eval.ID, "index", waitIndex, "timeout", raftSyncLimit) 426 w.sendNack(eval, token) 427 428 // Timing out above means this server is woefully behind the 429 // leader's index. This can happen when a new server is added to 430 // a cluster and must initially sync the cluster state. 431 // Backoff dequeuing another eval until there's some indication 432 // this server would be up to date enough to process it. 433 slowServerSyncLimit := 10 * raftSyncLimit 434 if _, err := w.snapshotMinIndex(waitIndex, slowServerSyncLimit); err != nil { 435 w.logger.Warn("server is unable to catch up to last eval's index", "error", err) 436 } 437 438 } else if errors.Is(err, context.Canceled) { 439 // If the server has shutdown while we're waiting, we'll get the 440 // Canceled error from the worker's context. We need to nack any 441 // dequeued evals before we exit. 442 w.logger.Warn("nacking eval because the server is shutting down", "eval", eval.ID) 443 w.sendNack(eval, token) 444 return 445 } else { 446 w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex) 447 w.sendNack(eval, token) 448 } 449 450 continue 451 } 452 453 // Invoke the scheduler to determine placements 454 w.setWorkloadStatus(WorkloadScheduling) 455 if err := w.invokeScheduler(snap, eval, token); err != nil { 456 w.logger.Error("error invoking scheduler", "error", err) 457 w.sendNack(eval, token) 458 continue 459 } 460 461 // Complete the evaluation 462 w.sendAck(eval, token) 463 } 464 } 465 466 // dequeueEvaluation is used to fetch the next ready evaluation. 467 // This blocks until an evaluation is available or a timeout is reached. 468 func (w *Worker) dequeueEvaluation(timeout time.Duration) ( 469 eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) { 470 // Setup the request 471 req := structs.EvalDequeueRequest{ 472 Schedulers: w.enabledSchedulers, 473 Timeout: timeout, 474 SchedulerVersion: scheduler.SchedulerVersion, 475 WriteRequest: structs.WriteRequest{ 476 Region: w.srv.config.Region, 477 }, 478 } 479 var resp structs.EvalDequeueResponse 480 481 REQ: 482 // Wait inside this function if the worker is paused. 483 w.maybeWait() 484 // Immediately check to see if the worker has been shutdown. 485 if w.workerShuttingDown() { 486 return nil, "", 0, true 487 } 488 489 // Make a blocking RPC 490 start := time.Now() 491 w.setWorkloadStatus(WorkloadWaitingToDequeue) 492 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 493 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 494 if err != nil { 495 if time.Since(w.start) > dequeueErrGrace && !w.workerShuttingDown() { 496 w.logger.Error("failed to dequeue evaluation", "error", err) 497 } 498 499 // Adjust the backoff based on the error. If it is a scheduler version 500 // mismatch we increase the baseline. 501 base, limit := backoffBaselineFast, backoffLimitSlow 502 if strings.Contains(err.Error(), "calling scheduler version") { 503 base = backoffSchedulerVersionMismatch 504 limit = backoffSchedulerVersionMismatch 505 } 506 507 if w.backoffErr(base, limit) { 508 return nil, "", 0, true 509 } 510 goto REQ 511 } 512 w.backoffReset() 513 514 // Check if we got a response 515 if resp.Eval != nil { 516 w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID, "type", resp.Eval.Type, "namespace", resp.Eval.Namespace, "job_id", resp.Eval.JobID, "node_id", resp.Eval.NodeID, "triggered_by", resp.Eval.TriggeredBy) 517 return resp.Eval, resp.Token, resp.GetWaitIndex(), false 518 } 519 520 goto REQ 521 } 522 523 // sendAcknowledgement should not be called directly. Call `sendAck` or `sendNack` instead. 524 // This function implements `ack`ing or `nack`ing the evaluation generally. 525 // Any errors are logged but swallowed. 526 func (w *Worker) sendAcknowledgement(eval *structs.Evaluation, token string, ack bool) { 527 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 528 // Setup the request 529 req := structs.EvalAckRequest{ 530 EvalID: eval.ID, 531 Token: token, 532 WriteRequest: structs.WriteRequest{ 533 Region: w.srv.config.Region, 534 }, 535 } 536 var resp structs.GenericResponse 537 538 // Determine if this is an Ack or Nack 539 verb := "ack" 540 endpoint := "Eval.Ack" 541 if !ack { 542 verb = "nack" 543 endpoint = "Eval.Nack" 544 } 545 546 // Make the RPC call 547 err := w.srv.RPC(endpoint, &req, &resp) 548 if err != nil { 549 w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", eval.ID, "error", err) 550 } else { 551 w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", eval.ID, "type", eval.Type, "namespace", eval.Namespace, "job_id", eval.JobID, "node_id", eval.NodeID, "triggered_by", eval.TriggeredBy) 552 } 553 } 554 555 // sendNack makes a best effort to nack the evaluation. 556 // Any errors are logged but swallowed. 557 func (w *Worker) sendNack(eval *structs.Evaluation, token string) { 558 w.sendAcknowledgement(eval, token, false) 559 } 560 561 // sendAck makes a best effort to ack the evaluation. 562 // Any errors are logged but swallowed. 563 func (w *Worker) sendAck(eval *structs.Evaluation, token string) { 564 w.sendAcknowledgement(eval, token, true) 565 } 566 567 type ErrMinIndexDeadlineExceeded struct { 568 waitIndex uint64 569 timeout time.Duration 570 } 571 572 // Unwrapping an ErrMinIndexDeadlineExceeded always return 573 // context.DeadlineExceeded 574 func (ErrMinIndexDeadlineExceeded) Unwrap() error { 575 return context.DeadlineExceeded 576 } 577 578 func (e ErrMinIndexDeadlineExceeded) Error() string { 579 return fmt.Sprintf("timed out after %s waiting for index=%d", e.timeout, e.waitIndex) 580 } 581 582 // snapshotMinIndex times calls to StateStore.SnapshotAfter which may block. 583 func (w *Worker) snapshotMinIndex(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) { 584 defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, time.Now()) 585 586 ctx, cancel := context.WithTimeout(w.ctx, timeout) 587 snap, err := w.srv.fsm.State().SnapshotMinIndex(ctx, waitIndex) 588 cancel() 589 590 // Wrap error to ensure callers can detect timeouts. 591 if errors.Is(err, context.DeadlineExceeded) { 592 return nil, ErrMinIndexDeadlineExceeded{ 593 waitIndex: waitIndex, 594 timeout: timeout, 595 } 596 } 597 598 return snap, err 599 } 600 601 // invokeScheduler is used to invoke the business logic of the scheduler 602 func (w *Worker) invokeScheduler(snap *state.StateSnapshot, eval *structs.Evaluation, token string) error { 603 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 604 // Store the evaluation token 605 w.evalToken = token 606 607 // Store the snapshot's index 608 var err error 609 w.snapshotIndex, err = snap.LatestIndex() 610 if err != nil { 611 return fmt.Errorf("failed to determine snapshot's index: %v", err) 612 } 613 614 // Create the scheduler, or use the special core scheduler 615 var sched scheduler.Scheduler 616 if eval.Type == structs.JobTypeCore { 617 sched = NewCoreScheduler(w.srv, snap) 618 } else { 619 sched, err = scheduler.NewScheduler(eval.Type, w.logger, w.srv.workersEventCh, snap, w) 620 if err != nil { 621 return fmt.Errorf("failed to instantiate scheduler: %v", err) 622 } 623 } 624 625 // Process the evaluation 626 err = sched.Process(eval) 627 if err != nil { 628 return fmt.Errorf("failed to process evaluation: %v", err) 629 } 630 return nil 631 } 632 633 // ServersMeetMinimumVersion allows implementations of the Scheduler interface in 634 // other packages to perform server version checks without direct references to 635 // the Nomad server. 636 func (w *Worker) ServersMeetMinimumVersion(minVersion *version.Version, checkFailedServers bool) bool { 637 return ServersMeetMinimumVersion(w.srv.Members(), w.srv.Region(), minVersion, checkFailedServers) 638 } 639 640 // SubmitPlan is used to submit a plan for consideration. This allows 641 // the worker to act as the planner for the scheduler. 642 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 643 // Check for a shutdown before plan submission. Checking server state rather than 644 // worker state to allow work in flight to complete before stopping. 645 if w.srv.IsShutdown() { 646 return nil, nil, fmt.Errorf("shutdown while planning") 647 } 648 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 649 650 // Add the evaluation token to the plan 651 plan.EvalToken = w.evalToken 652 653 // Add SnapshotIndex to ensure leader's StateStore processes the Plan 654 // at or after the index it was created. 655 plan.SnapshotIndex = w.snapshotIndex 656 657 // Normalize stopped and preempted allocs before RPC 658 normalizePlan := ServersMeetMinimumVersion(w.srv.Members(), w.srv.Region(), MinVersionPlanNormalization, true) 659 if normalizePlan { 660 plan.NormalizeAllocations() 661 } 662 663 // Setup the request 664 req := structs.PlanRequest{ 665 Plan: plan, 666 WriteRequest: structs.WriteRequest{ 667 Region: w.srv.config.Region, 668 }, 669 } 670 var resp structs.PlanResponse 671 672 SUBMIT: 673 // Make the RPC call 674 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 675 w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err) 676 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 677 goto SUBMIT 678 } 679 return nil, nil, err 680 } else { 681 w.logger.Debug("submitted plan for evaluation", "eval_id", plan.EvalID) 682 w.backoffReset() 683 } 684 685 // Look for a result 686 result := resp.Result 687 if result == nil { 688 return nil, nil, fmt.Errorf("missing result") 689 } 690 691 // Check if a state update is required. This could be required if we 692 // planned based on stale data, which is causing issues. For example, a 693 // node failure since the time we've started planning or conflicting task 694 // allocations. 695 var state scheduler.State 696 if result.RefreshIndex != 0 { 697 // Wait for the raft log to catchup to the evaluation 698 w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID) 699 700 var err error 701 state, err = w.snapshotMinIndex(result.RefreshIndex, raftSyncLimit) 702 if err != nil { 703 return nil, nil, err 704 } 705 } 706 707 // Return the result and potential state update 708 return result, state, nil 709 } 710 711 // UpdateEval is used to submit an updated evaluation. This allows 712 // the worker to act as the planner for the scheduler. 713 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 714 // Check for a shutdown before plan submission. Checking server state rather than 715 // worker state to allow a workers work in flight to complete before stopping. 716 if w.srv.IsShutdown() { 717 return fmt.Errorf("shutdown while planning") 718 } 719 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 720 721 // Store the snapshot index in the eval 722 eval.SnapshotIndex = w.snapshotIndex 723 eval.UpdateModifyTime() 724 725 // Setup the request 726 req := structs.EvalUpdateRequest{ 727 Evals: []*structs.Evaluation{eval}, 728 EvalToken: w.evalToken, 729 WriteRequest: structs.WriteRequest{ 730 Region: w.srv.config.Region, 731 }, 732 } 733 var resp structs.GenericResponse 734 735 SUBMIT: 736 // Make the RPC call 737 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 738 w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err) 739 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 740 goto SUBMIT 741 } 742 return err 743 } else { 744 w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval)) 745 w.backoffReset() 746 } 747 return nil 748 } 749 750 // CreateEval is used to create a new evaluation. This allows 751 // the worker to act as the planner for the scheduler. 752 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 753 // Check for a shutdown before plan submission. This consults the server Shutdown state 754 // instead of the worker's to prevent aborting work in flight. 755 if w.srv.IsShutdown() { 756 return fmt.Errorf("shutdown while planning") 757 } 758 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 759 760 // Store the snapshot index in the eval 761 eval.SnapshotIndex = w.snapshotIndex 762 763 now := time.Now().UTC().UnixNano() 764 eval.CreateTime = now 765 eval.ModifyTime = now 766 767 // Setup the request 768 req := structs.EvalUpdateRequest{ 769 Evals: []*structs.Evaluation{eval}, 770 EvalToken: w.evalToken, 771 WriteRequest: structs.WriteRequest{ 772 Region: w.srv.config.Region, 773 }, 774 } 775 var resp structs.GenericResponse 776 777 SUBMIT: 778 // Make the RPC call 779 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 780 w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err) 781 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 782 goto SUBMIT 783 } 784 return err 785 } else { 786 w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval)) 787 w.backoffReset() 788 } 789 return nil 790 } 791 792 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 793 // tracker. This allows the worker to act as the planner for the scheduler. 794 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 795 // Check for a shutdown before plan submission. This checks the server state rather than 796 // the worker's to prevent erroring on work in flight that would complete otherwise. 797 if w.srv.IsShutdown() { 798 return fmt.Errorf("shutdown while planning") 799 } 800 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 801 802 // Update the evaluation if the queued jobs is not same as what is 803 // recorded in the job summary 804 ws := memdb.NewWatchSet() 805 summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID) 806 if err != nil { 807 return fmt.Errorf("couldn't retrieve job summary: %v", err) 808 } 809 if summary != nil { 810 var hasChanged bool 811 for tg, summary := range summary.Summary { 812 if queued, ok := eval.QueuedAllocations[tg]; ok { 813 if queued != summary.Queued { 814 hasChanged = true 815 break 816 } 817 } 818 } 819 if hasChanged { 820 if err := w.UpdateEval(eval); err != nil { 821 return err 822 } 823 } 824 } 825 826 // Store the snapshot index in the eval 827 eval.SnapshotIndex = w.snapshotIndex 828 eval.UpdateModifyTime() 829 830 // Setup the request 831 req := structs.EvalUpdateRequest{ 832 Evals: []*structs.Evaluation{eval}, 833 EvalToken: w.evalToken, 834 WriteRequest: structs.WriteRequest{ 835 Region: w.srv.config.Region, 836 }, 837 } 838 var resp structs.GenericResponse 839 840 SUBMIT: 841 // Make the RPC call 842 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 843 w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err) 844 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 845 goto SUBMIT 846 } 847 return err 848 } else { 849 w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval)) 850 w.backoffReset() 851 } 852 return nil 853 } 854 855 // shouldResubmit checks if a given error should be swallowed and the plan 856 // resubmitted after a backoff. Usually these are transient errors that 857 // the cluster should heal from quickly. 858 func (w *Worker) shouldResubmit(err error) bool { 859 s := err.Error() 860 switch { 861 case strings.Contains(s, "No cluster leader"): 862 return true 863 case strings.Contains(s, "plan queue is disabled"): 864 return true 865 default: 866 return false 867 } 868 } 869 870 // backoffErr is used to do an exponential back off on error. This is 871 // maintained statefully for the worker. Returns if attempts should be 872 // abandoned due to shutdown. 873 // This uses the worker's context in order to immediately stop the 874 // backoff if the server or the worker is shutdown. 875 func (w *Worker) backoffErr(base, limit time.Duration) bool { 876 w.setWorkloadStatus(WorkloadBackoff) 877 backoff := (1 << (2 * w.failures)) * base 878 if backoff > limit { 879 backoff = limit 880 } else { 881 w.failures++ 882 } 883 select { 884 case <-time.After(backoff): 885 return false 886 case <-w.ctx.Done(): 887 return true 888 } 889 } 890 891 // backoffReset is used to reset the failure count for 892 // exponential backoff 893 func (w *Worker) backoffReset() { 894 w.failures = 0 895 }