github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 metrics "github.com/armon/go-metrics" 11 log "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "github.com/hashicorp/nomad/scheduler" 16 ) 17 18 const ( 19 // backoffBaselineFast is the baseline time for exponential backoff 20 backoffBaselineFast = 20 * time.Millisecond 21 22 // backoffBaselineSlow is the baseline time for exponential backoff 23 // but that is much slower than backoffBaselineFast 24 backoffBaselineSlow = 500 * time.Millisecond 25 26 // backoffLimitSlow is the limit of the exponential backoff for 27 // the slower backoff 28 backoffLimitSlow = 10 * time.Second 29 30 // backoffSchedulerVersionMismatch is the backoff between retries when the 31 // scheduler version mismatches that of the leader. 32 backoffSchedulerVersionMismatch = 30 * time.Second 33 34 // dequeueTimeout is used to timeout an evaluation dequeue so that 35 // we can check if there is a shutdown event 36 dequeueTimeout = 500 * time.Millisecond 37 38 // raftSyncLimit is the limit of time we will wait for Raft replication 39 // to catch up to the evaluation. This is used to fast Nack and 40 // allow another scheduler to pick it up. 41 raftSyncLimit = 5 * time.Second 42 43 // dequeueErrGrace is the grace period where we don't log about 44 // dequeue errors after start. This is to improve the user experience 45 // in dev mode where the leader isn't elected for a few seconds. 46 dequeueErrGrace = 10 * time.Second 47 ) 48 49 // Worker is a single threaded scheduling worker. There may be multiple 50 // running per server (leader or follower). They are responsible for dequeuing 51 // pending evaluations, invoking schedulers, plan submission and the 52 // lifecycle around making task allocations. They bridge the business logic 53 // of the scheduler with the plumbing required to make it all work. 54 type Worker struct { 55 srv *Server 56 logger log.Logger 57 start time.Time 58 59 paused bool 60 pauseLock sync.Mutex 61 pauseCond *sync.Cond 62 63 failures uint 64 65 evalToken string 66 67 // snapshotIndex is the index of the snapshot in which the scheduler was 68 // first invoked. It is used to mark the SnapshotIndex of evaluations 69 // Created, Updated or Reblocked. 70 snapshotIndex uint64 71 } 72 73 // NewWorker starts a new worker associated with the given server 74 func NewWorker(srv *Server) (*Worker, error) { 75 w := &Worker{ 76 srv: srv, 77 logger: srv.logger.ResetNamed("worker"), 78 start: time.Now(), 79 } 80 w.pauseCond = sync.NewCond(&w.pauseLock) 81 go w.run() 82 return w, nil 83 } 84 85 // SetPause is used to pause or unpause a worker 86 func (w *Worker) SetPause(p bool) { 87 w.pauseLock.Lock() 88 w.paused = p 89 w.pauseLock.Unlock() 90 if !p { 91 w.pauseCond.Broadcast() 92 } 93 } 94 95 // checkPaused is used to park the worker when paused 96 func (w *Worker) checkPaused() { 97 w.pauseLock.Lock() 98 for w.paused { 99 w.pauseCond.Wait() 100 } 101 w.pauseLock.Unlock() 102 } 103 104 // run is the long-lived goroutine which is used to run the worker 105 func (w *Worker) run() { 106 for { 107 // Dequeue a pending evaluation 108 eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout) 109 if shutdown { 110 return 111 } 112 113 // Check for a shutdown 114 if w.srv.IsShutdown() { 115 w.logger.Error("nacking eval because the server is shutting down", "eval", log.Fmt("%#v", eval)) 116 w.sendAck(eval.ID, token, false) 117 return 118 } 119 120 // Wait for the raft log to catchup to the evaluation 121 snap, err := w.snapshotMinIndex(waitIndex, raftSyncLimit) 122 if err != nil { 123 w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex) 124 w.sendAck(eval.ID, token, false) 125 continue 126 } 127 128 // Invoke the scheduler to determine placements 129 if err := w.invokeScheduler(snap, eval, token); err != nil { 130 w.logger.Error("error invoking scheduler", "error", err) 131 w.sendAck(eval.ID, token, false) 132 continue 133 } 134 135 // Complete the evaluation 136 w.sendAck(eval.ID, token, true) 137 } 138 } 139 140 // dequeueEvaluation is used to fetch the next ready evaluation. 141 // This blocks until an evaluation is available or a timeout is reached. 142 func (w *Worker) dequeueEvaluation(timeout time.Duration) ( 143 eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) { 144 // Setup the request 145 req := structs.EvalDequeueRequest{ 146 Schedulers: w.srv.config.EnabledSchedulers, 147 Timeout: timeout, 148 SchedulerVersion: scheduler.SchedulerVersion, 149 WriteRequest: structs.WriteRequest{ 150 Region: w.srv.config.Region, 151 }, 152 } 153 var resp structs.EvalDequeueResponse 154 155 REQ: 156 // Check if we are paused 157 w.checkPaused() 158 159 // Make a blocking RPC 160 start := time.Now() 161 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 162 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 163 if err != nil { 164 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 165 w.logger.Error("failed to dequeue evaluation", "error", err) 166 } 167 168 // Adjust the backoff based on the error. If it is a scheduler version 169 // mismatch we increase the baseline. 170 base, limit := backoffBaselineFast, backoffLimitSlow 171 if strings.Contains(err.Error(), "calling scheduler version") { 172 base = backoffSchedulerVersionMismatch 173 limit = backoffSchedulerVersionMismatch 174 } 175 176 if w.backoffErr(base, limit) { 177 return nil, "", 0, true 178 } 179 goto REQ 180 } 181 w.backoffReset() 182 183 // Check if we got a response 184 if resp.Eval != nil { 185 w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID) 186 return resp.Eval, resp.Token, resp.GetWaitIndex(), false 187 } 188 189 // Check for potential shutdown 190 if w.srv.IsShutdown() { 191 return nil, "", 0, true 192 } 193 goto REQ 194 } 195 196 // sendAck makes a best effort to ack or nack the evaluation. 197 // Any errors are logged but swallowed. 198 func (w *Worker) sendAck(evalID, token string, ack bool) { 199 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 200 // Setup the request 201 req := structs.EvalAckRequest{ 202 EvalID: evalID, 203 Token: token, 204 WriteRequest: structs.WriteRequest{ 205 Region: w.srv.config.Region, 206 }, 207 } 208 var resp structs.GenericResponse 209 210 // Determine if this is an Ack or Nack 211 verb := "ack" 212 endpoint := "Eval.Ack" 213 if !ack { 214 verb = "nack" 215 endpoint = "Eval.Nack" 216 } 217 218 // Make the RPC call 219 err := w.srv.RPC(endpoint, &req, &resp) 220 if err != nil { 221 w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", evalID, "error", err) 222 } else { 223 w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", evalID) 224 } 225 } 226 227 // snapshotMinIndex times calls to StateStore.SnapshotAfter which may block. 228 func (w *Worker) snapshotMinIndex(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) { 229 start := time.Now() 230 ctx, cancel := context.WithTimeout(w.srv.shutdownCtx, timeout) 231 snap, err := w.srv.fsm.State().SnapshotMinIndex(ctx, waitIndex) 232 cancel() 233 metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 234 235 // Wrap error to ensure callers don't disregard timeouts. 236 if err == context.DeadlineExceeded { 237 err = fmt.Errorf("timed out after %s waiting for index=%d", timeout, waitIndex) 238 } 239 240 return snap, err 241 } 242 243 // invokeScheduler is used to invoke the business logic of the scheduler 244 func (w *Worker) invokeScheduler(snap *state.StateSnapshot, eval *structs.Evaluation, token string) error { 245 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 246 // Store the evaluation token 247 w.evalToken = token 248 249 // Store the snapshot's index 250 var err error 251 w.snapshotIndex, err = snap.LatestIndex() 252 if err != nil { 253 return fmt.Errorf("failed to determine snapshot's index: %v", err) 254 } 255 256 // Create the scheduler, or use the special core scheduler 257 var sched scheduler.Scheduler 258 if eval.Type == structs.JobTypeCore { 259 sched = NewCoreScheduler(w.srv, snap) 260 } else { 261 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 262 if err != nil { 263 return fmt.Errorf("failed to instantiate scheduler: %v", err) 264 } 265 } 266 267 // Process the evaluation 268 err = sched.Process(eval) 269 if err != nil { 270 return fmt.Errorf("failed to process evaluation: %v", err) 271 } 272 return nil 273 } 274 275 // SubmitPlan is used to submit a plan for consideration. This allows 276 // the worker to act as the planner for the scheduler. 277 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 278 // Check for a shutdown before plan submission 279 if w.srv.IsShutdown() { 280 return nil, nil, fmt.Errorf("shutdown while planning") 281 } 282 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 283 284 // Add the evaluation token to the plan 285 plan.EvalToken = w.evalToken 286 287 // Add SnapshotIndex to ensure leader's StateStore processes the Plan 288 // at or after the index it was created. 289 plan.SnapshotIndex = w.snapshotIndex 290 291 // Normalize stopped and preempted allocs before RPC 292 normalizePlan := ServersMeetMinimumVersion(w.srv.Members(), MinVersionPlanNormalization, true) 293 if normalizePlan { 294 plan.NormalizeAllocations() 295 } 296 297 // Setup the request 298 req := structs.PlanRequest{ 299 Plan: plan, 300 WriteRequest: structs.WriteRequest{ 301 Region: w.srv.config.Region, 302 }, 303 } 304 var resp structs.PlanResponse 305 306 SUBMIT: 307 // Make the RPC call 308 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 309 w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err) 310 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 311 goto SUBMIT 312 } 313 return nil, nil, err 314 } else { 315 w.logger.Debug("submitted plan for evaluation", "eval_id", plan.EvalID) 316 w.backoffReset() 317 } 318 319 // Look for a result 320 result := resp.Result 321 if result == nil { 322 return nil, nil, fmt.Errorf("missing result") 323 } 324 325 // Check if a state update is required. This could be required if we 326 // planned based on stale data, which is causing issues. For example, a 327 // node failure since the time we've started planning or conflicting task 328 // allocations. 329 var state scheduler.State 330 if result.RefreshIndex != 0 { 331 // Wait for the raft log to catchup to the evaluation 332 w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID) 333 334 var err error 335 state, err = w.snapshotMinIndex(result.RefreshIndex, raftSyncLimit) 336 if err != nil { 337 return nil, nil, err 338 } 339 } 340 341 // Return the result and potential state update 342 return result, state, nil 343 } 344 345 // UpdateEval is used to submit an updated evaluation. This allows 346 // the worker to act as the planner for the scheduler. 347 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 348 // Check for a shutdown before plan submission 349 if w.srv.IsShutdown() { 350 return fmt.Errorf("shutdown while planning") 351 } 352 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 353 354 // Store the snapshot index in the eval 355 eval.SnapshotIndex = w.snapshotIndex 356 eval.UpdateModifyTime() 357 358 // Setup the request 359 req := structs.EvalUpdateRequest{ 360 Evals: []*structs.Evaluation{eval}, 361 EvalToken: w.evalToken, 362 WriteRequest: structs.WriteRequest{ 363 Region: w.srv.config.Region, 364 }, 365 } 366 var resp structs.GenericResponse 367 368 SUBMIT: 369 // Make the RPC call 370 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 371 w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err) 372 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 373 goto SUBMIT 374 } 375 return err 376 } else { 377 w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval)) 378 w.backoffReset() 379 } 380 return nil 381 } 382 383 // CreateEval is used to create a new evaluation. This allows 384 // the worker to act as the planner for the scheduler. 385 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 386 // Check for a shutdown before plan submission 387 if w.srv.IsShutdown() { 388 return fmt.Errorf("shutdown while planning") 389 } 390 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 391 392 // Store the snapshot index in the eval 393 eval.SnapshotIndex = w.snapshotIndex 394 395 now := time.Now().UTC().UnixNano() 396 eval.CreateTime = now 397 eval.ModifyTime = now 398 399 // Setup the request 400 req := structs.EvalUpdateRequest{ 401 Evals: []*structs.Evaluation{eval}, 402 EvalToken: w.evalToken, 403 WriteRequest: structs.WriteRequest{ 404 Region: w.srv.config.Region, 405 }, 406 } 407 var resp structs.GenericResponse 408 409 SUBMIT: 410 // Make the RPC call 411 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 412 w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err) 413 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 414 goto SUBMIT 415 } 416 return err 417 } else { 418 w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval)) 419 w.backoffReset() 420 } 421 return nil 422 } 423 424 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 425 // tracker. This allows the worker to act as the planner for the scheduler. 426 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 427 // Check for a shutdown before plan submission 428 if w.srv.IsShutdown() { 429 return fmt.Errorf("shutdown while planning") 430 } 431 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 432 433 // Update the evaluation if the queued jobs is not same as what is 434 // recorded in the job summary 435 ws := memdb.NewWatchSet() 436 summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID) 437 if err != nil { 438 return fmt.Errorf("couldn't retrieve job summary: %v", err) 439 } 440 if summary != nil { 441 var hasChanged bool 442 for tg, summary := range summary.Summary { 443 if queued, ok := eval.QueuedAllocations[tg]; ok { 444 if queued != summary.Queued { 445 hasChanged = true 446 break 447 } 448 } 449 } 450 if hasChanged { 451 if err := w.UpdateEval(eval); err != nil { 452 return err 453 } 454 } 455 } 456 457 // Store the snapshot index in the eval 458 eval.SnapshotIndex = w.snapshotIndex 459 eval.UpdateModifyTime() 460 461 // Setup the request 462 req := structs.EvalUpdateRequest{ 463 Evals: []*structs.Evaluation{eval}, 464 EvalToken: w.evalToken, 465 WriteRequest: structs.WriteRequest{ 466 Region: w.srv.config.Region, 467 }, 468 } 469 var resp structs.GenericResponse 470 471 SUBMIT: 472 // Make the RPC call 473 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 474 w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err) 475 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 476 goto SUBMIT 477 } 478 return err 479 } else { 480 w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval)) 481 w.backoffReset() 482 } 483 return nil 484 } 485 486 // shouldResubmit checks if a given error should be swallowed and the plan 487 // resubmitted after a backoff. Usually these are transient errors that 488 // the cluster should heal from quickly. 489 func (w *Worker) shouldResubmit(err error) bool { 490 s := err.Error() 491 switch { 492 case strings.Contains(s, "No cluster leader"): 493 return true 494 case strings.Contains(s, "plan queue is disabled"): 495 return true 496 default: 497 return false 498 } 499 } 500 501 // backoffErr is used to do an exponential back off on error. This is 502 // maintained statefully for the worker. Returns if attempts should be 503 // abandoned due to shutdown. 504 func (w *Worker) backoffErr(base, limit time.Duration) bool { 505 backoff := (1 << (2 * w.failures)) * base 506 if backoff > limit { 507 backoff = limit 508 } else { 509 w.failures++ 510 } 511 select { 512 case <-time.After(backoff): 513 return false 514 case <-w.srv.shutdownCh: 515 return true 516 } 517 } 518 519 // backoffReset is used to reset the failure count for 520 // exponential backoff 521 func (w *Worker) backoffReset() { 522 w.failures = 0 523 }