github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 metrics "github.com/armon/go-metrics" 11 log "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "github.com/hashicorp/nomad/scheduler" 16 ) 17 18 const ( 19 // backoffBaselineFast is the baseline time for exponential backoff 20 backoffBaselineFast = 20 * time.Millisecond 21 22 // backoffBaselineSlow is the baseline time for exponential backoff 23 // but that is much slower than backoffBaselineFast 24 backoffBaselineSlow = 500 * time.Millisecond 25 26 // backoffLimitSlow is the limit of the exponential backoff for 27 // the slower backoff 28 backoffLimitSlow = 10 * time.Second 29 30 // backoffSchedulerVersionMismatch is the backoff between retries when the 31 // scheduler version mismatches that of the leader. 32 backoffSchedulerVersionMismatch = 30 * time.Second 33 34 // dequeueTimeout is used to timeout an evaluation dequeue so that 35 // we can check if there is a shutdown event 36 dequeueTimeout = 500 * time.Millisecond 37 38 // raftSyncLimit is the limit of time we will wait for Raft replication 39 // to catch up to the evaluation. This is used to fast Nack and 40 // allow another scheduler to pick it up. 41 raftSyncLimit = 5 * time.Second 42 43 // dequeueErrGrace is the grace period where we don't log about 44 // dequeue errors after start. This is to improve the user experience 45 // in dev mode where the leader isn't elected for a few seconds. 46 dequeueErrGrace = 10 * time.Second 47 ) 48 49 // Worker is a single threaded scheduling worker. There may be multiple 50 // running per server (leader or follower). They are responsible for dequeuing 51 // pending evaluations, invoking schedulers, plan submission and the 52 // lifecycle around making task allocations. They bridge the business logic 53 // of the scheduler with the plumbing required to make it all work. 54 type Worker struct { 55 srv *Server 56 logger log.Logger 57 start time.Time 58 59 paused bool 60 pauseLock sync.Mutex 61 pauseCond *sync.Cond 62 63 failures uint 64 65 evalToken string 66 67 // snapshotIndex is the index of the snapshot in which the scheduler was 68 // first invoked. It is used to mark the SnapshotIndex of evaluations 69 // Created, Updated or Reblocked. 70 snapshotIndex uint64 71 } 72 73 // NewWorker starts a new worker associated with the given server 74 func NewWorker(srv *Server) (*Worker, error) { 75 w := &Worker{ 76 srv: srv, 77 logger: srv.logger.ResetNamed("worker"), 78 start: time.Now(), 79 } 80 w.pauseCond = sync.NewCond(&w.pauseLock) 81 go w.run() 82 return w, nil 83 } 84 85 // SetPause is used to pause or unpause a worker 86 func (w *Worker) SetPause(p bool) { 87 w.pauseLock.Lock() 88 w.paused = p 89 w.pauseLock.Unlock() 90 if !p { 91 w.pauseCond.Broadcast() 92 } 93 } 94 95 // checkPaused is used to park the worker when paused 96 func (w *Worker) checkPaused() { 97 w.pauseLock.Lock() 98 for w.paused { 99 w.pauseCond.Wait() 100 } 101 w.pauseLock.Unlock() 102 } 103 104 // run is the long-lived goroutine which is used to run the worker 105 func (w *Worker) run() { 106 for { 107 // Dequeue a pending evaluation 108 eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout) 109 if shutdown { 110 return 111 } 112 113 // Check for a shutdown 114 if w.srv.IsShutdown() { 115 w.logger.Error("nacking eval because the server is shutting down", "eval", log.Fmt("%#v", eval)) 116 w.sendAck(eval.ID, token, false) 117 return 118 } 119 120 // Wait for the raft log to catchup to the evaluation 121 snap, err := w.snapshotAfter(waitIndex, raftSyncLimit) 122 if err != nil { 123 w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex) 124 w.sendAck(eval.ID, token, false) 125 continue 126 } 127 128 // Invoke the scheduler to determine placements 129 if err := w.invokeScheduler(snap, eval, token); err != nil { 130 w.logger.Error("error invoking scheduler", "error", err) 131 w.sendAck(eval.ID, token, false) 132 continue 133 } 134 135 // Complete the evaluation 136 w.sendAck(eval.ID, token, true) 137 } 138 } 139 140 // dequeueEvaluation is used to fetch the next ready evaluation. 141 // This blocks until an evaluation is available or a timeout is reached. 142 func (w *Worker) dequeueEvaluation(timeout time.Duration) ( 143 eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) { 144 // Setup the request 145 req := structs.EvalDequeueRequest{ 146 Schedulers: w.srv.config.EnabledSchedulers, 147 Timeout: timeout, 148 SchedulerVersion: scheduler.SchedulerVersion, 149 WriteRequest: structs.WriteRequest{ 150 Region: w.srv.config.Region, 151 }, 152 } 153 var resp structs.EvalDequeueResponse 154 155 REQ: 156 // Check if we are paused 157 w.checkPaused() 158 159 // Make a blocking RPC 160 start := time.Now() 161 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 162 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 163 if err != nil { 164 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 165 w.logger.Error("failed to dequeue evaluation", "error", err) 166 } 167 168 // Adjust the backoff based on the error. If it is a scheduler version 169 // mismatch we increase the baseline. 170 base, limit := backoffBaselineFast, backoffLimitSlow 171 if strings.Contains(err.Error(), "calling scheduler version") { 172 base = backoffSchedulerVersionMismatch 173 limit = backoffSchedulerVersionMismatch 174 } 175 176 if w.backoffErr(base, limit) { 177 return nil, "", 0, true 178 } 179 goto REQ 180 } 181 w.backoffReset() 182 183 // Check if we got a response 184 if resp.Eval != nil { 185 w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID) 186 return resp.Eval, resp.Token, resp.GetWaitIndex(), false 187 } 188 189 // Check for potential shutdown 190 if w.srv.IsShutdown() { 191 return nil, "", 0, true 192 } 193 goto REQ 194 } 195 196 // sendAck makes a best effort to ack or nack the evaluation. 197 // Any errors are logged but swallowed. 198 func (w *Worker) sendAck(evalID, token string, ack bool) { 199 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 200 // Setup the request 201 req := structs.EvalAckRequest{ 202 EvalID: evalID, 203 Token: token, 204 WriteRequest: structs.WriteRequest{ 205 Region: w.srv.config.Region, 206 }, 207 } 208 var resp structs.GenericResponse 209 210 // Determine if this is an Ack or Nack 211 verb := "ack" 212 endpoint := "Eval.Ack" 213 if !ack { 214 verb = "nack" 215 endpoint = "Eval.Nack" 216 } 217 218 // Make the RPC call 219 err := w.srv.RPC(endpoint, &req, &resp) 220 if err != nil { 221 w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", evalID, "error", err) 222 } else { 223 w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", evalID) 224 } 225 } 226 227 // snapshotAfter times calls to StateStore.SnapshotAfter which may block. 228 func (w *Worker) snapshotAfter(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) { 229 start := time.Now() 230 ctx, cancel := context.WithTimeout(w.srv.shutdownCtx, timeout) 231 snap, err := w.srv.fsm.State().SnapshotAfter(ctx, waitIndex) 232 cancel() 233 metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 234 235 // Wrap error to ensure callers don't disregard timeouts. 236 if err == context.DeadlineExceeded { 237 err = fmt.Errorf("timed out after %s waiting for index=%d", timeout, waitIndex) 238 } 239 240 return snap, err 241 } 242 243 // invokeScheduler is used to invoke the business logic of the scheduler 244 func (w *Worker) invokeScheduler(snap *state.StateSnapshot, eval *structs.Evaluation, token string) error { 245 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 246 // Store the evaluation token 247 w.evalToken = token 248 249 // Store the snapshot's index 250 var err error 251 w.snapshotIndex, err = snap.LatestIndex() 252 if err != nil { 253 return fmt.Errorf("failed to determine snapshot's index: %v", err) 254 } 255 256 // Create the scheduler, or use the special system scheduler 257 var sched scheduler.Scheduler 258 if eval.Type == structs.JobTypeCore { 259 sched = NewCoreScheduler(w.srv, snap) 260 } else { 261 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 262 if err != nil { 263 return fmt.Errorf("failed to instantiate scheduler: %v", err) 264 } 265 } 266 267 // Process the evaluation 268 err = sched.Process(eval) 269 if err != nil { 270 return fmt.Errorf("failed to process evaluation: %v", err) 271 } 272 return nil 273 } 274 275 // SubmitPlan is used to submit a plan for consideration. This allows 276 // the worker to act as the planner for the scheduler. 277 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 278 // Check for a shutdown before plan submission 279 if w.srv.IsShutdown() { 280 return nil, nil, fmt.Errorf("shutdown while planning") 281 } 282 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 283 284 // Add the evaluation token to the plan 285 plan.EvalToken = w.evalToken 286 287 // Normalize stopped and preempted allocs before RPC 288 normalizePlan := ServersMeetMinimumVersion(w.srv.Members(), MinVersionPlanNormalization, true) 289 if normalizePlan { 290 plan.NormalizeAllocations() 291 } 292 293 // Setup the request 294 req := structs.PlanRequest{ 295 Plan: plan, 296 WriteRequest: structs.WriteRequest{ 297 Region: w.srv.config.Region, 298 }, 299 } 300 var resp structs.PlanResponse 301 302 SUBMIT: 303 // Make the RPC call 304 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 305 w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err) 306 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 307 goto SUBMIT 308 } 309 return nil, nil, err 310 } else { 311 w.logger.Debug("submitted plan for evaluation", "eval_id", plan.EvalID) 312 w.backoffReset() 313 } 314 315 // Look for a result 316 result := resp.Result 317 if result == nil { 318 return nil, nil, fmt.Errorf("missing result") 319 } 320 321 // Check if a state update is required. This could be required if we 322 // planning based on stale data, which is causing issues. For example, a 323 // node failure since the time we've started planning or conflicting task 324 // allocations. 325 var state scheduler.State 326 if result.RefreshIndex != 0 { 327 // Wait for the raft log to catchup to the evaluation 328 w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID) 329 330 var err error 331 state, err = w.snapshotAfter(result.RefreshIndex, raftSyncLimit) 332 if err != nil { 333 return nil, nil, err 334 } 335 } 336 337 // Return the result and potential state update 338 return result, state, nil 339 } 340 341 // UpdateEval is used to submit an updated evaluation. This allows 342 // the worker to act as the planner for the scheduler. 343 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 344 // Check for a shutdown before plan submission 345 if w.srv.IsShutdown() { 346 return fmt.Errorf("shutdown while planning") 347 } 348 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 349 350 // Store the snapshot index in the eval 351 eval.SnapshotIndex = w.snapshotIndex 352 353 // Setup the request 354 req := structs.EvalUpdateRequest{ 355 Evals: []*structs.Evaluation{eval}, 356 EvalToken: w.evalToken, 357 WriteRequest: structs.WriteRequest{ 358 Region: w.srv.config.Region, 359 }, 360 } 361 var resp structs.GenericResponse 362 363 SUBMIT: 364 // Make the RPC call 365 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 366 w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err) 367 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 368 goto SUBMIT 369 } 370 return err 371 } else { 372 w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval)) 373 w.backoffReset() 374 } 375 return nil 376 } 377 378 // CreateEval is used to create a new evaluation. This allows 379 // the worker to act as the planner for the scheduler. 380 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 381 // Check for a shutdown before plan submission 382 if w.srv.IsShutdown() { 383 return fmt.Errorf("shutdown while planning") 384 } 385 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 386 387 // Store the snapshot index in the eval 388 eval.SnapshotIndex = w.snapshotIndex 389 390 // Setup the request 391 req := structs.EvalUpdateRequest{ 392 Evals: []*structs.Evaluation{eval}, 393 EvalToken: w.evalToken, 394 WriteRequest: structs.WriteRequest{ 395 Region: w.srv.config.Region, 396 }, 397 } 398 var resp structs.GenericResponse 399 400 SUBMIT: 401 // Make the RPC call 402 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 403 w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err) 404 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 405 goto SUBMIT 406 } 407 return err 408 } else { 409 w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval)) 410 w.backoffReset() 411 } 412 return nil 413 } 414 415 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 416 // tracker. This allows the worker to act as the planner for the scheduler. 417 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 418 // Check for a shutdown before plan submission 419 if w.srv.IsShutdown() { 420 return fmt.Errorf("shutdown while planning") 421 } 422 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 423 424 // Update the evaluation if the queued jobs is not same as what is 425 // recorded in the job summary 426 ws := memdb.NewWatchSet() 427 summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID) 428 if err != nil { 429 return fmt.Errorf("couldn't retrieve job summary: %v", err) 430 } 431 if summary != nil { 432 var hasChanged bool 433 for tg, summary := range summary.Summary { 434 if queued, ok := eval.QueuedAllocations[tg]; ok { 435 if queued != summary.Queued { 436 hasChanged = true 437 break 438 } 439 } 440 } 441 if hasChanged { 442 if err := w.UpdateEval(eval); err != nil { 443 return err 444 } 445 } 446 } 447 448 // Store the snapshot index in the eval 449 eval.SnapshotIndex = w.snapshotIndex 450 451 // Setup the request 452 req := structs.EvalUpdateRequest{ 453 Evals: []*structs.Evaluation{eval}, 454 EvalToken: w.evalToken, 455 WriteRequest: structs.WriteRequest{ 456 Region: w.srv.config.Region, 457 }, 458 } 459 var resp structs.GenericResponse 460 461 SUBMIT: 462 // Make the RPC call 463 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 464 w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err) 465 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 466 goto SUBMIT 467 } 468 return err 469 } else { 470 w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval)) 471 w.backoffReset() 472 } 473 return nil 474 } 475 476 // shouldResubmit checks if a given error should be swallowed and the plan 477 // resubmitted after a backoff. Usually these are transient errors that 478 // the cluster should heal from quickly. 479 func (w *Worker) shouldResubmit(err error) bool { 480 s := err.Error() 481 switch { 482 case strings.Contains(s, "No cluster leader"): 483 return true 484 case strings.Contains(s, "plan queue is disabled"): 485 return true 486 default: 487 return false 488 } 489 } 490 491 // backoffErr is used to do an exponential back off on error. This is 492 // maintained statefully for the worker. Returns if attempts should be 493 // abandoned due to shutdown. 494 func (w *Worker) backoffErr(base, limit time.Duration) bool { 495 backoff := (1 << (2 * w.failures)) * base 496 if backoff > limit { 497 backoff = limit 498 } else { 499 w.failures++ 500 } 501 select { 502 case <-time.After(backoff): 503 return false 504 case <-w.srv.shutdownCh: 505 return true 506 } 507 } 508 509 // backoffReset is used to reset the failure count for 510 // exponential backoff 511 func (w *Worker) backoffReset() { 512 w.failures = 0 513 }