github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 memdb "github.com/hashicorp/go-memdb" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/nomad/scheduler" 14 ) 15 16 const ( 17 // backoffBaselineFast is the baseline time for exponential backoff 18 backoffBaselineFast = 20 * time.Millisecond 19 20 // backoffBaselineSlow is the baseline time for exponential backoff 21 // but that is much slower than backoffBaselineFast 22 backoffBaselineSlow = 500 * time.Millisecond 23 24 // backoffLimitFast is the limit of the exponential backoff 25 backoffLimitFast = time.Second 26 27 // backoffLimitSlow is the limit of the exponential backoff for 28 // the slower backoff 29 backoffLimitSlow = 10 * time.Second 30 31 // backoffSchedulerVersionMismatch is the backoff between retries when the 32 // scheduler version mismatches that of the leader. 33 backoffSchedulerVersionMismatch = 30 * time.Second 34 35 // dequeueTimeout is used to timeout an evaluation dequeue so that 36 // we can check if there is a shutdown event 37 dequeueTimeout = 500 * time.Millisecond 38 39 // raftSyncLimit is the limit of time we will wait for Raft replication 40 // to catch up to the evaluation. This is used to fast Nack and 41 // allow another scheduler to pick it up. 42 raftSyncLimit = 5 * time.Second 43 44 // dequeueErrGrace is the grace period where we don't log about 45 // dequeue errors after start. This is to improve the user experience 46 // in dev mode where the leader isn't elected for a few seconds. 47 dequeueErrGrace = 10 * time.Second 48 ) 49 50 // Worker is a single threaded scheduling worker. There may be multiple 51 // running per server (leader or follower). They are responsible for dequeuing 52 // pending evaluations, invoking schedulers, plan submission and the 53 // lifecycle around making task allocations. They bridge the business logic 54 // of the scheduler with the plumbing required to make it all work. 55 type Worker struct { 56 srv *Server 57 logger *log.Logger 58 start time.Time 59 60 paused bool 61 pauseLock sync.Mutex 62 pauseCond *sync.Cond 63 64 failures uint 65 66 evalToken string 67 68 // snapshotIndex is the index of the snapshot in which the scheduler was 69 // first invoked. It is used to mark the SnapshotIndex of evaluations 70 // Created, Updated or Reblocked. 71 snapshotIndex uint64 72 } 73 74 // NewWorker starts a new worker associated with the given server 75 func NewWorker(srv *Server) (*Worker, error) { 76 w := &Worker{ 77 srv: srv, 78 logger: srv.logger, 79 start: time.Now(), 80 } 81 w.pauseCond = sync.NewCond(&w.pauseLock) 82 go w.run() 83 return w, nil 84 } 85 86 // SetPause is used to pause or unpause a worker 87 func (w *Worker) SetPause(p bool) { 88 w.pauseLock.Lock() 89 w.paused = p 90 w.pauseLock.Unlock() 91 if !p { 92 w.pauseCond.Broadcast() 93 } 94 } 95 96 // checkPaused is used to park the worker when paused 97 func (w *Worker) checkPaused() { 98 w.pauseLock.Lock() 99 for w.paused { 100 w.pauseCond.Wait() 101 } 102 w.pauseLock.Unlock() 103 } 104 105 // run is the long-lived goroutine which is used to run the worker 106 func (w *Worker) run() { 107 for { 108 // Dequeue a pending evaluation 109 eval, token, waitIndex, shutdown := w.dequeueEvaluation(dequeueTimeout) 110 if shutdown { 111 return 112 } 113 114 // Check for a shutdown 115 if w.srv.IsShutdown() { 116 w.sendAck(eval.ID, token, false) 117 return 118 } 119 120 // Wait for the raft log to catchup to the evaluation 121 if err := w.waitForIndex(waitIndex, raftSyncLimit); err != nil { 122 w.sendAck(eval.ID, token, false) 123 continue 124 } 125 126 // Invoke the scheduler to determine placements 127 if err := w.invokeScheduler(eval, token); err != nil { 128 w.sendAck(eval.ID, token, false) 129 continue 130 } 131 132 // Complete the evaluation 133 w.sendAck(eval.ID, token, true) 134 } 135 } 136 137 // dequeueEvaluation is used to fetch the next ready evaluation. 138 // This blocks until an evaluation is available or a timeout is reached. 139 func (w *Worker) dequeueEvaluation(timeout time.Duration) ( 140 eval *structs.Evaluation, token string, waitIndex uint64, shutdown bool) { 141 // Setup the request 142 req := structs.EvalDequeueRequest{ 143 Schedulers: w.srv.config.EnabledSchedulers, 144 Timeout: timeout, 145 SchedulerVersion: scheduler.SchedulerVersion, 146 WriteRequest: structs.WriteRequest{ 147 Region: w.srv.config.Region, 148 }, 149 } 150 var resp structs.EvalDequeueResponse 151 152 REQ: 153 // Check if we are paused 154 w.checkPaused() 155 156 // Make a blocking RPC 157 start := time.Now() 158 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 159 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 160 if err != nil { 161 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 162 w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) 163 } 164 165 // Adjust the backoff based on the error. If it is a scheduler version 166 // mismatch we increase the baseline. 167 base, limit := backoffBaselineFast, backoffLimitSlow 168 if strings.Contains(err.Error(), "calling scheduler version") { 169 base = backoffSchedulerVersionMismatch 170 limit = backoffSchedulerVersionMismatch 171 } 172 173 if w.backoffErr(base, limit) { 174 return nil, "", 0, true 175 } 176 goto REQ 177 } 178 w.backoffReset() 179 180 // Check if we got a response 181 if resp.Eval != nil { 182 w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) 183 return resp.Eval, resp.Token, resp.GetWaitIndex(), false 184 } 185 186 // Check for potential shutdown 187 if w.srv.IsShutdown() { 188 return nil, "", 0, true 189 } 190 goto REQ 191 } 192 193 // sendAck makes a best effort to ack or nack the evaluation. 194 // Any errors are logged but swallowed. 195 func (w *Worker) sendAck(evalID, token string, ack bool) { 196 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 197 // Setup the request 198 req := structs.EvalAckRequest{ 199 EvalID: evalID, 200 Token: token, 201 WriteRequest: structs.WriteRequest{ 202 Region: w.srv.config.Region, 203 }, 204 } 205 var resp structs.GenericResponse 206 207 // Determine if this is an Ack or Nack 208 verb := "ack" 209 endpoint := "Eval.Ack" 210 if !ack { 211 verb = "nack" 212 endpoint = "Eval.Nack" 213 } 214 215 // Make the RPC call 216 err := w.srv.RPC(endpoint, &req, &resp) 217 if err != nil { 218 w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", 219 verb, evalID, err) 220 } else { 221 w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) 222 } 223 } 224 225 // waitForIndex ensures that the local state is at least as fresh 226 // as the given index. This is used before starting an evaluation, 227 // but also potentially mid-stream. If a Plan fails because of stale 228 // state (attempt to allocate to a failed/dead node), we may need 229 // to sync our state again and do the planning with more recent data. 230 func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error { 231 // XXX: Potential optimization is to set up a watch on the state stores 232 // index table and only unblock via a trigger rather than timing out and 233 // checking. 234 235 start := time.Now() 236 defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 237 CHECK: 238 // Get the states current index 239 snapshotIndex, err := w.srv.fsm.State().LatestIndex() 240 if err != nil { 241 return fmt.Errorf("failed to determine state store's index: %v", err) 242 } 243 244 // We only need the FSM state to be as recent as the given index 245 if index <= snapshotIndex { 246 w.backoffReset() 247 return nil 248 } 249 250 // Check if we've reached our limit 251 if time.Now().Sub(start) > timeout { 252 return fmt.Errorf("sync wait timeout reached") 253 } 254 255 // Exponential back off if we haven't yet reached it 256 if w.backoffErr(backoffBaselineFast, backoffLimitFast) { 257 return fmt.Errorf("shutdown while waiting for state sync") 258 } 259 goto CHECK 260 } 261 262 // invokeScheduler is used to invoke the business logic of the scheduler 263 func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error { 264 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 265 // Store the evaluation token 266 w.evalToken = token 267 268 // Snapshot the current state 269 snap, err := w.srv.fsm.State().Snapshot() 270 if err != nil { 271 return fmt.Errorf("failed to snapshot state: %v", err) 272 } 273 274 // Store the snapshot's index 275 w.snapshotIndex, err = snap.LatestIndex() 276 if err != nil { 277 return fmt.Errorf("failed to determine snapshot's index: %v", err) 278 } 279 280 // Create the scheduler, or use the special system scheduler 281 var sched scheduler.Scheduler 282 if eval.Type == structs.JobTypeCore { 283 sched = NewCoreScheduler(w.srv, snap) 284 } else { 285 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 286 if err != nil { 287 return fmt.Errorf("failed to instantiate scheduler: %v", err) 288 } 289 } 290 291 // Process the evaluation 292 err = sched.Process(eval) 293 if err != nil { 294 return fmt.Errorf("failed to process evaluation: %v", err) 295 } 296 return nil 297 } 298 299 // SubmitPlan is used to submit a plan for consideration. This allows 300 // the worker to act as the planner for the scheduler. 301 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 302 // Check for a shutdown before plan submission 303 if w.srv.IsShutdown() { 304 return nil, nil, fmt.Errorf("shutdown while planning") 305 } 306 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 307 308 // Add the evaluation token to the plan 309 plan.EvalToken = w.evalToken 310 311 // Setup the request 312 req := structs.PlanRequest{ 313 Plan: plan, 314 WriteRequest: structs.WriteRequest{ 315 Region: w.srv.config.Region, 316 }, 317 } 318 var resp structs.PlanResponse 319 320 SUBMIT: 321 // Make the RPC call 322 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 323 w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", 324 plan.EvalID, err) 325 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 326 goto SUBMIT 327 } 328 return nil, nil, err 329 } else { 330 w.logger.Printf("[DEBUG] worker: submitted plan at index %d for evaluation %s", resp.Index, plan.EvalID) 331 w.backoffReset() 332 } 333 334 // Look for a result 335 result := resp.Result 336 if result == nil { 337 return nil, nil, fmt.Errorf("missing result") 338 } 339 340 // Check if a state update is required. This could be required if we 341 // planning based on stale data, which is causing issues. For example, a 342 // node failure since the time we've started planning or conflicting task 343 // allocations. 344 var state scheduler.State 345 if result.RefreshIndex != 0 { 346 // Wait for the raft log to catchup to the evaluation 347 w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID) 348 if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { 349 return nil, nil, err 350 } 351 352 // Snapshot the current state 353 snap, err := w.srv.fsm.State().Snapshot() 354 if err != nil { 355 return nil, nil, fmt.Errorf("failed to snapshot state: %v", err) 356 } 357 state = snap 358 } 359 360 // Return the result and potential state update 361 return result, state, nil 362 } 363 364 // UpdateEval is used to submit an updated evaluation. This allows 365 // the worker to act as the planner for the scheduler. 366 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 367 // Check for a shutdown before plan submission 368 if w.srv.IsShutdown() { 369 return fmt.Errorf("shutdown while planning") 370 } 371 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 372 373 // Store the snapshot index in the eval 374 eval.SnapshotIndex = w.snapshotIndex 375 376 // Setup the request 377 req := structs.EvalUpdateRequest{ 378 Evals: []*structs.Evaluation{eval}, 379 EvalToken: w.evalToken, 380 WriteRequest: structs.WriteRequest{ 381 Region: w.srv.config.Region, 382 }, 383 } 384 var resp structs.GenericResponse 385 386 SUBMIT: 387 // Make the RPC call 388 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 389 w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", 390 eval, err) 391 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 392 goto SUBMIT 393 } 394 return err 395 } else { 396 w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) 397 w.backoffReset() 398 } 399 return nil 400 } 401 402 // CreateEval is used to create a new evaluation. This allows 403 // the worker to act as the planner for the scheduler. 404 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 405 // Check for a shutdown before plan submission 406 if w.srv.IsShutdown() { 407 return fmt.Errorf("shutdown while planning") 408 } 409 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 410 411 // Store the snapshot index in the eval 412 eval.SnapshotIndex = w.snapshotIndex 413 414 // Setup the request 415 req := structs.EvalUpdateRequest{ 416 Evals: []*structs.Evaluation{eval}, 417 EvalToken: w.evalToken, 418 WriteRequest: structs.WriteRequest{ 419 Region: w.srv.config.Region, 420 }, 421 } 422 var resp structs.GenericResponse 423 424 SUBMIT: 425 // Make the RPC call 426 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 427 w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", 428 eval, err) 429 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 430 goto SUBMIT 431 } 432 return err 433 } else { 434 w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) 435 w.backoffReset() 436 } 437 return nil 438 } 439 440 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 441 // tracker. This allows the worker to act as the planner for the scheduler. 442 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 443 // Check for a shutdown before plan submission 444 if w.srv.IsShutdown() { 445 return fmt.Errorf("shutdown while planning") 446 } 447 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 448 449 // Update the evaluation if the queued jobs is not same as what is 450 // recorded in the job summary 451 ws := memdb.NewWatchSet() 452 summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.Namespace, eval.JobID) 453 if err != nil { 454 return fmt.Errorf("couldn't retrieve job summary: %v", err) 455 } 456 if summary != nil { 457 var hasChanged bool 458 for tg, summary := range summary.Summary { 459 if queued, ok := eval.QueuedAllocations[tg]; ok { 460 if queued != summary.Queued { 461 hasChanged = true 462 break 463 } 464 } 465 } 466 if hasChanged { 467 if err := w.UpdateEval(eval); err != nil { 468 return err 469 } 470 } 471 } 472 473 // Store the snapshot index in the eval 474 eval.SnapshotIndex = w.snapshotIndex 475 476 // Setup the request 477 req := structs.EvalUpdateRequest{ 478 Evals: []*structs.Evaluation{eval}, 479 EvalToken: w.evalToken, 480 WriteRequest: structs.WriteRequest{ 481 Region: w.srv.config.Region, 482 }, 483 } 484 var resp structs.GenericResponse 485 486 SUBMIT: 487 // Make the RPC call 488 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 489 w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v", 490 eval, err) 491 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 492 goto SUBMIT 493 } 494 return err 495 } else { 496 w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval) 497 w.backoffReset() 498 } 499 return nil 500 } 501 502 // shouldResubmit checks if a given error should be swallowed and the plan 503 // resubmitted after a backoff. Usually these are transient errors that 504 // the cluster should heal from quickly. 505 func (w *Worker) shouldResubmit(err error) bool { 506 s := err.Error() 507 switch { 508 case strings.Contains(s, "No cluster leader"): 509 return true 510 case strings.Contains(s, "plan queue is disabled"): 511 return true 512 default: 513 return false 514 } 515 } 516 517 // backoffErr is used to do an exponential back off on error. This is 518 // maintained statefully for the worker. Returns if attempts should be 519 // abandoned due to shutdown. 520 func (w *Worker) backoffErr(base, limit time.Duration) bool { 521 backoff := (1 << (2 * w.failures)) * base 522 if backoff > limit { 523 backoff = limit 524 } else { 525 w.failures++ 526 } 527 select { 528 case <-time.After(backoff): 529 return false 530 case <-w.srv.shutdownCh: 531 return true 532 } 533 } 534 535 // backoffReset is used to reset the failure count for 536 // exponential backoff 537 func (w *Worker) backoffReset() { 538 w.failures = 0 539 }