github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "github.com/hashicorp/nomad/scheduler" 13 ) 14 15 const ( 16 // backoffBaselineFast is the baseline time for exponential backoff 17 backoffBaselineFast = 20 * time.Millisecond 18 19 // backoffBaselineSlow is the baseline time for exponential backoff 20 // but that is much slower than backoffBaselineFast 21 backoffBaselineSlow = 500 * time.Millisecond 22 23 // backoffLimitFast is the limit of the exponential backoff 24 backoffLimitFast = time.Second 25 26 // backoffLimitSlow is the limit of the exponential backoff for 27 // the slower backoff 28 backoffLimitSlow = 10 * time.Second 29 30 // backoffSchedulerVersionMismatch is the backoff between retries when the 31 // scheduler version mismatches that of the leader. 32 backoffSchedulerVersionMismatch = 30 * time.Second 33 34 // dequeueTimeout is used to timeout an evaluation dequeue so that 35 // we can check if there is a shutdown event 36 dequeueTimeout = 500 * time.Millisecond 37 38 // raftSyncLimit is the limit of time we will wait for Raft replication 39 // to catch up to the evaluation. This is used to fast Nack and 40 // allow another scheduler to pick it up. 41 raftSyncLimit = 5 * time.Second 42 43 // dequeueErrGrace is the grace period where we don't log about 44 // dequeue errors after start. This is to improve the user experience 45 // in dev mode where the leader isn't elected for a few seconds. 46 dequeueErrGrace = 10 * time.Second 47 ) 48 49 // Worker is a single threaded scheduling worker. There may be multiple 50 // running per server (leader or follower). They are responsible for dequeuing 51 // pending evaluations, invoking schedulers, plan submission and the 52 // lifecycle around making task allocations. They bridge the business logic 53 // of the scheduler with the plumbing required to make it all work. 54 type Worker struct { 55 srv *Server 56 logger *log.Logger 57 start time.Time 58 59 paused bool 60 pauseLock sync.Mutex 61 pauseCond *sync.Cond 62 63 failures uint 64 65 evalToken string 66 67 // snapshotIndex is the index of the snapshot in which the scheduler was 68 // first envoked. It is used to mark the SnapshotIndex of evaluations 69 // Created, Updated or Reblocked. 70 snapshotIndex uint64 71 } 72 73 // NewWorker starts a new worker associated with the given server 74 func NewWorker(srv *Server) (*Worker, error) { 75 w := &Worker{ 76 srv: srv, 77 logger: srv.logger, 78 start: time.Now(), 79 } 80 w.pauseCond = sync.NewCond(&w.pauseLock) 81 go w.run() 82 return w, nil 83 } 84 85 // SetPause is used to pause or unpause a worker 86 func (w *Worker) SetPause(p bool) { 87 w.pauseLock.Lock() 88 w.paused = p 89 w.pauseLock.Unlock() 90 if !p { 91 w.pauseCond.Broadcast() 92 } 93 } 94 95 // checkPaused is used to park the worker when paused 96 func (w *Worker) checkPaused() { 97 w.pauseLock.Lock() 98 for w.paused { 99 w.pauseCond.Wait() 100 } 101 w.pauseLock.Unlock() 102 } 103 104 // run is the long-lived goroutine which is used to run the worker 105 func (w *Worker) run() { 106 for { 107 // Dequeue a pending evaluation 108 eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout) 109 if shutdown { 110 return 111 } 112 113 // Check for a shutdown 114 if w.srv.IsShutdown() { 115 w.sendAck(eval.ID, token, false) 116 return 117 } 118 119 // Wait for the raft log to catchup to the evaluation 120 if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil { 121 w.sendAck(eval.ID, token, false) 122 continue 123 } 124 125 // Invoke the scheduler to determine placements 126 if err := w.invokeScheduler(eval, token); err != nil { 127 w.sendAck(eval.ID, token, false) 128 continue 129 } 130 131 // Complete the evaluation 132 w.sendAck(eval.ID, token, true) 133 } 134 } 135 136 // dequeueEvaluation is used to fetch the next ready evaluation. 137 // This blocks until an evaluation is available or a timeout is reached. 138 func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) { 139 // Setup the request 140 req := structs.EvalDequeueRequest{ 141 Schedulers: w.srv.config.EnabledSchedulers, 142 Timeout: timeout, 143 SchedulerVersion: scheduler.SchedulerVersion, 144 WriteRequest: structs.WriteRequest{ 145 Region: w.srv.config.Region, 146 }, 147 } 148 var resp structs.EvalDequeueResponse 149 150 REQ: 151 // Check if we are paused 152 w.checkPaused() 153 154 // Make a blocking RPC 155 start := time.Now() 156 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 157 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 158 if err != nil { 159 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 160 w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) 161 } 162 163 // Adjust the backoff based on the error. If it is a scheduler version 164 // mismatch we increase the baseline. 165 base, limit := backoffBaselineFast, backoffLimitSlow 166 if strings.Contains(err.Error(), "calling scheduler version") { 167 base = backoffSchedulerVersionMismatch 168 limit = backoffSchedulerVersionMismatch 169 } 170 171 if w.backoffErr(base, limit) { 172 return nil, "", true 173 } 174 goto REQ 175 } 176 w.backoffReset() 177 178 // Check if we got a response 179 if resp.Eval != nil { 180 w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) 181 return resp.Eval, resp.Token, false 182 } 183 184 // Check for potential shutdown 185 if w.srv.IsShutdown() { 186 return nil, "", true 187 } 188 goto REQ 189 } 190 191 // sendAck makes a best effort to ack or nack the evaluation. 192 // Any errors are logged but swallowed. 193 func (w *Worker) sendAck(evalID, token string, ack bool) { 194 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 195 // Setup the request 196 req := structs.EvalAckRequest{ 197 EvalID: evalID, 198 Token: token, 199 WriteRequest: structs.WriteRequest{ 200 Region: w.srv.config.Region, 201 }, 202 } 203 var resp structs.GenericResponse 204 205 // Determine if this is an Ack or Nack 206 verb := "ack" 207 endpoint := "Eval.Ack" 208 if !ack { 209 verb = "nack" 210 endpoint = "Eval.Nack" 211 } 212 213 // Make the RPC call 214 err := w.srv.RPC(endpoint, &req, &resp) 215 if err != nil { 216 w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", 217 verb, evalID, err) 218 } else { 219 w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) 220 } 221 } 222 223 // waitForIndex ensures that the local state is at least as fresh 224 // as the given index. This is used before starting an evaluation, 225 // but also potentially mid-stream. If a Plan fails because of stale 226 // state (attempt to allocate to a failed/dead node), we may need 227 // to sync our state again and do the planning with more recent data. 228 func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error { 229 // XXX: Potential optimization is to set up a watch on the state stores 230 // index table and only unblock via a trigger rather than timing out and 231 // checking. 232 233 start := time.Now() 234 defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 235 CHECK: 236 // Get the states current index 237 snapshotIndex, err := w.srv.fsm.State().LatestIndex() 238 if err != nil { 239 return fmt.Errorf("failed to determine state store's index: %v", err) 240 } 241 242 // We only need the FSM state to be as recent as the given index 243 if index <= snapshotIndex { 244 w.backoffReset() 245 return nil 246 } 247 248 // Check if we've reached our limit 249 if time.Now().Sub(start) > timeout { 250 return fmt.Errorf("sync wait timeout reached") 251 } 252 253 // Exponential back off if we haven't yet reached it 254 if w.backoffErr(backoffBaselineFast, backoffLimitFast) { 255 return fmt.Errorf("shutdown while waiting for state sync") 256 } 257 goto CHECK 258 } 259 260 // invokeScheduler is used to invoke the business logic of the scheduler 261 func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error { 262 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 263 // Store the evaluation token 264 w.evalToken = token 265 266 // Snapshot the current state 267 snap, err := w.srv.fsm.State().Snapshot() 268 if err != nil { 269 return fmt.Errorf("failed to snapshot state: %v", err) 270 } 271 272 // Store the snapshot's index 273 w.snapshotIndex, err = snap.LatestIndex() 274 if err != nil { 275 return fmt.Errorf("failed to determine snapshot's index: %v", err) 276 } 277 278 // Create the scheduler, or use the special system scheduler 279 var sched scheduler.Scheduler 280 if eval.Type == structs.JobTypeCore { 281 sched = NewCoreScheduler(w.srv, snap) 282 } else { 283 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 284 if err != nil { 285 return fmt.Errorf("failed to instantiate scheduler: %v", err) 286 } 287 } 288 289 // Process the evaluation 290 err = sched.Process(eval) 291 if err != nil { 292 return fmt.Errorf("failed to process evaluation: %v", err) 293 } 294 return nil 295 } 296 297 // SubmitPlan is used to submit a plan for consideration. This allows 298 // the worker to act as the planner for the scheduler. 299 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 300 // Check for a shutdown before plan submission 301 if w.srv.IsShutdown() { 302 return nil, nil, fmt.Errorf("shutdown while planning") 303 } 304 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 305 306 // Add the evaluation token to the plan 307 plan.EvalToken = w.evalToken 308 309 // Setup the request 310 req := structs.PlanRequest{ 311 Plan: plan, 312 WriteRequest: structs.WriteRequest{ 313 Region: w.srv.config.Region, 314 }, 315 } 316 var resp structs.PlanResponse 317 318 SUBMIT: 319 // Make the RPC call 320 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 321 w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", 322 plan.EvalID, err) 323 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 324 goto SUBMIT 325 } 326 return nil, nil, err 327 } else { 328 w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID) 329 w.backoffReset() 330 } 331 332 // Look for a result 333 result := resp.Result 334 if result == nil { 335 return nil, nil, fmt.Errorf("missing result") 336 } 337 338 // Check if a state update is required. This could be required if we 339 // planning based on stale data, which is causing issues. For example, a 340 // node failure since the time we've started planning or conflicting task 341 // allocations. 342 var state scheduler.State 343 if result.RefreshIndex != 0 { 344 // Wait for the raft log to catchup to the evaluation 345 w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID) 346 if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { 347 return nil, nil, err 348 } 349 350 // Snapshot the current state 351 snap, err := w.srv.fsm.State().Snapshot() 352 if err != nil { 353 return nil, nil, fmt.Errorf("failed to snapshot state: %v", err) 354 } 355 state = snap 356 } 357 358 // Return the result and potential state update 359 return result, state, nil 360 } 361 362 // UpdateEval is used to submit an updated evaluation. This allows 363 // the worker to act as the planner for the scheduler. 364 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 365 // Check for a shutdown before plan submission 366 if w.srv.IsShutdown() { 367 return fmt.Errorf("shutdown while planning") 368 } 369 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 370 371 // Store the snapshot index in the eval 372 eval.SnapshotIndex = w.snapshotIndex 373 374 // Setup the request 375 req := structs.EvalUpdateRequest{ 376 Evals: []*structs.Evaluation{eval}, 377 EvalToken: w.evalToken, 378 WriteRequest: structs.WriteRequest{ 379 Region: w.srv.config.Region, 380 }, 381 } 382 var resp structs.GenericResponse 383 384 SUBMIT: 385 // Make the RPC call 386 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 387 w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", 388 eval, err) 389 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 390 goto SUBMIT 391 } 392 return err 393 } else { 394 w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) 395 w.backoffReset() 396 } 397 return nil 398 } 399 400 // CreateEval is used to create a new evaluation. This allows 401 // the worker to act as the planner for the scheduler. 402 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 403 // Check for a shutdown before plan submission 404 if w.srv.IsShutdown() { 405 return fmt.Errorf("shutdown while planning") 406 } 407 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 408 409 // Store the snapshot index in the eval 410 eval.SnapshotIndex = w.snapshotIndex 411 412 // Setup the request 413 req := structs.EvalUpdateRequest{ 414 Evals: []*structs.Evaluation{eval}, 415 EvalToken: w.evalToken, 416 WriteRequest: structs.WriteRequest{ 417 Region: w.srv.config.Region, 418 }, 419 } 420 var resp structs.GenericResponse 421 422 SUBMIT: 423 // Make the RPC call 424 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 425 w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", 426 eval, err) 427 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 428 goto SUBMIT 429 } 430 return err 431 } else { 432 w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) 433 w.backoffReset() 434 } 435 return nil 436 } 437 438 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 439 // tracker. This allows the worker to act as the planner for the scheduler. 440 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 441 // Check for a shutdown before plan submission 442 if w.srv.IsShutdown() { 443 return fmt.Errorf("shutdown while planning") 444 } 445 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 446 447 // Update the evaluation if the queued jobs is not same as what is 448 // recorded in the job summary 449 summary, err := w.srv.fsm.state.JobSummaryByID(eval.JobID) 450 if err != nil { 451 return fmt.Errorf("couldn't retreive job summary: %v", err) 452 } 453 if summary != nil { 454 var hasChanged bool 455 for tg, summary := range summary.Summary { 456 if queued, ok := eval.QueuedAllocations[tg]; ok { 457 if queued != summary.Queued { 458 hasChanged = true 459 break 460 } 461 } 462 } 463 if hasChanged { 464 if err := w.UpdateEval(eval); err != nil { 465 return err 466 } 467 } 468 } 469 470 // Store the snapshot index in the eval 471 eval.SnapshotIndex = w.snapshotIndex 472 473 // Setup the request 474 req := structs.EvalUpdateRequest{ 475 Evals: []*structs.Evaluation{eval}, 476 EvalToken: w.evalToken, 477 WriteRequest: structs.WriteRequest{ 478 Region: w.srv.config.Region, 479 }, 480 } 481 var resp structs.GenericResponse 482 483 SUBMIT: 484 // Make the RPC call 485 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 486 w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v", 487 eval, err) 488 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 489 goto SUBMIT 490 } 491 return err 492 } else { 493 w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval) 494 w.backoffReset() 495 } 496 return nil 497 } 498 499 // shouldResubmit checks if a given error should be swallowed and the plan 500 // resubmitted after a backoff. Usually these are transient errors that 501 // the cluster should heal from quickly. 502 func (w *Worker) shouldResubmit(err error) bool { 503 s := err.Error() 504 switch { 505 case strings.Contains(s, "No cluster leader"): 506 return true 507 case strings.Contains(s, "plan queue is disabled"): 508 return true 509 default: 510 return false 511 } 512 } 513 514 // backoffErr is used to do an exponential back off on error. This is 515 // maintained statefully for the worker. Returns if attempts should be 516 // abandoneded due to shutdown. 517 func (w *Worker) backoffErr(base, limit time.Duration) bool { 518 backoff := (1 << (2 * w.failures)) * base 519 if backoff > limit { 520 backoff = limit 521 } else { 522 w.failures++ 523 } 524 select { 525 case <-time.After(backoff): 526 return false 527 case <-w.srv.shutdownCh: 528 return true 529 } 530 } 531 532 // backoffReset is used to reset the failure count for 533 // exponential backoff 534 func (w *Worker) backoffReset() { 535 w.failures = 0 536 }