github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 memdb "github.com/hashicorp/go-memdb" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/nomad/scheduler" 14 ) 15 16 const ( 17 // backoffBaselineFast is the baseline time for exponential backoff 18 backoffBaselineFast = 20 * time.Millisecond 19 20 // backoffBaselineSlow is the baseline time for exponential backoff 21 // but that is much slower than backoffBaselineFast 22 backoffBaselineSlow = 500 * time.Millisecond 23 24 // backoffLimitFast is the limit of the exponential backoff 25 backoffLimitFast = time.Second 26 27 // backoffLimitSlow is the limit of the exponential backoff for 28 // the slower backoff 29 backoffLimitSlow = 10 * time.Second 30 31 // backoffSchedulerVersionMismatch is the backoff between retries when the 32 // scheduler version mismatches that of the leader. 33 backoffSchedulerVersionMismatch = 30 * time.Second 34 35 // dequeueTimeout is used to timeout an evaluation dequeue so that 36 // we can check if there is a shutdown event 37 dequeueTimeout = 500 * time.Millisecond 38 39 // raftSyncLimit is the limit of time we will wait for Raft replication 40 // to catch up to the evaluation. This is used to fast Nack and 41 // allow another scheduler to pick it up. 42 raftSyncLimit = 5 * time.Second 43 44 // dequeueErrGrace is the grace period where we don't log about 45 // dequeue errors after start. This is to improve the user experience 46 // in dev mode where the leader isn't elected for a few seconds. 47 dequeueErrGrace = 10 * time.Second 48 ) 49 50 // Worker is a single threaded scheduling worker. There may be multiple 51 // running per server (leader or follower). They are responsible for dequeuing 52 // pending evaluations, invoking schedulers, plan submission and the 53 // lifecycle around making task allocations. They bridge the business logic 54 // of the scheduler with the plumbing required to make it all work. 55 type Worker struct { 56 srv *Server 57 logger *log.Logger 58 start time.Time 59 60 paused bool 61 pauseLock sync.Mutex 62 pauseCond *sync.Cond 63 64 failures uint 65 66 evalToken string 67 68 // snapshotIndex is the index of the snapshot in which the scheduler was 69 // first envoked. It is used to mark the SnapshotIndex of evaluations 70 // Created, Updated or Reblocked. 71 snapshotIndex uint64 72 } 73 74 // NewWorker starts a new worker associated with the given server 75 func NewWorker(srv *Server) (*Worker, error) { 76 w := &Worker{ 77 srv: srv, 78 logger: srv.logger, 79 start: time.Now(), 80 } 81 w.pauseCond = sync.NewCond(&w.pauseLock) 82 go w.run() 83 return w, nil 84 } 85 86 // SetPause is used to pause or unpause a worker 87 func (w *Worker) SetPause(p bool) { 88 w.pauseLock.Lock() 89 w.paused = p 90 w.pauseLock.Unlock() 91 if !p { 92 w.pauseCond.Broadcast() 93 } 94 } 95 96 // checkPaused is used to park the worker when paused 97 func (w *Worker) checkPaused() { 98 w.pauseLock.Lock() 99 for w.paused { 100 w.pauseCond.Wait() 101 } 102 w.pauseLock.Unlock() 103 } 104 105 // run is the long-lived goroutine which is used to run the worker 106 func (w *Worker) run() { 107 for { 108 // Dequeue a pending evaluation 109 eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout) 110 if shutdown { 111 return 112 } 113 114 // Check for a shutdown 115 if w.srv.IsShutdown() { 116 w.sendAck(eval.ID, token, false) 117 return 118 } 119 120 // Wait for the raft log to catchup to the evaluation 121 if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil { 122 w.sendAck(eval.ID, token, false) 123 continue 124 } 125 126 // Invoke the scheduler to determine placements 127 if err := w.invokeScheduler(eval, token); err != nil { 128 w.sendAck(eval.ID, token, false) 129 continue 130 } 131 132 // Complete the evaluation 133 w.sendAck(eval.ID, token, true) 134 } 135 } 136 137 // dequeueEvaluation is used to fetch the next ready evaluation. 138 // This blocks until an evaluation is available or a timeout is reached. 139 func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) { 140 // Setup the request 141 req := structs.EvalDequeueRequest{ 142 Schedulers: w.srv.config.EnabledSchedulers, 143 Timeout: timeout, 144 SchedulerVersion: scheduler.SchedulerVersion, 145 WriteRequest: structs.WriteRequest{ 146 Region: w.srv.config.Region, 147 }, 148 } 149 var resp structs.EvalDequeueResponse 150 151 REQ: 152 // Check if we are paused 153 w.checkPaused() 154 155 // Make a blocking RPC 156 start := time.Now() 157 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 158 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 159 if err != nil { 160 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 161 w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) 162 } 163 164 // Adjust the backoff based on the error. If it is a scheduler version 165 // mismatch we increase the baseline. 166 base, limit := backoffBaselineFast, backoffLimitSlow 167 if strings.Contains(err.Error(), "calling scheduler version") { 168 base = backoffSchedulerVersionMismatch 169 limit = backoffSchedulerVersionMismatch 170 } 171 172 if w.backoffErr(base, limit) { 173 return nil, "", true 174 } 175 goto REQ 176 } 177 w.backoffReset() 178 179 // Check if we got a response 180 if resp.Eval != nil { 181 w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) 182 return resp.Eval, resp.Token, false 183 } 184 185 // Check for potential shutdown 186 if w.srv.IsShutdown() { 187 return nil, "", true 188 } 189 goto REQ 190 } 191 192 // sendAck makes a best effort to ack or nack the evaluation. 193 // Any errors are logged but swallowed. 194 func (w *Worker) sendAck(evalID, token string, ack bool) { 195 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 196 // Setup the request 197 req := structs.EvalAckRequest{ 198 EvalID: evalID, 199 Token: token, 200 WriteRequest: structs.WriteRequest{ 201 Region: w.srv.config.Region, 202 }, 203 } 204 var resp structs.GenericResponse 205 206 // Determine if this is an Ack or Nack 207 verb := "ack" 208 endpoint := "Eval.Ack" 209 if !ack { 210 verb = "nack" 211 endpoint = "Eval.Nack" 212 } 213 214 // Make the RPC call 215 err := w.srv.RPC(endpoint, &req, &resp) 216 if err != nil { 217 w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", 218 verb, evalID, err) 219 } else { 220 w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) 221 } 222 } 223 224 // waitForIndex ensures that the local state is at least as fresh 225 // as the given index. This is used before starting an evaluation, 226 // but also potentially mid-stream. If a Plan fails because of stale 227 // state (attempt to allocate to a failed/dead node), we may need 228 // to sync our state again and do the planning with more recent data. 229 func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error { 230 // XXX: Potential optimization is to set up a watch on the state stores 231 // index table and only unblock via a trigger rather than timing out and 232 // checking. 233 234 start := time.Now() 235 defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 236 CHECK: 237 // Get the states current index 238 snapshotIndex, err := w.srv.fsm.State().LatestIndex() 239 if err != nil { 240 return fmt.Errorf("failed to determine state store's index: %v", err) 241 } 242 243 // We only need the FSM state to be as recent as the given index 244 if index <= snapshotIndex { 245 w.backoffReset() 246 return nil 247 } 248 249 // Check if we've reached our limit 250 if time.Now().Sub(start) > timeout { 251 return fmt.Errorf("sync wait timeout reached") 252 } 253 254 // Exponential back off if we haven't yet reached it 255 if w.backoffErr(backoffBaselineFast, backoffLimitFast) { 256 return fmt.Errorf("shutdown while waiting for state sync") 257 } 258 goto CHECK 259 } 260 261 // invokeScheduler is used to invoke the business logic of the scheduler 262 func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error { 263 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 264 // Store the evaluation token 265 w.evalToken = token 266 267 // Snapshot the current state 268 snap, err := w.srv.fsm.State().Snapshot() 269 if err != nil { 270 return fmt.Errorf("failed to snapshot state: %v", err) 271 } 272 273 // Store the snapshot's index 274 w.snapshotIndex, err = snap.LatestIndex() 275 if err != nil { 276 return fmt.Errorf("failed to determine snapshot's index: %v", err) 277 } 278 279 // Create the scheduler, or use the special system scheduler 280 var sched scheduler.Scheduler 281 if eval.Type == structs.JobTypeCore { 282 sched = NewCoreScheduler(w.srv, snap) 283 } else { 284 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 285 if err != nil { 286 return fmt.Errorf("failed to instantiate scheduler: %v", err) 287 } 288 } 289 290 // Process the evaluation 291 err = sched.Process(eval) 292 if err != nil { 293 return fmt.Errorf("failed to process evaluation: %v", err) 294 } 295 return nil 296 } 297 298 // SubmitPlan is used to submit a plan for consideration. This allows 299 // the worker to act as the planner for the scheduler. 300 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 301 // Check for a shutdown before plan submission 302 if w.srv.IsShutdown() { 303 return nil, nil, fmt.Errorf("shutdown while planning") 304 } 305 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 306 307 // Add the evaluation token to the plan 308 plan.EvalToken = w.evalToken 309 310 // Setup the request 311 req := structs.PlanRequest{ 312 Plan: plan, 313 WriteRequest: structs.WriteRequest{ 314 Region: w.srv.config.Region, 315 }, 316 } 317 var resp structs.PlanResponse 318 319 SUBMIT: 320 // Make the RPC call 321 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 322 w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", 323 plan.EvalID, err) 324 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 325 goto SUBMIT 326 } 327 return nil, nil, err 328 } else { 329 w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID) 330 w.backoffReset() 331 } 332 333 // Look for a result 334 result := resp.Result 335 if result == nil { 336 return nil, nil, fmt.Errorf("missing result") 337 } 338 339 // Check if a state update is required. This could be required if we 340 // planning based on stale data, which is causing issues. For example, a 341 // node failure since the time we've started planning or conflicting task 342 // allocations. 343 var state scheduler.State 344 if result.RefreshIndex != 0 { 345 // Wait for the raft log to catchup to the evaluation 346 w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID) 347 if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { 348 return nil, nil, err 349 } 350 351 // Snapshot the current state 352 snap, err := w.srv.fsm.State().Snapshot() 353 if err != nil { 354 return nil, nil, fmt.Errorf("failed to snapshot state: %v", err) 355 } 356 state = snap 357 } 358 359 // Return the result and potential state update 360 return result, state, nil 361 } 362 363 // UpdateEval is used to submit an updated evaluation. This allows 364 // the worker to act as the planner for the scheduler. 365 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 366 // Check for a shutdown before plan submission 367 if w.srv.IsShutdown() { 368 return fmt.Errorf("shutdown while planning") 369 } 370 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 371 372 // Store the snapshot index in the eval 373 eval.SnapshotIndex = w.snapshotIndex 374 375 // Setup the request 376 req := structs.EvalUpdateRequest{ 377 Evals: []*structs.Evaluation{eval}, 378 EvalToken: w.evalToken, 379 WriteRequest: structs.WriteRequest{ 380 Region: w.srv.config.Region, 381 }, 382 } 383 var resp structs.GenericResponse 384 385 SUBMIT: 386 // Make the RPC call 387 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 388 w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", 389 eval, err) 390 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 391 goto SUBMIT 392 } 393 return err 394 } else { 395 w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) 396 w.backoffReset() 397 } 398 return nil 399 } 400 401 // CreateEval is used to create a new evaluation. This allows 402 // the worker to act as the planner for the scheduler. 403 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 404 // Check for a shutdown before plan submission 405 if w.srv.IsShutdown() { 406 return fmt.Errorf("shutdown while planning") 407 } 408 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 409 410 // Store the snapshot index in the eval 411 eval.SnapshotIndex = w.snapshotIndex 412 413 // Setup the request 414 req := structs.EvalUpdateRequest{ 415 Evals: []*structs.Evaluation{eval}, 416 EvalToken: w.evalToken, 417 WriteRequest: structs.WriteRequest{ 418 Region: w.srv.config.Region, 419 }, 420 } 421 var resp structs.GenericResponse 422 423 SUBMIT: 424 // Make the RPC call 425 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 426 w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", 427 eval, err) 428 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 429 goto SUBMIT 430 } 431 return err 432 } else { 433 w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) 434 w.backoffReset() 435 } 436 return nil 437 } 438 439 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 440 // tracker. This allows the worker to act as the planner for the scheduler. 441 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 442 // Check for a shutdown before plan submission 443 if w.srv.IsShutdown() { 444 return fmt.Errorf("shutdown while planning") 445 } 446 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 447 448 // Update the evaluation if the queued jobs is not same as what is 449 // recorded in the job summary 450 ws := memdb.NewWatchSet() 451 summary, err := w.srv.fsm.state.JobSummaryByID(ws, eval.JobID) 452 if err != nil { 453 return fmt.Errorf("couldn't retreive job summary: %v", err) 454 } 455 if summary != nil { 456 var hasChanged bool 457 for tg, summary := range summary.Summary { 458 if queued, ok := eval.QueuedAllocations[tg]; ok { 459 if queued != summary.Queued { 460 hasChanged = true 461 break 462 } 463 } 464 } 465 if hasChanged { 466 if err := w.UpdateEval(eval); err != nil { 467 return err 468 } 469 } 470 } 471 472 // Store the snapshot index in the eval 473 eval.SnapshotIndex = w.snapshotIndex 474 475 // Setup the request 476 req := structs.EvalUpdateRequest{ 477 Evals: []*structs.Evaluation{eval}, 478 EvalToken: w.evalToken, 479 WriteRequest: structs.WriteRequest{ 480 Region: w.srv.config.Region, 481 }, 482 } 483 var resp structs.GenericResponse 484 485 SUBMIT: 486 // Make the RPC call 487 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 488 w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v", 489 eval, err) 490 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 491 goto SUBMIT 492 } 493 return err 494 } else { 495 w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval) 496 w.backoffReset() 497 } 498 return nil 499 } 500 501 // shouldResubmit checks if a given error should be swallowed and the plan 502 // resubmitted after a backoff. Usually these are transient errors that 503 // the cluster should heal from quickly. 504 func (w *Worker) shouldResubmit(err error) bool { 505 s := err.Error() 506 switch { 507 case strings.Contains(s, "No cluster leader"): 508 return true 509 case strings.Contains(s, "plan queue is disabled"): 510 return true 511 default: 512 return false 513 } 514 } 515 516 // backoffErr is used to do an exponential back off on error. This is 517 // maintained statefully for the worker. Returns if attempts should be 518 // abandoneded due to shutdown. 519 func (w *Worker) backoffErr(base, limit time.Duration) bool { 520 backoff := (1 << (2 * w.failures)) * base 521 if backoff > limit { 522 backoff = limit 523 } else { 524 w.failures++ 525 } 526 select { 527 case <-time.After(backoff): 528 return false 529 case <-w.srv.shutdownCh: 530 return true 531 } 532 } 533 534 // backoffReset is used to reset the failure count for 535 // exponential backoff 536 func (w *Worker) backoffReset() { 537 w.failures = 0 538 }