github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "github.com/hashicorp/nomad/scheduler" 13 ) 14 15 const ( 16 // backoffBaselineFast is the baseline time for exponential backoff 17 backoffBaselineFast = 20 * time.Millisecond 18 19 // backoffBaselineSlow is the baseline time for exponential backoff 20 // but that is much slower than backoffBaselineFast 21 backoffBaselineSlow = 500 * time.Millisecond 22 23 // backoffLimitFast is the limit of the exponential backoff 24 backoffLimitFast = time.Second 25 26 // backoffLimitSlow is the limit of the exponential backoff for 27 // the slower backoff 28 backoffLimitSlow = 10 * time.Second 29 30 // dequeueTimeout is used to timeout an evaluation dequeue so that 31 // we can check if there is a shutdown event 32 dequeueTimeout = 500 * time.Millisecond 33 34 // raftSyncLimit is the limit of time we will wait for Raft replication 35 // to catch up to the evaluation. This is used to fast Nack and 36 // allow another scheduler to pick it up. 37 raftSyncLimit = 5 * time.Second 38 39 // dequeueErrGrace is the grace period where we don't log about 40 // dequeue errors after start. This is to improve the user experience 41 // in dev mode where the leader isn't elected for a few seconds. 42 dequeueErrGrace = 10 * time.Second 43 ) 44 45 // Worker is a single threaded scheduling worker. There may be multiple 46 // running per server (leader or follower). They are responsible for dequeuing 47 // pending evaluations, invoking schedulers, plan submission and the 48 // lifecycle around making task allocations. They bridge the business logic 49 // of the scheduler with the plumbing required to make it all work. 50 type Worker struct { 51 srv *Server 52 logger *log.Logger 53 start time.Time 54 55 paused bool 56 pauseLock sync.Mutex 57 pauseCond *sync.Cond 58 59 failures uint 60 61 evalToken string 62 63 // snapshotIndex is the index of the snapshot in which the scheduler was 64 // first envoked. It is used to mark the SnapshotIndex of evaluations 65 // Created, Updated or Reblocked. 66 snapshotIndex uint64 67 } 68 69 // NewWorker starts a new worker associated with the given server 70 func NewWorker(srv *Server) (*Worker, error) { 71 w := &Worker{ 72 srv: srv, 73 logger: srv.logger, 74 start: time.Now(), 75 } 76 w.pauseCond = sync.NewCond(&w.pauseLock) 77 go w.run() 78 return w, nil 79 } 80 81 // SetPause is used to pause or unpause a worker 82 func (w *Worker) SetPause(p bool) { 83 w.pauseLock.Lock() 84 w.paused = p 85 w.pauseLock.Unlock() 86 if !p { 87 w.pauseCond.Broadcast() 88 } 89 } 90 91 // checkPaused is used to park the worker when paused 92 func (w *Worker) checkPaused() { 93 w.pauseLock.Lock() 94 for w.paused { 95 w.pauseCond.Wait() 96 } 97 w.pauseLock.Unlock() 98 } 99 100 // run is the long-lived goroutine which is used to run the worker 101 func (w *Worker) run() { 102 for { 103 // Dequeue a pending evaluation 104 eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout) 105 if shutdown { 106 return 107 } 108 109 // Check for a shutdown 110 if w.srv.IsShutdown() { 111 w.sendAck(eval.ID, token, false) 112 return 113 } 114 115 // Wait for the the raft log to catchup to the evaluation 116 if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil { 117 w.sendAck(eval.ID, token, false) 118 continue 119 } 120 121 // Invoke the scheduler to determine placements 122 if err := w.invokeScheduler(eval, token); err != nil { 123 w.sendAck(eval.ID, token, false) 124 continue 125 } 126 127 // Complete the evaluation 128 w.sendAck(eval.ID, token, true) 129 } 130 } 131 132 // dequeueEvaluation is used to fetch the next ready evaluation. 133 // This blocks until an evaluation is available or a timeout is reached. 134 func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) { 135 // Setup the request 136 req := structs.EvalDequeueRequest{ 137 Schedulers: w.srv.config.EnabledSchedulers, 138 Timeout: timeout, 139 WriteRequest: structs.WriteRequest{ 140 Region: w.srv.config.Region, 141 }, 142 } 143 var resp structs.EvalDequeueResponse 144 145 REQ: 146 // Check if we are paused 147 w.checkPaused() 148 149 // Make a blocking RPC 150 start := time.Now() 151 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 152 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 153 if err != nil { 154 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 155 w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) 156 } 157 if w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 158 return nil, "", true 159 } 160 goto REQ 161 } 162 w.backoffReset() 163 164 // Check if we got a response 165 if resp.Eval != nil { 166 w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) 167 return resp.Eval, resp.Token, false 168 } 169 170 // Check for potential shutdown 171 if w.srv.IsShutdown() { 172 return nil, "", true 173 } 174 goto REQ 175 } 176 177 // sendAck makes a best effort to ack or nack the evaluation. 178 // Any errors are logged but swallowed. 179 func (w *Worker) sendAck(evalID, token string, ack bool) { 180 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 181 // Setup the request 182 req := structs.EvalAckRequest{ 183 EvalID: evalID, 184 Token: token, 185 WriteRequest: structs.WriteRequest{ 186 Region: w.srv.config.Region, 187 }, 188 } 189 var resp structs.GenericResponse 190 191 // Determine if this is an Ack or Nack 192 verb := "ack" 193 endpoint := "Eval.Ack" 194 if !ack { 195 verb = "nack" 196 endpoint = "Eval.Nack" 197 } 198 199 // Make the RPC call 200 err := w.srv.RPC(endpoint, &req, &resp) 201 if err != nil { 202 w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", 203 verb, evalID, err) 204 } else { 205 w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) 206 } 207 } 208 209 // waitForIndex ensures that the local state is at least as fresh 210 // as the given index. This is used before starting an evaluation, 211 // but also potentially mid-stream. If a Plan fails because of stale 212 // state (attempt to allocate to a failed/dead node), we may need 213 // to sync our state again and do the planning with more recent data. 214 func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error { 215 start := time.Now() 216 defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 217 CHECK: 218 // We only need the FSM state to be as recent as the given index 219 appliedIndex := w.srv.raft.AppliedIndex() 220 if index <= appliedIndex { 221 w.backoffReset() 222 return nil 223 } 224 225 // Check if we've reached our limit 226 if time.Now().Sub(start) > timeout { 227 return fmt.Errorf("sync wait timeout reached") 228 } 229 230 // Exponential back off if we haven't yet reached it 231 if w.backoffErr(backoffBaselineFast, backoffLimitFast) { 232 return fmt.Errorf("shutdown while waiting for state sync") 233 } 234 goto CHECK 235 } 236 237 // invokeScheduler is used to invoke the business logic of the scheduler 238 func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error { 239 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 240 // Store the evaluation token 241 w.evalToken = token 242 243 // Snapshot the current state 244 snap, err := w.srv.fsm.State().Snapshot() 245 if err != nil { 246 return fmt.Errorf("failed to snapshot state: %v", err) 247 } 248 249 // Store the snapshot's index 250 w.snapshotIndex, err = snap.LatestIndex() 251 if err != nil { 252 return fmt.Errorf("failed to determine snapshot's index: %v", err) 253 } 254 255 // Create the scheduler, or use the special system scheduler 256 var sched scheduler.Scheduler 257 if eval.Type == structs.JobTypeCore { 258 sched = NewCoreScheduler(w.srv, snap) 259 } else { 260 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 261 if err != nil { 262 return fmt.Errorf("failed to instantiate scheduler: %v", err) 263 } 264 } 265 266 // Process the evaluation 267 err = sched.Process(eval) 268 if err != nil { 269 return fmt.Errorf("failed to process evaluation: %v", err) 270 } 271 return nil 272 } 273 274 // SubmitPlan is used to submit a plan for consideration. This allows 275 // the worker to act as the planner for the scheduler. 276 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 277 // Check for a shutdown before plan submission 278 if w.srv.IsShutdown() { 279 return nil, nil, fmt.Errorf("shutdown while planning") 280 } 281 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 282 283 // Add the evaluation token to the plan 284 plan.EvalToken = w.evalToken 285 286 // Setup the request 287 req := structs.PlanRequest{ 288 Plan: plan, 289 WriteRequest: structs.WriteRequest{ 290 Region: w.srv.config.Region, 291 }, 292 } 293 var resp structs.PlanResponse 294 295 SUBMIT: 296 // Make the RPC call 297 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 298 w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", 299 plan.EvalID, err) 300 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 301 goto SUBMIT 302 } 303 return nil, nil, err 304 } else { 305 w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID) 306 w.backoffReset() 307 } 308 309 // Look for a result 310 result := resp.Result 311 if result == nil { 312 return nil, nil, fmt.Errorf("missing result") 313 } 314 315 // Check if a state update is required. This could be required if we 316 // planning based on stale data, which is causing issues. For example, a 317 // node failure since the time we've started planning or conflicting task 318 // allocations. 319 var state scheduler.State 320 if result.RefreshIndex != 0 { 321 // Wait for the the raft log to catchup to the evaluation 322 w.logger.Printf("[DEBUG] worker: refreshing state to index %d", result.RefreshIndex) 323 if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { 324 return nil, nil, err 325 } 326 327 // Snapshot the current state 328 snap, err := w.srv.fsm.State().Snapshot() 329 if err != nil { 330 return nil, nil, fmt.Errorf("failed to snapshot state: %v", err) 331 } 332 state = snap 333 } 334 335 // Return the result and potential state update 336 return result, state, nil 337 } 338 339 // UpdateEval is used to submit an updated evaluation. This allows 340 // the worker to act as the planner for the scheduler. 341 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 342 // Check for a shutdown before plan submission 343 if w.srv.IsShutdown() { 344 return fmt.Errorf("shutdown while planning") 345 } 346 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 347 348 // Store the snapshot index in the eval 349 eval.SnapshotIndex = w.snapshotIndex 350 351 // Setup the request 352 req := structs.EvalUpdateRequest{ 353 Evals: []*structs.Evaluation{eval}, 354 EvalToken: w.evalToken, 355 WriteRequest: structs.WriteRequest{ 356 Region: w.srv.config.Region, 357 }, 358 } 359 var resp structs.GenericResponse 360 361 SUBMIT: 362 // Make the RPC call 363 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 364 w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", 365 eval, err) 366 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 367 goto SUBMIT 368 } 369 return err 370 } else { 371 w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) 372 w.backoffReset() 373 } 374 return nil 375 } 376 377 // CreateEval is used to create a new evaluation. This allows 378 // the worker to act as the planner for the scheduler. 379 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 380 // Check for a shutdown before plan submission 381 if w.srv.IsShutdown() { 382 return fmt.Errorf("shutdown while planning") 383 } 384 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 385 386 // Store the snapshot index in the eval 387 eval.SnapshotIndex = w.snapshotIndex 388 389 // Setup the request 390 req := structs.EvalUpdateRequest{ 391 Evals: []*structs.Evaluation{eval}, 392 EvalToken: w.evalToken, 393 WriteRequest: structs.WriteRequest{ 394 Region: w.srv.config.Region, 395 }, 396 } 397 var resp structs.GenericResponse 398 399 SUBMIT: 400 // Make the RPC call 401 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 402 w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", 403 eval, err) 404 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 405 goto SUBMIT 406 } 407 return err 408 } else { 409 w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) 410 w.backoffReset() 411 } 412 return nil 413 } 414 415 // ReblockEval is used to reinsert a blocked evaluation into the blocked eval 416 // tracker. This allows the worker to act as the planner for the scheduler. 417 func (w *Worker) ReblockEval(eval *structs.Evaluation) error { 418 // Check for a shutdown before plan submission 419 if w.srv.IsShutdown() { 420 return fmt.Errorf("shutdown while planning") 421 } 422 defer metrics.MeasureSince([]string{"nomad", "worker", "reblock_eval"}, time.Now()) 423 424 // Store the snapshot index in the eval 425 eval.SnapshotIndex = w.snapshotIndex 426 427 // Setup the request 428 req := structs.EvalUpdateRequest{ 429 Evals: []*structs.Evaluation{eval}, 430 EvalToken: w.evalToken, 431 WriteRequest: structs.WriteRequest{ 432 Region: w.srv.config.Region, 433 }, 434 } 435 var resp structs.GenericResponse 436 437 SUBMIT: 438 // Make the RPC call 439 if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { 440 w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v", 441 eval, err) 442 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 443 goto SUBMIT 444 } 445 return err 446 } else { 447 w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval) 448 w.backoffReset() 449 } 450 return nil 451 } 452 453 // shouldResubmit checks if a given error should be swallowed and the plan 454 // resubmitted after a backoff. Usually these are transient errors that 455 // the cluster should heal from quickly. 456 func (w *Worker) shouldResubmit(err error) bool { 457 s := err.Error() 458 switch { 459 case strings.Contains(s, "No cluster leader"): 460 return true 461 case strings.Contains(s, "plan queue is disabled"): 462 return true 463 default: 464 return false 465 } 466 } 467 468 // backoffErr is used to do an exponential back off on error. This is 469 // maintained statefully for the worker. Returns if attempts should be 470 // abandoneded due to shutdown. 471 func (w *Worker) backoffErr(base, limit time.Duration) bool { 472 backoff := (1 << (2 * w.failures)) * base 473 if backoff > limit { 474 backoff = limit 475 } else { 476 w.failures++ 477 } 478 select { 479 case <-time.After(backoff): 480 return false 481 case <-w.srv.shutdownCh: 482 return true 483 } 484 } 485 486 // backoffReset is used to reset the failure count for 487 // exponential backoff 488 func (w *Worker) backoffReset() { 489 w.failures = 0 490 }