github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/worker.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "github.com/hashicorp/nomad/scheduler" 13 ) 14 15 const ( 16 // backoffBaselineFast is the baseline time for exponential backoff 17 backoffBaselineFast = 20 * time.Millisecond 18 19 // backoffBaselineSlow is the baseline time for exponential backoff 20 // but that is much slower than backoffBaselineFast 21 backoffBaselineSlow = 500 * time.Millisecond 22 23 // backoffLimitFast is the limit of the exponential backoff 24 backoffLimitFast = time.Second 25 26 // backoffLimitSlow is the limit of the exponential backoff for 27 // the slower backoff 28 backoffLimitSlow = 10 * time.Second 29 30 // dequeueTimeout is used to timeout an evaluation dequeue so that 31 // we can check if there is a shutdown event 32 dequeueTimeout = 500 * time.Millisecond 33 34 // raftSyncLimit is the limit of time we will wait for Raft replication 35 // to catch up to the evaluation. This is used to fast Nack and 36 // allow another scheduler to pick it up. 37 raftSyncLimit = 5 * time.Second 38 39 // dequeueErrGrace is the grace period where we don't log about 40 // dequeue errors after start. This is to improve the user experience 41 // in dev mode where the leader isn't elected for a few seconds. 42 dequeueErrGrace = 10 * time.Second 43 ) 44 45 // Worker is a single threaded scheduling worker. There may be multiple 46 // running per server (leader or follower). They are responsible for dequeuing 47 // pending evaluations, invoking schedulers, plan submission and the 48 // lifecycle around making task allocations. They bridge the business logic 49 // of the scheduler with the plumbing required to make it all work. 50 type Worker struct { 51 srv *Server 52 logger *log.Logger 53 start time.Time 54 55 paused bool 56 pauseLock sync.Mutex 57 pauseCond *sync.Cond 58 59 failures uint 60 61 evalToken string 62 } 63 64 // NewWorker starts a new worker associated with the given server 65 func NewWorker(srv *Server) (*Worker, error) { 66 w := &Worker{ 67 srv: srv, 68 logger: srv.logger, 69 start: time.Now(), 70 } 71 w.pauseCond = sync.NewCond(&w.pauseLock) 72 go w.run() 73 return w, nil 74 } 75 76 // SetPause is used to pause or unpause a worker 77 func (w *Worker) SetPause(p bool) { 78 w.pauseLock.Lock() 79 w.paused = p 80 w.pauseLock.Unlock() 81 if !p { 82 w.pauseCond.Broadcast() 83 } 84 } 85 86 // checkPaused is used to park the worker when paused 87 func (w *Worker) checkPaused() { 88 w.pauseLock.Lock() 89 for w.paused { 90 w.pauseCond.Wait() 91 } 92 w.pauseLock.Unlock() 93 } 94 95 // run is the long-lived goroutine which is used to run the worker 96 func (w *Worker) run() { 97 for { 98 // Dequeue a pending evaluation 99 eval, token, shutdown := w.dequeueEvaluation(dequeueTimeout) 100 if shutdown { 101 return 102 } 103 104 // Check for a shutdown 105 if w.srv.IsShutdown() { 106 w.sendAck(eval.ID, token, false) 107 return 108 } 109 110 // Wait for the the raft log to catchup to the evaluation 111 if err := w.waitForIndex(eval.ModifyIndex, raftSyncLimit); err != nil { 112 w.sendAck(eval.ID, token, false) 113 continue 114 } 115 116 // Invoke the scheduler to determine placements 117 if err := w.invokeScheduler(eval, token); err != nil { 118 w.sendAck(eval.ID, token, false) 119 continue 120 } 121 122 // Complete the evaluation 123 w.sendAck(eval.ID, token, true) 124 } 125 } 126 127 // dequeueEvaluation is used to fetch the next ready evaluation. 128 // This blocks until an evaluation is available or a timeout is reached. 129 func (w *Worker) dequeueEvaluation(timeout time.Duration) (*structs.Evaluation, string, bool) { 130 // Setup the request 131 req := structs.EvalDequeueRequest{ 132 Schedulers: w.srv.config.EnabledSchedulers, 133 Timeout: timeout, 134 WriteRequest: structs.WriteRequest{ 135 Region: w.srv.config.Region, 136 }, 137 } 138 var resp structs.EvalDequeueResponse 139 140 REQ: 141 // Check if we are paused 142 w.checkPaused() 143 144 // Make a blocking RPC 145 start := time.Now() 146 err := w.srv.RPC("Eval.Dequeue", &req, &resp) 147 metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) 148 if err != nil { 149 if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { 150 w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) 151 } 152 if w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 153 return nil, "", true 154 } 155 goto REQ 156 } 157 w.backoffReset() 158 159 // Check if we got a response 160 if resp.Eval != nil { 161 w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) 162 return resp.Eval, resp.Token, false 163 } 164 165 // Check for potential shutdown 166 if w.srv.IsShutdown() { 167 return nil, "", true 168 } 169 goto REQ 170 } 171 172 // sendAck makes a best effort to ack or nack the evaluation. 173 // Any errors are logged but swallowed. 174 func (w *Worker) sendAck(evalID, token string, ack bool) { 175 defer metrics.MeasureSince([]string{"nomad", "worker", "send_ack"}, time.Now()) 176 // Setup the request 177 req := structs.EvalAckRequest{ 178 EvalID: evalID, 179 Token: token, 180 WriteRequest: structs.WriteRequest{ 181 Region: w.srv.config.Region, 182 }, 183 } 184 var resp structs.GenericResponse 185 186 // Determine if this is an Ack or Nack 187 verb := "ack" 188 endpoint := "Eval.Ack" 189 if !ack { 190 verb = "nack" 191 endpoint = "Eval.Nack" 192 } 193 194 // Make the RPC call 195 err := w.srv.RPC(endpoint, &req, &resp) 196 if err != nil { 197 w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", 198 verb, evalID, err) 199 } else { 200 w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) 201 } 202 } 203 204 // waitForIndex ensures that the local state is at least as fresh 205 // as the given index. This is used before starting an evaluation, 206 // but also potentially mid-stream. If a Plan fails because of stale 207 // state (attempt to allocate to a failed/dead node), we may need 208 // to sync our state again and do the planning with more recent data. 209 func (w *Worker) waitForIndex(index uint64, timeout time.Duration) error { 210 start := time.Now() 211 defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start) 212 CHECK: 213 // We only need the FSM state to be as recent as the given index 214 appliedIndex := w.srv.raft.AppliedIndex() 215 if index <= appliedIndex { 216 w.backoffReset() 217 return nil 218 } 219 220 // Check if we've reached our limit 221 if time.Now().Sub(start) > timeout { 222 return fmt.Errorf("sync wait timeout reached") 223 } 224 225 // Exponential back off if we haven't yet reached it 226 if w.backoffErr(backoffBaselineFast, backoffLimitFast) { 227 return fmt.Errorf("shutdown while waiting for state sync") 228 } 229 goto CHECK 230 } 231 232 // invokeScheduler is used to invoke the business logic of the scheduler 233 func (w *Worker) invokeScheduler(eval *structs.Evaluation, token string) error { 234 defer metrics.MeasureSince([]string{"nomad", "worker", "invoke_scheduler", eval.Type}, time.Now()) 235 // Store the evaluation token 236 w.evalToken = token 237 238 // Snapshot the current state 239 snap, err := w.srv.fsm.State().Snapshot() 240 if err != nil { 241 return fmt.Errorf("failed to snapshot state: %v", err) 242 } 243 244 // Create the scheduler, or use the special system scheduler 245 var sched scheduler.Scheduler 246 if eval.Type == structs.JobTypeCore { 247 sched = NewCoreScheduler(w.srv, snap) 248 } else { 249 sched, err = scheduler.NewScheduler(eval.Type, w.logger, snap, w) 250 if err != nil { 251 return fmt.Errorf("failed to instantiate scheduler: %v", err) 252 } 253 } 254 255 // Process the evaluation 256 err = sched.Process(eval) 257 if err != nil { 258 return fmt.Errorf("failed to process evaluation: %v", err) 259 } 260 return nil 261 } 262 263 // SubmitPlan is used to submit a plan for consideration. This allows 264 // the worker to act as the planner for the scheduler. 265 func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.State, error) { 266 // Check for a shutdown before plan submission 267 if w.srv.IsShutdown() { 268 return nil, nil, fmt.Errorf("shutdown while planning") 269 } 270 defer metrics.MeasureSince([]string{"nomad", "worker", "submit_plan"}, time.Now()) 271 272 // Add the evaluation token to the plan 273 plan.EvalToken = w.evalToken 274 275 // Setup the request 276 req := structs.PlanRequest{ 277 Plan: plan, 278 WriteRequest: structs.WriteRequest{ 279 Region: w.srv.config.Region, 280 }, 281 } 282 var resp structs.PlanResponse 283 284 SUBMIT: 285 // Make the RPC call 286 if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { 287 w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", 288 plan.EvalID, err) 289 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 290 goto SUBMIT 291 } 292 return nil, nil, err 293 } else { 294 w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID) 295 w.backoffReset() 296 } 297 298 // Look for a result 299 result := resp.Result 300 if result == nil { 301 return nil, nil, fmt.Errorf("missing result") 302 } 303 304 // Check if a state update is required. This could be required if we 305 // planning based on stale data, which is causing issues. For example, a 306 // node failure since the time we've started planning or conflicting task 307 // allocations. 308 var state scheduler.State 309 if result.RefreshIndex != 0 { 310 // Wait for the the raft log to catchup to the evaluation 311 w.logger.Printf("[DEBUG] worker: refreshing state to index %d", result.RefreshIndex) 312 if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { 313 return nil, nil, err 314 } 315 316 // Snapshot the current state 317 snap, err := w.srv.fsm.State().Snapshot() 318 if err != nil { 319 return nil, nil, fmt.Errorf("failed to snapshot state: %v", err) 320 } 321 state = snap 322 } 323 324 // Return the result and potential state update 325 return result, state, nil 326 } 327 328 // UpdateEval is used to submit an updated evaluation. This allows 329 // the worker to act as the planner for the scheduler. 330 func (w *Worker) UpdateEval(eval *structs.Evaluation) error { 331 // Check for a shutdown before plan submission 332 if w.srv.IsShutdown() { 333 return fmt.Errorf("shutdown while planning") 334 } 335 defer metrics.MeasureSince([]string{"nomad", "worker", "update_eval"}, time.Now()) 336 337 // Setup the request 338 req := structs.EvalUpdateRequest{ 339 Evals: []*structs.Evaluation{eval}, 340 EvalToken: w.evalToken, 341 WriteRequest: structs.WriteRequest{ 342 Region: w.srv.config.Region, 343 }, 344 } 345 var resp structs.GenericResponse 346 347 SUBMIT: 348 // Make the RPC call 349 if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { 350 w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", 351 eval, err) 352 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 353 goto SUBMIT 354 } 355 return err 356 } else { 357 w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) 358 w.backoffReset() 359 } 360 return nil 361 } 362 363 // CreateEval is used to create a new evaluation. This allows 364 // the worker to act as the planner for the scheduler. 365 func (w *Worker) CreateEval(eval *structs.Evaluation) error { 366 // Check for a shutdown before plan submission 367 if w.srv.IsShutdown() { 368 return fmt.Errorf("shutdown while planning") 369 } 370 defer metrics.MeasureSince([]string{"nomad", "worker", "create_eval"}, time.Now()) 371 372 // Setup the request 373 req := structs.EvalUpdateRequest{ 374 Evals: []*structs.Evaluation{eval}, 375 EvalToken: w.evalToken, 376 WriteRequest: structs.WriteRequest{ 377 Region: w.srv.config.Region, 378 }, 379 } 380 var resp structs.GenericResponse 381 382 SUBMIT: 383 // Make the RPC call 384 if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { 385 w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", 386 eval, err) 387 if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { 388 goto SUBMIT 389 } 390 return err 391 } else { 392 w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) 393 w.backoffReset() 394 } 395 return nil 396 } 397 398 // shouldResubmit checks if a given error should be swallowed and the plan 399 // resubmitted after a backoff. Usually these are transient errors that 400 // the cluster should heal from quickly. 401 func (w *Worker) shouldResubmit(err error) bool { 402 s := err.Error() 403 switch { 404 case strings.Contains(s, "No cluster leader"): 405 return true 406 case strings.Contains(s, "plan queue is disabled"): 407 return true 408 default: 409 return false 410 } 411 } 412 413 // backoffErr is used to do an exponential back off on error. This is 414 // maintained statefully for the worker. Returns if attempts should be 415 // abandoneded due to shutdown. 416 // be made or abandoned. 417 func (w *Worker) backoffErr(base, limit time.Duration) bool { 418 backoff := (1 << (2 * w.failures)) * base 419 if backoff > limit { 420 backoff = limit 421 } else { 422 w.failures++ 423 } 424 select { 425 case <-time.After(backoff): 426 return false 427 case <-w.srv.shutdownCh: 428 return true 429 } 430 } 431 432 // backoffReset is used to reset the failure count for 433 // exponential backoff 434 func (w *Worker) backoffReset() { 435 w.failures = 0 436 }