github.com/bigcommerce/nomad@v0.9.3-bc/nomad/drainer/watch_jobs.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 memdb "github.com/hashicorp/go-memdb" 11 12 "github.com/hashicorp/nomad/helper" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "golang.org/x/time/rate" 16 ) 17 18 type DrainRequest struct { 19 Allocs []*structs.Allocation 20 Resp *structs.BatchFuture 21 } 22 23 func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest { 24 return &DrainRequest{ 25 Allocs: allocs, 26 Resp: structs.NewBatchFuture(), 27 } 28 } 29 30 // DrainingJobWatcher is the interface for watching a job drain 31 type DrainingJobWatcher interface { 32 // RegisterJob is used to start watching a draining job 33 RegisterJobs(job []structs.NamespacedID) 34 35 // Drain is used to emit allocations that should be drained. 36 Drain() <-chan *DrainRequest 37 38 // Migrated is allocations for draining jobs that have transitioned to 39 // stop. There is no guarantee that duplicates won't be published. 40 Migrated() <-chan []*structs.Allocation 41 } 42 43 // drainingJobWatcher is used to watch draining jobs and emit events when 44 // draining allocations have replacements 45 type drainingJobWatcher struct { 46 ctx context.Context 47 logger log.Logger 48 49 // state is the state that is watched for state changes. 50 state *state.StateStore 51 52 // limiter is used to limit the rate of blocking queries 53 limiter *rate.Limiter 54 55 // jobs is the set of tracked jobs. 56 jobs map[structs.NamespacedID]struct{} 57 58 // queryCtx is used to cancel a blocking query. 59 queryCtx context.Context 60 queryCancel context.CancelFunc 61 62 // drainCh and migratedCh are used to emit allocations 63 drainCh chan *DrainRequest 64 migratedCh chan []*structs.Allocation 65 66 l sync.RWMutex 67 } 68 69 // NewDrainingJobWatcher returns a new job watcher. The caller is expected to 70 // cancel the context to clean up the drainer. 71 func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher { 72 73 // Create a context that can cancel the blocking query so that when a new 74 // job gets registered it is handled. 75 queryCtx, queryCancel := context.WithCancel(ctx) 76 77 w := &drainingJobWatcher{ 78 ctx: ctx, 79 queryCtx: queryCtx, 80 queryCancel: queryCancel, 81 limiter: limiter, 82 logger: logger.Named("job_watcher"), 83 state: state, 84 jobs: make(map[structs.NamespacedID]struct{}, 64), 85 drainCh: make(chan *DrainRequest), 86 migratedCh: make(chan []*structs.Allocation), 87 } 88 89 go w.watch() 90 return w 91 } 92 93 // RegisterJob marks the given job as draining and adds it to being watched. 94 func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { 95 w.l.Lock() 96 defer w.l.Unlock() 97 98 updated := false 99 for _, jns := range jobs { 100 if _, ok := w.jobs[jns]; ok { 101 continue 102 } 103 104 // Add the job and cancel the context 105 w.logger.Trace("registering job", "job", jns) 106 w.jobs[jns] = struct{}{} 107 updated = true 108 } 109 110 if updated { 111 w.queryCancel() 112 113 // Create a new query context 114 w.queryCtx, w.queryCancel = context.WithCancel(w.ctx) 115 } 116 } 117 118 // Drain returns the channel that emits allocations to drain. 119 func (w *drainingJobWatcher) Drain() <-chan *DrainRequest { 120 return w.drainCh 121 } 122 123 // Migrated returns the channel that emits allocations for draining jobs that 124 // have been migrated. 125 func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation { 126 return w.migratedCh 127 } 128 129 // deregisterJob removes the job from being watched. 130 func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { 131 w.l.Lock() 132 defer w.l.Unlock() 133 jns := structs.NamespacedID{ 134 ID: jobID, 135 Namespace: namespace, 136 } 137 delete(w.jobs, jns) 138 w.logger.Trace("deregistering job", "job", jns) 139 } 140 141 // watch is the long lived watching routine that detects job drain changes. 142 func (w *drainingJobWatcher) watch() { 143 waitIndex := uint64(1) 144 for { 145 w.logger.Trace("getting job allocs at index", "index", waitIndex) 146 jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) 147 w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index, "error", err) 148 if err != nil { 149 if err == context.Canceled { 150 // Determine if it is a cancel or a shutdown 151 select { 152 case <-w.ctx.Done(): 153 w.logger.Trace("shutting down") 154 return 155 default: 156 // The query context was cancelled; 157 // reset index so we don't miss past 158 // updates to newly registered jobs 159 waitIndex = 1 160 continue 161 } 162 } 163 164 w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err) 165 select { 166 case <-w.ctx.Done(): 167 w.logger.Trace("shutting down") 168 return 169 case <-time.After(stateReadErrorDelay): 170 continue 171 } 172 } 173 174 lastHandled := waitIndex 175 waitIndex = index 176 177 // Snapshot the state store 178 snap, err := w.state.Snapshot() 179 if err != nil { 180 w.logger.Warn("failed to snapshot statestore", "error", err) 181 continue 182 } 183 184 currentJobs := w.drainingJobs() 185 var allDrain, allMigrated []*structs.Allocation 186 for jns, allocs := range jobAllocs { 187 // Check if the job is still registered 188 if _, ok := currentJobs[jns]; !ok { 189 w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns) 190 continue 191 } 192 193 w.logger.Trace("handling job", "job", jns) 194 195 // Lookup the job 196 job, err := snap.JobByID(nil, jns.Namespace, jns.ID) 197 if err != nil { 198 w.logger.Warn("failed to lookup job", "job", jns, "error", err) 199 continue 200 } 201 202 // Ignore purged jobs 203 if job == nil { 204 w.logger.Trace("ignoring garbage collected job", "job", jns) 205 w.deregisterJob(jns.ID, jns.Namespace) 206 continue 207 } 208 209 // Ignore any system jobs 210 if job.Type == structs.JobTypeSystem { 211 w.deregisterJob(job.ID, job.Namespace) 212 continue 213 } 214 215 result, err := handleJob(snap, job, allocs, lastHandled) 216 if err != nil { 217 w.logger.Error("handling drain for job failed", "job", jns, "error", err) 218 continue 219 } 220 221 w.logger.Trace("received result for job", "job", jns, "result", result) 222 223 allDrain = append(allDrain, result.drain...) 224 allMigrated = append(allMigrated, result.migrated...) 225 226 // Stop tracking this job 227 if result.done { 228 w.deregisterJob(job.ID, job.Namespace) 229 } 230 } 231 232 if len(allDrain) != 0 { 233 // Create the request 234 req := NewDrainRequest(allDrain) 235 w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain)) 236 237 select { 238 case w.drainCh <- req: 239 case <-w.ctx.Done(): 240 w.logger.Trace("shutting down") 241 return 242 } 243 244 // Wait for the request to be committed 245 select { 246 case <-req.Resp.WaitCh(): 247 case <-w.ctx.Done(): 248 w.logger.Trace("shutting down") 249 return 250 } 251 252 // See if it successfully committed 253 if err := req.Resp.Error(); err != nil { 254 w.logger.Error("failed to transition allocations", "error", err) 255 } 256 257 // Wait until the new index 258 if index := req.Resp.Index(); index > waitIndex { 259 waitIndex = index 260 } 261 } 262 263 if len(allMigrated) != 0 { 264 w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated)) 265 select { 266 case w.migratedCh <- allMigrated: 267 case <-w.ctx.Done(): 268 w.logger.Trace("shutting down") 269 return 270 } 271 } 272 } 273 } 274 275 // jobResult is the set of actions to take for a draining job given its current 276 // state. 277 type jobResult struct { 278 // drain is the set of allocations to emit for draining. 279 drain []*structs.Allocation 280 281 // migrated is the set of allocations to emit as migrated 282 migrated []*structs.Allocation 283 284 // done marks whether the job has been fully drained. 285 done bool 286 } 287 288 // newJobResult returns a jobResult with done=true. It is the responsibility of 289 // callers to set done=false when a remaining drainable alloc is found. 290 func newJobResult() *jobResult { 291 return &jobResult{ 292 done: true, 293 } 294 } 295 296 func (r *jobResult) String() string { 297 return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done) 298 } 299 300 // handleJob takes the state of a draining job and returns the desired actions. 301 func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) { 302 r := newJobResult() 303 batch := job.Type == structs.JobTypeBatch 304 taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups)) 305 for _, tg := range job.TaskGroups { 306 // Only capture the groups that have a migrate strategy or we are just 307 // watching batch 308 if tg.Migrate != nil || batch { 309 taskGroups[tg.Name] = tg 310 } 311 } 312 313 // Sort the allocations by TG 314 tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups)) 315 for _, alloc := range allocs { 316 if _, ok := taskGroups[alloc.TaskGroup]; !ok { 317 continue 318 } 319 320 tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc) 321 } 322 323 for name, tg := range taskGroups { 324 allocs := tgAllocs[name] 325 if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil { 326 return nil, fmt.Errorf("drain for task group %q failed: %v", name, err) 327 } 328 } 329 330 return r, nil 331 } 332 333 // handleTaskGroup takes the state of a draining task group and computes the 334 // desired actions. For batch jobs we only notify when they have been migrated 335 // and never mark them for drain. Batch jobs are allowed to complete up until 336 // the deadline, after which they are force killed. 337 func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup, 338 allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error { 339 340 // Determine how many allocations can be drained 341 drainingNodes := make(map[string]bool, 4) 342 healthy := 0 343 remainingDrainingAlloc := false 344 var drainable []*structs.Allocation 345 346 for _, alloc := range allocs { 347 // Check if the alloc is on a draining node. 348 onDrainingNode, ok := drainingNodes[alloc.NodeID] 349 if !ok { 350 // Look up the node 351 node, err := snap.NodeByID(nil, alloc.NodeID) 352 if err != nil { 353 return err 354 } 355 356 // Check if the node exists and whether it has a drain strategy 357 onDrainingNode = node != nil && node.DrainStrategy != nil 358 drainingNodes[alloc.NodeID] = onDrainingNode 359 } 360 361 // Check if the alloc should be considered migrated. A migrated 362 // allocation is one that is terminal, is on a draining 363 // allocation, and has only happened since our last handled index to 364 // avoid emitting many duplicate migrate events. 365 if alloc.TerminalStatus() && 366 onDrainingNode && 367 alloc.ModifyIndex > lastHandledIndex { 368 result.migrated = append(result.migrated, alloc) 369 continue 370 } 371 372 // If the service alloc is running and has its deployment status set, it 373 // is considered healthy from a migration standpoint. 374 if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() { 375 healthy++ 376 } 377 378 // An alloc can't be considered for migration if: 379 // - It isn't on a draining node 380 // - It is already terminal 381 if !onDrainingNode || alloc.TerminalStatus() { 382 continue 383 } 384 385 // Capture the fact that there is an allocation that is still draining 386 // for this job. 387 remainingDrainingAlloc = true 388 389 // If we haven't marked this allocation for migration already, capture 390 // it as eligible for draining. 391 if !batch && !alloc.DesiredTransition.ShouldMigrate() { 392 drainable = append(drainable, alloc) 393 } 394 } 395 396 // Update the done status 397 if remainingDrainingAlloc { 398 result.done = false 399 } 400 401 // We don't mark batch for drain so exit 402 if batch { 403 return nil 404 } 405 406 // Determine how many we can drain 407 thresholdCount := tg.Count - tg.Migrate.MaxParallel 408 numToDrain := healthy - thresholdCount 409 numToDrain = helper.IntMin(len(drainable), numToDrain) 410 if numToDrain <= 0 { 411 return nil 412 } 413 414 result.drain = append(result.drain, drainable[0:numToDrain]...) 415 return nil 416 } 417 418 // getJobAllocs returns all allocations for draining jobs 419 func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) { 420 if err := w.limiter.Wait(ctx); err != nil { 421 return nil, 0, err 422 } 423 424 resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx) 425 if err != nil { 426 return nil, 0, err 427 } 428 if resp == nil { 429 return nil, index, nil 430 } 431 432 return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil 433 } 434 435 // getJobAllocsImpl returns a map of draining jobs to their allocations. 436 func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 437 index, err := state.Index("allocs") 438 if err != nil { 439 return nil, 0, err 440 } 441 442 // Capture the draining jobs. 443 draining := w.drainingJobs() 444 l := len(draining) 445 if l == 0 { 446 return nil, index, nil 447 } 448 449 // Capture the allocs for each draining job. 450 var maxIndex uint64 = 0 451 resp := make(map[structs.NamespacedID][]*structs.Allocation, l) 452 for jns := range draining { 453 allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false) 454 if err != nil { 455 return nil, index, err 456 } 457 458 resp[jns] = allocs 459 for _, alloc := range allocs { 460 if maxIndex < alloc.ModifyIndex { 461 maxIndex = alloc.ModifyIndex 462 } 463 } 464 } 465 466 // Prefer using the actual max index of affected allocs since it means less 467 // unblocking 468 if maxIndex != 0 { 469 index = maxIndex 470 } 471 472 return resp, index, nil 473 } 474 475 // drainingJobs captures the set of draining jobs. 476 func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} { 477 w.l.RLock() 478 defer w.l.RUnlock() 479 480 l := len(w.jobs) 481 if l == 0 { 482 return nil 483 } 484 485 draining := make(map[structs.NamespacedID]struct{}, l) 486 for k := range w.jobs { 487 draining[k] = struct{}{} 488 } 489 490 return draining 491 } 492 493 // getQueryCtx is a helper for getting the query context. 494 func (w *drainingJobWatcher) getQueryCtx() context.Context { 495 w.l.RLock() 496 defer w.l.RUnlock() 497 return w.queryCtx 498 }