github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/drainer/watch_jobs.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 8 log "github.com/hashicorp/go-hclog" 9 memdb "github.com/hashicorp/go-memdb" 10 11 "github.com/hashicorp/nomad/helper" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "golang.org/x/time/rate" 15 ) 16 17 type DrainRequest struct { 18 Allocs []*structs.Allocation 19 Resp *structs.BatchFuture 20 } 21 22 func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest { 23 return &DrainRequest{ 24 Allocs: allocs, 25 Resp: structs.NewBatchFuture(), 26 } 27 } 28 29 // DrainingJobWatcher is the interface for watching a job drain 30 type DrainingJobWatcher interface { 31 // RegisterJob is used to start watching a draining job 32 RegisterJobs(job []structs.NamespacedID) 33 34 // Drain is used to emit allocations that should be drained. 35 Drain() <-chan *DrainRequest 36 37 // Migrated is allocations for draining jobs that have transitioned to 38 // stop. There is no guarantee that duplicates won't be published. 39 Migrated() <-chan []*structs.Allocation 40 } 41 42 // drainingJobWatcher is used to watch draining jobs and emit events when 43 // draining allocations have replacements 44 type drainingJobWatcher struct { 45 ctx context.Context 46 logger log.Logger 47 48 // state is the state that is watched for state changes. 49 state *state.StateStore 50 51 // limiter is used to limit the rate of blocking queries 52 limiter *rate.Limiter 53 54 // jobs is the set of tracked jobs. 55 jobs map[structs.NamespacedID]struct{} 56 57 // queryCtx is used to cancel a blocking query. 58 queryCtx context.Context 59 queryCancel context.CancelFunc 60 61 // drainCh and migratedCh are used to emit allocations 62 drainCh chan *DrainRequest 63 migratedCh chan []*structs.Allocation 64 65 l sync.RWMutex 66 } 67 68 // NewDrainingJobWatcher returns a new job watcher. The caller is expected to 69 // cancel the context to clean up the drainer. 70 func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher { 71 72 // Create a context that can cancel the blocking query so that when a new 73 // job gets registered it is handled. 74 queryCtx, queryCancel := context.WithCancel(ctx) 75 76 w := &drainingJobWatcher{ 77 ctx: ctx, 78 queryCtx: queryCtx, 79 queryCancel: queryCancel, 80 limiter: limiter, 81 logger: logger.Named("job_watcher"), 82 state: state, 83 jobs: make(map[structs.NamespacedID]struct{}, 64), 84 drainCh: make(chan *DrainRequest), 85 migratedCh: make(chan []*structs.Allocation), 86 } 87 88 go w.watch() 89 return w 90 } 91 92 // RegisterJob marks the given job as draining and adds it to being watched. 93 func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { 94 w.l.Lock() 95 defer w.l.Unlock() 96 97 updated := false 98 for _, jns := range jobs { 99 if _, ok := w.jobs[jns]; ok { 100 continue 101 } 102 103 // Add the job and cancel the context 104 w.logger.Trace("registering job", "job", jns) 105 w.jobs[jns] = struct{}{} 106 updated = true 107 } 108 109 if updated { 110 w.queryCancel() 111 112 // Create a new query context 113 w.queryCtx, w.queryCancel = context.WithCancel(w.ctx) 114 } 115 } 116 117 // Drain returns the channel that emits allocations to drain. 118 func (w *drainingJobWatcher) Drain() <-chan *DrainRequest { 119 return w.drainCh 120 } 121 122 // Migrated returns the channel that emits allocations for draining jobs that 123 // have been migrated. 124 func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation { 125 return w.migratedCh 126 } 127 128 // deregisterJob removes the job from being watched. 129 func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { 130 w.l.Lock() 131 defer w.l.Unlock() 132 jns := structs.NamespacedID{ 133 ID: jobID, 134 Namespace: namespace, 135 } 136 delete(w.jobs, jns) 137 w.logger.Trace("deregistering job", "job", jns) 138 } 139 140 // watch is the long lived watching routine that detects job drain changes. 141 func (w *drainingJobWatcher) watch() { 142 timer, stop := helper.NewSafeTimer(stateReadErrorDelay) 143 defer stop() 144 145 waitIndex := uint64(1) 146 147 for { 148 timer.Reset(stateReadErrorDelay) 149 150 w.logger.Trace("getting job allocs at index", "index", waitIndex) 151 jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) 152 153 if err != nil { 154 if err == context.Canceled { 155 // Determine if it is a cancel or a shutdown 156 select { 157 case <-w.ctx.Done(): 158 return 159 default: 160 // The query context was cancelled; 161 // reset index so we don't miss past 162 // updates to newly registered jobs 163 waitIndex = 1 164 continue 165 } 166 } 167 168 w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err) 169 select { 170 case <-w.ctx.Done(): 171 w.logger.Trace("shutting down") 172 return 173 case <-timer.C: 174 continue 175 } 176 } 177 w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index) 178 179 lastHandled := waitIndex 180 waitIndex = index 181 182 // Snapshot the state store 183 snap, err := w.state.Snapshot() 184 if err != nil { 185 w.logger.Warn("failed to snapshot statestore", "error", err) 186 continue 187 } 188 189 currentJobs := w.drainingJobs() 190 var allDrain, allMigrated []*structs.Allocation 191 for jns, allocs := range jobAllocs { 192 // Check if the job is still registered 193 if _, ok := currentJobs[jns]; !ok { 194 w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns) 195 continue 196 } 197 198 w.logger.Trace("handling job", "job", jns) 199 200 // Lookup the job 201 job, err := snap.JobByID(nil, jns.Namespace, jns.ID) 202 if err != nil { 203 w.logger.Warn("failed to lookup job", "job", jns, "error", err) 204 continue 205 } 206 207 // Ignore purged jobs 208 if job == nil { 209 w.logger.Trace("ignoring garbage collected job", "job", jns) 210 w.deregisterJob(jns.ID, jns.Namespace) 211 continue 212 } 213 214 // Ignore any system jobs 215 if job.Type == structs.JobTypeSystem { 216 w.deregisterJob(job.ID, job.Namespace) 217 continue 218 } 219 220 result, err := handleJob(snap, job, allocs, lastHandled) 221 if err != nil { 222 w.logger.Error("handling drain for job failed", "job", jns, "error", err) 223 continue 224 } 225 226 w.logger.Trace("received result for job", "job", jns, "result", result) 227 228 allDrain = append(allDrain, result.drain...) 229 allMigrated = append(allMigrated, result.migrated...) 230 231 // Stop tracking this job 232 if result.done { 233 w.deregisterJob(job.ID, job.Namespace) 234 } 235 } 236 237 if len(allDrain) != 0 { 238 // Create the request 239 req := NewDrainRequest(allDrain) 240 w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain)) 241 242 select { 243 case w.drainCh <- req: 244 case <-w.ctx.Done(): 245 w.logger.Trace("shutting down") 246 return 247 } 248 249 // Wait for the request to be committed 250 select { 251 case <-req.Resp.WaitCh(): 252 case <-w.ctx.Done(): 253 w.logger.Trace("shutting down") 254 return 255 } 256 257 // See if it successfully committed 258 if err := req.Resp.Error(); err != nil { 259 w.logger.Error("failed to transition allocations", "error", err) 260 } 261 262 // Wait until the new index 263 if index := req.Resp.Index(); index > waitIndex { 264 waitIndex = index 265 } 266 } 267 268 if len(allMigrated) != 0 { 269 w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated)) 270 select { 271 case w.migratedCh <- allMigrated: 272 case <-w.ctx.Done(): 273 w.logger.Trace("shutting down") 274 return 275 } 276 } 277 } 278 } 279 280 // jobResult is the set of actions to take for a draining job given its current 281 // state. 282 type jobResult struct { 283 // drain is the set of allocations to emit for draining. 284 drain []*structs.Allocation 285 286 // migrated is the set of allocations to emit as migrated 287 migrated []*structs.Allocation 288 289 // done marks whether the job has been fully drained. 290 done bool 291 } 292 293 // newJobResult returns a jobResult with done=true. It is the responsibility of 294 // callers to set done=false when a remaining drainable alloc is found. 295 func newJobResult() *jobResult { 296 return &jobResult{ 297 done: true, 298 } 299 } 300 301 func (r *jobResult) String() string { 302 return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done) 303 } 304 305 // handleJob takes the state of a draining job and returns the desired actions. 306 func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) { 307 r := newJobResult() 308 batch := job.Type == structs.JobTypeBatch 309 taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups)) 310 for _, tg := range job.TaskGroups { 311 // Only capture the groups that have a migrate strategy or we are just 312 // watching batch 313 if tg.Migrate != nil || batch { 314 taskGroups[tg.Name] = tg 315 } 316 } 317 318 // Sort the allocations by TG 319 tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups)) 320 for _, alloc := range allocs { 321 if _, ok := taskGroups[alloc.TaskGroup]; !ok { 322 continue 323 } 324 325 tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc) 326 } 327 328 for name, tg := range taskGroups { 329 allocs := tgAllocs[name] 330 if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil { 331 return nil, fmt.Errorf("drain for task group %q failed: %v", name, err) 332 } 333 } 334 335 return r, nil 336 } 337 338 // handleTaskGroup takes the state of a draining task group and computes the 339 // desired actions. For batch jobs we only notify when they have been migrated 340 // and never mark them for drain. Batch jobs are allowed to complete up until 341 // the deadline, after which they are force killed. 342 func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup, 343 allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error { 344 345 // Determine how many allocations can be drained 346 drainingNodes := make(map[string]bool, 4) 347 healthy := 0 348 remainingDrainingAlloc := false 349 var drainable []*structs.Allocation 350 351 for _, alloc := range allocs { 352 // Check if the alloc is on a draining node. 353 onDrainingNode, ok := drainingNodes[alloc.NodeID] 354 if !ok { 355 // Look up the node 356 node, err := snap.NodeByID(nil, alloc.NodeID) 357 if err != nil { 358 return err 359 } 360 361 // Check if the node exists and whether it has a drain strategy 362 onDrainingNode = node != nil && node.DrainStrategy != nil 363 drainingNodes[alloc.NodeID] = onDrainingNode 364 } 365 366 // Check if the alloc should be considered migrated. A migrated 367 // allocation is one that is terminal, is on a draining 368 // allocation, and has only happened since our last handled index to 369 // avoid emitting many duplicate migrate events. 370 if alloc.TerminalStatus() && 371 onDrainingNode && 372 alloc.ModifyIndex > lastHandledIndex { 373 result.migrated = append(result.migrated, alloc) 374 continue 375 } 376 377 // If the service alloc is running and has its deployment status set, it 378 // is considered healthy from a migration standpoint. 379 if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() { 380 healthy++ 381 } 382 383 // An alloc can't be considered for migration if: 384 // - It isn't on a draining node 385 // - It is already terminal 386 if !onDrainingNode || alloc.TerminalStatus() { 387 continue 388 } 389 390 // Capture the fact that there is an allocation that is still draining 391 // for this job. 392 remainingDrainingAlloc = true 393 394 // If we haven't marked this allocation for migration already, capture 395 // it as eligible for draining. 396 if !batch && !alloc.DesiredTransition.ShouldMigrate() { 397 drainable = append(drainable, alloc) 398 } 399 } 400 401 // Update the done status 402 if remainingDrainingAlloc { 403 result.done = false 404 } 405 406 // We don't mark batch for drain so exit 407 if batch { 408 return nil 409 } 410 411 // Determine how many we can drain 412 thresholdCount := tg.Count - tg.Migrate.MaxParallel 413 numToDrain := healthy - thresholdCount 414 numToDrain = helper.Min(len(drainable), numToDrain) 415 if numToDrain <= 0 { 416 return nil 417 } 418 419 result.drain = append(result.drain, drainable[0:numToDrain]...) 420 return nil 421 } 422 423 // getJobAllocs returns all allocations for draining jobs 424 func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) { 425 if err := w.limiter.Wait(ctx); err != nil { 426 return nil, 0, err 427 } 428 429 resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx) 430 if err != nil { 431 return nil, 0, err 432 } 433 if resp == nil { 434 return nil, index, nil 435 } 436 437 return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil 438 } 439 440 // getJobAllocsImpl returns a map of draining jobs to their allocations. 441 func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 442 index, err := state.Index("allocs") 443 if err != nil { 444 return nil, 0, err 445 } 446 447 // Capture the draining jobs. 448 draining := w.drainingJobs() 449 l := len(draining) 450 if l == 0 { 451 return nil, index, nil 452 } 453 454 // Capture the allocs for each draining job. 455 var maxIndex uint64 = 0 456 resp := make(map[structs.NamespacedID][]*structs.Allocation, l) 457 for jns := range draining { 458 allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false) 459 if err != nil { 460 return nil, index, err 461 } 462 463 resp[jns] = allocs 464 for _, alloc := range allocs { 465 if maxIndex < alloc.ModifyIndex { 466 maxIndex = alloc.ModifyIndex 467 } 468 } 469 } 470 471 // Prefer using the actual max index of affected allocs since it means less 472 // unblocking 473 if maxIndex != 0 { 474 index = maxIndex 475 } 476 477 return resp, index, nil 478 } 479 480 // drainingJobs captures the set of draining jobs. 481 func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} { 482 w.l.RLock() 483 defer w.l.RUnlock() 484 485 l := len(w.jobs) 486 if l == 0 { 487 return nil 488 } 489 490 draining := make(map[structs.NamespacedID]struct{}, l) 491 for k := range w.jobs { 492 draining[k] = struct{}{} 493 } 494 495 return draining 496 } 497 498 // getQueryCtx is a helper for getting the query context. 499 func (w *drainingJobWatcher) getQueryCtx() context.Context { 500 w.l.RLock() 501 defer w.l.RUnlock() 502 return w.queryCtx 503 }