github.com/hernad/nomad@v1.6.112/nomad/drainer/watch_jobs.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package drainer 5 6 import ( 7 "context" 8 "fmt" 9 "sync" 10 11 log "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 14 "github.com/hernad/nomad/helper" 15 "github.com/hernad/nomad/nomad/state" 16 "github.com/hernad/nomad/nomad/structs" 17 "golang.org/x/time/rate" 18 ) 19 20 type DrainRequest struct { 21 Allocs []*structs.Allocation 22 Resp *structs.BatchFuture 23 } 24 25 func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest { 26 return &DrainRequest{ 27 Allocs: allocs, 28 Resp: structs.NewBatchFuture(), 29 } 30 } 31 32 // DrainingJobWatcher is the interface for watching a job drain 33 type DrainingJobWatcher interface { 34 // RegisterJob is used to start watching a draining job 35 RegisterJobs(jobs []structs.NamespacedID) 36 37 // Drain is used to emit allocations that should be drained. 38 Drain() <-chan *DrainRequest 39 40 // Migrated is allocations for draining jobs that have transitioned to 41 // stop. There is no guarantee that duplicates won't be published. 42 Migrated() <-chan []*structs.Allocation 43 } 44 45 // drainingJobWatcher is used to watch draining jobs and emit events when 46 // draining allocations have replacements 47 type drainingJobWatcher struct { 48 ctx context.Context 49 logger log.Logger 50 51 // state is the state that is watched for state changes. 52 state *state.StateStore 53 54 // limiter is used to limit the rate of blocking queries 55 limiter *rate.Limiter 56 57 // jobs is the set of tracked jobs. 58 jobs map[structs.NamespacedID]struct{} 59 60 // queryCtx is used to cancel a blocking query. 61 queryCtx context.Context 62 queryCancel context.CancelFunc 63 64 // drainCh and migratedCh are used to emit allocations 65 drainCh chan *DrainRequest 66 migratedCh chan []*structs.Allocation 67 68 l sync.RWMutex 69 } 70 71 // NewDrainingJobWatcher returns a new job watcher. The caller is expected to 72 // cancel the context to clean up the drainer. 73 func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher { 74 75 // Create a context that can cancel the blocking query so that when a new 76 // job gets registered it is handled. 77 queryCtx, queryCancel := context.WithCancel(ctx) 78 79 w := &drainingJobWatcher{ 80 ctx: ctx, 81 queryCtx: queryCtx, 82 queryCancel: queryCancel, 83 limiter: limiter, 84 logger: logger.Named("job_watcher"), 85 state: state, 86 jobs: make(map[structs.NamespacedID]struct{}, 64), 87 drainCh: make(chan *DrainRequest), 88 migratedCh: make(chan []*structs.Allocation), 89 } 90 91 go w.watch() 92 return w 93 } 94 95 // RegisterJob marks the given job as draining and adds it to being watched. 96 func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { 97 w.l.Lock() 98 defer w.l.Unlock() 99 100 updated := false 101 for _, jns := range jobs { 102 if _, ok := w.jobs[jns]; ok { 103 continue 104 } 105 106 // Add the job and cancel the context 107 w.logger.Trace("registering job", "job", jns) 108 w.jobs[jns] = struct{}{} 109 updated = true 110 } 111 112 if updated { 113 w.queryCancel() 114 115 // Create a new query context 116 w.queryCtx, w.queryCancel = context.WithCancel(w.ctx) 117 } 118 } 119 120 // Drain returns the channel that emits allocations to drain. 121 func (w *drainingJobWatcher) Drain() <-chan *DrainRequest { 122 return w.drainCh 123 } 124 125 // Migrated returns the channel that emits allocations for draining jobs that 126 // have been migrated. 127 func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation { 128 return w.migratedCh 129 } 130 131 // deregisterJob removes the job from being watched. 132 func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { 133 w.l.Lock() 134 defer w.l.Unlock() 135 jns := structs.NamespacedID{ 136 ID: jobID, 137 Namespace: namespace, 138 } 139 delete(w.jobs, jns) 140 w.logger.Trace("deregistering job", "job", jns) 141 } 142 143 // watch is the long lived watching routine that detects job drain changes. 144 func (w *drainingJobWatcher) watch() { 145 timer, stop := helper.NewSafeTimer(stateReadErrorDelay) 146 defer stop() 147 148 waitIndex := uint64(1) 149 150 for { 151 timer.Reset(stateReadErrorDelay) 152 153 w.logger.Trace("getting job allocs at index", "index", waitIndex) 154 jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) 155 156 if err != nil { 157 if err == context.Canceled { 158 // Determine if it is a cancel or a shutdown 159 select { 160 case <-w.ctx.Done(): 161 return 162 default: 163 // The query context was cancelled; 164 // reset index so we don't miss past 165 // updates to newly registered jobs 166 waitIndex = 1 167 continue 168 } 169 } 170 171 w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err) 172 select { 173 case <-w.ctx.Done(): 174 w.logger.Trace("shutting down") 175 return 176 case <-timer.C: 177 continue 178 } 179 } 180 w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index) 181 182 lastHandled := waitIndex 183 waitIndex = index 184 185 // Snapshot the state store 186 snap, err := w.state.Snapshot() 187 if err != nil { 188 w.logger.Warn("failed to snapshot statestore", "error", err) 189 continue 190 } 191 192 currentJobs := w.drainingJobs() 193 var allDrain, allMigrated []*structs.Allocation 194 for jns, allocs := range jobAllocs { 195 // Check if the job is still registered 196 if _, ok := currentJobs[jns]; !ok { 197 w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns) 198 continue 199 } 200 201 w.logger.Trace("handling job", "job", jns) 202 203 // Lookup the job 204 job, err := snap.JobByID(nil, jns.Namespace, jns.ID) 205 if err != nil { 206 w.logger.Warn("failed to lookup job", "job", jns, "error", err) 207 continue 208 } 209 210 // Ignore purged jobs 211 if job == nil { 212 w.logger.Trace("ignoring garbage collected job", "job", jns) 213 w.deregisterJob(jns.ID, jns.Namespace) 214 continue 215 } 216 217 // Ignore any system jobs 218 if job.Type == structs.JobTypeSystem { 219 w.deregisterJob(job.ID, job.Namespace) 220 continue 221 } 222 223 result, err := handleJob(snap, job, allocs, lastHandled) 224 if err != nil { 225 w.logger.Error("handling drain for job failed", "job", jns, "error", err) 226 continue 227 } 228 229 w.logger.Trace("received result for job", "job", jns, "result", result) 230 231 allDrain = append(allDrain, result.drain...) 232 allMigrated = append(allMigrated, result.migrated...) 233 234 // Stop tracking this job 235 if result.done { 236 w.deregisterJob(job.ID, job.Namespace) 237 } 238 } 239 240 if len(allDrain) != 0 { 241 // Create the request 242 req := NewDrainRequest(allDrain) 243 w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain)) 244 245 select { 246 case w.drainCh <- req: 247 case <-w.ctx.Done(): 248 w.logger.Trace("shutting down") 249 return 250 } 251 252 // Wait for the request to be committed 253 select { 254 case <-req.Resp.WaitCh(): 255 case <-w.ctx.Done(): 256 w.logger.Trace("shutting down") 257 return 258 } 259 260 // See if it successfully committed 261 if err := req.Resp.Error(); err != nil { 262 w.logger.Error("failed to transition allocations", "error", err) 263 } 264 265 // Wait until the new index 266 if index := req.Resp.Index(); index > waitIndex { 267 waitIndex = index 268 } 269 } 270 271 if len(allMigrated) != 0 { 272 w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated)) 273 select { 274 case w.migratedCh <- allMigrated: 275 case <-w.ctx.Done(): 276 w.logger.Trace("shutting down") 277 return 278 } 279 } 280 } 281 } 282 283 // jobResult is the set of actions to take for a draining job given its current 284 // state. 285 type jobResult struct { 286 // drain is the set of allocations to emit for draining. 287 drain []*structs.Allocation 288 289 // migrated is the set of allocations to emit as migrated 290 migrated []*structs.Allocation 291 292 // done marks whether the job has been fully drained. 293 done bool 294 } 295 296 // newJobResult returns a jobResult with done=true. It is the responsibility of 297 // callers to set done=false when a remaining drainable alloc is found. 298 func newJobResult() *jobResult { 299 return &jobResult{ 300 done: true, 301 } 302 } 303 304 func (r *jobResult) String() string { 305 return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done) 306 } 307 308 // handleJob takes the state of a draining job and returns the desired actions. 309 func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) { 310 r := newJobResult() 311 batch := job.Type == structs.JobTypeBatch 312 taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups)) 313 for _, tg := range job.TaskGroups { 314 // Only capture the groups that have a migrate strategy or we are just 315 // watching batch 316 if tg.Migrate != nil || batch { 317 taskGroups[tg.Name] = tg 318 } 319 } 320 321 // Sort the allocations by TG 322 tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups)) 323 for _, alloc := range allocs { 324 if _, ok := taskGroups[alloc.TaskGroup]; !ok { 325 continue 326 } 327 328 tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc) 329 } 330 331 for name, tg := range taskGroups { 332 allocs := tgAllocs[name] 333 if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil { 334 return nil, fmt.Errorf("drain for task group %q failed: %v", name, err) 335 } 336 } 337 338 return r, nil 339 } 340 341 // handleTaskGroup takes the state of a draining task group and computes the 342 // desired actions. For batch jobs we only notify when they have been migrated 343 // and never mark them for drain. Batch jobs are allowed to complete up until 344 // the deadline, after which they are force killed. 345 func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup, 346 allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error { 347 348 // Determine how many allocations can be drained 349 drainingNodes := make(map[string]bool, 4) 350 healthy := 0 351 remainingDrainingAlloc := false 352 var drainable []*structs.Allocation 353 354 for _, alloc := range allocs { 355 // Check if the alloc is on a draining node. 356 onDrainingNode, ok := drainingNodes[alloc.NodeID] 357 if !ok { 358 // Look up the node 359 node, err := snap.NodeByID(nil, alloc.NodeID) 360 if err != nil { 361 return err 362 } 363 364 // Check if the node exists and whether it has a drain strategy 365 onDrainingNode = node != nil && node.DrainStrategy != nil 366 drainingNodes[alloc.NodeID] = onDrainingNode 367 } 368 369 // Check if the alloc should be considered migrated. A migrated 370 // allocation is one that is terminal on the client, is on a draining 371 // allocation, and has been updated since our last handled index to 372 // avoid emitting many duplicate migrate events. 373 if alloc.ClientTerminalStatus() && 374 onDrainingNode && 375 alloc.ModifyIndex > lastHandledIndex { 376 result.migrated = append(result.migrated, alloc) 377 continue 378 } 379 380 // If the service alloc is running and has its deployment status set, it 381 // is considered healthy from a migration standpoint. 382 if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() { 383 healthy++ 384 } 385 386 // An alloc can't be considered for migration if: 387 // - It isn't on a draining node 388 // - It is already terminal on the client 389 if !onDrainingNode || alloc.ClientTerminalStatus() { 390 continue 391 } 392 393 // Capture the fact that there is an allocation that is still draining 394 // for this job. 395 remainingDrainingAlloc = true 396 397 // If we haven't marked this allocation for migration already, capture 398 // it as eligible for draining. 399 if !batch && !alloc.DesiredTransition.ShouldMigrate() { 400 drainable = append(drainable, alloc) 401 } 402 } 403 404 // Update the done status 405 if remainingDrainingAlloc { 406 result.done = false 407 } 408 409 // We don't mark batch for drain so exit 410 if batch { 411 return nil 412 } 413 414 // Determine how many we can drain 415 thresholdCount := tg.Count - tg.Migrate.MaxParallel 416 numToDrain := healthy - thresholdCount 417 numToDrain = helper.Min(len(drainable), numToDrain) 418 if numToDrain <= 0 { 419 return nil 420 } 421 422 result.drain = append(result.drain, drainable[0:numToDrain]...) 423 return nil 424 } 425 426 // getJobAllocs returns all allocations for draining jobs 427 func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) { 428 if err := w.limiter.Wait(ctx); err != nil { 429 return nil, 0, err 430 } 431 432 resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx) 433 if err != nil { 434 return nil, 0, err 435 } 436 if resp == nil { 437 return nil, index, nil 438 } 439 440 return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil 441 } 442 443 // getJobAllocsImpl returns a map of draining jobs to their allocations. 444 func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 445 index, err := state.Index("allocs") 446 if err != nil { 447 return nil, 0, err 448 } 449 450 // Capture the draining jobs. 451 draining := w.drainingJobs() 452 l := len(draining) 453 if l == 0 { 454 return nil, index, nil 455 } 456 457 // Capture the allocs for each draining job. 458 var maxIndex uint64 = 0 459 resp := make(map[structs.NamespacedID][]*structs.Allocation, l) 460 for jns := range draining { 461 allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false) 462 if err != nil { 463 return nil, index, err 464 } 465 466 resp[jns] = allocs 467 for _, alloc := range allocs { 468 if maxIndex < alloc.ModifyIndex { 469 maxIndex = alloc.ModifyIndex 470 } 471 } 472 } 473 474 // Prefer using the actual max index of affected allocs since it means less 475 // unblocking 476 if maxIndex != 0 { 477 index = maxIndex 478 } 479 480 return resp, index, nil 481 } 482 483 // drainingJobs captures the set of draining jobs. 484 func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} { 485 w.l.RLock() 486 defer w.l.RUnlock() 487 488 l := len(w.jobs) 489 if l == 0 { 490 return nil 491 } 492 493 draining := make(map[structs.NamespacedID]struct{}, l) 494 for k := range w.jobs { 495 draining[k] = struct{}{} 496 } 497 498 return draining 499 } 500 501 // getQueryCtx is a helper for getting the query context. 502 func (w *drainingJobWatcher) getQueryCtx() context.Context { 503 w.l.RLock() 504 defer w.l.RUnlock() 505 return w.queryCtx 506 }