github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/drainer/watch_jobs.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 memdb "github.com/hashicorp/go-memdb" 11 12 "github.com/hashicorp/nomad/helper" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "golang.org/x/time/rate" 16 ) 17 18 type DrainRequest struct { 19 Allocs []*structs.Allocation 20 Resp *structs.BatchFuture 21 } 22 23 func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest { 24 return &DrainRequest{ 25 Allocs: allocs, 26 Resp: structs.NewBatchFuture(), 27 } 28 } 29 30 // DrainingJobWatcher is the interface for watching a job drain 31 type DrainingJobWatcher interface { 32 // RegisterJob is used to start watching a draining job 33 RegisterJobs(job []structs.NamespacedID) 34 35 // Drain is used to emit allocations that should be drained. 36 Drain() <-chan *DrainRequest 37 38 // Migrated is allocations for draining jobs that have transitioned to 39 // stop. There is no guarantee that duplicates won't be published. 40 Migrated() <-chan []*structs.Allocation 41 } 42 43 // drainingJobWatcher is used to watch draining jobs and emit events when 44 // draining allocations have replacements 45 type drainingJobWatcher struct { 46 ctx context.Context 47 logger log.Logger 48 49 // state is the state that is watched for state changes. 50 state *state.StateStore 51 52 // limiter is used to limit the rate of blocking queries 53 limiter *rate.Limiter 54 55 // jobs is the set of tracked jobs. 56 jobs map[structs.NamespacedID]struct{} 57 58 // queryCtx is used to cancel a blocking query. 59 queryCtx context.Context 60 queryCancel context.CancelFunc 61 62 // drainCh and migratedCh are used to emit allocations 63 drainCh chan *DrainRequest 64 migratedCh chan []*structs.Allocation 65 66 l sync.RWMutex 67 } 68 69 // NewDrainingJobWatcher returns a new job watcher. The caller is expected to 70 // cancel the context to clean up the drainer. 71 func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher { 72 73 // Create a context that can cancel the blocking query so that when a new 74 // job gets registered it is handled. 75 queryCtx, queryCancel := context.WithCancel(ctx) 76 77 w := &drainingJobWatcher{ 78 ctx: ctx, 79 queryCtx: queryCtx, 80 queryCancel: queryCancel, 81 limiter: limiter, 82 logger: logger.Named("job_watcher"), 83 state: state, 84 jobs: make(map[structs.NamespacedID]struct{}, 64), 85 drainCh: make(chan *DrainRequest), 86 migratedCh: make(chan []*structs.Allocation), 87 } 88 89 go w.watch() 90 return w 91 } 92 93 // RegisterJob marks the given job as draining and adds it to being watched. 94 func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { 95 w.l.Lock() 96 defer w.l.Unlock() 97 98 updated := false 99 for _, jns := range jobs { 100 if _, ok := w.jobs[jns]; ok { 101 continue 102 } 103 104 // Add the job and cancel the context 105 w.logger.Trace("registering job", "job", jns) 106 w.jobs[jns] = struct{}{} 107 updated = true 108 } 109 110 if updated { 111 w.queryCancel() 112 113 // Create a new query context 114 w.queryCtx, w.queryCancel = context.WithCancel(w.ctx) 115 } 116 } 117 118 // Drain returns the channel that emits allocations to drain. 119 func (w *drainingJobWatcher) Drain() <-chan *DrainRequest { 120 return w.drainCh 121 } 122 123 // Migrated returns the channel that emits allocations for draining jobs that 124 // have been migrated. 125 func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation { 126 return w.migratedCh 127 } 128 129 // deregisterJob removes the job from being watched. 130 func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { 131 w.l.Lock() 132 defer w.l.Unlock() 133 jns := structs.NamespacedID{ 134 ID: jobID, 135 Namespace: namespace, 136 } 137 delete(w.jobs, jns) 138 w.logger.Trace("deregistering job", "job", jns) 139 } 140 141 // watch is the long lived watching routine that detects job drain changes. 142 func (w *drainingJobWatcher) watch() { 143 waitIndex := uint64(1) 144 for { 145 w.logger.Trace("getting job allocs at index", "index", waitIndex) 146 jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) 147 if err != nil { 148 if err == context.Canceled { 149 // Determine if it is a cancel or a shutdown 150 select { 151 case <-w.ctx.Done(): 152 return 153 default: 154 // The query context was cancelled; 155 // reset index so we don't miss past 156 // updates to newly registered jobs 157 waitIndex = 1 158 continue 159 } 160 } 161 162 w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err) 163 select { 164 case <-w.ctx.Done(): 165 w.logger.Trace("shutting down") 166 return 167 case <-time.After(stateReadErrorDelay): 168 continue 169 } 170 } 171 w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index) 172 173 lastHandled := waitIndex 174 waitIndex = index 175 176 // Snapshot the state store 177 snap, err := w.state.Snapshot() 178 if err != nil { 179 w.logger.Warn("failed to snapshot statestore", "error", err) 180 continue 181 } 182 183 currentJobs := w.drainingJobs() 184 var allDrain, allMigrated []*structs.Allocation 185 for jns, allocs := range jobAllocs { 186 // Check if the job is still registered 187 if _, ok := currentJobs[jns]; !ok { 188 w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns) 189 continue 190 } 191 192 w.logger.Trace("handling job", "job", jns) 193 194 // Lookup the job 195 job, err := snap.JobByID(nil, jns.Namespace, jns.ID) 196 if err != nil { 197 w.logger.Warn("failed to lookup job", "job", jns, "error", err) 198 continue 199 } 200 201 // Ignore purged jobs 202 if job == nil { 203 w.logger.Trace("ignoring garbage collected job", "job", jns) 204 w.deregisterJob(jns.ID, jns.Namespace) 205 continue 206 } 207 208 // Ignore any system jobs 209 if job.Type == structs.JobTypeSystem { 210 w.deregisterJob(job.ID, job.Namespace) 211 continue 212 } 213 214 result, err := handleJob(snap, job, allocs, lastHandled) 215 if err != nil { 216 w.logger.Error("handling drain for job failed", "job", jns, "error", err) 217 continue 218 } 219 220 w.logger.Trace("received result for job", "job", jns, "result", result) 221 222 allDrain = append(allDrain, result.drain...) 223 allMigrated = append(allMigrated, result.migrated...) 224 225 // Stop tracking this job 226 if result.done { 227 w.deregisterJob(job.ID, job.Namespace) 228 } 229 } 230 231 if len(allDrain) != 0 { 232 // Create the request 233 req := NewDrainRequest(allDrain) 234 w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain)) 235 236 select { 237 case w.drainCh <- req: 238 case <-w.ctx.Done(): 239 w.logger.Trace("shutting down") 240 return 241 } 242 243 // Wait for the request to be committed 244 select { 245 case <-req.Resp.WaitCh(): 246 case <-w.ctx.Done(): 247 w.logger.Trace("shutting down") 248 return 249 } 250 251 // See if it successfully committed 252 if err := req.Resp.Error(); err != nil { 253 w.logger.Error("failed to transition allocations", "error", err) 254 } 255 256 // Wait until the new index 257 if index := req.Resp.Index(); index > waitIndex { 258 waitIndex = index 259 } 260 } 261 262 if len(allMigrated) != 0 { 263 w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated)) 264 select { 265 case w.migratedCh <- allMigrated: 266 case <-w.ctx.Done(): 267 w.logger.Trace("shutting down") 268 return 269 } 270 } 271 } 272 } 273 274 // jobResult is the set of actions to take for a draining job given its current 275 // state. 276 type jobResult struct { 277 // drain is the set of allocations to emit for draining. 278 drain []*structs.Allocation 279 280 // migrated is the set of allocations to emit as migrated 281 migrated []*structs.Allocation 282 283 // done marks whether the job has been fully drained. 284 done bool 285 } 286 287 // newJobResult returns a jobResult with done=true. It is the responsibility of 288 // callers to set done=false when a remaining drainable alloc is found. 289 func newJobResult() *jobResult { 290 return &jobResult{ 291 done: true, 292 } 293 } 294 295 func (r *jobResult) String() string { 296 return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done) 297 } 298 299 // handleJob takes the state of a draining job and returns the desired actions. 300 func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) { 301 r := newJobResult() 302 batch := job.Type == structs.JobTypeBatch 303 taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups)) 304 for _, tg := range job.TaskGroups { 305 // Only capture the groups that have a migrate strategy or we are just 306 // watching batch 307 if tg.Migrate != nil || batch { 308 taskGroups[tg.Name] = tg 309 } 310 } 311 312 // Sort the allocations by TG 313 tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups)) 314 for _, alloc := range allocs { 315 if _, ok := taskGroups[alloc.TaskGroup]; !ok { 316 continue 317 } 318 319 tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc) 320 } 321 322 for name, tg := range taskGroups { 323 allocs := tgAllocs[name] 324 if err := handleTaskGroup(snap, batch, tg, allocs, lastHandledIndex, r); err != nil { 325 return nil, fmt.Errorf("drain for task group %q failed: %v", name, err) 326 } 327 } 328 329 return r, nil 330 } 331 332 // handleTaskGroup takes the state of a draining task group and computes the 333 // desired actions. For batch jobs we only notify when they have been migrated 334 // and never mark them for drain. Batch jobs are allowed to complete up until 335 // the deadline, after which they are force killed. 336 func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGroup, 337 allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error { 338 339 // Determine how many allocations can be drained 340 drainingNodes := make(map[string]bool, 4) 341 healthy := 0 342 remainingDrainingAlloc := false 343 var drainable []*structs.Allocation 344 345 for _, alloc := range allocs { 346 // Check if the alloc is on a draining node. 347 onDrainingNode, ok := drainingNodes[alloc.NodeID] 348 if !ok { 349 // Look up the node 350 node, err := snap.NodeByID(nil, alloc.NodeID) 351 if err != nil { 352 return err 353 } 354 355 // Check if the node exists and whether it has a drain strategy 356 onDrainingNode = node != nil && node.DrainStrategy != nil 357 drainingNodes[alloc.NodeID] = onDrainingNode 358 } 359 360 // Check if the alloc should be considered migrated. A migrated 361 // allocation is one that is terminal, is on a draining 362 // allocation, and has only happened since our last handled index to 363 // avoid emitting many duplicate migrate events. 364 if alloc.TerminalStatus() && 365 onDrainingNode && 366 alloc.ModifyIndex > lastHandledIndex { 367 result.migrated = append(result.migrated, alloc) 368 continue 369 } 370 371 // If the service alloc is running and has its deployment status set, it 372 // is considered healthy from a migration standpoint. 373 if !batch && !alloc.TerminalStatus() && alloc.DeploymentStatus.HasHealth() { 374 healthy++ 375 } 376 377 // An alloc can't be considered for migration if: 378 // - It isn't on a draining node 379 // - It is already terminal 380 if !onDrainingNode || alloc.TerminalStatus() { 381 continue 382 } 383 384 // Capture the fact that there is an allocation that is still draining 385 // for this job. 386 remainingDrainingAlloc = true 387 388 // If we haven't marked this allocation for migration already, capture 389 // it as eligible for draining. 390 if !batch && !alloc.DesiredTransition.ShouldMigrate() { 391 drainable = append(drainable, alloc) 392 } 393 } 394 395 // Update the done status 396 if remainingDrainingAlloc { 397 result.done = false 398 } 399 400 // We don't mark batch for drain so exit 401 if batch { 402 return nil 403 } 404 405 // Determine how many we can drain 406 thresholdCount := tg.Count - tg.Migrate.MaxParallel 407 numToDrain := healthy - thresholdCount 408 numToDrain = helper.IntMin(len(drainable), numToDrain) 409 if numToDrain <= 0 { 410 return nil 411 } 412 413 result.drain = append(result.drain, drainable[0:numToDrain]...) 414 return nil 415 } 416 417 // getJobAllocs returns all allocations for draining jobs 418 func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) { 419 if err := w.limiter.Wait(ctx); err != nil { 420 return nil, 0, err 421 } 422 423 resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx) 424 if err != nil { 425 return nil, 0, err 426 } 427 if resp == nil { 428 return nil, index, nil 429 } 430 431 return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil 432 } 433 434 // getJobAllocsImpl returns a map of draining jobs to their allocations. 435 func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 436 index, err := state.Index("allocs") 437 if err != nil { 438 return nil, 0, err 439 } 440 441 // Capture the draining jobs. 442 draining := w.drainingJobs() 443 l := len(draining) 444 if l == 0 { 445 return nil, index, nil 446 } 447 448 // Capture the allocs for each draining job. 449 var maxIndex uint64 = 0 450 resp := make(map[structs.NamespacedID][]*structs.Allocation, l) 451 for jns := range draining { 452 allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false) 453 if err != nil { 454 return nil, index, err 455 } 456 457 resp[jns] = allocs 458 for _, alloc := range allocs { 459 if maxIndex < alloc.ModifyIndex { 460 maxIndex = alloc.ModifyIndex 461 } 462 } 463 } 464 465 // Prefer using the actual max index of affected allocs since it means less 466 // unblocking 467 if maxIndex != 0 { 468 index = maxIndex 469 } 470 471 return resp, index, nil 472 } 473 474 // drainingJobs captures the set of draining jobs. 475 func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} { 476 w.l.RLock() 477 defer w.l.RUnlock() 478 479 l := len(w.jobs) 480 if l == 0 { 481 return nil 482 } 483 484 draining := make(map[structs.NamespacedID]struct{}, l) 485 for k := range w.jobs { 486 draining[k] = struct{}{} 487 } 488 489 return draining 490 } 491 492 // getQueryCtx is a helper for getting the query context. 493 func (w *drainingJobWatcher) getQueryCtx() context.Context { 494 w.l.RLock() 495 defer w.l.RUnlock() 496 return w.queryCtx 497 }