github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/blocked_evals.go (about) 1 package nomad 2 3 import ( 4 "sync" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/consul/lib" 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 const ( 13 // unblockBuffer is the buffer size for the unblock channel. The buffer 14 // should be large to ensure that the FSM doesn't block when calling Unblock 15 // as this would apply back-pressure on Raft. 16 unblockBuffer = 8096 17 18 // pruneInterval is the interval at which we prune objects from the 19 // BlockedEvals tracker 20 pruneInterval = 5 * time.Minute 21 22 // pruneThreshold is the threshold after which objects will be pruned. 23 pruneThreshold = 15 * time.Minute 24 ) 25 26 // BlockedEvals is used to track evaluations that shouldn't be queued until a 27 // certain class of nodes becomes available. An evaluation is put into the 28 // blocked state when it is run through the scheduler and produced failed 29 // allocations. It is unblocked when the capacity of a node that could run the 30 // failed allocation becomes available. 31 type BlockedEvals struct { 32 evalBroker *EvalBroker 33 enabled bool 34 stats *BlockedStats 35 l sync.RWMutex 36 37 // captured is the set of evaluations that are captured by computed node 38 // classes. 39 captured map[string]wrappedEval 40 41 // escaped is the set of evaluations that have escaped computed node 42 // classes. 43 escaped map[string]wrappedEval 44 45 // unblockCh is used to buffer unblocking of evaluations. 46 capacityChangeCh chan *capacityUpdate 47 48 // jobs is the map of blocked job and is used to ensure that only one 49 // blocked eval exists for each job. The value is the blocked evaluation ID. 50 jobs map[string]string 51 52 // unblockIndexes maps computed node classes or quota name to the index in 53 // which they were unblocked. This is used to check if an evaluation could 54 // have been unblocked between the time they were in the scheduler and the 55 // time they are being blocked. 56 unblockIndexes map[string]uint64 57 58 // duplicates is the set of evaluations for jobs that had pre-existing 59 // blocked evaluations. These should be marked as cancelled since only one 60 // blocked eval is needed per job. 61 duplicates []*structs.Evaluation 62 63 // duplicateCh is used to signal that a duplicate eval was added to the 64 // duplicate set. It can be used to unblock waiting callers looking for 65 // duplicates. 66 duplicateCh chan struct{} 67 68 // timetable is used to correlate indexes with their insertion time. This 69 // allows us to prune based on time. 70 timetable *TimeTable 71 72 // stopCh is used to stop any created goroutines. 73 stopCh chan struct{} 74 } 75 76 // capacityUpdate stores unblock data. 77 type capacityUpdate struct { 78 computedClass string 79 quotaChange string 80 index uint64 81 } 82 83 // wrappedEval captures both the evaluation and the optional token 84 type wrappedEval struct { 85 eval *structs.Evaluation 86 token string 87 } 88 89 // BlockedStats returns all the stats about the blocked eval tracker. 90 type BlockedStats struct { 91 // TotalEscaped is the total number of blocked evaluations that have escaped 92 // computed node classes. 93 TotalEscaped int 94 95 // TotalBlocked is the total number of blocked evaluations. 96 TotalBlocked int 97 98 // TotalQuotaLimit is the total number of blocked evaluations that are due 99 // to the quota limit being reached. 100 TotalQuotaLimit int 101 } 102 103 // NewBlockedEvals creates a new blocked eval tracker that will enqueue 104 // unblocked evals into the passed broker. 105 func NewBlockedEvals(evalBroker *EvalBroker) *BlockedEvals { 106 return &BlockedEvals{ 107 evalBroker: evalBroker, 108 captured: make(map[string]wrappedEval), 109 escaped: make(map[string]wrappedEval), 110 jobs: make(map[string]string), 111 unblockIndexes: make(map[string]uint64), 112 capacityChangeCh: make(chan *capacityUpdate, unblockBuffer), 113 duplicateCh: make(chan struct{}, 1), 114 stopCh: make(chan struct{}), 115 stats: new(BlockedStats), 116 } 117 } 118 119 // Enabled is used to check if the broker is enabled. 120 func (b *BlockedEvals) Enabled() bool { 121 b.l.RLock() 122 defer b.l.RUnlock() 123 return b.enabled 124 } 125 126 // SetEnabled is used to control if the blocked eval tracker is enabled. The 127 // tracker should only be enabled on the active leader. 128 func (b *BlockedEvals) SetEnabled(enabled bool) { 129 b.l.Lock() 130 if b.enabled == enabled { 131 // No-op 132 b.l.Unlock() 133 return 134 } else if enabled { 135 go b.watchCapacity() 136 go b.prune() 137 } else { 138 close(b.stopCh) 139 } 140 b.enabled = enabled 141 b.l.Unlock() 142 if !enabled { 143 b.Flush() 144 } 145 } 146 147 func (b *BlockedEvals) SetTimetable(timetable *TimeTable) { 148 b.l.Lock() 149 b.timetable = timetable 150 b.l.Unlock() 151 } 152 153 // Block tracks the passed evaluation and enqueues it into the eval broker when 154 // a suitable node calls unblock. 155 func (b *BlockedEvals) Block(eval *structs.Evaluation) { 156 b.processBlock(eval, "") 157 } 158 159 // Reblock tracks the passed evaluation and enqueues it into the eval broker when 160 // a suitable node calls unblock. Reblock should be used over Block when the 161 // blocking is occurring by an outstanding evaluation. The token is the 162 // evaluation's token. 163 func (b *BlockedEvals) Reblock(eval *structs.Evaluation, token string) { 164 b.processBlock(eval, token) 165 } 166 167 // processBlock is the implementation of blocking an evaluation. It supports 168 // taking an optional evaluation token to use when reblocking an evaluation that 169 // may be outstanding. 170 func (b *BlockedEvals) processBlock(eval *structs.Evaluation, token string) { 171 b.l.Lock() 172 defer b.l.Unlock() 173 174 // Do nothing if not enabled 175 if !b.enabled { 176 return 177 } 178 179 // Check if the job already has a blocked evaluation. If it does add it to 180 // the list of duplicates. We only ever want one blocked evaluation per job, 181 // otherwise we would create unnecessary work for the scheduler as multiple 182 // evals for the same job would be run, all producing the same outcome. 183 if _, existing := b.jobs[eval.JobID]; existing { 184 b.duplicates = append(b.duplicates, eval) 185 186 // Unblock any waiter. 187 select { 188 case b.duplicateCh <- struct{}{}: 189 default: 190 } 191 192 return 193 } 194 195 // Check if the eval missed an unblock while it was in the scheduler at an 196 // older index. The scheduler could have been invoked with a snapshot of 197 // state that was prior to additional capacity being added or allocations 198 // becoming terminal. 199 if b.missedUnblock(eval) { 200 // Just re-enqueue the eval immediately. We pass the token so that the 201 // eval_broker can properly handle the case in which the evaluation is 202 // still outstanding. 203 b.evalBroker.EnqueueAll(map[*structs.Evaluation]string{eval: token}) 204 return 205 } 206 207 // Mark the job as tracked. 208 b.jobs[eval.JobID] = eval.ID 209 b.stats.TotalBlocked++ 210 211 // Track that the evaluation is being added due to reaching the quota limit 212 if eval.QuotaLimitReached != "" { 213 b.stats.TotalQuotaLimit++ 214 } 215 216 // Wrap the evaluation, capturing its token. 217 wrapped := wrappedEval{ 218 eval: eval, 219 token: token, 220 } 221 222 // If the eval has escaped, meaning computed node classes could not capture 223 // the constraints of the job, we store the eval separately as we have to 224 // unblock it whenever node capacity changes. This is because we don't know 225 // what node class is feasible for the jobs constraints. 226 if eval.EscapedComputedClass { 227 b.escaped[eval.ID] = wrapped 228 b.stats.TotalEscaped++ 229 return 230 } 231 232 // Add the eval to the set of blocked evals whose jobs constraints are 233 // captured by computed node class. 234 b.captured[eval.ID] = wrapped 235 } 236 237 // missedUnblock returns whether an evaluation missed an unblock while it was in 238 // the scheduler. Since the scheduler can operate at an index in the past, the 239 // evaluation may have been processed missing data that would allow it to 240 // complete. This method returns if that is the case and should be called with 241 // the lock held. 242 func (b *BlockedEvals) missedUnblock(eval *structs.Evaluation) bool { 243 var max uint64 = 0 244 for id, index := range b.unblockIndexes { 245 // Calculate the max unblock index 246 if max < index { 247 max = index 248 } 249 250 // The evaluation is blocked because it has hit a quota limit not class 251 // eligibility 252 if eval.QuotaLimitReached != "" { 253 if eval.QuotaLimitReached != id { 254 // Not a match 255 continue 256 } else if eval.SnapshotIndex < index { 257 // The evaluation was processed before the quota specification was 258 // updated, so unblock the evaluation. 259 return true 260 } 261 262 // The evaluation was processed having seen all changes to the quota 263 return false 264 } 265 266 elig, ok := eval.ClassEligibility[id] 267 if !ok && eval.SnapshotIndex < index { 268 // The evaluation was processed and did not encounter this class 269 // because it was added after it was processed. Thus for correctness 270 // we need to unblock it. 271 return true 272 } 273 274 // The evaluation could use the computed node class and the eval was 275 // processed before the last unblock. 276 if elig && eval.SnapshotIndex < index { 277 return true 278 } 279 } 280 281 // If the evaluation has escaped, and the map contains an index older than 282 // the evaluations, it should be unblocked. 283 if eval.EscapedComputedClass && eval.SnapshotIndex < max { 284 return true 285 } 286 287 // The evaluation is ahead of all recent unblocks. 288 return false 289 } 290 291 // Untrack causes any blocked evaluation for the passed job to be no longer 292 // tracked. Untrack is called when there is a successful evaluation for the job 293 // and a blocked evaluation is no longer needed. 294 func (b *BlockedEvals) Untrack(jobID string) { 295 b.l.Lock() 296 defer b.l.Unlock() 297 298 // Do nothing if not enabled 299 if !b.enabled { 300 return 301 } 302 303 // Get the evaluation ID to cancel 304 evalID, ok := b.jobs[jobID] 305 if !ok { 306 // No blocked evaluation so exit 307 return 308 } 309 310 // Attempt to delete the evaluation 311 if w, ok := b.captured[evalID]; ok { 312 delete(b.jobs, w.eval.JobID) 313 delete(b.captured, evalID) 314 b.stats.TotalBlocked-- 315 if w.eval.QuotaLimitReached != "" { 316 b.stats.TotalQuotaLimit-- 317 } 318 } 319 320 if w, ok := b.escaped[evalID]; ok { 321 delete(b.jobs, w.eval.JobID) 322 delete(b.escaped, evalID) 323 b.stats.TotalEscaped-- 324 b.stats.TotalBlocked-- 325 if w.eval.QuotaLimitReached != "" { 326 b.stats.TotalQuotaLimit-- 327 } 328 } 329 } 330 331 // Unblock causes any evaluation that could potentially make progress on a 332 // capacity change on the passed computed node class to be enqueued into the 333 // eval broker. 334 func (b *BlockedEvals) Unblock(computedClass string, index uint64) { 335 b.l.Lock() 336 337 // Do nothing if not enabled 338 if !b.enabled { 339 b.l.Unlock() 340 return 341 } 342 343 // Store the index in which the unblock happened. We use this on subsequent 344 // block calls in case the evaluation was in the scheduler when a trigger 345 // occurred. 346 b.unblockIndexes[computedClass] = index 347 b.l.Unlock() 348 349 b.capacityChangeCh <- &capacityUpdate{ 350 computedClass: computedClass, 351 index: index, 352 } 353 } 354 355 // UnblockQuota causes any evaluation that could potentially make progress on a 356 // capacity change on the passed quota to be enqueued into the eval broker. 357 func (b *BlockedEvals) UnblockQuota(quota string, index uint64) { 358 // Nothing to do 359 if quota == "" { 360 return 361 } 362 363 b.l.Lock() 364 365 // Do nothing if not enabled 366 if !b.enabled { 367 b.l.Unlock() 368 return 369 } 370 371 // Store the index in which the unblock happened. We use this on subsequent 372 // block calls in case the evaluation was in the scheduler when a trigger 373 // occurred. 374 b.unblockIndexes[quota] = index 375 b.l.Unlock() 376 377 b.capacityChangeCh <- &capacityUpdate{ 378 quotaChange: quota, 379 index: index, 380 } 381 } 382 383 // UnblockClassAndQuota causes any evaluation that could potentially make 384 // progress on a capacity change on the passed computed node class or quota to 385 // be enqueued into the eval broker. 386 func (b *BlockedEvals) UnblockClassAndQuota(class, quota string, index uint64) { 387 b.l.Lock() 388 389 // Do nothing if not enabled 390 if !b.enabled { 391 b.l.Unlock() 392 return 393 } 394 395 // Store the index in which the unblock happened. We use this on subsequent 396 // block calls in case the evaluation was in the scheduler when a trigger 397 // occurred. 398 if quota != "" { 399 b.unblockIndexes[quota] = index 400 } 401 b.unblockIndexes[class] = index 402 b.l.Unlock() 403 404 b.capacityChangeCh <- &capacityUpdate{ 405 computedClass: class, 406 quotaChange: quota, 407 index: index, 408 } 409 } 410 411 // watchCapacity is a long lived function that watches for capacity changes in 412 // nodes and unblocks the correct set of evals. 413 func (b *BlockedEvals) watchCapacity() { 414 for { 415 select { 416 case <-b.stopCh: 417 return 418 case update := <-b.capacityChangeCh: 419 b.unblock(update.computedClass, update.quotaChange, update.index) 420 } 421 } 422 } 423 424 func (b *BlockedEvals) unblock(computedClass, quota string, index uint64) { 425 b.l.Lock() 426 defer b.l.Unlock() 427 428 // Protect against the case of a flush. 429 if !b.enabled { 430 return 431 } 432 433 // Every eval that has escaped computed node class has to be unblocked 434 // because any node could potentially be feasible. 435 numEscaped := len(b.escaped) 436 numQuotaLimit := 0 437 unblocked := make(map[*structs.Evaluation]string, lib.MaxInt(numEscaped, 4)) 438 439 if numEscaped != 0 && computedClass != "" { 440 for id, wrapped := range b.escaped { 441 unblocked[wrapped.eval] = wrapped.token 442 delete(b.escaped, id) 443 delete(b.jobs, wrapped.eval.JobID) 444 445 if wrapped.eval.QuotaLimitReached != "" { 446 numQuotaLimit++ 447 } 448 } 449 } 450 451 // We unblock any eval that is explicitly eligible for the computed class 452 // and also any eval that is not eligible or uneligible. This signifies that 453 // when the evaluation was originally run through the scheduler, that it 454 // never saw a node with the given computed class and thus needs to be 455 // unblocked for correctness. 456 for id, wrapped := range b.captured { 457 if quota != "" && wrapped.eval.QuotaLimitReached != quota { 458 // We are unblocking based on quota and this eval doesn't match 459 continue 460 } else if elig, ok := wrapped.eval.ClassEligibility[computedClass]; ok && !elig { 461 // Can skip because the eval has explicitly marked the node class 462 // as ineligible. 463 continue 464 } 465 466 // Unblock the evaluation because it is either for the matching quota, 467 // is eligible based on the computed node class, or never seen the 468 // computed node class. 469 unblocked[wrapped.eval] = wrapped.token 470 delete(b.jobs, wrapped.eval.JobID) 471 delete(b.captured, id) 472 if wrapped.eval.QuotaLimitReached != "" { 473 numQuotaLimit++ 474 } 475 } 476 477 if l := len(unblocked); l != 0 { 478 // Update the counters 479 b.stats.TotalEscaped = 0 480 b.stats.TotalBlocked -= l 481 b.stats.TotalQuotaLimit -= numQuotaLimit 482 483 // Enqueue all the unblocked evals into the broker. 484 b.evalBroker.EnqueueAll(unblocked) 485 } 486 } 487 488 // UnblockFailed unblocks all blocked evaluation that were due to scheduler 489 // failure. 490 func (b *BlockedEvals) UnblockFailed() { 491 b.l.Lock() 492 defer b.l.Unlock() 493 494 // Do nothing if not enabled 495 if !b.enabled { 496 return 497 } 498 499 quotaLimit := 0 500 unblocked := make(map[*structs.Evaluation]string, 4) 501 for id, wrapped := range b.captured { 502 if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { 503 unblocked[wrapped.eval] = wrapped.token 504 delete(b.captured, id) 505 delete(b.jobs, wrapped.eval.JobID) 506 if wrapped.eval.QuotaLimitReached != "" { 507 quotaLimit++ 508 } 509 } 510 } 511 512 for id, wrapped := range b.escaped { 513 if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { 514 unblocked[wrapped.eval] = wrapped.token 515 delete(b.escaped, id) 516 delete(b.jobs, wrapped.eval.JobID) 517 b.stats.TotalEscaped -= 1 518 if wrapped.eval.QuotaLimitReached != "" { 519 quotaLimit++ 520 } 521 } 522 } 523 524 if l := len(unblocked); l > 0 { 525 b.stats.TotalBlocked -= l 526 b.stats.TotalQuotaLimit -= quotaLimit 527 b.evalBroker.EnqueueAll(unblocked) 528 } 529 } 530 531 // GetDuplicates returns all the duplicate evaluations and blocks until the 532 // passed timeout. 533 func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation { 534 var timeoutTimer *time.Timer 535 var timeoutCh <-chan time.Time 536 SCAN: 537 b.l.Lock() 538 if len(b.duplicates) != 0 { 539 dups := b.duplicates 540 b.duplicates = nil 541 b.l.Unlock() 542 return dups 543 } 544 b.l.Unlock() 545 546 // Create the timer 547 if timeoutTimer == nil && timeout != 0 { 548 timeoutTimer = time.NewTimer(timeout) 549 timeoutCh = timeoutTimer.C 550 defer timeoutTimer.Stop() 551 } 552 553 select { 554 case <-b.stopCh: 555 return nil 556 case <-timeoutCh: 557 return nil 558 case <-b.duplicateCh: 559 goto SCAN 560 } 561 } 562 563 // Flush is used to clear the state of blocked evaluations. 564 func (b *BlockedEvals) Flush() { 565 b.l.Lock() 566 defer b.l.Unlock() 567 568 // Reset the blocked eval tracker. 569 b.stats.TotalEscaped = 0 570 b.stats.TotalBlocked = 0 571 b.stats.TotalQuotaLimit = 0 572 b.captured = make(map[string]wrappedEval) 573 b.escaped = make(map[string]wrappedEval) 574 b.jobs = make(map[string]string) 575 b.unblockIndexes = make(map[string]uint64) 576 b.timetable = nil 577 b.duplicates = nil 578 b.capacityChangeCh = make(chan *capacityUpdate, unblockBuffer) 579 b.stopCh = make(chan struct{}) 580 b.duplicateCh = make(chan struct{}, 1) 581 } 582 583 // Stats is used to query the state of the blocked eval tracker. 584 func (b *BlockedEvals) Stats() *BlockedStats { 585 // Allocate a new stats struct 586 stats := new(BlockedStats) 587 588 b.l.RLock() 589 defer b.l.RUnlock() 590 591 // Copy all the stats 592 stats.TotalEscaped = b.stats.TotalEscaped 593 stats.TotalBlocked = b.stats.TotalBlocked 594 stats.TotalQuotaLimit = b.stats.TotalQuotaLimit 595 return stats 596 } 597 598 // EmitStats is used to export metrics about the blocked eval tracker while enabled 599 func (b *BlockedEvals) EmitStats(period time.Duration, stopCh chan struct{}) { 600 for { 601 select { 602 case <-time.After(period): 603 stats := b.Stats() 604 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_quota_limit"}, float32(stats.TotalQuotaLimit)) 605 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_blocked"}, float32(stats.TotalBlocked)) 606 metrics.SetGauge([]string{"nomad", "blocked_evals", "total_escaped"}, float32(stats.TotalEscaped)) 607 case <-stopCh: 608 return 609 } 610 } 611 } 612 613 // prune is a long lived function that prunes unnecessary objects on a timer. 614 func (b *BlockedEvals) prune() { 615 ticker := time.NewTicker(pruneInterval) 616 defer ticker.Stop() 617 618 for { 619 select { 620 case <-b.stopCh: 621 return 622 case <-ticker.C: 623 b.pruneUnblockIndexes() 624 } 625 } 626 } 627 628 // pruneUnblockIndexes is used to prune any tracked entry that is excessively 629 // old. This protects againsts unbounded growth of the map. 630 func (b *BlockedEvals) pruneUnblockIndexes() { 631 b.l.Lock() 632 defer b.l.Unlock() 633 634 if b.timetable == nil { 635 return 636 } 637 638 cutoff := time.Now().UTC().Add(-1 * pruneThreshold) 639 oldThreshold := b.timetable.NearestIndex(cutoff) 640 641 for key, index := range b.unblockIndexes { 642 if index < oldThreshold { 643 delete(b.unblockIndexes, key) 644 } 645 } 646 }