github.com/adityamillind98/nomad@v0.11.8/scheduler/reconcile_util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "sort" 6 "strings" 7 8 "time" 9 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 // placementResult is an allocation that must be placed. It potentially has a 14 // previous allocation attached to it that should be stopped only if the 15 // paired placement is complete. This gives an atomic place/stop behavior to 16 // prevent an impossible resource ask as part of a rolling update to wipe the 17 // job out. 18 type placementResult interface { 19 // TaskGroup returns the task group the placement is for 20 TaskGroup() *structs.TaskGroup 21 22 // Name returns the name of the desired allocation 23 Name() string 24 25 // Canary returns whether the placement should be a canary 26 Canary() bool 27 28 // PreviousAllocation returns the previous allocation 29 PreviousAllocation() *structs.Allocation 30 31 // IsRescheduling returns whether the placement was rescheduling a failed allocation 32 IsRescheduling() bool 33 34 // StopPreviousAlloc returns whether the previous allocation should be 35 // stopped and if so the status description. 36 StopPreviousAlloc() (bool, string) 37 } 38 39 // allocStopResult contains the information required to stop a single allocation 40 type allocStopResult struct { 41 alloc *structs.Allocation 42 clientStatus string 43 statusDescription string 44 followupEvalID string 45 } 46 47 // allocPlaceResult contains the information required to place a single 48 // allocation 49 type allocPlaceResult struct { 50 name string 51 canary bool 52 taskGroup *structs.TaskGroup 53 previousAlloc *structs.Allocation 54 reschedule bool 55 } 56 57 func (a allocPlaceResult) TaskGroup() *structs.TaskGroup { return a.taskGroup } 58 func (a allocPlaceResult) Name() string { return a.name } 59 func (a allocPlaceResult) Canary() bool { return a.canary } 60 func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc } 61 func (a allocPlaceResult) IsRescheduling() bool { return a.reschedule } 62 func (a allocPlaceResult) StopPreviousAlloc() (bool, string) { return false, "" } 63 64 // allocDestructiveResult contains the information required to do a destructive 65 // update. Destructive changes should be applied atomically, as in the old alloc 66 // is only stopped if the new one can be placed. 67 type allocDestructiveResult struct { 68 placeName string 69 placeTaskGroup *structs.TaskGroup 70 stopAlloc *structs.Allocation 71 stopStatusDescription string 72 } 73 74 func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup { return a.placeTaskGroup } 75 func (a allocDestructiveResult) Name() string { return a.placeName } 76 func (a allocDestructiveResult) Canary() bool { return false } 77 func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc } 78 func (a allocDestructiveResult) IsRescheduling() bool { return false } 79 func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) { 80 return true, a.stopStatusDescription 81 } 82 83 // allocMatrix is a mapping of task groups to their allocation set. 84 type allocMatrix map[string]allocSet 85 86 // newAllocMatrix takes a job and the existing allocations for the job and 87 // creates an allocMatrix 88 func newAllocMatrix(job *structs.Job, allocs []*structs.Allocation) allocMatrix { 89 m := allocMatrix(make(map[string]allocSet)) 90 for _, a := range allocs { 91 s, ok := m[a.TaskGroup] 92 if !ok { 93 s = make(map[string]*structs.Allocation) 94 m[a.TaskGroup] = s 95 } 96 s[a.ID] = a 97 } 98 99 if job != nil { 100 for _, tg := range job.TaskGroups { 101 if _, ok := m[tg.Name]; !ok { 102 m[tg.Name] = make(map[string]*structs.Allocation) 103 } 104 } 105 } 106 return m 107 } 108 109 // allocSet is a set of allocations with a series of helper functions defined 110 // that help reconcile state. 111 type allocSet map[string]*structs.Allocation 112 113 // GoString provides a human readable view of the set 114 func (a allocSet) GoString() string { 115 if len(a) == 0 { 116 return "[]" 117 } 118 119 start := fmt.Sprintf("len(%d) [\n", len(a)) 120 var s []string 121 for k, v := range a { 122 s = append(s, fmt.Sprintf("%q: %v", k, v.Name)) 123 } 124 return start + strings.Join(s, "\n") + "]" 125 } 126 127 // nameSet returns the set of allocation names 128 func (a allocSet) nameSet() map[string]struct{} { 129 names := make(map[string]struct{}, len(a)) 130 for _, alloc := range a { 131 names[alloc.Name] = struct{}{} 132 } 133 return names 134 } 135 136 // nameOrder returns the set of allocation names in sorted order 137 func (a allocSet) nameOrder() []*structs.Allocation { 138 allocs := make([]*structs.Allocation, 0, len(a)) 139 for _, alloc := range a { 140 allocs = append(allocs, alloc) 141 } 142 sort.Slice(allocs, func(i, j int) bool { 143 return allocs[i].Index() < allocs[j].Index() 144 }) 145 return allocs 146 } 147 148 // difference returns a new allocSet that has all the existing item except those 149 // contained within the other allocation sets 150 func (a allocSet) difference(others ...allocSet) allocSet { 151 diff := make(map[string]*structs.Allocation) 152 OUTER: 153 for k, v := range a { 154 for _, other := range others { 155 if _, ok := other[k]; ok { 156 continue OUTER 157 } 158 } 159 diff[k] = v 160 } 161 return diff 162 } 163 164 // union returns a new allocSet that has the union of the two allocSets. 165 // Conflicts prefer the last passed allocSet containing the value 166 func (a allocSet) union(others ...allocSet) allocSet { 167 union := make(map[string]*structs.Allocation, len(a)) 168 order := []allocSet{a} 169 order = append(order, others...) 170 171 for _, set := range order { 172 for k, v := range set { 173 union[k] = v 174 } 175 } 176 177 return union 178 } 179 180 // fromKeys returns an alloc set matching the passed keys 181 func (a allocSet) fromKeys(keys ...[]string) allocSet { 182 from := make(map[string]*structs.Allocation) 183 for _, set := range keys { 184 for _, k := range set { 185 if alloc, ok := a[k]; ok { 186 from[k] = alloc 187 } 188 } 189 } 190 return from 191 } 192 193 // filterByTainted takes a set of tainted nodes and filters the allocation set 194 // into three groups: 195 // 1. Those that exist on untainted nodes 196 // 2. Those exist on nodes that are draining 197 // 3. Those that exist on lost nodes 198 func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, migrate, lost allocSet) { 199 untainted = make(map[string]*structs.Allocation) 200 migrate = make(map[string]*structs.Allocation) 201 lost = make(map[string]*structs.Allocation) 202 for _, alloc := range a { 203 // Terminal allocs are always untainted as they should never be migrated 204 if alloc.TerminalStatus() { 205 untainted[alloc.ID] = alloc 206 continue 207 } 208 209 // Non-terminal allocs that should migrate should always migrate 210 if alloc.DesiredTransition.ShouldMigrate() { 211 migrate[alloc.ID] = alloc 212 continue 213 } 214 215 n, ok := nodes[alloc.NodeID] 216 if !ok { 217 // Node is untainted so alloc is untainted 218 untainted[alloc.ID] = alloc 219 continue 220 } 221 222 // Allocs on GC'd (nil) or lost nodes are Lost 223 if n == nil || n.TerminalStatus() { 224 lost[alloc.ID] = alloc 225 continue 226 } 227 228 // All other allocs are untainted 229 untainted[alloc.ID] = alloc 230 } 231 return 232 } 233 234 // filterByRescheduleable filters the allocation set to return the set of allocations that are either 235 // untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled 236 // at a future time are also returned so that we can create follow up evaluations for them. Allocs are 237 // skipped or considered untainted according to logic defined in shouldFilter method. 238 func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) { 239 untainted = make(map[string]*structs.Allocation) 240 rescheduleNow = make(map[string]*structs.Allocation) 241 242 for _, alloc := range a { 243 var eligibleNow, eligibleLater bool 244 var rescheduleTime time.Time 245 246 // Ignore allocs that have already been rescheduled 247 if alloc.NextAllocation != "" { 248 continue 249 } 250 251 isUntainted, ignore := shouldFilter(alloc, isBatch) 252 if isUntainted { 253 untainted[alloc.ID] = alloc 254 } 255 if isUntainted || ignore { 256 continue 257 } 258 259 // Only failed allocs with desired state run get to this point 260 // If the failed alloc is not eligible for rescheduling now we add it to the untainted set 261 eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment) 262 if !eligibleNow { 263 untainted[alloc.ID] = alloc 264 if eligibleLater { 265 rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) 266 } 267 } else { 268 rescheduleNow[alloc.ID] = alloc 269 } 270 } 271 return 272 } 273 274 // shouldFilter returns whether the alloc should be ignored or considered untainted 275 // Ignored allocs are filtered out. 276 // Untainted allocs count against the desired total. 277 // Filtering logic for batch jobs: 278 // If complete, and ran successfully - untainted 279 // If desired state is stop - ignore 280 // 281 // Filtering logic for service jobs: 282 // If desired state is stop/evict - ignore 283 // If client status is complete/lost - ignore 284 func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) { 285 // Allocs from batch jobs should be filtered when the desired status 286 // is terminal and the client did not finish or when the client 287 // status is failed so that they will be replaced. If they are 288 // complete but not failed, they shouldn't be replaced. 289 if isBatch { 290 switch alloc.DesiredStatus { 291 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 292 if alloc.RanSuccessfully() { 293 return true, false 294 } 295 return false, true 296 default: 297 } 298 299 switch alloc.ClientStatus { 300 case structs.AllocClientStatusFailed: 301 default: 302 return true, false 303 } 304 return false, false 305 } 306 307 // Handle service jobs 308 switch alloc.DesiredStatus { 309 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 310 return false, true 311 default: 312 } 313 314 switch alloc.ClientStatus { 315 case structs.AllocClientStatusComplete, structs.AllocClientStatusLost: 316 return false, true 317 default: 318 } 319 return false, false 320 } 321 322 // updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation 323 // should be rescheduled now, later or left in the untainted set 324 func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) { 325 // If the allocation is part of an ongoing active deployment, we only allow it to reschedule 326 // if it has been marked eligible 327 if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() { 328 return 329 } 330 331 // Check if the allocation is marked as it should be force rescheduled 332 if alloc.DesiredTransition.ShouldForceReschedule() { 333 rescheduleNow = true 334 } 335 336 // Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time 337 rescheduleTime, eligible := alloc.NextRescheduleTime() 338 if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) { 339 rescheduleNow = true 340 return 341 } 342 if eligible && alloc.FollowupEvalID == "" { 343 rescheduleLater = true 344 } 345 return 346 } 347 348 // filterByTerminal filters out terminal allocs 349 func filterByTerminal(untainted allocSet) (nonTerminal allocSet) { 350 nonTerminal = make(map[string]*structs.Allocation) 351 for id, alloc := range untainted { 352 if !alloc.TerminalStatus() { 353 nonTerminal[id] = alloc 354 } 355 } 356 return 357 } 358 359 // filterByDeployment filters allocations into two sets, those that match the 360 // given deployment ID and those that don't 361 func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) { 362 match = make(map[string]*structs.Allocation) 363 nonmatch = make(map[string]*structs.Allocation) 364 for _, alloc := range a { 365 if alloc.DeploymentID == id { 366 match[alloc.ID] = alloc 367 } else { 368 nonmatch[alloc.ID] = alloc 369 } 370 } 371 return 372 } 373 374 // delayByStopAfterClientDisconnect returns a delay for any lost allocation that's got a 375 // stop_after_client_disconnect configured 376 func (as allocSet) delayByStopAfterClientDisconnect() (later []*delayedRescheduleInfo) { 377 now := time.Now().UTC() 378 for _, a := range as { 379 if !a.ShouldClientStop() { 380 continue 381 } 382 383 t := a.WaitClientStop() 384 385 if t.After(now) { 386 later = append(later, &delayedRescheduleInfo{ 387 allocID: a.ID, 388 alloc: a, 389 rescheduleTime: t, 390 }) 391 } 392 } 393 return later 394 } 395 396 // allocNameIndex is used to select allocation names for placement or removal 397 // given an existing set of placed allocations. 398 type allocNameIndex struct { 399 job, taskGroup string 400 count int 401 b structs.Bitmap 402 } 403 404 // newAllocNameIndex returns an allocNameIndex for use in selecting names of 405 // allocations to create or stop. It takes the job and task group name, desired 406 // count and any existing allocations as input. 407 func newAllocNameIndex(job, taskGroup string, count int, in allocSet) *allocNameIndex { 408 return &allocNameIndex{ 409 count: count, 410 b: bitmapFrom(in, uint(count)), 411 job: job, 412 taskGroup: taskGroup, 413 } 414 } 415 416 // bitmapFrom creates a bitmap from the given allocation set and a minimum size 417 // maybe given. The size of the bitmap is as the larger of the passed minimum 418 // and the maximum alloc index of the passed input (byte aligned). 419 func bitmapFrom(input allocSet, minSize uint) structs.Bitmap { 420 var max uint 421 for _, a := range input { 422 if num := a.Index(); num > max { 423 max = num 424 } 425 } 426 427 if l := uint(len(input)); minSize < l { 428 minSize = l 429 } 430 431 if max < minSize { 432 max = minSize 433 } else if max%8 == 0 { 434 // This may be possible if the job was scaled down. We want to make sure 435 // that the max index is not byte-aligned otherwise we will overflow 436 // the bitmap. 437 max++ 438 } 439 440 if max == 0 { 441 max = 8 442 } 443 444 // byteAlign the count 445 if remainder := max % 8; remainder != 0 { 446 max = max + 8 - remainder 447 } 448 449 bitmap, err := structs.NewBitmap(max) 450 if err != nil { 451 panic(err) 452 } 453 454 for _, a := range input { 455 bitmap.Set(a.Index()) 456 } 457 458 return bitmap 459 } 460 461 // RemoveHighest removes and returns the highest n used names. The returned set 462 // can be less than n if there aren't n names set in the index 463 func (a *allocNameIndex) Highest(n uint) map[string]struct{} { 464 h := make(map[string]struct{}, n) 465 for i := a.b.Size(); i > uint(0) && uint(len(h)) < n; i-- { 466 // Use this to avoid wrapping around b/c of the unsigned int 467 idx := i - 1 468 if a.b.Check(idx) { 469 a.b.Unset(idx) 470 h[structs.AllocName(a.job, a.taskGroup, idx)] = struct{}{} 471 } 472 } 473 474 return h 475 } 476 477 // Set sets the indexes from the passed alloc set as used 478 func (a *allocNameIndex) Set(set allocSet) { 479 for _, alloc := range set { 480 a.b.Set(alloc.Index()) 481 } 482 } 483 484 // Unset unsets all indexes of the passed alloc set as being used 485 func (a *allocNameIndex) Unset(as allocSet) { 486 for _, alloc := range as { 487 a.b.Unset(alloc.Index()) 488 } 489 } 490 491 // UnsetIndex unsets the index as having its name used 492 func (a *allocNameIndex) UnsetIndex(idx uint) { 493 a.b.Unset(idx) 494 } 495 496 // NextCanaries returns the next n names for use as canaries and sets them as 497 // used. The existing canaries and destructive updates are also passed in. 498 func (a *allocNameIndex) NextCanaries(n uint, existing, destructive allocSet) []string { 499 next := make([]string, 0, n) 500 501 // Create a name index 502 existingNames := existing.nameSet() 503 504 // First select indexes from the allocations that are undergoing destructive 505 // updates. This way we avoid duplicate names as they will get replaced. 506 dmap := bitmapFrom(destructive, uint(a.count)) 507 remainder := n 508 for _, idx := range dmap.IndexesInRange(true, uint(0), uint(a.count)-1) { 509 name := structs.AllocName(a.job, a.taskGroup, uint(idx)) 510 if _, used := existingNames[name]; !used { 511 next = append(next, name) 512 a.b.Set(uint(idx)) 513 514 // If we have enough, return 515 remainder = n - uint(len(next)) 516 if remainder == 0 { 517 return next 518 } 519 } 520 } 521 522 // Get the set of unset names that can be used 523 for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) { 524 name := structs.AllocName(a.job, a.taskGroup, uint(idx)) 525 if _, used := existingNames[name]; !used { 526 next = append(next, name) 527 a.b.Set(uint(idx)) 528 529 // If we have enough, return 530 remainder = n - uint(len(next)) 531 if remainder == 0 { 532 return next 533 } 534 } 535 } 536 537 // We have exhausted the preferred and free set. Pick starting from n to 538 // n+remainder, to avoid overlapping where possible. An example is the 539 // desired count is 3 and we want 5 canaries. The first 3 canaries can use 540 // index [0, 1, 2] but after that we prefer picking indexes [4, 5] so that 541 // we do not overlap. Once the canaries are promoted, these would be the 542 // allocations that would be shut down as well. 543 for i := uint(a.count); i < uint(a.count)+remainder; i++ { 544 name := structs.AllocName(a.job, a.taskGroup, i) 545 next = append(next, name) 546 } 547 548 return next 549 } 550 551 // Next returns the next n names for use as new placements and sets them as 552 // used. 553 func (a *allocNameIndex) Next(n uint) []string { 554 next := make([]string, 0, n) 555 556 // Get the set of unset names that can be used 557 remainder := n 558 for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) { 559 next = append(next, structs.AllocName(a.job, a.taskGroup, uint(idx))) 560 a.b.Set(uint(idx)) 561 562 // If we have enough, return 563 remainder = n - uint(len(next)) 564 if remainder == 0 { 565 return next 566 } 567 } 568 569 // We have exhausted the free set, now just pick overlapping indexes 570 var i uint 571 for i = 0; i < remainder; i++ { 572 next = append(next, structs.AllocName(a.job, a.taskGroup, i)) 573 a.b.Set(i) 574 } 575 576 return next 577 }