github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/reconcile_util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "sort" 6 "strings" 7 8 "time" 9 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 // placementResult is an allocation that must be placed. It potentially has a 14 // previous allocation attached to it that should be stopped only if the 15 // paired placement is complete. This gives an atomic place/stop behavior to 16 // prevent an impossible resource ask as part of a rolling update to wipe the 17 // job out. 18 type placementResult interface { 19 // TaskGroup returns the task group the placement is for 20 TaskGroup() *structs.TaskGroup 21 22 // Name returns the name of the desired allocation 23 Name() string 24 25 // Canary returns whether the placement should be a canary 26 Canary() bool 27 28 // PreviousAllocation returns the previous allocation 29 PreviousAllocation() *structs.Allocation 30 31 // IsRescheduling returns whether the placement was rescheduling a failed allocation 32 IsRescheduling() bool 33 34 // StopPreviousAlloc returns whether the previous allocation should be 35 // stopped and if so the status description. 36 StopPreviousAlloc() (bool, string) 37 } 38 39 // allocStopResult contains the information required to stop a single allocation 40 type allocStopResult struct { 41 alloc *structs.Allocation 42 clientStatus string 43 statusDescription string 44 } 45 46 // allocPlaceResult contains the information required to place a single 47 // allocation 48 type allocPlaceResult struct { 49 name string 50 canary bool 51 taskGroup *structs.TaskGroup 52 previousAlloc *structs.Allocation 53 reschedule bool 54 } 55 56 func (a allocPlaceResult) TaskGroup() *structs.TaskGroup { return a.taskGroup } 57 func (a allocPlaceResult) Name() string { return a.name } 58 func (a allocPlaceResult) Canary() bool { return a.canary } 59 func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc } 60 func (a allocPlaceResult) IsRescheduling() bool { return a.reschedule } 61 func (a allocPlaceResult) StopPreviousAlloc() (bool, string) { return false, "" } 62 63 // allocDestructiveResult contains the information required to do a destructive 64 // update. Destructive changes should be applied atomically, as in the old alloc 65 // is only stopped if the new one can be placed. 66 type allocDestructiveResult struct { 67 placeName string 68 placeTaskGroup *structs.TaskGroup 69 stopAlloc *structs.Allocation 70 stopStatusDescription string 71 } 72 73 func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup { return a.placeTaskGroup } 74 func (a allocDestructiveResult) Name() string { return a.placeName } 75 func (a allocDestructiveResult) Canary() bool { return false } 76 func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc } 77 func (a allocDestructiveResult) IsRescheduling() bool { return false } 78 func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) { 79 return true, a.stopStatusDescription 80 } 81 82 // allocMatrix is a mapping of task groups to their allocation set. 83 type allocMatrix map[string]allocSet 84 85 // newAllocMatrix takes a job and the existing allocations for the job and 86 // creates an allocMatrix 87 func newAllocMatrix(job *structs.Job, allocs []*structs.Allocation) allocMatrix { 88 m := allocMatrix(make(map[string]allocSet)) 89 for _, a := range allocs { 90 s, ok := m[a.TaskGroup] 91 if !ok { 92 s = make(map[string]*structs.Allocation) 93 m[a.TaskGroup] = s 94 } 95 s[a.ID] = a 96 } 97 98 if job != nil { 99 for _, tg := range job.TaskGroups { 100 if _, ok := m[tg.Name]; !ok { 101 m[tg.Name] = make(map[string]*structs.Allocation) 102 } 103 } 104 } 105 return m 106 } 107 108 // allocSet is a set of allocations with a series of helper functions defined 109 // that help reconcile state. 110 type allocSet map[string]*structs.Allocation 111 112 // GoString provides a human readable view of the set 113 func (a allocSet) GoString() string { 114 if len(a) == 0 { 115 return "[]" 116 } 117 118 start := fmt.Sprintf("len(%d) [\n", len(a)) 119 var s []string 120 for k, v := range a { 121 s = append(s, fmt.Sprintf("%q: %v", k, v.Name)) 122 } 123 return start + strings.Join(s, "\n") + "]" 124 } 125 126 // nameSet returns the set of allocation names 127 func (a allocSet) nameSet() map[string]struct{} { 128 names := make(map[string]struct{}, len(a)) 129 for _, alloc := range a { 130 names[alloc.Name] = struct{}{} 131 } 132 return names 133 } 134 135 // nameOrder returns the set of allocation names in sorted order 136 func (a allocSet) nameOrder() []*structs.Allocation { 137 allocs := make([]*structs.Allocation, 0, len(a)) 138 for _, alloc := range a { 139 allocs = append(allocs, alloc) 140 } 141 sort.Slice(allocs, func(i, j int) bool { 142 return allocs[i].Index() < allocs[j].Index() 143 }) 144 return allocs 145 } 146 147 // difference returns a new allocSet that has all the existing item except those 148 // contained within the other allocation sets 149 func (a allocSet) difference(others ...allocSet) allocSet { 150 diff := make(map[string]*structs.Allocation) 151 OUTER: 152 for k, v := range a { 153 for _, other := range others { 154 if _, ok := other[k]; ok { 155 continue OUTER 156 } 157 } 158 diff[k] = v 159 } 160 return diff 161 } 162 163 // union returns a new allocSet that has the union of the two allocSets. 164 // Conflicts prefer the last passed allocSet containing the value 165 func (a allocSet) union(others ...allocSet) allocSet { 166 union := make(map[string]*structs.Allocation, len(a)) 167 order := []allocSet{a} 168 order = append(order, others...) 169 170 for _, set := range order { 171 for k, v := range set { 172 union[k] = v 173 } 174 } 175 176 return union 177 } 178 179 // fromKeys returns an alloc set matching the passed keys 180 func (a allocSet) fromKeys(keys ...[]string) allocSet { 181 from := make(map[string]*structs.Allocation) 182 for _, set := range keys { 183 for _, k := range set { 184 if alloc, ok := a[k]; ok { 185 from[k] = alloc 186 } 187 } 188 } 189 return from 190 } 191 192 // filterByTainted takes a set of tainted nodes and filters the allocation set 193 // into three groups: 194 // 1. Those that exist on untainted nodes 195 // 2. Those exist on nodes that are draining 196 // 3. Those that exist on lost nodes 197 func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, migrate, lost allocSet) { 198 untainted = make(map[string]*structs.Allocation) 199 migrate = make(map[string]*structs.Allocation) 200 lost = make(map[string]*structs.Allocation) 201 for _, alloc := range a { 202 // Terminal allocs are always untainted as they should never be migrated 203 if alloc.TerminalStatus() { 204 untainted[alloc.ID] = alloc 205 continue 206 } 207 208 // Non-terminal allocs that should migrate should always migrate 209 if alloc.DesiredTransition.ShouldMigrate() { 210 migrate[alloc.ID] = alloc 211 continue 212 } 213 214 n, ok := nodes[alloc.NodeID] 215 if !ok { 216 // Node is untainted so alloc is untainted 217 untainted[alloc.ID] = alloc 218 continue 219 } 220 221 // Allocs on GC'd (nil) or lost nodes are Lost 222 if n == nil || n.TerminalStatus() { 223 lost[alloc.ID] = alloc 224 continue 225 } 226 227 // All other allocs are untainted 228 untainted[alloc.ID] = alloc 229 } 230 return 231 } 232 233 // filterByRescheduleable filters the allocation set to return the set of allocations that are either 234 // untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled 235 // at a future time are also returned so that we can create follow up evaluations for them. Allocs are 236 // skipped or considered untainted according to logic defined in shouldFilter method. 237 func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) { 238 untainted = make(map[string]*structs.Allocation) 239 rescheduleNow = make(map[string]*structs.Allocation) 240 241 for _, alloc := range a { 242 var eligibleNow, eligibleLater bool 243 var rescheduleTime time.Time 244 245 // Ignore allocs that have already been rescheduled 246 if alloc.NextAllocation != "" { 247 continue 248 } 249 250 isUntainted, ignore := shouldFilter(alloc, isBatch) 251 if isUntainted { 252 untainted[alloc.ID] = alloc 253 } 254 if isUntainted || ignore { 255 continue 256 } 257 258 // Only failed allocs with desired state run get to this point 259 // If the failed alloc is not eligible for rescheduling now we add it to the untainted set 260 eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment) 261 if !eligibleNow { 262 untainted[alloc.ID] = alloc 263 if eligibleLater { 264 rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime}) 265 } 266 } else { 267 rescheduleNow[alloc.ID] = alloc 268 } 269 } 270 return 271 } 272 273 // shouldFilter returns whether the alloc should be ignored or considered untainted 274 // Ignored allocs are filtered out. 275 // Untainted allocs count against the desired total. 276 // Filtering logic for batch jobs: 277 // If complete, and ran successfully - untainted 278 // If desired state is stop - ignore 279 // 280 // Filtering logic for service jobs: 281 // If desired state is stop/evict - ignore 282 // If client status is complete/lost - ignore 283 func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) { 284 // Allocs from batch jobs should be filtered when the desired status 285 // is terminal and the client did not finish or when the client 286 // status is failed so that they will be replaced. If they are 287 // complete but not failed, they shouldn't be replaced. 288 if isBatch { 289 switch alloc.DesiredStatus { 290 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 291 if alloc.RanSuccessfully() { 292 return true, false 293 } 294 return false, true 295 default: 296 } 297 298 switch alloc.ClientStatus { 299 case structs.AllocClientStatusFailed: 300 default: 301 return true, false 302 } 303 return false, false 304 } 305 306 // Handle service jobs 307 switch alloc.DesiredStatus { 308 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 309 return false, true 310 default: 311 } 312 313 switch alloc.ClientStatus { 314 case structs.AllocClientStatusComplete, structs.AllocClientStatusLost: 315 return false, true 316 default: 317 } 318 return false, false 319 } 320 321 // updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation 322 // should be rescheduled now, later or left in the untainted set 323 func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) { 324 // If the allocation is part of an ongoing active deployment, we only allow it to reschedule 325 // if it has been marked eligible 326 if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() { 327 return 328 } 329 330 // Check if the allocation is marked as it should be force rescheduled 331 if alloc.DesiredTransition.ShouldForceReschedule() { 332 rescheduleNow = true 333 } 334 335 // Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time 336 rescheduleTime, eligible := alloc.NextRescheduleTime() 337 if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) { 338 rescheduleNow = true 339 return 340 } 341 if eligible && alloc.FollowupEvalID == "" { 342 rescheduleLater = true 343 } 344 return 345 } 346 347 // filterByTerminal filters out terminal allocs 348 func filterByTerminal(untainted allocSet) (nonTerminal allocSet) { 349 nonTerminal = make(map[string]*structs.Allocation) 350 for id, alloc := range untainted { 351 if !alloc.TerminalStatus() { 352 nonTerminal[id] = alloc 353 } 354 } 355 return 356 } 357 358 // filterByDeployment filters allocations into two sets, those that match the 359 // given deployment ID and those that don't 360 func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) { 361 match = make(map[string]*structs.Allocation) 362 nonmatch = make(map[string]*structs.Allocation) 363 for _, alloc := range a { 364 if alloc.DeploymentID == id { 365 match[alloc.ID] = alloc 366 } else { 367 nonmatch[alloc.ID] = alloc 368 } 369 } 370 return 371 } 372 373 // allocNameIndex is used to select allocation names for placement or removal 374 // given an existing set of placed allocations. 375 type allocNameIndex struct { 376 job, taskGroup string 377 count int 378 b structs.Bitmap 379 } 380 381 // newAllocNameIndex returns an allocNameIndex for use in selecting names of 382 // allocations to create or stop. It takes the job and task group name, desired 383 // count and any existing allocations as input. 384 func newAllocNameIndex(job, taskGroup string, count int, in allocSet) *allocNameIndex { 385 return &allocNameIndex{ 386 count: count, 387 b: bitmapFrom(in, uint(count)), 388 job: job, 389 taskGroup: taskGroup, 390 } 391 } 392 393 // bitmapFrom creates a bitmap from the given allocation set and a minimum size 394 // maybe given. The size of the bitmap is as the larger of the passed minimum 395 // and the maximum alloc index of the passed input (byte aligned). 396 func bitmapFrom(input allocSet, minSize uint) structs.Bitmap { 397 var max uint 398 for _, a := range input { 399 if num := a.Index(); num > max { 400 max = num 401 } 402 } 403 404 if l := uint(len(input)); minSize < l { 405 minSize = l 406 } 407 408 if max < minSize { 409 max = minSize 410 } else if max%8 == 0 { 411 // This may be possible if the job was scaled down. We want to make sure 412 // that the max index is not byte-aligned otherwise we will overflow 413 // the bitmap. 414 max++ 415 } 416 417 if max == 0 { 418 max = 8 419 } 420 421 // byteAlign the count 422 if remainder := max % 8; remainder != 0 { 423 max = max + 8 - remainder 424 } 425 426 bitmap, err := structs.NewBitmap(max) 427 if err != nil { 428 panic(err) 429 } 430 431 for _, a := range input { 432 bitmap.Set(a.Index()) 433 } 434 435 return bitmap 436 } 437 438 // RemoveHighest removes and returns the highest n used names. The returned set 439 // can be less than n if there aren't n names set in the index 440 func (a *allocNameIndex) Highest(n uint) map[string]struct{} { 441 h := make(map[string]struct{}, n) 442 for i := a.b.Size(); i > uint(0) && uint(len(h)) < n; i-- { 443 // Use this to avoid wrapping around b/c of the unsigned int 444 idx := i - 1 445 if a.b.Check(idx) { 446 a.b.Unset(idx) 447 h[structs.AllocName(a.job, a.taskGroup, idx)] = struct{}{} 448 } 449 } 450 451 return h 452 } 453 454 // Set sets the indexes from the passed alloc set as used 455 func (a *allocNameIndex) Set(set allocSet) { 456 for _, alloc := range set { 457 a.b.Set(alloc.Index()) 458 } 459 } 460 461 // Unset unsets all indexes of the passed alloc set as being used 462 func (a *allocNameIndex) Unset(as allocSet) { 463 for _, alloc := range as { 464 a.b.Unset(alloc.Index()) 465 } 466 } 467 468 // UnsetIndex unsets the index as having its name used 469 func (a *allocNameIndex) UnsetIndex(idx uint) { 470 a.b.Unset(idx) 471 } 472 473 // NextCanaries returns the next n names for use as canaries and sets them as 474 // used. The existing canaries and destructive updates are also passed in. 475 func (a *allocNameIndex) NextCanaries(n uint, existing, destructive allocSet) []string { 476 next := make([]string, 0, n) 477 478 // Create a name index 479 existingNames := existing.nameSet() 480 481 // First select indexes from the allocations that are undergoing destructive 482 // updates. This way we avoid duplicate names as they will get replaced. 483 dmap := bitmapFrom(destructive, uint(a.count)) 484 remainder := n 485 for _, idx := range dmap.IndexesInRange(true, uint(0), uint(a.count)-1) { 486 name := structs.AllocName(a.job, a.taskGroup, uint(idx)) 487 if _, used := existingNames[name]; !used { 488 next = append(next, name) 489 a.b.Set(uint(idx)) 490 491 // If we have enough, return 492 remainder = n - uint(len(next)) 493 if remainder == 0 { 494 return next 495 } 496 } 497 } 498 499 // Get the set of unset names that can be used 500 for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) { 501 name := structs.AllocName(a.job, a.taskGroup, uint(idx)) 502 if _, used := existingNames[name]; !used { 503 next = append(next, name) 504 a.b.Set(uint(idx)) 505 506 // If we have enough, return 507 remainder = n - uint(len(next)) 508 if remainder == 0 { 509 return next 510 } 511 } 512 } 513 514 // We have exhausted the preferred and free set. Pick starting from n to 515 // n+remainder, to avoid overlapping where possible. An example is the 516 // desired count is 3 and we want 5 canaries. The first 3 canaries can use 517 // index [0, 1, 2] but after that we prefer picking indexes [4, 5] so that 518 // we do not overlap. Once the canaries are promoted, these would be the 519 // allocations that would be shut down as well. 520 for i := uint(a.count); i < uint(a.count)+remainder; i++ { 521 name := structs.AllocName(a.job, a.taskGroup, i) 522 next = append(next, name) 523 } 524 525 return next 526 } 527 528 // Next returns the next n names for use as new placements and sets them as 529 // used. 530 func (a *allocNameIndex) Next(n uint) []string { 531 next := make([]string, 0, n) 532 533 // Get the set of unset names that can be used 534 remainder := n 535 for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) { 536 next = append(next, structs.AllocName(a.job, a.taskGroup, uint(idx))) 537 a.b.Set(uint(idx)) 538 539 // If we have enough, return 540 remainder = n - uint(len(next)) 541 if remainder == 0 { 542 return next 543 } 544 } 545 546 // We have exhausted the free set, now just pick overlapping indexes 547 var i uint 548 for i = 0; i < remainder; i++ { 549 next = append(next, structs.AllocName(a.job, a.taskGroup, i)) 550 a.b.Set(i) 551 } 552 553 return next 554 }