github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/hashicorp/nomad/scheduler" 12 ) 13 14 var ( 15 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 16 // single Raft transaction. This is to ensure that the Raft message does not 17 // become too large. 18 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 19 ) 20 21 // CoreScheduler is a special "scheduler" that is registered 22 // as "_core". It is used to run various administrative work 23 // across the cluster. 24 type CoreScheduler struct { 25 srv *Server 26 snap *state.StateSnapshot 27 } 28 29 // NewCoreScheduler is used to return a new system scheduler instance 30 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 31 s := &CoreScheduler{ 32 srv: srv, 33 snap: snap, 34 } 35 return s 36 } 37 38 // Process is used to implement the scheduler.Scheduler interface 39 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 40 switch eval.JobID { 41 case structs.CoreJobEvalGC: 42 return c.evalGC(eval) 43 case structs.CoreJobNodeGC: 44 return c.nodeGC(eval) 45 case structs.CoreJobJobGC: 46 return c.jobGC(eval) 47 case structs.CoreJobDeploymentGC: 48 return c.deploymentGC(eval) 49 case structs.CoreJobForceGC: 50 return c.forceGC(eval) 51 default: 52 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 53 } 54 } 55 56 // forceGC is used to garbage collect all eligible objects. 57 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 58 if err := c.jobGC(eval); err != nil { 59 return err 60 } 61 if err := c.evalGC(eval); err != nil { 62 return err 63 } 64 if err := c.deploymentGC(eval); err != nil { 65 return err 66 } 67 68 // Node GC must occur after the others to ensure the allocations are 69 // cleared. 70 return c.nodeGC(eval) 71 } 72 73 // jobGC is used to garbage collect eligible jobs. 74 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 75 // Get all the jobs eligible for garbage collection. 76 ws := memdb.NewWatchSet() 77 iter, err := c.snap.JobsByGC(ws, true) 78 if err != nil { 79 return err 80 } 81 82 var oldThreshold uint64 83 if eval.JobID == structs.CoreJobForceGC { 84 // The GC was forced, so set the threshold to its maximum so everything 85 // will GC. 86 oldThreshold = math.MaxUint64 87 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 88 } else { 89 // Get the time table to calculate GC cutoffs. 90 tt := c.srv.fsm.TimeTable() 91 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 92 oldThreshold = tt.NearestIndex(cutoff) 93 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 94 oldThreshold, c.srv.config.JobGCThreshold) 95 } 96 97 // Collect the allocations, evaluations and jobs to GC 98 var gcAlloc, gcEval []string 99 var gcJob []*structs.Job 100 101 OUTER: 102 for i := iter.Next(); i != nil; i = iter.Next() { 103 job := i.(*structs.Job) 104 105 // Ignore new jobs. 106 if job.CreateIndex > oldThreshold { 107 continue 108 } 109 110 ws := memdb.NewWatchSet() 111 evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID) 112 if err != nil { 113 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 114 continue 115 } 116 117 allEvalsGC := true 118 var jobAlloc, jobEval []string 119 for _, eval := range evals { 120 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 121 if err != nil { 122 continue OUTER 123 } 124 125 if gc { 126 jobEval = append(jobEval, eval.ID) 127 jobAlloc = append(jobAlloc, allocs...) 128 } else { 129 allEvalsGC = false 130 break 131 } 132 } 133 134 // Job is eligible for garbage collection 135 if allEvalsGC { 136 gcJob = append(gcJob, job) 137 gcAlloc = append(gcAlloc, jobAlloc...) 138 gcEval = append(gcEval, jobEval...) 139 } 140 } 141 142 // Fast-path the nothing case 143 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 144 return nil 145 } 146 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 147 len(gcJob), len(gcEval), len(gcAlloc)) 148 149 // Reap the evals and allocs 150 if err := c.evalReap(gcEval, gcAlloc); err != nil { 151 return err 152 } 153 154 // Reap the jobs 155 return c.jobReap(gcJob, eval.LeaderACL) 156 } 157 158 // jobReap contacts the leader and issues a reap on the passed jobs 159 func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error { 160 // Call to the leader to issue the reap 161 for _, req := range c.partitionJobReap(jobs, leaderACL) { 162 var resp structs.JobBatchDeregisterResponse 163 if err := c.srv.RPC("Job.BatchDeregister", req, &resp); err != nil { 164 c.srv.logger.Printf("[ERR] sched.core: batch job reap failed: %v", err) 165 return err 166 } 167 } 168 169 return nil 170 } 171 172 // partitionJobReap returns a list of JobBatchDeregisterRequests to make, 173 // ensuring a single request does not contain too many jobs. This is necessary 174 // to ensure that the Raft transaction does not become too large. 175 func (c *CoreScheduler) partitionJobReap(jobs []*structs.Job, leaderACL string) []*structs.JobBatchDeregisterRequest { 176 option := &structs.JobDeregisterOptions{Purge: true} 177 var requests []*structs.JobBatchDeregisterRequest 178 submittedJobs := 0 179 for submittedJobs != len(jobs) { 180 req := &structs.JobBatchDeregisterRequest{ 181 Jobs: make(map[structs.NamespacedID]*structs.JobDeregisterOptions), 182 WriteRequest: structs.WriteRequest{ 183 Region: c.srv.config.Region, 184 AuthToken: leaderACL, 185 }, 186 } 187 requests = append(requests, req) 188 available := maxIdsPerReap 189 190 if remaining := len(jobs) - submittedJobs; remaining > 0 { 191 if remaining <= available { 192 for _, job := range jobs[submittedJobs:] { 193 jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace} 194 req.Jobs[jns] = option 195 } 196 submittedJobs += remaining 197 } else { 198 for _, job := range jobs[submittedJobs : submittedJobs+available] { 199 jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace} 200 req.Jobs[jns] = option 201 } 202 submittedJobs += available 203 } 204 } 205 } 206 207 return requests 208 } 209 210 // evalGC is used to garbage collect old evaluations 211 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 212 // Iterate over the evaluations 213 ws := memdb.NewWatchSet() 214 iter, err := c.snap.Evals(ws) 215 if err != nil { 216 return err 217 } 218 219 var oldThreshold uint64 220 if eval.JobID == structs.CoreJobForceGC { 221 // The GC was forced, so set the threshold to its maximum so everything 222 // will GC. 223 oldThreshold = math.MaxUint64 224 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 225 } else { 226 // Compute the old threshold limit for GC using the FSM 227 // time table. This is a rough mapping of a time to the 228 // Raft index it belongs to. 229 tt := c.srv.fsm.TimeTable() 230 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 231 oldThreshold = tt.NearestIndex(cutoff) 232 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 233 oldThreshold, c.srv.config.EvalGCThreshold) 234 } 235 236 // Collect the allocations and evaluations to GC 237 var gcAlloc, gcEval []string 238 for raw := iter.Next(); raw != nil; raw = iter.Next() { 239 eval := raw.(*structs.Evaluation) 240 241 // The Evaluation GC should not handle batch jobs since those need to be 242 // garbage collected in one shot 243 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 244 if err != nil { 245 return err 246 } 247 248 if gc { 249 gcEval = append(gcEval, eval.ID) 250 } 251 gcAlloc = append(gcAlloc, allocs...) 252 } 253 254 // Fast-path the nothing case 255 if len(gcEval) == 0 && len(gcAlloc) == 0 { 256 return nil 257 } 258 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 259 len(gcEval), len(gcAlloc)) 260 261 return c.evalReap(gcEval, gcAlloc) 262 } 263 264 // gcEval returns whether the eval should be garbage collected given a raft 265 // threshold index. The eval disqualifies for garbage collection if it or its 266 // allocs are not older than the threshold. If the eval should be garbage 267 // collected, the associated alloc ids that should also be removed are also 268 // returned 269 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 270 bool, []string, error) { 271 // Ignore non-terminal and new evaluations 272 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 273 return false, nil, nil 274 } 275 276 // Create a watchset 277 ws := memdb.NewWatchSet() 278 279 // Look up the job 280 job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID) 281 if err != nil { 282 return false, nil, err 283 } 284 285 // If the eval is from a running "batch" job we don't want to garbage 286 // collect its allocations. If there is a long running batch job and its 287 // terminal allocations get GC'd the scheduler would re-run the 288 // allocations. 289 if eval.Type == structs.JobTypeBatch { 290 // Check if the job is running 291 292 // Can collect if: 293 // Job doesn't exist 294 // Job is Stopped and dead 295 // allowBatch and the job is dead 296 collect := false 297 if job == nil { 298 collect = true 299 } else if job.Status != structs.JobStatusDead { 300 collect = false 301 } else if job.Stop { 302 collect = true 303 } else if allowBatch { 304 collect = true 305 } 306 307 // We don't want to gc anything related to a job which is not dead 308 // If the batch job doesn't exist we can GC it regardless of allowBatch 309 if !collect { 310 return false, nil, nil 311 } 312 } 313 314 // Get the allocations by eval 315 allocs, err := c.snap.AllocsByEval(ws, eval.ID) 316 if err != nil { 317 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 318 eval.ID, err) 319 return false, nil, err 320 } 321 322 // Scan the allocations to ensure they are terminal and old 323 gcEval := true 324 var gcAllocIDs []string 325 for _, alloc := range allocs { 326 if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) { 327 // Can't GC the evaluation since not all of the allocations are 328 // terminal 329 gcEval = false 330 } else { 331 // The allocation is eligible to be GC'd 332 gcAllocIDs = append(gcAllocIDs, alloc.ID) 333 } 334 } 335 336 return gcEval, gcAllocIDs, nil 337 } 338 339 // evalReap contacts the leader and issues a reap on the passed evals and 340 // allocs. 341 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 342 // Call to the leader to issue the reap 343 for _, req := range c.partitionEvalReap(evals, allocs) { 344 var resp structs.GenericResponse 345 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 346 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 347 return err 348 } 349 } 350 351 return nil 352 } 353 354 // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single 355 // request does not contain too many allocations and evaluations. This is 356 // necessary to ensure that the Raft transaction does not become too large. 357 func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest { 358 var requests []*structs.EvalDeleteRequest 359 submittedEvals, submittedAllocs := 0, 0 360 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 361 req := &structs.EvalDeleteRequest{ 362 WriteRequest: structs.WriteRequest{ 363 Region: c.srv.config.Region, 364 }, 365 } 366 requests = append(requests, req) 367 available := maxIdsPerReap 368 369 // Add the allocs first 370 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 371 if remaining <= available { 372 req.Allocs = allocs[submittedAllocs:] 373 available -= remaining 374 submittedAllocs += remaining 375 } else { 376 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 377 submittedAllocs += available 378 379 // Exhausted space so skip adding evals 380 continue 381 } 382 } 383 384 // Add the evals 385 if remaining := len(evals) - submittedEvals; remaining > 0 { 386 if remaining <= available { 387 req.Evals = evals[submittedEvals:] 388 submittedEvals += remaining 389 } else { 390 req.Evals = evals[submittedEvals : submittedEvals+available] 391 submittedEvals += available 392 } 393 } 394 } 395 396 return requests 397 } 398 399 // nodeGC is used to garbage collect old nodes 400 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 401 // Iterate over the evaluations 402 ws := memdb.NewWatchSet() 403 iter, err := c.snap.Nodes(ws) 404 if err != nil { 405 return err 406 } 407 408 var oldThreshold uint64 409 if eval.JobID == structs.CoreJobForceGC { 410 // The GC was forced, so set the threshold to its maximum so everything 411 // will GC. 412 oldThreshold = math.MaxUint64 413 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 414 } else { 415 // Compute the old threshold limit for GC using the FSM 416 // time table. This is a rough mapping of a time to the 417 // Raft index it belongs to. 418 tt := c.srv.fsm.TimeTable() 419 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 420 oldThreshold = tt.NearestIndex(cutoff) 421 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 422 oldThreshold, c.srv.config.NodeGCThreshold) 423 } 424 425 // Collect the nodes to GC 426 var gcNode []string 427 OUTER: 428 for { 429 raw := iter.Next() 430 if raw == nil { 431 break 432 } 433 node := raw.(*structs.Node) 434 435 // Ignore non-terminal and new nodes 436 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 437 continue 438 } 439 440 // Get the allocations by node 441 ws := memdb.NewWatchSet() 442 allocs, err := c.snap.AllocsByNode(ws, node.ID) 443 if err != nil { 444 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 445 eval.ID, err) 446 continue 447 } 448 449 // If there are any non-terminal allocations, skip the node. If the node 450 // is terminal and the allocations are not, the scheduler may not have 451 // run yet to transition the allocs on the node to terminal. We delay 452 // GC'ing until this happens. 453 for _, alloc := range allocs { 454 if !alloc.TerminalStatus() { 455 continue OUTER 456 } 457 } 458 459 // Node is eligible for garbage collection 460 gcNode = append(gcNode, node.ID) 461 } 462 463 // Fast-path the nothing case 464 if len(gcNode) == 0 { 465 return nil 466 } 467 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 468 469 // Call to the leader to issue the reap 470 for _, nodeID := range gcNode { 471 req := structs.NodeDeregisterRequest{ 472 NodeID: nodeID, 473 WriteRequest: structs.WriteRequest{ 474 Region: c.srv.config.Region, 475 AuthToken: eval.LeaderACL, 476 }, 477 } 478 var resp structs.NodeUpdateResponse 479 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 480 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 481 return err 482 } 483 } 484 return nil 485 } 486 487 // deploymentGC is used to garbage collect old deployments 488 func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { 489 // Iterate over the deployments 490 ws := memdb.NewWatchSet() 491 iter, err := c.snap.Deployments(ws) 492 if err != nil { 493 return err 494 } 495 496 var oldThreshold uint64 497 if eval.JobID == structs.CoreJobForceGC { 498 // The GC was forced, so set the threshold to its maximum so everything 499 // will GC. 500 oldThreshold = math.MaxUint64 501 c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC") 502 } else { 503 // Compute the old threshold limit for GC using the FSM 504 // time table. This is a rough mapping of a time to the 505 // Raft index it belongs to. 506 tt := c.srv.fsm.TimeTable() 507 cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold) 508 oldThreshold = tt.NearestIndex(cutoff) 509 c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)", 510 oldThreshold, c.srv.config.DeploymentGCThreshold) 511 } 512 513 // Collect the deployments to GC 514 var gcDeployment []string 515 516 OUTER: 517 for { 518 raw := iter.Next() 519 if raw == nil { 520 break 521 } 522 deploy := raw.(*structs.Deployment) 523 524 // Ignore non-terminal and new deployments 525 if deploy.Active() || deploy.ModifyIndex > oldThreshold { 526 continue 527 } 528 529 // Ensure there are no allocs referencing this deployment. 530 allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID) 531 if err != nil { 532 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v", 533 deploy.ID, err) 534 continue 535 } 536 537 // Ensure there is no allocation referencing the deployment. 538 for _, alloc := range allocs { 539 if !alloc.TerminalStatus() { 540 continue OUTER 541 } 542 } 543 544 // Deployment is eligible for garbage collection 545 gcDeployment = append(gcDeployment, deploy.ID) 546 } 547 548 // Fast-path the nothing case 549 if len(gcDeployment) == 0 { 550 return nil 551 } 552 c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment)) 553 return c.deploymentReap(gcDeployment) 554 } 555 556 // deploymentReap contacts the leader and issues a reap on the passed 557 // deployments. 558 func (c *CoreScheduler) deploymentReap(deployments []string) error { 559 // Call to the leader to issue the reap 560 for _, req := range c.partitionDeploymentReap(deployments) { 561 var resp structs.GenericResponse 562 if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil { 563 c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err) 564 return err 565 } 566 } 567 568 return nil 569 } 570 571 // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make, 572 // ensuring a single request does not contain too many deployments. This is 573 // necessary to ensure that the Raft transaction does not become too large. 574 func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest { 575 var requests []*structs.DeploymentDeleteRequest 576 submittedDeployments := 0 577 for submittedDeployments != len(deployments) { 578 req := &structs.DeploymentDeleteRequest{ 579 WriteRequest: structs.WriteRequest{ 580 Region: c.srv.config.Region, 581 }, 582 } 583 requests = append(requests, req) 584 available := maxIdsPerReap 585 586 if remaining := len(deployments) - submittedDeployments; remaining > 0 { 587 if remaining <= available { 588 req.Deployments = deployments[submittedDeployments:] 589 submittedDeployments += remaining 590 } else { 591 req.Deployments = deployments[submittedDeployments : submittedDeployments+available] 592 submittedDeployments += available 593 } 594 } 595 } 596 597 return requests 598 } 599 600 // allocGCEligible returns if the allocation is eligible to be garbage collected 601 // according to its terminal status and its reschedule trackers 602 func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool { 603 // Not in a terminal status and old enough 604 if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex { 605 return false 606 } 607 608 // If the job is deleted, stopped or dead all allocs can be removed 609 if job == nil || job.Stop || job.Status == structs.JobStatusDead { 610 return true 611 } 612 613 // If the allocation's desired state is Stop, it can be GCed even if it 614 // has failed and hasn't been rescheduled. This can happen during job updates 615 if a.DesiredStatus == structs.AllocDesiredStatusStop { 616 return true 617 } 618 619 // If the alloc hasn't failed then we don't need to consider it for rescheduling 620 // Rescheduling needs to copy over information from the previous alloc so that it 621 // can enforce the reschedule policy 622 if a.ClientStatus != structs.AllocClientStatusFailed { 623 return true 624 } 625 626 var reschedulePolicy *structs.ReschedulePolicy 627 tg := job.LookupTaskGroup(a.TaskGroup) 628 629 if tg != nil { 630 reschedulePolicy = tg.ReschedulePolicy 631 } 632 // No reschedule policy or rescheduling is disabled 633 if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) { 634 return true 635 } 636 // Restart tracking information has been carried forward 637 if a.NextAllocation != "" { 638 return true 639 } 640 641 // This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet 642 if reschedulePolicy.Unlimited { 643 return false 644 } 645 646 // No restarts have been attempted yet 647 if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 { 648 return false 649 } 650 651 // Don't GC if most recent reschedule attempt is within time interval 652 interval := reschedulePolicy.Interval 653 lastIndex := len(a.RescheduleTracker.Events) 654 lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1] 655 timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime 656 657 return timeDiff > interval.Nanoseconds() 658 }