github.com/uchennaokeke444/nomad@v0.11.8/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "strings" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 memdb "github.com/hashicorp/go-memdb" 11 version "github.com/hashicorp/go-version" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/nomad/scheduler" 15 ) 16 17 var ( 18 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 19 // single Raft transaction. This is to ensure that the Raft message does not 20 // become too large. 21 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 22 ) 23 24 // CoreScheduler is a special "scheduler" that is registered 25 // as "_core". It is used to run various administrative work 26 // across the cluster. 27 type CoreScheduler struct { 28 srv *Server 29 snap *state.StateSnapshot 30 logger log.Logger 31 } 32 33 // NewCoreScheduler is used to return a new system scheduler instance 34 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 35 s := &CoreScheduler{ 36 srv: srv, 37 snap: snap, 38 logger: srv.logger.ResetNamed("core.sched"), 39 } 40 return s 41 } 42 43 // Process is used to implement the scheduler.Scheduler interface 44 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 45 job := strings.Split(eval.JobID, ":") // extra data can be smuggled in w/ JobID 46 switch job[0] { 47 case structs.CoreJobEvalGC: 48 return c.evalGC(eval) 49 case structs.CoreJobNodeGC: 50 return c.nodeGC(eval) 51 case structs.CoreJobJobGC: 52 return c.jobGC(eval) 53 case structs.CoreJobDeploymentGC: 54 return c.deploymentGC(eval) 55 case structs.CoreJobCSIVolumeClaimGC: 56 return c.csiVolumeClaimGC(eval) 57 case structs.CoreJobCSIPluginGC: 58 return c.csiPluginGC(eval) 59 case structs.CoreJobForceGC: 60 return c.forceGC(eval) 61 default: 62 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 63 } 64 } 65 66 // forceGC is used to garbage collect all eligible objects. 67 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 68 if err := c.jobGC(eval); err != nil { 69 return err 70 } 71 if err := c.evalGC(eval); err != nil { 72 return err 73 } 74 if err := c.deploymentGC(eval); err != nil { 75 return err 76 } 77 if err := c.csiPluginGC(eval); err != nil { 78 return err 79 } 80 if err := c.csiVolumeClaimGC(eval); err != nil { 81 return err 82 } 83 84 // Node GC must occur after the others to ensure the allocations are 85 // cleared. 86 return c.nodeGC(eval) 87 } 88 89 // jobGC is used to garbage collect eligible jobs. 90 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 91 // Get all the jobs eligible for garbage collection. 92 ws := memdb.NewWatchSet() 93 iter, err := c.snap.JobsByGC(ws, true) 94 if err != nil { 95 return err 96 } 97 98 var oldThreshold uint64 99 if eval.JobID == structs.CoreJobForceGC { 100 // The GC was forced, so set the threshold to its maximum so everything 101 // will GC. 102 oldThreshold = math.MaxUint64 103 c.logger.Debug("forced job GC") 104 } else { 105 // Get the time table to calculate GC cutoffs. 106 tt := c.srv.fsm.TimeTable() 107 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 108 oldThreshold = tt.NearestIndex(cutoff) 109 c.logger.Debug("job GC scanning before cutoff index", 110 "index", oldThreshold, "job_gc_threshold", c.srv.config.JobGCThreshold) 111 } 112 113 // Collect the allocations, evaluations and jobs to GC 114 var gcAlloc, gcEval []string 115 var gcJob []*structs.Job 116 117 OUTER: 118 for i := iter.Next(); i != nil; i = iter.Next() { 119 job := i.(*structs.Job) 120 121 // Ignore new jobs. 122 if job.CreateIndex > oldThreshold { 123 continue 124 } 125 126 ws := memdb.NewWatchSet() 127 evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID) 128 if err != nil { 129 c.logger.Error("job GC failed to get evals for job", "job", job.ID, "error", err) 130 continue 131 } 132 133 allEvalsGC := true 134 var jobAlloc, jobEval []string 135 for _, eval := range evals { 136 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 137 if err != nil { 138 continue OUTER 139 } 140 141 if gc { 142 jobEval = append(jobEval, eval.ID) 143 jobAlloc = append(jobAlloc, allocs...) 144 } else { 145 allEvalsGC = false 146 break 147 } 148 } 149 150 // Job is eligible for garbage collection 151 if allEvalsGC { 152 gcJob = append(gcJob, job) 153 gcAlloc = append(gcAlloc, jobAlloc...) 154 gcEval = append(gcEval, jobEval...) 155 } 156 157 } 158 159 // Fast-path the nothing case 160 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 161 return nil 162 } 163 c.logger.Debug("job GC found eligible objects", 164 "jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc)) 165 166 // Reap the evals and allocs 167 if err := c.evalReap(gcEval, gcAlloc); err != nil { 168 return err 169 } 170 171 // Reap the jobs 172 return c.jobReap(gcJob, eval.LeaderACL) 173 } 174 175 // jobReap contacts the leader and issues a reap on the passed jobs 176 func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error { 177 // Call to the leader to issue the reap 178 for _, req := range c.partitionJobReap(jobs, leaderACL) { 179 var resp structs.JobBatchDeregisterResponse 180 if err := c.srv.RPC("Job.BatchDeregister", req, &resp); err != nil { 181 c.logger.Error("batch job reap failed", "error", err) 182 return err 183 } 184 } 185 186 return nil 187 } 188 189 // partitionJobReap returns a list of JobBatchDeregisterRequests to make, 190 // ensuring a single request does not contain too many jobs. This is necessary 191 // to ensure that the Raft transaction does not become too large. 192 func (c *CoreScheduler) partitionJobReap(jobs []*structs.Job, leaderACL string) []*structs.JobBatchDeregisterRequest { 193 option := &structs.JobDeregisterOptions{Purge: true} 194 var requests []*structs.JobBatchDeregisterRequest 195 submittedJobs := 0 196 for submittedJobs != len(jobs) { 197 req := &structs.JobBatchDeregisterRequest{ 198 Jobs: make(map[structs.NamespacedID]*structs.JobDeregisterOptions), 199 WriteRequest: structs.WriteRequest{ 200 Region: c.srv.config.Region, 201 AuthToken: leaderACL, 202 }, 203 } 204 requests = append(requests, req) 205 available := maxIdsPerReap 206 207 if remaining := len(jobs) - submittedJobs; remaining > 0 { 208 if remaining <= available { 209 for _, job := range jobs[submittedJobs:] { 210 jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace} 211 req.Jobs[jns] = option 212 } 213 submittedJobs += remaining 214 } else { 215 for _, job := range jobs[submittedJobs : submittedJobs+available] { 216 jns := structs.NamespacedID{ID: job.ID, Namespace: job.Namespace} 217 req.Jobs[jns] = option 218 } 219 submittedJobs += available 220 } 221 } 222 } 223 224 return requests 225 } 226 227 // evalGC is used to garbage collect old evaluations 228 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 229 // Iterate over the evaluations 230 ws := memdb.NewWatchSet() 231 iter, err := c.snap.Evals(ws) 232 if err != nil { 233 return err 234 } 235 236 var oldThreshold uint64 237 if eval.JobID == structs.CoreJobForceGC { 238 // The GC was forced, so set the threshold to its maximum so everything 239 // will GC. 240 oldThreshold = math.MaxUint64 241 c.logger.Debug("forced eval GC") 242 } else { 243 // Compute the old threshold limit for GC using the FSM 244 // time table. This is a rough mapping of a time to the 245 // Raft index it belongs to. 246 tt := c.srv.fsm.TimeTable() 247 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 248 oldThreshold = tt.NearestIndex(cutoff) 249 c.logger.Debug("eval GC scanning before cutoff index", 250 "index", oldThreshold, "eval_gc_threshold", c.srv.config.EvalGCThreshold) 251 } 252 253 // Collect the allocations and evaluations to GC 254 var gcAlloc, gcEval []string 255 for raw := iter.Next(); raw != nil; raw = iter.Next() { 256 eval := raw.(*structs.Evaluation) 257 258 // The Evaluation GC should not handle batch jobs since those need to be 259 // garbage collected in one shot 260 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 261 if err != nil { 262 return err 263 } 264 265 if gc { 266 gcEval = append(gcEval, eval.ID) 267 } 268 gcAlloc = append(gcAlloc, allocs...) 269 } 270 271 // Fast-path the nothing case 272 if len(gcEval) == 0 && len(gcAlloc) == 0 { 273 return nil 274 } 275 c.logger.Debug("eval GC found eligibile objects", 276 "evals", len(gcEval), "allocs", len(gcAlloc)) 277 278 return c.evalReap(gcEval, gcAlloc) 279 } 280 281 // gcEval returns whether the eval should be garbage collected given a raft 282 // threshold index. The eval disqualifies for garbage collection if it or its 283 // allocs are not older than the threshold. If the eval should be garbage 284 // collected, the associated alloc ids that should also be removed are also 285 // returned 286 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 287 bool, []string, error) { 288 // Ignore non-terminal and new evaluations 289 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 290 return false, nil, nil 291 } 292 293 // Create a watchset 294 ws := memdb.NewWatchSet() 295 296 // Look up the job 297 job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID) 298 if err != nil { 299 return false, nil, err 300 } 301 302 // Get the allocations by eval 303 allocs, err := c.snap.AllocsByEval(ws, eval.ID) 304 if err != nil { 305 c.logger.Error("failed to get allocs for eval", 306 "eval_id", eval.ID, "error", err) 307 return false, nil, err 308 } 309 310 // If the eval is from a running "batch" job we don't want to garbage 311 // collect its allocations. If there is a long running batch job and its 312 // terminal allocations get GC'd the scheduler would re-run the 313 // allocations. 314 if eval.Type == structs.JobTypeBatch { 315 // Check if the job is running 316 317 // Can collect if: 318 // Job doesn't exist 319 // Job is Stopped and dead 320 // allowBatch and the job is dead 321 collect := false 322 if job == nil { 323 collect = true 324 } else if job.Status != structs.JobStatusDead { 325 collect = false 326 } else if job.Stop { 327 collect = true 328 } else if allowBatch { 329 collect = true 330 } 331 332 // We don't want to gc anything related to a job which is not dead 333 // If the batch job doesn't exist we can GC it regardless of allowBatch 334 if !collect { 335 // Find allocs associated with older (based on createindex) and GC them if terminal 336 oldAllocs := olderVersionTerminalAllocs(allocs, job) 337 return false, oldAllocs, nil 338 } 339 } 340 341 // Scan the allocations to ensure they are terminal and old 342 gcEval := true 343 var gcAllocIDs []string 344 for _, alloc := range allocs { 345 if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) { 346 // Can't GC the evaluation since not all of the allocations are 347 // terminal 348 gcEval = false 349 } else { 350 // The allocation is eligible to be GC'd 351 gcAllocIDs = append(gcAllocIDs, alloc.ID) 352 } 353 } 354 355 return gcEval, gcAllocIDs, nil 356 } 357 358 // olderVersionTerminalAllocs returns terminal allocations whose job create index 359 // is older than the job's create index 360 func olderVersionTerminalAllocs(allocs []*structs.Allocation, job *structs.Job) []string { 361 var ret []string 362 for _, alloc := range allocs { 363 if alloc.Job != nil && alloc.Job.CreateIndex < job.CreateIndex && alloc.TerminalStatus() { 364 ret = append(ret, alloc.ID) 365 } 366 } 367 return ret 368 } 369 370 // evalReap contacts the leader and issues a reap on the passed evals and 371 // allocs. 372 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 373 // Call to the leader to issue the reap 374 for _, req := range c.partitionEvalReap(evals, allocs) { 375 var resp structs.GenericResponse 376 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 377 c.logger.Error("eval reap failed", "error", err) 378 return err 379 } 380 } 381 382 return nil 383 } 384 385 // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single 386 // request does not contain too many allocations and evaluations. This is 387 // necessary to ensure that the Raft transaction does not become too large. 388 func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest { 389 var requests []*structs.EvalDeleteRequest 390 submittedEvals, submittedAllocs := 0, 0 391 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 392 req := &structs.EvalDeleteRequest{ 393 WriteRequest: structs.WriteRequest{ 394 Region: c.srv.config.Region, 395 }, 396 } 397 requests = append(requests, req) 398 available := maxIdsPerReap 399 400 // Add the allocs first 401 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 402 if remaining <= available { 403 req.Allocs = allocs[submittedAllocs:] 404 available -= remaining 405 submittedAllocs += remaining 406 } else { 407 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 408 submittedAllocs += available 409 410 // Exhausted space so skip adding evals 411 continue 412 } 413 } 414 415 // Add the evals 416 if remaining := len(evals) - submittedEvals; remaining > 0 { 417 if remaining <= available { 418 req.Evals = evals[submittedEvals:] 419 submittedEvals += remaining 420 } else { 421 req.Evals = evals[submittedEvals : submittedEvals+available] 422 submittedEvals += available 423 } 424 } 425 } 426 427 return requests 428 } 429 430 // nodeGC is used to garbage collect old nodes 431 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 432 // Iterate over the evaluations 433 ws := memdb.NewWatchSet() 434 iter, err := c.snap.Nodes(ws) 435 if err != nil { 436 return err 437 } 438 439 var oldThreshold uint64 440 if eval.JobID == structs.CoreJobForceGC { 441 // The GC was forced, so set the threshold to its maximum so everything 442 // will GC. 443 oldThreshold = math.MaxUint64 444 c.logger.Debug("forced node GC") 445 } else { 446 // Compute the old threshold limit for GC using the FSM 447 // time table. This is a rough mapping of a time to the 448 // Raft index it belongs to. 449 tt := c.srv.fsm.TimeTable() 450 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 451 oldThreshold = tt.NearestIndex(cutoff) 452 c.logger.Debug("node GC scanning before cutoff index", 453 "index", oldThreshold, "node_gc_threshold", c.srv.config.NodeGCThreshold) 454 } 455 456 // Collect the nodes to GC 457 var gcNode []string 458 OUTER: 459 for { 460 raw := iter.Next() 461 if raw == nil { 462 break 463 } 464 node := raw.(*structs.Node) 465 466 // Ignore non-terminal and new nodes 467 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 468 continue 469 } 470 471 // Get the allocations by node 472 ws := memdb.NewWatchSet() 473 allocs, err := c.snap.AllocsByNode(ws, node.ID) 474 if err != nil { 475 c.logger.Error("failed to get allocs for node", 476 "node_id", node.ID, "error", err) 477 continue 478 } 479 480 // If there are any non-terminal allocations, skip the node. If the node 481 // is terminal and the allocations are not, the scheduler may not have 482 // run yet to transition the allocs on the node to terminal. We delay 483 // GC'ing until this happens. 484 for _, alloc := range allocs { 485 if !alloc.TerminalStatus() { 486 continue OUTER 487 } 488 } 489 490 // Node is eligible for garbage collection 491 gcNode = append(gcNode, node.ID) 492 } 493 494 // Fast-path the nothing case 495 if len(gcNode) == 0 { 496 return nil 497 } 498 c.logger.Debug("node GC found eligible nodes", "nodes", len(gcNode)) 499 return c.nodeReap(eval, gcNode) 500 } 501 502 func (c *CoreScheduler) nodeReap(eval *structs.Evaluation, nodeIDs []string) error { 503 // For old clusters, send single deregistration messages COMPAT(0.11) 504 minVersionBatchNodeDeregister := version.Must(version.NewVersion("0.9.4")) 505 if !ServersMeetMinimumVersion(c.srv.Members(), minVersionBatchNodeDeregister, true) { 506 for _, id := range nodeIDs { 507 req := structs.NodeDeregisterRequest{ 508 NodeID: id, 509 WriteRequest: structs.WriteRequest{ 510 Region: c.srv.config.Region, 511 AuthToken: eval.LeaderACL, 512 }, 513 } 514 var resp structs.NodeUpdateResponse 515 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 516 c.logger.Error("node reap failed", "node_id", id, "error", err) 517 return err 518 } 519 } 520 return nil 521 } 522 523 // Call to the leader to issue the reap 524 for _, ids := range partitionAll(maxIdsPerReap, nodeIDs) { 525 req := structs.NodeBatchDeregisterRequest{ 526 NodeIDs: ids, 527 WriteRequest: structs.WriteRequest{ 528 Region: c.srv.config.Region, 529 AuthToken: eval.LeaderACL, 530 }, 531 } 532 var resp structs.NodeUpdateResponse 533 if err := c.srv.RPC("Node.BatchDeregister", &req, &resp); err != nil { 534 c.logger.Error("node reap failed", "node_ids", ids, "error", err) 535 return err 536 } 537 } 538 return nil 539 } 540 541 // deploymentGC is used to garbage collect old deployments 542 func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { 543 // Iterate over the deployments 544 ws := memdb.NewWatchSet() 545 iter, err := c.snap.Deployments(ws) 546 if err != nil { 547 return err 548 } 549 550 var oldThreshold uint64 551 if eval.JobID == structs.CoreJobForceGC { 552 // The GC was forced, so set the threshold to its maximum so everything 553 // will GC. 554 oldThreshold = math.MaxUint64 555 c.logger.Debug("forced deployment GC") 556 } else { 557 // Compute the old threshold limit for GC using the FSM 558 // time table. This is a rough mapping of a time to the 559 // Raft index it belongs to. 560 tt := c.srv.fsm.TimeTable() 561 cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold) 562 oldThreshold = tt.NearestIndex(cutoff) 563 c.logger.Debug("deployment GC scanning before cutoff index", 564 "index", oldThreshold, "deployment_gc_threshold", c.srv.config.DeploymentGCThreshold) 565 } 566 567 // Collect the deployments to GC 568 var gcDeployment []string 569 570 OUTER: 571 for { 572 raw := iter.Next() 573 if raw == nil { 574 break 575 } 576 deploy := raw.(*structs.Deployment) 577 578 // Ignore non-terminal and new deployments 579 if deploy.Active() || deploy.ModifyIndex > oldThreshold { 580 continue 581 } 582 583 // Ensure there are no allocs referencing this deployment. 584 allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID) 585 if err != nil { 586 c.logger.Error("failed to get allocs for deployment", 587 "deployment_id", deploy.ID, "error", err) 588 continue 589 } 590 591 // Ensure there is no allocation referencing the deployment. 592 for _, alloc := range allocs { 593 if !alloc.TerminalStatus() { 594 continue OUTER 595 } 596 } 597 598 // Deployment is eligible for garbage collection 599 gcDeployment = append(gcDeployment, deploy.ID) 600 } 601 602 // Fast-path the nothing case 603 if len(gcDeployment) == 0 { 604 return nil 605 } 606 c.logger.Debug("deployment GC found eligible deployments", "deployments", len(gcDeployment)) 607 return c.deploymentReap(gcDeployment) 608 } 609 610 // deploymentReap contacts the leader and issues a reap on the passed 611 // deployments. 612 func (c *CoreScheduler) deploymentReap(deployments []string) error { 613 // Call to the leader to issue the reap 614 for _, req := range c.partitionDeploymentReap(deployments) { 615 var resp structs.GenericResponse 616 if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil { 617 c.logger.Error("deployment reap failed", "error", err) 618 return err 619 } 620 } 621 622 return nil 623 } 624 625 // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make, 626 // ensuring a single request does not contain too many deployments. This is 627 // necessary to ensure that the Raft transaction does not become too large. 628 func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest { 629 var requests []*structs.DeploymentDeleteRequest 630 submittedDeployments := 0 631 for submittedDeployments != len(deployments) { 632 req := &structs.DeploymentDeleteRequest{ 633 WriteRequest: structs.WriteRequest{ 634 Region: c.srv.config.Region, 635 }, 636 } 637 requests = append(requests, req) 638 available := maxIdsPerReap 639 640 if remaining := len(deployments) - submittedDeployments; remaining > 0 { 641 if remaining <= available { 642 req.Deployments = deployments[submittedDeployments:] 643 submittedDeployments += remaining 644 } else { 645 req.Deployments = deployments[submittedDeployments : submittedDeployments+available] 646 submittedDeployments += available 647 } 648 } 649 } 650 651 return requests 652 } 653 654 // allocGCEligible returns if the allocation is eligible to be garbage collected 655 // according to its terminal status and its reschedule trackers 656 func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool { 657 // Not in a terminal status and old enough 658 if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex { 659 return false 660 } 661 662 // If the allocation is still running on the client we can not garbage 663 // collect it. 664 if a.ClientStatus == structs.AllocClientStatusRunning { 665 return false 666 } 667 668 // If the job is deleted, stopped or dead all allocs can be removed 669 if job == nil || job.Stop || job.Status == structs.JobStatusDead { 670 return true 671 } 672 673 // If the allocation's desired state is Stop, it can be GCed even if it 674 // has failed and hasn't been rescheduled. This can happen during job updates 675 if a.DesiredStatus == structs.AllocDesiredStatusStop { 676 return true 677 } 678 679 // If the alloc hasn't failed then we don't need to consider it for rescheduling 680 // Rescheduling needs to copy over information from the previous alloc so that it 681 // can enforce the reschedule policy 682 if a.ClientStatus != structs.AllocClientStatusFailed { 683 return true 684 } 685 686 var reschedulePolicy *structs.ReschedulePolicy 687 tg := job.LookupTaskGroup(a.TaskGroup) 688 689 if tg != nil { 690 reschedulePolicy = tg.ReschedulePolicy 691 } 692 // No reschedule policy or rescheduling is disabled 693 if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) { 694 return true 695 } 696 // Restart tracking information has been carried forward 697 if a.NextAllocation != "" { 698 return true 699 } 700 701 // This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet 702 if reschedulePolicy.Unlimited { 703 return false 704 } 705 706 // No restarts have been attempted yet 707 if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 { 708 return false 709 } 710 711 // Don't GC if most recent reschedule attempt is within time interval 712 interval := reschedulePolicy.Interval 713 lastIndex := len(a.RescheduleTracker.Events) 714 lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1] 715 timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime 716 717 return timeDiff > interval.Nanoseconds() 718 } 719 720 // csiVolumeClaimGC is used to garbage collect CSI volume claims 721 func (c *CoreScheduler) csiVolumeClaimGC(eval *structs.Evaluation) error { 722 723 gcClaims := func(ns, volID string) error { 724 req := &structs.CSIVolumeClaimRequest{ 725 VolumeID: volID, 726 Claim: structs.CSIVolumeClaimRelease, 727 } 728 req.Namespace = ns 729 req.Region = c.srv.config.Region 730 err := c.srv.RPC("CSIVolume.Claim", req, &structs.CSIVolumeClaimResponse{}) 731 return err 732 } 733 734 c.logger.Trace("garbage collecting unclaimed CSI volume claims", "eval.JobID", eval.JobID) 735 736 // Volume ID smuggled in with the eval's own JobID 737 evalVolID := strings.Split(eval.JobID, ":") 738 739 // COMPAT(1.0): 0.11.0 shipped with 3 fields. tighten this check to len == 2 740 if len(evalVolID) > 1 { 741 volID := evalVolID[1] 742 return gcClaims(eval.Namespace, volID) 743 } 744 745 ws := memdb.NewWatchSet() 746 747 iter, err := c.snap.CSIVolumes(ws) 748 if err != nil { 749 return err 750 } 751 752 // Get the time table to calculate GC cutoffs. 753 var oldThreshold uint64 754 if eval.JobID == structs.CoreJobForceGC { 755 // The GC was forced, so set the threshold to its maximum so 756 // everything will GC. 757 oldThreshold = math.MaxUint64 758 c.logger.Debug("forced volume claim GC") 759 } else { 760 tt := c.srv.fsm.TimeTable() 761 cutoff := time.Now().UTC().Add(-1 * c.srv.config.CSIVolumeClaimGCThreshold) 762 oldThreshold = tt.NearestIndex(cutoff) 763 } 764 765 c.logger.Debug("CSI volume claim GC scanning before cutoff index", 766 "index", oldThreshold, 767 "csi_volume_claim_gc_threshold", c.srv.config.CSIVolumeClaimGCThreshold) 768 769 NEXT_VOLUME: 770 for i := iter.Next(); i != nil; i = iter.Next() { 771 vol := i.(*structs.CSIVolume) 772 773 // Ignore new volumes 774 if vol.CreateIndex > oldThreshold { 775 continue 776 } 777 778 // we only call the claim release RPC if the volume has claims 779 // that no longer have valid allocations. otherwise we'd send 780 // out a lot of do-nothing RPCs. 781 for id := range vol.ReadClaims { 782 alloc, err := c.snap.AllocByID(ws, id) 783 if err != nil { 784 return err 785 } 786 if alloc == nil { 787 err = gcClaims(vol.Namespace, vol.ID) 788 if err != nil { 789 return err 790 } 791 goto NEXT_VOLUME 792 } 793 } 794 for id := range vol.WriteClaims { 795 alloc, err := c.snap.AllocByID(ws, id) 796 if err != nil { 797 return err 798 } 799 if alloc == nil { 800 err = gcClaims(vol.Namespace, vol.ID) 801 if err != nil { 802 return err 803 } 804 goto NEXT_VOLUME 805 } 806 } 807 if len(vol.PastClaims) > 0 { 808 err = gcClaims(vol.Namespace, vol.ID) 809 if err != nil { 810 return err 811 } 812 } 813 814 } 815 return nil 816 817 } 818 819 // csiPluginGC is used to garbage collect unused plugins 820 func (c *CoreScheduler) csiPluginGC(eval *structs.Evaluation) error { 821 822 ws := memdb.NewWatchSet() 823 824 iter, err := c.snap.CSIPlugins(ws) 825 if err != nil { 826 return err 827 } 828 829 // Get the time table to calculate GC cutoffs. 830 var oldThreshold uint64 831 if eval.JobID == structs.CoreJobForceGC { 832 // The GC was forced, so set the threshold to its maximum so 833 // everything will GC. 834 oldThreshold = math.MaxUint64 835 c.logger.Debug("forced plugin GC") 836 } else { 837 tt := c.srv.fsm.TimeTable() 838 cutoff := time.Now().UTC().Add(-1 * c.srv.config.CSIPluginGCThreshold) 839 oldThreshold = tt.NearestIndex(cutoff) 840 } 841 842 c.logger.Debug("CSI plugin GC scanning before cutoff index", 843 "index", oldThreshold, "csi_plugin_gc_threshold", c.srv.config.CSIPluginGCThreshold) 844 845 for i := iter.Next(); i != nil; i = iter.Next() { 846 plugin := i.(*structs.CSIPlugin) 847 848 // Ignore new plugins 849 if plugin.CreateIndex > oldThreshold { 850 continue 851 } 852 853 req := &structs.CSIPluginDeleteRequest{ID: plugin.ID} 854 req.Region = c.srv.Region() 855 err := c.srv.RPC("CSIPlugin.Delete", req, &structs.CSIPluginDeleteResponse{}) 856 if err != nil { 857 if err.Error() == "plugin in use" { 858 continue 859 } 860 c.logger.Error("failed to GC plugin", "plugin_id", plugin.ID, "error", err) 861 return err 862 } 863 } 864 return nil 865 }