github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/hashicorp/nomad/scheduler" 12 ) 13 14 var ( 15 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 16 // single Raft transaction. This is to ensure that the Raft message does not 17 // become too large. 18 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 19 ) 20 21 // CoreScheduler is a special "scheduler" that is registered 22 // as "_core". It is used to run various administrative work 23 // across the cluster. 24 type CoreScheduler struct { 25 srv *Server 26 snap *state.StateSnapshot 27 } 28 29 // NewCoreScheduler is used to return a new system scheduler instance 30 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 31 s := &CoreScheduler{ 32 srv: srv, 33 snap: snap, 34 } 35 return s 36 } 37 38 // Process is used to implement the scheduler.Scheduler interface 39 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 40 switch eval.JobID { 41 case structs.CoreJobEvalGC: 42 return c.evalGC(eval) 43 case structs.CoreJobNodeGC: 44 return c.nodeGC(eval) 45 case structs.CoreJobJobGC: 46 return c.jobGC(eval) 47 case structs.CoreJobDeploymentGC: 48 return c.deploymentGC(eval) 49 case structs.CoreJobForceGC: 50 return c.forceGC(eval) 51 default: 52 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 53 } 54 } 55 56 // forceGC is used to garbage collect all eligible objects. 57 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 58 if err := c.jobGC(eval); err != nil { 59 return err 60 } 61 if err := c.evalGC(eval); err != nil { 62 return err 63 } 64 if err := c.deploymentGC(eval); err != nil { 65 return err 66 } 67 68 // Node GC must occur after the others to ensure the allocations are 69 // cleared. 70 return c.nodeGC(eval) 71 } 72 73 // jobGC is used to garbage collect eligible jobs. 74 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 75 // Get all the jobs eligible for garbage collection. 76 ws := memdb.NewWatchSet() 77 iter, err := c.snap.JobsByGC(ws, true) 78 if err != nil { 79 return err 80 } 81 82 var oldThreshold uint64 83 if eval.JobID == structs.CoreJobForceGC { 84 // The GC was forced, so set the threshold to its maximum so everything 85 // will GC. 86 oldThreshold = math.MaxUint64 87 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 88 } else { 89 // Get the time table to calculate GC cutoffs. 90 tt := c.srv.fsm.TimeTable() 91 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 92 oldThreshold = tt.NearestIndex(cutoff) 93 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 94 oldThreshold, c.srv.config.JobGCThreshold) 95 } 96 97 // Collect the allocations, evaluations and jobs to GC 98 var gcAlloc, gcEval []string 99 var gcJob []*structs.Job 100 101 OUTER: 102 for i := iter.Next(); i != nil; i = iter.Next() { 103 job := i.(*structs.Job) 104 105 // Ignore new jobs. 106 if job.CreateIndex > oldThreshold { 107 continue 108 } 109 110 ws := memdb.NewWatchSet() 111 evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID) 112 if err != nil { 113 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 114 continue 115 } 116 117 allEvalsGC := true 118 var jobAlloc, jobEval []string 119 for _, eval := range evals { 120 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 121 if err != nil { 122 continue OUTER 123 } 124 125 if gc { 126 jobEval = append(jobEval, eval.ID) 127 jobAlloc = append(jobAlloc, allocs...) 128 } else { 129 allEvalsGC = false 130 break 131 } 132 } 133 134 // Job is eligible for garbage collection 135 if allEvalsGC { 136 gcJob = append(gcJob, job) 137 gcAlloc = append(gcAlloc, jobAlloc...) 138 gcEval = append(gcEval, jobEval...) 139 } 140 } 141 142 // Fast-path the nothing case 143 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 144 return nil 145 } 146 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 147 len(gcJob), len(gcEval), len(gcAlloc)) 148 149 // Reap the evals and allocs 150 if err := c.evalReap(gcEval, gcAlloc); err != nil { 151 return err 152 } 153 154 // Call to the leader to deregister the jobs. 155 for _, job := range gcJob { 156 req := structs.JobDeregisterRequest{ 157 JobID: job.ID, 158 Purge: true, 159 WriteRequest: structs.WriteRequest{ 160 Region: c.srv.config.Region, 161 Namespace: job.Namespace, 162 AuthToken: eval.LeaderACL, 163 }, 164 } 165 var resp structs.JobDeregisterResponse 166 if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil { 167 c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err) 168 return err 169 } 170 } 171 172 return nil 173 } 174 175 // evalGC is used to garbage collect old evaluations 176 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 177 // Iterate over the evaluations 178 ws := memdb.NewWatchSet() 179 iter, err := c.snap.Evals(ws) 180 if err != nil { 181 return err 182 } 183 184 var oldThreshold uint64 185 if eval.JobID == structs.CoreJobForceGC { 186 // The GC was forced, so set the threshold to its maximum so everything 187 // will GC. 188 oldThreshold = math.MaxUint64 189 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 190 } else { 191 // Compute the old threshold limit for GC using the FSM 192 // time table. This is a rough mapping of a time to the 193 // Raft index it belongs to. 194 tt := c.srv.fsm.TimeTable() 195 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 196 oldThreshold = tt.NearestIndex(cutoff) 197 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 198 oldThreshold, c.srv.config.EvalGCThreshold) 199 } 200 201 // Collect the allocations and evaluations to GC 202 var gcAlloc, gcEval []string 203 for raw := iter.Next(); raw != nil; raw = iter.Next() { 204 eval := raw.(*structs.Evaluation) 205 206 // The Evaluation GC should not handle batch jobs since those need to be 207 // garbage collected in one shot 208 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 209 if err != nil { 210 return err 211 } 212 213 if gc { 214 gcEval = append(gcEval, eval.ID) 215 } 216 gcAlloc = append(gcAlloc, allocs...) 217 } 218 219 // Fast-path the nothing case 220 if len(gcEval) == 0 && len(gcAlloc) == 0 { 221 return nil 222 } 223 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 224 len(gcEval), len(gcAlloc)) 225 226 return c.evalReap(gcEval, gcAlloc) 227 } 228 229 // gcEval returns whether the eval should be garbage collected given a raft 230 // threshold index. The eval disqualifies for garbage collection if it or its 231 // allocs are not older than the threshold. If the eval should be garbage 232 // collected, the associated alloc ids that should also be removed are also 233 // returned 234 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 235 bool, []string, error) { 236 // Ignore non-terminal and new evaluations 237 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 238 return false, nil, nil 239 } 240 241 // Create a watchset 242 ws := memdb.NewWatchSet() 243 244 // If the eval is from a running "batch" job we don't want to garbage 245 // collect its allocations. If there is a long running batch job and its 246 // terminal allocations get GC'd the scheduler would re-run the 247 // allocations. 248 if eval.Type == structs.JobTypeBatch { 249 // Check if the job is running 250 job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID) 251 if err != nil { 252 return false, nil, err 253 } 254 255 // Can collect if: 256 // Job doesn't exist 257 // Job is Stopped and dead 258 // allowBatch and the job is dead 259 collect := false 260 if job == nil { 261 collect = true 262 } else if job.Status != structs.JobStatusDead { 263 collect = false 264 } else if job.Stop { 265 collect = true 266 } else if allowBatch { 267 collect = true 268 } 269 270 // We don't want to gc anything related to a job which is not dead 271 // If the batch job doesn't exist we can GC it regardless of allowBatch 272 if !collect { 273 return false, nil, nil 274 } 275 } 276 277 // Get the allocations by eval 278 allocs, err := c.snap.AllocsByEval(ws, eval.ID) 279 if err != nil { 280 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 281 eval.ID, err) 282 return false, nil, err 283 } 284 285 // Scan the allocations to ensure they are terminal and old 286 gcEval := true 287 var gcAllocIDs []string 288 for _, alloc := range allocs { 289 if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { 290 // Can't GC the evaluation since not all of the allocations are 291 // terminal 292 gcEval = false 293 } else { 294 // The allocation is eligible to be GC'd 295 gcAllocIDs = append(gcAllocIDs, alloc.ID) 296 } 297 } 298 299 return gcEval, gcAllocIDs, nil 300 } 301 302 // evalReap contacts the leader and issues a reap on the passed evals and 303 // allocs. 304 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 305 // Call to the leader to issue the reap 306 for _, req := range c.partitionEvalReap(evals, allocs) { 307 var resp structs.GenericResponse 308 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 309 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 310 return err 311 } 312 } 313 314 return nil 315 } 316 317 // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single 318 // request does not contain too many allocations and evaluations. This is 319 // necessary to ensure that the Raft transaction does not become too large. 320 func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest { 321 var requests []*structs.EvalDeleteRequest 322 submittedEvals, submittedAllocs := 0, 0 323 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 324 req := &structs.EvalDeleteRequest{ 325 WriteRequest: structs.WriteRequest{ 326 Region: c.srv.config.Region, 327 }, 328 } 329 requests = append(requests, req) 330 available := maxIdsPerReap 331 332 // Add the allocs first 333 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 334 if remaining <= available { 335 req.Allocs = allocs[submittedAllocs:] 336 available -= remaining 337 submittedAllocs += remaining 338 } else { 339 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 340 submittedAllocs += available 341 342 // Exhausted space so skip adding evals 343 continue 344 } 345 } 346 347 // Add the evals 348 if remaining := len(evals) - submittedEvals; remaining > 0 { 349 if remaining <= available { 350 req.Evals = evals[submittedEvals:] 351 submittedEvals += remaining 352 } else { 353 req.Evals = evals[submittedEvals : submittedEvals+available] 354 submittedEvals += available 355 } 356 } 357 } 358 359 return requests 360 } 361 362 // nodeGC is used to garbage collect old nodes 363 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 364 // Iterate over the evaluations 365 ws := memdb.NewWatchSet() 366 iter, err := c.snap.Nodes(ws) 367 if err != nil { 368 return err 369 } 370 371 var oldThreshold uint64 372 if eval.JobID == structs.CoreJobForceGC { 373 // The GC was forced, so set the threshold to its maximum so everything 374 // will GC. 375 oldThreshold = math.MaxUint64 376 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 377 } else { 378 // Compute the old threshold limit for GC using the FSM 379 // time table. This is a rough mapping of a time to the 380 // Raft index it belongs to. 381 tt := c.srv.fsm.TimeTable() 382 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 383 oldThreshold = tt.NearestIndex(cutoff) 384 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 385 oldThreshold, c.srv.config.NodeGCThreshold) 386 } 387 388 // Collect the nodes to GC 389 var gcNode []string 390 OUTER: 391 for { 392 raw := iter.Next() 393 if raw == nil { 394 break 395 } 396 node := raw.(*structs.Node) 397 398 // Ignore non-terminal and new nodes 399 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 400 continue 401 } 402 403 // Get the allocations by node 404 ws := memdb.NewWatchSet() 405 allocs, err := c.snap.AllocsByNode(ws, node.ID) 406 if err != nil { 407 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 408 eval.ID, err) 409 continue 410 } 411 412 // If there are any non-terminal allocations, skip the node. If the node 413 // is terminal and the allocations are not, the scheduler may not have 414 // run yet to transition the allocs on the node to terminal. We delay 415 // GC'ing until this happens. 416 for _, alloc := range allocs { 417 if !alloc.TerminalStatus() { 418 continue OUTER 419 } 420 } 421 422 // Node is eligible for garbage collection 423 gcNode = append(gcNode, node.ID) 424 } 425 426 // Fast-path the nothing case 427 if len(gcNode) == 0 { 428 return nil 429 } 430 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 431 432 // Call to the leader to issue the reap 433 for _, nodeID := range gcNode { 434 req := structs.NodeDeregisterRequest{ 435 NodeID: nodeID, 436 WriteRequest: structs.WriteRequest{ 437 Region: c.srv.config.Region, 438 AuthToken: eval.LeaderACL, 439 }, 440 } 441 var resp structs.NodeUpdateResponse 442 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 443 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 444 return err 445 } 446 } 447 return nil 448 } 449 450 // deploymentGC is used to garbage collect old deployments 451 func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { 452 // Iterate over the deployments 453 ws := memdb.NewWatchSet() 454 iter, err := c.snap.Deployments(ws) 455 if err != nil { 456 return err 457 } 458 459 var oldThreshold uint64 460 if eval.JobID == structs.CoreJobForceGC { 461 // The GC was forced, so set the threshold to its maximum so everything 462 // will GC. 463 oldThreshold = math.MaxUint64 464 c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC") 465 } else { 466 // Compute the old threshold limit for GC using the FSM 467 // time table. This is a rough mapping of a time to the 468 // Raft index it belongs to. 469 tt := c.srv.fsm.TimeTable() 470 cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold) 471 oldThreshold = tt.NearestIndex(cutoff) 472 c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)", 473 oldThreshold, c.srv.config.DeploymentGCThreshold) 474 } 475 476 // Collect the deployments to GC 477 var gcDeployment []string 478 479 OUTER: 480 for { 481 raw := iter.Next() 482 if raw == nil { 483 break 484 } 485 deploy := raw.(*structs.Deployment) 486 487 // Ignore non-terminal and new deployments 488 if deploy.Active() || deploy.ModifyIndex > oldThreshold { 489 continue 490 } 491 492 // Ensure there are no allocs referencing this deployment. 493 allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID) 494 if err != nil { 495 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v", 496 deploy.ID, err) 497 continue 498 } 499 500 // Ensure there is no allocation referencing the deployment. 501 for _, alloc := range allocs { 502 if !alloc.TerminalStatus() { 503 continue OUTER 504 } 505 } 506 507 // Deployment is eligible for garbage collection 508 gcDeployment = append(gcDeployment, deploy.ID) 509 } 510 511 // Fast-path the nothing case 512 if len(gcDeployment) == 0 { 513 return nil 514 } 515 c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment)) 516 return c.deploymentReap(gcDeployment) 517 } 518 519 // deploymentReap contacts the leader and issues a reap on the passed 520 // deployments. 521 func (c *CoreScheduler) deploymentReap(deployments []string) error { 522 // Call to the leader to issue the reap 523 for _, req := range c.partitionDeploymentReap(deployments) { 524 var resp structs.GenericResponse 525 if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil { 526 c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err) 527 return err 528 } 529 } 530 531 return nil 532 } 533 534 // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make, 535 // ensuring a single request does not contain too many deployments. This is 536 // necessary to ensure that the Raft transaction does not become too large. 537 func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest { 538 var requests []*structs.DeploymentDeleteRequest 539 submittedDeployments := 0 540 for submittedDeployments != len(deployments) { 541 req := &structs.DeploymentDeleteRequest{ 542 WriteRequest: structs.WriteRequest{ 543 Region: c.srv.config.Region, 544 }, 545 } 546 requests = append(requests, req) 547 available := maxIdsPerReap 548 549 if remaining := len(deployments) - submittedDeployments; remaining > 0 { 550 if remaining <= available { 551 req.Deployments = deployments[submittedDeployments:] 552 submittedDeployments += remaining 553 } else { 554 req.Deployments = deployments[submittedDeployments : submittedDeployments+available] 555 submittedDeployments += available 556 } 557 } 558 } 559 560 return requests 561 }