github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/hashicorp/nomad/scheduler" 12 ) 13 14 var ( 15 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 16 // single Raft transaction. This is to ensure that the Raft message does not 17 // become too large. 18 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 19 ) 20 21 // CoreScheduler is a special "scheduler" that is registered 22 // as "_core". It is used to run various administrative work 23 // across the cluster. 24 type CoreScheduler struct { 25 srv *Server 26 snap *state.StateSnapshot 27 } 28 29 // NewCoreScheduler is used to return a new system scheduler instance 30 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 31 s := &CoreScheduler{ 32 srv: srv, 33 snap: snap, 34 } 35 return s 36 } 37 38 // Process is used to implement the scheduler.Scheduler interface 39 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 40 switch eval.JobID { 41 case structs.CoreJobEvalGC: 42 return c.evalGC(eval) 43 case structs.CoreJobNodeGC: 44 return c.nodeGC(eval) 45 case structs.CoreJobJobGC: 46 return c.jobGC(eval) 47 case structs.CoreJobDeploymentGC: 48 return c.deploymentGC(eval) 49 case structs.CoreJobForceGC: 50 return c.forceGC(eval) 51 default: 52 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 53 } 54 } 55 56 // forceGC is used to garbage collect all eligible objects. 57 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 58 if err := c.jobGC(eval); err != nil { 59 return err 60 } 61 if err := c.evalGC(eval); err != nil { 62 return err 63 } 64 if err := c.deploymentGC(eval); err != nil { 65 return err 66 } 67 68 // Node GC must occur after the others to ensure the allocations are 69 // cleared. 70 return c.nodeGC(eval) 71 } 72 73 // jobGC is used to garbage collect eligible jobs. 74 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 75 // Get all the jobs eligible for garbage collection. 76 ws := memdb.NewWatchSet() 77 iter, err := c.snap.JobsByGC(ws, true) 78 if err != nil { 79 return err 80 } 81 82 var oldThreshold uint64 83 if eval.JobID == structs.CoreJobForceGC { 84 // The GC was forced, so set the threshold to its maximum so everything 85 // will GC. 86 oldThreshold = math.MaxUint64 87 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 88 } else { 89 // Get the time table to calculate GC cutoffs. 90 tt := c.srv.fsm.TimeTable() 91 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 92 oldThreshold = tt.NearestIndex(cutoff) 93 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 94 oldThreshold, c.srv.config.JobGCThreshold) 95 } 96 97 // Collect the allocations, evaluations and jobs to GC 98 var gcAlloc, gcEval, gcJob []string 99 100 OUTER: 101 for i := iter.Next(); i != nil; i = iter.Next() { 102 job := i.(*structs.Job) 103 104 // Ignore new jobs. 105 if job.CreateIndex > oldThreshold { 106 continue 107 } 108 109 ws := memdb.NewWatchSet() 110 evals, err := c.snap.EvalsByJob(ws, job.ID) 111 if err != nil { 112 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 113 continue 114 } 115 116 allEvalsGC := true 117 var jobAlloc, jobEval []string 118 for _, eval := range evals { 119 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 120 if err != nil { 121 continue OUTER 122 } 123 124 if gc { 125 jobEval = append(jobEval, eval.ID) 126 jobAlloc = append(jobAlloc, allocs...) 127 } else { 128 allEvalsGC = false 129 break 130 } 131 } 132 133 // Job is eligible for garbage collection 134 if allEvalsGC { 135 gcJob = append(gcJob, job.ID) 136 gcAlloc = append(gcAlloc, jobAlloc...) 137 gcEval = append(gcEval, jobEval...) 138 } 139 } 140 141 // Fast-path the nothing case 142 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 143 return nil 144 } 145 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 146 len(gcJob), len(gcEval), len(gcAlloc)) 147 148 // Reap the evals and allocs 149 if err := c.evalReap(gcEval, gcAlloc); err != nil { 150 return err 151 } 152 153 // Call to the leader to deregister the jobs. 154 for _, job := range gcJob { 155 req := structs.JobDeregisterRequest{ 156 JobID: job, 157 Purge: true, 158 WriteRequest: structs.WriteRequest{ 159 Region: c.srv.config.Region, 160 }, 161 } 162 var resp structs.JobDeregisterResponse 163 if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil { 164 c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err) 165 return err 166 } 167 } 168 169 return nil 170 } 171 172 // evalGC is used to garbage collect old evaluations 173 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 174 // Iterate over the evaluations 175 ws := memdb.NewWatchSet() 176 iter, err := c.snap.Evals(ws) 177 if err != nil { 178 return err 179 } 180 181 var oldThreshold uint64 182 if eval.JobID == structs.CoreJobForceGC { 183 // The GC was forced, so set the threshold to its maximum so everything 184 // will GC. 185 oldThreshold = math.MaxUint64 186 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 187 } else { 188 // Compute the old threshold limit for GC using the FSM 189 // time table. This is a rough mapping of a time to the 190 // Raft index it belongs to. 191 tt := c.srv.fsm.TimeTable() 192 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 193 oldThreshold = tt.NearestIndex(cutoff) 194 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 195 oldThreshold, c.srv.config.EvalGCThreshold) 196 } 197 198 // Collect the allocations and evaluations to GC 199 var gcAlloc, gcEval []string 200 for raw := iter.Next(); raw != nil; raw = iter.Next() { 201 eval := raw.(*structs.Evaluation) 202 203 // The Evaluation GC should not handle batch jobs since those need to be 204 // garbage collected in one shot 205 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 206 if err != nil { 207 return err 208 } 209 210 if gc { 211 gcEval = append(gcEval, eval.ID) 212 } 213 gcAlloc = append(gcAlloc, allocs...) 214 } 215 216 // Fast-path the nothing case 217 if len(gcEval) == 0 && len(gcAlloc) == 0 { 218 return nil 219 } 220 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 221 len(gcEval), len(gcAlloc)) 222 223 return c.evalReap(gcEval, gcAlloc) 224 } 225 226 // gcEval returns whether the eval should be garbage collected given a raft 227 // threshold index. The eval disqualifies for garbage collection if it or its 228 // allocs are not older than the threshold. If the eval should be garbage 229 // collected, the associated alloc ids that should also be removed are also 230 // returned 231 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 232 bool, []string, error) { 233 // Ignore non-terminal and new evaluations 234 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 235 return false, nil, nil 236 } 237 238 // Create a watchset 239 ws := memdb.NewWatchSet() 240 241 // If the eval is from a running "batch" job we don't want to garbage 242 // collect its allocations. If there is a long running batch job and its 243 // terminal allocations get GC'd the scheduler would re-run the 244 // allocations. 245 if eval.Type == structs.JobTypeBatch { 246 // Check if the job is running 247 job, err := c.snap.JobByID(ws, eval.JobID) 248 if err != nil { 249 return false, nil, err 250 } 251 252 // Can collect if: 253 // Job doesn't exist 254 // Job is Stopped and dead 255 // allowBatch and the job is dead 256 collect := false 257 if job == nil { 258 collect = true 259 } else if job.Status != structs.JobStatusDead { 260 collect = false 261 } else if job.Stop { 262 collect = true 263 } else if allowBatch { 264 collect = true 265 } 266 267 // We don't want to gc anything related to a job which is not dead 268 // If the batch job doesn't exist we can GC it regardless of allowBatch 269 if !collect { 270 return false, nil, nil 271 } 272 } 273 274 // Get the allocations by eval 275 allocs, err := c.snap.AllocsByEval(ws, eval.ID) 276 if err != nil { 277 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 278 eval.ID, err) 279 return false, nil, err 280 } 281 282 // Scan the allocations to ensure they are terminal and old 283 gcEval := true 284 var gcAllocIDs []string 285 for _, alloc := range allocs { 286 if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { 287 // Can't GC the evaluation since not all of the allocations are 288 // terminal 289 gcEval = false 290 } else { 291 // The allocation is eligible to be GC'd 292 gcAllocIDs = append(gcAllocIDs, alloc.ID) 293 } 294 } 295 296 return gcEval, gcAllocIDs, nil 297 } 298 299 // evalReap contacts the leader and issues a reap on the passed evals and 300 // allocs. 301 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 302 // Call to the leader to issue the reap 303 for _, req := range c.partitionEvalReap(evals, allocs) { 304 var resp structs.GenericResponse 305 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 306 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 307 return err 308 } 309 } 310 311 return nil 312 } 313 314 // partitionEvalReap returns a list of EvalDeleteRequest to make, ensuring a single 315 // request does not contain too many allocations and evaluations. This is 316 // necessary to ensure that the Raft transaction does not become too large. 317 func (c *CoreScheduler) partitionEvalReap(evals, allocs []string) []*structs.EvalDeleteRequest { 318 var requests []*structs.EvalDeleteRequest 319 submittedEvals, submittedAllocs := 0, 0 320 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 321 req := &structs.EvalDeleteRequest{ 322 WriteRequest: structs.WriteRequest{ 323 Region: c.srv.config.Region, 324 }, 325 } 326 requests = append(requests, req) 327 available := maxIdsPerReap 328 329 // Add the allocs first 330 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 331 if remaining <= available { 332 req.Allocs = allocs[submittedAllocs:] 333 available -= remaining 334 submittedAllocs += remaining 335 } else { 336 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 337 submittedAllocs += available 338 339 // Exhausted space so skip adding evals 340 continue 341 } 342 } 343 344 // Add the evals 345 if remaining := len(evals) - submittedEvals; remaining > 0 { 346 if remaining <= available { 347 req.Evals = evals[submittedEvals:] 348 submittedEvals += remaining 349 } else { 350 req.Evals = evals[submittedEvals : submittedEvals+available] 351 submittedEvals += available 352 } 353 } 354 } 355 356 return requests 357 } 358 359 // nodeGC is used to garbage collect old nodes 360 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 361 // Iterate over the evaluations 362 ws := memdb.NewWatchSet() 363 iter, err := c.snap.Nodes(ws) 364 if err != nil { 365 return err 366 } 367 368 var oldThreshold uint64 369 if eval.JobID == structs.CoreJobForceGC { 370 // The GC was forced, so set the threshold to its maximum so everything 371 // will GC. 372 oldThreshold = math.MaxUint64 373 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 374 } else { 375 // Compute the old threshold limit for GC using the FSM 376 // time table. This is a rough mapping of a time to the 377 // Raft index it belongs to. 378 tt := c.srv.fsm.TimeTable() 379 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 380 oldThreshold = tt.NearestIndex(cutoff) 381 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 382 oldThreshold, c.srv.config.NodeGCThreshold) 383 } 384 385 // Collect the nodes to GC 386 var gcNode []string 387 OUTER: 388 for { 389 raw := iter.Next() 390 if raw == nil { 391 break 392 } 393 node := raw.(*structs.Node) 394 395 // Ignore non-terminal and new nodes 396 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 397 continue 398 } 399 400 // Get the allocations by node 401 ws := memdb.NewWatchSet() 402 allocs, err := c.snap.AllocsByNode(ws, node.ID) 403 if err != nil { 404 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 405 eval.ID, err) 406 continue 407 } 408 409 // If there are any non-terminal allocations, skip the node. If the node 410 // is terminal and the allocations are not, the scheduler may not have 411 // run yet to transition the allocs on the node to terminal. We delay 412 // GC'ing until this happens. 413 for _, alloc := range allocs { 414 if !alloc.TerminalStatus() { 415 continue OUTER 416 } 417 } 418 419 // Node is eligible for garbage collection 420 gcNode = append(gcNode, node.ID) 421 } 422 423 // Fast-path the nothing case 424 if len(gcNode) == 0 { 425 return nil 426 } 427 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 428 429 // Call to the leader to issue the reap 430 for _, nodeID := range gcNode { 431 req := structs.NodeDeregisterRequest{ 432 NodeID: nodeID, 433 WriteRequest: structs.WriteRequest{ 434 Region: c.srv.config.Region, 435 }, 436 } 437 var resp structs.NodeUpdateResponse 438 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 439 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 440 return err 441 } 442 } 443 return nil 444 } 445 446 // deploymentGC is used to garbage collect old deployments 447 func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { 448 // Iterate over the deployments 449 ws := memdb.NewWatchSet() 450 iter, err := c.snap.Deployments(ws) 451 if err != nil { 452 return err 453 } 454 455 var oldThreshold uint64 456 if eval.JobID == structs.CoreJobForceGC { 457 // The GC was forced, so set the threshold to its maximum so everything 458 // will GC. 459 oldThreshold = math.MaxUint64 460 c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC") 461 } else { 462 // Compute the old threshold limit for GC using the FSM 463 // time table. This is a rough mapping of a time to the 464 // Raft index it belongs to. 465 tt := c.srv.fsm.TimeTable() 466 cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold) 467 oldThreshold = tt.NearestIndex(cutoff) 468 c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)", 469 oldThreshold, c.srv.config.DeploymentGCThreshold) 470 } 471 472 // Collect the deployments to GC 473 var gcDeployment []string 474 475 OUTER: 476 for { 477 raw := iter.Next() 478 if raw == nil { 479 break 480 } 481 deploy := raw.(*structs.Deployment) 482 483 // Ignore non-terminal and new deployments 484 if deploy.Active() || deploy.ModifyIndex > oldThreshold { 485 continue 486 } 487 488 // Ensure there are no allocs referencing this deployment. 489 allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID) 490 if err != nil { 491 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v", 492 deploy.ID, err) 493 continue 494 } 495 496 // Ensure there is no allocation referencing the deployment. 497 for _, alloc := range allocs { 498 if !alloc.TerminalStatus() { 499 continue OUTER 500 } 501 } 502 503 // Deployment is eligible for garbage collection 504 gcDeployment = append(gcDeployment, deploy.ID) 505 } 506 507 // Fast-path the nothing case 508 if len(gcDeployment) == 0 { 509 return nil 510 } 511 c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment)) 512 return c.deploymentReap(gcDeployment) 513 } 514 515 // deploymentReap contacts the leader and issues a reap on the passed 516 // deployments. 517 func (c *CoreScheduler) deploymentReap(deployments []string) error { 518 // Call to the leader to issue the reap 519 for _, req := range c.partitionDeploymentReap(deployments) { 520 var resp structs.GenericResponse 521 if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil { 522 c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err) 523 return err 524 } 525 } 526 527 return nil 528 } 529 530 // partitionDeploymentReap returns a list of DeploymentDeleteRequest to make, 531 // ensuring a single request does not contain too many deployments. This is 532 // necessary to ensure that the Raft transaction does not become too large. 533 func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs.DeploymentDeleteRequest { 534 var requests []*structs.DeploymentDeleteRequest 535 submittedDeployments := 0 536 for submittedDeployments != len(deployments) { 537 req := &structs.DeploymentDeleteRequest{ 538 WriteRequest: structs.WriteRequest{ 539 Region: c.srv.config.Region, 540 }, 541 } 542 requests = append(requests, req) 543 available := maxIdsPerReap 544 545 if remaining := len(deployments) - submittedDeployments; remaining > 0 { 546 if remaining <= available { 547 req.Deployments = deployments[submittedDeployments:] 548 submittedDeployments += remaining 549 } else { 550 req.Deployments = deployments[submittedDeployments : submittedDeployments+available] 551 submittedDeployments += available 552 } 553 } 554 } 555 556 return requests 557 }