github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 "github.com/hashicorp/nomad/nomad/state" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/scheduler" 11 ) 12 13 var ( 14 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 15 // single Raft transaction. This is to ensure that the Raft message does not 16 // become too large. 17 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 18 ) 19 20 // CoreScheduler is a special "scheduler" that is registered 21 // as "_core". It is used to run various administrative work 22 // across the cluster. 23 type CoreScheduler struct { 24 srv *Server 25 snap *state.StateSnapshot 26 } 27 28 // NewCoreScheduler is used to return a new system scheduler instance 29 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 30 s := &CoreScheduler{ 31 srv: srv, 32 snap: snap, 33 } 34 return s 35 } 36 37 // Process is used to implement the scheduler.Scheduler interface 38 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 39 switch eval.JobID { 40 case structs.CoreJobEvalGC: 41 return c.evalGC(eval) 42 case structs.CoreJobNodeGC: 43 return c.nodeGC(eval) 44 case structs.CoreJobJobGC: 45 return c.jobGC(eval) 46 case structs.CoreJobForceGC: 47 return c.forceGC(eval) 48 default: 49 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 50 } 51 } 52 53 // forceGC is used to garbage collect all eligible objects. 54 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 55 if err := c.jobGC(eval); err != nil { 56 return err 57 } 58 if err := c.evalGC(eval); err != nil { 59 return err 60 } 61 62 // Node GC must occur after the others to ensure the allocations are 63 // cleared. 64 return c.nodeGC(eval) 65 } 66 67 // jobGC is used to garbage collect eligible jobs. 68 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 69 // Get all the jobs eligible for garbage collection. 70 iter, err := c.snap.JobsByGC(true) 71 if err != nil { 72 return err 73 } 74 75 var oldThreshold uint64 76 if eval.JobID == structs.CoreJobForceGC { 77 // The GC was forced, so set the threshold to its maximum so everything 78 // will GC. 79 oldThreshold = math.MaxUint64 80 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 81 } else { 82 // Get the time table to calculate GC cutoffs. 83 tt := c.srv.fsm.TimeTable() 84 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 85 oldThreshold = tt.NearestIndex(cutoff) 86 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 87 oldThreshold, c.srv.config.JobGCThreshold) 88 } 89 90 // Collect the allocations, evaluations and jobs to GC 91 var gcAlloc, gcEval, gcJob []string 92 93 OUTER: 94 for i := iter.Next(); i != nil; i = iter.Next() { 95 job := i.(*structs.Job) 96 97 // Ignore new jobs. 98 if job.CreateIndex > oldThreshold { 99 continue 100 } 101 102 evals, err := c.snap.EvalsByJob(job.ID) 103 if err != nil { 104 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 105 continue 106 } 107 108 for _, eval := range evals { 109 gc, allocs, err := c.gcEval(eval, oldThreshold) 110 if err != nil || !gc { 111 // We skip the job because it is not finished if it has 112 // non-terminal allocations. 113 continue OUTER 114 } 115 116 gcEval = append(gcEval, eval.ID) 117 gcAlloc = append(gcAlloc, allocs...) 118 } 119 120 // Job is eligible for garbage collection 121 gcJob = append(gcJob, job.ID) 122 } 123 124 // Fast-path the nothing case 125 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 126 return nil 127 } 128 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 129 len(gcJob), len(gcEval), len(gcAlloc)) 130 131 // Reap the evals and allocs 132 if err := c.evalReap(gcEval, gcAlloc); err != nil { 133 return err 134 } 135 136 // Call to the leader to deregister the jobs. 137 for _, job := range gcJob { 138 req := structs.JobDeregisterRequest{ 139 JobID: job, 140 WriteRequest: structs.WriteRequest{ 141 Region: c.srv.config.Region, 142 }, 143 } 144 var resp structs.JobDeregisterResponse 145 if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil { 146 c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err) 147 return err 148 } 149 } 150 151 return nil 152 } 153 154 // evalGC is used to garbage collect old evaluations 155 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 156 // Iterate over the evaluations 157 iter, err := c.snap.Evals() 158 if err != nil { 159 return err 160 } 161 162 var oldThreshold uint64 163 if eval.JobID == structs.CoreJobForceGC { 164 // The GC was forced, so set the threshold to its maximum so everything 165 // will GC. 166 oldThreshold = math.MaxUint64 167 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 168 } else { 169 // Compute the old threshold limit for GC using the FSM 170 // time table. This is a rough mapping of a time to the 171 // Raft index it belongs to. 172 tt := c.srv.fsm.TimeTable() 173 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 174 oldThreshold = tt.NearestIndex(cutoff) 175 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 176 oldThreshold, c.srv.config.EvalGCThreshold) 177 } 178 179 // Collect the allocations and evaluations to GC 180 var gcAlloc, gcEval []string 181 for raw := iter.Next(); raw != nil; raw = iter.Next() { 182 eval := raw.(*structs.Evaluation) 183 184 gc, allocs, err := c.gcEval(eval, oldThreshold) 185 if err != nil { 186 return err 187 } 188 189 // If the eval is from a running "batch" job we don't want to garbage 190 // collect its allocations. If there is a long running batch job and its 191 // terminal allocations get GC'd the scheduler would re-run the 192 // allocations. 193 if eval.Type == structs.JobTypeBatch { 194 // Check if the job is running 195 job, err := c.snap.JobByID(eval.JobID) 196 if err != nil { 197 return err 198 } 199 200 // If the job has been deregistered, we want to garbage collect the 201 // allocations and evaluations. 202 if job != nil && len(allocs) != 0 { 203 continue 204 } 205 } 206 207 if gc { 208 gcEval = append(gcEval, eval.ID) 209 gcAlloc = append(gcAlloc, allocs...) 210 } 211 } 212 213 // Fast-path the nothing case 214 if len(gcEval) == 0 && len(gcAlloc) == 0 { 215 return nil 216 } 217 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 218 len(gcEval), len(gcAlloc)) 219 220 return c.evalReap(gcEval, gcAlloc) 221 } 222 223 // gcEval returns whether the eval should be garbage collected given a raft 224 // threshold index. The eval disqualifies for garbage collection if it or its 225 // allocs are not older than the threshold. If the eval should be garbage 226 // collected, the associated alloc ids that should also be removed are also 227 // returned 228 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64) ( 229 bool, []string, error) { 230 // Ignore non-terminal and new evaluations 231 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 232 return false, nil, nil 233 } 234 235 // Get the allocations by eval 236 allocs, err := c.snap.AllocsByEval(eval.ID) 237 if err != nil { 238 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 239 eval.ID, err) 240 return false, nil, err 241 } 242 243 // Scan the allocations to ensure they are terminal and old 244 for _, alloc := range allocs { 245 if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { 246 return false, nil, nil 247 } 248 } 249 250 allocIds := make([]string, len(allocs)) 251 for i, alloc := range allocs { 252 allocIds[i] = alloc.ID 253 } 254 255 // Evaluation is eligible for garbage collection 256 return true, allocIds, nil 257 } 258 259 // evalReap contacts the leader and issues a reap on the passed evals and 260 // allocs. 261 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 262 // Call to the leader to issue the reap 263 for _, req := range c.partitionReap(evals, allocs) { 264 var resp structs.GenericResponse 265 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 266 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 267 return err 268 } 269 } 270 271 return nil 272 } 273 274 // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single 275 // request does not contain too many allocations and evaluations. This is 276 // necessary to ensure that the Raft transaction does not become too large. 277 func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest { 278 var requests []*structs.EvalDeleteRequest 279 submittedEvals, submittedAllocs := 0, 0 280 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 281 req := &structs.EvalDeleteRequest{ 282 WriteRequest: structs.WriteRequest{ 283 Region: c.srv.config.Region, 284 }, 285 } 286 requests = append(requests, req) 287 available := maxIdsPerReap 288 289 // Add the allocs first 290 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 291 if remaining <= available { 292 req.Allocs = allocs[submittedAllocs:] 293 available -= remaining 294 submittedAllocs += remaining 295 } else { 296 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 297 submittedAllocs += available 298 299 // Exhausted space so skip adding evals 300 continue 301 } 302 } 303 304 // Add the evals 305 if remaining := len(evals) - submittedEvals; remaining > 0 { 306 if remaining <= available { 307 req.Evals = evals[submittedEvals:] 308 submittedEvals += remaining 309 } else { 310 req.Evals = evals[submittedEvals : submittedEvals+available] 311 submittedEvals += available 312 } 313 } 314 } 315 316 return requests 317 } 318 319 // nodeGC is used to garbage collect old nodes 320 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 321 // Iterate over the evaluations 322 iter, err := c.snap.Nodes() 323 if err != nil { 324 return err 325 } 326 327 var oldThreshold uint64 328 if eval.JobID == structs.CoreJobForceGC { 329 // The GC was forced, so set the threshold to its maximum so everything 330 // will GC. 331 oldThreshold = math.MaxUint64 332 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 333 } else { 334 // Compute the old threshold limit for GC using the FSM 335 // time table. This is a rough mapping of a time to the 336 // Raft index it belongs to. 337 tt := c.srv.fsm.TimeTable() 338 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 339 oldThreshold = tt.NearestIndex(cutoff) 340 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 341 oldThreshold, c.srv.config.NodeGCThreshold) 342 } 343 344 // Collect the nodes to GC 345 var gcNode []string 346 for { 347 raw := iter.Next() 348 if raw == nil { 349 break 350 } 351 node := raw.(*structs.Node) 352 353 // Ignore non-terminal and new nodes 354 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 355 continue 356 } 357 358 // Get the allocations by node 359 allocs, err := c.snap.AllocsByNode(node.ID) 360 if err != nil { 361 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 362 eval.ID, err) 363 continue 364 } 365 366 // If there are any allocations, skip the node 367 if len(allocs) > 0 { 368 continue 369 } 370 371 // Node is eligible for garbage collection 372 gcNode = append(gcNode, node.ID) 373 } 374 375 // Fast-path the nothing case 376 if len(gcNode) == 0 { 377 return nil 378 } 379 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 380 381 // Call to the leader to issue the reap 382 for _, nodeID := range gcNode { 383 req := structs.NodeDeregisterRequest{ 384 NodeID: nodeID, 385 WriteRequest: structs.WriteRequest{ 386 Region: c.srv.config.Region, 387 }, 388 } 389 var resp structs.NodeUpdateResponse 390 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 391 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 392 return err 393 } 394 } 395 return nil 396 }