github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 "github.com/hashicorp/nomad/nomad/state" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/scheduler" 11 ) 12 13 var ( 14 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 15 // single Raft transaction. This is to ensure that the Raft message does not 16 // become too large. 17 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 18 ) 19 20 // CoreScheduler is a special "scheduler" that is registered 21 // as "_core". It is used to run various administrative work 22 // across the cluster. 23 type CoreScheduler struct { 24 srv *Server 25 snap *state.StateSnapshot 26 } 27 28 // NewCoreScheduler is used to return a new system scheduler instance 29 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 30 s := &CoreScheduler{ 31 srv: srv, 32 snap: snap, 33 } 34 return s 35 } 36 37 // Process is used to implement the scheduler.Scheduler interface 38 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 39 switch eval.JobID { 40 case structs.CoreJobEvalGC: 41 return c.evalGC(eval) 42 case structs.CoreJobNodeGC: 43 return c.nodeGC(eval) 44 case structs.CoreJobJobGC: 45 return c.jobGC(eval) 46 case structs.CoreJobForceGC: 47 return c.forceGC(eval) 48 default: 49 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 50 } 51 } 52 53 // forceGC is used to garbage collect all eligible objects. 54 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 55 if err := c.jobGC(eval); err != nil { 56 return err 57 } 58 if err := c.evalGC(eval); err != nil { 59 return err 60 } 61 62 // Node GC must occur after the others to ensure the allocations are 63 // cleared. 64 return c.nodeGC(eval) 65 } 66 67 // jobGC is used to garbage collect eligible jobs. 68 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 69 // Get all the jobs eligible for garbage collection. 70 iter, err := c.snap.JobsByGC(true) 71 if err != nil { 72 return err 73 } 74 75 var oldThreshold uint64 76 if eval.JobID == structs.CoreJobForceGC { 77 // The GC was forced, so set the threshold to its maximum so everything 78 // will GC. 79 oldThreshold = math.MaxUint64 80 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 81 } else { 82 // Get the time table to calculate GC cutoffs. 83 tt := c.srv.fsm.TimeTable() 84 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 85 oldThreshold = tt.NearestIndex(cutoff) 86 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 87 oldThreshold, c.srv.config.JobGCThreshold) 88 } 89 90 // Collect the allocations, evaluations and jobs to GC 91 var gcAlloc, gcEval, gcJob []string 92 93 OUTER: 94 for i := iter.Next(); i != nil; i = iter.Next() { 95 job := i.(*structs.Job) 96 97 // Ignore new jobs. 98 if job.CreateIndex > oldThreshold { 99 continue 100 } 101 102 evals, err := c.snap.EvalsByJob(job.ID) 103 if err != nil { 104 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 105 continue 106 } 107 108 allEvalsGC := true 109 var jobAlloc, jobEval []string 110 for _, eval := range evals { 111 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 112 if err != nil { 113 continue OUTER 114 } 115 116 if gc { 117 jobEval = append(jobEval, eval.ID) 118 jobAlloc = append(jobAlloc, allocs...) 119 } else { 120 allEvalsGC = false 121 break 122 } 123 } 124 125 // Job is eligible for garbage collection 126 if allEvalsGC { 127 gcJob = append(gcJob, job.ID) 128 gcAlloc = append(gcAlloc, jobAlloc...) 129 gcEval = append(gcEval, jobEval...) 130 } 131 } 132 133 // Fast-path the nothing case 134 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 135 return nil 136 } 137 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 138 len(gcJob), len(gcEval), len(gcAlloc)) 139 140 // Reap the evals and allocs 141 if err := c.evalReap(gcEval, gcAlloc); err != nil { 142 return err 143 } 144 145 // Call to the leader to deregister the jobs. 146 for _, job := range gcJob { 147 req := structs.JobDeregisterRequest{ 148 JobID: job, 149 WriteRequest: structs.WriteRequest{ 150 Region: c.srv.config.Region, 151 }, 152 } 153 var resp structs.JobDeregisterResponse 154 if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil { 155 c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err) 156 return err 157 } 158 } 159 160 return nil 161 } 162 163 // evalGC is used to garbage collect old evaluations 164 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 165 // Iterate over the evaluations 166 iter, err := c.snap.Evals() 167 if err != nil { 168 return err 169 } 170 171 var oldThreshold uint64 172 if eval.JobID == structs.CoreJobForceGC { 173 // The GC was forced, so set the threshold to its maximum so everything 174 // will GC. 175 oldThreshold = math.MaxUint64 176 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 177 } else { 178 // Compute the old threshold limit for GC using the FSM 179 // time table. This is a rough mapping of a time to the 180 // Raft index it belongs to. 181 tt := c.srv.fsm.TimeTable() 182 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 183 oldThreshold = tt.NearestIndex(cutoff) 184 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 185 oldThreshold, c.srv.config.EvalGCThreshold) 186 } 187 188 // Collect the allocations and evaluations to GC 189 var gcAlloc, gcEval []string 190 for raw := iter.Next(); raw != nil; raw = iter.Next() { 191 eval := raw.(*structs.Evaluation) 192 193 // The Evaluation GC should not handle batch jobs since those need to be 194 // garbage collected in one shot 195 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 196 if err != nil { 197 return err 198 } 199 200 if gc { 201 gcEval = append(gcEval, eval.ID) 202 } 203 gcAlloc = append(gcAlloc, allocs...) 204 } 205 206 // Fast-path the nothing case 207 if len(gcEval) == 0 && len(gcAlloc) == 0 { 208 return nil 209 } 210 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 211 len(gcEval), len(gcAlloc)) 212 213 return c.evalReap(gcEval, gcAlloc) 214 } 215 216 // gcEval returns whether the eval should be garbage collected given a raft 217 // threshold index. The eval disqualifies for garbage collection if it or its 218 // allocs are not older than the threshold. If the eval should be garbage 219 // collected, the associated alloc ids that should also be removed are also 220 // returned 221 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 222 bool, []string, error) { 223 // Ignore non-terminal and new evaluations 224 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 225 return false, nil, nil 226 } 227 228 // If the eval is from a running "batch" job we don't want to garbage 229 // collect its allocations. If there is a long running batch job and its 230 // terminal allocations get GC'd the scheduler would re-run the 231 // allocations. 232 if eval.Type == structs.JobTypeBatch { 233 if !allowBatch { 234 return false, nil, nil 235 } 236 237 // Check if the job is running 238 job, err := c.snap.JobByID(eval.JobID) 239 if err != nil { 240 return false, nil, err 241 } 242 243 // We don't want to gc anything related to a job which is not dead 244 if job != nil && job.Status != structs.JobStatusDead { 245 return false, nil, nil 246 } 247 } 248 249 // Get the allocations by eval 250 allocs, err := c.snap.AllocsByEval(eval.ID) 251 if err != nil { 252 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 253 eval.ID, err) 254 return false, nil, err 255 } 256 257 // Scan the allocations to ensure they are terminal and old 258 gcEval := true 259 var gcAllocIDs []string 260 for _, alloc := range allocs { 261 if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { 262 // Can't GC the evaluation since not all of the allocations are 263 // terminal 264 gcEval = false 265 } else { 266 // The allocation is eligible to be GC'd 267 gcAllocIDs = append(gcAllocIDs, alloc.ID) 268 } 269 } 270 271 return gcEval, gcAllocIDs, nil 272 } 273 274 // evalReap contacts the leader and issues a reap on the passed evals and 275 // allocs. 276 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 277 // Call to the leader to issue the reap 278 for _, req := range c.partitionReap(evals, allocs) { 279 var resp structs.GenericResponse 280 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 281 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 282 return err 283 } 284 } 285 286 return nil 287 } 288 289 // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single 290 // request does not contain too many allocations and evaluations. This is 291 // necessary to ensure that the Raft transaction does not become too large. 292 func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest { 293 var requests []*structs.EvalDeleteRequest 294 submittedEvals, submittedAllocs := 0, 0 295 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 296 req := &structs.EvalDeleteRequest{ 297 WriteRequest: structs.WriteRequest{ 298 Region: c.srv.config.Region, 299 }, 300 } 301 requests = append(requests, req) 302 available := maxIdsPerReap 303 304 // Add the allocs first 305 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 306 if remaining <= available { 307 req.Allocs = allocs[submittedAllocs:] 308 available -= remaining 309 submittedAllocs += remaining 310 } else { 311 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 312 submittedAllocs += available 313 314 // Exhausted space so skip adding evals 315 continue 316 } 317 } 318 319 // Add the evals 320 if remaining := len(evals) - submittedEvals; remaining > 0 { 321 if remaining <= available { 322 req.Evals = evals[submittedEvals:] 323 submittedEvals += remaining 324 } else { 325 req.Evals = evals[submittedEvals : submittedEvals+available] 326 submittedEvals += available 327 } 328 } 329 } 330 331 return requests 332 } 333 334 // nodeGC is used to garbage collect old nodes 335 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 336 // Iterate over the evaluations 337 iter, err := c.snap.Nodes() 338 if err != nil { 339 return err 340 } 341 342 var oldThreshold uint64 343 if eval.JobID == structs.CoreJobForceGC { 344 // The GC was forced, so set the threshold to its maximum so everything 345 // will GC. 346 oldThreshold = math.MaxUint64 347 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 348 } else { 349 // Compute the old threshold limit for GC using the FSM 350 // time table. This is a rough mapping of a time to the 351 // Raft index it belongs to. 352 tt := c.srv.fsm.TimeTable() 353 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 354 oldThreshold = tt.NearestIndex(cutoff) 355 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 356 oldThreshold, c.srv.config.NodeGCThreshold) 357 } 358 359 // Collect the nodes to GC 360 var gcNode []string 361 OUTER: 362 for { 363 raw := iter.Next() 364 if raw == nil { 365 break 366 } 367 node := raw.(*structs.Node) 368 369 // Ignore non-terminal and new nodes 370 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 371 continue 372 } 373 374 // Get the allocations by node 375 allocs, err := c.snap.AllocsByNode(node.ID) 376 if err != nil { 377 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 378 eval.ID, err) 379 continue 380 } 381 382 // If there are any non-terminal allocations, skip the node. If the node 383 // is terminal and the allocations are not, the scheduler may not have 384 // run yet to transition the allocs on the node to terminal. We delay 385 // GC'ing until this happens. 386 for _, alloc := range allocs { 387 if !alloc.TerminalStatus() { 388 continue OUTER 389 } 390 } 391 392 // Node is eligible for garbage collection 393 gcNode = append(gcNode, node.ID) 394 } 395 396 // Fast-path the nothing case 397 if len(gcNode) == 0 { 398 return nil 399 } 400 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 401 402 // Call to the leader to issue the reap 403 for _, nodeID := range gcNode { 404 req := structs.NodeDeregisterRequest{ 405 NodeID: nodeID, 406 WriteRequest: structs.WriteRequest{ 407 Region: c.srv.config.Region, 408 }, 409 } 410 var resp structs.NodeUpdateResponse 411 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 412 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 413 return err 414 } 415 } 416 return nil 417 }