github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/hashicorp/nomad/scheduler" 12 ) 13 14 var ( 15 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 16 // single Raft transaction. This is to ensure that the Raft message does not 17 // become too large. 18 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 19 ) 20 21 // CoreScheduler is a special "scheduler" that is registered 22 // as "_core". It is used to run various administrative work 23 // across the cluster. 24 type CoreScheduler struct { 25 srv *Server 26 snap *state.StateSnapshot 27 } 28 29 // NewCoreScheduler is used to return a new system scheduler instance 30 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 31 s := &CoreScheduler{ 32 srv: srv, 33 snap: snap, 34 } 35 return s 36 } 37 38 // Process is used to implement the scheduler.Scheduler interface 39 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 40 switch eval.JobID { 41 case structs.CoreJobEvalGC: 42 return c.evalGC(eval) 43 case structs.CoreJobNodeGC: 44 return c.nodeGC(eval) 45 case structs.CoreJobJobGC: 46 return c.jobGC(eval) 47 case structs.CoreJobForceGC: 48 return c.forceGC(eval) 49 default: 50 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 51 } 52 } 53 54 // forceGC is used to garbage collect all eligible objects. 55 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 56 if err := c.jobGC(eval); err != nil { 57 return err 58 } 59 if err := c.evalGC(eval); err != nil { 60 return err 61 } 62 63 // Node GC must occur after the others to ensure the allocations are 64 // cleared. 65 return c.nodeGC(eval) 66 } 67 68 // jobGC is used to garbage collect eligible jobs. 69 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 70 // Get all the jobs eligible for garbage collection. 71 ws := memdb.NewWatchSet() 72 iter, err := c.snap.JobsByGC(ws, true) 73 if err != nil { 74 return err 75 } 76 77 var oldThreshold uint64 78 if eval.JobID == structs.CoreJobForceGC { 79 // The GC was forced, so set the threshold to its maximum so everything 80 // will GC. 81 oldThreshold = math.MaxUint64 82 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 83 } else { 84 // Get the time table to calculate GC cutoffs. 85 tt := c.srv.fsm.TimeTable() 86 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 87 oldThreshold = tt.NearestIndex(cutoff) 88 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 89 oldThreshold, c.srv.config.JobGCThreshold) 90 } 91 92 // Collect the allocations, evaluations and jobs to GC 93 var gcAlloc, gcEval, gcJob []string 94 95 OUTER: 96 for i := iter.Next(); i != nil; i = iter.Next() { 97 job := i.(*structs.Job) 98 99 // Ignore new jobs. 100 if job.CreateIndex > oldThreshold { 101 continue 102 } 103 104 ws := memdb.NewWatchSet() 105 evals, err := c.snap.EvalsByJob(ws, job.ID) 106 if err != nil { 107 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 108 continue 109 } 110 111 allEvalsGC := true 112 var jobAlloc, jobEval []string 113 for _, eval := range evals { 114 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 115 if err != nil { 116 continue OUTER 117 } 118 119 if gc { 120 jobEval = append(jobEval, eval.ID) 121 jobAlloc = append(jobAlloc, allocs...) 122 } else { 123 allEvalsGC = false 124 break 125 } 126 } 127 128 // Job is eligible for garbage collection 129 if allEvalsGC { 130 gcJob = append(gcJob, job.ID) 131 gcAlloc = append(gcAlloc, jobAlloc...) 132 gcEval = append(gcEval, jobEval...) 133 } 134 } 135 136 // Fast-path the nothing case 137 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 138 return nil 139 } 140 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 141 len(gcJob), len(gcEval), len(gcAlloc)) 142 143 // Reap the evals and allocs 144 if err := c.evalReap(gcEval, gcAlloc); err != nil { 145 return err 146 } 147 148 // Call to the leader to deregister the jobs. 149 for _, job := range gcJob { 150 req := structs.JobDeregisterRequest{ 151 JobID: job, 152 Purge: true, 153 WriteRequest: structs.WriteRequest{ 154 Region: c.srv.config.Region, 155 }, 156 } 157 var resp structs.JobDeregisterResponse 158 if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil { 159 c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err) 160 return err 161 } 162 } 163 164 return nil 165 } 166 167 // evalGC is used to garbage collect old evaluations 168 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 169 // Iterate over the evaluations 170 ws := memdb.NewWatchSet() 171 iter, err := c.snap.Evals(ws) 172 if err != nil { 173 return err 174 } 175 176 var oldThreshold uint64 177 if eval.JobID == structs.CoreJobForceGC { 178 // The GC was forced, so set the threshold to its maximum so everything 179 // will GC. 180 oldThreshold = math.MaxUint64 181 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 182 } else { 183 // Compute the old threshold limit for GC using the FSM 184 // time table. This is a rough mapping of a time to the 185 // Raft index it belongs to. 186 tt := c.srv.fsm.TimeTable() 187 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 188 oldThreshold = tt.NearestIndex(cutoff) 189 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 190 oldThreshold, c.srv.config.EvalGCThreshold) 191 } 192 193 // Collect the allocations and evaluations to GC 194 var gcAlloc, gcEval []string 195 for raw := iter.Next(); raw != nil; raw = iter.Next() { 196 eval := raw.(*structs.Evaluation) 197 198 // The Evaluation GC should not handle batch jobs since those need to be 199 // garbage collected in one shot 200 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 201 if err != nil { 202 return err 203 } 204 205 if gc { 206 gcEval = append(gcEval, eval.ID) 207 } 208 gcAlloc = append(gcAlloc, allocs...) 209 } 210 211 // Fast-path the nothing case 212 if len(gcEval) == 0 && len(gcAlloc) == 0 { 213 return nil 214 } 215 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 216 len(gcEval), len(gcAlloc)) 217 218 return c.evalReap(gcEval, gcAlloc) 219 } 220 221 // gcEval returns whether the eval should be garbage collected given a raft 222 // threshold index. The eval disqualifies for garbage collection if it or its 223 // allocs are not older than the threshold. If the eval should be garbage 224 // collected, the associated alloc ids that should also be removed are also 225 // returned 226 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 227 bool, []string, error) { 228 // Ignore non-terminal and new evaluations 229 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 230 return false, nil, nil 231 } 232 233 // Create a watchset 234 ws := memdb.NewWatchSet() 235 236 // If the eval is from a running "batch" job we don't want to garbage 237 // collect its allocations. If there is a long running batch job and its 238 // terminal allocations get GC'd the scheduler would re-run the 239 // allocations. 240 if eval.Type == structs.JobTypeBatch { 241 // Check if the job is running 242 job, err := c.snap.JobByID(ws, eval.JobID) 243 if err != nil { 244 return false, nil, err 245 } 246 247 // Can collect if: 248 // Job doesn't exist 249 // Job is Stopped and dead 250 // allowBatch and the job is dead 251 collect := false 252 if job == nil { 253 collect = true 254 } else if job.Status != structs.JobStatusDead { 255 collect = false 256 } else if job.Stop { 257 collect = true 258 } else if allowBatch { 259 collect = true 260 } 261 262 // We don't want to gc anything related to a job which is not dead 263 // If the batch job doesn't exist we can GC it regardless of allowBatch 264 if !collect { 265 return false, nil, nil 266 } 267 } 268 269 // Get the allocations by eval 270 allocs, err := c.snap.AllocsByEval(ws, eval.ID) 271 if err != nil { 272 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 273 eval.ID, err) 274 return false, nil, err 275 } 276 277 // Scan the allocations to ensure they are terminal and old 278 gcEval := true 279 var gcAllocIDs []string 280 for _, alloc := range allocs { 281 if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { 282 // Can't GC the evaluation since not all of the allocations are 283 // terminal 284 gcEval = false 285 } else { 286 // The allocation is eligible to be GC'd 287 gcAllocIDs = append(gcAllocIDs, alloc.ID) 288 } 289 } 290 291 return gcEval, gcAllocIDs, nil 292 } 293 294 // evalReap contacts the leader and issues a reap on the passed evals and 295 // allocs. 296 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 297 // Call to the leader to issue the reap 298 for _, req := range c.partitionReap(evals, allocs) { 299 var resp structs.GenericResponse 300 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 301 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 302 return err 303 } 304 } 305 306 return nil 307 } 308 309 // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single 310 // request does not contain too many allocations and evaluations. This is 311 // necessary to ensure that the Raft transaction does not become too large. 312 func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest { 313 var requests []*structs.EvalDeleteRequest 314 submittedEvals, submittedAllocs := 0, 0 315 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 316 req := &structs.EvalDeleteRequest{ 317 WriteRequest: structs.WriteRequest{ 318 Region: c.srv.config.Region, 319 }, 320 } 321 requests = append(requests, req) 322 available := maxIdsPerReap 323 324 // Add the allocs first 325 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 326 if remaining <= available { 327 req.Allocs = allocs[submittedAllocs:] 328 available -= remaining 329 submittedAllocs += remaining 330 } else { 331 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 332 submittedAllocs += available 333 334 // Exhausted space so skip adding evals 335 continue 336 } 337 } 338 339 // Add the evals 340 if remaining := len(evals) - submittedEvals; remaining > 0 { 341 if remaining <= available { 342 req.Evals = evals[submittedEvals:] 343 submittedEvals += remaining 344 } else { 345 req.Evals = evals[submittedEvals : submittedEvals+available] 346 submittedEvals += available 347 } 348 } 349 } 350 351 return requests 352 } 353 354 // nodeGC is used to garbage collect old nodes 355 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 356 // Iterate over the evaluations 357 ws := memdb.NewWatchSet() 358 iter, err := c.snap.Nodes(ws) 359 if err != nil { 360 return err 361 } 362 363 var oldThreshold uint64 364 if eval.JobID == structs.CoreJobForceGC { 365 // The GC was forced, so set the threshold to its maximum so everything 366 // will GC. 367 oldThreshold = math.MaxUint64 368 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 369 } else { 370 // Compute the old threshold limit for GC using the FSM 371 // time table. This is a rough mapping of a time to the 372 // Raft index it belongs to. 373 tt := c.srv.fsm.TimeTable() 374 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 375 oldThreshold = tt.NearestIndex(cutoff) 376 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 377 oldThreshold, c.srv.config.NodeGCThreshold) 378 } 379 380 // Collect the nodes to GC 381 var gcNode []string 382 OUTER: 383 for { 384 raw := iter.Next() 385 if raw == nil { 386 break 387 } 388 node := raw.(*structs.Node) 389 390 // Ignore non-terminal and new nodes 391 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 392 continue 393 } 394 395 // Get the allocations by node 396 ws := memdb.NewWatchSet() 397 allocs, err := c.snap.AllocsByNode(ws, node.ID) 398 if err != nil { 399 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 400 eval.ID, err) 401 continue 402 } 403 404 // If there are any non-terminal allocations, skip the node. If the node 405 // is terminal and the allocations are not, the scheduler may not have 406 // run yet to transition the allocs on the node to terminal. We delay 407 // GC'ing until this happens. 408 for _, alloc := range allocs { 409 if !alloc.TerminalStatus() { 410 continue OUTER 411 } 412 } 413 414 // Node is eligible for garbage collection 415 gcNode = append(gcNode, node.ID) 416 } 417 418 // Fast-path the nothing case 419 if len(gcNode) == 0 { 420 return nil 421 } 422 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 423 424 // Call to the leader to issue the reap 425 for _, nodeID := range gcNode { 426 req := structs.NodeDeregisterRequest{ 427 NodeID: nodeID, 428 WriteRequest: structs.WriteRequest{ 429 Region: c.srv.config.Region, 430 }, 431 } 432 var resp structs.NodeUpdateResponse 433 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 434 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 435 return err 436 } 437 } 438 return nil 439 }