github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/core_sched.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/hashicorp/nomad/scheduler" 12 ) 13 14 var ( 15 // maxIdsPerReap is the maximum number of evals and allocations to reap in a 16 // single Raft transaction. This is to ensure that the Raft message does not 17 // become too large. 18 maxIdsPerReap = (1024 * 256) / 36 // 0.25 MB of ids. 19 ) 20 21 // CoreScheduler is a special "scheduler" that is registered 22 // as "_core". It is used to run various administrative work 23 // across the cluster. 24 type CoreScheduler struct { 25 srv *Server 26 snap *state.StateSnapshot 27 } 28 29 // NewCoreScheduler is used to return a new system scheduler instance 30 func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { 31 s := &CoreScheduler{ 32 srv: srv, 33 snap: snap, 34 } 35 return s 36 } 37 38 // Process is used to implement the scheduler.Scheduler interface 39 func (c *CoreScheduler) Process(eval *structs.Evaluation) error { 40 switch eval.JobID { 41 case structs.CoreJobEvalGC: 42 return c.evalGC(eval) 43 case structs.CoreJobNodeGC: 44 return c.nodeGC(eval) 45 case structs.CoreJobJobGC: 46 return c.jobGC(eval) 47 case structs.CoreJobForceGC: 48 return c.forceGC(eval) 49 default: 50 return fmt.Errorf("core scheduler cannot handle job '%s'", eval.JobID) 51 } 52 } 53 54 // forceGC is used to garbage collect all eligible objects. 55 func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { 56 if err := c.jobGC(eval); err != nil { 57 return err 58 } 59 if err := c.evalGC(eval); err != nil { 60 return err 61 } 62 63 // Node GC must occur after the others to ensure the allocations are 64 // cleared. 65 return c.nodeGC(eval) 66 } 67 68 // jobGC is used to garbage collect eligible jobs. 69 func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { 70 // Get all the jobs eligible for garbage collection. 71 ws := memdb.NewWatchSet() 72 iter, err := c.snap.JobsByGC(ws, true) 73 if err != nil { 74 return err 75 } 76 77 var oldThreshold uint64 78 if eval.JobID == structs.CoreJobForceGC { 79 // The GC was forced, so set the threshold to its maximum so everything 80 // will GC. 81 oldThreshold = math.MaxUint64 82 c.srv.logger.Println("[DEBUG] sched.core: forced job GC") 83 } else { 84 // Get the time table to calculate GC cutoffs. 85 tt := c.srv.fsm.TimeTable() 86 cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) 87 oldThreshold = tt.NearestIndex(cutoff) 88 c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", 89 oldThreshold, c.srv.config.JobGCThreshold) 90 } 91 92 // Collect the allocations, evaluations and jobs to GC 93 var gcAlloc, gcEval, gcJob []string 94 95 OUTER: 96 for i := iter.Next(); i != nil; i = iter.Next() { 97 job := i.(*structs.Job) 98 99 // Ignore new jobs. 100 if job.CreateIndex > oldThreshold { 101 continue 102 } 103 104 ws := memdb.NewWatchSet() 105 evals, err := c.snap.EvalsByJob(ws, job.ID) 106 if err != nil { 107 c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) 108 continue 109 } 110 111 allEvalsGC := true 112 var jobAlloc, jobEval []string 113 for _, eval := range evals { 114 gc, allocs, err := c.gcEval(eval, oldThreshold, true) 115 if err != nil { 116 continue OUTER 117 } 118 119 if gc { 120 jobEval = append(jobEval, eval.ID) 121 jobAlloc = append(jobAlloc, allocs...) 122 } else { 123 allEvalsGC = false 124 break 125 } 126 } 127 128 // Job is eligible for garbage collection 129 if allEvalsGC { 130 gcJob = append(gcJob, job.ID) 131 gcAlloc = append(gcAlloc, jobAlloc...) 132 gcEval = append(gcEval, jobEval...) 133 } 134 } 135 136 // Fast-path the nothing case 137 if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { 138 return nil 139 } 140 c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", 141 len(gcJob), len(gcEval), len(gcAlloc)) 142 143 // Reap the evals and allocs 144 if err := c.evalReap(gcEval, gcAlloc); err != nil { 145 return err 146 } 147 148 // Call to the leader to deregister the jobs. 149 for _, job := range gcJob { 150 req := structs.JobDeregisterRequest{ 151 JobID: job, 152 WriteRequest: structs.WriteRequest{ 153 Region: c.srv.config.Region, 154 }, 155 } 156 var resp structs.JobDeregisterResponse 157 if err := c.srv.RPC("Job.Deregister", &req, &resp); err != nil { 158 c.srv.logger.Printf("[ERR] sched.core: job deregister failed: %v", err) 159 return err 160 } 161 } 162 163 return nil 164 } 165 166 // evalGC is used to garbage collect old evaluations 167 func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { 168 // Iterate over the evaluations 169 ws := memdb.NewWatchSet() 170 iter, err := c.snap.Evals(ws) 171 if err != nil { 172 return err 173 } 174 175 var oldThreshold uint64 176 if eval.JobID == structs.CoreJobForceGC { 177 // The GC was forced, so set the threshold to its maximum so everything 178 // will GC. 179 oldThreshold = math.MaxUint64 180 c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") 181 } else { 182 // Compute the old threshold limit for GC using the FSM 183 // time table. This is a rough mapping of a time to the 184 // Raft index it belongs to. 185 tt := c.srv.fsm.TimeTable() 186 cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) 187 oldThreshold = tt.NearestIndex(cutoff) 188 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", 189 oldThreshold, c.srv.config.EvalGCThreshold) 190 } 191 192 // Collect the allocations and evaluations to GC 193 var gcAlloc, gcEval []string 194 for raw := iter.Next(); raw != nil; raw = iter.Next() { 195 eval := raw.(*structs.Evaluation) 196 197 // The Evaluation GC should not handle batch jobs since those need to be 198 // garbage collected in one shot 199 gc, allocs, err := c.gcEval(eval, oldThreshold, false) 200 if err != nil { 201 return err 202 } 203 204 if gc { 205 gcEval = append(gcEval, eval.ID) 206 } 207 gcAlloc = append(gcAlloc, allocs...) 208 } 209 210 // Fast-path the nothing case 211 if len(gcEval) == 0 && len(gcAlloc) == 0 { 212 return nil 213 } 214 c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", 215 len(gcEval), len(gcAlloc)) 216 217 return c.evalReap(gcEval, gcAlloc) 218 } 219 220 // gcEval returns whether the eval should be garbage collected given a raft 221 // threshold index. The eval disqualifies for garbage collection if it or its 222 // allocs are not older than the threshold. If the eval should be garbage 223 // collected, the associated alloc ids that should also be removed are also 224 // returned 225 func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) ( 226 bool, []string, error) { 227 // Ignore non-terminal and new evaluations 228 if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex { 229 return false, nil, nil 230 } 231 232 // Create a watchset 233 ws := memdb.NewWatchSet() 234 235 // If the eval is from a running "batch" job we don't want to garbage 236 // collect its allocations. If there is a long running batch job and its 237 // terminal allocations get GC'd the scheduler would re-run the 238 // allocations. 239 if eval.Type == structs.JobTypeBatch { 240 // Check if the job is running 241 job, err := c.snap.JobByID(ws, eval.JobID) 242 if err != nil { 243 return false, nil, err 244 } 245 246 // We don't want to gc anything related to a job which is not dead 247 // If the batch job doesn't exist we can GC it regardless of allowBatch 248 if job != nil && (!allowBatch || job.Status != structs.JobStatusDead) { 249 return false, nil, nil 250 } 251 } 252 253 // Get the allocations by eval 254 allocs, err := c.snap.AllocsByEval(ws, eval.ID) 255 if err != nil { 256 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", 257 eval.ID, err) 258 return false, nil, err 259 } 260 261 // Scan the allocations to ensure they are terminal and old 262 gcEval := true 263 var gcAllocIDs []string 264 for _, alloc := range allocs { 265 if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { 266 // Can't GC the evaluation since not all of the allocations are 267 // terminal 268 gcEval = false 269 } else { 270 // The allocation is eligible to be GC'd 271 gcAllocIDs = append(gcAllocIDs, alloc.ID) 272 } 273 } 274 275 return gcEval, gcAllocIDs, nil 276 } 277 278 // evalReap contacts the leader and issues a reap on the passed evals and 279 // allocs. 280 func (c *CoreScheduler) evalReap(evals, allocs []string) error { 281 // Call to the leader to issue the reap 282 for _, req := range c.partitionReap(evals, allocs) { 283 var resp structs.GenericResponse 284 if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { 285 c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) 286 return err 287 } 288 } 289 290 return nil 291 } 292 293 // partitionReap returns a list of EvalDeleteRequest to make, ensuring a single 294 // request does not contain too many allocations and evaluations. This is 295 // necessary to ensure that the Raft transaction does not become too large. 296 func (c *CoreScheduler) partitionReap(evals, allocs []string) []*structs.EvalDeleteRequest { 297 var requests []*structs.EvalDeleteRequest 298 submittedEvals, submittedAllocs := 0, 0 299 for submittedEvals != len(evals) || submittedAllocs != len(allocs) { 300 req := &structs.EvalDeleteRequest{ 301 WriteRequest: structs.WriteRequest{ 302 Region: c.srv.config.Region, 303 }, 304 } 305 requests = append(requests, req) 306 available := maxIdsPerReap 307 308 // Add the allocs first 309 if remaining := len(allocs) - submittedAllocs; remaining > 0 { 310 if remaining <= available { 311 req.Allocs = allocs[submittedAllocs:] 312 available -= remaining 313 submittedAllocs += remaining 314 } else { 315 req.Allocs = allocs[submittedAllocs : submittedAllocs+available] 316 submittedAllocs += available 317 318 // Exhausted space so skip adding evals 319 continue 320 } 321 } 322 323 // Add the evals 324 if remaining := len(evals) - submittedEvals; remaining > 0 { 325 if remaining <= available { 326 req.Evals = evals[submittedEvals:] 327 submittedEvals += remaining 328 } else { 329 req.Evals = evals[submittedEvals : submittedEvals+available] 330 submittedEvals += available 331 } 332 } 333 } 334 335 return requests 336 } 337 338 // nodeGC is used to garbage collect old nodes 339 func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { 340 // Iterate over the evaluations 341 ws := memdb.NewWatchSet() 342 iter, err := c.snap.Nodes(ws) 343 if err != nil { 344 return err 345 } 346 347 var oldThreshold uint64 348 if eval.JobID == structs.CoreJobForceGC { 349 // The GC was forced, so set the threshold to its maximum so everything 350 // will GC. 351 oldThreshold = math.MaxUint64 352 c.srv.logger.Println("[DEBUG] sched.core: forced node GC") 353 } else { 354 // Compute the old threshold limit for GC using the FSM 355 // time table. This is a rough mapping of a time to the 356 // Raft index it belongs to. 357 tt := c.srv.fsm.TimeTable() 358 cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) 359 oldThreshold = tt.NearestIndex(cutoff) 360 c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", 361 oldThreshold, c.srv.config.NodeGCThreshold) 362 } 363 364 // Collect the nodes to GC 365 var gcNode []string 366 OUTER: 367 for { 368 raw := iter.Next() 369 if raw == nil { 370 break 371 } 372 node := raw.(*structs.Node) 373 374 // Ignore non-terminal and new nodes 375 if !node.TerminalStatus() || node.ModifyIndex > oldThreshold { 376 continue 377 } 378 379 // Get the allocations by node 380 ws := memdb.NewWatchSet() 381 allocs, err := c.snap.AllocsByNode(ws, node.ID) 382 if err != nil { 383 c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", 384 eval.ID, err) 385 continue 386 } 387 388 // If there are any non-terminal allocations, skip the node. If the node 389 // is terminal and the allocations are not, the scheduler may not have 390 // run yet to transition the allocs on the node to terminal. We delay 391 // GC'ing until this happens. 392 for _, alloc := range allocs { 393 if !alloc.TerminalStatus() { 394 continue OUTER 395 } 396 } 397 398 // Node is eligible for garbage collection 399 gcNode = append(gcNode, node.ID) 400 } 401 402 // Fast-path the nothing case 403 if len(gcNode) == 0 { 404 return nil 405 } 406 c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) 407 408 // Call to the leader to issue the reap 409 for _, nodeID := range gcNode { 410 req := structs.NodeDeregisterRequest{ 411 NodeID: nodeID, 412 WriteRequest: structs.WriteRequest{ 413 Region: c.srv.config.Region, 414 }, 415 } 416 var resp structs.NodeUpdateResponse 417 if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { 418 c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) 419 return err 420 } 421 } 422 return nil 423 }