github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "runtime" 6 "time" 7 8 "github.com/armon/go-metrics" 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/go-multierror" 11 "github.com/hashicorp/nomad/nomad/state" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/raft" 14 ) 15 16 // planApply is a long lived goroutine that reads plan allocations from 17 // the plan queue, determines if they can be applied safely and applies 18 // them via Raft. 19 // 20 // Naively, we could simply dequeue a plan, verify, apply and then respond. 21 // However, the plan application is bounded by the Raft apply time and 22 // subject to some latency. This creates a stall condition, where we are 23 // not evaluating, but simply waiting for a transaction to apply. 24 // 25 // To avoid this, we overlap verification with apply. This means once 26 // we've verified plan N we attempt to apply it. However, while waiting 27 // for apply, we begin to verify plan N+1 under the assumption that plan 28 // N has succeeded. 29 // 30 // In this sense, we track two parallel versions of the world. One is 31 // the pessimistic one driven by the Raft log which is replicated. The 32 // other is optimistic and assumes our transactions will succeed. In the 33 // happy path, this lets us do productive work during the latency of 34 // apply. 35 // 36 // In the unhappy path (Raft transaction fails), effectively we only 37 // wasted work during a time we would have been waiting anyways. However, 38 // in anticipation of this case we cannot respond to the plan until 39 // the Raft log is updated. This means our schedulers will stall, 40 // but there are many of those and only a single plan verifier. 41 // 42 func (s *Server) planApply() { 43 // waitCh is used to track an outstanding application while snap 44 // holds an optimistic state which includes that plan application. 45 var waitCh chan struct{} 46 var snap *state.StateSnapshot 47 48 // Setup a worker pool with half the cores, with at least 1 49 poolSize := runtime.NumCPU() / 2 50 if poolSize == 0 { 51 poolSize = 1 52 } 53 pool := NewEvaluatePool(poolSize, workerPoolBufferSize) 54 defer pool.Shutdown() 55 56 for { 57 // Pull the next pending plan, exit if we are no longer leader 58 pending, err := s.planQueue.Dequeue(0) 59 if err != nil { 60 return 61 } 62 63 // Check if out last plan has completed 64 select { 65 case <-waitCh: 66 waitCh = nil 67 snap = nil 68 default: 69 } 70 71 // Snapshot the state so that we have a consistent view of the world 72 // if no snapshot is available 73 if waitCh == nil || snap == nil { 74 snap, err = s.fsm.State().Snapshot() 75 if err != nil { 76 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 77 pending.respond(nil, err) 78 continue 79 } 80 } 81 82 // Evaluate the plan 83 result, err := evaluatePlan(pool, snap, pending.plan) 84 if err != nil { 85 s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) 86 pending.respond(nil, err) 87 continue 88 } 89 90 // Fast-path the response if there is nothing to do 91 if result.IsNoOp() { 92 pending.respond(result, nil) 93 continue 94 } 95 96 // Ensure any parallel apply is complete before starting the next one. 97 // This also limits how out of date our snapshot can be. 98 if waitCh != nil { 99 <-waitCh 100 snap, err = s.fsm.State().Snapshot() 101 if err != nil { 102 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 103 pending.respond(nil, err) 104 continue 105 } 106 } 107 108 // Dispatch the Raft transaction for the plan 109 future, err := s.applyPlan(pending.plan.Job, result, snap) 110 if err != nil { 111 s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err) 112 pending.respond(nil, err) 113 continue 114 } 115 116 // Respond to the plan in async 117 waitCh = make(chan struct{}) 118 go s.asyncPlanWait(waitCh, future, result, pending) 119 } 120 } 121 122 // applyPlan is used to apply the plan result and to return the alloc index 123 func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 124 // Determine the miniumum number of updates, could be more if there 125 // are multiple updates per node 126 minUpdates := len(result.NodeUpdate) 127 minUpdates += len(result.NodeAllocation) 128 129 // Setup the update request 130 req := structs.AllocUpdateRequest{ 131 Job: job, 132 Alloc: make([]*structs.Allocation, 0, minUpdates), 133 } 134 for _, updateList := range result.NodeUpdate { 135 req.Alloc = append(req.Alloc, updateList...) 136 } 137 for _, allocList := range result.NodeAllocation { 138 req.Alloc = append(req.Alloc, allocList...) 139 } 140 141 // Set the time the alloc was applied for the first time. This can be used 142 // to approximate the scheduling time. 143 now := time.Now().UTC().UnixNano() 144 for _, alloc := range req.Alloc { 145 if alloc.CreateTime == 0 { 146 alloc.CreateTime = now 147 } 148 } 149 150 // Dispatch the Raft transaction 151 future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req) 152 if err != nil { 153 return nil, err 154 } 155 156 // Optimistically apply to our state view 157 if snap != nil { 158 nextIdx := s.raft.AppliedIndex() + 1 159 if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil { 160 return future, err 161 } 162 } 163 return future, nil 164 } 165 166 // asyncPlanWait is used to apply and respond to a plan async 167 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 168 result *structs.PlanResult, pending *pendingPlan) { 169 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 170 defer close(waitCh) 171 172 // Wait for the plan to apply 173 if err := future.Error(); err != nil { 174 s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err) 175 pending.respond(nil, err) 176 return 177 } 178 179 // Respond to the plan 180 result.AllocIndex = future.Index() 181 182 // If this is a partial plan application, we need to ensure the scheduler 183 // at least has visibility into any placements it made to avoid double placement. 184 // The RefreshIndex computed by evaluatePlan may be stale due to evaluation 185 // against an optimistic copy of the state. 186 if result.RefreshIndex != 0 { 187 result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex) 188 } 189 pending.respond(result, nil) 190 } 191 192 // evaluatePlan is used to determine what portions of a plan 193 // can be applied if any. Returns if there should be a plan application 194 // which may be partial or if there was an error 195 func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) { 196 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 197 198 // Create a result holder for the plan 199 result := &structs.PlanResult{ 200 NodeUpdate: make(map[string][]*structs.Allocation), 201 NodeAllocation: make(map[string][]*structs.Allocation), 202 } 203 204 // Collect all the nodeIDs 205 nodeIDs := make(map[string]struct{}) 206 nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation)) 207 for nodeID := range plan.NodeUpdate { 208 if _, ok := nodeIDs[nodeID]; !ok { 209 nodeIDs[nodeID] = struct{}{} 210 nodeIDList = append(nodeIDList, nodeID) 211 } 212 } 213 for nodeID := range plan.NodeAllocation { 214 if _, ok := nodeIDs[nodeID]; !ok { 215 nodeIDs[nodeID] = struct{}{} 216 nodeIDList = append(nodeIDList, nodeID) 217 } 218 } 219 220 // Setup a multierror to handle potentially getting many 221 // errors since we are processing in parallel. 222 var mErr multierror.Error 223 partialCommit := false 224 225 // handleResult is used to process the result of evaluateNodePlan 226 handleResult := func(nodeID string, fit bool, err error) (cancel bool) { 227 // Evaluate the plan for this node 228 if err != nil { 229 mErr.Errors = append(mErr.Errors, err) 230 return true 231 } 232 if !fit { 233 // Set that this is a partial commit 234 partialCommit = true 235 236 // If we require all-at-once scheduling, there is no point 237 // to continue the evaluation, as we've already failed. 238 if plan.AllAtOnce { 239 result.NodeUpdate = nil 240 result.NodeAllocation = nil 241 return true 242 } 243 244 // Skip this node, since it cannot be used. 245 return 246 } 247 248 // Add this to the plan result 249 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 250 result.NodeUpdate[nodeID] = nodeUpdate 251 } 252 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 253 result.NodeAllocation[nodeID] = nodeAlloc 254 } 255 return 256 } 257 258 // Get the pool channels 259 req := pool.RequestCh() 260 resp := pool.ResultCh() 261 outstanding := 0 262 didCancel := false 263 264 // Evalute each node in the plan, handling results as they are ready to 265 // avoid blocking. 266 for len(nodeIDList) > 0 { 267 nodeID := nodeIDList[0] 268 select { 269 case req <- evaluateRequest{snap, plan, nodeID}: 270 outstanding++ 271 nodeIDList = nodeIDList[1:] 272 case r := <-resp: 273 outstanding-- 274 275 // Handle a result that allows us to cancel evaluation, 276 // which may save time processing additional entries. 277 if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { 278 didCancel = true 279 break 280 } 281 } 282 } 283 284 // Drain the remaining results 285 for outstanding > 0 { 286 r := <-resp 287 if !didCancel { 288 if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { 289 didCancel = true 290 } 291 } 292 outstanding-- 293 } 294 295 // If the plan resulted in a partial commit, we need to determine 296 // a minimum refresh index to force the scheduler to work on a more 297 // up-to-date state to avoid the failures. 298 if partialCommit { 299 allocIndex, err := snap.Index("allocs") 300 if err != nil { 301 mErr.Errors = append(mErr.Errors, err) 302 } 303 nodeIndex, err := snap.Index("nodes") 304 if err != nil { 305 mErr.Errors = append(mErr.Errors, err) 306 } 307 result.RefreshIndex = maxUint64(nodeIndex, allocIndex) 308 309 if result.RefreshIndex == 0 { 310 err := fmt.Errorf("partialCommit with RefreshIndex of 0 (%d node, %d alloc)", nodeIndex, allocIndex) 311 mErr.Errors = append(mErr.Errors, err) 312 } 313 } 314 return result, mErr.ErrorOrNil() 315 } 316 317 // evaluateNodePlan is used to evalute the plan for a single node, 318 // returning if the plan is valid or if an error is encountered 319 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) { 320 // If this is an evict-only plan, it always 'fits' since we are removing things. 321 if len(plan.NodeAllocation[nodeID]) == 0 { 322 return true, nil 323 } 324 325 // Get the node itself 326 ws := memdb.NewWatchSet() 327 node, err := snap.NodeByID(ws, nodeID) 328 if err != nil { 329 return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err) 330 } 331 332 // If the node does not exist or is not ready for schduling it is not fit 333 // XXX: There is a potential race between when we do this check and when 334 // the Raft commit happens. 335 if node == nil || node.Status != structs.NodeStatusReady || node.Drain { 336 return false, nil 337 } 338 339 // Get the existing allocations that are non-terminal 340 existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false) 341 if err != nil { 342 return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 343 } 344 345 // Determine the proposed allocation by first removing allocations 346 // that are planned evictions and adding the new allocations. 347 proposed := existingAlloc 348 var remove []*structs.Allocation 349 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 350 remove = append(remove, update...) 351 } 352 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 353 for _, alloc := range updated { 354 remove = append(remove, alloc) 355 } 356 } 357 proposed = structs.RemoveAllocs(existingAlloc, remove) 358 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 359 360 // Check if these allocations fit 361 fit, _, _, err := structs.AllocsFit(node, proposed, nil) 362 return fit, err 363 }