github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/nomad/nomad/state" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/raft" 11 ) 12 13 // planApply is a long lived goroutine that reads plan allocations from 14 // the plan queue, determines if they can be applied safely and applies 15 // them via Raft. 16 // 17 // Naively, we could simply dequeue a plan, verify, apply and then respond. 18 // However, the plan application is bounded by the Raft apply time and 19 // subject to some latency. This creates a stall condition, where we are 20 // not evaluating, but simply waiting for a transaction to apply. 21 // 22 // To avoid this, we overlap verification with apply. This means once 23 // we've verified plan N we attempt to apply it. However, while waiting 24 // for apply, we begin to verify plan N+1 under the assumption that plan 25 // N has succeeded. 26 // 27 // In this sense, we track two parallel versions of the world. One is 28 // the pessimistic one driven by the Raft log which is replicated. The 29 // other is optimistic and assumes our transactions will succeed. In the 30 // happy path, this lets us do productive work during the latency of 31 // apply. 32 // 33 // In the unhappy path (Raft transaction fails), effectively we only 34 // wasted work during a time we would have been waiting anyways. However, 35 // in anticipation of this case we cannot respond to the plan until 36 // the Raft log is updated. This means our schedulers will stall, 37 // but there are many of those and only a single plan verifier. 38 // 39 func (s *Server) planApply() { 40 // waitCh is used to track an outstanding application while snap 41 // holds an optimistic state which includes that plan application. 42 var waitCh chan struct{} 43 var snap *state.StateSnapshot 44 45 for { 46 // Pull the next pending plan, exit if we are no longer leader 47 pending, err := s.planQueue.Dequeue(0) 48 if err != nil { 49 return 50 } 51 52 // Verify the evaluation is outstanding, and that the tokens match. 53 if err := s.evalBroker.OutstandingReset(pending.plan.EvalID, pending.plan.EvalToken); err != nil { 54 s.logger.Printf("[ERR] nomad: plan rejected for evaluation %s: %v", 55 pending.plan.EvalID, err) 56 pending.respond(nil, err) 57 continue 58 } 59 60 // Check if out last plan has completed 61 select { 62 case <-waitCh: 63 waitCh = nil 64 snap = nil 65 default: 66 } 67 68 // Snapshot the state so that we have a consistent view of the world 69 // if no snapshot is available 70 if waitCh == nil || snap == nil { 71 snap, err = s.fsm.State().Snapshot() 72 if err != nil { 73 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 74 pending.respond(nil, err) 75 continue 76 } 77 } 78 79 // Evaluate the plan 80 result, err := evaluatePlan(snap, pending.plan) 81 if err != nil { 82 s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) 83 pending.respond(nil, err) 84 continue 85 } 86 87 // Fast-path the response if there is nothing to do 88 if result.IsNoOp() { 89 pending.respond(result, nil) 90 continue 91 } 92 93 // Ensure any parallel apply is complete before starting the next one. 94 // This also limits how out of date our snapshot can be. 95 if waitCh != nil { 96 <-waitCh 97 snap, err = s.fsm.State().Snapshot() 98 if err != nil { 99 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 100 pending.respond(nil, err) 101 continue 102 } 103 } 104 105 // Dispatch the Raft transaction for the plan 106 future, err := s.applyPlan(result, snap) 107 if err != nil { 108 s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err) 109 pending.respond(nil, err) 110 continue 111 } 112 113 // Respond to the plan in async 114 waitCh = make(chan struct{}) 115 go s.asyncPlanWait(waitCh, future, result, pending) 116 } 117 } 118 119 // applyPlan is used to apply the plan result and to return the alloc index 120 func (s *Server) applyPlan(result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 121 req := structs.AllocUpdateRequest{} 122 for _, updateList := range result.NodeUpdate { 123 req.Alloc = append(req.Alloc, updateList...) 124 } 125 for _, allocList := range result.NodeAllocation { 126 req.Alloc = append(req.Alloc, allocList...) 127 } 128 req.Alloc = append(req.Alloc, result.FailedAllocs...) 129 130 // Dispatch the Raft transaction 131 future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req) 132 if err != nil { 133 return nil, err 134 } 135 136 // Optimistically apply to our state view 137 if snap != nil { 138 nextIdx := s.raft.AppliedIndex() + 1 139 if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil { 140 return future, err 141 } 142 } 143 return future, nil 144 } 145 146 // asyncPlanWait is used to apply and respond to a plan async 147 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 148 result *structs.PlanResult, pending *pendingPlan) { 149 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 150 defer close(waitCh) 151 152 // Wait for the plan to apply 153 if err := future.Error(); err != nil { 154 s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err) 155 pending.respond(nil, err) 156 return 157 } 158 159 // Respond to the plan 160 result.AllocIndex = future.Index() 161 pending.respond(result, nil) 162 } 163 164 // evaluatePlan is used to determine what portions of a plan 165 // can be applied if any. Returns if there should be a plan application 166 // which may be partial or if there was an error 167 func evaluatePlan(snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) { 168 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 169 170 // Create a result holder for the plan 171 result := &structs.PlanResult{ 172 NodeUpdate: make(map[string][]*structs.Allocation), 173 NodeAllocation: make(map[string][]*structs.Allocation), 174 FailedAllocs: plan.FailedAllocs, 175 } 176 177 // Collect all the nodeIDs 178 nodeIDs := make(map[string]struct{}) 179 for nodeID := range plan.NodeUpdate { 180 nodeIDs[nodeID] = struct{}{} 181 } 182 for nodeID := range plan.NodeAllocation { 183 nodeIDs[nodeID] = struct{}{} 184 } 185 186 // Check each allocation to see if it should be allowed 187 for nodeID := range nodeIDs { 188 // Evaluate the plan for this node 189 fit, err := evaluateNodePlan(snap, plan, nodeID) 190 if err != nil { 191 return nil, err 192 } 193 if !fit { 194 // Scheduler must have stale data, RefreshIndex should force 195 // the latest view of allocations and nodes 196 allocIndex, err := snap.Index("allocs") 197 if err != nil { 198 return nil, err 199 } 200 nodeIndex, err := snap.Index("nodes") 201 if err != nil { 202 return nil, err 203 } 204 result.RefreshIndex = maxUint64(nodeIndex, allocIndex) 205 206 // If we require all-at-once scheduling, there is no point 207 // to continue the evaluation, as we've already failed. 208 if plan.AllAtOnce { 209 result.NodeUpdate = nil 210 result.NodeAllocation = nil 211 return result, nil 212 } 213 214 // Skip this node, since it cannot be used. 215 continue 216 } 217 218 // Add this to the plan result 219 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 220 result.NodeUpdate[nodeID] = nodeUpdate 221 } 222 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 223 result.NodeAllocation[nodeID] = nodeAlloc 224 } 225 } 226 return result, nil 227 } 228 229 // evaluateNodePlan is used to evalute the plan for a single node, 230 // returning if the plan is valid or if an error is encountered 231 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) { 232 // If this is an evict-only plan, it always 'fits' since we are removing things. 233 if len(plan.NodeAllocation[nodeID]) == 0 { 234 return true, nil 235 } 236 237 // Get the node itself 238 node, err := snap.NodeByID(nodeID) 239 if err != nil { 240 return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err) 241 } 242 243 // If the node does not exist or is not ready for schduling it is not fit 244 // XXX: There is a potential race between when we do this check and when 245 // the Raft commit happens. 246 if node == nil || node.Status != structs.NodeStatusReady || node.Drain { 247 return false, nil 248 } 249 250 // Get the existing allocations 251 existingAlloc, err := snap.AllocsByNode(nodeID) 252 if err != nil { 253 return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 254 } 255 256 // Filter on alloc state 257 existingAlloc = structs.FilterTerminalAllocs(existingAlloc) 258 259 // Determine the proposed allocation by first removing allocations 260 // that are planned evictions and adding the new allocations. 261 proposed := existingAlloc 262 var remove []*structs.Allocation 263 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 264 remove = append(remove, update...) 265 } 266 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 267 for _, alloc := range updated { 268 remove = append(remove, alloc) 269 } 270 } 271 proposed = structs.RemoveAllocs(existingAlloc, remove) 272 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 273 274 // Check if these allocations fit 275 fit, _, _, err := structs.AllocsFit(node, proposed, nil) 276 return fit, err 277 }