github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/nomad/nomad/state" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/raft" 11 ) 12 13 // planApply is a long lived goroutine that reads plan allocations from 14 // the plan queue, determines if they can be applied safely and applies 15 // them via Raft. 16 // 17 // Naively, we could simply dequeue a plan, verify, apply and then respond. 18 // However, the plan application is bounded by the Raft apply time and 19 // subject to some latency. This creates a stall condition, where we are 20 // not evaluating, but simply waiting for a transaction to apply. 21 // 22 // To avoid this, we overlap verification with apply. This means once 23 // we've verified plan N we attempt to apply it. However, while waiting 24 // for apply, we begin to verify plan N+1 under the assumption that plan 25 // N has succeeded. 26 // 27 // In this sense, we track two parallel versions of the world. One is 28 // the pessimistic one driven by the Raft log which is replicated. The 29 // other is optimistic and assumes our transactions will succeed. In the 30 // happy path, this lets us do productive work during the latency of 31 // apply. 32 // 33 // In the unhappy path (Raft transaction fails), effectively we only 34 // wasted work during a time we would have been waiting anyways. However, 35 // in anticipation of this case we cannot respond to the plan until 36 // the Raft log is updated. This means our schedulers will stall, 37 // but there are many of those and only a single plan verifier. 38 // 39 func (s *Server) planApply() { 40 // waitCh is used to track an outstanding application while snap 41 // holds an optimistic state which includes that plan application. 42 var waitCh chan struct{} 43 var snap *state.StateSnapshot 44 45 for { 46 // Pull the next pending plan, exit if we are no longer leader 47 pending, err := s.planQueue.Dequeue(0) 48 if err != nil { 49 return 50 } 51 52 // Verify the evaluation is outstanding, and that the tokens match. 53 token, ok := s.evalBroker.Outstanding(pending.plan.EvalID) 54 if !ok { 55 s.logger.Printf("[ERR] nomad: plan received for non-outstanding evaluation %s", 56 pending.plan.EvalID) 57 pending.respond(nil, fmt.Errorf("evaluation is not outstanding")) 58 continue 59 } 60 if pending.plan.EvalToken != token { 61 s.logger.Printf("[ERR] nomad: plan received for evaluation %s with wrong token", 62 pending.plan.EvalID) 63 pending.respond(nil, fmt.Errorf("evaluation token does not match")) 64 continue 65 } 66 67 // Check if out last plan has completed 68 select { 69 case <-waitCh: 70 waitCh = nil 71 snap = nil 72 default: 73 } 74 75 // Snapshot the state so that we have a consistent view of the world 76 // if no snapshot is available 77 if waitCh == nil || snap == nil { 78 snap, err = s.fsm.State().Snapshot() 79 if err != nil { 80 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 81 pending.respond(nil, err) 82 continue 83 } 84 } 85 86 // Evaluate the plan 87 result, err := evaluatePlan(snap, pending.plan) 88 if err != nil { 89 s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) 90 pending.respond(nil, err) 91 continue 92 } 93 94 // Fast-path the response if there is nothing to do 95 if result.IsNoOp() { 96 pending.respond(result, nil) 97 continue 98 } 99 100 // Ensure any parallel apply is complete before starting the next one. 101 // This also limits how out of date our snapshot can be. 102 if waitCh != nil { 103 <-waitCh 104 snap, err = s.fsm.State().Snapshot() 105 if err != nil { 106 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 107 pending.respond(nil, err) 108 continue 109 } 110 } 111 112 // Dispatch the Raft transaction for the plan 113 future, err := s.applyPlan(result, snap) 114 if err != nil { 115 s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err) 116 pending.respond(nil, err) 117 continue 118 } 119 120 // Respond to the plan in async 121 waitCh = make(chan struct{}) 122 go s.asyncPlanWait(waitCh, future, result, pending) 123 } 124 } 125 126 // applyPlan is used to apply the plan result and to return the alloc index 127 func (s *Server) applyPlan(result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 128 req := structs.AllocUpdateRequest{} 129 for _, updateList := range result.NodeUpdate { 130 req.Alloc = append(req.Alloc, updateList...) 131 } 132 for _, allocList := range result.NodeAllocation { 133 req.Alloc = append(req.Alloc, allocList...) 134 } 135 req.Alloc = append(req.Alloc, result.FailedAllocs...) 136 137 // Dispatch the Raft transaction 138 future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req) 139 if err != nil { 140 return nil, err 141 } 142 143 // Optimistically apply to our state view 144 if snap != nil { 145 nextIdx := s.raft.AppliedIndex() + 1 146 if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil { 147 return future, err 148 } 149 } 150 return future, nil 151 } 152 153 // asyncPlanWait is used to apply and respond to a plan async 154 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 155 result *structs.PlanResult, pending *pendingPlan) { 156 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 157 defer close(waitCh) 158 159 // Wait for the plan to apply 160 if err := future.Error(); err != nil { 161 s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err) 162 pending.respond(nil, err) 163 return 164 } 165 166 // Respond to the plan 167 result.AllocIndex = future.Index() 168 pending.respond(result, nil) 169 } 170 171 // evaluatePlan is used to determine what portions of a plan 172 // can be applied if any. Returns if there should be a plan application 173 // which may be partial or if there was an error 174 func evaluatePlan(snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) { 175 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 176 177 // Create a result holder for the plan 178 result := &structs.PlanResult{ 179 NodeUpdate: make(map[string][]*structs.Allocation), 180 NodeAllocation: make(map[string][]*structs.Allocation), 181 FailedAllocs: plan.FailedAllocs, 182 } 183 184 // Collect all the nodeIDs 185 nodeIDs := make(map[string]struct{}) 186 for nodeID := range plan.NodeUpdate { 187 nodeIDs[nodeID] = struct{}{} 188 } 189 for nodeID := range plan.NodeAllocation { 190 nodeIDs[nodeID] = struct{}{} 191 } 192 193 // Check each allocation to see if it should be allowed 194 for nodeID := range nodeIDs { 195 // Evaluate the plan for this node 196 fit, err := evaluateNodePlan(snap, plan, nodeID) 197 if err != nil { 198 return nil, err 199 } 200 if !fit { 201 // Scheduler must have stale data, RefreshIndex should force 202 // the latest view of allocations and nodes 203 allocIndex, err := snap.Index("allocs") 204 if err != nil { 205 return nil, err 206 } 207 nodeIndex, err := snap.Index("nodes") 208 if err != nil { 209 return nil, err 210 } 211 result.RefreshIndex = maxUint64(nodeIndex, allocIndex) 212 213 // If we require all-at-once scheduling, there is no point 214 // to continue the evaluation, as we've already failed. 215 if plan.AllAtOnce { 216 result.NodeUpdate = nil 217 result.NodeAllocation = nil 218 return result, nil 219 } 220 221 // Skip this node, since it cannot be used. 222 continue 223 } 224 225 // Add this to the plan result 226 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 227 result.NodeUpdate[nodeID] = nodeUpdate 228 } 229 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 230 result.NodeAllocation[nodeID] = nodeAlloc 231 } 232 } 233 return result, nil 234 } 235 236 // evaluateNodePlan is used to evalute the plan for a single node, 237 // returning if the plan is valid or if an error is encountered 238 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) { 239 // If this is an evict-only plan, it always 'fits' since we are removing things. 240 if len(plan.NodeAllocation[nodeID]) == 0 { 241 return true, nil 242 } 243 244 // Get the node itself 245 node, err := snap.NodeByID(nodeID) 246 if err != nil { 247 return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err) 248 } 249 250 // If the node does not exist or is not ready for schduling it is not fit 251 // XXX: There is a potential race between when we do this check and when 252 // the Raft commit happens. 253 if node == nil || node.Status != structs.NodeStatusReady || node.Drain { 254 return false, nil 255 } 256 257 // Get the existing allocations 258 existingAlloc, err := snap.AllocsByNode(nodeID) 259 if err != nil { 260 return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 261 } 262 263 // Filter on alloc state 264 existingAlloc = structs.FilterTerminalAllocs(existingAlloc) 265 266 // Determine the proposed allocation by first removing allocations 267 // that are planned evictions and adding the new allocations. 268 proposed := existingAlloc 269 var remove []*structs.Allocation 270 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 271 remove = append(remove, update...) 272 } 273 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 274 for _, alloc := range updated { 275 remove = append(remove, alloc) 276 } 277 } 278 proposed = structs.RemoveAllocs(existingAlloc, remove) 279 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 280 281 // Check if these allocations fit 282 fit, _, _, err := structs.AllocsFit(node, proposed, nil) 283 return fit, err 284 }