github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "runtime" 6 "time" 7 8 "github.com/armon/go-metrics" 9 "github.com/hashicorp/go-multierror" 10 "github.com/hashicorp/nomad/nomad/state" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "github.com/hashicorp/raft" 13 ) 14 15 // planApply is a long lived goroutine that reads plan allocations from 16 // the plan queue, determines if they can be applied safely and applies 17 // them via Raft. 18 // 19 // Naively, we could simply dequeue a plan, verify, apply and then respond. 20 // However, the plan application is bounded by the Raft apply time and 21 // subject to some latency. This creates a stall condition, where we are 22 // not evaluating, but simply waiting for a transaction to apply. 23 // 24 // To avoid this, we overlap verification with apply. This means once 25 // we've verified plan N we attempt to apply it. However, while waiting 26 // for apply, we begin to verify plan N+1 under the assumption that plan 27 // N has succeeded. 28 // 29 // In this sense, we track two parallel versions of the world. One is 30 // the pessimistic one driven by the Raft log which is replicated. The 31 // other is optimistic and assumes our transactions will succeed. In the 32 // happy path, this lets us do productive work during the latency of 33 // apply. 34 // 35 // In the unhappy path (Raft transaction fails), effectively we only 36 // wasted work during a time we would have been waiting anyways. However, 37 // in anticipation of this case we cannot respond to the plan until 38 // the Raft log is updated. This means our schedulers will stall, 39 // but there are many of those and only a single plan verifier. 40 // 41 func (s *Server) planApply() { 42 // waitCh is used to track an outstanding application while snap 43 // holds an optimistic state which includes that plan application. 44 var waitCh chan struct{} 45 var snap *state.StateSnapshot 46 47 // Setup a worker pool with half the cores, with at least 1 48 poolSize := runtime.NumCPU() / 2 49 if poolSize == 0 { 50 poolSize = 1 51 } 52 pool := NewEvaluatePool(poolSize, workerPoolBufferSize) 53 defer pool.Shutdown() 54 55 for { 56 // Pull the next pending plan, exit if we are no longer leader 57 pending, err := s.planQueue.Dequeue(0) 58 if err != nil { 59 return 60 } 61 62 // Check if out last plan has completed 63 select { 64 case <-waitCh: 65 waitCh = nil 66 snap = nil 67 default: 68 } 69 70 // Snapshot the state so that we have a consistent view of the world 71 // if no snapshot is available 72 if waitCh == nil || snap == nil { 73 snap, err = s.fsm.State().Snapshot() 74 if err != nil { 75 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 76 pending.respond(nil, err) 77 continue 78 } 79 } 80 81 // Evaluate the plan 82 result, err := evaluatePlan(pool, snap, pending.plan) 83 if err != nil { 84 s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) 85 pending.respond(nil, err) 86 continue 87 } 88 89 // Fast-path the response if there is nothing to do 90 if result.IsNoOp() { 91 pending.respond(result, nil) 92 continue 93 } 94 95 // Ensure any parallel apply is complete before starting the next one. 96 // This also limits how out of date our snapshot can be. 97 if waitCh != nil { 98 <-waitCh 99 snap, err = s.fsm.State().Snapshot() 100 if err != nil { 101 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 102 pending.respond(nil, err) 103 continue 104 } 105 } 106 107 // Dispatch the Raft transaction for the plan 108 future, err := s.applyPlan(pending.plan.Job, result, snap) 109 if err != nil { 110 s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err) 111 pending.respond(nil, err) 112 continue 113 } 114 115 // Respond to the plan in async 116 waitCh = make(chan struct{}) 117 go s.asyncPlanWait(waitCh, future, result, pending) 118 } 119 } 120 121 // applyPlan is used to apply the plan result and to return the alloc index 122 func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 123 // Determine the miniumum number of updates, could be more if there 124 // are multiple updates per node 125 minUpdates := len(result.NodeUpdate) 126 minUpdates += len(result.NodeAllocation) 127 128 // Setup the update request 129 req := structs.AllocUpdateRequest{ 130 Job: job, 131 Alloc: make([]*structs.Allocation, 0, minUpdates), 132 } 133 for _, updateList := range result.NodeUpdate { 134 req.Alloc = append(req.Alloc, updateList...) 135 } 136 for _, allocList := range result.NodeAllocation { 137 req.Alloc = append(req.Alloc, allocList...) 138 } 139 140 // Set the time the alloc was applied for the first time. This can be used 141 // to approximate the scheduling time. 142 now := time.Now().UTC().UnixNano() 143 for _, alloc := range req.Alloc { 144 if alloc.CreateTime == 0 { 145 alloc.CreateTime = now 146 } 147 } 148 149 // Dispatch the Raft transaction 150 future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req) 151 if err != nil { 152 return nil, err 153 } 154 155 // Optimistically apply to our state view 156 if snap != nil { 157 nextIdx := s.raft.AppliedIndex() + 1 158 if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil { 159 return future, err 160 } 161 } 162 return future, nil 163 } 164 165 // asyncPlanWait is used to apply and respond to a plan async 166 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 167 result *structs.PlanResult, pending *pendingPlan) { 168 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 169 defer close(waitCh) 170 171 // Wait for the plan to apply 172 if err := future.Error(); err != nil { 173 s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err) 174 pending.respond(nil, err) 175 return 176 } 177 178 // Respond to the plan 179 result.AllocIndex = future.Index() 180 181 // If this is a partial plan application, we need to ensure the scheduler 182 // at least has visibility into any placements it made to avoid double placement. 183 // The RefreshIndex computed by evaluatePlan may be stale due to evaluation 184 // against an optimistic copy of the state. 185 if result.RefreshIndex != 0 { 186 result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex) 187 } 188 pending.respond(result, nil) 189 } 190 191 // evaluatePlan is used to determine what portions of a plan 192 // can be applied if any. Returns if there should be a plan application 193 // which may be partial or if there was an error 194 func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) { 195 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 196 197 // Create a result holder for the plan 198 result := &structs.PlanResult{ 199 NodeUpdate: make(map[string][]*structs.Allocation), 200 NodeAllocation: make(map[string][]*structs.Allocation), 201 } 202 203 // Collect all the nodeIDs 204 nodeIDs := make(map[string]struct{}) 205 nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation)) 206 for nodeID := range plan.NodeUpdate { 207 if _, ok := nodeIDs[nodeID]; !ok { 208 nodeIDs[nodeID] = struct{}{} 209 nodeIDList = append(nodeIDList, nodeID) 210 } 211 } 212 for nodeID := range plan.NodeAllocation { 213 if _, ok := nodeIDs[nodeID]; !ok { 214 nodeIDs[nodeID] = struct{}{} 215 nodeIDList = append(nodeIDList, nodeID) 216 } 217 } 218 219 // Setup a multierror to handle potentially getting many 220 // errors since we are processing in parallel. 221 var mErr multierror.Error 222 partialCommit := false 223 224 // handleResult is used to process the result of evaluateNodePlan 225 handleResult := func(nodeID string, fit bool, err error) (cancel bool) { 226 // Evaluate the plan for this node 227 if err != nil { 228 mErr.Errors = append(mErr.Errors, err) 229 return true 230 } 231 if !fit { 232 // Set that this is a partial commit 233 partialCommit = true 234 235 // If we require all-at-once scheduling, there is no point 236 // to continue the evaluation, as we've already failed. 237 if plan.AllAtOnce { 238 result.NodeUpdate = nil 239 result.NodeAllocation = nil 240 return true 241 } 242 243 // Skip this node, since it cannot be used. 244 return 245 } 246 247 // Add this to the plan result 248 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 249 result.NodeUpdate[nodeID] = nodeUpdate 250 } 251 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 252 result.NodeAllocation[nodeID] = nodeAlloc 253 } 254 return 255 } 256 257 // Get the pool channels 258 req := pool.RequestCh() 259 resp := pool.ResultCh() 260 outstanding := 0 261 didCancel := false 262 263 // Evalute each node in the plan, handling results as they are ready to 264 // avoid blocking. 265 for len(nodeIDList) > 0 { 266 nodeID := nodeIDList[0] 267 select { 268 case req <- evaluateRequest{snap, plan, nodeID}: 269 outstanding++ 270 nodeIDList = nodeIDList[1:] 271 case r := <-resp: 272 outstanding-- 273 274 // Handle a result that allows us to cancel evaluation, 275 // which may save time processing additional entries. 276 if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { 277 didCancel = true 278 break 279 } 280 } 281 } 282 283 // Drain the remaining results 284 for outstanding > 0 { 285 r := <-resp 286 if !didCancel { 287 if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { 288 didCancel = true 289 } 290 } 291 outstanding-- 292 } 293 294 // If the plan resulted in a partial commit, we need to determine 295 // a minimum refresh index to force the scheduler to work on a more 296 // up-to-date state to avoid the failures. 297 if partialCommit { 298 allocIndex, err := snap.Index("allocs") 299 if err != nil { 300 mErr.Errors = append(mErr.Errors, err) 301 } 302 nodeIndex, err := snap.Index("nodes") 303 if err != nil { 304 mErr.Errors = append(mErr.Errors, err) 305 } 306 result.RefreshIndex = maxUint64(nodeIndex, allocIndex) 307 308 if result.RefreshIndex == 0 { 309 err := fmt.Errorf("partialCommit with RefreshIndex of 0 (%d node, %d alloc)", nodeIndex, allocIndex) 310 mErr.Errors = append(mErr.Errors, err) 311 } 312 } 313 return result, mErr.ErrorOrNil() 314 } 315 316 // evaluateNodePlan is used to evalute the plan for a single node, 317 // returning if the plan is valid or if an error is encountered 318 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) { 319 // If this is an evict-only plan, it always 'fits' since we are removing things. 320 if len(plan.NodeAllocation[nodeID]) == 0 { 321 return true, nil 322 } 323 324 // Get the node itself 325 node, err := snap.NodeByID(nodeID) 326 if err != nil { 327 return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err) 328 } 329 330 // If the node does not exist or is not ready for schduling it is not fit 331 // XXX: There is a potential race between when we do this check and when 332 // the Raft commit happens. 333 if node == nil || node.Status != structs.NodeStatusReady || node.Drain { 334 return false, nil 335 } 336 337 // Get the existing allocations that are non-terminal 338 existingAlloc, err := snap.AllocsByNodeTerminal(nodeID, false) 339 if err != nil { 340 return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 341 } 342 343 // Determine the proposed allocation by first removing allocations 344 // that are planned evictions and adding the new allocations. 345 proposed := existingAlloc 346 var remove []*structs.Allocation 347 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 348 remove = append(remove, update...) 349 } 350 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 351 for _, alloc := range updated { 352 remove = append(remove, alloc) 353 } 354 } 355 proposed = structs.RemoveAllocs(existingAlloc, remove) 356 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 357 358 // Check if these allocations fit 359 fit, _, _, err := structs.AllocsFit(node, proposed, nil) 360 return fit, err 361 }