github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "runtime" 6 "time" 7 8 "github.com/armon/go-metrics" 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/go-multierror" 11 "github.com/hashicorp/nomad/nomad/state" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/raft" 14 ) 15 16 // planApply is a long lived goroutine that reads plan allocations from 17 // the plan queue, determines if they can be applied safely and applies 18 // them via Raft. 19 // 20 // Naively, we could simply dequeue a plan, verify, apply and then respond. 21 // However, the plan application is bounded by the Raft apply time and 22 // subject to some latency. This creates a stall condition, where we are 23 // not evaluating, but simply waiting for a transaction to apply. 24 // 25 // To avoid this, we overlap verification with apply. This means once 26 // we've verified plan N we attempt to apply it. However, while waiting 27 // for apply, we begin to verify plan N+1 under the assumption that plan 28 // N has succeeded. 29 // 30 // In this sense, we track two parallel versions of the world. One is 31 // the pessimistic one driven by the Raft log which is replicated. The 32 // other is optimistic and assumes our transactions will succeed. In the 33 // happy path, this lets us do productive work during the latency of 34 // apply. 35 // 36 // In the unhappy path (Raft transaction fails), effectively we only 37 // wasted work during a time we would have been waiting anyways. However, 38 // in anticipation of this case we cannot respond to the plan until 39 // the Raft log is updated. This means our schedulers will stall, 40 // but there are many of those and only a single plan verifier. 41 // 42 func (s *Server) planApply() { 43 // waitCh is used to track an outstanding application while snap 44 // holds an optimistic state which includes that plan application. 45 var waitCh chan struct{} 46 var snap *state.StateSnapshot 47 48 // Setup a worker pool with half the cores, with at least 1 49 poolSize := runtime.NumCPU() / 2 50 if poolSize == 0 { 51 poolSize = 1 52 } 53 pool := NewEvaluatePool(poolSize, workerPoolBufferSize) 54 defer pool.Shutdown() 55 56 for { 57 // Pull the next pending plan, exit if we are no longer leader 58 pending, err := s.planQueue.Dequeue(0) 59 if err != nil { 60 return 61 } 62 63 // Check if out last plan has completed 64 select { 65 case <-waitCh: 66 waitCh = nil 67 snap = nil 68 default: 69 } 70 71 // Snapshot the state so that we have a consistent view of the world 72 // if no snapshot is available 73 if waitCh == nil || snap == nil { 74 snap, err = s.fsm.State().Snapshot() 75 if err != nil { 76 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 77 pending.respond(nil, err) 78 continue 79 } 80 } 81 82 // Evaluate the plan 83 result, err := evaluatePlan(pool, snap, pending.plan) 84 if err != nil { 85 s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) 86 pending.respond(nil, err) 87 continue 88 } 89 90 // Fast-path the response if there is nothing to do 91 if result.IsNoOp() { 92 pending.respond(result, nil) 93 continue 94 } 95 96 // Ensure any parallel apply is complete before starting the next one. 97 // This also limits how out of date our snapshot can be. 98 if waitCh != nil { 99 <-waitCh 100 snap, err = s.fsm.State().Snapshot() 101 if err != nil { 102 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 103 pending.respond(nil, err) 104 continue 105 } 106 } 107 108 // Dispatch the Raft transaction for the plan 109 future, err := s.applyPlan(pending.plan, result, snap) 110 if err != nil { 111 s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err) 112 pending.respond(nil, err) 113 continue 114 } 115 116 // Respond to the plan in async 117 waitCh = make(chan struct{}) 118 go s.asyncPlanWait(waitCh, future, result, pending) 119 } 120 } 121 122 // applyPlan is used to apply the plan result and to return the alloc index 123 func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 124 // Determine the miniumum number of updates, could be more if there 125 // are multiple updates per node 126 minUpdates := len(result.NodeUpdate) 127 minUpdates += len(result.NodeAllocation) 128 129 // Grab the job 130 job := plan.Job 131 132 // Setup the update request 133 req := structs.ApplyPlanResultsRequest{ 134 AllocUpdateRequest: structs.AllocUpdateRequest{ 135 Job: job, 136 Alloc: make([]*structs.Allocation, 0, minUpdates), 137 }, 138 CreatedDeployment: plan.CreatedDeployment, 139 DeploymentUpdates: plan.DeploymentUpdates, 140 } 141 for _, updateList := range result.NodeUpdate { 142 req.Alloc = append(req.Alloc, updateList...) 143 } 144 for _, allocList := range result.NodeAllocation { 145 req.Alloc = append(req.Alloc, allocList...) 146 } 147 148 // Set the time the alloc was applied for the first time. This can be used 149 // to approximate the scheduling time. 150 now := time.Now().UTC().UnixNano() 151 for _, alloc := range req.Alloc { 152 if alloc.CreateTime == 0 { 153 alloc.CreateTime = now 154 } 155 } 156 157 // Dispatch the Raft transaction 158 future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) 159 if err != nil { 160 return nil, err 161 } 162 163 // Optimistically apply to our state view 164 if snap != nil { 165 nextIdx := s.raft.AppliedIndex() + 1 166 if err := snap.UpsertPlanResults(nextIdx, &req); err != nil { 167 return future, err 168 } 169 } 170 return future, nil 171 } 172 173 // asyncPlanWait is used to apply and respond to a plan async 174 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 175 result *structs.PlanResult, pending *pendingPlan) { 176 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 177 defer close(waitCh) 178 179 // Wait for the plan to apply 180 if err := future.Error(); err != nil { 181 s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err) 182 pending.respond(nil, err) 183 return 184 } 185 186 // Respond to the plan 187 result.AllocIndex = future.Index() 188 189 // If this is a partial plan application, we need to ensure the scheduler 190 // at least has visibility into any placements it made to avoid double placement. 191 // The RefreshIndex computed by evaluatePlan may be stale due to evaluation 192 // against an optimistic copy of the state. 193 if result.RefreshIndex != 0 { 194 result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex) 195 } 196 pending.respond(result, nil) 197 } 198 199 // evaluatePlan is used to determine what portions of a plan 200 // can be applied if any. Returns if there should be a plan application 201 // which may be partial or if there was an error 202 func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) { 203 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 204 205 // Create a result holder for the plan 206 result := &structs.PlanResult{ 207 NodeUpdate: make(map[string][]*structs.Allocation), 208 NodeAllocation: make(map[string][]*structs.Allocation), 209 } 210 211 // Collect all the nodeIDs 212 nodeIDs := make(map[string]struct{}) 213 nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation)) 214 for nodeID := range plan.NodeUpdate { 215 if _, ok := nodeIDs[nodeID]; !ok { 216 nodeIDs[nodeID] = struct{}{} 217 nodeIDList = append(nodeIDList, nodeID) 218 } 219 } 220 for nodeID := range plan.NodeAllocation { 221 if _, ok := nodeIDs[nodeID]; !ok { 222 nodeIDs[nodeID] = struct{}{} 223 nodeIDList = append(nodeIDList, nodeID) 224 } 225 } 226 227 // Setup a multierror to handle potentially getting many 228 // errors since we are processing in parallel. 229 var mErr multierror.Error 230 partialCommit := false 231 232 // handleResult is used to process the result of evaluateNodePlan 233 handleResult := func(nodeID string, fit bool, err error) (cancel bool) { 234 // Evaluate the plan for this node 235 if err != nil { 236 mErr.Errors = append(mErr.Errors, err) 237 return true 238 } 239 if !fit { 240 // Set that this is a partial commit 241 partialCommit = true 242 243 // If we require all-at-once scheduling, there is no point 244 // to continue the evaluation, as we've already failed. 245 if plan.AllAtOnce { 246 result.NodeUpdate = nil 247 result.NodeAllocation = nil 248 return true 249 } 250 251 // Skip this node, since it cannot be used. 252 return 253 } 254 255 // Add this to the plan result 256 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 257 result.NodeUpdate[nodeID] = nodeUpdate 258 } 259 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 260 result.NodeAllocation[nodeID] = nodeAlloc 261 } 262 return 263 } 264 265 // Get the pool channels 266 req := pool.RequestCh() 267 resp := pool.ResultCh() 268 outstanding := 0 269 didCancel := false 270 271 // Evalute each node in the plan, handling results as they are ready to 272 // avoid blocking. 273 OUTER: 274 for len(nodeIDList) > 0 { 275 nodeID := nodeIDList[0] 276 select { 277 case req <- evaluateRequest{snap, plan, nodeID}: 278 outstanding++ 279 nodeIDList = nodeIDList[1:] 280 case r := <-resp: 281 outstanding-- 282 283 // Handle a result that allows us to cancel evaluation, 284 // which may save time processing additional entries. 285 if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { 286 didCancel = true 287 break OUTER 288 } 289 } 290 } 291 292 // Drain the remaining results 293 for outstanding > 0 { 294 r := <-resp 295 if !didCancel { 296 if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { 297 didCancel = true 298 } 299 } 300 outstanding-- 301 } 302 303 // If the plan resulted in a partial commit, we need to determine 304 // a minimum refresh index to force the scheduler to work on a more 305 // up-to-date state to avoid the failures. 306 if partialCommit { 307 allocIndex, err := snap.Index("allocs") 308 if err != nil { 309 mErr.Errors = append(mErr.Errors, err) 310 } 311 nodeIndex, err := snap.Index("nodes") 312 if err != nil { 313 mErr.Errors = append(mErr.Errors, err) 314 } 315 result.RefreshIndex = maxUint64(nodeIndex, allocIndex) 316 317 if result.RefreshIndex == 0 { 318 err := fmt.Errorf("partialCommit with RefreshIndex of 0 (%d node, %d alloc)", nodeIndex, allocIndex) 319 mErr.Errors = append(mErr.Errors, err) 320 } 321 } 322 return result, mErr.ErrorOrNil() 323 } 324 325 // evaluateNodePlan is used to evalute the plan for a single node, 326 // returning if the plan is valid or if an error is encountered 327 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) { 328 // If this is an evict-only plan, it always 'fits' since we are removing things. 329 if len(plan.NodeAllocation[nodeID]) == 0 { 330 return true, nil 331 } 332 333 // Get the node itself 334 ws := memdb.NewWatchSet() 335 node, err := snap.NodeByID(ws, nodeID) 336 if err != nil { 337 return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err) 338 } 339 340 // If the node does not exist or is not ready for schduling it is not fit 341 // XXX: There is a potential race between when we do this check and when 342 // the Raft commit happens. 343 if node == nil || node.Status != structs.NodeStatusReady || node.Drain { 344 return false, nil 345 } 346 347 // Get the existing allocations that are non-terminal 348 existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false) 349 if err != nil { 350 return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 351 } 352 353 // Determine the proposed allocation by first removing allocations 354 // that are planned evictions and adding the new allocations. 355 proposed := existingAlloc 356 var remove []*structs.Allocation 357 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 358 remove = append(remove, update...) 359 } 360 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 361 for _, alloc := range updated { 362 remove = append(remove, alloc) 363 } 364 } 365 proposed = structs.RemoveAllocs(existingAlloc, remove) 366 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 367 368 // Check if these allocations fit 369 fit, _, _, err := structs.AllocsFit(node, proposed, nil) 370 return fit, err 371 }