github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "runtime" 7 "time" 8 9 "github.com/armon/go-metrics" 10 memdb "github.com/hashicorp/go-memdb" 11 "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/raft" 15 ) 16 17 // planApply is a long lived goroutine that reads plan allocations from 18 // the plan queue, determines if they can be applied safely and applies 19 // them via Raft. 20 // 21 // Naively, we could simply dequeue a plan, verify, apply and then respond. 22 // However, the plan application is bounded by the Raft apply time and 23 // subject to some latency. This creates a stall condition, where we are 24 // not evaluating, but simply waiting for a transaction to apply. 25 // 26 // To avoid this, we overlap verification with apply. This means once 27 // we've verified plan N we attempt to apply it. However, while waiting 28 // for apply, we begin to verify plan N+1 under the assumption that plan 29 // N has succeeded. 30 // 31 // In this sense, we track two parallel versions of the world. One is 32 // the pessimistic one driven by the Raft log which is replicated. The 33 // other is optimistic and assumes our transactions will succeed. In the 34 // happy path, this lets us do productive work during the latency of 35 // apply. 36 // 37 // In the unhappy path (Raft transaction fails), effectively we only 38 // wasted work during a time we would have been waiting anyways. However, 39 // in anticipation of this case we cannot respond to the plan until 40 // the Raft log is updated. This means our schedulers will stall, 41 // but there are many of those and only a single plan verifier. 42 // 43 func (s *Server) planApply() { 44 // waitCh is used to track an outstanding application while snap 45 // holds an optimistic state which includes that plan application. 46 var waitCh chan struct{} 47 var snap *state.StateSnapshot 48 49 // Setup a worker pool with half the cores, with at least 1 50 poolSize := runtime.NumCPU() / 2 51 if poolSize == 0 { 52 poolSize = 1 53 } 54 pool := NewEvaluatePool(poolSize, workerPoolBufferSize) 55 defer pool.Shutdown() 56 57 for { 58 // Pull the next pending plan, exit if we are no longer leader 59 pending, err := s.planQueue.Dequeue(0) 60 if err != nil { 61 return 62 } 63 64 // Check if out last plan has completed 65 select { 66 case <-waitCh: 67 waitCh = nil 68 snap = nil 69 default: 70 } 71 72 // Snapshot the state so that we have a consistent view of the world 73 // if no snapshot is available 74 if waitCh == nil || snap == nil { 75 snap, err = s.fsm.State().Snapshot() 76 if err != nil { 77 s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err) 78 pending.respond(nil, err) 79 continue 80 } 81 } 82 83 // Evaluate the plan 84 result, err := evaluatePlan(pool, snap, pending.plan, s.logger) 85 if err != nil { 86 s.logger.Printf("[ERR] nomad.planner: failed to evaluate plan: %v", err) 87 pending.respond(nil, err) 88 continue 89 } 90 91 // Fast-path the response if there is nothing to do 92 if result.IsNoOp() { 93 pending.respond(result, nil) 94 continue 95 } 96 97 // Ensure any parallel apply is complete before starting the next one. 98 // This also limits how out of date our snapshot can be. 99 if waitCh != nil { 100 <-waitCh 101 snap, err = s.fsm.State().Snapshot() 102 if err != nil { 103 s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err) 104 pending.respond(nil, err) 105 continue 106 } 107 } 108 109 // Dispatch the Raft transaction for the plan 110 future, err := s.applyPlan(pending.plan, result, snap) 111 if err != nil { 112 s.logger.Printf("[ERR] nomad.planner: failed to submit plan: %v", err) 113 pending.respond(nil, err) 114 continue 115 } 116 117 // Respond to the plan in async 118 waitCh = make(chan struct{}) 119 go s.asyncPlanWait(waitCh, future, result, pending) 120 } 121 } 122 123 // applyPlan is used to apply the plan result and to return the alloc index 124 func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 125 // Determine the minimum number of updates, could be more if there 126 // are multiple updates per node 127 minUpdates := len(result.NodeUpdate) 128 minUpdates += len(result.NodeAllocation) 129 130 // Setup the update request 131 req := structs.ApplyPlanResultsRequest{ 132 AllocUpdateRequest: structs.AllocUpdateRequest{ 133 Job: plan.Job, 134 Alloc: make([]*structs.Allocation, 0, minUpdates), 135 }, 136 Deployment: result.Deployment, 137 DeploymentUpdates: result.DeploymentUpdates, 138 EvalID: plan.EvalID, 139 } 140 for _, updateList := range result.NodeUpdate { 141 req.Alloc = append(req.Alloc, updateList...) 142 } 143 for _, allocList := range result.NodeAllocation { 144 req.Alloc = append(req.Alloc, allocList...) 145 } 146 147 // Set the time the alloc was applied for the first time. This can be used 148 // to approximate the scheduling time. 149 now := time.Now().UTC().UnixNano() 150 for _, alloc := range req.Alloc { 151 if alloc.CreateTime == 0 { 152 alloc.CreateTime = now 153 } 154 alloc.ModifyTime = now 155 } 156 157 // Dispatch the Raft transaction 158 future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) 159 if err != nil { 160 return nil, err 161 } 162 163 // Optimistically apply to our state view 164 if snap != nil { 165 nextIdx := s.raft.AppliedIndex() + 1 166 if err := snap.UpsertPlanResults(nextIdx, &req); err != nil { 167 return future, err 168 } 169 } 170 return future, nil 171 } 172 173 // asyncPlanWait is used to apply and respond to a plan async 174 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 175 result *structs.PlanResult, pending *pendingPlan) { 176 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 177 defer close(waitCh) 178 179 // Wait for the plan to apply 180 if err := future.Error(); err != nil { 181 s.logger.Printf("[ERR] nomad.planner: failed to apply plan: %v", err) 182 pending.respond(nil, err) 183 return 184 } 185 186 // Respond to the plan 187 result.AllocIndex = future.Index() 188 189 // If this is a partial plan application, we need to ensure the scheduler 190 // at least has visibility into any placements it made to avoid double placement. 191 // The RefreshIndex computed by evaluatePlan may be stale due to evaluation 192 // against an optimistic copy of the state. 193 if result.RefreshIndex != 0 { 194 result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex) 195 } 196 pending.respond(result, nil) 197 } 198 199 // evaluatePlan is used to determine what portions of a plan 200 // can be applied if any. Returns if there should be a plan application 201 // which may be partial or if there was an error 202 func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { 203 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 204 205 // Check if the plan exceeds quota 206 overQuota, err := evaluatePlanQuota(snap, plan) 207 if err != nil { 208 return nil, err 209 } 210 211 // Reject the plan and force the scheduler to refresh 212 if overQuota { 213 index, err := refreshIndex(snap) 214 if err != nil { 215 return nil, err 216 } 217 218 logger.Printf("[DEBUG] nomad.planner: plan for evaluation %q exceeds quota limit. Forcing refresh to %d", plan.EvalID, index) 219 return &structs.PlanResult{RefreshIndex: index}, nil 220 } 221 222 return evaluatePlanPlacements(pool, snap, plan, logger) 223 } 224 225 // evaluatePlanPlacements is used to determine what portions of a plan can be 226 // applied if any, looking for node over commitment. Returns if there should be 227 // a plan application which may be partial or if there was an error 228 func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { 229 // Create a result holder for the plan 230 result := &structs.PlanResult{ 231 NodeUpdate: make(map[string][]*structs.Allocation), 232 NodeAllocation: make(map[string][]*structs.Allocation), 233 Deployment: plan.Deployment.Copy(), 234 DeploymentUpdates: plan.DeploymentUpdates, 235 } 236 237 // Collect all the nodeIDs 238 nodeIDs := make(map[string]struct{}) 239 nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation)) 240 for nodeID := range plan.NodeUpdate { 241 if _, ok := nodeIDs[nodeID]; !ok { 242 nodeIDs[nodeID] = struct{}{} 243 nodeIDList = append(nodeIDList, nodeID) 244 } 245 } 246 for nodeID := range plan.NodeAllocation { 247 if _, ok := nodeIDs[nodeID]; !ok { 248 nodeIDs[nodeID] = struct{}{} 249 nodeIDList = append(nodeIDList, nodeID) 250 } 251 } 252 253 // Setup a multierror to handle potentially getting many 254 // errors since we are processing in parallel. 255 var mErr multierror.Error 256 partialCommit := false 257 258 // handleResult is used to process the result of evaluateNodePlan 259 handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) { 260 // Evaluate the plan for this node 261 if err != nil { 262 mErr.Errors = append(mErr.Errors, err) 263 return true 264 } 265 if !fit { 266 // Log the reason why the node's allocations could not be made 267 if reason != "" { 268 logger.Printf("[DEBUG] nomad.planner: plan for node %q rejected because: %v", nodeID, reason) 269 } 270 // Set that this is a partial commit 271 partialCommit = true 272 273 // If we require all-at-once scheduling, there is no point 274 // to continue the evaluation, as we've already failed. 275 if plan.AllAtOnce { 276 result.NodeUpdate = nil 277 result.NodeAllocation = nil 278 result.DeploymentUpdates = nil 279 result.Deployment = nil 280 return true 281 } 282 283 // Skip this node, since it cannot be used. 284 return 285 } 286 287 // Add this to the plan result 288 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 289 result.NodeUpdate[nodeID] = nodeUpdate 290 } 291 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 292 result.NodeAllocation[nodeID] = nodeAlloc 293 } 294 return 295 } 296 297 // Get the pool channels 298 req := pool.RequestCh() 299 resp := pool.ResultCh() 300 outstanding := 0 301 didCancel := false 302 303 // Evaluate each node in the plan, handling results as they are ready to 304 // avoid blocking. 305 OUTER: 306 for len(nodeIDList) > 0 { 307 nodeID := nodeIDList[0] 308 select { 309 case req <- evaluateRequest{snap, plan, nodeID}: 310 outstanding++ 311 nodeIDList = nodeIDList[1:] 312 case r := <-resp: 313 outstanding-- 314 315 // Handle a result that allows us to cancel evaluation, 316 // which may save time processing additional entries. 317 if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { 318 didCancel = true 319 break OUTER 320 } 321 } 322 } 323 324 // Drain the remaining results 325 for outstanding > 0 { 326 r := <-resp 327 if !didCancel { 328 if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { 329 didCancel = true 330 } 331 } 332 outstanding-- 333 } 334 335 // If the plan resulted in a partial commit, we need to determine 336 // a minimum refresh index to force the scheduler to work on a more 337 // up-to-date state to avoid the failures. 338 if partialCommit { 339 index, err := refreshIndex(snap) 340 if err != nil { 341 mErr.Errors = append(mErr.Errors, err) 342 } 343 result.RefreshIndex = index 344 345 if result.RefreshIndex == 0 { 346 err := fmt.Errorf("partialCommit with RefreshIndex of 0") 347 mErr.Errors = append(mErr.Errors, err) 348 } 349 350 // If there was a partial commit and we are operating within a 351 // deployment correct for any canary that may have been desired to be 352 // placed but wasn't actually placed 353 correctDeploymentCanaries(result) 354 } 355 return result, mErr.ErrorOrNil() 356 } 357 358 // correctDeploymentCanaries ensures that the deployment object doesn't list any 359 // canaries as placed if they didn't actually get placed. This could happen if 360 // the plan had a partial commit. 361 func correctDeploymentCanaries(result *structs.PlanResult) { 362 // Hot path 363 if result.Deployment == nil || !result.Deployment.HasPlacedCanaries() { 364 return 365 } 366 367 // Build a set of all the allocations IDs that were placed 368 placedAllocs := make(map[string]struct{}, len(result.NodeAllocation)) 369 for _, placed := range result.NodeAllocation { 370 for _, alloc := range placed { 371 placedAllocs[alloc.ID] = struct{}{} 372 } 373 } 374 375 // Go through all the canaries and ensure that the result list only contains 376 // those that have been placed 377 for _, group := range result.Deployment.TaskGroups { 378 canaries := group.PlacedCanaries 379 if len(canaries) == 0 { 380 continue 381 } 382 383 // Prune the canaries in place to avoid allocating an extra slice 384 i := 0 385 for _, canaryID := range canaries { 386 if _, ok := placedAllocs[canaryID]; ok { 387 canaries[i] = canaryID 388 i++ 389 } 390 } 391 392 group.PlacedCanaries = canaries[:i] 393 } 394 } 395 396 // evaluateNodePlan is used to evaluate the plan for a single node, 397 // returning if the plan is valid or if an error is encountered 398 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) { 399 // If this is an evict-only plan, it always 'fits' since we are removing things. 400 if len(plan.NodeAllocation[nodeID]) == 0 { 401 return true, "", nil 402 } 403 404 // Get the node itself 405 ws := memdb.NewWatchSet() 406 node, err := snap.NodeByID(ws, nodeID) 407 if err != nil { 408 return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err) 409 } 410 411 // If the node does not exist or is not ready for scheduling it is not fit 412 // XXX: There is a potential race between when we do this check and when 413 // the Raft commit happens. 414 if node == nil { 415 return false, "node does not exist", nil 416 } else if node.Status != structs.NodeStatusReady { 417 return false, "node is not ready for placements", nil 418 } else if node.SchedulingEligibility == structs.NodeSchedulingIneligible { 419 return false, "node is not eligible for draining", nil 420 } else if node.Drain { 421 // Deprecate in favor of scheduling eligibility and remove post-0.8 422 return false, "node is draining", nil 423 } 424 425 // Get the existing allocations that are non-terminal 426 existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false) 427 if err != nil { 428 return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 429 } 430 431 // Determine the proposed allocation by first removing allocations 432 // that are planned evictions and adding the new allocations. 433 var remove []*structs.Allocation 434 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 435 remove = append(remove, update...) 436 } 437 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 438 for _, alloc := range updated { 439 remove = append(remove, alloc) 440 } 441 } 442 proposed := structs.RemoveAllocs(existingAlloc, remove) 443 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 444 445 // Check if these allocations fit 446 fit, reason, _, err := structs.AllocsFit(node, proposed, nil) 447 return fit, reason, err 448 }