github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/plan_apply.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "log" 6 "runtime" 7 "time" 8 9 "github.com/armon/go-metrics" 10 memdb "github.com/hashicorp/go-memdb" 11 "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/raft" 15 ) 16 17 // planApply is a long lived goroutine that reads plan allocations from 18 // the plan queue, determines if they can be applied safely and applies 19 // them via Raft. 20 // 21 // Naively, we could simply dequeue a plan, verify, apply and then respond. 22 // However, the plan application is bounded by the Raft apply time and 23 // subject to some latency. This creates a stall condition, where we are 24 // not evaluating, but simply waiting for a transaction to apply. 25 // 26 // To avoid this, we overlap verification with apply. This means once 27 // we've verified plan N we attempt to apply it. However, while waiting 28 // for apply, we begin to verify plan N+1 under the assumption that plan 29 // N has succeeded. 30 // 31 // In this sense, we track two parallel versions of the world. One is 32 // the pessimistic one driven by the Raft log which is replicated. The 33 // other is optimistic and assumes our transactions will succeed. In the 34 // happy path, this lets us do productive work during the latency of 35 // apply. 36 // 37 // In the unhappy path (Raft transaction fails), effectively we only 38 // wasted work during a time we would have been waiting anyways. However, 39 // in anticipation of this case we cannot respond to the plan until 40 // the Raft log is updated. This means our schedulers will stall, 41 // but there are many of those and only a single plan verifier. 42 // 43 func (s *Server) planApply() { 44 // waitCh is used to track an outstanding application while snap 45 // holds an optimistic state which includes that plan application. 46 var waitCh chan struct{} 47 var snap *state.StateSnapshot 48 49 // Setup a worker pool with half the cores, with at least 1 50 poolSize := runtime.NumCPU() / 2 51 if poolSize == 0 { 52 poolSize = 1 53 } 54 pool := NewEvaluatePool(poolSize, workerPoolBufferSize) 55 defer pool.Shutdown() 56 57 for { 58 // Pull the next pending plan, exit if we are no longer leader 59 pending, err := s.planQueue.Dequeue(0) 60 if err != nil { 61 return 62 } 63 64 // Check if out last plan has completed 65 select { 66 case <-waitCh: 67 waitCh = nil 68 snap = nil 69 default: 70 } 71 72 // Snapshot the state so that we have a consistent view of the world 73 // if no snapshot is available 74 if waitCh == nil || snap == nil { 75 snap, err = s.fsm.State().Snapshot() 76 if err != nil { 77 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 78 pending.respond(nil, err) 79 continue 80 } 81 } 82 83 // Evaluate the plan 84 result, err := evaluatePlan(pool, snap, pending.plan, s.logger) 85 if err != nil { 86 s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) 87 pending.respond(nil, err) 88 continue 89 } 90 91 // Fast-path the response if there is nothing to do 92 if result.IsNoOp() { 93 pending.respond(result, nil) 94 continue 95 } 96 97 // Ensure any parallel apply is complete before starting the next one. 98 // This also limits how out of date our snapshot can be. 99 if waitCh != nil { 100 <-waitCh 101 snap, err = s.fsm.State().Snapshot() 102 if err != nil { 103 s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err) 104 pending.respond(nil, err) 105 continue 106 } 107 } 108 109 // Dispatch the Raft transaction for the plan 110 future, err := s.applyPlan(pending.plan, result, snap) 111 if err != nil { 112 s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err) 113 pending.respond(nil, err) 114 continue 115 } 116 117 // Respond to the plan in async 118 waitCh = make(chan struct{}) 119 go s.asyncPlanWait(waitCh, future, result, pending) 120 } 121 } 122 123 // applyPlan is used to apply the plan result and to return the alloc index 124 func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { 125 // Determine the miniumum number of updates, could be more if there 126 // are multiple updates per node 127 minUpdates := len(result.NodeUpdate) 128 minUpdates += len(result.NodeAllocation) 129 130 // Setup the update request 131 req := structs.ApplyPlanResultsRequest{ 132 AllocUpdateRequest: structs.AllocUpdateRequest{ 133 Job: plan.Job, 134 Alloc: make([]*structs.Allocation, 0, minUpdates), 135 }, 136 Deployment: result.Deployment, 137 DeploymentUpdates: result.DeploymentUpdates, 138 } 139 for _, updateList := range result.NodeUpdate { 140 req.Alloc = append(req.Alloc, updateList...) 141 } 142 for _, allocList := range result.NodeAllocation { 143 req.Alloc = append(req.Alloc, allocList...) 144 } 145 146 // Set the time the alloc was applied for the first time. This can be used 147 // to approximate the scheduling time. 148 now := time.Now().UTC().UnixNano() 149 for _, alloc := range req.Alloc { 150 if alloc.CreateTime == 0 { 151 alloc.CreateTime = now 152 } 153 } 154 155 // Dispatch the Raft transaction 156 future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) 157 if err != nil { 158 return nil, err 159 } 160 161 // Optimistically apply to our state view 162 if snap != nil { 163 nextIdx := s.raft.AppliedIndex() + 1 164 if err := snap.UpsertPlanResults(nextIdx, &req); err != nil { 165 return future, err 166 } 167 } 168 return future, nil 169 } 170 171 // asyncPlanWait is used to apply and respond to a plan async 172 func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, 173 result *structs.PlanResult, pending *pendingPlan) { 174 defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) 175 defer close(waitCh) 176 177 // Wait for the plan to apply 178 if err := future.Error(); err != nil { 179 s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err) 180 pending.respond(nil, err) 181 return 182 } 183 184 // Respond to the plan 185 result.AllocIndex = future.Index() 186 187 // If this is a partial plan application, we need to ensure the scheduler 188 // at least has visibility into any placements it made to avoid double placement. 189 // The RefreshIndex computed by evaluatePlan may be stale due to evaluation 190 // against an optimistic copy of the state. 191 if result.RefreshIndex != 0 { 192 result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex) 193 } 194 pending.respond(result, nil) 195 } 196 197 // evaluatePlan is used to determine what portions of a plan 198 // can be applied if any. Returns if there should be a plan application 199 // which may be partial or if there was an error 200 func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { 201 defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) 202 203 // Create a result holder for the plan 204 result := &structs.PlanResult{ 205 NodeUpdate: make(map[string][]*structs.Allocation), 206 NodeAllocation: make(map[string][]*structs.Allocation), 207 Deployment: plan.Deployment.Copy(), 208 DeploymentUpdates: plan.DeploymentUpdates, 209 } 210 211 // Collect all the nodeIDs 212 nodeIDs := make(map[string]struct{}) 213 nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation)) 214 for nodeID := range plan.NodeUpdate { 215 if _, ok := nodeIDs[nodeID]; !ok { 216 nodeIDs[nodeID] = struct{}{} 217 nodeIDList = append(nodeIDList, nodeID) 218 } 219 } 220 for nodeID := range plan.NodeAllocation { 221 if _, ok := nodeIDs[nodeID]; !ok { 222 nodeIDs[nodeID] = struct{}{} 223 nodeIDList = append(nodeIDList, nodeID) 224 } 225 } 226 227 // Setup a multierror to handle potentially getting many 228 // errors since we are processing in parallel. 229 var mErr multierror.Error 230 partialCommit := false 231 232 // handleResult is used to process the result of evaluateNodePlan 233 handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) { 234 // Evaluate the plan for this node 235 if err != nil { 236 mErr.Errors = append(mErr.Errors, err) 237 return true 238 } 239 if !fit { 240 // Log the reason why the node's allocations could not be made 241 if reason != "" { 242 logger.Printf("[DEBUG] nomad: plan for node %q rejected because: %v", nodeID, reason) 243 } 244 // Set that this is a partial commit 245 partialCommit = true 246 247 // If we require all-at-once scheduling, there is no point 248 // to continue the evaluation, as we've already failed. 249 if plan.AllAtOnce { 250 result.NodeUpdate = nil 251 result.NodeAllocation = nil 252 result.DeploymentUpdates = nil 253 result.Deployment = nil 254 return true 255 } 256 257 // Skip this node, since it cannot be used. 258 return 259 } 260 261 // Add this to the plan result 262 if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 { 263 result.NodeUpdate[nodeID] = nodeUpdate 264 } 265 if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 { 266 result.NodeAllocation[nodeID] = nodeAlloc 267 } 268 return 269 } 270 271 // Get the pool channels 272 req := pool.RequestCh() 273 resp := pool.ResultCh() 274 outstanding := 0 275 didCancel := false 276 277 // Evalute each node in the plan, handling results as they are ready to 278 // avoid blocking. 279 OUTER: 280 for len(nodeIDList) > 0 { 281 nodeID := nodeIDList[0] 282 select { 283 case req <- evaluateRequest{snap, plan, nodeID}: 284 outstanding++ 285 nodeIDList = nodeIDList[1:] 286 case r := <-resp: 287 outstanding-- 288 289 // Handle a result that allows us to cancel evaluation, 290 // which may save time processing additional entries. 291 if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { 292 didCancel = true 293 break OUTER 294 } 295 } 296 } 297 298 // Drain the remaining results 299 for outstanding > 0 { 300 r := <-resp 301 if !didCancel { 302 if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { 303 didCancel = true 304 } 305 } 306 outstanding-- 307 } 308 309 // If the plan resulted in a partial commit, we need to determine 310 // a minimum refresh index to force the scheduler to work on a more 311 // up-to-date state to avoid the failures. 312 if partialCommit { 313 allocIndex, err := snap.Index("allocs") 314 if err != nil { 315 mErr.Errors = append(mErr.Errors, err) 316 } 317 nodeIndex, err := snap.Index("nodes") 318 if err != nil { 319 mErr.Errors = append(mErr.Errors, err) 320 } 321 result.RefreshIndex = maxUint64(nodeIndex, allocIndex) 322 323 if result.RefreshIndex == 0 { 324 err := fmt.Errorf("partialCommit with RefreshIndex of 0 (%d node, %d alloc)", nodeIndex, allocIndex) 325 mErr.Errors = append(mErr.Errors, err) 326 } 327 328 // If there was a partial commit and we are operating within a 329 // deployment correct for any canary that may have been desired to be 330 // placed but wasn't actually placed 331 correctDeploymentCanaries(result) 332 } 333 return result, mErr.ErrorOrNil() 334 } 335 336 // correctDeploymentCanaries ensures that the deployment object doesn't list any 337 // canaries as placed if they didn't actually get placed. This could happen if 338 // the plan had a partial commit. 339 func correctDeploymentCanaries(result *structs.PlanResult) { 340 // Hot path 341 if result.Deployment == nil || !result.Deployment.HasPlacedCanaries() { 342 return 343 } 344 345 // Build a set of all the allocations IDs that were placed 346 placedAllocs := make(map[string]struct{}, len(result.NodeAllocation)) 347 for _, placed := range result.NodeAllocation { 348 for _, alloc := range placed { 349 placedAllocs[alloc.ID] = struct{}{} 350 } 351 } 352 353 // Go through all the canaries and ensure that the result list only contains 354 // those that have been placed 355 for _, group := range result.Deployment.TaskGroups { 356 canaries := group.PlacedCanaries 357 if len(canaries) == 0 { 358 continue 359 } 360 361 // Prune the canaries in place to avoid allocating an extra slice 362 i := 0 363 for _, canaryID := range canaries { 364 if _, ok := placedAllocs[canaryID]; ok { 365 canaries[i] = canaryID 366 i++ 367 } 368 } 369 370 group.PlacedCanaries = canaries[:i] 371 } 372 } 373 374 // evaluateNodePlan is used to evalute the plan for a single node, 375 // returning if the plan is valid or if an error is encountered 376 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) { 377 // If this is an evict-only plan, it always 'fits' since we are removing things. 378 if len(plan.NodeAllocation[nodeID]) == 0 { 379 return true, "", nil 380 } 381 382 // Get the node itself 383 ws := memdb.NewWatchSet() 384 node, err := snap.NodeByID(ws, nodeID) 385 if err != nil { 386 return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err) 387 } 388 389 // If the node does not exist or is not ready for schduling it is not fit 390 // XXX: There is a potential race between when we do this check and when 391 // the Raft commit happens. 392 if node == nil { 393 return false, "node does not exist", nil 394 } else if node.Status != structs.NodeStatusReady { 395 return false, "node is not ready for placements", nil 396 } else if node.Drain { 397 return false, "node is draining", nil 398 } 399 400 // Get the existing allocations that are non-terminal 401 existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false) 402 if err != nil { 403 return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) 404 } 405 406 // Determine the proposed allocation by first removing allocations 407 // that are planned evictions and adding the new allocations. 408 proposed := existingAlloc 409 var remove []*structs.Allocation 410 if update := plan.NodeUpdate[nodeID]; len(update) > 0 { 411 remove = append(remove, update...) 412 } 413 if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 { 414 for _, alloc := range updated { 415 remove = append(remove, alloc) 416 } 417 } 418 proposed = structs.RemoveAllocs(existingAlloc, remove) 419 proposed = append(proposed, plan.NodeAllocation[nodeID]...) 420 421 // Check if these allocations fit 422 fit, reason, _, err := structs.AllocsFit(node, proposed, nil) 423 return fit, reason, err 424 }