github.com/djenriquez/nomad-1@v0.8.1/nomad/drainer/drainer.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "log" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/helper" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/state" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "golang.org/x/time/rate" 14 ) 15 16 var ( 17 // stateReadErrorDelay is the delay to apply before retrying reading state 18 // when there is an error 19 stateReadErrorDelay = 1 * time.Second 20 ) 21 22 const ( 23 // LimitStateQueriesPerSecond is the number of state queries allowed per 24 // second 25 LimitStateQueriesPerSecond = 100.0 26 27 // BatchUpdateInterval is how long we wait to batch updates 28 BatchUpdateInterval = 1 * time.Second 29 30 // NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will 31 // be coalesced together 32 NodeDeadlineCoalesceWindow = 5 * time.Second 33 ) 34 35 // RaftApplier contains methods for applying the raft requests required by the 36 // NodeDrainer. 37 type RaftApplier interface { 38 AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) 39 NodesDrainComplete(nodes []string) (uint64, error) 40 } 41 42 // NodeTracker is the interface to notify an object that is tracking draining 43 // nodes of changes 44 type NodeTracker interface { 45 // TrackedNodes returns all the nodes that are currently tracked as 46 // draining. 47 TrackedNodes() map[string]*structs.Node 48 49 // Remove removes a node from the draining set. 50 Remove(nodeID string) 51 52 // Update either updates the specification of a draining node or tracks the 53 // node as draining. 54 Update(node *structs.Node) 55 } 56 57 // DrainingJobWatcherFactory returns a new DrainingJobWatcher 58 type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher 59 60 // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher 61 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher 62 63 // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier 64 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier 65 66 // GetDrainingJobWatcher returns a draining job watcher 67 func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher { 68 return NewDrainingJobWatcher(ctx, limiter, state, logger) 69 } 70 71 // GetDeadlineNotifier returns a node deadline notifier with default coalescing. 72 func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { 73 return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow) 74 } 75 76 // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory 77 func GetNodeWatcherFactory() DrainingNodeWatcherFactory { 78 return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher { 79 return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) 80 } 81 } 82 83 // allocMigrateBatcher is used to batch allocation updates. 84 type allocMigrateBatcher struct { 85 // updates holds pending client status updates for allocations 86 updates []*structs.Allocation 87 88 // updateFuture is used to wait for the pending batch update 89 // to complete. This may be nil if no batch is pending. 90 updateFuture *structs.BatchFuture 91 92 // updateTimer is the timer that will trigger the next batch 93 // update, and may be nil if there is no batch pending. 94 updateTimer *time.Timer 95 96 batchWindow time.Duration 97 98 // synchronizes access to the updates list, the future and the timer. 99 sync.Mutex 100 } 101 102 // NodeDrainerConfig is used to configure a new node drainer. 103 type NodeDrainerConfig struct { 104 Logger *log.Logger 105 Raft RaftApplier 106 JobFactory DrainingJobWatcherFactory 107 NodeFactory DrainingNodeWatcherFactory 108 DrainDeadlineFactory DrainDeadlineNotifierFactory 109 110 // StateQueriesPerSecond configures the query limit against the state store 111 // that is allowed by the node drainer. 112 StateQueriesPerSecond float64 113 114 // BatchUpdateInterval is the interval in which allocation updates are 115 // batched. 116 BatchUpdateInterval time.Duration 117 } 118 119 // NodeDrainer is used to orchestrate migrating allocations off of draining 120 // nodes. 121 type NodeDrainer struct { 122 enabled bool 123 logger *log.Logger 124 125 // nodes is the set of draining nodes 126 nodes map[string]*drainingNode 127 128 // nodeWatcher watches for nodes to transition in and out of drain state. 129 nodeWatcher DrainingNodeWatcher 130 nodeFactory DrainingNodeWatcherFactory 131 132 // jobWatcher watches draining jobs and emits desired drains and notifies 133 // when migrations take place. 134 jobWatcher DrainingJobWatcher 135 jobFactory DrainingJobWatcherFactory 136 137 // deadlineNotifier notifies when nodes reach their drain deadline. 138 deadlineNotifier DrainDeadlineNotifier 139 deadlineNotifierFactory DrainDeadlineNotifierFactory 140 141 // state is the state that is watched for state changes. 142 state *state.StateStore 143 144 // queryLimiter is used to limit the rate of blocking queries 145 queryLimiter *rate.Limiter 146 147 // raft is a shim around the raft messages necessary for draining 148 raft RaftApplier 149 150 // batcher is used to batch alloc migrations. 151 batcher allocMigrateBatcher 152 153 // ctx and exitFn are used to cancel the watcher 154 ctx context.Context 155 exitFn context.CancelFunc 156 157 l sync.RWMutex 158 } 159 160 // NewNodeDrainer returns a new new node drainer. The node drainer is 161 // responsible for marking allocations on draining nodes with a desired 162 // migration transition, updating the drain strategy on nodes when they are 163 // complete and creating evaluations for the system to react to these changes. 164 func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { 165 return &NodeDrainer{ 166 raft: c.Raft, 167 logger: c.Logger, 168 jobFactory: c.JobFactory, 169 nodeFactory: c.NodeFactory, 170 deadlineNotifierFactory: c.DrainDeadlineFactory, 171 queryLimiter: rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100), 172 batcher: allocMigrateBatcher{ 173 batchWindow: c.BatchUpdateInterval, 174 }, 175 } 176 } 177 178 // SetEnabled will start or stop the node draining goroutine depending on the 179 // enabled boolean. 180 func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) { 181 n.l.Lock() 182 defer n.l.Unlock() 183 184 // If we are starting now or have a new state, init state and start the 185 // run loop 186 n.enabled = enabled 187 if enabled { 188 n.flush(state) 189 go n.run(n.ctx) 190 } else if !enabled && n.exitFn != nil { 191 n.exitFn() 192 } 193 } 194 195 // flush is used to clear the state of the watcher 196 func (n *NodeDrainer) flush(state *state.StateStore) { 197 // Cancel anything that may be running. 198 if n.exitFn != nil { 199 n.exitFn() 200 } 201 202 // Store the new state 203 if state != nil { 204 n.state = state 205 } 206 207 n.ctx, n.exitFn = context.WithCancel(context.Background()) 208 n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger) 209 n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n) 210 n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx) 211 n.nodes = make(map[string]*drainingNode, 32) 212 } 213 214 // run is a long lived event handler that receives changes from the relevant 215 // watchers and takes action based on them. 216 func (n *NodeDrainer) run(ctx context.Context) { 217 for { 218 select { 219 case <-n.ctx.Done(): 220 return 221 case nodes := <-n.deadlineNotifier.NextBatch(): 222 n.handleDeadlinedNodes(nodes) 223 case req := <-n.jobWatcher.Drain(): 224 n.handleJobAllocDrain(req) 225 case allocs := <-n.jobWatcher.Migrated(): 226 n.handleMigratedAllocs(allocs) 227 } 228 } 229 } 230 231 // handleDeadlinedNodes handles a set of nodes reaching their drain deadline. 232 // The handler detects the remaining allocations on the nodes and immediately 233 // marks them for migration. 234 func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { 235 // Retrieve the set of allocations that will be force stopped. 236 var forceStop []*structs.Allocation 237 n.l.RLock() 238 for _, node := range nodes { 239 draining, ok := n.nodes[node] 240 if !ok { 241 n.logger.Printf("[DEBUG] nomad.drain: skipping untracked deadlined node %q", node) 242 continue 243 } 244 245 allocs, err := draining.RemainingAllocs() 246 if err != nil { 247 n.logger.Printf("[ERR] nomad.drain: failed to retrive allocs on deadlined node %q: %v", node, err) 248 continue 249 } 250 251 n.logger.Printf("[DEBUG] nomad.drain: node %q deadlined causing %d allocs to be force stopped", node, len(allocs)) 252 forceStop = append(forceStop, allocs...) 253 } 254 n.l.RUnlock() 255 n.batchDrainAllocs(forceStop) 256 257 // Submit the node transistions in a sharded form to ensure a reasonable 258 // Raft transaction size. 259 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) { 260 if _, err := n.raft.NodesDrainComplete(nodes); err != nil { 261 n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) 262 } 263 } 264 } 265 266 // handleJobAllocDrain handles marking a set of allocations as having a desired 267 // transition to drain. The handler blocks till the changes to the allocation 268 // have occurred. 269 func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) { 270 index, err := n.batchDrainAllocs(req.Allocs) 271 req.Resp.Respond(index, err) 272 } 273 274 // handleMigratedAllocs checks to see if any nodes can be considered done 275 // draining based on the set of allocations that have migrated because of an 276 // ongoing drain for a job. 277 func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { 278 // Determine the set of nodes that were effected 279 nodes := make(map[string]struct{}) 280 for _, alloc := range allocs { 281 nodes[alloc.NodeID] = struct{}{} 282 } 283 284 var done []string 285 var remainingAllocs []*structs.Allocation 286 287 // For each node, check if it is now done 288 n.l.RLock() 289 for node := range nodes { 290 draining, ok := n.nodes[node] 291 if !ok { 292 continue 293 } 294 295 isDone, err := draining.IsDone() 296 if err != nil { 297 n.logger.Printf("[ERR] nomad.drain: error checking if node %q is done draining: %v", node, err) 298 continue 299 } 300 301 if !isDone { 302 continue 303 } 304 305 done = append(done, node) 306 307 remaining, err := draining.RemainingAllocs() 308 if err != nil { 309 n.logger.Printf("[ERR] nomad.drain: node %q is done draining but encountered an error getting remaining allocs: %v", node, err) 310 continue 311 } 312 313 remainingAllocs = append(remainingAllocs, remaining...) 314 } 315 n.l.RUnlock() 316 317 // Stop any running system jobs on otherwise done nodes 318 if len(remainingAllocs) > 0 { 319 future := structs.NewBatchFuture() 320 n.drainAllocs(future, remainingAllocs) 321 if err := future.Wait(); err != nil { 322 n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done nodes: %v", 323 len(remainingAllocs), err) 324 } 325 } 326 327 // Submit the node transistions in a sharded form to ensure a reasonable 328 // Raft transaction size. 329 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) { 330 if _, err := n.raft.NodesDrainComplete(nodes); err != nil { 331 n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) 332 } 333 } 334 } 335 336 // batchDrainAllocs is used to batch the draining of allocations. It will block 337 // until the batch is complete. 338 func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) { 339 // Add this to the batch 340 n.batcher.Lock() 341 n.batcher.updates = append(n.batcher.updates, allocs...) 342 343 // Start a new batch if none 344 future := n.batcher.updateFuture 345 if future == nil { 346 future = structs.NewBatchFuture() 347 n.batcher.updateFuture = future 348 n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() { 349 // Get the pending updates 350 n.batcher.Lock() 351 updates := n.batcher.updates 352 future := n.batcher.updateFuture 353 n.batcher.updates = nil 354 n.batcher.updateFuture = nil 355 n.batcher.updateTimer = nil 356 n.batcher.Unlock() 357 358 // Perform the batch update 359 n.drainAllocs(future, updates) 360 }) 361 } 362 n.batcher.Unlock() 363 364 if err := future.Wait(); err != nil { 365 return 0, err 366 } 367 368 return future.Index(), nil 369 } 370 371 // drainAllocs is a non batch, marking of the desired transition to migrate for 372 // the set of allocations. It will also create the necessary evaluations for the 373 // affected jobs. 374 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) { 375 // Compute the effected jobs and make the transition map 376 jobs := make(map[string]*structs.Allocation, 4) 377 transistions := make(map[string]*structs.DesiredTransition, len(allocs)) 378 for _, alloc := range allocs { 379 transistions[alloc.ID] = &structs.DesiredTransition{ 380 Migrate: helper.BoolToPtr(true), 381 } 382 jobs[alloc.JobID] = alloc 383 } 384 385 evals := make([]*structs.Evaluation, 0, len(jobs)) 386 for job, alloc := range jobs { 387 evals = append(evals, &structs.Evaluation{ 388 ID: uuid.Generate(), 389 Namespace: alloc.Namespace, 390 Priority: alloc.Job.Priority, 391 Type: alloc.Job.Type, 392 TriggeredBy: structs.EvalTriggerNodeDrain, 393 JobID: job, 394 Status: structs.EvalStatusPending, 395 }) 396 } 397 398 // Commit this update via Raft 399 var finalIndex uint64 400 for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transistions, evals) { 401 index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals) 402 if err != nil { 403 future.Respond(0, err) 404 return 405 } 406 finalIndex = index 407 } 408 409 future.Respond(finalIndex, nil) 410 }