github.com/quite/nomad@v0.8.6/nomad/drainer/drainer.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "log" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/helper" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/state" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "golang.org/x/time/rate" 14 ) 15 16 var ( 17 // stateReadErrorDelay is the delay to apply before retrying reading state 18 // when there is an error 19 stateReadErrorDelay = 1 * time.Second 20 ) 21 22 const ( 23 // LimitStateQueriesPerSecond is the number of state queries allowed per 24 // second 25 LimitStateQueriesPerSecond = 100.0 26 27 // BatchUpdateInterval is how long we wait to batch updates 28 BatchUpdateInterval = 1 * time.Second 29 30 // NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will 31 // be coalesced together 32 NodeDeadlineCoalesceWindow = 5 * time.Second 33 34 // NodeDrainEventComplete is used to indicate that the node drain is 35 // finished. 36 NodeDrainEventComplete = "Node drain complete" 37 38 // NodeDrainEventDetailDeadlined is the key to use when the drain is 39 // complete because a deadline. The acceptable values are "true" and "false" 40 NodeDrainEventDetailDeadlined = "deadline_reached" 41 ) 42 43 // RaftApplier contains methods for applying the raft requests required by the 44 // NodeDrainer. 45 type RaftApplier interface { 46 AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) 47 NodesDrainComplete(nodes []string, event *structs.NodeEvent) (uint64, error) 48 } 49 50 // NodeTracker is the interface to notify an object that is tracking draining 51 // nodes of changes 52 type NodeTracker interface { 53 // TrackedNodes returns all the nodes that are currently tracked as 54 // draining. 55 TrackedNodes() map[string]*structs.Node 56 57 // Remove removes a node from the draining set. 58 Remove(nodeID string) 59 60 // Update either updates the specification of a draining node or tracks the 61 // node as draining. 62 Update(node *structs.Node) 63 } 64 65 // DrainingJobWatcherFactory returns a new DrainingJobWatcher 66 type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher 67 68 // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher 69 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher 70 71 // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier 72 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier 73 74 // GetDrainingJobWatcher returns a draining job watcher 75 func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher { 76 return NewDrainingJobWatcher(ctx, limiter, state, logger) 77 } 78 79 // GetDeadlineNotifier returns a node deadline notifier with default coalescing. 80 func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { 81 return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow) 82 } 83 84 // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory 85 func GetNodeWatcherFactory() DrainingNodeWatcherFactory { 86 return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher { 87 return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) 88 } 89 } 90 91 // allocMigrateBatcher is used to batch allocation updates. 92 type allocMigrateBatcher struct { 93 // updates holds pending client status updates for allocations 94 updates []*structs.Allocation 95 96 // updateFuture is used to wait for the pending batch update 97 // to complete. This may be nil if no batch is pending. 98 updateFuture *structs.BatchFuture 99 100 // updateTimer is the timer that will trigger the next batch 101 // update, and may be nil if there is no batch pending. 102 updateTimer *time.Timer 103 104 batchWindow time.Duration 105 106 // synchronizes access to the updates list, the future and the timer. 107 sync.Mutex 108 } 109 110 // NodeDrainerConfig is used to configure a new node drainer. 111 type NodeDrainerConfig struct { 112 Logger *log.Logger 113 Raft RaftApplier 114 JobFactory DrainingJobWatcherFactory 115 NodeFactory DrainingNodeWatcherFactory 116 DrainDeadlineFactory DrainDeadlineNotifierFactory 117 118 // StateQueriesPerSecond configures the query limit against the state store 119 // that is allowed by the node drainer. 120 StateQueriesPerSecond float64 121 122 // BatchUpdateInterval is the interval in which allocation updates are 123 // batched. 124 BatchUpdateInterval time.Duration 125 } 126 127 // NodeDrainer is used to orchestrate migrating allocations off of draining 128 // nodes. 129 type NodeDrainer struct { 130 enabled bool 131 logger *log.Logger 132 133 // nodes is the set of draining nodes 134 nodes map[string]*drainingNode 135 136 // nodeWatcher watches for nodes to transition in and out of drain state. 137 nodeWatcher DrainingNodeWatcher 138 nodeFactory DrainingNodeWatcherFactory 139 140 // jobWatcher watches draining jobs and emits desired drains and notifies 141 // when migrations take place. 142 jobWatcher DrainingJobWatcher 143 jobFactory DrainingJobWatcherFactory 144 145 // deadlineNotifier notifies when nodes reach their drain deadline. 146 deadlineNotifier DrainDeadlineNotifier 147 deadlineNotifierFactory DrainDeadlineNotifierFactory 148 149 // state is the state that is watched for state changes. 150 state *state.StateStore 151 152 // queryLimiter is used to limit the rate of blocking queries 153 queryLimiter *rate.Limiter 154 155 // raft is a shim around the raft messages necessary for draining 156 raft RaftApplier 157 158 // batcher is used to batch alloc migrations. 159 batcher allocMigrateBatcher 160 161 // ctx and exitFn are used to cancel the watcher 162 ctx context.Context 163 exitFn context.CancelFunc 164 165 l sync.RWMutex 166 } 167 168 // NewNodeDrainer returns a new new node drainer. The node drainer is 169 // responsible for marking allocations on draining nodes with a desired 170 // migration transition, updating the drain strategy on nodes when they are 171 // complete and creating evaluations for the system to react to these changes. 172 func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { 173 return &NodeDrainer{ 174 raft: c.Raft, 175 logger: c.Logger, 176 jobFactory: c.JobFactory, 177 nodeFactory: c.NodeFactory, 178 deadlineNotifierFactory: c.DrainDeadlineFactory, 179 queryLimiter: rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100), 180 batcher: allocMigrateBatcher{ 181 batchWindow: c.BatchUpdateInterval, 182 }, 183 } 184 } 185 186 // SetEnabled will start or stop the node draining goroutine depending on the 187 // enabled boolean. 188 func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) { 189 n.l.Lock() 190 defer n.l.Unlock() 191 192 // If we are starting now or have a new state, init state and start the 193 // run loop 194 n.enabled = enabled 195 if enabled { 196 n.flush(state) 197 go n.run(n.ctx) 198 } else if !enabled && n.exitFn != nil { 199 n.exitFn() 200 } 201 } 202 203 // flush is used to clear the state of the watcher 204 func (n *NodeDrainer) flush(state *state.StateStore) { 205 // Cancel anything that may be running. 206 if n.exitFn != nil { 207 n.exitFn() 208 } 209 210 // Store the new state 211 if state != nil { 212 n.state = state 213 } 214 215 n.ctx, n.exitFn = context.WithCancel(context.Background()) 216 n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger) 217 n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n) 218 n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx) 219 n.nodes = make(map[string]*drainingNode, 32) 220 } 221 222 // run is a long lived event handler that receives changes from the relevant 223 // watchers and takes action based on them. 224 func (n *NodeDrainer) run(ctx context.Context) { 225 for { 226 select { 227 case <-n.ctx.Done(): 228 return 229 case nodes := <-n.deadlineNotifier.NextBatch(): 230 n.handleDeadlinedNodes(nodes) 231 case req := <-n.jobWatcher.Drain(): 232 n.handleJobAllocDrain(req) 233 case allocs := <-n.jobWatcher.Migrated(): 234 n.handleMigratedAllocs(allocs) 235 } 236 } 237 } 238 239 // handleDeadlinedNodes handles a set of nodes reaching their drain deadline. 240 // The handler detects the remaining allocations on the nodes and immediately 241 // marks them for migration. 242 func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { 243 // Retrieve the set of allocations that will be force stopped. 244 var forceStop []*structs.Allocation 245 n.l.RLock() 246 for _, node := range nodes { 247 draining, ok := n.nodes[node] 248 if !ok { 249 n.logger.Printf("[DEBUG] nomad.drain: skipping untracked deadlined node %q", node) 250 continue 251 } 252 253 allocs, err := draining.RemainingAllocs() 254 if err != nil { 255 n.logger.Printf("[ERR] nomad.drain: failed to retrive allocs on deadlined node %q: %v", node, err) 256 continue 257 } 258 259 n.logger.Printf("[DEBUG] nomad.drain: node %q deadlined causing %d allocs to be force stopped", node, len(allocs)) 260 forceStop = append(forceStop, allocs...) 261 } 262 n.l.RUnlock() 263 n.batchDrainAllocs(forceStop) 264 265 // Create the node event 266 event := structs.NewNodeEvent(). 267 SetSubsystem(structs.NodeEventSubsystemDrain). 268 SetMessage(NodeDrainEventComplete). 269 AddDetail(NodeDrainEventDetailDeadlined, "true") 270 271 // Submit the node transitions in a sharded form to ensure a reasonable 272 // Raft transaction size. 273 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) { 274 if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { 275 n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) 276 } 277 } 278 } 279 280 // handleJobAllocDrain handles marking a set of allocations as having a desired 281 // transition to drain. The handler blocks till the changes to the allocation 282 // have occurred. 283 func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) { 284 index, err := n.batchDrainAllocs(req.Allocs) 285 req.Resp.Respond(index, err) 286 } 287 288 // handleMigratedAllocs checks to see if any nodes can be considered done 289 // draining based on the set of allocations that have migrated because of an 290 // ongoing drain for a job. 291 func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { 292 // Determine the set of nodes that were effected 293 nodes := make(map[string]struct{}) 294 for _, alloc := range allocs { 295 nodes[alloc.NodeID] = struct{}{} 296 } 297 298 var done []string 299 var remainingAllocs []*structs.Allocation 300 301 // For each node, check if it is now done 302 n.l.RLock() 303 for node := range nodes { 304 draining, ok := n.nodes[node] 305 if !ok { 306 continue 307 } 308 309 isDone, err := draining.IsDone() 310 if err != nil { 311 n.logger.Printf("[ERR] nomad.drain: error checking if node %q is done draining: %v", node, err) 312 continue 313 } 314 315 if !isDone { 316 continue 317 } 318 319 done = append(done, node) 320 321 remaining, err := draining.RemainingAllocs() 322 if err != nil { 323 n.logger.Printf("[ERR] nomad.drain: node %q is done draining but encountered an error getting remaining allocs: %v", node, err) 324 continue 325 } 326 327 remainingAllocs = append(remainingAllocs, remaining...) 328 } 329 n.l.RUnlock() 330 331 // Stop any running system jobs on otherwise done nodes 332 if len(remainingAllocs) > 0 { 333 future := structs.NewBatchFuture() 334 n.drainAllocs(future, remainingAllocs) 335 if err := future.Wait(); err != nil { 336 n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done nodes: %v", 337 len(remainingAllocs), err) 338 } 339 } 340 341 // Create the node event 342 event := structs.NewNodeEvent(). 343 SetSubsystem(structs.NodeEventSubsystemDrain). 344 SetMessage(NodeDrainEventComplete) 345 346 // Submit the node transitions in a sharded form to ensure a reasonable 347 // Raft transaction size. 348 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) { 349 if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { 350 n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) 351 } 352 } 353 } 354 355 // batchDrainAllocs is used to batch the draining of allocations. It will block 356 // until the batch is complete. 357 func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) { 358 // Add this to the batch 359 n.batcher.Lock() 360 n.batcher.updates = append(n.batcher.updates, allocs...) 361 362 // Start a new batch if none 363 future := n.batcher.updateFuture 364 if future == nil { 365 future = structs.NewBatchFuture() 366 n.batcher.updateFuture = future 367 n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() { 368 // Get the pending updates 369 n.batcher.Lock() 370 updates := n.batcher.updates 371 future := n.batcher.updateFuture 372 n.batcher.updates = nil 373 n.batcher.updateFuture = nil 374 n.batcher.updateTimer = nil 375 n.batcher.Unlock() 376 377 // Perform the batch update 378 n.drainAllocs(future, updates) 379 }) 380 } 381 n.batcher.Unlock() 382 383 if err := future.Wait(); err != nil { 384 return 0, err 385 } 386 387 return future.Index(), nil 388 } 389 390 // drainAllocs is a non batch, marking of the desired transition to migrate for 391 // the set of allocations. It will also create the necessary evaluations for the 392 // affected jobs. 393 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) { 394 // Compute the effected jobs and make the transition map 395 jobs := make(map[string]*structs.Allocation, 4) 396 transitions := make(map[string]*structs.DesiredTransition, len(allocs)) 397 for _, alloc := range allocs { 398 transitions[alloc.ID] = &structs.DesiredTransition{ 399 Migrate: helper.BoolToPtr(true), 400 } 401 jobs[alloc.JobID] = alloc 402 } 403 404 evals := make([]*structs.Evaluation, 0, len(jobs)) 405 for job, alloc := range jobs { 406 evals = append(evals, &structs.Evaluation{ 407 ID: uuid.Generate(), 408 Namespace: alloc.Namespace, 409 Priority: alloc.Job.Priority, 410 Type: alloc.Job.Type, 411 TriggeredBy: structs.EvalTriggerNodeDrain, 412 JobID: job, 413 Status: structs.EvalStatusPending, 414 }) 415 } 416 417 // Commit this update via Raft 418 var finalIndex uint64 419 for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transitions, evals) { 420 index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals) 421 if err != nil { 422 future.Respond(0, err) 423 return 424 } 425 finalIndex = index 426 } 427 428 future.Respond(finalIndex, nil) 429 }