github.com/hernad/nomad@v1.6.112/nomad/drainer/drainer.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package drainer 5 6 import ( 7 "context" 8 "sync" 9 "time" 10 11 log "github.com/hashicorp/go-hclog" 12 13 "github.com/hernad/nomad/helper/pointer" 14 "github.com/hernad/nomad/helper/uuid" 15 "github.com/hernad/nomad/nomad/state" 16 "github.com/hernad/nomad/nomad/structs" 17 "golang.org/x/time/rate" 18 ) 19 20 var ( 21 // stateReadErrorDelay is the delay to apply before retrying reading state 22 // when there is an error 23 stateReadErrorDelay = 1 * time.Second 24 ) 25 26 const ( 27 // LimitStateQueriesPerSecond is the number of state queries allowed per 28 // second 29 LimitStateQueriesPerSecond = 100.0 30 31 // BatchUpdateInterval is how long we wait to batch updates 32 BatchUpdateInterval = 1 * time.Second 33 34 // NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will 35 // be coalesced together 36 NodeDeadlineCoalesceWindow = 5 * time.Second 37 38 // NodeDrainEventComplete is used to indicate that the node drain is 39 // finished. 40 NodeDrainEventComplete = "Node drain complete" 41 42 // NodeDrainEventDetailDeadlined is the key to use when the drain is 43 // complete because a deadline. The acceptable values are "true" and "false" 44 NodeDrainEventDetailDeadlined = "deadline_reached" 45 ) 46 47 // RaftApplier contains methods for applying the raft requests required by the 48 // NodeDrainer. 49 type RaftApplier interface { 50 AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) 51 NodesDrainComplete(nodes []string, event *structs.NodeEvent) (uint64, error) 52 } 53 54 // NodeTracker is the interface to notify an object that is tracking draining 55 // nodes of changes 56 type NodeTracker interface { 57 // TrackedNodes returns all the nodes that are currently tracked as 58 // draining. 59 TrackedNodes() map[string]*structs.Node 60 61 // Remove removes a node from the draining set. 62 Remove(nodeID string) 63 64 // Update either updates the specification of a draining node or tracks the 65 // node as draining. 66 Update(node *structs.Node) 67 } 68 69 // DrainingJobWatcherFactory returns a new DrainingJobWatcher 70 type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger) DrainingJobWatcher 71 72 // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher 73 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger, NodeTracker) DrainingNodeWatcher 74 75 // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier 76 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier 77 78 // GetDrainingJobWatcher returns a draining job watcher 79 func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) DrainingJobWatcher { 80 return NewDrainingJobWatcher(ctx, limiter, state, logger) 81 } 82 83 // GetDeadlineNotifier returns a node deadline notifier with default coalescing. 84 func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { 85 return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow) 86 } 87 88 // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory 89 func GetNodeWatcherFactory() DrainingNodeWatcherFactory { 90 return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) DrainingNodeWatcher { 91 return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) 92 } 93 } 94 95 // allocMigrateBatcher is used to batch allocation updates. 96 type allocMigrateBatcher struct { 97 // updates holds pending client status updates for allocations 98 updates []*structs.Allocation 99 100 // updateFuture is used to wait for the pending batch update 101 // to complete. This may be nil if no batch is pending. 102 updateFuture *structs.BatchFuture 103 104 // updateTimer is the timer that will trigger the next batch 105 // update, and may be nil if there is no batch pending. 106 updateTimer *time.Timer 107 108 batchWindow time.Duration 109 110 // synchronizes access to the updates list, the future and the timer. 111 sync.Mutex 112 } 113 114 // NodeDrainerConfig is used to configure a new node drainer. 115 type NodeDrainerConfig struct { 116 Logger log.Logger 117 Raft RaftApplier 118 JobFactory DrainingJobWatcherFactory 119 NodeFactory DrainingNodeWatcherFactory 120 DrainDeadlineFactory DrainDeadlineNotifierFactory 121 122 // StateQueriesPerSecond configures the query limit against the state store 123 // that is allowed by the node drainer. 124 StateQueriesPerSecond float64 125 126 // BatchUpdateInterval is the interval in which allocation updates are 127 // batched. 128 BatchUpdateInterval time.Duration 129 } 130 131 // NodeDrainer is used to orchestrate migrating allocations off of draining 132 // nodes. 133 type NodeDrainer struct { 134 enabled bool 135 logger log.Logger 136 137 // nodes is the set of draining nodes 138 nodes map[string]*drainingNode 139 140 // nodeWatcher watches for nodes to transition in and out of drain state. 141 nodeWatcher DrainingNodeWatcher 142 nodeFactory DrainingNodeWatcherFactory 143 144 // jobWatcher watches draining jobs and emits desired drains and notifies 145 // when migrations take place. 146 jobWatcher DrainingJobWatcher 147 jobFactory DrainingJobWatcherFactory 148 149 // deadlineNotifier notifies when nodes reach their drain deadline. 150 deadlineNotifier DrainDeadlineNotifier 151 deadlineNotifierFactory DrainDeadlineNotifierFactory 152 153 // state is the state that is watched for state changes. 154 state *state.StateStore 155 156 // queryLimiter is used to limit the rate of blocking queries 157 queryLimiter *rate.Limiter 158 159 // raft is a shim around the raft messages necessary for draining 160 raft RaftApplier 161 162 // batcher is used to batch alloc migrations. 163 batcher allocMigrateBatcher 164 165 // ctx and exitFn are used to cancel the watcher 166 ctx context.Context 167 exitFn context.CancelFunc 168 169 l sync.RWMutex 170 } 171 172 // NewNodeDrainer returns a new new node drainer. The node drainer is 173 // responsible for marking allocations on draining nodes with a desired 174 // migration transition, updating the drain strategy on nodes when they are 175 // complete and creating evaluations for the system to react to these changes. 176 func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { 177 return &NodeDrainer{ 178 raft: c.Raft, 179 logger: c.Logger.Named("drain"), 180 jobFactory: c.JobFactory, 181 nodeFactory: c.NodeFactory, 182 deadlineNotifierFactory: c.DrainDeadlineFactory, 183 queryLimiter: rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100), 184 batcher: allocMigrateBatcher{ 185 batchWindow: c.BatchUpdateInterval, 186 }, 187 } 188 } 189 190 // SetEnabled will start or stop the node draining goroutine depending on the 191 // enabled boolean. 192 func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) { 193 n.l.Lock() 194 defer n.l.Unlock() 195 196 // If we are starting now or have a new state, init state and start the 197 // run loop 198 n.enabled = enabled 199 if enabled { 200 n.flush(state) 201 go n.run(n.ctx) 202 } else if !enabled && n.exitFn != nil { 203 n.exitFn() 204 } 205 } 206 207 // flush is used to clear the state of the watcher 208 func (n *NodeDrainer) flush(state *state.StateStore) { 209 // Cancel anything that may be running. 210 if n.exitFn != nil { 211 n.exitFn() 212 } 213 214 // Store the new state 215 if state != nil { 216 n.state = state 217 } 218 219 n.ctx, n.exitFn = context.WithCancel(context.Background()) 220 n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger) 221 n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n) 222 n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx) 223 n.nodes = make(map[string]*drainingNode, 32) 224 } 225 226 // run is a long lived event handler that receives changes from the relevant 227 // watchers and takes action based on them. 228 func (n *NodeDrainer) run(ctx context.Context) { 229 for { 230 select { 231 case <-n.ctx.Done(): 232 return 233 case nodes := <-n.deadlineNotifier.NextBatch(): 234 n.handleDeadlinedNodes(nodes) 235 case req := <-n.jobWatcher.Drain(): 236 n.handleJobAllocDrain(req) 237 case allocs := <-n.jobWatcher.Migrated(): 238 n.handleMigratedAllocs(allocs) 239 } 240 } 241 } 242 243 // handleDeadlinedNodes handles a set of nodes reaching their drain deadline. 244 // The handler detects the remaining allocations on the nodes and immediately 245 // marks them for migration. 246 func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { 247 // Retrieve the set of allocations that will be force stopped. 248 var forceStop []*structs.Allocation 249 n.l.RLock() 250 for _, node := range nodes { 251 draining, ok := n.nodes[node] 252 if !ok { 253 n.logger.Debug("skipping untracked deadlined node", "node_id", node) 254 continue 255 } 256 257 allocs, err := draining.RemainingAllocs() 258 if err != nil { 259 n.logger.Error("failed to retrieve allocs on deadlined node", "node_id", node, "error", err) 260 continue 261 } 262 263 n.logger.Debug("node deadlined causing allocs to be force stopped", "node_id", node, "num_allocs", len(allocs)) 264 forceStop = append(forceStop, allocs...) 265 } 266 n.l.RUnlock() 267 n.batchDrainAllocs(forceStop) 268 269 // Create the node event 270 event := structs.NewNodeEvent(). 271 SetSubsystem(structs.NodeEventSubsystemDrain). 272 SetMessage(NodeDrainEventComplete). 273 AddDetail(NodeDrainEventDetailDeadlined, "true") 274 275 // Submit the node transitions in a sharded form to ensure a reasonable 276 // Raft transaction size. 277 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) { 278 if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { 279 n.logger.Error("failed to unset drain for nodes", "error", err) 280 } 281 } 282 } 283 284 // handleJobAllocDrain handles marking a set of allocations as having a desired 285 // transition to drain. The handler blocks till the changes to the allocation 286 // have occurred. 287 func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) { 288 index, err := n.batchDrainAllocs(req.Allocs) 289 req.Resp.Respond(index, err) 290 } 291 292 // handleMigratedAllocs checks to see if any nodes can be considered done 293 // draining based on the set of allocations that have migrated because of an 294 // ongoing drain for a job. 295 func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { 296 // Determine the set of nodes that were effected 297 nodes := make(map[string]struct{}) 298 for _, alloc := range allocs { 299 nodes[alloc.NodeID] = struct{}{} 300 } 301 302 var done []string 303 var remainingAllocs []*structs.Allocation 304 305 // For each node, check if it is now done 306 n.l.RLock() 307 for node := range nodes { 308 draining, ok := n.nodes[node] 309 if !ok { 310 continue 311 } 312 313 isDone, err := draining.IsDone() 314 if err != nil { 315 n.logger.Error("error checking if node is done draining", "node_id", node, "error", err) 316 continue 317 } 318 319 if !isDone { 320 continue 321 } 322 323 done = append(done, node) 324 325 remaining, err := draining.RemainingAllocs() 326 if err != nil { 327 n.logger.Error("node is done draining but encountered an error getting remaining allocs", "node_id", node, "error", err) 328 continue 329 } 330 331 remainingAllocs = append(remainingAllocs, remaining...) 332 } 333 n.l.RUnlock() 334 335 // Stop any running system jobs on otherwise done nodes 336 if len(remainingAllocs) > 0 { 337 future := structs.NewBatchFuture() 338 n.drainAllocs(future, remainingAllocs) 339 if err := future.Wait(); err != nil { 340 n.logger.Error("failed to drain remaining allocs from done nodes", "num_allocs", len(remainingAllocs), "error", err) 341 } 342 } 343 344 // Create the node event 345 event := structs.NewNodeEvent(). 346 SetSubsystem(structs.NodeEventSubsystemDrain). 347 SetMessage(NodeDrainEventComplete) 348 349 // Submit the node transitions in a sharded form to ensure a reasonable 350 // Raft transaction size. 351 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) { 352 if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { 353 n.logger.Error("failed to unset drain for nodes", "error", err) 354 } 355 } 356 } 357 358 // batchDrainAllocs is used to batch the draining of allocations. It will block 359 // until the batch is complete. 360 func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) { 361 // Add this to the batch 362 n.batcher.Lock() 363 n.batcher.updates = append(n.batcher.updates, allocs...) 364 365 // Start a new batch if none 366 future := n.batcher.updateFuture 367 if future == nil { 368 future = structs.NewBatchFuture() 369 n.batcher.updateFuture = future 370 n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() { 371 // Get the pending updates 372 n.batcher.Lock() 373 updates := n.batcher.updates 374 future := n.batcher.updateFuture 375 n.batcher.updates = nil 376 n.batcher.updateFuture = nil 377 n.batcher.updateTimer = nil 378 n.batcher.Unlock() 379 380 // Perform the batch update 381 n.drainAllocs(future, updates) 382 }) 383 } 384 n.batcher.Unlock() 385 386 if err := future.Wait(); err != nil { 387 return 0, err 388 } 389 390 return future.Index(), nil 391 } 392 393 // drainAllocs is a non batch, marking of the desired transition to migrate for 394 // the set of allocations. It will also create the necessary evaluations for the 395 // affected jobs. 396 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) { 397 // Compute the effected jobs and make the transition map 398 jobs := make(map[structs.NamespacedID]*structs.Allocation, 4) 399 transitions := make(map[string]*structs.DesiredTransition, len(allocs)) 400 for _, alloc := range allocs { 401 transitions[alloc.ID] = &structs.DesiredTransition{ 402 Migrate: pointer.Of(true), 403 } 404 jobs[alloc.JobNamespacedID()] = alloc 405 } 406 407 evals := make([]*structs.Evaluation, 0, len(jobs)) 408 now := time.Now().UTC().UnixNano() 409 for _, alloc := range jobs { 410 evals = append(evals, &structs.Evaluation{ 411 ID: uuid.Generate(), 412 Namespace: alloc.Namespace, 413 Priority: alloc.Job.Priority, 414 Type: alloc.Job.Type, 415 TriggeredBy: structs.EvalTriggerNodeDrain, 416 JobID: alloc.JobID, 417 Status: structs.EvalStatusPending, 418 CreateTime: now, 419 ModifyTime: now, 420 }) 421 } 422 423 // Commit this update via Raft 424 var finalIndex uint64 425 for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transitions, evals) { 426 index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals) 427 if err != nil { 428 future.Respond(0, err) 429 return 430 } 431 finalIndex = index 432 } 433 434 future.Respond(finalIndex, nil) 435 }