github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/drainer/drainer.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 log "github.com/hashicorp/go-hclog" 9 10 "github.com/hashicorp/nomad/helper" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "golang.org/x/time/rate" 15 ) 16 17 var ( 18 // stateReadErrorDelay is the delay to apply before retrying reading state 19 // when there is an error 20 stateReadErrorDelay = 1 * time.Second 21 ) 22 23 const ( 24 // LimitStateQueriesPerSecond is the number of state queries allowed per 25 // second 26 LimitStateQueriesPerSecond = 100.0 27 28 // BatchUpdateInterval is how long we wait to batch updates 29 BatchUpdateInterval = 1 * time.Second 30 31 // NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will 32 // be coalesced together 33 NodeDeadlineCoalesceWindow = 5 * time.Second 34 35 // NodeDrainEventComplete is used to indicate that the node drain is 36 // finished. 37 NodeDrainEventComplete = "Node drain complete" 38 39 // NodeDrainEventDetailDeadlined is the key to use when the drain is 40 // complete because a deadline. The acceptable values are "true" and "false" 41 NodeDrainEventDetailDeadlined = "deadline_reached" 42 ) 43 44 // RaftApplier contains methods for applying the raft requests required by the 45 // NodeDrainer. 46 type RaftApplier interface { 47 AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) 48 NodesDrainComplete(nodes []string, event *structs.NodeEvent) (uint64, error) 49 } 50 51 // NodeTracker is the interface to notify an object that is tracking draining 52 // nodes of changes 53 type NodeTracker interface { 54 // TrackedNodes returns all the nodes that are currently tracked as 55 // draining. 56 TrackedNodes() map[string]*structs.Node 57 58 // Remove removes a node from the draining set. 59 Remove(nodeID string) 60 61 // Update either updates the specification of a draining node or tracks the 62 // node as draining. 63 Update(node *structs.Node) 64 } 65 66 // DrainingJobWatcherFactory returns a new DrainingJobWatcher 67 type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger) DrainingJobWatcher 68 69 // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher 70 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger, NodeTracker) DrainingNodeWatcher 71 72 // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier 73 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier 74 75 // GetDrainingJobWatcher returns a draining job watcher 76 func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) DrainingJobWatcher { 77 return NewDrainingJobWatcher(ctx, limiter, state, logger) 78 } 79 80 // GetDeadlineNotifier returns a node deadline notifier with default coalescing. 81 func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { 82 return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow) 83 } 84 85 // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory 86 func GetNodeWatcherFactory() DrainingNodeWatcherFactory { 87 return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) DrainingNodeWatcher { 88 return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) 89 } 90 } 91 92 // allocMigrateBatcher is used to batch allocation updates. 93 type allocMigrateBatcher struct { 94 // updates holds pending client status updates for allocations 95 updates []*structs.Allocation 96 97 // updateFuture is used to wait for the pending batch update 98 // to complete. This may be nil if no batch is pending. 99 updateFuture *structs.BatchFuture 100 101 // updateTimer is the timer that will trigger the next batch 102 // update, and may be nil if there is no batch pending. 103 updateTimer *time.Timer 104 105 batchWindow time.Duration 106 107 // synchronizes access to the updates list, the future and the timer. 108 sync.Mutex 109 } 110 111 // NodeDrainerConfig is used to configure a new node drainer. 112 type NodeDrainerConfig struct { 113 Logger log.Logger 114 Raft RaftApplier 115 JobFactory DrainingJobWatcherFactory 116 NodeFactory DrainingNodeWatcherFactory 117 DrainDeadlineFactory DrainDeadlineNotifierFactory 118 119 // StateQueriesPerSecond configures the query limit against the state store 120 // that is allowed by the node drainer. 121 StateQueriesPerSecond float64 122 123 // BatchUpdateInterval is the interval in which allocation updates are 124 // batched. 125 BatchUpdateInterval time.Duration 126 } 127 128 // NodeDrainer is used to orchestrate migrating allocations off of draining 129 // nodes. 130 type NodeDrainer struct { 131 enabled bool 132 logger log.Logger 133 134 // nodes is the set of draining nodes 135 nodes map[string]*drainingNode 136 137 // nodeWatcher watches for nodes to transition in and out of drain state. 138 nodeWatcher DrainingNodeWatcher 139 nodeFactory DrainingNodeWatcherFactory 140 141 // jobWatcher watches draining jobs and emits desired drains and notifies 142 // when migrations take place. 143 jobWatcher DrainingJobWatcher 144 jobFactory DrainingJobWatcherFactory 145 146 // deadlineNotifier notifies when nodes reach their drain deadline. 147 deadlineNotifier DrainDeadlineNotifier 148 deadlineNotifierFactory DrainDeadlineNotifierFactory 149 150 // state is the state that is watched for state changes. 151 state *state.StateStore 152 153 // queryLimiter is used to limit the rate of blocking queries 154 queryLimiter *rate.Limiter 155 156 // raft is a shim around the raft messages necessary for draining 157 raft RaftApplier 158 159 // batcher is used to batch alloc migrations. 160 batcher allocMigrateBatcher 161 162 // ctx and exitFn are used to cancel the watcher 163 ctx context.Context 164 exitFn context.CancelFunc 165 166 l sync.RWMutex 167 } 168 169 // NewNodeDrainer returns a new new node drainer. The node drainer is 170 // responsible for marking allocations on draining nodes with a desired 171 // migration transition, updating the drain strategy on nodes when they are 172 // complete and creating evaluations for the system to react to these changes. 173 func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { 174 return &NodeDrainer{ 175 raft: c.Raft, 176 logger: c.Logger.Named("drain"), 177 jobFactory: c.JobFactory, 178 nodeFactory: c.NodeFactory, 179 deadlineNotifierFactory: c.DrainDeadlineFactory, 180 queryLimiter: rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100), 181 batcher: allocMigrateBatcher{ 182 batchWindow: c.BatchUpdateInterval, 183 }, 184 } 185 } 186 187 // SetEnabled will start or stop the node draining goroutine depending on the 188 // enabled boolean. 189 func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) { 190 n.l.Lock() 191 defer n.l.Unlock() 192 193 // If we are starting now or have a new state, init state and start the 194 // run loop 195 n.enabled = enabled 196 if enabled { 197 n.flush(state) 198 go n.run(n.ctx) 199 } else if !enabled && n.exitFn != nil { 200 n.exitFn() 201 } 202 } 203 204 // flush is used to clear the state of the watcher 205 func (n *NodeDrainer) flush(state *state.StateStore) { 206 // Cancel anything that may be running. 207 if n.exitFn != nil { 208 n.exitFn() 209 } 210 211 // Store the new state 212 if state != nil { 213 n.state = state 214 } 215 216 n.ctx, n.exitFn = context.WithCancel(context.Background()) 217 n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger) 218 n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n) 219 n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx) 220 n.nodes = make(map[string]*drainingNode, 32) 221 } 222 223 // run is a long lived event handler that receives changes from the relevant 224 // watchers and takes action based on them. 225 func (n *NodeDrainer) run(ctx context.Context) { 226 for { 227 select { 228 case <-n.ctx.Done(): 229 return 230 case nodes := <-n.deadlineNotifier.NextBatch(): 231 n.handleDeadlinedNodes(nodes) 232 case req := <-n.jobWatcher.Drain(): 233 n.handleJobAllocDrain(req) 234 case allocs := <-n.jobWatcher.Migrated(): 235 n.handleMigratedAllocs(allocs) 236 } 237 } 238 } 239 240 // handleDeadlinedNodes handles a set of nodes reaching their drain deadline. 241 // The handler detects the remaining allocations on the nodes and immediately 242 // marks them for migration. 243 func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { 244 // Retrieve the set of allocations that will be force stopped. 245 var forceStop []*structs.Allocation 246 n.l.RLock() 247 for _, node := range nodes { 248 draining, ok := n.nodes[node] 249 if !ok { 250 n.logger.Debug("skipping untracked deadlined node", "node_id", node) 251 continue 252 } 253 254 allocs, err := draining.RemainingAllocs() 255 if err != nil { 256 n.logger.Error("failed to retrieve allocs on deadlined node", "node_id", node, "error", err) 257 continue 258 } 259 260 n.logger.Debug("node deadlined causing allocs to be force stopped", "node_id", node, "num_allocs", len(allocs)) 261 forceStop = append(forceStop, allocs...) 262 } 263 n.l.RUnlock() 264 n.batchDrainAllocs(forceStop) 265 266 // Create the node event 267 event := structs.NewNodeEvent(). 268 SetSubsystem(structs.NodeEventSubsystemDrain). 269 SetMessage(NodeDrainEventComplete). 270 AddDetail(NodeDrainEventDetailDeadlined, "true") 271 272 // Submit the node transitions in a sharded form to ensure a reasonable 273 // Raft transaction size. 274 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) { 275 if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { 276 n.logger.Error("ailed to unset drain for nodes", "error", err) 277 } 278 } 279 } 280 281 // handleJobAllocDrain handles marking a set of allocations as having a desired 282 // transition to drain. The handler blocks till the changes to the allocation 283 // have occurred. 284 func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) { 285 index, err := n.batchDrainAllocs(req.Allocs) 286 req.Resp.Respond(index, err) 287 } 288 289 // handleMigratedAllocs checks to see if any nodes can be considered done 290 // draining based on the set of allocations that have migrated because of an 291 // ongoing drain for a job. 292 func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { 293 // Determine the set of nodes that were effected 294 nodes := make(map[string]struct{}) 295 for _, alloc := range allocs { 296 nodes[alloc.NodeID] = struct{}{} 297 } 298 299 var done []string 300 var remainingAllocs []*structs.Allocation 301 302 // For each node, check if it is now done 303 n.l.RLock() 304 for node := range nodes { 305 draining, ok := n.nodes[node] 306 if !ok { 307 continue 308 } 309 310 isDone, err := draining.IsDone() 311 if err != nil { 312 n.logger.Error("error checking if node is done draining", "node_id", node, "error", err) 313 continue 314 } 315 316 if !isDone { 317 continue 318 } 319 320 done = append(done, node) 321 322 remaining, err := draining.RemainingAllocs() 323 if err != nil { 324 n.logger.Error("node is done draining but encountered an error getting remaining allocs", "node_id", node, "error", err) 325 continue 326 } 327 328 remainingAllocs = append(remainingAllocs, remaining...) 329 } 330 n.l.RUnlock() 331 332 // Stop any running system jobs on otherwise done nodes 333 if len(remainingAllocs) > 0 { 334 future := structs.NewBatchFuture() 335 n.drainAllocs(future, remainingAllocs) 336 if err := future.Wait(); err != nil { 337 n.logger.Error("failed to drain remaining allocs from done nodes", "num_allocs", len(remainingAllocs), "error", err) 338 } 339 } 340 341 // Create the node event 342 event := structs.NewNodeEvent(). 343 SetSubsystem(structs.NodeEventSubsystemDrain). 344 SetMessage(NodeDrainEventComplete) 345 346 // Submit the node transitions in a sharded form to ensure a reasonable 347 // Raft transaction size. 348 for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) { 349 if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { 350 n.logger.Error("failed to unset drain for nodes", "error", err) 351 } 352 } 353 } 354 355 // batchDrainAllocs is used to batch the draining of allocations. It will block 356 // until the batch is complete. 357 func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) { 358 // Add this to the batch 359 n.batcher.Lock() 360 n.batcher.updates = append(n.batcher.updates, allocs...) 361 362 // Start a new batch if none 363 future := n.batcher.updateFuture 364 if future == nil { 365 future = structs.NewBatchFuture() 366 n.batcher.updateFuture = future 367 n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() { 368 // Get the pending updates 369 n.batcher.Lock() 370 updates := n.batcher.updates 371 future := n.batcher.updateFuture 372 n.batcher.updates = nil 373 n.batcher.updateFuture = nil 374 n.batcher.updateTimer = nil 375 n.batcher.Unlock() 376 377 // Perform the batch update 378 n.drainAllocs(future, updates) 379 }) 380 } 381 n.batcher.Unlock() 382 383 if err := future.Wait(); err != nil { 384 return 0, err 385 } 386 387 return future.Index(), nil 388 } 389 390 // drainAllocs is a non batch, marking of the desired transition to migrate for 391 // the set of allocations. It will also create the necessary evaluations for the 392 // affected jobs. 393 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) { 394 // Compute the effected jobs and make the transition map 395 jobs := make(map[string]*structs.Allocation, 4) 396 transitions := make(map[string]*structs.DesiredTransition, len(allocs)) 397 for _, alloc := range allocs { 398 transitions[alloc.ID] = &structs.DesiredTransition{ 399 Migrate: helper.BoolToPtr(true), 400 } 401 jobs[alloc.JobID] = alloc 402 } 403 404 evals := make([]*structs.Evaluation, 0, len(jobs)) 405 now := time.Now().UTC().UnixNano() 406 for job, alloc := range jobs { 407 evals = append(evals, &structs.Evaluation{ 408 ID: uuid.Generate(), 409 Namespace: alloc.Namespace, 410 Priority: alloc.Job.Priority, 411 Type: alloc.Job.Type, 412 TriggeredBy: structs.EvalTriggerNodeDrain, 413 JobID: job, 414 Status: structs.EvalStatusPending, 415 CreateTime: now, 416 ModifyTime: now, 417 }) 418 } 419 420 // Commit this update via Raft 421 var finalIndex uint64 422 for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transitions, evals) { 423 index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals) 424 if err != nil { 425 future.Respond(0, err) 426 return 427 } 428 finalIndex = index 429 } 430 431 future.Respond(finalIndex, nil) 432 }