gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/node_endpoint.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "golang.org/x/sync/errgroup" 11 12 metrics "github.com/armon/go-metrics" 13 log "github.com/hashicorp/go-hclog" 14 memdb "github.com/hashicorp/go-memdb" 15 multierror "github.com/hashicorp/go-multierror" 16 vapi "github.com/hashicorp/vault/api" 17 18 "github.com/hashicorp/nomad/acl" 19 "github.com/hashicorp/nomad/helper/uuid" 20 "github.com/hashicorp/nomad/nomad/state" 21 "github.com/hashicorp/nomad/nomad/structs" 22 "github.com/hashicorp/raft" 23 "github.com/pkg/errors" 24 ) 25 26 const ( 27 // batchUpdateInterval is how long we wait to batch updates 28 batchUpdateInterval = 50 * time.Millisecond 29 30 // maxParallelRequestsPerDerive is the maximum number of parallel Vault 31 // create token requests that may be outstanding per derive request 32 maxParallelRequestsPerDerive = 16 33 34 // NodeDrainEvents are the various drain messages 35 NodeDrainEventDrainSet = "Node drain strategy set" 36 NodeDrainEventDrainDisabled = "Node drain disabled" 37 NodeDrainEventDrainUpdated = "Node drain stategy updated" 38 39 // NodeEligibilityEventEligible is used when the nodes eligiblity is marked 40 // eligible 41 NodeEligibilityEventEligible = "Node marked as eligible for scheduling" 42 43 // NodeEligibilityEventIneligible is used when the nodes eligiblity is marked 44 // ineligible 45 NodeEligibilityEventIneligible = "Node marked as ineligible for scheduling" 46 47 // NodeHeartbeatEventReregistered is the message used when the node becomes 48 // reregistered by the heartbeat. 49 NodeHeartbeatEventReregistered = "Node reregistered by heartbeat" 50 ) 51 52 // Node endpoint is used for client interactions 53 type Node struct { 54 srv *Server 55 logger log.Logger 56 57 // ctx provides context regarding the underlying connection 58 ctx *RPCContext 59 60 // updates holds pending client status updates for allocations 61 updates []*structs.Allocation 62 63 // evals holds pending rescheduling eval updates triggered by failed allocations 64 evals []*structs.Evaluation 65 66 // updateFuture is used to wait for the pending batch update 67 // to complete. This may be nil if no batch is pending. 68 updateFuture *structs.BatchFuture 69 70 // updateTimer is the timer that will trigger the next batch 71 // update, and may be nil if there is no batch pending. 72 updateTimer *time.Timer 73 74 // updatesLock synchronizes access to the updates list, 75 // the future and the timer. 76 updatesLock sync.Mutex 77 } 78 79 // Register is used to upsert a client that is available for scheduling 80 func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error { 81 isForwarded := args.IsForwarded() 82 if done, err := n.srv.forward("Node.Register", args, args, reply); done { 83 // We have a valid node connection since there is no error from the 84 // forwarded server, so add the mapping to cache the 85 // connection and allow the server to send RPCs to the client. 86 if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded { 87 n.ctx.NodeID = args.Node.ID 88 n.srv.addNodeConn(n.ctx) 89 } 90 91 return err 92 } 93 defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now()) 94 95 // Validate the arguments 96 if args.Node == nil { 97 return fmt.Errorf("missing node for client registration") 98 } 99 if args.Node.ID == "" { 100 return fmt.Errorf("missing node ID for client registration") 101 } 102 if args.Node.Datacenter == "" { 103 return fmt.Errorf("missing datacenter for client registration") 104 } 105 if args.Node.Name == "" { 106 return fmt.Errorf("missing node name for client registration") 107 } 108 if len(args.Node.Attributes) == 0 { 109 return fmt.Errorf("missing attributes for client registration") 110 } 111 if args.Node.SecretID == "" { 112 return fmt.Errorf("missing node secret ID for client registration") 113 } 114 115 // Default the status if none is given 116 if args.Node.Status == "" { 117 args.Node.Status = structs.NodeStatusInit 118 } 119 if !structs.ValidNodeStatus(args.Node.Status) { 120 return fmt.Errorf("invalid status for node") 121 } 122 123 // Default to eligible for scheduling if unset 124 if args.Node.SchedulingEligibility == "" { 125 args.Node.SchedulingEligibility = structs.NodeSchedulingEligible 126 } 127 128 // Set the timestamp when the node is registered 129 args.Node.StatusUpdatedAt = time.Now().Unix() 130 131 // Compute the node class 132 if err := args.Node.ComputeClass(); err != nil { 133 return fmt.Errorf("failed to computed node class: %v", err) 134 } 135 136 // Look for the node so we can detect a state transition 137 snap, err := n.srv.fsm.State().Snapshot() 138 if err != nil { 139 return err 140 } 141 142 ws := memdb.NewWatchSet() 143 originalNode, err := snap.NodeByID(ws, args.Node.ID) 144 if err != nil { 145 return err 146 } 147 148 // Check if the SecretID has been tampered with 149 if originalNode != nil { 150 if args.Node.SecretID != originalNode.SecretID && originalNode.SecretID != "" { 151 return fmt.Errorf("node secret ID does not match. Not registering node.") 152 } 153 } 154 155 // We have a valid node connection, so add the mapping to cache the 156 // connection and allow the server to send RPCs to the client. We only cache 157 // the connection if it is not being forwarded from another server. 158 if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() { 159 n.ctx.NodeID = args.Node.ID 160 n.srv.addNodeConn(n.ctx) 161 } 162 163 // Commit this update via Raft 164 _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) 165 if err != nil { 166 n.logger.Error("register failed", "error", err) 167 return err 168 } 169 reply.NodeModifyIndex = index 170 171 // Check if we should trigger evaluations 172 originalStatus := structs.NodeStatusInit 173 if originalNode != nil { 174 originalStatus = originalNode.Status 175 } 176 transitionToReady := transitionedToReady(args.Node.Status, originalStatus) 177 if structs.ShouldDrainNode(args.Node.Status) || transitionToReady { 178 evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) 179 if err != nil { 180 n.logger.Error("eval creation failed", "error", err) 181 return err 182 } 183 reply.EvalIDs = evalIDs 184 reply.EvalCreateIndex = evalIndex 185 } 186 187 // Check if we need to setup a heartbeat 188 if !args.Node.TerminalStatus() { 189 ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) 190 if err != nil { 191 n.logger.Error("heartbeat reset failed", "error", err) 192 return err 193 } 194 reply.HeartbeatTTL = ttl 195 } 196 197 // Set the reply index 198 reply.Index = index 199 snap, err = n.srv.fsm.State().Snapshot() 200 if err != nil { 201 return err 202 } 203 204 n.srv.peerLock.RLock() 205 defer n.srv.peerLock.RUnlock() 206 if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { 207 n.logger.Error("failed to populate NodeUpdateResponse", "error", err) 208 return err 209 } 210 211 return nil 212 } 213 214 // updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading. 215 func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { 216 reply.LeaderRPCAddr = string(n.srv.raft.Leader()) 217 218 // Reply with config information required for future RPC requests 219 reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) 220 for _, v := range n.srv.localPeers { 221 reply.Servers = append(reply.Servers, 222 &structs.NodeServerInfo{ 223 RPCAdvertiseAddr: v.RPCAddr.String(), 224 RPCMajorVersion: int32(v.MajorVersion), 225 RPCMinorVersion: int32(v.MinorVersion), 226 Datacenter: v.Datacenter, 227 }) 228 } 229 230 // TODO(sean@): Use an indexed node count instead 231 // 232 // Snapshot is used only to iterate over all nodes to create a node 233 // count to send back to Nomad Clients in their heartbeat so Clients 234 // can estimate the size of the cluster. 235 ws := memdb.NewWatchSet() 236 iter, err := snap.Nodes(ws) 237 if err == nil { 238 for { 239 raw := iter.Next() 240 if raw == nil { 241 break 242 } 243 reply.NumNodes++ 244 } 245 } 246 247 return nil 248 } 249 250 // Deregister is used to remove a client from the cluster. If a client should 251 // just be made unavailable for scheduling, a status update is preferred. 252 func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error { 253 if done, err := n.srv.forward("Node.Deregister", args, args, reply); done { 254 return err 255 } 256 defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now()) 257 258 if args.NodeID == "" { 259 return fmt.Errorf("missing node ID for client deregistration") 260 } 261 262 // deregister takes a batch 263 repack := &structs.NodeBatchDeregisterRequest{ 264 NodeIDs: []string{args.NodeID}, 265 WriteRequest: args.WriteRequest, 266 } 267 268 return n.deregister(repack, reply, func() (interface{}, uint64, error) { 269 return n.srv.raftApply(structs.NodeDeregisterRequestType, args) 270 }) 271 } 272 273 // BatchDeregister is used to remove client nodes from the cluster. 274 func (n *Node) BatchDeregister(args *structs.NodeBatchDeregisterRequest, reply *structs.NodeUpdateResponse) error { 275 if done, err := n.srv.forward("Node.BatchDeregister", args, args, reply); done { 276 return err 277 } 278 defer metrics.MeasureSince([]string{"nomad", "client", "batch_deregister"}, time.Now()) 279 280 if len(args.NodeIDs) == 0 { 281 return fmt.Errorf("missing node IDs for client deregistration") 282 } 283 284 return n.deregister(args, reply, func() (interface{}, uint64, error) { 285 return n.srv.raftApply(structs.NodeBatchDeregisterRequestType, args) 286 }) 287 } 288 289 // deregister takes a raftMessage closure, to support both Deregister and BatchDeregister 290 func (n *Node) deregister(args *structs.NodeBatchDeregisterRequest, 291 reply *structs.NodeUpdateResponse, 292 raftApplyFn func() (interface{}, uint64, error), 293 ) error { 294 // Check request permissions 295 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 296 return err 297 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 298 return structs.ErrPermissionDenied 299 } 300 301 // Look for the node 302 snap, err := n.srv.fsm.State().Snapshot() 303 if err != nil { 304 return err 305 } 306 307 ws := memdb.NewWatchSet() 308 for _, nodeID := range args.NodeIDs { 309 node, err := snap.NodeByID(ws, nodeID) 310 if err != nil { 311 return err 312 } 313 if node == nil { 314 return fmt.Errorf("node not found") 315 } 316 } 317 318 // Commit this update via Raft 319 _, index, err := raftApplyFn() 320 if err != nil { 321 n.logger.Error("raft message failed", "error", err) 322 return err 323 } 324 325 for _, nodeID := range args.NodeIDs { 326 // Clear the heartbeat timer if any 327 n.srv.clearHeartbeatTimer(nodeID) 328 329 // Create the evaluations for this node 330 evalIDs, evalIndex, err := n.createNodeEvals(nodeID, index) 331 if err != nil { 332 n.logger.Error("eval creation failed", "error", err) 333 return err 334 } 335 336 // Determine if there are any Vault accessors on the node 337 if accessors, err := snap.VaultAccessorsByNode(ws, nodeID); err != nil { 338 n.logger.Error("looking up vault accessors for node failed", "node_id", nodeID, "error", err) 339 return err 340 } else if l := len(accessors); l > 0 { 341 n.logger.Debug("revoking vault accessors on node due to deregister", "num_accessors", l, "node_id", nodeID) 342 if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { 343 n.logger.Error("revoking vault accessors for node failed", "node_id", nodeID, "error", err) 344 return err 345 } 346 } 347 348 // Determine if there are any SI token accessors on the node 349 if accessors, err := snap.SITokenAccessorsByNode(ws, nodeID); err != nil { 350 n.logger.Error("looking up si accessors for node failed", "node_id", nodeID, "error", err) 351 return err 352 } else if l := len(accessors); l > 0 { 353 n.logger.Debug("revoking si accessors on node due to deregister", "num_accessors", l, "node_id", nodeID) 354 // Unlike with the Vault integration, there's no error returned here, since 355 // bootstrapping the Consul client is elsewhere. Errors in revocation trigger 356 // background retry attempts rather than inline error handling. 357 _ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, true) 358 } 359 360 reply.EvalIDs = append(reply.EvalIDs, evalIDs...) 361 // Set the reply eval create index just the first time 362 if reply.EvalCreateIndex == 0 { 363 reply.EvalCreateIndex = evalIndex 364 } 365 } 366 367 reply.NodeModifyIndex = index 368 reply.Index = index 369 return nil 370 } 371 372 // UpdateStatus is used to update the status of a client node 373 func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error { 374 isForwarded := args.IsForwarded() 375 if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done { 376 // We have a valid node connection since there is no error from the 377 // forwarded server, so add the mapping to cache the 378 // connection and allow the server to send RPCs to the client. 379 if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded { 380 n.ctx.NodeID = args.NodeID 381 n.srv.addNodeConn(n.ctx) 382 } 383 384 return err 385 } 386 defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now()) 387 388 // Verify the arguments 389 if args.NodeID == "" { 390 return fmt.Errorf("missing node ID for client status update") 391 } 392 if !structs.ValidNodeStatus(args.Status) { 393 return fmt.Errorf("invalid status for node") 394 } 395 396 // Look for the node 397 snap, err := n.srv.fsm.State().Snapshot() 398 if err != nil { 399 return err 400 } 401 402 ws := memdb.NewWatchSet() 403 node, err := snap.NodeByID(ws, args.NodeID) 404 if err != nil { 405 return err 406 } 407 if node == nil { 408 return fmt.Errorf("node not found") 409 } 410 411 // We have a valid node connection, so add the mapping to cache the 412 // connection and allow the server to send RPCs to the client. We only cache 413 // the connection if it is not being forwarded from another server. 414 if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() { 415 n.ctx.NodeID = args.NodeID 416 n.srv.addNodeConn(n.ctx) 417 } 418 419 // XXX: Could use the SecretID here but have to update the heartbeat system 420 // to track SecretIDs. 421 422 // Update the timestamp of when the node status was updated 423 args.UpdatedAt = time.Now().Unix() 424 425 // Commit this update via Raft 426 var index uint64 427 if node.Status != args.Status { 428 // Attach an event if we are updating the node status to ready when it 429 // is down via a heartbeat 430 if node.Status == structs.NodeStatusDown && args.NodeEvent == nil { 431 args.NodeEvent = structs.NewNodeEvent(). 432 SetSubsystem(structs.NodeEventSubsystemCluster). 433 SetMessage(NodeHeartbeatEventReregistered) 434 } 435 436 _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) 437 if err != nil { 438 n.logger.Error("status update failed", "error", err) 439 return err 440 } 441 reply.NodeModifyIndex = index 442 } 443 444 // Check if we should trigger evaluations 445 transitionToReady := transitionedToReady(args.Status, node.Status) 446 if structs.ShouldDrainNode(args.Status) || transitionToReady { 447 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 448 if err != nil { 449 n.logger.Error("eval creation failed", "error", err) 450 return err 451 } 452 reply.EvalIDs = evalIDs 453 reply.EvalCreateIndex = evalIndex 454 } 455 456 // Check if we need to setup a heartbeat 457 switch args.Status { 458 case structs.NodeStatusDown: 459 // Determine if there are any Vault accessors on the node to cleanup 460 if accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID); err != nil { 461 n.logger.Error("looking up vault accessors for node failed", "node_id", args.NodeID, "error", err) 462 return err 463 } else if l := len(accessors); l > 0 { 464 n.logger.Debug("revoking vault accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID) 465 if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { 466 n.logger.Error("revoking vault accessors for node failed", "node_id", args.NodeID, "error", err) 467 return err 468 } 469 } 470 471 // Determine if there are any SI token accessors on the node to cleanup 472 if accessors, err := n.srv.State().SITokenAccessorsByNode(ws, args.NodeID); err != nil { 473 n.logger.Error("looking up SI accessors for node failed", "node_id", args.NodeID, "error", err) 474 return err 475 } else if l := len(accessors); l > 0 { 476 n.logger.Debug("revoking SI accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID) 477 _ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, true) 478 } 479 default: 480 ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) 481 if err != nil { 482 n.logger.Error("heartbeat reset failed", "error", err) 483 return err 484 } 485 reply.HeartbeatTTL = ttl 486 } 487 488 // Set the reply index and leader 489 reply.Index = index 490 n.srv.peerLock.RLock() 491 defer n.srv.peerLock.RUnlock() 492 if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { 493 n.logger.Error("failed to populate NodeUpdateResponse", "error", err) 494 return err 495 } 496 497 return nil 498 } 499 500 // transitionedToReady is a helper that takes a nodes new and old status and 501 // returns whether it has transitioned to ready. 502 func transitionedToReady(newStatus, oldStatus string) bool { 503 initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady 504 terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady 505 return initToReady || terminalToReady 506 } 507 508 // UpdateDrain is used to update the drain mode of a client node 509 func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, 510 reply *structs.NodeDrainUpdateResponse) error { 511 if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done { 512 return err 513 } 514 defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now()) 515 516 // Check node write permissions 517 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 518 return err 519 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 520 return structs.ErrPermissionDenied 521 } 522 523 // Verify the arguments 524 if args.NodeID == "" { 525 return fmt.Errorf("missing node ID for drain update") 526 } 527 if args.NodeEvent != nil { 528 return fmt.Errorf("node event must not be set") 529 } 530 531 // Look for the node 532 snap, err := n.srv.fsm.State().Snapshot() 533 if err != nil { 534 return err 535 } 536 node, err := snap.NodeByID(nil, args.NodeID) 537 if err != nil { 538 return err 539 } 540 if node == nil { 541 return fmt.Errorf("node not found") 542 } 543 544 now := time.Now().UTC() 545 546 // Update the timestamp of when the node status was updated 547 args.UpdatedAt = now.Unix() 548 549 // COMPAT: Remove in 0.9. Attempt to upgrade the request if it is of the old 550 // format. 551 if args.Drain && args.DrainStrategy == nil { 552 args.DrainStrategy = &structs.DrainStrategy{ 553 DrainSpec: structs.DrainSpec{ 554 Deadline: -1 * time.Second, // Force drain 555 }, 556 } 557 } 558 559 // Setup drain strategy 560 if args.DrainStrategy != nil { 561 // Mark start time for the drain 562 if node.DrainStrategy == nil { 563 args.DrainStrategy.StartedAt = now 564 } else { 565 args.DrainStrategy.StartedAt = node.DrainStrategy.StartedAt 566 } 567 568 // Mark the deadline time 569 if args.DrainStrategy.Deadline.Nanoseconds() > 0 { 570 args.DrainStrategy.ForceDeadline = now.Add(args.DrainStrategy.Deadline) 571 } 572 } 573 574 // Construct the node event 575 args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemDrain) 576 if node.DrainStrategy == nil && args.DrainStrategy != nil { 577 args.NodeEvent.SetMessage(NodeDrainEventDrainSet) 578 } else if node.DrainStrategy != nil && args.DrainStrategy != nil { 579 args.NodeEvent.SetMessage(NodeDrainEventDrainUpdated) 580 } else if node.DrainStrategy != nil && args.DrainStrategy == nil { 581 args.NodeEvent.SetMessage(NodeDrainEventDrainDisabled) 582 } else { 583 args.NodeEvent = nil 584 } 585 586 // Commit this update via Raft 587 _, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) 588 if err != nil { 589 n.logger.Error("drain update failed", "error", err) 590 return err 591 } 592 reply.NodeModifyIndex = index 593 594 // If the node is transitioning to be eligible, create Node evaluations 595 // because there may be a System job registered that should be evaluated. 596 if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.MarkEligible && args.DrainStrategy == nil { 597 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 598 if err != nil { 599 n.logger.Error("eval creation failed", "error", err) 600 return err 601 } 602 reply.EvalIDs = evalIDs 603 reply.EvalCreateIndex = evalIndex 604 } 605 606 // Set the reply index 607 reply.Index = index 608 return nil 609 } 610 611 // UpdateEligibility is used to update the scheduling eligibility of a node 612 func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, 613 reply *structs.NodeEligibilityUpdateResponse) error { 614 if done, err := n.srv.forward("Node.UpdateEligibility", args, args, reply); done { 615 return err 616 } 617 defer metrics.MeasureSince([]string{"nomad", "client", "update_eligibility"}, time.Now()) 618 619 // Check node write permissions 620 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 621 return err 622 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 623 return structs.ErrPermissionDenied 624 } 625 626 // Verify the arguments 627 if args.NodeID == "" { 628 return fmt.Errorf("missing node ID for setting scheduling eligibility") 629 } 630 if args.NodeEvent != nil { 631 return fmt.Errorf("node event must not be set") 632 } 633 634 // Check that only allowed types are set 635 switch args.Eligibility { 636 case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible: 637 default: 638 return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility) 639 } 640 641 // Look for the node 642 snap, err := n.srv.fsm.State().Snapshot() 643 if err != nil { 644 return err 645 } 646 node, err := snap.NodeByID(nil, args.NodeID) 647 if err != nil { 648 return err 649 } 650 if node == nil { 651 return fmt.Errorf("node not found") 652 } 653 654 if node.DrainStrategy != nil && args.Eligibility == structs.NodeSchedulingEligible { 655 return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining") 656 } 657 658 switch args.Eligibility { 659 case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible: 660 default: 661 return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility) 662 } 663 664 // Update the timestamp of when the node status was updated 665 args.UpdatedAt = time.Now().Unix() 666 667 // Construct the node event 668 args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster) 669 if node.SchedulingEligibility == args.Eligibility { 670 return nil // Nothing to do 671 } else if args.Eligibility == structs.NodeSchedulingEligible { 672 args.NodeEvent.SetMessage(NodeEligibilityEventEligible) 673 } else { 674 args.NodeEvent.SetMessage(NodeEligibilityEventIneligible) 675 } 676 677 // Commit this update via Raft 678 outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args) 679 if err != nil { 680 n.logger.Error("eligibility update failed", "error", err) 681 return err 682 } 683 if outErr != nil { 684 if err, ok := outErr.(error); ok && err != nil { 685 n.logger.Error("eligibility update failed", "error", err) 686 return err 687 } 688 } 689 690 // If the node is transitioning to be eligible, create Node evaluations 691 // because there may be a System job registered that should be evaluated. 692 if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.Eligibility == structs.NodeSchedulingEligible { 693 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 694 if err != nil { 695 n.logger.Error("eval creation failed", "error", err) 696 return err 697 } 698 reply.EvalIDs = evalIDs 699 reply.EvalCreateIndex = evalIndex 700 } 701 702 // Set the reply index 703 reply.Index = index 704 return nil 705 } 706 707 // Evaluate is used to force a re-evaluation of the node 708 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error { 709 if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done { 710 return err 711 } 712 defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now()) 713 714 // Check node write permissions 715 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 716 return err 717 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 718 return structs.ErrPermissionDenied 719 } 720 721 // Verify the arguments 722 if args.NodeID == "" { 723 return fmt.Errorf("missing node ID for evaluation") 724 } 725 726 // Look for the node 727 snap, err := n.srv.fsm.State().Snapshot() 728 if err != nil { 729 return err 730 } 731 ws := memdb.NewWatchSet() 732 node, err := snap.NodeByID(ws, args.NodeID) 733 if err != nil { 734 return err 735 } 736 if node == nil { 737 return fmt.Errorf("node not found") 738 } 739 740 // Create the evaluation 741 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) 742 if err != nil { 743 n.logger.Error("eval creation failed", "error", err) 744 return err 745 } 746 reply.EvalIDs = evalIDs 747 reply.EvalCreateIndex = evalIndex 748 749 // Set the reply index 750 reply.Index = evalIndex 751 752 n.srv.peerLock.RLock() 753 defer n.srv.peerLock.RUnlock() 754 if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { 755 n.logger.Error("failed to populate NodeUpdateResponse", "error", err) 756 return err 757 } 758 return nil 759 } 760 761 // GetNode is used to request information about a specific node 762 func (n *Node) GetNode(args *structs.NodeSpecificRequest, 763 reply *structs.SingleNodeResponse) error { 764 if done, err := n.srv.forward("Node.GetNode", args, args, reply); done { 765 return err 766 } 767 defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) 768 769 // Check node read permissions 770 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 771 // If ResolveToken had an unexpected error return that 772 if err != structs.ErrTokenNotFound { 773 return err 774 } 775 776 // Attempt to lookup AuthToken as a Node.SecretID since nodes 777 // call this endpoint and don't have an ACL token. 778 node, stateErr := n.srv.fsm.State().NodeBySecretID(nil, args.AuthToken) 779 if stateErr != nil { 780 // Return the original ResolveToken error with this err 781 var merr multierror.Error 782 merr.Errors = append(merr.Errors, err, stateErr) 783 return merr.ErrorOrNil() 784 } 785 786 // Not a node or a valid ACL token 787 if node == nil { 788 return structs.ErrTokenNotFound 789 } 790 } else if aclObj != nil && !aclObj.AllowNodeRead() { 791 return structs.ErrPermissionDenied 792 } 793 794 // Setup the blocking query 795 opts := blockingOptions{ 796 queryOpts: &args.QueryOptions, 797 queryMeta: &reply.QueryMeta, 798 run: func(ws memdb.WatchSet, state *state.StateStore) error { 799 // Verify the arguments 800 if args.NodeID == "" { 801 return fmt.Errorf("missing node ID") 802 } 803 804 // Look for the node 805 out, err := state.NodeByID(ws, args.NodeID) 806 if err != nil { 807 return err 808 } 809 810 // Setup the output 811 if out != nil { 812 // Clear the secret ID 813 reply.Node = out.Copy() 814 reply.Node.SecretID = "" 815 reply.Index = out.ModifyIndex 816 } else { 817 // Use the last index that affected the nodes table 818 index, err := state.Index("nodes") 819 if err != nil { 820 return err 821 } 822 reply.Node = nil 823 reply.Index = index 824 } 825 826 // Set the query response 827 n.srv.setQueryMeta(&reply.QueryMeta) 828 return nil 829 }} 830 return n.srv.blockingRPC(&opts) 831 } 832 833 // GetAllocs is used to request allocations for a specific node 834 func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, 835 reply *structs.NodeAllocsResponse) error { 836 if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done { 837 return err 838 } 839 defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now()) 840 841 // Check node read and namespace job read permissions 842 aclObj, err := n.srv.ResolveToken(args.AuthToken) 843 if err != nil { 844 return err 845 } 846 if aclObj != nil && !aclObj.AllowNodeRead() { 847 return structs.ErrPermissionDenied 848 } 849 850 // cache namespace perms 851 readableNamespaces := map[string]bool{} 852 853 // readNS is a caching namespace read-job helper 854 readNS := func(ns string) bool { 855 if aclObj == nil { 856 // ACLs are disabled; everything is readable 857 return true 858 } 859 860 if readable, ok := readableNamespaces[ns]; ok { 861 // cache hit 862 return readable 863 } 864 865 // cache miss 866 readable := aclObj.AllowNsOp(ns, acl.NamespaceCapabilityReadJob) 867 readableNamespaces[ns] = readable 868 return readable 869 } 870 871 // Verify the arguments 872 if args.NodeID == "" { 873 return fmt.Errorf("missing node ID") 874 } 875 876 // Setup the blocking query 877 opts := blockingOptions{ 878 queryOpts: &args.QueryOptions, 879 queryMeta: &reply.QueryMeta, 880 run: func(ws memdb.WatchSet, state *state.StateStore) error { 881 // Look for the node 882 allocs, err := state.AllocsByNode(ws, args.NodeID) 883 if err != nil { 884 return err 885 } 886 887 // Setup the output 888 if n := len(allocs); n != 0 { 889 reply.Allocs = make([]*structs.Allocation, 0, n) 890 for _, alloc := range allocs { 891 if readNS(alloc.Namespace) { 892 reply.Allocs = append(reply.Allocs, alloc) 893 } 894 895 // Get the max of all allocs since 896 // subsequent requests need to start 897 // from the latest index 898 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 899 } 900 } else { 901 reply.Allocs = nil 902 903 // Use the last index that affected the nodes table 904 index, err := state.Index("allocs") 905 if err != nil { 906 return err 907 } 908 909 // Must provide non-zero index to prevent blocking 910 // Index 1 is impossible anyways (due to Raft internals) 911 if index == 0 { 912 reply.Index = 1 913 } else { 914 reply.Index = index 915 } 916 } 917 return nil 918 }} 919 return n.srv.blockingRPC(&opts) 920 } 921 922 // GetClientAllocs is used to request a lightweight list of alloc modify indexes 923 // per allocation. 924 func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest, 925 reply *structs.NodeClientAllocsResponse) error { 926 isForwarded := args.IsForwarded() 927 if done, err := n.srv.forward("Node.GetClientAllocs", args, args, reply); done { 928 // We have a valid node connection since there is no error from the 929 // forwarded server, so add the mapping to cache the 930 // connection and allow the server to send RPCs to the client. 931 if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded { 932 n.ctx.NodeID = args.NodeID 933 n.srv.addNodeConn(n.ctx) 934 } 935 936 return err 937 } 938 defer metrics.MeasureSince([]string{"nomad", "client", "get_client_allocs"}, time.Now()) 939 940 // Verify the arguments 941 if args.NodeID == "" { 942 return fmt.Errorf("missing node ID") 943 } 944 945 // numOldAllocs is used to detect if there is a garbage collection event 946 // that effects the node. When an allocation is garbage collected, that does 947 // not change the modify index changes and thus the query won't unblock, 948 // even though the set of allocations on the node has changed. 949 var numOldAllocs int 950 951 // Setup the blocking query 952 opts := blockingOptions{ 953 queryOpts: &args.QueryOptions, 954 queryMeta: &reply.QueryMeta, 955 run: func(ws memdb.WatchSet, state *state.StateStore) error { 956 // Look for the node 957 node, err := state.NodeByID(ws, args.NodeID) 958 if err != nil { 959 return err 960 } 961 962 var allocs []*structs.Allocation 963 if node != nil { 964 if args.SecretID == "" { 965 return fmt.Errorf("missing node secret ID for client status update") 966 } else if args.SecretID != node.SecretID { 967 return fmt.Errorf("node secret ID does not match") 968 } 969 970 // We have a valid node connection, so add the mapping to cache the 971 // connection and allow the server to send RPCs to the client. We only cache 972 // the connection if it is not being forwarded from another server. 973 if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() { 974 n.ctx.NodeID = args.NodeID 975 n.srv.addNodeConn(n.ctx) 976 } 977 978 var err error 979 allocs, err = state.AllocsByNode(ws, args.NodeID) 980 if err != nil { 981 return err 982 } 983 } 984 985 reply.Allocs = make(map[string]uint64) 986 reply.MigrateTokens = make(map[string]string) 987 988 // preferTableIndex is used to determine whether we should build the 989 // response index based on the full table indexes versus the modify 990 // indexes of the allocations on the specific node. This is 991 // preferred in the case that the node doesn't yet have allocations 992 // or when we detect a GC that effects the node. 993 preferTableIndex := true 994 995 // Setup the output 996 if numAllocs := len(allocs); numAllocs != 0 { 997 preferTableIndex = false 998 999 for _, alloc := range allocs { 1000 reply.Allocs[alloc.ID] = alloc.AllocModifyIndex 1001 1002 // If the allocation is going to do a migration, create a 1003 // migration token so that the client can authenticate with 1004 // the node hosting the previous allocation. 1005 if alloc.ShouldMigrate() { 1006 prevAllocation, err := state.AllocByID(ws, alloc.PreviousAllocation) 1007 if err != nil { 1008 return err 1009 } 1010 1011 if prevAllocation != nil && prevAllocation.NodeID != alloc.NodeID { 1012 allocNode, err := state.NodeByID(ws, prevAllocation.NodeID) 1013 if err != nil { 1014 return err 1015 } 1016 if allocNode == nil { 1017 // Node must have been GC'd so skip the token 1018 continue 1019 } 1020 1021 token, err := structs.GenerateMigrateToken(prevAllocation.ID, allocNode.SecretID) 1022 if err != nil { 1023 return err 1024 } 1025 reply.MigrateTokens[alloc.ID] = token 1026 } 1027 } 1028 1029 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 1030 } 1031 1032 // Determine if we have less allocations than before. This 1033 // indicates there was a garbage collection 1034 if numAllocs < numOldAllocs { 1035 preferTableIndex = true 1036 } 1037 1038 // Store the new number of allocations 1039 numOldAllocs = numAllocs 1040 } 1041 1042 if preferTableIndex { 1043 // Use the last index that affected the nodes table 1044 index, err := state.Index("allocs") 1045 if err != nil { 1046 return err 1047 } 1048 1049 // Must provide non-zero index to prevent blocking 1050 // Index 1 is impossible anyways (due to Raft internals) 1051 if index == 0 { 1052 reply.Index = 1 1053 } else { 1054 reply.Index = index 1055 } 1056 } 1057 return nil 1058 }} 1059 return n.srv.blockingRPC(&opts) 1060 } 1061 1062 // UpdateAlloc is used to update the client status of an allocation 1063 func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error { 1064 if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done { 1065 return err 1066 } 1067 defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now()) 1068 1069 // Ensure at least a single alloc 1070 if len(args.Alloc) == 0 { 1071 return fmt.Errorf("must update at least one allocation") 1072 } 1073 1074 // Ensure that evals aren't set from client RPCs 1075 // We create them here before the raft update 1076 if len(args.Evals) != 0 { 1077 return fmt.Errorf("evals field must not be set") 1078 } 1079 1080 // Update modified timestamp for client initiated allocation updates 1081 now := time.Now() 1082 var evals []*structs.Evaluation 1083 1084 // A set of de-duplicated volumes that need their volume claims released. 1085 // Later we'll apply this raft. 1086 volumesToGC := newCSIBatchRelease(n.srv, n.logger, 100) 1087 1088 for _, allocToUpdate := range args.Alloc { 1089 allocToUpdate.ModifyTime = now.UTC().UnixNano() 1090 1091 if !allocToUpdate.TerminalStatus() { 1092 continue 1093 } 1094 1095 alloc, _ := n.srv.State().AllocByID(nil, allocToUpdate.ID) 1096 if alloc == nil { 1097 continue 1098 } 1099 1100 // if the job has been purged, this will always return error 1101 job, err := n.srv.State().JobByID(nil, alloc.Namespace, alloc.JobID) 1102 if err != nil { 1103 n.logger.Debug("UpdateAlloc unable to find job", "job", alloc.JobID, "error", err) 1104 continue 1105 } 1106 if job == nil { 1107 n.logger.Debug("UpdateAlloc unable to find job", "job", alloc.JobID) 1108 continue 1109 } 1110 1111 taskGroup := job.LookupTaskGroup(alloc.TaskGroup) 1112 if taskGroup == nil { 1113 continue 1114 } 1115 1116 // If the terminal alloc has CSI volumes, add the volumes to the batch 1117 // of volumes we'll release the claims of. 1118 for _, vol := range taskGroup.Volumes { 1119 if vol.Type == structs.VolumeTypeCSI { 1120 volumesToGC.add(vol.Source, alloc.Namespace) 1121 } 1122 } 1123 1124 // Add an evaluation if this is a failed alloc that is eligible for rescheduling 1125 if allocToUpdate.ClientStatus == structs.AllocClientStatusFailed && alloc.FollowupEvalID == "" && alloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) { 1126 eval := &structs.Evaluation{ 1127 ID: uuid.Generate(), 1128 Namespace: alloc.Namespace, 1129 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 1130 JobID: alloc.JobID, 1131 Type: job.Type, 1132 Priority: job.Priority, 1133 Status: structs.EvalStatusPending, 1134 CreateTime: now.UTC().UnixNano(), 1135 ModifyTime: now.UTC().UnixNano(), 1136 } 1137 evals = append(evals, eval) 1138 } 1139 } 1140 1141 // Make a raft apply to release the CSI volume claims of terminal allocs. 1142 var result *multierror.Error 1143 err := volumesToGC.apply() 1144 if err != nil { 1145 result = multierror.Append(result, err) 1146 } 1147 1148 // Add this to the batch 1149 n.updatesLock.Lock() 1150 n.updates = append(n.updates, args.Alloc...) 1151 n.evals = append(n.evals, evals...) 1152 1153 // Start a new batch if none 1154 future := n.updateFuture 1155 if future == nil { 1156 future = structs.NewBatchFuture() 1157 n.updateFuture = future 1158 n.updateTimer = time.AfterFunc(batchUpdateInterval, func() { 1159 // Get the pending updates 1160 n.updatesLock.Lock() 1161 updates := n.updates 1162 evals := n.evals 1163 future := n.updateFuture 1164 n.updates = nil 1165 n.evals = nil 1166 n.updateFuture = nil 1167 n.updateTimer = nil 1168 n.updatesLock.Unlock() 1169 1170 // Perform the batch update 1171 n.batchUpdate(future, updates, evals) 1172 }) 1173 } 1174 n.updatesLock.Unlock() 1175 1176 // Wait for the future 1177 if err := future.Wait(); err != nil { 1178 result = multierror.Append(result, err) 1179 return result.ErrorOrNil() 1180 } 1181 1182 // Setup the response 1183 reply.Index = future.Index() 1184 return result.ErrorOrNil() 1185 } 1186 1187 // batchUpdate is used to update all the allocations 1188 func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) { 1189 // Group pending evals by jobID to prevent creating unnecessary evals 1190 evalsByJobId := make(map[structs.NamespacedID]struct{}) 1191 var trimmedEvals []*structs.Evaluation 1192 for _, eval := range evals { 1193 namespacedID := structs.NamespacedID{ 1194 ID: eval.JobID, 1195 Namespace: eval.Namespace, 1196 } 1197 _, exists := evalsByJobId[namespacedID] 1198 if !exists { 1199 now := time.Now().UTC().UnixNano() 1200 eval.CreateTime = now 1201 eval.ModifyTime = now 1202 trimmedEvals = append(trimmedEvals, eval) 1203 evalsByJobId[namespacedID] = struct{}{} 1204 } 1205 } 1206 1207 if len(trimmedEvals) > 0 { 1208 n.logger.Debug("adding evaluations for rescheduling failed allocations", "num_evals", len(trimmedEvals)) 1209 } 1210 // Prepare the batch update 1211 batch := &structs.AllocUpdateRequest{ 1212 Alloc: updates, 1213 Evals: trimmedEvals, 1214 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 1215 } 1216 1217 // Commit this update via Raft 1218 var mErr multierror.Error 1219 _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch) 1220 if err != nil { 1221 n.logger.Error("alloc update failed", "error", err) 1222 mErr.Errors = append(mErr.Errors, err) 1223 } 1224 1225 // For each allocation we are updating, check if we should revoke any 1226 // - Vault token accessors 1227 // - Service Identity token accessors 1228 var ( 1229 revokeVault []*structs.VaultAccessor 1230 revokeSI []*structs.SITokenAccessor 1231 ) 1232 1233 for _, alloc := range updates { 1234 // Skip any allocation that isn't dead on the client 1235 if !alloc.Terminated() { 1236 continue 1237 } 1238 1239 ws := memdb.NewWatchSet() 1240 1241 // Determine if there are any orphaned Vault accessors for the allocation 1242 if accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID); err != nil { 1243 n.logger.Error("looking up vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1244 mErr.Errors = append(mErr.Errors, err) 1245 } else { 1246 revokeVault = append(revokeVault, accessors...) 1247 } 1248 1249 // Determine if there are any orphaned SI accessors for the allocation 1250 if accessors, err := n.srv.State().SITokenAccessorsByAlloc(ws, alloc.ID); err != nil { 1251 n.logger.Error("looking up si accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1252 mErr.Errors = append(mErr.Errors, err) 1253 } else { 1254 revokeSI = append(revokeSI, accessors...) 1255 } 1256 } 1257 1258 // Revoke any orphaned Vault token accessors 1259 if l := len(revokeVault); l > 0 { 1260 n.logger.Debug("revoking vault accessors due to terminal allocations", "num_accessors", l) 1261 if err := n.srv.vault.RevokeTokens(context.Background(), revokeVault, true); err != nil { 1262 n.logger.Error("batched vault accessor revocation failed", "error", err) 1263 mErr.Errors = append(mErr.Errors, err) 1264 } 1265 } 1266 1267 // Revoke any orphaned SI token accessors 1268 if l := len(revokeSI); l > 0 { 1269 n.logger.Debug("revoking si accessors due to terminal allocations", "num_accessors", l) 1270 _ = n.srv.consulACLs.RevokeTokens(context.Background(), revokeSI, true) 1271 } 1272 1273 // Respond to the future 1274 future.Respond(index, mErr.ErrorOrNil()) 1275 } 1276 1277 // List is used to list the available nodes 1278 func (n *Node) List(args *structs.NodeListRequest, 1279 reply *structs.NodeListResponse) error { 1280 if done, err := n.srv.forward("Node.List", args, args, reply); done { 1281 return err 1282 } 1283 defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) 1284 1285 // Check node read permissions 1286 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 1287 return err 1288 } else if aclObj != nil && !aclObj.AllowNodeRead() { 1289 return structs.ErrPermissionDenied 1290 } 1291 1292 // Setup the blocking query 1293 opts := blockingOptions{ 1294 queryOpts: &args.QueryOptions, 1295 queryMeta: &reply.QueryMeta, 1296 run: func(ws memdb.WatchSet, state *state.StateStore) error { 1297 // Capture all the nodes 1298 var err error 1299 var iter memdb.ResultIterator 1300 if prefix := args.QueryOptions.Prefix; prefix != "" { 1301 iter, err = state.NodesByIDPrefix(ws, prefix) 1302 } else { 1303 iter, err = state.Nodes(ws) 1304 } 1305 if err != nil { 1306 return err 1307 } 1308 1309 var nodes []*structs.NodeListStub 1310 for { 1311 raw := iter.Next() 1312 if raw == nil { 1313 break 1314 } 1315 node := raw.(*structs.Node) 1316 nodes = append(nodes, node.Stub()) 1317 } 1318 reply.Nodes = nodes 1319 1320 // Use the last index that affected the jobs table 1321 index, err := state.Index("nodes") 1322 if err != nil { 1323 return err 1324 } 1325 reply.Index = index 1326 1327 // Set the query response 1328 n.srv.setQueryMeta(&reply.QueryMeta) 1329 return nil 1330 }} 1331 return n.srv.blockingRPC(&opts) 1332 } 1333 1334 // createNodeEvals is used to create evaluations for each alloc on a node. 1335 // Each Eval is scoped to a job, so we need to potentially trigger many evals. 1336 func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) { 1337 // Snapshot the state 1338 snap, err := n.srv.fsm.State().Snapshot() 1339 if err != nil { 1340 return nil, 0, fmt.Errorf("failed to snapshot state: %v", err) 1341 } 1342 1343 // Find all the allocations for this node 1344 ws := memdb.NewWatchSet() 1345 allocs, err := snap.AllocsByNode(ws, nodeID) 1346 if err != nil { 1347 return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err) 1348 } 1349 1350 sysJobsIter, err := snap.JobsByScheduler(ws, "system") 1351 if err != nil { 1352 return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err) 1353 } 1354 1355 var sysJobs []*structs.Job 1356 for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() { 1357 sysJobs = append(sysJobs, job.(*structs.Job)) 1358 } 1359 1360 // Fast-path if nothing to do 1361 if len(allocs) == 0 && len(sysJobs) == 0 { 1362 return nil, 0, nil 1363 } 1364 1365 // Create an eval for each JobID affected 1366 var evals []*structs.Evaluation 1367 var evalIDs []string 1368 jobIDs := make(map[string]struct{}) 1369 now := time.Now().UTC().UnixNano() 1370 1371 for _, alloc := range allocs { 1372 // Deduplicate on JobID 1373 if _, ok := jobIDs[alloc.JobID]; ok { 1374 continue 1375 } 1376 jobIDs[alloc.JobID] = struct{}{} 1377 1378 // Create a new eval 1379 eval := &structs.Evaluation{ 1380 ID: uuid.Generate(), 1381 Namespace: alloc.Namespace, 1382 Priority: alloc.Job.Priority, 1383 Type: alloc.Job.Type, 1384 TriggeredBy: structs.EvalTriggerNodeUpdate, 1385 JobID: alloc.JobID, 1386 NodeID: nodeID, 1387 NodeModifyIndex: nodeIndex, 1388 Status: structs.EvalStatusPending, 1389 CreateTime: now, 1390 ModifyTime: now, 1391 } 1392 evals = append(evals, eval) 1393 evalIDs = append(evalIDs, eval.ID) 1394 } 1395 1396 // Create an evaluation for each system job. 1397 for _, job := range sysJobs { 1398 // Still dedup on JobID as the node may already have the system job. 1399 if _, ok := jobIDs[job.ID]; ok { 1400 continue 1401 } 1402 jobIDs[job.ID] = struct{}{} 1403 1404 // Create a new eval 1405 eval := &structs.Evaluation{ 1406 ID: uuid.Generate(), 1407 Namespace: job.Namespace, 1408 Priority: job.Priority, 1409 Type: job.Type, 1410 TriggeredBy: structs.EvalTriggerNodeUpdate, 1411 JobID: job.ID, 1412 NodeID: nodeID, 1413 NodeModifyIndex: nodeIndex, 1414 Status: structs.EvalStatusPending, 1415 CreateTime: now, 1416 ModifyTime: now, 1417 } 1418 evals = append(evals, eval) 1419 evalIDs = append(evalIDs, eval.ID) 1420 } 1421 1422 // Create the Raft transaction 1423 update := &structs.EvalUpdateRequest{ 1424 Evals: evals, 1425 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 1426 } 1427 1428 // Commit this evaluation via Raft 1429 // XXX: There is a risk of partial failure where the node update succeeds 1430 // but that the EvalUpdate does not. 1431 _, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update) 1432 if err != nil { 1433 return nil, 0, err 1434 } 1435 return evalIDs, evalIndex, nil 1436 } 1437 1438 // DeriveVaultToken is used by the clients to request wrapped Vault tokens for 1439 // tasks 1440 func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, reply *structs.DeriveVaultTokenResponse) error { 1441 setError := func(e error, recoverable bool) { 1442 if e != nil { 1443 if re, ok := e.(*structs.RecoverableError); ok { 1444 reply.Error = re // No need to wrap if error is already a RecoverableError 1445 } else { 1446 reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) 1447 } 1448 n.logger.Error("DeriveVaultToken failed", "recoverable", recoverable, "error", e) 1449 } 1450 } 1451 1452 if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done { 1453 setError(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader) 1454 return nil 1455 } 1456 defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now()) 1457 1458 // Verify the arguments 1459 if args.NodeID == "" { 1460 setError(fmt.Errorf("missing node ID"), false) 1461 return nil 1462 } 1463 if args.SecretID == "" { 1464 setError(fmt.Errorf("missing node SecretID"), false) 1465 return nil 1466 } 1467 if args.AllocID == "" { 1468 setError(fmt.Errorf("missing allocation ID"), false) 1469 return nil 1470 } 1471 if len(args.Tasks) == 0 { 1472 setError(fmt.Errorf("no tasks specified"), false) 1473 return nil 1474 } 1475 1476 // Verify the following: 1477 // * The Node exists and has the correct SecretID 1478 // * The Allocation exists on the specified Node 1479 // * The Allocation contains the given tasks and they each require Vault 1480 // tokens 1481 snap, err := n.srv.fsm.State().Snapshot() 1482 if err != nil { 1483 setError(err, false) 1484 return nil 1485 } 1486 ws := memdb.NewWatchSet() 1487 node, err := snap.NodeByID(ws, args.NodeID) 1488 if err != nil { 1489 setError(err, false) 1490 return nil 1491 } 1492 if node == nil { 1493 setError(fmt.Errorf("Node %q does not exist", args.NodeID), false) 1494 return nil 1495 } 1496 if node.SecretID != args.SecretID { 1497 setError(fmt.Errorf("SecretID mismatch"), false) 1498 return nil 1499 } 1500 1501 alloc, err := snap.AllocByID(ws, args.AllocID) 1502 if err != nil { 1503 setError(err, false) 1504 return nil 1505 } 1506 if alloc == nil { 1507 setError(fmt.Errorf("Allocation %q does not exist", args.AllocID), false) 1508 return nil 1509 } 1510 if alloc.NodeID != args.NodeID { 1511 setError(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false) 1512 return nil 1513 } 1514 if alloc.TerminalStatus() { 1515 setError(fmt.Errorf("Can't request Vault token for terminal allocation"), false) 1516 return nil 1517 } 1518 1519 // Check the policies 1520 policies := alloc.Job.VaultPolicies() 1521 if policies == nil { 1522 setError(fmt.Errorf("Job doesn't require Vault policies"), false) 1523 return nil 1524 } 1525 tg, ok := policies[alloc.TaskGroup] 1526 if !ok { 1527 setError(fmt.Errorf("Task group does not require Vault policies"), false) 1528 return nil 1529 } 1530 1531 var unneeded []string 1532 for _, task := range args.Tasks { 1533 taskVault := tg[task] 1534 if taskVault == nil || len(taskVault.Policies) == 0 { 1535 unneeded = append(unneeded, task) 1536 } 1537 } 1538 1539 if len(unneeded) != 0 { 1540 e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s", 1541 strings.Join(unneeded, ", ")) 1542 setError(e, false) 1543 return nil 1544 } 1545 1546 // At this point the request is valid and we should contact Vault for 1547 // tokens. 1548 1549 // Create an error group where we will spin up a fixed set of goroutines to 1550 // handle deriving tokens but where if any fails the whole group is 1551 // canceled. 1552 g, ctx := errgroup.WithContext(context.Background()) 1553 1554 // Cap the handlers 1555 handlers := len(args.Tasks) 1556 if handlers > maxParallelRequestsPerDerive { 1557 handlers = maxParallelRequestsPerDerive 1558 } 1559 1560 // Create the Vault Tokens 1561 input := make(chan string, handlers) 1562 results := make(map[string]*vapi.Secret, len(args.Tasks)) 1563 for i := 0; i < handlers; i++ { 1564 g.Go(func() error { 1565 for { 1566 select { 1567 case task, ok := <-input: 1568 if !ok { 1569 return nil 1570 } 1571 1572 secret, err := n.srv.vault.CreateToken(ctx, alloc, task) 1573 if err != nil { 1574 return err 1575 } 1576 1577 results[task] = secret 1578 case <-ctx.Done(): 1579 return nil 1580 } 1581 } 1582 }) 1583 } 1584 1585 // Send the input 1586 go func() { 1587 defer close(input) 1588 for _, task := range args.Tasks { 1589 select { 1590 case <-ctx.Done(): 1591 return 1592 case input <- task: 1593 } 1594 } 1595 }() 1596 1597 // Wait for everything to complete or for an error 1598 createErr := g.Wait() 1599 1600 // Retrieve the results 1601 accessors := make([]*structs.VaultAccessor, 0, len(results)) 1602 tokens := make(map[string]string, len(results)) 1603 for task, secret := range results { 1604 w := secret.WrapInfo 1605 tokens[task] = w.Token 1606 accessor := &structs.VaultAccessor{ 1607 Accessor: w.WrappedAccessor, 1608 Task: task, 1609 NodeID: alloc.NodeID, 1610 AllocID: alloc.ID, 1611 CreationTTL: w.TTL, 1612 } 1613 1614 accessors = append(accessors, accessor) 1615 } 1616 1617 // If there was an error revoke the created tokens 1618 if createErr != nil { 1619 n.logger.Error("Vault token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr) 1620 1621 if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil { 1622 n.logger.Error("Vault token revocation for alloc failed", "alloc_id", alloc.ID, "error", revokeErr) 1623 } 1624 1625 if rerr, ok := createErr.(*structs.RecoverableError); ok { 1626 reply.Error = rerr 1627 } else { 1628 reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError) 1629 } 1630 1631 return nil 1632 } 1633 1634 // Commit to Raft before returning any of the tokens 1635 req := structs.VaultAccessorsRequest{Accessors: accessors} 1636 _, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req) 1637 if err != nil { 1638 n.logger.Error("registering Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1639 1640 // Determine if we can recover from the error 1641 retry := false 1642 switch err { 1643 case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout: 1644 retry = true 1645 } 1646 1647 setError(err, retry) 1648 return nil 1649 } 1650 1651 reply.Index = index 1652 reply.Tasks = tokens 1653 n.srv.setQueryMeta(&reply.QueryMeta) 1654 return nil 1655 } 1656 1657 type connectTask struct { 1658 TaskKind structs.TaskKind 1659 TaskName string 1660 } 1661 1662 func (n *Node) DeriveSIToken(args *structs.DeriveSITokenRequest, reply *structs.DeriveSITokenResponse) error { 1663 setError := func(e error, recoverable bool) { 1664 if e != nil { 1665 if re, ok := e.(*structs.RecoverableError); ok { 1666 reply.Error = re // No need to wrap if error is already a RecoverableError 1667 } else { 1668 reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) 1669 } 1670 n.logger.Error("DeriveSIToken failed", "recoverable", recoverable, "error", e) 1671 } 1672 } 1673 1674 if done, err := n.srv.forward("Node.DeriveSIToken", args, args, reply); done { 1675 setError(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader) 1676 return nil 1677 } 1678 defer metrics.MeasureSince([]string{"nomad", "client", "derive_si_token"}, time.Now()) 1679 1680 // Verify the arguments 1681 if err := args.Validate(); err != nil { 1682 setError(err, false) 1683 return nil 1684 } 1685 1686 // Get the ClusterID 1687 clusterID, err := n.srv.ClusterID() 1688 if err != nil { 1689 setError(err, false) 1690 return nil 1691 } 1692 1693 // Verify the following: 1694 // * The Node exists and has the correct SecretID. 1695 // * The Allocation exists on the specified Node. 1696 // * The Allocation contains the given tasks, and each task requires a 1697 // SI token. 1698 1699 snap, err := n.srv.fsm.State().Snapshot() 1700 if err != nil { 1701 setError(err, false) 1702 return nil 1703 } 1704 node, err := snap.NodeByID(nil, args.NodeID) 1705 if err != nil { 1706 setError(err, false) 1707 return nil 1708 } 1709 if node == nil { 1710 setError(errors.Errorf("Node %q does not exist", args.NodeID), false) 1711 return nil 1712 } 1713 if node.SecretID != args.SecretID { 1714 setError(errors.Errorf("SecretID mismatch"), false) 1715 return nil 1716 } 1717 1718 alloc, err := snap.AllocByID(nil, args.AllocID) 1719 if err != nil { 1720 setError(err, false) 1721 return nil 1722 } 1723 if alloc == nil { 1724 setError(errors.Errorf("Allocation %q does not exist", args.AllocID), false) 1725 return nil 1726 } 1727 if alloc.NodeID != args.NodeID { 1728 setError(errors.Errorf("Allocation %q not running on node %q", args.AllocID, args.NodeID), false) 1729 return nil 1730 } 1731 if alloc.TerminalStatus() { 1732 setError(errors.Errorf("Cannot request SI token for terminal allocation"), false) 1733 return nil 1734 } 1735 1736 // make sure task group contains at least one connect enabled service 1737 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 1738 if tg == nil { 1739 setError(errors.Errorf("Allocation %q does not contain TaskGroup %q", args.AllocID, alloc.TaskGroup), false) 1740 return nil 1741 } 1742 if !tg.UsesConnect() { 1743 setError(errors.Errorf("TaskGroup %q does not use Connect", tg.Name), false) 1744 return nil 1745 } 1746 1747 // make sure each task in args.Tasks is a connect-enabled task 1748 notConnect, tasks := connectTasks(tg, args.Tasks) 1749 if len(notConnect) > 0 { 1750 setError(fmt.Errorf( 1751 "Requested Consul Service Identity tokens for tasks that are not Connect enabled: %v", 1752 strings.Join(notConnect, ", "), 1753 ), false) 1754 } 1755 1756 // At this point the request is valid and we should contact Consul for tokens. 1757 1758 // A lot of the following is copied from DeriveVaultToken which has been 1759 // working fine for years. 1760 1761 // Create an error group where we will spin up a fixed set of goroutines to 1762 // handle deriving tokens but where if any fails the whole group is 1763 // canceled. 1764 g, ctx := errgroup.WithContext(context.Background()) 1765 1766 // Cap the worker threads 1767 numWorkers := len(args.Tasks) 1768 if numWorkers > maxParallelRequestsPerDerive { 1769 numWorkers = maxParallelRequestsPerDerive 1770 } 1771 1772 // would like to pull some of this out... 1773 1774 // Create the SI tokens from a slice of task name + connect service 1775 input := make(chan connectTask, numWorkers) 1776 results := make(map[string]*structs.SIToken, numWorkers) 1777 for i := 0; i < numWorkers; i++ { 1778 g.Go(func() error { 1779 for { 1780 select { 1781 case task, ok := <-input: 1782 if !ok { 1783 return nil 1784 } 1785 secret, err := n.srv.consulACLs.CreateToken(ctx, ServiceIdentityRequest{ 1786 TaskKind: task.TaskKind, 1787 TaskName: task.TaskName, 1788 ClusterID: clusterID, 1789 AllocID: alloc.ID, 1790 }) 1791 if err != nil { 1792 return err 1793 } 1794 results[task.TaskName] = secret 1795 case <-ctx.Done(): 1796 return nil 1797 } 1798 } 1799 }) 1800 } 1801 1802 // Send the input 1803 go func() { 1804 defer close(input) 1805 for _, connectTask := range tasks { 1806 select { 1807 case <-ctx.Done(): 1808 return 1809 case input <- connectTask: 1810 } 1811 } 1812 }() 1813 1814 // Wait for everything to complete or for an error 1815 createErr := g.Wait() 1816 1817 accessors := make([]*structs.SITokenAccessor, 0, len(results)) 1818 tokens := make(map[string]string, len(results)) 1819 for task, secret := range results { 1820 tokens[task] = secret.SecretID 1821 accessor := &structs.SITokenAccessor{ 1822 NodeID: alloc.NodeID, 1823 AllocID: alloc.ID, 1824 TaskName: task, 1825 AccessorID: secret.AccessorID, 1826 } 1827 accessors = append(accessors, accessor) 1828 } 1829 1830 // If there was an error, revoke all created tokens. These tokens have not 1831 // yet been committed to the persistent store. 1832 if createErr != nil { 1833 n.logger.Error("Consul Service Identity token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr) 1834 _ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, false) 1835 1836 if recoverable, ok := createErr.(*structs.RecoverableError); ok { 1837 reply.Error = recoverable 1838 } else { 1839 reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError) 1840 } 1841 1842 return nil 1843 } 1844 1845 // Commit the derived tokens to raft before returning them 1846 requested := structs.SITokenAccessorsRequest{Accessors: accessors} 1847 _, index, err := n.srv.raftApply(structs.ServiceIdentityAccessorRegisterRequestType, &requested) 1848 if err != nil { 1849 n.logger.Error("registering Service Identity token accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1850 1851 // Determine if we can recover from the error 1852 retry := false 1853 switch err { 1854 case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout: 1855 retry = true 1856 } 1857 setError(err, retry) 1858 return nil 1859 } 1860 1861 // We made it! Now we can set the reply. 1862 reply.Index = index 1863 reply.Tokens = tokens 1864 n.srv.setQueryMeta(&reply.QueryMeta) 1865 return nil 1866 } 1867 1868 func connectTasks(tg *structs.TaskGroup, tasks []string) ([]string, []connectTask) { 1869 var notConnect []string 1870 var usesConnect []connectTask 1871 for _, task := range tasks { 1872 tgTask := tg.LookupTask(task) 1873 if !taskUsesConnect(tgTask) { 1874 notConnect = append(notConnect, task) 1875 } else { 1876 usesConnect = append(usesConnect, connectTask{ 1877 TaskName: task, 1878 TaskKind: tgTask.Kind, 1879 }) 1880 } 1881 } 1882 return notConnect, usesConnect 1883 } 1884 1885 func taskUsesConnect(task *structs.Task) bool { 1886 if task == nil { 1887 // not even in the task group 1888 return false 1889 } 1890 1891 return task.Kind.IsConnectProxy() || task.Kind.IsConnectNative() 1892 } 1893 1894 func (n *Node) EmitEvents(args *structs.EmitNodeEventsRequest, reply *structs.EmitNodeEventsResponse) error { 1895 if done, err := n.srv.forward("Node.EmitEvents", args, args, reply); done { 1896 return err 1897 } 1898 defer metrics.MeasureSince([]string{"nomad", "client", "emit_events"}, time.Now()) 1899 1900 if len(args.NodeEvents) == 0 { 1901 return fmt.Errorf("no node events given") 1902 } 1903 for nodeID, events := range args.NodeEvents { 1904 if len(events) == 0 { 1905 return fmt.Errorf("no node events given for node %q", nodeID) 1906 } 1907 } 1908 1909 _, index, err := n.srv.raftApply(structs.UpsertNodeEventsType, args) 1910 if err != nil { 1911 n.logger.Error("upserting node events failed", "error", err) 1912 return err 1913 } 1914 1915 reply.Index = index 1916 return nil 1917 }