github.com/hernad/nomad@v1.6.112/nomad/node_endpoint.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "net/http" 11 "reflect" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/armon/go-metrics" 17 "github.com/hashicorp/go-hclog" 18 "github.com/hashicorp/go-memdb" 19 "github.com/hashicorp/go-multierror" 20 vapi "github.com/hashicorp/vault/api" 21 "golang.org/x/sync/errgroup" 22 23 "github.com/hernad/nomad/acl" 24 "github.com/hernad/nomad/helper/uuid" 25 "github.com/hernad/nomad/nomad/state" 26 "github.com/hernad/nomad/nomad/state/paginator" 27 "github.com/hernad/nomad/nomad/structs" 28 "github.com/hashicorp/raft" 29 ) 30 31 const ( 32 // batchUpdateInterval is how long we wait to batch updates 33 batchUpdateInterval = 50 * time.Millisecond 34 35 // maxParallelRequestsPerDerive is the maximum number of parallel Vault 36 // create token requests that may be outstanding per derive request 37 maxParallelRequestsPerDerive = 16 38 39 // NodeDrainEvents are the various drain messages 40 NodeDrainEventDrainSet = "Node drain strategy set" 41 NodeDrainEventDrainDisabled = "Node drain disabled" 42 NodeDrainEventDrainUpdated = "Node drain strategy updated" 43 44 // NodeEligibilityEventEligible is used when the nodes eligiblity is marked 45 // eligible 46 NodeEligibilityEventEligible = "Node marked as eligible for scheduling" 47 48 // NodeEligibilityEventIneligible is used when the nodes eligiblity is marked 49 // ineligible 50 NodeEligibilityEventIneligible = "Node marked as ineligible for scheduling" 51 52 // NodeHeartbeatEventReregistered is the message used when the node becomes 53 // reregistered by the heartbeat. 54 NodeHeartbeatEventReregistered = "Node reregistered by heartbeat" 55 56 // NodeWaitingForNodePool is the message used when the node is waiting for 57 // its node pool to be created. 58 NodeWaitingForNodePool = "Node registered but waiting for node pool to be created" 59 ) 60 61 // Node endpoint is used for client interactions 62 type Node struct { 63 srv *Server 64 logger hclog.Logger 65 66 // ctx provides context regarding the underlying connection 67 ctx *RPCContext 68 69 // updates holds pending client status updates for allocations 70 updates []*structs.Allocation 71 72 // evals holds pending rescheduling eval updates triggered by failed allocations 73 evals []*structs.Evaluation 74 75 // updateFuture is used to wait for the pending batch update 76 // to complete. This may be nil if no batch is pending. 77 updateFuture *structs.BatchFuture 78 79 // updateTimer is the timer that will trigger the next batch 80 // update, and may be nil if there is no batch pending. 81 updateTimer *time.Timer 82 83 // updatesLock synchronizes access to the updates list, 84 // the future and the timer. 85 updatesLock sync.Mutex 86 } 87 88 func NewNodeEndpoint(srv *Server, ctx *RPCContext) *Node { 89 return &Node{ 90 srv: srv, 91 ctx: ctx, 92 logger: srv.logger.Named("client"), 93 updates: []*structs.Allocation{}, 94 evals: []*structs.Evaluation{}, 95 } 96 } 97 98 // Register is used to upsert a client that is available for scheduling 99 func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error { 100 // note that we trust-on-first use and the identity will be anonymous for 101 // that initial request; we lean on mTLS for handling that safely 102 authErr := n.srv.Authenticate(n.ctx, args) 103 104 isForwarded := args.IsForwarded() 105 if done, err := n.srv.forward("Node.Register", args, args, reply); done { 106 // We have a valid node connection since there is no error from the 107 // forwarded server, so add the mapping to cache the 108 // connection and allow the server to send RPCs to the client. 109 if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded { 110 n.ctx.NodeID = args.Node.ID 111 n.srv.addNodeConn(n.ctx) 112 } 113 114 return err 115 } 116 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 117 if authErr != nil { 118 return structs.ErrPermissionDenied 119 } 120 121 defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now()) 122 123 // Validate the arguments 124 if args.Node == nil { 125 return fmt.Errorf("missing node for client registration") 126 } 127 if args.Node.ID == "" { 128 return fmt.Errorf("missing node ID for client registration") 129 } 130 if args.Node.Datacenter == "" { 131 return fmt.Errorf("missing datacenter for client registration") 132 } 133 if args.Node.Name == "" { 134 return fmt.Errorf("missing node name for client registration") 135 } 136 if len(args.Node.Attributes) == 0 { 137 return fmt.Errorf("missing attributes for client registration") 138 } 139 if args.Node.SecretID == "" { 140 return fmt.Errorf("missing node secret ID for client registration") 141 } 142 if args.Node.NodePool != "" { 143 err := structs.ValidateNodePoolName(args.Node.NodePool) 144 if err != nil { 145 return fmt.Errorf("invalid node pool: %v", err) 146 } 147 if args.Node.NodePool == structs.NodePoolAll { 148 return fmt.Errorf("node is not allowed to register in node pool %q", structs.NodePoolAll) 149 } 150 } 151 152 // Default the status if none is given 153 if args.Node.Status == "" { 154 args.Node.Status = structs.NodeStatusInit 155 } 156 if !structs.ValidNodeStatus(args.Node.Status) { 157 return fmt.Errorf("invalid status for node") 158 } 159 160 // Default to eligible for scheduling if unset 161 if args.Node.SchedulingEligibility == "" { 162 args.Node.SchedulingEligibility = structs.NodeSchedulingEligible 163 } 164 165 // Default the node pool if none is given. 166 if args.Node.NodePool == "" { 167 args.Node.NodePool = structs.NodePoolDefault 168 } 169 170 // Set the timestamp when the node is registered 171 args.Node.StatusUpdatedAt = time.Now().Unix() 172 173 // Compute the node class 174 if err := args.Node.ComputeClass(); err != nil { 175 return fmt.Errorf("failed to computed node class: %v", err) 176 } 177 178 // Look for the node so we can detect a state transition 179 snap, err := n.srv.fsm.State().Snapshot() 180 if err != nil { 181 return err 182 } 183 184 ws := memdb.NewWatchSet() 185 originalNode, err := snap.NodeByID(ws, args.Node.ID) 186 if err != nil { 187 return err 188 } 189 190 if originalNode != nil { 191 // Check if the SecretID has been tampered with 192 if args.Node.SecretID != originalNode.SecretID && originalNode.SecretID != "" { 193 return fmt.Errorf("node secret ID does not match. Not registering node.") 194 } 195 196 // Don't allow the Register method to update the node status. Only the 197 // UpdateStatus method should be able to do this. 198 if originalNode.Status != "" { 199 args.Node.Status = originalNode.Status 200 } 201 } 202 203 // We have a valid node connection, so add the mapping to cache the 204 // connection and allow the server to send RPCs to the client. We only cache 205 // the connection if it is not being forwarded from another server. 206 if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() { 207 n.ctx.NodeID = args.Node.ID 208 n.srv.addNodeConn(n.ctx) 209 } 210 211 // Commit this update via Raft. 212 // 213 // Only the authoritative region is allowed to create the node pool for the 214 // node if it doesn't exist yet. This prevents non-authoritative regions 215 // from having to push their local state to the authoritative region. 216 // 217 // Nodes in non-authoritative regions that are registered with a new node 218 // pool are kept in the `initializing` status until the node pool is 219 // created and replicated. 220 if n.srv.Region() == n.srv.config.AuthoritativeRegion { 221 args.CreateNodePool = true 222 } 223 _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) 224 if err != nil { 225 n.logger.Error("register failed", "error", err) 226 return err 227 } 228 reply.NodeModifyIndex = index 229 230 // Check if we should trigger evaluations 231 if shouldCreateNodeEval(originalNode, args.Node) { 232 evalIDs, evalIndex, err := n.createNodeEvals(args.Node, index) 233 if err != nil { 234 n.logger.Error("eval creation failed", "error", err) 235 return err 236 } 237 reply.EvalIDs = evalIDs 238 reply.EvalCreateIndex = evalIndex 239 } 240 241 // Check if we need to setup a heartbeat 242 if !args.Node.TerminalStatus() { 243 ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) 244 if err != nil { 245 n.logger.Error("heartbeat reset failed", "error", err) 246 return err 247 } 248 reply.HeartbeatTTL = ttl 249 } 250 251 // Set the reply index 252 reply.Index = index 253 snap, err = n.srv.fsm.State().Snapshot() 254 if err != nil { 255 return err 256 } 257 258 n.srv.peerLock.RLock() 259 defer n.srv.peerLock.RUnlock() 260 if err := n.constructNodeServerInfoResponse(args.Node.ID, snap, reply); err != nil { 261 n.logger.Error("failed to populate NodeUpdateResponse", "error", err) 262 return err 263 } 264 265 return nil 266 } 267 268 // shouldCreateNodeEval returns true if the node update may result into 269 // allocation updates, so the node should be re-evaluating. 270 // 271 // Such cases might be: 272 // * node health/drain status changes that may result into alloc rescheduling 273 // * node drivers or attributes changing that may cause system job placement changes 274 func shouldCreateNodeEval(original, updated *structs.Node) bool { 275 if structs.ShouldDrainNode(updated.Status) { 276 return true 277 } 278 279 if original == nil { 280 return nodeStatusTransitionRequiresEval(updated.Status, structs.NodeStatusInit) 281 } 282 283 if nodeStatusTransitionRequiresEval(updated.Status, original.Status) { 284 return true 285 } 286 287 // check fields used by the feasibility checks in ../scheduler/feasible.go, 288 // whether through a Constraint explicitly added by user or an implicit constraint 289 // added through a driver/volume check. 290 // 291 // Node Resources (e.g. CPU/Memory) are handled differently, using blocked evals, 292 // and not relevant in this check. 293 return !(original.ID == updated.ID && 294 original.Datacenter == updated.Datacenter && 295 original.Name == updated.Name && 296 original.NodeClass == updated.NodeClass && 297 reflect.DeepEqual(original.Attributes, updated.Attributes) && 298 reflect.DeepEqual(original.Meta, updated.Meta) && 299 reflect.DeepEqual(original.Drivers, updated.Drivers) && 300 reflect.DeepEqual(original.HostVolumes, updated.HostVolumes) && 301 equalDevices(original, updated)) 302 } 303 304 func equalDevices(n1, n2 *structs.Node) bool { 305 // ignore super old nodes, mostly to avoid nil dereferencing 306 if n1.NodeResources == nil || n2.NodeResources == nil { 307 return n1.NodeResources == n2.NodeResources 308 } 309 310 // treat nil and empty value as equal 311 if len(n1.NodeResources.Devices) == 0 { 312 return len(n1.NodeResources.Devices) == len(n2.NodeResources.Devices) 313 } 314 315 return reflect.DeepEqual(n1.NodeResources.Devices, n2.NodeResources.Devices) 316 } 317 318 // constructNodeServerInfoResponse assumes the n.srv.peerLock is held for reading. 319 func (n *Node) constructNodeServerInfoResponse(nodeID string, snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { 320 reply.LeaderRPCAddr = string(n.srv.raft.Leader()) 321 322 // Reply with config information required for future RPC requests 323 reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) 324 for _, v := range n.srv.localPeers { 325 reply.Servers = append(reply.Servers, 326 &structs.NodeServerInfo{ 327 RPCAdvertiseAddr: v.RPCAddr.String(), 328 Datacenter: v.Datacenter, 329 }) 330 } 331 332 ws := memdb.NewWatchSet() 333 334 // Add ClientStatus information to heartbeat response. 335 if node, err := snap.NodeByID(ws, nodeID); err == nil && node != nil { 336 reply.SchedulingEligibility = node.SchedulingEligibility 337 } else if node == nil { 338 339 // If the node is not found, leave reply.SchedulingEligibility as 340 // the empty string. The response handler in the client treats this 341 // as a no-op. As there is no call to action for an operator, log it 342 // at debug level. 343 n.logger.Debug("constructNodeServerInfoResponse: node not found", 344 "node_id", nodeID) 345 } else { 346 347 // This case is likely only reached via a code error in state store 348 return err 349 } 350 351 // TODO(sean@): Use an indexed node count instead 352 // 353 // Snapshot is used only to iterate over all nodes to create a node 354 // count to send back to Nomad Clients in their heartbeat so Clients 355 // can estimate the size of the cluster. 356 iter, err := snap.Nodes(ws) 357 if err == nil { 358 for { 359 raw := iter.Next() 360 if raw == nil { 361 break 362 } 363 reply.NumNodes++ 364 } 365 } 366 367 reply.Features = n.srv.EnterpriseState.Features() 368 369 return nil 370 } 371 372 // Deregister is used to remove a client from the cluster. If a client should 373 // just be made unavailable for scheduling, a status update is preferred. 374 func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error { 375 authErr := n.srv.Authenticate(n.ctx, args) 376 if done, err := n.srv.forward("Node.Deregister", args, args, reply); done { 377 return err 378 } 379 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 380 if authErr != nil { 381 return structs.ErrPermissionDenied 382 } 383 defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now()) 384 385 if args.NodeID == "" { 386 return fmt.Errorf("missing node ID for client deregistration") 387 } 388 389 // deregister takes a batch 390 repack := &structs.NodeBatchDeregisterRequest{ 391 NodeIDs: []string{args.NodeID}, 392 WriteRequest: args.WriteRequest, 393 } 394 395 return n.deregister(repack, reply, func() (interface{}, uint64, error) { 396 return n.srv.raftApply(structs.NodeDeregisterRequestType, args) 397 }) 398 } 399 400 // BatchDeregister is used to remove client nodes from the cluster. 401 func (n *Node) BatchDeregister(args *structs.NodeBatchDeregisterRequest, reply *structs.NodeUpdateResponse) error { 402 authErr := n.srv.Authenticate(n.ctx, args) 403 if done, err := n.srv.forward("Node.BatchDeregister", args, args, reply); done { 404 return err 405 } 406 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 407 if authErr != nil { 408 return structs.ErrPermissionDenied 409 } 410 defer metrics.MeasureSince([]string{"nomad", "client", "batch_deregister"}, time.Now()) 411 412 if len(args.NodeIDs) == 0 { 413 return fmt.Errorf("missing node IDs for client deregistration") 414 } 415 416 return n.deregister(args, reply, func() (interface{}, uint64, error) { 417 return n.srv.raftApply(structs.NodeBatchDeregisterRequestType, args) 418 }) 419 } 420 421 // deregister takes a raftMessage closure, to support both Deregister and BatchDeregister 422 func (n *Node) deregister(args *structs.NodeBatchDeregisterRequest, 423 reply *structs.NodeUpdateResponse, 424 raftApplyFn func() (interface{}, uint64, error), 425 ) error { 426 // Check request permissions 427 if aclObj, err := n.srv.ResolveACL(args); err != nil { 428 return err 429 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 430 return structs.ErrPermissionDenied 431 } 432 433 // Look for the node 434 snap, err := n.srv.fsm.State().Snapshot() 435 if err != nil { 436 return err 437 } 438 439 nodes := make([]*structs.Node, 0, len(args.NodeIDs)) 440 for _, nodeID := range args.NodeIDs { 441 node, err := snap.NodeByID(nil, nodeID) 442 if err != nil { 443 return err 444 } 445 if node == nil { 446 return fmt.Errorf("node not found") 447 } 448 nodes = append(nodes, node) 449 } 450 451 // Commit this update via Raft 452 _, index, err := raftApplyFn() 453 if err != nil { 454 n.logger.Error("raft message failed", "error", err) 455 return err 456 } 457 458 for _, node := range nodes { 459 nodeID := node.ID 460 461 // Clear the heartbeat timer if any 462 n.srv.clearHeartbeatTimer(nodeID) 463 464 // Create the evaluations for this node 465 evalIDs, evalIndex, err := n.createNodeEvals(node, index) 466 if err != nil { 467 n.logger.Error("eval creation failed", "error", err) 468 return err 469 } 470 471 // Determine if there are any Vault accessors on the node 472 if accessors, err := snap.VaultAccessorsByNode(nil, nodeID); err != nil { 473 n.logger.Error("looking up vault accessors for node failed", "node_id", nodeID, "error", err) 474 return err 475 } else if l := len(accessors); l > 0 { 476 n.logger.Debug("revoking vault accessors on node due to deregister", "num_accessors", l, "node_id", nodeID) 477 if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { 478 n.logger.Error("revoking vault accessors for node failed", "node_id", nodeID, "error", err) 479 return err 480 } 481 } 482 483 // Determine if there are any SI token accessors on the node 484 if accessors, err := snap.SITokenAccessorsByNode(nil, nodeID); err != nil { 485 n.logger.Error("looking up si accessors for node failed", "node_id", nodeID, "error", err) 486 return err 487 } else if l := len(accessors); l > 0 { 488 n.logger.Debug("revoking si accessors on node due to deregister", "num_accessors", l, "node_id", nodeID) 489 // Unlike with the Vault integration, there's no error returned here, since 490 // bootstrapping the Consul client is elsewhere. Errors in revocation trigger 491 // background retry attempts rather than inline error handling. 492 _ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, true) 493 } 494 495 reply.EvalIDs = append(reply.EvalIDs, evalIDs...) 496 // Set the reply eval create index just the first time 497 if reply.EvalCreateIndex == 0 { 498 reply.EvalCreateIndex = evalIndex 499 } 500 } 501 502 reply.NodeModifyIndex = index 503 reply.Index = index 504 return nil 505 } 506 507 // UpdateStatus is used to update the status of a client node. 508 // 509 // Clients with non-terminal allocations must first call UpdateAlloc to be able 510 // to transition from the initializing status to ready. 511 // 512 // Clients node pool must exist for them to be able to transition from 513 // initializing to ready. 514 // 515 // ┌────────────────────────────────────── No ───┐ 516 // │ │ 517 // ┌──▼───┐ ┌─────────────┐ ┌────────┴────────┐ 518 // ── Register ─► init ├─ ready ──► Has allocs? ├─ Yes ─► Allocs updated? │ 519 // └──▲──▲┘ └─────┬───────┘ └────────┬────────┘ 520 // │ │ │ │ 521 // │ │ └─ No ─┐ ┌─────── Yes ──┘ 522 // │ │ │ │ 523 // │ │ ┌────────▼──▼───────┐ 524 // │ └──────────No───┤ Node pool exists? │ 525 // │ └─────────┬─────────┘ 526 // │ │ 527 // ready Yes 528 // │ │ 529 // ┌──────┴───────┐ ┌───▼───┐ ┌──────┐ 530 // │ disconnected ◄─ disconnected ─┤ ready ├─ down ──► down │ 531 // └──────────────┘ └───▲───┘ └──┬───┘ 532 // │ │ 533 // └──── ready ─────┘ 534 func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error { 535 authErr := n.srv.Authenticate(n.ctx, args) 536 537 isForwarded := args.IsForwarded() 538 if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done { 539 // We have a valid node connection since there is no error from the 540 // forwarded server, so add the mapping to cache the 541 // connection and allow the server to send RPCs to the client. 542 if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded { 543 n.ctx.NodeID = args.NodeID 544 n.srv.addNodeConn(n.ctx) 545 } 546 547 return err 548 } 549 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 550 if authErr != nil { 551 return structs.ErrPermissionDenied 552 } 553 554 defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now()) 555 556 // Verify the arguments 557 if args.NodeID == "" { 558 return fmt.Errorf("missing node ID for client status update") 559 } 560 if !structs.ValidNodeStatus(args.Status) { 561 return fmt.Errorf("invalid status for node") 562 } 563 564 // Look for the node 565 snap, err := n.srv.fsm.State().Snapshot() 566 if err != nil { 567 return err 568 } 569 570 ws := memdb.NewWatchSet() 571 node, err := snap.NodeByID(ws, args.NodeID) 572 if err != nil { 573 return err 574 } 575 if node == nil { 576 return fmt.Errorf("node not found") 577 } 578 579 // We have a valid node connection, so add the mapping to cache the 580 // connection and allow the server to send RPCs to the client. We only cache 581 // the connection if it is not being forwarded from another server. 582 if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() { 583 n.ctx.NodeID = args.NodeID 584 n.srv.addNodeConn(n.ctx) 585 } 586 587 // XXX: Could use the SecretID here but have to update the heartbeat system 588 // to track SecretIDs. 589 590 // Update the timestamp of when the node status was updated 591 args.UpdatedAt = time.Now().Unix() 592 593 // Compute next status. 594 switch node.Status { 595 case structs.NodeStatusInit: 596 if args.Status == structs.NodeStatusReady { 597 // Keep node in the initializing status if it has allocations but 598 // they are not updated. 599 allocs, err := snap.AllocsByNodeTerminal(ws, args.NodeID, false) 600 if err != nil { 601 return fmt.Errorf("failed to query node allocs: %v", err) 602 } 603 604 allocsUpdated := node.LastAllocUpdateIndex > node.LastMissedHeartbeatIndex 605 if len(allocs) > 0 && !allocsUpdated { 606 n.logger.Debug(fmt.Sprintf("marking node as %s due to outdated allocation information", structs.NodeStatusInit)) 607 args.Status = structs.NodeStatusInit 608 } 609 610 // Keep node in the initialing status if it's in a node pool that 611 // doesn't exist. 612 pool, err := snap.NodePoolByName(ws, node.NodePool) 613 if err != nil { 614 return fmt.Errorf("failed to query node pool: %v", err) 615 } 616 if pool == nil { 617 n.logger.Debug(fmt.Sprintf("marking node as %s due to missing node pool", structs.NodeStatusInit)) 618 args.Status = structs.NodeStatusInit 619 if !node.HasEvent(NodeWaitingForNodePool) { 620 args.NodeEvent = structs.NewNodeEvent(). 621 SetSubsystem(structs.NodeEventSubsystemCluster). 622 SetMessage(NodeWaitingForNodePool). 623 AddDetail("node_pool", node.NodePool) 624 } 625 } 626 } 627 case structs.NodeStatusDisconnected: 628 if args.Status == structs.NodeStatusReady { 629 args.Status = structs.NodeStatusInit 630 } 631 } 632 633 // Commit this update via Raft 634 var index uint64 635 if node.Status != args.Status || args.NodeEvent != nil { 636 // Attach an event if we are updating the node status to ready when it 637 // is down via a heartbeat 638 if node.Status == structs.NodeStatusDown && args.NodeEvent == nil { 639 args.NodeEvent = structs.NewNodeEvent(). 640 SetSubsystem(structs.NodeEventSubsystemCluster). 641 SetMessage(NodeHeartbeatEventReregistered) 642 } 643 644 _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) 645 if err != nil { 646 n.logger.Error("status update failed", "error", err) 647 return err 648 } 649 reply.NodeModifyIndex = index 650 } 651 652 // Check if we should trigger evaluations 653 if structs.ShouldDrainNode(args.Status) || 654 nodeStatusTransitionRequiresEval(args.Status, node.Status) { 655 evalIDs, evalIndex, err := n.createNodeEvals(node, index) 656 if err != nil { 657 n.logger.Error("eval creation failed", "error", err) 658 return err 659 } 660 reply.EvalIDs = evalIDs 661 reply.EvalCreateIndex = evalIndex 662 } 663 664 // Check if we need to setup a heartbeat 665 switch args.Status { 666 case structs.NodeStatusDown: 667 // Determine if there are any Vault accessors on the node to cleanup 668 if accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID); err != nil { 669 n.logger.Error("looking up vault accessors for node failed", "node_id", args.NodeID, "error", err) 670 return err 671 } else if l := len(accessors); l > 0 { 672 n.logger.Debug("revoking vault accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID) 673 if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { 674 n.logger.Error("revoking vault accessors for node failed", "node_id", args.NodeID, "error", err) 675 return err 676 } 677 } 678 679 // Determine if there are any SI token accessors on the node to cleanup 680 if accessors, err := n.srv.State().SITokenAccessorsByNode(ws, args.NodeID); err != nil { 681 n.logger.Error("looking up SI accessors for node failed", "node_id", args.NodeID, "error", err) 682 return err 683 } else if l := len(accessors); l > 0 { 684 n.logger.Debug("revoking SI accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID) 685 _ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, true) 686 } 687 688 // Identify the service registrations current placed on the downed 689 // node. 690 serviceRegistrations, err := n.srv.State().GetServiceRegistrationsByNodeID(ws, args.NodeID) 691 if err != nil { 692 n.logger.Error("looking up service registrations for node failed", 693 "node_id", args.NodeID, "error", err) 694 return err 695 } 696 697 // If the node has service registrations assigned to it, delete these 698 // via Raft. 699 if l := len(serviceRegistrations); l > 0 { 700 n.logger.Debug("deleting service registrations on node due to down state", 701 "num_service_registrations", l, "node_id", args.NodeID) 702 703 deleteRegReq := structs.ServiceRegistrationDeleteByNodeIDRequest{NodeID: args.NodeID} 704 705 _, index, err = n.srv.raftApply(structs.ServiceRegistrationDeleteByNodeIDRequestType, &deleteRegReq) 706 if err != nil { 707 n.logger.Error("failed to delete service registrations for node", 708 "node_id", args.NodeID, "error", err) 709 return err 710 } 711 } 712 713 default: 714 ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) 715 if err != nil { 716 n.logger.Error("heartbeat reset failed", "error", err) 717 return err 718 } 719 reply.HeartbeatTTL = ttl 720 } 721 722 // Set the reply index and leader 723 reply.Index = index 724 n.srv.peerLock.RLock() 725 defer n.srv.peerLock.RUnlock() 726 if err := n.constructNodeServerInfoResponse(node.GetID(), snap, reply); err != nil { 727 n.logger.Error("failed to populate NodeUpdateResponse", "error", err) 728 return err 729 } 730 731 return nil 732 } 733 734 // nodeStatusTransitionRequiresEval is a helper that takes a nodes new and old status and 735 // returns whether it has transitioned to ready. 736 func nodeStatusTransitionRequiresEval(newStatus, oldStatus string) bool { 737 initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady 738 terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady 739 disconnectedToOther := oldStatus == structs.NodeStatusDisconnected && newStatus != structs.NodeStatusDisconnected 740 otherToDisconnected := oldStatus != structs.NodeStatusDisconnected && newStatus == structs.NodeStatusDisconnected 741 return initToReady || terminalToReady || disconnectedToOther || otherToDisconnected 742 } 743 744 // UpdateDrain is used to update the drain mode of a client node 745 func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, 746 reply *structs.NodeDrainUpdateResponse) error { 747 748 authErr := n.srv.Authenticate(n.ctx, args) 749 if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done { 750 return err 751 } 752 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 753 if authErr != nil { 754 return structs.ErrPermissionDenied 755 } 756 defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now()) 757 758 // Check node write permissions 759 if aclObj, err := n.srv.ResolveACL(args); err != nil { 760 return err 761 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 762 return structs.ErrPermissionDenied 763 } 764 765 // Verify the arguments 766 if args.NodeID == "" { 767 return fmt.Errorf("missing node ID for drain update") 768 } 769 if args.NodeEvent != nil { 770 return fmt.Errorf("node event must not be set") 771 } 772 773 // Look for the node 774 snap, err := n.srv.fsm.State().Snapshot() 775 if err != nil { 776 return err 777 } 778 node, err := snap.NodeByID(nil, args.NodeID) 779 if err != nil { 780 return err 781 } 782 if node == nil { 783 return fmt.Errorf("node not found") 784 } 785 786 now := time.Now().UTC() 787 788 // Update the timestamp of when the node status was updated 789 args.UpdatedAt = now.Unix() 790 791 // Setup drain strategy 792 if args.DrainStrategy != nil { 793 // Mark start time for the drain 794 if node.DrainStrategy == nil { 795 args.DrainStrategy.StartedAt = now 796 } else { 797 args.DrainStrategy.StartedAt = node.DrainStrategy.StartedAt 798 } 799 800 // Mark the deadline time 801 if args.DrainStrategy.Deadline.Nanoseconds() > 0 { 802 args.DrainStrategy.ForceDeadline = now.Add(args.DrainStrategy.Deadline) 803 } 804 } 805 806 // Construct the node event 807 args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemDrain) 808 if node.DrainStrategy == nil && args.DrainStrategy != nil { 809 args.NodeEvent.SetMessage(NodeDrainEventDrainSet) 810 } else if node.DrainStrategy != nil && args.DrainStrategy != nil { 811 args.NodeEvent.SetMessage(NodeDrainEventDrainUpdated) 812 } else if node.DrainStrategy != nil && args.DrainStrategy == nil { 813 args.NodeEvent.SetMessage(NodeDrainEventDrainDisabled) 814 } else { 815 args.NodeEvent = nil 816 } 817 818 // Commit this update via Raft 819 _, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) 820 if err != nil { 821 n.logger.Error("drain update failed", "error", err) 822 return err 823 } 824 reply.NodeModifyIndex = index 825 826 // If the node is transitioning to be eligible, create Node evaluations 827 // because there may be a System job registered that should be evaluated. 828 if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.MarkEligible && args.DrainStrategy == nil { 829 n.logger.Info("node transitioning to eligible state", "node_id", node.ID) 830 evalIDs, evalIndex, err := n.createNodeEvals(node, index) 831 if err != nil { 832 n.logger.Error("eval creation failed", "error", err) 833 return err 834 } 835 reply.EvalIDs = evalIDs 836 reply.EvalCreateIndex = evalIndex 837 } 838 839 // Set the reply index 840 reply.Index = index 841 return nil 842 } 843 844 // UpdateEligibility is used to update the scheduling eligibility of a node 845 func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, 846 reply *structs.NodeEligibilityUpdateResponse) error { 847 848 authErr := n.srv.Authenticate(n.ctx, args) 849 if done, err := n.srv.forward("Node.UpdateEligibility", args, args, reply); done { 850 return err 851 } 852 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 853 if authErr != nil { 854 return structs.ErrPermissionDenied 855 } 856 defer metrics.MeasureSince([]string{"nomad", "client", "update_eligibility"}, time.Now()) 857 858 // Check node write permissions 859 if aclObj, err := n.srv.ResolveACL(args); err != nil { 860 return err 861 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 862 return structs.ErrPermissionDenied 863 } 864 865 // Verify the arguments 866 if args.NodeID == "" { 867 return fmt.Errorf("missing node ID for setting scheduling eligibility") 868 } 869 if args.NodeEvent != nil { 870 return fmt.Errorf("node event must not be set") 871 } 872 873 // Check that only allowed types are set 874 switch args.Eligibility { 875 case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible: 876 default: 877 return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility) 878 } 879 880 // Look for the node 881 snap, err := n.srv.fsm.State().Snapshot() 882 if err != nil { 883 return err 884 } 885 node, err := snap.NodeByID(nil, args.NodeID) 886 if err != nil { 887 return err 888 } 889 if node == nil { 890 return fmt.Errorf("node not found") 891 } 892 893 if node.DrainStrategy != nil && args.Eligibility == structs.NodeSchedulingEligible { 894 return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining") 895 } 896 897 switch args.Eligibility { 898 case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible: 899 default: 900 return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility) 901 } 902 903 // Update the timestamp of when the node status was updated 904 args.UpdatedAt = time.Now().Unix() 905 906 // Construct the node event 907 args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster) 908 if node.SchedulingEligibility == args.Eligibility { 909 return nil // Nothing to do 910 } else if args.Eligibility == structs.NodeSchedulingEligible { 911 n.logger.Info("node transitioning to eligible state", "node_id", node.ID) 912 args.NodeEvent.SetMessage(NodeEligibilityEventEligible) 913 } else { 914 n.logger.Info("node transitioning to ineligible state", "node_id", node.ID) 915 args.NodeEvent.SetMessage(NodeEligibilityEventIneligible) 916 } 917 918 // Commit this update via Raft 919 outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args) 920 if err != nil { 921 n.logger.Error("eligibility update failed", "error", err) 922 return err 923 } 924 if outErr != nil { 925 if err, ok := outErr.(error); ok && err != nil { 926 n.logger.Error("eligibility update failed", "error", err) 927 return err 928 } 929 } 930 931 // If the node is transitioning to be eligible, create Node evaluations 932 // because there may be a System job registered that should be evaluated. 933 if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.Eligibility == structs.NodeSchedulingEligible { 934 evalIDs, evalIndex, err := n.createNodeEvals(node, index) 935 if err != nil { 936 n.logger.Error("eval creation failed", "error", err) 937 return err 938 } 939 reply.EvalIDs = evalIDs 940 reply.EvalCreateIndex = evalIndex 941 } 942 943 // Set the reply index 944 reply.Index = index 945 return nil 946 } 947 948 // Evaluate is used to force a re-evaluation of the node 949 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error { 950 951 authErr := n.srv.Authenticate(n.ctx, args) 952 if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done { 953 return err 954 } 955 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 956 if authErr != nil { 957 return structs.ErrPermissionDenied 958 } 959 defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now()) 960 961 // Check node write permissions 962 if aclObj, err := n.srv.ResolveACL(args); err != nil { 963 return err 964 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 965 return structs.ErrPermissionDenied 966 } 967 968 // Verify the arguments 969 if args.NodeID == "" { 970 return fmt.Errorf("missing node ID for evaluation") 971 } 972 973 // Look for the node 974 snap, err := n.srv.fsm.State().Snapshot() 975 if err != nil { 976 return err 977 } 978 ws := memdb.NewWatchSet() 979 node, err := snap.NodeByID(ws, args.NodeID) 980 if err != nil { 981 return err 982 } 983 if node == nil { 984 return fmt.Errorf("node not found") 985 } 986 987 // Create the evaluation 988 evalIDs, evalIndex, err := n.createNodeEvals(node, node.ModifyIndex) 989 if err != nil { 990 n.logger.Error("eval creation failed", "error", err) 991 return err 992 } 993 reply.EvalIDs = evalIDs 994 reply.EvalCreateIndex = evalIndex 995 996 // Set the reply index 997 reply.Index = evalIndex 998 999 n.srv.peerLock.RLock() 1000 defer n.srv.peerLock.RUnlock() 1001 if err := n.constructNodeServerInfoResponse(node.GetID(), snap, reply); err != nil { 1002 n.logger.Error("failed to populate NodeUpdateResponse", "error", err) 1003 return err 1004 } 1005 return nil 1006 } 1007 1008 // GetNode is used to request information about a specific node 1009 func (n *Node) GetNode(args *structs.NodeSpecificRequest, 1010 reply *structs.SingleNodeResponse) error { 1011 1012 authErr := n.srv.Authenticate(n.ctx, args) 1013 if done, err := n.srv.forward("Node.GetNode", args, args, reply); done { 1014 return err 1015 } 1016 n.srv.MeasureRPCRate("node", structs.RateMetricRead, args) 1017 if authErr != nil { 1018 return structs.ErrPermissionDenied 1019 } 1020 defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) 1021 1022 // Check node read permissions 1023 aclObj, err := n.srv.ResolveClientOrACL(args) 1024 if err != nil { 1025 return err 1026 } 1027 if aclObj != nil && !aclObj.AllowNodeRead() { 1028 return structs.ErrPermissionDenied 1029 } 1030 1031 // Setup the blocking query 1032 opts := blockingOptions{ 1033 queryOpts: &args.QueryOptions, 1034 queryMeta: &reply.QueryMeta, 1035 run: func(ws memdb.WatchSet, state *state.StateStore) error { 1036 // Verify the arguments 1037 if args.NodeID == "" { 1038 return fmt.Errorf("missing node ID") 1039 } 1040 1041 // Look for the node 1042 out, err := state.NodeByID(ws, args.NodeID) 1043 if err != nil { 1044 return err 1045 } 1046 1047 // Setup the output 1048 if out != nil { 1049 out = out.Sanitize() 1050 reply.Node = out 1051 reply.Index = out.ModifyIndex 1052 } else { 1053 // Use the last index that affected the nodes table 1054 index, err := state.Index("nodes") 1055 if err != nil { 1056 return err 1057 } 1058 reply.Node = nil 1059 reply.Index = index 1060 } 1061 1062 // Set the query response 1063 n.srv.setQueryMeta(&reply.QueryMeta) 1064 return nil 1065 }} 1066 return n.srv.blockingRPC(&opts) 1067 } 1068 1069 // GetAllocs is used to request allocations for a specific node 1070 func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, 1071 reply *structs.NodeAllocsResponse) error { 1072 1073 authErr := n.srv.Authenticate(n.ctx, args) 1074 if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done { 1075 return err 1076 } 1077 n.srv.MeasureRPCRate("node", structs.RateMetricList, args) 1078 if authErr != nil { 1079 return structs.ErrPermissionDenied 1080 } 1081 defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now()) 1082 1083 // Check node read and namespace job read permissions 1084 aclObj, err := n.srv.ResolveACL(args) 1085 if err != nil { 1086 return err 1087 } 1088 if aclObj != nil && !aclObj.AllowNodeRead() { 1089 return structs.ErrPermissionDenied 1090 } 1091 1092 // cache namespace perms 1093 readableNamespaces := map[string]bool{} 1094 1095 // readNS is a caching namespace read-job helper 1096 readNS := func(ns string) bool { 1097 if aclObj == nil { 1098 // ACLs are disabled; everything is readable 1099 return true 1100 } 1101 1102 if readable, ok := readableNamespaces[ns]; ok { 1103 // cache hit 1104 return readable 1105 } 1106 1107 // cache miss 1108 readable := aclObj.AllowNsOp(ns, acl.NamespaceCapabilityReadJob) 1109 readableNamespaces[ns] = readable 1110 return readable 1111 } 1112 1113 // Verify the arguments 1114 if args.NodeID == "" { 1115 return fmt.Errorf("missing node ID") 1116 } 1117 1118 // Setup the blocking query 1119 opts := blockingOptions{ 1120 queryOpts: &args.QueryOptions, 1121 queryMeta: &reply.QueryMeta, 1122 run: func(ws memdb.WatchSet, state *state.StateStore) error { 1123 // Look for the node 1124 allocs, err := state.AllocsByNode(ws, args.NodeID) 1125 if err != nil { 1126 return err 1127 } 1128 1129 // Setup the output 1130 if n := len(allocs); n != 0 { 1131 reply.Allocs = make([]*structs.Allocation, 0, n) 1132 for _, alloc := range allocs { 1133 if readNS(alloc.Namespace) { 1134 reply.Allocs = append(reply.Allocs, alloc) 1135 } 1136 1137 // Get the max of all allocs since 1138 // subsequent requests need to start 1139 // from the latest index 1140 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 1141 } 1142 } else { 1143 reply.Allocs = nil 1144 1145 // Use the last index that affected the nodes table 1146 index, err := state.Index("allocs") 1147 if err != nil { 1148 return err 1149 } 1150 1151 // Must provide non-zero index to prevent blocking 1152 // Index 1 is impossible anyways (due to Raft internals) 1153 if index == 0 { 1154 reply.Index = 1 1155 } else { 1156 reply.Index = index 1157 } 1158 } 1159 return nil 1160 }} 1161 return n.srv.blockingRPC(&opts) 1162 } 1163 1164 // GetClientAllocs is used to request a lightweight list of alloc modify indexes 1165 // per allocation. 1166 func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest, 1167 reply *structs.NodeClientAllocsResponse) error { 1168 1169 authErr := n.srv.Authenticate(n.ctx, args) 1170 isForwarded := args.IsForwarded() 1171 if done, err := n.srv.forward("Node.GetClientAllocs", args, args, reply); done { 1172 // We have a valid node connection since there is no error from the 1173 // forwarded server, so add the mapping to cache the 1174 // connection and allow the server to send RPCs to the client. 1175 if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded { 1176 n.ctx.NodeID = args.NodeID 1177 n.srv.addNodeConn(n.ctx) 1178 } 1179 1180 return err 1181 } 1182 n.srv.MeasureRPCRate("node", structs.RateMetricList, args) 1183 if authErr != nil { 1184 return structs.ErrPermissionDenied 1185 } 1186 defer metrics.MeasureSince([]string{"nomad", "client", "get_client_allocs"}, time.Now()) 1187 1188 // Verify the arguments 1189 if args.NodeID == "" { 1190 return fmt.Errorf("missing node ID") 1191 } 1192 1193 // numOldAllocs is used to detect if there is a garbage collection event 1194 // that effects the node. When an allocation is garbage collected, that does 1195 // not change the modify index changes and thus the query won't unblock, 1196 // even though the set of allocations on the node has changed. 1197 var numOldAllocs int 1198 1199 // Setup the blocking query 1200 opts := blockingOptions{ 1201 queryOpts: &args.QueryOptions, 1202 queryMeta: &reply.QueryMeta, 1203 run: func(ws memdb.WatchSet, state *state.StateStore) error { 1204 // Look for the node 1205 node, err := state.NodeByID(ws, args.NodeID) 1206 if err != nil { 1207 return err 1208 } 1209 1210 var allocs []*structs.Allocation 1211 if node != nil { 1212 if args.SecretID == "" { 1213 return fmt.Errorf("missing node secret ID for client status update") 1214 } else if args.SecretID != node.SecretID { 1215 return fmt.Errorf("node secret ID does not match") 1216 } 1217 1218 // We have a valid node connection, so add the mapping to cache the 1219 // connection and allow the server to send RPCs to the client. We only cache 1220 // the connection if it is not being forwarded from another server. 1221 if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() { 1222 n.ctx.NodeID = args.NodeID 1223 n.srv.addNodeConn(n.ctx) 1224 } 1225 1226 var err error 1227 allocs, err = state.AllocsByNode(ws, args.NodeID) 1228 if err != nil { 1229 return err 1230 } 1231 } 1232 1233 reply.Allocs = make(map[string]uint64) 1234 reply.MigrateTokens = make(map[string]string) 1235 1236 // preferTableIndex is used to determine whether we should build the 1237 // response index based on the full table indexes versus the modify 1238 // indexes of the allocations on the specific node. This is 1239 // preferred in the case that the node doesn't yet have allocations 1240 // or when we detect a GC that effects the node. 1241 preferTableIndex := true 1242 1243 // Setup the output 1244 if numAllocs := len(allocs); numAllocs != 0 { 1245 preferTableIndex = false 1246 1247 for _, alloc := range allocs { 1248 reply.Allocs[alloc.ID] = alloc.AllocModifyIndex 1249 1250 // If the allocation is going to do a migration, create a 1251 // migration token so that the client can authenticate with 1252 // the node hosting the previous allocation. 1253 if alloc.ShouldMigrate() { 1254 prevAllocation, err := state.AllocByID(ws, alloc.PreviousAllocation) 1255 if err != nil { 1256 return err 1257 } 1258 1259 if prevAllocation != nil && prevAllocation.NodeID != alloc.NodeID { 1260 allocNode, err := state.NodeByID(ws, prevAllocation.NodeID) 1261 if err != nil { 1262 return err 1263 } 1264 if allocNode == nil { 1265 // Node must have been GC'd so skip the token 1266 continue 1267 } 1268 1269 token, err := structs.GenerateMigrateToken(prevAllocation.ID, allocNode.SecretID) 1270 if err != nil { 1271 return err 1272 } 1273 reply.MigrateTokens[alloc.ID] = token 1274 } 1275 } 1276 1277 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 1278 } 1279 1280 // Determine if we have less allocations than before. This 1281 // indicates there was a garbage collection 1282 if numAllocs < numOldAllocs { 1283 preferTableIndex = true 1284 } 1285 1286 // Store the new number of allocations 1287 numOldAllocs = numAllocs 1288 } 1289 1290 if preferTableIndex { 1291 // Use the last index that affected the nodes table 1292 index, err := state.Index("allocs") 1293 if err != nil { 1294 return err 1295 } 1296 1297 // Must provide non-zero index to prevent blocking 1298 // Index 1 is impossible anyways (due to Raft internals) 1299 if index == 0 { 1300 reply.Index = 1 1301 } else { 1302 reply.Index = index 1303 } 1304 } 1305 return nil 1306 }} 1307 return n.srv.blockingRPC(&opts) 1308 } 1309 1310 // UpdateAlloc is used to update the client status of an allocation. It should 1311 // only be called by clients. 1312 // 1313 // Calling this method returns an error when: 1314 // - The node is not registered in the server yet. Clients must first call the 1315 // Register method. 1316 // - The node status is down or disconnected. Clients must call the 1317 // UpdateStatus method to update its status in the server. 1318 func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error { 1319 1320 authErr := n.srv.Authenticate(n.ctx, args) 1321 1322 // Ensure the connection was initiated by another client if TLS is used. 1323 err := validateTLSCertificateLevel(n.srv, n.ctx, tlsCertificateLevelClient) 1324 if err != nil { 1325 return err 1326 } 1327 if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done { 1328 return err 1329 } 1330 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 1331 if authErr != nil { 1332 return structs.ErrPermissionDenied 1333 } 1334 1335 defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now()) 1336 1337 // Ensure at least a single alloc 1338 if len(args.Alloc) == 0 { 1339 return fmt.Errorf("must update at least one allocation") 1340 } 1341 1342 // Ensure the node is allowed to update allocs. 1343 // The node needs to successfully heartbeat before updating its allocs. 1344 nodeID := args.Alloc[0].NodeID 1345 if nodeID == "" { 1346 return fmt.Errorf("missing node ID") 1347 } 1348 1349 node, err := n.srv.State().NodeByID(nil, nodeID) 1350 if err != nil { 1351 return fmt.Errorf("failed to retrieve node %s: %v", nodeID, err) 1352 } 1353 if node == nil { 1354 return fmt.Errorf("node %s not found", nodeID) 1355 } 1356 if node.UnresponsiveStatus() { 1357 return fmt.Errorf("node %s is not allowed to update allocs while in status %s", nodeID, node.Status) 1358 } 1359 1360 // Ensure that evals aren't set from client RPCs 1361 // We create them here before the raft update 1362 if len(args.Evals) != 0 { 1363 return fmt.Errorf("evals field must not be set") 1364 } 1365 1366 // Update modified timestamp for client initiated allocation updates 1367 now := time.Now() 1368 var evals []*structs.Evaluation 1369 1370 for _, allocToUpdate := range args.Alloc { 1371 evalTriggerBy := "" 1372 allocToUpdate.ModifyTime = now.UTC().UnixNano() 1373 1374 alloc, _ := n.srv.State().AllocByID(nil, allocToUpdate.ID) 1375 if alloc == nil { 1376 continue 1377 } 1378 1379 if !allocToUpdate.TerminalStatus() && alloc.ClientStatus != structs.AllocClientStatusUnknown { 1380 continue 1381 } 1382 1383 var job *structs.Job 1384 var jobType string 1385 var jobPriority int 1386 1387 job, err = n.srv.State().JobByID(nil, alloc.Namespace, alloc.JobID) 1388 if err != nil { 1389 n.logger.Debug("UpdateAlloc unable to find job", "job", alloc.JobID, "error", err) 1390 continue 1391 } 1392 1393 // If the job is nil it means it has been de-registered. 1394 if job == nil { 1395 jobType = alloc.Job.Type 1396 jobPriority = alloc.Job.Priority 1397 evalTriggerBy = structs.EvalTriggerJobDeregister 1398 allocToUpdate.DesiredStatus = structs.AllocDesiredStatusStop 1399 n.logger.Debug("UpdateAlloc unable to find job - shutting down alloc", "job", alloc.JobID) 1400 } 1401 1402 var taskGroup *structs.TaskGroup 1403 if job != nil { 1404 jobType = job.Type 1405 jobPriority = job.Priority 1406 taskGroup = job.LookupTaskGroup(alloc.TaskGroup) 1407 } 1408 1409 // If we cannot find the task group for a failed alloc we cannot continue, unless it is an orphan. 1410 if evalTriggerBy != structs.EvalTriggerJobDeregister && 1411 allocToUpdate.ClientStatus == structs.AllocClientStatusFailed && 1412 alloc.FollowupEvalID == "" { 1413 1414 if taskGroup == nil { 1415 n.logger.Debug("UpdateAlloc unable to find task group for job", "job", alloc.JobID, "alloc", alloc.ID, "task_group", alloc.TaskGroup) 1416 continue 1417 } 1418 1419 // Set trigger by failed if not an orphan. 1420 if alloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) { 1421 evalTriggerBy = structs.EvalTriggerRetryFailedAlloc 1422 } 1423 } 1424 1425 var eval *structs.Evaluation 1426 // If unknown, and not an orphan, set the trigger by. 1427 if evalTriggerBy != structs.EvalTriggerJobDeregister && 1428 alloc.ClientStatus == structs.AllocClientStatusUnknown { 1429 evalTriggerBy = structs.EvalTriggerReconnect 1430 } 1431 1432 // If we weren't able to determine one of our expected eval triggers, 1433 // continue and don't create an eval. 1434 if evalTriggerBy == "" { 1435 continue 1436 } 1437 1438 eval = &structs.Evaluation{ 1439 ID: uuid.Generate(), 1440 Namespace: alloc.Namespace, 1441 TriggeredBy: evalTriggerBy, 1442 JobID: alloc.JobID, 1443 Type: jobType, 1444 Priority: jobPriority, 1445 Status: structs.EvalStatusPending, 1446 CreateTime: now.UTC().UnixNano(), 1447 ModifyTime: now.UTC().UnixNano(), 1448 } 1449 evals = append(evals, eval) 1450 } 1451 1452 // Add this to the batch 1453 n.updatesLock.Lock() 1454 n.updates = append(n.updates, args.Alloc...) 1455 n.evals = append(n.evals, evals...) 1456 1457 // Start a new batch if none 1458 future := n.updateFuture 1459 if future == nil { 1460 future = structs.NewBatchFuture() 1461 n.updateFuture = future 1462 n.updateTimer = time.AfterFunc(batchUpdateInterval, func() { 1463 // Get the pending updates 1464 n.updatesLock.Lock() 1465 updates := n.updates 1466 evals := n.evals 1467 future := n.updateFuture 1468 1469 // Assume future update patterns will be similar to 1470 // current batch and set cap appropriately to avoid 1471 // slice resizing. 1472 n.updates = make([]*structs.Allocation, 0, len(updates)) 1473 n.evals = make([]*structs.Evaluation, 0, len(evals)) 1474 1475 n.updateFuture = nil 1476 n.updateTimer = nil 1477 n.updatesLock.Unlock() 1478 1479 // Perform the batch update 1480 n.batchUpdate(future, updates, evals) 1481 }) 1482 } 1483 n.updatesLock.Unlock() 1484 1485 // Wait for the future 1486 if err := future.Wait(); err != nil { 1487 return err 1488 } 1489 1490 // Setup the response 1491 reply.Index = future.Index() 1492 return nil 1493 } 1494 1495 // batchUpdate is used to update all the allocations 1496 func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) { 1497 var mErr multierror.Error 1498 // Group pending evals by jobID to prevent creating unnecessary evals 1499 evalsByJobId := make(map[structs.NamespacedID]struct{}) 1500 var trimmedEvals []*structs.Evaluation 1501 for _, eval := range evals { 1502 namespacedID := structs.NamespacedID{ 1503 ID: eval.JobID, 1504 Namespace: eval.Namespace, 1505 } 1506 _, exists := evalsByJobId[namespacedID] 1507 if !exists { 1508 now := time.Now().UTC().UnixNano() 1509 eval.CreateTime = now 1510 eval.ModifyTime = now 1511 trimmedEvals = append(trimmedEvals, eval) 1512 evalsByJobId[namespacedID] = struct{}{} 1513 } 1514 } 1515 1516 if len(trimmedEvals) > 0 { 1517 n.logger.Debug("adding evaluations for rescheduling failed allocations", "num_evals", len(trimmedEvals)) 1518 } 1519 // Prepare the batch update 1520 batch := &structs.AllocUpdateRequest{ 1521 Alloc: updates, 1522 Evals: trimmedEvals, 1523 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 1524 } 1525 1526 // Commit this update via Raft 1527 _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch) 1528 if err != nil { 1529 n.logger.Error("alloc update failed", "error", err) 1530 mErr.Errors = append(mErr.Errors, err) 1531 } 1532 1533 // For each allocation we are updating, check if we should revoke any 1534 // - Vault token accessors 1535 // - Service Identity token accessors 1536 var ( 1537 revokeVault []*structs.VaultAccessor 1538 revokeSI []*structs.SITokenAccessor 1539 ) 1540 1541 for _, alloc := range updates { 1542 // Skip any allocation that isn't dead on the client 1543 if !alloc.Terminated() { 1544 continue 1545 } 1546 1547 ws := memdb.NewWatchSet() 1548 1549 // Determine if there are any orphaned Vault accessors for the allocation 1550 if accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID); err != nil { 1551 n.logger.Error("looking up vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1552 mErr.Errors = append(mErr.Errors, err) 1553 } else { 1554 revokeVault = append(revokeVault, accessors...) 1555 } 1556 1557 // Determine if there are any orphaned SI accessors for the allocation 1558 if accessors, err := n.srv.State().SITokenAccessorsByAlloc(ws, alloc.ID); err != nil { 1559 n.logger.Error("looking up si accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1560 mErr.Errors = append(mErr.Errors, err) 1561 } else { 1562 revokeSI = append(revokeSI, accessors...) 1563 } 1564 } 1565 1566 // Revoke any orphaned Vault token accessors 1567 if l := len(revokeVault); l > 0 { 1568 n.logger.Debug("revoking vault accessors due to terminal allocations", "num_accessors", l) 1569 if err := n.srv.vault.RevokeTokens(context.Background(), revokeVault, true); err != nil { 1570 n.logger.Error("batched vault accessor revocation failed", "error", err) 1571 mErr.Errors = append(mErr.Errors, err) 1572 } 1573 } 1574 1575 // Revoke any orphaned SI token accessors 1576 if l := len(revokeSI); l > 0 { 1577 n.logger.Debug("revoking si accessors due to terminal allocations", "num_accessors", l) 1578 _ = n.srv.consulACLs.RevokeTokens(context.Background(), revokeSI, true) 1579 } 1580 1581 // Respond to the future 1582 future.Respond(index, mErr.ErrorOrNil()) 1583 } 1584 1585 // List is used to list the available nodes 1586 func (n *Node) List(args *structs.NodeListRequest, 1587 reply *structs.NodeListResponse) error { 1588 1589 authErr := n.srv.Authenticate(n.ctx, args) 1590 if done, err := n.srv.forward("Node.List", args, args, reply); done { 1591 return err 1592 } 1593 n.srv.MeasureRPCRate("node", structs.RateMetricList, args) 1594 if authErr != nil { 1595 return structs.ErrPermissionDenied 1596 } 1597 defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) 1598 1599 // Check node read permissions 1600 if aclObj, err := n.srv.ResolveACL(args); err != nil { 1601 return err 1602 } else if aclObj != nil && !aclObj.AllowNodeRead() { 1603 return structs.ErrPermissionDenied 1604 } 1605 1606 // Set up the blocking query. 1607 opts := blockingOptions{ 1608 queryOpts: &args.QueryOptions, 1609 queryMeta: &reply.QueryMeta, 1610 run: func(ws memdb.WatchSet, state *state.StateStore) error { 1611 1612 var err error 1613 var iter memdb.ResultIterator 1614 if prefix := args.QueryOptions.Prefix; prefix != "" { 1615 iter, err = state.NodesByIDPrefix(ws, prefix) 1616 } else { 1617 iter, err = state.Nodes(ws) 1618 } 1619 if err != nil { 1620 return err 1621 } 1622 1623 // Generate the tokenizer to use for pagination using the populated 1624 // paginatorOpts object. The ID of a node must be unique within the 1625 // region, therefore we only need WithID on the paginator options. 1626 tokenizer := paginator.NewStructsTokenizer(iter, paginator.StructsTokenizerOptions{WithID: true}) 1627 1628 var nodes []*structs.NodeListStub 1629 1630 // Build the paginator. This includes the function that is 1631 // responsible for appending a node to the nodes array. 1632 paginatorImpl, err := paginator.NewPaginator(iter, tokenizer, nil, args.QueryOptions, 1633 func(raw interface{}) error { 1634 nodes = append(nodes, raw.(*structs.Node).Stub(args.Fields)) 1635 return nil 1636 }) 1637 if err != nil { 1638 return structs.NewErrRPCCodedf( 1639 http.StatusBadRequest, "failed to create result paginator: %v", err) 1640 } 1641 1642 // Calling page populates our output nodes array as well as returns 1643 // the next token. 1644 nextToken, err := paginatorImpl.Page() 1645 if err != nil { 1646 return structs.NewErrRPCCodedf( 1647 http.StatusBadRequest, "failed to read result page: %v", err) 1648 } 1649 1650 // Populate the reply. 1651 reply.Nodes = nodes 1652 reply.NextToken = nextToken 1653 1654 // Use the last index that affected the jobs table 1655 index, err := state.Index("nodes") 1656 if err != nil { 1657 return err 1658 } 1659 reply.Index = index 1660 1661 // Set the query response 1662 n.srv.setQueryMeta(&reply.QueryMeta) 1663 return nil 1664 }} 1665 return n.srv.blockingRPC(&opts) 1666 } 1667 1668 // createNodeEvals is used to create evaluations for each alloc on a node. 1669 // Each Eval is scoped to a job, so we need to potentially trigger many evals. 1670 func (n *Node) createNodeEvals(node *structs.Node, nodeIndex uint64) ([]string, uint64, error) { 1671 nodeID := node.ID 1672 1673 // Snapshot the state 1674 snap, err := n.srv.fsm.State().Snapshot() 1675 if err != nil { 1676 return nil, 0, fmt.Errorf("failed to snapshot state: %v", err) 1677 } 1678 1679 // Find all the allocations for this node 1680 allocs, err := snap.AllocsByNode(nil, nodeID) 1681 if err != nil { 1682 return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err) 1683 } 1684 1685 sysJobsIter, err := snap.JobsByScheduler(nil, "system") 1686 if err != nil { 1687 return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err) 1688 } 1689 1690 var sysJobs []*structs.Job 1691 for jobI := sysJobsIter.Next(); jobI != nil; jobI = sysJobsIter.Next() { 1692 job := jobI.(*structs.Job) 1693 // Avoid creating evals for jobs that don't run in this datacenter or 1694 // node pool. We could perform an entire feasibility check here, but 1695 // datacenter/pool is a good optimization to start with as their 1696 // cardinality tends to be low so the check shouldn't add much work. 1697 if node.IsInPool(job.NodePool) && node.IsInAnyDC(job.Datacenters) { 1698 sysJobs = append(sysJobs, job) 1699 } 1700 } 1701 1702 // Fast-path if nothing to do 1703 if len(allocs) == 0 && len(sysJobs) == 0 { 1704 return nil, 0, nil 1705 } 1706 1707 // Create an eval for each JobID affected 1708 var evals []*structs.Evaluation 1709 var evalIDs []string 1710 jobIDs := map[structs.NamespacedID]struct{}{} 1711 now := time.Now().UTC().UnixNano() 1712 1713 for _, alloc := range allocs { 1714 // Deduplicate on JobID 1715 if _, ok := jobIDs[alloc.JobNamespacedID()]; ok { 1716 continue 1717 } 1718 jobIDs[alloc.JobNamespacedID()] = struct{}{} 1719 1720 // Create a new eval 1721 eval := &structs.Evaluation{ 1722 ID: uuid.Generate(), 1723 Namespace: alloc.Namespace, 1724 Priority: alloc.Job.Priority, 1725 Type: alloc.Job.Type, 1726 TriggeredBy: structs.EvalTriggerNodeUpdate, 1727 JobID: alloc.JobID, 1728 NodeID: nodeID, 1729 NodeModifyIndex: nodeIndex, 1730 Status: structs.EvalStatusPending, 1731 CreateTime: now, 1732 ModifyTime: now, 1733 } 1734 1735 evals = append(evals, eval) 1736 evalIDs = append(evalIDs, eval.ID) 1737 } 1738 1739 // Create an evaluation for each system job. 1740 for _, job := range sysJobs { 1741 // Still dedup on JobID as the node may already have the system job. 1742 if _, ok := jobIDs[job.NamespacedID()]; ok { 1743 continue 1744 } 1745 jobIDs[job.NamespacedID()] = struct{}{} 1746 1747 // Create a new eval 1748 eval := &structs.Evaluation{ 1749 ID: uuid.Generate(), 1750 Namespace: job.Namespace, 1751 Priority: job.Priority, 1752 Type: job.Type, 1753 TriggeredBy: structs.EvalTriggerNodeUpdate, 1754 JobID: job.ID, 1755 NodeID: nodeID, 1756 NodeModifyIndex: nodeIndex, 1757 Status: structs.EvalStatusPending, 1758 CreateTime: now, 1759 ModifyTime: now, 1760 } 1761 evals = append(evals, eval) 1762 evalIDs = append(evalIDs, eval.ID) 1763 } 1764 1765 // Create the Raft transaction 1766 update := &structs.EvalUpdateRequest{ 1767 Evals: evals, 1768 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 1769 } 1770 1771 // Commit this evaluation via Raft 1772 // XXX: There is a risk of partial failure where the node update succeeds 1773 // but that the EvalUpdate does not. 1774 _, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update) 1775 if err != nil { 1776 return nil, 0, err 1777 } 1778 return evalIDs, evalIndex, nil 1779 } 1780 1781 // DeriveVaultToken is used by the clients to request wrapped Vault tokens for 1782 // tasks 1783 func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, reply *structs.DeriveVaultTokenResponse) error { 1784 1785 authErr := n.srv.Authenticate(n.ctx, args) 1786 1787 setError := func(e error, recoverable bool) { 1788 if e != nil { 1789 if re, ok := e.(*structs.RecoverableError); ok { 1790 reply.Error = re // No need to wrap if error is already a RecoverableError 1791 } else { 1792 reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) 1793 } 1794 n.logger.Error("DeriveVaultToken failed", "recoverable", recoverable, "error", e) 1795 } 1796 } 1797 1798 if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done { 1799 setError(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader) 1800 return nil 1801 } 1802 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 1803 if authErr != nil { 1804 return structs.ErrPermissionDenied 1805 } 1806 defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now()) 1807 1808 // Verify the arguments 1809 if args.NodeID == "" { 1810 setError(fmt.Errorf("missing node ID"), false) 1811 return nil 1812 } 1813 if args.SecretID == "" { 1814 setError(fmt.Errorf("missing node SecretID"), false) 1815 return nil 1816 } 1817 if args.AllocID == "" { 1818 setError(fmt.Errorf("missing allocation ID"), false) 1819 return nil 1820 } 1821 if len(args.Tasks) == 0 { 1822 setError(fmt.Errorf("no tasks specified"), false) 1823 return nil 1824 } 1825 1826 // Verify the following: 1827 // * The Node exists and has the correct SecretID 1828 // * The Allocation exists on the specified Node 1829 // * The Allocation contains the given tasks and they each require Vault 1830 // tokens 1831 snap, err := n.srv.fsm.State().Snapshot() 1832 if err != nil { 1833 setError(err, false) 1834 return nil 1835 } 1836 ws := memdb.NewWatchSet() 1837 node, err := snap.NodeByID(ws, args.NodeID) 1838 if err != nil { 1839 setError(err, false) 1840 return nil 1841 } 1842 if node == nil { 1843 setError(fmt.Errorf("Node %q does not exist", args.NodeID), false) 1844 return nil 1845 } 1846 if node.SecretID != args.SecretID { 1847 setError(fmt.Errorf("SecretID mismatch"), false) 1848 return nil 1849 } 1850 1851 alloc, err := snap.AllocByID(ws, args.AllocID) 1852 if err != nil { 1853 setError(err, false) 1854 return nil 1855 } 1856 if alloc == nil { 1857 setError(fmt.Errorf("Allocation %q does not exist", args.AllocID), false) 1858 return nil 1859 } 1860 if alloc.NodeID != args.NodeID { 1861 setError(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false) 1862 return nil 1863 } 1864 if alloc.TerminalStatus() { 1865 setError(fmt.Errorf("Can't request Vault token for terminal allocation"), false) 1866 return nil 1867 } 1868 1869 // Check if alloc has Vault 1870 vaultBlocks := alloc.Job.Vault() 1871 if vaultBlocks == nil { 1872 setError(fmt.Errorf("Job does not require Vault token"), false) 1873 return nil 1874 } 1875 tg, ok := vaultBlocks[alloc.TaskGroup] 1876 if !ok { 1877 setError(fmt.Errorf("Task group does not require Vault token"), false) 1878 return nil 1879 } 1880 1881 var unneeded []string 1882 for _, task := range args.Tasks { 1883 taskVault := tg[task] 1884 if taskVault == nil || len(taskVault.Policies) == 0 { 1885 unneeded = append(unneeded, task) 1886 } 1887 } 1888 1889 if len(unneeded) != 0 { 1890 e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s", 1891 strings.Join(unneeded, ", ")) 1892 setError(e, false) 1893 return nil 1894 } 1895 1896 // At this point the request is valid and we should contact Vault for 1897 // tokens. 1898 1899 // Create an error group where we will spin up a fixed set of goroutines to 1900 // handle deriving tokens but where if any fails the whole group is 1901 // canceled. 1902 g, ctx := errgroup.WithContext(context.Background()) 1903 1904 // Cap the handlers 1905 handlers := len(args.Tasks) 1906 if handlers > maxParallelRequestsPerDerive { 1907 handlers = maxParallelRequestsPerDerive 1908 } 1909 1910 // Create the Vault Tokens 1911 input := make(chan string, handlers) 1912 results := make(map[string]*vapi.Secret, len(args.Tasks)) 1913 for i := 0; i < handlers; i++ { 1914 g.Go(func() error { 1915 for { 1916 select { 1917 case task, ok := <-input: 1918 if !ok { 1919 return nil 1920 } 1921 1922 secret, err := n.srv.vault.CreateToken(ctx, alloc, task) 1923 if err != nil { 1924 return err 1925 } 1926 1927 results[task] = secret 1928 case <-ctx.Done(): 1929 return nil 1930 } 1931 } 1932 }) 1933 } 1934 1935 // Send the input 1936 go func() { 1937 defer close(input) 1938 for _, task := range args.Tasks { 1939 select { 1940 case <-ctx.Done(): 1941 return 1942 case input <- task: 1943 } 1944 } 1945 }() 1946 1947 // Wait for everything to complete or for an error 1948 createErr := g.Wait() 1949 1950 // Retrieve the results 1951 accessors := make([]*structs.VaultAccessor, 0, len(results)) 1952 tokens := make(map[string]string, len(results)) 1953 for task, secret := range results { 1954 w := secret.WrapInfo 1955 tokens[task] = w.Token 1956 accessor := &structs.VaultAccessor{ 1957 Accessor: w.WrappedAccessor, 1958 Task: task, 1959 NodeID: alloc.NodeID, 1960 AllocID: alloc.ID, 1961 CreationTTL: w.TTL, 1962 } 1963 1964 accessors = append(accessors, accessor) 1965 } 1966 1967 // If there was an error revoke the created tokens 1968 if createErr != nil { 1969 n.logger.Error("Vault token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr) 1970 1971 if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil { 1972 n.logger.Error("Vault token revocation for alloc failed", "alloc_id", alloc.ID, "error", revokeErr) 1973 } 1974 1975 if rerr, ok := createErr.(*structs.RecoverableError); ok { 1976 reply.Error = rerr 1977 } else { 1978 reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError) 1979 } 1980 1981 return nil 1982 } 1983 1984 // Commit to Raft before returning any of the tokens 1985 req := structs.VaultAccessorsRequest{Accessors: accessors} 1986 _, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req) 1987 if err != nil { 1988 n.logger.Error("registering Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 1989 1990 // Determine if we can recover from the error 1991 retry := false 1992 switch err { 1993 case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout: 1994 retry = true 1995 } 1996 1997 setError(err, retry) 1998 return nil 1999 } 2000 2001 reply.Index = index 2002 reply.Tasks = tokens 2003 n.srv.setQueryMeta(&reply.QueryMeta) 2004 return nil 2005 } 2006 2007 type connectTask struct { 2008 TaskKind structs.TaskKind 2009 TaskName string 2010 } 2011 2012 func (n *Node) DeriveSIToken(args *structs.DeriveSITokenRequest, reply *structs.DeriveSITokenResponse) error { 2013 2014 authErr := n.srv.Authenticate(n.ctx, args) 2015 2016 setError := func(e error, recoverable bool) { 2017 if e != nil { 2018 if re, ok := e.(*structs.RecoverableError); ok { 2019 reply.Error = re // No need to wrap if error is already a RecoverableError 2020 } else { 2021 reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) 2022 } 2023 n.logger.Error("DeriveSIToken failed", "recoverable", recoverable, "error", e) 2024 } 2025 } 2026 2027 if done, err := n.srv.forward("Node.DeriveSIToken", args, args, reply); done { 2028 setError(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader) 2029 return nil 2030 } 2031 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 2032 if authErr != nil { 2033 return structs.ErrPermissionDenied 2034 } 2035 defer metrics.MeasureSince([]string{"nomad", "client", "derive_si_token"}, time.Now()) 2036 2037 // Verify the arguments 2038 if err := args.Validate(); err != nil { 2039 setError(err, false) 2040 return nil 2041 } 2042 2043 // Get the ClusterID 2044 clusterID, err := n.srv.ClusterID() 2045 if err != nil { 2046 setError(err, false) 2047 return nil 2048 } 2049 2050 // Verify the following: 2051 // * The Node exists and has the correct SecretID. 2052 // * The Allocation exists on the specified Node. 2053 // * The Allocation contains the given tasks, and each task requires a 2054 // SI token. 2055 2056 snap, err := n.srv.fsm.State().Snapshot() 2057 if err != nil { 2058 setError(err, false) 2059 return nil 2060 } 2061 node, err := snap.NodeByID(nil, args.NodeID) 2062 if err != nil { 2063 setError(err, false) 2064 return nil 2065 } 2066 if node == nil { 2067 setError(fmt.Errorf("Node %q does not exist", args.NodeID), false) 2068 return nil 2069 } 2070 if node.SecretID != args.SecretID { 2071 setError(errors.New("SecretID mismatch"), false) 2072 return nil 2073 } 2074 2075 alloc, err := snap.AllocByID(nil, args.AllocID) 2076 if err != nil { 2077 setError(err, false) 2078 return nil 2079 } 2080 if alloc == nil { 2081 setError(fmt.Errorf("Allocation %q does not exist", args.AllocID), false) 2082 return nil 2083 } 2084 if alloc.NodeID != args.NodeID { 2085 setError(fmt.Errorf("Allocation %q not running on node %q", args.AllocID, args.NodeID), false) 2086 return nil 2087 } 2088 if alloc.TerminalStatus() { 2089 setError(errors.New("Cannot request SI token for terminal allocation"), false) 2090 return nil 2091 } 2092 2093 // make sure task group contains at least one connect enabled service 2094 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 2095 if tg == nil { 2096 setError(fmt.Errorf("Allocation %q does not contain TaskGroup %q", args.AllocID, alloc.TaskGroup), false) 2097 return nil 2098 } 2099 if !tg.UsesConnect() { 2100 setError(fmt.Errorf("TaskGroup %q does not use Connect", tg.Name), false) 2101 return nil 2102 } 2103 2104 // make sure each task in args.Tasks is a connect-enabled task 2105 notConnect, tasks := connectTasks(tg, args.Tasks) 2106 if len(notConnect) > 0 { 2107 setError(fmt.Errorf( 2108 "Requested Consul Service Identity tokens for tasks that are not Connect enabled: %v", 2109 strings.Join(notConnect, ", "), 2110 ), false) 2111 } 2112 2113 // At this point the request is valid and we should contact Consul for tokens. 2114 2115 // A lot of the following is copied from DeriveVaultToken which has been 2116 // working fine for years. 2117 2118 // Create an error group where we will spin up a fixed set of goroutines to 2119 // handle deriving tokens but where if any fails the whole group is 2120 // canceled. 2121 g, ctx := errgroup.WithContext(context.Background()) 2122 2123 // Cap the worker threads 2124 numWorkers := len(args.Tasks) 2125 if numWorkers > maxParallelRequestsPerDerive { 2126 numWorkers = maxParallelRequestsPerDerive 2127 } 2128 2129 // would like to pull some of this out... 2130 2131 // Create the SI tokens from a slice of task name + connect service 2132 input := make(chan connectTask, numWorkers) 2133 results := make(map[string]*structs.SIToken, numWorkers) 2134 for i := 0; i < numWorkers; i++ { 2135 g.Go(func() error { 2136 for { 2137 select { 2138 case task, ok := <-input: 2139 if !ok { 2140 return nil 2141 } 2142 secret, err := n.srv.consulACLs.CreateToken(ctx, ServiceIdentityRequest{ 2143 ConsulNamespace: tg.Consul.GetNamespace(), 2144 TaskKind: task.TaskKind, 2145 TaskName: task.TaskName, 2146 ClusterID: clusterID, 2147 AllocID: alloc.ID, 2148 }) 2149 if err != nil { 2150 return err 2151 } 2152 results[task.TaskName] = secret 2153 case <-ctx.Done(): 2154 return nil 2155 } 2156 } 2157 }) 2158 } 2159 2160 // Send the input 2161 go func() { 2162 defer close(input) 2163 for _, connectTask := range tasks { 2164 select { 2165 case <-ctx.Done(): 2166 return 2167 case input <- connectTask: 2168 } 2169 } 2170 }() 2171 2172 // Wait for everything to complete or for an error 2173 createErr := g.Wait() 2174 2175 accessors := make([]*structs.SITokenAccessor, 0, len(results)) 2176 tokens := make(map[string]string, len(results)) 2177 for task, secret := range results { 2178 tokens[task] = secret.SecretID 2179 accessor := &structs.SITokenAccessor{ 2180 ConsulNamespace: tg.Consul.GetNamespace(), 2181 NodeID: alloc.NodeID, 2182 AllocID: alloc.ID, 2183 TaskName: task, 2184 AccessorID: secret.AccessorID, 2185 } 2186 accessors = append(accessors, accessor) 2187 } 2188 2189 // If there was an error, revoke all created tokens. These tokens have not 2190 // yet been committed to the persistent store. 2191 if createErr != nil { 2192 n.logger.Error("Consul Service Identity token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr) 2193 _ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, false) 2194 2195 if recoverable, ok := createErr.(*structs.RecoverableError); ok { 2196 reply.Error = recoverable 2197 } else { 2198 reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError) 2199 } 2200 2201 return nil 2202 } 2203 2204 // Commit the derived tokens to raft before returning them 2205 requested := structs.SITokenAccessorsRequest{Accessors: accessors} 2206 _, index, err := n.srv.raftApply(structs.ServiceIdentityAccessorRegisterRequestType, &requested) 2207 if err != nil { 2208 n.logger.Error("registering Service Identity token accessors for alloc failed", "alloc_id", alloc.ID, "error", err) 2209 2210 // Determine if we can recover from the error 2211 retry := false 2212 switch err { 2213 case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout: 2214 retry = true 2215 } 2216 setError(err, retry) 2217 return nil 2218 } 2219 2220 // We made it! Now we can set the reply. 2221 reply.Index = index 2222 reply.Tokens = tokens 2223 n.srv.setQueryMeta(&reply.QueryMeta) 2224 return nil 2225 } 2226 2227 func connectTasks(tg *structs.TaskGroup, tasks []string) ([]string, []connectTask) { 2228 var notConnect []string 2229 var usesConnect []connectTask 2230 for _, task := range tasks { 2231 tgTask := tg.LookupTask(task) 2232 if !taskUsesConnect(tgTask) { 2233 notConnect = append(notConnect, task) 2234 } else { 2235 usesConnect = append(usesConnect, connectTask{ 2236 TaskName: task, 2237 TaskKind: tgTask.Kind, 2238 }) 2239 } 2240 } 2241 return notConnect, usesConnect 2242 } 2243 2244 func taskUsesConnect(task *structs.Task) bool { 2245 if task == nil { 2246 // not even in the task group 2247 return false 2248 } 2249 return task.UsesConnect() 2250 } 2251 2252 func (n *Node) EmitEvents(args *structs.EmitNodeEventsRequest, reply *structs.EmitNodeEventsResponse) error { 2253 2254 authErr := n.srv.Authenticate(n.ctx, args) 2255 2256 // Ensure the connection was initiated by another client if TLS is used. 2257 err := validateTLSCertificateLevel(n.srv, n.ctx, tlsCertificateLevelClient) 2258 if err != nil { 2259 return err 2260 } 2261 if done, err := n.srv.forward("Node.EmitEvents", args, args, reply); done { 2262 return err 2263 } 2264 n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args) 2265 if authErr != nil { 2266 return structs.ErrPermissionDenied 2267 } 2268 defer metrics.MeasureSince([]string{"nomad", "client", "emit_events"}, time.Now()) 2269 2270 if len(args.NodeEvents) == 0 { 2271 return fmt.Errorf("no node events given") 2272 } 2273 for nodeID, events := range args.NodeEvents { 2274 if len(events) == 0 { 2275 return fmt.Errorf("no node events given for node %q", nodeID) 2276 } 2277 } 2278 2279 _, index, err := n.srv.raftApply(structs.UpsertNodeEventsType, args) 2280 if err != nil { 2281 n.logger.Error("upserting node events failed", "error", err) 2282 return err 2283 } 2284 2285 reply.Index = index 2286 return nil 2287 }