github.com/hernad/nomad@v1.6.112/nomad/node_endpoint.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"net/http"
    11  	"reflect"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/armon/go-metrics"
    17  	"github.com/hashicorp/go-hclog"
    18  	"github.com/hashicorp/go-memdb"
    19  	"github.com/hashicorp/go-multierror"
    20  	vapi "github.com/hashicorp/vault/api"
    21  	"golang.org/x/sync/errgroup"
    22  
    23  	"github.com/hernad/nomad/acl"
    24  	"github.com/hernad/nomad/helper/uuid"
    25  	"github.com/hernad/nomad/nomad/state"
    26  	"github.com/hernad/nomad/nomad/state/paginator"
    27  	"github.com/hernad/nomad/nomad/structs"
    28  	"github.com/hashicorp/raft"
    29  )
    30  
    31  const (
    32  	// batchUpdateInterval is how long we wait to batch updates
    33  	batchUpdateInterval = 50 * time.Millisecond
    34  
    35  	// maxParallelRequestsPerDerive  is the maximum number of parallel Vault
    36  	// create token requests that may be outstanding per derive request
    37  	maxParallelRequestsPerDerive = 16
    38  
    39  	// NodeDrainEvents are the various drain messages
    40  	NodeDrainEventDrainSet      = "Node drain strategy set"
    41  	NodeDrainEventDrainDisabled = "Node drain disabled"
    42  	NodeDrainEventDrainUpdated  = "Node drain strategy updated"
    43  
    44  	// NodeEligibilityEventEligible is used when the nodes eligiblity is marked
    45  	// eligible
    46  	NodeEligibilityEventEligible = "Node marked as eligible for scheduling"
    47  
    48  	// NodeEligibilityEventIneligible is used when the nodes eligiblity is marked
    49  	// ineligible
    50  	NodeEligibilityEventIneligible = "Node marked as ineligible for scheduling"
    51  
    52  	// NodeHeartbeatEventReregistered is the message used when the node becomes
    53  	// reregistered by the heartbeat.
    54  	NodeHeartbeatEventReregistered = "Node reregistered by heartbeat"
    55  
    56  	// NodeWaitingForNodePool is the message used when the node is waiting for
    57  	// its node pool to be created.
    58  	NodeWaitingForNodePool = "Node registered but waiting for node pool to be created"
    59  )
    60  
    61  // Node endpoint is used for client interactions
    62  type Node struct {
    63  	srv    *Server
    64  	logger hclog.Logger
    65  
    66  	// ctx provides context regarding the underlying connection
    67  	ctx *RPCContext
    68  
    69  	// updates holds pending client status updates for allocations
    70  	updates []*structs.Allocation
    71  
    72  	// evals holds pending rescheduling eval updates triggered by failed allocations
    73  	evals []*structs.Evaluation
    74  
    75  	// updateFuture is used to wait for the pending batch update
    76  	// to complete. This may be nil if no batch is pending.
    77  	updateFuture *structs.BatchFuture
    78  
    79  	// updateTimer is the timer that will trigger the next batch
    80  	// update, and may be nil if there is no batch pending.
    81  	updateTimer *time.Timer
    82  
    83  	// updatesLock synchronizes access to the updates list,
    84  	// the future and the timer.
    85  	updatesLock sync.Mutex
    86  }
    87  
    88  func NewNodeEndpoint(srv *Server, ctx *RPCContext) *Node {
    89  	return &Node{
    90  		srv:     srv,
    91  		ctx:     ctx,
    92  		logger:  srv.logger.Named("client"),
    93  		updates: []*structs.Allocation{},
    94  		evals:   []*structs.Evaluation{},
    95  	}
    96  }
    97  
    98  // Register is used to upsert a client that is available for scheduling
    99  func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error {
   100  	// note that we trust-on-first use and the identity will be anonymous for
   101  	// that initial request; we lean on mTLS for handling that safely
   102  	authErr := n.srv.Authenticate(n.ctx, args)
   103  
   104  	isForwarded := args.IsForwarded()
   105  	if done, err := n.srv.forward("Node.Register", args, args, reply); done {
   106  		// We have a valid node connection since there is no error from the
   107  		// forwarded server, so add the mapping to cache the
   108  		// connection and allow the server to send RPCs to the client.
   109  		if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded {
   110  			n.ctx.NodeID = args.Node.ID
   111  			n.srv.addNodeConn(n.ctx)
   112  		}
   113  
   114  		return err
   115  	}
   116  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   117  	if authErr != nil {
   118  		return structs.ErrPermissionDenied
   119  	}
   120  
   121  	defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now())
   122  
   123  	// Validate the arguments
   124  	if args.Node == nil {
   125  		return fmt.Errorf("missing node for client registration")
   126  	}
   127  	if args.Node.ID == "" {
   128  		return fmt.Errorf("missing node ID for client registration")
   129  	}
   130  	if args.Node.Datacenter == "" {
   131  		return fmt.Errorf("missing datacenter for client registration")
   132  	}
   133  	if args.Node.Name == "" {
   134  		return fmt.Errorf("missing node name for client registration")
   135  	}
   136  	if len(args.Node.Attributes) == 0 {
   137  		return fmt.Errorf("missing attributes for client registration")
   138  	}
   139  	if args.Node.SecretID == "" {
   140  		return fmt.Errorf("missing node secret ID for client registration")
   141  	}
   142  	if args.Node.NodePool != "" {
   143  		err := structs.ValidateNodePoolName(args.Node.NodePool)
   144  		if err != nil {
   145  			return fmt.Errorf("invalid node pool: %v", err)
   146  		}
   147  		if args.Node.NodePool == structs.NodePoolAll {
   148  			return fmt.Errorf("node is not allowed to register in node pool %q", structs.NodePoolAll)
   149  		}
   150  	}
   151  
   152  	// Default the status if none is given
   153  	if args.Node.Status == "" {
   154  		args.Node.Status = structs.NodeStatusInit
   155  	}
   156  	if !structs.ValidNodeStatus(args.Node.Status) {
   157  		return fmt.Errorf("invalid status for node")
   158  	}
   159  
   160  	// Default to eligible for scheduling if unset
   161  	if args.Node.SchedulingEligibility == "" {
   162  		args.Node.SchedulingEligibility = structs.NodeSchedulingEligible
   163  	}
   164  
   165  	// Default the node pool if none is given.
   166  	if args.Node.NodePool == "" {
   167  		args.Node.NodePool = structs.NodePoolDefault
   168  	}
   169  
   170  	// Set the timestamp when the node is registered
   171  	args.Node.StatusUpdatedAt = time.Now().Unix()
   172  
   173  	// Compute the node class
   174  	if err := args.Node.ComputeClass(); err != nil {
   175  		return fmt.Errorf("failed to computed node class: %v", err)
   176  	}
   177  
   178  	// Look for the node so we can detect a state transition
   179  	snap, err := n.srv.fsm.State().Snapshot()
   180  	if err != nil {
   181  		return err
   182  	}
   183  
   184  	ws := memdb.NewWatchSet()
   185  	originalNode, err := snap.NodeByID(ws, args.Node.ID)
   186  	if err != nil {
   187  		return err
   188  	}
   189  
   190  	if originalNode != nil {
   191  		// Check if the SecretID has been tampered with
   192  		if args.Node.SecretID != originalNode.SecretID && originalNode.SecretID != "" {
   193  			return fmt.Errorf("node secret ID does not match. Not registering node.")
   194  		}
   195  
   196  		// Don't allow the Register method to update the node status. Only the
   197  		// UpdateStatus method should be able to do this.
   198  		if originalNode.Status != "" {
   199  			args.Node.Status = originalNode.Status
   200  		}
   201  	}
   202  
   203  	// We have a valid node connection, so add the mapping to cache the
   204  	// connection and allow the server to send RPCs to the client. We only cache
   205  	// the connection if it is not being forwarded from another server.
   206  	if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() {
   207  		n.ctx.NodeID = args.Node.ID
   208  		n.srv.addNodeConn(n.ctx)
   209  	}
   210  
   211  	// Commit this update via Raft.
   212  	//
   213  	// Only the authoritative region is allowed to create the node pool for the
   214  	// node if it doesn't exist yet. This prevents non-authoritative regions
   215  	// from having to push their local state to the authoritative region.
   216  	//
   217  	// Nodes in non-authoritative regions that are registered with a new node
   218  	// pool are kept in the `initializing` status until the node pool is
   219  	// created and replicated.
   220  	if n.srv.Region() == n.srv.config.AuthoritativeRegion {
   221  		args.CreateNodePool = true
   222  	}
   223  	_, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args)
   224  	if err != nil {
   225  		n.logger.Error("register failed", "error", err)
   226  		return err
   227  	}
   228  	reply.NodeModifyIndex = index
   229  
   230  	// Check if we should trigger evaluations
   231  	if shouldCreateNodeEval(originalNode, args.Node) {
   232  		evalIDs, evalIndex, err := n.createNodeEvals(args.Node, index)
   233  		if err != nil {
   234  			n.logger.Error("eval creation failed", "error", err)
   235  			return err
   236  		}
   237  		reply.EvalIDs = evalIDs
   238  		reply.EvalCreateIndex = evalIndex
   239  	}
   240  
   241  	// Check if we need to setup a heartbeat
   242  	if !args.Node.TerminalStatus() {
   243  		ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID)
   244  		if err != nil {
   245  			n.logger.Error("heartbeat reset failed", "error", err)
   246  			return err
   247  		}
   248  		reply.HeartbeatTTL = ttl
   249  	}
   250  
   251  	// Set the reply index
   252  	reply.Index = index
   253  	snap, err = n.srv.fsm.State().Snapshot()
   254  	if err != nil {
   255  		return err
   256  	}
   257  
   258  	n.srv.peerLock.RLock()
   259  	defer n.srv.peerLock.RUnlock()
   260  	if err := n.constructNodeServerInfoResponse(args.Node.ID, snap, reply); err != nil {
   261  		n.logger.Error("failed to populate NodeUpdateResponse", "error", err)
   262  		return err
   263  	}
   264  
   265  	return nil
   266  }
   267  
   268  // shouldCreateNodeEval returns true if the node update may result into
   269  // allocation updates, so the node should be re-evaluating.
   270  //
   271  // Such cases might be:
   272  // * node health/drain status changes that may result into alloc rescheduling
   273  // * node drivers or attributes changing that may cause system job placement changes
   274  func shouldCreateNodeEval(original, updated *structs.Node) bool {
   275  	if structs.ShouldDrainNode(updated.Status) {
   276  		return true
   277  	}
   278  
   279  	if original == nil {
   280  		return nodeStatusTransitionRequiresEval(updated.Status, structs.NodeStatusInit)
   281  	}
   282  
   283  	if nodeStatusTransitionRequiresEval(updated.Status, original.Status) {
   284  		return true
   285  	}
   286  
   287  	// check fields used by the feasibility checks in ../scheduler/feasible.go,
   288  	// whether through a Constraint explicitly added by user or an implicit constraint
   289  	// added through a driver/volume check.
   290  	//
   291  	// Node Resources (e.g. CPU/Memory) are handled differently, using blocked evals,
   292  	// and not relevant in this check.
   293  	return !(original.ID == updated.ID &&
   294  		original.Datacenter == updated.Datacenter &&
   295  		original.Name == updated.Name &&
   296  		original.NodeClass == updated.NodeClass &&
   297  		reflect.DeepEqual(original.Attributes, updated.Attributes) &&
   298  		reflect.DeepEqual(original.Meta, updated.Meta) &&
   299  		reflect.DeepEqual(original.Drivers, updated.Drivers) &&
   300  		reflect.DeepEqual(original.HostVolumes, updated.HostVolumes) &&
   301  		equalDevices(original, updated))
   302  }
   303  
   304  func equalDevices(n1, n2 *structs.Node) bool {
   305  	// ignore super old nodes, mostly to avoid nil dereferencing
   306  	if n1.NodeResources == nil || n2.NodeResources == nil {
   307  		return n1.NodeResources == n2.NodeResources
   308  	}
   309  
   310  	// treat nil and empty value as equal
   311  	if len(n1.NodeResources.Devices) == 0 {
   312  		return len(n1.NodeResources.Devices) == len(n2.NodeResources.Devices)
   313  	}
   314  
   315  	return reflect.DeepEqual(n1.NodeResources.Devices, n2.NodeResources.Devices)
   316  }
   317  
   318  // constructNodeServerInfoResponse assumes the n.srv.peerLock is held for reading.
   319  func (n *Node) constructNodeServerInfoResponse(nodeID string, snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error {
   320  	reply.LeaderRPCAddr = string(n.srv.raft.Leader())
   321  
   322  	// Reply with config information required for future RPC requests
   323  	reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers))
   324  	for _, v := range n.srv.localPeers {
   325  		reply.Servers = append(reply.Servers,
   326  			&structs.NodeServerInfo{
   327  				RPCAdvertiseAddr: v.RPCAddr.String(),
   328  				Datacenter:       v.Datacenter,
   329  			})
   330  	}
   331  
   332  	ws := memdb.NewWatchSet()
   333  
   334  	// Add ClientStatus information to heartbeat response.
   335  	if node, err := snap.NodeByID(ws, nodeID); err == nil && node != nil {
   336  		reply.SchedulingEligibility = node.SchedulingEligibility
   337  	} else if node == nil {
   338  
   339  		// If the node is not found, leave reply.SchedulingEligibility as
   340  		// the empty string. The response handler in the client treats this
   341  		// as a no-op. As there is no call to action for an operator, log it
   342  		// at debug level.
   343  		n.logger.Debug("constructNodeServerInfoResponse: node not found",
   344  			"node_id", nodeID)
   345  	} else {
   346  
   347  		// This case is likely only reached via a code error in state store
   348  		return err
   349  	}
   350  
   351  	// TODO(sean@): Use an indexed node count instead
   352  	//
   353  	// Snapshot is used only to iterate over all nodes to create a node
   354  	// count to send back to Nomad Clients in their heartbeat so Clients
   355  	// can estimate the size of the cluster.
   356  	iter, err := snap.Nodes(ws)
   357  	if err == nil {
   358  		for {
   359  			raw := iter.Next()
   360  			if raw == nil {
   361  				break
   362  			}
   363  			reply.NumNodes++
   364  		}
   365  	}
   366  
   367  	reply.Features = n.srv.EnterpriseState.Features()
   368  
   369  	return nil
   370  }
   371  
   372  // Deregister is used to remove a client from the cluster. If a client should
   373  // just be made unavailable for scheduling, a status update is preferred.
   374  func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error {
   375  	authErr := n.srv.Authenticate(n.ctx, args)
   376  	if done, err := n.srv.forward("Node.Deregister", args, args, reply); done {
   377  		return err
   378  	}
   379  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   380  	if authErr != nil {
   381  		return structs.ErrPermissionDenied
   382  	}
   383  	defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now())
   384  
   385  	if args.NodeID == "" {
   386  		return fmt.Errorf("missing node ID for client deregistration")
   387  	}
   388  
   389  	// deregister takes a batch
   390  	repack := &structs.NodeBatchDeregisterRequest{
   391  		NodeIDs:      []string{args.NodeID},
   392  		WriteRequest: args.WriteRequest,
   393  	}
   394  
   395  	return n.deregister(repack, reply, func() (interface{}, uint64, error) {
   396  		return n.srv.raftApply(structs.NodeDeregisterRequestType, args)
   397  	})
   398  }
   399  
   400  // BatchDeregister is used to remove client nodes from the cluster.
   401  func (n *Node) BatchDeregister(args *structs.NodeBatchDeregisterRequest, reply *structs.NodeUpdateResponse) error {
   402  	authErr := n.srv.Authenticate(n.ctx, args)
   403  	if done, err := n.srv.forward("Node.BatchDeregister", args, args, reply); done {
   404  		return err
   405  	}
   406  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   407  	if authErr != nil {
   408  		return structs.ErrPermissionDenied
   409  	}
   410  	defer metrics.MeasureSince([]string{"nomad", "client", "batch_deregister"}, time.Now())
   411  
   412  	if len(args.NodeIDs) == 0 {
   413  		return fmt.Errorf("missing node IDs for client deregistration")
   414  	}
   415  
   416  	return n.deregister(args, reply, func() (interface{}, uint64, error) {
   417  		return n.srv.raftApply(structs.NodeBatchDeregisterRequestType, args)
   418  	})
   419  }
   420  
   421  // deregister takes a raftMessage closure, to support both Deregister and BatchDeregister
   422  func (n *Node) deregister(args *structs.NodeBatchDeregisterRequest,
   423  	reply *structs.NodeUpdateResponse,
   424  	raftApplyFn func() (interface{}, uint64, error),
   425  ) error {
   426  	// Check request permissions
   427  	if aclObj, err := n.srv.ResolveACL(args); err != nil {
   428  		return err
   429  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   430  		return structs.ErrPermissionDenied
   431  	}
   432  
   433  	// Look for the node
   434  	snap, err := n.srv.fsm.State().Snapshot()
   435  	if err != nil {
   436  		return err
   437  	}
   438  
   439  	nodes := make([]*structs.Node, 0, len(args.NodeIDs))
   440  	for _, nodeID := range args.NodeIDs {
   441  		node, err := snap.NodeByID(nil, nodeID)
   442  		if err != nil {
   443  			return err
   444  		}
   445  		if node == nil {
   446  			return fmt.Errorf("node not found")
   447  		}
   448  		nodes = append(nodes, node)
   449  	}
   450  
   451  	// Commit this update via Raft
   452  	_, index, err := raftApplyFn()
   453  	if err != nil {
   454  		n.logger.Error("raft message failed", "error", err)
   455  		return err
   456  	}
   457  
   458  	for _, node := range nodes {
   459  		nodeID := node.ID
   460  
   461  		// Clear the heartbeat timer if any
   462  		n.srv.clearHeartbeatTimer(nodeID)
   463  
   464  		// Create the evaluations for this node
   465  		evalIDs, evalIndex, err := n.createNodeEvals(node, index)
   466  		if err != nil {
   467  			n.logger.Error("eval creation failed", "error", err)
   468  			return err
   469  		}
   470  
   471  		// Determine if there are any Vault accessors on the node
   472  		if accessors, err := snap.VaultAccessorsByNode(nil, nodeID); err != nil {
   473  			n.logger.Error("looking up vault accessors for node failed", "node_id", nodeID, "error", err)
   474  			return err
   475  		} else if l := len(accessors); l > 0 {
   476  			n.logger.Debug("revoking vault accessors on node due to deregister", "num_accessors", l, "node_id", nodeID)
   477  			if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil {
   478  				n.logger.Error("revoking vault accessors for node failed", "node_id", nodeID, "error", err)
   479  				return err
   480  			}
   481  		}
   482  
   483  		// Determine if there are any SI token accessors on the node
   484  		if accessors, err := snap.SITokenAccessorsByNode(nil, nodeID); err != nil {
   485  			n.logger.Error("looking up si accessors for node failed", "node_id", nodeID, "error", err)
   486  			return err
   487  		} else if l := len(accessors); l > 0 {
   488  			n.logger.Debug("revoking si accessors on node due to deregister", "num_accessors", l, "node_id", nodeID)
   489  			// Unlike with the Vault integration, there's no error returned here, since
   490  			// bootstrapping the Consul client is elsewhere. Errors in revocation trigger
   491  			// background retry attempts rather than inline error handling.
   492  			_ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, true)
   493  		}
   494  
   495  		reply.EvalIDs = append(reply.EvalIDs, evalIDs...)
   496  		// Set the reply eval create index just the first time
   497  		if reply.EvalCreateIndex == 0 {
   498  			reply.EvalCreateIndex = evalIndex
   499  		}
   500  	}
   501  
   502  	reply.NodeModifyIndex = index
   503  	reply.Index = index
   504  	return nil
   505  }
   506  
   507  // UpdateStatus is used to update the status of a client node.
   508  //
   509  // Clients with non-terminal allocations must first call UpdateAlloc to be able
   510  // to transition from the initializing status to ready.
   511  //
   512  // Clients node pool must exist for them to be able to transition from
   513  // initializing to ready.
   514  //
   515  //	                ┌────────────────────────────────────── No ───┐
   516  //	                │                                             │
   517  //	             ┌──▼───┐          ┌─────────────┐       ┌────────┴────────┐
   518  //	── Register ─► init ├─ ready ──► Has allocs? ├─ Yes ─► Allocs updated? │
   519  //	             └──▲──▲┘          └─────┬───────┘       └────────┬────────┘
   520  //	                │  │                 │                        │
   521  //	                │  │                 └─ No ─┐  ┌─────── Yes ──┘
   522  //	                │  │                        │  │
   523  //	                │  │               ┌────────▼──▼───────┐
   524  //	                │  └──────────No───┤ Node pool exists? │
   525  //	                │                  └─────────┬─────────┘
   526  //	                │                            │
   527  //	              ready                         Yes
   528  //	                │                            │
   529  //	         ┌──────┴───────┐                ┌───▼───┐         ┌──────┐
   530  //	         │ disconnected ◄─ disconnected ─┤ ready ├─ down ──► down │
   531  //	         └──────────────┘                └───▲───┘         └──┬───┘
   532  //	                                             │                │
   533  //	                                             └──── ready ─────┘
   534  func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error {
   535  	authErr := n.srv.Authenticate(n.ctx, args)
   536  
   537  	isForwarded := args.IsForwarded()
   538  	if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done {
   539  		// We have a valid node connection since there is no error from the
   540  		// forwarded server, so add the mapping to cache the
   541  		// connection and allow the server to send RPCs to the client.
   542  		if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded {
   543  			n.ctx.NodeID = args.NodeID
   544  			n.srv.addNodeConn(n.ctx)
   545  		}
   546  
   547  		return err
   548  	}
   549  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   550  	if authErr != nil {
   551  		return structs.ErrPermissionDenied
   552  	}
   553  
   554  	defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now())
   555  
   556  	// Verify the arguments
   557  	if args.NodeID == "" {
   558  		return fmt.Errorf("missing node ID for client status update")
   559  	}
   560  	if !structs.ValidNodeStatus(args.Status) {
   561  		return fmt.Errorf("invalid status for node")
   562  	}
   563  
   564  	// Look for the node
   565  	snap, err := n.srv.fsm.State().Snapshot()
   566  	if err != nil {
   567  		return err
   568  	}
   569  
   570  	ws := memdb.NewWatchSet()
   571  	node, err := snap.NodeByID(ws, args.NodeID)
   572  	if err != nil {
   573  		return err
   574  	}
   575  	if node == nil {
   576  		return fmt.Errorf("node not found")
   577  	}
   578  
   579  	// We have a valid node connection, so add the mapping to cache the
   580  	// connection and allow the server to send RPCs to the client. We only cache
   581  	// the connection if it is not being forwarded from another server.
   582  	if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() {
   583  		n.ctx.NodeID = args.NodeID
   584  		n.srv.addNodeConn(n.ctx)
   585  	}
   586  
   587  	// XXX: Could use the SecretID here but have to update the heartbeat system
   588  	// to track SecretIDs.
   589  
   590  	// Update the timestamp of when the node status was updated
   591  	args.UpdatedAt = time.Now().Unix()
   592  
   593  	// Compute next status.
   594  	switch node.Status {
   595  	case structs.NodeStatusInit:
   596  		if args.Status == structs.NodeStatusReady {
   597  			// Keep node in the initializing status if it has allocations but
   598  			// they are not updated.
   599  			allocs, err := snap.AllocsByNodeTerminal(ws, args.NodeID, false)
   600  			if err != nil {
   601  				return fmt.Errorf("failed to query node allocs: %v", err)
   602  			}
   603  
   604  			allocsUpdated := node.LastAllocUpdateIndex > node.LastMissedHeartbeatIndex
   605  			if len(allocs) > 0 && !allocsUpdated {
   606  				n.logger.Debug(fmt.Sprintf("marking node as %s due to outdated allocation information", structs.NodeStatusInit))
   607  				args.Status = structs.NodeStatusInit
   608  			}
   609  
   610  			// Keep node in the initialing status if it's in a node pool that
   611  			// doesn't exist.
   612  			pool, err := snap.NodePoolByName(ws, node.NodePool)
   613  			if err != nil {
   614  				return fmt.Errorf("failed to query node pool: %v", err)
   615  			}
   616  			if pool == nil {
   617  				n.logger.Debug(fmt.Sprintf("marking node as %s due to missing node pool", structs.NodeStatusInit))
   618  				args.Status = structs.NodeStatusInit
   619  				if !node.HasEvent(NodeWaitingForNodePool) {
   620  					args.NodeEvent = structs.NewNodeEvent().
   621  						SetSubsystem(structs.NodeEventSubsystemCluster).
   622  						SetMessage(NodeWaitingForNodePool).
   623  						AddDetail("node_pool", node.NodePool)
   624  				}
   625  			}
   626  		}
   627  	case structs.NodeStatusDisconnected:
   628  		if args.Status == structs.NodeStatusReady {
   629  			args.Status = structs.NodeStatusInit
   630  		}
   631  	}
   632  
   633  	// Commit this update via Raft
   634  	var index uint64
   635  	if node.Status != args.Status || args.NodeEvent != nil {
   636  		// Attach an event if we are updating the node status to ready when it
   637  		// is down via a heartbeat
   638  		if node.Status == structs.NodeStatusDown && args.NodeEvent == nil {
   639  			args.NodeEvent = structs.NewNodeEvent().
   640  				SetSubsystem(structs.NodeEventSubsystemCluster).
   641  				SetMessage(NodeHeartbeatEventReregistered)
   642  		}
   643  
   644  		_, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args)
   645  		if err != nil {
   646  			n.logger.Error("status update failed", "error", err)
   647  			return err
   648  		}
   649  		reply.NodeModifyIndex = index
   650  	}
   651  
   652  	// Check if we should trigger evaluations
   653  	if structs.ShouldDrainNode(args.Status) ||
   654  		nodeStatusTransitionRequiresEval(args.Status, node.Status) {
   655  		evalIDs, evalIndex, err := n.createNodeEvals(node, index)
   656  		if err != nil {
   657  			n.logger.Error("eval creation failed", "error", err)
   658  			return err
   659  		}
   660  		reply.EvalIDs = evalIDs
   661  		reply.EvalCreateIndex = evalIndex
   662  	}
   663  
   664  	// Check if we need to setup a heartbeat
   665  	switch args.Status {
   666  	case structs.NodeStatusDown:
   667  		// Determine if there are any Vault accessors on the node to cleanup
   668  		if accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID); err != nil {
   669  			n.logger.Error("looking up vault accessors for node failed", "node_id", args.NodeID, "error", err)
   670  			return err
   671  		} else if l := len(accessors); l > 0 {
   672  			n.logger.Debug("revoking vault accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID)
   673  			if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil {
   674  				n.logger.Error("revoking vault accessors for node failed", "node_id", args.NodeID, "error", err)
   675  				return err
   676  			}
   677  		}
   678  
   679  		// Determine if there are any SI token accessors on the node to cleanup
   680  		if accessors, err := n.srv.State().SITokenAccessorsByNode(ws, args.NodeID); err != nil {
   681  			n.logger.Error("looking up SI accessors for node failed", "node_id", args.NodeID, "error", err)
   682  			return err
   683  		} else if l := len(accessors); l > 0 {
   684  			n.logger.Debug("revoking SI accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID)
   685  			_ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, true)
   686  		}
   687  
   688  		// Identify the service registrations current placed on the downed
   689  		// node.
   690  		serviceRegistrations, err := n.srv.State().GetServiceRegistrationsByNodeID(ws, args.NodeID)
   691  		if err != nil {
   692  			n.logger.Error("looking up service registrations for node failed",
   693  				"node_id", args.NodeID, "error", err)
   694  			return err
   695  		}
   696  
   697  		// If the node has service registrations assigned to it, delete these
   698  		// via Raft.
   699  		if l := len(serviceRegistrations); l > 0 {
   700  			n.logger.Debug("deleting service registrations on node due to down state",
   701  				"num_service_registrations", l, "node_id", args.NodeID)
   702  
   703  			deleteRegReq := structs.ServiceRegistrationDeleteByNodeIDRequest{NodeID: args.NodeID}
   704  
   705  			_, index, err = n.srv.raftApply(structs.ServiceRegistrationDeleteByNodeIDRequestType, &deleteRegReq)
   706  			if err != nil {
   707  				n.logger.Error("failed to delete service registrations for node",
   708  					"node_id", args.NodeID, "error", err)
   709  				return err
   710  			}
   711  		}
   712  
   713  	default:
   714  		ttl, err := n.srv.resetHeartbeatTimer(args.NodeID)
   715  		if err != nil {
   716  			n.logger.Error("heartbeat reset failed", "error", err)
   717  			return err
   718  		}
   719  		reply.HeartbeatTTL = ttl
   720  	}
   721  
   722  	// Set the reply index and leader
   723  	reply.Index = index
   724  	n.srv.peerLock.RLock()
   725  	defer n.srv.peerLock.RUnlock()
   726  	if err := n.constructNodeServerInfoResponse(node.GetID(), snap, reply); err != nil {
   727  		n.logger.Error("failed to populate NodeUpdateResponse", "error", err)
   728  		return err
   729  	}
   730  
   731  	return nil
   732  }
   733  
   734  // nodeStatusTransitionRequiresEval is a helper that takes a nodes new and old status and
   735  // returns whether it has transitioned to ready.
   736  func nodeStatusTransitionRequiresEval(newStatus, oldStatus string) bool {
   737  	initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady
   738  	terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady
   739  	disconnectedToOther := oldStatus == structs.NodeStatusDisconnected && newStatus != structs.NodeStatusDisconnected
   740  	otherToDisconnected := oldStatus != structs.NodeStatusDisconnected && newStatus == structs.NodeStatusDisconnected
   741  	return initToReady || terminalToReady || disconnectedToOther || otherToDisconnected
   742  }
   743  
   744  // UpdateDrain is used to update the drain mode of a client node
   745  func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
   746  	reply *structs.NodeDrainUpdateResponse) error {
   747  
   748  	authErr := n.srv.Authenticate(n.ctx, args)
   749  	if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done {
   750  		return err
   751  	}
   752  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   753  	if authErr != nil {
   754  		return structs.ErrPermissionDenied
   755  	}
   756  	defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now())
   757  
   758  	// Check node write permissions
   759  	if aclObj, err := n.srv.ResolveACL(args); err != nil {
   760  		return err
   761  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   762  		return structs.ErrPermissionDenied
   763  	}
   764  
   765  	// Verify the arguments
   766  	if args.NodeID == "" {
   767  		return fmt.Errorf("missing node ID for drain update")
   768  	}
   769  	if args.NodeEvent != nil {
   770  		return fmt.Errorf("node event must not be set")
   771  	}
   772  
   773  	// Look for the node
   774  	snap, err := n.srv.fsm.State().Snapshot()
   775  	if err != nil {
   776  		return err
   777  	}
   778  	node, err := snap.NodeByID(nil, args.NodeID)
   779  	if err != nil {
   780  		return err
   781  	}
   782  	if node == nil {
   783  		return fmt.Errorf("node not found")
   784  	}
   785  
   786  	now := time.Now().UTC()
   787  
   788  	// Update the timestamp of when the node status was updated
   789  	args.UpdatedAt = now.Unix()
   790  
   791  	// Setup drain strategy
   792  	if args.DrainStrategy != nil {
   793  		// Mark start time for the drain
   794  		if node.DrainStrategy == nil {
   795  			args.DrainStrategy.StartedAt = now
   796  		} else {
   797  			args.DrainStrategy.StartedAt = node.DrainStrategy.StartedAt
   798  		}
   799  
   800  		// Mark the deadline time
   801  		if args.DrainStrategy.Deadline.Nanoseconds() > 0 {
   802  			args.DrainStrategy.ForceDeadline = now.Add(args.DrainStrategy.Deadline)
   803  		}
   804  	}
   805  
   806  	// Construct the node event
   807  	args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemDrain)
   808  	if node.DrainStrategy == nil && args.DrainStrategy != nil {
   809  		args.NodeEvent.SetMessage(NodeDrainEventDrainSet)
   810  	} else if node.DrainStrategy != nil && args.DrainStrategy != nil {
   811  		args.NodeEvent.SetMessage(NodeDrainEventDrainUpdated)
   812  	} else if node.DrainStrategy != nil && args.DrainStrategy == nil {
   813  		args.NodeEvent.SetMessage(NodeDrainEventDrainDisabled)
   814  	} else {
   815  		args.NodeEvent = nil
   816  	}
   817  
   818  	// Commit this update via Raft
   819  	_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
   820  	if err != nil {
   821  		n.logger.Error("drain update failed", "error", err)
   822  		return err
   823  	}
   824  	reply.NodeModifyIndex = index
   825  
   826  	// If the node is transitioning to be eligible, create Node evaluations
   827  	// because there may be a System job registered that should be evaluated.
   828  	if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.MarkEligible && args.DrainStrategy == nil {
   829  		n.logger.Info("node transitioning to eligible state", "node_id", node.ID)
   830  		evalIDs, evalIndex, err := n.createNodeEvals(node, index)
   831  		if err != nil {
   832  			n.logger.Error("eval creation failed", "error", err)
   833  			return err
   834  		}
   835  		reply.EvalIDs = evalIDs
   836  		reply.EvalCreateIndex = evalIndex
   837  	}
   838  
   839  	// Set the reply index
   840  	reply.Index = index
   841  	return nil
   842  }
   843  
   844  // UpdateEligibility is used to update the scheduling eligibility of a node
   845  func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest,
   846  	reply *structs.NodeEligibilityUpdateResponse) error {
   847  
   848  	authErr := n.srv.Authenticate(n.ctx, args)
   849  	if done, err := n.srv.forward("Node.UpdateEligibility", args, args, reply); done {
   850  		return err
   851  	}
   852  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   853  	if authErr != nil {
   854  		return structs.ErrPermissionDenied
   855  	}
   856  	defer metrics.MeasureSince([]string{"nomad", "client", "update_eligibility"}, time.Now())
   857  
   858  	// Check node write permissions
   859  	if aclObj, err := n.srv.ResolveACL(args); err != nil {
   860  		return err
   861  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   862  		return structs.ErrPermissionDenied
   863  	}
   864  
   865  	// Verify the arguments
   866  	if args.NodeID == "" {
   867  		return fmt.Errorf("missing node ID for setting scheduling eligibility")
   868  	}
   869  	if args.NodeEvent != nil {
   870  		return fmt.Errorf("node event must not be set")
   871  	}
   872  
   873  	// Check that only allowed types are set
   874  	switch args.Eligibility {
   875  	case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible:
   876  	default:
   877  		return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility)
   878  	}
   879  
   880  	// Look for the node
   881  	snap, err := n.srv.fsm.State().Snapshot()
   882  	if err != nil {
   883  		return err
   884  	}
   885  	node, err := snap.NodeByID(nil, args.NodeID)
   886  	if err != nil {
   887  		return err
   888  	}
   889  	if node == nil {
   890  		return fmt.Errorf("node not found")
   891  	}
   892  
   893  	if node.DrainStrategy != nil && args.Eligibility == structs.NodeSchedulingEligible {
   894  		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
   895  	}
   896  
   897  	switch args.Eligibility {
   898  	case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible:
   899  	default:
   900  		return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility)
   901  	}
   902  
   903  	// Update the timestamp of when the node status was updated
   904  	args.UpdatedAt = time.Now().Unix()
   905  
   906  	// Construct the node event
   907  	args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster)
   908  	if node.SchedulingEligibility == args.Eligibility {
   909  		return nil // Nothing to do
   910  	} else if args.Eligibility == structs.NodeSchedulingEligible {
   911  		n.logger.Info("node transitioning to eligible state", "node_id", node.ID)
   912  		args.NodeEvent.SetMessage(NodeEligibilityEventEligible)
   913  	} else {
   914  		n.logger.Info("node transitioning to ineligible state", "node_id", node.ID)
   915  		args.NodeEvent.SetMessage(NodeEligibilityEventIneligible)
   916  	}
   917  
   918  	// Commit this update via Raft
   919  	outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args)
   920  	if err != nil {
   921  		n.logger.Error("eligibility update failed", "error", err)
   922  		return err
   923  	}
   924  	if outErr != nil {
   925  		if err, ok := outErr.(error); ok && err != nil {
   926  			n.logger.Error("eligibility update failed", "error", err)
   927  			return err
   928  		}
   929  	}
   930  
   931  	// If the node is transitioning to be eligible, create Node evaluations
   932  	// because there may be a System job registered that should be evaluated.
   933  	if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.Eligibility == structs.NodeSchedulingEligible {
   934  		evalIDs, evalIndex, err := n.createNodeEvals(node, index)
   935  		if err != nil {
   936  			n.logger.Error("eval creation failed", "error", err)
   937  			return err
   938  		}
   939  		reply.EvalIDs = evalIDs
   940  		reply.EvalCreateIndex = evalIndex
   941  	}
   942  
   943  	// Set the reply index
   944  	reply.Index = index
   945  	return nil
   946  }
   947  
   948  // Evaluate is used to force a re-evaluation of the node
   949  func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error {
   950  
   951  	authErr := n.srv.Authenticate(n.ctx, args)
   952  	if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done {
   953  		return err
   954  	}
   955  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
   956  	if authErr != nil {
   957  		return structs.ErrPermissionDenied
   958  	}
   959  	defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now())
   960  
   961  	// Check node write permissions
   962  	if aclObj, err := n.srv.ResolveACL(args); err != nil {
   963  		return err
   964  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   965  		return structs.ErrPermissionDenied
   966  	}
   967  
   968  	// Verify the arguments
   969  	if args.NodeID == "" {
   970  		return fmt.Errorf("missing node ID for evaluation")
   971  	}
   972  
   973  	// Look for the node
   974  	snap, err := n.srv.fsm.State().Snapshot()
   975  	if err != nil {
   976  		return err
   977  	}
   978  	ws := memdb.NewWatchSet()
   979  	node, err := snap.NodeByID(ws, args.NodeID)
   980  	if err != nil {
   981  		return err
   982  	}
   983  	if node == nil {
   984  		return fmt.Errorf("node not found")
   985  	}
   986  
   987  	// Create the evaluation
   988  	evalIDs, evalIndex, err := n.createNodeEvals(node, node.ModifyIndex)
   989  	if err != nil {
   990  		n.logger.Error("eval creation failed", "error", err)
   991  		return err
   992  	}
   993  	reply.EvalIDs = evalIDs
   994  	reply.EvalCreateIndex = evalIndex
   995  
   996  	// Set the reply index
   997  	reply.Index = evalIndex
   998  
   999  	n.srv.peerLock.RLock()
  1000  	defer n.srv.peerLock.RUnlock()
  1001  	if err := n.constructNodeServerInfoResponse(node.GetID(), snap, reply); err != nil {
  1002  		n.logger.Error("failed to populate NodeUpdateResponse", "error", err)
  1003  		return err
  1004  	}
  1005  	return nil
  1006  }
  1007  
  1008  // GetNode is used to request information about a specific node
  1009  func (n *Node) GetNode(args *structs.NodeSpecificRequest,
  1010  	reply *structs.SingleNodeResponse) error {
  1011  
  1012  	authErr := n.srv.Authenticate(n.ctx, args)
  1013  	if done, err := n.srv.forward("Node.GetNode", args, args, reply); done {
  1014  		return err
  1015  	}
  1016  	n.srv.MeasureRPCRate("node", structs.RateMetricRead, args)
  1017  	if authErr != nil {
  1018  		return structs.ErrPermissionDenied
  1019  	}
  1020  	defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now())
  1021  
  1022  	// Check node read permissions
  1023  	aclObj, err := n.srv.ResolveClientOrACL(args)
  1024  	if err != nil {
  1025  		return err
  1026  	}
  1027  	if aclObj != nil && !aclObj.AllowNodeRead() {
  1028  		return structs.ErrPermissionDenied
  1029  	}
  1030  
  1031  	// Setup the blocking query
  1032  	opts := blockingOptions{
  1033  		queryOpts: &args.QueryOptions,
  1034  		queryMeta: &reply.QueryMeta,
  1035  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
  1036  			// Verify the arguments
  1037  			if args.NodeID == "" {
  1038  				return fmt.Errorf("missing node ID")
  1039  			}
  1040  
  1041  			// Look for the node
  1042  			out, err := state.NodeByID(ws, args.NodeID)
  1043  			if err != nil {
  1044  				return err
  1045  			}
  1046  
  1047  			// Setup the output
  1048  			if out != nil {
  1049  				out = out.Sanitize()
  1050  				reply.Node = out
  1051  				reply.Index = out.ModifyIndex
  1052  			} else {
  1053  				// Use the last index that affected the nodes table
  1054  				index, err := state.Index("nodes")
  1055  				if err != nil {
  1056  					return err
  1057  				}
  1058  				reply.Node = nil
  1059  				reply.Index = index
  1060  			}
  1061  
  1062  			// Set the query response
  1063  			n.srv.setQueryMeta(&reply.QueryMeta)
  1064  			return nil
  1065  		}}
  1066  	return n.srv.blockingRPC(&opts)
  1067  }
  1068  
  1069  // GetAllocs is used to request allocations for a specific node
  1070  func (n *Node) GetAllocs(args *structs.NodeSpecificRequest,
  1071  	reply *structs.NodeAllocsResponse) error {
  1072  
  1073  	authErr := n.srv.Authenticate(n.ctx, args)
  1074  	if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done {
  1075  		return err
  1076  	}
  1077  	n.srv.MeasureRPCRate("node", structs.RateMetricList, args)
  1078  	if authErr != nil {
  1079  		return structs.ErrPermissionDenied
  1080  	}
  1081  	defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now())
  1082  
  1083  	// Check node read and namespace job read permissions
  1084  	aclObj, err := n.srv.ResolveACL(args)
  1085  	if err != nil {
  1086  		return err
  1087  	}
  1088  	if aclObj != nil && !aclObj.AllowNodeRead() {
  1089  		return structs.ErrPermissionDenied
  1090  	}
  1091  
  1092  	// cache namespace perms
  1093  	readableNamespaces := map[string]bool{}
  1094  
  1095  	// readNS is a caching namespace read-job helper
  1096  	readNS := func(ns string) bool {
  1097  		if aclObj == nil {
  1098  			// ACLs are disabled; everything is readable
  1099  			return true
  1100  		}
  1101  
  1102  		if readable, ok := readableNamespaces[ns]; ok {
  1103  			// cache hit
  1104  			return readable
  1105  		}
  1106  
  1107  		// cache miss
  1108  		readable := aclObj.AllowNsOp(ns, acl.NamespaceCapabilityReadJob)
  1109  		readableNamespaces[ns] = readable
  1110  		return readable
  1111  	}
  1112  
  1113  	// Verify the arguments
  1114  	if args.NodeID == "" {
  1115  		return fmt.Errorf("missing node ID")
  1116  	}
  1117  
  1118  	// Setup the blocking query
  1119  	opts := blockingOptions{
  1120  		queryOpts: &args.QueryOptions,
  1121  		queryMeta: &reply.QueryMeta,
  1122  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
  1123  			// Look for the node
  1124  			allocs, err := state.AllocsByNode(ws, args.NodeID)
  1125  			if err != nil {
  1126  				return err
  1127  			}
  1128  
  1129  			// Setup the output
  1130  			if n := len(allocs); n != 0 {
  1131  				reply.Allocs = make([]*structs.Allocation, 0, n)
  1132  				for _, alloc := range allocs {
  1133  					if readNS(alloc.Namespace) {
  1134  						reply.Allocs = append(reply.Allocs, alloc)
  1135  					}
  1136  
  1137  					// Get the max of all allocs since
  1138  					// subsequent requests need to start
  1139  					// from the latest index
  1140  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
  1141  				}
  1142  			} else {
  1143  				reply.Allocs = nil
  1144  
  1145  				// Use the last index that affected the nodes table
  1146  				index, err := state.Index("allocs")
  1147  				if err != nil {
  1148  					return err
  1149  				}
  1150  
  1151  				// Must provide non-zero index to prevent blocking
  1152  				// Index 1 is impossible anyways (due to Raft internals)
  1153  				if index == 0 {
  1154  					reply.Index = 1
  1155  				} else {
  1156  					reply.Index = index
  1157  				}
  1158  			}
  1159  			return nil
  1160  		}}
  1161  	return n.srv.blockingRPC(&opts)
  1162  }
  1163  
  1164  // GetClientAllocs is used to request a lightweight list of alloc modify indexes
  1165  // per allocation.
  1166  func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest,
  1167  	reply *structs.NodeClientAllocsResponse) error {
  1168  
  1169  	authErr := n.srv.Authenticate(n.ctx, args)
  1170  	isForwarded := args.IsForwarded()
  1171  	if done, err := n.srv.forward("Node.GetClientAllocs", args, args, reply); done {
  1172  		// We have a valid node connection since there is no error from the
  1173  		// forwarded server, so add the mapping to cache the
  1174  		// connection and allow the server to send RPCs to the client.
  1175  		if err == nil && n.ctx != nil && n.ctx.NodeID == "" && !isForwarded {
  1176  			n.ctx.NodeID = args.NodeID
  1177  			n.srv.addNodeConn(n.ctx)
  1178  		}
  1179  
  1180  		return err
  1181  	}
  1182  	n.srv.MeasureRPCRate("node", structs.RateMetricList, args)
  1183  	if authErr != nil {
  1184  		return structs.ErrPermissionDenied
  1185  	}
  1186  	defer metrics.MeasureSince([]string{"nomad", "client", "get_client_allocs"}, time.Now())
  1187  
  1188  	// Verify the arguments
  1189  	if args.NodeID == "" {
  1190  		return fmt.Errorf("missing node ID")
  1191  	}
  1192  
  1193  	// numOldAllocs is used to detect if there is a garbage collection event
  1194  	// that effects the node. When an allocation is garbage collected, that does
  1195  	// not change the modify index changes and thus the query won't unblock,
  1196  	// even though the set of allocations on the node has changed.
  1197  	var numOldAllocs int
  1198  
  1199  	// Setup the blocking query
  1200  	opts := blockingOptions{
  1201  		queryOpts: &args.QueryOptions,
  1202  		queryMeta: &reply.QueryMeta,
  1203  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
  1204  			// Look for the node
  1205  			node, err := state.NodeByID(ws, args.NodeID)
  1206  			if err != nil {
  1207  				return err
  1208  			}
  1209  
  1210  			var allocs []*structs.Allocation
  1211  			if node != nil {
  1212  				if args.SecretID == "" {
  1213  					return fmt.Errorf("missing node secret ID for client status update")
  1214  				} else if args.SecretID != node.SecretID {
  1215  					return fmt.Errorf("node secret ID does not match")
  1216  				}
  1217  
  1218  				// We have a valid node connection, so add the mapping to cache the
  1219  				// connection and allow the server to send RPCs to the client. We only cache
  1220  				// the connection if it is not being forwarded from another server.
  1221  				if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() {
  1222  					n.ctx.NodeID = args.NodeID
  1223  					n.srv.addNodeConn(n.ctx)
  1224  				}
  1225  
  1226  				var err error
  1227  				allocs, err = state.AllocsByNode(ws, args.NodeID)
  1228  				if err != nil {
  1229  					return err
  1230  				}
  1231  			}
  1232  
  1233  			reply.Allocs = make(map[string]uint64)
  1234  			reply.MigrateTokens = make(map[string]string)
  1235  
  1236  			// preferTableIndex is used to determine whether we should build the
  1237  			// response index based on the full table indexes versus the modify
  1238  			// indexes of the allocations on the specific node. This is
  1239  			// preferred in the case that the node doesn't yet have allocations
  1240  			// or when we detect a GC that effects the node.
  1241  			preferTableIndex := true
  1242  
  1243  			// Setup the output
  1244  			if numAllocs := len(allocs); numAllocs != 0 {
  1245  				preferTableIndex = false
  1246  
  1247  				for _, alloc := range allocs {
  1248  					reply.Allocs[alloc.ID] = alloc.AllocModifyIndex
  1249  
  1250  					// If the allocation is going to do a migration, create a
  1251  					// migration token so that the client can authenticate with
  1252  					// the node hosting the previous allocation.
  1253  					if alloc.ShouldMigrate() {
  1254  						prevAllocation, err := state.AllocByID(ws, alloc.PreviousAllocation)
  1255  						if err != nil {
  1256  							return err
  1257  						}
  1258  
  1259  						if prevAllocation != nil && prevAllocation.NodeID != alloc.NodeID {
  1260  							allocNode, err := state.NodeByID(ws, prevAllocation.NodeID)
  1261  							if err != nil {
  1262  								return err
  1263  							}
  1264  							if allocNode == nil {
  1265  								// Node must have been GC'd so skip the token
  1266  								continue
  1267  							}
  1268  
  1269  							token, err := structs.GenerateMigrateToken(prevAllocation.ID, allocNode.SecretID)
  1270  							if err != nil {
  1271  								return err
  1272  							}
  1273  							reply.MigrateTokens[alloc.ID] = token
  1274  						}
  1275  					}
  1276  
  1277  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
  1278  				}
  1279  
  1280  				// Determine if we have less allocations than before. This
  1281  				// indicates there was a garbage collection
  1282  				if numAllocs < numOldAllocs {
  1283  					preferTableIndex = true
  1284  				}
  1285  
  1286  				// Store the new number of allocations
  1287  				numOldAllocs = numAllocs
  1288  			}
  1289  
  1290  			if preferTableIndex {
  1291  				// Use the last index that affected the nodes table
  1292  				index, err := state.Index("allocs")
  1293  				if err != nil {
  1294  					return err
  1295  				}
  1296  
  1297  				// Must provide non-zero index to prevent blocking
  1298  				// Index 1 is impossible anyways (due to Raft internals)
  1299  				if index == 0 {
  1300  					reply.Index = 1
  1301  				} else {
  1302  					reply.Index = index
  1303  				}
  1304  			}
  1305  			return nil
  1306  		}}
  1307  	return n.srv.blockingRPC(&opts)
  1308  }
  1309  
  1310  // UpdateAlloc is used to update the client status of an allocation. It should
  1311  // only be called by clients.
  1312  //
  1313  // Calling this method returns an error when:
  1314  //   - The node is not registered in the server yet. Clients must first call the
  1315  //     Register method.
  1316  //   - The node status is down or disconnected. Clients must call the
  1317  //     UpdateStatus method to update its status in the server.
  1318  func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error {
  1319  
  1320  	authErr := n.srv.Authenticate(n.ctx, args)
  1321  
  1322  	// Ensure the connection was initiated by another client if TLS is used.
  1323  	err := validateTLSCertificateLevel(n.srv, n.ctx, tlsCertificateLevelClient)
  1324  	if err != nil {
  1325  		return err
  1326  	}
  1327  	if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done {
  1328  		return err
  1329  	}
  1330  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
  1331  	if authErr != nil {
  1332  		return structs.ErrPermissionDenied
  1333  	}
  1334  
  1335  	defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now())
  1336  
  1337  	// Ensure at least a single alloc
  1338  	if len(args.Alloc) == 0 {
  1339  		return fmt.Errorf("must update at least one allocation")
  1340  	}
  1341  
  1342  	// Ensure the node is allowed to update allocs.
  1343  	// The node needs to successfully heartbeat before updating its allocs.
  1344  	nodeID := args.Alloc[0].NodeID
  1345  	if nodeID == "" {
  1346  		return fmt.Errorf("missing node ID")
  1347  	}
  1348  
  1349  	node, err := n.srv.State().NodeByID(nil, nodeID)
  1350  	if err != nil {
  1351  		return fmt.Errorf("failed to retrieve node %s: %v", nodeID, err)
  1352  	}
  1353  	if node == nil {
  1354  		return fmt.Errorf("node %s not found", nodeID)
  1355  	}
  1356  	if node.UnresponsiveStatus() {
  1357  		return fmt.Errorf("node %s is not allowed to update allocs while in status %s", nodeID, node.Status)
  1358  	}
  1359  
  1360  	// Ensure that evals aren't set from client RPCs
  1361  	// We create them here before the raft update
  1362  	if len(args.Evals) != 0 {
  1363  		return fmt.Errorf("evals field must not be set")
  1364  	}
  1365  
  1366  	// Update modified timestamp for client initiated allocation updates
  1367  	now := time.Now()
  1368  	var evals []*structs.Evaluation
  1369  
  1370  	for _, allocToUpdate := range args.Alloc {
  1371  		evalTriggerBy := ""
  1372  		allocToUpdate.ModifyTime = now.UTC().UnixNano()
  1373  
  1374  		alloc, _ := n.srv.State().AllocByID(nil, allocToUpdate.ID)
  1375  		if alloc == nil {
  1376  			continue
  1377  		}
  1378  
  1379  		if !allocToUpdate.TerminalStatus() && alloc.ClientStatus != structs.AllocClientStatusUnknown {
  1380  			continue
  1381  		}
  1382  
  1383  		var job *structs.Job
  1384  		var jobType string
  1385  		var jobPriority int
  1386  
  1387  		job, err = n.srv.State().JobByID(nil, alloc.Namespace, alloc.JobID)
  1388  		if err != nil {
  1389  			n.logger.Debug("UpdateAlloc unable to find job", "job", alloc.JobID, "error", err)
  1390  			continue
  1391  		}
  1392  
  1393  		// If the job is nil it means it has been de-registered.
  1394  		if job == nil {
  1395  			jobType = alloc.Job.Type
  1396  			jobPriority = alloc.Job.Priority
  1397  			evalTriggerBy = structs.EvalTriggerJobDeregister
  1398  			allocToUpdate.DesiredStatus = structs.AllocDesiredStatusStop
  1399  			n.logger.Debug("UpdateAlloc unable to find job - shutting down alloc", "job", alloc.JobID)
  1400  		}
  1401  
  1402  		var taskGroup *structs.TaskGroup
  1403  		if job != nil {
  1404  			jobType = job.Type
  1405  			jobPriority = job.Priority
  1406  			taskGroup = job.LookupTaskGroup(alloc.TaskGroup)
  1407  		}
  1408  
  1409  		// If we cannot find the task group for a failed alloc we cannot continue, unless it is an orphan.
  1410  		if evalTriggerBy != structs.EvalTriggerJobDeregister &&
  1411  			allocToUpdate.ClientStatus == structs.AllocClientStatusFailed &&
  1412  			alloc.FollowupEvalID == "" {
  1413  
  1414  			if taskGroup == nil {
  1415  				n.logger.Debug("UpdateAlloc unable to find task group for job", "job", alloc.JobID, "alloc", alloc.ID, "task_group", alloc.TaskGroup)
  1416  				continue
  1417  			}
  1418  
  1419  			// Set trigger by failed if not an orphan.
  1420  			if alloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) {
  1421  				evalTriggerBy = structs.EvalTriggerRetryFailedAlloc
  1422  			}
  1423  		}
  1424  
  1425  		var eval *structs.Evaluation
  1426  		// If unknown, and not an orphan, set the trigger by.
  1427  		if evalTriggerBy != structs.EvalTriggerJobDeregister &&
  1428  			alloc.ClientStatus == structs.AllocClientStatusUnknown {
  1429  			evalTriggerBy = structs.EvalTriggerReconnect
  1430  		}
  1431  
  1432  		// If we weren't able to determine one of our expected eval triggers,
  1433  		// continue and don't create an eval.
  1434  		if evalTriggerBy == "" {
  1435  			continue
  1436  		}
  1437  
  1438  		eval = &structs.Evaluation{
  1439  			ID:          uuid.Generate(),
  1440  			Namespace:   alloc.Namespace,
  1441  			TriggeredBy: evalTriggerBy,
  1442  			JobID:       alloc.JobID,
  1443  			Type:        jobType,
  1444  			Priority:    jobPriority,
  1445  			Status:      structs.EvalStatusPending,
  1446  			CreateTime:  now.UTC().UnixNano(),
  1447  			ModifyTime:  now.UTC().UnixNano(),
  1448  		}
  1449  		evals = append(evals, eval)
  1450  	}
  1451  
  1452  	// Add this to the batch
  1453  	n.updatesLock.Lock()
  1454  	n.updates = append(n.updates, args.Alloc...)
  1455  	n.evals = append(n.evals, evals...)
  1456  
  1457  	// Start a new batch if none
  1458  	future := n.updateFuture
  1459  	if future == nil {
  1460  		future = structs.NewBatchFuture()
  1461  		n.updateFuture = future
  1462  		n.updateTimer = time.AfterFunc(batchUpdateInterval, func() {
  1463  			// Get the pending updates
  1464  			n.updatesLock.Lock()
  1465  			updates := n.updates
  1466  			evals := n.evals
  1467  			future := n.updateFuture
  1468  
  1469  			// Assume future update patterns will be similar to
  1470  			// current batch and set cap appropriately to avoid
  1471  			// slice resizing.
  1472  			n.updates = make([]*structs.Allocation, 0, len(updates))
  1473  			n.evals = make([]*structs.Evaluation, 0, len(evals))
  1474  
  1475  			n.updateFuture = nil
  1476  			n.updateTimer = nil
  1477  			n.updatesLock.Unlock()
  1478  
  1479  			// Perform the batch update
  1480  			n.batchUpdate(future, updates, evals)
  1481  		})
  1482  	}
  1483  	n.updatesLock.Unlock()
  1484  
  1485  	// Wait for the future
  1486  	if err := future.Wait(); err != nil {
  1487  		return err
  1488  	}
  1489  
  1490  	// Setup the response
  1491  	reply.Index = future.Index()
  1492  	return nil
  1493  }
  1494  
  1495  // batchUpdate is used to update all the allocations
  1496  func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) {
  1497  	var mErr multierror.Error
  1498  	// Group pending evals by jobID to prevent creating unnecessary evals
  1499  	evalsByJobId := make(map[structs.NamespacedID]struct{})
  1500  	var trimmedEvals []*structs.Evaluation
  1501  	for _, eval := range evals {
  1502  		namespacedID := structs.NamespacedID{
  1503  			ID:        eval.JobID,
  1504  			Namespace: eval.Namespace,
  1505  		}
  1506  		_, exists := evalsByJobId[namespacedID]
  1507  		if !exists {
  1508  			now := time.Now().UTC().UnixNano()
  1509  			eval.CreateTime = now
  1510  			eval.ModifyTime = now
  1511  			trimmedEvals = append(trimmedEvals, eval)
  1512  			evalsByJobId[namespacedID] = struct{}{}
  1513  		}
  1514  	}
  1515  
  1516  	if len(trimmedEvals) > 0 {
  1517  		n.logger.Debug("adding evaluations for rescheduling failed allocations", "num_evals", len(trimmedEvals))
  1518  	}
  1519  	// Prepare the batch update
  1520  	batch := &structs.AllocUpdateRequest{
  1521  		Alloc:        updates,
  1522  		Evals:        trimmedEvals,
  1523  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
  1524  	}
  1525  
  1526  	// Commit this update via Raft
  1527  	_, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch)
  1528  	if err != nil {
  1529  		n.logger.Error("alloc update failed", "error", err)
  1530  		mErr.Errors = append(mErr.Errors, err)
  1531  	}
  1532  
  1533  	// For each allocation we are updating, check if we should revoke any
  1534  	// - Vault token accessors
  1535  	// - Service Identity token accessors
  1536  	var (
  1537  		revokeVault []*structs.VaultAccessor
  1538  		revokeSI    []*structs.SITokenAccessor
  1539  	)
  1540  
  1541  	for _, alloc := range updates {
  1542  		// Skip any allocation that isn't dead on the client
  1543  		if !alloc.Terminated() {
  1544  			continue
  1545  		}
  1546  
  1547  		ws := memdb.NewWatchSet()
  1548  
  1549  		// Determine if there are any orphaned Vault accessors for the allocation
  1550  		if accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID); err != nil {
  1551  			n.logger.Error("looking up vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err)
  1552  			mErr.Errors = append(mErr.Errors, err)
  1553  		} else {
  1554  			revokeVault = append(revokeVault, accessors...)
  1555  		}
  1556  
  1557  		// Determine if there are any orphaned SI accessors for the allocation
  1558  		if accessors, err := n.srv.State().SITokenAccessorsByAlloc(ws, alloc.ID); err != nil {
  1559  			n.logger.Error("looking up si accessors for alloc failed", "alloc_id", alloc.ID, "error", err)
  1560  			mErr.Errors = append(mErr.Errors, err)
  1561  		} else {
  1562  			revokeSI = append(revokeSI, accessors...)
  1563  		}
  1564  	}
  1565  
  1566  	// Revoke any orphaned Vault token accessors
  1567  	if l := len(revokeVault); l > 0 {
  1568  		n.logger.Debug("revoking vault accessors due to terminal allocations", "num_accessors", l)
  1569  		if err := n.srv.vault.RevokeTokens(context.Background(), revokeVault, true); err != nil {
  1570  			n.logger.Error("batched vault accessor revocation failed", "error", err)
  1571  			mErr.Errors = append(mErr.Errors, err)
  1572  		}
  1573  	}
  1574  
  1575  	// Revoke any orphaned SI token accessors
  1576  	if l := len(revokeSI); l > 0 {
  1577  		n.logger.Debug("revoking si accessors due to terminal allocations", "num_accessors", l)
  1578  		_ = n.srv.consulACLs.RevokeTokens(context.Background(), revokeSI, true)
  1579  	}
  1580  
  1581  	// Respond to the future
  1582  	future.Respond(index, mErr.ErrorOrNil())
  1583  }
  1584  
  1585  // List is used to list the available nodes
  1586  func (n *Node) List(args *structs.NodeListRequest,
  1587  	reply *structs.NodeListResponse) error {
  1588  
  1589  	authErr := n.srv.Authenticate(n.ctx, args)
  1590  	if done, err := n.srv.forward("Node.List", args, args, reply); done {
  1591  		return err
  1592  	}
  1593  	n.srv.MeasureRPCRate("node", structs.RateMetricList, args)
  1594  	if authErr != nil {
  1595  		return structs.ErrPermissionDenied
  1596  	}
  1597  	defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now())
  1598  
  1599  	// Check node read permissions
  1600  	if aclObj, err := n.srv.ResolveACL(args); err != nil {
  1601  		return err
  1602  	} else if aclObj != nil && !aclObj.AllowNodeRead() {
  1603  		return structs.ErrPermissionDenied
  1604  	}
  1605  
  1606  	// Set up the blocking query.
  1607  	opts := blockingOptions{
  1608  		queryOpts: &args.QueryOptions,
  1609  		queryMeta: &reply.QueryMeta,
  1610  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
  1611  
  1612  			var err error
  1613  			var iter memdb.ResultIterator
  1614  			if prefix := args.QueryOptions.Prefix; prefix != "" {
  1615  				iter, err = state.NodesByIDPrefix(ws, prefix)
  1616  			} else {
  1617  				iter, err = state.Nodes(ws)
  1618  			}
  1619  			if err != nil {
  1620  				return err
  1621  			}
  1622  
  1623  			// Generate the tokenizer to use for pagination using the populated
  1624  			// paginatorOpts object. The ID of a node must be unique within the
  1625  			// region, therefore we only need WithID on the paginator options.
  1626  			tokenizer := paginator.NewStructsTokenizer(iter, paginator.StructsTokenizerOptions{WithID: true})
  1627  
  1628  			var nodes []*structs.NodeListStub
  1629  
  1630  			// Build the paginator. This includes the function that is
  1631  			// responsible for appending a node to the nodes array.
  1632  			paginatorImpl, err := paginator.NewPaginator(iter, tokenizer, nil, args.QueryOptions,
  1633  				func(raw interface{}) error {
  1634  					nodes = append(nodes, raw.(*structs.Node).Stub(args.Fields))
  1635  					return nil
  1636  				})
  1637  			if err != nil {
  1638  				return structs.NewErrRPCCodedf(
  1639  					http.StatusBadRequest, "failed to create result paginator: %v", err)
  1640  			}
  1641  
  1642  			// Calling page populates our output nodes array as well as returns
  1643  			// the next token.
  1644  			nextToken, err := paginatorImpl.Page()
  1645  			if err != nil {
  1646  				return structs.NewErrRPCCodedf(
  1647  					http.StatusBadRequest, "failed to read result page: %v", err)
  1648  			}
  1649  
  1650  			// Populate the reply.
  1651  			reply.Nodes = nodes
  1652  			reply.NextToken = nextToken
  1653  
  1654  			// Use the last index that affected the jobs table
  1655  			index, err := state.Index("nodes")
  1656  			if err != nil {
  1657  				return err
  1658  			}
  1659  			reply.Index = index
  1660  
  1661  			// Set the query response
  1662  			n.srv.setQueryMeta(&reply.QueryMeta)
  1663  			return nil
  1664  		}}
  1665  	return n.srv.blockingRPC(&opts)
  1666  }
  1667  
  1668  // createNodeEvals is used to create evaluations for each alloc on a node.
  1669  // Each Eval is scoped to a job, so we need to potentially trigger many evals.
  1670  func (n *Node) createNodeEvals(node *structs.Node, nodeIndex uint64) ([]string, uint64, error) {
  1671  	nodeID := node.ID
  1672  
  1673  	// Snapshot the state
  1674  	snap, err := n.srv.fsm.State().Snapshot()
  1675  	if err != nil {
  1676  		return nil, 0, fmt.Errorf("failed to snapshot state: %v", err)
  1677  	}
  1678  
  1679  	// Find all the allocations for this node
  1680  	allocs, err := snap.AllocsByNode(nil, nodeID)
  1681  	if err != nil {
  1682  		return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err)
  1683  	}
  1684  
  1685  	sysJobsIter, err := snap.JobsByScheduler(nil, "system")
  1686  	if err != nil {
  1687  		return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err)
  1688  	}
  1689  
  1690  	var sysJobs []*structs.Job
  1691  	for jobI := sysJobsIter.Next(); jobI != nil; jobI = sysJobsIter.Next() {
  1692  		job := jobI.(*structs.Job)
  1693  		// Avoid creating evals for jobs that don't run in this datacenter or
  1694  		// node pool. We could perform an entire feasibility check here, but
  1695  		// datacenter/pool is a good optimization to start with as their
  1696  		// cardinality tends to be low so the check shouldn't add much work.
  1697  		if node.IsInPool(job.NodePool) && node.IsInAnyDC(job.Datacenters) {
  1698  			sysJobs = append(sysJobs, job)
  1699  		}
  1700  	}
  1701  
  1702  	// Fast-path if nothing to do
  1703  	if len(allocs) == 0 && len(sysJobs) == 0 {
  1704  		return nil, 0, nil
  1705  	}
  1706  
  1707  	// Create an eval for each JobID affected
  1708  	var evals []*structs.Evaluation
  1709  	var evalIDs []string
  1710  	jobIDs := map[structs.NamespacedID]struct{}{}
  1711  	now := time.Now().UTC().UnixNano()
  1712  
  1713  	for _, alloc := range allocs {
  1714  		// Deduplicate on JobID
  1715  		if _, ok := jobIDs[alloc.JobNamespacedID()]; ok {
  1716  			continue
  1717  		}
  1718  		jobIDs[alloc.JobNamespacedID()] = struct{}{}
  1719  
  1720  		// Create a new eval
  1721  		eval := &structs.Evaluation{
  1722  			ID:              uuid.Generate(),
  1723  			Namespace:       alloc.Namespace,
  1724  			Priority:        alloc.Job.Priority,
  1725  			Type:            alloc.Job.Type,
  1726  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
  1727  			JobID:           alloc.JobID,
  1728  			NodeID:          nodeID,
  1729  			NodeModifyIndex: nodeIndex,
  1730  			Status:          structs.EvalStatusPending,
  1731  			CreateTime:      now,
  1732  			ModifyTime:      now,
  1733  		}
  1734  
  1735  		evals = append(evals, eval)
  1736  		evalIDs = append(evalIDs, eval.ID)
  1737  	}
  1738  
  1739  	// Create an evaluation for each system job.
  1740  	for _, job := range sysJobs {
  1741  		// Still dedup on JobID as the node may already have the system job.
  1742  		if _, ok := jobIDs[job.NamespacedID()]; ok {
  1743  			continue
  1744  		}
  1745  		jobIDs[job.NamespacedID()] = struct{}{}
  1746  
  1747  		// Create a new eval
  1748  		eval := &structs.Evaluation{
  1749  			ID:              uuid.Generate(),
  1750  			Namespace:       job.Namespace,
  1751  			Priority:        job.Priority,
  1752  			Type:            job.Type,
  1753  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
  1754  			JobID:           job.ID,
  1755  			NodeID:          nodeID,
  1756  			NodeModifyIndex: nodeIndex,
  1757  			Status:          structs.EvalStatusPending,
  1758  			CreateTime:      now,
  1759  			ModifyTime:      now,
  1760  		}
  1761  		evals = append(evals, eval)
  1762  		evalIDs = append(evalIDs, eval.ID)
  1763  	}
  1764  
  1765  	// Create the Raft transaction
  1766  	update := &structs.EvalUpdateRequest{
  1767  		Evals:        evals,
  1768  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
  1769  	}
  1770  
  1771  	// Commit this evaluation via Raft
  1772  	// XXX: There is a risk of partial failure where the node update succeeds
  1773  	// but that the EvalUpdate does not.
  1774  	_, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update)
  1775  	if err != nil {
  1776  		return nil, 0, err
  1777  	}
  1778  	return evalIDs, evalIndex, nil
  1779  }
  1780  
  1781  // DeriveVaultToken is used by the clients to request wrapped Vault tokens for
  1782  // tasks
  1783  func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, reply *structs.DeriveVaultTokenResponse) error {
  1784  
  1785  	authErr := n.srv.Authenticate(n.ctx, args)
  1786  
  1787  	setError := func(e error, recoverable bool) {
  1788  		if e != nil {
  1789  			if re, ok := e.(*structs.RecoverableError); ok {
  1790  				reply.Error = re // No need to wrap if error is already a RecoverableError
  1791  			} else {
  1792  				reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError)
  1793  			}
  1794  			n.logger.Error("DeriveVaultToken failed", "recoverable", recoverable, "error", e)
  1795  		}
  1796  	}
  1797  
  1798  	if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done {
  1799  		setError(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader)
  1800  		return nil
  1801  	}
  1802  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
  1803  	if authErr != nil {
  1804  		return structs.ErrPermissionDenied
  1805  	}
  1806  	defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now())
  1807  
  1808  	// Verify the arguments
  1809  	if args.NodeID == "" {
  1810  		setError(fmt.Errorf("missing node ID"), false)
  1811  		return nil
  1812  	}
  1813  	if args.SecretID == "" {
  1814  		setError(fmt.Errorf("missing node SecretID"), false)
  1815  		return nil
  1816  	}
  1817  	if args.AllocID == "" {
  1818  		setError(fmt.Errorf("missing allocation ID"), false)
  1819  		return nil
  1820  	}
  1821  	if len(args.Tasks) == 0 {
  1822  		setError(fmt.Errorf("no tasks specified"), false)
  1823  		return nil
  1824  	}
  1825  
  1826  	// Verify the following:
  1827  	// * The Node exists and has the correct SecretID
  1828  	// * The Allocation exists on the specified Node
  1829  	// * The Allocation contains the given tasks and they each require Vault
  1830  	//   tokens
  1831  	snap, err := n.srv.fsm.State().Snapshot()
  1832  	if err != nil {
  1833  		setError(err, false)
  1834  		return nil
  1835  	}
  1836  	ws := memdb.NewWatchSet()
  1837  	node, err := snap.NodeByID(ws, args.NodeID)
  1838  	if err != nil {
  1839  		setError(err, false)
  1840  		return nil
  1841  	}
  1842  	if node == nil {
  1843  		setError(fmt.Errorf("Node %q does not exist", args.NodeID), false)
  1844  		return nil
  1845  	}
  1846  	if node.SecretID != args.SecretID {
  1847  		setError(fmt.Errorf("SecretID mismatch"), false)
  1848  		return nil
  1849  	}
  1850  
  1851  	alloc, err := snap.AllocByID(ws, args.AllocID)
  1852  	if err != nil {
  1853  		setError(err, false)
  1854  		return nil
  1855  	}
  1856  	if alloc == nil {
  1857  		setError(fmt.Errorf("Allocation %q does not exist", args.AllocID), false)
  1858  		return nil
  1859  	}
  1860  	if alloc.NodeID != args.NodeID {
  1861  		setError(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false)
  1862  		return nil
  1863  	}
  1864  	if alloc.TerminalStatus() {
  1865  		setError(fmt.Errorf("Can't request Vault token for terminal allocation"), false)
  1866  		return nil
  1867  	}
  1868  
  1869  	// Check if alloc has Vault
  1870  	vaultBlocks := alloc.Job.Vault()
  1871  	if vaultBlocks == nil {
  1872  		setError(fmt.Errorf("Job does not require Vault token"), false)
  1873  		return nil
  1874  	}
  1875  	tg, ok := vaultBlocks[alloc.TaskGroup]
  1876  	if !ok {
  1877  		setError(fmt.Errorf("Task group does not require Vault token"), false)
  1878  		return nil
  1879  	}
  1880  
  1881  	var unneeded []string
  1882  	for _, task := range args.Tasks {
  1883  		taskVault := tg[task]
  1884  		if taskVault == nil || len(taskVault.Policies) == 0 {
  1885  			unneeded = append(unneeded, task)
  1886  		}
  1887  	}
  1888  
  1889  	if len(unneeded) != 0 {
  1890  		e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s",
  1891  			strings.Join(unneeded, ", "))
  1892  		setError(e, false)
  1893  		return nil
  1894  	}
  1895  
  1896  	// At this point the request is valid and we should contact Vault for
  1897  	// tokens.
  1898  
  1899  	// Create an error group where we will spin up a fixed set of goroutines to
  1900  	// handle deriving tokens but where if any fails the whole group is
  1901  	// canceled.
  1902  	g, ctx := errgroup.WithContext(context.Background())
  1903  
  1904  	// Cap the handlers
  1905  	handlers := len(args.Tasks)
  1906  	if handlers > maxParallelRequestsPerDerive {
  1907  		handlers = maxParallelRequestsPerDerive
  1908  	}
  1909  
  1910  	// Create the Vault Tokens
  1911  	input := make(chan string, handlers)
  1912  	results := make(map[string]*vapi.Secret, len(args.Tasks))
  1913  	for i := 0; i < handlers; i++ {
  1914  		g.Go(func() error {
  1915  			for {
  1916  				select {
  1917  				case task, ok := <-input:
  1918  					if !ok {
  1919  						return nil
  1920  					}
  1921  
  1922  					secret, err := n.srv.vault.CreateToken(ctx, alloc, task)
  1923  					if err != nil {
  1924  						return err
  1925  					}
  1926  
  1927  					results[task] = secret
  1928  				case <-ctx.Done():
  1929  					return nil
  1930  				}
  1931  			}
  1932  		})
  1933  	}
  1934  
  1935  	// Send the input
  1936  	go func() {
  1937  		defer close(input)
  1938  		for _, task := range args.Tasks {
  1939  			select {
  1940  			case <-ctx.Done():
  1941  				return
  1942  			case input <- task:
  1943  			}
  1944  		}
  1945  	}()
  1946  
  1947  	// Wait for everything to complete or for an error
  1948  	createErr := g.Wait()
  1949  
  1950  	// Retrieve the results
  1951  	accessors := make([]*structs.VaultAccessor, 0, len(results))
  1952  	tokens := make(map[string]string, len(results))
  1953  	for task, secret := range results {
  1954  		w := secret.WrapInfo
  1955  		tokens[task] = w.Token
  1956  		accessor := &structs.VaultAccessor{
  1957  			Accessor:    w.WrappedAccessor,
  1958  			Task:        task,
  1959  			NodeID:      alloc.NodeID,
  1960  			AllocID:     alloc.ID,
  1961  			CreationTTL: w.TTL,
  1962  		}
  1963  
  1964  		accessors = append(accessors, accessor)
  1965  	}
  1966  
  1967  	// If there was an error revoke the created tokens
  1968  	if createErr != nil {
  1969  		n.logger.Error("Vault token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr)
  1970  
  1971  		if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil {
  1972  			n.logger.Error("Vault token revocation for alloc failed", "alloc_id", alloc.ID, "error", revokeErr)
  1973  		}
  1974  
  1975  		if rerr, ok := createErr.(*structs.RecoverableError); ok {
  1976  			reply.Error = rerr
  1977  		} else {
  1978  			reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError)
  1979  		}
  1980  
  1981  		return nil
  1982  	}
  1983  
  1984  	// Commit to Raft before returning any of the tokens
  1985  	req := structs.VaultAccessorsRequest{Accessors: accessors}
  1986  	_, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req)
  1987  	if err != nil {
  1988  		n.logger.Error("registering Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err)
  1989  
  1990  		// Determine if we can recover from the error
  1991  		retry := false
  1992  		switch err {
  1993  		case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout:
  1994  			retry = true
  1995  		}
  1996  
  1997  		setError(err, retry)
  1998  		return nil
  1999  	}
  2000  
  2001  	reply.Index = index
  2002  	reply.Tasks = tokens
  2003  	n.srv.setQueryMeta(&reply.QueryMeta)
  2004  	return nil
  2005  }
  2006  
  2007  type connectTask struct {
  2008  	TaskKind structs.TaskKind
  2009  	TaskName string
  2010  }
  2011  
  2012  func (n *Node) DeriveSIToken(args *structs.DeriveSITokenRequest, reply *structs.DeriveSITokenResponse) error {
  2013  
  2014  	authErr := n.srv.Authenticate(n.ctx, args)
  2015  
  2016  	setError := func(e error, recoverable bool) {
  2017  		if e != nil {
  2018  			if re, ok := e.(*structs.RecoverableError); ok {
  2019  				reply.Error = re // No need to wrap if error is already a RecoverableError
  2020  			} else {
  2021  				reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError)
  2022  			}
  2023  			n.logger.Error("DeriveSIToken failed", "recoverable", recoverable, "error", e)
  2024  		}
  2025  	}
  2026  
  2027  	if done, err := n.srv.forward("Node.DeriveSIToken", args, args, reply); done {
  2028  		setError(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader)
  2029  		return nil
  2030  	}
  2031  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
  2032  	if authErr != nil {
  2033  		return structs.ErrPermissionDenied
  2034  	}
  2035  	defer metrics.MeasureSince([]string{"nomad", "client", "derive_si_token"}, time.Now())
  2036  
  2037  	// Verify the arguments
  2038  	if err := args.Validate(); err != nil {
  2039  		setError(err, false)
  2040  		return nil
  2041  	}
  2042  
  2043  	// Get the ClusterID
  2044  	clusterID, err := n.srv.ClusterID()
  2045  	if err != nil {
  2046  		setError(err, false)
  2047  		return nil
  2048  	}
  2049  
  2050  	// Verify the following:
  2051  	// * The Node exists and has the correct SecretID.
  2052  	// * The Allocation exists on the specified Node.
  2053  	// * The Allocation contains the given tasks, and each task requires a
  2054  	//   SI token.
  2055  
  2056  	snap, err := n.srv.fsm.State().Snapshot()
  2057  	if err != nil {
  2058  		setError(err, false)
  2059  		return nil
  2060  	}
  2061  	node, err := snap.NodeByID(nil, args.NodeID)
  2062  	if err != nil {
  2063  		setError(err, false)
  2064  		return nil
  2065  	}
  2066  	if node == nil {
  2067  		setError(fmt.Errorf("Node %q does not exist", args.NodeID), false)
  2068  		return nil
  2069  	}
  2070  	if node.SecretID != args.SecretID {
  2071  		setError(errors.New("SecretID mismatch"), false)
  2072  		return nil
  2073  	}
  2074  
  2075  	alloc, err := snap.AllocByID(nil, args.AllocID)
  2076  	if err != nil {
  2077  		setError(err, false)
  2078  		return nil
  2079  	}
  2080  	if alloc == nil {
  2081  		setError(fmt.Errorf("Allocation %q does not exist", args.AllocID), false)
  2082  		return nil
  2083  	}
  2084  	if alloc.NodeID != args.NodeID {
  2085  		setError(fmt.Errorf("Allocation %q not running on node %q", args.AllocID, args.NodeID), false)
  2086  		return nil
  2087  	}
  2088  	if alloc.TerminalStatus() {
  2089  		setError(errors.New("Cannot request SI token for terminal allocation"), false)
  2090  		return nil
  2091  	}
  2092  
  2093  	// make sure task group contains at least one connect enabled service
  2094  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  2095  	if tg == nil {
  2096  		setError(fmt.Errorf("Allocation %q does not contain TaskGroup %q", args.AllocID, alloc.TaskGroup), false)
  2097  		return nil
  2098  	}
  2099  	if !tg.UsesConnect() {
  2100  		setError(fmt.Errorf("TaskGroup %q does not use Connect", tg.Name), false)
  2101  		return nil
  2102  	}
  2103  
  2104  	// make sure each task in args.Tasks is a connect-enabled task
  2105  	notConnect, tasks := connectTasks(tg, args.Tasks)
  2106  	if len(notConnect) > 0 {
  2107  		setError(fmt.Errorf(
  2108  			"Requested Consul Service Identity tokens for tasks that are not Connect enabled: %v",
  2109  			strings.Join(notConnect, ", "),
  2110  		), false)
  2111  	}
  2112  
  2113  	// At this point the request is valid and we should contact Consul for tokens.
  2114  
  2115  	// A lot of the following is copied from DeriveVaultToken which has been
  2116  	// working fine for years.
  2117  
  2118  	// Create an error group where we will spin up a fixed set of goroutines to
  2119  	// handle deriving tokens but where if any fails the whole group is
  2120  	// canceled.
  2121  	g, ctx := errgroup.WithContext(context.Background())
  2122  
  2123  	// Cap the worker threads
  2124  	numWorkers := len(args.Tasks)
  2125  	if numWorkers > maxParallelRequestsPerDerive {
  2126  		numWorkers = maxParallelRequestsPerDerive
  2127  	}
  2128  
  2129  	// would like to pull some of this out...
  2130  
  2131  	// Create the SI tokens from a slice of task name + connect service
  2132  	input := make(chan connectTask, numWorkers)
  2133  	results := make(map[string]*structs.SIToken, numWorkers)
  2134  	for i := 0; i < numWorkers; i++ {
  2135  		g.Go(func() error {
  2136  			for {
  2137  				select {
  2138  				case task, ok := <-input:
  2139  					if !ok {
  2140  						return nil
  2141  					}
  2142  					secret, err := n.srv.consulACLs.CreateToken(ctx, ServiceIdentityRequest{
  2143  						ConsulNamespace: tg.Consul.GetNamespace(),
  2144  						TaskKind:        task.TaskKind,
  2145  						TaskName:        task.TaskName,
  2146  						ClusterID:       clusterID,
  2147  						AllocID:         alloc.ID,
  2148  					})
  2149  					if err != nil {
  2150  						return err
  2151  					}
  2152  					results[task.TaskName] = secret
  2153  				case <-ctx.Done():
  2154  					return nil
  2155  				}
  2156  			}
  2157  		})
  2158  	}
  2159  
  2160  	// Send the input
  2161  	go func() {
  2162  		defer close(input)
  2163  		for _, connectTask := range tasks {
  2164  			select {
  2165  			case <-ctx.Done():
  2166  				return
  2167  			case input <- connectTask:
  2168  			}
  2169  		}
  2170  	}()
  2171  
  2172  	// Wait for everything to complete or for an error
  2173  	createErr := g.Wait()
  2174  
  2175  	accessors := make([]*structs.SITokenAccessor, 0, len(results))
  2176  	tokens := make(map[string]string, len(results))
  2177  	for task, secret := range results {
  2178  		tokens[task] = secret.SecretID
  2179  		accessor := &structs.SITokenAccessor{
  2180  			ConsulNamespace: tg.Consul.GetNamespace(),
  2181  			NodeID:          alloc.NodeID,
  2182  			AllocID:         alloc.ID,
  2183  			TaskName:        task,
  2184  			AccessorID:      secret.AccessorID,
  2185  		}
  2186  		accessors = append(accessors, accessor)
  2187  	}
  2188  
  2189  	// If there was an error, revoke all created tokens. These tokens have not
  2190  	// yet been committed to the persistent store.
  2191  	if createErr != nil {
  2192  		n.logger.Error("Consul Service Identity token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr)
  2193  		_ = n.srv.consulACLs.RevokeTokens(context.Background(), accessors, false)
  2194  
  2195  		if recoverable, ok := createErr.(*structs.RecoverableError); ok {
  2196  			reply.Error = recoverable
  2197  		} else {
  2198  			reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError)
  2199  		}
  2200  
  2201  		return nil
  2202  	}
  2203  
  2204  	// Commit the derived tokens to raft before returning them
  2205  	requested := structs.SITokenAccessorsRequest{Accessors: accessors}
  2206  	_, index, err := n.srv.raftApply(structs.ServiceIdentityAccessorRegisterRequestType, &requested)
  2207  	if err != nil {
  2208  		n.logger.Error("registering Service Identity token accessors for alloc failed", "alloc_id", alloc.ID, "error", err)
  2209  
  2210  		// Determine if we can recover from the error
  2211  		retry := false
  2212  		switch err {
  2213  		case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout:
  2214  			retry = true
  2215  		}
  2216  		setError(err, retry)
  2217  		return nil
  2218  	}
  2219  
  2220  	// We made it! Now we can set the reply.
  2221  	reply.Index = index
  2222  	reply.Tokens = tokens
  2223  	n.srv.setQueryMeta(&reply.QueryMeta)
  2224  	return nil
  2225  }
  2226  
  2227  func connectTasks(tg *structs.TaskGroup, tasks []string) ([]string, []connectTask) {
  2228  	var notConnect []string
  2229  	var usesConnect []connectTask
  2230  	for _, task := range tasks {
  2231  		tgTask := tg.LookupTask(task)
  2232  		if !taskUsesConnect(tgTask) {
  2233  			notConnect = append(notConnect, task)
  2234  		} else {
  2235  			usesConnect = append(usesConnect, connectTask{
  2236  				TaskName: task,
  2237  				TaskKind: tgTask.Kind,
  2238  			})
  2239  		}
  2240  	}
  2241  	return notConnect, usesConnect
  2242  }
  2243  
  2244  func taskUsesConnect(task *structs.Task) bool {
  2245  	if task == nil {
  2246  		// not even in the task group
  2247  		return false
  2248  	}
  2249  	return task.UsesConnect()
  2250  }
  2251  
  2252  func (n *Node) EmitEvents(args *structs.EmitNodeEventsRequest, reply *structs.EmitNodeEventsResponse) error {
  2253  
  2254  	authErr := n.srv.Authenticate(n.ctx, args)
  2255  
  2256  	// Ensure the connection was initiated by another client if TLS is used.
  2257  	err := validateTLSCertificateLevel(n.srv, n.ctx, tlsCertificateLevelClient)
  2258  	if err != nil {
  2259  		return err
  2260  	}
  2261  	if done, err := n.srv.forward("Node.EmitEvents", args, args, reply); done {
  2262  		return err
  2263  	}
  2264  	n.srv.MeasureRPCRate("node", structs.RateMetricWrite, args)
  2265  	if authErr != nil {
  2266  		return structs.ErrPermissionDenied
  2267  	}
  2268  	defer metrics.MeasureSince([]string{"nomad", "client", "emit_events"}, time.Now())
  2269  
  2270  	if len(args.NodeEvents) == 0 {
  2271  		return fmt.Errorf("no node events given")
  2272  	}
  2273  	for nodeID, events := range args.NodeEvents {
  2274  		if len(events) == 0 {
  2275  			return fmt.Errorf("no node events given for node %q", nodeID)
  2276  		}
  2277  	}
  2278  
  2279  	_, index, err := n.srv.raftApply(structs.UpsertNodeEventsType, args)
  2280  	if err != nil {
  2281  		n.logger.Error("upserting node events failed", "error", err)
  2282  		return err
  2283  	}
  2284  
  2285  	reply.Index = index
  2286  	return nil
  2287  }