github.com/manicqin/nomad@v0.9.5/nomad/node_endpoint.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"golang.org/x/sync/errgroup"
    11  
    12  	metrics "github.com/armon/go-metrics"
    13  	log "github.com/hashicorp/go-hclog"
    14  	memdb "github.com/hashicorp/go-memdb"
    15  	multierror "github.com/hashicorp/go-multierror"
    16  	vapi "github.com/hashicorp/vault/api"
    17  
    18  	"github.com/hashicorp/nomad/acl"
    19  	"github.com/hashicorp/nomad/helper/uuid"
    20  	"github.com/hashicorp/nomad/nomad/state"
    21  	"github.com/hashicorp/nomad/nomad/structs"
    22  	"github.com/hashicorp/raft"
    23  )
    24  
    25  const (
    26  	// batchUpdateInterval is how long we wait to batch updates
    27  	batchUpdateInterval = 50 * time.Millisecond
    28  
    29  	// maxParallelRequestsPerDerive  is the maximum number of parallel Vault
    30  	// create token requests that may be outstanding per derive request
    31  	maxParallelRequestsPerDerive = 16
    32  
    33  	// NodeDrainEvents are the various drain messages
    34  	NodeDrainEventDrainSet      = "Node drain strategy set"
    35  	NodeDrainEventDrainDisabled = "Node drain disabled"
    36  	NodeDrainEventDrainUpdated  = "Node drain stategy updated"
    37  
    38  	// NodeEligibilityEventEligible is used when the nodes eligiblity is marked
    39  	// eligible
    40  	NodeEligibilityEventEligible = "Node marked as eligible for scheduling"
    41  
    42  	// NodeEligibilityEventIneligible is used when the nodes eligiblity is marked
    43  	// ineligible
    44  	NodeEligibilityEventIneligible = "Node marked as ineligible for scheduling"
    45  
    46  	// NodeHeartbeatEventReregistered is the message used when the node becomes
    47  	// reregistered by the heartbeat.
    48  	NodeHeartbeatEventReregistered = "Node reregistered by heartbeat"
    49  )
    50  
    51  // Node endpoint is used for client interactions
    52  type Node struct {
    53  	srv    *Server
    54  	logger log.Logger
    55  
    56  	// ctx provides context regarding the underlying connection
    57  	ctx *RPCContext
    58  
    59  	// updates holds pending client status updates for allocations
    60  	updates []*structs.Allocation
    61  
    62  	// evals holds pending rescheduling eval updates triggered by failed allocations
    63  	evals []*structs.Evaluation
    64  
    65  	// updateFuture is used to wait for the pending batch update
    66  	// to complete. This may be nil if no batch is pending.
    67  	updateFuture *structs.BatchFuture
    68  
    69  	// updateTimer is the timer that will trigger the next batch
    70  	// update, and may be nil if there is no batch pending.
    71  	updateTimer *time.Timer
    72  
    73  	// updatesLock synchronizes access to the updates list,
    74  	// the future and the timer.
    75  	updatesLock sync.Mutex
    76  }
    77  
    78  // Register is used to upsert a client that is available for scheduling
    79  func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error {
    80  	if done, err := n.srv.forward("Node.Register", args, args, reply); done {
    81  		// We have a valid node connection since there is no error from the
    82  		// forwarded server, so add the mapping to cache the
    83  		// connection and allow the server to send RPCs to the client.
    84  		if err == nil && n.ctx != nil && n.ctx.NodeID == "" {
    85  			n.ctx.NodeID = args.Node.ID
    86  			n.srv.addNodeConn(n.ctx)
    87  		}
    88  
    89  		return err
    90  	}
    91  	defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now())
    92  
    93  	if n.srv.config.ACLEnforceNode {
    94  		// Check noderpc write permissions
    95  		if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
    96  			return err
    97  		} else if aclObj != nil && !aclObj.AllowNodeRPCWrite() {
    98  			return structs.ErrPermissionDenied
    99  		}
   100  	}
   101  
   102  	// Validate the arguments
   103  	if args.Node == nil {
   104  		return fmt.Errorf("missing node for client registration")
   105  	}
   106  	if args.Node.ID == "" {
   107  		return fmt.Errorf("missing node ID for client registration")
   108  	}
   109  	if args.Node.Datacenter == "" {
   110  		return fmt.Errorf("missing datacenter for client registration")
   111  	}
   112  	if args.Node.Name == "" {
   113  		return fmt.Errorf("missing node name for client registration")
   114  	}
   115  	if len(args.Node.Attributes) == 0 {
   116  		return fmt.Errorf("missing attributes for client registration")
   117  	}
   118  	if args.Node.SecretID == "" {
   119  		return fmt.Errorf("missing node secret ID for client registration")
   120  	}
   121  
   122  	// Default the status if none is given
   123  	if args.Node.Status == "" {
   124  		args.Node.Status = structs.NodeStatusInit
   125  	}
   126  	if !structs.ValidNodeStatus(args.Node.Status) {
   127  		return fmt.Errorf("invalid status for node")
   128  	}
   129  
   130  	// Default to eligible for scheduling if unset
   131  	if args.Node.SchedulingEligibility == "" {
   132  		args.Node.SchedulingEligibility = structs.NodeSchedulingEligible
   133  	}
   134  
   135  	// Set the timestamp when the node is registered
   136  	args.Node.StatusUpdatedAt = time.Now().Unix()
   137  
   138  	// Compute the node class
   139  	if err := args.Node.ComputeClass(); err != nil {
   140  		return fmt.Errorf("failed to computed node class: %v", err)
   141  	}
   142  
   143  	// Look for the node so we can detect a state transition
   144  	snap, err := n.srv.fsm.State().Snapshot()
   145  	if err != nil {
   146  		return err
   147  	}
   148  
   149  	ws := memdb.NewWatchSet()
   150  	originalNode, err := snap.NodeByID(ws, args.Node.ID)
   151  	if err != nil {
   152  		return err
   153  	}
   154  
   155  	// Check if the SecretID has been tampered with
   156  	if originalNode != nil {
   157  		if args.Node.SecretID != originalNode.SecretID && originalNode.SecretID != "" {
   158  			return fmt.Errorf("node secret ID does not match. Not registering node.")
   159  		}
   160  	}
   161  
   162  	// We have a valid node connection, so add the mapping to cache the
   163  	// connection and allow the server to send RPCs to the client. We only cache
   164  	// the connection if it is not being forwarded from another server.
   165  	if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() {
   166  		n.ctx.NodeID = args.Node.ID
   167  		n.srv.addNodeConn(n.ctx)
   168  	}
   169  
   170  	// Commit this update via Raft
   171  	_, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args)
   172  	if err != nil {
   173  		n.logger.Error("register failed", "error", err)
   174  		return err
   175  	}
   176  	reply.NodeModifyIndex = index
   177  
   178  	// Check if we should trigger evaluations
   179  	originalStatus := structs.NodeStatusInit
   180  	if originalNode != nil {
   181  		originalStatus = originalNode.Status
   182  	}
   183  	transitionToReady := transitionedToReady(args.Node.Status, originalStatus)
   184  	if structs.ShouldDrainNode(args.Node.Status) || transitionToReady {
   185  		evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index)
   186  		if err != nil {
   187  			n.logger.Error("eval creation failed", "error", err)
   188  			return err
   189  		}
   190  		reply.EvalIDs = evalIDs
   191  		reply.EvalCreateIndex = evalIndex
   192  	}
   193  
   194  	// Check if we need to setup a heartbeat
   195  	if !args.Node.TerminalStatus() {
   196  		ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID)
   197  		if err != nil {
   198  			n.logger.Error("heartbeat reset failed", "error", err)
   199  			return err
   200  		}
   201  		reply.HeartbeatTTL = ttl
   202  	}
   203  
   204  	// Set the reply index
   205  	reply.Index = index
   206  	snap, err = n.srv.fsm.State().Snapshot()
   207  	if err != nil {
   208  		return err
   209  	}
   210  
   211  	n.srv.peerLock.RLock()
   212  	defer n.srv.peerLock.RUnlock()
   213  	if err := n.constructNodeServerInfoResponse(snap, reply); err != nil {
   214  		n.logger.Error("failed to populate NodeUpdateResponse", "error", err)
   215  		return err
   216  	}
   217  
   218  	return nil
   219  }
   220  
   221  // updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading.
   222  func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error {
   223  	reply.LeaderRPCAddr = string(n.srv.raft.Leader())
   224  
   225  	// Reply with config information required for future RPC requests
   226  	reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers))
   227  	for _, v := range n.srv.localPeers {
   228  		reply.Servers = append(reply.Servers,
   229  			&structs.NodeServerInfo{
   230  				RPCAdvertiseAddr: v.RPCAddr.String(),
   231  				RPCMajorVersion:  int32(v.MajorVersion),
   232  				RPCMinorVersion:  int32(v.MinorVersion),
   233  				Datacenter:       v.Datacenter,
   234  			})
   235  	}
   236  
   237  	// TODO(sean@): Use an indexed node count instead
   238  	//
   239  	// Snapshot is used only to iterate over all nodes to create a node
   240  	// count to send back to Nomad Clients in their heartbeat so Clients
   241  	// can estimate the size of the cluster.
   242  	ws := memdb.NewWatchSet()
   243  	iter, err := snap.Nodes(ws)
   244  	if err == nil {
   245  		for {
   246  			raw := iter.Next()
   247  			if raw == nil {
   248  				break
   249  			}
   250  			reply.NumNodes++
   251  		}
   252  	}
   253  
   254  	return nil
   255  }
   256  
   257  // Deregister is used to remove a client from the cluster. If a client should
   258  // just be made unavailable for scheduling, a status update is preferred.
   259  func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error {
   260  	if done, err := n.srv.forward("Node.Deregister", args, args, reply); done {
   261  		return err
   262  	}
   263  	defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now())
   264  
   265  	if args.NodeID == "" {
   266  		return fmt.Errorf("missing node ID for client deregistration")
   267  	}
   268  
   269  	// deregister takes a batch
   270  	repack := &structs.NodeBatchDeregisterRequest{
   271  		NodeIDs:      []string{args.NodeID},
   272  		WriteRequest: args.WriteRequest,
   273  	}
   274  
   275  	return n.deregister(repack, reply, func() (interface{}, uint64, error) {
   276  		return n.srv.raftApply(structs.NodeDeregisterRequestType, args)
   277  	})
   278  }
   279  
   280  // BatchDeregister is used to remove client nodes from the cluster.
   281  func (n *Node) BatchDeregister(args *structs.NodeBatchDeregisterRequest, reply *structs.NodeUpdateResponse) error {
   282  	if done, err := n.srv.forward("Node.BatchDeregister", args, args, reply); done {
   283  		return err
   284  	}
   285  	defer metrics.MeasureSince([]string{"nomad", "client", "batch_deregister"}, time.Now())
   286  
   287  	if len(args.NodeIDs) == 0 {
   288  		return fmt.Errorf("missing node IDs for client deregistration")
   289  	}
   290  
   291  	return n.deregister(args, reply, func() (interface{}, uint64, error) {
   292  		return n.srv.raftApply(structs.NodeBatchDeregisterRequestType, args)
   293  	})
   294  }
   295  
   296  // deregister takes a raftMessage closure, to support both Deregister and BatchDeregister
   297  func (n *Node) deregister(args *structs.NodeBatchDeregisterRequest,
   298  	reply *structs.NodeUpdateResponse,
   299  	raftApplyFn func() (interface{}, uint64, error),
   300  ) error {
   301  	// Check request permissions
   302  	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
   303  		return err
   304  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   305  		return structs.ErrPermissionDenied
   306  	}
   307  
   308  	// Look for the node
   309  	snap, err := n.srv.fsm.State().Snapshot()
   310  	if err != nil {
   311  		return err
   312  	}
   313  
   314  	ws := memdb.NewWatchSet()
   315  	for _, nodeID := range args.NodeIDs {
   316  		node, err := snap.NodeByID(ws, nodeID)
   317  		if err != nil {
   318  			return err
   319  		}
   320  		if node == nil {
   321  			return fmt.Errorf("node not found")
   322  		}
   323  	}
   324  
   325  	// Commit this update via Raft
   326  	_, index, err := raftApplyFn()
   327  	if err != nil {
   328  		n.logger.Error("raft message failed", "error", err)
   329  		return err
   330  	}
   331  
   332  	for _, nodeID := range args.NodeIDs {
   333  		// Clear the heartbeat timer if any
   334  		n.srv.clearHeartbeatTimer(nodeID)
   335  
   336  		// Create the evaluations for this node
   337  		evalIDs, evalIndex, err := n.createNodeEvals(nodeID, index)
   338  		if err != nil {
   339  			n.logger.Error("eval creation failed", "error", err)
   340  			return err
   341  		}
   342  
   343  		// Determine if there are any Vault accessors on the node
   344  		accessors, err := snap.VaultAccessorsByNode(ws, nodeID)
   345  		if err != nil {
   346  			n.logger.Error("looking up accessors for node failed", "node_id", nodeID, "error", err)
   347  			return err
   348  		}
   349  
   350  		if l := len(accessors); l != 0 {
   351  			n.logger.Debug("revoking accessors on node due to deregister", "num_accessors", l, "node_id", nodeID)
   352  			if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil {
   353  				n.logger.Error("revoking accessors for node failed", "node_id", nodeID, "error", err)
   354  				return err
   355  			}
   356  		}
   357  
   358  		reply.EvalIDs = append(reply.EvalIDs, evalIDs...)
   359  		// Set the reply eval create index just the first time
   360  		if reply.EvalCreateIndex == 0 {
   361  			reply.EvalCreateIndex = evalIndex
   362  		}
   363  	}
   364  
   365  	reply.NodeModifyIndex = index
   366  	reply.Index = index
   367  	return nil
   368  }
   369  
   370  // UpdateStatus is used to update the status of a client node
   371  func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error {
   372  	if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done {
   373  		// We have a valid node connection since there is no error from the
   374  		// forwarded server, so add the mapping to cache the
   375  		// connection and allow the server to send RPCs to the client.
   376  		if err == nil && n.ctx != nil && n.ctx.NodeID == "" {
   377  			n.ctx.NodeID = args.NodeID
   378  			n.srv.addNodeConn(n.ctx)
   379  		}
   380  
   381  		return err
   382  	}
   383  
   384  	defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now())
   385  
   386  	// Verify the arguments
   387  	if args.NodeID == "" {
   388  		return fmt.Errorf("missing node ID for client status update")
   389  	}
   390  	if !structs.ValidNodeStatus(args.Status) {
   391  		return fmt.Errorf("invalid status for node")
   392  	}
   393  
   394  	// Look for the node
   395  	snap, err := n.srv.fsm.State().Snapshot()
   396  	if err != nil {
   397  		return err
   398  	}
   399  
   400  	ws := memdb.NewWatchSet()
   401  	node, err := snap.NodeByID(ws, args.NodeID)
   402  	if err != nil {
   403  		return err
   404  	}
   405  	if node == nil {
   406  		return fmt.Errorf("node not found")
   407  	}
   408  
   409  	// We have a valid node connection, so add the mapping to cache the
   410  	// connection and allow the server to send RPCs to the client. We only cache
   411  	// the connection if it is not being forwarded from another server.
   412  	if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() {
   413  		n.ctx.NodeID = args.NodeID
   414  		n.srv.addNodeConn(n.ctx)
   415  	}
   416  
   417  	// XXX: Could use the SecretID here but have to update the heartbeat system
   418  	// to track SecretIDs.
   419  
   420  	// Update the timestamp of when the node status was updated
   421  	args.UpdatedAt = time.Now().Unix()
   422  
   423  	// Commit this update via Raft
   424  	var index uint64
   425  	if node.Status != args.Status {
   426  		// Attach an event if we are updating the node status to ready when it
   427  		// is down via a heartbeat
   428  		if node.Status == structs.NodeStatusDown && args.NodeEvent == nil {
   429  			args.NodeEvent = structs.NewNodeEvent().
   430  				SetSubsystem(structs.NodeEventSubsystemCluster).
   431  				SetMessage(NodeHeartbeatEventReregistered)
   432  		}
   433  
   434  		_, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args)
   435  		if err != nil {
   436  			n.logger.Error("status update failed", "error", err)
   437  			return err
   438  		}
   439  		reply.NodeModifyIndex = index
   440  	}
   441  
   442  	// Check if we should trigger evaluations
   443  	transitionToReady := transitionedToReady(args.Status, node.Status)
   444  	if structs.ShouldDrainNode(args.Status) || transitionToReady {
   445  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   446  		if err != nil {
   447  			n.logger.Error("eval creation failed", "error", err)
   448  			return err
   449  		}
   450  		reply.EvalIDs = evalIDs
   451  		reply.EvalCreateIndex = evalIndex
   452  	}
   453  
   454  	// Check if we need to setup a heartbeat
   455  	switch args.Status {
   456  	case structs.NodeStatusDown:
   457  		// Determine if there are any Vault accessors on the node
   458  		accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID)
   459  		if err != nil {
   460  			n.logger.Error("looking up accessors for node failed", "node_id", args.NodeID, "error", err)
   461  			return err
   462  		}
   463  
   464  		if l := len(accessors); l != 0 {
   465  			n.logger.Debug("revoking accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID)
   466  			if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil {
   467  				n.logger.Error("revoking accessors for node failed", "node_id", args.NodeID, "error", err)
   468  				return err
   469  			}
   470  		}
   471  	default:
   472  		ttl, err := n.srv.resetHeartbeatTimer(args.NodeID)
   473  		if err != nil {
   474  			n.logger.Error("heartbeat reset failed", "error", err)
   475  			return err
   476  		}
   477  		reply.HeartbeatTTL = ttl
   478  	}
   479  
   480  	// Set the reply index and leader
   481  	reply.Index = index
   482  	n.srv.peerLock.RLock()
   483  	defer n.srv.peerLock.RUnlock()
   484  	if err := n.constructNodeServerInfoResponse(snap, reply); err != nil {
   485  		n.logger.Error("failed to populate NodeUpdateResponse", "error", err)
   486  		return err
   487  	}
   488  
   489  	return nil
   490  }
   491  
   492  // transitionedToReady is a helper that takes a nodes new and old status and
   493  // returns whether it has transitioned to ready.
   494  func transitionedToReady(newStatus, oldStatus string) bool {
   495  	initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady
   496  	terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady
   497  	return initToReady || terminalToReady
   498  }
   499  
   500  // UpdateDrain is used to update the drain mode of a client node
   501  func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
   502  	reply *structs.NodeDrainUpdateResponse) error {
   503  	if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done {
   504  		return err
   505  	}
   506  	defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now())
   507  
   508  	// Check node write permissions
   509  	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
   510  		return err
   511  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   512  		return structs.ErrPermissionDenied
   513  	}
   514  
   515  	// Verify the arguments
   516  	if args.NodeID == "" {
   517  		return fmt.Errorf("missing node ID for drain update")
   518  	}
   519  	if args.NodeEvent != nil {
   520  		return fmt.Errorf("node event must not be set")
   521  	}
   522  
   523  	// Look for the node
   524  	snap, err := n.srv.fsm.State().Snapshot()
   525  	if err != nil {
   526  		return err
   527  	}
   528  	node, err := snap.NodeByID(nil, args.NodeID)
   529  	if err != nil {
   530  		return err
   531  	}
   532  	if node == nil {
   533  		return fmt.Errorf("node not found")
   534  	}
   535  
   536  	now := time.Now().UTC()
   537  
   538  	// Update the timestamp of when the node status was updated
   539  	args.UpdatedAt = now.Unix()
   540  
   541  	// COMPAT: Remove in 0.9. Attempt to upgrade the request if it is of the old
   542  	// format.
   543  	if args.Drain && args.DrainStrategy == nil {
   544  		args.DrainStrategy = &structs.DrainStrategy{
   545  			DrainSpec: structs.DrainSpec{
   546  				Deadline: -1 * time.Second, // Force drain
   547  			},
   548  		}
   549  	}
   550  
   551  	// Setup drain strategy
   552  	if args.DrainStrategy != nil {
   553  		// Mark start time for the drain
   554  		if node.DrainStrategy == nil {
   555  			args.DrainStrategy.StartedAt = now
   556  		} else {
   557  			args.DrainStrategy.StartedAt = node.DrainStrategy.StartedAt
   558  		}
   559  
   560  		// Mark the deadline time
   561  		if args.DrainStrategy.Deadline.Nanoseconds() > 0 {
   562  			args.DrainStrategy.ForceDeadline = now.Add(args.DrainStrategy.Deadline)
   563  		}
   564  	}
   565  
   566  	// Construct the node event
   567  	args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemDrain)
   568  	if node.DrainStrategy == nil && args.DrainStrategy != nil {
   569  		args.NodeEvent.SetMessage(NodeDrainEventDrainSet)
   570  	} else if node.DrainStrategy != nil && args.DrainStrategy != nil {
   571  		args.NodeEvent.SetMessage(NodeDrainEventDrainUpdated)
   572  	} else if node.DrainStrategy != nil && args.DrainStrategy == nil {
   573  		args.NodeEvent.SetMessage(NodeDrainEventDrainDisabled)
   574  	} else {
   575  		args.NodeEvent = nil
   576  	}
   577  
   578  	// Commit this update via Raft
   579  	_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
   580  	if err != nil {
   581  		n.logger.Error("drain update failed", "error", err)
   582  		return err
   583  	}
   584  	reply.NodeModifyIndex = index
   585  
   586  	// If the node is transitioning to be eligible, create Node evaluations
   587  	// because there may be a System job registered that should be evaluated.
   588  	if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.MarkEligible && args.DrainStrategy == nil {
   589  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   590  		if err != nil {
   591  			n.logger.Error("eval creation failed", "error", err)
   592  			return err
   593  		}
   594  		reply.EvalIDs = evalIDs
   595  		reply.EvalCreateIndex = evalIndex
   596  	}
   597  
   598  	// Set the reply index
   599  	reply.Index = index
   600  	return nil
   601  }
   602  
   603  // UpdateEligibility is used to update the scheduling eligibility of a node
   604  func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest,
   605  	reply *structs.NodeEligibilityUpdateResponse) error {
   606  	if done, err := n.srv.forward("Node.UpdateEligibility", args, args, reply); done {
   607  		return err
   608  	}
   609  	defer metrics.MeasureSince([]string{"nomad", "client", "update_eligibility"}, time.Now())
   610  
   611  	// Check node write permissions
   612  	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
   613  		return err
   614  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   615  		return structs.ErrPermissionDenied
   616  	}
   617  
   618  	// Verify the arguments
   619  	if args.NodeID == "" {
   620  		return fmt.Errorf("missing node ID for setting scheduling eligibility")
   621  	}
   622  	if args.NodeEvent != nil {
   623  		return fmt.Errorf("node event must not be set")
   624  	}
   625  
   626  	// Check that only allowed types are set
   627  	switch args.Eligibility {
   628  	case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible:
   629  	default:
   630  		return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility)
   631  	}
   632  
   633  	// Look for the node
   634  	snap, err := n.srv.fsm.State().Snapshot()
   635  	if err != nil {
   636  		return err
   637  	}
   638  	node, err := snap.NodeByID(nil, args.NodeID)
   639  	if err != nil {
   640  		return err
   641  	}
   642  	if node == nil {
   643  		return fmt.Errorf("node not found")
   644  	}
   645  
   646  	if node.DrainStrategy != nil && args.Eligibility == structs.NodeSchedulingEligible {
   647  		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
   648  	}
   649  
   650  	switch args.Eligibility {
   651  	case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible:
   652  	default:
   653  		return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility)
   654  	}
   655  
   656  	// Update the timestamp of when the node status was updated
   657  	args.UpdatedAt = time.Now().Unix()
   658  
   659  	// Construct the node event
   660  	args.NodeEvent = structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster)
   661  	if node.SchedulingEligibility == args.Eligibility {
   662  		return nil // Nothing to do
   663  	} else if args.Eligibility == structs.NodeSchedulingEligible {
   664  		args.NodeEvent.SetMessage(NodeEligibilityEventEligible)
   665  	} else {
   666  		args.NodeEvent.SetMessage(NodeEligibilityEventIneligible)
   667  	}
   668  
   669  	// Commit this update via Raft
   670  	outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args)
   671  	if err != nil {
   672  		n.logger.Error("eligibility update failed", "error", err)
   673  		return err
   674  	}
   675  	if outErr != nil {
   676  		if err, ok := outErr.(error); ok && err != nil {
   677  			n.logger.Error("eligibility update failed", "error", err)
   678  			return err
   679  		}
   680  	}
   681  
   682  	// If the node is transitioning to be eligible, create Node evaluations
   683  	// because there may be a System job registered that should be evaluated.
   684  	if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.Eligibility == structs.NodeSchedulingEligible {
   685  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   686  		if err != nil {
   687  			n.logger.Error("eval creation failed", "error", err)
   688  			return err
   689  		}
   690  		reply.EvalIDs = evalIDs
   691  		reply.EvalCreateIndex = evalIndex
   692  	}
   693  
   694  	// Set the reply index
   695  	reply.Index = index
   696  	return nil
   697  }
   698  
   699  // Evaluate is used to force a re-evaluation of the node
   700  func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error {
   701  	if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done {
   702  		return err
   703  	}
   704  	defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now())
   705  
   706  	// Check node write permissions
   707  	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
   708  		return err
   709  	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
   710  		return structs.ErrPermissionDenied
   711  	}
   712  
   713  	// Verify the arguments
   714  	if args.NodeID == "" {
   715  		return fmt.Errorf("missing node ID for evaluation")
   716  	}
   717  
   718  	// Look for the node
   719  	snap, err := n.srv.fsm.State().Snapshot()
   720  	if err != nil {
   721  		return err
   722  	}
   723  	ws := memdb.NewWatchSet()
   724  	node, err := snap.NodeByID(ws, args.NodeID)
   725  	if err != nil {
   726  		return err
   727  	}
   728  	if node == nil {
   729  		return fmt.Errorf("node not found")
   730  	}
   731  
   732  	// Create the evaluation
   733  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex)
   734  	if err != nil {
   735  		n.logger.Error("eval creation failed", "error", err)
   736  		return err
   737  	}
   738  	reply.EvalIDs = evalIDs
   739  	reply.EvalCreateIndex = evalIndex
   740  
   741  	// Set the reply index
   742  	reply.Index = evalIndex
   743  
   744  	n.srv.peerLock.RLock()
   745  	defer n.srv.peerLock.RUnlock()
   746  	if err := n.constructNodeServerInfoResponse(snap, reply); err != nil {
   747  		n.logger.Error("failed to populate NodeUpdateResponse", "error", err)
   748  		return err
   749  	}
   750  	return nil
   751  }
   752  
   753  // GetNode is used to request information about a specific node
   754  func (n *Node) GetNode(args *structs.NodeSpecificRequest,
   755  	reply *structs.SingleNodeResponse) error {
   756  	if done, err := n.srv.forward("Node.GetNode", args, args, reply); done {
   757  		return err
   758  	}
   759  	defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now())
   760  
   761  	// Check node read permissions
   762  	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
   763  		// If ResolveToken had an unexpected error return that
   764  		if err != structs.ErrTokenNotFound {
   765  			return err
   766  		}
   767  
   768  		// Attempt to lookup AuthToken as a Node.SecretID since nodes
   769  		// call this endpoint and don't have an ACL token.
   770  		node, stateErr := n.srv.fsm.State().NodeBySecretID(nil, args.AuthToken)
   771  		if stateErr != nil {
   772  			// Return the original ResolveToken error with this err
   773  			var merr multierror.Error
   774  			merr.Errors = append(merr.Errors, err, stateErr)
   775  			return merr.ErrorOrNil()
   776  		}
   777  
   778  		// Not a node or a valid ACL token
   779  		if node == nil {
   780  			return structs.ErrTokenNotFound
   781  		}
   782  	} else if aclObj != nil && !aclObj.AllowNodeRead() {
   783  		return structs.ErrPermissionDenied
   784  	}
   785  
   786  	// Setup the blocking query
   787  	opts := blockingOptions{
   788  		queryOpts: &args.QueryOptions,
   789  		queryMeta: &reply.QueryMeta,
   790  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
   791  			// Verify the arguments
   792  			if args.NodeID == "" {
   793  				return fmt.Errorf("missing node ID")
   794  			}
   795  
   796  			// Look for the node
   797  			out, err := state.NodeByID(ws, args.NodeID)
   798  			if err != nil {
   799  				return err
   800  			}
   801  
   802  			// Setup the output
   803  			if out != nil {
   804  				// Clear the secret ID
   805  				reply.Node = out.Copy()
   806  				reply.Node.SecretID = ""
   807  				reply.Index = out.ModifyIndex
   808  			} else {
   809  				// Use the last index that affected the nodes table
   810  				index, err := state.Index("nodes")
   811  				if err != nil {
   812  					return err
   813  				}
   814  				reply.Node = nil
   815  				reply.Index = index
   816  			}
   817  
   818  			// Set the query response
   819  			n.srv.setQueryMeta(&reply.QueryMeta)
   820  			return nil
   821  		}}
   822  	return n.srv.blockingRPC(&opts)
   823  }
   824  
   825  // GetAllocs is used to request allocations for a specific node
   826  func (n *Node) GetAllocs(args *structs.NodeSpecificRequest,
   827  	reply *structs.NodeAllocsResponse) error {
   828  	if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done {
   829  		return err
   830  	}
   831  	defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now())
   832  
   833  	// Check node read and namespace job read permissions
   834  	aclObj, err := n.srv.ResolveToken(args.AuthToken)
   835  	if err != nil {
   836  		return err
   837  	}
   838  	if aclObj != nil && !aclObj.AllowNodeRead() {
   839  		return structs.ErrPermissionDenied
   840  	}
   841  
   842  	// cache namespace perms
   843  	readableNamespaces := map[string]bool{}
   844  
   845  	// readNS is a caching namespace read-job helper
   846  	readNS := func(ns string) bool {
   847  		if aclObj == nil {
   848  			// ACLs are disabled; everything is readable
   849  			return true
   850  		}
   851  
   852  		if readable, ok := readableNamespaces[ns]; ok {
   853  			// cache hit
   854  			return readable
   855  		}
   856  
   857  		// cache miss
   858  		readable := aclObj.AllowNsOp(ns, acl.NamespaceCapabilityReadJob)
   859  		readableNamespaces[ns] = readable
   860  		return readable
   861  	}
   862  
   863  	// Verify the arguments
   864  	if args.NodeID == "" {
   865  		return fmt.Errorf("missing node ID")
   866  	}
   867  
   868  	// Setup the blocking query
   869  	opts := blockingOptions{
   870  		queryOpts: &args.QueryOptions,
   871  		queryMeta: &reply.QueryMeta,
   872  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
   873  			// Look for the node
   874  			allocs, err := state.AllocsByNode(ws, args.NodeID)
   875  			if err != nil {
   876  				return err
   877  			}
   878  
   879  			// Setup the output
   880  			if n := len(allocs); n != 0 {
   881  				reply.Allocs = make([]*structs.Allocation, 0, n)
   882  				for _, alloc := range allocs {
   883  					if readNS(alloc.Namespace) {
   884  						reply.Allocs = append(reply.Allocs, alloc)
   885  					}
   886  
   887  					// Get the max of all allocs since
   888  					// subsequent requests need to start
   889  					// from the latest index
   890  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
   891  				}
   892  			} else {
   893  				reply.Allocs = nil
   894  
   895  				// Use the last index that affected the nodes table
   896  				index, err := state.Index("allocs")
   897  				if err != nil {
   898  					return err
   899  				}
   900  
   901  				// Must provide non-zero index to prevent blocking
   902  				// Index 1 is impossible anyways (due to Raft internals)
   903  				if index == 0 {
   904  					reply.Index = 1
   905  				} else {
   906  					reply.Index = index
   907  				}
   908  			}
   909  			return nil
   910  		}}
   911  	return n.srv.blockingRPC(&opts)
   912  }
   913  
   914  // GetClientAllocs is used to request a lightweight list of alloc modify indexes
   915  // per allocation.
   916  func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest,
   917  	reply *structs.NodeClientAllocsResponse) error {
   918  	if done, err := n.srv.forward("Node.GetClientAllocs", args, args, reply); done {
   919  		// We have a valid node connection since there is no error from the
   920  		// forwarded server, so add the mapping to cache the
   921  		// connection and allow the server to send RPCs to the client.
   922  		if err == nil && n.ctx != nil && n.ctx.NodeID == "" {
   923  			n.ctx.NodeID = args.NodeID
   924  			n.srv.addNodeConn(n.ctx)
   925  		}
   926  
   927  		return err
   928  	}
   929  	defer metrics.MeasureSince([]string{"nomad", "client", "get_client_allocs"}, time.Now())
   930  
   931  	// Verify the arguments
   932  	if args.NodeID == "" {
   933  		return fmt.Errorf("missing node ID")
   934  	}
   935  
   936  	// numOldAllocs is used to detect if there is a garbage collection event
   937  	// that effects the node. When an allocation is garbage collected, that does
   938  	// not change the modify index changes and thus the query won't unblock,
   939  	// even though the set of allocations on the node has changed.
   940  	var numOldAllocs int
   941  
   942  	// Setup the blocking query
   943  	opts := blockingOptions{
   944  		queryOpts: &args.QueryOptions,
   945  		queryMeta: &reply.QueryMeta,
   946  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
   947  			// Look for the node
   948  			node, err := state.NodeByID(ws, args.NodeID)
   949  			if err != nil {
   950  				return err
   951  			}
   952  
   953  			var allocs []*structs.Allocation
   954  			if node != nil {
   955  				if args.SecretID == "" {
   956  					return fmt.Errorf("missing node secret ID for client status update")
   957  				} else if args.SecretID != node.SecretID {
   958  					return fmt.Errorf("node secret ID does not match")
   959  				}
   960  
   961  				// We have a valid node connection, so add the mapping to cache the
   962  				// connection and allow the server to send RPCs to the client. We only cache
   963  				// the connection if it is not being forwarded from another server.
   964  				if n.ctx != nil && n.ctx.NodeID == "" && !args.IsForwarded() {
   965  					n.ctx.NodeID = args.NodeID
   966  					n.srv.addNodeConn(n.ctx)
   967  				}
   968  
   969  				var err error
   970  				allocs, err = state.AllocsByNode(ws, args.NodeID)
   971  				if err != nil {
   972  					return err
   973  				}
   974  			}
   975  
   976  			reply.Allocs = make(map[string]uint64)
   977  			reply.MigrateTokens = make(map[string]string)
   978  
   979  			// preferTableIndex is used to determine whether we should build the
   980  			// response index based on the full table indexes versus the modify
   981  			// indexes of the allocations on the specific node. This is
   982  			// preferred in the case that the node doesn't yet have allocations
   983  			// or when we detect a GC that effects the node.
   984  			preferTableIndex := true
   985  
   986  			// Setup the output
   987  			if numAllocs := len(allocs); numAllocs != 0 {
   988  				preferTableIndex = false
   989  
   990  				for _, alloc := range allocs {
   991  					reply.Allocs[alloc.ID] = alloc.AllocModifyIndex
   992  
   993  					// If the allocation is going to do a migration, create a
   994  					// migration token so that the client can authenticate with
   995  					// the node hosting the previous allocation.
   996  					if alloc.ShouldMigrate() {
   997  						prevAllocation, err := state.AllocByID(ws, alloc.PreviousAllocation)
   998  						if err != nil {
   999  							return err
  1000  						}
  1001  
  1002  						if prevAllocation != nil && prevAllocation.NodeID != alloc.NodeID {
  1003  							allocNode, err := state.NodeByID(ws, prevAllocation.NodeID)
  1004  							if err != nil {
  1005  								return err
  1006  							}
  1007  							if allocNode == nil {
  1008  								// Node must have been GC'd so skip the token
  1009  								continue
  1010  							}
  1011  
  1012  							token, err := structs.GenerateMigrateToken(prevAllocation.ID, allocNode.SecretID)
  1013  							if err != nil {
  1014  								return err
  1015  							}
  1016  							reply.MigrateTokens[alloc.ID] = token
  1017  						}
  1018  					}
  1019  
  1020  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
  1021  				}
  1022  
  1023  				// Determine if we have less allocations than before. This
  1024  				// indicates there was a garbage collection
  1025  				if numAllocs < numOldAllocs {
  1026  					preferTableIndex = true
  1027  				}
  1028  
  1029  				// Store the new number of allocations
  1030  				numOldAllocs = numAllocs
  1031  			}
  1032  
  1033  			if preferTableIndex {
  1034  				// Use the last index that affected the nodes table
  1035  				index, err := state.Index("allocs")
  1036  				if err != nil {
  1037  					return err
  1038  				}
  1039  
  1040  				// Must provide non-zero index to prevent blocking
  1041  				// Index 1 is impossible anyways (due to Raft internals)
  1042  				if index == 0 {
  1043  					reply.Index = 1
  1044  				} else {
  1045  					reply.Index = index
  1046  				}
  1047  			}
  1048  			return nil
  1049  		}}
  1050  	return n.srv.blockingRPC(&opts)
  1051  }
  1052  
  1053  // UpdateAlloc is used to update the client status of an allocation
  1054  func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error {
  1055  	if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done {
  1056  		return err
  1057  	}
  1058  	defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now())
  1059  
  1060  	// Ensure at least a single alloc
  1061  	if len(args.Alloc) == 0 {
  1062  		return fmt.Errorf("must update at least one allocation")
  1063  	}
  1064  
  1065  	// Ensure that evals aren't set from client RPCs
  1066  	// We create them here before the raft update
  1067  	if len(args.Evals) != 0 {
  1068  		return fmt.Errorf("evals field must not be set")
  1069  	}
  1070  
  1071  	// Update modified timestamp for client initiated allocation updates
  1072  	now := time.Now()
  1073  	var evals []*structs.Evaluation
  1074  
  1075  	for _, alloc := range args.Alloc {
  1076  		alloc.ModifyTime = now.UTC().UnixNano()
  1077  
  1078  		// Add an evaluation if this is a failed alloc that is eligible for rescheduling
  1079  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
  1080  			// Only create evaluations if this is an existing alloc,
  1081  			// and eligible as per its task group's ReschedulePolicy
  1082  			if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil {
  1083  				job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID)
  1084  				if err != nil {
  1085  					n.logger.Error("UpdateAlloc unable to find job", "job", existingAlloc.JobID, "error", err)
  1086  					continue
  1087  				}
  1088  				if job == nil {
  1089  					n.logger.Debug("UpdateAlloc unable to find job", "job", existingAlloc.JobID)
  1090  					continue
  1091  				}
  1092  				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)
  1093  				if taskGroup != nil && existingAlloc.FollowupEvalID == "" && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) {
  1094  					eval := &structs.Evaluation{
  1095  						ID:          uuid.Generate(),
  1096  						Namespace:   existingAlloc.Namespace,
  1097  						TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
  1098  						JobID:       existingAlloc.JobID,
  1099  						Type:        job.Type,
  1100  						Priority:    job.Priority,
  1101  						Status:      structs.EvalStatusPending,
  1102  						CreateTime:  now.UTC().UnixNano(),
  1103  						ModifyTime:  now.UTC().UnixNano(),
  1104  					}
  1105  					evals = append(evals, eval)
  1106  				}
  1107  			}
  1108  		}
  1109  	}
  1110  
  1111  	// Add this to the batch
  1112  	n.updatesLock.Lock()
  1113  	n.updates = append(n.updates, args.Alloc...)
  1114  	n.evals = append(n.evals, evals...)
  1115  
  1116  	// Start a new batch if none
  1117  	future := n.updateFuture
  1118  	if future == nil {
  1119  		future = structs.NewBatchFuture()
  1120  		n.updateFuture = future
  1121  		n.updateTimer = time.AfterFunc(batchUpdateInterval, func() {
  1122  			// Get the pending updates
  1123  			n.updatesLock.Lock()
  1124  			updates := n.updates
  1125  			evals := n.evals
  1126  			future := n.updateFuture
  1127  			n.updates = nil
  1128  			n.evals = nil
  1129  			n.updateFuture = nil
  1130  			n.updateTimer = nil
  1131  			n.updatesLock.Unlock()
  1132  
  1133  			// Perform the batch update
  1134  			n.batchUpdate(future, updates, evals)
  1135  		})
  1136  	}
  1137  	n.updatesLock.Unlock()
  1138  
  1139  	// Wait for the future
  1140  	if err := future.Wait(); err != nil {
  1141  		return err
  1142  	}
  1143  
  1144  	// Setup the response
  1145  	reply.Index = future.Index()
  1146  	return nil
  1147  }
  1148  
  1149  // batchUpdate is used to update all the allocations
  1150  func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) {
  1151  	// Group pending evals by jobID to prevent creating unnecessary evals
  1152  	evalsByJobId := make(map[structs.NamespacedID]struct{})
  1153  	var trimmedEvals []*structs.Evaluation
  1154  	for _, eval := range evals {
  1155  		namespacedID := structs.NamespacedID{
  1156  			ID:        eval.JobID,
  1157  			Namespace: eval.Namespace,
  1158  		}
  1159  		_, exists := evalsByJobId[namespacedID]
  1160  		if !exists {
  1161  			now := time.Now().UTC().UnixNano()
  1162  			eval.CreateTime = now
  1163  			eval.ModifyTime = now
  1164  			trimmedEvals = append(trimmedEvals, eval)
  1165  			evalsByJobId[namespacedID] = struct{}{}
  1166  		}
  1167  	}
  1168  
  1169  	if len(trimmedEvals) > 0 {
  1170  		n.logger.Debug("adding evaluations for rescheduling failed allocations", "num_evals", len(trimmedEvals))
  1171  	}
  1172  	// Prepare the batch update
  1173  	batch := &structs.AllocUpdateRequest{
  1174  		Alloc:        updates,
  1175  		Evals:        trimmedEvals,
  1176  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
  1177  	}
  1178  
  1179  	// Commit this update via Raft
  1180  	var mErr multierror.Error
  1181  	_, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch)
  1182  	if err != nil {
  1183  		n.logger.Error("alloc update failed", "error", err)
  1184  		mErr.Errors = append(mErr.Errors, err)
  1185  	}
  1186  
  1187  	// For each allocation we are updating check if we should revoke any
  1188  	// Vault Accessors
  1189  	var revoke []*structs.VaultAccessor
  1190  	for _, alloc := range updates {
  1191  		// Skip any allocation that isn't dead on the client
  1192  		if !alloc.Terminated() {
  1193  			continue
  1194  		}
  1195  
  1196  		// Determine if there are any Vault accessors for the allocation
  1197  		ws := memdb.NewWatchSet()
  1198  		accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID)
  1199  		if err != nil {
  1200  			n.logger.Error("looking up Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err)
  1201  			mErr.Errors = append(mErr.Errors, err)
  1202  		}
  1203  
  1204  		revoke = append(revoke, accessors...)
  1205  	}
  1206  
  1207  	if l := len(revoke); l != 0 {
  1208  		n.logger.Debug("revoking accessors due to terminal allocations", "num_accessors", l)
  1209  		if err := n.srv.vault.RevokeTokens(context.Background(), revoke, true); err != nil {
  1210  			n.logger.Error("batched Vault accessor revocation failed", "error", err)
  1211  			mErr.Errors = append(mErr.Errors, err)
  1212  		}
  1213  	}
  1214  
  1215  	// Respond to the future
  1216  	future.Respond(index, mErr.ErrorOrNil())
  1217  }
  1218  
  1219  // List is used to list the available nodes
  1220  func (n *Node) List(args *structs.NodeListRequest,
  1221  	reply *structs.NodeListResponse) error {
  1222  	if done, err := n.srv.forward("Node.List", args, args, reply); done {
  1223  		return err
  1224  	}
  1225  	defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now())
  1226  
  1227  	// Check node read permissions
  1228  	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
  1229  		return err
  1230  	} else if aclObj != nil && !aclObj.AllowNodeRead() {
  1231  		return structs.ErrPermissionDenied
  1232  	}
  1233  
  1234  	// Setup the blocking query
  1235  	opts := blockingOptions{
  1236  		queryOpts: &args.QueryOptions,
  1237  		queryMeta: &reply.QueryMeta,
  1238  		run: func(ws memdb.WatchSet, state *state.StateStore) error {
  1239  			// Capture all the nodes
  1240  			var err error
  1241  			var iter memdb.ResultIterator
  1242  			if prefix := args.QueryOptions.Prefix; prefix != "" {
  1243  				iter, err = state.NodesByIDPrefix(ws, prefix)
  1244  			} else {
  1245  				iter, err = state.Nodes(ws)
  1246  			}
  1247  			if err != nil {
  1248  				return err
  1249  			}
  1250  
  1251  			var nodes []*structs.NodeListStub
  1252  			for {
  1253  				raw := iter.Next()
  1254  				if raw == nil {
  1255  					break
  1256  				}
  1257  				node := raw.(*structs.Node)
  1258  				nodes = append(nodes, node.Stub())
  1259  			}
  1260  			reply.Nodes = nodes
  1261  
  1262  			// Use the last index that affected the jobs table
  1263  			index, err := state.Index("nodes")
  1264  			if err != nil {
  1265  				return err
  1266  			}
  1267  			reply.Index = index
  1268  
  1269  			// Set the query response
  1270  			n.srv.setQueryMeta(&reply.QueryMeta)
  1271  			return nil
  1272  		}}
  1273  	return n.srv.blockingRPC(&opts)
  1274  }
  1275  
  1276  // createNodeEvals is used to create evaluations for each alloc on a node.
  1277  // Each Eval is scoped to a job, so we need to potentially trigger many evals.
  1278  func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) {
  1279  	// Snapshot the state
  1280  	snap, err := n.srv.fsm.State().Snapshot()
  1281  	if err != nil {
  1282  		return nil, 0, fmt.Errorf("failed to snapshot state: %v", err)
  1283  	}
  1284  
  1285  	// Find all the allocations for this node
  1286  	ws := memdb.NewWatchSet()
  1287  	allocs, err := snap.AllocsByNode(ws, nodeID)
  1288  	if err != nil {
  1289  		return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err)
  1290  	}
  1291  
  1292  	sysJobsIter, err := snap.JobsByScheduler(ws, "system")
  1293  	if err != nil {
  1294  		return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err)
  1295  	}
  1296  
  1297  	var sysJobs []*structs.Job
  1298  	for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() {
  1299  		sysJobs = append(sysJobs, job.(*structs.Job))
  1300  	}
  1301  
  1302  	// Fast-path if nothing to do
  1303  	if len(allocs) == 0 && len(sysJobs) == 0 {
  1304  		return nil, 0, nil
  1305  	}
  1306  
  1307  	// Create an eval for each JobID affected
  1308  	var evals []*structs.Evaluation
  1309  	var evalIDs []string
  1310  	jobIDs := make(map[string]struct{})
  1311  	now := time.Now().UTC().UnixNano()
  1312  
  1313  	for _, alloc := range allocs {
  1314  		// Deduplicate on JobID
  1315  		if _, ok := jobIDs[alloc.JobID]; ok {
  1316  			continue
  1317  		}
  1318  		jobIDs[alloc.JobID] = struct{}{}
  1319  
  1320  		// Create a new eval
  1321  		eval := &structs.Evaluation{
  1322  			ID:              uuid.Generate(),
  1323  			Namespace:       alloc.Namespace,
  1324  			Priority:        alloc.Job.Priority,
  1325  			Type:            alloc.Job.Type,
  1326  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
  1327  			JobID:           alloc.JobID,
  1328  			NodeID:          nodeID,
  1329  			NodeModifyIndex: nodeIndex,
  1330  			Status:          structs.EvalStatusPending,
  1331  			CreateTime:      now,
  1332  			ModifyTime:      now,
  1333  		}
  1334  		evals = append(evals, eval)
  1335  		evalIDs = append(evalIDs, eval.ID)
  1336  	}
  1337  
  1338  	// Create an evaluation for each system job.
  1339  	for _, job := range sysJobs {
  1340  		// Still dedup on JobID as the node may already have the system job.
  1341  		if _, ok := jobIDs[job.ID]; ok {
  1342  			continue
  1343  		}
  1344  		jobIDs[job.ID] = struct{}{}
  1345  
  1346  		// Create a new eval
  1347  		eval := &structs.Evaluation{
  1348  			ID:              uuid.Generate(),
  1349  			Namespace:       job.Namespace,
  1350  			Priority:        job.Priority,
  1351  			Type:            job.Type,
  1352  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
  1353  			JobID:           job.ID,
  1354  			NodeID:          nodeID,
  1355  			NodeModifyIndex: nodeIndex,
  1356  			Status:          structs.EvalStatusPending,
  1357  			CreateTime:      now,
  1358  			ModifyTime:      now,
  1359  		}
  1360  		evals = append(evals, eval)
  1361  		evalIDs = append(evalIDs, eval.ID)
  1362  	}
  1363  
  1364  	// Create the Raft transaction
  1365  	update := &structs.EvalUpdateRequest{
  1366  		Evals:        evals,
  1367  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
  1368  	}
  1369  
  1370  	// Commit this evaluation via Raft
  1371  	// XXX: There is a risk of partial failure where the node update succeeds
  1372  	// but that the EvalUpdate does not.
  1373  	_, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update)
  1374  	if err != nil {
  1375  		return nil, 0, err
  1376  	}
  1377  	return evalIDs, evalIndex, nil
  1378  }
  1379  
  1380  // DeriveVaultToken is used by the clients to request wrapped Vault tokens for
  1381  // tasks
  1382  func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest,
  1383  	reply *structs.DeriveVaultTokenResponse) error {
  1384  
  1385  	// setErr is a helper for setting the recoverable error on the reply and
  1386  	// logging it
  1387  	setErr := func(e error, recoverable bool) {
  1388  		if e == nil {
  1389  			return
  1390  		}
  1391  		re, ok := e.(*structs.RecoverableError)
  1392  		if ok {
  1393  			// No need to wrap if error is already a RecoverableError
  1394  			reply.Error = re
  1395  		} else {
  1396  			reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError)
  1397  		}
  1398  
  1399  		n.logger.Error("DeriveVaultToken failed", "recoverable", recoverable, "error", e)
  1400  	}
  1401  
  1402  	if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done {
  1403  		setErr(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader)
  1404  		return nil
  1405  	}
  1406  	defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now())
  1407  
  1408  	// Verify the arguments
  1409  	if args.NodeID == "" {
  1410  		setErr(fmt.Errorf("missing node ID"), false)
  1411  		return nil
  1412  	}
  1413  	if args.SecretID == "" {
  1414  		setErr(fmt.Errorf("missing node SecretID"), false)
  1415  		return nil
  1416  	}
  1417  	if args.AllocID == "" {
  1418  		setErr(fmt.Errorf("missing allocation ID"), false)
  1419  		return nil
  1420  	}
  1421  	if len(args.Tasks) == 0 {
  1422  		setErr(fmt.Errorf("no tasks specified"), false)
  1423  		return nil
  1424  	}
  1425  
  1426  	// Verify the following:
  1427  	// * The Node exists and has the correct SecretID
  1428  	// * The Allocation exists on the specified node
  1429  	// * The allocation contains the given tasks and they each require Vault
  1430  	//   tokens
  1431  	snap, err := n.srv.fsm.State().Snapshot()
  1432  	if err != nil {
  1433  		setErr(err, false)
  1434  		return nil
  1435  	}
  1436  	ws := memdb.NewWatchSet()
  1437  	node, err := snap.NodeByID(ws, args.NodeID)
  1438  	if err != nil {
  1439  		setErr(err, false)
  1440  		return nil
  1441  	}
  1442  	if node == nil {
  1443  		setErr(fmt.Errorf("Node %q does not exist", args.NodeID), false)
  1444  		return nil
  1445  	}
  1446  	if node.SecretID != args.SecretID {
  1447  		setErr(fmt.Errorf("SecretID mismatch"), false)
  1448  		return nil
  1449  	}
  1450  
  1451  	alloc, err := snap.AllocByID(ws, args.AllocID)
  1452  	if err != nil {
  1453  		setErr(err, false)
  1454  		return nil
  1455  	}
  1456  	if alloc == nil {
  1457  		setErr(fmt.Errorf("Allocation %q does not exist", args.AllocID), false)
  1458  		return nil
  1459  	}
  1460  	if alloc.NodeID != args.NodeID {
  1461  		setErr(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false)
  1462  		return nil
  1463  	}
  1464  	if alloc.TerminalStatus() {
  1465  		setErr(fmt.Errorf("Can't request Vault token for terminal allocation"), false)
  1466  		return nil
  1467  	}
  1468  
  1469  	// Check the policies
  1470  	policies := alloc.Job.VaultPolicies()
  1471  	if policies == nil {
  1472  		setErr(fmt.Errorf("Job doesn't require Vault policies"), false)
  1473  		return nil
  1474  	}
  1475  	tg, ok := policies[alloc.TaskGroup]
  1476  	if !ok {
  1477  		setErr(fmt.Errorf("Task group does not require Vault policies"), false)
  1478  		return nil
  1479  	}
  1480  
  1481  	var unneeded []string
  1482  	for _, task := range args.Tasks {
  1483  		taskVault := tg[task]
  1484  		if taskVault == nil || len(taskVault.Policies) == 0 {
  1485  			unneeded = append(unneeded, task)
  1486  		}
  1487  	}
  1488  
  1489  	if len(unneeded) != 0 {
  1490  		e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s",
  1491  			strings.Join(unneeded, ", "))
  1492  		setErr(e, false)
  1493  		return nil
  1494  	}
  1495  
  1496  	// At this point the request is valid and we should contact Vault for
  1497  	// tokens.
  1498  
  1499  	// Create an error group where we will spin up a fixed set of goroutines to
  1500  	// handle deriving tokens but where if any fails the whole group is
  1501  	// canceled.
  1502  	g, ctx := errgroup.WithContext(context.Background())
  1503  
  1504  	// Cap the handlers
  1505  	handlers := len(args.Tasks)
  1506  	if handlers > maxParallelRequestsPerDerive {
  1507  		handlers = maxParallelRequestsPerDerive
  1508  	}
  1509  
  1510  	// Create the Vault Tokens
  1511  	input := make(chan string, handlers)
  1512  	results := make(map[string]*vapi.Secret, len(args.Tasks))
  1513  	for i := 0; i < handlers; i++ {
  1514  		g.Go(func() error {
  1515  			for {
  1516  				select {
  1517  				case task, ok := <-input:
  1518  					if !ok {
  1519  						return nil
  1520  					}
  1521  
  1522  					secret, err := n.srv.vault.CreateToken(ctx, alloc, task)
  1523  					if err != nil {
  1524  						return err
  1525  					}
  1526  
  1527  					results[task] = secret
  1528  				case <-ctx.Done():
  1529  					return nil
  1530  				}
  1531  			}
  1532  		})
  1533  	}
  1534  
  1535  	// Send the input
  1536  	go func() {
  1537  		defer close(input)
  1538  		for _, task := range args.Tasks {
  1539  			select {
  1540  			case <-ctx.Done():
  1541  				return
  1542  			case input <- task:
  1543  			}
  1544  		}
  1545  
  1546  	}()
  1547  
  1548  	// Wait for everything to complete or for an error
  1549  	createErr := g.Wait()
  1550  
  1551  	// Retrieve the results
  1552  	accessors := make([]*structs.VaultAccessor, 0, len(results))
  1553  	tokens := make(map[string]string, len(results))
  1554  	for task, secret := range results {
  1555  		w := secret.WrapInfo
  1556  		tokens[task] = w.Token
  1557  		accessor := &structs.VaultAccessor{
  1558  			Accessor:    w.WrappedAccessor,
  1559  			Task:        task,
  1560  			NodeID:      alloc.NodeID,
  1561  			AllocID:     alloc.ID,
  1562  			CreationTTL: w.TTL,
  1563  		}
  1564  
  1565  		accessors = append(accessors, accessor)
  1566  	}
  1567  
  1568  	// If there was an error revoke the created tokens
  1569  	if createErr != nil {
  1570  		n.logger.Error("Vault token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr)
  1571  
  1572  		if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil {
  1573  			n.logger.Error("Vault token revocation for alloc failed", "alloc_id", alloc.ID, "error", revokeErr)
  1574  		}
  1575  
  1576  		if rerr, ok := createErr.(*structs.RecoverableError); ok {
  1577  			reply.Error = rerr
  1578  		} else {
  1579  			reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError)
  1580  		}
  1581  
  1582  		return nil
  1583  	}
  1584  
  1585  	// Commit to Raft before returning any of the tokens
  1586  	req := structs.VaultAccessorsRequest{Accessors: accessors}
  1587  	_, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req)
  1588  	if err != nil {
  1589  		n.logger.Error("registering Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err)
  1590  
  1591  		// Determine if we can recover from the error
  1592  		retry := false
  1593  		switch err {
  1594  		case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout:
  1595  			retry = true
  1596  		}
  1597  
  1598  		setErr(err, retry)
  1599  		return nil
  1600  	}
  1601  
  1602  	reply.Index = index
  1603  	reply.Tasks = tokens
  1604  	n.srv.setQueryMeta(&reply.QueryMeta)
  1605  	return nil
  1606  }
  1607  
  1608  func (n *Node) EmitEvents(args *structs.EmitNodeEventsRequest, reply *structs.EmitNodeEventsResponse) error {
  1609  	if done, err := n.srv.forward("Node.EmitEvents", args, args, reply); done {
  1610  		return err
  1611  	}
  1612  	defer metrics.MeasureSince([]string{"nomad", "client", "emit_events"}, time.Now())
  1613  
  1614  	if len(args.NodeEvents) == 0 {
  1615  		return fmt.Errorf("no node events given")
  1616  	}
  1617  	for nodeID, events := range args.NodeEvents {
  1618  		if len(events) == 0 {
  1619  			return fmt.Errorf("no node events given for node %q", nodeID)
  1620  		}
  1621  	}
  1622  
  1623  	_, index, err := n.srv.raftApply(structs.UpsertNodeEventsType, args)
  1624  	if err != nil {
  1625  		n.logger.Error("upserting node events failed", "error", err)
  1626  		return err
  1627  	}
  1628  
  1629  	reply.Index = index
  1630  	return nil
  1631  }