github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/node_endpoint.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  )
    10  
    11  // Node endpoint is used for client interactions
    12  type Node struct {
    13  	srv *Server
    14  }
    15  
    16  // Register is used to upsert a client that is available for scheduling
    17  func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error {
    18  	if done, err := n.srv.forward("Node.Register", args, args, reply); done {
    19  		return err
    20  	}
    21  	defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now())
    22  
    23  	// Validate the arguments
    24  	if args.Node == nil {
    25  		return fmt.Errorf("missing node for client registration")
    26  	}
    27  	if args.Node.ID == "" {
    28  		return fmt.Errorf("missing node ID for client registration")
    29  	}
    30  	if args.Node.Datacenter == "" {
    31  		return fmt.Errorf("missing datacenter for client registration")
    32  	}
    33  	if args.Node.Name == "" {
    34  		return fmt.Errorf("missing node name for client registration")
    35  	}
    36  
    37  	// Default the status if none is given
    38  	if args.Node.Status == "" {
    39  		args.Node.Status = structs.NodeStatusInit
    40  	}
    41  	if !structs.ValidNodeStatus(args.Node.Status) {
    42  		return fmt.Errorf("invalid status for node")
    43  	}
    44  
    45  	// Commit this update via Raft
    46  	_, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args)
    47  	if err != nil {
    48  		n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err)
    49  		return err
    50  	}
    51  	reply.NodeModifyIndex = index
    52  
    53  	// Check if we should trigger evaluations
    54  	if structs.ShouldDrainNode(args.Node.Status) {
    55  		evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index)
    56  		if err != nil {
    57  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
    58  			return err
    59  		}
    60  		reply.EvalIDs = evalIDs
    61  		reply.EvalCreateIndex = evalIndex
    62  	}
    63  
    64  	// Check if we need to setup a heartbeat
    65  	if !args.Node.TerminalStatus() {
    66  		ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID)
    67  		if err != nil {
    68  			n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err)
    69  			return err
    70  		}
    71  		reply.HeartbeatTTL = ttl
    72  	}
    73  
    74  	// Set the reply index
    75  	reply.Index = index
    76  	return nil
    77  }
    78  
    79  // Deregister is used to remove a client from the client. If a client should
    80  // just be made unavailable for scheduling, a status update is prefered.
    81  func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error {
    82  	if done, err := n.srv.forward("Node.Deregister", args, args, reply); done {
    83  		return err
    84  	}
    85  	defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now())
    86  
    87  	// Verify the arguments
    88  	if args.NodeID == "" {
    89  		return fmt.Errorf("missing node ID for client deregistration")
    90  	}
    91  
    92  	// Commit this update via Raft
    93  	_, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args)
    94  	if err != nil {
    95  		n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err)
    96  		return err
    97  	}
    98  
    99  	// Clear the heartbeat timer if any
   100  	n.srv.clearHeartbeatTimer(args.NodeID)
   101  
   102  	// Create the evaluations for this node
   103  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   104  	if err != nil {
   105  		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   106  		return err
   107  	}
   108  
   109  	// Setup the reply
   110  	reply.EvalIDs = evalIDs
   111  	reply.EvalCreateIndex = evalIndex
   112  	reply.NodeModifyIndex = index
   113  	reply.Index = index
   114  	return nil
   115  }
   116  
   117  // UpdateStatus is used to update the status of a client node
   118  func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error {
   119  	if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done {
   120  		return err
   121  	}
   122  	defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now())
   123  
   124  	// Verify the arguments
   125  	if args.NodeID == "" {
   126  		return fmt.Errorf("missing node ID for client deregistration")
   127  	}
   128  	if !structs.ValidNodeStatus(args.Status) {
   129  		return fmt.Errorf("invalid status for node")
   130  	}
   131  
   132  	// Look for the node
   133  	snap, err := n.srv.fsm.State().Snapshot()
   134  	if err != nil {
   135  		return err
   136  	}
   137  	node, err := snap.NodeByID(args.NodeID)
   138  	if err != nil {
   139  		return err
   140  	}
   141  	if node == nil {
   142  		return fmt.Errorf("node not found")
   143  	}
   144  
   145  	// Commit this update via Raft
   146  	var index uint64
   147  	if node.Status != args.Status {
   148  		_, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args)
   149  		if err != nil {
   150  			n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err)
   151  			return err
   152  		}
   153  		reply.NodeModifyIndex = index
   154  	}
   155  
   156  	// Check if we should trigger evaluations
   157  	initToReady := node.Status == structs.NodeStatusInit && args.Status == structs.NodeStatusReady
   158  	terminalToReady := node.Status == structs.NodeStatusDown && args.Status == structs.NodeStatusReady
   159  	transitionToReady := initToReady || terminalToReady
   160  	if structs.ShouldDrainNode(args.Status) || transitionToReady {
   161  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   162  		if err != nil {
   163  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   164  			return err
   165  		}
   166  		reply.EvalIDs = evalIDs
   167  		reply.EvalCreateIndex = evalIndex
   168  	}
   169  
   170  	// Check if we need to setup a heartbeat
   171  	if args.Status != structs.NodeStatusDown {
   172  		ttl, err := n.srv.resetHeartbeatTimer(args.NodeID)
   173  		if err != nil {
   174  			n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err)
   175  			return err
   176  		}
   177  		reply.HeartbeatTTL = ttl
   178  	}
   179  
   180  	// Set the reply index
   181  	reply.Index = index
   182  	return nil
   183  }
   184  
   185  // UpdateDrain is used to update the drain mode of a client node
   186  func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
   187  	reply *structs.NodeDrainUpdateResponse) error {
   188  	if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done {
   189  		return err
   190  	}
   191  	defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now())
   192  
   193  	// Verify the arguments
   194  	if args.NodeID == "" {
   195  		return fmt.Errorf("missing node ID for drain update")
   196  	}
   197  
   198  	// Look for the node
   199  	snap, err := n.srv.fsm.State().Snapshot()
   200  	if err != nil {
   201  		return err
   202  	}
   203  	node, err := snap.NodeByID(args.NodeID)
   204  	if err != nil {
   205  		return err
   206  	}
   207  	if node == nil {
   208  		return fmt.Errorf("node not found")
   209  	}
   210  
   211  	// Commit this update via Raft
   212  	var index uint64
   213  	if node.Drain != args.Drain {
   214  		_, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
   215  		if err != nil {
   216  			n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err)
   217  			return err
   218  		}
   219  		reply.NodeModifyIndex = index
   220  	}
   221  
   222  	// Check if we should trigger evaluations
   223  	if args.Drain {
   224  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   225  		if err != nil {
   226  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   227  			return err
   228  		}
   229  		reply.EvalIDs = evalIDs
   230  		reply.EvalCreateIndex = evalIndex
   231  	}
   232  
   233  	// Set the reply index
   234  	reply.Index = index
   235  	return nil
   236  }
   237  
   238  // Evaluate is used to force a re-evaluation of the node
   239  func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error {
   240  	if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done {
   241  		return err
   242  	}
   243  	defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now())
   244  
   245  	// Verify the arguments
   246  	if args.NodeID == "" {
   247  		return fmt.Errorf("missing node ID for evaluation")
   248  	}
   249  
   250  	// Look for the node
   251  	snap, err := n.srv.fsm.State().Snapshot()
   252  	if err != nil {
   253  		return err
   254  	}
   255  	node, err := snap.NodeByID(args.NodeID)
   256  	if err != nil {
   257  		return err
   258  	}
   259  	if node == nil {
   260  		return fmt.Errorf("node not found")
   261  	}
   262  
   263  	// Create the evaluation
   264  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex)
   265  	if err != nil {
   266  		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   267  		return err
   268  	}
   269  	reply.EvalIDs = evalIDs
   270  	reply.EvalCreateIndex = evalIndex
   271  
   272  	// Set the reply index
   273  	reply.Index = evalIndex
   274  	return nil
   275  }
   276  
   277  // GetNode is used to request information about a specific node
   278  func (n *Node) GetNode(args *structs.NodeSpecificRequest,
   279  	reply *structs.SingleNodeResponse) error {
   280  	if done, err := n.srv.forward("Node.GetNode", args, args, reply); done {
   281  		return err
   282  	}
   283  	defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now())
   284  
   285  	// Verify the arguments
   286  	if args.NodeID == "" {
   287  		return fmt.Errorf("missing node ID")
   288  	}
   289  
   290  	// Look for the node
   291  	snap, err := n.srv.fsm.State().Snapshot()
   292  	if err != nil {
   293  		return err
   294  	}
   295  	out, err := snap.NodeByID(args.NodeID)
   296  	if err != nil {
   297  		return err
   298  	}
   299  
   300  	// Setup the output
   301  	if out != nil {
   302  		reply.Node = out
   303  		reply.Index = out.ModifyIndex
   304  	} else {
   305  		// Use the last index that affected the nodes table
   306  		index, err := snap.Index("nodes")
   307  		if err != nil {
   308  			return err
   309  		}
   310  		reply.Index = index
   311  	}
   312  
   313  	// Set the query response
   314  	n.srv.setQueryMeta(&reply.QueryMeta)
   315  	return nil
   316  }
   317  
   318  // GetAllocs is used to request allocations for a specific node
   319  func (n *Node) GetAllocs(args *structs.NodeSpecificRequest,
   320  	reply *structs.NodeAllocsResponse) error {
   321  	if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done {
   322  		return err
   323  	}
   324  	defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now())
   325  
   326  	// Verify the arguments
   327  	if args.NodeID == "" {
   328  		return fmt.Errorf("missing node ID")
   329  	}
   330  
   331  	// Setup the blocking query
   332  	opts := blockingOptions{
   333  		queryOpts:  &args.QueryOptions,
   334  		queryMeta:  &reply.QueryMeta,
   335  		allocWatch: args.NodeID,
   336  		run: func() error {
   337  			// Look for the node
   338  			snap, err := n.srv.fsm.State().Snapshot()
   339  			if err != nil {
   340  				return err
   341  			}
   342  			allocs, err := snap.AllocsByNode(args.NodeID)
   343  			if err != nil {
   344  				return err
   345  			}
   346  
   347  			// Setup the output
   348  			if len(allocs) != 0 {
   349  				reply.Allocs = allocs
   350  				for _, alloc := range allocs {
   351  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
   352  				}
   353  			} else {
   354  				reply.Allocs = nil
   355  
   356  				// Use the last index that affected the nodes table
   357  				index, err := snap.Index("allocs")
   358  				if err != nil {
   359  					return err
   360  				}
   361  
   362  				// Must provide non-zero index to prevent blocking
   363  				// Index 1 is impossible anyways (due to Raft internals)
   364  				if index == 0 {
   365  					reply.Index = 1
   366  				} else {
   367  					reply.Index = index
   368  				}
   369  			}
   370  			return nil
   371  		}}
   372  	return n.srv.blockingRPC(&opts)
   373  }
   374  
   375  // UpdateAlloc is used to update the client status of an allocation
   376  func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error {
   377  	if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done {
   378  		return err
   379  	}
   380  	defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now())
   381  
   382  	// Ensure only a single alloc
   383  	if len(args.Alloc) != 1 {
   384  		return fmt.Errorf("must update a single allocation")
   385  	}
   386  
   387  	// Commit this update via Raft
   388  	_, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, args)
   389  	if err != nil {
   390  		n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err)
   391  		return err
   392  	}
   393  
   394  	// Setup the response
   395  	reply.Index = index
   396  	return nil
   397  }
   398  
   399  // List is used to list the available nodes
   400  func (n *Node) List(args *structs.NodeListRequest,
   401  	reply *structs.NodeListResponse) error {
   402  	if done, err := n.srv.forward("Node.List", args, args, reply); done {
   403  		return err
   404  	}
   405  	defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now())
   406  
   407  	// Capture all the nodes
   408  	snap, err := n.srv.fsm.State().Snapshot()
   409  	if err != nil {
   410  		return err
   411  	}
   412  	iter, err := snap.Nodes()
   413  	if err != nil {
   414  		return err
   415  	}
   416  
   417  	for {
   418  		raw := iter.Next()
   419  		if raw == nil {
   420  			break
   421  		}
   422  		node := raw.(*structs.Node)
   423  		reply.Nodes = append(reply.Nodes, node.Stub())
   424  	}
   425  
   426  	// Use the last index that affected the jobs table
   427  	index, err := snap.Index("nodes")
   428  	if err != nil {
   429  		return err
   430  	}
   431  	reply.Index = index
   432  
   433  	// Set the query response
   434  	n.srv.setQueryMeta(&reply.QueryMeta)
   435  	return nil
   436  }
   437  
   438  // createNodeEvals is used to create evaluations for each alloc on a node.
   439  // Each Eval is scoped to a job, so we need to potentially trigger many evals.
   440  func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) {
   441  	// Snapshot the state
   442  	snap, err := n.srv.fsm.State().Snapshot()
   443  	if err != nil {
   444  		return nil, 0, fmt.Errorf("failed to snapshot state: %v", err)
   445  	}
   446  
   447  	// Find all the allocations for this node
   448  	allocs, err := snap.AllocsByNode(nodeID)
   449  	if err != nil {
   450  		return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err)
   451  	}
   452  
   453  	sysJobsIter, err := snap.JobsByScheduler("system")
   454  	if err != nil {
   455  		return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err)
   456  	}
   457  
   458  	var sysJobs []*structs.Job
   459  	for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() {
   460  		sysJobs = append(sysJobs, job.(*structs.Job))
   461  	}
   462  
   463  	// Fast-path if nothing to do
   464  	if len(allocs) == 0 && len(sysJobs) == 0 {
   465  		return nil, 0, nil
   466  	}
   467  
   468  	// Create an eval for each JobID affected
   469  	var evals []*structs.Evaluation
   470  	var evalIDs []string
   471  	jobIDs := make(map[string]struct{})
   472  
   473  	for _, alloc := range allocs {
   474  		// Deduplicate on JobID
   475  		if _, ok := jobIDs[alloc.JobID]; ok {
   476  			continue
   477  		}
   478  		jobIDs[alloc.JobID] = struct{}{}
   479  
   480  		// Create a new eval
   481  		eval := &structs.Evaluation{
   482  			ID:              structs.GenerateUUID(),
   483  			Priority:        alloc.Job.Priority,
   484  			Type:            alloc.Job.Type,
   485  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
   486  			JobID:           alloc.JobID,
   487  			NodeID:          nodeID,
   488  			NodeModifyIndex: nodeIndex,
   489  			Status:          structs.EvalStatusPending,
   490  		}
   491  		evals = append(evals, eval)
   492  		evalIDs = append(evalIDs, eval.ID)
   493  	}
   494  
   495  	// Create an evaluation for each system job.
   496  	for _, job := range sysJobs {
   497  		// Still dedup on JobID as the node may already have the system job.
   498  		if _, ok := jobIDs[job.ID]; ok {
   499  			continue
   500  		}
   501  		jobIDs[job.ID] = struct{}{}
   502  
   503  		// Create a new eval
   504  		eval := &structs.Evaluation{
   505  			ID:              structs.GenerateUUID(),
   506  			Priority:        job.Priority,
   507  			Type:            job.Type,
   508  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
   509  			JobID:           job.ID,
   510  			NodeID:          nodeID,
   511  			NodeModifyIndex: nodeIndex,
   512  			Status:          structs.EvalStatusPending,
   513  		}
   514  		evals = append(evals, eval)
   515  		evalIDs = append(evalIDs, eval.ID)
   516  	}
   517  
   518  	// Create the Raft transaction
   519  	update := &structs.EvalUpdateRequest{
   520  		Evals:        evals,
   521  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
   522  	}
   523  
   524  	// Commit this evaluation via Raft
   525  	// XXX: There is a risk of partial failure where the node update succeeds
   526  	// but that the EvalUpdate does not.
   527  	_, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update)
   528  	if err != nil {
   529  		return nil, 0, err
   530  	}
   531  	return evalIDs, evalIndex, nil
   532  }