github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/node_endpoint.go

github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/node_endpoint.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/nomad/nomad/watch"
    11  )
    12  
    13  // Node endpoint is used for client interactions
    14  type Node struct {
    15  	srv *Server
    16  }
    17  
    18  // Register is used to upsert a client that is available for scheduling
    19  func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error {
    20  	if done, err := n.srv.forward("Node.Register", args, args, reply); done {
    21  		return err
    22  	}
    23  	defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now())
    24  
    25  	// Validate the arguments
    26  	if args.Node == nil {
    27  		return fmt.Errorf("missing node for client registration")
    28  	}
    29  	if args.Node.ID == "" {
    30  		return fmt.Errorf("missing node ID for client registration")
    31  	}
    32  	if args.Node.Datacenter == "" {
    33  		return fmt.Errorf("missing datacenter for client registration")
    34  	}
    35  	if args.Node.Name == "" {
    36  		return fmt.Errorf("missing node name for client registration")
    37  	}
    38  
    39  	// Default the status if none is given
    40  	if args.Node.Status == "" {
    41  		args.Node.Status = structs.NodeStatusInit
    42  	}
    43  	if !structs.ValidNodeStatus(args.Node.Status) {
    44  		return fmt.Errorf("invalid status for node")
    45  	}
    46  
    47  	// Compute the node class
    48  	if err := args.Node.ComputeClass(); err != nil {
    49  		return fmt.Errorf("failed to computed node class: %v", err)
    50  	}
    51  
    52  	// Commit this update via Raft
    53  	_, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args)
    54  	if err != nil {
    55  		n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err)
    56  		return err
    57  	}
    58  	reply.NodeModifyIndex = index
    59  
    60  	// Check if we should trigger evaluations
    61  	if structs.ShouldDrainNode(args.Node.Status) {
    62  		evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index)
    63  		if err != nil {
    64  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
    65  			return err
    66  		}
    67  		reply.EvalIDs = evalIDs
    68  		reply.EvalCreateIndex = evalIndex
    69  	}
    70  
    71  	// Check if we need to setup a heartbeat
    72  	if !args.Node.TerminalStatus() {
    73  		ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID)
    74  		if err != nil {
    75  			n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err)
    76  			return err
    77  		}
    78  		reply.HeartbeatTTL = ttl
    79  	}
    80  
    81  	// Set the reply index
    82  	reply.Index = index
    83  	return nil
    84  }
    85  
    86  // Deregister is used to remove a client from the client. If a client should
    87  // just be made unavailable for scheduling, a status update is prefered.
    88  func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error {
    89  	if done, err := n.srv.forward("Node.Deregister", args, args, reply); done {
    90  		return err
    91  	}
    92  	defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now())
    93  
    94  	// Verify the arguments
    95  	if args.NodeID == "" {
    96  		return fmt.Errorf("missing node ID for client deregistration")
    97  	}
    98  
    99  	// Commit this update via Raft
   100  	_, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args)
   101  	if err != nil {
   102  		n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err)
   103  		return err
   104  	}
   105  
   106  	// Clear the heartbeat timer if any
   107  	n.srv.clearHeartbeatTimer(args.NodeID)
   108  
   109  	// Create the evaluations for this node
   110  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   111  	if err != nil {
   112  		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   113  		return err
   114  	}
   115  
   116  	// Setup the reply
   117  	reply.EvalIDs = evalIDs
   118  	reply.EvalCreateIndex = evalIndex
   119  	reply.NodeModifyIndex = index
   120  	reply.Index = index
   121  	return nil
   122  }
   123  
   124  // UpdateStatus is used to update the status of a client node
   125  func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error {
   126  	if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done {
   127  		return err
   128  	}
   129  	defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now())
   130  
   131  	// Verify the arguments
   132  	if args.NodeID == "" {
   133  		return fmt.Errorf("missing node ID for client deregistration")
   134  	}
   135  	if !structs.ValidNodeStatus(args.Status) {
   136  		return fmt.Errorf("invalid status for node")
   137  	}
   138  
   139  	// Look for the node
   140  	snap, err := n.srv.fsm.State().Snapshot()
   141  	if err != nil {
   142  		return err
   143  	}
   144  	node, err := snap.NodeByID(args.NodeID)
   145  	if err != nil {
   146  		return err
   147  	}
   148  	if node == nil {
   149  		return fmt.Errorf("node not found")
   150  	}
   151  
   152  	// Commit this update via Raft
   153  	var index uint64
   154  	if node.Status != args.Status {
   155  		_, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args)
   156  		if err != nil {
   157  			n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err)
   158  			return err
   159  		}
   160  		reply.NodeModifyIndex = index
   161  	}
   162  
   163  	// Check if we should trigger evaluations
   164  	initToReady := node.Status == structs.NodeStatusInit && args.Status == structs.NodeStatusReady
   165  	terminalToReady := node.Status == structs.NodeStatusDown && args.Status == structs.NodeStatusReady
   166  	transitionToReady := initToReady || terminalToReady
   167  	if structs.ShouldDrainNode(args.Status) || transitionToReady {
   168  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   169  		if err != nil {
   170  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   171  			return err
   172  		}
   173  		reply.EvalIDs = evalIDs
   174  		reply.EvalCreateIndex = evalIndex
   175  	}
   176  
   177  	// Check if we need to setup a heartbeat
   178  	if args.Status != structs.NodeStatusDown {
   179  		ttl, err := n.srv.resetHeartbeatTimer(args.NodeID)
   180  		if err != nil {
   181  			n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err)
   182  			return err
   183  		}
   184  		reply.HeartbeatTTL = ttl
   185  	}
   186  
   187  	// Set the reply index
   188  	reply.Index = index
   189  	return nil
   190  }
   191  
   192  // UpdateDrain is used to update the drain mode of a client node
   193  func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
   194  	reply *structs.NodeDrainUpdateResponse) error {
   195  	if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done {
   196  		return err
   197  	}
   198  	defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now())
   199  
   200  	// Verify the arguments
   201  	if args.NodeID == "" {
   202  		return fmt.Errorf("missing node ID for drain update")
   203  	}
   204  
   205  	// Look for the node
   206  	snap, err := n.srv.fsm.State().Snapshot()
   207  	if err != nil {
   208  		return err
   209  	}
   210  	node, err := snap.NodeByID(args.NodeID)
   211  	if err != nil {
   212  		return err
   213  	}
   214  	if node == nil {
   215  		return fmt.Errorf("node not found")
   216  	}
   217  
   218  	// Commit this update via Raft
   219  	var index uint64
   220  	if node.Drain != args.Drain {
   221  		_, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
   222  		if err != nil {
   223  			n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err)
   224  			return err
   225  		}
   226  		reply.NodeModifyIndex = index
   227  	}
   228  
   229  	// Check if we should trigger evaluations
   230  	if args.Drain {
   231  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   232  		if err != nil {
   233  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   234  			return err
   235  		}
   236  		reply.EvalIDs = evalIDs
   237  		reply.EvalCreateIndex = evalIndex
   238  	}
   239  
   240  	// Set the reply index
   241  	reply.Index = index
   242  	return nil
   243  }
   244  
   245  // Evaluate is used to force a re-evaluation of the node
   246  func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error {
   247  	if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done {
   248  		return err
   249  	}
   250  	defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now())
   251  
   252  	// Verify the arguments
   253  	if args.NodeID == "" {
   254  		return fmt.Errorf("missing node ID for evaluation")
   255  	}
   256  
   257  	// Look for the node
   258  	snap, err := n.srv.fsm.State().Snapshot()
   259  	if err != nil {
   260  		return err
   261  	}
   262  	node, err := snap.NodeByID(args.NodeID)
   263  	if err != nil {
   264  		return err
   265  	}
   266  	if node == nil {
   267  		return fmt.Errorf("node not found")
   268  	}
   269  
   270  	// Create the evaluation
   271  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex)
   272  	if err != nil {
   273  		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   274  		return err
   275  	}
   276  	reply.EvalIDs = evalIDs
   277  	reply.EvalCreateIndex = evalIndex
   278  
   279  	// Set the reply index
   280  	reply.Index = evalIndex
   281  	return nil
   282  }
   283  
   284  // GetNode is used to request information about a specific node
   285  func (n *Node) GetNode(args *structs.NodeSpecificRequest,
   286  	reply *structs.SingleNodeResponse) error {
   287  	if done, err := n.srv.forward("Node.GetNode", args, args, reply); done {
   288  		return err
   289  	}
   290  	defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now())
   291  
   292  	// Setup the blocking query
   293  	opts := blockingOptions{
   294  		queryOpts: &args.QueryOptions,
   295  		queryMeta: &reply.QueryMeta,
   296  		watch:     watch.NewItems(watch.Item{Node: args.NodeID}),
   297  		run: func() error {
   298  			// Verify the arguments
   299  			if args.NodeID == "" {
   300  				return fmt.Errorf("missing node ID")
   301  			}
   302  
   303  			// Look for the node
   304  			snap, err := n.srv.fsm.State().Snapshot()
   305  			if err != nil {
   306  				return err
   307  			}
   308  			out, err := snap.NodeByID(args.NodeID)
   309  			if err != nil {
   310  				return err
   311  			}
   312  
   313  			// Setup the output
   314  			reply.Node = out
   315  			if out != nil {
   316  				reply.Index = out.ModifyIndex
   317  			} else {
   318  				// Use the last index that affected the nodes table
   319  				index, err := snap.Index("nodes")
   320  				if err != nil {
   321  					return err
   322  				}
   323  				reply.Index = index
   324  			}
   325  
   326  			// Set the query response
   327  			n.srv.setQueryMeta(&reply.QueryMeta)
   328  			return nil
   329  		}}
   330  	return n.srv.blockingRPC(&opts)
   331  }
   332  
   333  // GetAllocs is used to request allocations for a specific node
   334  func (n *Node) GetAllocs(args *structs.NodeSpecificRequest,
   335  	reply *structs.NodeAllocsResponse) error {
   336  	if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done {
   337  		return err
   338  	}
   339  	defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now())
   340  
   341  	// Verify the arguments
   342  	if args.NodeID == "" {
   343  		return fmt.Errorf("missing node ID")
   344  	}
   345  
   346  	// Setup the blocking query
   347  	opts := blockingOptions{
   348  		queryOpts: &args.QueryOptions,
   349  		queryMeta: &reply.QueryMeta,
   350  		watch:     watch.NewItems(watch.Item{AllocNode: args.NodeID}),
   351  		run: func() error {
   352  			// Look for the node
   353  			snap, err := n.srv.fsm.State().Snapshot()
   354  			if err != nil {
   355  				return err
   356  			}
   357  			allocs, err := snap.AllocsByNode(args.NodeID)
   358  			if err != nil {
   359  				return err
   360  			}
   361  
   362  			// Setup the output
   363  			if len(allocs) != 0 {
   364  				reply.Allocs = allocs
   365  				for _, alloc := range allocs {
   366  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
   367  				}
   368  			} else {
   369  				reply.Allocs = nil
   370  
   371  				// Use the last index that affected the nodes table
   372  				index, err := snap.Index("allocs")
   373  				if err != nil {
   374  					return err
   375  				}
   376  
   377  				// Must provide non-zero index to prevent blocking
   378  				// Index 1 is impossible anyways (due to Raft internals)
   379  				if index == 0 {
   380  					reply.Index = 1
   381  				} else {
   382  					reply.Index = index
   383  				}
   384  			}
   385  			return nil
   386  		}}
   387  	return n.srv.blockingRPC(&opts)
   388  }
   389  
   390  // UpdateAlloc is used to update the client status of an allocation
   391  func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error {
   392  	if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done {
   393  		return err
   394  	}
   395  	defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now())
   396  
   397  	// Ensure only a single alloc
   398  	if len(args.Alloc) != 1 {
   399  		return fmt.Errorf("must update a single allocation")
   400  	}
   401  
   402  	// Commit this update via Raft
   403  	_, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, args)
   404  	if err != nil {
   405  		n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err)
   406  		return err
   407  	}
   408  
   409  	// Setup the response
   410  	reply.Index = index
   411  	return nil
   412  }
   413  
   414  // List is used to list the available nodes
   415  func (n *Node) List(args *structs.NodeListRequest,
   416  	reply *structs.NodeListResponse) error {
   417  	if done, err := n.srv.forward("Node.List", args, args, reply); done {
   418  		return err
   419  	}
   420  	defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now())
   421  
   422  	// Setup the blocking query
   423  	opts := blockingOptions{
   424  		queryOpts: &args.QueryOptions,
   425  		queryMeta: &reply.QueryMeta,
   426  		watch:     watch.NewItems(watch.Item{Table: "nodes"}),
   427  		run: func() error {
   428  			// Capture all the nodes
   429  			snap, err := n.srv.fsm.State().Snapshot()
   430  			if err != nil {
   431  				return err
   432  			}
   433  			var iter memdb.ResultIterator
   434  			if prefix := args.QueryOptions.Prefix; prefix != "" {
   435  				iter, err = snap.NodesByIDPrefix(prefix)
   436  			} else {
   437  				iter, err = snap.Nodes()
   438  			}
   439  			if err != nil {
   440  				return err
   441  			}
   442  
   443  			var nodes []*structs.NodeListStub
   444  			for {
   445  				raw := iter.Next()
   446  				if raw == nil {
   447  					break
   448  				}
   449  				node := raw.(*structs.Node)
   450  				nodes = append(nodes, node.Stub())
   451  			}
   452  			reply.Nodes = nodes
   453  
   454  			// Use the last index that affected the jobs table
   455  			index, err := snap.Index("nodes")
   456  			if err != nil {
   457  				return err
   458  			}
   459  			reply.Index = index
   460  
   461  			// Set the query response
   462  			n.srv.setQueryMeta(&reply.QueryMeta)
   463  			return nil
   464  		}}
   465  	return n.srv.blockingRPC(&opts)
   466  }
   467  
   468  // createNodeEvals is used to create evaluations for each alloc on a node.
   469  // Each Eval is scoped to a job, so we need to potentially trigger many evals.
   470  func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) {
   471  	// Snapshot the state
   472  	snap, err := n.srv.fsm.State().Snapshot()
   473  	if err != nil {
   474  		return nil, 0, fmt.Errorf("failed to snapshot state: %v", err)
   475  	}
   476  
   477  	// Find all the allocations for this node
   478  	allocs, err := snap.AllocsByNode(nodeID)
   479  	if err != nil {
   480  		return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err)
   481  	}
   482  
   483  	sysJobsIter, err := snap.JobsByScheduler("system")
   484  	if err != nil {
   485  		return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err)
   486  	}
   487  
   488  	var sysJobs []*structs.Job
   489  	for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() {
   490  		sysJobs = append(sysJobs, job.(*structs.Job))
   491  	}
   492  
   493  	// Fast-path if nothing to do
   494  	if len(allocs) == 0 && len(sysJobs) == 0 {
   495  		return nil, 0, nil
   496  	}
   497  
   498  	// Create an eval for each JobID affected
   499  	var evals []*structs.Evaluation
   500  	var evalIDs []string
   501  	jobIDs := make(map[string]struct{})
   502  
   503  	for _, alloc := range allocs {
   504  		// Deduplicate on JobID
   505  		if _, ok := jobIDs[alloc.JobID]; ok {
   506  			continue
   507  		}
   508  		jobIDs[alloc.JobID] = struct{}{}
   509  
   510  		// Create a new eval
   511  		eval := &structs.Evaluation{
   512  			ID:              structs.GenerateUUID(),
   513  			Priority:        alloc.Job.Priority,
   514  			Type:            alloc.Job.Type,
   515  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
   516  			JobID:           alloc.JobID,
   517  			NodeID:          nodeID,
   518  			NodeModifyIndex: nodeIndex,
   519  			Status:          structs.EvalStatusPending,
   520  		}
   521  		evals = append(evals, eval)
   522  		evalIDs = append(evalIDs, eval.ID)
   523  	}
   524  
   525  	// Create an evaluation for each system job.
   526  	for _, job := range sysJobs {
   527  		// Still dedup on JobID as the node may already have the system job.
   528  		if _, ok := jobIDs[job.ID]; ok {
   529  			continue
   530  		}
   531  		jobIDs[job.ID] = struct{}{}
   532  
   533  		// Create a new eval
   534  		eval := &structs.Evaluation{
   535  			ID:              structs.GenerateUUID(),
   536  			Priority:        job.Priority,
   537  			Type:            job.Type,
   538  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
   539  			JobID:           job.ID,
   540  			NodeID:          nodeID,
   541  			NodeModifyIndex: nodeIndex,
   542  			Status:          structs.EvalStatusPending,
   543  		}
   544  		evals = append(evals, eval)
   545  		evalIDs = append(evalIDs, eval.ID)
   546  	}
   547  
   548  	// Create the Raft transaction
   549  	update := &structs.EvalUpdateRequest{
   550  		Evals:        evals,
   551  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
   552  	}
   553  
   554  	// Commit this evaluation via Raft
   555  	// XXX: There is a risk of partial failure where the node update succeeds
   556  	// but that the EvalUpdate does not.
   557  	_, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update)
   558  	if err != nil {
   559  		return nil, 0, err
   560  	}
   561  	return evalIDs, evalIndex, nil
   562  }