github.com/huiliang/nomad@v0.2.1-0.20151124023127-7a8b664699ff/nomad/node_endpoint.go

github.com/huiliang/nomad@v0.2.1-0.20151124023127-7a8b664699ff/nomad/node_endpoint.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  	"github.com/hashicorp/nomad/nomad/watch"
    10  )
    11  
    12  // Node endpoint is used for client interactions
    13  type Node struct {
    14  	srv *Server
    15  }
    16  
    17  // Register is used to upsert a client that is available for scheduling
    18  func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error {
    19  	if done, err := n.srv.forward("Node.Register", args, args, reply); done {
    20  		return err
    21  	}
    22  	defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now())
    23  
    24  	// Validate the arguments
    25  	if args.Node == nil {
    26  		return fmt.Errorf("missing node for client registration")
    27  	}
    28  	if args.Node.ID == "" {
    29  		return fmt.Errorf("missing node ID for client registration")
    30  	}
    31  	if args.Node.Datacenter == "" {
    32  		return fmt.Errorf("missing datacenter for client registration")
    33  	}
    34  	if args.Node.Name == "" {
    35  		return fmt.Errorf("missing node name for client registration")
    36  	}
    37  
    38  	// Default the status if none is given
    39  	if args.Node.Status == "" {
    40  		args.Node.Status = structs.NodeStatusInit
    41  	}
    42  	if !structs.ValidNodeStatus(args.Node.Status) {
    43  		return fmt.Errorf("invalid status for node")
    44  	}
    45  
    46  	// Commit this update via Raft
    47  	_, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args)
    48  	if err != nil {
    49  		n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err)
    50  		return err
    51  	}
    52  	reply.NodeModifyIndex = index
    53  
    54  	// Check if we should trigger evaluations
    55  	if structs.ShouldDrainNode(args.Node.Status) {
    56  		evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index)
    57  		if err != nil {
    58  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
    59  			return err
    60  		}
    61  		reply.EvalIDs = evalIDs
    62  		reply.EvalCreateIndex = evalIndex
    63  	}
    64  
    65  	// Check if we need to setup a heartbeat
    66  	if !args.Node.TerminalStatus() {
    67  		ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID)
    68  		if err != nil {
    69  			n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err)
    70  			return err
    71  		}
    72  		reply.HeartbeatTTL = ttl
    73  	}
    74  
    75  	// Set the reply index
    76  	reply.Index = index
    77  	return nil
    78  }
    79  
    80  // Deregister is used to remove a client from the client. If a client should
    81  // just be made unavailable for scheduling, a status update is prefered.
    82  func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error {
    83  	if done, err := n.srv.forward("Node.Deregister", args, args, reply); done {
    84  		return err
    85  	}
    86  	defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now())
    87  
    88  	// Verify the arguments
    89  	if args.NodeID == "" {
    90  		return fmt.Errorf("missing node ID for client deregistration")
    91  	}
    92  
    93  	// Commit this update via Raft
    94  	_, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args)
    95  	if err != nil {
    96  		n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err)
    97  		return err
    98  	}
    99  
   100  	// Clear the heartbeat timer if any
   101  	n.srv.clearHeartbeatTimer(args.NodeID)
   102  
   103  	// Create the evaluations for this node
   104  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   105  	if err != nil {
   106  		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   107  		return err
   108  	}
   109  
   110  	// Setup the reply
   111  	reply.EvalIDs = evalIDs
   112  	reply.EvalCreateIndex = evalIndex
   113  	reply.NodeModifyIndex = index
   114  	reply.Index = index
   115  	return nil
   116  }
   117  
   118  // UpdateStatus is used to update the status of a client node
   119  func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error {
   120  	if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done {
   121  		return err
   122  	}
   123  	defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now())
   124  
   125  	// Verify the arguments
   126  	if args.NodeID == "" {
   127  		return fmt.Errorf("missing node ID for client deregistration")
   128  	}
   129  	if !structs.ValidNodeStatus(args.Status) {
   130  		return fmt.Errorf("invalid status for node")
   131  	}
   132  
   133  	// Look for the node
   134  	snap, err := n.srv.fsm.State().Snapshot()
   135  	if err != nil {
   136  		return err
   137  	}
   138  	node, err := snap.NodeByID(args.NodeID)
   139  	if err != nil {
   140  		return err
   141  	}
   142  	if node == nil {
   143  		return fmt.Errorf("node not found")
   144  	}
   145  
   146  	// Commit this update via Raft
   147  	var index uint64
   148  	if node.Status != args.Status {
   149  		_, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args)
   150  		if err != nil {
   151  			n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err)
   152  			return err
   153  		}
   154  		reply.NodeModifyIndex = index
   155  	}
   156  
   157  	// Check if we should trigger evaluations
   158  	initToReady := node.Status == structs.NodeStatusInit && args.Status == structs.NodeStatusReady
   159  	terminalToReady := node.Status == structs.NodeStatusDown && args.Status == structs.NodeStatusReady
   160  	transitionToReady := initToReady || terminalToReady
   161  	if structs.ShouldDrainNode(args.Status) || transitionToReady {
   162  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   163  		if err != nil {
   164  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   165  			return err
   166  		}
   167  		reply.EvalIDs = evalIDs
   168  		reply.EvalCreateIndex = evalIndex
   169  	}
   170  
   171  	// Check if we need to setup a heartbeat
   172  	if args.Status != structs.NodeStatusDown {
   173  		ttl, err := n.srv.resetHeartbeatTimer(args.NodeID)
   174  		if err != nil {
   175  			n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err)
   176  			return err
   177  		}
   178  		reply.HeartbeatTTL = ttl
   179  	}
   180  
   181  	// Set the reply index
   182  	reply.Index = index
   183  	return nil
   184  }
   185  
   186  // UpdateDrain is used to update the drain mode of a client node
   187  func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
   188  	reply *structs.NodeDrainUpdateResponse) error {
   189  	if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done {
   190  		return err
   191  	}
   192  	defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now())
   193  
   194  	// Verify the arguments
   195  	if args.NodeID == "" {
   196  		return fmt.Errorf("missing node ID for drain update")
   197  	}
   198  
   199  	// Look for the node
   200  	snap, err := n.srv.fsm.State().Snapshot()
   201  	if err != nil {
   202  		return err
   203  	}
   204  	node, err := snap.NodeByID(args.NodeID)
   205  	if err != nil {
   206  		return err
   207  	}
   208  	if node == nil {
   209  		return fmt.Errorf("node not found")
   210  	}
   211  
   212  	// Commit this update via Raft
   213  	var index uint64
   214  	if node.Drain != args.Drain {
   215  		_, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
   216  		if err != nil {
   217  			n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err)
   218  			return err
   219  		}
   220  		reply.NodeModifyIndex = index
   221  	}
   222  
   223  	// Check if we should trigger evaluations
   224  	if args.Drain {
   225  		evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
   226  		if err != nil {
   227  			n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   228  			return err
   229  		}
   230  		reply.EvalIDs = evalIDs
   231  		reply.EvalCreateIndex = evalIndex
   232  	}
   233  
   234  	// Set the reply index
   235  	reply.Index = index
   236  	return nil
   237  }
   238  
   239  // Evaluate is used to force a re-evaluation of the node
   240  func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error {
   241  	if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done {
   242  		return err
   243  	}
   244  	defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now())
   245  
   246  	// Verify the arguments
   247  	if args.NodeID == "" {
   248  		return fmt.Errorf("missing node ID for evaluation")
   249  	}
   250  
   251  	// Look for the node
   252  	snap, err := n.srv.fsm.State().Snapshot()
   253  	if err != nil {
   254  		return err
   255  	}
   256  	node, err := snap.NodeByID(args.NodeID)
   257  	if err != nil {
   258  		return err
   259  	}
   260  	if node == nil {
   261  		return fmt.Errorf("node not found")
   262  	}
   263  
   264  	// Create the evaluation
   265  	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex)
   266  	if err != nil {
   267  		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
   268  		return err
   269  	}
   270  	reply.EvalIDs = evalIDs
   271  	reply.EvalCreateIndex = evalIndex
   272  
   273  	// Set the reply index
   274  	reply.Index = evalIndex
   275  	return nil
   276  }
   277  
   278  // GetNode is used to request information about a specific node
   279  func (n *Node) GetNode(args *structs.NodeSpecificRequest,
   280  	reply *structs.SingleNodeResponse) error {
   281  	if done, err := n.srv.forward("Node.GetNode", args, args, reply); done {
   282  		return err
   283  	}
   284  	defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now())
   285  
   286  	// Setup the blocking query
   287  	opts := blockingOptions{
   288  		queryOpts: &args.QueryOptions,
   289  		queryMeta: &reply.QueryMeta,
   290  		watch:     watch.NewItems(watch.Item{Node: args.NodeID}),
   291  		run: func() error {
   292  			// Verify the arguments
   293  			if args.NodeID == "" {
   294  				return fmt.Errorf("missing node ID")
   295  			}
   296  
   297  			// Look for the node
   298  			snap, err := n.srv.fsm.State().Snapshot()
   299  			if err != nil {
   300  				return err
   301  			}
   302  			out, err := snap.NodeByID(args.NodeID)
   303  			if err != nil {
   304  				return err
   305  			}
   306  
   307  			// Setup the output
   308  			reply.Node = out
   309  			if out != nil {
   310  				reply.Index = out.ModifyIndex
   311  			} else {
   312  				// Use the last index that affected the nodes table
   313  				index, err := snap.Index("nodes")
   314  				if err != nil {
   315  					return err
   316  				}
   317  				reply.Index = index
   318  			}
   319  
   320  			// Set the query response
   321  			n.srv.setQueryMeta(&reply.QueryMeta)
   322  			return nil
   323  		}}
   324  	return n.srv.blockingRPC(&opts)
   325  }
   326  
   327  // GetAllocs is used to request allocations for a specific node
   328  func (n *Node) GetAllocs(args *structs.NodeSpecificRequest,
   329  	reply *structs.NodeAllocsResponse) error {
   330  	if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done {
   331  		return err
   332  	}
   333  	defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now())
   334  
   335  	// Verify the arguments
   336  	if args.NodeID == "" {
   337  		return fmt.Errorf("missing node ID")
   338  	}
   339  
   340  	// Setup the blocking query
   341  	opts := blockingOptions{
   342  		queryOpts: &args.QueryOptions,
   343  		queryMeta: &reply.QueryMeta,
   344  		watch:     watch.NewItems(watch.Item{AllocNode: args.NodeID}),
   345  		run: func() error {
   346  			// Look for the node
   347  			snap, err := n.srv.fsm.State().Snapshot()
   348  			if err != nil {
   349  				return err
   350  			}
   351  			allocs, err := snap.AllocsByNode(args.NodeID)
   352  			if err != nil {
   353  				return err
   354  			}
   355  
   356  			// Setup the output
   357  			if len(allocs) != 0 {
   358  				reply.Allocs = allocs
   359  				for _, alloc := range allocs {
   360  					reply.Index = maxUint64(reply.Index, alloc.ModifyIndex)
   361  				}
   362  			} else {
   363  				reply.Allocs = nil
   364  
   365  				// Use the last index that affected the nodes table
   366  				index, err := snap.Index("allocs")
   367  				if err != nil {
   368  					return err
   369  				}
   370  
   371  				// Must provide non-zero index to prevent blocking
   372  				// Index 1 is impossible anyways (due to Raft internals)
   373  				if index == 0 {
   374  					reply.Index = 1
   375  				} else {
   376  					reply.Index = index
   377  				}
   378  			}
   379  			return nil
   380  		}}
   381  	return n.srv.blockingRPC(&opts)
   382  }
   383  
   384  // UpdateAlloc is used to update the client status of an allocation
   385  func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error {
   386  	if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done {
   387  		return err
   388  	}
   389  	defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now())
   390  
   391  	// Ensure only a single alloc
   392  	if len(args.Alloc) != 1 {
   393  		return fmt.Errorf("must update a single allocation")
   394  	}
   395  
   396  	// Commit this update via Raft
   397  	_, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, args)
   398  	if err != nil {
   399  		n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err)
   400  		return err
   401  	}
   402  
   403  	// Setup the response
   404  	reply.Index = index
   405  	return nil
   406  }
   407  
   408  // List is used to list the available nodes
   409  func (n *Node) List(args *structs.NodeListRequest,
   410  	reply *structs.NodeListResponse) error {
   411  	if done, err := n.srv.forward("Node.List", args, args, reply); done {
   412  		return err
   413  	}
   414  	defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now())
   415  
   416  	// Setup the blocking query
   417  	opts := blockingOptions{
   418  		queryOpts: &args.QueryOptions,
   419  		queryMeta: &reply.QueryMeta,
   420  		watch:     watch.NewItems(watch.Item{Table: "nodes"}),
   421  		run: func() error {
   422  			// Capture all the nodes
   423  			snap, err := n.srv.fsm.State().Snapshot()
   424  			if err != nil {
   425  				return err
   426  			}
   427  			iter, err := snap.Nodes()
   428  			if err != nil {
   429  				return err
   430  			}
   431  
   432  			var nodes []*structs.NodeListStub
   433  			for {
   434  				raw := iter.Next()
   435  				if raw == nil {
   436  					break
   437  				}
   438  				node := raw.(*structs.Node)
   439  				nodes = append(nodes, node.Stub())
   440  			}
   441  			reply.Nodes = nodes
   442  
   443  			// Use the last index that affected the jobs table
   444  			index, err := snap.Index("nodes")
   445  			if err != nil {
   446  				return err
   447  			}
   448  			reply.Index = index
   449  
   450  			// Set the query response
   451  			n.srv.setQueryMeta(&reply.QueryMeta)
   452  			return nil
   453  		}}
   454  	return n.srv.blockingRPC(&opts)
   455  }
   456  
   457  // createNodeEvals is used to create evaluations for each alloc on a node.
   458  // Each Eval is scoped to a job, so we need to potentially trigger many evals.
   459  func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) {
   460  	// Snapshot the state
   461  	snap, err := n.srv.fsm.State().Snapshot()
   462  	if err != nil {
   463  		return nil, 0, fmt.Errorf("failed to snapshot state: %v", err)
   464  	}
   465  
   466  	// Find all the allocations for this node
   467  	allocs, err := snap.AllocsByNode(nodeID)
   468  	if err != nil {
   469  		return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err)
   470  	}
   471  
   472  	sysJobsIter, err := snap.JobsByScheduler("system")
   473  	if err != nil {
   474  		return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err)
   475  	}
   476  
   477  	var sysJobs []*structs.Job
   478  	for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() {
   479  		sysJobs = append(sysJobs, job.(*structs.Job))
   480  	}
   481  
   482  	// Fast-path if nothing to do
   483  	if len(allocs) == 0 && len(sysJobs) == 0 {
   484  		return nil, 0, nil
   485  	}
   486  
   487  	// Create an eval for each JobID affected
   488  	var evals []*structs.Evaluation
   489  	var evalIDs []string
   490  	jobIDs := make(map[string]struct{})
   491  
   492  	for _, alloc := range allocs {
   493  		// Deduplicate on JobID
   494  		if _, ok := jobIDs[alloc.JobID]; ok {
   495  			continue
   496  		}
   497  		jobIDs[alloc.JobID] = struct{}{}
   498  
   499  		// Create a new eval
   500  		eval := &structs.Evaluation{
   501  			ID:              structs.GenerateUUID(),
   502  			Priority:        alloc.Job.Priority,
   503  			Type:            alloc.Job.Type,
   504  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
   505  			JobID:           alloc.JobID,
   506  			NodeID:          nodeID,
   507  			NodeModifyIndex: nodeIndex,
   508  			Status:          structs.EvalStatusPending,
   509  		}
   510  		evals = append(evals, eval)
   511  		evalIDs = append(evalIDs, eval.ID)
   512  	}
   513  
   514  	// Create an evaluation for each system job.
   515  	for _, job := range sysJobs {
   516  		// Still dedup on JobID as the node may already have the system job.
   517  		if _, ok := jobIDs[job.ID]; ok {
   518  			continue
   519  		}
   520  		jobIDs[job.ID] = struct{}{}
   521  
   522  		// Create a new eval
   523  		eval := &structs.Evaluation{
   524  			ID:              structs.GenerateUUID(),
   525  			Priority:        job.Priority,
   526  			Type:            job.Type,
   527  			TriggeredBy:     structs.EvalTriggerNodeUpdate,
   528  			JobID:           job.ID,
   529  			NodeID:          nodeID,
   530  			NodeModifyIndex: nodeIndex,
   531  			Status:          structs.EvalStatusPending,
   532  		}
   533  		evals = append(evals, eval)
   534  		evalIDs = append(evalIDs, eval.ID)
   535  	}
   536  
   537  	// Create the Raft transaction
   538  	update := &structs.EvalUpdateRequest{
   539  		Evals:        evals,
   540  		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
   541  	}
   542  
   543  	// Commit this evaluation via Raft
   544  	// XXX: There is a risk of partial failure where the node update succeeds
   545  	// but that the EvalUpdate does not.
   546  	_, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update)
   547  	if err != nil {
   548  		return nil, 0, err
   549  	}
   550  	return evalIDs, evalIndex, nil
   551  }