github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/operator_endpoint.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"net"
     6  
     7  	"github.com/hashicorp/consul/agent/consul/autopilot"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  	"github.com/hashicorp/raft"
    10  	"github.com/hashicorp/serf/serf"
    11  )
    12  
    13  // Operator endpoint is used to perform low-level operator tasks for Nomad.
    14  type Operator struct {
    15  	srv *Server
    16  }
    17  
    18  // RaftGetConfiguration is used to retrieve the current Raft configuration.
    19  func (op *Operator) RaftGetConfiguration(args *structs.GenericRequest, reply *structs.RaftConfigurationResponse) error {
    20  	if done, err := op.srv.forward("Operator.RaftGetConfiguration", args, args, reply); done {
    21  		return err
    22  	}
    23  
    24  	// Check management permissions
    25  	if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
    26  		return err
    27  	} else if aclObj != nil && !aclObj.IsManagement() {
    28  		return structs.ErrPermissionDenied
    29  	}
    30  
    31  	// We can't fetch the leader and the configuration atomically with
    32  	// the current Raft API.
    33  	future := op.srv.raft.GetConfiguration()
    34  	if err := future.Error(); err != nil {
    35  		return err
    36  	}
    37  
    38  	// Index the Nomad information about the servers.
    39  	serverMap := make(map[raft.ServerAddress]serf.Member)
    40  	for _, member := range op.srv.serf.Members() {
    41  		valid, parts := isNomadServer(member)
    42  		if !valid {
    43  			continue
    44  		}
    45  
    46  		addr := (&net.TCPAddr{IP: member.Addr, Port: parts.Port}).String()
    47  		serverMap[raft.ServerAddress(addr)] = member
    48  	}
    49  
    50  	// Fill out the reply.
    51  	leader := op.srv.raft.Leader()
    52  	reply.Index = future.Index()
    53  	for _, server := range future.Configuration().Servers {
    54  		node := "(unknown)"
    55  		raftProtocolVersion := "unknown"
    56  		if member, ok := serverMap[server.Address]; ok {
    57  			node = member.Name
    58  			if raftVsn, ok := member.Tags["raft_vsn"]; ok {
    59  				raftProtocolVersion = raftVsn
    60  			}
    61  		}
    62  
    63  		entry := &structs.RaftServer{
    64  			ID:           server.ID,
    65  			Node:         node,
    66  			Address:      server.Address,
    67  			Leader:       server.Address == leader,
    68  			Voter:        server.Suffrage == raft.Voter,
    69  			RaftProtocol: raftProtocolVersion,
    70  		}
    71  		reply.Servers = append(reply.Servers, entry)
    72  	}
    73  	return nil
    74  }
    75  
    76  // RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft
    77  // quorum but no longer known to Serf or the catalog) by address in the form of
    78  // "IP:port". The reply argument is not used, but it required to fulfill the RPC
    79  // interface.
    80  func (op *Operator) RaftRemovePeerByAddress(args *structs.RaftPeerByAddressRequest, reply *struct{}) error {
    81  	if done, err := op.srv.forward("Operator.RaftRemovePeerByAddress", args, args, reply); done {
    82  		return err
    83  	}
    84  
    85  	// Check management permissions
    86  	if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
    87  		return err
    88  	} else if aclObj != nil && !aclObj.IsManagement() {
    89  		return structs.ErrPermissionDenied
    90  	}
    91  
    92  	// Since this is an operation designed for humans to use, we will return
    93  	// an error if the supplied address isn't among the peers since it's
    94  	// likely they screwed up.
    95  	{
    96  		future := op.srv.raft.GetConfiguration()
    97  		if err := future.Error(); err != nil {
    98  			return err
    99  		}
   100  		for _, s := range future.Configuration().Servers {
   101  			if s.Address == args.Address {
   102  				goto REMOVE
   103  			}
   104  		}
   105  		return fmt.Errorf("address %q was not found in the Raft configuration",
   106  			args.Address)
   107  	}
   108  
   109  REMOVE:
   110  	// The Raft library itself will prevent various forms of foot-shooting,
   111  	// like making a configuration with no voters. Some consideration was
   112  	// given here to adding more checks, but it was decided to make this as
   113  	// low-level and direct as possible. We've got ACL coverage to lock this
   114  	// down, and if you are an operator, it's assumed you know what you are
   115  	// doing if you are calling this. If you remove a peer that's known to
   116  	// Serf, for example, it will come back when the leader does a reconcile
   117  	// pass.
   118  	future := op.srv.raft.RemovePeer(args.Address)
   119  	if err := future.Error(); err != nil {
   120  		op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer %q: %v",
   121  			args.Address, err)
   122  		return err
   123  	}
   124  
   125  	op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer %q", args.Address)
   126  	return nil
   127  }
   128  
   129  // RaftRemovePeerByID is used to kick a stale peer (one that is in the Raft
   130  // quorum but no longer known to Serf or the catalog) by address in the form of
   131  // "IP:port". The reply argument is not used, but is required to fulfill the RPC
   132  // interface.
   133  func (op *Operator) RaftRemovePeerByID(args *structs.RaftPeerByIDRequest, reply *struct{}) error {
   134  	if done, err := op.srv.forward("Operator.RaftRemovePeerByID", args, args, reply); done {
   135  		return err
   136  	}
   137  
   138  	// Check management permissions
   139  	if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
   140  		return err
   141  	} else if aclObj != nil && !aclObj.IsManagement() {
   142  		return structs.ErrPermissionDenied
   143  	}
   144  
   145  	// Since this is an operation designed for humans to use, we will return
   146  	// an error if the supplied id isn't among the peers since it's
   147  	// likely they screwed up.
   148  	var address raft.ServerAddress
   149  	{
   150  		future := op.srv.raft.GetConfiguration()
   151  		if err := future.Error(); err != nil {
   152  			return err
   153  		}
   154  		for _, s := range future.Configuration().Servers {
   155  			if s.ID == args.ID {
   156  				address = s.Address
   157  				goto REMOVE
   158  			}
   159  		}
   160  		return fmt.Errorf("id %q was not found in the Raft configuration",
   161  			args.ID)
   162  	}
   163  
   164  REMOVE:
   165  	// The Raft library itself will prevent various forms of foot-shooting,
   166  	// like making a configuration with no voters. Some consideration was
   167  	// given here to adding more checks, but it was decided to make this as
   168  	// low-level and direct as possible. We've got ACL coverage to lock this
   169  	// down, and if you are an operator, it's assumed you know what you are
   170  	// doing if you are calling this. If you remove a peer that's known to
   171  	// Serf, for example, it will come back when the leader does a reconcile
   172  	// pass.
   173  	minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol()
   174  	if err != nil {
   175  		return err
   176  	}
   177  
   178  	var future raft.Future
   179  	if minRaftProtocol >= 2 {
   180  		future = op.srv.raft.RemoveServer(args.ID, 0, 0)
   181  	} else {
   182  		future = op.srv.raft.RemovePeer(address)
   183  	}
   184  	if err := future.Error(); err != nil {
   185  		op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer with id %q: %v",
   186  			args.ID, err)
   187  		return err
   188  	}
   189  
   190  	op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer with id %q", args.ID)
   191  	return nil
   192  }
   193  
   194  // AutopilotGetConfiguration is used to retrieve the current Autopilot configuration.
   195  func (op *Operator) AutopilotGetConfiguration(args *structs.GenericRequest, reply *structs.AutopilotConfig) error {
   196  	if done, err := op.srv.forward("Operator.AutopilotGetConfiguration", args, args, reply); done {
   197  		return err
   198  	}
   199  
   200  	// This action requires operator read access.
   201  	rule, err := op.srv.ResolveToken(args.AuthToken)
   202  	if err != nil {
   203  		return err
   204  	}
   205  	if rule != nil && !rule.AllowOperatorRead() {
   206  		return structs.ErrPermissionDenied
   207  	}
   208  
   209  	state := op.srv.fsm.State()
   210  	_, config, err := state.AutopilotConfig()
   211  	if err != nil {
   212  		return err
   213  	}
   214  	if config == nil {
   215  		return fmt.Errorf("autopilot config not initialized yet")
   216  	}
   217  
   218  	*reply = *config
   219  
   220  	return nil
   221  }
   222  
   223  // AutopilotSetConfiguration is used to set the current Autopilot configuration.
   224  func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRequest, reply *bool) error {
   225  	if done, err := op.srv.forward("Operator.AutopilotSetConfiguration", args, args, reply); done {
   226  		return err
   227  	}
   228  
   229  	// This action requires operator write access.
   230  	rule, err := op.srv.ResolveToken(args.AuthToken)
   231  	if err != nil {
   232  		return err
   233  	}
   234  	if rule != nil && !rule.AllowOperatorWrite() {
   235  		return structs.ErrPermissionDenied
   236  	}
   237  
   238  	// Apply the update
   239  	resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args)
   240  	if err != nil {
   241  		op.srv.logger.Printf("[ERR] nomad.operator: Apply failed: %v", err)
   242  		return err
   243  	}
   244  	if respErr, ok := resp.(error); ok {
   245  		return respErr
   246  	}
   247  
   248  	// Check if the return type is a bool.
   249  	if respBool, ok := resp.(bool); ok {
   250  		*reply = respBool
   251  	}
   252  	return nil
   253  }
   254  
   255  // ServerHealth is used to get the current health of the servers.
   256  func (op *Operator) ServerHealth(args *structs.GenericRequest, reply *autopilot.OperatorHealthReply) error {
   257  	// This must be sent to the leader, so we fix the args since we are
   258  	// re-using a structure where we don't support all the options.
   259  	args.AllowStale = false
   260  	if done, err := op.srv.forward("Operator.ServerHealth", args, args, reply); done {
   261  		return err
   262  	}
   263  
   264  	// This action requires operator read access.
   265  	rule, err := op.srv.ResolveToken(args.AuthToken)
   266  	if err != nil {
   267  		return err
   268  	}
   269  	if rule != nil && !rule.AllowOperatorRead() {
   270  		return structs.ErrPermissionDenied
   271  	}
   272  
   273  	// Exit early if the min Raft version is too low
   274  	minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol()
   275  	if err != nil {
   276  		return fmt.Errorf("error getting server raft protocol versions: %s", err)
   277  	}
   278  	if minRaftProtocol < 3 {
   279  		return fmt.Errorf("all servers must have raft_protocol set to 3 or higher to use this endpoint")
   280  	}
   281  
   282  	*reply = op.srv.autopilot.GetClusterHealth()
   283  
   284  	return nil
   285  }