github.com/uchennaokeke444/nomad@v0.11.8/nomad/operator_endpoint.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"net"
     6  
     7  	log "github.com/hashicorp/go-hclog"
     8  
     9  	"github.com/hashicorp/consul/agent/consul/autopilot"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/raft"
    12  	"github.com/hashicorp/serf/serf"
    13  )
    14  
    15  // Operator endpoint is used to perform low-level operator tasks for Nomad.
    16  type Operator struct {
    17  	srv    *Server
    18  	logger log.Logger
    19  }
    20  
    21  // RaftGetConfiguration is used to retrieve the current Raft configuration.
    22  func (op *Operator) RaftGetConfiguration(args *structs.GenericRequest, reply *structs.RaftConfigurationResponse) error {
    23  	if done, err := op.srv.forward("Operator.RaftGetConfiguration", args, args, reply); done {
    24  		return err
    25  	}
    26  
    27  	// Check management permissions
    28  	if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
    29  		return err
    30  	} else if aclObj != nil && !aclObj.IsManagement() {
    31  		return structs.ErrPermissionDenied
    32  	}
    33  
    34  	// We can't fetch the leader and the configuration atomically with
    35  	// the current Raft API.
    36  	future := op.srv.raft.GetConfiguration()
    37  	if err := future.Error(); err != nil {
    38  		return err
    39  	}
    40  
    41  	// Index the Nomad information about the servers.
    42  	serverMap := make(map[raft.ServerAddress]serf.Member)
    43  	for _, member := range op.srv.serf.Members() {
    44  		valid, parts := isNomadServer(member)
    45  		if !valid {
    46  			continue
    47  		}
    48  
    49  		addr := (&net.TCPAddr{IP: member.Addr, Port: parts.Port}).String()
    50  		serverMap[raft.ServerAddress(addr)] = member
    51  	}
    52  
    53  	// Fill out the reply.
    54  	leader := op.srv.raft.Leader()
    55  	reply.Index = future.Index()
    56  	for _, server := range future.Configuration().Servers {
    57  		node := "(unknown)"
    58  		raftProtocolVersion := "unknown"
    59  		if member, ok := serverMap[server.Address]; ok {
    60  			node = member.Name
    61  			if raftVsn, ok := member.Tags["raft_vsn"]; ok {
    62  				raftProtocolVersion = raftVsn
    63  			}
    64  		}
    65  
    66  		entry := &structs.RaftServer{
    67  			ID:           server.ID,
    68  			Node:         node,
    69  			Address:      server.Address,
    70  			Leader:       server.Address == leader,
    71  			Voter:        server.Suffrage == raft.Voter,
    72  			RaftProtocol: raftProtocolVersion,
    73  		}
    74  		reply.Servers = append(reply.Servers, entry)
    75  	}
    76  	return nil
    77  }
    78  
    79  // RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft
    80  // quorum but no longer known to Serf or the catalog) by address in the form of
    81  // "IP:port". The reply argument is not used, but it required to fulfill the RPC
    82  // interface.
    83  func (op *Operator) RaftRemovePeerByAddress(args *structs.RaftPeerByAddressRequest, reply *struct{}) error {
    84  	if done, err := op.srv.forward("Operator.RaftRemovePeerByAddress", args, args, reply); done {
    85  		return err
    86  	}
    87  
    88  	// Check management permissions
    89  	if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
    90  		return err
    91  	} else if aclObj != nil && !aclObj.IsManagement() {
    92  		return structs.ErrPermissionDenied
    93  	}
    94  
    95  	// Since this is an operation designed for humans to use, we will return
    96  	// an error if the supplied address isn't among the peers since it's
    97  	// likely they screwed up.
    98  	{
    99  		future := op.srv.raft.GetConfiguration()
   100  		if err := future.Error(); err != nil {
   101  			return err
   102  		}
   103  		for _, s := range future.Configuration().Servers {
   104  			if s.Address == args.Address {
   105  				goto REMOVE
   106  			}
   107  		}
   108  		return fmt.Errorf("address %q was not found in the Raft configuration",
   109  			args.Address)
   110  	}
   111  
   112  REMOVE:
   113  	// The Raft library itself will prevent various forms of foot-shooting,
   114  	// like making a configuration with no voters. Some consideration was
   115  	// given here to adding more checks, but it was decided to make this as
   116  	// low-level and direct as possible. We've got ACL coverage to lock this
   117  	// down, and if you are an operator, it's assumed you know what you are
   118  	// doing if you are calling this. If you remove a peer that's known to
   119  	// Serf, for example, it will come back when the leader does a reconcile
   120  	// pass.
   121  	future := op.srv.raft.RemovePeer(args.Address)
   122  	if err := future.Error(); err != nil {
   123  		op.logger.Warn("failed to remove Raft peer", "peer", args.Address, "error", err)
   124  		return err
   125  	}
   126  
   127  	op.logger.Warn("removed Raft peer", "peer", args.Address)
   128  	return nil
   129  }
   130  
   131  // RaftRemovePeerByID is used to kick a stale peer (one that is in the Raft
   132  // quorum but no longer known to Serf or the catalog) by address in the form of
   133  // "IP:port". The reply argument is not used, but is required to fulfill the RPC
   134  // interface.
   135  func (op *Operator) RaftRemovePeerByID(args *structs.RaftPeerByIDRequest, reply *struct{}) error {
   136  	if done, err := op.srv.forward("Operator.RaftRemovePeerByID", args, args, reply); done {
   137  		return err
   138  	}
   139  
   140  	// Check management permissions
   141  	if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
   142  		return err
   143  	} else if aclObj != nil && !aclObj.IsManagement() {
   144  		return structs.ErrPermissionDenied
   145  	}
   146  
   147  	// Since this is an operation designed for humans to use, we will return
   148  	// an error if the supplied id isn't among the peers since it's
   149  	// likely they screwed up.
   150  	var address raft.ServerAddress
   151  	{
   152  		future := op.srv.raft.GetConfiguration()
   153  		if err := future.Error(); err != nil {
   154  			return err
   155  		}
   156  		for _, s := range future.Configuration().Servers {
   157  			if s.ID == args.ID {
   158  				address = s.Address
   159  				goto REMOVE
   160  			}
   161  		}
   162  		return fmt.Errorf("id %q was not found in the Raft configuration",
   163  			args.ID)
   164  	}
   165  
   166  REMOVE:
   167  	// The Raft library itself will prevent various forms of foot-shooting,
   168  	// like making a configuration with no voters. Some consideration was
   169  	// given here to adding more checks, but it was decided to make this as
   170  	// low-level and direct as possible. We've got ACL coverage to lock this
   171  	// down, and if you are an operator, it's assumed you know what you are
   172  	// doing if you are calling this. If you remove a peer that's known to
   173  	// Serf, for example, it will come back when the leader does a reconcile
   174  	// pass.
   175  	minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol()
   176  	if err != nil {
   177  		return err
   178  	}
   179  
   180  	var future raft.Future
   181  	if minRaftProtocol >= 2 {
   182  		future = op.srv.raft.RemoveServer(args.ID, 0, 0)
   183  	} else {
   184  		future = op.srv.raft.RemovePeer(address)
   185  	}
   186  	if err := future.Error(); err != nil {
   187  		op.logger.Warn("failed to remove Raft peer", "peer_id", args.ID, "error", err)
   188  		return err
   189  	}
   190  
   191  	op.logger.Warn("removed Raft peer", "peer_id", args.ID)
   192  	return nil
   193  }
   194  
   195  // AutopilotGetConfiguration is used to retrieve the current Autopilot configuration.
   196  func (op *Operator) AutopilotGetConfiguration(args *structs.GenericRequest, reply *structs.AutopilotConfig) error {
   197  	if done, err := op.srv.forward("Operator.AutopilotGetConfiguration", args, args, reply); done {
   198  		return err
   199  	}
   200  
   201  	// This action requires operator read access.
   202  	rule, err := op.srv.ResolveToken(args.AuthToken)
   203  	if err != nil {
   204  		return err
   205  	}
   206  	if rule != nil && !rule.AllowOperatorRead() {
   207  		return structs.ErrPermissionDenied
   208  	}
   209  
   210  	state := op.srv.fsm.State()
   211  	_, config, err := state.AutopilotConfig()
   212  	if err != nil {
   213  		return err
   214  	}
   215  	if config == nil {
   216  		return fmt.Errorf("autopilot config not initialized yet")
   217  	}
   218  
   219  	*reply = *config
   220  
   221  	return nil
   222  }
   223  
   224  // AutopilotSetConfiguration is used to set the current Autopilot configuration.
   225  func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRequest, reply *bool) error {
   226  	if done, err := op.srv.forward("Operator.AutopilotSetConfiguration", args, args, reply); done {
   227  		return err
   228  	}
   229  
   230  	// This action requires operator write access.
   231  	rule, err := op.srv.ResolveToken(args.AuthToken)
   232  	if err != nil {
   233  		return err
   234  	}
   235  	if rule != nil && !rule.AllowOperatorWrite() {
   236  		return structs.ErrPermissionDenied
   237  	}
   238  
   239  	// All servers should be at or above 0.8.0 to apply this operatation
   240  	if !ServersMeetMinimumVersion(op.srv.Members(), minAutopilotVersion, false) {
   241  		return fmt.Errorf("All servers should be running version %v to update autopilot config", minAutopilotVersion)
   242  	}
   243  
   244  	// Apply the update
   245  	resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args)
   246  	if err != nil {
   247  		op.logger.Error("failed applying AutoPilot configuration", "error", err)
   248  		return err
   249  	}
   250  	if respErr, ok := resp.(error); ok {
   251  		return respErr
   252  	}
   253  
   254  	// Check if the return type is a bool.
   255  	if respBool, ok := resp.(bool); ok {
   256  		*reply = respBool
   257  	}
   258  	return nil
   259  }
   260  
   261  // ServerHealth is used to get the current health of the servers.
   262  func (op *Operator) ServerHealth(args *structs.GenericRequest, reply *autopilot.OperatorHealthReply) error {
   263  	// This must be sent to the leader, so we fix the args since we are
   264  	// re-using a structure where we don't support all the options.
   265  	args.AllowStale = false
   266  	if done, err := op.srv.forward("Operator.ServerHealth", args, args, reply); done {
   267  		return err
   268  	}
   269  
   270  	// This action requires operator read access.
   271  	rule, err := op.srv.ResolveToken(args.AuthToken)
   272  	if err != nil {
   273  		return err
   274  	}
   275  	if rule != nil && !rule.AllowOperatorRead() {
   276  		return structs.ErrPermissionDenied
   277  	}
   278  
   279  	// Exit early if the min Raft version is too low
   280  	minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol()
   281  	if err != nil {
   282  		return fmt.Errorf("error getting server raft protocol versions: %s", err)
   283  	}
   284  	if minRaftProtocol < 3 {
   285  		return fmt.Errorf("all servers must have raft_protocol set to 3 or higher to use this endpoint")
   286  	}
   287  
   288  	*reply = op.srv.autopilot.GetClusterHealth()
   289  
   290  	return nil
   291  }
   292  
   293  // SchedulerSetConfiguration is used to set the current Scheduler configuration.
   294  func (op *Operator) SchedulerSetConfiguration(args *structs.SchedulerSetConfigRequest, reply *structs.SchedulerSetConfigurationResponse) error {
   295  	if done, err := op.srv.forward("Operator.SchedulerSetConfiguration", args, args, reply); done {
   296  		return err
   297  	}
   298  
   299  	// This action requires operator write access.
   300  	rule, err := op.srv.ResolveToken(args.AuthToken)
   301  	if err != nil {
   302  		return err
   303  	} else if rule != nil && !rule.AllowOperatorWrite() {
   304  		return structs.ErrPermissionDenied
   305  	}
   306  
   307  	// All servers should be at or above 0.9.0 to apply this operatation
   308  	if !ServersMeetMinimumVersion(op.srv.Members(), minSchedulerConfigVersion, false) {
   309  		return fmt.Errorf("All servers should be running version %v to update scheduler config", minSchedulerConfigVersion)
   310  	}
   311  	// Apply the update
   312  	resp, index, err := op.srv.raftApply(structs.SchedulerConfigRequestType, args)
   313  	if err != nil {
   314  		op.logger.Error("failed applying Scheduler configuration", "error", err)
   315  		return err
   316  	} else if respErr, ok := resp.(error); ok {
   317  		return respErr
   318  	}
   319  
   320  	// Check if the return type is a bool
   321  	// Only applies to CAS requests
   322  	if respBool, ok := resp.(bool); ok {
   323  		reply.Updated = respBool
   324  	}
   325  	reply.Index = index
   326  	return nil
   327  }
   328  
   329  // SchedulerGetConfiguration is used to retrieve the current Scheduler configuration.
   330  func (op *Operator) SchedulerGetConfiguration(args *structs.GenericRequest, reply *structs.SchedulerConfigurationResponse) error {
   331  	if done, err := op.srv.forward("Operator.SchedulerGetConfiguration", args, args, reply); done {
   332  		return err
   333  	}
   334  
   335  	// This action requires operator read access.
   336  	rule, err := op.srv.ResolveToken(args.AuthToken)
   337  	if err != nil {
   338  		return err
   339  	} else if rule != nil && !rule.AllowOperatorRead() {
   340  		return structs.ErrPermissionDenied
   341  	}
   342  
   343  	state := op.srv.fsm.State()
   344  	index, config, err := state.SchedulerConfig()
   345  
   346  	if err != nil {
   347  		return err
   348  	} else if config == nil {
   349  		return fmt.Errorf("scheduler config not initialized yet")
   350  	}
   351  
   352  	reply.SchedulerConfig = config
   353  	reply.QueryMeta.Index = index
   354  	op.srv.setQueryMeta(&reply.QueryMeta)
   355  
   356  	return nil
   357  }