github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/operator_endpoint.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "net" 6 7 "github.com/hashicorp/consul/agent/consul/autopilot" 8 "github.com/hashicorp/nomad/nomad/structs" 9 "github.com/hashicorp/raft" 10 "github.com/hashicorp/serf/serf" 11 ) 12 13 // Operator endpoint is used to perform low-level operator tasks for Nomad. 14 type Operator struct { 15 srv *Server 16 } 17 18 // RaftGetConfiguration is used to retrieve the current Raft configuration. 19 func (op *Operator) RaftGetConfiguration(args *structs.GenericRequest, reply *structs.RaftConfigurationResponse) error { 20 if done, err := op.srv.forward("Operator.RaftGetConfiguration", args, args, reply); done { 21 return err 22 } 23 24 // Check management permissions 25 if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil { 26 return err 27 } else if aclObj != nil && !aclObj.IsManagement() { 28 return structs.ErrPermissionDenied 29 } 30 31 // We can't fetch the leader and the configuration atomically with 32 // the current Raft API. 33 future := op.srv.raft.GetConfiguration() 34 if err := future.Error(); err != nil { 35 return err 36 } 37 38 // Index the Nomad information about the servers. 39 serverMap := make(map[raft.ServerAddress]serf.Member) 40 for _, member := range op.srv.serf.Members() { 41 valid, parts := isNomadServer(member) 42 if !valid { 43 continue 44 } 45 46 addr := (&net.TCPAddr{IP: member.Addr, Port: parts.Port}).String() 47 serverMap[raft.ServerAddress(addr)] = member 48 } 49 50 // Fill out the reply. 51 leader := op.srv.raft.Leader() 52 reply.Index = future.Index() 53 for _, server := range future.Configuration().Servers { 54 node := "(unknown)" 55 raftProtocolVersion := "unknown" 56 if member, ok := serverMap[server.Address]; ok { 57 node = member.Name 58 if raftVsn, ok := member.Tags["raft_vsn"]; ok { 59 raftProtocolVersion = raftVsn 60 } 61 } 62 63 entry := &structs.RaftServer{ 64 ID: server.ID, 65 Node: node, 66 Address: server.Address, 67 Leader: server.Address == leader, 68 Voter: server.Suffrage == raft.Voter, 69 RaftProtocol: raftProtocolVersion, 70 } 71 reply.Servers = append(reply.Servers, entry) 72 } 73 return nil 74 } 75 76 // RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft 77 // quorum but no longer known to Serf or the catalog) by address in the form of 78 // "IP:port". The reply argument is not used, but it required to fulfill the RPC 79 // interface. 80 func (op *Operator) RaftRemovePeerByAddress(args *structs.RaftPeerByAddressRequest, reply *struct{}) error { 81 if done, err := op.srv.forward("Operator.RaftRemovePeerByAddress", args, args, reply); done { 82 return err 83 } 84 85 // Check management permissions 86 if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil { 87 return err 88 } else if aclObj != nil && !aclObj.IsManagement() { 89 return structs.ErrPermissionDenied 90 } 91 92 // Since this is an operation designed for humans to use, we will return 93 // an error if the supplied address isn't among the peers since it's 94 // likely they screwed up. 95 { 96 future := op.srv.raft.GetConfiguration() 97 if err := future.Error(); err != nil { 98 return err 99 } 100 for _, s := range future.Configuration().Servers { 101 if s.Address == args.Address { 102 goto REMOVE 103 } 104 } 105 return fmt.Errorf("address %q was not found in the Raft configuration", 106 args.Address) 107 } 108 109 REMOVE: 110 // The Raft library itself will prevent various forms of foot-shooting, 111 // like making a configuration with no voters. Some consideration was 112 // given here to adding more checks, but it was decided to make this as 113 // low-level and direct as possible. We've got ACL coverage to lock this 114 // down, and if you are an operator, it's assumed you know what you are 115 // doing if you are calling this. If you remove a peer that's known to 116 // Serf, for example, it will come back when the leader does a reconcile 117 // pass. 118 future := op.srv.raft.RemovePeer(args.Address) 119 if err := future.Error(); err != nil { 120 op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer %q: %v", 121 args.Address, err) 122 return err 123 } 124 125 op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer %q", args.Address) 126 return nil 127 } 128 129 // RaftRemovePeerByID is used to kick a stale peer (one that is in the Raft 130 // quorum but no longer known to Serf or the catalog) by address in the form of 131 // "IP:port". The reply argument is not used, but is required to fulfill the RPC 132 // interface. 133 func (op *Operator) RaftRemovePeerByID(args *structs.RaftPeerByIDRequest, reply *struct{}) error { 134 if done, err := op.srv.forward("Operator.RaftRemovePeerByID", args, args, reply); done { 135 return err 136 } 137 138 // Check management permissions 139 if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil { 140 return err 141 } else if aclObj != nil && !aclObj.IsManagement() { 142 return structs.ErrPermissionDenied 143 } 144 145 // Since this is an operation designed for humans to use, we will return 146 // an error if the supplied id isn't among the peers since it's 147 // likely they screwed up. 148 var address raft.ServerAddress 149 { 150 future := op.srv.raft.GetConfiguration() 151 if err := future.Error(); err != nil { 152 return err 153 } 154 for _, s := range future.Configuration().Servers { 155 if s.ID == args.ID { 156 address = s.Address 157 goto REMOVE 158 } 159 } 160 return fmt.Errorf("id %q was not found in the Raft configuration", 161 args.ID) 162 } 163 164 REMOVE: 165 // The Raft library itself will prevent various forms of foot-shooting, 166 // like making a configuration with no voters. Some consideration was 167 // given here to adding more checks, but it was decided to make this as 168 // low-level and direct as possible. We've got ACL coverage to lock this 169 // down, and if you are an operator, it's assumed you know what you are 170 // doing if you are calling this. If you remove a peer that's known to 171 // Serf, for example, it will come back when the leader does a reconcile 172 // pass. 173 minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol() 174 if err != nil { 175 return err 176 } 177 178 var future raft.Future 179 if minRaftProtocol >= 2 { 180 future = op.srv.raft.RemoveServer(args.ID, 0, 0) 181 } else { 182 future = op.srv.raft.RemovePeer(address) 183 } 184 if err := future.Error(); err != nil { 185 op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer with id %q: %v", 186 args.ID, err) 187 return err 188 } 189 190 op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer with id %q", args.ID) 191 return nil 192 } 193 194 // AutopilotGetConfiguration is used to retrieve the current Autopilot configuration. 195 func (op *Operator) AutopilotGetConfiguration(args *structs.GenericRequest, reply *structs.AutopilotConfig) error { 196 if done, err := op.srv.forward("Operator.AutopilotGetConfiguration", args, args, reply); done { 197 return err 198 } 199 200 // This action requires operator read access. 201 rule, err := op.srv.ResolveToken(args.AuthToken) 202 if err != nil { 203 return err 204 } 205 if rule != nil && !rule.AllowOperatorRead() { 206 return structs.ErrPermissionDenied 207 } 208 209 state := op.srv.fsm.State() 210 _, config, err := state.AutopilotConfig() 211 if err != nil { 212 return err 213 } 214 if config == nil { 215 return fmt.Errorf("autopilot config not initialized yet") 216 } 217 218 *reply = *config 219 220 return nil 221 } 222 223 // AutopilotSetConfiguration is used to set the current Autopilot configuration. 224 func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRequest, reply *bool) error { 225 if done, err := op.srv.forward("Operator.AutopilotSetConfiguration", args, args, reply); done { 226 return err 227 } 228 229 // This action requires operator write access. 230 rule, err := op.srv.ResolveToken(args.AuthToken) 231 if err != nil { 232 return err 233 } 234 if rule != nil && !rule.AllowOperatorWrite() { 235 return structs.ErrPermissionDenied 236 } 237 238 // Apply the update 239 resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args) 240 if err != nil { 241 op.srv.logger.Printf("[ERR] nomad.operator: Apply failed: %v", err) 242 return err 243 } 244 if respErr, ok := resp.(error); ok { 245 return respErr 246 } 247 248 // Check if the return type is a bool. 249 if respBool, ok := resp.(bool); ok { 250 *reply = respBool 251 } 252 return nil 253 } 254 255 // ServerHealth is used to get the current health of the servers. 256 func (op *Operator) ServerHealth(args *structs.GenericRequest, reply *autopilot.OperatorHealthReply) error { 257 // This must be sent to the leader, so we fix the args since we are 258 // re-using a structure where we don't support all the options. 259 args.AllowStale = false 260 if done, err := op.srv.forward("Operator.ServerHealth", args, args, reply); done { 261 return err 262 } 263 264 // This action requires operator read access. 265 rule, err := op.srv.ResolveToken(args.AuthToken) 266 if err != nil { 267 return err 268 } 269 if rule != nil && !rule.AllowOperatorRead() { 270 return structs.ErrPermissionDenied 271 } 272 273 // Exit early if the min Raft version is too low 274 minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol() 275 if err != nil { 276 return fmt.Errorf("error getting server raft protocol versions: %s", err) 277 } 278 if minRaftProtocol < 3 { 279 return fmt.Errorf("all servers must have raft_protocol set to 3 or higher to use this endpoint") 280 } 281 282 *reply = op.srv.autopilot.GetClusterHealth() 283 284 return nil 285 }