github.com/adityamillind98/nomad@v0.11.8/nomad/operator_endpoint.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "net" 6 7 log "github.com/hashicorp/go-hclog" 8 9 "github.com/hashicorp/consul/agent/consul/autopilot" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/hashicorp/raft" 12 "github.com/hashicorp/serf/serf" 13 ) 14 15 // Operator endpoint is used to perform low-level operator tasks for Nomad. 16 type Operator struct { 17 srv *Server 18 logger log.Logger 19 } 20 21 // RaftGetConfiguration is used to retrieve the current Raft configuration. 22 func (op *Operator) RaftGetConfiguration(args *structs.GenericRequest, reply *structs.RaftConfigurationResponse) error { 23 if done, err := op.srv.forward("Operator.RaftGetConfiguration", args, args, reply); done { 24 return err 25 } 26 27 // Check management permissions 28 if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil { 29 return err 30 } else if aclObj != nil && !aclObj.IsManagement() { 31 return structs.ErrPermissionDenied 32 } 33 34 // We can't fetch the leader and the configuration atomically with 35 // the current Raft API. 36 future := op.srv.raft.GetConfiguration() 37 if err := future.Error(); err != nil { 38 return err 39 } 40 41 // Index the Nomad information about the servers. 42 serverMap := make(map[raft.ServerAddress]serf.Member) 43 for _, member := range op.srv.serf.Members() { 44 valid, parts := isNomadServer(member) 45 if !valid { 46 continue 47 } 48 49 addr := (&net.TCPAddr{IP: member.Addr, Port: parts.Port}).String() 50 serverMap[raft.ServerAddress(addr)] = member 51 } 52 53 // Fill out the reply. 54 leader := op.srv.raft.Leader() 55 reply.Index = future.Index() 56 for _, server := range future.Configuration().Servers { 57 node := "(unknown)" 58 raftProtocolVersion := "unknown" 59 if member, ok := serverMap[server.Address]; ok { 60 node = member.Name 61 if raftVsn, ok := member.Tags["raft_vsn"]; ok { 62 raftProtocolVersion = raftVsn 63 } 64 } 65 66 entry := &structs.RaftServer{ 67 ID: server.ID, 68 Node: node, 69 Address: server.Address, 70 Leader: server.Address == leader, 71 Voter: server.Suffrage == raft.Voter, 72 RaftProtocol: raftProtocolVersion, 73 } 74 reply.Servers = append(reply.Servers, entry) 75 } 76 return nil 77 } 78 79 // RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft 80 // quorum but no longer known to Serf or the catalog) by address in the form of 81 // "IP:port". The reply argument is not used, but it required to fulfill the RPC 82 // interface. 83 func (op *Operator) RaftRemovePeerByAddress(args *structs.RaftPeerByAddressRequest, reply *struct{}) error { 84 if done, err := op.srv.forward("Operator.RaftRemovePeerByAddress", args, args, reply); done { 85 return err 86 } 87 88 // Check management permissions 89 if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil { 90 return err 91 } else if aclObj != nil && !aclObj.IsManagement() { 92 return structs.ErrPermissionDenied 93 } 94 95 // Since this is an operation designed for humans to use, we will return 96 // an error if the supplied address isn't among the peers since it's 97 // likely they screwed up. 98 { 99 future := op.srv.raft.GetConfiguration() 100 if err := future.Error(); err != nil { 101 return err 102 } 103 for _, s := range future.Configuration().Servers { 104 if s.Address == args.Address { 105 goto REMOVE 106 } 107 } 108 return fmt.Errorf("address %q was not found in the Raft configuration", 109 args.Address) 110 } 111 112 REMOVE: 113 // The Raft library itself will prevent various forms of foot-shooting, 114 // like making a configuration with no voters. Some consideration was 115 // given here to adding more checks, but it was decided to make this as 116 // low-level and direct as possible. We've got ACL coverage to lock this 117 // down, and if you are an operator, it's assumed you know what you are 118 // doing if you are calling this. If you remove a peer that's known to 119 // Serf, for example, it will come back when the leader does a reconcile 120 // pass. 121 future := op.srv.raft.RemovePeer(args.Address) 122 if err := future.Error(); err != nil { 123 op.logger.Warn("failed to remove Raft peer", "peer", args.Address, "error", err) 124 return err 125 } 126 127 op.logger.Warn("removed Raft peer", "peer", args.Address) 128 return nil 129 } 130 131 // RaftRemovePeerByID is used to kick a stale peer (one that is in the Raft 132 // quorum but no longer known to Serf or the catalog) by address in the form of 133 // "IP:port". The reply argument is not used, but is required to fulfill the RPC 134 // interface. 135 func (op *Operator) RaftRemovePeerByID(args *structs.RaftPeerByIDRequest, reply *struct{}) error { 136 if done, err := op.srv.forward("Operator.RaftRemovePeerByID", args, args, reply); done { 137 return err 138 } 139 140 // Check management permissions 141 if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil { 142 return err 143 } else if aclObj != nil && !aclObj.IsManagement() { 144 return structs.ErrPermissionDenied 145 } 146 147 // Since this is an operation designed for humans to use, we will return 148 // an error if the supplied id isn't among the peers since it's 149 // likely they screwed up. 150 var address raft.ServerAddress 151 { 152 future := op.srv.raft.GetConfiguration() 153 if err := future.Error(); err != nil { 154 return err 155 } 156 for _, s := range future.Configuration().Servers { 157 if s.ID == args.ID { 158 address = s.Address 159 goto REMOVE 160 } 161 } 162 return fmt.Errorf("id %q was not found in the Raft configuration", 163 args.ID) 164 } 165 166 REMOVE: 167 // The Raft library itself will prevent various forms of foot-shooting, 168 // like making a configuration with no voters. Some consideration was 169 // given here to adding more checks, but it was decided to make this as 170 // low-level and direct as possible. We've got ACL coverage to lock this 171 // down, and if you are an operator, it's assumed you know what you are 172 // doing if you are calling this. If you remove a peer that's known to 173 // Serf, for example, it will come back when the leader does a reconcile 174 // pass. 175 minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol() 176 if err != nil { 177 return err 178 } 179 180 var future raft.Future 181 if minRaftProtocol >= 2 { 182 future = op.srv.raft.RemoveServer(args.ID, 0, 0) 183 } else { 184 future = op.srv.raft.RemovePeer(address) 185 } 186 if err := future.Error(); err != nil { 187 op.logger.Warn("failed to remove Raft peer", "peer_id", args.ID, "error", err) 188 return err 189 } 190 191 op.logger.Warn("removed Raft peer", "peer_id", args.ID) 192 return nil 193 } 194 195 // AutopilotGetConfiguration is used to retrieve the current Autopilot configuration. 196 func (op *Operator) AutopilotGetConfiguration(args *structs.GenericRequest, reply *structs.AutopilotConfig) error { 197 if done, err := op.srv.forward("Operator.AutopilotGetConfiguration", args, args, reply); done { 198 return err 199 } 200 201 // This action requires operator read access. 202 rule, err := op.srv.ResolveToken(args.AuthToken) 203 if err != nil { 204 return err 205 } 206 if rule != nil && !rule.AllowOperatorRead() { 207 return structs.ErrPermissionDenied 208 } 209 210 state := op.srv.fsm.State() 211 _, config, err := state.AutopilotConfig() 212 if err != nil { 213 return err 214 } 215 if config == nil { 216 return fmt.Errorf("autopilot config not initialized yet") 217 } 218 219 *reply = *config 220 221 return nil 222 } 223 224 // AutopilotSetConfiguration is used to set the current Autopilot configuration. 225 func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRequest, reply *bool) error { 226 if done, err := op.srv.forward("Operator.AutopilotSetConfiguration", args, args, reply); done { 227 return err 228 } 229 230 // This action requires operator write access. 231 rule, err := op.srv.ResolveToken(args.AuthToken) 232 if err != nil { 233 return err 234 } 235 if rule != nil && !rule.AllowOperatorWrite() { 236 return structs.ErrPermissionDenied 237 } 238 239 // All servers should be at or above 0.8.0 to apply this operatation 240 if !ServersMeetMinimumVersion(op.srv.Members(), minAutopilotVersion, false) { 241 return fmt.Errorf("All servers should be running version %v to update autopilot config", minAutopilotVersion) 242 } 243 244 // Apply the update 245 resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args) 246 if err != nil { 247 op.logger.Error("failed applying AutoPilot configuration", "error", err) 248 return err 249 } 250 if respErr, ok := resp.(error); ok { 251 return respErr 252 } 253 254 // Check if the return type is a bool. 255 if respBool, ok := resp.(bool); ok { 256 *reply = respBool 257 } 258 return nil 259 } 260 261 // ServerHealth is used to get the current health of the servers. 262 func (op *Operator) ServerHealth(args *structs.GenericRequest, reply *autopilot.OperatorHealthReply) error { 263 // This must be sent to the leader, so we fix the args since we are 264 // re-using a structure where we don't support all the options. 265 args.AllowStale = false 266 if done, err := op.srv.forward("Operator.ServerHealth", args, args, reply); done { 267 return err 268 } 269 270 // This action requires operator read access. 271 rule, err := op.srv.ResolveToken(args.AuthToken) 272 if err != nil { 273 return err 274 } 275 if rule != nil && !rule.AllowOperatorRead() { 276 return structs.ErrPermissionDenied 277 } 278 279 // Exit early if the min Raft version is too low 280 minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol() 281 if err != nil { 282 return fmt.Errorf("error getting server raft protocol versions: %s", err) 283 } 284 if minRaftProtocol < 3 { 285 return fmt.Errorf("all servers must have raft_protocol set to 3 or higher to use this endpoint") 286 } 287 288 *reply = op.srv.autopilot.GetClusterHealth() 289 290 return nil 291 } 292 293 // SchedulerSetConfiguration is used to set the current Scheduler configuration. 294 func (op *Operator) SchedulerSetConfiguration(args *structs.SchedulerSetConfigRequest, reply *structs.SchedulerSetConfigurationResponse) error { 295 if done, err := op.srv.forward("Operator.SchedulerSetConfiguration", args, args, reply); done { 296 return err 297 } 298 299 // This action requires operator write access. 300 rule, err := op.srv.ResolveToken(args.AuthToken) 301 if err != nil { 302 return err 303 } else if rule != nil && !rule.AllowOperatorWrite() { 304 return structs.ErrPermissionDenied 305 } 306 307 // All servers should be at or above 0.9.0 to apply this operatation 308 if !ServersMeetMinimumVersion(op.srv.Members(), minSchedulerConfigVersion, false) { 309 return fmt.Errorf("All servers should be running version %v to update scheduler config", minSchedulerConfigVersion) 310 } 311 // Apply the update 312 resp, index, err := op.srv.raftApply(structs.SchedulerConfigRequestType, args) 313 if err != nil { 314 op.logger.Error("failed applying Scheduler configuration", "error", err) 315 return err 316 } else if respErr, ok := resp.(error); ok { 317 return respErr 318 } 319 320 // Check if the return type is a bool 321 // Only applies to CAS requests 322 if respBool, ok := resp.(bool); ok { 323 reply.Updated = respBool 324 } 325 reply.Index = index 326 return nil 327 } 328 329 // SchedulerGetConfiguration is used to retrieve the current Scheduler configuration. 330 func (op *Operator) SchedulerGetConfiguration(args *structs.GenericRequest, reply *structs.SchedulerConfigurationResponse) error { 331 if done, err := op.srv.forward("Operator.SchedulerGetConfiguration", args, args, reply); done { 332 return err 333 } 334 335 // This action requires operator read access. 336 rule, err := op.srv.ResolveToken(args.AuthToken) 337 if err != nil { 338 return err 339 } else if rule != nil && !rule.AllowOperatorRead() { 340 return structs.ErrPermissionDenied 341 } 342 343 state := op.srv.fsm.State() 344 index, config, err := state.SchedulerConfig() 345 346 if err != nil { 347 return err 348 } else if config == nil { 349 return fmt.Errorf("scheduler config not initialized yet") 350 } 351 352 reply.SchedulerConfig = config 353 reply.QueryMeta.Index = index 354 op.srv.setQueryMeta(&reply.QueryMeta) 355 356 return nil 357 }