github.com/sl1pm4t/consul@v1.4.5-0.20190325224627-74c31c540f9c/agent/router/manager.go (about) 1 // Package servers provides a Manager interface for Manager managed 2 // metadata.Server objects. The servers package manages servers from a Consul 3 // client's perspective (i.e. a list of servers that a client talks with for 4 // RPCs). The servers package does not provide any API guarantees and should 5 // be called only by `hashicorp/consul`. 6 package router 7 8 import ( 9 "log" 10 "math/rand" 11 "net" 12 "sync" 13 "sync/atomic" 14 "time" 15 16 "github.com/hashicorp/consul/agent/metadata" 17 "github.com/hashicorp/consul/lib" 18 ) 19 20 const ( 21 // clientRPCJitterFraction determines the amount of jitter added to 22 // clientRPCMinReuseDuration before a connection is expired and a new 23 // connection is established in order to rebalance load across consul 24 // servers. The cluster-wide number of connections per second from 25 // rebalancing is applied after this jitter to ensure the CPU impact 26 // is always finite. See newRebalanceConnsPerSecPerServer's comment 27 // for additional commentary. 28 // 29 // For example, in a 10K consul cluster with 5x servers, this default 30 // averages out to ~13 new connections from rebalancing per server 31 // per second (each connection is reused for 120s to 180s). 32 clientRPCJitterFraction = 2 33 34 // clientRPCMinReuseDuration controls the minimum amount of time RPC 35 // queries are sent over an established connection to a single server 36 clientRPCMinReuseDuration = 120 * time.Second 37 38 // Limit the number of new connections a server receives per second 39 // for connection rebalancing. This limit caps the load caused by 40 // continual rebalancing efforts when a cluster is in equilibrium. A 41 // lower value comes at the cost of increased recovery time after a 42 // partition. This parameter begins to take effect when there are 43 // more than ~48K clients querying 5x servers or at lower server 44 // values when there is a partition. 45 // 46 // For example, in a 100K consul cluster with 5x servers, it will 47 // take ~5min for all servers to rebalance their connections. If 48 // 99,995 agents are in the minority talking to only one server, it 49 // will take ~26min for all servers to rebalance. A 10K cluster in 50 // the same scenario will take ~2.6min to rebalance. 51 newRebalanceConnsPerSecPerServer = 64 52 ) 53 54 // ManagerSerfCluster is an interface wrapper around Serf in order to make this 55 // easier to unit test. 56 type ManagerSerfCluster interface { 57 NumNodes() int 58 } 59 60 // Pinger is an interface wrapping client.ConnPool to prevent a cyclic import 61 // dependency. 62 type Pinger interface { 63 Ping(dc string, addr net.Addr, version int, useTLS bool) (bool, error) 64 } 65 66 // serverList is a local copy of the struct used to maintain the list of 67 // Consul servers used by Manager. 68 // 69 // NOTE(sean@): We are explicitly relying on the fact that serverList will 70 // be copied onto the stack. Please keep this structure light. 71 type serverList struct { 72 // servers tracks the locally known servers. List membership is 73 // maintained by Serf. 74 servers []*metadata.Server 75 } 76 77 type Manager struct { 78 // listValue manages the atomic load/store of a Manager's serverList 79 listValue atomic.Value 80 listLock sync.Mutex 81 82 // rebalanceTimer controls the duration of the rebalance interval 83 rebalanceTimer *time.Timer 84 85 // shutdownCh is a copy of the channel in consul.Client 86 shutdownCh chan struct{} 87 88 logger *log.Logger 89 90 // clusterInfo is used to estimate the approximate number of nodes in 91 // a cluster and limit the rate at which it rebalances server 92 // connections. ManagerSerfCluster is an interface that wraps serf. 93 clusterInfo ManagerSerfCluster 94 95 // connPoolPinger is used to test the health of a server in the 96 // connection pool. Pinger is an interface that wraps 97 // client.ConnPool. 98 connPoolPinger Pinger 99 100 // notifyFailedBarrier is acts as a barrier to prevent queuing behind 101 // serverListLog and acts as a TryLock(). 102 notifyFailedBarrier int32 103 104 // offline is used to indicate that there are no servers, or that all 105 // known servers have failed the ping test. 106 offline int32 107 } 108 109 // AddServer takes out an internal write lock and adds a new server. If the 110 // server is not known, appends the server to the list. The new server will 111 // begin seeing use after the rebalance timer fires or enough servers fail 112 // organically. If the server is already known, merge the new server 113 // details. 114 func (m *Manager) AddServer(s *metadata.Server) { 115 m.listLock.Lock() 116 defer m.listLock.Unlock() 117 l := m.getServerList() 118 119 // Check if this server is known 120 found := false 121 for idx, existing := range l.servers { 122 if existing.Name == s.Name { 123 newServers := make([]*metadata.Server, len(l.servers)) 124 copy(newServers, l.servers) 125 126 // Overwrite the existing server details in order to 127 // possibly update metadata (e.g. server version) 128 newServers[idx] = s 129 130 l.servers = newServers 131 found = true 132 break 133 } 134 } 135 136 // Add to the list if not known 137 if !found { 138 newServers := make([]*metadata.Server, len(l.servers), len(l.servers)+1) 139 copy(newServers, l.servers) 140 newServers = append(newServers, s) 141 l.servers = newServers 142 } 143 144 // Assume we are no longer offline since we've just seen a new server. 145 atomic.StoreInt32(&m.offline, 0) 146 147 // Start using this list of servers. 148 m.saveServerList(l) 149 } 150 151 // cycleServers returns a new list of servers that has dequeued the first 152 // server and enqueued it at the end of the list. cycleServers assumes the 153 // caller is holding the listLock. cycleServer does not test or ping 154 // the next server inline. cycleServer may be called when the environment 155 // has just entered an unhealthy situation and blocking on a server test is 156 // less desirable than just returning the next server in the firing line. If 157 // the next server fails, it will fail fast enough and cycleServer will be 158 // called again. 159 func (l *serverList) cycleServer() (servers []*metadata.Server) { 160 numServers := len(l.servers) 161 if numServers < 2 { 162 return servers // No action required 163 } 164 165 newServers := make([]*metadata.Server, 0, numServers) 166 newServers = append(newServers, l.servers[1:]...) 167 newServers = append(newServers, l.servers[0]) 168 169 return newServers 170 } 171 172 // removeServerByKey performs an inline removal of the first matching server 173 func (l *serverList) removeServerByKey(targetKey *metadata.Key) { 174 for i, s := range l.servers { 175 if targetKey.Equal(s.Key()) { 176 copy(l.servers[i:], l.servers[i+1:]) 177 l.servers[len(l.servers)-1] = nil 178 l.servers = l.servers[:len(l.servers)-1] 179 return 180 } 181 } 182 } 183 184 // shuffleServers shuffles the server list in place 185 func (l *serverList) shuffleServers() { 186 for i := len(l.servers) - 1; i > 0; i-- { 187 j := rand.Int31n(int32(i + 1)) 188 l.servers[i], l.servers[j] = l.servers[j], l.servers[i] 189 } 190 } 191 192 // IsOffline checks to see if all the known servers have failed their ping 193 // test during the last rebalance. 194 func (m *Manager) IsOffline() bool { 195 offline := atomic.LoadInt32(&m.offline) 196 return offline == 1 197 } 198 199 // FindServer takes out an internal "read lock" and searches through the list 200 // of servers to find a "healthy" server. If the server is actually 201 // unhealthy, we rely on Serf to detect this and remove the node from the 202 // server list. If the server at the front of the list has failed or fails 203 // during an RPC call, it is rotated to the end of the list. If there are no 204 // servers available, return nil. 205 func (m *Manager) FindServer() *metadata.Server { 206 l := m.getServerList() 207 numServers := len(l.servers) 208 if numServers == 0 { 209 m.logger.Printf("[WARN] manager: No servers available") 210 return nil 211 } 212 213 // Return whatever is at the front of the list because it is 214 // assumed to be the oldest in the server list (unless - 215 // hypothetically - the server list was rotated right after a 216 // server was added). 217 return l.servers[0] 218 } 219 220 // getServerList is a convenience method which hides the locking semantics 221 // of atomic.Value from the caller. 222 func (m *Manager) getServerList() serverList { 223 return m.listValue.Load().(serverList) 224 } 225 226 // saveServerList is a convenience method which hides the locking semantics 227 // of atomic.Value from the caller. 228 func (m *Manager) saveServerList(l serverList) { 229 m.listValue.Store(l) 230 } 231 232 // New is the only way to safely create a new Manager struct. 233 func New(logger *log.Logger, shutdownCh chan struct{}, clusterInfo ManagerSerfCluster, connPoolPinger Pinger) (m *Manager) { 234 m = new(Manager) 235 m.logger = logger 236 m.clusterInfo = clusterInfo // can't pass *consul.Client: import cycle 237 m.connPoolPinger = connPoolPinger // can't pass *consul.ConnPool: import cycle 238 m.rebalanceTimer = time.NewTimer(clientRPCMinReuseDuration) 239 m.shutdownCh = shutdownCh 240 atomic.StoreInt32(&m.offline, 1) 241 242 l := serverList{} 243 l.servers = make([]*metadata.Server, 0) 244 m.saveServerList(l) 245 return m 246 } 247 248 // NotifyFailedServer marks the passed in server as "failed" by rotating it 249 // to the end of the server list. 250 func (m *Manager) NotifyFailedServer(s *metadata.Server) { 251 l := m.getServerList() 252 253 // If the server being failed is not the first server on the list, 254 // this is a noop. If, however, the server is failed and first on 255 // the list, acquire the lock, retest, and take the penalty of moving 256 // the server to the end of the list. 257 258 // Only rotate the server list when there is more than one server 259 if len(l.servers) > 1 && l.servers[0].Name == s.Name && 260 // Use atomic.CAS to emulate a TryLock(). 261 atomic.CompareAndSwapInt32(&m.notifyFailedBarrier, 0, 1) { 262 defer atomic.StoreInt32(&m.notifyFailedBarrier, 0) 263 264 // Grab a lock, retest, and take the hit of cycling the first 265 // server to the end. 266 m.listLock.Lock() 267 defer m.listLock.Unlock() 268 l = m.getServerList() 269 270 if len(l.servers) > 1 && l.servers[0].Name == s.Name { 271 l.servers = l.cycleServer() 272 m.saveServerList(l) 273 m.logger.Printf(`[DEBUG] manager: cycled away from server "%s"`, s.Name) 274 } 275 } 276 } 277 278 // NumServers takes out an internal "read lock" and returns the number of 279 // servers. numServers includes both healthy and unhealthy servers. 280 func (m *Manager) NumServers() int { 281 l := m.getServerList() 282 return len(l.servers) 283 } 284 285 // RebalanceServers shuffles the list of servers on this metadata. The server 286 // at the front of the list is selected for the next RPC. RPC calls that 287 // fail for a particular server are rotated to the end of the list. This 288 // method reshuffles the list periodically in order to redistribute work 289 // across all known consul servers (i.e. guarantee that the order of servers 290 // in the server list is not positively correlated with the age of a server 291 // in the Consul cluster). Periodically shuffling the server list prevents 292 // long-lived clients from fixating on long-lived servers. 293 // 294 // Unhealthy servers are removed when serf notices the server has been 295 // deregistered. Before the newly shuffled server list is saved, the new 296 // remote endpoint is tested to ensure its responsive. 297 func (m *Manager) RebalanceServers() { 298 // Obtain a copy of the current serverList 299 l := m.getServerList() 300 301 // Shuffle servers so we have a chance of picking a new one. 302 l.shuffleServers() 303 304 // Iterate through the shuffled server list to find an assumed 305 // healthy server. NOTE: Do not iterate on the list directly because 306 // this loop mutates the server list in-place. 307 var foundHealthyServer bool 308 for i := 0; i < len(l.servers); i++ { 309 // Always test the first server. Failed servers are cycled 310 // while Serf detects the node has failed. 311 srv := l.servers[0] 312 313 ok, err := m.connPoolPinger.Ping(srv.Datacenter, srv.Addr, srv.Version, srv.UseTLS) 314 if ok { 315 foundHealthyServer = true 316 break 317 } 318 m.logger.Printf(`[DEBUG] manager: pinging server "%s" failed: %s`, srv, err) 319 l.servers = l.cycleServer() 320 } 321 322 // If no healthy servers were found, sleep and wait for Serf to make 323 // the world a happy place again. Update the offline status. 324 if foundHealthyServer { 325 atomic.StoreInt32(&m.offline, 0) 326 } else { 327 atomic.StoreInt32(&m.offline, 1) 328 m.logger.Printf("[DEBUG] manager: No healthy servers during rebalance, aborting") 329 return 330 } 331 332 // Verify that all servers are present 333 if m.reconcileServerList(&l) { 334 m.logger.Printf("[DEBUG] manager: Rebalanced %d servers, next active server is %s", len(l.servers), l.servers[0].String()) 335 } else { 336 // reconcileServerList failed because Serf removed the server 337 // that was at the front of the list that had successfully 338 // been Ping'ed. Between the Ping and reconcile, a Serf 339 // event had shown up removing the node. 340 // 341 // Instead of doing any heroics, "freeze in place" and 342 // continue to use the existing connection until the next 343 // rebalance occurs. 344 } 345 346 return 347 } 348 349 // reconcileServerList returns true when the first server in serverList 350 // exists in the receiver's serverList. If true, the merged serverList is 351 // stored as the receiver's serverList. Returns false if the first server 352 // does not exist in the list (i.e. was removed by Serf during a 353 // PingConsulServer() call. Newly added servers are appended to the list and 354 // other missing servers are removed from the list. 355 func (m *Manager) reconcileServerList(l *serverList) bool { 356 m.listLock.Lock() 357 defer m.listLock.Unlock() 358 359 // newServerCfg is a serverList that has been kept up to date with 360 // Serf node join and node leave events. 361 newServerCfg := m.getServerList() 362 363 // If Serf has removed all nodes, or there is no selected server 364 // (zero nodes in serverList), abort early. 365 if len(newServerCfg.servers) == 0 || len(l.servers) == 0 { 366 return false 367 } 368 369 type targetServer struct { 370 server *metadata.Server 371 372 // 'b' == both 373 // 'o' == original 374 // 'n' == new 375 state byte 376 } 377 mergedList := make(map[metadata.Key]*targetServer, len(l.servers)) 378 for _, s := range l.servers { 379 mergedList[*s.Key()] = &targetServer{server: s, state: 'o'} 380 } 381 for _, s := range newServerCfg.servers { 382 k := s.Key() 383 _, found := mergedList[*k] 384 if found { 385 mergedList[*k].state = 'b' 386 } else { 387 mergedList[*k] = &targetServer{server: s, state: 'n'} 388 } 389 } 390 391 // Ensure the selected server has not been removed by Serf 392 selectedServerKey := l.servers[0].Key() 393 if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' { 394 return false 395 } 396 397 // Append any new servers and remove any old servers 398 for k, v := range mergedList { 399 switch v.state { 400 case 'b': 401 // Do nothing, server exists in both 402 case 'o': 403 // Server has been removed 404 l.removeServerByKey(&k) 405 case 'n': 406 // Server added 407 l.servers = append(l.servers, v.server) 408 default: 409 panic("unknown merge list state") 410 } 411 } 412 413 m.saveServerList(*l) 414 return true 415 } 416 417 // RemoveServer takes out an internal write lock and removes a server from 418 // the server list. 419 func (m *Manager) RemoveServer(s *metadata.Server) { 420 m.listLock.Lock() 421 defer m.listLock.Unlock() 422 l := m.getServerList() 423 424 // Remove the server if known 425 for i := range l.servers { 426 if l.servers[i].Name == s.Name { 427 newServers := make([]*metadata.Server, 0, len(l.servers)-1) 428 newServers = append(newServers, l.servers[:i]...) 429 newServers = append(newServers, l.servers[i+1:]...) 430 l.servers = newServers 431 432 m.saveServerList(l) 433 return 434 } 435 } 436 } 437 438 // refreshServerRebalanceTimer is only called once m.rebalanceTimer expires. 439 func (m *Manager) refreshServerRebalanceTimer() time.Duration { 440 l := m.getServerList() 441 numServers := len(l.servers) 442 // Limit this connection's life based on the size (and health) of the 443 // cluster. Never rebalance a connection more frequently than 444 // connReuseLowWatermarkDuration, and make sure we never exceed 445 // clusterWideRebalanceConnsPerSec operations/s across numLANMembers. 446 clusterWideRebalanceConnsPerSec := float64(numServers * newRebalanceConnsPerSecPerServer) 447 connReuseLowWatermarkDuration := clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction) 448 numLANMembers := m.clusterInfo.NumNodes() 449 connRebalanceTimeout := lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWatermarkDuration, numLANMembers) 450 451 m.rebalanceTimer.Reset(connRebalanceTimeout) 452 return connRebalanceTimeout 453 } 454 455 // ResetRebalanceTimer resets the rebalance timer. This method exists for 456 // testing and should not be used directly. 457 func (m *Manager) ResetRebalanceTimer() { 458 m.listLock.Lock() 459 defer m.listLock.Unlock() 460 m.rebalanceTimer.Reset(clientRPCMinReuseDuration) 461 } 462 463 // Start is used to start and manage the task of automatically shuffling and 464 // rebalancing the list of Consul servers. This maintenance only happens 465 // periodically based on the expiration of the timer. Failed servers are 466 // automatically cycled to the end of the list. New servers are appended to 467 // the list. The order of the server list must be shuffled periodically to 468 // distribute load across all known and available Consul servers. 469 func (m *Manager) Start() { 470 for { 471 select { 472 case <-m.rebalanceTimer.C: 473 m.RebalanceServers() 474 m.refreshServerRebalanceTimer() 475 476 case <-m.shutdownCh: 477 m.logger.Printf("[INFO] manager: shutting down") 478 return 479 } 480 } 481 }