github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/client/rpcproxy/rpcproxy.go (about) 1 // Package rpcproxy provides a proxy interface to Nomad Servers. The 2 // RPCProxy periodically shuffles which server a Nomad Client communicates 3 // with in order to redistribute load across Nomad Servers. Nomad Servers 4 // that fail an RPC request are automatically cycled to the end of the list 5 // until the server list is reshuffled. 6 // 7 // The rpcproxy package does not provide any external API guarantees and 8 // should be called only by `hashicorp/nomad`. 9 package rpcproxy 10 11 import ( 12 "fmt" 13 "log" 14 "math/rand" 15 "strings" 16 "sync" 17 "sync/atomic" 18 "time" 19 20 "github.com/hashicorp/consul/lib" 21 "github.com/hashicorp/nomad/nomad/structs" 22 ) 23 24 const ( 25 // clientRPCJitterFraction determines the amount of jitter added to 26 // clientRPCMinReuseDuration before a connection is expired and a new 27 // connection is established in order to rebalance load across Nomad 28 // servers. The cluster-wide number of connections per second from 29 // rebalancing is applied after this jitter to ensure the CPU impact 30 // is always finite. See newRebalanceConnsPerSecPerServer's comment 31 // for additional commentary. 32 // 33 // For example, in a 10K Nomad cluster with 5x servers, this default 34 // averages out to ~13 new connections from rebalancing per server 35 // per second. 36 clientRPCJitterFraction = 2 37 38 // clientRPCMinReuseDuration controls the minimum amount of time RPC 39 // queries are sent over an established connection to a single server 40 clientRPCMinReuseDuration = 600 * time.Second 41 42 // Limit the number of new connections a server receives per second 43 // for connection rebalancing. This limit caps the load caused by 44 // continual rebalancing efforts when a cluster is in equilibrium. A 45 // lower value comes at the cost of increased recovery time after a 46 // partition. This parameter begins to take effect when there are 47 // more than ~48K clients querying 5x servers or at lower server 48 // counts when there is a partition. 49 // 50 // For example, in a 100K Nomad cluster with 5x servers, it will take 51 // ~5min for all servers to rebalance their connections. If 99,995 52 // agents are in the minority talking to only one server, it will 53 // take ~26min for all servers to rebalance. A 10K cluster in the 54 // same scenario will take ~2.6min to rebalance. 55 newRebalanceConnsPerSecPerServer = 64 56 57 // rpcAPIMismatchLogRate determines the rate at which log entries are 58 // emitted when the client and server's API versions are mismatched. 59 rpcAPIMismatchLogRate = 3 * time.Hour 60 ) 61 62 // NomadConfigInfo is an interface wrapper around this Nomad Agent's 63 // configuration to prevents a cyclic import dependency. 64 type NomadConfigInfo interface { 65 Datacenter() string 66 RPCMajorVersion() int 67 RPCMinorVersion() int 68 Region() string 69 } 70 71 // Pinger is an interface wrapping client.ConnPool to prevent a 72 // cyclic import dependency 73 type Pinger interface { 74 PingNomadServer(region string, apiMajorVersion int, s *ServerEndpoint) (bool, error) 75 } 76 77 // serverList is an array of Nomad Servers. The first server in the list is 78 // the active server. 79 // 80 // NOTE(sean@): We are explicitly relying on the fact that serverList will be 81 // copied onto the stack by atomic.Value. Please keep this structure light. 82 type serverList struct { 83 L []*ServerEndpoint 84 } 85 86 // RPCProxy is the manager type responsible for returning and managing Nomad 87 // addresses. 88 type RPCProxy struct { 89 // activatedList manages the list of Nomad Servers that are eligible 90 // to be queried by the Client agent. 91 activatedList atomic.Value 92 activatedListLock sync.Mutex 93 94 // primaryServers is a list of servers found in the last heartbeat. 95 // primaryServers are periodically reshuffled. Covered by 96 // serverListLock. 97 primaryServers serverList 98 99 // backupServers is a list of fallback servers. These servers are 100 // appended to the RPCProxy's serverList, but are never shuffled with 101 // the list of servers discovered via the Nomad heartbeat. Covered 102 // by serverListLock. 103 backupServers serverList 104 105 // serverListLock covers both backupServers and primaryServers. If 106 // it is necessary to hold serverListLock and listLock, obtain an 107 // exclusive lock on serverListLock before listLock. 108 serverListLock sync.RWMutex 109 110 leaderAddr string 111 numNodes int 112 113 // rebalanceTimer controls the duration of the rebalance interval 114 rebalanceTimer *time.Timer 115 116 // shutdownCh is a copy of the channel in nomad.Client 117 shutdownCh chan struct{} 118 119 logger *log.Logger 120 121 configInfo NomadConfigInfo 122 123 // rpcAPIMismatchThrottle regulates the rate at which warning 124 // messages are emitted in the event of an API mismatch between the 125 // clients and servers. 126 rpcAPIMismatchThrottle map[string]time.Time 127 128 // connPoolPinger is used to test the health of a server in the 129 // connection pool. Pinger is an interface that wraps 130 // client.ConnPool. 131 connPoolPinger Pinger 132 } 133 134 // NewRPCProxy is the only way to safely create a new RPCProxy. 135 func NewRPCProxy(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) *RPCProxy { 136 p := &RPCProxy{ 137 logger: logger, 138 configInfo: configInfo, // can't pass *nomad.Client: import cycle 139 connPoolPinger: connPoolPinger, // can't pass *nomad.ConnPool: import cycle 140 rebalanceTimer: time.NewTimer(clientRPCMinReuseDuration), 141 shutdownCh: shutdownCh, 142 } 143 144 l := serverList{} 145 l.L = make([]*ServerEndpoint, 0) 146 p.saveServerList(l) 147 return p 148 } 149 150 // activateEndpoint adds an endpoint to the RPCProxy's active serverList. 151 // Returns true if the server was added, returns false if the server already 152 // existed in the RPCProxy's serverList. 153 func (p *RPCProxy) activateEndpoint(s *ServerEndpoint) bool { 154 l := p.getServerList() 155 156 // Check if this server is known 157 found := false 158 for idx, existing := range l.L { 159 if existing.Name == s.Name { 160 newServers := make([]*ServerEndpoint, len(l.L)) 161 copy(newServers, l.L) 162 163 // Overwrite the existing server details in order to 164 // possibly update metadata (e.g. server version) 165 newServers[idx] = s 166 167 l.L = newServers 168 found = true 169 break 170 } 171 } 172 173 // Add to the list if not known 174 if !found { 175 newServers := make([]*ServerEndpoint, len(l.L), len(l.L)+1) 176 copy(newServers, l.L) 177 newServers = append(newServers, s) 178 l.L = newServers 179 } 180 181 p.saveServerList(l) 182 183 return !found 184 } 185 186 // SetBackupServers sets a list of Nomad Servers to be used in the event that 187 // the Nomad Agent lost contact with the list of Nomad Servers provided via 188 // the Nomad Agent's heartbeat. If available, the backup servers are 189 // populated via Consul. 190 func (p *RPCProxy) SetBackupServers(addrs []string) error { 191 l := make([]*ServerEndpoint, 0, len(addrs)) 192 for _, s := range addrs { 193 s, err := NewServerEndpoint(s) 194 if err != nil { 195 p.logger.Printf("[WARN] client.rpcproxy: unable to create backup server %+q: %v", s, err) 196 return fmt.Errorf("unable to create new backup server from %+q: %v", s, err) 197 } 198 l = append(l, s) 199 } 200 201 p.serverListLock.Lock() 202 p.backupServers.L = l 203 p.serverListLock.Unlock() 204 205 p.activatedListLock.Lock() 206 defer p.activatedListLock.Unlock() 207 for _, s := range l { 208 p.activateEndpoint(s) 209 } 210 211 return nil 212 } 213 214 // AddPrimaryServer takes the RPC address of a Nomad server, creates a new 215 // endpoint, and adds it to both the primaryServers list and the active 216 // serverList used in the RPC Proxy. If the endpoint is not known by the 217 // RPCProxy, appends the endpoint to the list. The new endpoint will begin 218 // seeing use after the rebalance timer fires (or enough servers fail 219 // organically). Any values in the primary server list are overridden by the 220 // next successful heartbeat. 221 func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { 222 s, err := NewServerEndpoint(rpcAddr) 223 if err != nil { 224 p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %+q: %v", rpcAddr, err) 225 return nil 226 } 227 228 k := s.Key() 229 p.serverListLock.Lock() 230 if serverExists := p.primaryServers.serverExistByKey(k); serverExists { 231 p.serverListLock.Unlock() 232 return s 233 } 234 p.primaryServers.L = append(p.primaryServers.L, s) 235 p.serverListLock.Unlock() 236 237 p.activatedListLock.Lock() 238 p.activateEndpoint(s) 239 p.activatedListLock.Unlock() 240 241 return s 242 } 243 244 // cycleServers returns a new list of servers that has dequeued the first 245 // server and enqueued it at the end of the list. cycleServers assumes the 246 // caller is holding the listLock. cycleServer does not test or ping 247 // the next server inline. cycleServer may be called when the environment 248 // has just entered an unhealthy situation and blocking on a server test is 249 // less desirable than just returning the next server in the firing line. If 250 // the next server fails, it will fail fast enough and cycleServer will be 251 // called again. 252 func (l *serverList) cycleServer() (servers []*ServerEndpoint) { 253 numServers := len(l.L) 254 if numServers < 2 { 255 return servers // No action required 256 } 257 258 newServers := make([]*ServerEndpoint, 0, numServers) 259 newServers = append(newServers, l.L[1:]...) 260 newServers = append(newServers, l.L[0]) 261 262 return newServers 263 } 264 265 // serverExistByKey performs a search to see if a server exists in the 266 // serverList. Assumes the caller is holding at least a read lock. 267 func (l *serverList) serverExistByKey(targetKey *EndpointKey) bool { 268 var found bool 269 for _, server := range l.L { 270 if targetKey.Equal(server.Key()) { 271 found = true 272 } 273 } 274 return found 275 } 276 277 // removeServerByKey performs an inline removal of the first matching server 278 func (l *serverList) removeServerByKey(targetKey *EndpointKey) { 279 for i, s := range l.L { 280 if targetKey.Equal(s.Key()) { 281 copy(l.L[i:], l.L[i+1:]) 282 l.L[len(l.L)-1] = nil 283 l.L = l.L[:len(l.L)-1] 284 return 285 } 286 } 287 } 288 289 // shuffleServers shuffles the server list in place 290 func (l *serverList) shuffleServers() { 291 for i := len(l.L) - 1; i > 0; i-- { 292 j := rand.Int31n(int32(i + 1)) 293 l.L[i], l.L[j] = l.L[j], l.L[i] 294 } 295 } 296 297 // String returns a string representation of serverList 298 func (l *serverList) String() string { 299 if len(l.L) == 0 { 300 return fmt.Sprintf("empty server list") 301 } 302 303 serverStrs := make([]string, 0, len(l.L)) 304 for _, server := range l.L { 305 serverStrs = append(serverStrs, server.String()) 306 } 307 308 return fmt.Sprintf("[%s]", strings.Join(serverStrs, ", ")) 309 } 310 311 // FindServer takes out an internal "read lock" and searches through the list 312 // of servers to find a "healthy" server. If the server is actually 313 // unhealthy, we rely on heartbeats to detect this and remove the node from 314 // the server list. If the server at the front of the list has failed or 315 // fails during an RPC call, it is rotated to the end of the list. If there 316 // are no servers available, return nil. 317 func (p *RPCProxy) FindServer() *ServerEndpoint { 318 l := p.getServerList() 319 numServers := len(l.L) 320 if numServers == 0 { 321 p.logger.Printf("[WARN] client.rpcproxy: No servers available") 322 return nil 323 } 324 325 // Return whatever is at the front of the list because it is 326 // assumed to be the oldest in the server list (unless - 327 // hypothetically - the server list was rotated right after a 328 // server was added). 329 return l.L[0] 330 } 331 332 // getServerList is a convenience method which hides the locking semantics 333 // of atomic.Value from the caller. 334 func (p *RPCProxy) getServerList() serverList { 335 return p.activatedList.Load().(serverList) 336 } 337 338 // saveServerList is a convenience method which hides the locking semantics 339 // of atomic.Value from the caller. 340 func (p *RPCProxy) saveServerList(l serverList) { 341 p.activatedList.Store(l) 342 } 343 344 // LeaderAddr returns the current leader address. If an empty string, then 345 // the Nomad Server for this Nomad Agent is in the minority or the Nomad 346 // Servers are in the middle of an election. 347 func (p *RPCProxy) LeaderAddr() string { 348 p.activatedListLock.Lock() 349 defer p.activatedListLock.Unlock() 350 return p.leaderAddr 351 } 352 353 // NotifyFailedServer marks the passed in server as "failed" by rotating it 354 // to the end of the server list. 355 func (p *RPCProxy) NotifyFailedServer(s *ServerEndpoint) { 356 l := p.getServerList() 357 358 // If the server being failed is not the first server on the list, 359 // this is a noop. If, however, the server is failed and first on 360 // the list, acquire the lock, retest, and take the penalty of moving 361 // the server to the end of the list. 362 363 // Only rotate the server list when there is more than one server 364 if len(l.L) > 1 && l.L[0] == s { 365 // Grab a lock, retest, and take the hit of cycling the first 366 // server to the end. 367 p.activatedListLock.Lock() 368 defer p.activatedListLock.Unlock() 369 l = p.getServerList() 370 371 if len(l.L) > 1 && l.L[0] == s { 372 l.L = l.cycleServer() 373 p.saveServerList(l) 374 } 375 } 376 } 377 378 // NumNodes returns the estimated number of nodes according to the last Nomad 379 // Heartbeat. 380 func (p *RPCProxy) NumNodes() int { 381 return p.numNodes 382 } 383 384 // NumServers takes out an internal "read lock" and returns the number of 385 // servers. numServers includes both healthy and unhealthy servers. 386 func (p *RPCProxy) NumServers() int { 387 l := p.getServerList() 388 return len(l.L) 389 } 390 391 // RebalanceServers shuffles the list of servers on this agent. The server 392 // at the front of the list is selected for the next RPC. RPC calls that 393 // fail for a particular server are rotated to the end of the list. This 394 // method reshuffles the list periodically in order to redistribute work 395 // across all known Nomad servers (i.e. guarantee that the order of servers 396 // in the server list is not positively correlated with the age of a server 397 // in the Nomad cluster). Periodically shuffling the server list prevents 398 // long-lived clients from fixating on long-lived servers. 399 // 400 // Unhealthy servers are removed from the server list during the next client 401 // heartbeat. Before the newly shuffled server list is saved, the new remote 402 // endpoint is tested to ensure its responsive. 403 func (p *RPCProxy) RebalanceServers() { 404 var serverListLocked bool 405 p.serverListLock.Lock() 406 serverListLocked = true 407 defer func() { 408 if serverListLocked { 409 p.serverListLock.Unlock() 410 } 411 }() 412 413 // Early abort if there is nothing to shuffle 414 if (len(p.primaryServers.L) + len(p.backupServers.L)) < 2 { 415 return 416 } 417 418 // Shuffle server lists independently 419 p.primaryServers.shuffleServers() 420 p.backupServers.shuffleServers() 421 422 // Create a new merged serverList 423 type targetServer struct { 424 server *ServerEndpoint 425 // 'p' == Primary Server 426 // 's' == Secondary/Backup Server 427 // 'b' == Both 428 state byte 429 } 430 mergedList := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(p.backupServers.L)) 431 for _, s := range p.primaryServers.L { 432 mergedList[*s.Key()] = &targetServer{server: s, state: 'p'} 433 } 434 for _, s := range p.backupServers.L { 435 k := s.Key() 436 _, found := mergedList[*k] 437 if found { 438 mergedList[*k].state = 'b' 439 } else { 440 mergedList[*k] = &targetServer{server: s, state: 's'} 441 } 442 } 443 444 l := &serverList{L: make([]*ServerEndpoint, 0, len(mergedList))} 445 for _, s := range p.primaryServers.L { 446 l.L = append(l.L, s) 447 } 448 for _, v := range mergedList { 449 if v.state != 's' { 450 continue 451 } 452 l.L = append(l.L, v.server) 453 } 454 455 // Release the lock before we begin transition to operations on the 456 // network timescale and attempt to ping servers. A copy of the 457 // servers has been made at this point. 458 p.serverListLock.Unlock() 459 serverListLocked = false 460 461 // Iterate through the shuffled server list to find an assumed 462 // healthy server. NOTE: Do not iterate on the list directly because 463 // this loop mutates the server list in-place. 464 var foundHealthyServer bool 465 for i := 0; i < len(l.L); i++ { 466 // Always test the first server. Failed servers are cycled 467 // and eventually removed from the list when Nomad heartbeats 468 // detect the failed node. 469 selectedServer := l.L[0] 470 471 ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCMajorVersion(), selectedServer) 472 if ok { 473 foundHealthyServer = true 474 break 475 } 476 p.logger.Printf(`[DEBUG] client.rpcproxy: pinging server "%s" failed: %s`, selectedServer.String(), err) 477 478 l.cycleServer() 479 } 480 481 // If no healthy servers were found, sleep and wait for the admin to 482 // join this node to a server and begin receiving heartbeats with an 483 // updated list of Nomad servers. Or Consul will begin advertising a 484 // new server in the nomad service (Nomad server service). 485 if !foundHealthyServer { 486 p.logger.Printf("[DEBUG] client.rpcproxy: No healthy servers during rebalance, aborting") 487 return 488 } 489 490 // Verify that all servers are present. Reconcile will save the 491 // final serverList. 492 if p.reconcileServerList(l) { 493 p.logger.Printf("[TRACE] client.rpcproxy: Rebalanced %d servers, next active server is %s", len(l.L), l.L[0].String()) 494 } else { 495 // reconcileServerList failed because Nomad removed the 496 // server that was at the front of the list that had 497 // successfully been Ping'ed. Between the Ping and 498 // reconcile, a Nomad heartbeat removed the node. 499 // 500 // Instead of doing any heroics, "freeze in place" and 501 // continue to use the existing connection until the next 502 // rebalance occurs. 503 } 504 505 return 506 } 507 508 // reconcileServerList returns true when the first server in serverList 509 // (l) exists in the receiver's serverList (p). If true, the merged 510 // serverList (l) is stored as the receiver's serverList (p). Returns 511 // false if the first server in p does not exist in the passed in list (l) 512 // (i.e. was removed by Nomad during a PingNomadServer() call. Newly added 513 // servers are appended to the list and other missing servers are removed 514 // from the list. 515 func (p *RPCProxy) reconcileServerList(l *serverList) bool { 516 p.activatedListLock.Lock() 517 defer p.activatedListLock.Unlock() 518 519 // newServerList is a serverList that has been kept up-to-date with 520 // join and leave events. 521 newServerList := p.getServerList() 522 523 // If a Nomad heartbeat removed all nodes, or there is no selected 524 // server (zero nodes in serverList), abort early. 525 if len(newServerList.L) == 0 || len(l.L) == 0 { 526 return false 527 } 528 529 type targetServer struct { 530 server *ServerEndpoint 531 532 // 'b' == both 533 // 'o' == original 534 // 'n' == new 535 state byte 536 } 537 mergedList := make(map[EndpointKey]*targetServer, len(l.L)) 538 for _, s := range l.L { 539 mergedList[*s.Key()] = &targetServer{server: s, state: 'o'} 540 } 541 for _, s := range newServerList.L { 542 k := s.Key() 543 _, found := mergedList[*k] 544 if found { 545 mergedList[*k].state = 'b' 546 } else { 547 mergedList[*k] = &targetServer{server: s, state: 'n'} 548 } 549 } 550 551 // Ensure the selected server has not been removed by a heartbeat 552 selectedServerKey := l.L[0].Key() 553 if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' { 554 return false 555 } 556 557 // Append any new servers and remove any old servers 558 for k, v := range mergedList { 559 switch v.state { 560 case 'b': 561 // Do nothing, server exists in both 562 case 'o': 563 // Server has been removed 564 l.removeServerByKey(&k) 565 case 'n': 566 // Server added 567 l.L = append(l.L, v.server) 568 default: 569 panic("unknown merge list state") 570 } 571 } 572 573 p.saveServerList(*l) 574 return true 575 } 576 577 // RemoveServer takes out an internal write lock and removes a server from 578 // the activated server list. 579 func (p *RPCProxy) RemoveServer(s *ServerEndpoint) { 580 // Lock hierarchy protocol dictates serverListLock is acquired first. 581 p.serverListLock.Lock() 582 defer p.serverListLock.Unlock() 583 584 p.activatedListLock.Lock() 585 defer p.activatedListLock.Unlock() 586 l := p.getServerList() 587 588 k := s.Key() 589 l.removeServerByKey(k) 590 p.saveServerList(l) 591 592 p.primaryServers.removeServerByKey(k) 593 p.backupServers.removeServerByKey(k) 594 } 595 596 // refreshServerRebalanceTimer is only called once p.rebalanceTimer expires. 597 func (p *RPCProxy) refreshServerRebalanceTimer() time.Duration { 598 l := p.getServerList() 599 numServers := len(l.L) 600 // Limit this connection's life based on the size (and health) of the 601 // cluster. Never rebalance a connection more frequently than 602 // connReuseLowWatermarkDuration, and make sure we never exceed 603 // clusterWideRebalanceConnsPerSec operations/s across numLANMembers. 604 clusterWideRebalanceConnsPerSec := float64(numServers * newRebalanceConnsPerSecPerServer) 605 connReuseLowWatermarkDuration := clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction) 606 numLANMembers := p.numNodes 607 connRebalanceTimeout := lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWatermarkDuration, numLANMembers) 608 609 p.rebalanceTimer.Reset(connRebalanceTimeout) 610 return connRebalanceTimeout 611 } 612 613 // ResetRebalanceTimer resets the rebalance timer. This method exists for 614 // testing and should not be used directly. 615 func (p *RPCProxy) ResetRebalanceTimer() { 616 p.activatedListLock.Lock() 617 defer p.activatedListLock.Unlock() 618 p.rebalanceTimer.Reset(clientRPCMinReuseDuration) 619 } 620 621 // ServerRPCAddrs returns one RPC Address per server 622 func (p *RPCProxy) ServerRPCAddrs() []string { 623 l := p.getServerList() 624 serverAddrs := make([]string, 0, len(l.L)) 625 for _, s := range l.L { 626 serverAddrs = append(serverAddrs, s.Addr.String()) 627 } 628 return serverAddrs 629 } 630 631 // Run is used to start and manage the task of automatically shuffling and 632 // rebalancing the list of Nomad servers. This maintenance only happens 633 // periodically based on the expiration of the timer. Failed servers are 634 // automatically cycled to the end of the list. New servers are appended to 635 // the list. The order of the server list must be shuffled periodically to 636 // distribute load across all known and available Nomad servers. 637 func (p *RPCProxy) Run() { 638 for { 639 select { 640 case <-p.rebalanceTimer.C: 641 p.RebalanceServers() 642 643 p.refreshServerRebalanceTimer() 644 case <-p.shutdownCh: 645 p.logger.Printf("[INFO] client.rpcproxy: shutting down") 646 return 647 } 648 } 649 } 650 651 // RefreshServerLists is called when the Client receives an update from a 652 // Nomad Server. The response from Nomad Client Heartbeats contain a list of 653 // Nomad Servers that the Nomad Client should use for RPC requests. 654 // RefreshServerLists does not rebalance its serverLists (that is handled 655 // elsewhere via a periodic timer). New Nomad Servers learned via the 656 // heartbeat are appended to the RPCProxy's activated serverList. Servers 657 // that are no longer present in the Heartbeat are removed immediately from 658 // all server lists. Nomad Servers speaking a newer major or minor API 659 // version are filtered from the serverList. 660 func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNodes int32, leaderRPCAddr string) error { 661 // Merge all servers found in the response. Servers in the response 662 // with newer API versions are filtered from the list. If the list 663 // is missing an address found in the RPCProxy's server list, remove 664 // it from the RPCProxy. 665 666 p.serverListLock.Lock() 667 defer p.serverListLock.Unlock() 668 669 // Clear the backup server list when a heartbeat contains at least 670 // one server. 671 if len(servers) > 0 && len(p.backupServers.L) > 0 { 672 p.backupServers.L = make([]*ServerEndpoint, 0, len(servers)) 673 } 674 675 // 1) Create a map to reconcile the difference between 676 // p.primaryServers and servers. 677 type targetServer struct { 678 server *ServerEndpoint 679 680 // 'b' == both 681 // 'o' == original 682 // 'n' == new 683 state byte 684 } 685 mergedPrimaryMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(servers)) 686 numOldServers := 0 687 for _, s := range p.primaryServers.L { 688 mergedPrimaryMap[*s.Key()] = &targetServer{server: s, state: 'o'} 689 numOldServers++ 690 } 691 numBothServers := 0 692 var newServers bool 693 for _, s := range servers { 694 // Filter out servers using a newer API version. Prevent 695 // spamming the logs every heartbeat. 696 // 697 // TODO(sean@): Move the logging throttle logic into a 698 // dedicated logging package so RPCProxy does not have to 699 // perform this accounting. 700 if int32(p.configInfo.RPCMajorVersion()) < s.RPCMajorVersion || 701 (int32(p.configInfo.RPCMajorVersion()) == s.RPCMajorVersion && 702 int32(p.configInfo.RPCMinorVersion()) < s.RPCMinorVersion) { 703 now := time.Now() 704 t, ok := p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] 705 if ok && t.After(now) { 706 continue 707 } 708 709 p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %+q", p.configInfo.RPCMajorVersion(), p.configInfo.RPCMinorVersion(), s.RPCMajorVersion, s.RPCMinorVersion, s.RPCAdvertiseAddr) 710 p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) 711 continue 712 } 713 714 server, err := NewServerEndpoint(s.RPCAdvertiseAddr) 715 if err != nil { 716 p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %+q: %v", s.RPCAdvertiseAddr, err) 717 continue 718 } 719 720 // Nomad servers in different datacenters are automatically 721 // added to the backup server list. 722 if s.Datacenter != p.configInfo.Datacenter() { 723 p.backupServers.L = append(p.backupServers.L, server) 724 continue 725 } 726 727 k := server.Key() 728 _, found := mergedPrimaryMap[*k] 729 if found { 730 mergedPrimaryMap[*k].state = 'b' 731 numBothServers++ 732 } else { 733 mergedPrimaryMap[*k] = &targetServer{server: server, state: 'n'} 734 newServers = true 735 } 736 } 737 738 // Short-circuit acquiring listLock if nothing changed 739 if !newServers && numOldServers == numBothServers { 740 return nil 741 } 742 743 p.activatedListLock.Lock() 744 defer p.activatedListLock.Unlock() 745 newServerCfg := p.getServerList() 746 for k, v := range mergedPrimaryMap { 747 switch v.state { 748 case 'b': 749 // Do nothing, server exists in both 750 case 'o': 751 // Server has been removed 752 753 // TODO(sean@): Teach Nomad servers how to remove 754 // themselves from their heartbeat in order to 755 // gracefully drain their clients over the next 756 // cluster's max rebalanceTimer duration. Without 757 // this enhancement, if a server being shutdown and 758 // it is the first in serverList, the client will 759 // fail its next RPC connection. 760 p.primaryServers.removeServerByKey(&k) 761 newServerCfg.removeServerByKey(&k) 762 case 'n': 763 // Server added. Append it to both lists 764 // immediately. The server should only go into 765 // active use in the event of a failure or after a 766 // rebalance occurs. 767 p.primaryServers.L = append(p.primaryServers.L, v.server) 768 newServerCfg.L = append(newServerCfg.L, v.server) 769 default: 770 panic("unknown merge list state") 771 } 772 } 773 774 p.numNodes = int(numNodes) 775 p.leaderAddr = leaderRPCAddr 776 p.saveServerList(newServerCfg) 777 778 return nil 779 }