github.imxd.top/hashicorp/consul@v1.4.5/agent/router/manager.go (about)

     1  // Package servers provides a Manager interface for Manager managed
     2  // metadata.Server objects.  The servers package manages servers from a Consul
     3  // client's perspective (i.e. a list of servers that a client talks with for
     4  // RPCs).  The servers package does not provide any API guarantees and should
     5  // be called only by `hashicorp/consul`.
     6  package router
     7  
     8  import (
     9  	"log"
    10  	"math/rand"
    11  	"net"
    12  	"sync"
    13  	"sync/atomic"
    14  	"time"
    15  
    16  	"github.com/hashicorp/consul/agent/metadata"
    17  	"github.com/hashicorp/consul/lib"
    18  )
    19  
    20  const (
    21  	// clientRPCJitterFraction determines the amount of jitter added to
    22  	// clientRPCMinReuseDuration before a connection is expired and a new
    23  	// connection is established in order to rebalance load across consul
    24  	// servers.  The cluster-wide number of connections per second from
    25  	// rebalancing is applied after this jitter to ensure the CPU impact
    26  	// is always finite.  See newRebalanceConnsPerSecPerServer's comment
    27  	// for additional commentary.
    28  	//
    29  	// For example, in a 10K consul cluster with 5x servers, this default
    30  	// averages out to ~13 new connections from rebalancing per server
    31  	// per second (each connection is reused for 120s to 180s).
    32  	clientRPCJitterFraction = 2
    33  
    34  	// clientRPCMinReuseDuration controls the minimum amount of time RPC
    35  	// queries are sent over an established connection to a single server
    36  	clientRPCMinReuseDuration = 120 * time.Second
    37  
    38  	// Limit the number of new connections a server receives per second
    39  	// for connection rebalancing.  This limit caps the load caused by
    40  	// continual rebalancing efforts when a cluster is in equilibrium.  A
    41  	// lower value comes at the cost of increased recovery time after a
    42  	// partition.  This parameter begins to take effect when there are
    43  	// more than ~48K clients querying 5x servers or at lower server
    44  	// values when there is a partition.
    45  	//
    46  	// For example, in a 100K consul cluster with 5x servers, it will
    47  	// take ~5min for all servers to rebalance their connections.  If
    48  	// 99,995 agents are in the minority talking to only one server, it
    49  	// will take ~26min for all servers to rebalance.  A 10K cluster in
    50  	// the same scenario will take ~2.6min to rebalance.
    51  	newRebalanceConnsPerSecPerServer = 64
    52  )
    53  
    54  // ManagerSerfCluster is an interface wrapper around Serf in order to make this
    55  // easier to unit test.
    56  type ManagerSerfCluster interface {
    57  	NumNodes() int
    58  }
    59  
    60  // Pinger is an interface wrapping client.ConnPool to prevent a cyclic import
    61  // dependency.
    62  type Pinger interface {
    63  	Ping(dc string, addr net.Addr, version int, useTLS bool) (bool, error)
    64  }
    65  
    66  // serverList is a local copy of the struct used to maintain the list of
    67  // Consul servers used by Manager.
    68  //
    69  // NOTE(sean@): We are explicitly relying on the fact that serverList will
    70  // be copied onto the stack.  Please keep this structure light.
    71  type serverList struct {
    72  	// servers tracks the locally known servers.  List membership is
    73  	// maintained by Serf.
    74  	servers []*metadata.Server
    75  }
    76  
    77  type Manager struct {
    78  	// listValue manages the atomic load/store of a Manager's serverList
    79  	listValue atomic.Value
    80  	listLock  sync.Mutex
    81  
    82  	// rebalanceTimer controls the duration of the rebalance interval
    83  	rebalanceTimer *time.Timer
    84  
    85  	// shutdownCh is a copy of the channel in consul.Client
    86  	shutdownCh chan struct{}
    87  
    88  	logger *log.Logger
    89  
    90  	// clusterInfo is used to estimate the approximate number of nodes in
    91  	// a cluster and limit the rate at which it rebalances server
    92  	// connections.  ManagerSerfCluster is an interface that wraps serf.
    93  	clusterInfo ManagerSerfCluster
    94  
    95  	// connPoolPinger is used to test the health of a server in the
    96  	// connection pool.  Pinger is an interface that wraps
    97  	// client.ConnPool.
    98  	connPoolPinger Pinger
    99  
   100  	// notifyFailedBarrier is acts as a barrier to prevent queuing behind
   101  	// serverListLog and acts as a TryLock().
   102  	notifyFailedBarrier int32
   103  
   104  	// offline is used to indicate that there are no servers, or that all
   105  	// known servers have failed the ping test.
   106  	offline int32
   107  }
   108  
   109  // AddServer takes out an internal write lock and adds a new server.  If the
   110  // server is not known, appends the server to the list.  The new server will
   111  // begin seeing use after the rebalance timer fires or enough servers fail
   112  // organically.  If the server is already known, merge the new server
   113  // details.
   114  func (m *Manager) AddServer(s *metadata.Server) {
   115  	m.listLock.Lock()
   116  	defer m.listLock.Unlock()
   117  	l := m.getServerList()
   118  
   119  	// Check if this server is known
   120  	found := false
   121  	for idx, existing := range l.servers {
   122  		if existing.Name == s.Name {
   123  			newServers := make([]*metadata.Server, len(l.servers))
   124  			copy(newServers, l.servers)
   125  
   126  			// Overwrite the existing server details in order to
   127  			// possibly update metadata (e.g. server version)
   128  			newServers[idx] = s
   129  
   130  			l.servers = newServers
   131  			found = true
   132  			break
   133  		}
   134  	}
   135  
   136  	// Add to the list if not known
   137  	if !found {
   138  		newServers := make([]*metadata.Server, len(l.servers), len(l.servers)+1)
   139  		copy(newServers, l.servers)
   140  		newServers = append(newServers, s)
   141  		l.servers = newServers
   142  	}
   143  
   144  	// Assume we are no longer offline since we've just seen a new server.
   145  	atomic.StoreInt32(&m.offline, 0)
   146  
   147  	// Start using this list of servers.
   148  	m.saveServerList(l)
   149  }
   150  
   151  // cycleServers returns a new list of servers that has dequeued the first
   152  // server and enqueued it at the end of the list.  cycleServers assumes the
   153  // caller is holding the listLock.  cycleServer does not test or ping
   154  // the next server inline.  cycleServer may be called when the environment
   155  // has just entered an unhealthy situation and blocking on a server test is
   156  // less desirable than just returning the next server in the firing line.  If
   157  // the next server fails, it will fail fast enough and cycleServer will be
   158  // called again.
   159  func (l *serverList) cycleServer() (servers []*metadata.Server) {
   160  	numServers := len(l.servers)
   161  	if numServers < 2 {
   162  		return servers // No action required
   163  	}
   164  
   165  	newServers := make([]*metadata.Server, 0, numServers)
   166  	newServers = append(newServers, l.servers[1:]...)
   167  	newServers = append(newServers, l.servers[0])
   168  
   169  	return newServers
   170  }
   171  
   172  // removeServerByKey performs an inline removal of the first matching server
   173  func (l *serverList) removeServerByKey(targetKey *metadata.Key) {
   174  	for i, s := range l.servers {
   175  		if targetKey.Equal(s.Key()) {
   176  			copy(l.servers[i:], l.servers[i+1:])
   177  			l.servers[len(l.servers)-1] = nil
   178  			l.servers = l.servers[:len(l.servers)-1]
   179  			return
   180  		}
   181  	}
   182  }
   183  
   184  // shuffleServers shuffles the server list in place
   185  func (l *serverList) shuffleServers() {
   186  	for i := len(l.servers) - 1; i > 0; i-- {
   187  		j := rand.Int31n(int32(i + 1))
   188  		l.servers[i], l.servers[j] = l.servers[j], l.servers[i]
   189  	}
   190  }
   191  
   192  // IsOffline checks to see if all the known servers have failed their ping
   193  // test during the last rebalance.
   194  func (m *Manager) IsOffline() bool {
   195  	offline := atomic.LoadInt32(&m.offline)
   196  	return offline == 1
   197  }
   198  
   199  // FindServer takes out an internal "read lock" and searches through the list
   200  // of servers to find a "healthy" server.  If the server is actually
   201  // unhealthy, we rely on Serf to detect this and remove the node from the
   202  // server list.  If the server at the front of the list has failed or fails
   203  // during an RPC call, it is rotated to the end of the list.  If there are no
   204  // servers available, return nil.
   205  func (m *Manager) FindServer() *metadata.Server {
   206  	l := m.getServerList()
   207  	numServers := len(l.servers)
   208  	if numServers == 0 {
   209  		m.logger.Printf("[WARN] manager: No servers available")
   210  		return nil
   211  	}
   212  
   213  	// Return whatever is at the front of the list because it is
   214  	// assumed to be the oldest in the server list (unless -
   215  	// hypothetically - the server list was rotated right after a
   216  	// server was added).
   217  	return l.servers[0]
   218  }
   219  
   220  // getServerList is a convenience method which hides the locking semantics
   221  // of atomic.Value from the caller.
   222  func (m *Manager) getServerList() serverList {
   223  	return m.listValue.Load().(serverList)
   224  }
   225  
   226  // saveServerList is a convenience method which hides the locking semantics
   227  // of atomic.Value from the caller.
   228  func (m *Manager) saveServerList(l serverList) {
   229  	m.listValue.Store(l)
   230  }
   231  
   232  // New is the only way to safely create a new Manager struct.
   233  func New(logger *log.Logger, shutdownCh chan struct{}, clusterInfo ManagerSerfCluster, connPoolPinger Pinger) (m *Manager) {
   234  	m = new(Manager)
   235  	m.logger = logger
   236  	m.clusterInfo = clusterInfo       // can't pass *consul.Client: import cycle
   237  	m.connPoolPinger = connPoolPinger // can't pass *consul.ConnPool: import cycle
   238  	m.rebalanceTimer = time.NewTimer(clientRPCMinReuseDuration)
   239  	m.shutdownCh = shutdownCh
   240  	atomic.StoreInt32(&m.offline, 1)
   241  
   242  	l := serverList{}
   243  	l.servers = make([]*metadata.Server, 0)
   244  	m.saveServerList(l)
   245  	return m
   246  }
   247  
   248  // NotifyFailedServer marks the passed in server as "failed" by rotating it
   249  // to the end of the server list.
   250  func (m *Manager) NotifyFailedServer(s *metadata.Server) {
   251  	l := m.getServerList()
   252  
   253  	// If the server being failed is not the first server on the list,
   254  	// this is a noop.  If, however, the server is failed and first on
   255  	// the list, acquire the lock, retest, and take the penalty of moving
   256  	// the server to the end of the list.
   257  
   258  	// Only rotate the server list when there is more than one server
   259  	if len(l.servers) > 1 && l.servers[0].Name == s.Name &&
   260  		// Use atomic.CAS to emulate a TryLock().
   261  		atomic.CompareAndSwapInt32(&m.notifyFailedBarrier, 0, 1) {
   262  		defer atomic.StoreInt32(&m.notifyFailedBarrier, 0)
   263  
   264  		// Grab a lock, retest, and take the hit of cycling the first
   265  		// server to the end.
   266  		m.listLock.Lock()
   267  		defer m.listLock.Unlock()
   268  		l = m.getServerList()
   269  
   270  		if len(l.servers) > 1 && l.servers[0].Name == s.Name {
   271  			l.servers = l.cycleServer()
   272  			m.saveServerList(l)
   273  			m.logger.Printf(`[DEBUG] manager: cycled away from server "%s"`, s.Name)
   274  		}
   275  	}
   276  }
   277  
   278  // NumServers takes out an internal "read lock" and returns the number of
   279  // servers.  numServers includes both healthy and unhealthy servers.
   280  func (m *Manager) NumServers() int {
   281  	l := m.getServerList()
   282  	return len(l.servers)
   283  }
   284  
   285  // RebalanceServers shuffles the list of servers on this metadata.  The server
   286  // at the front of the list is selected for the next RPC.  RPC calls that
   287  // fail for a particular server are rotated to the end of the list.  This
   288  // method reshuffles the list periodically in order to redistribute work
   289  // across all known consul servers (i.e. guarantee that the order of servers
   290  // in the server list is not positively correlated with the age of a server
   291  // in the Consul cluster).  Periodically shuffling the server list prevents
   292  // long-lived clients from fixating on long-lived servers.
   293  //
   294  // Unhealthy servers are removed when serf notices the server has been
   295  // deregistered.  Before the newly shuffled server list is saved, the new
   296  // remote endpoint is tested to ensure its responsive.
   297  func (m *Manager) RebalanceServers() {
   298  	// Obtain a copy of the current serverList
   299  	l := m.getServerList()
   300  
   301  	// Shuffle servers so we have a chance of picking a new one.
   302  	l.shuffleServers()
   303  
   304  	// Iterate through the shuffled server list to find an assumed
   305  	// healthy server.  NOTE: Do not iterate on the list directly because
   306  	// this loop mutates the server list in-place.
   307  	var foundHealthyServer bool
   308  	for i := 0; i < len(l.servers); i++ {
   309  		// Always test the first server.  Failed servers are cycled
   310  		// while Serf detects the node has failed.
   311  		srv := l.servers[0]
   312  
   313  		ok, err := m.connPoolPinger.Ping(srv.Datacenter, srv.Addr, srv.Version, srv.UseTLS)
   314  		if ok {
   315  			foundHealthyServer = true
   316  			break
   317  		}
   318  		m.logger.Printf(`[DEBUG] manager: pinging server "%s" failed: %s`, srv, err)
   319  		l.servers = l.cycleServer()
   320  	}
   321  
   322  	// If no healthy servers were found, sleep and wait for Serf to make
   323  	// the world a happy place again. Update the offline status.
   324  	if foundHealthyServer {
   325  		atomic.StoreInt32(&m.offline, 0)
   326  	} else {
   327  		atomic.StoreInt32(&m.offline, 1)
   328  		m.logger.Printf("[DEBUG] manager: No healthy servers during rebalance, aborting")
   329  		return
   330  	}
   331  
   332  	// Verify that all servers are present
   333  	if m.reconcileServerList(&l) {
   334  		m.logger.Printf("[DEBUG] manager: Rebalanced %d servers, next active server is %s", len(l.servers), l.servers[0].String())
   335  	} else {
   336  		// reconcileServerList failed because Serf removed the server
   337  		// that was at the front of the list that had successfully
   338  		// been Ping'ed.  Between the Ping and reconcile, a Serf
   339  		// event had shown up removing the node.
   340  		//
   341  		// Instead of doing any heroics, "freeze in place" and
   342  		// continue to use the existing connection until the next
   343  		// rebalance occurs.
   344  	}
   345  
   346  	return
   347  }
   348  
   349  // reconcileServerList returns true when the first server in serverList
   350  // exists in the receiver's serverList.  If true, the merged serverList is
   351  // stored as the receiver's serverList.  Returns false if the first server
   352  // does not exist in the list (i.e. was removed by Serf during a
   353  // PingConsulServer() call.  Newly added servers are appended to the list and
   354  // other missing servers are removed from the list.
   355  func (m *Manager) reconcileServerList(l *serverList) bool {
   356  	m.listLock.Lock()
   357  	defer m.listLock.Unlock()
   358  
   359  	// newServerCfg is a serverList that has been kept up to date with
   360  	// Serf node join and node leave events.
   361  	newServerCfg := m.getServerList()
   362  
   363  	// If Serf has removed all nodes, or there is no selected server
   364  	// (zero nodes in serverList), abort early.
   365  	if len(newServerCfg.servers) == 0 || len(l.servers) == 0 {
   366  		return false
   367  	}
   368  
   369  	type targetServer struct {
   370  		server *metadata.Server
   371  
   372  		//   'b' == both
   373  		//   'o' == original
   374  		//   'n' == new
   375  		state byte
   376  	}
   377  	mergedList := make(map[metadata.Key]*targetServer, len(l.servers))
   378  	for _, s := range l.servers {
   379  		mergedList[*s.Key()] = &targetServer{server: s, state: 'o'}
   380  	}
   381  	for _, s := range newServerCfg.servers {
   382  		k := s.Key()
   383  		_, found := mergedList[*k]
   384  		if found {
   385  			mergedList[*k].state = 'b'
   386  		} else {
   387  			mergedList[*k] = &targetServer{server: s, state: 'n'}
   388  		}
   389  	}
   390  
   391  	// Ensure the selected server has not been removed by Serf
   392  	selectedServerKey := l.servers[0].Key()
   393  	if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' {
   394  		return false
   395  	}
   396  
   397  	// Append any new servers and remove any old servers
   398  	for k, v := range mergedList {
   399  		switch v.state {
   400  		case 'b':
   401  			// Do nothing, server exists in both
   402  		case 'o':
   403  			// Server has been removed
   404  			l.removeServerByKey(&k)
   405  		case 'n':
   406  			// Server added
   407  			l.servers = append(l.servers, v.server)
   408  		default:
   409  			panic("unknown merge list state")
   410  		}
   411  	}
   412  
   413  	m.saveServerList(*l)
   414  	return true
   415  }
   416  
   417  // RemoveServer takes out an internal write lock and removes a server from
   418  // the server list.
   419  func (m *Manager) RemoveServer(s *metadata.Server) {
   420  	m.listLock.Lock()
   421  	defer m.listLock.Unlock()
   422  	l := m.getServerList()
   423  
   424  	// Remove the server if known
   425  	for i := range l.servers {
   426  		if l.servers[i].Name == s.Name {
   427  			newServers := make([]*metadata.Server, 0, len(l.servers)-1)
   428  			newServers = append(newServers, l.servers[:i]...)
   429  			newServers = append(newServers, l.servers[i+1:]...)
   430  			l.servers = newServers
   431  
   432  			m.saveServerList(l)
   433  			return
   434  		}
   435  	}
   436  }
   437  
   438  // refreshServerRebalanceTimer is only called once m.rebalanceTimer expires.
   439  func (m *Manager) refreshServerRebalanceTimer() time.Duration {
   440  	l := m.getServerList()
   441  	numServers := len(l.servers)
   442  	// Limit this connection's life based on the size (and health) of the
   443  	// cluster.  Never rebalance a connection more frequently than
   444  	// connReuseLowWatermarkDuration, and make sure we never exceed
   445  	// clusterWideRebalanceConnsPerSec operations/s across numLANMembers.
   446  	clusterWideRebalanceConnsPerSec := float64(numServers * newRebalanceConnsPerSecPerServer)
   447  	connReuseLowWatermarkDuration := clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction)
   448  	numLANMembers := m.clusterInfo.NumNodes()
   449  	connRebalanceTimeout := lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWatermarkDuration, numLANMembers)
   450  
   451  	m.rebalanceTimer.Reset(connRebalanceTimeout)
   452  	return connRebalanceTimeout
   453  }
   454  
   455  // ResetRebalanceTimer resets the rebalance timer.  This method exists for
   456  // testing and should not be used directly.
   457  func (m *Manager) ResetRebalanceTimer() {
   458  	m.listLock.Lock()
   459  	defer m.listLock.Unlock()
   460  	m.rebalanceTimer.Reset(clientRPCMinReuseDuration)
   461  }
   462  
   463  // Start is used to start and manage the task of automatically shuffling and
   464  // rebalancing the list of Consul servers.  This maintenance only happens
   465  // periodically based on the expiration of the timer.  Failed servers are
   466  // automatically cycled to the end of the list.  New servers are appended to
   467  // the list.  The order of the server list must be shuffled periodically to
   468  // distribute load across all known and available Consul servers.
   469  func (m *Manager) Start() {
   470  	for {
   471  		select {
   472  		case <-m.rebalanceTimer.C:
   473  			m.RebalanceServers()
   474  			m.refreshServerRebalanceTimer()
   475  
   476  		case <-m.shutdownCh:
   477  			m.logger.Printf("[INFO] manager: shutting down")
   478  			return
   479  		}
   480  	}
   481  }