github.imxd.top/hashicorp/consul@v1.4.5/agent/router/router.go (about)

     1  package router
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"sort"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/consul/agent/metadata"
    10  	"github.com/hashicorp/consul/agent/structs"
    11  	"github.com/hashicorp/consul/lib"
    12  	"github.com/hashicorp/consul/types"
    13  	"github.com/hashicorp/serf/coordinate"
    14  	"github.com/hashicorp/serf/serf"
    15  )
    16  
    17  // Router keeps track of a set of network areas and their associated Serf
    18  // membership of Consul servers. It then indexes this by datacenter to provide
    19  // healthy routes to servers by datacenter.
    20  type Router struct {
    21  	// logger is used for diagnostic output.
    22  	logger *log.Logger
    23  
    24  	// localDatacenter has the name of the router's home datacenter. This is
    25  	// used to short-circuit RTT calculations for local servers.
    26  	localDatacenter string
    27  
    28  	// areas maps area IDs to structures holding information about that
    29  	// area.
    30  	areas map[types.AreaID]*areaInfo
    31  
    32  	// managers is an index from datacenter names to a list of server
    33  	// managers for that datacenter. This is used to quickly lookup routes.
    34  	managers map[string][]*Manager
    35  
    36  	// routeFn is a hook to actually do the routing.
    37  	routeFn func(datacenter string) (*Manager, *metadata.Server, bool)
    38  
    39  	// isShutdown prevents adding new routes to a router after it is shut
    40  	// down.
    41  	isShutdown bool
    42  
    43  	// This top-level lock covers all the internal state.
    44  	sync.RWMutex
    45  }
    46  
    47  // RouterSerfCluster is an interface wrapper around Serf in order to make this
    48  // easier to unit test.
    49  type RouterSerfCluster interface {
    50  	NumNodes() int
    51  	Members() []serf.Member
    52  	GetCoordinate() (*coordinate.Coordinate, error)
    53  	GetCachedCoordinate(name string) (*coordinate.Coordinate, bool)
    54  }
    55  
    56  // managerInfo holds a server manager for a datacenter along with its associated
    57  // shutdown channel.
    58  type managerInfo struct {
    59  	// manager is notified about servers for this datacenter.
    60  	manager *Manager
    61  
    62  	// shutdownCh is only given to this manager so we can shut it down when
    63  	// all servers for this datacenter are gone.
    64  	shutdownCh chan struct{}
    65  }
    66  
    67  // areaInfo holds information about a given network area.
    68  type areaInfo struct {
    69  	// cluster is the Serf instance for this network area.
    70  	cluster RouterSerfCluster
    71  
    72  	// pinger is used to ping servers in this network area when trying to
    73  	// find a new, healthy server to talk to.
    74  	pinger Pinger
    75  
    76  	// managers maps datacenter names to managers for that datacenter in
    77  	// this area.
    78  	managers map[string]*managerInfo
    79  
    80  	// useTLS specifies whether to use TLS to communicate for this network area.
    81  	useTLS bool
    82  }
    83  
    84  // NewRouter returns a new Router with the given configuration.
    85  func NewRouter(logger *log.Logger, localDatacenter string) *Router {
    86  	router := &Router{
    87  		logger:          logger,
    88  		localDatacenter: localDatacenter,
    89  		areas:           make(map[types.AreaID]*areaInfo),
    90  		managers:        make(map[string][]*Manager),
    91  	}
    92  
    93  	// Hook the direct route lookup by default.
    94  	router.routeFn = router.findDirectRoute
    95  
    96  	return router
    97  }
    98  
    99  // Shutdown removes all areas from the router, which stops all their respective
   100  // managers. No new areas can be added after the router is shut down.
   101  func (r *Router) Shutdown() {
   102  	r.Lock()
   103  	defer r.Unlock()
   104  
   105  	for areaID, area := range r.areas {
   106  		for datacenter, info := range area.managers {
   107  			r.removeManagerFromIndex(datacenter, info.manager)
   108  			close(info.shutdownCh)
   109  		}
   110  
   111  		delete(r.areas, areaID)
   112  	}
   113  
   114  	r.isShutdown = true
   115  }
   116  
   117  // AddArea registers a new network area with the router.
   118  func (r *Router) AddArea(areaID types.AreaID, cluster RouterSerfCluster, pinger Pinger, useTLS bool) error {
   119  	r.Lock()
   120  	defer r.Unlock()
   121  
   122  	if r.isShutdown {
   123  		return fmt.Errorf("cannot add area, router is shut down")
   124  	}
   125  
   126  	if _, ok := r.areas[areaID]; ok {
   127  		return fmt.Errorf("area ID %q already exists", areaID)
   128  	}
   129  
   130  	area := &areaInfo{
   131  		cluster:  cluster,
   132  		pinger:   pinger,
   133  		managers: make(map[string]*managerInfo),
   134  		useTLS:   useTLS,
   135  	}
   136  	r.areas[areaID] = area
   137  
   138  	// Do an initial populate of the manager so that we don't have to wait
   139  	// for events to fire. This lets us attempt to use all the known servers
   140  	// initially, and then will quickly detect that they are failed if we
   141  	// can't reach them.
   142  	for _, m := range cluster.Members() {
   143  		ok, parts := metadata.IsConsulServer(m)
   144  		if !ok {
   145  			r.logger.Printf("[WARN]: consul: Non-server %q in server-only area %q",
   146  				m.Name, areaID)
   147  			continue
   148  		}
   149  
   150  		if err := r.addServer(area, parts); err != nil {
   151  			return fmt.Errorf("failed to add server %q to area %q: %v", m.Name, areaID, err)
   152  		}
   153  	}
   154  
   155  	return nil
   156  }
   157  
   158  // removeManagerFromIndex does cleanup to take a manager out of the index of
   159  // datacenters. This assumes the lock is already held for writing, and will
   160  // panic if the given manager isn't found.
   161  func (r *Router) removeManagerFromIndex(datacenter string, manager *Manager) {
   162  	managers := r.managers[datacenter]
   163  	for i := 0; i < len(managers); i++ {
   164  		if managers[i] == manager {
   165  			r.managers[datacenter] = append(managers[:i], managers[i+1:]...)
   166  			if len(r.managers[datacenter]) == 0 {
   167  				delete(r.managers, datacenter)
   168  			}
   169  			return
   170  		}
   171  	}
   172  	panic("managers index out of sync")
   173  }
   174  
   175  // Returns whether TLS is enabled for the given area ID
   176  func (r *Router) TLSEnabled(areaID types.AreaID) (bool, error) {
   177  	r.RLock()
   178  	defer r.RUnlock()
   179  
   180  	area, ok := r.areas[areaID]
   181  	if !ok {
   182  		return false, fmt.Errorf("area ID %q does not exist", areaID)
   183  	}
   184  
   185  	return area.useTLS, nil
   186  }
   187  
   188  // RemoveArea removes an existing network area from the router.
   189  func (r *Router) RemoveArea(areaID types.AreaID) error {
   190  	r.Lock()
   191  	defer r.Unlock()
   192  
   193  	area, ok := r.areas[areaID]
   194  	if !ok {
   195  		return fmt.Errorf("area ID %q does not exist", areaID)
   196  	}
   197  
   198  	// Remove all of this area's managers from the index and shut them down.
   199  	for datacenter, info := range area.managers {
   200  		r.removeManagerFromIndex(datacenter, info.manager)
   201  		close(info.shutdownCh)
   202  	}
   203  
   204  	delete(r.areas, areaID)
   205  	return nil
   206  }
   207  
   208  // addServer does the work of AddServer once the write lock is held.
   209  func (r *Router) addServer(area *areaInfo, s *metadata.Server) error {
   210  	// Make the manager on the fly if this is the first we've seen of it,
   211  	// and add it to the index.
   212  	info, ok := area.managers[s.Datacenter]
   213  	if !ok {
   214  		shutdownCh := make(chan struct{})
   215  		manager := New(r.logger, shutdownCh, area.cluster, area.pinger)
   216  		info = &managerInfo{
   217  			manager:    manager,
   218  			shutdownCh: shutdownCh,
   219  		}
   220  		area.managers[s.Datacenter] = info
   221  
   222  		managers := r.managers[s.Datacenter]
   223  		r.managers[s.Datacenter] = append(managers, manager)
   224  		go manager.Start()
   225  	}
   226  
   227  	// If TLS is enabled for the area, set it on the server so the manager
   228  	// knows to use TLS when pinging it.
   229  	if area.useTLS {
   230  		s.UseTLS = true
   231  	}
   232  
   233  	info.manager.AddServer(s)
   234  	return nil
   235  }
   236  
   237  // AddServer should be called whenever a new server joins an area. This is
   238  // typically hooked into the Serf event handler area for this area.
   239  func (r *Router) AddServer(areaID types.AreaID, s *metadata.Server) error {
   240  	r.Lock()
   241  	defer r.Unlock()
   242  
   243  	area, ok := r.areas[areaID]
   244  	if !ok {
   245  		return fmt.Errorf("area ID %q does not exist", areaID)
   246  	}
   247  	return r.addServer(area, s)
   248  }
   249  
   250  // RemoveServer should be called whenever a server is removed from an area. This
   251  // is typically hooked into the Serf event handler area for this area.
   252  func (r *Router) RemoveServer(areaID types.AreaID, s *metadata.Server) error {
   253  	r.Lock()
   254  	defer r.Unlock()
   255  
   256  	area, ok := r.areas[areaID]
   257  	if !ok {
   258  		return fmt.Errorf("area ID %q does not exist", areaID)
   259  	}
   260  
   261  	// If the manager has already been removed we just quietly exit. This
   262  	// can get called by Serf events, so the timing isn't totally
   263  	// deterministic.
   264  	info, ok := area.managers[s.Datacenter]
   265  	if !ok {
   266  		return nil
   267  	}
   268  	info.manager.RemoveServer(s)
   269  
   270  	// If this manager is empty then remove it so we don't accumulate cruft
   271  	// and waste time during request routing.
   272  	if num := info.manager.NumServers(); num == 0 {
   273  		r.removeManagerFromIndex(s.Datacenter, info.manager)
   274  		close(info.shutdownCh)
   275  		delete(area.managers, s.Datacenter)
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  // FailServer should be called whenever a server is failed in an area. This
   282  // is typically hooked into the Serf event handler area for this area. We will
   283  // immediately shift traffic away from this server, but it will remain in the
   284  // list of servers.
   285  func (r *Router) FailServer(areaID types.AreaID, s *metadata.Server) error {
   286  	r.RLock()
   287  	defer r.RUnlock()
   288  
   289  	area, ok := r.areas[areaID]
   290  	if !ok {
   291  		return fmt.Errorf("area ID %q does not exist", areaID)
   292  	}
   293  
   294  	// If the manager has already been removed we just quietly exit. This
   295  	// can get called by Serf events, so the timing isn't totally
   296  	// deterministic.
   297  	info, ok := area.managers[s.Datacenter]
   298  	if !ok {
   299  		return nil
   300  	}
   301  
   302  	info.manager.NotifyFailedServer(s)
   303  	return nil
   304  }
   305  
   306  // FindRoute returns a healthy server with a route to the given datacenter. The
   307  // Boolean return parameter will indicate if a server was available. In some
   308  // cases this may return a best-effort unhealthy server that can be used for a
   309  // connection attempt. If any problem occurs with the given server, the caller
   310  // should feed that back to the manager associated with the server, which is
   311  // also returned, by calling NotifyFailedServer().
   312  func (r *Router) FindRoute(datacenter string) (*Manager, *metadata.Server, bool) {
   313  	return r.routeFn(datacenter)
   314  }
   315  
   316  // findDirectRoute looks for a route to the given datacenter if it's directly
   317  // adjacent to the server.
   318  func (r *Router) findDirectRoute(datacenter string) (*Manager, *metadata.Server, bool) {
   319  	r.RLock()
   320  	defer r.RUnlock()
   321  
   322  	// Get the list of managers for this datacenter. This will usually just
   323  	// have one entry, but it's possible to have a user-defined area + WAN.
   324  	managers, ok := r.managers[datacenter]
   325  	if !ok {
   326  		return nil, nil, false
   327  	}
   328  
   329  	// Try each manager until we get a server.
   330  	for _, manager := range managers {
   331  		if manager.IsOffline() {
   332  			continue
   333  		}
   334  
   335  		if s := manager.FindServer(); s != nil {
   336  			return manager, s, true
   337  		}
   338  	}
   339  
   340  	// Didn't find a route (even via an unhealthy server).
   341  	return nil, nil, false
   342  }
   343  
   344  // GetDatacenters returns a list of datacenters known to the router, sorted by
   345  // name.
   346  func (r *Router) GetDatacenters() []string {
   347  	r.RLock()
   348  	defer r.RUnlock()
   349  
   350  	dcs := make([]string, 0, len(r.managers))
   351  	for dc := range r.managers {
   352  		dcs = append(dcs, dc)
   353  	}
   354  
   355  	sort.Strings(dcs)
   356  	return dcs
   357  }
   358  
   359  // datacenterSorter takes a list of DC names and a parallel vector of distances
   360  // and implements sort.Interface, keeping both structures coherent and sorting
   361  // by distance.
   362  type datacenterSorter struct {
   363  	Names []string
   364  	Vec   []float64
   365  }
   366  
   367  // See sort.Interface.
   368  func (n *datacenterSorter) Len() int {
   369  	return len(n.Names)
   370  }
   371  
   372  // See sort.Interface.
   373  func (n *datacenterSorter) Swap(i, j int) {
   374  	n.Names[i], n.Names[j] = n.Names[j], n.Names[i]
   375  	n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i]
   376  }
   377  
   378  // See sort.Interface.
   379  func (n *datacenterSorter) Less(i, j int) bool {
   380  	return n.Vec[i] < n.Vec[j]
   381  }
   382  
   383  // GetDatacentersByDistance returns a list of datacenters known to the router,
   384  // sorted by median RTT from this server to the servers in each datacenter. If
   385  // there are multiple areas that reach a given datacenter, this will use the
   386  // lowest RTT for the sort.
   387  func (r *Router) GetDatacentersByDistance() ([]string, error) {
   388  	r.RLock()
   389  	defer r.RUnlock()
   390  
   391  	// Go through each area and aggregate the median RTT from the current
   392  	// server to the other servers in each datacenter.
   393  	dcs := make(map[string]float64)
   394  	for areaID, info := range r.areas {
   395  		index := make(map[string][]float64)
   396  		coord, err := info.cluster.GetCoordinate()
   397  		if err != nil {
   398  			return nil, err
   399  		}
   400  
   401  		for _, m := range info.cluster.Members() {
   402  			ok, parts := metadata.IsConsulServer(m)
   403  			if !ok {
   404  				r.logger.Printf("[WARN]: consul: Non-server %q in server-only area %q",
   405  					m.Name, areaID)
   406  				continue
   407  			}
   408  
   409  			existing := index[parts.Datacenter]
   410  			if parts.Datacenter == r.localDatacenter {
   411  				// Everything in the local datacenter looks like zero RTT.
   412  				index[parts.Datacenter] = append(existing, 0.0)
   413  			} else {
   414  				// It's OK to get a nil coordinate back, ComputeDistance
   415  				// will put the RTT at positive infinity.
   416  				other, _ := info.cluster.GetCachedCoordinate(parts.Name)
   417  				rtt := lib.ComputeDistance(coord, other)
   418  				index[parts.Datacenter] = append(existing, rtt)
   419  			}
   420  		}
   421  
   422  		// Compute the median RTT between this server and the servers
   423  		// in each datacenter. We accumulate the lowest RTT to each DC
   424  		// in the master map, since a given DC might appear in multiple
   425  		// areas.
   426  		for dc, rtts := range index {
   427  			sort.Float64s(rtts)
   428  			rtt := rtts[len(rtts)/2]
   429  
   430  			current, ok := dcs[dc]
   431  			if !ok || (ok && rtt < current) {
   432  				dcs[dc] = rtt
   433  			}
   434  		}
   435  	}
   436  
   437  	// First sort by DC name, since we do a stable sort later.
   438  	names := make([]string, 0, len(dcs))
   439  	for dc := range dcs {
   440  		names = append(names, dc)
   441  	}
   442  	sort.Strings(names)
   443  
   444  	// Then stable sort by median RTT.
   445  	rtts := make([]float64, 0, len(dcs))
   446  	for _, dc := range names {
   447  		rtts = append(rtts, dcs[dc])
   448  	}
   449  	sort.Stable(&datacenterSorter{names, rtts})
   450  	return names, nil
   451  }
   452  
   453  // GetDatacenterMaps returns a structure with the raw network coordinates of
   454  // each known server, organized by datacenter and network area.
   455  func (r *Router) GetDatacenterMaps() ([]structs.DatacenterMap, error) {
   456  	r.RLock()
   457  	defer r.RUnlock()
   458  
   459  	var maps []structs.DatacenterMap
   460  	for areaID, info := range r.areas {
   461  		index := make(map[string]structs.Coordinates)
   462  		for _, m := range info.cluster.Members() {
   463  			ok, parts := metadata.IsConsulServer(m)
   464  			if !ok {
   465  				r.logger.Printf("[WARN]: consul: Non-server %q in server-only area %q",
   466  					m.Name, areaID)
   467  				continue
   468  			}
   469  
   470  			coord, ok := info.cluster.GetCachedCoordinate(parts.Name)
   471  			if ok {
   472  				entry := &structs.Coordinate{
   473  					Node:  parts.Name,
   474  					Coord: coord,
   475  				}
   476  				existing := index[parts.Datacenter]
   477  				index[parts.Datacenter] = append(existing, entry)
   478  			}
   479  		}
   480  
   481  		for dc, coords := range index {
   482  			entry := structs.DatacenterMap{
   483  				Datacenter:  dc,
   484  				AreaID:      areaID,
   485  				Coordinates: coords,
   486  			}
   487  			maps = append(maps, entry)
   488  		}
   489  	}
   490  	return maps, nil
   491  }