github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/gossip/gossip_test.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package gossip
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"net"
    18  	"reflect"
    19  	"strconv"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    24  	"github.com/cockroachdb/cockroach/pkg/gossip/resolver"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/rpc"
    27  	"github.com/cockroachdb/cockroach/pkg/testutils"
    28  	"github.com/cockroachdb/cockroach/pkg/util"
    29  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    30  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    33  	"github.com/cockroachdb/cockroach/pkg/util/netutil"
    34  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    35  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    36  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    37  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    38  	"github.com/cockroachdb/errors"
    39  	"github.com/gogo/protobuf/proto"
    40  )
    41  
    42  // TestGossipInfoStore verifies operation of gossip instance infostore.
    43  func TestGossipInfoStore(t *testing.T) {
    44  	defer leaktest.AfterTest(t)()
    45  	stopper := stop.NewStopper()
    46  	defer stopper.Stop(context.Background())
    47  	clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
    48  	rpcContext := rpc.NewInsecureTestingContext(clock, stopper)
    49  	g := NewTest(1, rpcContext, rpc.NewServer(rpcContext), stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef())
    50  	slice := []byte("b")
    51  	if err := g.AddInfo("s", slice, time.Hour); err != nil {
    52  		t.Fatal(err)
    53  	}
    54  	if val, err := g.GetInfo("s"); !bytes.Equal(val, slice) || err != nil {
    55  		t.Errorf("error fetching string: %v", err)
    56  	}
    57  	if _, err := g.GetInfo("s2"); err == nil {
    58  		t.Errorf("expected error fetching nonexistent key \"s2\"")
    59  	}
    60  }
    61  
    62  // TestGossipMoveNode verifies that if a node is moved to a new address, it
    63  // gets properly updated in gossip.
    64  func TestGossipMoveNode(t *testing.T) {
    65  	defer leaktest.AfterTest(t)()
    66  	stopper := stop.NewStopper()
    67  	defer stopper.Stop(context.Background())
    68  	clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
    69  	rpcContext := rpc.NewInsecureTestingContext(clock, stopper)
    70  	g := NewTest(1, rpcContext, rpc.NewServer(rpcContext), stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef())
    71  	var nodes []*roachpb.NodeDescriptor
    72  	for i := 1; i <= 3; i++ {
    73  		node := &roachpb.NodeDescriptor{
    74  			NodeID:  roachpb.NodeID(i),
    75  			Address: util.MakeUnresolvedAddr("tcp", fmt.Sprintf("1.1.1.1:%d", i)),
    76  		}
    77  		if err := g.SetNodeDescriptor(node); err != nil {
    78  			t.Fatalf("failed setting node descriptor %+v: %s", node, err)
    79  		}
    80  		nodes = append(nodes, node)
    81  	}
    82  	for _, node := range nodes {
    83  		if val, err := g.GetNodeDescriptor(node.NodeID); err != nil {
    84  			t.Fatal(err)
    85  		} else if !proto.Equal(node, val) {
    86  			t.Fatalf("expected node %+v, got %+v", node, val)
    87  		}
    88  	}
    89  
    90  	// Move node 2 to the address of node 3.
    91  	movedNode := nodes[1]
    92  	replacedNode := nodes[2]
    93  	movedNode.Address = replacedNode.Address
    94  	if err := g.SetNodeDescriptor(movedNode); err != nil {
    95  		t.Fatal(err)
    96  	}
    97  
    98  	testutils.SucceedsSoon(t, func() error {
    99  		if val, err := g.GetNodeDescriptor(movedNode.NodeID); err != nil {
   100  			return err
   101  		} else if !proto.Equal(movedNode, val) {
   102  			return fmt.Errorf("expected node %+v, got %+v", movedNode, val)
   103  		}
   104  		return nil
   105  	})
   106  }
   107  
   108  func TestGossipGetNextBootstrapAddress(t *testing.T) {
   109  	defer leaktest.AfterTest(t)()
   110  	stopper := stop.NewStopper()
   111  	defer stopper.Stop(context.Background())
   112  
   113  	resolverSpecs := []string{
   114  		"127.0.0.1:9000",
   115  		"127.0.0.1:9001",
   116  		"localhost:9004",
   117  	}
   118  
   119  	resolvers := []resolver.Resolver{}
   120  	for _, rs := range resolverSpecs {
   121  		resolver, err := resolver.NewResolver(rs)
   122  		if err == nil {
   123  			resolvers = append(resolvers, resolver)
   124  		}
   125  	}
   126  	if len(resolvers) != 3 {
   127  		t.Errorf("expected 3 resolvers; got %d", len(resolvers))
   128  	}
   129  	clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
   130  	rpcContext := rpc.NewInsecureTestingContext(clock, stopper)
   131  	server := rpc.NewServer(rpcContext)
   132  	g := NewTest(0, nil, server, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef())
   133  	g.setResolvers(resolvers)
   134  
   135  	// Using specified resolvers, fetch bootstrap addresses 3 times
   136  	// and verify the results match expected addresses.
   137  	expAddresses := []string{
   138  		"127.0.0.1:9000",
   139  		"127.0.0.1:9001",
   140  		"localhost:9004",
   141  	}
   142  	for i := 0; i < len(expAddresses); i++ {
   143  		g.mu.Lock()
   144  		if addr := g.getNextBootstrapAddressLocked(); addr == nil {
   145  			t.Errorf("%d: unexpected nil addr when expecting %s", i, expAddresses[i])
   146  		} else if addrStr := addr.String(); addrStr != expAddresses[i] {
   147  			t.Errorf("%d: expected addr %s; got %s", i, expAddresses[i], addrStr)
   148  		}
   149  		g.mu.Unlock()
   150  	}
   151  }
   152  
   153  func TestGossipLocalityResolver(t *testing.T) {
   154  	defer leaktest.AfterTest(t)()
   155  	stopper := stop.NewStopper()
   156  	defer stopper.Stop(context.Background())
   157  	clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
   158  	rpcContext := rpc.NewInsecureTestingContext(clock, stopper)
   159  
   160  	gossipLocalityAdvertiseList := roachpb.Locality{}
   161  	tier := roachpb.Tier{}
   162  	tier.Key = "zone"
   163  	tier.Value = "1"
   164  
   165  	tier2 := roachpb.Tier{}
   166  	tier2.Key = "zone"
   167  	tier2.Value = "2"
   168  
   169  	gossipLocalityAdvertiseList.Tiers = append(gossipLocalityAdvertiseList.Tiers, tier)
   170  
   171  	node1PrivateAddress := util.MakeUnresolvedAddr("tcp", "1.0.0.1")
   172  	node2PrivateAddress := util.MakeUnresolvedAddr("tcp", "2.0.0.1")
   173  
   174  	node1PublicAddressRPC := util.MakeUnresolvedAddr("tcp", "1.1.1.1:1")
   175  	node2PublicAddressRPC := util.MakeUnresolvedAddr("tcp", "2.2.2.2:3")
   176  	node2PublicAddressSQL := util.MakeUnresolvedAddr("tcp", "2.2.2.2:4")
   177  
   178  	var node1LocalityList []roachpb.LocalityAddress
   179  	nodeLocalityAddress := roachpb.LocalityAddress{}
   180  	nodeLocalityAddress.Address = node1PrivateAddress
   181  	nodeLocalityAddress.LocalityTier = tier
   182  
   183  	nodeLocalityAddress2 := roachpb.LocalityAddress{}
   184  	nodeLocalityAddress2.Address = node2PrivateAddress
   185  	nodeLocalityAddress2.LocalityTier = tier2
   186  
   187  	node1LocalityList = append(node1LocalityList, nodeLocalityAddress)
   188  	node1LocalityList = append(node1LocalityList, nodeLocalityAddress2)
   189  
   190  	var node2LocalityList []roachpb.LocalityAddress
   191  	node2LocalityList = append(node2LocalityList, nodeLocalityAddress2)
   192  
   193  	g := NewTestWithLocality(1, rpcContext, rpc.NewServer(rpcContext), stopper, metric.NewRegistry(), gossipLocalityAdvertiseList, zonepb.DefaultZoneConfigRef())
   194  	node1 := &roachpb.NodeDescriptor{
   195  		NodeID:          1,
   196  		Address:         node1PublicAddressRPC,
   197  		LocalityAddress: node1LocalityList,
   198  	}
   199  	node2 := &roachpb.NodeDescriptor{
   200  		NodeID:          2,
   201  		Address:         node2PublicAddressRPC,
   202  		SQLAddress:      node2PublicAddressSQL,
   203  		LocalityAddress: node2LocalityList,
   204  	}
   205  
   206  	if err := g.SetNodeDescriptor(node1); err != nil {
   207  		t.Fatal(err)
   208  	}
   209  	if err := g.SetNodeDescriptor(node2); err != nil {
   210  		t.Fatal(err)
   211  	}
   212  
   213  	nodeAddress, err := g.GetNodeIDAddress(node1.NodeID)
   214  	if err != nil {
   215  		t.Error(err)
   216  	}
   217  	if *nodeAddress != node1PrivateAddress {
   218  		t.Fatalf("expected: %s but got: %s address", node1PrivateAddress, *nodeAddress)
   219  	}
   220  
   221  	nodeAddress, err = g.GetNodeIDAddress(node2.NodeID)
   222  	if err != nil {
   223  		t.Error(err)
   224  	}
   225  
   226  	if *nodeAddress != node2PublicAddressRPC {
   227  		t.Fatalf("expected: %s but got: %s address", node2PublicAddressRPC, *nodeAddress)
   228  	}
   229  
   230  	nodeAddressSQL, err := g.GetNodeIDSQLAddress(node2.NodeID)
   231  	if err != nil {
   232  		t.Error(err)
   233  	}
   234  
   235  	if *nodeAddressSQL != node2PublicAddressSQL {
   236  		t.Fatalf("expected: %s but got: %s address", node2PublicAddressSQL, *nodeAddressSQL)
   237  	}
   238  }
   239  
   240  func TestGossipRaceLogStatus(t *testing.T) {
   241  	defer leaktest.AfterTest(t)()
   242  
   243  	stopper := stop.NewStopper()
   244  	defer stopper.Stop(context.Background())
   245  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   246  	// don't talk to servers from unrelated tests by accident).
   247  	clusterID := uuid.MakeV4()
   248  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   249  
   250  	local.mu.Lock()
   251  	peer := startGossip(clusterID, 2, stopper, t, metric.NewRegistry())
   252  	local.startClientLocked(&peer.mu.is.NodeAddr)
   253  	local.mu.Unlock()
   254  
   255  	// Race gossiping against LogStatus.
   256  	gun := make(chan struct{})
   257  	for i := uint8(0); i < 10; i++ {
   258  		go func() {
   259  			<-gun
   260  			local.LogStatus()
   261  			gun <- struct{}{}
   262  		}()
   263  		gun <- struct{}{}
   264  		if err := local.AddInfo(
   265  			strconv.FormatUint(uint64(i), 10),
   266  			[]byte{i},
   267  			time.Hour,
   268  		); err != nil {
   269  			t.Fatal(err)
   270  		}
   271  		<-gun
   272  	}
   273  	close(gun)
   274  }
   275  
   276  // TestGossipOutgoingLimitEnforced verifies that a gossip node won't open more
   277  // outgoing connections than it should. If the gossip implementation is racy
   278  // with respect to opening outgoing connections, this may not fail every time
   279  // it's run, but should fail very quickly if run under stress.
   280  func TestGossipOutgoingLimitEnforced(t *testing.T) {
   281  	defer leaktest.AfterTest(t)()
   282  
   283  	stopper := stop.NewStopper()
   284  	defer stopper.Stop(context.Background())
   285  
   286  	// This test has an implicit dependency on the maxPeers logic deciding that
   287  	// maxPeers is 3 for a 5-node cluster, so let's go ahead and make that
   288  	// explicit.
   289  	maxPeers := maxPeers(5)
   290  	if maxPeers > 3 {
   291  		t.Fatalf("maxPeers(5)=%d, which is higher than this test's assumption", maxPeers)
   292  	}
   293  
   294  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   295  	// don't talk to servers from unrelated tests by accident).
   296  	clusterID := uuid.MakeV4()
   297  
   298  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   299  	local.mu.Lock()
   300  	localAddr := local.mu.is.NodeAddr
   301  	local.mu.Unlock()
   302  	var peers []*Gossip
   303  	for i := 0; i < 4; i++ {
   304  		// After creating a new node, join it to the first node to ensure that the
   305  		// network is connected (and thus all nodes know each other's addresses)
   306  		// before we start the actual test.
   307  		newPeer := startGossip(clusterID, roachpb.NodeID(i+2), stopper, t, metric.NewRegistry())
   308  		newPeer.mu.Lock()
   309  		newPeer.startClientLocked(&localAddr)
   310  		newPeer.mu.Unlock()
   311  		peers = append(peers, newPeer)
   312  	}
   313  
   314  	// Wait until the network is at least mostly connected.
   315  	testutils.SucceedsSoon(t, func() error {
   316  		local.mu.Lock()
   317  		defer local.mu.Unlock()
   318  		if local.mu.incoming.len() == maxPeers {
   319  			return nil
   320  		}
   321  		return fmt.Errorf("local.mu.incoming.len() = %d, want %d", local.mu.incoming.len(), maxPeers)
   322  	})
   323  
   324  	// Verify that we can't open more than maxPeers connections. We have to muck
   325  	// with the infostore's data so that the other nodes will appear far enough
   326  	// away to be worth opening a connection to.
   327  	local.mu.Lock()
   328  	err := local.mu.is.visitInfos(func(key string, i *Info) error {
   329  		copy := *i
   330  		copy.Hops = maxHops + 1
   331  		copy.Value.Timestamp.WallTime++
   332  		return local.mu.is.addInfo(key, &copy)
   333  	}, true /* deleteExpired */)
   334  	local.mu.Unlock()
   335  	if err != nil {
   336  		t.Fatal(err)
   337  	}
   338  	for range peers {
   339  		local.tightenNetwork(context.Background())
   340  	}
   341  
   342  	if outgoing := local.outgoing.gauge.Value(); outgoing > int64(maxPeers) {
   343  		t.Errorf("outgoing nodeSet has %d connections; the max should be %d", outgoing, maxPeers)
   344  	}
   345  	local.clientsMu.Lock()
   346  	if numClients := len(local.clientsMu.clients); numClients > maxPeers {
   347  		t.Errorf("local gossip has %d clients; the max should be %d", numClients, maxPeers)
   348  	}
   349  	local.clientsMu.Unlock()
   350  }
   351  
   352  func TestGossipMostDistant(t *testing.T) {
   353  	defer leaktest.AfterTest(t)()
   354  
   355  	stopper := stop.NewStopper()
   356  	defer stopper.Stop(context.Background())
   357  
   358  	connect := func(from, to *Gossip) {
   359  		to.mu.Lock()
   360  		addr := to.mu.is.NodeAddr
   361  		to.mu.Unlock()
   362  		from.mu.Lock()
   363  		from.startClientLocked(&addr)
   364  		from.mu.Unlock()
   365  	}
   366  
   367  	mostDistant := func(g *Gossip) (roachpb.NodeID, uint32) {
   368  		g.mu.Lock()
   369  		distantNodeID, distantHops := g.mu.is.mostDistant(func(roachpb.NodeID) bool {
   370  			return false
   371  		})
   372  		g.mu.Unlock()
   373  		return distantNodeID, distantHops
   374  	}
   375  
   376  	const n = 10
   377  	testCases := []struct {
   378  		from, to int
   379  	}{
   380  		{0, n - 1}, // n1 connects to n10
   381  		{n - 1, 0}, // n10 connects to n1
   382  	}
   383  
   384  	for _, c := range testCases {
   385  		t.Run("", func(t *testing.T) {
   386  			// Shared cluster ID by all gossipers (this ensures that the gossipers
   387  			// don't talk to servers from unrelated tests by accident).
   388  			clusterID := uuid.MakeV4()
   389  
   390  			// Set up a gossip network of 10 nodes connected in a single line:
   391  			//
   392  			//   1 <- 2 <- 3 <- 4 <- 5 <- 6 <- 7 <- 8 <- 9 <- 10
   393  			nodes := make([]*Gossip, n)
   394  			for i := range nodes {
   395  				nodes[i] = startGossip(clusterID, roachpb.NodeID(i+1), stopper, t, metric.NewRegistry())
   396  				if i == 0 {
   397  					continue
   398  				}
   399  				connect(nodes[i], nodes[i-1])
   400  			}
   401  
   402  			// Wait for n1 to determine that n10 is the most distant node.
   403  			testutils.SucceedsSoon(t, func() error {
   404  				g := nodes[0]
   405  				distantNodeID, distantHops := mostDistant(g)
   406  				if distantNodeID == 10 && distantHops == 9 {
   407  					return nil
   408  				}
   409  				return fmt.Errorf("n%d: distantHops: %d from n%d", g.NodeID.Get(), distantHops, distantNodeID)
   410  			})
   411  			// Wait for the infos to be fully propagated.
   412  			testutils.SucceedsSoon(t, func() error {
   413  				infosCount := func(g *Gossip) int {
   414  					g.mu.Lock()
   415  					defer g.mu.Unlock()
   416  					return len(g.mu.is.Infos)
   417  				}
   418  				count := infosCount(nodes[0])
   419  				for _, g := range nodes[1:] {
   420  					if tmp := infosCount(g); tmp != count {
   421  						return fmt.Errorf("unexpected info count: %d != %d", tmp, count)
   422  					}
   423  				}
   424  				return nil
   425  			})
   426  
   427  			// Connect the network in a loop. This will cut the distance to the most
   428  			// distant node in half.
   429  			log.Infof(context.Background(), "connecting from n%d to n%d", c.from, c.to)
   430  			connect(nodes[c.from], nodes[c.to])
   431  
   432  			// Wait for n1 to determine that n6 is now the most distant hops from 9
   433  			// to 5 and change the most distant node to n6.
   434  			testutils.SucceedsSoon(t, func() error {
   435  				g := nodes[0]
   436  				g.mu.Lock()
   437  				var buf bytes.Buffer
   438  				_ = g.mu.is.visitInfos(func(key string, i *Info) error {
   439  					if i.NodeID != 1 && IsNodeIDKey(key) {
   440  						fmt.Fprintf(&buf, "n%d: hops=%d\n", i.NodeID, i.Hops)
   441  					}
   442  					return nil
   443  				}, true /* deleteExpired */)
   444  				g.mu.Unlock()
   445  
   446  				distantNodeID, distantHops := mostDistant(g)
   447  				if distantNodeID == 6 && distantHops == 5 {
   448  					return nil
   449  				}
   450  				return fmt.Errorf("n%d: distantHops: %d from n%d\n%s",
   451  					g.NodeID.Get(), distantHops, distantNodeID, buf.String())
   452  			})
   453  		})
   454  	}
   455  }
   456  
   457  // TestGossipNoForwardSelf verifies that when a Gossip instance is full, it
   458  // redirects clients elsewhere (in particular not to itself).
   459  //
   460  // NB: Stress testing this test really stresses the OS networking stack
   461  // more than anything else. For example, on Linux it may quickly deplete
   462  // the ephemeral port range (due to the TIME_WAIT state).
   463  // On a box which only runs tests, this can be circumvented by running
   464  //
   465  //	sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle"
   466  //
   467  // See https://vincent.bernat.im/en/blog/2014-tcp-time-wait-state-linux.html
   468  // for details.
   469  //
   470  // On OSX, things similarly fall apart. See #7524 and #5218 for some discussion
   471  // of this.
   472  func TestGossipNoForwardSelf(t *testing.T) {
   473  	defer leaktest.AfterTest(t)()
   474  
   475  	stopper := stop.NewStopper()
   476  	defer stopper.Stop(context.Background())
   477  
   478  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   479  	// don't talk to servers from unrelated tests by accident).
   480  	clusterID := uuid.MakeV4()
   481  
   482  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   483  
   484  	ctx, cancel := context.WithCancel(context.Background())
   485  	defer cancel()
   486  
   487  	// Start one loopback client plus enough additional clients to fill the
   488  	// incoming clients.
   489  	peers := []*Gossip{local}
   490  	local.server.mu.Lock()
   491  	maxSize := local.server.mu.incoming.maxSize
   492  	local.server.mu.Unlock()
   493  	for i := 0; i < maxSize; i++ {
   494  		peers = append(peers, startGossip(clusterID, roachpb.NodeID(i+2), stopper, t, metric.NewRegistry()))
   495  	}
   496  
   497  	for _, peer := range peers {
   498  		c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, local.GetNodeAddr(), makeMetrics())
   499  
   500  		testutils.SucceedsSoon(t, func() error {
   501  			conn, err := peer.rpcContext.GRPCUnvalidatedDial(c.addr.String()).Connect(ctx)
   502  			if err != nil {
   503  				return err
   504  			}
   505  
   506  			stream, err := NewGossipClient(conn).Gossip(ctx)
   507  			if err != nil {
   508  				return err
   509  			}
   510  
   511  			if err := c.requestGossip(peer, stream); err != nil {
   512  				return err
   513  			}
   514  
   515  			// Wait until the server responds, so we know we're connected.
   516  			_, err = stream.Recv()
   517  			return err
   518  		})
   519  	}
   520  
   521  	numClients := len(peers) * 2
   522  	disconnectedCh := make(chan *client)
   523  
   524  	// Start a few overflow peers and assert that they don't get forwarded to us
   525  	// again.
   526  	for i := 0; i < numClients; i++ {
   527  		local.server.mu.Lock()
   528  		maxSize := local.server.mu.incoming.maxSize
   529  		local.server.mu.Unlock()
   530  		peer := startGossip(clusterID, roachpb.NodeID(i+maxSize+2), stopper, t, metric.NewRegistry())
   531  
   532  		for {
   533  			localAddr := local.GetNodeAddr()
   534  			c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, localAddr, makeMetrics())
   535  			peer.mu.Lock()
   536  			c.startLocked(peer, disconnectedCh, peer.rpcContext, stopper, peer.rpcContext.NewBreaker(""))
   537  			peer.mu.Unlock()
   538  
   539  			disconnectedClient := <-disconnectedCh
   540  			if disconnectedClient != c {
   541  				t.Fatalf("expected %p to be disconnected, got %p", c, disconnectedClient)
   542  			} else if c.forwardAddr == nil {
   543  				// Under high load, clients sometimes fail to connect for reasons
   544  				// unrelated to the test, so we need to permit some.
   545  				t.Logf("node #%d: got nil forwarding address", peer.NodeID.Get())
   546  				continue
   547  			} else if *c.forwardAddr == *localAddr {
   548  				t.Errorf("node #%d: got local's forwarding address", peer.NodeID.Get())
   549  			}
   550  			break
   551  		}
   552  	}
   553  }
   554  
   555  // TestGossipCullNetwork verifies that a client will be culled from
   556  // the network periodically (at cullInterval duration intervals).
   557  func TestGossipCullNetwork(t *testing.T) {
   558  	defer leaktest.AfterTest(t)()
   559  
   560  	stopper := stop.NewStopper()
   561  	defer stopper.Stop(context.Background())
   562  
   563  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   564  	// don't talk to servers from unrelated tests by accident).
   565  	clusterID := uuid.MakeV4()
   566  
   567  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   568  	local.SetCullInterval(5 * time.Millisecond)
   569  
   570  	local.mu.Lock()
   571  	for i := 0; i < minPeers; i++ {
   572  		peer := startGossip(clusterID, roachpb.NodeID(i+2), stopper, t, metric.NewRegistry())
   573  		local.startClientLocked(peer.GetNodeAddr())
   574  	}
   575  	local.mu.Unlock()
   576  
   577  	const slowGossipDuration = time.Minute
   578  
   579  	if err := retry.ForDuration(slowGossipDuration, func() error {
   580  		if peers := len(local.Outgoing()); peers != minPeers {
   581  			return errors.Errorf("%d of %d peers connected", peers, minPeers)
   582  		}
   583  		return nil
   584  	}); err != nil {
   585  		t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err)
   586  	}
   587  
   588  	local.manage()
   589  
   590  	if err := retry.ForDuration(slowGossipDuration, func() error {
   591  		// Verify that a client is closed within the cull interval.
   592  		if peers := len(local.Outgoing()); peers != minPeers-1 {
   593  			return errors.Errorf("%d of %d peers connected", peers, minPeers-1)
   594  		}
   595  		return nil
   596  	}); err != nil {
   597  		t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err)
   598  	}
   599  }
   600  
   601  func TestGossipOrphanedStallDetection(t *testing.T) {
   602  	defer leaktest.AfterTest(t)()
   603  
   604  	stopper := stop.NewStopper()
   605  	defer stopper.Stop(context.Background())
   606  
   607  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   608  	// don't talk to servers from unrelated tests by accident).
   609  	clusterID := uuid.MakeV4()
   610  
   611  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   612  	local.SetStallInterval(5 * time.Millisecond)
   613  
   614  	// Make sure we have the sentinel to ensure that its absence is not the
   615  	// cause of stall detection.
   616  	if err := local.AddInfo(KeySentinel, nil, time.Hour); err != nil {
   617  		t.Fatal(err)
   618  	}
   619  
   620  	peerStopper := stop.NewStopper()
   621  	peer := startGossip(clusterID, 2, peerStopper, t, metric.NewRegistry())
   622  
   623  	peerNodeID := peer.NodeID.Get()
   624  	peerAddr := peer.GetNodeAddr()
   625  	peerAddrStr := peerAddr.String()
   626  
   627  	local.mu.Lock()
   628  	local.startClientLocked(peerAddr)
   629  	local.mu.Unlock()
   630  
   631  	testutils.SucceedsSoon(t, func() error {
   632  		for _, peerID := range local.Outgoing() {
   633  			if peerID == peerNodeID {
   634  				return nil
   635  			}
   636  		}
   637  		return errors.Errorf("n%d not yet connected", peerNodeID)
   638  	})
   639  
   640  	testutils.SucceedsSoon(t, func() error {
   641  		for _, resolver := range local.GetResolvers() {
   642  			if resolver.Addr() == peerAddrStr {
   643  				return nil
   644  			}
   645  		}
   646  		return errors.Errorf("n%d descriptor not yet available", peerNodeID)
   647  	})
   648  
   649  	local.bootstrap()
   650  	local.manage()
   651  
   652  	peerStopper.Stop(context.Background())
   653  
   654  	testutils.SucceedsSoon(t, func() error {
   655  		for _, peerID := range local.Outgoing() {
   656  			if peerID == peerNodeID {
   657  				return errors.Errorf("n%d still connected", peerNodeID)
   658  			}
   659  		}
   660  		return nil
   661  	})
   662  
   663  	peerStopper = stop.NewStopper()
   664  	defer peerStopper.Stop(context.Background())
   665  	startGossipAtAddr(clusterID, peerNodeID, peerAddr, peerStopper, t, metric.NewRegistry())
   666  
   667  	testutils.SucceedsSoon(t, func() error {
   668  		for _, peerID := range local.Outgoing() {
   669  			if peerID == peerNodeID {
   670  				return nil
   671  			}
   672  		}
   673  		return errors.Errorf("n%d not yet connected", peerNodeID)
   674  	})
   675  }
   676  
   677  // TestGossipCantJoinTwoClusters verifies that a node can't
   678  // participate in two separate clusters if two nodes from different
   679  // clusters are specified as bootstrap hosts. Previously, this would
   680  // be allowed, because a node verifies the cluster ID only at startup.
   681  // If after joining the first cluster via that cluster's init node,
   682  // the init node shuts down, the joining node will reconnect via its
   683  // second bootstrap host and begin to participate [illegally] in
   684  // another cluster.
   685  func TestGossipJoinTwoClusters(t *testing.T) {
   686  	defer leaktest.AfterTest(t)()
   687  
   688  	const interval = 10 * time.Millisecond
   689  	var stoppers []*stop.Stopper
   690  	var g []*Gossip
   691  	var clusterIDs []uuid.UUID
   692  	var addrs []net.Addr
   693  
   694  	// Create three gossip nodes, init the first two with no bootstrap
   695  	// hosts, but unique cluster IDs. The third host has the first two
   696  	// hosts as bootstrap hosts, but has the same cluster ID as the
   697  	// first of its bootstrap hosts.
   698  	for i := 0; i < 3; i++ {
   699  		stopper := stop.NewStopper()
   700  		stoppers = append(stoppers, stopper)
   701  		defer func() {
   702  			select {
   703  			case <-stopper.ShouldQuiesce():
   704  			default:
   705  				stopper.Stop(context.Background())
   706  			}
   707  		}()
   708  
   709  		var clusterID uuid.UUID
   710  		switch i {
   711  		case 0, 1:
   712  			clusterID = uuid.MakeV4()
   713  		case 2:
   714  			clusterID = clusterIDs[0]
   715  		}
   716  		clusterIDs = append(clusterIDs, clusterID)
   717  		clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
   718  		rpcContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID)
   719  
   720  		server := rpc.NewServer(rpcContext)
   721  
   722  		// node ID must be non-zero
   723  		gnode := NewTest(
   724  			roachpb.NodeID(i+1), rpcContext, server, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef())
   725  		g = append(g, gnode)
   726  		gnode.SetStallInterval(interval)
   727  		gnode.SetBootstrapInterval(interval)
   728  		gnode.clusterID.Set(context.Background(), clusterIDs[i])
   729  
   730  		ln, err := netutil.ListenAndServeGRPC(stopper, server, util.IsolatedTestAddr)
   731  		if err != nil {
   732  			t.Fatal(err)
   733  		}
   734  		addrs = append(addrs, ln.Addr())
   735  
   736  		// Only the third node has resolvers.
   737  		var resolvers []resolver.Resolver
   738  		switch i {
   739  		case 2:
   740  			for j := 0; j < 2; j++ {
   741  				resolver, err := resolver.NewResolver(addrs[j].String())
   742  				if err != nil {
   743  					t.Fatal(err)
   744  				}
   745  				resolvers = append(resolvers, resolver)
   746  			}
   747  		}
   748  		gnode.Start(ln.Addr(), resolvers)
   749  	}
   750  
   751  	// Wait for connections.
   752  	testutils.SucceedsSoon(t, func() error {
   753  		// The first gossip node should have one gossip client address
   754  		// in nodeMap if the 2nd gossip node connected. The second gossip
   755  		// node should have none.
   756  		g[0].mu.Lock()
   757  		defer g[0].mu.Unlock()
   758  		if a, e := len(g[0].mu.nodeMap), 1; a != e {
   759  			return errors.Errorf("expected %v to contain %d nodes, got %d", g[0].mu.nodeMap, e, a)
   760  		}
   761  		g[1].mu.Lock()
   762  		defer g[1].mu.Unlock()
   763  		if a, e := len(g[1].mu.nodeMap), 0; a != e {
   764  			return errors.Errorf("expected %v to contain %d nodes, got %d", g[1].mu.nodeMap, e, a)
   765  		}
   766  		return nil
   767  	})
   768  
   769  	// Kill node 0 to force node 2 to bootstrap with node 1.
   770  	stoppers[0].Stop(context.Background())
   771  	// Wait for twice the bootstrap interval, and verify that
   772  	// node 2 still has not connected to node 1.
   773  	time.Sleep(2 * interval)
   774  
   775  	g[1].mu.Lock()
   776  	if a, e := len(g[1].mu.nodeMap), 0; a != e {
   777  		t.Errorf("expected %v to contain %d nodes, got %d", g[1].mu.nodeMap, e, a)
   778  	}
   779  	g[1].mu.Unlock()
   780  }
   781  
   782  // Test propagation of gossip infos in both directions across an existing
   783  // gossip connection.
   784  func TestGossipPropagation(t *testing.T) {
   785  	defer leaktest.AfterTest(t)()
   786  	stopper := stop.NewStopper()
   787  	defer stopper.Stop(context.Background())
   788  
   789  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   790  	// don't talk to servers from unrelated tests by accident).
   791  	clusterID := uuid.MakeV4()
   792  
   793  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   794  	remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry())
   795  	remote.mu.Lock()
   796  	rAddr := remote.mu.is.NodeAddr
   797  	remote.mu.Unlock()
   798  	local.manage()
   799  	remote.manage()
   800  
   801  	mustAdd := func(g *Gossip, key string, val []byte, ttl time.Duration) {
   802  		if err := g.AddInfo(key, val, ttl); err != nil {
   803  			t.Fatal(err)
   804  		}
   805  	}
   806  
   807  	// Gossip a key on local and wait for it to show up on remote. This
   808  	// guarantees we have an active local to remote client connection.
   809  	mustAdd(local, "bootstrap", nil, 0)
   810  	testutils.SucceedsSoon(t, func() error {
   811  		c := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() })
   812  		if c == nil {
   813  			// Restart the client connection in the loop. It might have failed due to
   814  			// a heartbeat timeout.
   815  			local.mu.Lock()
   816  			local.startClientLocked(&rAddr)
   817  			local.mu.Unlock()
   818  			return fmt.Errorf("unable to find local to remote client")
   819  		}
   820  		_, err := remote.GetInfo("bootstrap")
   821  		return err
   822  	})
   823  
   824  	// Add entries on both the local and remote nodes and verify they get propagated.
   825  	mustAdd(local, "local", nil, time.Minute)
   826  	mustAdd(remote, "remote", nil, time.Minute)
   827  
   828  	getInfo := func(g *Gossip, key string) *Info {
   829  		g.mu.RLock()
   830  		defer g.mu.RUnlock()
   831  		return g.mu.is.Infos[key]
   832  	}
   833  
   834  	var localInfo *Info
   835  	var remoteInfo *Info
   836  	testutils.SucceedsSoon(t, func() error {
   837  		localInfo = getInfo(remote, "local")
   838  		if localInfo == nil {
   839  			return fmt.Errorf("local info not propagated")
   840  		}
   841  		remoteInfo = getInfo(local, "remote")
   842  		if remoteInfo == nil {
   843  			return fmt.Errorf("remote info not propagated")
   844  		}
   845  		return nil
   846  	})
   847  
   848  	// Replace the existing entries on both the local and remote nodes and verify
   849  	// these new entries get propagated with updated timestamps.
   850  	mustAdd(local, "local", nil, 2*time.Minute)
   851  	mustAdd(remote, "remote", nil, 2*time.Minute)
   852  
   853  	testutils.SucceedsSoon(t, func() error {
   854  		if i := getInfo(remote, "local"); i == nil || reflect.DeepEqual(i, localInfo) {
   855  			return fmt.Errorf("new local info not propagated:\n%v\n%v", i, localInfo)
   856  		}
   857  		if i := getInfo(local, "remote"); reflect.DeepEqual(i, remoteInfo) {
   858  			return fmt.Errorf("new remote info not propagated:\n%v\n%v", i, remoteInfo)
   859  		}
   860  		return nil
   861  	})
   862  }
   863  
   864  // Test whether propagation of an info that was generated by a prior
   865  // incarnation of a server can correctly be sent back to that originating
   866  // server. Consider the scenario:
   867  //
   868  // n1: decommissioned
   869  // n2: gossip node-liveness:1
   870  // n3: node-liveness range lease acquired (does not gossip node-liveness:1
   871  //     record because it is unchanged)
   872  // n2: restarted
   873  //   - connects as gossip client to n3
   874  //   - sends a batch of gossip records to n3
   875  //   - n3 responds without sending node-liveness:1 because it's
   876  //     OrigStamp is less than the highwater stamp from n2
   877  func TestGossipLoopbackInfoPropagation(t *testing.T) {
   878  	defer leaktest.AfterTest(t)()
   879  	t.Skipf("#34494")
   880  	stopper := stop.NewStopper()
   881  	defer stopper.Stop(context.Background())
   882  
   883  	// Shared cluster ID by all gossipers (this ensures that the gossipers
   884  	// don't talk to servers from unrelated tests by accident).
   885  	clusterID := uuid.MakeV4()
   886  
   887  	local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry())
   888  	remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry())
   889  	remote.mu.Lock()
   890  	rAddr := remote.mu.is.NodeAddr
   891  	remote.mu.Unlock()
   892  	local.manage()
   893  	remote.manage()
   894  
   895  	// Add a gossip info for "foo" on remote, that was generated by local. This
   896  	// simulates what happens if local was to gossip an info, and later restart
   897  	// and never gossip that info again.
   898  	func() {
   899  		local.mu.Lock()
   900  		defer local.mu.Unlock()
   901  		remote.mu.Lock()
   902  		defer remote.mu.Unlock()
   903  		// NB: replacing local.mu.is.newInfo with remote.mu.is.newInfo allows "foo"
   904  		// to be propagated.
   905  		if err := remote.mu.is.addInfo("foo", local.mu.is.newInfo(nil, 0)); err != nil {
   906  			t.Fatal(err)
   907  		}
   908  	}()
   909  
   910  	// Add an info to local so that it has a highwater timestamp that is newer
   911  	// than the info we added to remote. NB: commenting out this line allows
   912  	// "foo" to be propagated.
   913  	if err := local.AddInfo("bar", nil, 0); err != nil {
   914  		t.Fatal(err)
   915  	}
   916  
   917  	// Start a client connection to the remote node.
   918  	local.mu.Lock()
   919  	local.startClientLocked(&rAddr)
   920  	local.mu.Unlock()
   921  
   922  	getInfo := func(g *Gossip, key string) *Info {
   923  		g.mu.RLock()
   924  		defer g.mu.RUnlock()
   925  		return g.mu.is.Infos[key]
   926  	}
   927  
   928  	testutils.SucceedsSoon(t, func() error {
   929  		if getInfo(remote, "bar") == nil {
   930  			return fmt.Errorf("bar not propagated")
   931  		}
   932  		if getInfo(local, "foo") == nil {
   933  			return fmt.Errorf("foo not propagated")
   934  		}
   935  		return nil
   936  	})
   937  }