github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_transport_test.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"context"
    15  	"math/rand"
    16  	"net"
    17  	"reflect"
    18  	"testing"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    22  	"github.com/cockroachdb/cockroach/pkg/gossip"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/rpc"
    26  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/testutils"
    29  	"github.com/cockroachdb/cockroach/pkg/util"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    32  	"github.com/cockroachdb/cockroach/pkg/util/log"
    33  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    34  	"github.com/cockroachdb/cockroach/pkg/util/netutil"
    35  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    36  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    37  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    38  	"github.com/cockroachdb/errors"
    39  	"github.com/stretchr/testify/require"
    40  	"go.etcd.io/etcd/raft/raftpb"
    41  )
    42  
    43  const channelServerBrokenRangeMessage = "channelServer broken range"
    44  
    45  type channelServer struct {
    46  	ch       chan *kvserver.RaftMessageRequest
    47  	maxSleep time.Duration
    48  
    49  	// If non-zero, all messages to this range will return errors
    50  	brokenRange roachpb.RangeID
    51  }
    52  
    53  func newChannelServer(bufSize int, maxSleep time.Duration) channelServer {
    54  	return channelServer{
    55  		ch:       make(chan *kvserver.RaftMessageRequest, bufSize),
    56  		maxSleep: maxSleep,
    57  	}
    58  }
    59  
    60  func (s channelServer) HandleRaftRequest(
    61  	ctx context.Context, req *kvserver.RaftMessageRequest, _ kvserver.RaftMessageResponseStream,
    62  ) *roachpb.Error {
    63  	if s.maxSleep != 0 {
    64  		// maxSleep simulates goroutine scheduling delays that could
    65  		// result in messages being processed out of order (in previous
    66  		// transport implementations).
    67  		time.Sleep(time.Duration(rand.Int63n(int64(s.maxSleep))))
    68  	}
    69  	if s.brokenRange != 0 && s.brokenRange == req.RangeID {
    70  		return roachpb.NewErrorf(channelServerBrokenRangeMessage)
    71  	}
    72  	s.ch <- req
    73  	return nil
    74  }
    75  
    76  func (s channelServer) HandleRaftResponse(
    77  	ctx context.Context, resp *kvserver.RaftMessageResponse,
    78  ) error {
    79  	// Mimic the logic in (*Store).HandleRaftResponse without requiring an
    80  	// entire Store object to be pulled into these tests.
    81  	if val, ok := resp.Union.GetValue().(*roachpb.Error); ok {
    82  		if err, ok := val.GetDetail().(*roachpb.StoreNotFoundError); ok {
    83  			return err
    84  		}
    85  	}
    86  	log.Fatalf(ctx, "unexpected raft response: %s", resp)
    87  	return nil
    88  }
    89  
    90  func (s channelServer) HandleSnapshot(
    91  	header *kvserver.SnapshotRequest_Header, stream kvserver.SnapshotResponseStream,
    92  ) error {
    93  	panic("unexpected HandleSnapshot")
    94  }
    95  
    96  // raftTransportTestContext contains objects needed to test RaftTransport.
    97  // Typical usage will add multiple nodes with AddNode, attach channels
    98  // to at least one store with ListenStore, and send messages with Send.
    99  type raftTransportTestContext struct {
   100  	t              testing.TB
   101  	stopper        *stop.Stopper
   102  	transports     map[roachpb.NodeID]*kvserver.RaftTransport
   103  	nodeRPCContext *rpc.Context
   104  	gossip         *gossip.Gossip
   105  }
   106  
   107  func newRaftTransportTestContext(t testing.TB) *raftTransportTestContext {
   108  	rttc := &raftTransportTestContext{
   109  		t:          t,
   110  		stopper:    stop.NewStopper(),
   111  		transports: map[roachpb.NodeID]*kvserver.RaftTransport{},
   112  	}
   113  	rttc.nodeRPCContext = rpc.NewContext(
   114  		log.AmbientContext{Tracer: tracing.NewTracer()},
   115  		testutils.NewNodeTestBaseContext(),
   116  		hlc.NewClock(hlc.UnixNano, time.Nanosecond),
   117  		rttc.stopper,
   118  		cluster.MakeTestingClusterSettings(),
   119  	)
   120  	// Ensure that tests using this test context and restart/shut down
   121  	// their servers do not inadvertently start talking to servers from
   122  	// unrelated concurrent tests.
   123  	rttc.nodeRPCContext.ClusterID.Set(context.Background(), uuid.MakeV4())
   124  
   125  	// We are sharing the same RPC context for all simulated nodes, so
   126  	// we can't enforce some of the RPC check validation.
   127  	rttc.nodeRPCContext.TestingAllowNamedRPCToAnonymousServer = true
   128  
   129  	server := rpc.NewServer(rttc.nodeRPCContext) // never started
   130  	rttc.gossip = gossip.NewTest(
   131  		1, rttc.nodeRPCContext, server, rttc.stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef(),
   132  	)
   133  
   134  	return rttc
   135  }
   136  
   137  func (rttc *raftTransportTestContext) Stop() {
   138  	rttc.stopper.Stop(context.Background())
   139  }
   140  
   141  // AddNode registers a node with the cluster. Nodes must be added
   142  // before they can be used in other methods of
   143  // raftTransportTestContext. The node will be gossiped immediately.
   144  func (rttc *raftTransportTestContext) AddNode(nodeID roachpb.NodeID) *kvserver.RaftTransport {
   145  	transport, addr := rttc.AddNodeWithoutGossip(nodeID, util.TestAddr, rttc.stopper)
   146  	rttc.GossipNode(nodeID, addr)
   147  	return transport
   148  }
   149  
   150  // AddNodeWithoutGossip registers a node with the cluster. Nodes must
   151  // be added before they can be used in other methods of
   152  // raftTransportTestContext. Unless you are testing the effects of
   153  // delaying gossip, use AddNode instead.
   154  func (rttc *raftTransportTestContext) AddNodeWithoutGossip(
   155  	nodeID roachpb.NodeID, addr net.Addr, stopper *stop.Stopper,
   156  ) (*kvserver.RaftTransport, net.Addr) {
   157  	grpcServer := rpc.NewServer(rttc.nodeRPCContext)
   158  	transport := kvserver.NewRaftTransport(
   159  		log.AmbientContext{Tracer: tracing.NewTracer()},
   160  		cluster.MakeTestingClusterSettings(),
   161  		nodedialer.New(rttc.nodeRPCContext, gossip.AddressResolver(rttc.gossip)),
   162  		grpcServer,
   163  		rttc.stopper,
   164  	)
   165  	rttc.transports[nodeID] = transport
   166  	ln, err := netutil.ListenAndServeGRPC(stopper, grpcServer, addr)
   167  	if err != nil {
   168  		rttc.t.Fatal(err)
   169  	}
   170  	return transport, ln.Addr()
   171  }
   172  
   173  // GossipNode gossips the node's address, which is necessary before
   174  // any messages can be sent to it. Normally done automatically by
   175  // AddNode.
   176  func (rttc *raftTransportTestContext) GossipNode(nodeID roachpb.NodeID, addr net.Addr) {
   177  	if err := rttc.gossip.AddInfoProto(gossip.MakeNodeIDKey(nodeID),
   178  		&roachpb.NodeDescriptor{
   179  			NodeID:  nodeID,
   180  			Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()),
   181  		},
   182  		time.Hour); err != nil {
   183  		rttc.t.Fatal(err)
   184  	}
   185  }
   186  
   187  // ListenStore registers a store on a node and returns a channel for
   188  // messages sent to that store.
   189  func (rttc *raftTransportTestContext) ListenStore(
   190  	nodeID roachpb.NodeID, storeID roachpb.StoreID,
   191  ) channelServer {
   192  	ch := newChannelServer(100, 10*time.Millisecond)
   193  	rttc.transports[nodeID].Listen(storeID, ch)
   194  	return ch
   195  }
   196  
   197  // Send a message. Returns false if the message was dropped.
   198  func (rttc *raftTransportTestContext) Send(
   199  	from, to roachpb.ReplicaDescriptor, rangeID roachpb.RangeID, msg raftpb.Message,
   200  ) bool {
   201  	msg.To = uint64(to.ReplicaID)
   202  	msg.From = uint64(from.ReplicaID)
   203  	req := &kvserver.RaftMessageRequest{
   204  		RangeID:     rangeID,
   205  		Message:     msg,
   206  		ToReplica:   to,
   207  		FromReplica: from,
   208  	}
   209  	return rttc.transports[from.NodeID].SendAsync(req, rpc.DefaultClass)
   210  }
   211  
   212  func TestSendAndReceive(t *testing.T) {
   213  	defer leaktest.AfterTest(t)()
   214  	rttc := newRaftTransportTestContext(t)
   215  	defer rttc.Stop()
   216  
   217  	// Create several servers, each of which has two stores (A raft
   218  	// node ID addresses a store). Node 1 has stores 1 and 2, node 2 has
   219  	// stores 3 and 4, etc.
   220  	//
   221  	// We suppose that range 1 is replicated across the odd-numbered
   222  	// stores in reverse order to ensure that the various IDs are not
   223  	// equal: replica 1 is store 5, replica 2 is store 3, and replica 3
   224  	// is store 1.
   225  	const numNodes = 3
   226  	const storesPerNode = 2
   227  	nextNodeID := roachpb.NodeID(2)
   228  	nextStoreID := roachpb.StoreID(2)
   229  
   230  	// Per-node state.
   231  	transports := map[roachpb.NodeID]*kvserver.RaftTransport{}
   232  
   233  	// Per-store state.
   234  	storeNodes := map[roachpb.StoreID]roachpb.NodeID{}
   235  	channels := map[roachpb.StoreID]channelServer{}
   236  	replicaIDs := map[roachpb.StoreID]roachpb.ReplicaID{
   237  		1: 3,
   238  		3: 2,
   239  		5: 1,
   240  	}
   241  
   242  	messageTypes := map[raftpb.MessageType]struct{}{
   243  		raftpb.MsgHeartbeat: {},
   244  	}
   245  
   246  	for nodeIndex := 0; nodeIndex < numNodes; nodeIndex++ {
   247  		nodeID := nextNodeID
   248  		nextNodeID++
   249  		transports[nodeID] = rttc.AddNode(nodeID)
   250  
   251  		for storeIndex := 0; storeIndex < storesPerNode; storeIndex++ {
   252  			storeID := nextStoreID
   253  			nextStoreID++
   254  
   255  			storeNodes[storeID] = nodeID
   256  
   257  			channels[storeID] = rttc.ListenStore(nodeID, storeID)
   258  		}
   259  	}
   260  
   261  	messageTypeCounts := make(map[roachpb.StoreID]map[raftpb.MessageType]int)
   262  
   263  	// Each store sends one snapshot and one heartbeat to each store, including
   264  	// itself.
   265  	for toStoreID, toNodeID := range storeNodes {
   266  		if _, ok := messageTypeCounts[toStoreID]; !ok {
   267  			messageTypeCounts[toStoreID] = make(map[raftpb.MessageType]int)
   268  		}
   269  
   270  		for fromStoreID, fromNodeID := range storeNodes {
   271  			baseReq := kvserver.RaftMessageRequest{
   272  				RangeID: 1,
   273  				Message: raftpb.Message{
   274  					From: uint64(fromStoreID),
   275  					To:   uint64(toStoreID),
   276  				},
   277  				FromReplica: roachpb.ReplicaDescriptor{
   278  					NodeID:  fromNodeID,
   279  					StoreID: fromStoreID,
   280  				},
   281  				ToReplica: roachpb.ReplicaDescriptor{
   282  					NodeID:  toNodeID,
   283  					StoreID: toStoreID,
   284  				},
   285  			}
   286  
   287  			for messageType := range messageTypes {
   288  				req := baseReq
   289  				req.Message.Type = messageType
   290  
   291  				if !transports[fromNodeID].SendAsync(&req, rpc.DefaultClass) {
   292  					t.Errorf("unable to send %s from %d to %d", messageType, fromNodeID, toNodeID)
   293  				}
   294  				messageTypeCounts[toStoreID][messageType]++
   295  			}
   296  		}
   297  	}
   298  
   299  	// Read all the messages from the channels. Note that the transport
   300  	// does not guarantee in-order delivery between independent
   301  	// transports, so we just verify that the right number of messages
   302  	// end up in each channel.
   303  	for toStoreID := range storeNodes {
   304  		for len(messageTypeCounts[toStoreID]) > 0 {
   305  			req := <-channels[toStoreID].ch
   306  			if req.Message.To != uint64(toStoreID) {
   307  				t.Errorf("got unexpected message %v on channel %d", req, toStoreID)
   308  			}
   309  
   310  			if typeCounts, ok := messageTypeCounts[toStoreID]; ok {
   311  				if _, ok := typeCounts[req.Message.Type]; ok {
   312  					typeCounts[req.Message.Type]--
   313  					if typeCounts[req.Message.Type] == 0 {
   314  						delete(typeCounts, req.Message.Type)
   315  					}
   316  				} else {
   317  					t.Errorf("expected %v to have key %v, but it did not", typeCounts, req.Message.Type)
   318  				}
   319  			} else {
   320  				t.Errorf("expected %v to have key %v, but it did not", messageTypeCounts, toStoreID)
   321  			}
   322  		}
   323  
   324  		delete(messageTypeCounts, toStoreID)
   325  
   326  		select {
   327  		case req := <-channels[toStoreID].ch:
   328  			t.Errorf("got unexpected message %v on channel %d", req, toStoreID)
   329  		case <-time.After(100 * time.Millisecond):
   330  		}
   331  	}
   332  
   333  	if len(messageTypeCounts) > 0 {
   334  		t.Errorf("remaining messages expected: %v", messageTypeCounts)
   335  	}
   336  
   337  	// Real raft messages have different node/store/replica IDs.
   338  	// Send a message from replica 2 (on store 3, node 2) to replica 1 (on store 5, node 3)
   339  	fromStoreID := roachpb.StoreID(3)
   340  	toStoreID := roachpb.StoreID(5)
   341  	expReq := &kvserver.RaftMessageRequest{
   342  		RangeID: 1,
   343  		Message: raftpb.Message{
   344  			Type: raftpb.MsgApp,
   345  			From: uint64(replicaIDs[fromStoreID]),
   346  			To:   uint64(replicaIDs[toStoreID]),
   347  		},
   348  		FromReplica: roachpb.ReplicaDescriptor{
   349  			NodeID:    storeNodes[fromStoreID],
   350  			StoreID:   fromStoreID,
   351  			ReplicaID: replicaIDs[fromStoreID],
   352  		},
   353  		ToReplica: roachpb.ReplicaDescriptor{
   354  			NodeID:    storeNodes[toStoreID],
   355  			StoreID:   toStoreID,
   356  			ReplicaID: replicaIDs[toStoreID],
   357  		},
   358  	}
   359  	// NB: argument passed to SendAsync is not safe to use after; make a copy.
   360  	expReqCopy := *expReq
   361  	if !transports[storeNodes[fromStoreID]].SendAsync(&expReqCopy, rpc.DefaultClass) {
   362  		t.Errorf("unable to send message from %d to %d", fromStoreID, toStoreID)
   363  	}
   364  	// NB: proto.Equal will panic here since it doesn't know about `gogoproto.casttype`.
   365  	if req := <-channels[toStoreID].ch; !reflect.DeepEqual(req, expReq) {
   366  		t.Errorf("got unexpected message %v on channel %d", req, toStoreID)
   367  	}
   368  
   369  	select {
   370  	case req := <-channels[toStoreID].ch:
   371  		t.Errorf("got unexpected message %v on channel %d", req, toStoreID)
   372  	default:
   373  	}
   374  }
   375  
   376  // TestInOrderDelivery verifies that for a given pair of nodes, raft
   377  // messages are delivered in order.
   378  func TestInOrderDelivery(t *testing.T) {
   379  	defer leaktest.AfterTest(t)()
   380  	rttc := newRaftTransportTestContext(t)
   381  	defer rttc.Stop()
   382  
   383  	const numMessages = 100
   384  	serverReplica := roachpb.ReplicaDescriptor{
   385  		NodeID:    2,
   386  		StoreID:   2,
   387  		ReplicaID: 2,
   388  	}
   389  	rttc.AddNode(serverReplica.NodeID)
   390  	serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID)
   391  
   392  	clientReplica := roachpb.ReplicaDescriptor{
   393  		NodeID:    1,
   394  		StoreID:   1,
   395  		ReplicaID: 1,
   396  	}
   397  	rttc.AddNode(clientReplica.NodeID)
   398  
   399  	for i := 0; i < numMessages; i++ {
   400  		if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: uint64(i)}) {
   401  			t.Errorf("failed to send message %d", i)
   402  		}
   403  	}
   404  
   405  	for i := 0; i < numMessages; i++ {
   406  		req := <-serverChannel.ch
   407  		if req.Message.Commit != uint64(i) {
   408  			t.Errorf("messages out of order: got %d while expecting %d", req.Message.Commit, i)
   409  		}
   410  	}
   411  }
   412  
   413  // TestRaftTransportCircuitBreaker verifies that messages will be
   414  // dropped waiting for raft node connection to be established.
   415  func TestRaftTransportCircuitBreaker(t *testing.T) {
   416  	defer leaktest.AfterTest(t)()
   417  	rttc := newRaftTransportTestContext(t)
   418  	defer rttc.Stop()
   419  
   420  	serverReplica := roachpb.ReplicaDescriptor{
   421  		NodeID:    2,
   422  		StoreID:   2,
   423  		ReplicaID: 2,
   424  	}
   425  	_, serverAddr := rttc.AddNodeWithoutGossip(serverReplica.NodeID, util.TestAddr, rttc.stopper)
   426  	serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID)
   427  
   428  	clientReplica := roachpb.ReplicaDescriptor{
   429  		NodeID:    1,
   430  		StoreID:   1,
   431  		ReplicaID: 1,
   432  	}
   433  	clientTransport := rttc.AddNode(clientReplica.NodeID)
   434  
   435  	// Sending repeated messages should begin dropping once the circuit breaker
   436  	// does trip.
   437  	testutils.SucceedsSoon(t, func() error {
   438  		if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) {
   439  			return errors.Errorf("expected circuit breaker to trip")
   440  		}
   441  		return nil
   442  	})
   443  
   444  	// Now, gossip address of server.
   445  	rttc.GossipNode(serverReplica.NodeID, serverAddr)
   446  
   447  	// Keep sending commit=2 until breaker resets and we receive the
   448  	// first instance. It's possible an earlier message for commit=1
   449  	// snuck in.
   450  	testutils.SucceedsSoon(t, func() error {
   451  		if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 2}) {
   452  			clientTransport.GetCircuitBreaker(serverReplica.NodeID, rpc.DefaultClass).Reset()
   453  		}
   454  		select {
   455  		case req := <-serverChannel.ch:
   456  			if req.Message.Commit == 2 {
   457  				return nil
   458  			}
   459  		default:
   460  		}
   461  		return errors.Errorf("expected message commit=2")
   462  	})
   463  }
   464  
   465  // TestRaftTransportIndependentRanges ensures that errors from one
   466  // range do not interfere with messages to another range on the same
   467  // store.
   468  func TestRaftTransportIndependentRanges(t *testing.T) {
   469  	defer leaktest.AfterTest(t)()
   470  	rttc := newRaftTransportTestContext(t)
   471  	defer rttc.Stop()
   472  
   473  	server := roachpb.ReplicaDescriptor{
   474  		NodeID:    1,
   475  		StoreID:   1,
   476  		ReplicaID: 1,
   477  	}
   478  	serverTransport := rttc.AddNode(server.NodeID)
   479  	client := roachpb.ReplicaDescriptor{
   480  		NodeID:    2,
   481  		StoreID:   2,
   482  		ReplicaID: 2,
   483  	}
   484  	rttc.AddNode(client.NodeID)
   485  
   486  	const numMessages = 50
   487  	channelServer := newChannelServer(numMessages*2, 10*time.Millisecond)
   488  	channelServer.brokenRange = 13
   489  	serverTransport.Listen(server.StoreID, channelServer)
   490  
   491  	for i := 0; i < numMessages; i++ {
   492  		for _, rangeID := range []roachpb.RangeID{1, 13} {
   493  			if !rttc.Send(client, server, rangeID, raftpb.Message{Commit: uint64(i)}) {
   494  				t.Errorf("failed to send message %d to range %s", i, rangeID)
   495  			}
   496  		}
   497  	}
   498  	for i := 0; i < numMessages; i++ {
   499  		select {
   500  		case msg := <-channelServer.ch:
   501  			if msg.Message.Commit != uint64(i) {
   502  				t.Errorf("got message %d while expecting %d", msg.Message.Commit, i)
   503  			}
   504  		case <-time.After(time.Second):
   505  			t.Fatalf("timeout waiting for message %d", i)
   506  		}
   507  	}
   508  }
   509  
   510  // TestReopenConnection verifies that if a raft response indicates that the
   511  // expected store isn't present on the node, that the connection gets
   512  // terminated and reopened before retrying, to ensure that the transport
   513  // doesn't get stuck in an endless retry loop against the wrong node.
   514  func TestReopenConnection(t *testing.T) {
   515  	defer leaktest.AfterTest(t)()
   516  	rttc := newRaftTransportTestContext(t)
   517  	defer rttc.Stop()
   518  
   519  	// Use a special stopper for the initial server so that we can fully stop it
   520  	// (releasing its bound network address) before the rest of the test pieces.
   521  	serverStopper := stop.NewStopper()
   522  	serverReplica := roachpb.ReplicaDescriptor{
   523  		NodeID:    2,
   524  		StoreID:   2,
   525  		ReplicaID: 2,
   526  	}
   527  	serverTransport, serverAddr :=
   528  		rttc.AddNodeWithoutGossip(serverReplica.NodeID, util.TestAddr, serverStopper)
   529  	rttc.GossipNode(serverReplica.NodeID, serverAddr)
   530  	rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID)
   531  
   532  	clientReplica := roachpb.ReplicaDescriptor{
   533  		NodeID:    1,
   534  		StoreID:   1,
   535  		ReplicaID: 1,
   536  	}
   537  	rttc.AddNode(clientReplica.NodeID)
   538  	rttc.ListenStore(clientReplica.NodeID, clientReplica.StoreID)
   539  
   540  	// Take down the old server and start a new one at the same address.
   541  	serverTransport.Stop(serverReplica.StoreID)
   542  	serverStopper.Stop(context.Background())
   543  
   544  	// With the old server down, nothing is listening no the address right now
   545  	// so the circuit breaker should trip.
   546  	testutils.SucceedsSoon(t, func() error {
   547  		if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) {
   548  			return errors.New("expected circuit breaker to trip")
   549  		}
   550  		return nil
   551  	})
   552  
   553  	replacementReplica := roachpb.ReplicaDescriptor{
   554  		NodeID:    3,
   555  		StoreID:   3,
   556  		ReplicaID: 3,
   557  	}
   558  
   559  	rttc.AddNodeWithoutGossip(replacementReplica.NodeID, serverAddr, rttc.stopper)
   560  	replacementChannel := rttc.ListenStore(replacementReplica.NodeID, replacementReplica.StoreID)
   561  
   562  	// Try sending a message to the old server's store (at the address its
   563  	// replacement is now running at) before its replacement has been gossiped.
   564  	// We just want to ensure that doing so doesn't deadlock the client transport.
   565  	if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) {
   566  		t.Fatal("unexpectedly managed to send to recently downed node")
   567  	}
   568  
   569  	// Then, to ensure the client hasn't been deadlocked, add the replacement node
   570  	// to the gossip network and send it a request. Note that this will remove the
   571  	// gossip record for serverReplica.NodeID (n2) since they share the same address.
   572  	// This explains why we we can't really assert whether n2 becomes unreachable or
   573  	// not. If a healthy connection makes it into the rpc context before gossip
   574  	// makes the node unresolvable, it's possible. In the other case, it's not.
   575  	rttc.GossipNode(replacementReplica.NodeID, serverAddr)
   576  
   577  	testutils.SucceedsSoon(t, func() error {
   578  		// Sending messages to the old store does not deadlock. See the comment above
   579  		// to understand why we don't check the returned value.
   580  		rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1})
   581  		// It won't be long until we can send to the new replica. The only reason
   582  		// this might fail is that the failed connection is still in the RPC
   583  		// connection pool and we have to wait out a health check interval.
   584  		if !rttc.Send(clientReplica, replacementReplica, 1, raftpb.Message{Commit: 1}) {
   585  			return errors.New("unable to send to replacement replica")
   586  		}
   587  		return nil
   588  	})
   589  
   590  	// Send commit=2 to the replacement replica. This should work now because we've
   591  	// just used it successfully above and didn't change anything about the networking.
   592  	if !rttc.Send(clientReplica, replacementReplica, 1, raftpb.Message{Commit: 2}) {
   593  		t.Fatal("replacement node still unhealthy")
   594  
   595  	}
   596  	testutils.SucceedsSoon(t, func() error {
   597  		select {
   598  		case req := <-replacementChannel.ch:
   599  			// There could be a few stray messages with `c==1` in the channel,
   600  			// so throw those away.
   601  			if c := req.Message.Commit; c == 2 {
   602  				return nil
   603  			}
   604  		default:
   605  		}
   606  		return errors.New("still waiting")
   607  	})
   608  }
   609  
   610  // This test ensures that blocking by a node dialer attempting to dial a
   611  // remote node does not block calls to SendAsync.
   612  func TestSendFailureToConnectDoesNotHangRaft(t *testing.T) {
   613  	defer leaktest.AfterTest(t)()
   614  	rttc := newRaftTransportTestContext(t)
   615  	defer rttc.Stop()
   616  
   617  	// Create a single server from which we're going to call send.
   618  	// We'll then set up a bogus target server which will not be serving gRPC
   619  	// and will block during connection setup (leading to blocking in the Dial
   620  	// call). The test ensures that the Send call does not block.
   621  	const rangeID, from, to = 1, 1, 2
   622  	transport := rttc.AddNode(from)
   623  	// Set up a plain old TCP listener that's not going to accept any connecitons
   624  	// which will lead to blocking during dial.
   625  	ln, err := net.Listen("tcp", util.TestAddr.String())
   626  	require.NoError(t, err)
   627  	defer func() { _ = ln.Close() }()
   628  	rttc.GossipNode(to, ln.Addr())
   629  	// Try to send a message, make sure we don't block waiting to set up the
   630  	// connection.
   631  	transport.SendAsync(&kvserver.RaftMessageRequest{
   632  		RangeID: rangeID,
   633  		ToReplica: roachpb.ReplicaDescriptor{
   634  			StoreID:   to,
   635  			NodeID:    to,
   636  			ReplicaID: to,
   637  		},
   638  		FromReplica: roachpb.ReplicaDescriptor{
   639  			StoreID:   from,
   640  			NodeID:    from,
   641  			ReplicaID: from,
   642  		},
   643  		Message: raftpb.Message{To: to, From: from},
   644  	}, rpc.DefaultClass)
   645  }