github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/rpc/context_test.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rpc
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"net"
    18  	"strconv"
    19  	"sync"
    20  	"sync/atomic"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    27  	"github.com/cockroachdb/cockroach/pkg/testutils"
    28  	"github.com/cockroachdb/cockroach/pkg/util"
    29  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    32  	"github.com/cockroachdb/cockroach/pkg/util/log"
    33  	"github.com/cockroachdb/cockroach/pkg/util/netutil"
    34  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    35  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    36  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    38  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    39  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    40  	"github.com/cockroachdb/errors"
    41  	"github.com/cockroachdb/logtags"
    42  	"github.com/stretchr/testify/require"
    43  	"google.golang.org/grpc"
    44  	"google.golang.org/grpc/codes"
    45  	"google.golang.org/grpc/credentials"
    46  	"google.golang.org/grpc/keepalive"
    47  	"google.golang.org/grpc/status"
    48  )
    49  
    50  // TestingConnHealth returns nil if we have an open connection to the given
    51  // target with DefaultClass that succeeded on its most recent heartbeat.
    52  // Otherwise, it kicks off a connection attempt (unless one is already in
    53  // progress or we are in a backoff state) and returns an error (typically
    54  // ErrNotHeartbeated). This is a conservative/pessimistic indicator:
    55  // if we have not attempted to talk to the target node recently, an
    56  // error will be returned. This method should therefore be used to
    57  // prioritize among a list of candidate nodes, but not to filter out
    58  // "unhealthy" nodes.
    59  //
    60  // This is used in tests only; in clusters use (*Dialer).ConnHealth()
    61  // instead which automates the address resolution.
    62  //
    63  // TODO(knz): remove this altogether. Use the dialer in all cases.
    64  func (ctx *Context) TestingConnHealth(target string, nodeID roachpb.NodeID) error {
    65  	if ctx.GetLocalInternalClientForAddr(target, nodeID) != nil {
    66  		// The local server is always considered healthy.
    67  		return nil
    68  	}
    69  	conn := ctx.GRPCDialNode(target, nodeID, DefaultClass)
    70  	return conn.Health()
    71  }
    72  
    73  // AddTestingDialOpts adds extra dialing options to the rpc Context. This should
    74  // be done before GRPCDial is called.
    75  func (ctx *Context) AddTestingDialOpts(opts ...grpc.DialOption) {
    76  	ctx.testingDialOpts = append(ctx.testingDialOpts, opts...)
    77  }
    78  
    79  func newTestServer(t testing.TB, ctx *Context, extraOpts ...grpc.ServerOption) *grpc.Server {
    80  	tlsConfig, err := ctx.GetServerTLSConfig()
    81  	if err != nil {
    82  		t.Fatal(err)
    83  	}
    84  	opts := []grpc.ServerOption{
    85  		grpc.Creds(credentials.NewTLS(tlsConfig)),
    86  		grpc.StatsHandler(&ctx.stats),
    87  	}
    88  	opts = append(opts, extraOpts...)
    89  	return grpc.NewServer(opts...)
    90  }
    91  
    92  func newTestContextWithKnobs(
    93  	clock *hlc.Clock, stopper *stop.Stopper, knobs ContextTestingKnobs,
    94  ) *Context {
    95  	return NewContextWithTestingKnobs(
    96  		log.AmbientContext{Tracer: tracing.NewTracer()},
    97  		testutils.NewNodeTestBaseContext(),
    98  		clock,
    99  		stopper,
   100  		cluster.MakeTestingClusterSettings(),
   101  		knobs,
   102  	)
   103  }
   104  
   105  func newTestContext(clusterID uuid.UUID, clock *hlc.Clock, stopper *stop.Stopper) *Context {
   106  	return newTestContextWithKnobs(clock, stopper, ContextTestingKnobs{
   107  		ClusterID: &clusterID,
   108  	})
   109  }
   110  
   111  func TestHeartbeatCB(t *testing.T) {
   112  	defer leaktest.AfterTest(t)()
   113  
   114  	testutils.RunTrueAndFalse(t, "compression", func(t *testing.T, compression bool) {
   115  		stopper := stop.NewStopper()
   116  		defer stopper.Stop(context.Background())
   117  
   118  		// Shared cluster ID by all RPC peers (this ensures that the peers
   119  		// don't talk to servers from unrelated tests by accident).
   120  		clusterID := uuid.MakeV4()
   121  
   122  		clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
   123  		serverCtx := newTestContext(clusterID, clock, stopper)
   124  		serverCtx.rpcCompression = compression
   125  		const serverNodeID = 1
   126  		serverCtx.NodeID.Set(context.Background(), serverNodeID)
   127  		s := newTestServer(t, serverCtx)
   128  		RegisterHeartbeatServer(s, &HeartbeatService{
   129  			clock:              clock,
   130  			remoteClockMonitor: serverCtx.RemoteClocks,
   131  			clusterID:          &serverCtx.ClusterID,
   132  			nodeID:             &serverCtx.NodeID,
   133  			settings:           serverCtx.settings,
   134  		})
   135  
   136  		ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
   137  		if err != nil {
   138  			t.Fatal(err)
   139  		}
   140  		remoteAddr := ln.Addr().String()
   141  
   142  		// Clocks don't matter in this test.
   143  		clientCtx := newTestContext(clusterID, clock, stopper)
   144  		clientCtx.rpcCompression = compression
   145  
   146  		var once sync.Once
   147  		ch := make(chan struct{})
   148  
   149  		clientCtx.HeartbeatCB = func() {
   150  			once.Do(func() {
   151  				close(ch)
   152  			})
   153  		}
   154  
   155  		if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil {
   156  			t.Fatal(err)
   157  		}
   158  
   159  		<-ch
   160  	})
   161  }
   162  
   163  type internalServer struct{}
   164  
   165  func (*internalServer) Batch(
   166  	context.Context, *roachpb.BatchRequest,
   167  ) (*roachpb.BatchResponse, error) {
   168  	return nil, nil
   169  }
   170  
   171  func (*internalServer) RangeFeed(
   172  	_ *roachpb.RangeFeedRequest, _ roachpb.Internal_RangeFeedServer,
   173  ) error {
   174  	panic("unimplemented")
   175  }
   176  
   177  // TestInternalServerAddress verifies that RPCContext uses AdvertiseAddr, not Addr, to
   178  // determine whether to apply the local server optimization.
   179  //
   180  // Prevents regression of https://github.com/cockroachdb/cockroach/issues/19991.
   181  func TestInternalServerAddress(t *testing.T) {
   182  	defer leaktest.AfterTest(t)()
   183  
   184  	stopper := stop.NewStopper()
   185  	defer stopper.Stop(context.Background())
   186  
   187  	// Can't be zero because that'd be an empty offset.
   188  	clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond)
   189  
   190  	serverCtx := newTestContext(uuid.MakeV4(), clock, stopper)
   191  	serverCtx.Config.Addr = "127.0.0.1:9999"
   192  	serverCtx.Config.AdvertiseAddr = "127.0.0.1:8888"
   193  	serverCtx.NodeID.Set(context.Background(), 1)
   194  
   195  	internal := &internalServer{}
   196  	serverCtx.SetLocalInternalServer(internal)
   197  
   198  	exp := internalClientAdapter{internal}
   199  	if ic := serverCtx.GetLocalInternalClientForAddr(serverCtx.Config.AdvertiseAddr, 1); ic != exp {
   200  		t.Fatalf("expected %+v, got %+v", exp, ic)
   201  	}
   202  }
   203  
   204  // TestHeartbeatHealth verifies that the health status changes after
   205  // heartbeats succeed or fail.
   206  func TestHeartbeatHealth(t *testing.T) {
   207  	defer leaktest.AfterTest(t)()
   208  
   209  	stopper := stop.NewStopper()
   210  	defer stopper.Stop(context.Background())
   211  
   212  	// Can't be zero because that'd be an empty offset.
   213  	clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond)
   214  
   215  	// Shared cluster ID by all RPC peers (this ensures that the peers
   216  	// don't talk to servers from unrelated tests by accident).
   217  	clusterID := uuid.MakeV4()
   218  
   219  	const serverNodeID = 1
   220  	const clientNodeID = 2
   221  
   222  	serverCtx := newTestContext(clusterID, clock, stop.NewStopper())
   223  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
   224  	s := newTestServer(t, serverCtx)
   225  
   226  	heartbeat := &ManualHeartbeatService{
   227  		ready:              make(chan error),
   228  		stopper:            stopper,
   229  		clock:              clock,
   230  		remoteClockMonitor: serverCtx.RemoteClocks,
   231  		settings:           serverCtx.settings,
   232  		nodeID:             &serverCtx.NodeID,
   233  	}
   234  	RegisterHeartbeatServer(s, heartbeat)
   235  
   236  	errFailedHeartbeat := errors.New("failed heartbeat")
   237  
   238  	var hbSuccess atomic.Value
   239  	hbSuccess.Store(true)
   240  
   241  	go func() {
   242  		for {
   243  			var err error
   244  			if !hbSuccess.Load().(bool) {
   245  				err = errFailedHeartbeat
   246  			}
   247  
   248  			select {
   249  			case <-stopper.ShouldStop():
   250  				return
   251  			case heartbeat.ready <- err:
   252  			}
   253  		}
   254  	}()
   255  
   256  	lisNotLocalServer, err := net.Listen("tcp", "127.0.0.1:0")
   257  	defer func() {
   258  		netutil.FatalIfUnexpected(lisNotLocalServer.Close())
   259  	}()
   260  	if err != nil {
   261  		t.Fatal(err)
   262  	}
   263  	lisLocalServer, err := net.Listen("tcp", "127.0.0.1:0")
   264  	defer func() {
   265  		netutil.FatalIfUnexpected(lisLocalServer.Close())
   266  	}()
   267  	if err != nil {
   268  		t.Fatal(err)
   269  	}
   270  
   271  	clientCtx := newTestContext(clusterID, clock, stopper)
   272  	clientCtx.NodeID.Set(context.Background(), clientNodeID)
   273  	clientCtx.Addr = lisNotLocalServer.Addr().String()
   274  	clientCtx.AdvertiseAddr = lisLocalServer.Addr().String()
   275  	// Make the interval shorter to speed up the test.
   276  	clientCtx.heartbeatInterval = 1 * time.Millisecond
   277  
   278  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
   279  	if err != nil {
   280  		t.Fatal(err)
   281  	}
   282  	remoteAddr := ln.Addr().String()
   283  	if _, err := clientCtx.GRPCDialNode(
   284  		remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil {
   285  		t.Fatal(err)
   286  	}
   287  
   288  	// Wait for the connection.
   289  	testutils.SucceedsSoon(t, func() error {
   290  		err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   291  		if err != nil && !errors.Is(err, ErrNotHeartbeated) {
   292  			t.Fatal(err)
   293  		}
   294  		return err
   295  	})
   296  	assertGauges(t, clientCtx.Metrics(),
   297  		0 /* initializing */, 1 /* nominal */, 0 /* failed */)
   298  
   299  	// Should be unhealthy in the presence of failing heartbeats.
   300  	hbSuccess.Store(false)
   301  	testutils.SucceedsSoon(t, func() error {
   302  		if err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID); !testutils.IsError(err, errFailedHeartbeat.Error()) {
   303  			return errors.Errorf("unexpected error: %v", err)
   304  		}
   305  		return nil
   306  	})
   307  	assertGauges(t, clientCtx.Metrics(),
   308  		0 /* initializing */, 0 /* nominal */, 1 /* failed */)
   309  
   310  	// Should become healthy in the presence of successful heartbeats.
   311  	hbSuccess.Store(true)
   312  	testutils.SucceedsSoon(t, func() error {
   313  		return clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   314  	})
   315  	assertGauges(t, clientCtx.Metrics(),
   316  		0 /* initializing */, 1 /* nominal */, 0 /* failed */)
   317  
   318  	// Should become unhealthy again in the presence of failing heartbeats.
   319  	hbSuccess.Store(false)
   320  	testutils.SucceedsSoon(t, func() error {
   321  		if err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID); !testutils.IsError(err, errFailedHeartbeat.Error()) {
   322  			return errors.Errorf("unexpected error: %v", err)
   323  		}
   324  		return nil
   325  	})
   326  	assertGauges(t, clientCtx.Metrics(),
   327  		0 /* initializing */, 0 /* nominal */, 1 /* failed */)
   328  
   329  	// Should become healthy in the presence of successful heartbeats.
   330  	hbSuccess.Store(true)
   331  	testutils.SucceedsSoon(t, func() error {
   332  		return clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   333  	})
   334  	assertGauges(t, clientCtx.Metrics(),
   335  		0 /* initializing */, 1 /* nominal */, 0 /* failed */)
   336  
   337  	// Ensure that non-existing connections return ErrNotHeartbeated.
   338  
   339  	lisNonExistentConnection, err := net.Listen("tcp", "127.0.0.1:0")
   340  	defer func() {
   341  		netutil.FatalIfUnexpected(lisNonExistentConnection.Close())
   342  	}()
   343  	if err != nil {
   344  		t.Fatal(err)
   345  	}
   346  	if err := clientCtx.TestingConnHealth(lisNonExistentConnection.Addr().String(), 3); !errors.Is(err, ErrNotHeartbeated) {
   347  		t.Errorf("wanted ErrNotHeartbeated, not %v", err)
   348  	}
   349  	// The connection to Node 3 on the lisNonExistentConnection should be
   350  	// initializing and the server connection should be nominal.
   351  	testutils.SucceedsSoon(t, func() error {
   352  		return checkGauges(clientCtx.Metrics(),
   353  			1 /* initializing */, 1 /* nominal */, 0 /* failed */)
   354  	})
   355  
   356  	if err := clientCtx.TestingConnHealth(clientCtx.Addr, clientNodeID); !errors.Is(err, ErrNotHeartbeated) {
   357  		t.Errorf("wanted ErrNotHeartbeated, not %v", err)
   358  	}
   359  
   360  	// Ensure that the local Addr returns ErrNotHeartbeated without having dialed
   361  	// a connection but the local AdvertiseAddr successfully returns no error when
   362  	// an internal server has been registered.
   363  	clientCtx.SetLocalInternalServer(&internalServer{})
   364  
   365  	if err := clientCtx.TestingConnHealth(clientCtx.Addr, clientNodeID); !errors.Is(err, ErrNotHeartbeated) {
   366  		t.Errorf("wanted ErrNotHeartbeated, not %v", err)
   367  	}
   368  	if err := clientCtx.TestingConnHealth(clientCtx.AdvertiseAddr, clientNodeID); err != nil {
   369  		t.Error(err)
   370  	}
   371  
   372  	// Ensure that when the server closes its connection the context attempts to
   373  	// reconnect. Both the server connection on Node 1 and the non-existent
   374  	// connection should be initializing.
   375  	serverCtx.Stopper.Stop(context.Background())
   376  	testutils.SucceedsSoon(t, func() error {
   377  		return checkGauges(clientCtx.Metrics(),
   378  			2 /* initializing */, 0 /* nominal */, 0 /* failed */)
   379  	})
   380  	const expNumStarted = 3 // 2 for the server and 1 for the non-existent conn
   381  	numStarted := clientCtx.Metrics().HeartbeatLoopsStarted.Count()
   382  	if numStarted != expNumStarted {
   383  		t.Fatalf("expected %d heartbeat loops to have been started, got %d",
   384  			expNumStarted, numStarted)
   385  	}
   386  	const expNumExited = 1 // 1 for the server upon shutdown
   387  	numExited := clientCtx.Metrics().HeartbeatLoopsExited.Count()
   388  	if numExited != expNumExited {
   389  		t.Fatalf("expected %d heartbeat loops to have exited, got %d",
   390  			expNumExited, numExited)
   391  	}
   392  }
   393  
   394  func checkGauges(m *Metrics, initializing, nominal, failed int64) error {
   395  	if got := m.HeartbeatsInitializing.Value(); got != initializing {
   396  		return errors.Errorf("expected %d initializing heartbeats, got %d", initializing, got)
   397  	}
   398  	if got := m.HeartbeatsNominal.Value(); got != nominal {
   399  		return errors.Errorf("expected %d nominal heartbeats, got %d", nominal, got)
   400  	}
   401  	if got := m.HeartbeatsFailed.Value(); got != failed {
   402  		return errors.Errorf("expected %d failed heartbeats, got %d", failed, got)
   403  	}
   404  	return nil
   405  }
   406  
   407  func assertGauges(t *testing.T, m *Metrics, initializing, nominal, failed int64) {
   408  	t.Helper()
   409  	if err := checkGauges(m, initializing, nominal, failed); err != nil {
   410  		t.Error(err)
   411  	}
   412  }
   413  
   414  // TestConnectionRemoveNodeIDZero verifies that when a connection initiated via
   415  // GRPCDialNode fails, we also clean up the connection returned by
   416  // GRPCUnvalidatedDial.
   417  //
   418  // See #37200.
   419  func TestConnectionRemoveNodeIDZero(t *testing.T) {
   420  	defer leaktest.AfterTest(t)()
   421  
   422  	ctx := context.Background()
   423  	stopper := stop.NewStopper()
   424  	defer stopper.Stop(ctx)
   425  
   426  	clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
   427  	clientCtx := newTestContext(uuid.MakeV4(), clock, stopper)
   428  	// Provoke an error.
   429  	_, err := clientCtx.GRPCDialNode("127.0.0.1:notaport", 1, DefaultClass).Connect(context.Background())
   430  	if err == nil {
   431  		t.Fatal("expected some kind of error, got nil")
   432  	}
   433  
   434  	// NB: this takes a moment because GRPCDialRaw only gives up on the initial
   435  	// connection after 1s (more precisely, the redialChan gets closed only after
   436  	// 1s), which seems difficult to configure ad-hoc.
   437  	testutils.SucceedsSoon(t, func() error {
   438  		var keys []connKey
   439  		clientCtx.conns.Range(func(k, v interface{}) bool {
   440  			keys = append(keys, k.(connKey))
   441  			return true
   442  		})
   443  		if len(keys) > 0 {
   444  			return errors.Errorf("still have connections %v", keys)
   445  		}
   446  		return nil
   447  	})
   448  }
   449  
   450  type interceptingListener struct {
   451  	net.Listener
   452  	connCB func(net.Conn)
   453  }
   454  
   455  func (ln *interceptingListener) Accept() (net.Conn, error) {
   456  	conn, err := ln.Listener.Accept()
   457  	if err == nil {
   458  		ln.connCB(conn)
   459  	}
   460  	return conn, err
   461  }
   462  
   463  // TestHeartbeatHealth verifies that the health status changes after
   464  // heartbeats succeed or fail due to transport failures.
   465  func TestHeartbeatHealthTransport(t *testing.T) {
   466  	defer leaktest.AfterTest(t)()
   467  
   468  	stopper := stop.NewStopper()
   469  	defer stopper.Stop(context.Background())
   470  
   471  	ctx := context.Background()
   472  
   473  	// Shared cluster ID by all RPC peers (this ensures that the peers
   474  	// don't talk to servers from unrelated tests by accident).
   475  	clusterID := uuid.MakeV4()
   476  
   477  	// Can't be zero because that'd be an empty offset.
   478  	clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond)
   479  
   480  	serverCtx := newTestContext(clusterID, clock, stopper)
   481  	const serverNodeID = 1
   482  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
   483  	// newTestServer with a custom listener.
   484  	tlsConfig, err := serverCtx.GetServerTLSConfig()
   485  	if err != nil {
   486  		t.Fatal(err)
   487  	}
   488  	s := grpc.NewServer(grpc.Creds(credentials.NewTLS(tlsConfig)))
   489  	RegisterHeartbeatServer(s, &HeartbeatService{
   490  		clock:              clock,
   491  		remoteClockMonitor: serverCtx.RemoteClocks,
   492  		clusterID:          &serverCtx.ClusterID,
   493  		nodeID:             &serverCtx.NodeID,
   494  		settings:           serverCtx.settings,
   495  	})
   496  
   497  	mu := struct {
   498  		syncutil.Mutex
   499  		conns     []net.Conn
   500  		autoClose bool
   501  	}{}
   502  	ln := func() *interceptingListener {
   503  		ln, err := net.Listen("tcp", util.TestAddr.String())
   504  		if err != nil {
   505  			t.Fatal(err)
   506  		}
   507  		return &interceptingListener{
   508  			Listener: ln,
   509  			connCB: func(conn net.Conn) {
   510  				mu.Lock()
   511  				if mu.autoClose {
   512  					_ = conn.Close()
   513  				} else {
   514  					mu.conns = append(mu.conns, conn)
   515  				}
   516  				mu.Unlock()
   517  			}}
   518  	}()
   519  
   520  	stopper.RunWorker(ctx, func(context.Context) {
   521  		<-stopper.ShouldQuiesce()
   522  		netutil.FatalIfUnexpected(ln.Close())
   523  		<-stopper.ShouldStop()
   524  		s.Stop()
   525  	})
   526  
   527  	stopper.RunWorker(ctx, func(context.Context) {
   528  		netutil.FatalIfUnexpected(s.Serve(ln))
   529  	})
   530  
   531  	remoteAddr := ln.Addr().String()
   532  
   533  	clientCtx := newTestContext(clusterID, clock, stopper)
   534  	// Make the interval shorter to speed up the test.
   535  	clientCtx.heartbeatInterval = 1 * time.Millisecond
   536  	if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil {
   537  		t.Fatal(err)
   538  	}
   539  	// Everything is normal; should become healthy.
   540  	testutils.SucceedsSoon(t, func() error {
   541  		return clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   542  	})
   543  
   544  	closeConns := func() (numClosed int, _ error) {
   545  		mu.Lock()
   546  		defer mu.Unlock()
   547  		n := len(mu.conns)
   548  		for i := n - 1; i >= 0; i-- {
   549  			if err := mu.conns[i].Close(); err != nil {
   550  				return 0, err
   551  			}
   552  			mu.conns = mu.conns[:i]
   553  		}
   554  		return n, nil
   555  	}
   556  
   557  	isUnhealthy := func(err error) bool {
   558  		// Most of the time, an unhealthy connection will get
   559  		// ErrNotHeartbeated, but there are brief periods during which we
   560  		// could get one of the grpc errors below (while the old
   561  		// connection is in the middle of closing).
   562  		if errors.Is(err, ErrNotHeartbeated) {
   563  			return true
   564  		}
   565  		// The expected code here is Unavailable, but at least on OSX you can also get
   566  		//
   567  		// rpc error: code = Internal desc = connection error: desc = "transport: authentication
   568  		// handshake failed: write tcp 127.0.0.1:53936->127.0.0.1:53934: write: broken pipe".
   569  		code := status.Code(err)
   570  		return code == codes.Unavailable || code == codes.Internal
   571  	}
   572  
   573  	// Close all the connections until we see a failure on the main goroutine.
   574  	done := make(chan struct{})
   575  	if err := stopper.RunAsyncTask(ctx, "busyloop-closer", func(ctx context.Context) {
   576  		for {
   577  			if _, err := closeConns(); err != nil {
   578  				log.Warningf(ctx, "%v", err)
   579  			}
   580  			select {
   581  			case <-done:
   582  				return
   583  			default:
   584  			}
   585  		}
   586  	}); err != nil {
   587  		t.Fatal(err)
   588  	}
   589  
   590  	// We don't use SucceedsSoon because that internally uses doubling backoffs, and
   591  	// it doesn't need too much bad luck to run into the time limit.
   592  	for then := timeutil.Now(); ; {
   593  		err := func() error {
   594  			if err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID); !isUnhealthy(err) {
   595  				return errors.Errorf("unexpected error: %v", err)
   596  			}
   597  			return nil
   598  		}()
   599  		if err == nil {
   600  			break
   601  		}
   602  		if timeutil.Since(then) > 45*time.Second {
   603  			t.Fatal(err)
   604  		}
   605  		time.Sleep(10 * time.Millisecond)
   606  	}
   607  
   608  	close(done)
   609  
   610  	// We can reconnect and the connection becomes healthy again.
   611  	testutils.SucceedsSoon(t, func() error {
   612  		if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil {
   613  			return err
   614  		}
   615  		return clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   616  	})
   617  
   618  	// Close the listener and all the connections. Note that if we
   619  	// only closed the listener, recently-accepted-but-not-yet-handled
   620  	// connections could sneak in and randomly make the target healthy
   621  	// again. To avoid this, we flip the boolean below which is used in
   622  	// our handler callback to eagerly close any stragglers.
   623  	mu.Lock()
   624  	mu.autoClose = true
   625  	mu.Unlock()
   626  	if err := ln.Close(); err != nil {
   627  		t.Fatal(err)
   628  	}
   629  
   630  	// Also terminate any existing connections.
   631  	if _, err := closeConns(); err != nil {
   632  		t.Fatal(err)
   633  	}
   634  
   635  	// Should become unhealthy again now that the connection was closed.
   636  	testutils.SucceedsSoon(t, func() error {
   637  		err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   638  
   639  		if !isUnhealthy(err) {
   640  			return errors.Errorf("unexpected error: %v", err)
   641  		}
   642  		return nil
   643  	})
   644  
   645  	// Should stay unhealthy despite reconnection attempts.
   646  	for then := timeutil.Now(); timeutil.Since(then) < 50*clientCtx.heartbeatInterval; {
   647  		err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID)
   648  		if !isUnhealthy(err) {
   649  			t.Fatal(err)
   650  		}
   651  	}
   652  }
   653  
   654  func TestOffsetMeasurement(t *testing.T) {
   655  	defer leaktest.AfterTest(t)()
   656  
   657  	stopper := stop.NewStopper()
   658  	defer stopper.Stop(context.Background())
   659  
   660  	// Shared cluster ID by all RPC peers (this ensures that the peers
   661  	// don't talk to servers from unrelated tests by accident).
   662  	clusterID := uuid.MakeV4()
   663  
   664  	serverTime := timeutil.Unix(0, 20)
   665  	serverClock := hlc.NewClock(serverTime.UnixNano, time.Nanosecond)
   666  	serverCtx := newTestContext(clusterID, serverClock, stopper)
   667  	const serverNodeID = 1
   668  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
   669  	s := newTestServer(t, serverCtx)
   670  	RegisterHeartbeatServer(s, &HeartbeatService{
   671  		clock:              serverClock,
   672  		remoteClockMonitor: serverCtx.RemoteClocks,
   673  		clusterID:          &serverCtx.ClusterID,
   674  		nodeID:             &serverCtx.NodeID,
   675  		settings:           serverCtx.settings,
   676  	})
   677  
   678  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
   679  	if err != nil {
   680  		t.Fatal(err)
   681  	}
   682  	remoteAddr := ln.Addr().String()
   683  
   684  	// Create a client clock that is behind the server clock.
   685  	clientAdvancing := AdvancingClock{time: timeutil.Unix(0, 10)}
   686  	clientClock := hlc.NewClock(clientAdvancing.UnixNano, time.Nanosecond)
   687  	clientCtx := newTestContext(clusterID, clientClock, stopper)
   688  	// Make the interval shorter to speed up the test.
   689  	clientCtx.heartbeatInterval = 1 * time.Millisecond
   690  	clientCtx.RemoteClocks.offsetTTL = 5 * clientAdvancing.getAdvancementInterval()
   691  	if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil {
   692  		t.Fatal(err)
   693  	}
   694  
   695  	expectedOffset := RemoteOffset{Offset: 10, Uncertainty: 0, MeasuredAt: 10}
   696  	testutils.SucceedsSoon(t, func() error {
   697  		clientCtx.RemoteClocks.mu.Lock()
   698  		defer clientCtx.RemoteClocks.mu.Unlock()
   699  
   700  		if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok {
   701  			return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr)
   702  		} else if o != expectedOffset {
   703  			return errors.Errorf("expected:\n%v\nactual:\n%v", expectedOffset, o)
   704  		}
   705  		return nil
   706  	})
   707  
   708  	// Change the client such that it receives a heartbeat right after the
   709  	// maximum clock reading delay.
   710  	clientAdvancing.setAdvancementInterval(
   711  		maximumPingDurationMult*clientClock.MaxOffset() + 1*time.Nanosecond)
   712  
   713  	testutils.SucceedsSoon(t, func() error {
   714  		clientCtx.RemoteClocks.mu.Lock()
   715  		defer clientCtx.RemoteClocks.mu.Unlock()
   716  
   717  		if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; ok {
   718  			return errors.Errorf("expected offset to have been cleared, but found %s", o)
   719  		}
   720  		return nil
   721  	})
   722  }
   723  
   724  func TestFailedOffsetMeasurement(t *testing.T) {
   725  	defer leaktest.AfterTest(t)()
   726  
   727  	stopper := stop.NewStopper()
   728  	defer stopper.Stop(context.Background())
   729  
   730  	// Shared cluster ID by all RPC peers (this ensures that the peers
   731  	// don't talk to servers from unrelated tests by accident).
   732  	clusterID := uuid.MakeV4()
   733  
   734  	// Can't be zero because that'd be an empty offset.
   735  	clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond)
   736  
   737  	serverCtx := newTestContext(clusterID, clock, stopper)
   738  	const serverNodeID = 1
   739  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
   740  	s := newTestServer(t, serverCtx)
   741  	heartbeat := &ManualHeartbeatService{
   742  		clock:              clock,
   743  		remoteClockMonitor: serverCtx.RemoteClocks,
   744  		ready:              make(chan error),
   745  		stopper:            stopper,
   746  		settings:           serverCtx.settings,
   747  		nodeID:             &serverCtx.NodeID,
   748  	}
   749  	RegisterHeartbeatServer(s, heartbeat)
   750  
   751  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
   752  	if err != nil {
   753  		t.Fatal(err)
   754  	}
   755  	remoteAddr := ln.Addr().String()
   756  
   757  	// Create a client that never receives a heartbeat after the first.
   758  	clientCtx := newTestContext(clusterID, clock, stopper)
   759  	// Remove the timeout so that failure arises from exceeding the maximum
   760  	// clock reading delay, not the timeout.
   761  	clientCtx.heartbeatTimeout = 0
   762  	go func() { heartbeat.ready <- nil }() // Allow one heartbeat for initialization.
   763  	if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil {
   764  		t.Fatal(err)
   765  	}
   766  
   767  	testutils.SucceedsSoon(t, func() error {
   768  		clientCtx.RemoteClocks.mu.Lock()
   769  		defer clientCtx.RemoteClocks.mu.Unlock()
   770  
   771  		if _, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok {
   772  			return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr)
   773  		}
   774  		return nil
   775  	})
   776  
   777  	testutils.SucceedsSoon(t, func() error {
   778  		serverCtx.RemoteClocks.mu.Lock()
   779  		defer serverCtx.RemoteClocks.mu.Unlock()
   780  
   781  		if o, ok := serverCtx.RemoteClocks.mu.offsets[remoteAddr]; ok {
   782  			return errors.Errorf("expected offset of %s to not be initialized, but it was: %v", remoteAddr, o)
   783  		}
   784  		return nil
   785  	})
   786  }
   787  
   788  type AdvancingClock struct {
   789  	syncutil.Mutex
   790  	time                time.Time
   791  	advancementInterval atomic.Value // time.Duration
   792  }
   793  
   794  func (ac *AdvancingClock) setAdvancementInterval(d time.Duration) {
   795  	ac.advancementInterval.Store(d)
   796  }
   797  
   798  func (ac *AdvancingClock) getAdvancementInterval() time.Duration {
   799  	v := ac.advancementInterval.Load()
   800  	if v == nil {
   801  		return 0
   802  	}
   803  	return v.(time.Duration)
   804  }
   805  
   806  func (ac *AdvancingClock) UnixNano() int64 {
   807  	ac.Lock()
   808  	time := ac.time
   809  	ac.time = time.Add(ac.getAdvancementInterval())
   810  	ac.Unlock()
   811  	return time.UnixNano()
   812  }
   813  
   814  func TestRemoteOffsetUnhealthy(t *testing.T) {
   815  	defer leaktest.AfterTest(t)()
   816  
   817  	stopper := stop.NewStopper()
   818  	defer stopper.Stop(context.Background())
   819  
   820  	const maxOffset = 100 * time.Millisecond
   821  
   822  	type nodeContext struct {
   823  		offset  time.Duration
   824  		ctx     *Context
   825  		errChan chan error
   826  	}
   827  
   828  	start := time.Date(2012, 12, 07, 0, 0, 0, 0, time.UTC)
   829  
   830  	nodeCtxs := []nodeContext{
   831  		{offset: 0},
   832  		{offset: 0},
   833  		{offset: 0},
   834  		// The minimum offset that actually triggers node death.
   835  		{offset: maxOffset + 1},
   836  	}
   837  
   838  	// Shared cluster ID by all RPC peers (this ensures that the peers
   839  	// don't talk to servers from unrelated tests by accident).
   840  	clusterID := uuid.MakeV4()
   841  
   842  	for i := range nodeCtxs {
   843  		clock := hlc.NewClock(start.Add(nodeCtxs[i].offset).UnixNano, maxOffset)
   844  		nodeCtxs[i].errChan = make(chan error, 1)
   845  		nodeCtxs[i].ctx = newTestContext(clusterID, clock, stopper)
   846  		nodeCtxs[i].ctx.heartbeatInterval = maxOffset
   847  		nodeCtxs[i].ctx.NodeID.Set(context.Background(), roachpb.NodeID(i+1))
   848  
   849  		s := newTestServer(t, nodeCtxs[i].ctx)
   850  		RegisterHeartbeatServer(s, &HeartbeatService{
   851  			clock:              clock,
   852  			remoteClockMonitor: nodeCtxs[i].ctx.RemoteClocks,
   853  			clusterID:          &nodeCtxs[i].ctx.ClusterID,
   854  			nodeID:             &nodeCtxs[i].ctx.NodeID,
   855  			settings:           nodeCtxs[i].ctx.settings,
   856  		})
   857  		ln, err := netutil.ListenAndServeGRPC(nodeCtxs[i].ctx.Stopper, s, util.TestAddr)
   858  		if err != nil {
   859  			t.Fatal(err)
   860  		}
   861  		nodeCtxs[i].ctx.Addr = ln.Addr().String()
   862  	}
   863  
   864  	// Fully connect the nodes.
   865  	for i, clientNodeContext := range nodeCtxs {
   866  		for j, serverNodeContext := range nodeCtxs {
   867  			if i == j {
   868  				continue
   869  			}
   870  			if _, err := clientNodeContext.ctx.GRPCDialNode(
   871  				serverNodeContext.ctx.Addr,
   872  				serverNodeContext.ctx.NodeID.Get(),
   873  				DefaultClass).Connect(context.Background()); err != nil {
   874  				t.Fatal(err)
   875  			}
   876  		}
   877  	}
   878  
   879  	// Wait until all nodes are connected to all other nodes.
   880  	for _, nodeCtx := range nodeCtxs {
   881  		testutils.SucceedsSoon(t, func() error {
   882  			nodeCtx.ctx.RemoteClocks.mu.Lock()
   883  			defer nodeCtx.ctx.RemoteClocks.mu.Unlock()
   884  
   885  			if a, e := len(nodeCtx.ctx.RemoteClocks.mu.offsets), len(nodeCtxs)-1; a != e {
   886  				return errors.Errorf("not yet fully connected: have %d of %d connections: %v", a, e, nodeCtx.ctx.RemoteClocks.mu.offsets)
   887  			}
   888  			return nil
   889  		})
   890  	}
   891  
   892  	for i, nodeCtx := range nodeCtxs {
   893  		if nodeOffset := nodeCtx.offset; nodeOffset > maxOffset {
   894  			if err := nodeCtx.ctx.RemoteClocks.VerifyClockOffset(nodeCtx.ctx.masterCtx); testutils.IsError(err, errOffsetGreaterThanMaxOffset) {
   895  				t.Logf("max offset: %s - node %d with excessive clock offset of %s returned expected error: %s", maxOffset, i, nodeOffset, err)
   896  			} else {
   897  				t.Errorf("max offset: %s - node %d with excessive clock offset of %s returned unexpected error: %v", maxOffset, i, nodeOffset, err)
   898  			}
   899  		} else {
   900  			if err := nodeCtx.ctx.RemoteClocks.VerifyClockOffset(nodeCtx.ctx.masterCtx); err != nil {
   901  				t.Errorf("max offset: %s - node %d with acceptable clock offset of %s returned unexpected error: %s", maxOffset, i, nodeOffset, err)
   902  			} else {
   903  				t.Logf("max offset: %s - node %d with acceptable clock offset of %s did not return an error, as expected", maxOffset, i, nodeOffset)
   904  			}
   905  		}
   906  	}
   907  }
   908  
   909  // This is a smoketest for gRPC Keepalives: rpc.Context asks gRPC to perform
   910  // periodic pings on the transport to check that it's still alive. If the ping
   911  // doesn't get a pong within a timeout, the transport is supposed to be closed -
   912  // that's what we're testing here. Likewise, serverside keepalive ensures that
   913  // if a ping is not seen within a timeout, the transport will also be closed.
   914  //
   915  // In this test we use a TestingHeartbeatStreamService as oppposed to a standard
   916  // HeartbeatService. This is important to test scenarios where the
   917  // client->server connection is partitioned but the server->client connection is
   918  // healthy, because a TestingHeartbeatStreamService will continue to respond on
   919  // its response stream even if it doesn't get any new requests.
   920  func TestGRPCKeepaliveFailureFailsInflightRPCs(t *testing.T) {
   921  	defer leaktest.AfterTest(t)()
   922  	t.Skip("Takes too long given https://github.com/grpc/grpc-go/pull/2642")
   923  
   924  	sc := log.Scope(t)
   925  	defer sc.Close(t)
   926  
   927  	testCases := []grpcKeepaliveTestCase{
   928  		// Keepalive doesn't matter if the network is fine.
   929  		{cKeepalive: false, sKeepalive: false, partitionC2S: false, partitionS2C: false, expClose: false},
   930  
   931  		// No keepalive. Never detects network issues.
   932  		{cKeepalive: false, sKeepalive: false, partitionC2S: true, partitionS2C: false, expClose: false},
   933  		{cKeepalive: false, sKeepalive: false, partitionC2S: false, partitionS2C: true, expClose: false},
   934  		{cKeepalive: false, sKeepalive: false, partitionC2S: true, partitionS2C: true, expClose: false},
   935  
   936  		// Client-only keepalive. Doesn't detect client->server partition.
   937  		{cKeepalive: true, sKeepalive: false, partitionC2S: true, partitionS2C: false, expClose: false},
   938  		{cKeepalive: true, sKeepalive: false, partitionC2S: false, partitionS2C: true, expClose: true},
   939  		{cKeepalive: true, sKeepalive: false, partitionC2S: true, partitionS2C: true, expClose: true},
   940  
   941  		// Server-only keepalive. Only detects server->client partition. The
   942  		// bi-directional partition case (third case) may be is surprising.
   943  		// The reason the client doesn't close the connection is because it
   944  		// does not receive the connection closed message sent by the server.
   945  		// This demonstrates why client keepalive is so important.
   946  		{cKeepalive: false, sKeepalive: true, partitionC2S: true, partitionS2C: false, expClose: true},
   947  		{cKeepalive: false, sKeepalive: true, partitionC2S: false, partitionS2C: true, expClose: false},
   948  		{cKeepalive: false, sKeepalive: true, partitionC2S: true, partitionS2C: true, expClose: false},
   949  
   950  		// Client and Server keepalive. Detects all partitions!
   951  		{cKeepalive: true, sKeepalive: true, partitionC2S: true, partitionS2C: false, expClose: true},
   952  		{cKeepalive: true, sKeepalive: true, partitionC2S: false, partitionS2C: true, expClose: true},
   953  		{cKeepalive: true, sKeepalive: true, partitionC2S: true, partitionS2C: true, expClose: true},
   954  	}
   955  
   956  	// For consistent spacing in test names.
   957  	fmtBool := func(b bool) string {
   958  		s := strconv.FormatBool(b)
   959  		if b {
   960  			s += " "
   961  		}
   962  		return s
   963  	}
   964  	connIcon := func(partition bool) string {
   965  		if partition {
   966  			return "-X->"
   967  		}
   968  		return "--->"
   969  	}
   970  
   971  	// Run all the tests.
   972  	var wg sync.WaitGroup
   973  	wg.Add(len(testCases))
   974  	errCh := make(chan error, len(testCases))
   975  	for testNum, c := range testCases {
   976  		kaName := fmt.Sprintf("clientKeepalive=%s,serverKeepalive=%s", fmtBool(c.cKeepalive), fmtBool(c.sKeepalive))
   977  		pName := fmt.Sprintf("client%sserver,server%sclient", connIcon(c.partitionC2S), connIcon(c.partitionS2C))
   978  		testName := fmt.Sprintf("%d/%s/%s", testNum, kaName, pName)
   979  		ctx := logtags.AddTag(context.Background(), testName, nil)
   980  
   981  		log.Infof(ctx, "starting sub-test")
   982  		go func(c grpcKeepaliveTestCase) {
   983  			errCh <- errors.Wrapf(grpcRunKeepaliveTestCase(ctx, c), "%+v", c)
   984  			wg.Done()
   985  		}(c)
   986  	}
   987  	log.Infof(context.Background(), "waiting for sub-tests to complete")
   988  	wg.Wait()
   989  	close(errCh)
   990  
   991  	for err := range errCh {
   992  		if err != nil {
   993  			t.Errorf("%+v", err)
   994  		}
   995  	}
   996  }
   997  
   998  type grpcKeepaliveTestCase struct {
   999  	cKeepalive, sKeepalive     bool
  1000  	partitionC2S, partitionS2C bool
  1001  	expClose                   bool
  1002  }
  1003  
  1004  func grpcRunKeepaliveTestCase(testCtx context.Context, c grpcKeepaliveTestCase) error {
  1005  	var cKeepalive keepalive.ClientParameters
  1006  	if c.cKeepalive {
  1007  		cKeepalive = clientKeepalive
  1008  	}
  1009  	var sKeepalive keepalive.ServerParameters
  1010  	if c.sKeepalive {
  1011  		sKeepalive = serverTestingKeepalive
  1012  	}
  1013  
  1014  	stopper := stop.NewStopper()
  1015  	defer stopper.Stop(context.Background())
  1016  	ctx, cancel := stopper.WithCancelOnQuiesce(testCtx)
  1017  	defer cancel()
  1018  
  1019  	// Shared cluster ID by all RPC peers (this ensures that the peers
  1020  	// don't talk to servers from unrelated tests by accident).
  1021  	clusterID := uuid.MakeV4()
  1022  
  1023  	// Construct server with server-side keepalive.
  1024  	log.Infof(ctx, "constructing server")
  1025  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1026  	serverCtx := newTestContext(clusterID, clock, stopper)
  1027  	const serverNodeID = 1
  1028  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
  1029  	tlsConfig, err := serverCtx.GetServerTLSConfig()
  1030  	if err != nil {
  1031  		return err
  1032  	}
  1033  	s := grpc.NewServer(
  1034  		grpc.Creds(credentials.NewTLS(tlsConfig)),
  1035  		grpc.StatsHandler(&serverCtx.stats),
  1036  		grpc.KeepaliveParams(sKeepalive),
  1037  	)
  1038  
  1039  	// Create heartbeat service. This service will continuously
  1040  	// read on its input stream and send on its output stream.
  1041  	log.Infof(ctx, "creating heartbeat service")
  1042  	const msgInterval = 10 * time.Millisecond
  1043  	hss := &HeartbeatStreamService{
  1044  		HeartbeatService: HeartbeatService{
  1045  			clock:              clock,
  1046  			remoteClockMonitor: serverCtx.RemoteClocks,
  1047  			clusterID:          &serverCtx.ClusterID,
  1048  			nodeID:             &serverCtx.NodeID,
  1049  			settings:           serverCtx.settings,
  1050  		},
  1051  		interval: msgInterval,
  1052  	}
  1053  	RegisterHeartbeatServer(s, hss)
  1054  	RegisterTestingHeartbeatStreamServer(s, hss)
  1055  
  1056  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1057  	if err != nil {
  1058  		return err
  1059  	}
  1060  	remoteAddr := ln.Addr().String()
  1061  
  1062  	log.Infof(ctx, "setting up client")
  1063  	clientCtx := newTestContext(clusterID, clock, stopper)
  1064  	// Disable automatic heartbeats. We'll send them by hand.
  1065  	clientCtx.heartbeatInterval = math.MaxInt64
  1066  
  1067  	var firstConn int32 = 1
  1068  
  1069  	// We're going to open RPC transport connections using a dialer that returns
  1070  	// PartitionableConns. We'll partition the first opened connection.
  1071  	dialerCh := make(chan *testutils.PartitionableConn, 1)
  1072  	clientCtx.AddTestingDialOpts(
  1073  		grpc.WithContextDialer(
  1074  			func(_ context.Context, addr string) (net.Conn, error) {
  1075  				if !atomic.CompareAndSwapInt32(&firstConn, 1, 0) {
  1076  					// If we allow gRPC to open a 2nd transport connection, then our RPCs
  1077  					// might succeed if they're sent on that one. In the spirit of a
  1078  					// partition, we'll return errors for the attempt to open a new
  1079  					// connection (albeit for a TCP connection the error would come after
  1080  					// a socket connect timeout).
  1081  					return nil, errors.Errorf("No more connections for you. We're partitioned.")
  1082  				}
  1083  
  1084  				conn, err := net.Dial("tcp", addr)
  1085  				if err != nil {
  1086  					return nil, err
  1087  				}
  1088  				transportConn := testutils.NewPartitionableConn(conn)
  1089  				dialerCh <- transportConn
  1090  				return transportConn, nil
  1091  			}),
  1092  		grpc.WithKeepaliveParams(cKeepalive),
  1093  	)
  1094  	log.Infof(ctx, "dialing server")
  1095  	conn, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(ctx)
  1096  	if err != nil {
  1097  		return err
  1098  	}
  1099  	defer func() { _ = conn.Close() }()
  1100  
  1101  	// Create the heartbeat client.
  1102  	log.Infof(ctx, "starting heartbeat client")
  1103  	unlockedHeartbeatClient, err := NewTestingHeartbeatStreamClient(conn).PingStream(ctx)
  1104  	if err != nil {
  1105  		return err
  1106  	}
  1107  	heartbeatClient := &lockedPingStreamClient{
  1108  		TestingHeartbeatStream_PingStreamClient: unlockedHeartbeatClient,
  1109  	}
  1110  
  1111  	// Perform an initial request-response round trip.
  1112  	log.Infof(ctx, "first ping")
  1113  	request := PingRequest{ServerVersion: clientCtx.settings.Version.BinaryVersion()}
  1114  	if err := heartbeatClient.Send(&request); err != nil {
  1115  		return err
  1116  	}
  1117  	if _, err := heartbeatClient.Recv(); err != nil {
  1118  		return err
  1119  	}
  1120  
  1121  	// Launch a goroutine to read from the channel continuously and
  1122  	// a goroutine to write to the channel continuously. Both will
  1123  	// exit when the channel breaks (either because of a partition
  1124  	// or because the stopper stops).
  1125  	go func() {
  1126  		t := time.NewTicker(msgInterval)
  1127  		defer t.Stop()
  1128  		for {
  1129  			<-t.C
  1130  			log.Infof(ctx, "client send")
  1131  			if err := heartbeatClient.Send(&request); err != nil {
  1132  				return
  1133  			}
  1134  		}
  1135  	}()
  1136  	go func() {
  1137  		for {
  1138  			log.Infof(ctx, "client recv")
  1139  			if _, err := heartbeatClient.Recv(); err != nil {
  1140  				return
  1141  			}
  1142  		}
  1143  	}()
  1144  
  1145  	// Now partition either client->server, server->client, or both, and attempt
  1146  	// to perform an RPC. We expect it to fail once the grpc keepalive fails to
  1147  	// get a response from the server.
  1148  	transportConn := <-dialerCh
  1149  	defer transportConn.Finish()
  1150  	if c.partitionC2S {
  1151  		log.Infof(ctx, "partition C2S")
  1152  		transportConn.PartitionC2S()
  1153  	}
  1154  	if c.partitionS2C {
  1155  		log.Infof(ctx, "partition S2C")
  1156  		transportConn.PartitionS2C()
  1157  	}
  1158  
  1159  	// We want to start a goroutine that keeps trying to send requests and reports
  1160  	// the error from the send call. In cases where there are no keep-alives this
  1161  	// request may get blocked if flow control blocks it.
  1162  	errChan := make(chan error)
  1163  	sendCtx, cancel := context.WithCancel(ctx)
  1164  	r := retry.StartWithCtx(sendCtx, retry.Options{
  1165  		InitialBackoff: 10 * time.Millisecond,
  1166  		MaxBackoff:     500 * time.Millisecond,
  1167  	})
  1168  	defer cancel()
  1169  	go func() {
  1170  		for r.Next() {
  1171  			err := heartbeatClient.Send(&request)
  1172  			isClosed := err != nil && grpcutil.IsClosedConnection(err)
  1173  			log.Infof(ctx, "heartbeat Send got error %+v (closed=%v)", err, isClosed)
  1174  			select {
  1175  			case errChan <- err:
  1176  			case <-sendCtx.Done():
  1177  				return
  1178  			}
  1179  			if isClosed {
  1180  				return
  1181  			}
  1182  		}
  1183  	}()
  1184  	// Check whether the connection eventually closes. We may need to
  1185  	// adjust this duration if the test gets flaky.
  1186  	// This unfortunately massive amount of time is required due to gRPC's
  1187  	// minimum timeout of 10s and the below issue whereby keepalives are sent
  1188  	// at half the expected rate.
  1189  	// https://github.com/grpc/grpc-go/issues/2638
  1190  	const timeoutDur = 21 * time.Second
  1191  	timeout := time.After(timeoutDur)
  1192  	// sendErr will hold the last error we saw from an attempt to send a
  1193  	// heartbeat. Initialize it with a dummy error which will fail the test if
  1194  	// it is not overwritten.
  1195  	sendErr := fmt.Errorf("not a real error")
  1196  	for done := false; !done; {
  1197  		select {
  1198  		case <-timeout:
  1199  			cancel()
  1200  			done = true
  1201  		case sendErr = <-errChan:
  1202  		}
  1203  	}
  1204  	if c.expClose {
  1205  		if sendErr == nil || !grpcutil.IsClosedConnection(sendErr) {
  1206  			newErr := fmt.Errorf("expected closed connection, found %v", sendErr)
  1207  			log.Infof(ctx, "%+v", newErr)
  1208  			return newErr
  1209  		}
  1210  	} else {
  1211  		if sendErr != nil {
  1212  			newErr := fmt.Errorf("expected unclosed connection, found %v", sendErr)
  1213  			log.Infof(ctx, "%+v", newErr)
  1214  			return newErr
  1215  		}
  1216  	}
  1217  
  1218  	// If the DialOptions we passed to gRPC didn't prevent it from opening new
  1219  	// connections, then next RPCs would succeed since gRPC reconnects the
  1220  	// transport (and that would succeed here since we've only partitioned one
  1221  	// connection). We could further test that the status reported by
  1222  	// Context.ConnHealth() for the remote node moves to UNAVAILABLE because of
  1223  	// the (application-level) heartbeats performed by rpc.Context, but the
  1224  	// behavior of our heartbeats in the face of transport failures is
  1225  	// sufficiently tested in TestHeartbeatHealthTransport.
  1226  	log.Infof(ctx, "test done")
  1227  	return nil
  1228  }
  1229  
  1230  func TestClusterIDMismatch(t *testing.T) {
  1231  	defer leaktest.AfterTest(t)()
  1232  
  1233  	stopper := stop.NewStopper()
  1234  	defer stopper.Stop(context.Background())
  1235  
  1236  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1237  	serverCtx := newTestContext(uuid.MakeV4(), clock, stopper)
  1238  	const serverNodeID = 1
  1239  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
  1240  	s := newTestServer(t, serverCtx)
  1241  	RegisterHeartbeatServer(s, &HeartbeatService{
  1242  		clock:              clock,
  1243  		remoteClockMonitor: serverCtx.RemoteClocks,
  1244  		clusterID:          &serverCtx.ClusterID,
  1245  		nodeID:             &serverCtx.NodeID,
  1246  		settings:           serverCtx.settings,
  1247  	})
  1248  
  1249  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1250  	if err != nil {
  1251  		t.Fatal(err)
  1252  	}
  1253  	remoteAddr := ln.Addr().String()
  1254  
  1255  	// Ensure the client ctx gets a new fresh cluster ID so it becomes
  1256  	// different from the server's.
  1257  	clientCtx := newTestContext(uuid.MakeV4(), clock, stopper)
  1258  
  1259  	var wg sync.WaitGroup
  1260  	for i := 0; i < 10; i++ {
  1261  		wg.Add(1)
  1262  		go func() {
  1263  			_, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background())
  1264  			expected := "initial connection heartbeat failed.*doesn't match server cluster ID"
  1265  			if !testutils.IsError(err, expected) {
  1266  				t.Errorf("expected %s error, got %v", expected, err)
  1267  			}
  1268  			wg.Done()
  1269  		}()
  1270  	}
  1271  	wg.Wait()
  1272  }
  1273  
  1274  func TestClusterNameMismatch(t *testing.T) {
  1275  	defer leaktest.AfterTest(t)()
  1276  
  1277  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1278  
  1279  	testData := []struct {
  1280  		serverName             string
  1281  		serverDisablePeerCheck bool
  1282  		clientName             string
  1283  		clientDisablePeerCheck bool
  1284  		expectedErr            string
  1285  	}{
  1286  		{"", false, "", false, ``},
  1287  		// The name check is enabled if both the client and server want it.
  1288  		{"a", false, "", false, `peer node expects cluster name "a", use --cluster-name to configure`},
  1289  		{"", false, "a", false, `peer node does not have a cluster name configured, cannot use --cluster-name`},
  1290  		{"a", false, "b", false, `local cluster name "b" does not match peer cluster name "a"`},
  1291  		// It's disabled if either doesn't want it.
  1292  		// However in any case if the name is not empty it has to match.
  1293  		{"a", true, "", false, ``},
  1294  		{"", true, "a", false, ``},
  1295  		{"a", true, "b", false, ``},
  1296  		{"a", false, "", true, ``},
  1297  		{"", false, "a", true, ``},
  1298  		{"a", false, "b", true, ``},
  1299  		{"a", true, "", true, ``},
  1300  		{"", true, "a", true, ``},
  1301  		{"a", true, "b", true, ``},
  1302  	}
  1303  
  1304  	for i, c := range testData {
  1305  		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
  1306  			stopper := stop.NewStopper()
  1307  			defer stopper.Stop(context.Background())
  1308  
  1309  			serverCtx := newTestContext(uuid.MakeV4(), clock, stopper)
  1310  			serverCtx.clusterName = c.serverName
  1311  			serverCtx.disableClusterNameVerification = c.serverDisablePeerCheck
  1312  
  1313  			s := newTestServer(t, serverCtx)
  1314  			RegisterHeartbeatServer(s, &HeartbeatService{
  1315  				clock:                          clock,
  1316  				remoteClockMonitor:             serverCtx.RemoteClocks,
  1317  				clusterID:                      &serverCtx.ClusterID,
  1318  				nodeID:                         &serverCtx.NodeID,
  1319  				settings:                       serverCtx.settings,
  1320  				clusterName:                    serverCtx.clusterName,
  1321  				disableClusterNameVerification: serverCtx.disableClusterNameVerification,
  1322  			})
  1323  
  1324  			ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1325  			if err != nil {
  1326  				t.Fatal(err)
  1327  			}
  1328  			remoteAddr := ln.Addr().String()
  1329  
  1330  			clientCtx := newTestContext(serverCtx.ClusterID.Get(), clock, stopper)
  1331  			clientCtx.clusterName = c.clientName
  1332  			clientCtx.disableClusterNameVerification = c.clientDisablePeerCheck
  1333  
  1334  			var wg sync.WaitGroup
  1335  			for i := 0; i < 10; i++ {
  1336  				wg.Add(1)
  1337  				go func() {
  1338  					_, err := clientCtx.GRPCUnvalidatedDial(remoteAddr).Connect(context.Background())
  1339  					if !testutils.IsError(err, c.expectedErr) {
  1340  						t.Errorf("expected %s error, got %v", c.expectedErr, err)
  1341  					}
  1342  					wg.Done()
  1343  				}()
  1344  			}
  1345  			wg.Wait()
  1346  		})
  1347  	}
  1348  }
  1349  
  1350  func TestNodeIDMismatch(t *testing.T) {
  1351  	defer leaktest.AfterTest(t)()
  1352  
  1353  	stopper := stop.NewStopper()
  1354  	defer stopper.Stop(context.Background())
  1355  
  1356  	// Shared cluster ID by all RPC peers (this ensures that the peers
  1357  	// don't talk to servers from unrelated tests by accident).
  1358  	clusterID := uuid.MakeV4()
  1359  
  1360  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1361  	serverCtx := newTestContext(clusterID, clock, stopper)
  1362  	serverCtx.NodeID.Set(context.Background(), 1)
  1363  	s := newTestServer(t, serverCtx)
  1364  	RegisterHeartbeatServer(s, &HeartbeatService{
  1365  		clock:              clock,
  1366  		remoteClockMonitor: serverCtx.RemoteClocks,
  1367  		clusterID:          &serverCtx.ClusterID,
  1368  		nodeID:             &serverCtx.NodeID,
  1369  		settings:           serverCtx.settings,
  1370  	})
  1371  
  1372  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1373  	if err != nil {
  1374  		t.Fatal(err)
  1375  	}
  1376  	remoteAddr := ln.Addr().String()
  1377  
  1378  	clientCtx := newTestContext(clusterID, clock, stopper)
  1379  
  1380  	var wg sync.WaitGroup
  1381  	for i := 0; i < 10; i++ {
  1382  		wg.Add(1)
  1383  		go func() {
  1384  			_, err := clientCtx.GRPCDialNode(remoteAddr, 2, DefaultClass).Connect(context.Background())
  1385  			expected := "initial connection heartbeat failed.*doesn't match server node ID"
  1386  			if !testutils.IsError(err, expected) {
  1387  				t.Errorf("expected %s error, got %v", expected, err)
  1388  			}
  1389  			wg.Done()
  1390  		}()
  1391  	}
  1392  	wg.Wait()
  1393  }
  1394  
  1395  func setVersion(c *Context, v roachpb.Version) error {
  1396  	st := cluster.MakeTestingClusterSettingsWithVersions(v, v, true /* initializeVersion */)
  1397  	c.settings = st
  1398  	return nil
  1399  }
  1400  
  1401  // Test that GRPCDial fails if there is a version incompatibility in either
  1402  // direction (client -> server or server -> client).
  1403  func TestVersionCheckBidirectional(t *testing.T) {
  1404  	defer leaktest.AfterTest(t)()
  1405  
  1406  	v1 := roachpb.Version{Major: 1}
  1407  	v2 := clusterversion.TestingBinaryVersion
  1408  
  1409  	testData := []struct {
  1410  		name          string
  1411  		serverVersion roachpb.Version
  1412  		clientVersion roachpb.Version
  1413  		expectError   bool
  1414  	}{
  1415  		{"serverVersion == clientVersion", v1, v1, false},
  1416  		{"serverVersion < clientVersion", v1, v2, true},
  1417  		{"serverVersion > clientVersion", v2, v1, true},
  1418  	}
  1419  
  1420  	// Shared cluster ID by all RPC peers (this ensures that the peers
  1421  	// don't talk to servers from unrelated tests by accident).
  1422  	clusterID := uuid.MakeV4()
  1423  
  1424  	for _, td := range testData {
  1425  		t.Run(td.name, func(t *testing.T) {
  1426  			stopper := stop.NewStopper()
  1427  			defer stopper.Stop(context.Background())
  1428  
  1429  			clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1430  			serverCtx := newTestContext(clusterID, clock, stopper)
  1431  			const serverNodeID = 1
  1432  			serverCtx.NodeID.Set(context.Background(), serverNodeID)
  1433  			if err := setVersion(serverCtx, td.serverVersion); err != nil {
  1434  				t.Fatal(err)
  1435  			}
  1436  			s := newTestServer(t, serverCtx)
  1437  			RegisterHeartbeatServer(s, &HeartbeatService{
  1438  				clock:              clock,
  1439  				remoteClockMonitor: serverCtx.RemoteClocks,
  1440  				clusterID:          &serverCtx.ClusterID,
  1441  				nodeID:             &serverCtx.NodeID,
  1442  				settings:           serverCtx.settings,
  1443  			})
  1444  
  1445  			ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1446  			if err != nil {
  1447  				t.Fatal(err)
  1448  			}
  1449  			remoteAddr := ln.Addr().String()
  1450  
  1451  			clientCtx := newTestContext(clusterID, clock, stopper)
  1452  			if err := setVersion(clientCtx, td.clientVersion); err != nil {
  1453  				t.Fatal(err)
  1454  			}
  1455  
  1456  			_, err = clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background())
  1457  
  1458  			if td.expectError {
  1459  				expected := "initial connection heartbeat failed.*cluster requires at least version"
  1460  				if !testutils.IsError(err, expected) {
  1461  					t.Errorf("expected %s error, got %v", expected, err)
  1462  				}
  1463  			} else if err != nil {
  1464  				t.Errorf("unexpected error: %s", err)
  1465  			}
  1466  		})
  1467  	}
  1468  }
  1469  
  1470  // TestGRPCDialClass ensures that distinct connections are constructed when
  1471  // dialing the same target with different classes.
  1472  func TestGRPCDialClass(t *testing.T) {
  1473  	defer leaktest.AfterTest(t)()
  1474  
  1475  	stopper := stop.NewStopper()
  1476  	defer stopper.Stop(context.Background())
  1477  
  1478  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1479  	serverCtx := newTestContext(uuid.MakeV4(), clock, stopper)
  1480  	const serverNodeID = 1
  1481  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
  1482  	s := newTestServer(t, serverCtx)
  1483  	RegisterHeartbeatServer(s, &HeartbeatService{
  1484  		clock:              clock,
  1485  		remoteClockMonitor: serverCtx.RemoteClocks,
  1486  		clusterID:          &serverCtx.ClusterID,
  1487  		nodeID:             &serverCtx.NodeID,
  1488  		settings:           serverCtx.settings,
  1489  	})
  1490  
  1491  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1492  	require.Nil(t, err)
  1493  	remoteAddr := ln.Addr().String()
  1494  	clientCtx := newTestContext(serverCtx.ClusterID.Get(), clock, stopper)
  1495  
  1496  	def1 := clientCtx.GRPCDialNode(remoteAddr, 1, DefaultClass)
  1497  	sys1 := clientCtx.GRPCDialNode(remoteAddr, 1, SystemClass)
  1498  	require.False(t, sys1 == def1,
  1499  		"expected connections dialed with different classes to the same target to differ")
  1500  	defConn1, err := def1.Connect(context.Background())
  1501  	require.Nil(t, err, "expected successful connection")
  1502  	sysConn1, err := sys1.Connect(context.Background())
  1503  	require.Nil(t, err, "expected successful connection")
  1504  	require.False(t, sysConn1 == defConn1, "expected connections dialed with "+
  1505  		"different classes to the sametarget to have separate underlying gRPC connections")
  1506  	def2 := clientCtx.GRPCDialNode(remoteAddr, 1, DefaultClass)
  1507  	require.True(t, def1 == def2, "expected connections dialed with the same "+
  1508  		"class to the same target to be the same")
  1509  	sys2 := clientCtx.GRPCDialNode(remoteAddr, 1, SystemClass)
  1510  	require.True(t, sys1 == sys2, "expected connections dialed with the same "+
  1511  		"class to the same target to be the same")
  1512  	for _, c := range []*Connection{def2, sys2} {
  1513  		require.Nil(t, c.Health(), "expected connections to be healthy")
  1514  	}
  1515  }
  1516  
  1517  // TestTestingKnobs ensures that the testing knobs are injected in the proper
  1518  // places.
  1519  func TestTestingKnobs(t *testing.T) {
  1520  	defer leaktest.AfterTest(t)()
  1521  
  1522  	stopper := stop.NewStopper()
  1523  	defer stopper.Stop(context.Background())
  1524  	clusterID := uuid.MakeV4()
  1525  
  1526  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1527  	serverCtx := newTestContext(clusterID, clock, stopper)
  1528  	const serverNodeID = 1
  1529  	serverCtx.NodeID.Set(context.Background(), serverNodeID)
  1530  	// Register an UnknownServiceHandler that expects a BatchRequest and sends
  1531  	// a BatchResponse. It will be used both as a unary and stream handler below.
  1532  	s := newTestServer(t, serverCtx, grpc.UnknownServiceHandler(
  1533  		func(srv interface{}, stream grpc.ServerStream) error {
  1534  			var ba roachpb.BatchRequest
  1535  			if err := stream.RecvMsg(&ba); err != nil {
  1536  				return err
  1537  			}
  1538  			return stream.SendMsg(&roachpb.BatchResponse{})
  1539  		},
  1540  	))
  1541  	RegisterHeartbeatServer(s, &HeartbeatService{
  1542  		clock:              clock,
  1543  		remoteClockMonitor: serverCtx.RemoteClocks,
  1544  		clusterID:          &serverCtx.ClusterID,
  1545  		nodeID:             &serverCtx.NodeID,
  1546  		settings:           serverCtx.settings,
  1547  	})
  1548  
  1549  	// The test will inject interceptors for both stream and unary calls and then
  1550  	// will ensure that these interceptors are properly called by keeping track
  1551  	// of all calls.
  1552  
  1553  	// Use these structs to keep track of the number of times the interceptors
  1554  	// are called in the seen map below.
  1555  	type streamCall struct {
  1556  		target string
  1557  		class  ConnectionClass
  1558  		method string
  1559  	}
  1560  	type unaryCall struct {
  1561  		target string
  1562  		class  ConnectionClass
  1563  		method string
  1564  	}
  1565  	seen := make(map[interface{}]int)
  1566  	var seenMu syncutil.Mutex
  1567  	recordCall := func(call interface{}) {
  1568  		seenMu.Lock()
  1569  		defer seenMu.Unlock()
  1570  		seen[call]++
  1571  	}
  1572  	clientCtx := newTestContextWithKnobs(clock, stopper, ContextTestingKnobs{
  1573  		ClusterID: &clusterID,
  1574  		StreamClientInterceptor: func(
  1575  			target string, class ConnectionClass,
  1576  		) grpc.StreamClientInterceptor {
  1577  			return func(
  1578  				ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn,
  1579  				method string, streamer grpc.Streamer, opts ...grpc.CallOption,
  1580  			) (grpc.ClientStream, error) {
  1581  				cs, err := streamer(ctx, desc, cc, method, opts...)
  1582  				if err != nil {
  1583  					return nil, err
  1584  				}
  1585  				recordCall(streamCall{
  1586  					target: target,
  1587  					class:  class,
  1588  					method: method,
  1589  				})
  1590  				return cs, nil
  1591  			}
  1592  		},
  1593  		UnaryClientInterceptor: func(
  1594  			target string, class ConnectionClass,
  1595  		) grpc.UnaryClientInterceptor {
  1596  			return func(
  1597  				ctx context.Context, method string, req, reply interface{},
  1598  				cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption,
  1599  			) error {
  1600  				recordCall(unaryCall{
  1601  					target: target,
  1602  					class:  class,
  1603  					method: method,
  1604  				})
  1605  				return invoker(ctx, method, req, reply, cc, opts...)
  1606  			}
  1607  		},
  1608  	})
  1609  
  1610  	ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr)
  1611  	require.Nil(t, err)
  1612  	remoteAddr := ln.Addr().String()
  1613  	sysConn, err := clientCtx.GRPCDialNode(remoteAddr, 1, SystemClass).Connect(context.Background())
  1614  	require.Nil(t, err)
  1615  	defConn, err := clientCtx.GRPCDialNode(remoteAddr, 1, DefaultClass).Connect(context.Background())
  1616  	require.Nil(t, err)
  1617  	const unaryMethod = "/cockroach.rpc.Testing/Foo"
  1618  	const streamMethod = "/cockroach.rpc.Testing/Bar"
  1619  	const numSysUnary = 3
  1620  	for i := 0; i < numSysUnary; i++ {
  1621  		ba := roachpb.BatchRequest{}
  1622  		br := roachpb.BatchResponse{}
  1623  		err := sysConn.Invoke(context.Background(), unaryMethod, &ba, &br)
  1624  		require.Nil(t, err)
  1625  	}
  1626  	const numDefStream = 4
  1627  	for i := 0; i < numDefStream; i++ {
  1628  		desc := grpc.StreamDesc{
  1629  			StreamName:    "bar",
  1630  			ClientStreams: true,
  1631  		}
  1632  		cs, err := defConn.NewStream(context.Background(), &desc, streamMethod)
  1633  		require.Nil(t, err)
  1634  		require.Nil(t, cs.SendMsg(&roachpb.BatchRequest{}))
  1635  		var br roachpb.BatchResponse
  1636  		require.Nil(t, cs.RecvMsg(&br))
  1637  		require.Nil(t, cs.CloseSend())
  1638  	}
  1639  
  1640  	exp := map[interface{}]int{
  1641  		unaryCall{
  1642  			target: remoteAddr,
  1643  			class:  SystemClass,
  1644  			method: unaryMethod,
  1645  		}: numSysUnary,
  1646  		streamCall{
  1647  			target: remoteAddr,
  1648  			class:  DefaultClass,
  1649  			method: streamMethod,
  1650  		}: numDefStream,
  1651  	}
  1652  	seenMu.Lock()
  1653  	defer seenMu.Unlock()
  1654  	for call, num := range exp {
  1655  		require.Equal(t, num, seen[call])
  1656  	}
  1657  }
  1658  
  1659  // This test ensures that clients cannot be left waiting on
  1660  // `Connection.Connect()` calls in the rare case where a heartbeat loop
  1661  // exits before attempting to send its first heartbeat. See #41521.
  1662  func TestRunHeartbeatSetsHeartbeatStateWhenExitingBeforeFirstHeartbeat(t *testing.T) {
  1663  	defer leaktest.AfterTest(t)()
  1664  	ctx := context.Background()
  1665  	stopper := stop.NewStopper()
  1666  	defer stopper.Stop(ctx)
  1667  	clusterID := uuid.MakeV4()
  1668  
  1669  	clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond)
  1670  
  1671  	// This test reaches into low-level implementation details to recreate
  1672  	// the hazardous scenario seen in #41521. In that isse we saw a runHeartbeat()
  1673  	// loop exit prior to sending the first heartbeat. To recreate that scenario
  1674  	// which seems difficult to create now that gRPC backs off redialing, we
  1675  	// launch the runHeartbeat() loop with an already closed redial chan.
  1676  	// In order to hit predictable errors we run an actual server on the other
  1677  	// side of the Connection passed to runHeartbeat().
  1678  	//
  1679  	// At least half of the time this test will hit the case where the select
  1680  	// in runHeartbeat detects the closed redial chan and returns. The
  1681  	// correctness criteria we're trying to verify is that the Connect call
  1682  	// below does not block.
  1683  
  1684  	rpcCtx := newTestContext(clusterID, clock, stopper)
  1685  
  1686  	const serverNodeID = 1
  1687  	serverCtx := newTestContext(clusterID, clock, stopper)
  1688  	serverCtx.NodeID.Set(ctx, serverNodeID)
  1689  
  1690  	s := NewServer(serverCtx)
  1691  	ln, err := netutil.ListenAndServeGRPC(stopper, s, util.TestAddr)
  1692  	if err != nil {
  1693  		t.Fatal(err)
  1694  	}
  1695  	remoteAddr := ln.Addr().String()
  1696  
  1697  	c := newConnectionToNodeID(stopper, 1)
  1698  
  1699  	redialChan := make(chan struct{})
  1700  	close(redialChan)
  1701  
  1702  	c.grpcConn, _, c.dialErr = rpcCtx.grpcDialRaw(remoteAddr, serverNodeID, DefaultClass)
  1703  	require.NoError(t, c.dialErr)
  1704  	// It is possible that the redial chan being closed is not seen on the first
  1705  	// pass through the loop.
  1706  	err = rpcCtx.runHeartbeat(c, "", redialChan)
  1707  	require.EqualError(t, err, grpcutil.ErrCannotReuseClientConn.Error())
  1708  	// Even when the runHeartbeat returns, we could have heartbeated successfully.
  1709  	// If we did not, then we expect the `not yet heartbeated` error.
  1710  	if _, err = c.Connect(ctx); err != nil {
  1711  		require.Regexp(t, "not yet heartbeated", err)
  1712  	}
  1713  	require.NoError(t, c.grpcConn.Close())
  1714  }
  1715  
  1716  func BenchmarkGRPCDial(b *testing.B) {
  1717  	if testing.Short() {
  1718  		b.Skip("TODO: fix benchmark")
  1719  	}
  1720  	stopper := stop.NewStopper()
  1721  	defer stopper.Stop(context.Background())
  1722  
  1723  	clock := hlc.NewClock(hlc.UnixNano, 250*time.Millisecond)
  1724  	ctx := newTestContext(uuid.MakeV4(), clock, stopper)
  1725  	const serverNodeID = 1
  1726  	ctx.NodeID.Set(context.Background(), serverNodeID)
  1727  
  1728  	s := newTestServer(b, ctx)
  1729  	ln, err := netutil.ListenAndServeGRPC(ctx.Stopper, s, util.TestAddr)
  1730  	if err != nil {
  1731  		b.Fatal(err)
  1732  	}
  1733  	remoteAddr := ln.Addr().String()
  1734  
  1735  	b.RunParallel(func(pb *testing.PB) {
  1736  		for pb.Next() {
  1737  			_, err := ctx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background())
  1738  			if err != nil {
  1739  				b.Fatal(err)
  1740  			}
  1741  		}
  1742  	})
  1743  }