github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/rpc/context_test.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rpc 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "net" 18 "strconv" 19 "sync" 20 "sync/atomic" 21 "testing" 22 "time" 23 24 "github.com/cockroachdb/cockroach/pkg/clusterversion" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 27 "github.com/cockroachdb/cockroach/pkg/testutils" 28 "github.com/cockroachdb/cockroach/pkg/util" 29 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 32 "github.com/cockroachdb/cockroach/pkg/util/log" 33 "github.com/cockroachdb/cockroach/pkg/util/netutil" 34 "github.com/cockroachdb/cockroach/pkg/util/retry" 35 "github.com/cockroachdb/cockroach/pkg/util/stop" 36 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 37 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 38 "github.com/cockroachdb/cockroach/pkg/util/tracing" 39 "github.com/cockroachdb/cockroach/pkg/util/uuid" 40 "github.com/cockroachdb/errors" 41 "github.com/cockroachdb/logtags" 42 "github.com/stretchr/testify/require" 43 "google.golang.org/grpc" 44 "google.golang.org/grpc/codes" 45 "google.golang.org/grpc/credentials" 46 "google.golang.org/grpc/keepalive" 47 "google.golang.org/grpc/status" 48 ) 49 50 // TestingConnHealth returns nil if we have an open connection to the given 51 // target with DefaultClass that succeeded on its most recent heartbeat. 52 // Otherwise, it kicks off a connection attempt (unless one is already in 53 // progress or we are in a backoff state) and returns an error (typically 54 // ErrNotHeartbeated). This is a conservative/pessimistic indicator: 55 // if we have not attempted to talk to the target node recently, an 56 // error will be returned. This method should therefore be used to 57 // prioritize among a list of candidate nodes, but not to filter out 58 // "unhealthy" nodes. 59 // 60 // This is used in tests only; in clusters use (*Dialer).ConnHealth() 61 // instead which automates the address resolution. 62 // 63 // TODO(knz): remove this altogether. Use the dialer in all cases. 64 func (ctx *Context) TestingConnHealth(target string, nodeID roachpb.NodeID) error { 65 if ctx.GetLocalInternalClientForAddr(target, nodeID) != nil { 66 // The local server is always considered healthy. 67 return nil 68 } 69 conn := ctx.GRPCDialNode(target, nodeID, DefaultClass) 70 return conn.Health() 71 } 72 73 // AddTestingDialOpts adds extra dialing options to the rpc Context. This should 74 // be done before GRPCDial is called. 75 func (ctx *Context) AddTestingDialOpts(opts ...grpc.DialOption) { 76 ctx.testingDialOpts = append(ctx.testingDialOpts, opts...) 77 } 78 79 func newTestServer(t testing.TB, ctx *Context, extraOpts ...grpc.ServerOption) *grpc.Server { 80 tlsConfig, err := ctx.GetServerTLSConfig() 81 if err != nil { 82 t.Fatal(err) 83 } 84 opts := []grpc.ServerOption{ 85 grpc.Creds(credentials.NewTLS(tlsConfig)), 86 grpc.StatsHandler(&ctx.stats), 87 } 88 opts = append(opts, extraOpts...) 89 return grpc.NewServer(opts...) 90 } 91 92 func newTestContextWithKnobs( 93 clock *hlc.Clock, stopper *stop.Stopper, knobs ContextTestingKnobs, 94 ) *Context { 95 return NewContextWithTestingKnobs( 96 log.AmbientContext{Tracer: tracing.NewTracer()}, 97 testutils.NewNodeTestBaseContext(), 98 clock, 99 stopper, 100 cluster.MakeTestingClusterSettings(), 101 knobs, 102 ) 103 } 104 105 func newTestContext(clusterID uuid.UUID, clock *hlc.Clock, stopper *stop.Stopper) *Context { 106 return newTestContextWithKnobs(clock, stopper, ContextTestingKnobs{ 107 ClusterID: &clusterID, 108 }) 109 } 110 111 func TestHeartbeatCB(t *testing.T) { 112 defer leaktest.AfterTest(t)() 113 114 testutils.RunTrueAndFalse(t, "compression", func(t *testing.T, compression bool) { 115 stopper := stop.NewStopper() 116 defer stopper.Stop(context.Background()) 117 118 // Shared cluster ID by all RPC peers (this ensures that the peers 119 // don't talk to servers from unrelated tests by accident). 120 clusterID := uuid.MakeV4() 121 122 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 123 serverCtx := newTestContext(clusterID, clock, stopper) 124 serverCtx.rpcCompression = compression 125 const serverNodeID = 1 126 serverCtx.NodeID.Set(context.Background(), serverNodeID) 127 s := newTestServer(t, serverCtx) 128 RegisterHeartbeatServer(s, &HeartbeatService{ 129 clock: clock, 130 remoteClockMonitor: serverCtx.RemoteClocks, 131 clusterID: &serverCtx.ClusterID, 132 nodeID: &serverCtx.NodeID, 133 settings: serverCtx.settings, 134 }) 135 136 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 137 if err != nil { 138 t.Fatal(err) 139 } 140 remoteAddr := ln.Addr().String() 141 142 // Clocks don't matter in this test. 143 clientCtx := newTestContext(clusterID, clock, stopper) 144 clientCtx.rpcCompression = compression 145 146 var once sync.Once 147 ch := make(chan struct{}) 148 149 clientCtx.HeartbeatCB = func() { 150 once.Do(func() { 151 close(ch) 152 }) 153 } 154 155 if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { 156 t.Fatal(err) 157 } 158 159 <-ch 160 }) 161 } 162 163 type internalServer struct{} 164 165 func (*internalServer) Batch( 166 context.Context, *roachpb.BatchRequest, 167 ) (*roachpb.BatchResponse, error) { 168 return nil, nil 169 } 170 171 func (*internalServer) RangeFeed( 172 _ *roachpb.RangeFeedRequest, _ roachpb.Internal_RangeFeedServer, 173 ) error { 174 panic("unimplemented") 175 } 176 177 // TestInternalServerAddress verifies that RPCContext uses AdvertiseAddr, not Addr, to 178 // determine whether to apply the local server optimization. 179 // 180 // Prevents regression of https://github.com/cockroachdb/cockroach/issues/19991. 181 func TestInternalServerAddress(t *testing.T) { 182 defer leaktest.AfterTest(t)() 183 184 stopper := stop.NewStopper() 185 defer stopper.Stop(context.Background()) 186 187 // Can't be zero because that'd be an empty offset. 188 clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond) 189 190 serverCtx := newTestContext(uuid.MakeV4(), clock, stopper) 191 serverCtx.Config.Addr = "127.0.0.1:9999" 192 serverCtx.Config.AdvertiseAddr = "127.0.0.1:8888" 193 serverCtx.NodeID.Set(context.Background(), 1) 194 195 internal := &internalServer{} 196 serverCtx.SetLocalInternalServer(internal) 197 198 exp := internalClientAdapter{internal} 199 if ic := serverCtx.GetLocalInternalClientForAddr(serverCtx.Config.AdvertiseAddr, 1); ic != exp { 200 t.Fatalf("expected %+v, got %+v", exp, ic) 201 } 202 } 203 204 // TestHeartbeatHealth verifies that the health status changes after 205 // heartbeats succeed or fail. 206 func TestHeartbeatHealth(t *testing.T) { 207 defer leaktest.AfterTest(t)() 208 209 stopper := stop.NewStopper() 210 defer stopper.Stop(context.Background()) 211 212 // Can't be zero because that'd be an empty offset. 213 clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond) 214 215 // Shared cluster ID by all RPC peers (this ensures that the peers 216 // don't talk to servers from unrelated tests by accident). 217 clusterID := uuid.MakeV4() 218 219 const serverNodeID = 1 220 const clientNodeID = 2 221 222 serverCtx := newTestContext(clusterID, clock, stop.NewStopper()) 223 serverCtx.NodeID.Set(context.Background(), serverNodeID) 224 s := newTestServer(t, serverCtx) 225 226 heartbeat := &ManualHeartbeatService{ 227 ready: make(chan error), 228 stopper: stopper, 229 clock: clock, 230 remoteClockMonitor: serverCtx.RemoteClocks, 231 settings: serverCtx.settings, 232 nodeID: &serverCtx.NodeID, 233 } 234 RegisterHeartbeatServer(s, heartbeat) 235 236 errFailedHeartbeat := errors.New("failed heartbeat") 237 238 var hbSuccess atomic.Value 239 hbSuccess.Store(true) 240 241 go func() { 242 for { 243 var err error 244 if !hbSuccess.Load().(bool) { 245 err = errFailedHeartbeat 246 } 247 248 select { 249 case <-stopper.ShouldStop(): 250 return 251 case heartbeat.ready <- err: 252 } 253 } 254 }() 255 256 lisNotLocalServer, err := net.Listen("tcp", "127.0.0.1:0") 257 defer func() { 258 netutil.FatalIfUnexpected(lisNotLocalServer.Close()) 259 }() 260 if err != nil { 261 t.Fatal(err) 262 } 263 lisLocalServer, err := net.Listen("tcp", "127.0.0.1:0") 264 defer func() { 265 netutil.FatalIfUnexpected(lisLocalServer.Close()) 266 }() 267 if err != nil { 268 t.Fatal(err) 269 } 270 271 clientCtx := newTestContext(clusterID, clock, stopper) 272 clientCtx.NodeID.Set(context.Background(), clientNodeID) 273 clientCtx.Addr = lisNotLocalServer.Addr().String() 274 clientCtx.AdvertiseAddr = lisLocalServer.Addr().String() 275 // Make the interval shorter to speed up the test. 276 clientCtx.heartbeatInterval = 1 * time.Millisecond 277 278 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 279 if err != nil { 280 t.Fatal(err) 281 } 282 remoteAddr := ln.Addr().String() 283 if _, err := clientCtx.GRPCDialNode( 284 remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { 285 t.Fatal(err) 286 } 287 288 // Wait for the connection. 289 testutils.SucceedsSoon(t, func() error { 290 err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 291 if err != nil && !errors.Is(err, ErrNotHeartbeated) { 292 t.Fatal(err) 293 } 294 return err 295 }) 296 assertGauges(t, clientCtx.Metrics(), 297 0 /* initializing */, 1 /* nominal */, 0 /* failed */) 298 299 // Should be unhealthy in the presence of failing heartbeats. 300 hbSuccess.Store(false) 301 testutils.SucceedsSoon(t, func() error { 302 if err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID); !testutils.IsError(err, errFailedHeartbeat.Error()) { 303 return errors.Errorf("unexpected error: %v", err) 304 } 305 return nil 306 }) 307 assertGauges(t, clientCtx.Metrics(), 308 0 /* initializing */, 0 /* nominal */, 1 /* failed */) 309 310 // Should become healthy in the presence of successful heartbeats. 311 hbSuccess.Store(true) 312 testutils.SucceedsSoon(t, func() error { 313 return clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 314 }) 315 assertGauges(t, clientCtx.Metrics(), 316 0 /* initializing */, 1 /* nominal */, 0 /* failed */) 317 318 // Should become unhealthy again in the presence of failing heartbeats. 319 hbSuccess.Store(false) 320 testutils.SucceedsSoon(t, func() error { 321 if err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID); !testutils.IsError(err, errFailedHeartbeat.Error()) { 322 return errors.Errorf("unexpected error: %v", err) 323 } 324 return nil 325 }) 326 assertGauges(t, clientCtx.Metrics(), 327 0 /* initializing */, 0 /* nominal */, 1 /* failed */) 328 329 // Should become healthy in the presence of successful heartbeats. 330 hbSuccess.Store(true) 331 testutils.SucceedsSoon(t, func() error { 332 return clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 333 }) 334 assertGauges(t, clientCtx.Metrics(), 335 0 /* initializing */, 1 /* nominal */, 0 /* failed */) 336 337 // Ensure that non-existing connections return ErrNotHeartbeated. 338 339 lisNonExistentConnection, err := net.Listen("tcp", "127.0.0.1:0") 340 defer func() { 341 netutil.FatalIfUnexpected(lisNonExistentConnection.Close()) 342 }() 343 if err != nil { 344 t.Fatal(err) 345 } 346 if err := clientCtx.TestingConnHealth(lisNonExistentConnection.Addr().String(), 3); !errors.Is(err, ErrNotHeartbeated) { 347 t.Errorf("wanted ErrNotHeartbeated, not %v", err) 348 } 349 // The connection to Node 3 on the lisNonExistentConnection should be 350 // initializing and the server connection should be nominal. 351 testutils.SucceedsSoon(t, func() error { 352 return checkGauges(clientCtx.Metrics(), 353 1 /* initializing */, 1 /* nominal */, 0 /* failed */) 354 }) 355 356 if err := clientCtx.TestingConnHealth(clientCtx.Addr, clientNodeID); !errors.Is(err, ErrNotHeartbeated) { 357 t.Errorf("wanted ErrNotHeartbeated, not %v", err) 358 } 359 360 // Ensure that the local Addr returns ErrNotHeartbeated without having dialed 361 // a connection but the local AdvertiseAddr successfully returns no error when 362 // an internal server has been registered. 363 clientCtx.SetLocalInternalServer(&internalServer{}) 364 365 if err := clientCtx.TestingConnHealth(clientCtx.Addr, clientNodeID); !errors.Is(err, ErrNotHeartbeated) { 366 t.Errorf("wanted ErrNotHeartbeated, not %v", err) 367 } 368 if err := clientCtx.TestingConnHealth(clientCtx.AdvertiseAddr, clientNodeID); err != nil { 369 t.Error(err) 370 } 371 372 // Ensure that when the server closes its connection the context attempts to 373 // reconnect. Both the server connection on Node 1 and the non-existent 374 // connection should be initializing. 375 serverCtx.Stopper.Stop(context.Background()) 376 testutils.SucceedsSoon(t, func() error { 377 return checkGauges(clientCtx.Metrics(), 378 2 /* initializing */, 0 /* nominal */, 0 /* failed */) 379 }) 380 const expNumStarted = 3 // 2 for the server and 1 for the non-existent conn 381 numStarted := clientCtx.Metrics().HeartbeatLoopsStarted.Count() 382 if numStarted != expNumStarted { 383 t.Fatalf("expected %d heartbeat loops to have been started, got %d", 384 expNumStarted, numStarted) 385 } 386 const expNumExited = 1 // 1 for the server upon shutdown 387 numExited := clientCtx.Metrics().HeartbeatLoopsExited.Count() 388 if numExited != expNumExited { 389 t.Fatalf("expected %d heartbeat loops to have exited, got %d", 390 expNumExited, numExited) 391 } 392 } 393 394 func checkGauges(m *Metrics, initializing, nominal, failed int64) error { 395 if got := m.HeartbeatsInitializing.Value(); got != initializing { 396 return errors.Errorf("expected %d initializing heartbeats, got %d", initializing, got) 397 } 398 if got := m.HeartbeatsNominal.Value(); got != nominal { 399 return errors.Errorf("expected %d nominal heartbeats, got %d", nominal, got) 400 } 401 if got := m.HeartbeatsFailed.Value(); got != failed { 402 return errors.Errorf("expected %d failed heartbeats, got %d", failed, got) 403 } 404 return nil 405 } 406 407 func assertGauges(t *testing.T, m *Metrics, initializing, nominal, failed int64) { 408 t.Helper() 409 if err := checkGauges(m, initializing, nominal, failed); err != nil { 410 t.Error(err) 411 } 412 } 413 414 // TestConnectionRemoveNodeIDZero verifies that when a connection initiated via 415 // GRPCDialNode fails, we also clean up the connection returned by 416 // GRPCUnvalidatedDial. 417 // 418 // See #37200. 419 func TestConnectionRemoveNodeIDZero(t *testing.T) { 420 defer leaktest.AfterTest(t)() 421 422 ctx := context.Background() 423 stopper := stop.NewStopper() 424 defer stopper.Stop(ctx) 425 426 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 427 clientCtx := newTestContext(uuid.MakeV4(), clock, stopper) 428 // Provoke an error. 429 _, err := clientCtx.GRPCDialNode("127.0.0.1:notaport", 1, DefaultClass).Connect(context.Background()) 430 if err == nil { 431 t.Fatal("expected some kind of error, got nil") 432 } 433 434 // NB: this takes a moment because GRPCDialRaw only gives up on the initial 435 // connection after 1s (more precisely, the redialChan gets closed only after 436 // 1s), which seems difficult to configure ad-hoc. 437 testutils.SucceedsSoon(t, func() error { 438 var keys []connKey 439 clientCtx.conns.Range(func(k, v interface{}) bool { 440 keys = append(keys, k.(connKey)) 441 return true 442 }) 443 if len(keys) > 0 { 444 return errors.Errorf("still have connections %v", keys) 445 } 446 return nil 447 }) 448 } 449 450 type interceptingListener struct { 451 net.Listener 452 connCB func(net.Conn) 453 } 454 455 func (ln *interceptingListener) Accept() (net.Conn, error) { 456 conn, err := ln.Listener.Accept() 457 if err == nil { 458 ln.connCB(conn) 459 } 460 return conn, err 461 } 462 463 // TestHeartbeatHealth verifies that the health status changes after 464 // heartbeats succeed or fail due to transport failures. 465 func TestHeartbeatHealthTransport(t *testing.T) { 466 defer leaktest.AfterTest(t)() 467 468 stopper := stop.NewStopper() 469 defer stopper.Stop(context.Background()) 470 471 ctx := context.Background() 472 473 // Shared cluster ID by all RPC peers (this ensures that the peers 474 // don't talk to servers from unrelated tests by accident). 475 clusterID := uuid.MakeV4() 476 477 // Can't be zero because that'd be an empty offset. 478 clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond) 479 480 serverCtx := newTestContext(clusterID, clock, stopper) 481 const serverNodeID = 1 482 serverCtx.NodeID.Set(context.Background(), serverNodeID) 483 // newTestServer with a custom listener. 484 tlsConfig, err := serverCtx.GetServerTLSConfig() 485 if err != nil { 486 t.Fatal(err) 487 } 488 s := grpc.NewServer(grpc.Creds(credentials.NewTLS(tlsConfig))) 489 RegisterHeartbeatServer(s, &HeartbeatService{ 490 clock: clock, 491 remoteClockMonitor: serverCtx.RemoteClocks, 492 clusterID: &serverCtx.ClusterID, 493 nodeID: &serverCtx.NodeID, 494 settings: serverCtx.settings, 495 }) 496 497 mu := struct { 498 syncutil.Mutex 499 conns []net.Conn 500 autoClose bool 501 }{} 502 ln := func() *interceptingListener { 503 ln, err := net.Listen("tcp", util.TestAddr.String()) 504 if err != nil { 505 t.Fatal(err) 506 } 507 return &interceptingListener{ 508 Listener: ln, 509 connCB: func(conn net.Conn) { 510 mu.Lock() 511 if mu.autoClose { 512 _ = conn.Close() 513 } else { 514 mu.conns = append(mu.conns, conn) 515 } 516 mu.Unlock() 517 }} 518 }() 519 520 stopper.RunWorker(ctx, func(context.Context) { 521 <-stopper.ShouldQuiesce() 522 netutil.FatalIfUnexpected(ln.Close()) 523 <-stopper.ShouldStop() 524 s.Stop() 525 }) 526 527 stopper.RunWorker(ctx, func(context.Context) { 528 netutil.FatalIfUnexpected(s.Serve(ln)) 529 }) 530 531 remoteAddr := ln.Addr().String() 532 533 clientCtx := newTestContext(clusterID, clock, stopper) 534 // Make the interval shorter to speed up the test. 535 clientCtx.heartbeatInterval = 1 * time.Millisecond 536 if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { 537 t.Fatal(err) 538 } 539 // Everything is normal; should become healthy. 540 testutils.SucceedsSoon(t, func() error { 541 return clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 542 }) 543 544 closeConns := func() (numClosed int, _ error) { 545 mu.Lock() 546 defer mu.Unlock() 547 n := len(mu.conns) 548 for i := n - 1; i >= 0; i-- { 549 if err := mu.conns[i].Close(); err != nil { 550 return 0, err 551 } 552 mu.conns = mu.conns[:i] 553 } 554 return n, nil 555 } 556 557 isUnhealthy := func(err error) bool { 558 // Most of the time, an unhealthy connection will get 559 // ErrNotHeartbeated, but there are brief periods during which we 560 // could get one of the grpc errors below (while the old 561 // connection is in the middle of closing). 562 if errors.Is(err, ErrNotHeartbeated) { 563 return true 564 } 565 // The expected code here is Unavailable, but at least on OSX you can also get 566 // 567 // rpc error: code = Internal desc = connection error: desc = "transport: authentication 568 // handshake failed: write tcp 127.0.0.1:53936->127.0.0.1:53934: write: broken pipe". 569 code := status.Code(err) 570 return code == codes.Unavailable || code == codes.Internal 571 } 572 573 // Close all the connections until we see a failure on the main goroutine. 574 done := make(chan struct{}) 575 if err := stopper.RunAsyncTask(ctx, "busyloop-closer", func(ctx context.Context) { 576 for { 577 if _, err := closeConns(); err != nil { 578 log.Warningf(ctx, "%v", err) 579 } 580 select { 581 case <-done: 582 return 583 default: 584 } 585 } 586 }); err != nil { 587 t.Fatal(err) 588 } 589 590 // We don't use SucceedsSoon because that internally uses doubling backoffs, and 591 // it doesn't need too much bad luck to run into the time limit. 592 for then := timeutil.Now(); ; { 593 err := func() error { 594 if err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID); !isUnhealthy(err) { 595 return errors.Errorf("unexpected error: %v", err) 596 } 597 return nil 598 }() 599 if err == nil { 600 break 601 } 602 if timeutil.Since(then) > 45*time.Second { 603 t.Fatal(err) 604 } 605 time.Sleep(10 * time.Millisecond) 606 } 607 608 close(done) 609 610 // We can reconnect and the connection becomes healthy again. 611 testutils.SucceedsSoon(t, func() error { 612 if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { 613 return err 614 } 615 return clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 616 }) 617 618 // Close the listener and all the connections. Note that if we 619 // only closed the listener, recently-accepted-but-not-yet-handled 620 // connections could sneak in and randomly make the target healthy 621 // again. To avoid this, we flip the boolean below which is used in 622 // our handler callback to eagerly close any stragglers. 623 mu.Lock() 624 mu.autoClose = true 625 mu.Unlock() 626 if err := ln.Close(); err != nil { 627 t.Fatal(err) 628 } 629 630 // Also terminate any existing connections. 631 if _, err := closeConns(); err != nil { 632 t.Fatal(err) 633 } 634 635 // Should become unhealthy again now that the connection was closed. 636 testutils.SucceedsSoon(t, func() error { 637 err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 638 639 if !isUnhealthy(err) { 640 return errors.Errorf("unexpected error: %v", err) 641 } 642 return nil 643 }) 644 645 // Should stay unhealthy despite reconnection attempts. 646 for then := timeutil.Now(); timeutil.Since(then) < 50*clientCtx.heartbeatInterval; { 647 err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID) 648 if !isUnhealthy(err) { 649 t.Fatal(err) 650 } 651 } 652 } 653 654 func TestOffsetMeasurement(t *testing.T) { 655 defer leaktest.AfterTest(t)() 656 657 stopper := stop.NewStopper() 658 defer stopper.Stop(context.Background()) 659 660 // Shared cluster ID by all RPC peers (this ensures that the peers 661 // don't talk to servers from unrelated tests by accident). 662 clusterID := uuid.MakeV4() 663 664 serverTime := timeutil.Unix(0, 20) 665 serverClock := hlc.NewClock(serverTime.UnixNano, time.Nanosecond) 666 serverCtx := newTestContext(clusterID, serverClock, stopper) 667 const serverNodeID = 1 668 serverCtx.NodeID.Set(context.Background(), serverNodeID) 669 s := newTestServer(t, serverCtx) 670 RegisterHeartbeatServer(s, &HeartbeatService{ 671 clock: serverClock, 672 remoteClockMonitor: serverCtx.RemoteClocks, 673 clusterID: &serverCtx.ClusterID, 674 nodeID: &serverCtx.NodeID, 675 settings: serverCtx.settings, 676 }) 677 678 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 679 if err != nil { 680 t.Fatal(err) 681 } 682 remoteAddr := ln.Addr().String() 683 684 // Create a client clock that is behind the server clock. 685 clientAdvancing := AdvancingClock{time: timeutil.Unix(0, 10)} 686 clientClock := hlc.NewClock(clientAdvancing.UnixNano, time.Nanosecond) 687 clientCtx := newTestContext(clusterID, clientClock, stopper) 688 // Make the interval shorter to speed up the test. 689 clientCtx.heartbeatInterval = 1 * time.Millisecond 690 clientCtx.RemoteClocks.offsetTTL = 5 * clientAdvancing.getAdvancementInterval() 691 if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { 692 t.Fatal(err) 693 } 694 695 expectedOffset := RemoteOffset{Offset: 10, Uncertainty: 0, MeasuredAt: 10} 696 testutils.SucceedsSoon(t, func() error { 697 clientCtx.RemoteClocks.mu.Lock() 698 defer clientCtx.RemoteClocks.mu.Unlock() 699 700 if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok { 701 return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr) 702 } else if o != expectedOffset { 703 return errors.Errorf("expected:\n%v\nactual:\n%v", expectedOffset, o) 704 } 705 return nil 706 }) 707 708 // Change the client such that it receives a heartbeat right after the 709 // maximum clock reading delay. 710 clientAdvancing.setAdvancementInterval( 711 maximumPingDurationMult*clientClock.MaxOffset() + 1*time.Nanosecond) 712 713 testutils.SucceedsSoon(t, func() error { 714 clientCtx.RemoteClocks.mu.Lock() 715 defer clientCtx.RemoteClocks.mu.Unlock() 716 717 if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; ok { 718 return errors.Errorf("expected offset to have been cleared, but found %s", o) 719 } 720 return nil 721 }) 722 } 723 724 func TestFailedOffsetMeasurement(t *testing.T) { 725 defer leaktest.AfterTest(t)() 726 727 stopper := stop.NewStopper() 728 defer stopper.Stop(context.Background()) 729 730 // Shared cluster ID by all RPC peers (this ensures that the peers 731 // don't talk to servers from unrelated tests by accident). 732 clusterID := uuid.MakeV4() 733 734 // Can't be zero because that'd be an empty offset. 735 clock := hlc.NewClock(timeutil.Unix(0, 1).UnixNano, time.Nanosecond) 736 737 serverCtx := newTestContext(clusterID, clock, stopper) 738 const serverNodeID = 1 739 serverCtx.NodeID.Set(context.Background(), serverNodeID) 740 s := newTestServer(t, serverCtx) 741 heartbeat := &ManualHeartbeatService{ 742 clock: clock, 743 remoteClockMonitor: serverCtx.RemoteClocks, 744 ready: make(chan error), 745 stopper: stopper, 746 settings: serverCtx.settings, 747 nodeID: &serverCtx.NodeID, 748 } 749 RegisterHeartbeatServer(s, heartbeat) 750 751 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 752 if err != nil { 753 t.Fatal(err) 754 } 755 remoteAddr := ln.Addr().String() 756 757 // Create a client that never receives a heartbeat after the first. 758 clientCtx := newTestContext(clusterID, clock, stopper) 759 // Remove the timeout so that failure arises from exceeding the maximum 760 // clock reading delay, not the timeout. 761 clientCtx.heartbeatTimeout = 0 762 go func() { heartbeat.ready <- nil }() // Allow one heartbeat for initialization. 763 if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { 764 t.Fatal(err) 765 } 766 767 testutils.SucceedsSoon(t, func() error { 768 clientCtx.RemoteClocks.mu.Lock() 769 defer clientCtx.RemoteClocks.mu.Unlock() 770 771 if _, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok { 772 return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr) 773 } 774 return nil 775 }) 776 777 testutils.SucceedsSoon(t, func() error { 778 serverCtx.RemoteClocks.mu.Lock() 779 defer serverCtx.RemoteClocks.mu.Unlock() 780 781 if o, ok := serverCtx.RemoteClocks.mu.offsets[remoteAddr]; ok { 782 return errors.Errorf("expected offset of %s to not be initialized, but it was: %v", remoteAddr, o) 783 } 784 return nil 785 }) 786 } 787 788 type AdvancingClock struct { 789 syncutil.Mutex 790 time time.Time 791 advancementInterval atomic.Value // time.Duration 792 } 793 794 func (ac *AdvancingClock) setAdvancementInterval(d time.Duration) { 795 ac.advancementInterval.Store(d) 796 } 797 798 func (ac *AdvancingClock) getAdvancementInterval() time.Duration { 799 v := ac.advancementInterval.Load() 800 if v == nil { 801 return 0 802 } 803 return v.(time.Duration) 804 } 805 806 func (ac *AdvancingClock) UnixNano() int64 { 807 ac.Lock() 808 time := ac.time 809 ac.time = time.Add(ac.getAdvancementInterval()) 810 ac.Unlock() 811 return time.UnixNano() 812 } 813 814 func TestRemoteOffsetUnhealthy(t *testing.T) { 815 defer leaktest.AfterTest(t)() 816 817 stopper := stop.NewStopper() 818 defer stopper.Stop(context.Background()) 819 820 const maxOffset = 100 * time.Millisecond 821 822 type nodeContext struct { 823 offset time.Duration 824 ctx *Context 825 errChan chan error 826 } 827 828 start := time.Date(2012, 12, 07, 0, 0, 0, 0, time.UTC) 829 830 nodeCtxs := []nodeContext{ 831 {offset: 0}, 832 {offset: 0}, 833 {offset: 0}, 834 // The minimum offset that actually triggers node death. 835 {offset: maxOffset + 1}, 836 } 837 838 // Shared cluster ID by all RPC peers (this ensures that the peers 839 // don't talk to servers from unrelated tests by accident). 840 clusterID := uuid.MakeV4() 841 842 for i := range nodeCtxs { 843 clock := hlc.NewClock(start.Add(nodeCtxs[i].offset).UnixNano, maxOffset) 844 nodeCtxs[i].errChan = make(chan error, 1) 845 nodeCtxs[i].ctx = newTestContext(clusterID, clock, stopper) 846 nodeCtxs[i].ctx.heartbeatInterval = maxOffset 847 nodeCtxs[i].ctx.NodeID.Set(context.Background(), roachpb.NodeID(i+1)) 848 849 s := newTestServer(t, nodeCtxs[i].ctx) 850 RegisterHeartbeatServer(s, &HeartbeatService{ 851 clock: clock, 852 remoteClockMonitor: nodeCtxs[i].ctx.RemoteClocks, 853 clusterID: &nodeCtxs[i].ctx.ClusterID, 854 nodeID: &nodeCtxs[i].ctx.NodeID, 855 settings: nodeCtxs[i].ctx.settings, 856 }) 857 ln, err := netutil.ListenAndServeGRPC(nodeCtxs[i].ctx.Stopper, s, util.TestAddr) 858 if err != nil { 859 t.Fatal(err) 860 } 861 nodeCtxs[i].ctx.Addr = ln.Addr().String() 862 } 863 864 // Fully connect the nodes. 865 for i, clientNodeContext := range nodeCtxs { 866 for j, serverNodeContext := range nodeCtxs { 867 if i == j { 868 continue 869 } 870 if _, err := clientNodeContext.ctx.GRPCDialNode( 871 serverNodeContext.ctx.Addr, 872 serverNodeContext.ctx.NodeID.Get(), 873 DefaultClass).Connect(context.Background()); err != nil { 874 t.Fatal(err) 875 } 876 } 877 } 878 879 // Wait until all nodes are connected to all other nodes. 880 for _, nodeCtx := range nodeCtxs { 881 testutils.SucceedsSoon(t, func() error { 882 nodeCtx.ctx.RemoteClocks.mu.Lock() 883 defer nodeCtx.ctx.RemoteClocks.mu.Unlock() 884 885 if a, e := len(nodeCtx.ctx.RemoteClocks.mu.offsets), len(nodeCtxs)-1; a != e { 886 return errors.Errorf("not yet fully connected: have %d of %d connections: %v", a, e, nodeCtx.ctx.RemoteClocks.mu.offsets) 887 } 888 return nil 889 }) 890 } 891 892 for i, nodeCtx := range nodeCtxs { 893 if nodeOffset := nodeCtx.offset; nodeOffset > maxOffset { 894 if err := nodeCtx.ctx.RemoteClocks.VerifyClockOffset(nodeCtx.ctx.masterCtx); testutils.IsError(err, errOffsetGreaterThanMaxOffset) { 895 t.Logf("max offset: %s - node %d with excessive clock offset of %s returned expected error: %s", maxOffset, i, nodeOffset, err) 896 } else { 897 t.Errorf("max offset: %s - node %d with excessive clock offset of %s returned unexpected error: %v", maxOffset, i, nodeOffset, err) 898 } 899 } else { 900 if err := nodeCtx.ctx.RemoteClocks.VerifyClockOffset(nodeCtx.ctx.masterCtx); err != nil { 901 t.Errorf("max offset: %s - node %d with acceptable clock offset of %s returned unexpected error: %s", maxOffset, i, nodeOffset, err) 902 } else { 903 t.Logf("max offset: %s - node %d with acceptable clock offset of %s did not return an error, as expected", maxOffset, i, nodeOffset) 904 } 905 } 906 } 907 } 908 909 // This is a smoketest for gRPC Keepalives: rpc.Context asks gRPC to perform 910 // periodic pings on the transport to check that it's still alive. If the ping 911 // doesn't get a pong within a timeout, the transport is supposed to be closed - 912 // that's what we're testing here. Likewise, serverside keepalive ensures that 913 // if a ping is not seen within a timeout, the transport will also be closed. 914 // 915 // In this test we use a TestingHeartbeatStreamService as oppposed to a standard 916 // HeartbeatService. This is important to test scenarios where the 917 // client->server connection is partitioned but the server->client connection is 918 // healthy, because a TestingHeartbeatStreamService will continue to respond on 919 // its response stream even if it doesn't get any new requests. 920 func TestGRPCKeepaliveFailureFailsInflightRPCs(t *testing.T) { 921 defer leaktest.AfterTest(t)() 922 t.Skip("Takes too long given https://github.com/grpc/grpc-go/pull/2642") 923 924 sc := log.Scope(t) 925 defer sc.Close(t) 926 927 testCases := []grpcKeepaliveTestCase{ 928 // Keepalive doesn't matter if the network is fine. 929 {cKeepalive: false, sKeepalive: false, partitionC2S: false, partitionS2C: false, expClose: false}, 930 931 // No keepalive. Never detects network issues. 932 {cKeepalive: false, sKeepalive: false, partitionC2S: true, partitionS2C: false, expClose: false}, 933 {cKeepalive: false, sKeepalive: false, partitionC2S: false, partitionS2C: true, expClose: false}, 934 {cKeepalive: false, sKeepalive: false, partitionC2S: true, partitionS2C: true, expClose: false}, 935 936 // Client-only keepalive. Doesn't detect client->server partition. 937 {cKeepalive: true, sKeepalive: false, partitionC2S: true, partitionS2C: false, expClose: false}, 938 {cKeepalive: true, sKeepalive: false, partitionC2S: false, partitionS2C: true, expClose: true}, 939 {cKeepalive: true, sKeepalive: false, partitionC2S: true, partitionS2C: true, expClose: true}, 940 941 // Server-only keepalive. Only detects server->client partition. The 942 // bi-directional partition case (third case) may be is surprising. 943 // The reason the client doesn't close the connection is because it 944 // does not receive the connection closed message sent by the server. 945 // This demonstrates why client keepalive is so important. 946 {cKeepalive: false, sKeepalive: true, partitionC2S: true, partitionS2C: false, expClose: true}, 947 {cKeepalive: false, sKeepalive: true, partitionC2S: false, partitionS2C: true, expClose: false}, 948 {cKeepalive: false, sKeepalive: true, partitionC2S: true, partitionS2C: true, expClose: false}, 949 950 // Client and Server keepalive. Detects all partitions! 951 {cKeepalive: true, sKeepalive: true, partitionC2S: true, partitionS2C: false, expClose: true}, 952 {cKeepalive: true, sKeepalive: true, partitionC2S: false, partitionS2C: true, expClose: true}, 953 {cKeepalive: true, sKeepalive: true, partitionC2S: true, partitionS2C: true, expClose: true}, 954 } 955 956 // For consistent spacing in test names. 957 fmtBool := func(b bool) string { 958 s := strconv.FormatBool(b) 959 if b { 960 s += " " 961 } 962 return s 963 } 964 connIcon := func(partition bool) string { 965 if partition { 966 return "-X->" 967 } 968 return "--->" 969 } 970 971 // Run all the tests. 972 var wg sync.WaitGroup 973 wg.Add(len(testCases)) 974 errCh := make(chan error, len(testCases)) 975 for testNum, c := range testCases { 976 kaName := fmt.Sprintf("clientKeepalive=%s,serverKeepalive=%s", fmtBool(c.cKeepalive), fmtBool(c.sKeepalive)) 977 pName := fmt.Sprintf("client%sserver,server%sclient", connIcon(c.partitionC2S), connIcon(c.partitionS2C)) 978 testName := fmt.Sprintf("%d/%s/%s", testNum, kaName, pName) 979 ctx := logtags.AddTag(context.Background(), testName, nil) 980 981 log.Infof(ctx, "starting sub-test") 982 go func(c grpcKeepaliveTestCase) { 983 errCh <- errors.Wrapf(grpcRunKeepaliveTestCase(ctx, c), "%+v", c) 984 wg.Done() 985 }(c) 986 } 987 log.Infof(context.Background(), "waiting for sub-tests to complete") 988 wg.Wait() 989 close(errCh) 990 991 for err := range errCh { 992 if err != nil { 993 t.Errorf("%+v", err) 994 } 995 } 996 } 997 998 type grpcKeepaliveTestCase struct { 999 cKeepalive, sKeepalive bool 1000 partitionC2S, partitionS2C bool 1001 expClose bool 1002 } 1003 1004 func grpcRunKeepaliveTestCase(testCtx context.Context, c grpcKeepaliveTestCase) error { 1005 var cKeepalive keepalive.ClientParameters 1006 if c.cKeepalive { 1007 cKeepalive = clientKeepalive 1008 } 1009 var sKeepalive keepalive.ServerParameters 1010 if c.sKeepalive { 1011 sKeepalive = serverTestingKeepalive 1012 } 1013 1014 stopper := stop.NewStopper() 1015 defer stopper.Stop(context.Background()) 1016 ctx, cancel := stopper.WithCancelOnQuiesce(testCtx) 1017 defer cancel() 1018 1019 // Shared cluster ID by all RPC peers (this ensures that the peers 1020 // don't talk to servers from unrelated tests by accident). 1021 clusterID := uuid.MakeV4() 1022 1023 // Construct server with server-side keepalive. 1024 log.Infof(ctx, "constructing server") 1025 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1026 serverCtx := newTestContext(clusterID, clock, stopper) 1027 const serverNodeID = 1 1028 serverCtx.NodeID.Set(context.Background(), serverNodeID) 1029 tlsConfig, err := serverCtx.GetServerTLSConfig() 1030 if err != nil { 1031 return err 1032 } 1033 s := grpc.NewServer( 1034 grpc.Creds(credentials.NewTLS(tlsConfig)), 1035 grpc.StatsHandler(&serverCtx.stats), 1036 grpc.KeepaliveParams(sKeepalive), 1037 ) 1038 1039 // Create heartbeat service. This service will continuously 1040 // read on its input stream and send on its output stream. 1041 log.Infof(ctx, "creating heartbeat service") 1042 const msgInterval = 10 * time.Millisecond 1043 hss := &HeartbeatStreamService{ 1044 HeartbeatService: HeartbeatService{ 1045 clock: clock, 1046 remoteClockMonitor: serverCtx.RemoteClocks, 1047 clusterID: &serverCtx.ClusterID, 1048 nodeID: &serverCtx.NodeID, 1049 settings: serverCtx.settings, 1050 }, 1051 interval: msgInterval, 1052 } 1053 RegisterHeartbeatServer(s, hss) 1054 RegisterTestingHeartbeatStreamServer(s, hss) 1055 1056 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1057 if err != nil { 1058 return err 1059 } 1060 remoteAddr := ln.Addr().String() 1061 1062 log.Infof(ctx, "setting up client") 1063 clientCtx := newTestContext(clusterID, clock, stopper) 1064 // Disable automatic heartbeats. We'll send them by hand. 1065 clientCtx.heartbeatInterval = math.MaxInt64 1066 1067 var firstConn int32 = 1 1068 1069 // We're going to open RPC transport connections using a dialer that returns 1070 // PartitionableConns. We'll partition the first opened connection. 1071 dialerCh := make(chan *testutils.PartitionableConn, 1) 1072 clientCtx.AddTestingDialOpts( 1073 grpc.WithContextDialer( 1074 func(_ context.Context, addr string) (net.Conn, error) { 1075 if !atomic.CompareAndSwapInt32(&firstConn, 1, 0) { 1076 // If we allow gRPC to open a 2nd transport connection, then our RPCs 1077 // might succeed if they're sent on that one. In the spirit of a 1078 // partition, we'll return errors for the attempt to open a new 1079 // connection (albeit for a TCP connection the error would come after 1080 // a socket connect timeout). 1081 return nil, errors.Errorf("No more connections for you. We're partitioned.") 1082 } 1083 1084 conn, err := net.Dial("tcp", addr) 1085 if err != nil { 1086 return nil, err 1087 } 1088 transportConn := testutils.NewPartitionableConn(conn) 1089 dialerCh <- transportConn 1090 return transportConn, nil 1091 }), 1092 grpc.WithKeepaliveParams(cKeepalive), 1093 ) 1094 log.Infof(ctx, "dialing server") 1095 conn, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(ctx) 1096 if err != nil { 1097 return err 1098 } 1099 defer func() { _ = conn.Close() }() 1100 1101 // Create the heartbeat client. 1102 log.Infof(ctx, "starting heartbeat client") 1103 unlockedHeartbeatClient, err := NewTestingHeartbeatStreamClient(conn).PingStream(ctx) 1104 if err != nil { 1105 return err 1106 } 1107 heartbeatClient := &lockedPingStreamClient{ 1108 TestingHeartbeatStream_PingStreamClient: unlockedHeartbeatClient, 1109 } 1110 1111 // Perform an initial request-response round trip. 1112 log.Infof(ctx, "first ping") 1113 request := PingRequest{ServerVersion: clientCtx.settings.Version.BinaryVersion()} 1114 if err := heartbeatClient.Send(&request); err != nil { 1115 return err 1116 } 1117 if _, err := heartbeatClient.Recv(); err != nil { 1118 return err 1119 } 1120 1121 // Launch a goroutine to read from the channel continuously and 1122 // a goroutine to write to the channel continuously. Both will 1123 // exit when the channel breaks (either because of a partition 1124 // or because the stopper stops). 1125 go func() { 1126 t := time.NewTicker(msgInterval) 1127 defer t.Stop() 1128 for { 1129 <-t.C 1130 log.Infof(ctx, "client send") 1131 if err := heartbeatClient.Send(&request); err != nil { 1132 return 1133 } 1134 } 1135 }() 1136 go func() { 1137 for { 1138 log.Infof(ctx, "client recv") 1139 if _, err := heartbeatClient.Recv(); err != nil { 1140 return 1141 } 1142 } 1143 }() 1144 1145 // Now partition either client->server, server->client, or both, and attempt 1146 // to perform an RPC. We expect it to fail once the grpc keepalive fails to 1147 // get a response from the server. 1148 transportConn := <-dialerCh 1149 defer transportConn.Finish() 1150 if c.partitionC2S { 1151 log.Infof(ctx, "partition C2S") 1152 transportConn.PartitionC2S() 1153 } 1154 if c.partitionS2C { 1155 log.Infof(ctx, "partition S2C") 1156 transportConn.PartitionS2C() 1157 } 1158 1159 // We want to start a goroutine that keeps trying to send requests and reports 1160 // the error from the send call. In cases where there are no keep-alives this 1161 // request may get blocked if flow control blocks it. 1162 errChan := make(chan error) 1163 sendCtx, cancel := context.WithCancel(ctx) 1164 r := retry.StartWithCtx(sendCtx, retry.Options{ 1165 InitialBackoff: 10 * time.Millisecond, 1166 MaxBackoff: 500 * time.Millisecond, 1167 }) 1168 defer cancel() 1169 go func() { 1170 for r.Next() { 1171 err := heartbeatClient.Send(&request) 1172 isClosed := err != nil && grpcutil.IsClosedConnection(err) 1173 log.Infof(ctx, "heartbeat Send got error %+v (closed=%v)", err, isClosed) 1174 select { 1175 case errChan <- err: 1176 case <-sendCtx.Done(): 1177 return 1178 } 1179 if isClosed { 1180 return 1181 } 1182 } 1183 }() 1184 // Check whether the connection eventually closes. We may need to 1185 // adjust this duration if the test gets flaky. 1186 // This unfortunately massive amount of time is required due to gRPC's 1187 // minimum timeout of 10s and the below issue whereby keepalives are sent 1188 // at half the expected rate. 1189 // https://github.com/grpc/grpc-go/issues/2638 1190 const timeoutDur = 21 * time.Second 1191 timeout := time.After(timeoutDur) 1192 // sendErr will hold the last error we saw from an attempt to send a 1193 // heartbeat. Initialize it with a dummy error which will fail the test if 1194 // it is not overwritten. 1195 sendErr := fmt.Errorf("not a real error") 1196 for done := false; !done; { 1197 select { 1198 case <-timeout: 1199 cancel() 1200 done = true 1201 case sendErr = <-errChan: 1202 } 1203 } 1204 if c.expClose { 1205 if sendErr == nil || !grpcutil.IsClosedConnection(sendErr) { 1206 newErr := fmt.Errorf("expected closed connection, found %v", sendErr) 1207 log.Infof(ctx, "%+v", newErr) 1208 return newErr 1209 } 1210 } else { 1211 if sendErr != nil { 1212 newErr := fmt.Errorf("expected unclosed connection, found %v", sendErr) 1213 log.Infof(ctx, "%+v", newErr) 1214 return newErr 1215 } 1216 } 1217 1218 // If the DialOptions we passed to gRPC didn't prevent it from opening new 1219 // connections, then next RPCs would succeed since gRPC reconnects the 1220 // transport (and that would succeed here since we've only partitioned one 1221 // connection). We could further test that the status reported by 1222 // Context.ConnHealth() for the remote node moves to UNAVAILABLE because of 1223 // the (application-level) heartbeats performed by rpc.Context, but the 1224 // behavior of our heartbeats in the face of transport failures is 1225 // sufficiently tested in TestHeartbeatHealthTransport. 1226 log.Infof(ctx, "test done") 1227 return nil 1228 } 1229 1230 func TestClusterIDMismatch(t *testing.T) { 1231 defer leaktest.AfterTest(t)() 1232 1233 stopper := stop.NewStopper() 1234 defer stopper.Stop(context.Background()) 1235 1236 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1237 serverCtx := newTestContext(uuid.MakeV4(), clock, stopper) 1238 const serverNodeID = 1 1239 serverCtx.NodeID.Set(context.Background(), serverNodeID) 1240 s := newTestServer(t, serverCtx) 1241 RegisterHeartbeatServer(s, &HeartbeatService{ 1242 clock: clock, 1243 remoteClockMonitor: serverCtx.RemoteClocks, 1244 clusterID: &serverCtx.ClusterID, 1245 nodeID: &serverCtx.NodeID, 1246 settings: serverCtx.settings, 1247 }) 1248 1249 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1250 if err != nil { 1251 t.Fatal(err) 1252 } 1253 remoteAddr := ln.Addr().String() 1254 1255 // Ensure the client ctx gets a new fresh cluster ID so it becomes 1256 // different from the server's. 1257 clientCtx := newTestContext(uuid.MakeV4(), clock, stopper) 1258 1259 var wg sync.WaitGroup 1260 for i := 0; i < 10; i++ { 1261 wg.Add(1) 1262 go func() { 1263 _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()) 1264 expected := "initial connection heartbeat failed.*doesn't match server cluster ID" 1265 if !testutils.IsError(err, expected) { 1266 t.Errorf("expected %s error, got %v", expected, err) 1267 } 1268 wg.Done() 1269 }() 1270 } 1271 wg.Wait() 1272 } 1273 1274 func TestClusterNameMismatch(t *testing.T) { 1275 defer leaktest.AfterTest(t)() 1276 1277 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1278 1279 testData := []struct { 1280 serverName string 1281 serverDisablePeerCheck bool 1282 clientName string 1283 clientDisablePeerCheck bool 1284 expectedErr string 1285 }{ 1286 {"", false, "", false, ``}, 1287 // The name check is enabled if both the client and server want it. 1288 {"a", false, "", false, `peer node expects cluster name "a", use --cluster-name to configure`}, 1289 {"", false, "a", false, `peer node does not have a cluster name configured, cannot use --cluster-name`}, 1290 {"a", false, "b", false, `local cluster name "b" does not match peer cluster name "a"`}, 1291 // It's disabled if either doesn't want it. 1292 // However in any case if the name is not empty it has to match. 1293 {"a", true, "", false, ``}, 1294 {"", true, "a", false, ``}, 1295 {"a", true, "b", false, ``}, 1296 {"a", false, "", true, ``}, 1297 {"", false, "a", true, ``}, 1298 {"a", false, "b", true, ``}, 1299 {"a", true, "", true, ``}, 1300 {"", true, "a", true, ``}, 1301 {"a", true, "b", true, ``}, 1302 } 1303 1304 for i, c := range testData { 1305 t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { 1306 stopper := stop.NewStopper() 1307 defer stopper.Stop(context.Background()) 1308 1309 serverCtx := newTestContext(uuid.MakeV4(), clock, stopper) 1310 serverCtx.clusterName = c.serverName 1311 serverCtx.disableClusterNameVerification = c.serverDisablePeerCheck 1312 1313 s := newTestServer(t, serverCtx) 1314 RegisterHeartbeatServer(s, &HeartbeatService{ 1315 clock: clock, 1316 remoteClockMonitor: serverCtx.RemoteClocks, 1317 clusterID: &serverCtx.ClusterID, 1318 nodeID: &serverCtx.NodeID, 1319 settings: serverCtx.settings, 1320 clusterName: serverCtx.clusterName, 1321 disableClusterNameVerification: serverCtx.disableClusterNameVerification, 1322 }) 1323 1324 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1325 if err != nil { 1326 t.Fatal(err) 1327 } 1328 remoteAddr := ln.Addr().String() 1329 1330 clientCtx := newTestContext(serverCtx.ClusterID.Get(), clock, stopper) 1331 clientCtx.clusterName = c.clientName 1332 clientCtx.disableClusterNameVerification = c.clientDisablePeerCheck 1333 1334 var wg sync.WaitGroup 1335 for i := 0; i < 10; i++ { 1336 wg.Add(1) 1337 go func() { 1338 _, err := clientCtx.GRPCUnvalidatedDial(remoteAddr).Connect(context.Background()) 1339 if !testutils.IsError(err, c.expectedErr) { 1340 t.Errorf("expected %s error, got %v", c.expectedErr, err) 1341 } 1342 wg.Done() 1343 }() 1344 } 1345 wg.Wait() 1346 }) 1347 } 1348 } 1349 1350 func TestNodeIDMismatch(t *testing.T) { 1351 defer leaktest.AfterTest(t)() 1352 1353 stopper := stop.NewStopper() 1354 defer stopper.Stop(context.Background()) 1355 1356 // Shared cluster ID by all RPC peers (this ensures that the peers 1357 // don't talk to servers from unrelated tests by accident). 1358 clusterID := uuid.MakeV4() 1359 1360 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1361 serverCtx := newTestContext(clusterID, clock, stopper) 1362 serverCtx.NodeID.Set(context.Background(), 1) 1363 s := newTestServer(t, serverCtx) 1364 RegisterHeartbeatServer(s, &HeartbeatService{ 1365 clock: clock, 1366 remoteClockMonitor: serverCtx.RemoteClocks, 1367 clusterID: &serverCtx.ClusterID, 1368 nodeID: &serverCtx.NodeID, 1369 settings: serverCtx.settings, 1370 }) 1371 1372 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1373 if err != nil { 1374 t.Fatal(err) 1375 } 1376 remoteAddr := ln.Addr().String() 1377 1378 clientCtx := newTestContext(clusterID, clock, stopper) 1379 1380 var wg sync.WaitGroup 1381 for i := 0; i < 10; i++ { 1382 wg.Add(1) 1383 go func() { 1384 _, err := clientCtx.GRPCDialNode(remoteAddr, 2, DefaultClass).Connect(context.Background()) 1385 expected := "initial connection heartbeat failed.*doesn't match server node ID" 1386 if !testutils.IsError(err, expected) { 1387 t.Errorf("expected %s error, got %v", expected, err) 1388 } 1389 wg.Done() 1390 }() 1391 } 1392 wg.Wait() 1393 } 1394 1395 func setVersion(c *Context, v roachpb.Version) error { 1396 st := cluster.MakeTestingClusterSettingsWithVersions(v, v, true /* initializeVersion */) 1397 c.settings = st 1398 return nil 1399 } 1400 1401 // Test that GRPCDial fails if there is a version incompatibility in either 1402 // direction (client -> server or server -> client). 1403 func TestVersionCheckBidirectional(t *testing.T) { 1404 defer leaktest.AfterTest(t)() 1405 1406 v1 := roachpb.Version{Major: 1} 1407 v2 := clusterversion.TestingBinaryVersion 1408 1409 testData := []struct { 1410 name string 1411 serverVersion roachpb.Version 1412 clientVersion roachpb.Version 1413 expectError bool 1414 }{ 1415 {"serverVersion == clientVersion", v1, v1, false}, 1416 {"serverVersion < clientVersion", v1, v2, true}, 1417 {"serverVersion > clientVersion", v2, v1, true}, 1418 } 1419 1420 // Shared cluster ID by all RPC peers (this ensures that the peers 1421 // don't talk to servers from unrelated tests by accident). 1422 clusterID := uuid.MakeV4() 1423 1424 for _, td := range testData { 1425 t.Run(td.name, func(t *testing.T) { 1426 stopper := stop.NewStopper() 1427 defer stopper.Stop(context.Background()) 1428 1429 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1430 serverCtx := newTestContext(clusterID, clock, stopper) 1431 const serverNodeID = 1 1432 serverCtx.NodeID.Set(context.Background(), serverNodeID) 1433 if err := setVersion(serverCtx, td.serverVersion); err != nil { 1434 t.Fatal(err) 1435 } 1436 s := newTestServer(t, serverCtx) 1437 RegisterHeartbeatServer(s, &HeartbeatService{ 1438 clock: clock, 1439 remoteClockMonitor: serverCtx.RemoteClocks, 1440 clusterID: &serverCtx.ClusterID, 1441 nodeID: &serverCtx.NodeID, 1442 settings: serverCtx.settings, 1443 }) 1444 1445 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1446 if err != nil { 1447 t.Fatal(err) 1448 } 1449 remoteAddr := ln.Addr().String() 1450 1451 clientCtx := newTestContext(clusterID, clock, stopper) 1452 if err := setVersion(clientCtx, td.clientVersion); err != nil { 1453 t.Fatal(err) 1454 } 1455 1456 _, err = clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()) 1457 1458 if td.expectError { 1459 expected := "initial connection heartbeat failed.*cluster requires at least version" 1460 if !testutils.IsError(err, expected) { 1461 t.Errorf("expected %s error, got %v", expected, err) 1462 } 1463 } else if err != nil { 1464 t.Errorf("unexpected error: %s", err) 1465 } 1466 }) 1467 } 1468 } 1469 1470 // TestGRPCDialClass ensures that distinct connections are constructed when 1471 // dialing the same target with different classes. 1472 func TestGRPCDialClass(t *testing.T) { 1473 defer leaktest.AfterTest(t)() 1474 1475 stopper := stop.NewStopper() 1476 defer stopper.Stop(context.Background()) 1477 1478 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1479 serverCtx := newTestContext(uuid.MakeV4(), clock, stopper) 1480 const serverNodeID = 1 1481 serverCtx.NodeID.Set(context.Background(), serverNodeID) 1482 s := newTestServer(t, serverCtx) 1483 RegisterHeartbeatServer(s, &HeartbeatService{ 1484 clock: clock, 1485 remoteClockMonitor: serverCtx.RemoteClocks, 1486 clusterID: &serverCtx.ClusterID, 1487 nodeID: &serverCtx.NodeID, 1488 settings: serverCtx.settings, 1489 }) 1490 1491 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1492 require.Nil(t, err) 1493 remoteAddr := ln.Addr().String() 1494 clientCtx := newTestContext(serverCtx.ClusterID.Get(), clock, stopper) 1495 1496 def1 := clientCtx.GRPCDialNode(remoteAddr, 1, DefaultClass) 1497 sys1 := clientCtx.GRPCDialNode(remoteAddr, 1, SystemClass) 1498 require.False(t, sys1 == def1, 1499 "expected connections dialed with different classes to the same target to differ") 1500 defConn1, err := def1.Connect(context.Background()) 1501 require.Nil(t, err, "expected successful connection") 1502 sysConn1, err := sys1.Connect(context.Background()) 1503 require.Nil(t, err, "expected successful connection") 1504 require.False(t, sysConn1 == defConn1, "expected connections dialed with "+ 1505 "different classes to the sametarget to have separate underlying gRPC connections") 1506 def2 := clientCtx.GRPCDialNode(remoteAddr, 1, DefaultClass) 1507 require.True(t, def1 == def2, "expected connections dialed with the same "+ 1508 "class to the same target to be the same") 1509 sys2 := clientCtx.GRPCDialNode(remoteAddr, 1, SystemClass) 1510 require.True(t, sys1 == sys2, "expected connections dialed with the same "+ 1511 "class to the same target to be the same") 1512 for _, c := range []*Connection{def2, sys2} { 1513 require.Nil(t, c.Health(), "expected connections to be healthy") 1514 } 1515 } 1516 1517 // TestTestingKnobs ensures that the testing knobs are injected in the proper 1518 // places. 1519 func TestTestingKnobs(t *testing.T) { 1520 defer leaktest.AfterTest(t)() 1521 1522 stopper := stop.NewStopper() 1523 defer stopper.Stop(context.Background()) 1524 clusterID := uuid.MakeV4() 1525 1526 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1527 serverCtx := newTestContext(clusterID, clock, stopper) 1528 const serverNodeID = 1 1529 serverCtx.NodeID.Set(context.Background(), serverNodeID) 1530 // Register an UnknownServiceHandler that expects a BatchRequest and sends 1531 // a BatchResponse. It will be used both as a unary and stream handler below. 1532 s := newTestServer(t, serverCtx, grpc.UnknownServiceHandler( 1533 func(srv interface{}, stream grpc.ServerStream) error { 1534 var ba roachpb.BatchRequest 1535 if err := stream.RecvMsg(&ba); err != nil { 1536 return err 1537 } 1538 return stream.SendMsg(&roachpb.BatchResponse{}) 1539 }, 1540 )) 1541 RegisterHeartbeatServer(s, &HeartbeatService{ 1542 clock: clock, 1543 remoteClockMonitor: serverCtx.RemoteClocks, 1544 clusterID: &serverCtx.ClusterID, 1545 nodeID: &serverCtx.NodeID, 1546 settings: serverCtx.settings, 1547 }) 1548 1549 // The test will inject interceptors for both stream and unary calls and then 1550 // will ensure that these interceptors are properly called by keeping track 1551 // of all calls. 1552 1553 // Use these structs to keep track of the number of times the interceptors 1554 // are called in the seen map below. 1555 type streamCall struct { 1556 target string 1557 class ConnectionClass 1558 method string 1559 } 1560 type unaryCall struct { 1561 target string 1562 class ConnectionClass 1563 method string 1564 } 1565 seen := make(map[interface{}]int) 1566 var seenMu syncutil.Mutex 1567 recordCall := func(call interface{}) { 1568 seenMu.Lock() 1569 defer seenMu.Unlock() 1570 seen[call]++ 1571 } 1572 clientCtx := newTestContextWithKnobs(clock, stopper, ContextTestingKnobs{ 1573 ClusterID: &clusterID, 1574 StreamClientInterceptor: func( 1575 target string, class ConnectionClass, 1576 ) grpc.StreamClientInterceptor { 1577 return func( 1578 ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, 1579 method string, streamer grpc.Streamer, opts ...grpc.CallOption, 1580 ) (grpc.ClientStream, error) { 1581 cs, err := streamer(ctx, desc, cc, method, opts...) 1582 if err != nil { 1583 return nil, err 1584 } 1585 recordCall(streamCall{ 1586 target: target, 1587 class: class, 1588 method: method, 1589 }) 1590 return cs, nil 1591 } 1592 }, 1593 UnaryClientInterceptor: func( 1594 target string, class ConnectionClass, 1595 ) grpc.UnaryClientInterceptor { 1596 return func( 1597 ctx context.Context, method string, req, reply interface{}, 1598 cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption, 1599 ) error { 1600 recordCall(unaryCall{ 1601 target: target, 1602 class: class, 1603 method: method, 1604 }) 1605 return invoker(ctx, method, req, reply, cc, opts...) 1606 } 1607 }, 1608 }) 1609 1610 ln, err := netutil.ListenAndServeGRPC(serverCtx.Stopper, s, util.TestAddr) 1611 require.Nil(t, err) 1612 remoteAddr := ln.Addr().String() 1613 sysConn, err := clientCtx.GRPCDialNode(remoteAddr, 1, SystemClass).Connect(context.Background()) 1614 require.Nil(t, err) 1615 defConn, err := clientCtx.GRPCDialNode(remoteAddr, 1, DefaultClass).Connect(context.Background()) 1616 require.Nil(t, err) 1617 const unaryMethod = "/cockroach.rpc.Testing/Foo" 1618 const streamMethod = "/cockroach.rpc.Testing/Bar" 1619 const numSysUnary = 3 1620 for i := 0; i < numSysUnary; i++ { 1621 ba := roachpb.BatchRequest{} 1622 br := roachpb.BatchResponse{} 1623 err := sysConn.Invoke(context.Background(), unaryMethod, &ba, &br) 1624 require.Nil(t, err) 1625 } 1626 const numDefStream = 4 1627 for i := 0; i < numDefStream; i++ { 1628 desc := grpc.StreamDesc{ 1629 StreamName: "bar", 1630 ClientStreams: true, 1631 } 1632 cs, err := defConn.NewStream(context.Background(), &desc, streamMethod) 1633 require.Nil(t, err) 1634 require.Nil(t, cs.SendMsg(&roachpb.BatchRequest{})) 1635 var br roachpb.BatchResponse 1636 require.Nil(t, cs.RecvMsg(&br)) 1637 require.Nil(t, cs.CloseSend()) 1638 } 1639 1640 exp := map[interface{}]int{ 1641 unaryCall{ 1642 target: remoteAddr, 1643 class: SystemClass, 1644 method: unaryMethod, 1645 }: numSysUnary, 1646 streamCall{ 1647 target: remoteAddr, 1648 class: DefaultClass, 1649 method: streamMethod, 1650 }: numDefStream, 1651 } 1652 seenMu.Lock() 1653 defer seenMu.Unlock() 1654 for call, num := range exp { 1655 require.Equal(t, num, seen[call]) 1656 } 1657 } 1658 1659 // This test ensures that clients cannot be left waiting on 1660 // `Connection.Connect()` calls in the rare case where a heartbeat loop 1661 // exits before attempting to send its first heartbeat. See #41521. 1662 func TestRunHeartbeatSetsHeartbeatStateWhenExitingBeforeFirstHeartbeat(t *testing.T) { 1663 defer leaktest.AfterTest(t)() 1664 ctx := context.Background() 1665 stopper := stop.NewStopper() 1666 defer stopper.Stop(ctx) 1667 clusterID := uuid.MakeV4() 1668 1669 clock := hlc.NewClock(timeutil.Unix(0, 20).UnixNano, time.Nanosecond) 1670 1671 // This test reaches into low-level implementation details to recreate 1672 // the hazardous scenario seen in #41521. In that isse we saw a runHeartbeat() 1673 // loop exit prior to sending the first heartbeat. To recreate that scenario 1674 // which seems difficult to create now that gRPC backs off redialing, we 1675 // launch the runHeartbeat() loop with an already closed redial chan. 1676 // In order to hit predictable errors we run an actual server on the other 1677 // side of the Connection passed to runHeartbeat(). 1678 // 1679 // At least half of the time this test will hit the case where the select 1680 // in runHeartbeat detects the closed redial chan and returns. The 1681 // correctness criteria we're trying to verify is that the Connect call 1682 // below does not block. 1683 1684 rpcCtx := newTestContext(clusterID, clock, stopper) 1685 1686 const serverNodeID = 1 1687 serverCtx := newTestContext(clusterID, clock, stopper) 1688 serverCtx.NodeID.Set(ctx, serverNodeID) 1689 1690 s := NewServer(serverCtx) 1691 ln, err := netutil.ListenAndServeGRPC(stopper, s, util.TestAddr) 1692 if err != nil { 1693 t.Fatal(err) 1694 } 1695 remoteAddr := ln.Addr().String() 1696 1697 c := newConnectionToNodeID(stopper, 1) 1698 1699 redialChan := make(chan struct{}) 1700 close(redialChan) 1701 1702 c.grpcConn, _, c.dialErr = rpcCtx.grpcDialRaw(remoteAddr, serverNodeID, DefaultClass) 1703 require.NoError(t, c.dialErr) 1704 // It is possible that the redial chan being closed is not seen on the first 1705 // pass through the loop. 1706 err = rpcCtx.runHeartbeat(c, "", redialChan) 1707 require.EqualError(t, err, grpcutil.ErrCannotReuseClientConn.Error()) 1708 // Even when the runHeartbeat returns, we could have heartbeated successfully. 1709 // If we did not, then we expect the `not yet heartbeated` error. 1710 if _, err = c.Connect(ctx); err != nil { 1711 require.Regexp(t, "not yet heartbeated", err) 1712 } 1713 require.NoError(t, c.grpcConn.Close()) 1714 } 1715 1716 func BenchmarkGRPCDial(b *testing.B) { 1717 if testing.Short() { 1718 b.Skip("TODO: fix benchmark") 1719 } 1720 stopper := stop.NewStopper() 1721 defer stopper.Stop(context.Background()) 1722 1723 clock := hlc.NewClock(hlc.UnixNano, 250*time.Millisecond) 1724 ctx := newTestContext(uuid.MakeV4(), clock, stopper) 1725 const serverNodeID = 1 1726 ctx.NodeID.Set(context.Background(), serverNodeID) 1727 1728 s := newTestServer(b, ctx) 1729 ln, err := netutil.ListenAndServeGRPC(ctx.Stopper, s, util.TestAddr) 1730 if err != nil { 1731 b.Fatal(err) 1732 } 1733 remoteAddr := ln.Addr().String() 1734 1735 b.RunParallel(func(pb *testing.PB) { 1736 for pb.Next() { 1737 _, err := ctx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()) 1738 if err != nil { 1739 b.Fatal(err) 1740 } 1741 } 1742 }) 1743 }