github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/gossip/client_test.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package gossip 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "net" 18 "testing" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 22 "github.com/cockroachdb/cockroach/pkg/gossip/resolver" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/rpc" 25 "github.com/cockroachdb/cockroach/pkg/testutils" 26 "github.com/cockroachdb/cockroach/pkg/util" 27 "github.com/cockroachdb/cockroach/pkg/util/hlc" 28 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 29 "github.com/cockroachdb/cockroach/pkg/util/log" 30 "github.com/cockroachdb/cockroach/pkg/util/metric" 31 "github.com/cockroachdb/cockroach/pkg/util/netutil" 32 "github.com/cockroachdb/cockroach/pkg/util/stop" 33 "github.com/cockroachdb/cockroach/pkg/util/tracing" 34 "github.com/cockroachdb/cockroach/pkg/util/uuid" 35 "github.com/cockroachdb/errors" 36 "github.com/gogo/protobuf/proto" 37 "google.golang.org/grpc" 38 ) 39 40 // startGossip creates and starts a gossip instance. 41 func startGossip( 42 clusterID uuid.UUID, 43 nodeID roachpb.NodeID, 44 stopper *stop.Stopper, 45 t *testing.T, 46 registry *metric.Registry, 47 ) *Gossip { 48 return startGossipAtAddr(clusterID, nodeID, util.IsolatedTestAddr, stopper, t, registry) 49 } 50 51 func startGossipAtAddr( 52 clusterID uuid.UUID, 53 nodeID roachpb.NodeID, 54 addr net.Addr, 55 stopper *stop.Stopper, 56 t *testing.T, 57 registry *metric.Registry, 58 ) *Gossip { 59 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 60 rpcContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 61 rpcContext.NodeID.Set(context.Background(), nodeID) 62 63 server := rpc.NewServer(rpcContext) 64 g := NewTest(nodeID, rpcContext, server, stopper, registry, zonepb.DefaultZoneConfigRef()) 65 ln, err := netutil.ListenAndServeGRPC(stopper, server, addr) 66 if err != nil { 67 t.Fatal(err) 68 } 69 addr = ln.Addr() 70 if err := g.SetNodeDescriptor(&roachpb.NodeDescriptor{ 71 NodeID: nodeID, 72 Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), 73 }); err != nil { 74 t.Fatal(err) 75 } 76 g.start(addr) 77 time.Sleep(time.Millisecond) 78 return g 79 } 80 81 type fakeGossipServer struct { 82 nodeAddr util.UnresolvedAddr 83 nodeIDChan chan roachpb.NodeID 84 } 85 86 func newFakeGossipServer(grpcServer *grpc.Server, stopper *stop.Stopper) *fakeGossipServer { 87 s := &fakeGossipServer{ 88 nodeIDChan: make(chan roachpb.NodeID, 1), 89 } 90 RegisterGossipServer(grpcServer, s) 91 return s 92 } 93 94 func (s *fakeGossipServer) Gossip(stream Gossip_GossipServer) error { 95 for { 96 args, err := stream.Recv() 97 if err != nil { 98 return err 99 } 100 101 select { 102 case s.nodeIDChan <- args.NodeID: 103 default: 104 } 105 106 if err := stream.Send(&Response{ 107 // Just don't conflict with other nodes. 108 NodeID: math.MaxInt32, 109 }); err != nil { 110 return err 111 } 112 } 113 } 114 115 // startFakeServerGossips creates local gossip instances and remote 116 // faked gossip instance. The remote gossip instance launches its 117 // faked gossip service just for check the client message. 118 func startFakeServerGossips( 119 t *testing.T, clusterID uuid.UUID, localNodeID roachpb.NodeID, stopper *stop.Stopper, 120 ) (*Gossip, *fakeGossipServer) { 121 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 122 lRPCContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 123 124 lserver := rpc.NewServer(lRPCContext) 125 local := NewTest(localNodeID, lRPCContext, lserver, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef()) 126 lln, err := netutil.ListenAndServeGRPC(stopper, lserver, util.IsolatedTestAddr) 127 if err != nil { 128 t.Fatal(err) 129 } 130 local.start(lln.Addr()) 131 132 rRPCContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 133 rserver := rpc.NewServer(rRPCContext) 134 remote := newFakeGossipServer(rserver, stopper) 135 rln, err := netutil.ListenAndServeGRPC(stopper, rserver, util.IsolatedTestAddr) 136 if err != nil { 137 t.Fatal(err) 138 } 139 addr := rln.Addr() 140 remote.nodeAddr = util.MakeUnresolvedAddr(addr.Network(), addr.String()) 141 142 return local, remote 143 } 144 145 func gossipSucceedsSoon( 146 t *testing.T, 147 stopper *stop.Stopper, 148 clusterID uuid.UUID, 149 disconnected chan *client, 150 gossip map[*client]*Gossip, 151 f func() error, 152 ) { 153 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 154 // Use an insecure context since we don't need a valid cert. 155 rpcContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 156 157 for c := range gossip { 158 disconnected <- c 159 } 160 161 testutils.SucceedsSoon(t, func() error { 162 select { 163 case client := <-disconnected: 164 // If the client wasn't able to connect, restart it. 165 g := gossip[client] 166 g.mu.Lock() 167 client.startLocked(g, disconnected, rpcContext, stopper, rpcContext.NewBreaker("")) 168 g.mu.Unlock() 169 default: 170 } 171 172 return f() 173 }) 174 } 175 176 // TestClientGossip verifies a client can gossip a delta to the server. 177 func TestClientGossip(t *testing.T) { 178 defer leaktest.AfterTest(t)() 179 stopper := stop.NewStopper() 180 181 // Shared cluster ID by all gossipers (this ensures that the gossipers 182 // don't talk to servers from unrelated tests by accident). 183 clusterID := uuid.MakeV4() 184 185 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 186 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 187 disconnected := make(chan *client, 1) 188 c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, remote.GetNodeAddr(), makeMetrics()) 189 190 defer func() { 191 stopper.Stop(context.Background()) 192 if c != <-disconnected { 193 t.Errorf("expected client disconnect after remote close") 194 } 195 }() 196 197 if err := local.AddInfo("local-key", nil, time.Hour); err != nil { 198 t.Fatal(err) 199 } 200 if err := remote.AddInfo("remote-key", nil, time.Hour); err != nil { 201 t.Fatal(err) 202 } 203 204 gossipSucceedsSoon(t, stopper, clusterID, disconnected, map[*client]*Gossip{ 205 c: local, 206 }, func() error { 207 if _, err := remote.GetInfo("local-key"); err != nil { 208 return err 209 } 210 if _, err := local.GetInfo("remote-key"); err != nil { 211 return err 212 } 213 return nil 214 }) 215 } 216 217 // TestClientGossipMetrics verifies that gossip stats are generated. 218 func TestClientGossipMetrics(t *testing.T) { 219 defer leaktest.AfterTest(t)() 220 stopper := stop.NewStopper() 221 defer stopper.Stop(context.Background()) 222 223 // Shared cluster ID by all gossipers (this ensures that the gossipers 224 // don't talk to servers from unrelated tests by accident). 225 clusterID := uuid.MakeV4() 226 227 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 228 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 229 230 if err := local.AddInfo("local-key", nil, time.Hour); err != nil { 231 t.Fatal(err) 232 } 233 if err := remote.AddInfo("remote-key", nil, time.Hour); err != nil { 234 t.Fatal(err) 235 } 236 237 gossipSucceedsSoon( 238 t, stopper, clusterID, make(chan *client, 2), 239 map[*client]*Gossip{ 240 newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, local.GetNodeAddr(), remote.nodeMetrics): remote, 241 }, 242 func() error { 243 // Infos/Bytes Sent/Received should not be zero. 244 for i, s := range []*server{local.server, remote.server} { 245 for _, counter := range []*metric.Counter{ 246 s.nodeMetrics.InfosSent, 247 s.nodeMetrics.InfosReceived, 248 s.nodeMetrics.BytesSent, 249 s.nodeMetrics.BytesReceived, 250 } { 251 if count := counter.Count(); count <= 0 { 252 return errors.Errorf("%d: expected metrics counter %q > 0; = %d", i, counter.GetName(), count) 253 } 254 } 255 } 256 257 // Since there are two gossip nodes, there should be exactly one incoming 258 // or outgoing connection due to gossip's connection de-duplication. 259 for i, g := range []*Gossip{local, remote} { 260 g.mu.Lock() 261 defer g.mu.Unlock() 262 263 count := int64(0) 264 for _, gauge := range []*metric.Gauge{g.mu.incoming.gauge, g.outgoing.gauge} { 265 if gauge == nil { 266 return errors.Errorf("%d: missing gauge", i) 267 } 268 count += gauge.Value() 269 } 270 const expected = 1 271 if count != expected { 272 return errors.Errorf("%d: expected metrics incoming + outgoing connection count == %d; = %d", i, expected, count) 273 } 274 } 275 return nil 276 }) 277 } 278 279 // TestClientNodeID verifies a client's gossip request with correct NodeID. 280 func TestClientNodeID(t *testing.T) { 281 defer leaktest.AfterTest(t)() 282 283 stopper := stop.NewStopper() 284 disconnected := make(chan *client, 1) 285 286 // Shared cluster ID by all gossipers (this ensures that the gossipers 287 // don't talk to servers from unrelated tests by accident). 288 clusterID := uuid.MakeV4() 289 290 localNodeID := roachpb.NodeID(1) 291 local, remote := startFakeServerGossips(t, clusterID, localNodeID, stopper) 292 293 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 294 // Use an insecure context. We're talking to tcp socket which are not in the certs. 295 rpcContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 296 297 c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, &remote.nodeAddr, makeMetrics()) 298 disconnected <- c 299 300 defer func() { 301 stopper.Stop(context.Background()) 302 if c != <-disconnected { 303 t.Errorf("expected client disconnect after remote close") 304 } 305 }() 306 307 // A gossip client may fail to start if the grpc connection times out which 308 // can happen under load (such as in CircleCI or using `make stress`). So we 309 // loop creating clients until success or the test times out. 310 for { 311 // Wait for c.gossip to start. 312 select { 313 case receivedNodeID := <-remote.nodeIDChan: 314 if receivedNodeID != localNodeID { 315 t.Fatalf("client should send NodeID with %v, got %v", localNodeID, receivedNodeID) 316 } 317 return 318 case <-disconnected: 319 // The client hasn't been started or failed to start, loop and try again. 320 local.mu.Lock() 321 c.startLocked(local, disconnected, rpcContext, stopper, rpcContext.NewBreaker("")) 322 local.mu.Unlock() 323 } 324 } 325 } 326 327 func verifyServerMaps(g *Gossip, expCount int) bool { 328 g.mu.RLock() 329 defer g.mu.RUnlock() 330 return len(g.mu.nodeMap) == expCount 331 } 332 333 // TestClientDisconnectLoopback verifies that the gossip server 334 // will drop an outgoing client connection that is already an 335 // inbound client connection of another node. 336 func TestClientDisconnectLoopback(t *testing.T) { 337 defer leaktest.AfterTest(t)() 338 stopper := stop.NewStopper() 339 defer stopper.Stop(context.Background()) 340 local := startGossip(uuid.Nil, 1, stopper, t, metric.NewRegistry()) 341 local.mu.Lock() 342 lAddr := local.mu.is.NodeAddr 343 local.startClientLocked(&lAddr) 344 local.mu.Unlock() 345 local.manage() 346 testutils.SucceedsSoon(t, func() error { 347 ok := local.findClient(func(c *client) bool { return c.addr.String() == lAddr.String() }) != nil 348 if !ok && verifyServerMaps(local, 0) { 349 return nil 350 } 351 return errors.New("local client still connected to itself") 352 }) 353 } 354 355 // TestClientDisconnectRedundant verifies that the gossip server 356 // will drop an outgoing client connection that is already an 357 // inbound client connection of another node. 358 func TestClientDisconnectRedundant(t *testing.T) { 359 defer leaktest.AfterTest(t)() 360 stopper := stop.NewStopper() 361 defer stopper.Stop(context.Background()) 362 363 // Shared cluster ID by all gossipers (this ensures that the gossipers 364 // don't talk to servers from unrelated tests by accident). 365 clusterID := uuid.MakeV4() 366 367 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 368 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 369 local.mu.Lock() 370 remote.mu.Lock() 371 rAddr := remote.mu.is.NodeAddr 372 lAddr := local.mu.is.NodeAddr 373 local.mu.Unlock() 374 remote.mu.Unlock() 375 local.manage() 376 remote.manage() 377 378 // Gossip a key on local and wait for it to show up on remote. This 379 // guarantees we have an active local to remote client connection. 380 if err := local.AddInfo("local-key", nil, 0); err != nil { 381 t.Fatal(err) 382 } 383 testutils.SucceedsSoon(t, func() error { 384 c := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) 385 if c == nil { 386 // Restart the client connection in the loop. It might have failed due to 387 // a heartbeat time. 388 local.mu.Lock() 389 local.startClientLocked(&rAddr) 390 local.mu.Unlock() 391 return fmt.Errorf("unable to find local to remote client") 392 } 393 _, err := remote.GetInfo("local-key") 394 return err 395 }) 396 397 // Start a remote to local client. This client will get removed as being 398 // redundant as there is already a connection between the two nodes. 399 remote.mu.Lock() 400 remote.startClientLocked(&lAddr) 401 remote.mu.Unlock() 402 403 testutils.SucceedsSoon(t, func() error { 404 // Check which of the clients is connected to the other. 405 ok1 := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) != nil 406 ok2 := remote.findClient(func(c *client) bool { return c.addr.String() == lAddr.String() }) != nil 407 if ok1 && !ok2 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { 408 return nil 409 } 410 return fmt.Errorf("remote to local client not yet closed as redundant: local=%t remote=%t", 411 ok1, ok2) 412 }) 413 } 414 415 // TestClientDisallowMultipleConns verifies that the server disallows 416 // multiple connections from the same client node ID. 417 func TestClientDisallowMultipleConns(t *testing.T) { 418 defer leaktest.AfterTest(t)() 419 stopper := stop.NewStopper() 420 defer stopper.Stop(context.Background()) 421 422 // Shared cluster ID by all gossipers (this ensures that the gossipers 423 // don't talk to servers from unrelated tests by accident). 424 clusterID := uuid.MakeV4() 425 426 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 427 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 428 429 local.mu.Lock() 430 remote.mu.Lock() 431 rAddr := remote.mu.is.NodeAddr 432 // Start two clients from local to remote. RPC client cache is 433 // disabled via the context, so we'll start two different outgoing 434 // connections. 435 local.startClientLocked(&rAddr) 436 local.startClientLocked(&rAddr) 437 local.mu.Unlock() 438 remote.mu.Unlock() 439 local.manage() 440 remote.manage() 441 testutils.SucceedsSoon(t, func() error { 442 // Verify that the remote server has only a single incoming 443 // connection and the local server has only a single outgoing 444 // connection. 445 local.mu.Lock() 446 remote.mu.Lock() 447 outgoing := local.outgoing.len() 448 incoming := remote.mu.incoming.len() 449 local.mu.Unlock() 450 remote.mu.Unlock() 451 if outgoing == 1 && incoming == 1 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { 452 return nil 453 } 454 return errors.Errorf("incorrect number of incoming (%d) or outgoing (%d) connections", incoming, outgoing) 455 }) 456 } 457 458 // TestClientRegisterInitNodeID verifies two client's gossip request with NodeID 0. 459 func TestClientRegisterWithInitNodeID(t *testing.T) { 460 defer leaktest.AfterTest(t)() 461 stopper := stop.NewStopper() 462 defer stopper.Stop(context.Background()) 463 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 464 465 // Shared cluster ID by all gossipers (this ensures that the gossipers 466 // don't talk to servers from unrelated tests by accident). 467 clusterID := uuid.MakeV4() 468 469 // Create three gossip nodes, and connect to the first with NodeID 0. 470 var g []*Gossip 471 var gossipAddr string 472 for i := 0; i < 3; i++ { 473 nodeID := roachpb.NodeID(i + 1) 474 475 rpcContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 476 server := rpc.NewServer(rpcContext) 477 // node ID must be non-zero 478 gnode := NewTest( 479 nodeID, rpcContext, server, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef(), 480 ) 481 g = append(g, gnode) 482 483 ln, err := netutil.ListenAndServeGRPC(stopper, server, util.IsolatedTestAddr) 484 if err != nil { 485 t.Fatal(err) 486 } 487 488 // Connect to the first gossip node. 489 if gossipAddr == "" { 490 gossipAddr = ln.Addr().String() 491 } 492 493 var resolvers []resolver.Resolver 494 resolver, err := resolver.NewResolver(gossipAddr) 495 if err != nil { 496 t.Fatal(err) 497 } 498 resolvers = append(resolvers, resolver) 499 gnode.Start(ln.Addr(), resolvers) 500 } 501 502 testutils.SucceedsSoon(t, func() error { 503 // The first gossip node should have two gossip client address 504 // in nodeMap if these three gossip nodes registered success. 505 g[0].mu.Lock() 506 defer g[0].mu.Unlock() 507 if a, e := len(g[0].mu.nodeMap), 2; a != e { 508 return errors.Errorf("expected %v to contain %d nodes, got %d", g[0].mu.nodeMap, e, a) 509 } 510 return nil 511 }) 512 } 513 514 type testResolver struct { 515 addr string 516 numTries int 517 numFails int 518 numSuccesses int 519 } 520 521 func (tr *testResolver) Type() string { return "tcp" } 522 523 func (tr *testResolver) Addr() string { return tr.addr } 524 525 func (tr *testResolver) GetAddress() (net.Addr, error) { 526 defer func() { tr.numTries++ }() 527 if tr.numTries < tr.numFails { 528 return nil, errors.New("bad address") 529 } 530 return util.NewUnresolvedAddr("tcp", tr.addr), nil 531 } 532 533 // TestClientRetryBootstrap verifies that an initial failure to connect 534 // to a bootstrap host doesn't stall the bootstrapping process in the 535 // absence of any additional activity. This can happen during acceptance 536 // tests if the DNS can't lookup hostnames when gossip is started. 537 func TestClientRetryBootstrap(t *testing.T) { 538 defer leaktest.AfterTest(t)() 539 stopper := stop.NewStopper() 540 defer stopper.Stop(context.Background()) 541 542 // Shared cluster ID by all gossipers (this ensures that the gossipers 543 // don't talk to servers from unrelated tests by accident). 544 clusterID := uuid.MakeV4() 545 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 546 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 547 548 if err := local.AddInfo("local-key", []byte("hello"), 0*time.Second); err != nil { 549 t.Fatal(err) 550 } 551 552 local.SetBootstrapInterval(10 * time.Millisecond) 553 resolvers := []resolver.Resolver{ 554 &testResolver{addr: remote.GetNodeAddr().String(), numFails: 3, numSuccesses: 1}, 555 } 556 local.setResolvers(resolvers) 557 local.bootstrap() 558 local.manage() 559 560 testutils.SucceedsSoon(t, func() error { 561 _, err := remote.GetInfo("local-key") 562 return err 563 }) 564 } 565 566 // TestClientForwardUnresolved verifies that a client does not resolve a forward 567 // address prematurely. 568 func TestClientForwardUnresolved(t *testing.T) { 569 defer leaktest.AfterTest(t)() 570 stopper := stop.NewStopper() 571 defer stopper.Stop(context.Background()) 572 const nodeID = 1 573 local := startGossip(uuid.Nil, nodeID, stopper, t, metric.NewRegistry()) 574 addr := local.GetNodeAddr() 575 576 client := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, addr, makeMetrics()) // never started 577 578 newAddr := util.UnresolvedAddr{ 579 NetworkField: "tcp", 580 AddressField: "localhost:2345", 581 } 582 reply := &Response{ 583 NodeID: nodeID, 584 Addr: *addr, 585 AlternateNodeID: nodeID + 1, 586 AlternateAddr: &newAddr, 587 } 588 local.mu.Lock() 589 local.outgoing.addPlaceholder() // so that the resolvePlaceholder in handleResponse doesn't fail 590 local.mu.Unlock() 591 if err := client.handleResponse( 592 context.Background(), local, reply, 593 ); !testutils.IsError(err, "received forward") { 594 t.Fatal(err) 595 } 596 if !proto.Equal(client.forwardAddr, &newAddr) { 597 t.Fatalf("unexpected forward address %v, expected %v", client.forwardAddr, &newAddr) 598 } 599 }