github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/gossip/gossip_test.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package gossip 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "net" 18 "reflect" 19 "strconv" 20 "testing" 21 "time" 22 23 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 24 "github.com/cockroachdb/cockroach/pkg/gossip/resolver" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/rpc" 27 "github.com/cockroachdb/cockroach/pkg/testutils" 28 "github.com/cockroachdb/cockroach/pkg/util" 29 "github.com/cockroachdb/cockroach/pkg/util/hlc" 30 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 31 "github.com/cockroachdb/cockroach/pkg/util/log" 32 "github.com/cockroachdb/cockroach/pkg/util/metric" 33 "github.com/cockroachdb/cockroach/pkg/util/netutil" 34 "github.com/cockroachdb/cockroach/pkg/util/retry" 35 "github.com/cockroachdb/cockroach/pkg/util/stop" 36 "github.com/cockroachdb/cockroach/pkg/util/tracing" 37 "github.com/cockroachdb/cockroach/pkg/util/uuid" 38 "github.com/cockroachdb/errors" 39 "github.com/gogo/protobuf/proto" 40 ) 41 42 // TestGossipInfoStore verifies operation of gossip instance infostore. 43 func TestGossipInfoStore(t *testing.T) { 44 defer leaktest.AfterTest(t)() 45 stopper := stop.NewStopper() 46 defer stopper.Stop(context.Background()) 47 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 48 rpcContext := rpc.NewInsecureTestingContext(clock, stopper) 49 g := NewTest(1, rpcContext, rpc.NewServer(rpcContext), stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef()) 50 slice := []byte("b") 51 if err := g.AddInfo("s", slice, time.Hour); err != nil { 52 t.Fatal(err) 53 } 54 if val, err := g.GetInfo("s"); !bytes.Equal(val, slice) || err != nil { 55 t.Errorf("error fetching string: %v", err) 56 } 57 if _, err := g.GetInfo("s2"); err == nil { 58 t.Errorf("expected error fetching nonexistent key \"s2\"") 59 } 60 } 61 62 // TestGossipMoveNode verifies that if a node is moved to a new address, it 63 // gets properly updated in gossip. 64 func TestGossipMoveNode(t *testing.T) { 65 defer leaktest.AfterTest(t)() 66 stopper := stop.NewStopper() 67 defer stopper.Stop(context.Background()) 68 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 69 rpcContext := rpc.NewInsecureTestingContext(clock, stopper) 70 g := NewTest(1, rpcContext, rpc.NewServer(rpcContext), stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef()) 71 var nodes []*roachpb.NodeDescriptor 72 for i := 1; i <= 3; i++ { 73 node := &roachpb.NodeDescriptor{ 74 NodeID: roachpb.NodeID(i), 75 Address: util.MakeUnresolvedAddr("tcp", fmt.Sprintf("1.1.1.1:%d", i)), 76 } 77 if err := g.SetNodeDescriptor(node); err != nil { 78 t.Fatalf("failed setting node descriptor %+v: %s", node, err) 79 } 80 nodes = append(nodes, node) 81 } 82 for _, node := range nodes { 83 if val, err := g.GetNodeDescriptor(node.NodeID); err != nil { 84 t.Fatal(err) 85 } else if !proto.Equal(node, val) { 86 t.Fatalf("expected node %+v, got %+v", node, val) 87 } 88 } 89 90 // Move node 2 to the address of node 3. 91 movedNode := nodes[1] 92 replacedNode := nodes[2] 93 movedNode.Address = replacedNode.Address 94 if err := g.SetNodeDescriptor(movedNode); err != nil { 95 t.Fatal(err) 96 } 97 98 testutils.SucceedsSoon(t, func() error { 99 if val, err := g.GetNodeDescriptor(movedNode.NodeID); err != nil { 100 return err 101 } else if !proto.Equal(movedNode, val) { 102 return fmt.Errorf("expected node %+v, got %+v", movedNode, val) 103 } 104 return nil 105 }) 106 } 107 108 func TestGossipGetNextBootstrapAddress(t *testing.T) { 109 defer leaktest.AfterTest(t)() 110 stopper := stop.NewStopper() 111 defer stopper.Stop(context.Background()) 112 113 resolverSpecs := []string{ 114 "127.0.0.1:9000", 115 "127.0.0.1:9001", 116 "localhost:9004", 117 } 118 119 resolvers := []resolver.Resolver{} 120 for _, rs := range resolverSpecs { 121 resolver, err := resolver.NewResolver(rs) 122 if err == nil { 123 resolvers = append(resolvers, resolver) 124 } 125 } 126 if len(resolvers) != 3 { 127 t.Errorf("expected 3 resolvers; got %d", len(resolvers)) 128 } 129 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 130 rpcContext := rpc.NewInsecureTestingContext(clock, stopper) 131 server := rpc.NewServer(rpcContext) 132 g := NewTest(0, nil, server, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef()) 133 g.setResolvers(resolvers) 134 135 // Using specified resolvers, fetch bootstrap addresses 3 times 136 // and verify the results match expected addresses. 137 expAddresses := []string{ 138 "127.0.0.1:9000", 139 "127.0.0.1:9001", 140 "localhost:9004", 141 } 142 for i := 0; i < len(expAddresses); i++ { 143 g.mu.Lock() 144 if addr := g.getNextBootstrapAddressLocked(); addr == nil { 145 t.Errorf("%d: unexpected nil addr when expecting %s", i, expAddresses[i]) 146 } else if addrStr := addr.String(); addrStr != expAddresses[i] { 147 t.Errorf("%d: expected addr %s; got %s", i, expAddresses[i], addrStr) 148 } 149 g.mu.Unlock() 150 } 151 } 152 153 func TestGossipLocalityResolver(t *testing.T) { 154 defer leaktest.AfterTest(t)() 155 stopper := stop.NewStopper() 156 defer stopper.Stop(context.Background()) 157 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 158 rpcContext := rpc.NewInsecureTestingContext(clock, stopper) 159 160 gossipLocalityAdvertiseList := roachpb.Locality{} 161 tier := roachpb.Tier{} 162 tier.Key = "zone" 163 tier.Value = "1" 164 165 tier2 := roachpb.Tier{} 166 tier2.Key = "zone" 167 tier2.Value = "2" 168 169 gossipLocalityAdvertiseList.Tiers = append(gossipLocalityAdvertiseList.Tiers, tier) 170 171 node1PrivateAddress := util.MakeUnresolvedAddr("tcp", "1.0.0.1") 172 node2PrivateAddress := util.MakeUnresolvedAddr("tcp", "2.0.0.1") 173 174 node1PublicAddressRPC := util.MakeUnresolvedAddr("tcp", "1.1.1.1:1") 175 node2PublicAddressRPC := util.MakeUnresolvedAddr("tcp", "2.2.2.2:3") 176 node2PublicAddressSQL := util.MakeUnresolvedAddr("tcp", "2.2.2.2:4") 177 178 var node1LocalityList []roachpb.LocalityAddress 179 nodeLocalityAddress := roachpb.LocalityAddress{} 180 nodeLocalityAddress.Address = node1PrivateAddress 181 nodeLocalityAddress.LocalityTier = tier 182 183 nodeLocalityAddress2 := roachpb.LocalityAddress{} 184 nodeLocalityAddress2.Address = node2PrivateAddress 185 nodeLocalityAddress2.LocalityTier = tier2 186 187 node1LocalityList = append(node1LocalityList, nodeLocalityAddress) 188 node1LocalityList = append(node1LocalityList, nodeLocalityAddress2) 189 190 var node2LocalityList []roachpb.LocalityAddress 191 node2LocalityList = append(node2LocalityList, nodeLocalityAddress2) 192 193 g := NewTestWithLocality(1, rpcContext, rpc.NewServer(rpcContext), stopper, metric.NewRegistry(), gossipLocalityAdvertiseList, zonepb.DefaultZoneConfigRef()) 194 node1 := &roachpb.NodeDescriptor{ 195 NodeID: 1, 196 Address: node1PublicAddressRPC, 197 LocalityAddress: node1LocalityList, 198 } 199 node2 := &roachpb.NodeDescriptor{ 200 NodeID: 2, 201 Address: node2PublicAddressRPC, 202 SQLAddress: node2PublicAddressSQL, 203 LocalityAddress: node2LocalityList, 204 } 205 206 if err := g.SetNodeDescriptor(node1); err != nil { 207 t.Fatal(err) 208 } 209 if err := g.SetNodeDescriptor(node2); err != nil { 210 t.Fatal(err) 211 } 212 213 nodeAddress, err := g.GetNodeIDAddress(node1.NodeID) 214 if err != nil { 215 t.Error(err) 216 } 217 if *nodeAddress != node1PrivateAddress { 218 t.Fatalf("expected: %s but got: %s address", node1PrivateAddress, *nodeAddress) 219 } 220 221 nodeAddress, err = g.GetNodeIDAddress(node2.NodeID) 222 if err != nil { 223 t.Error(err) 224 } 225 226 if *nodeAddress != node2PublicAddressRPC { 227 t.Fatalf("expected: %s but got: %s address", node2PublicAddressRPC, *nodeAddress) 228 } 229 230 nodeAddressSQL, err := g.GetNodeIDSQLAddress(node2.NodeID) 231 if err != nil { 232 t.Error(err) 233 } 234 235 if *nodeAddressSQL != node2PublicAddressSQL { 236 t.Fatalf("expected: %s but got: %s address", node2PublicAddressSQL, *nodeAddressSQL) 237 } 238 } 239 240 func TestGossipRaceLogStatus(t *testing.T) { 241 defer leaktest.AfterTest(t)() 242 243 stopper := stop.NewStopper() 244 defer stopper.Stop(context.Background()) 245 // Shared cluster ID by all gossipers (this ensures that the gossipers 246 // don't talk to servers from unrelated tests by accident). 247 clusterID := uuid.MakeV4() 248 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 249 250 local.mu.Lock() 251 peer := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 252 local.startClientLocked(&peer.mu.is.NodeAddr) 253 local.mu.Unlock() 254 255 // Race gossiping against LogStatus. 256 gun := make(chan struct{}) 257 for i := uint8(0); i < 10; i++ { 258 go func() { 259 <-gun 260 local.LogStatus() 261 gun <- struct{}{} 262 }() 263 gun <- struct{}{} 264 if err := local.AddInfo( 265 strconv.FormatUint(uint64(i), 10), 266 []byte{i}, 267 time.Hour, 268 ); err != nil { 269 t.Fatal(err) 270 } 271 <-gun 272 } 273 close(gun) 274 } 275 276 // TestGossipOutgoingLimitEnforced verifies that a gossip node won't open more 277 // outgoing connections than it should. If the gossip implementation is racy 278 // with respect to opening outgoing connections, this may not fail every time 279 // it's run, but should fail very quickly if run under stress. 280 func TestGossipOutgoingLimitEnforced(t *testing.T) { 281 defer leaktest.AfterTest(t)() 282 283 stopper := stop.NewStopper() 284 defer stopper.Stop(context.Background()) 285 286 // This test has an implicit dependency on the maxPeers logic deciding that 287 // maxPeers is 3 for a 5-node cluster, so let's go ahead and make that 288 // explicit. 289 maxPeers := maxPeers(5) 290 if maxPeers > 3 { 291 t.Fatalf("maxPeers(5)=%d, which is higher than this test's assumption", maxPeers) 292 } 293 294 // Shared cluster ID by all gossipers (this ensures that the gossipers 295 // don't talk to servers from unrelated tests by accident). 296 clusterID := uuid.MakeV4() 297 298 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 299 local.mu.Lock() 300 localAddr := local.mu.is.NodeAddr 301 local.mu.Unlock() 302 var peers []*Gossip 303 for i := 0; i < 4; i++ { 304 // After creating a new node, join it to the first node to ensure that the 305 // network is connected (and thus all nodes know each other's addresses) 306 // before we start the actual test. 307 newPeer := startGossip(clusterID, roachpb.NodeID(i+2), stopper, t, metric.NewRegistry()) 308 newPeer.mu.Lock() 309 newPeer.startClientLocked(&localAddr) 310 newPeer.mu.Unlock() 311 peers = append(peers, newPeer) 312 } 313 314 // Wait until the network is at least mostly connected. 315 testutils.SucceedsSoon(t, func() error { 316 local.mu.Lock() 317 defer local.mu.Unlock() 318 if local.mu.incoming.len() == maxPeers { 319 return nil 320 } 321 return fmt.Errorf("local.mu.incoming.len() = %d, want %d", local.mu.incoming.len(), maxPeers) 322 }) 323 324 // Verify that we can't open more than maxPeers connections. We have to muck 325 // with the infostore's data so that the other nodes will appear far enough 326 // away to be worth opening a connection to. 327 local.mu.Lock() 328 err := local.mu.is.visitInfos(func(key string, i *Info) error { 329 copy := *i 330 copy.Hops = maxHops + 1 331 copy.Value.Timestamp.WallTime++ 332 return local.mu.is.addInfo(key, ©) 333 }, true /* deleteExpired */) 334 local.mu.Unlock() 335 if err != nil { 336 t.Fatal(err) 337 } 338 for range peers { 339 local.tightenNetwork(context.Background()) 340 } 341 342 if outgoing := local.outgoing.gauge.Value(); outgoing > int64(maxPeers) { 343 t.Errorf("outgoing nodeSet has %d connections; the max should be %d", outgoing, maxPeers) 344 } 345 local.clientsMu.Lock() 346 if numClients := len(local.clientsMu.clients); numClients > maxPeers { 347 t.Errorf("local gossip has %d clients; the max should be %d", numClients, maxPeers) 348 } 349 local.clientsMu.Unlock() 350 } 351 352 func TestGossipMostDistant(t *testing.T) { 353 defer leaktest.AfterTest(t)() 354 355 stopper := stop.NewStopper() 356 defer stopper.Stop(context.Background()) 357 358 connect := func(from, to *Gossip) { 359 to.mu.Lock() 360 addr := to.mu.is.NodeAddr 361 to.mu.Unlock() 362 from.mu.Lock() 363 from.startClientLocked(&addr) 364 from.mu.Unlock() 365 } 366 367 mostDistant := func(g *Gossip) (roachpb.NodeID, uint32) { 368 g.mu.Lock() 369 distantNodeID, distantHops := g.mu.is.mostDistant(func(roachpb.NodeID) bool { 370 return false 371 }) 372 g.mu.Unlock() 373 return distantNodeID, distantHops 374 } 375 376 const n = 10 377 testCases := []struct { 378 from, to int 379 }{ 380 {0, n - 1}, // n1 connects to n10 381 {n - 1, 0}, // n10 connects to n1 382 } 383 384 for _, c := range testCases { 385 t.Run("", func(t *testing.T) { 386 // Shared cluster ID by all gossipers (this ensures that the gossipers 387 // don't talk to servers from unrelated tests by accident). 388 clusterID := uuid.MakeV4() 389 390 // Set up a gossip network of 10 nodes connected in a single line: 391 // 392 // 1 <- 2 <- 3 <- 4 <- 5 <- 6 <- 7 <- 8 <- 9 <- 10 393 nodes := make([]*Gossip, n) 394 for i := range nodes { 395 nodes[i] = startGossip(clusterID, roachpb.NodeID(i+1), stopper, t, metric.NewRegistry()) 396 if i == 0 { 397 continue 398 } 399 connect(nodes[i], nodes[i-1]) 400 } 401 402 // Wait for n1 to determine that n10 is the most distant node. 403 testutils.SucceedsSoon(t, func() error { 404 g := nodes[0] 405 distantNodeID, distantHops := mostDistant(g) 406 if distantNodeID == 10 && distantHops == 9 { 407 return nil 408 } 409 return fmt.Errorf("n%d: distantHops: %d from n%d", g.NodeID.Get(), distantHops, distantNodeID) 410 }) 411 // Wait for the infos to be fully propagated. 412 testutils.SucceedsSoon(t, func() error { 413 infosCount := func(g *Gossip) int { 414 g.mu.Lock() 415 defer g.mu.Unlock() 416 return len(g.mu.is.Infos) 417 } 418 count := infosCount(nodes[0]) 419 for _, g := range nodes[1:] { 420 if tmp := infosCount(g); tmp != count { 421 return fmt.Errorf("unexpected info count: %d != %d", tmp, count) 422 } 423 } 424 return nil 425 }) 426 427 // Connect the network in a loop. This will cut the distance to the most 428 // distant node in half. 429 log.Infof(context.Background(), "connecting from n%d to n%d", c.from, c.to) 430 connect(nodes[c.from], nodes[c.to]) 431 432 // Wait for n1 to determine that n6 is now the most distant hops from 9 433 // to 5 and change the most distant node to n6. 434 testutils.SucceedsSoon(t, func() error { 435 g := nodes[0] 436 g.mu.Lock() 437 var buf bytes.Buffer 438 _ = g.mu.is.visitInfos(func(key string, i *Info) error { 439 if i.NodeID != 1 && IsNodeIDKey(key) { 440 fmt.Fprintf(&buf, "n%d: hops=%d\n", i.NodeID, i.Hops) 441 } 442 return nil 443 }, true /* deleteExpired */) 444 g.mu.Unlock() 445 446 distantNodeID, distantHops := mostDistant(g) 447 if distantNodeID == 6 && distantHops == 5 { 448 return nil 449 } 450 return fmt.Errorf("n%d: distantHops: %d from n%d\n%s", 451 g.NodeID.Get(), distantHops, distantNodeID, buf.String()) 452 }) 453 }) 454 } 455 } 456 457 // TestGossipNoForwardSelf verifies that when a Gossip instance is full, it 458 // redirects clients elsewhere (in particular not to itself). 459 // 460 // NB: Stress testing this test really stresses the OS networking stack 461 // more than anything else. For example, on Linux it may quickly deplete 462 // the ephemeral port range (due to the TIME_WAIT state). 463 // On a box which only runs tests, this can be circumvented by running 464 // 465 // sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle" 466 // 467 // See https://vincent.bernat.im/en/blog/2014-tcp-time-wait-state-linux.html 468 // for details. 469 // 470 // On OSX, things similarly fall apart. See #7524 and #5218 for some discussion 471 // of this. 472 func TestGossipNoForwardSelf(t *testing.T) { 473 defer leaktest.AfterTest(t)() 474 475 stopper := stop.NewStopper() 476 defer stopper.Stop(context.Background()) 477 478 // Shared cluster ID by all gossipers (this ensures that the gossipers 479 // don't talk to servers from unrelated tests by accident). 480 clusterID := uuid.MakeV4() 481 482 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 483 484 ctx, cancel := context.WithCancel(context.Background()) 485 defer cancel() 486 487 // Start one loopback client plus enough additional clients to fill the 488 // incoming clients. 489 peers := []*Gossip{local} 490 local.server.mu.Lock() 491 maxSize := local.server.mu.incoming.maxSize 492 local.server.mu.Unlock() 493 for i := 0; i < maxSize; i++ { 494 peers = append(peers, startGossip(clusterID, roachpb.NodeID(i+2), stopper, t, metric.NewRegistry())) 495 } 496 497 for _, peer := range peers { 498 c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, local.GetNodeAddr(), makeMetrics()) 499 500 testutils.SucceedsSoon(t, func() error { 501 conn, err := peer.rpcContext.GRPCUnvalidatedDial(c.addr.String()).Connect(ctx) 502 if err != nil { 503 return err 504 } 505 506 stream, err := NewGossipClient(conn).Gossip(ctx) 507 if err != nil { 508 return err 509 } 510 511 if err := c.requestGossip(peer, stream); err != nil { 512 return err 513 } 514 515 // Wait until the server responds, so we know we're connected. 516 _, err = stream.Recv() 517 return err 518 }) 519 } 520 521 numClients := len(peers) * 2 522 disconnectedCh := make(chan *client) 523 524 // Start a few overflow peers and assert that they don't get forwarded to us 525 // again. 526 for i := 0; i < numClients; i++ { 527 local.server.mu.Lock() 528 maxSize := local.server.mu.incoming.maxSize 529 local.server.mu.Unlock() 530 peer := startGossip(clusterID, roachpb.NodeID(i+maxSize+2), stopper, t, metric.NewRegistry()) 531 532 for { 533 localAddr := local.GetNodeAddr() 534 c := newClient(log.AmbientContext{Tracer: tracing.NewTracer()}, localAddr, makeMetrics()) 535 peer.mu.Lock() 536 c.startLocked(peer, disconnectedCh, peer.rpcContext, stopper, peer.rpcContext.NewBreaker("")) 537 peer.mu.Unlock() 538 539 disconnectedClient := <-disconnectedCh 540 if disconnectedClient != c { 541 t.Fatalf("expected %p to be disconnected, got %p", c, disconnectedClient) 542 } else if c.forwardAddr == nil { 543 // Under high load, clients sometimes fail to connect for reasons 544 // unrelated to the test, so we need to permit some. 545 t.Logf("node #%d: got nil forwarding address", peer.NodeID.Get()) 546 continue 547 } else if *c.forwardAddr == *localAddr { 548 t.Errorf("node #%d: got local's forwarding address", peer.NodeID.Get()) 549 } 550 break 551 } 552 } 553 } 554 555 // TestGossipCullNetwork verifies that a client will be culled from 556 // the network periodically (at cullInterval duration intervals). 557 func TestGossipCullNetwork(t *testing.T) { 558 defer leaktest.AfterTest(t)() 559 560 stopper := stop.NewStopper() 561 defer stopper.Stop(context.Background()) 562 563 // Shared cluster ID by all gossipers (this ensures that the gossipers 564 // don't talk to servers from unrelated tests by accident). 565 clusterID := uuid.MakeV4() 566 567 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 568 local.SetCullInterval(5 * time.Millisecond) 569 570 local.mu.Lock() 571 for i := 0; i < minPeers; i++ { 572 peer := startGossip(clusterID, roachpb.NodeID(i+2), stopper, t, metric.NewRegistry()) 573 local.startClientLocked(peer.GetNodeAddr()) 574 } 575 local.mu.Unlock() 576 577 const slowGossipDuration = time.Minute 578 579 if err := retry.ForDuration(slowGossipDuration, func() error { 580 if peers := len(local.Outgoing()); peers != minPeers { 581 return errors.Errorf("%d of %d peers connected", peers, minPeers) 582 } 583 return nil 584 }); err != nil { 585 t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err) 586 } 587 588 local.manage() 589 590 if err := retry.ForDuration(slowGossipDuration, func() error { 591 // Verify that a client is closed within the cull interval. 592 if peers := len(local.Outgoing()); peers != minPeers-1 { 593 return errors.Errorf("%d of %d peers connected", peers, minPeers-1) 594 } 595 return nil 596 }); err != nil { 597 t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err) 598 } 599 } 600 601 func TestGossipOrphanedStallDetection(t *testing.T) { 602 defer leaktest.AfterTest(t)() 603 604 stopper := stop.NewStopper() 605 defer stopper.Stop(context.Background()) 606 607 // Shared cluster ID by all gossipers (this ensures that the gossipers 608 // don't talk to servers from unrelated tests by accident). 609 clusterID := uuid.MakeV4() 610 611 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 612 local.SetStallInterval(5 * time.Millisecond) 613 614 // Make sure we have the sentinel to ensure that its absence is not the 615 // cause of stall detection. 616 if err := local.AddInfo(KeySentinel, nil, time.Hour); err != nil { 617 t.Fatal(err) 618 } 619 620 peerStopper := stop.NewStopper() 621 peer := startGossip(clusterID, 2, peerStopper, t, metric.NewRegistry()) 622 623 peerNodeID := peer.NodeID.Get() 624 peerAddr := peer.GetNodeAddr() 625 peerAddrStr := peerAddr.String() 626 627 local.mu.Lock() 628 local.startClientLocked(peerAddr) 629 local.mu.Unlock() 630 631 testutils.SucceedsSoon(t, func() error { 632 for _, peerID := range local.Outgoing() { 633 if peerID == peerNodeID { 634 return nil 635 } 636 } 637 return errors.Errorf("n%d not yet connected", peerNodeID) 638 }) 639 640 testutils.SucceedsSoon(t, func() error { 641 for _, resolver := range local.GetResolvers() { 642 if resolver.Addr() == peerAddrStr { 643 return nil 644 } 645 } 646 return errors.Errorf("n%d descriptor not yet available", peerNodeID) 647 }) 648 649 local.bootstrap() 650 local.manage() 651 652 peerStopper.Stop(context.Background()) 653 654 testutils.SucceedsSoon(t, func() error { 655 for _, peerID := range local.Outgoing() { 656 if peerID == peerNodeID { 657 return errors.Errorf("n%d still connected", peerNodeID) 658 } 659 } 660 return nil 661 }) 662 663 peerStopper = stop.NewStopper() 664 defer peerStopper.Stop(context.Background()) 665 startGossipAtAddr(clusterID, peerNodeID, peerAddr, peerStopper, t, metric.NewRegistry()) 666 667 testutils.SucceedsSoon(t, func() error { 668 for _, peerID := range local.Outgoing() { 669 if peerID == peerNodeID { 670 return nil 671 } 672 } 673 return errors.Errorf("n%d not yet connected", peerNodeID) 674 }) 675 } 676 677 // TestGossipCantJoinTwoClusters verifies that a node can't 678 // participate in two separate clusters if two nodes from different 679 // clusters are specified as bootstrap hosts. Previously, this would 680 // be allowed, because a node verifies the cluster ID only at startup. 681 // If after joining the first cluster via that cluster's init node, 682 // the init node shuts down, the joining node will reconnect via its 683 // second bootstrap host and begin to participate [illegally] in 684 // another cluster. 685 func TestGossipJoinTwoClusters(t *testing.T) { 686 defer leaktest.AfterTest(t)() 687 688 const interval = 10 * time.Millisecond 689 var stoppers []*stop.Stopper 690 var g []*Gossip 691 var clusterIDs []uuid.UUID 692 var addrs []net.Addr 693 694 // Create three gossip nodes, init the first two with no bootstrap 695 // hosts, but unique cluster IDs. The third host has the first two 696 // hosts as bootstrap hosts, but has the same cluster ID as the 697 // first of its bootstrap hosts. 698 for i := 0; i < 3; i++ { 699 stopper := stop.NewStopper() 700 stoppers = append(stoppers, stopper) 701 defer func() { 702 select { 703 case <-stopper.ShouldQuiesce(): 704 default: 705 stopper.Stop(context.Background()) 706 } 707 }() 708 709 var clusterID uuid.UUID 710 switch i { 711 case 0, 1: 712 clusterID = uuid.MakeV4() 713 case 2: 714 clusterID = clusterIDs[0] 715 } 716 clusterIDs = append(clusterIDs, clusterID) 717 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 718 rpcContext := rpc.NewInsecureTestingContextWithClusterID(clock, stopper, clusterID) 719 720 server := rpc.NewServer(rpcContext) 721 722 // node ID must be non-zero 723 gnode := NewTest( 724 roachpb.NodeID(i+1), rpcContext, server, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef()) 725 g = append(g, gnode) 726 gnode.SetStallInterval(interval) 727 gnode.SetBootstrapInterval(interval) 728 gnode.clusterID.Set(context.Background(), clusterIDs[i]) 729 730 ln, err := netutil.ListenAndServeGRPC(stopper, server, util.IsolatedTestAddr) 731 if err != nil { 732 t.Fatal(err) 733 } 734 addrs = append(addrs, ln.Addr()) 735 736 // Only the third node has resolvers. 737 var resolvers []resolver.Resolver 738 switch i { 739 case 2: 740 for j := 0; j < 2; j++ { 741 resolver, err := resolver.NewResolver(addrs[j].String()) 742 if err != nil { 743 t.Fatal(err) 744 } 745 resolvers = append(resolvers, resolver) 746 } 747 } 748 gnode.Start(ln.Addr(), resolvers) 749 } 750 751 // Wait for connections. 752 testutils.SucceedsSoon(t, func() error { 753 // The first gossip node should have one gossip client address 754 // in nodeMap if the 2nd gossip node connected. The second gossip 755 // node should have none. 756 g[0].mu.Lock() 757 defer g[0].mu.Unlock() 758 if a, e := len(g[0].mu.nodeMap), 1; a != e { 759 return errors.Errorf("expected %v to contain %d nodes, got %d", g[0].mu.nodeMap, e, a) 760 } 761 g[1].mu.Lock() 762 defer g[1].mu.Unlock() 763 if a, e := len(g[1].mu.nodeMap), 0; a != e { 764 return errors.Errorf("expected %v to contain %d nodes, got %d", g[1].mu.nodeMap, e, a) 765 } 766 return nil 767 }) 768 769 // Kill node 0 to force node 2 to bootstrap with node 1. 770 stoppers[0].Stop(context.Background()) 771 // Wait for twice the bootstrap interval, and verify that 772 // node 2 still has not connected to node 1. 773 time.Sleep(2 * interval) 774 775 g[1].mu.Lock() 776 if a, e := len(g[1].mu.nodeMap), 0; a != e { 777 t.Errorf("expected %v to contain %d nodes, got %d", g[1].mu.nodeMap, e, a) 778 } 779 g[1].mu.Unlock() 780 } 781 782 // Test propagation of gossip infos in both directions across an existing 783 // gossip connection. 784 func TestGossipPropagation(t *testing.T) { 785 defer leaktest.AfterTest(t)() 786 stopper := stop.NewStopper() 787 defer stopper.Stop(context.Background()) 788 789 // Shared cluster ID by all gossipers (this ensures that the gossipers 790 // don't talk to servers from unrelated tests by accident). 791 clusterID := uuid.MakeV4() 792 793 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 794 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 795 remote.mu.Lock() 796 rAddr := remote.mu.is.NodeAddr 797 remote.mu.Unlock() 798 local.manage() 799 remote.manage() 800 801 mustAdd := func(g *Gossip, key string, val []byte, ttl time.Duration) { 802 if err := g.AddInfo(key, val, ttl); err != nil { 803 t.Fatal(err) 804 } 805 } 806 807 // Gossip a key on local and wait for it to show up on remote. This 808 // guarantees we have an active local to remote client connection. 809 mustAdd(local, "bootstrap", nil, 0) 810 testutils.SucceedsSoon(t, func() error { 811 c := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) 812 if c == nil { 813 // Restart the client connection in the loop. It might have failed due to 814 // a heartbeat timeout. 815 local.mu.Lock() 816 local.startClientLocked(&rAddr) 817 local.mu.Unlock() 818 return fmt.Errorf("unable to find local to remote client") 819 } 820 _, err := remote.GetInfo("bootstrap") 821 return err 822 }) 823 824 // Add entries on both the local and remote nodes and verify they get propagated. 825 mustAdd(local, "local", nil, time.Minute) 826 mustAdd(remote, "remote", nil, time.Minute) 827 828 getInfo := func(g *Gossip, key string) *Info { 829 g.mu.RLock() 830 defer g.mu.RUnlock() 831 return g.mu.is.Infos[key] 832 } 833 834 var localInfo *Info 835 var remoteInfo *Info 836 testutils.SucceedsSoon(t, func() error { 837 localInfo = getInfo(remote, "local") 838 if localInfo == nil { 839 return fmt.Errorf("local info not propagated") 840 } 841 remoteInfo = getInfo(local, "remote") 842 if remoteInfo == nil { 843 return fmt.Errorf("remote info not propagated") 844 } 845 return nil 846 }) 847 848 // Replace the existing entries on both the local and remote nodes and verify 849 // these new entries get propagated with updated timestamps. 850 mustAdd(local, "local", nil, 2*time.Minute) 851 mustAdd(remote, "remote", nil, 2*time.Minute) 852 853 testutils.SucceedsSoon(t, func() error { 854 if i := getInfo(remote, "local"); i == nil || reflect.DeepEqual(i, localInfo) { 855 return fmt.Errorf("new local info not propagated:\n%v\n%v", i, localInfo) 856 } 857 if i := getInfo(local, "remote"); reflect.DeepEqual(i, remoteInfo) { 858 return fmt.Errorf("new remote info not propagated:\n%v\n%v", i, remoteInfo) 859 } 860 return nil 861 }) 862 } 863 864 // Test whether propagation of an info that was generated by a prior 865 // incarnation of a server can correctly be sent back to that originating 866 // server. Consider the scenario: 867 // 868 // n1: decommissioned 869 // n2: gossip node-liveness:1 870 // n3: node-liveness range lease acquired (does not gossip node-liveness:1 871 // record because it is unchanged) 872 // n2: restarted 873 // - connects as gossip client to n3 874 // - sends a batch of gossip records to n3 875 // - n3 responds without sending node-liveness:1 because it's 876 // OrigStamp is less than the highwater stamp from n2 877 func TestGossipLoopbackInfoPropagation(t *testing.T) { 878 defer leaktest.AfterTest(t)() 879 t.Skipf("#34494") 880 stopper := stop.NewStopper() 881 defer stopper.Stop(context.Background()) 882 883 // Shared cluster ID by all gossipers (this ensures that the gossipers 884 // don't talk to servers from unrelated tests by accident). 885 clusterID := uuid.MakeV4() 886 887 local := startGossip(clusterID, 1, stopper, t, metric.NewRegistry()) 888 remote := startGossip(clusterID, 2, stopper, t, metric.NewRegistry()) 889 remote.mu.Lock() 890 rAddr := remote.mu.is.NodeAddr 891 remote.mu.Unlock() 892 local.manage() 893 remote.manage() 894 895 // Add a gossip info for "foo" on remote, that was generated by local. This 896 // simulates what happens if local was to gossip an info, and later restart 897 // and never gossip that info again. 898 func() { 899 local.mu.Lock() 900 defer local.mu.Unlock() 901 remote.mu.Lock() 902 defer remote.mu.Unlock() 903 // NB: replacing local.mu.is.newInfo with remote.mu.is.newInfo allows "foo" 904 // to be propagated. 905 if err := remote.mu.is.addInfo("foo", local.mu.is.newInfo(nil, 0)); err != nil { 906 t.Fatal(err) 907 } 908 }() 909 910 // Add an info to local so that it has a highwater timestamp that is newer 911 // than the info we added to remote. NB: commenting out this line allows 912 // "foo" to be propagated. 913 if err := local.AddInfo("bar", nil, 0); err != nil { 914 t.Fatal(err) 915 } 916 917 // Start a client connection to the remote node. 918 local.mu.Lock() 919 local.startClientLocked(&rAddr) 920 local.mu.Unlock() 921 922 getInfo := func(g *Gossip, key string) *Info { 923 g.mu.RLock() 924 defer g.mu.RUnlock() 925 return g.mu.is.Infos[key] 926 } 927 928 testutils.SucceedsSoon(t, func() error { 929 if getInfo(remote, "bar") == nil { 930 return fmt.Errorf("bar not propagated") 931 } 932 if getInfo(local, "foo") == nil { 933 return fmt.Errorf("foo not propagated") 934 } 935 return nil 936 }) 937 }