github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_transport_test.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "context" 15 "math/rand" 16 "net" 17 "reflect" 18 "testing" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 22 "github.com/cockroachdb/cockroach/pkg/gossip" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/rpc" 26 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 27 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 28 "github.com/cockroachdb/cockroach/pkg/testutils" 29 "github.com/cockroachdb/cockroach/pkg/util" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 32 "github.com/cockroachdb/cockroach/pkg/util/log" 33 "github.com/cockroachdb/cockroach/pkg/util/metric" 34 "github.com/cockroachdb/cockroach/pkg/util/netutil" 35 "github.com/cockroachdb/cockroach/pkg/util/stop" 36 "github.com/cockroachdb/cockroach/pkg/util/tracing" 37 "github.com/cockroachdb/cockroach/pkg/util/uuid" 38 "github.com/cockroachdb/errors" 39 "github.com/stretchr/testify/require" 40 "go.etcd.io/etcd/raft/raftpb" 41 ) 42 43 const channelServerBrokenRangeMessage = "channelServer broken range" 44 45 type channelServer struct { 46 ch chan *kvserver.RaftMessageRequest 47 maxSleep time.Duration 48 49 // If non-zero, all messages to this range will return errors 50 brokenRange roachpb.RangeID 51 } 52 53 func newChannelServer(bufSize int, maxSleep time.Duration) channelServer { 54 return channelServer{ 55 ch: make(chan *kvserver.RaftMessageRequest, bufSize), 56 maxSleep: maxSleep, 57 } 58 } 59 60 func (s channelServer) HandleRaftRequest( 61 ctx context.Context, req *kvserver.RaftMessageRequest, _ kvserver.RaftMessageResponseStream, 62 ) *roachpb.Error { 63 if s.maxSleep != 0 { 64 // maxSleep simulates goroutine scheduling delays that could 65 // result in messages being processed out of order (in previous 66 // transport implementations). 67 time.Sleep(time.Duration(rand.Int63n(int64(s.maxSleep)))) 68 } 69 if s.brokenRange != 0 && s.brokenRange == req.RangeID { 70 return roachpb.NewErrorf(channelServerBrokenRangeMessage) 71 } 72 s.ch <- req 73 return nil 74 } 75 76 func (s channelServer) HandleRaftResponse( 77 ctx context.Context, resp *kvserver.RaftMessageResponse, 78 ) error { 79 // Mimic the logic in (*Store).HandleRaftResponse without requiring an 80 // entire Store object to be pulled into these tests. 81 if val, ok := resp.Union.GetValue().(*roachpb.Error); ok { 82 if err, ok := val.GetDetail().(*roachpb.StoreNotFoundError); ok { 83 return err 84 } 85 } 86 log.Fatalf(ctx, "unexpected raft response: %s", resp) 87 return nil 88 } 89 90 func (s channelServer) HandleSnapshot( 91 header *kvserver.SnapshotRequest_Header, stream kvserver.SnapshotResponseStream, 92 ) error { 93 panic("unexpected HandleSnapshot") 94 } 95 96 // raftTransportTestContext contains objects needed to test RaftTransport. 97 // Typical usage will add multiple nodes with AddNode, attach channels 98 // to at least one store with ListenStore, and send messages with Send. 99 type raftTransportTestContext struct { 100 t testing.TB 101 stopper *stop.Stopper 102 transports map[roachpb.NodeID]*kvserver.RaftTransport 103 nodeRPCContext *rpc.Context 104 gossip *gossip.Gossip 105 } 106 107 func newRaftTransportTestContext(t testing.TB) *raftTransportTestContext { 108 rttc := &raftTransportTestContext{ 109 t: t, 110 stopper: stop.NewStopper(), 111 transports: map[roachpb.NodeID]*kvserver.RaftTransport{}, 112 } 113 rttc.nodeRPCContext = rpc.NewContext( 114 log.AmbientContext{Tracer: tracing.NewTracer()}, 115 testutils.NewNodeTestBaseContext(), 116 hlc.NewClock(hlc.UnixNano, time.Nanosecond), 117 rttc.stopper, 118 cluster.MakeTestingClusterSettings(), 119 ) 120 // Ensure that tests using this test context and restart/shut down 121 // their servers do not inadvertently start talking to servers from 122 // unrelated concurrent tests. 123 rttc.nodeRPCContext.ClusterID.Set(context.Background(), uuid.MakeV4()) 124 125 // We are sharing the same RPC context for all simulated nodes, so 126 // we can't enforce some of the RPC check validation. 127 rttc.nodeRPCContext.TestingAllowNamedRPCToAnonymousServer = true 128 129 server := rpc.NewServer(rttc.nodeRPCContext) // never started 130 rttc.gossip = gossip.NewTest( 131 1, rttc.nodeRPCContext, server, rttc.stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef(), 132 ) 133 134 return rttc 135 } 136 137 func (rttc *raftTransportTestContext) Stop() { 138 rttc.stopper.Stop(context.Background()) 139 } 140 141 // AddNode registers a node with the cluster. Nodes must be added 142 // before they can be used in other methods of 143 // raftTransportTestContext. The node will be gossiped immediately. 144 func (rttc *raftTransportTestContext) AddNode(nodeID roachpb.NodeID) *kvserver.RaftTransport { 145 transport, addr := rttc.AddNodeWithoutGossip(nodeID, util.TestAddr, rttc.stopper) 146 rttc.GossipNode(nodeID, addr) 147 return transport 148 } 149 150 // AddNodeWithoutGossip registers a node with the cluster. Nodes must 151 // be added before they can be used in other methods of 152 // raftTransportTestContext. Unless you are testing the effects of 153 // delaying gossip, use AddNode instead. 154 func (rttc *raftTransportTestContext) AddNodeWithoutGossip( 155 nodeID roachpb.NodeID, addr net.Addr, stopper *stop.Stopper, 156 ) (*kvserver.RaftTransport, net.Addr) { 157 grpcServer := rpc.NewServer(rttc.nodeRPCContext) 158 transport := kvserver.NewRaftTransport( 159 log.AmbientContext{Tracer: tracing.NewTracer()}, 160 cluster.MakeTestingClusterSettings(), 161 nodedialer.New(rttc.nodeRPCContext, gossip.AddressResolver(rttc.gossip)), 162 grpcServer, 163 rttc.stopper, 164 ) 165 rttc.transports[nodeID] = transport 166 ln, err := netutil.ListenAndServeGRPC(stopper, grpcServer, addr) 167 if err != nil { 168 rttc.t.Fatal(err) 169 } 170 return transport, ln.Addr() 171 } 172 173 // GossipNode gossips the node's address, which is necessary before 174 // any messages can be sent to it. Normally done automatically by 175 // AddNode. 176 func (rttc *raftTransportTestContext) GossipNode(nodeID roachpb.NodeID, addr net.Addr) { 177 if err := rttc.gossip.AddInfoProto(gossip.MakeNodeIDKey(nodeID), 178 &roachpb.NodeDescriptor{ 179 NodeID: nodeID, 180 Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), 181 }, 182 time.Hour); err != nil { 183 rttc.t.Fatal(err) 184 } 185 } 186 187 // ListenStore registers a store on a node and returns a channel for 188 // messages sent to that store. 189 func (rttc *raftTransportTestContext) ListenStore( 190 nodeID roachpb.NodeID, storeID roachpb.StoreID, 191 ) channelServer { 192 ch := newChannelServer(100, 10*time.Millisecond) 193 rttc.transports[nodeID].Listen(storeID, ch) 194 return ch 195 } 196 197 // Send a message. Returns false if the message was dropped. 198 func (rttc *raftTransportTestContext) Send( 199 from, to roachpb.ReplicaDescriptor, rangeID roachpb.RangeID, msg raftpb.Message, 200 ) bool { 201 msg.To = uint64(to.ReplicaID) 202 msg.From = uint64(from.ReplicaID) 203 req := &kvserver.RaftMessageRequest{ 204 RangeID: rangeID, 205 Message: msg, 206 ToReplica: to, 207 FromReplica: from, 208 } 209 return rttc.transports[from.NodeID].SendAsync(req, rpc.DefaultClass) 210 } 211 212 func TestSendAndReceive(t *testing.T) { 213 defer leaktest.AfterTest(t)() 214 rttc := newRaftTransportTestContext(t) 215 defer rttc.Stop() 216 217 // Create several servers, each of which has two stores (A raft 218 // node ID addresses a store). Node 1 has stores 1 and 2, node 2 has 219 // stores 3 and 4, etc. 220 // 221 // We suppose that range 1 is replicated across the odd-numbered 222 // stores in reverse order to ensure that the various IDs are not 223 // equal: replica 1 is store 5, replica 2 is store 3, and replica 3 224 // is store 1. 225 const numNodes = 3 226 const storesPerNode = 2 227 nextNodeID := roachpb.NodeID(2) 228 nextStoreID := roachpb.StoreID(2) 229 230 // Per-node state. 231 transports := map[roachpb.NodeID]*kvserver.RaftTransport{} 232 233 // Per-store state. 234 storeNodes := map[roachpb.StoreID]roachpb.NodeID{} 235 channels := map[roachpb.StoreID]channelServer{} 236 replicaIDs := map[roachpb.StoreID]roachpb.ReplicaID{ 237 1: 3, 238 3: 2, 239 5: 1, 240 } 241 242 messageTypes := map[raftpb.MessageType]struct{}{ 243 raftpb.MsgHeartbeat: {}, 244 } 245 246 for nodeIndex := 0; nodeIndex < numNodes; nodeIndex++ { 247 nodeID := nextNodeID 248 nextNodeID++ 249 transports[nodeID] = rttc.AddNode(nodeID) 250 251 for storeIndex := 0; storeIndex < storesPerNode; storeIndex++ { 252 storeID := nextStoreID 253 nextStoreID++ 254 255 storeNodes[storeID] = nodeID 256 257 channels[storeID] = rttc.ListenStore(nodeID, storeID) 258 } 259 } 260 261 messageTypeCounts := make(map[roachpb.StoreID]map[raftpb.MessageType]int) 262 263 // Each store sends one snapshot and one heartbeat to each store, including 264 // itself. 265 for toStoreID, toNodeID := range storeNodes { 266 if _, ok := messageTypeCounts[toStoreID]; !ok { 267 messageTypeCounts[toStoreID] = make(map[raftpb.MessageType]int) 268 } 269 270 for fromStoreID, fromNodeID := range storeNodes { 271 baseReq := kvserver.RaftMessageRequest{ 272 RangeID: 1, 273 Message: raftpb.Message{ 274 From: uint64(fromStoreID), 275 To: uint64(toStoreID), 276 }, 277 FromReplica: roachpb.ReplicaDescriptor{ 278 NodeID: fromNodeID, 279 StoreID: fromStoreID, 280 }, 281 ToReplica: roachpb.ReplicaDescriptor{ 282 NodeID: toNodeID, 283 StoreID: toStoreID, 284 }, 285 } 286 287 for messageType := range messageTypes { 288 req := baseReq 289 req.Message.Type = messageType 290 291 if !transports[fromNodeID].SendAsync(&req, rpc.DefaultClass) { 292 t.Errorf("unable to send %s from %d to %d", messageType, fromNodeID, toNodeID) 293 } 294 messageTypeCounts[toStoreID][messageType]++ 295 } 296 } 297 } 298 299 // Read all the messages from the channels. Note that the transport 300 // does not guarantee in-order delivery between independent 301 // transports, so we just verify that the right number of messages 302 // end up in each channel. 303 for toStoreID := range storeNodes { 304 for len(messageTypeCounts[toStoreID]) > 0 { 305 req := <-channels[toStoreID].ch 306 if req.Message.To != uint64(toStoreID) { 307 t.Errorf("got unexpected message %v on channel %d", req, toStoreID) 308 } 309 310 if typeCounts, ok := messageTypeCounts[toStoreID]; ok { 311 if _, ok := typeCounts[req.Message.Type]; ok { 312 typeCounts[req.Message.Type]-- 313 if typeCounts[req.Message.Type] == 0 { 314 delete(typeCounts, req.Message.Type) 315 } 316 } else { 317 t.Errorf("expected %v to have key %v, but it did not", typeCounts, req.Message.Type) 318 } 319 } else { 320 t.Errorf("expected %v to have key %v, but it did not", messageTypeCounts, toStoreID) 321 } 322 } 323 324 delete(messageTypeCounts, toStoreID) 325 326 select { 327 case req := <-channels[toStoreID].ch: 328 t.Errorf("got unexpected message %v on channel %d", req, toStoreID) 329 case <-time.After(100 * time.Millisecond): 330 } 331 } 332 333 if len(messageTypeCounts) > 0 { 334 t.Errorf("remaining messages expected: %v", messageTypeCounts) 335 } 336 337 // Real raft messages have different node/store/replica IDs. 338 // Send a message from replica 2 (on store 3, node 2) to replica 1 (on store 5, node 3) 339 fromStoreID := roachpb.StoreID(3) 340 toStoreID := roachpb.StoreID(5) 341 expReq := &kvserver.RaftMessageRequest{ 342 RangeID: 1, 343 Message: raftpb.Message{ 344 Type: raftpb.MsgApp, 345 From: uint64(replicaIDs[fromStoreID]), 346 To: uint64(replicaIDs[toStoreID]), 347 }, 348 FromReplica: roachpb.ReplicaDescriptor{ 349 NodeID: storeNodes[fromStoreID], 350 StoreID: fromStoreID, 351 ReplicaID: replicaIDs[fromStoreID], 352 }, 353 ToReplica: roachpb.ReplicaDescriptor{ 354 NodeID: storeNodes[toStoreID], 355 StoreID: toStoreID, 356 ReplicaID: replicaIDs[toStoreID], 357 }, 358 } 359 // NB: argument passed to SendAsync is not safe to use after; make a copy. 360 expReqCopy := *expReq 361 if !transports[storeNodes[fromStoreID]].SendAsync(&expReqCopy, rpc.DefaultClass) { 362 t.Errorf("unable to send message from %d to %d", fromStoreID, toStoreID) 363 } 364 // NB: proto.Equal will panic here since it doesn't know about `gogoproto.casttype`. 365 if req := <-channels[toStoreID].ch; !reflect.DeepEqual(req, expReq) { 366 t.Errorf("got unexpected message %v on channel %d", req, toStoreID) 367 } 368 369 select { 370 case req := <-channels[toStoreID].ch: 371 t.Errorf("got unexpected message %v on channel %d", req, toStoreID) 372 default: 373 } 374 } 375 376 // TestInOrderDelivery verifies that for a given pair of nodes, raft 377 // messages are delivered in order. 378 func TestInOrderDelivery(t *testing.T) { 379 defer leaktest.AfterTest(t)() 380 rttc := newRaftTransportTestContext(t) 381 defer rttc.Stop() 382 383 const numMessages = 100 384 serverReplica := roachpb.ReplicaDescriptor{ 385 NodeID: 2, 386 StoreID: 2, 387 ReplicaID: 2, 388 } 389 rttc.AddNode(serverReplica.NodeID) 390 serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID) 391 392 clientReplica := roachpb.ReplicaDescriptor{ 393 NodeID: 1, 394 StoreID: 1, 395 ReplicaID: 1, 396 } 397 rttc.AddNode(clientReplica.NodeID) 398 399 for i := 0; i < numMessages; i++ { 400 if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: uint64(i)}) { 401 t.Errorf("failed to send message %d", i) 402 } 403 } 404 405 for i := 0; i < numMessages; i++ { 406 req := <-serverChannel.ch 407 if req.Message.Commit != uint64(i) { 408 t.Errorf("messages out of order: got %d while expecting %d", req.Message.Commit, i) 409 } 410 } 411 } 412 413 // TestRaftTransportCircuitBreaker verifies that messages will be 414 // dropped waiting for raft node connection to be established. 415 func TestRaftTransportCircuitBreaker(t *testing.T) { 416 defer leaktest.AfterTest(t)() 417 rttc := newRaftTransportTestContext(t) 418 defer rttc.Stop() 419 420 serverReplica := roachpb.ReplicaDescriptor{ 421 NodeID: 2, 422 StoreID: 2, 423 ReplicaID: 2, 424 } 425 _, serverAddr := rttc.AddNodeWithoutGossip(serverReplica.NodeID, util.TestAddr, rttc.stopper) 426 serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID) 427 428 clientReplica := roachpb.ReplicaDescriptor{ 429 NodeID: 1, 430 StoreID: 1, 431 ReplicaID: 1, 432 } 433 clientTransport := rttc.AddNode(clientReplica.NodeID) 434 435 // Sending repeated messages should begin dropping once the circuit breaker 436 // does trip. 437 testutils.SucceedsSoon(t, func() error { 438 if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { 439 return errors.Errorf("expected circuit breaker to trip") 440 } 441 return nil 442 }) 443 444 // Now, gossip address of server. 445 rttc.GossipNode(serverReplica.NodeID, serverAddr) 446 447 // Keep sending commit=2 until breaker resets and we receive the 448 // first instance. It's possible an earlier message for commit=1 449 // snuck in. 450 testutils.SucceedsSoon(t, func() error { 451 if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 2}) { 452 clientTransport.GetCircuitBreaker(serverReplica.NodeID, rpc.DefaultClass).Reset() 453 } 454 select { 455 case req := <-serverChannel.ch: 456 if req.Message.Commit == 2 { 457 return nil 458 } 459 default: 460 } 461 return errors.Errorf("expected message commit=2") 462 }) 463 } 464 465 // TestRaftTransportIndependentRanges ensures that errors from one 466 // range do not interfere with messages to another range on the same 467 // store. 468 func TestRaftTransportIndependentRanges(t *testing.T) { 469 defer leaktest.AfterTest(t)() 470 rttc := newRaftTransportTestContext(t) 471 defer rttc.Stop() 472 473 server := roachpb.ReplicaDescriptor{ 474 NodeID: 1, 475 StoreID: 1, 476 ReplicaID: 1, 477 } 478 serverTransport := rttc.AddNode(server.NodeID) 479 client := roachpb.ReplicaDescriptor{ 480 NodeID: 2, 481 StoreID: 2, 482 ReplicaID: 2, 483 } 484 rttc.AddNode(client.NodeID) 485 486 const numMessages = 50 487 channelServer := newChannelServer(numMessages*2, 10*time.Millisecond) 488 channelServer.brokenRange = 13 489 serverTransport.Listen(server.StoreID, channelServer) 490 491 for i := 0; i < numMessages; i++ { 492 for _, rangeID := range []roachpb.RangeID{1, 13} { 493 if !rttc.Send(client, server, rangeID, raftpb.Message{Commit: uint64(i)}) { 494 t.Errorf("failed to send message %d to range %s", i, rangeID) 495 } 496 } 497 } 498 for i := 0; i < numMessages; i++ { 499 select { 500 case msg := <-channelServer.ch: 501 if msg.Message.Commit != uint64(i) { 502 t.Errorf("got message %d while expecting %d", msg.Message.Commit, i) 503 } 504 case <-time.After(time.Second): 505 t.Fatalf("timeout waiting for message %d", i) 506 } 507 } 508 } 509 510 // TestReopenConnection verifies that if a raft response indicates that the 511 // expected store isn't present on the node, that the connection gets 512 // terminated and reopened before retrying, to ensure that the transport 513 // doesn't get stuck in an endless retry loop against the wrong node. 514 func TestReopenConnection(t *testing.T) { 515 defer leaktest.AfterTest(t)() 516 rttc := newRaftTransportTestContext(t) 517 defer rttc.Stop() 518 519 // Use a special stopper for the initial server so that we can fully stop it 520 // (releasing its bound network address) before the rest of the test pieces. 521 serverStopper := stop.NewStopper() 522 serverReplica := roachpb.ReplicaDescriptor{ 523 NodeID: 2, 524 StoreID: 2, 525 ReplicaID: 2, 526 } 527 serverTransport, serverAddr := 528 rttc.AddNodeWithoutGossip(serverReplica.NodeID, util.TestAddr, serverStopper) 529 rttc.GossipNode(serverReplica.NodeID, serverAddr) 530 rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID) 531 532 clientReplica := roachpb.ReplicaDescriptor{ 533 NodeID: 1, 534 StoreID: 1, 535 ReplicaID: 1, 536 } 537 rttc.AddNode(clientReplica.NodeID) 538 rttc.ListenStore(clientReplica.NodeID, clientReplica.StoreID) 539 540 // Take down the old server and start a new one at the same address. 541 serverTransport.Stop(serverReplica.StoreID) 542 serverStopper.Stop(context.Background()) 543 544 // With the old server down, nothing is listening no the address right now 545 // so the circuit breaker should trip. 546 testutils.SucceedsSoon(t, func() error { 547 if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { 548 return errors.New("expected circuit breaker to trip") 549 } 550 return nil 551 }) 552 553 replacementReplica := roachpb.ReplicaDescriptor{ 554 NodeID: 3, 555 StoreID: 3, 556 ReplicaID: 3, 557 } 558 559 rttc.AddNodeWithoutGossip(replacementReplica.NodeID, serverAddr, rttc.stopper) 560 replacementChannel := rttc.ListenStore(replacementReplica.NodeID, replacementReplica.StoreID) 561 562 // Try sending a message to the old server's store (at the address its 563 // replacement is now running at) before its replacement has been gossiped. 564 // We just want to ensure that doing so doesn't deadlock the client transport. 565 if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { 566 t.Fatal("unexpectedly managed to send to recently downed node") 567 } 568 569 // Then, to ensure the client hasn't been deadlocked, add the replacement node 570 // to the gossip network and send it a request. Note that this will remove the 571 // gossip record for serverReplica.NodeID (n2) since they share the same address. 572 // This explains why we we can't really assert whether n2 becomes unreachable or 573 // not. If a healthy connection makes it into the rpc context before gossip 574 // makes the node unresolvable, it's possible. In the other case, it's not. 575 rttc.GossipNode(replacementReplica.NodeID, serverAddr) 576 577 testutils.SucceedsSoon(t, func() error { 578 // Sending messages to the old store does not deadlock. See the comment above 579 // to understand why we don't check the returned value. 580 rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) 581 // It won't be long until we can send to the new replica. The only reason 582 // this might fail is that the failed connection is still in the RPC 583 // connection pool and we have to wait out a health check interval. 584 if !rttc.Send(clientReplica, replacementReplica, 1, raftpb.Message{Commit: 1}) { 585 return errors.New("unable to send to replacement replica") 586 } 587 return nil 588 }) 589 590 // Send commit=2 to the replacement replica. This should work now because we've 591 // just used it successfully above and didn't change anything about the networking. 592 if !rttc.Send(clientReplica, replacementReplica, 1, raftpb.Message{Commit: 2}) { 593 t.Fatal("replacement node still unhealthy") 594 595 } 596 testutils.SucceedsSoon(t, func() error { 597 select { 598 case req := <-replacementChannel.ch: 599 // There could be a few stray messages with `c==1` in the channel, 600 // so throw those away. 601 if c := req.Message.Commit; c == 2 { 602 return nil 603 } 604 default: 605 } 606 return errors.New("still waiting") 607 }) 608 } 609 610 // This test ensures that blocking by a node dialer attempting to dial a 611 // remote node does not block calls to SendAsync. 612 func TestSendFailureToConnectDoesNotHangRaft(t *testing.T) { 613 defer leaktest.AfterTest(t)() 614 rttc := newRaftTransportTestContext(t) 615 defer rttc.Stop() 616 617 // Create a single server from which we're going to call send. 618 // We'll then set up a bogus target server which will not be serving gRPC 619 // and will block during connection setup (leading to blocking in the Dial 620 // call). The test ensures that the Send call does not block. 621 const rangeID, from, to = 1, 1, 2 622 transport := rttc.AddNode(from) 623 // Set up a plain old TCP listener that's not going to accept any connecitons 624 // which will lead to blocking during dial. 625 ln, err := net.Listen("tcp", util.TestAddr.String()) 626 require.NoError(t, err) 627 defer func() { _ = ln.Close() }() 628 rttc.GossipNode(to, ln.Addr()) 629 // Try to send a message, make sure we don't block waiting to set up the 630 // connection. 631 transport.SendAsync(&kvserver.RaftMessageRequest{ 632 RangeID: rangeID, 633 ToReplica: roachpb.ReplicaDescriptor{ 634 StoreID: to, 635 NodeID: to, 636 ReplicaID: to, 637 }, 638 FromReplica: roachpb.ReplicaDescriptor{ 639 StoreID: from, 640 NodeID: from, 641 ReplicaID: from, 642 }, 643 Message: raftpb.Message{To: to, From: from}, 644 }, rpc.DefaultClass) 645 }