github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/gossip_test.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "context" 15 "fmt" 16 "reflect" 17 "testing" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/base" 21 "github.com/cockroachdb/cockroach/pkg/gossip" 22 "github.com/cockroachdb/cockroach/pkg/keys" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 25 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" 26 "github.com/cockroachdb/cockroach/pkg/util" 27 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 28 "github.com/cockroachdb/cockroach/pkg/util/log" 29 "github.com/cockroachdb/cockroach/pkg/util/retry" 30 "github.com/stretchr/testify/require" 31 ) 32 33 func TestGossipFirstRange(t *testing.T) { 34 defer leaktest.AfterTest(t)() 35 36 tc := testcluster.StartTestCluster(t, 3, 37 base.TestClusterArgs{ 38 ReplicationMode: base.ReplicationManual, 39 }) 40 defer tc.Stopper().Stop(context.Background()) 41 42 errors := make(chan error, 1) 43 descs := make(chan *roachpb.RangeDescriptor) 44 unregister := tc.Servers[0].Gossip().RegisterCallback(gossip.KeyFirstRangeDescriptor, 45 func(_ string, content roachpb.Value) { 46 var desc roachpb.RangeDescriptor 47 if err := content.GetProto(&desc); err != nil { 48 select { 49 case errors <- err: 50 default: 51 } 52 } else { 53 select { 54 case descs <- &desc: 55 case <-time.After(45 * time.Second): 56 t.Logf("had to drop descriptor %+v", desc) 57 } 58 } 59 }, 60 // Redundant callbacks are required by this test. 61 gossip.Redundant, 62 ) 63 // Unregister the callback before attempting to stop the stopper to prevent 64 // deadlock. This is still flaky in theory since a callback can fire between 65 // the last read from the channels and this unregister, but testing has 66 // shown this solution to be sufficiently robust for now. 67 defer unregister() 68 69 // Wait for the specified descriptor to be gossiped for the first range. We 70 // loop because the timing of replica addition and lease transfer can cause 71 // extra gossiping of the first range. 72 waitForGossip := func(desc roachpb.RangeDescriptor) { 73 for { 74 select { 75 case err := <-errors: 76 t.Fatal(err) 77 case gossiped := <-descs: 78 if reflect.DeepEqual(&desc, gossiped) { 79 return 80 } 81 log.Infof(context.Background(), "expected\n%+v\nbut found\n%+v", desc, gossiped) 82 } 83 } 84 } 85 86 // Expect an initial callback of the first range descriptor. 87 select { 88 case err := <-errors: 89 t.Fatal(err) 90 case <-descs: 91 } 92 93 // Add two replicas. The first range descriptor should be gossiped after each 94 // addition. 95 var desc roachpb.RangeDescriptor 96 firstRangeKey := keys.MinKey 97 for i := 1; i <= 2; i++ { 98 var err error 99 if desc, err = tc.AddReplicas(firstRangeKey, tc.Target(i)); err != nil { 100 t.Fatal(err) 101 } 102 waitForGossip(desc) 103 } 104 105 // Transfer the lease to a new node. This should cause the first range to be 106 // gossiped again. 107 if err := tc.TransferRangeLease(desc, tc.Target(1)); err != nil { 108 t.Fatal(err) 109 } 110 waitForGossip(desc) 111 112 // Remove a non-lease holder replica. 113 desc, err := tc.RemoveReplicas(firstRangeKey, tc.Target(0)) 114 if err != nil { 115 t.Fatal(err) 116 } 117 waitForGossip(desc) 118 119 // TODO(peter): Re-enable or remove when we've resolved the discussion 120 // about removing the lease-holder replica. See #7872. 121 122 // // Remove the lease holder replica. 123 // leaseHolder, err := tc.FindRangeLeaseHolder(desc, nil) 124 // desc, err = tc.RemoveReplicas(firstRangeKey, leaseHolder) 125 // if err != nil { 126 // t.Fatal(err) 127 // } 128 // select { 129 // case err := <-errors: 130 // t.Fatal(err) 131 // case gossiped := <-descs: 132 // if !reflect.DeepEqual(desc, gossiped) { 133 // t.Fatalf("expected\n%+v\nbut found\n%+v", desc, gossiped) 134 // } 135 // } 136 } 137 138 // TestGossipHandlesReplacedNode tests that we can shut down a node and 139 // replace it with a new node at the same address (simulating a node getting 140 // restarted after losing its data) without the cluster breaking. 141 func TestGossipHandlesReplacedNode(t *testing.T) { 142 defer leaktest.AfterTest(t)() 143 if testing.Short() { 144 // As of Nov 2018 it takes 3.6s. 145 t.Skip("short") 146 } 147 ctx := context.Background() 148 149 // Shorten the raft tick interval and election timeout to make range leases 150 // much shorter than normal. This keeps us from having to wait so long for 151 // the replaced node's leases to time out, but has still shown itself to be 152 // long enough to avoid flakes. 153 serverArgs := base.TestServerArgs{ 154 Addr: util.IsolatedTestAddr.String(), 155 Insecure: true, // because our certs are only valid for 127.0.0.1 156 RetryOptions: retry.Options{ 157 InitialBackoff: 10 * time.Millisecond, 158 MaxBackoff: 50 * time.Millisecond, 159 }, 160 } 161 serverArgs.RaftTickInterval = 50 * time.Millisecond 162 serverArgs.RaftElectionTimeoutTicks = 10 163 164 tc := testcluster.StartTestCluster(t, 3, 165 base.TestClusterArgs{ 166 ServerArgs: serverArgs, 167 }) 168 defer tc.Stopper().Stop(context.Background()) 169 170 // Take down the first node and replace it with a new one. 171 oldNodeIdx := 0 172 newServerArgs := serverArgs 173 newServerArgs.Addr = tc.Servers[oldNodeIdx].ServingRPCAddr() 174 newServerArgs.SQLAddr = tc.Servers[oldNodeIdx].ServingSQLAddr() 175 newServerArgs.PartOfCluster = true 176 newServerArgs.JoinAddr = tc.Servers[1].ServingRPCAddr() 177 log.Infof(ctx, "stopping server %d", oldNodeIdx) 178 tc.StopServer(oldNodeIdx) 179 tc.AddServer(t, newServerArgs) 180 181 tc.WaitForStores(t, tc.Server(1).GossipI().(*gossip.Gossip)) 182 183 // Ensure that all servers still running are responsive. If the two remaining 184 // original nodes don't refresh their connection to the address of the first 185 // node, they can get stuck here. 186 for i, server := range tc.Servers { 187 if i == oldNodeIdx { 188 continue 189 } 190 kvClient := server.DB() 191 if err := kvClient.Put(ctx, fmt.Sprintf("%d", i), i); err != nil { 192 t.Errorf("failed Put to node %d: %+v", i, err) 193 } 194 } 195 } 196 197 // TestGossipAfterAbortOfSystemConfigTransactionAfterFailureDueToIntents tests 198 // that failures to gossip the system config due to intents are rectified when 199 // later intents are aborted. 200 func TestGossipAfterAbortOfSystemConfigTransactionAfterFailureDueToIntents(t *testing.T) { 201 defer leaktest.AfterTest(t)() 202 203 ctx := context.Background() 204 205 tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{}) 206 defer tc.Stopper().Stop(ctx) 207 require.NoError(t, tc.WaitForFullReplication()) 208 209 db := tc.Server(0).DB() 210 211 txA := db.NewTxn(ctx, "a") 212 txB := db.NewTxn(ctx, "b") 213 214 require.NoError(t, txA.SetSystemConfigTrigger()) 215 require.NoError(t, txA.Put( 216 ctx, keys.SystemSQLCodec.DescMetadataKey(1000), sqlbase.WrapDescriptor(&sqlbase.DatabaseDescriptor{}))) 217 218 require.NoError(t, txB.SetSystemConfigTrigger()) 219 require.NoError(t, txB.Put( 220 ctx, keys.SystemSQLCodec.DescMetadataKey(2000), sqlbase.WrapDescriptor(&sqlbase.DatabaseDescriptor{}))) 221 222 const someTime = 10 * time.Millisecond 223 clearNotifictions := func(ch <-chan struct{}) { 224 for { 225 select { 226 case <-ch: 227 case <-time.After(someTime): 228 return 229 } 230 } 231 } 232 systemConfChangeCh := tc.Server(0).GossipI().(*gossip.Gossip).RegisterSystemConfigChannel() 233 clearNotifictions(systemConfChangeCh) 234 require.NoError(t, txB.Commit(ctx)) 235 select { 236 case <-systemConfChangeCh: 237 // This case is rare but happens sometimes. We gossip the node liveness 238 // in a bunch of cases so we just let the test finish here. The important 239 // thing is that sometimes we get to the next phase. 240 t.Log("got unexpected update. This can happen for a variety of " + 241 "reasons like lease transfers. The test is exiting without testing anything") 242 return 243 case <-time.After(someTime): 244 // Did not expect an update so this is the happy case 245 } 246 // Roll back the transaction which had laid down the intent which blocked the 247 // earlier gossip update, make sure we get a gossip notification now. 248 const aLongTime = 20 * someTime 249 require.NoError(t, txA.Rollback(ctx)) 250 select { 251 case <-systemConfChangeCh: 252 // Got an update. 253 case <-time.After(aLongTime): 254 t.Fatal("expected update") 255 } 256 }