github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_lease_test.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "context" 15 "fmt" 16 "runtime" 17 "sync" 18 "sync/atomic" 19 "testing" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/base" 23 "github.com/cockroachdb/cockroach/pkg/config" 24 "github.com/cockroachdb/cockroach/pkg/gossip" 25 "github.com/cockroachdb/cockroach/pkg/keys" 26 "github.com/cockroachdb/cockroach/pkg/kv" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 29 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 30 "github.com/cockroachdb/cockroach/pkg/roachpb" 31 "github.com/cockroachdb/cockroach/pkg/testutils" 32 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" 33 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 34 "github.com/cockroachdb/cockroach/pkg/util/log" 35 "github.com/cockroachdb/errors" 36 "github.com/stretchr/testify/require" 37 ) 38 39 // TestStoreRangeLease verifies that regular ranges (not some special ones at 40 // the start of the key space) get epoch-based range leases if enabled and 41 // expiration-based otherwise. 42 func TestStoreRangeLease(t *testing.T) { 43 defer leaktest.AfterTest(t)() 44 45 testutils.RunTrueAndFalse(t, "enableEpoch", func(t *testing.T, enableEpoch bool) { 46 sc := kvserver.TestStoreConfig(nil) 47 sc.TestingKnobs.DisableMergeQueue = true 48 sc.EnableEpochRangeLeases = enableEpoch 49 mtc := &multiTestContext{storeConfig: &sc} 50 defer mtc.Stop() 51 mtc.Start(t, 1) 52 53 // NodeLivenessKeyMax is a static split point, so this is always 54 // the start key of the first range that uses epoch-based 55 // leases. Splitting on it here is redundant, but we want to include 56 // it in our tests of lease types below. 57 splitKeys := []roachpb.Key{ 58 keys.NodeLivenessKeyMax, roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c"), 59 } 60 for _, splitKey := range splitKeys { 61 splitArgs := adminSplitArgs(splitKey) 62 if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil { 63 t.Fatal(pErr) 64 } 65 } 66 67 rLeft := mtc.stores[0].LookupReplica(roachpb.RKeyMin) 68 lease, _ := rLeft.GetLease() 69 if lt := lease.Type(); lt != roachpb.LeaseExpiration { 70 t.Fatalf("expected lease type expiration; got %d", lt) 71 } 72 73 // After the expiration, expect an epoch lease for all the ranges if 74 // we've enabled epoch based range leases. 75 for _, key := range splitKeys { 76 repl := mtc.stores[0].LookupReplica(roachpb.RKey(key)) 77 lease, _ = repl.GetLease() 78 if enableEpoch { 79 if lt := lease.Type(); lt != roachpb.LeaseEpoch { 80 t.Fatalf("expected lease type epoch; got %d", lt) 81 } 82 } else { 83 if lt := lease.Type(); lt != roachpb.LeaseExpiration { 84 t.Fatalf("expected lease type expiration; got %d", lt) 85 } 86 } 87 } 88 }) 89 } 90 91 // TestStoreRangeLeaseSwitcheroo verifies that ranges can be switched 92 // between expiration and epoch and back. 93 func TestStoreRangeLeaseSwitcheroo(t *testing.T) { 94 defer leaktest.AfterTest(t)() 95 sc := kvserver.TestStoreConfig(nil) 96 sc.TestingKnobs.DisableMergeQueue = true 97 sc.EnableEpochRangeLeases = true 98 sc.Clock = nil // manual clock 99 mtc := &multiTestContext{storeConfig: &sc} 100 defer mtc.Stop() 101 mtc.Start(t, 1) 102 103 splitKey := roachpb.Key("a") 104 splitArgs := adminSplitArgs(splitKey) 105 if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil { 106 t.Fatal(pErr) 107 } 108 109 // Allow leases to expire and send commands to ensure we 110 // re-acquire, then check types again. 111 mtc.advanceClock(context.Background()) 112 if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil { 113 t.Fatalf("failed to increment: %+v", err) 114 } 115 116 // We started with epoch ranges enabled, so verify we have an epoch lease. 117 repl := mtc.stores[0].LookupReplica(roachpb.RKey(splitKey)) 118 lease, _ := repl.GetLease() 119 if lt := lease.Type(); lt != roachpb.LeaseEpoch { 120 t.Fatalf("expected lease type epoch; got %d", lt) 121 } 122 123 // Stop the store and reverse the epoch range lease setting. 124 mtc.stopStore(0) 125 sc.EnableEpochRangeLeases = false 126 mtc.restartStore(0) 127 128 mtc.advanceClock(context.Background()) 129 if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil { 130 t.Fatalf("failed to increment: %+v", err) 131 } 132 133 // Verify we end up with an expiration lease on restart. 134 repl = mtc.stores[0].LookupReplica(roachpb.RKey(splitKey)) 135 lease, _ = repl.GetLease() 136 if lt := lease.Type(); lt != roachpb.LeaseExpiration { 137 t.Fatalf("expected lease type expiration; got %d", lt) 138 } 139 140 // Now, one more time, switch back to epoch-based. 141 mtc.stopStore(0) 142 sc.EnableEpochRangeLeases = true 143 mtc.restartStore(0) 144 145 mtc.advanceClock(context.Background()) 146 if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil { 147 t.Fatalf("failed to increment: %+v", err) 148 } 149 150 // Verify we end up with an epoch lease on restart. 151 repl = mtc.stores[0].LookupReplica(roachpb.RKey(splitKey)) 152 lease, _ = repl.GetLease() 153 if lt := lease.Type(); lt != roachpb.LeaseEpoch { 154 t.Fatalf("expected lease type epoch; got %d", lt) 155 } 156 } 157 158 // TestStoreGossipSystemData verifies that the system-config and node-liveness 159 // data is gossiped at startup. 160 func TestStoreGossipSystemData(t *testing.T) { 161 defer leaktest.AfterTest(t)() 162 sc := kvserver.TestStoreConfig(nil) 163 sc.TestingKnobs.DisableMergeQueue = true 164 sc.EnableEpochRangeLeases = true 165 mtc := &multiTestContext{storeConfig: &sc} 166 defer mtc.Stop() 167 mtc.Start(t, 1) 168 169 splitKey := keys.SystemConfigSplitKey 170 splitArgs := adminSplitArgs(splitKey) 171 if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil { 172 t.Fatal(pErr) 173 } 174 if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil { 175 t.Fatalf("failed to increment: %+v", err) 176 } 177 178 mtc.stopStore(0) 179 180 getSystemConfig := func() *config.SystemConfig { 181 systemConfig := mtc.gossips[0].GetSystemConfig() 182 return systemConfig 183 } 184 getNodeLiveness := func() kvserverpb.Liveness { 185 var liveness kvserverpb.Liveness 186 if err := mtc.gossips[0].GetInfoProto(gossip.MakeNodeLivenessKey(1), &liveness); err == nil { 187 return liveness 188 } 189 return kvserverpb.Liveness{} 190 } 191 192 // Clear the system-config and node liveness gossip data. This is necessary 193 // because multiTestContext.restartStore reuse the Gossip structure. 194 if err := mtc.gossips[0].AddInfoProto( 195 gossip.KeySystemConfig, &config.SystemConfigEntries{}, 0); err != nil { 196 t.Fatal(err) 197 } 198 if err := mtc.gossips[0].AddInfoProto( 199 gossip.MakeNodeLivenessKey(1), &kvserverpb.Liveness{}, 0); err != nil { 200 t.Fatal(err) 201 } 202 testutils.SucceedsSoon(t, func() error { 203 if !getSystemConfig().DefaultZoneConfig.Equal(sc.DefaultZoneConfig) { 204 return errors.New("system config not empty") 205 } 206 if getNodeLiveness() != (kvserverpb.Liveness{}) { 207 return errors.New("node liveness not empty") 208 } 209 return nil 210 }) 211 212 // Restart the store and verify that both the system-config and node-liveness 213 // data is gossiped. 214 mtc.restartStore(0) 215 testutils.SucceedsSoon(t, func() error { 216 if !getSystemConfig().DefaultZoneConfig.Equal(sc.DefaultZoneConfig) { 217 return errors.New("system config not gossiped") 218 } 219 if getNodeLiveness() == (kvserverpb.Liveness{}) { 220 return errors.New("node liveness not gossiped") 221 } 222 return nil 223 }) 224 } 225 226 // TestGossipSystemConfigOnLeaseChange verifies that the system-config gets 227 // re-gossiped on lease transfer even if it hasn't changed. This helps prevent 228 // situations where a previous leaseholder can restart and not receive the 229 // system config because it was the original source of it within the gossip 230 // network. 231 func TestGossipSystemConfigOnLeaseChange(t *testing.T) { 232 defer leaktest.AfterTest(t)() 233 sc := kvserver.TestStoreConfig(nil) 234 sc.TestingKnobs.DisableReplicateQueue = true 235 mtc := &multiTestContext{storeConfig: &sc} 236 defer mtc.Stop() 237 const numStores = 3 238 mtc.Start(t, numStores) 239 240 rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(keys.SystemConfigSpan.Key)).RangeID 241 mtc.replicateRange(rangeID, 1, 2) 242 243 initialStoreIdx := -1 244 for i := range mtc.stores { 245 if mtc.stores[i].Gossip().InfoOriginatedHere(gossip.KeySystemConfig) { 246 initialStoreIdx = i 247 } 248 } 249 if initialStoreIdx == -1 { 250 t.Fatalf("no store has gossiped system config; gossip contents: %+v", mtc.stores[0].Gossip().GetInfoStatus()) 251 } 252 253 newStoreIdx := (initialStoreIdx + 1) % numStores 254 mtc.transferLease(context.Background(), rangeID, initialStoreIdx, newStoreIdx) 255 256 testutils.SucceedsSoon(t, func() error { 257 if mtc.stores[initialStoreIdx].Gossip().InfoOriginatedHere(gossip.KeySystemConfig) { 258 return errors.New("system config still most recently gossiped by original leaseholder") 259 } 260 if !mtc.stores[newStoreIdx].Gossip().InfoOriginatedHere(gossip.KeySystemConfig) { 261 return errors.New("system config not most recently gossiped by new leaseholder") 262 } 263 return nil 264 }) 265 } 266 267 func TestGossipNodeLivenessOnLeaseChange(t *testing.T) { 268 defer leaktest.AfterTest(t)() 269 sc := kvserver.TestStoreConfig(nil) 270 sc.TestingKnobs.DisableReplicateQueue = true 271 mtc := &multiTestContext{storeConfig: &sc} 272 defer mtc.Stop() 273 const numStores = 3 274 mtc.Start(t, numStores) 275 276 rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(keys.NodeLivenessSpan.Key)).RangeID 277 mtc.replicateRange(rangeID, 1, 2) 278 279 // Turn off liveness heartbeats on all nodes to ensure that updates to node 280 // liveness are not triggering gossiping. 281 for i := range mtc.nodeLivenesses { 282 mtc.nodeLivenesses[i].PauseHeartbeat(true) 283 } 284 285 nodeLivenessKey := gossip.MakeNodeLivenessKey(1) 286 287 initialStoreIdx := -1 288 for i := range mtc.stores { 289 if mtc.stores[i].Gossip().InfoOriginatedHere(nodeLivenessKey) { 290 initialStoreIdx = i 291 } 292 } 293 if initialStoreIdx == -1 { 294 t.Fatalf("no store has gossiped %s; gossip contents: %+v", 295 nodeLivenessKey, mtc.stores[0].Gossip().GetInfoStatus()) 296 } 297 log.Infof(context.Background(), "%s gossiped from n%d", 298 nodeLivenessKey, mtc.stores[initialStoreIdx].Ident.NodeID) 299 300 newStoreIdx := (initialStoreIdx + 1) % numStores 301 mtc.transferLease(context.Background(), rangeID, initialStoreIdx, newStoreIdx) 302 303 testutils.SucceedsSoon(t, func() error { 304 if mtc.stores[initialStoreIdx].Gossip().InfoOriginatedHere(nodeLivenessKey) { 305 return fmt.Errorf("%s still most recently gossiped by original leaseholder", nodeLivenessKey) 306 } 307 if !mtc.stores[newStoreIdx].Gossip().InfoOriginatedHere(nodeLivenessKey) { 308 return fmt.Errorf("%s not most recently gossiped by new leaseholder", nodeLivenessKey) 309 } 310 return nil 311 }) 312 } 313 314 // TestCannotTransferLeaseToVoterOutgoing ensures that the evaluation of lease 315 // requests for nodes which are already in the VOTER_OUTGOING state will fail. 316 func TestCannotTransferLeaseToVoterOutgoing(t *testing.T) { 317 defer leaktest.AfterTest(t)() 318 ctx := context.Background() 319 320 knobs, ltk := makeReplicationTestKnobs() 321 // Add a testing knob to allow us to block the change replicas command 322 // while it is being proposed. When we detect that the change replicas 323 // command to move n3 to VOTER_OUTGOING has been evaluated, we'll send 324 // the request to transfer the lease to n3. The hope is that it will 325 // get past the sanity above latch acquisition prior to change replicas 326 // command committing. 327 var scratchRangeID atomic.Value 328 scratchRangeID.Store(roachpb.RangeID(0)) 329 changeReplicasChan := make(chan chan struct{}, 1) 330 shouldBlock := func(args kvserverbase.ProposalFilterArgs) bool { 331 // Block if a ChangeReplicas command is removing a node from our range. 332 return args.Req.RangeID == scratchRangeID.Load().(roachpb.RangeID) && 333 args.Cmd.ReplicatedEvalResult.ChangeReplicas != nil && 334 len(args.Cmd.ReplicatedEvalResult.ChangeReplicas.Removed()) > 0 335 } 336 blockIfShould := func(args kvserverbase.ProposalFilterArgs) { 337 if shouldBlock(args) { 338 ch := make(chan struct{}) 339 changeReplicasChan <- ch 340 <-ch 341 } 342 } 343 knobs.Store.(*kvserver.StoreTestingKnobs).TestingProposalFilter = func(args kvserverbase.ProposalFilterArgs) *roachpb.Error { 344 blockIfShould(args) 345 return nil 346 } 347 tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{ 348 ServerArgs: base.TestServerArgs{Knobs: knobs}, 349 ReplicationMode: base.ReplicationManual, 350 }) 351 defer tc.Stopper().Stop(ctx) 352 353 scratchStartKey := tc.ScratchRange(t) 354 desc := tc.AddReplicasOrFatal(t, scratchStartKey, tc.Targets(1, 2)...) 355 scratchRangeID.Store(desc.RangeID) 356 // Make sure n1 has the lease to start with. 357 err := tc.Server(0).DB().AdminTransferLease(context.Background(), 358 scratchStartKey, tc.Target(0).StoreID) 359 require.NoError(t, err) 360 361 // The test proceeds as follows: 362 // 363 // - Send an AdminChangeReplicasRequest to remove n3 and add n4 364 // - Block the step that moves n3 to VOTER_OUTGOING on changeReplicasChan 365 // - Send an AdminLeaseTransfer to make n3 the leaseholder 366 // - Try really hard to make sure that the lease transfer at least gets to 367 // latch acquisition before unblocking the ChangeReplicas. 368 // - Unblock the ChangeReplicas. 369 // - Make sure the lease transfer fails. 370 371 ltk.withStopAfterJointConfig(func() { 372 var wg sync.WaitGroup 373 wg.Add(1) 374 go func() { 375 defer wg.Done() 376 _, err = tc.Server(0).DB().AdminChangeReplicas(ctx, 377 scratchStartKey, desc, []roachpb.ReplicationChange{ 378 {ChangeType: roachpb.REMOVE_REPLICA, Target: tc.Target(2)}, 379 {ChangeType: roachpb.ADD_REPLICA, Target: tc.Target(3)}, 380 }) 381 require.NoError(t, err) 382 }() 383 ch := <-changeReplicasChan 384 wg.Add(1) 385 go func() { 386 defer wg.Done() 387 err := tc.Server(0).DB().AdminTransferLease(context.Background(), 388 scratchStartKey, tc.Target(2).StoreID) 389 require.Error(t, err) 390 require.Regexp(t, 391 // The error generated during evaluation. 392 "replica.*of type VOTER_DEMOTING cannot hold lease|"+ 393 // If the lease transfer request has not yet made it to the latching 394 // phase by the time we close(ch) below, we can receive the following 395 // error due to the sanity checking which happens in 396 // AdminTransferLease before attempting to evaluate the lease 397 // transfer. 398 // We have a sleep loop below to try to encourage the lease transfer 399 // to make it past that sanity check prior to letting the change 400 // of replicas proceed. 401 "cannot transfer lease to replica of type VOTER_DEMOTING", err.Error()) 402 }() 403 // Try really hard to make sure that our request makes it past the 404 // sanity check error to the evaluation error. 405 for i := 0; i < 100; i++ { 406 runtime.Gosched() 407 time.Sleep(time.Microsecond) 408 } 409 close(ch) 410 wg.Wait() 411 }) 412 413 } 414 415 // Test the error returned by attempts to create a txn record after a lease 416 // transfer. 417 func TestTimestampCacheErrorAfterLeaseTransfer(t *testing.T) { 418 defer leaktest.AfterTest(t)() 419 ctx := context.Background() 420 tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{}) 421 defer tc.Stopper().Stop(ctx) 422 423 key := []byte("a") 424 rangeDesc, err := tc.LookupRange(key) 425 require.NoError(t, err) 426 427 // Transfer the lease to Servers[0] so we start in a known state. Otherwise, 428 // there might be already a lease owned by a random node. 429 require.NoError(t, tc.TransferRangeLease(rangeDesc, tc.Target(0))) 430 431 // Start a txn and perform a write, so that a txn record has to be created by 432 // the EndTxn. 433 txn := tc.Servers[0].DB().NewTxn(ctx, "test") 434 require.NoError(t, txn.Put(ctx, "a", "val")) 435 // After starting the transaction, transfer the lease. This will wipe the 436 // timestamp cache, which means that the txn record will not be able to be 437 // created (because someone might have already aborted the txn). 438 require.NoError(t, tc.TransferRangeLease(rangeDesc, tc.Target(1))) 439 440 err = txn.Commit(ctx) 441 require.Regexp(t, `TransactionAbortedError\(ABORT_REASON_NEW_LEASE_PREVENTS_TXN\)`, err) 442 }