github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_learner_test.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "context" 15 "fmt" 16 "path/filepath" 17 "sort" 18 "strconv" 19 "strings" 20 "sync/atomic" 21 "testing" 22 "time" 23 24 "github.com/cockroachdb/cockroach/pkg/base" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/server" 28 "github.com/cockroachdb/cockroach/pkg/testutils" 29 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 30 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 31 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" 32 "github.com/cockroachdb/cockroach/pkg/util" 33 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 34 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 35 "github.com/cockroachdb/cockroach/pkg/util/metric" 36 "github.com/cockroachdb/cockroach/pkg/util/tracing" 37 "github.com/cockroachdb/errors" 38 "github.com/stretchr/testify/require" 39 ) 40 41 func predIncoming(rDesc roachpb.ReplicaDescriptor) bool { 42 return rDesc.GetType() == roachpb.VOTER_INCOMING 43 } 44 func predOutgoing(rDesc roachpb.ReplicaDescriptor) bool { 45 return rDesc.GetType() == roachpb.VOTER_OUTGOING 46 } 47 48 func predDemoting(rDesc roachpb.ReplicaDescriptor) bool { 49 return rDesc.GetType() == roachpb.VOTER_DEMOTING 50 } 51 52 type replicationTestKnobs struct { 53 storeKnobs kvserver.StoreTestingKnobs 54 replicaAddStopAfterLearnerAtomic int64 55 replicaAddStopAfterJointConfig int64 56 replicationAlwaysUseJointConfig int64 57 } 58 59 func (rtl *replicationTestKnobs) withStopAfterLearnerAtomic(f func()) { 60 prev := atomic.SwapInt64(&rtl.replicaAddStopAfterLearnerAtomic, 1) 61 defer atomic.StoreInt64(&rtl.replicaAddStopAfterLearnerAtomic, prev) 62 f() 63 } 64 65 func (rtl *replicationTestKnobs) withStopAfterJointConfig(f func()) { 66 au := atomic.SwapInt64(&rtl.replicationAlwaysUseJointConfig, 1) 67 sa := atomic.SwapInt64(&rtl.replicaAddStopAfterJointConfig, 1) 68 defer atomic.StoreInt64(&rtl.replicationAlwaysUseJointConfig, au) 69 defer atomic.StoreInt64(&rtl.replicaAddStopAfterJointConfig, sa) 70 f() 71 } 72 73 func makeReplicationTestKnobs() (base.TestingKnobs, *replicationTestKnobs) { 74 var k replicationTestKnobs 75 k.storeKnobs.ReplicaAddStopAfterLearnerSnapshot = func(_ []roachpb.ReplicationTarget) bool { 76 return atomic.LoadInt64(&k.replicaAddStopAfterLearnerAtomic) > 0 77 } 78 k.storeKnobs.ReplicaAddStopAfterJointConfig = func() bool { 79 return atomic.LoadInt64(&k.replicaAddStopAfterJointConfig) > 0 80 } 81 k.storeKnobs.ReplicationAlwaysUseJointConfig = func() bool { 82 return atomic.LoadInt64(&k.replicationAlwaysUseJointConfig) > 0 83 } 84 return base.TestingKnobs{Store: &k.storeKnobs}, &k 85 } 86 87 func getFirstStoreReplica( 88 t *testing.T, s serverutils.TestServerInterface, key roachpb.Key, 89 ) (*kvserver.Store, *kvserver.Replica) { 90 t.Helper() 91 store, err := s.GetStores().(*kvserver.Stores).GetStore(s.GetFirstStoreID()) 92 require.NoError(t, err) 93 var repl *kvserver.Replica 94 testutils.SucceedsSoon(t, func() error { 95 repl = store.LookupReplica(roachpb.RKey(key)) 96 if repl == nil { 97 return errors.New(`could not find replica`) 98 } 99 return nil 100 }) 101 return store, repl 102 } 103 104 // Some of the metrics used in these tests live on the queue objects and are 105 // registered with of storage.StoreMetrics instead of living on it. Example: 106 // queue.replicate.removelearnerreplica. 107 // 108 // TODO(dan): Move things like ReplicateQueueMetrics to be a field on 109 // storage.StoreMetrics and just keep a reference in newReplicateQueue. Ditto 110 // for other queues that do this. 111 func getFirstStoreMetric(t *testing.T, s serverutils.TestServerInterface, name string) int64 { 112 t.Helper() 113 store, err := s.GetStores().(*kvserver.Stores).GetStore(s.GetFirstStoreID()) 114 require.NoError(t, err) 115 116 var c int64 117 var found bool 118 store.Registry().Each(func(n string, v interface{}) { 119 if name == n { 120 switch t := v.(type) { 121 case *metric.Counter: 122 c = t.Count() 123 found = true 124 case *metric.Gauge: 125 c = t.Value() 126 found = true 127 } 128 } 129 }) 130 if !found { 131 panic(fmt.Sprintf("couldn't find metric %s", name)) 132 } 133 return c 134 } 135 136 func TestAddReplicaViaLearner(t *testing.T) { 137 defer leaktest.AfterTest(t)() 138 // The happy case! \o/ 139 140 blockUntilSnapshotCh := make(chan struct{}) 141 blockSnapshotsCh := make(chan struct{}) 142 knobs, ltk := makeReplicationTestKnobs() 143 ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { 144 close(blockUntilSnapshotCh) 145 select { 146 case <-blockSnapshotsCh: 147 case <-time.After(10 * time.Second): 148 return errors.New(`test timed out`) 149 } 150 return nil 151 } 152 ctx := context.Background() 153 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 154 ServerArgs: base.TestServerArgs{Knobs: knobs}, 155 ReplicationMode: base.ReplicationManual, 156 }) 157 defer tc.Stopper().Stop(ctx) 158 db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) 159 160 scratchStartKey := tc.ScratchRange(t) 161 162 g := ctxgroup.WithContext(ctx) 163 g.GoCtx(func(ctx context.Context) error { 164 _, err := tc.AddReplicas(scratchStartKey, tc.Target(1)) 165 return err 166 }) 167 168 // Wait until the snapshot starts, which happens after the learner has been 169 // added. 170 <-blockUntilSnapshotCh 171 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 172 require.Len(t, desc.Replicas().Voters(), 1) 173 require.Len(t, desc.Replicas().Learners(), 1) 174 175 var voters, nonVoters string 176 db.QueryRow(t, 177 `SELECT array_to_string(replicas, ','), array_to_string(learner_replicas, ',') FROM crdb_internal.ranges_no_leases WHERE range_id = $1`, 178 desc.RangeID, 179 ).Scan(&voters, &nonVoters) 180 require.Equal(t, `1`, voters) 181 require.Equal(t, `2`, nonVoters) 182 183 // Unblock the snapshot and let the learner get promoted to a voter. 184 close(blockSnapshotsCh) 185 require.NoError(t, g.Wait()) 186 187 desc = tc.LookupRangeOrFatal(t, scratchStartKey) 188 require.Len(t, desc.Replicas().Voters(), 2) 189 require.Len(t, desc.Replicas().Learners(), 0) 190 require.Equal(t, int64(1), getFirstStoreMetric(t, tc.Server(1), `range.snapshots.learner-applied`)) 191 } 192 193 func TestLearnerRaftConfState(t *testing.T) { 194 defer leaktest.AfterTest(t)() 195 196 verifyLearnerInRaftOnNodes := func( 197 key roachpb.Key, id roachpb.ReplicaID, servers []*server.TestServer, 198 ) { 199 t.Helper() 200 var repls []*kvserver.Replica 201 for _, s := range servers { 202 _, repl := getFirstStoreReplica(t, s, key) 203 repls = append(repls, repl) 204 } 205 testutils.SucceedsSoon(t, func() error { 206 for _, repl := range repls { 207 status := repl.RaftStatus() 208 if status == nil { 209 return errors.Errorf(`%s is still waking up`, repl) 210 } 211 if _, ok := status.Config.Learners[uint64(id)]; !ok { 212 return errors.Errorf(`%s thinks %d is not a learner`, repl, id) 213 } 214 } 215 return nil 216 }) 217 } 218 219 // Run the TestCluster with a known datadir so we can shut it down and start a 220 // new one on top of the existing data as part of the test. 221 dir, cleanup := testutils.TempDir(t) 222 defer cleanup() 223 224 knobs, ltk := makeReplicationTestKnobs() 225 ctx := context.Background() 226 const numNodes = 2 227 serverArgsPerNode := make(map[int]base.TestServerArgs) 228 for i := 0; i < numNodes; i++ { 229 path := filepath.Join(dir, "testserver", strconv.Itoa(i)) 230 serverArgsPerNode[i] = base.TestServerArgs{ 231 Knobs: knobs, 232 StoreSpecs: []base.StoreSpec{{InMemory: false, Path: path}}, 233 } 234 } 235 tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ 236 ServerArgsPerNode: serverArgsPerNode, 237 ReplicationMode: base.ReplicationManual, 238 }) 239 defer func() { 240 // We modify the value of `tc` below to start up a second cluster, so in 241 // contrast to other tests, run this `defer Stop` in an anonymous func. 242 tc.Stopper().Stop(ctx) 243 }() 244 245 // Add a learner replica, send a snapshot so that it's materialized as a 246 // Replica on the Store, but don't promote it to a voter. 247 scratchStartKey := tc.ScratchRange(t) 248 var desc roachpb.RangeDescriptor 249 ltk.withStopAfterLearnerAtomic(func() { 250 desc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 251 }) 252 require.Len(t, desc.Replicas().Learners(), 1) 253 learnerReplicaID := desc.Replicas().Learners()[0].ReplicaID 254 255 // Verify that raft on every node thinks it's a learner. This checks that we 256 // use ConfChangeAddLearnerNode in the ConfChange and also checks that we 257 // correctly generate the ConfState for the snapshot. 258 verifyLearnerInRaftOnNodes(scratchStartKey, learnerReplicaID, tc.Servers) 259 260 // Shut down the cluster and restart it, then verify again that raft on every 261 // node thinks our learner is a learner. This checks that we generate the 262 // initial ConfState correctly. 263 tc.Stopper().Stop(ctx) 264 tc = testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ 265 ServerArgsPerNode: serverArgsPerNode, 266 ReplicationMode: base.ReplicationManual, 267 }) 268 { 269 // Ping the raft group to wake it up. 270 _, err := tc.Server(0).DB().Get(ctx, scratchStartKey) 271 require.NoError(t, err) 272 } 273 verifyLearnerInRaftOnNodes(scratchStartKey, learnerReplicaID, tc.Servers) 274 } 275 276 func TestLearnerSnapshotFailsRollback(t *testing.T) { 277 defer leaktest.AfterTest(t)() 278 279 var rejectSnapshots int64 280 knobs, ltk := makeReplicationTestKnobs() 281 ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { 282 if atomic.LoadInt64(&rejectSnapshots) > 0 { 283 return errors.New(`nope`) 284 } 285 return nil 286 } 287 ctx := context.Background() 288 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 289 ServerArgs: base.TestServerArgs{Knobs: knobs}, 290 ReplicationMode: base.ReplicationManual, 291 }) 292 defer tc.Stopper().Stop(ctx) 293 294 scratchStartKey := tc.ScratchRange(t) 295 atomic.StoreInt64(&rejectSnapshots, 1) 296 _, err := tc.AddReplicas(scratchStartKey, tc.Target(1)) 297 // TODO(dan): It'd be nice if we could cancel the `AddReplicas` context before 298 // returning the error from the `ReceiveSnapshot` knob to test the codepath 299 // that uses a new context for the rollback, but plumbing that context is 300 // annoying. 301 if !testutils.IsError(err, `remote couldn't accept LEARNER snapshot`) { 302 t.Fatalf(`expected "remote couldn't accept LEARNER snapshot" error got: %+v`, err) 303 } 304 305 // Make sure we cleaned up after ourselves (by removing the learner). 306 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 307 require.Empty(t, desc.Replicas().Learners()) 308 } 309 310 func TestSplitWithLearnerOrJointConfig(t *testing.T) { 311 defer leaktest.AfterTest(t)() 312 ctx := context.Background() 313 314 knobs, ltk := makeReplicationTestKnobs() 315 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 316 ServerArgs: base.TestServerArgs{Knobs: knobs}, 317 ReplicationMode: base.ReplicationManual, 318 }) 319 defer tc.Stopper().Stop(ctx) 320 321 // Add a learner replica, send a snapshot so that it's materialized as a 322 // Replica on the Store, but don't promote it to a voter. 323 scratchStartKey := tc.ScratchRange(t) 324 ltk.withStopAfterLearnerAtomic(func() { 325 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 326 }) 327 328 // Splitting a learner is allowed. This orphans the two learners, but the 329 // replication queue will eventually clean this up. 330 left, right, err := tc.SplitRange(scratchStartKey.Next()) 331 require.NoError(t, err) 332 require.Len(t, left.Replicas().Learners(), 1) 333 require.Len(t, right.Replicas().Learners(), 1) 334 335 // Remove the learner on the RHS. 336 right = tc.RemoveReplicasOrFatal(t, right.StartKey.AsRawKey(), tc.Target(1)) 337 338 // Put an incoming voter on the RHS and split again. This works because the 339 // split auto-transitions us out of the joint conf before doing work. 340 atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) 341 atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) 342 // Use SucceedsSoon to deal with the case where the RHS has not yet been 343 // removed or the split has not yet been processed. 344 testutils.SucceedsSoon(t, func() error { 345 desc, err := tc.AddReplicas(right.StartKey.AsRawKey(), tc.Target(1)) 346 if err == nil { 347 right = desc 348 } else if !testutils.IsError(err, "cannot apply snapshot: snapshot intersects existing range") { 349 t.Fatal(err) 350 } 351 return err 352 }) 353 require.Len(t, right.Replicas().Filter(predIncoming), 1) 354 left, right, err = tc.SplitRange(right.StartKey.AsRawKey().Next()) 355 require.NoError(t, err) 356 require.False(t, left.Replicas().InAtomicReplicationChange(), left) 357 require.False(t, right.Replicas().InAtomicReplicationChange(), right) 358 } 359 360 func TestReplicateQueueSeesLearnerOrJointConfig(t *testing.T) { 361 defer leaktest.AfterTest(t)() 362 // NB also see TestAllocatorRemoveLearner for a lower-level test. 363 364 ctx := context.Background() 365 knobs, ltk := makeReplicationTestKnobs() 366 tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ 367 ServerArgs: base.TestServerArgs{Knobs: knobs}, 368 ReplicationMode: base.ReplicationManual, 369 }) 370 defer tc.Stopper().Stop(ctx) 371 372 // Add a learner replica, send a snapshot so that it's materialized as a 373 // Replica on the Store, but don't promote it to a voter. 374 scratchStartKey := tc.ScratchRange(t) 375 ltk.withStopAfterLearnerAtomic(func() { 376 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 377 }) 378 379 // Run the replicate queue. 380 store, repl := getFirstStoreReplica(t, tc.Server(0), scratchStartKey) 381 { 382 require.Equal(t, int64(0), getFirstStoreMetric(t, tc.Server(0), `queue.replicate.removelearnerreplica`)) 383 _, processErr, err := store.ManuallyEnqueue(ctx, "replicate", repl, true /* skipShouldQueue */) 384 require.NoError(t, err) 385 require.NoError(t, processErr) 386 require.Equal(t, int64(1), getFirstStoreMetric(t, tc.Server(0), `queue.replicate.removelearnerreplica`)) 387 388 // Make sure it deleted the learner. 389 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 390 require.Empty(t, desc.Replicas().Learners()) 391 392 // Bonus points: the replicate queue keeps processing until there is nothing 393 // to do, so it should have upreplicated the range to 3. 394 require.Len(t, desc.Replicas().Voters(), 3) 395 } 396 397 // Create a VOTER_OUTGOING, i.e. a joint configuration. 398 ltk.withStopAfterJointConfig(func() { 399 desc := tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(2)) 400 require.True(t, desc.Replicas().InAtomicReplicationChange(), desc) 401 trace, processErr, err := store.ManuallyEnqueue(ctx, "replicate", repl, true /* skipShouldQueue */) 402 require.NoError(t, err) 403 require.NoError(t, processErr) 404 formattedTrace := trace.String() 405 expectedMessages := []string{ 406 `transitioning out of joint configuration`, 407 } 408 if err := testutils.MatchInOrder(formattedTrace, expectedMessages...); err != nil { 409 t.Fatal(err) 410 } 411 412 desc = tc.LookupRangeOrFatal(t, scratchStartKey) 413 require.False(t, desc.Replicas().InAtomicReplicationChange(), desc) 414 // Queue processed again, so we're back to three replicas. 415 require.Len(t, desc.Replicas().Voters(), 3) 416 }) 417 } 418 419 func TestReplicaGCQueueSeesLearnerOrJointConfig(t *testing.T) { 420 defer leaktest.AfterTest(t)() 421 ctx := context.Background() 422 knobs, ltk := makeReplicationTestKnobs() 423 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 424 ServerArgs: base.TestServerArgs{Knobs: knobs}, 425 ReplicationMode: base.ReplicationManual, 426 }) 427 defer tc.Stopper().Stop(ctx) 428 429 // Add a learner replica, send a snapshot so that it's materialized as a 430 // Replica on the Store, but don't promote it to a voter. 431 scratchStartKey := tc.ScratchRange(t) 432 ltk.withStopAfterLearnerAtomic(func() { 433 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 434 }) 435 436 // Run the replicaGC queue. 437 checkNoGC := func() roachpb.RangeDescriptor { 438 store, repl := getFirstStoreReplica(t, tc.Server(1), scratchStartKey) 439 trace, processErr, err := store.ManuallyEnqueue(ctx, "replicaGC", repl, true /* skipShouldQueue */) 440 require.NoError(t, err) 441 require.NoError(t, processErr) 442 const msg = `not gc'able, replica is still in range descriptor: (n2,s2):` 443 require.Contains(t, trace.String(), msg) 444 return tc.LookupRangeOrFatal(t, scratchStartKey) 445 } 446 desc := checkNoGC() 447 // Make sure it didn't collect the learner. 448 require.NotEmpty(t, desc.Replicas().Learners()) 449 450 // Now get the range into a joint config. 451 tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(1)) // remove learner 452 453 ltk.withStopAfterJointConfig(func() { 454 desc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 455 require.Len(t, desc.Replicas().Filter(predIncoming), 1, desc) 456 }) 457 458 postDesc := checkNoGC() 459 require.Equal(t, desc, postDesc) 460 } 461 462 func TestRaftSnapshotQueueSeesLearner(t *testing.T) { 463 defer leaktest.AfterTest(t)() 464 ctx := context.Background() 465 blockSnapshotsCh := make(chan struct{}) 466 knobs, ltk := makeReplicationTestKnobs() 467 ltk.storeKnobs.DisableRaftSnapshotQueue = true 468 ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { 469 select { 470 case <-blockSnapshotsCh: 471 case <-time.After(10 * time.Second): 472 return errors.New(`test timed out`) 473 } 474 return nil 475 } 476 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 477 ServerArgs: base.TestServerArgs{Knobs: knobs}, 478 ReplicationMode: base.ReplicationManual, 479 }) 480 defer tc.Stopper().Stop(ctx) 481 482 // Create a learner replica. 483 scratchStartKey := tc.ScratchRange(t) 484 g := ctxgroup.WithContext(ctx) 485 g.GoCtx(func(ctx context.Context) error { 486 _, err := tc.AddReplicas(scratchStartKey, tc.Target(1)) 487 return err 488 }) 489 490 // Note the value of the metrics before. 491 generatedBefore := getFirstStoreMetric(t, tc.Server(0), `range.snapshots.generated`) 492 raftAppliedBefore := getFirstStoreMetric(t, tc.Server(0), `range.snapshots.normal-applied`) 493 494 // Run the raftsnapshot queue. SucceedsSoon because it may take a bit for 495 // raft to figure out that the replica needs a snapshot. 496 store, repl := getFirstStoreReplica(t, tc.Server(0), scratchStartKey) 497 testutils.SucceedsSoon(t, func() error { 498 trace, processErr, err := store.ManuallyEnqueue(ctx, "raftsnapshot", repl, true /* skipShouldQueue */) 499 if err != nil { 500 return err 501 } 502 if processErr != nil { 503 return processErr 504 } 505 const msg = `skipping snapshot; replica is likely a learner in the process of being added: (n2,s2):2LEARNER` 506 formattedTrace := trace.String() 507 if !strings.Contains(formattedTrace, msg) { 508 return errors.Errorf(`expected "%s" in trace got:\n%s`, msg, formattedTrace) 509 } 510 return nil 511 }) 512 513 // Make sure it didn't send any RAFT snapshots. 514 require.Equal(t, generatedBefore, getFirstStoreMetric(t, tc.Server(0), `range.snapshots.generated`)) 515 require.Equal(t, raftAppliedBefore, getFirstStoreMetric(t, tc.Server(0), `range.snapshots.normal-applied`)) 516 517 close(blockSnapshotsCh) 518 require.NoError(t, g.Wait()) 519 } 520 521 // This test verifies the result of a race between the replicate queue running 522 // while an AdminChangeReplicas is adding a replica. 523 func TestLearnerAdminChangeReplicasRace(t *testing.T) { 524 defer leaktest.AfterTest(t)() 525 526 blockUntilSnapshotCh := make(chan struct{}, 2) 527 blockSnapshotsCh := make(chan struct{}) 528 knobs, ltk := makeReplicationTestKnobs() 529 ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { 530 blockUntilSnapshotCh <- struct{}{} 531 <-blockSnapshotsCh 532 return nil 533 } 534 ctx := context.Background() 535 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 536 ServerArgs: base.TestServerArgs{Knobs: knobs}, 537 ReplicationMode: base.ReplicationManual, 538 }) 539 defer tc.Stopper().Stop(ctx) 540 541 // Add the learner. 542 scratchStartKey := tc.ScratchRange(t) 543 g := ctxgroup.WithContext(ctx) 544 g.GoCtx(func(ctx context.Context) error { 545 _, err := tc.AddReplicas(scratchStartKey, tc.Target(1)) 546 return err 547 }) 548 549 // Wait until the snapshot starts, which happens after the learner has been 550 // added. 551 <-blockUntilSnapshotCh 552 553 // Removes the learner out from under the coordinator running on behalf of 554 // AddReplicas. This simulates the replicate queue running concurrently. The 555 // first thing the replicate queue would do is remove any learners it sees. 556 _, err := tc.RemoveReplicas(scratchStartKey, tc.Target(1)) 557 require.NoError(t, err) 558 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 559 require.Len(t, desc.Replicas().Voters(), 1) 560 require.Len(t, desc.Replicas().Learners(), 0) 561 562 // Unblock the snapshot, and surprise AddReplicas. It should retry and error 563 // that the descriptor has changed since the AdminChangeReplicas command 564 // started. Alternatively it may fail in sending the snapshot because of a 565 // "raft group deleted" error if the newly added learner attempts to send 566 // a raft message to another node after it has been removed and then destroys 567 // itself in response to a ReplicaTooOldError. 568 close(blockSnapshotsCh) 569 const msgRE = `descriptor changed|raft group deleted` 570 if err := g.Wait(); !testutils.IsError(err, msgRE) { 571 t.Fatalf(`expected %q error got: %+v`, msgRE, err) 572 } 573 desc = tc.LookupRangeOrFatal(t, scratchStartKey) 574 require.Len(t, desc.Replicas().Voters(), 1) 575 require.Len(t, desc.Replicas().Learners(), 0) 576 } 577 578 // This test verifies the result of a race between the replicate queue running 579 // for the same range from two different nodes. This can happen around 580 // leadership changes. 581 func TestLearnerReplicateQueueRace(t *testing.T) { 582 defer leaktest.AfterTest(t)() 583 584 var skipReceiveSnapshotKnobAtomic int64 = 1 585 blockUntilSnapshotCh := make(chan struct{}, 2) 586 blockSnapshotsCh := make(chan struct{}) 587 knobs, ltk := makeReplicationTestKnobs() 588 // We must disable eager replica removal to make this test reliable. 589 // If we don't then it's possible that the removed replica on store 2 will 590 // notice it's removed before the snapshot is sent by the replicate queue. 591 // In this case we'll get a snapshot error from the replicate queue which 592 // will retry the up-replication with a new descriptor and succeed. 593 ltk.storeKnobs.DisableEagerReplicaRemoval = true 594 ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { 595 if atomic.LoadInt64(&skipReceiveSnapshotKnobAtomic) > 0 { 596 return nil 597 } 598 blockUntilSnapshotCh <- struct{}{} 599 <-blockSnapshotsCh 600 return nil 601 } 602 ctx := context.Background() 603 tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ 604 ServerArgs: base.TestServerArgs{Knobs: knobs}, 605 ReplicationMode: base.ReplicationManual, 606 }) 607 defer tc.Stopper().Stop(ctx) 608 609 scratchStartKey := tc.ScratchRange(t) 610 store, repl := getFirstStoreReplica(t, tc.Server(0), scratchStartKey) 611 612 // Start with 2 replicas so the replicate queue can go from 2->3, otherwise it 613 // will refuse to upreplicate to a fragile quorum of 1->2. 614 tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 615 atomic.StoreInt64(&skipReceiveSnapshotKnobAtomic, 0) 616 617 // Run the replicate queue, this will add a learner to node 3 and start 618 // sending it a snapshot. This will eventually fail and we assert some things 619 // about the trace to prove it failed in the way we want. 620 queue1ErrCh := make(chan error, 1) 621 go func() { 622 queue1ErrCh <- func() error { 623 trace, processErr, err := store.ManuallyEnqueue(ctx, "replicate", repl, true /* skipShouldQueue */) 624 if err != nil { 625 return err 626 } 627 if !strings.Contains(processErr.Error(), `descriptor changed`) { 628 return errors.Errorf(`expected "descriptor changed" error got: %+v`, processErr) 629 } 630 formattedTrace := trace.String() 631 expectedMessages := []string{ 632 `could not promote .*n3,s3.* to voter, rolling back: change replicas of r\d+ failed: descriptor changed`, 633 `learner to roll back not found`, 634 } 635 return testutils.MatchInOrder(formattedTrace, expectedMessages...) 636 }() 637 }() 638 639 // Wait until the snapshot starts, which happens after the learner has been 640 // added. 641 <-blockUntilSnapshotCh 642 643 // Remove the learner on node 3 out from under the replicate queue. This 644 // simulates a second replicate queue running concurrently. The first thing 645 // this second replicate queue would do is remove any learners it sees, 646 // leaving the 2 voters. 647 desc, err := tc.RemoveReplicas(scratchStartKey, tc.Target(2)) 648 require.NoError(t, err) 649 require.Len(t, desc.Replicas().Voters(), 2) 650 require.Len(t, desc.Replicas().Learners(), 0) 651 652 // Unblock the snapshot, and surprise the replicate queue. It should retry, 653 // get a descriptor changed error, and realize it should stop. 654 close(blockSnapshotsCh) 655 require.NoError(t, <-queue1ErrCh) 656 desc = tc.LookupRangeOrFatal(t, scratchStartKey) 657 require.Len(t, desc.Replicas().Voters(), 2) 658 require.Len(t, desc.Replicas().Learners(), 0) 659 } 660 661 func TestLearnerNoAcceptLease(t *testing.T) { 662 defer leaktest.AfterTest(t)() 663 ctx := context.Background() 664 knobs, ltk := makeReplicationTestKnobs() 665 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 666 ServerArgs: base.TestServerArgs{Knobs: knobs}, 667 ReplicationMode: base.ReplicationManual, 668 }) 669 defer tc.Stopper().Stop(ctx) 670 671 // Add a learner replica, send a snapshot so that it's materialized as a 672 // Replica on the Store, but don't promote it to a voter. 673 scratchStartKey := tc.ScratchRange(t) 674 ltk.withStopAfterLearnerAtomic(func() { 675 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 676 }) 677 678 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 679 err := tc.TransferRangeLease(desc, tc.Target(1)) 680 if !testutils.IsError(err, `cannot transfer lease to replica of type LEARNER`) { 681 t.Fatalf(`expected "cannot transfer lease to replica of type LEARNER" error got: %+v`, err) 682 } 683 } 684 685 // TestJointConfigLease verifies that incoming and outgoing voters can't have the 686 // lease transferred to them. 687 func TestJointConfigLease(t *testing.T) { 688 defer leaktest.AfterTest(t)() 689 ctx := context.Background() 690 knobs, ltk := makeReplicationTestKnobs() 691 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 692 ServerArgs: base.TestServerArgs{Knobs: knobs}, 693 ReplicationMode: base.ReplicationManual, 694 }) 695 defer tc.Stopper().Stop(ctx) 696 697 k := tc.ScratchRange(t) 698 atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) 699 atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) 700 desc := tc.AddReplicasOrFatal(t, k, tc.Target(1)) 701 require.True(t, desc.Replicas().InAtomicReplicationChange(), desc) 702 703 err := tc.TransferRangeLease(desc, tc.Target(1)) 704 exp := `cannot transfer lease to replica of type VOTER_INCOMING` 705 require.True(t, testutils.IsError(err, exp), err) 706 707 // NB: we don't have to transition out of the previous joint config first 708 // because this is done automatically by ChangeReplicas before it does what 709 // it's asked to do. 710 desc = tc.RemoveReplicasOrFatal(t, k, tc.Target(1)) 711 err = tc.TransferRangeLease(desc, tc.Target(1)) 712 exp = `cannot transfer lease to replica of type VOTER_DEMOTING` 713 require.True(t, testutils.IsError(err, exp), err) 714 } 715 716 func TestLearnerAndJointConfigFollowerRead(t *testing.T) { 717 defer leaktest.AfterTest(t)() 718 719 if util.RaceEnabled { 720 // Limiting how long transactions can run does not work well with race 721 // unless we're extremely lenient, which drives up the test duration. 722 t.Skip("skipping under race") 723 } 724 725 ctx := context.Background() 726 knobs, ltk := makeReplicationTestKnobs() 727 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 728 ServerArgs: base.TestServerArgs{Knobs: knobs}, 729 ReplicationMode: base.ReplicationManual, 730 }) 731 defer tc.Stopper().Stop(ctx) 732 db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) 733 db.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.target_duration = $1`, testingTargetDuration) 734 db.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.close_fraction = $1`, closeFraction) 735 db.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.follower_reads_enabled = true`) 736 737 scratchStartKey := tc.ScratchRange(t) 738 var scratchDesc roachpb.RangeDescriptor 739 ltk.withStopAfterLearnerAtomic(func() { 740 scratchDesc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 741 }) 742 743 check := func() { 744 req := roachpb.BatchRequest{Header: roachpb.Header{ 745 RangeID: scratchDesc.RangeID, 746 Timestamp: tc.Server(0).Clock().Now(), 747 }} 748 req.Add(&roachpb.ScanRequest{RequestHeader: roachpb.RequestHeader{ 749 Key: scratchDesc.StartKey.AsRawKey(), EndKey: scratchDesc.EndKey.AsRawKey(), 750 }}) 751 752 _, repl := getFirstStoreReplica(t, tc.Server(1), scratchStartKey) 753 testutils.SucceedsSoon(t, func() error { 754 // Trace the Send call so we can verify that it hit the exact `learner 755 // replicas cannot serve follower reads` branch that we're trying to test. 756 sendCtx, collect, cancel := tracing.ContextWithRecordingSpan(ctx, "manual read request") 757 defer cancel() 758 _, pErr := repl.Send(sendCtx, req) 759 err := pErr.GoError() 760 if !testutils.IsError(err, `not lease holder`) { 761 return errors.Errorf(`expected "not lease holder" error got: %+v`, err) 762 } 763 const msg = `cannot serve follower reads` 764 formattedTrace := collect().String() 765 if !strings.Contains(formattedTrace, msg) { 766 return errors.Errorf("expected a trace with `%s` got:\n%s", msg, formattedTrace) 767 } 768 return nil 769 }) 770 } 771 772 // Can't serve follower read from the LEARNER. 773 check() 774 775 atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) 776 atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) 777 778 scratchDesc = tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 779 // Removing a learner doesn't get you into a joint state (no voters changed). 780 require.False(t, scratchDesc.Replicas().InAtomicReplicationChange(), scratchDesc) 781 scratchDesc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 782 783 // Re-add the voter and remain in joint config. 784 require.True(t, scratchDesc.Replicas().InAtomicReplicationChange(), scratchDesc) 785 require.Len(t, scratchDesc.Replicas().Filter(predIncoming), 1) 786 787 // Can't serve follower read from the VOTER_INCOMING. 788 check() 789 790 // Remove the voter and remain in joint config. 791 scratchDesc = tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 792 require.True(t, scratchDesc.Replicas().InAtomicReplicationChange(), scratchDesc) 793 require.Len(t, scratchDesc.Replicas().Filter(predDemoting), 1) 794 795 // Can't serve follower read from the VOTER_OUTGOING. 796 check() 797 } 798 799 func TestLearnerOrJointConfigAdminRelocateRange(t *testing.T) { 800 defer leaktest.AfterTest(t)() 801 802 ctx := context.Background() 803 knobs, ltk := makeReplicationTestKnobs() 804 tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{ 805 ServerArgs: base.TestServerArgs{Knobs: knobs}, 806 ReplicationMode: base.ReplicationManual, 807 }) 808 defer tc.Stopper().Stop(ctx) 809 810 _, err := tc.Conns[0].Exec(`SET CLUSTER SETTING kv.atomic_replication_changes.enabled = true`) 811 require.NoError(t, err) 812 813 scratchStartKey := tc.ScratchRange(t) 814 ltk.withStopAfterLearnerAtomic(func() { 815 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 816 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(2)) 817 }) 818 819 check := func(targets []roachpb.ReplicationTarget) { 820 require.NoError(t, tc.Server(0).DB().AdminRelocateRange(ctx, scratchStartKey, targets)) 821 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 822 voters := desc.Replicas().Voters() 823 require.Len(t, voters, len(targets)) 824 sort.Slice(voters, func(i, j int) bool { return voters[i].NodeID < voters[j].NodeID }) 825 for i := range voters { 826 require.Equal(t, targets[i].NodeID, voters[i].NodeID, `%v`, voters) 827 require.Equal(t, targets[i].StoreID, voters[i].StoreID, `%v`, voters) 828 } 829 require.Empty(t, desc.Replicas().Learners()) 830 require.Empty(t, desc.Replicas().Filter(predIncoming)) 831 require.Empty(t, desc.Replicas().Filter(predOutgoing)) 832 } 833 834 // Test AdminRelocateRange's treatment of learners by having one that it has 835 // to remove and one that should stay and become a voter. 836 // 837 // Before: 1 (voter), 2 (learner), 3 (learner) 838 // After: 1 (voter), 2 (voter), 4 (voter) 839 check([]roachpb.ReplicationTarget{tc.Target(0), tc.Target(1), tc.Target(3)}) 840 841 // AdminRelocateRange should leave joint configs before doing its thing. 842 // 843 // Before: 1 (voter), 2 (voter), 4 (demoting) 844 // After: 1 (voter), 2 (voter), 3 (voter) 845 atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) 846 atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) 847 desc := tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(3)) 848 require.True(t, desc.Replicas().InAtomicReplicationChange(), desc) 849 require.Len(t, desc.Replicas().Filter(predDemoting), 1) 850 atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 0) 851 check([]roachpb.ReplicationTarget{tc.Target(0), tc.Target(1), tc.Target(2)}) 852 } 853 854 func TestLearnerAndJointConfigAdminMerge(t *testing.T) { 855 defer leaktest.AfterTest(t)() 856 857 ctx := context.Background() 858 knobs, ltk := makeReplicationTestKnobs() 859 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 860 ServerArgs: base.TestServerArgs{Knobs: knobs}, 861 ReplicationMode: base.ReplicationManual, 862 }) 863 defer tc.Stopper().Stop(ctx) 864 865 scratchStartKey := tc.ScratchRange(t) 866 splitKey1 := scratchStartKey.Next() 867 splitKey2 := splitKey1.Next() 868 _, _ = tc.SplitRangeOrFatal(t, splitKey1) 869 _, _ = tc.SplitRangeOrFatal(t, splitKey2) 870 871 // Three ranges (in that order): 872 // desc1: will have a learner (later joint voter) 873 // desc2 (unnamed): is always left vanilla 874 // desc3: like desc1 875 // 876 // This allows testing merges that have a learner on the RHS (on desc2) and 877 // the LHS (on desc1). 878 var desc1, desc3 roachpb.RangeDescriptor 879 ltk.withStopAfterLearnerAtomic(func() { 880 desc1 = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 881 desc3 = tc.AddReplicasOrFatal(t, splitKey2, tc.Target(1)) 882 }) 883 884 checkFails := func() { 885 err := tc.Server(0).DB().AdminMerge(ctx, scratchStartKey) 886 if exp := `cannot merge range with non-voter replicas on`; !testutils.IsError(err, exp) { 887 t.Fatalf(`expected "%s" error got: %+v`, exp, err) 888 } 889 err = tc.Server(0).DB().AdminMerge(ctx, splitKey1) 890 if exp := `cannot merge range with non-voter replicas on`; !testutils.IsError(err, exp) { 891 t.Fatalf(`expected "%s" error got: %+v`, exp, err) 892 } 893 } 894 895 // LEARNER on the lhs or rhs should fail. 896 // desc{1,2,3} = (VOTER_FULL, LEARNER) (VOTER_FULL) (VOTER_FULL, LEARNER) 897 checkFails() 898 899 // Turn the learners on desc1 and desc3 into VOTER_INCOMINGs. 900 atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) 901 atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) 902 desc1 = tc.RemoveReplicasOrFatal(t, desc1.StartKey.AsRawKey(), tc.Target(1)) 903 desc1 = tc.AddReplicasOrFatal(t, desc1.StartKey.AsRawKey(), tc.Target(1)) 904 require.Len(t, desc1.Replicas().Filter(predIncoming), 1) 905 desc3 = tc.RemoveReplicasOrFatal(t, desc3.StartKey.AsRawKey(), tc.Target(1)) 906 desc3 = tc.AddReplicasOrFatal(t, desc3.StartKey.AsRawKey(), tc.Target(1)) 907 require.Len(t, desc1.Replicas().Filter(predIncoming), 1) 908 909 // VOTER_INCOMING on the lhs or rhs should fail. 910 // desc{1,2,3} = (VOTER_FULL, VOTER_INCOMING) (VOTER_FULL) (VOTER_FULL, VOTER_INCOMING) 911 checkFails() 912 913 // Turn the incoming voters on desc1 and desc3 into VOTER_DEMOTINGs. 914 // desc{1,2,3} = (VOTER_FULL, VOTER_DEMOTING) (VOTER_FULL) (VOTER_FULL, VOTER_DEMOTING) 915 desc1 = tc.RemoveReplicasOrFatal(t, desc1.StartKey.AsRawKey(), tc.Target(1)) 916 require.Len(t, desc1.Replicas().Filter(predDemoting), 1) 917 desc3 = tc.RemoveReplicasOrFatal(t, desc3.StartKey.AsRawKey(), tc.Target(1)) 918 require.Len(t, desc3.Replicas().Filter(predDemoting), 1) 919 920 // VOTER_DEMOTING on the lhs or rhs should fail. 921 checkFails() 922 923 // Add a VOTER_INCOMING to desc2 to make sure it actually excludes this type 924 // of replicas from merges (rather than really just checking whether the 925 // replica sets are equal). 926 // desc{1,2,3} = (VOTER_FULL, VOTER_DEMOTING) (VOTER_FULL, VOTER_INCOMING) (VOTER_FULL, VOTER_DEMOTING) 927 desc2 := tc.AddReplicasOrFatal(t, splitKey1, tc.Target(1)) 928 require.Len(t, desc2.Replicas().Filter(predIncoming), 1) 929 930 checkFails() 931 932 // Ditto VOTER_DEMOTING. 933 // desc{1,2,3} = (VOTER_FULL, VOTER_DEMOTING) (VOTER_FULL, VOTER_DEMOTING) (VOTER_FULL, VOTER_DEMOTING) 934 desc2 = tc.RemoveReplicasOrFatal(t, desc2.StartKey.AsRawKey(), tc.Target(1)) 935 require.Len(t, desc2.Replicas().Filter(predDemoting), 1) 936 937 checkFails() 938 } 939 940 func TestMergeQueueSeesLearnerOrJointConfig(t *testing.T) { 941 defer leaktest.AfterTest(t)() 942 ctx := context.Background() 943 knobs, ltk := makeReplicationTestKnobs() 944 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 945 ServerArgs: base.TestServerArgs{Knobs: knobs}, 946 ReplicationMode: base.ReplicationManual, 947 }) 948 defer tc.Stopper().Stop(ctx) 949 db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) 950 // TestCluster currently overrides this when used with ReplicationManual. 951 db.Exec(t, `SET CLUSTER SETTING kv.range_merge.queue_enabled = true`) 952 953 scratchStartKey := tc.ScratchRange(t) 954 origDesc := tc.LookupRangeOrFatal(t, scratchStartKey) 955 956 splitKey := scratchStartKey.Next() 957 958 splitAndUnsplit := func() roachpb.RangeDescriptor { 959 desc, _ := tc.SplitRangeOrFatal(t, splitKey) 960 // Unsplit the range to clear the sticky bit. 961 require.NoError(t, tc.Server(0).DB().AdminUnsplit(ctx, splitKey)) 962 return desc 963 } 964 965 // Run the merge queue while there's a learner on the LHS. 966 { 967 splitAndUnsplit() 968 969 ltk.withStopAfterLearnerAtomic(func() { 970 _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 971 }) 972 973 store, repl := getFirstStoreReplica(t, tc.Server(0), scratchStartKey) 974 trace, processErr, err := store.ManuallyEnqueue(ctx, "merge", repl, true /* skipShouldQueue */) 975 require.NoError(t, err) 976 require.NoError(t, processErr) 977 formattedTrace := trace.String() 978 expectedMessages := []string{ 979 `removing learner replicas \[n2,s2\]`, 980 `merging to produce range: /Table/Max-/Max`, 981 } 982 if err := testutils.MatchInOrder(formattedTrace, expectedMessages...); err != nil { 983 t.Fatal(err) 984 } 985 986 // Sanity check that the desc has the same bounds it did originally. 987 desc := tc.LookupRangeOrFatal(t, scratchStartKey) 988 require.Equal(t, origDesc.StartKey, desc.StartKey) 989 require.Equal(t, origDesc.EndKey, desc.EndKey) 990 // The merge removed the learner. 991 require.Len(t, desc.Replicas().Voters(), 1) 992 require.Empty(t, desc.Replicas().Learners()) 993 } 994 995 // Create the RHS again and repeat the same game, except this time the LHS 996 // gets a VOTER_INCOMING for s2, and then the merge queue runs into it. It 997 // will transition the LHS out of the joint config and then do the merge. 998 { 999 desc := splitAndUnsplit() 1000 1001 ltk.withStopAfterJointConfig(func() { 1002 desc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) 1003 }) 1004 require.Len(t, desc.Replicas().Filter(predIncoming), 1, desc) 1005 1006 checkTransitioningOut := func() { 1007 t.Helper() 1008 store, repl := getFirstStoreReplica(t, tc.Server(0), scratchStartKey) 1009 trace, processErr, err := store.ManuallyEnqueue(ctx, "merge", repl, true /* skipShouldQueue */) 1010 require.NoError(t, err) 1011 require.NoError(t, processErr) 1012 formattedTrace := trace.String() 1013 expectedMessages := []string{ 1014 `transitioning out of joint configuration`, 1015 `merging to produce range: /Table/Max-/Max`, 1016 } 1017 if err := testutils.MatchInOrder(formattedTrace, expectedMessages...); err != nil { 1018 t.Fatal(err) 1019 } 1020 } 1021 1022 checkTransitioningOut() 1023 desc = tc.LookupRangeOrFatal(t, scratchStartKey) 1024 require.Len(t, desc.Replicas().Voters(), 2) 1025 require.False(t, desc.Replicas().InAtomicReplicationChange(), desc) 1026 1027 // Repeat the game, except now we start with two replicas and we're 1028 // giving the RHS a VOTER_OUTGOING. 1029 desc = splitAndUnsplit() 1030 ltk.withStopAfterJointConfig(func() { 1031 descRight := tc.RemoveReplicasOrFatal(t, desc.EndKey.AsRawKey(), tc.Target(1)) 1032 require.Len(t, descRight.Replicas().Filter(predDemoting), 1, desc) 1033 }) 1034 1035 // This should transition out (i.e. remove the voter on s2 for the RHS) 1036 // and then do its thing, which means in the end we have two voters again. 1037 checkTransitioningOut() 1038 desc = tc.LookupRangeOrFatal(t, scratchStartKey) 1039 require.Len(t, desc.Replicas().Voters(), 2) 1040 require.False(t, desc.Replicas().InAtomicReplicationChange(), desc) 1041 } 1042 }