github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_raft_test.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math" 18 "math/rand" 19 "reflect" 20 "runtime" 21 "strconv" 22 "sync" 23 "sync/atomic" 24 "testing" 25 "time" 26 27 "github.com/cockroachdb/cockroach/pkg/base" 28 "github.com/cockroachdb/cockroach/pkg/gossip" 29 "github.com/cockroachdb/cockroach/pkg/keys" 30 "github.com/cockroachdb/cockroach/pkg/kv" 31 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 32 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 33 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 34 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 35 "github.com/cockroachdb/cockroach/pkg/roachpb" 36 "github.com/cockroachdb/cockroach/pkg/rpc" 37 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 38 "github.com/cockroachdb/cockroach/pkg/server/serverpb" 39 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 40 "github.com/cockroachdb/cockroach/pkg/storage" 41 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 42 "github.com/cockroachdb/cockroach/pkg/testutils" 43 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 44 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" 45 "github.com/cockroachdb/cockroach/pkg/util" 46 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 47 "github.com/cockroachdb/cockroach/pkg/util/hlc" 48 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 49 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 50 "github.com/cockroachdb/cockroach/pkg/util/log" 51 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 52 "github.com/cockroachdb/cockroach/pkg/util/stop" 53 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 54 "github.com/cockroachdb/cockroach/pkg/util/uuid" 55 "github.com/cockroachdb/errors" 56 "github.com/stretchr/testify/assert" 57 "github.com/stretchr/testify/require" 58 "go.etcd.io/etcd/raft" 59 "go.etcd.io/etcd/raft/raftpb" 60 "google.golang.org/grpc" 61 ) 62 63 // mustGetInt decodes an int64 value from the bytes field of the receiver 64 // and panics if the bytes field is not 0 or 8 bytes in length. 65 func mustGetInt(v *roachpb.Value) int64 { 66 if v == nil { 67 return 0 68 } 69 i, err := v.GetInt() 70 if err != nil { 71 panic(err) 72 } 73 return i 74 } 75 76 // TestStoreRecoverFromEngine verifies that the store recovers all ranges and their contents 77 // after being stopped and recreated. 78 func TestStoreRecoverFromEngine(t *testing.T) { 79 defer leaktest.AfterTest(t)() 80 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 81 storeCfg.TestingKnobs.DisableSplitQueue = true 82 storeCfg.TestingKnobs.DisableMergeQueue = true 83 84 const rangeID = roachpb.RangeID(1) 85 splitKey := roachpb.Key("m") 86 key1 := roachpb.Key("a") 87 key2 := roachpb.Key("z") 88 89 engineStopper := stop.NewStopper() 90 defer engineStopper.Stop(context.Background()) 91 eng := storage.NewDefaultInMem() 92 engineStopper.AddCloser(eng) 93 var rangeID2 roachpb.RangeID 94 95 get := func(store *kvserver.Store, rangeID roachpb.RangeID, key roachpb.Key) int64 { 96 args := getArgs(key) 97 resp, err := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{ 98 RangeID: rangeID, 99 }, args) 100 if err != nil { 101 t.Fatal(err) 102 } 103 return mustGetInt(resp.(*roachpb.GetResponse).Value) 104 } 105 validate := func(store *kvserver.Store) { 106 if val := get(store, rangeID, key1); val != 13 { 107 t.Errorf("key %q: expected 13 but got %v", key1, val) 108 } 109 if val := get(store, rangeID2, key2); val != 28 { 110 t.Errorf("key %q: expected 28 but got %v", key2, val) 111 } 112 } 113 114 // First, populate the store with data across two ranges. Each range contains commands 115 // that both predate and postdate the split. 116 func() { 117 stopper := stop.NewStopper() 118 defer stopper.Stop(context.Background()) 119 store := createTestStoreWithOpts(t, 120 testStoreOpts{ 121 eng: eng, 122 cfg: &storeCfg, 123 // This test was written before the test stores were able to start with 124 // more than one range and is not prepared to handle many ranges. 125 dontCreateSystemRanges: true, 126 }, 127 stopper) 128 129 increment := func(rangeID roachpb.RangeID, key roachpb.Key, value int64) (*roachpb.IncrementResponse, *roachpb.Error) { 130 args := incrementArgs(key, value) 131 resp, err := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{ 132 RangeID: rangeID, 133 }, args) 134 incResp, _ := resp.(*roachpb.IncrementResponse) 135 return incResp, err 136 } 137 138 if _, err := increment(rangeID, key1, 2); err != nil { 139 t.Fatal(err) 140 } 141 if _, err := increment(rangeID, key2, 5); err != nil { 142 t.Fatal(err) 143 } 144 splitArgs := adminSplitArgs(splitKey) 145 if _, err := kv.SendWrapped(context.Background(), store.TestSender(), splitArgs); err != nil { 146 t.Fatal(err) 147 } 148 rangeID2 = store.LookupReplica(roachpb.RKey(key2)).RangeID 149 if rangeID2 == rangeID { 150 t.Fatal("got same range id after split") 151 } 152 if _, err := increment(rangeID, key1, 11); err != nil { 153 t.Fatal(err) 154 } 155 if _, err := increment(rangeID2, key2, 23); err != nil { 156 t.Fatal(err) 157 } 158 validate(store) 159 }() 160 161 // Now create a new store with the same engine and make sure the expected data is present. 162 // We must use the same clock because a newly-created manual clock will be behind the one 163 // we wrote with and so will see stale MVCC data. 164 store := createTestStoreWithOpts(t, 165 testStoreOpts{ 166 dontBootstrap: true, 167 eng: eng, 168 cfg: &storeCfg, 169 }, 170 engineStopper) 171 172 // Raft processing is initialized lazily; issue a no-op write request on each key to 173 // ensure that is has been started. 174 incArgs := incrementArgs(key1, 0) 175 if _, err := kv.SendWrapped(context.Background(), store.TestSender(), incArgs); err != nil { 176 t.Fatal(err) 177 } 178 incArgs = incrementArgs(key2, 0) 179 if _, err := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{ 180 RangeID: rangeID2, 181 }, incArgs); err != nil { 182 t.Fatal(err) 183 } 184 185 validate(store) 186 } 187 188 // TestStoreRecoverWithErrors verifies that even commands that fail are marked as 189 // applied so they are not retried after recovery. 190 func TestStoreRecoverWithErrors(t *testing.T) { 191 defer leaktest.AfterTest(t)() 192 storeCfg := kvserver.TestStoreConfig(nil) 193 // Splits can cause our chosen keys to end up on ranges other than range 1, 194 // and trying to handle that complicates the test without providing any 195 // added benefit. 196 storeCfg.TestingKnobs.DisableSplitQueue = true 197 eng := storage.NewDefaultInMem() 198 defer eng.Close() 199 200 numIncrements := 0 201 202 func() { 203 stopper := stop.NewStopper() 204 defer stopper.Stop(context.Background()) 205 keyA := roachpb.Key("a") 206 storeCfg := storeCfg // copy 207 storeCfg.TestingKnobs.EvalKnobs.TestingEvalFilter = 208 func(filterArgs kvserverbase.FilterArgs) *roachpb.Error { 209 _, ok := filterArgs.Req.(*roachpb.IncrementRequest) 210 if ok && filterArgs.Req.Header().Key.Equal(keyA) { 211 numIncrements++ 212 } 213 return nil 214 } 215 store := createTestStoreWithOpts( 216 t, 217 testStoreOpts{eng: eng, cfg: &storeCfg}, 218 stopper) 219 220 // Write a bytes value so the increment will fail. 221 putArgs := putArgs(keyA, []byte("asdf")) 222 if _, err := kv.SendWrapped(context.Background(), store.TestSender(), putArgs); err != nil { 223 t.Fatal(err) 224 } 225 226 // Try and fail to increment the key. It is important for this test that the 227 // failure be the last thing in the raft log when the store is stopped. 228 incArgs := incrementArgs(keyA, 42) 229 if _, err := kv.SendWrapped(context.Background(), store.TestSender(), incArgs); err == nil { 230 t.Fatal("did not get expected error") 231 } 232 }() 233 234 if numIncrements != 1 { 235 t.Fatalf("expected 1 increments; was %d", numIncrements) 236 } 237 238 stopper := stop.NewStopper() 239 defer stopper.Stop(context.Background()) 240 241 // Recover from the engine. 242 store := createTestStoreWithOpts(t, 243 testStoreOpts{ 244 dontBootstrap: true, 245 eng: eng, 246 cfg: &storeCfg, 247 }, 248 stopper) 249 250 // Issue a no-op write to lazily initialize raft on the range. 251 keyB := roachpb.Key("b") 252 incArgs := incrementArgs(keyB, 0) 253 if _, err := kv.SendWrapped(context.Background(), store.TestSender(), incArgs); err != nil { 254 t.Fatal(err) 255 } 256 257 // No additional increments were performed on key A during recovery. 258 if numIncrements != 1 { 259 t.Fatalf("expected 1 increments; was %d", numIncrements) 260 } 261 } 262 263 // TestReplicateRange verifies basic replication functionality by creating two stores 264 // and a range, replicating the range to the second store, and reading its data there. 265 func TestReplicateRange(t *testing.T) { 266 defer leaktest.AfterTest(t)() 267 mtc := &multiTestContext{ 268 // This test was written before the multiTestContext started creating many 269 // system ranges at startup, and hasn't been update to take that into 270 // account. 271 startWithSingleRange: true, 272 } 273 defer mtc.Stop() 274 mtc.Start(t, 2) 275 276 // Issue a command on the first node before replicating. 277 incArgs := incrementArgs([]byte("a"), 5) 278 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 279 t.Fatal(err) 280 } 281 282 repl, err := mtc.stores[0].GetReplica(1) 283 if err != nil { 284 t.Fatal(err) 285 } 286 287 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 288 NodeID: mtc.stores[1].Ident.NodeID, 289 StoreID: mtc.stores[1].Ident.StoreID, 290 }) 291 if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil { 292 t.Fatal(err) 293 } 294 // Verify no intent remains on range descriptor key. 295 key := keys.RangeDescriptorKey(repl.Desc().StartKey) 296 desc := roachpb.RangeDescriptor{} 297 if ok, err := storage.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), key, 298 mtc.stores[0].Clock().Now(), &desc, storage.MVCCGetOptions{}); err != nil { 299 t.Fatal(err) 300 } else if !ok { 301 t.Fatalf("range descriptor key %s was not found", key) 302 } 303 // Verify that in time, no intents remain on meta addressing 304 // keys, and that range descriptor on the meta records is correct. 305 testutils.SucceedsSoon(t, func() error { 306 meta2 := keys.RangeMetaKey(roachpb.RKeyMax) 307 meta1 := keys.RangeMetaKey(meta2) 308 for _, key := range []roachpb.RKey{meta2, meta1} { 309 metaDesc := roachpb.RangeDescriptor{} 310 if ok, err := storage.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), key.AsRawKey(), 311 mtc.stores[0].Clock().Now(), &metaDesc, storage.MVCCGetOptions{}); err != nil { 312 return err 313 } else if !ok { 314 return errors.Errorf("failed to resolve %s", key.AsRawKey()) 315 } 316 if !reflect.DeepEqual(metaDesc, desc) { 317 return errors.Errorf("descs not equal: %+v != %+v", metaDesc, desc) 318 } 319 } 320 return nil 321 }) 322 323 // Verify that the same data is available on the replica. 324 testutils.SucceedsSoon(t, func() error { 325 getArgs := getArgs([]byte("a")) 326 if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{ 327 ReadConsistency: roachpb.INCONSISTENT, 328 }, getArgs); err != nil { 329 return errors.Errorf("failed to read data: %s", err) 330 } else if e, v := int64(5), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { 331 return errors.Errorf("failed to read correct data: expected %d, got %d", e, v) 332 } 333 return nil 334 }) 335 } 336 337 // TestRestoreReplicas ensures that consensus group membership is properly 338 // persisted to disk and restored when a node is stopped and restarted. 339 func TestRestoreReplicas(t *testing.T) { 340 defer leaktest.AfterTest(t)() 341 342 t.Skip("https://github.com/cockroachdb/cockroach/issues/40351") 343 344 sc := kvserver.TestStoreConfig(nil) 345 // Disable periodic gossip activities. The periodic gossiping of the first 346 // range can cause spurious lease transfers which cause this test to fail. 347 sc.TestingKnobs.DisablePeriodicGossips = true 348 // Allow a replica to use the lease it had before a restart; we don't want 349 // this test to deal with needing to acquire new leases after the restart. 350 sc.TestingKnobs.DontPreventUseOfOldLeaseOnStart = true 351 mtc := &multiTestContext{ 352 storeConfig: &sc, 353 // This test was written before the multiTestContext started creating many 354 // system ranges at startup, and hasn't been update to take that into 355 // account. 356 startWithSingleRange: true, 357 } 358 defer mtc.Stop() 359 mtc.Start(t, 2) 360 361 firstRng, err := mtc.stores[0].GetReplica(1) 362 if err != nil { 363 t.Fatal(err) 364 } 365 366 // Perform an increment before replication to ensure that commands are not 367 // repeated on restarts. 368 incArgs := incrementArgs([]byte("a"), 23) 369 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 370 t.Fatal(err) 371 } 372 373 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 374 NodeID: mtc.stores[1].Ident.NodeID, 375 StoreID: mtc.stores[1].Ident.StoreID, 376 }) 377 if _, err := firstRng.ChangeReplicas(context.Background(), firstRng.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil { 378 t.Fatal(err) 379 } 380 381 mtc.restart() 382 383 // Send a command on each store. The original store (the lease holder still) 384 // will succeed. 385 incArgs = incrementArgs([]byte("a"), 5) 386 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 387 t.Fatal(err) 388 } 389 // The follower will return a not lease holder error, indicating the command 390 // should be forwarded to the lease holder. 391 incArgs = incrementArgs([]byte("a"), 11) 392 { 393 _, pErr := kv.SendWrapped(context.Background(), mtc.stores[1].TestSender(), incArgs) 394 if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); !ok { 395 t.Fatalf("expected not lease holder error; got %s", pErr) 396 } 397 } 398 // Send again, this time to first store. 399 if _, pErr := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); pErr != nil { 400 t.Fatal(pErr) 401 } 402 403 testutils.SucceedsSoon(t, func() error { 404 getArgs := getArgs([]byte("a")) 405 if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{ 406 ReadConsistency: roachpb.INCONSISTENT, 407 }, getArgs); err != nil { 408 return errors.Errorf("failed to read data: %s", err) 409 } else if e, v := int64(39), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { 410 return errors.Errorf("failed to read correct data: expected %d, got %d", e, v) 411 } 412 return nil 413 }) 414 415 // Both replicas have a complete list in Desc.Replicas 416 for i, store := range mtc.stores { 417 repl, err := store.GetReplica(1) 418 if err != nil { 419 t.Fatal(err) 420 } 421 desc := repl.Desc() 422 if len(desc.InternalReplicas) != 2 { 423 t.Fatalf("store %d: expected 2 replicas, found %d", i, len(desc.InternalReplicas)) 424 } 425 if desc.InternalReplicas[0].NodeID != mtc.stores[0].Ident.NodeID { 426 t.Errorf("store %d: expected replica[0].NodeID == %d, was %d", 427 i, mtc.stores[0].Ident.NodeID, desc.InternalReplicas[0].NodeID) 428 } 429 } 430 } 431 432 // TODO(bdarnell): more aggressive testing here; especially with 433 // proposer-evaluated KV, what this test does is much less as it doesn't 434 // exercise the path in which the replica change fails at *apply* time (we only 435 // test the failfast path), in which case the replica change isn't even 436 // proposed. 437 func TestFailedReplicaChange(t *testing.T) { 438 defer leaktest.AfterTest(t)() 439 440 var runFilter atomic.Value 441 runFilter.Store(true) 442 443 sc := kvserver.TestStoreConfig(nil) 444 sc.Clock = nil // manual clock 445 sc.TestingKnobs.EvalKnobs.TestingEvalFilter = func(filterArgs kvserverbase.FilterArgs) *roachpb.Error { 446 if runFilter.Load().(bool) { 447 if et, ok := filterArgs.Req.(*roachpb.EndTxnRequest); ok && et.Commit { 448 return roachpb.NewErrorWithTxn(errors.Errorf("boom"), filterArgs.Hdr.Txn) 449 } 450 } 451 return nil 452 } 453 mtc := &multiTestContext{storeConfig: &sc} 454 defer mtc.Stop() 455 mtc.Start(t, 2) 456 457 repl, err := mtc.stores[0].GetReplica(1) 458 if err != nil { 459 t.Fatal(err) 460 } 461 462 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 463 NodeID: mtc.stores[1].Ident.NodeID, 464 StoreID: mtc.stores[1].Ident.StoreID, 465 }) 466 if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); !testutils.IsError(err, "boom") { 467 t.Fatalf("did not get expected error: %+v", err) 468 } 469 470 // After the aborted transaction, r.Desc was not updated. 471 // TODO(bdarnell): expose and inspect raft's internal state. 472 if replicas := repl.Desc().InternalReplicas; len(replicas) != 1 { 473 t.Fatalf("expected 1 replica, found %v", replicas) 474 } 475 476 // The pending config change flag was cleared, so a subsequent attempt 477 // can succeed. 478 runFilter.Store(false) 479 480 // The first failed replica change has laid down intents. Make sure those 481 // are pushable by making the transaction abandoned. 482 mtc.manualClock.Increment(10 * base.DefaultTxnHeartbeatInterval.Nanoseconds()) 483 484 if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil { 485 t.Fatal(err) 486 } 487 488 // Wait for the range to sync to both replicas (mainly so leaktest doesn't 489 // complain about goroutines involved in the process). 490 testutils.SucceedsSoon(t, func() error { 491 for _, store := range mtc.stores { 492 rang, err := store.GetReplica(1) 493 if err != nil { 494 return err 495 } 496 if replicas := rang.Desc().InternalReplicas; len(replicas) <= 1 { 497 return errors.Errorf("expected > 1 replicas; got %v", replicas) 498 } 499 } 500 return nil 501 }) 502 } 503 504 // We can truncate the old log entries and a new replica will be brought up from a snapshot. 505 func TestReplicateAfterTruncation(t *testing.T) { 506 defer leaktest.AfterTest(t)() 507 mtc := &multiTestContext{ 508 // This test was written before the multiTestContext started creating many 509 // system ranges at startup, and hasn't been update to take that into 510 // account. 511 startWithSingleRange: true, 512 } 513 defer mtc.Stop() 514 mtc.Start(t, 2) 515 516 repl, err := mtc.stores[0].GetReplica(1) 517 if err != nil { 518 t.Fatal(err) 519 } 520 521 // Issue a command on the first node before replicating. 522 incArgs := incrementArgs([]byte("a"), 5) 523 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 524 t.Fatal(err) 525 } 526 527 // Get that command's log index. 528 index, err := repl.GetLastIndex() 529 if err != nil { 530 t.Fatal(err) 531 } 532 533 // Truncate the log at index+1 (log entries < N are removed, so this includes 534 // the increment). 535 truncArgs := truncateLogArgs(index+1, 1) 536 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil { 537 t.Fatal(err) 538 } 539 540 // Issue a second command post-truncation. 541 incArgs = incrementArgs([]byte("a"), 11) 542 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 543 t.Fatal(err) 544 } 545 546 // Now add the second replica. 547 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 548 NodeID: mtc.stores[1].Ident.NodeID, 549 StoreID: mtc.stores[1].Ident.StoreID, 550 }) 551 if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil { 552 t.Fatal(err) 553 } 554 555 // Once it catches up, the effects of both commands can be seen. 556 testutils.SucceedsSoon(t, func() error { 557 getArgs := getArgs([]byte("a")) 558 if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{ 559 ReadConsistency: roachpb.INCONSISTENT, 560 }, getArgs); err != nil { 561 return errors.Errorf("failed to read data: %s", err) 562 } else if e, v := int64(16), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { 563 return errors.Errorf("failed to read correct data: expected %d, got %d", e, v) 564 } 565 return nil 566 }) 567 568 repl2, err := mtc.stores[1].GetReplica(1) 569 if err != nil { 570 t.Fatal(err) 571 } 572 573 testutils.SucceedsSoon(t, func() error { 574 if mvcc, mvcc2 := repl.GetMVCCStats(), repl2.GetMVCCStats(); mvcc2 != mvcc { 575 return errors.Errorf("expected stats on new range:\n%+v\not equal old:\n%+v", mvcc2, mvcc) 576 } 577 return nil 578 }) 579 580 // Send a third command to verify that the log states are synced up so the 581 // new node can accept new commands. 582 incArgs = incrementArgs([]byte("a"), 23) 583 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 584 t.Fatal(err) 585 } 586 587 testutils.SucceedsSoon(t, func() error { 588 getArgs := getArgs([]byte("a")) 589 if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{ 590 ReadConsistency: roachpb.INCONSISTENT, 591 }, getArgs); err != nil { 592 return errors.Errorf("failed to read data: %s", err) 593 } else if e, v := int64(39), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { 594 return errors.Errorf("failed to read correct data: expected %d, got %d", e, v) 595 } 596 return nil 597 }) 598 } 599 600 func TestRaftLogSizeAfterTruncation(t *testing.T) { 601 defer leaktest.AfterTest(t)() 602 mtc := &multiTestContext{ 603 // This test was written before the multiTestContext started creating many 604 // system ranges at startup, and hasn't been update to take that into 605 // account. 606 startWithSingleRange: true, 607 } 608 defer mtc.Stop() 609 mtc.Start(t, 1) 610 611 const rangeID = 1 612 613 repl, err := mtc.stores[0].GetReplica(rangeID) 614 if err != nil { 615 t.Fatal(err) 616 } 617 618 key := []byte("a") 619 incArgs := incrementArgs(key, 5) 620 if _, err := kv.SendWrapped( 621 context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 622 t.Fatal(err) 623 } 624 625 index, err := repl.GetLastIndex() 626 if err != nil { 627 t.Fatal(err) 628 } 629 630 // Verifies the recomputed log size against what we track in `r.mu.raftLogSize`. 631 assertCorrectRaftLogSize := func() error { 632 // Recompute under raft lock so that the log doesn't change while we 633 // compute its size. 634 repl.RaftLock() 635 realSize, err := kvserver.ComputeRaftLogSize( 636 context.Background(), repl.RangeID, repl.Engine(), repl.SideloadedRaftMuLocked(), 637 ) 638 size, _ := repl.GetRaftLogSize() 639 repl.RaftUnlock() 640 641 if err != nil { 642 t.Fatal(err) 643 } 644 645 // If the size isn't trusted, it won't have to match (and in fact 646 // likely won't). In this test, this is because the upreplication 647 // elides old Raft log entries in the snapshot it uses. 648 if size != realSize { 649 return fmt.Errorf("%s: raft log claims size %d, but is in fact %d", repl, size, realSize) 650 } 651 return nil 652 } 653 654 assert.NoError(t, assertCorrectRaftLogSize()) 655 656 truncArgs := truncateLogArgs(index+1, 1) 657 if _, err := kv.SendWrapped( 658 context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil { 659 t.Fatal(err) 660 } 661 662 // Note that if there were multiple nodes, the Raft log sizes would not 663 // be correct for the followers as they would have received a shorter 664 // Raft log than the leader. 665 assert.NoError(t, assertCorrectRaftLogSize()) 666 } 667 668 // TestSnapshotAfterTruncation tests that Raft will properly send a 669 // non-preemptive snapshot when a node is brought up and the log has been 670 // truncated. 671 func TestSnapshotAfterTruncation(t *testing.T) { 672 defer leaktest.AfterTest(t)() 673 for _, changeTerm := range []bool{false, true} { 674 name := "sameTerm" 675 if changeTerm { 676 name = "differentTerm" 677 } 678 t.Run(name, func(t *testing.T) { 679 mtc := &multiTestContext{ 680 // This test was written before the multiTestContext started creating many 681 // system ranges at startup, and hasn't been update to take that into 682 // account. 683 startWithSingleRange: true, 684 } 685 defer mtc.Stop() 686 mtc.Start(t, 3) 687 const stoppedStore = 1 688 repl0, err := mtc.stores[0].GetReplica(1) 689 if err != nil { 690 t.Fatal(err) 691 } 692 693 key := roachpb.Key("a") 694 incA := int64(5) 695 incB := int64(7) 696 incAB := incA + incB 697 698 // Set up a key to replicate across the cluster. We're going to modify this 699 // key and truncate the raft logs from that command after killing one of the 700 // nodes to check that it gets the new value after it comes up. 701 incArgs := incrementArgs(key, incA) 702 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 703 t.Fatal(err) 704 } 705 706 mtc.replicateRange(1, 1, 2) 707 mtc.waitForValues(key, []int64{incA, incA, incA}) 708 709 // Now kill one store, increment the key on the other stores and truncate 710 // their logs to make sure that when store 1 comes back up it will require a 711 // non-preemptive snapshot from Raft. 712 mtc.stopStore(stoppedStore) 713 714 incArgs = incrementArgs(key, incB) 715 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 716 t.Fatal(err) 717 } 718 719 mtc.waitForValues(key, []int64{incAB, incA, incAB}) 720 721 index, err := repl0.GetLastIndex() 722 if err != nil { 723 t.Fatal(err) 724 } 725 726 // Truncate the log at index+1 (log entries < N are removed, so this 727 // includes the increment). 728 truncArgs := truncateLogArgs(index+1, 1) 729 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil { 730 t.Fatal(err) 731 } 732 733 if changeTerm { 734 for i := range mtc.stores { 735 if i != stoppedStore { 736 // Stop and restart all the live stores, which guarantees that 737 // we won't be in the same term we started with. 738 mtc.stopStore(i) 739 mtc.restartStore(i) 740 // Disable the snapshot queue on the live stores so that 741 // stoppedStore won't get a snapshot as soon as it starts 742 // back up. 743 mtc.stores[i].SetRaftSnapshotQueueActive(false) 744 } 745 } 746 747 // Restart the stopped store and wait for raft 748 // election/heartbeat traffic to settle down. Specifically, we 749 // need stoppedStore to know about the new term number before 750 // the snapshot is sent to reproduce #13506. If the snapshot 751 // happened before it learned the term, it would accept the 752 // snapshot no matter what term it contained. 753 // 754 // We do not wait for the store to successfully heartbeat 755 // because it is not expected to succeed in cases where the 756 // other two stores have already completed their leader 757 // election. In this case, a successful heartbeat won't be 758 // possible until we re-enable snapshots. 759 mtc.restartStoreWithoutHeartbeat(stoppedStore) 760 testutils.SucceedsSoon(t, func() error { 761 hasLeader := false 762 term := uint64(0) 763 for i := range mtc.stores { 764 repl, err := mtc.stores[i].GetReplica(1) 765 if err != nil { 766 return err 767 } 768 status := repl.RaftStatus() 769 if status == nil { 770 return errors.New("raft status not initialized") 771 } 772 if status.RaftState == raft.StateLeader { 773 hasLeader = true 774 } 775 if term == 0 { 776 term = status.Term 777 } else if status.Term != term { 778 return errors.Errorf("terms do not agree: %d vs %d", status.Term, term) 779 } 780 } 781 if !hasLeader { 782 return errors.New("no leader") 783 } 784 return nil 785 }) 786 787 // Turn the queues back on and wait for the snapshot to be sent and processed. 788 for i, store := range mtc.stores { 789 if i != stoppedStore { 790 store.SetRaftSnapshotQueueActive(true) 791 if err := store.ForceRaftSnapshotQueueProcess(); err != nil { 792 t.Fatal(err) 793 } 794 } 795 } 796 } else { // !changeTerm 797 mtc.restartStore(stoppedStore) 798 } 799 mtc.waitForValues(key, []int64{incAB, incAB, incAB}) 800 801 testutils.SucceedsSoon(t, func() error { 802 // Verify that the cached index and term (Replica.mu.last{Index,Term})) 803 // on all of the replicas is the same. #18327 fixed an issue where the 804 // cached term was left unchanged after applying a snapshot leading to a 805 // persistently unavailable range. 806 repl0, err = mtc.stores[0].GetReplica(1) 807 if err != nil { 808 t.Fatal(err) 809 } 810 expectedLastIndex, _ := repl0.GetLastIndex() 811 expectedLastTerm := repl0.GetCachedLastTerm() 812 813 for i := 1; i < len(mtc.stores); i++ { 814 repl1, err := mtc.stores[i].GetReplica(1) 815 if err != nil { 816 return err 817 } 818 if lastIndex, _ := repl1.GetLastIndex(); expectedLastIndex != lastIndex { 819 return fmt.Errorf("%d: expected last index %d, but found %d", i, expectedLastIndex, lastIndex) 820 } 821 if lastTerm := repl1.GetCachedLastTerm(); expectedLastTerm != lastTerm { 822 return fmt.Errorf("%d: expected last term %d, but found %d", i, expectedLastTerm, lastTerm) 823 } 824 } 825 return nil 826 }) 827 }) 828 } 829 } 830 831 // TestSnapshotAfterTruncationWithUncommittedTail is similar in spirit to 832 // TestSnapshotAfterTruncation/differentTerm. However, it differs in that we 833 // take care to ensure that the partitioned Replica has a long uncommitted tail 834 // of Raft entries that is not entirely overwritten by the snapshot it receives 835 // after the partition heals. If the recipient of the snapshot did not purge its 836 // Raft entry cache when receiving the snapshot, it could get stuck repeatedly 837 // rejecting attempts to catch it up. This serves as a regression test for the 838 // bug seen in #37056. 839 func TestSnapshotAfterTruncationWithUncommittedTail(t *testing.T) { 840 defer leaktest.AfterTest(t)() 841 ctx := context.Background() 842 mtc := &multiTestContext{ 843 // This test was written before the multiTestContext started creating many 844 // system ranges at startup, and hasn't been update to take that into 845 // account. 846 startWithSingleRange: true, 847 } 848 defer mtc.Stop() 849 mtc.Start(t, 3) 850 851 key := roachpb.Key("a") 852 incA := int64(5) 853 incB := int64(7) 854 incC := int64(9) 855 incAB := incA + incB 856 incABC := incAB + incC 857 858 // Set up a key to replicate across the cluster. We're going to modify this 859 // key and truncate the raft logs from that command after partitioning one 860 // of the nodes to check that it gets the new value after it reconnects. 861 // We're then going to continue modifying this key to make sure that the 862 // temporarily partitioned node can continue to receive updates. 863 incArgs := incrementArgs(key, incA) 864 if _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), incArgs); pErr != nil { 865 t.Fatal(pErr) 866 } 867 868 mtc.replicateRange(1, 1, 2) 869 mtc.waitForValues(key, []int64{incA, incA, incA}) 870 871 // We partition the original leader from the other two replicas. This allows 872 // us to build up a large uncommitted Raft log on the partitioned node. 873 const partStore = 0 874 partRepl, err := mtc.stores[partStore].GetReplica(1) 875 if err != nil { 876 t.Fatal(err) 877 } 878 partReplDesc, err := partRepl.GetReplicaDescriptor() 879 if err != nil { 880 t.Fatal(err) 881 } 882 partReplSender := mtc.stores[partStore].TestSender() 883 884 // Partition the original leader from its followers. We do this by installing 885 // unreliableRaftHandler listeners on all three Stores. The handler on the 886 // partitioned store filters out all messages while the handler on the other 887 // two stores only filters out messages from the partitioned store. The 888 // configuration looks like: 889 // 890 // [0] 891 // x x 892 // / \ 893 // x x 894 // [1]<---->[2] 895 // 896 for _, s := range []int{0, 1, 2} { 897 h := &unreliableRaftHandler{rangeID: 1, RaftMessageHandler: mtc.stores[s]} 898 if s != partStore { 899 // Only filter messages from the partitioned store on the other 900 // two stores. 901 h.dropReq = func(req *kvserver.RaftMessageRequest) bool { 902 return req.FromReplica.StoreID == partRepl.StoreID() 903 } 904 h.dropHB = func(hb *kvserver.RaftHeartbeat) bool { 905 return hb.FromReplicaID == partReplDesc.ReplicaID 906 } 907 } 908 mtc.transport.Listen(mtc.stores[s].Ident.StoreID, h) 909 } 910 911 // Perform a series of writes on the partitioned replica. The writes will 912 // not succeed before their context is canceled, but they will be appended 913 // to the partitioned replica's Raft log because it is currently the Raft 914 // leader. 915 g := ctxgroup.WithContext(ctx) 916 for i := 0; i < 32; i++ { 917 otherKey := roachpb.Key(fmt.Sprintf("other-%d", i)) 918 g.GoCtx(func(ctx context.Context) error { 919 cCtx, cancel := context.WithTimeout(ctx, 50*time.Millisecond) 920 defer cancel() 921 incArgsOther := incrementArgs(otherKey, 1) 922 if _, pErr := kv.SendWrapped(cCtx, partReplSender, incArgsOther); pErr == nil { 923 return errors.New("unexpected success") 924 } else if !testutils.IsPError(pErr, "context deadline exceeded") { 925 return pErr.GoError() 926 } 927 return nil 928 }) 929 } 930 if err := g.Wait(); err != nil { 931 t.Fatal(err) 932 } 933 934 // Transfer the lease to one of the followers and perform a write. The 935 // partition ensures that this will require a Raft leadership change. 936 const newLeaderStore = partStore + 1 937 newLeaderRepl, err := mtc.stores[newLeaderStore].GetReplica(1) 938 if err != nil { 939 t.Fatal(err) 940 } 941 newLeaderReplSender := mtc.stores[newLeaderStore].TestSender() 942 943 incArgs = incrementArgs(key, incB) 944 testutils.SucceedsSoon(t, func() error { 945 mtc.advanceClock(ctx) 946 _, pErr := kv.SendWrapped(ctx, newLeaderReplSender, incArgs) 947 if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); ok { 948 return pErr.GoError() 949 } else if pErr != nil { 950 t.Fatal(pErr) 951 } 952 return nil 953 }) 954 mtc.waitForValues(key, []int64{incA, incAB, incAB}) 955 956 index, err := newLeaderRepl.GetLastIndex() 957 if err != nil { 958 t.Fatal(err) 959 } 960 961 // Truncate the log at index+1 (log entries < N are removed, so this 962 // includes the increment). 963 truncArgs := truncateLogArgs(index+1, 1) 964 testutils.SucceedsSoon(t, func() error { 965 mtc.advanceClock(ctx) 966 _, pErr := kv.SendWrapped(ctx, newLeaderReplSender, truncArgs) 967 if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); ok { 968 return pErr.GoError() 969 } else if pErr != nil { 970 t.Fatal(pErr) 971 } 972 return nil 973 }) 974 975 snapsMetric := mtc.stores[partStore].Metrics().RangeSnapshotsNormalApplied 976 snapsBefore := snapsMetric.Count() 977 978 // Remove the partition. Snapshot should follow. 979 for _, s := range []int{0, 1, 2} { 980 mtc.transport.Listen(mtc.stores[s].Ident.StoreID, &unreliableRaftHandler{ 981 rangeID: 1, 982 RaftMessageHandler: mtc.stores[s], 983 unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{ 984 dropReq: func(req *kvserver.RaftMessageRequest) bool { 985 // Make sure that even going forward no MsgApp for what we just truncated can 986 // make it through. The Raft transport is asynchronous so this is necessary 987 // to make the test pass reliably. 988 // NB: the Index on the message is the log index that _precedes_ any of the 989 // entries in the MsgApp, so filter where msg.Index < index, not <= index. 990 return req.Message.Type == raftpb.MsgApp && req.Message.Index < index 991 }, 992 dropHB: func(*kvserver.RaftHeartbeat) bool { return false }, 993 dropResp: func(*kvserver.RaftMessageResponse) bool { return false }, 994 }, 995 }) 996 } 997 998 // The partitioned replica should catch up after a snapshot. 999 testutils.SucceedsSoon(t, func() error { 1000 snapsAfter := snapsMetric.Count() 1001 if !(snapsAfter > snapsBefore) { 1002 return errors.New("expected at least 1 snapshot to catch the partitioned replica up") 1003 } 1004 return nil 1005 }) 1006 mtc.waitForValues(key, []int64{incAB, incAB, incAB}) 1007 1008 // Perform another write. The partitioned replica should be able to receive 1009 // replicated updates. 1010 incArgs = incrementArgs(key, incC) 1011 if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], incArgs); pErr != nil { 1012 t.Fatal(pErr) 1013 } 1014 mtc.waitForValues(key, []int64{incABC, incABC, incABC}) 1015 } 1016 1017 type fakeSnapshotStream struct { 1018 nextReq *kvserver.SnapshotRequest 1019 nextErr error 1020 } 1021 1022 // Recv implements the SnapshotResponseStream interface. 1023 func (c fakeSnapshotStream) Recv() (*kvserver.SnapshotRequest, error) { 1024 return c.nextReq, c.nextErr 1025 } 1026 1027 // Send implements the SnapshotResponseStream interface. 1028 func (c fakeSnapshotStream) Send(request *kvserver.SnapshotResponse) error { 1029 return nil 1030 } 1031 1032 // Context implements the SnapshotResponseStream interface. 1033 func (c fakeSnapshotStream) Context() context.Context { 1034 return context.Background() 1035 } 1036 1037 // TestFailedSnapshotFillsReservation tests that failing to finish applying an 1038 // incoming snapshot still cleans up the outstanding reservation that was made. 1039 func TestFailedSnapshotFillsReservation(t *testing.T) { 1040 defer leaktest.AfterTest(t)() 1041 mtc := &multiTestContext{} 1042 defer mtc.Stop() 1043 mtc.Start(t, 3) 1044 1045 rep, err := mtc.stores[0].GetReplica(1) 1046 require.NoError(t, err) 1047 repDesc, err := rep.GetReplicaDescriptor() 1048 require.NoError(t, err) 1049 desc := protoutil.Clone(rep.Desc()).(*roachpb.RangeDescriptor) 1050 desc.AddReplica(2, 2, roachpb.LEARNER) 1051 rep2Desc, found := desc.GetReplicaDescriptor(2) 1052 require.True(t, found) 1053 header := kvserver.SnapshotRequest_Header{ 1054 CanDecline: true, 1055 RangeSize: 100, 1056 State: kvserverpb.ReplicaState{Desc: desc}, 1057 RaftMessageRequest: kvserver.RaftMessageRequest{ 1058 RangeID: rep.RangeID, 1059 FromReplica: repDesc, 1060 ToReplica: rep2Desc, 1061 }, 1062 } 1063 header.RaftMessageRequest.Message.Snapshot.Data = uuid.UUID{}.GetBytes() 1064 // Cause this stream to return an error as soon as we ask it for something. 1065 // This injects an error into HandleSnapshotStream when we try to send the 1066 // "snapshot accepted" message. 1067 expectedErr := errors.Errorf("") 1068 stream := fakeSnapshotStream{nil, expectedErr} 1069 if err := mtc.stores[1].HandleSnapshot(&header, stream); !errors.Is(err, expectedErr) { 1070 t.Fatalf("expected error %s, but found %v", expectedErr, err) 1071 } 1072 if n := mtc.stores[1].ReservationCount(); n != 0 { 1073 t.Fatalf("expected 0 reservations, but found %d", n) 1074 } 1075 } 1076 1077 // TestConcurrentRaftSnapshots tests that snapshots still work correctly when 1078 // Raft requests multiple non-preemptive snapshots at the same time. This 1079 // situation occurs when two replicas need snapshots at the same time. 1080 func TestConcurrentRaftSnapshots(t *testing.T) { 1081 defer leaktest.AfterTest(t)() 1082 // This test relies on concurrently waiting for a value to change in the 1083 // underlying engine(s). Since the teeing engine does not respond well to 1084 // value mismatches, whether transient or permanent, skip this test if the 1085 // teeing engine is being used. See 1086 // https://github.com/cockroachdb/cockroach/issues/42656 for more context. 1087 if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB { 1088 t.Skip("disabled on teeing engine") 1089 } 1090 1091 mtc := &multiTestContext{ 1092 // This test was written before the multiTestContext started creating many 1093 // system ranges at startup, and hasn't been update to take that into 1094 // account. 1095 startWithSingleRange: true, 1096 } 1097 defer mtc.Stop() 1098 mtc.Start(t, 5) 1099 repl, err := mtc.stores[0].GetReplica(1) 1100 if err != nil { 1101 t.Fatal(err) 1102 } 1103 1104 key := roachpb.Key("a") 1105 incA := int64(5) 1106 incB := int64(7) 1107 incAB := incA + incB 1108 1109 // Set up a key to replicate across the cluster. We're going to modify this 1110 // key and truncate the raft logs from that command after killing one of the 1111 // nodes to check that it gets the new value after it comes up. 1112 incArgs := incrementArgs(key, incA) 1113 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1114 t.Fatal(err) 1115 } 1116 1117 mtc.replicateRange(1, 1, 2, 3, 4) 1118 mtc.waitForValues(key, []int64{incA, incA, incA, incA, incA}) 1119 1120 // Now kill stores 1 + 2, increment the key on the other stores and 1121 // truncate their logs to make sure that when store 1 + 2 comes back up 1122 // they will require a non-preemptive snapshot from Raft. 1123 mtc.stopStore(1) 1124 mtc.stopStore(2) 1125 1126 incArgs = incrementArgs(key, incB) 1127 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1128 t.Fatal(err) 1129 } 1130 1131 mtc.waitForValues(key, []int64{incAB, incA, incA, incAB, incAB}) 1132 1133 index, err := repl.GetLastIndex() 1134 if err != nil { 1135 t.Fatal(err) 1136 } 1137 1138 // Truncate the log at index+1 (log entries < N are removed, so this 1139 // includes the increment). 1140 truncArgs := truncateLogArgs(index+1, 1) 1141 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil { 1142 t.Fatal(err) 1143 } 1144 mtc.restartStore(1) 1145 mtc.restartStore(2) 1146 1147 mtc.waitForValues(key, []int64{incAB, incAB, incAB, incAB, incAB}) 1148 } 1149 1150 // Test a scenario where a replica is removed from a down node, the associated 1151 // range is split, the node restarts and we try to replicate the RHS of the 1152 // split range back to the restarted node. 1153 func TestReplicateAfterRemoveAndSplit(t *testing.T) { 1154 defer leaktest.AfterTest(t)() 1155 1156 sc := kvserver.TestStoreConfig(nil) 1157 sc.TestingKnobs.DisableMergeQueue = true 1158 sc.TestingKnobs.DisableReplicateQueue = true 1159 // Disable the replica GC queue so that it doesn't accidentally pick up the 1160 // removed replica and GC it. We'll explicitly enable it later in the test. 1161 sc.TestingKnobs.DisableReplicaGCQueue = true 1162 // Disable eager replica removal so we can manually remove the replica. 1163 sc.TestingKnobs.DisableEagerReplicaRemoval = true 1164 sc.Clock = nil // manual clock 1165 mtc := &multiTestContext{ 1166 storeConfig: &sc, 1167 // This test was written before the multiTestContext started creating many 1168 // system ranges at startup, and hasn't been update to take that into 1169 // account. 1170 startWithSingleRange: true, 1171 } 1172 defer mtc.Stop() 1173 mtc.Start(t, 3) 1174 rep1, err := mtc.stores[0].GetReplica(1) 1175 if err != nil { 1176 t.Fatal(err) 1177 } 1178 1179 const rangeID = roachpb.RangeID(1) 1180 mtc.replicateRange(rangeID, 1, 2) 1181 1182 // Kill store 2. 1183 mtc.stopStore(2) 1184 1185 // Remove store 2 from the range to simulate removal of a dead node. 1186 mtc.unreplicateRange(rangeID, 2) 1187 1188 // Split the range. 1189 splitKey := roachpb.Key("m") 1190 splitArgs := adminSplitArgs(splitKey) 1191 if _, err := rep1.AdminSplit(context.Background(), *splitArgs, "test"); err != nil { 1192 t.Fatal(err) 1193 } 1194 1195 mtc.advanceClock(context.Background()) 1196 1197 // Restart store 2. 1198 mtc.restartStore(2) 1199 1200 replicateRHS := func() error { 1201 // Try to up-replicate the RHS of the split to store 2. We can't use 1202 // replicateRange because this should fail on the first attempt and then 1203 // eventually succeed. 1204 startKey := roachpb.RKey(splitKey) 1205 1206 var desc roachpb.RangeDescriptor 1207 if err := mtc.dbs[0].GetProto(context.Background(), keys.RangeDescriptorKey(startKey), &desc); err != nil { 1208 t.Fatal(err) 1209 } 1210 1211 rep2, err := mtc.findMemberStoreLocked(desc).GetReplica(desc.RangeID) 1212 if err != nil { 1213 t.Fatal(err) 1214 } 1215 1216 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 1217 NodeID: mtc.stores[2].Ident.NodeID, 1218 StoreID: mtc.stores[2].Ident.StoreID, 1219 }) 1220 _, err = rep2.ChangeReplicas(context.Background(), &desc, kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs) 1221 return err 1222 } 1223 1224 if err := replicateRHS(); !testutils.IsError(err, kvserver.IntersectingSnapshotMsg) { 1225 t.Fatalf("unexpected error %v", err) 1226 } 1227 1228 // Enable the replica GC queue so that the next attempt to replicate the RHS 1229 // to store 2 will cause the obsolete replica to be GC'd allowing a 1230 // subsequent replication to succeed. 1231 mtc.stores[2].SetReplicaGCQueueActive(true) 1232 } 1233 1234 // Test various mechanism for refreshing pending commands. 1235 func TestRefreshPendingCommands(t *testing.T) { 1236 defer leaktest.AfterTest(t)() 1237 1238 // In this scenario, three different mechanisms detect the need to repropose 1239 // commands. Test that each one is sufficient individually. We have this 1240 // redundancy because some mechanisms respond with lower latency than others, 1241 // but each has some scenarios (not currently tested) in which it is 1242 // insufficient on its own. In addition, there is a fourth reproposal 1243 // mechanism (reasonNewLeaderOrConfigChange) which is not relevant to this 1244 // scenario. 1245 // 1246 // We don't test with only reasonNewLeader because that mechanism is less 1247 // robust than refreshing due to snapshot or ticks. In particular, it is 1248 // possible for node 3 to propose the RequestLease command and have that 1249 // command executed by the other nodes but to never see the execution locally 1250 // because it is caught up by applying a snapshot. 1251 testCases := map[string]kvserver.StoreTestingKnobs{ 1252 "reasonSnapshotApplied": { 1253 DisableRefreshReasonNewLeader: true, 1254 DisableRefreshReasonTicks: true, 1255 }, 1256 "reasonTicks": { 1257 DisableRefreshReasonNewLeader: true, 1258 DisableRefreshReasonSnapshotApplied: true, 1259 }, 1260 } 1261 for name, c := range testCases { 1262 t.Run(name, func(t *testing.T) { 1263 sc := kvserver.TestStoreConfig(nil) 1264 sc.TestingKnobs = c 1265 // Disable periodic gossip tasks which can move the range 1 lease 1266 // unexpectedly. 1267 sc.TestingKnobs.DisablePeriodicGossips = true 1268 sc.Clock = nil // manual clock 1269 mtc := &multiTestContext{ 1270 storeConfig: &sc, 1271 // This test was written before the multiTestContext started creating 1272 // many system ranges at startup, and hasn't been update to take that 1273 // into account. 1274 startWithSingleRange: true, 1275 } 1276 defer mtc.Stop() 1277 mtc.Start(t, 3) 1278 1279 const rangeID = roachpb.RangeID(1) 1280 mtc.replicateRange(rangeID, 1, 2) 1281 1282 // Put some data in the range so we'll have something to test for. 1283 incArgs := incrementArgs([]byte("a"), 5) 1284 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1285 t.Fatal(err) 1286 } 1287 1288 // Wait for all nodes to catch up. 1289 mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5}) 1290 1291 // Stop node 2; while it is down write some more data. 1292 mtc.stopStore(2) 1293 1294 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1295 t.Fatal(err) 1296 } 1297 1298 // Get the last increment's log index. 1299 repl, err := mtc.stores[0].GetReplica(1) 1300 if err != nil { 1301 t.Fatal(err) 1302 } 1303 index, err := repl.GetLastIndex() 1304 if err != nil { 1305 t.Fatal(err) 1306 } 1307 1308 // Truncate the log at index+1 (log entries < N are removed, so this includes 1309 // the increment). 1310 truncArgs := truncateLogArgs(index+1, rangeID) 1311 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil { 1312 t.Fatal(err) 1313 } 1314 1315 // Stop and restart node 0 in order to make sure that any in-flight Raft 1316 // messages have been sent. 1317 mtc.stopStore(0) 1318 mtc.restartStore(0) 1319 1320 //////////////////////////////////////////////////////////////////// 1321 // We want store 2 to take the lease later, so we'll drain the other 1322 // stores and expire the lease. 1323 //////////////////////////////////////////////////////////////////// 1324 1325 // Disable node liveness heartbeats which can reacquire leases when we're 1326 // trying to expire them. We pause liveness heartbeats here after node 0 1327 // was restarted (which creates a new NodeLiveness). 1328 pauseNodeLivenessHeartbeats(mtc, true) 1329 1330 // Start draining stores 0 and 1 to prevent them from grabbing any new 1331 // leases. 1332 mtc.advanceClock(context.Background()) 1333 var wg sync.WaitGroup 1334 for i := 0; i < 2; i++ { 1335 wg.Add(1) 1336 go func(i int) { 1337 mtc.stores[i].SetDraining(true, nil /* reporter */) 1338 wg.Done() 1339 }(i) 1340 } 1341 1342 // Wait for the stores 0 and 1 to have entered draining mode, and then 1343 // advance the clock. Advancing the clock will leave the liveness records 1344 // of draining nodes in an expired state, so the SetDraining() call above 1345 // will be able to terminate. 1346 draining := false 1347 for !draining { 1348 draining = true 1349 for i := 0; i < 2; i++ { 1350 draining = draining && mtc.stores[i].IsDraining() 1351 } 1352 // Allow this loop to be preempted. Failure to do so can cause a 1353 // deadlock because a non-preemptible loop will prevent GC from 1354 // starting which in turn will cause all other goroutines to be stuck 1355 // as soon as they are called on to assist the GC (this shows up as 1356 // goroutines stuck in "GC assist wait"). With all of the other 1357 // goroutines stuck, nothing will be able to set mtc.stores[i].draining 1358 // to true. 1359 // 1360 // See #18554. 1361 runtime.Gosched() 1362 } 1363 mtc.advanceClock(context.Background()) 1364 1365 wg.Wait() 1366 1367 // Restart node 2 and wait for the snapshot to be applied. Note that 1368 // waitForValues reads directly from the engine and thus isn't executing 1369 // a Raft command. 1370 mtc.restartStore(2) 1371 mtc.waitForValues(roachpb.Key("a"), []int64{10, 10, 10}) 1372 1373 // Send an increment to the restarted node. If we don't refresh pending 1374 // commands appropriately, the range lease command will not get 1375 // re-proposed when we discover the new leader. 1376 if _, err := kv.SendWrapped(context.Background(), mtc.stores[2].TestSender(), incArgs); err != nil { 1377 t.Fatal(err) 1378 } 1379 1380 mtc.waitForValues(roachpb.Key("a"), []int64{15, 15, 15}) 1381 }) 1382 } 1383 } 1384 1385 // Test that when a Raft group is not able to establish a quorum, its Raft log 1386 // does not grow without bound. It tests two different scenarios where this used 1387 // to be possible (see #27772): 1388 // 1. The leader proposes a command and cannot establish a quorum. The leader 1389 // continually re-proposes the command. 1390 // 2. The follower proposes a command and forwards it to the leader, who cannot 1391 // establish a quorum. The follower continually re-proposes and forwards the 1392 // command to the leader. 1393 func TestLogGrowthWhenRefreshingPendingCommands(t *testing.T) { 1394 defer leaktest.AfterTest(t)() 1395 1396 sc := kvserver.TestStoreConfig(nil) 1397 // Drop the raft tick interval so the Raft group is ticked more. 1398 sc.RaftTickInterval = 10 * time.Millisecond 1399 // Don't timeout raft leader. We don't want leadership moving. 1400 sc.RaftElectionTimeoutTicks = 1000000 1401 // Reduce the max uncommitted entry size. 1402 sc.RaftMaxUncommittedEntriesSize = 64 << 10 // 64 KB 1403 // RaftProposalQuota cannot exceed RaftMaxUncommittedEntriesSize. 1404 sc.RaftProposalQuota = int64(sc.RaftMaxUncommittedEntriesSize) 1405 // RaftMaxInflightMsgs * RaftMaxSizePerMsg cannot exceed RaftProposalQuota. 1406 sc.RaftMaxInflightMsgs = 16 1407 sc.RaftMaxSizePerMsg = 1 << 10 // 1 KB 1408 // Disable leader transfers during leaseholder changes so that we 1409 // can easily create leader-not-leaseholder scenarios. 1410 sc.TestingKnobs.DisableLeaderFollowsLeaseholder = true 1411 // Refresh pending commands on every Raft group tick instead of 1412 // every RaftElectionTimeoutTicks. 1413 sc.TestingKnobs.RefreshReasonTicksPeriod = 1 1414 // Disable periodic gossip tasks which can move the range 1 lease 1415 // unexpectedly. 1416 sc.TestingKnobs.DisablePeriodicGossips = true 1417 mtc := &multiTestContext{ 1418 storeConfig: &sc, 1419 // This test was written before the multiTestContext started creating many 1420 // system ranges at startup, and hasn't been update to take that into 1421 // account. 1422 startWithSingleRange: true, 1423 } 1424 defer mtc.Stop() 1425 mtc.Start(t, 5) 1426 1427 const rangeID = roachpb.RangeID(1) 1428 mtc.replicateRange(rangeID, 1, 2, 3, 4) 1429 1430 // Raft leadership is kept on node 0. 1431 leaderRepl, err := mtc.Store(0).GetReplica(rangeID) 1432 if err != nil { 1433 t.Fatal(err) 1434 } 1435 1436 // Put some data in the range so we'll have something to test for. 1437 incArgs := incrementArgs([]byte("a"), 5) 1438 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1439 t.Fatal(err) 1440 } 1441 1442 // Wait for all nodes to catch up. 1443 mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5, 5, 5}) 1444 1445 // Test proposing on leader and proposing on follower. Neither should result 1446 // in unbounded raft log growth. 1447 testutils.RunTrueAndFalse(t, "proposeOnFollower", func(t *testing.T, proposeOnFollower bool) { 1448 // Restart any nodes that are down. 1449 for _, s := range []int{2, 3, 4} { 1450 if mtc.Store(s) == nil { 1451 mtc.restartStore(s) 1452 } 1453 } 1454 1455 // Determine which node to propose on. Transfer lease to that node. 1456 var propIdx, otherIdx int 1457 if !proposeOnFollower { 1458 propIdx, otherIdx = 0, 1 1459 } else { 1460 propIdx, otherIdx = 1, 0 1461 } 1462 propNode := mtc.stores[propIdx].TestSender() 1463 mtc.transferLease(context.Background(), rangeID, otherIdx, propIdx) 1464 testutils.SucceedsSoon(t, func() error { 1465 // Lease transfers may not be immediately observed by the new 1466 // leaseholder. Wait until the new leaseholder is aware. 1467 repl, err := mtc.Store(propIdx).GetReplica(rangeID) 1468 if err != nil { 1469 t.Fatal(err) 1470 } 1471 repDesc, err := repl.GetReplicaDescriptor() 1472 if err != nil { 1473 t.Fatal(err) 1474 } 1475 if lease, _ := repl.GetLease(); !lease.Replica.Equal(repDesc) { 1476 return errors.Errorf("lease not transferred yet; found %v", lease) 1477 } 1478 return nil 1479 }) 1480 1481 // Stop enough nodes to prevent a quorum. 1482 for _, s := range []int{2, 3, 4} { 1483 mtc.stopStore(s) 1484 } 1485 1486 // Determine the current raft log size. 1487 initLogSize, _ := leaderRepl.GetRaftLogSize() 1488 1489 // While a majority nodes are down, write some data. 1490 putRes := make(chan *roachpb.Error) 1491 go func() { 1492 putArgs := putArgs([]byte("b"), make([]byte, sc.RaftMaxUncommittedEntriesSize/8)) 1493 _, err := kv.SendWrapped(context.Background(), propNode, putArgs) 1494 putRes <- err 1495 }() 1496 1497 // Wait for a bit and watch for Raft log growth. 1498 wait := time.After(500 * time.Millisecond) 1499 ticker := time.Tick(50 * time.Millisecond) 1500 Loop: 1501 for { 1502 select { 1503 case <-wait: 1504 break Loop 1505 case <-ticker: 1506 // Verify that the leader is node 0. 1507 status := leaderRepl.RaftStatus() 1508 if status == nil || status.RaftState != raft.StateLeader { 1509 t.Fatalf("raft leader should be node 0, but got status %+v", status) 1510 } 1511 1512 // Check the raft log size. We allow GetRaftLogSize to grow up 1513 // to twice RaftMaxUncommittedEntriesSize because its total 1514 // includes a little more state (the roachpb.Value checksum, 1515 // etc.). The important thing here is that the log doesn't grow 1516 // forever. 1517 logSizeLimit := int64(2 * sc.RaftMaxUncommittedEntriesSize) 1518 curlogSize, _ := leaderRepl.GetRaftLogSize() 1519 logSize := curlogSize - initLogSize 1520 logSizeStr := humanizeutil.IBytes(logSize) 1521 // Note that logSize could be negative if something got truncated. 1522 if logSize > logSizeLimit { 1523 t.Fatalf("raft log size grew to %s", logSizeStr) 1524 } 1525 t.Logf("raft log size grew to %s", logSizeStr) 1526 case err := <-putRes: 1527 t.Fatalf("write finished with quorum unavailable; err=%v", err) 1528 } 1529 } 1530 1531 // Start enough nodes to establish a quorum. 1532 mtc.restartStore(2) 1533 1534 // The write should now succeed. 1535 if err := <-putRes; err != nil { 1536 t.Fatal(err) 1537 } 1538 }) 1539 } 1540 1541 // TestStoreRangeUpReplicate verifies that the replication queue will notice 1542 // under-replicated ranges and replicate them. 1543 func TestStoreRangeUpReplicate(t *testing.T) { 1544 defer leaktest.AfterTest(t)() 1545 defer kvserver.SetMockAddSSTable()() 1546 sc := kvserver.TestStoreConfig(nil) 1547 // Prevent the split queue from creating additional ranges while we're 1548 // waiting for replication. 1549 sc.TestingKnobs.DisableSplitQueue = true 1550 mtc := &multiTestContext{ 1551 storeConfig: &sc, 1552 } 1553 defer mtc.Stop() 1554 mtc.Start(t, 3) 1555 mtc.initGossipNetwork() 1556 1557 // Once we know our peers, trigger a scan. 1558 if err := mtc.stores[0].ForceReplicationScanAndProcess(); err != nil { 1559 t.Fatal(err) 1560 } 1561 1562 // Wait until all ranges are upreplicated to all nodes. 1563 var replicaCount int64 1564 testutils.SucceedsSoon(t, func() error { 1565 var replicaCounts [3]int64 1566 for i, s := range mtc.stores { 1567 var err error 1568 mtc.stores[i].VisitReplicas(func(r *kvserver.Replica) bool { 1569 replicaCounts[i]++ 1570 // Synchronize with the replica's raft processing goroutine. 1571 r.RaftLock() 1572 defer r.RaftUnlock() 1573 if len(r.Desc().InternalReplicas) != 3 { 1574 // This fails even after the snapshot has arrived and only 1575 // goes through once the replica has applied the conf change. 1576 err = errors.Errorf("not fully initialized") 1577 return false 1578 } 1579 return true 1580 }) 1581 if err != nil { 1582 return err 1583 } 1584 if replicaCounts[i] != replicaCounts[0] { 1585 return errors.Errorf("not fully upreplicated") 1586 } 1587 if n := s.ReservationCount(); n != 0 { 1588 return errors.Errorf("expected 0 reservations, but found %d", n) 1589 } 1590 } 1591 replicaCount = replicaCounts[0] 1592 return nil 1593 }) 1594 1595 var generated int64 1596 var learnerApplied, raftApplied int64 1597 for _, s := range mtc.stores { 1598 m := s.Metrics() 1599 generated += m.RangeSnapshotsGenerated.Count() 1600 learnerApplied += m.RangeSnapshotsLearnerApplied.Count() 1601 raftApplied += m.RangeSnapshotsNormalApplied.Count() 1602 } 1603 if generated == 0 { 1604 t.Fatalf("expected at least 1 snapshot, but found 0") 1605 } 1606 // We upreplicate each range (once each for n2 and n3), so there should be 1607 // exactly 2 * replica learner snaps, one per upreplication. 1608 require.Equal(t, 2*replicaCount, learnerApplied) 1609 // Ideally there would be zero raft snaps, but etcd/raft is picky about 1610 // getting a snapshot at exactly the index it asked for. 1611 if raftApplied > learnerApplied { 1612 t.Fatalf("expected more learner snaps %d than raft snaps %d", learnerApplied, raftApplied) 1613 } 1614 } 1615 1616 // TestUnreplicateFirstRange verifies that multiTestContext still functions in 1617 // the case where the first range (which contains range metadata) is 1618 // unreplicated from the first store. This situation can arise occasionally in 1619 // tests, as can a similar situation where the first store is no longer the lease holder of 1620 // the first range; this verifies that those tests will not be affected. 1621 func TestUnreplicateFirstRange(t *testing.T) { 1622 defer leaktest.AfterTest(t)() 1623 1624 mtc := &multiTestContext{} 1625 defer mtc.Stop() 1626 mtc.Start(t, 3) 1627 1628 const rangeID = roachpb.RangeID(1) 1629 // Replicate the range to store 1. 1630 mtc.replicateRange(rangeID, 1) 1631 // Move the lease away from store 0 before removing its replica. 1632 mtc.transferLease(context.Background(), rangeID, 0, 1) 1633 // Unreplicate the from from store 0. 1634 mtc.unreplicateRange(rangeID, 0) 1635 // Replicate the range to store 2. The first range is no longer available on 1636 // store 1, and this command will fail if that situation is not properly 1637 // supported. 1638 mtc.replicateRange(rangeID, 2) 1639 } 1640 1641 // TestChangeReplicasDescriptorInvariant tests that a replica change aborts if 1642 // another change has been made to the RangeDescriptor since it was initiated. 1643 func TestChangeReplicasDescriptorInvariant(t *testing.T) { 1644 defer leaktest.AfterTest(t)() 1645 mtc := &multiTestContext{ 1646 // This test was written before the multiTestContext started creating many 1647 // system ranges at startup, and hasn't been update to take that into 1648 // account. 1649 startWithSingleRange: true, 1650 } 1651 defer mtc.Stop() 1652 mtc.Start(t, 3) 1653 1654 repl, err := mtc.stores[0].GetReplica(1) 1655 if err != nil { 1656 t.Fatal(err) 1657 } 1658 1659 addReplica := func(storeNum int, desc *roachpb.RangeDescriptor) error { 1660 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 1661 NodeID: mtc.stores[storeNum].Ident.NodeID, 1662 StoreID: mtc.stores[storeNum].Ident.StoreID, 1663 }) 1664 _, err := repl.ChangeReplicas(context.Background(), desc, kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs) 1665 return err 1666 } 1667 1668 // Retain the descriptor for the range at this point. 1669 origDesc := repl.Desc() 1670 1671 // Add replica to the second store, which should succeed. 1672 if err := addReplica(1, origDesc); err != nil { 1673 t.Fatal(err) 1674 } 1675 testutils.SucceedsSoon(t, func() error { 1676 r := mtc.stores[1].LookupReplica(roachpb.RKey("a")) 1677 if r == nil { 1678 return errors.Errorf(`expected replica for key "a"`) 1679 } 1680 return nil 1681 }) 1682 1683 before := mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count() 1684 // Attempt to add replica to the third store with the original descriptor. 1685 // This should fail because the descriptor is stale. 1686 expectedErr := `change replicas of r1 failed: descriptor changed: \[expected\]` 1687 if err := addReplica(2, origDesc); !testutils.IsError(err, expectedErr) { 1688 t.Fatalf("got unexpected error: %+v", err) 1689 } 1690 1691 after := mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count() 1692 // The failed ChangeReplicas call should NOT have applied a learner snapshot. 1693 if after != before { 1694 t.Fatalf( 1695 "ChangeReplicas call should not have applied a learner snapshot, before %d after %d", 1696 before, after) 1697 } 1698 1699 before = mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count() 1700 // Add to third store with fresh descriptor. 1701 if err := addReplica(2, repl.Desc()); err != nil { 1702 t.Fatal(err) 1703 } 1704 1705 testutils.SucceedsSoon(t, func() error { 1706 after := mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count() 1707 // The failed ChangeReplicas call should have applied a learner snapshot. 1708 if after != before+1 { 1709 return errors.Errorf( 1710 "ChangeReplicas call should have applied a learner snapshot, before %d after %d", 1711 before, after) 1712 } 1713 r := mtc.stores[2].LookupReplica(roachpb.RKey("a")) 1714 if r == nil { 1715 return errors.Errorf(`expected replica for key "a"`) 1716 } 1717 return nil 1718 }) 1719 } 1720 1721 // TestProgressWithDownNode verifies that a surviving quorum can make progress 1722 // with a downed node. 1723 func TestProgressWithDownNode(t *testing.T) { 1724 defer leaktest.AfterTest(t)() 1725 // This test relies on concurrently waiting for a value to change in the 1726 // underlying engine(s). Since the teeing engine does not respond well to 1727 // value mismatches, whether transient or permanent, skip this test if the 1728 // teeing engine is being used. See 1729 // https://github.com/cockroachdb/cockroach/issues/42656 for more context. 1730 if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB { 1731 t.Skip("disabled on teeing engine") 1732 } 1733 mtc := &multiTestContext{ 1734 // This test was written before the multiTestContext started creating many 1735 // system ranges at startup, and hasn't been update to take that into 1736 // account. 1737 startWithSingleRange: true, 1738 } 1739 defer mtc.Stop() 1740 mtc.Start(t, 3) 1741 1742 const rangeID = roachpb.RangeID(1) 1743 mtc.replicateRange(rangeID, 1, 2) 1744 1745 incArgs := incrementArgs([]byte("a"), 5) 1746 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1747 t.Fatal(err) 1748 } 1749 1750 // Verify that the first increment propagates to all the engines. 1751 verify := func(expected []int64) { 1752 testutils.SucceedsSoon(t, func() error { 1753 values := []int64{} 1754 for _, eng := range mtc.engines { 1755 val, _, err := storage.MVCCGet(context.Background(), eng, roachpb.Key("a"), mtc.clock().Now(), 1756 storage.MVCCGetOptions{}) 1757 if err != nil { 1758 return err 1759 } 1760 values = append(values, mustGetInt(val)) 1761 } 1762 if !reflect.DeepEqual(expected, values) { 1763 return errors.Errorf("expected %v, got %v", expected, values) 1764 } 1765 return nil 1766 }) 1767 } 1768 verify([]int64{5, 5, 5}) 1769 1770 // Stop one of the replicas and issue a new increment. 1771 mtc.stopStore(1) 1772 incArgs = incrementArgs([]byte("a"), 11) 1773 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1774 t.Fatal(err) 1775 } 1776 1777 // The new increment can be seen on both live replicas. 1778 verify([]int64{16, 5, 16}) 1779 1780 // Once the downed node is restarted, it will catch up. 1781 mtc.restartStore(1) 1782 verify([]int64{16, 16, 16}) 1783 } 1784 1785 // TestReplicateRestartAfterTruncationWithRemoveAndReAdd is motivated by issue 1786 // #8111, which suggests the following test (which verifies the ability of a 1787 // snapshot with a new replica ID to overwrite existing data): 1788 // - replicate a range to three stores 1789 // - stop a store 1790 // - remove the stopped store from the range 1791 // - truncate the logs 1792 // - re-add the store and restart it 1793 // - ensure that store can catch up with the rest of the group 1794 func TestReplicateRestartAfterTruncationWithRemoveAndReAdd(t *testing.T) { 1795 defer leaktest.AfterTest(t)() 1796 runReplicateRestartAfterTruncation(t, true /* removeBeforeTruncateAndReAdd */) 1797 } 1798 1799 // TestReplicateRestartAfterTruncation is a variant of 1800 // TestReplicateRestartAfterTruncationWithRemoveAndReAdd without the remove and 1801 // re-add. Just stop, truncate, and restart. This verifies that a snapshot 1802 // without a new replica ID works correctly. 1803 func TestReplicateRestartAfterTruncation(t *testing.T) { 1804 defer leaktest.AfterTest(t)() 1805 runReplicateRestartAfterTruncation(t, false /* removeBeforeTruncateAndReAdd */) 1806 } 1807 1808 func runReplicateRestartAfterTruncation(t *testing.T, removeBeforeTruncateAndReAdd bool) { 1809 sc := kvserver.TestStoreConfig(nil) 1810 // Don't timeout raft leaders or range leases (see the relation between 1811 // RaftElectionTimeoutTicks and RangeLeaseActiveDuration). This test expects 1812 // mtc.stores[0] to hold the range lease for range 1. 1813 sc.RaftElectionTimeoutTicks = 1000000 1814 sc.Clock = nil // manual clock 1815 mtc := &multiTestContext{ 1816 storeConfig: &sc, 1817 // This test was written before the multiTestContext started creating many 1818 // system ranges at startup, and hasn't been update to take that into 1819 // account. 1820 startWithSingleRange: true, 1821 } 1822 defer mtc.Stop() 1823 mtc.Start(t, 3) 1824 1825 key := roachpb.Key("a") 1826 1827 // Replicate the initial range to all three nodes. 1828 const rangeID = roachpb.RangeID(1) 1829 mtc.replicateRange(rangeID, 1, 2) 1830 1831 // Verify that the first increment propagates to all the engines. 1832 incArgs := incrementArgs(key, 2) 1833 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1834 t.Fatal(err) 1835 } 1836 mtc.waitForValues(key, []int64{2, 2, 2}) 1837 1838 // Stop a store. 1839 mtc.stopStore(1) 1840 if removeBeforeTruncateAndReAdd { 1841 // remove the stopped store from the range 1842 mtc.unreplicateRange(rangeID, 1) 1843 } 1844 1845 // Truncate the logs. 1846 { 1847 // Get the last increment's log index. 1848 repl, err := mtc.stores[0].GetReplica(rangeID) 1849 if err != nil { 1850 t.Fatal(err) 1851 } 1852 index, err := repl.GetLastIndex() 1853 if err != nil { 1854 t.Fatal(err) 1855 } 1856 // Truncate the log at index+1 (log entries < N are removed, so this includes 1857 // the increment). 1858 truncArgs := truncateLogArgs(index+1, rangeID) 1859 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil { 1860 t.Fatal(err) 1861 } 1862 } 1863 1864 // Ensure that store can catch up with the rest of the group. 1865 incArgs = incrementArgs(key, 3) 1866 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1867 t.Fatal(err) 1868 } 1869 1870 mtc.waitForValues(key, []int64{5, 2, 5}) 1871 1872 // Re-add the store and restart it. 1873 // TODO(dt): ben originally suggested we also attempt this in the other order. 1874 // This currently hits an NPE in mtc.replicateRange though when it tries to 1875 // read the Ident.NodeID field in the specified store, and will become 1876 // impossible after streaming snapshots. 1877 mtc.restartStore(1) 1878 if removeBeforeTruncateAndReAdd { 1879 // Verify old replica is GC'd. Wait out the replica gc queue 1880 // inactivity threshold and force a gc scan. 1881 mtc.manualClock.Increment(int64(kvserver.ReplicaGCQueueInactivityThreshold + 1)) 1882 testutils.SucceedsSoon(t, func() error { 1883 mtc.stores[1].MustForceReplicaGCScanAndProcess() 1884 _, err := mtc.stores[1].GetReplica(rangeID) 1885 if !errors.HasType(err, (*roachpb.RangeNotFoundError)(nil)) { 1886 return errors.Errorf("expected replica to be garbage collected, got %v %T", err, err) 1887 } 1888 return nil 1889 }) 1890 1891 mtc.replicateRange(rangeID, 1) 1892 } 1893 1894 mtc.waitForValues(key, []int64{5, 5, 5}) 1895 } 1896 1897 func testReplicaAddRemove(t *testing.T, addFirst bool) { 1898 // This test relies on concurrently waiting for a value to change in the 1899 // underlying engine(s). Since the teeing engine does not respond well to 1900 // value mismatches, whether transient or permanent, skip this test if the 1901 // teeing engine is being used. See 1902 // https://github.com/cockroachdb/cockroach/issues/42656 for more context. 1903 if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB { 1904 t.Skip("disabled on teeing engine") 1905 } 1906 sc := kvserver.TestStoreConfig(nil) 1907 // We're gonna want to validate the state of the store before and after the 1908 // replica GC queue does its work, so we disable the replica gc queue here 1909 // and run it manually when we're ready. 1910 sc.TestingKnobs.DisableReplicaGCQueue = true 1911 sc.TestingKnobs.DisableEagerReplicaRemoval = true 1912 sc.Clock = nil // manual clock 1913 mtc := &multiTestContext{ 1914 storeConfig: &sc, 1915 // This test was written before the multiTestContext started creating many 1916 // system ranges at startup, and hasn't been update to take that into 1917 // account. 1918 startWithSingleRange: true, 1919 } 1920 defer mtc.Stop() 1921 mtc.Start(t, 4) 1922 1923 key := roachpb.Key("a") 1924 verifyFn := func(expected []int64) func() error { 1925 return func() error { 1926 values := make([]int64, len(mtc.engines)) 1927 for i, eng := range mtc.engines { 1928 val, _, err := storage.MVCCGet(context.Background(), eng, key, mtc.clock().Now(), 1929 storage.MVCCGetOptions{}) 1930 if err != nil { 1931 return err 1932 } 1933 values[i] = mustGetInt(val) 1934 } 1935 if reflect.DeepEqual(expected, values) { 1936 return nil 1937 } 1938 return errors.Errorf("expected %+v, got %+v", expected, values) 1939 } 1940 } 1941 1942 // Replicate the initial range to three of the four nodes. 1943 const rangeID = roachpb.RangeID(1) 1944 mtc.replicateRange(rangeID, 3, 1) 1945 1946 inc1 := int64(5) 1947 { 1948 incArgs := incrementArgs(key, inc1) 1949 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1950 t.Fatal(err) 1951 } 1952 } 1953 1954 // The first increment is visible on all three replicas. 1955 testutils.SucceedsSoon(t, verifyFn([]int64{ 1956 inc1, 1957 inc1, 1958 0, 1959 inc1, 1960 })) 1961 1962 // Stop a store and replace it. 1963 mtc.stopStore(1) 1964 if addFirst { 1965 mtc.replicateRange(rangeID, 2) 1966 mtc.unreplicateRange(rangeID, 1) 1967 } else { 1968 mtc.unreplicateRange(rangeID, 1) 1969 mtc.replicateRange(rangeID, 2) 1970 } 1971 // The first increment is visible on the new replica. 1972 testutils.SucceedsSoon(t, verifyFn([]int64{ 1973 inc1, 1974 inc1, 1975 inc1, 1976 inc1, 1977 })) 1978 1979 // Ensure that the rest of the group can make progress. 1980 inc2 := int64(11) 1981 { 1982 incArgs := incrementArgs(key, inc2) 1983 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 1984 t.Fatal(err) 1985 } 1986 } 1987 testutils.SucceedsSoon(t, verifyFn([]int64{ 1988 inc1 + inc2, 1989 inc1, 1990 inc1 + inc2, 1991 inc1 + inc2, 1992 })) 1993 1994 // Bring the downed store back up (required for a clean shutdown). 1995 mtc.restartStore(1) 1996 1997 // The downed store never sees the increment that was added while it was 1998 // down. Perform another increment now that it is back up to verify that it 1999 // doesn't see future activity. 2000 inc3 := int64(23) 2001 { 2002 incArgs := incrementArgs(key, inc3) 2003 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 2004 t.Fatal(err) 2005 } 2006 } 2007 testutils.SucceedsSoon(t, verifyFn([]int64{ 2008 inc1 + inc2 + inc3, 2009 inc1, 2010 inc1 + inc2 + inc3, 2011 inc1 + inc2 + inc3, 2012 })) 2013 2014 // Wait out the range lease and the unleased duration to make the replica GC'able. 2015 mtc.advanceClock(context.Background()) 2016 mtc.manualClock.Increment(int64(kvserver.ReplicaGCQueueInactivityThreshold + 1)) 2017 mtc.stores[1].SetReplicaGCQueueActive(true) 2018 mtc.stores[1].MustForceReplicaGCScanAndProcess() 2019 2020 // The removed store no longer has any of the data from the range. 2021 testutils.SucceedsSoon(t, verifyFn([]int64{ 2022 inc1 + inc2 + inc3, 2023 0, 2024 inc1 + inc2 + inc3, 2025 inc1 + inc2 + inc3, 2026 })) 2027 2028 desc := mtc.stores[0].LookupReplica(roachpb.RKeyMin).Desc() 2029 replicaIDsByStore := map[roachpb.StoreID]roachpb.ReplicaID{} 2030 for _, rep := range desc.InternalReplicas { 2031 replicaIDsByStore[rep.StoreID] = rep.ReplicaID 2032 } 2033 expected := map[roachpb.StoreID]roachpb.ReplicaID{1: 1, 4: 2, 3: 4} 2034 if !reflect.DeepEqual(expected, replicaIDsByStore) { 2035 t.Fatalf("expected replica IDs to be %v but got %v", expected, replicaIDsByStore) 2036 } 2037 } 2038 2039 func TestReplicateAddAndRemove(t *testing.T) { 2040 defer leaktest.AfterTest(t)() 2041 2042 testReplicaAddRemove(t, true /* addFirst */) 2043 } 2044 2045 func TestReplicateRemoveAndAdd(t *testing.T) { 2046 defer leaktest.AfterTest(t)() 2047 2048 testReplicaAddRemove(t, false /* addFirst */) 2049 } 2050 2051 // TestQuotaPool verifies that writes get throttled in the case where we have 2052 // two fast moving replicas with sufficiently fast growing raft logs and a 2053 // slower replica catching up. By throttling write throughput we avoid having 2054 // to constantly catch up the slower node via snapshots. See #8659. 2055 func TestQuotaPool(t *testing.T) { 2056 defer leaktest.AfterTest(t)() 2057 2058 const quota = 10000 2059 const numReplicas = 3 2060 const rangeID = 1 2061 ctx := context.Background() 2062 sc := kvserver.TestStoreConfig(nil) 2063 // Suppress timeout-based elections to avoid leadership changes in ways 2064 // this test doesn't expect. 2065 sc.RaftElectionTimeoutTicks = 100000 2066 mtc := &multiTestContext{ 2067 storeConfig: &sc, 2068 // This test was written before the multiTestContext started creating many 2069 // system ranges at startup, and hasn't been update to take that into 2070 // account. 2071 startWithSingleRange: true, 2072 } 2073 mtc.Start(t, numReplicas) 2074 defer mtc.Stop() 2075 2076 mtc.replicateRange(rangeID, 1, 2) 2077 2078 assertEqualLastIndex := func() error { 2079 var expectedIndex uint64 2080 2081 for i, s := range mtc.stores { 2082 repl, err := s.GetReplica(rangeID) 2083 if err != nil { 2084 t.Fatal(err) 2085 } 2086 2087 index, err := repl.GetLastIndex() 2088 if err != nil { 2089 t.Fatal(err) 2090 } 2091 if i == 0 { 2092 expectedIndex = index 2093 } else if expectedIndex != index { 2094 return fmt.Errorf("%s: expected lastIndex %d, but found %d", repl, expectedIndex, index) 2095 } 2096 } 2097 return nil 2098 } 2099 testutils.SucceedsSoon(t, assertEqualLastIndex) 2100 2101 // NB: See TestRaftBlockedReplica/#9914 for why we use a separate goroutine. 2102 raftLockReplica := func(repl *kvserver.Replica) { 2103 ch := make(chan struct{}) 2104 go func() { repl.RaftLock(); close(ch) }() 2105 <-ch 2106 } 2107 2108 leaderRepl := mtc.getRaftLeader(rangeID) 2109 // Grab the raftMu to re-initialize the QuotaPool to ensure that we don't 2110 // race with ongoing applications. 2111 raftLockReplica(leaderRepl) 2112 if err := leaderRepl.InitQuotaPool(quota); err != nil { 2113 t.Fatalf("failed to initialize quota pool: %v", err) 2114 } 2115 leaderRepl.RaftUnlock() 2116 followerRepl := func() *kvserver.Replica { 2117 for _, store := range mtc.stores { 2118 repl, err := store.GetReplica(rangeID) 2119 if err != nil { 2120 t.Fatal(err) 2121 } 2122 if repl == leaderRepl { 2123 continue 2124 } 2125 return repl 2126 } 2127 return nil 2128 }() 2129 if followerRepl == nil { 2130 t.Fatal("could not get a handle on a follower replica") 2131 } 2132 2133 // We block the third replica effectively causing acquisition of quota 2134 // without subsequent release. 2135 raftLockReplica(followerRepl) 2136 ch := make(chan *roachpb.Error, 1) 2137 2138 func() { 2139 defer followerRepl.RaftUnlock() 2140 2141 // In order to verify write throttling we insert a value 3/4th the size of 2142 // total quota available in the system. This should effectively go through 2143 // and block the subsequent insert of the same size. We check to see whether 2144 // or not after this write has gone through by verifying that the total 2145 // quota available has decreased as expected. 2146 // 2147 // Following this we unblock the 'slow' replica allowing it to catch up to 2148 // the first write. This in turn releases quota back to the pool and the 2149 // second write, previously blocked by virtue of there not being enough 2150 // quota, is now free to proceed. We expect the final quota in the system 2151 // to be the same as what we started with. 2152 key := roachpb.Key("k") 2153 value := bytes.Repeat([]byte("v"), (3*quota)/4) 2154 var ba roachpb.BatchRequest 2155 ba.Add(putArgs(key, value)) 2156 if err := ba.SetActiveTimestamp(mtc.clock().Now); err != nil { 2157 t.Fatal(err) 2158 } 2159 if _, pErr := leaderRepl.Send(ctx, ba); pErr != nil { 2160 t.Fatal(pErr) 2161 } 2162 2163 if curQuota := leaderRepl.QuotaAvailable(); curQuota > quota/4 { 2164 t.Fatalf("didn't observe the expected quota acquisition, available: %d", curQuota) 2165 } 2166 2167 testutils.SucceedsSoon(t, func() error { 2168 if qLen := leaderRepl.QuotaReleaseQueueLen(); qLen < 1 { 2169 return errors.Errorf("expected at least 1 queued quota release, found: %d", qLen) 2170 } 2171 return nil 2172 }) 2173 2174 go func() { 2175 var ba roachpb.BatchRequest 2176 ba.Add(putArgs(key, value)) 2177 if err := ba.SetActiveTimestamp(mtc.clock().Now); err != nil { 2178 ch <- roachpb.NewError(err) 2179 return 2180 } 2181 _, pErr := leaderRepl.Send(ctx, ba) 2182 ch <- pErr 2183 }() 2184 }() 2185 2186 testutils.SucceedsSoon(t, func() error { 2187 if curQuota := leaderRepl.QuotaAvailable(); curQuota != quota { 2188 return errors.Errorf("expected available quota %d, got %d", quota, curQuota) 2189 } 2190 if qLen := leaderRepl.QuotaReleaseQueueLen(); qLen != 0 { 2191 return errors.Errorf("expected no queued quota releases, found: %d", qLen) 2192 } 2193 return nil 2194 }) 2195 2196 if pErr := <-ch; pErr != nil { 2197 t.Fatal(pErr) 2198 } 2199 } 2200 2201 // TestWedgedReplicaDetection verifies that a leader replica is able to 2202 // correctly detect a wedged follower replica and no longer consider it 2203 // as active for the purpose of proposal throttling. 2204 func TestWedgedReplicaDetection(t *testing.T) { 2205 defer leaktest.AfterTest(t)() 2206 2207 const numReplicas = 3 2208 const rangeID = 1 2209 2210 sc := kvserver.TestStoreConfig(nil) 2211 // Suppress timeout-based elections to avoid leadership changes in ways 2212 // this test doesn't expect. 2213 sc.RaftElectionTimeoutTicks = 100000 2214 mtc := &multiTestContext{ 2215 storeConfig: &sc, 2216 // This test was written before the multiTestContext started creating many 2217 // system ranges at startup, and hasn't been update to take that into 2218 // account. 2219 startWithSingleRange: true, 2220 } 2221 mtc.Start(t, numReplicas) 2222 defer mtc.Stop() 2223 mtc.replicateRange(rangeID, 1, 2) 2224 2225 leaderRepl := mtc.getRaftLeader(rangeID) 2226 followerRepl := func() *kvserver.Replica { 2227 for _, store := range mtc.stores { 2228 repl, err := store.GetReplica(rangeID) 2229 if err != nil { 2230 t.Fatal(err) 2231 } 2232 if repl == leaderRepl { 2233 continue 2234 } 2235 return repl 2236 } 2237 return nil 2238 }() 2239 if followerRepl == nil { 2240 t.Fatal("could not get a handle on a follower replica") 2241 } 2242 2243 // Lock the follower replica to prevent it from making progress from now 2244 // on. NB: See TestRaftBlockedReplica/#9914 for why we use a separate 2245 // goroutine. 2246 var wg sync.WaitGroup 2247 wg.Add(1) 2248 go func() { 2249 followerRepl.RaftLock() 2250 wg.Done() 2251 }() 2252 wg.Wait() 2253 defer followerRepl.RaftUnlock() 2254 2255 // TODO(andrei): The test becomes flaky with a lower threshold because the 2256 // follower is considered inactive just below. Figure out how to switch the 2257 // test to a manual clock. The activity tracking for followers uses the 2258 // physical clock. 2259 inactivityThreshold := time.Second 2260 2261 // Send a request to the leader replica. followerRepl is locked so it will 2262 // not respond. 2263 ctx := context.Background() 2264 key := roachpb.Key("k") 2265 value := []byte("value") 2266 var ba roachpb.BatchRequest 2267 ba.Add(putArgs(key, value)) 2268 if err := ba.SetActiveTimestamp(mtc.clock().Now); err != nil { 2269 t.Fatal(err) 2270 } 2271 if _, pErr := leaderRepl.Send(ctx, ba); pErr != nil { 2272 t.Fatal(pErr) 2273 } 2274 2275 // The follower should still be active. 2276 followerID := followerRepl.ReplicaID() 2277 if !leaderRepl.IsFollowerActiveSince(ctx, followerID, inactivityThreshold) { 2278 t.Fatalf("expected follower to still be considered active") 2279 } 2280 2281 // It is possible that there are in-flight heartbeat responses from 2282 // followerRepl from before it was locked. The receipt of one of these 2283 // would bump the last active timestamp on the leader. Because of this, 2284 // we check whether the follower is eventually considered inactive. 2285 testutils.SucceedsSoon(t, func() error { 2286 // Send another request to the leader replica. followerRepl is locked 2287 // so it will not respond. 2288 if _, pErr := leaderRepl.Send(ctx, ba); pErr != nil { 2289 t.Fatal(pErr) 2290 } 2291 2292 // The follower should no longer be considered active. 2293 if leaderRepl.IsFollowerActiveSince(ctx, followerID, inactivityThreshold) { 2294 return errors.New("expected follower to be considered inactive") 2295 } 2296 return nil 2297 }) 2298 } 2299 2300 // TestRaftHeartbeats verifies that coalesced heartbeats are correctly 2301 // suppressing elections in an idle cluster. 2302 func TestRaftHeartbeats(t *testing.T) { 2303 defer leaktest.AfterTest(t)() 2304 2305 mtc := &multiTestContext{} 2306 defer mtc.Stop() 2307 mtc.Start(t, 3) 2308 2309 const rangeID = roachpb.RangeID(1) 2310 mtc.replicateRange(rangeID, 1, 2) 2311 2312 // Capture the initial term and state. 2313 leaderIdx := -1 2314 for i, store := range mtc.stores { 2315 if store.RaftStatus(rangeID).SoftState.RaftState == raft.StateLeader { 2316 leaderIdx = i 2317 break 2318 } 2319 } 2320 initialTerm := mtc.stores[leaderIdx].RaftStatus(rangeID).Term 2321 2322 // Wait for several ticks to elapse. 2323 ticksToWait := 2 * mtc.makeStoreConfig(leaderIdx).RaftElectionTimeoutTicks 2324 ticks := mtc.stores[leaderIdx].Metrics().RaftTicks.Count 2325 for targetTicks := ticks() + int64(ticksToWait); ticks() < targetTicks; { 2326 time.Sleep(time.Millisecond) 2327 } 2328 2329 status := mtc.stores[leaderIdx].RaftStatus(rangeID) 2330 if status.SoftState.RaftState != raft.StateLeader { 2331 t.Errorf("expected node %d to be leader after sleeping but was %s", leaderIdx, status.SoftState.RaftState) 2332 } 2333 if status.Term != initialTerm { 2334 t.Errorf("while sleeping, term changed from %d to %d", initialTerm, status.Term) 2335 } 2336 } 2337 2338 // TestReportUnreachableHeartbeats tests that if a single transport fails, 2339 // coalesced heartbeats are not stalled out entirely. 2340 func TestReportUnreachableHeartbeats(t *testing.T) { 2341 defer leaktest.AfterTest(t)() 2342 2343 mtc := &multiTestContext{ 2344 // This test was written before the multiTestContext started creating many 2345 // system ranges at startup, and hasn't been update to take that into 2346 // account. 2347 startWithSingleRange: true, 2348 } 2349 defer mtc.Stop() 2350 mtc.Start(t, 3) 2351 2352 const rangeID = roachpb.RangeID(1) 2353 mtc.replicateRange(rangeID, 1, 2) 2354 2355 leaderIdx := -1 2356 // Loop until a leader is elected. 2357 for { 2358 for i, store := range mtc.stores { 2359 if store.RaftStatus(rangeID).SoftState.RaftState == raft.StateLeader { 2360 leaderIdx = i 2361 break 2362 } 2363 } 2364 if leaderIdx == -1 { 2365 runtime.Gosched() 2366 } else { 2367 break 2368 } 2369 } 2370 initialTerm := mtc.stores[leaderIdx].RaftStatus(rangeID).Term 2371 // Choose a follower index that is guaranteed to not be the leader. 2372 followerIdx := (leaderIdx + 1) % len(mtc.stores) 2373 2374 // Shut down a raft transport via the circuit breaker, and wait for two 2375 // election timeouts to trigger an election if reportUnreachable broke 2376 // heartbeat transmission to the other store. 2377 cb := mtc.transport.GetCircuitBreaker(mtc.stores[followerIdx].Ident.NodeID, 2378 rpc.DefaultClass) 2379 cb.Break() 2380 2381 // Send a command to ensure Raft is aware of lost follower so that it won't 2382 // quiesce (which would prevent heartbeats). 2383 if _, err := kv.SendWrappedWith( 2384 context.Background(), mtc.stores[0].TestSender(), roachpb.Header{RangeID: rangeID}, 2385 incrementArgs(roachpb.Key("a"), 1)); err != nil { 2386 t.Fatal(err) 2387 } 2388 2389 ticksToWait := 2 * mtc.makeStoreConfig(leaderIdx).RaftElectionTimeoutTicks 2390 ticks := mtc.stores[leaderIdx].Metrics().RaftTicks.Count 2391 for targetTicks := ticks() + int64(ticksToWait); ticks() < targetTicks; { 2392 time.Sleep(time.Millisecond) 2393 } 2394 2395 // Ensure that the leadership has not changed, to confirm that heartbeats 2396 // are sent to the store with a functioning transport. 2397 status := mtc.stores[leaderIdx].RaftStatus(rangeID) 2398 if status.SoftState.RaftState != raft.StateLeader { 2399 t.Errorf("expected node %d to be leader after sleeping but was %s", leaderIdx, status.SoftState.RaftState) 2400 } 2401 if status.Term != initialTerm { 2402 t.Errorf("while sleeping, term changed from %d to %d", initialTerm, status.Term) 2403 } 2404 } 2405 2406 // TestReportUnreachableRemoveRace adds and removes the raft leader replica 2407 // repeatedly while one of its peers is unreachable in an attempt to expose 2408 // races (primarily in asynchronous coalesced heartbeats). 2409 func TestReportUnreachableRemoveRace(t *testing.T) { 2410 defer leaktest.AfterTest(t)() 2411 2412 mtc := &multiTestContext{} 2413 defer mtc.Stop() 2414 mtc.Start(t, 3) 2415 2416 const rangeID = roachpb.RangeID(1) 2417 mtc.replicateRange(rangeID, 1, 2) 2418 2419 outer: 2420 for i := 0; i < 5; i++ { 2421 for leaderIdx, store := range mtc.stores { 2422 repl, err := store.GetReplica(rangeID) 2423 if err != nil { 2424 t.Fatal(err) 2425 } 2426 if repl.RaftStatus().SoftState.RaftState == raft.StateLeader { 2427 for replicaIdx, toStore := range mtc.stores { 2428 if toStore == store { 2429 continue 2430 } 2431 repDesc, err := repl.GetReplicaDescriptor() 2432 if err != nil { 2433 t.Fatal(err) 2434 } 2435 if lease, _ := repl.GetLease(); lease.Replica.Equal(repDesc) { 2436 mtc.transferLease(context.Background(), rangeID, leaderIdx, replicaIdx) 2437 } 2438 mtc.unreplicateRange(rangeID, leaderIdx) 2439 cb := mtc.transport.GetCircuitBreaker(toStore.Ident.NodeID, rpc.DefaultClass) 2440 cb.Break() 2441 time.Sleep(mtc.storeConfig.CoalescedHeartbeatsInterval) 2442 cb.Reset() 2443 mtc.replicateRange(rangeID, leaderIdx) 2444 continue outer 2445 } 2446 t.Fatal("could not find raft replica") 2447 } 2448 } 2449 i-- // try again 2450 } 2451 } 2452 2453 // TestReplicateAfterSplit verifies that a new replica whose start key 2454 // is not KeyMin replicating to a fresh store can apply snapshots correctly. 2455 func TestReplicateAfterSplit(t *testing.T) { 2456 defer leaktest.AfterTest(t)() 2457 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 2458 storeCfg.TestingKnobs.DisableMergeQueue = true 2459 mtc := &multiTestContext{ 2460 storeConfig: &storeCfg, 2461 } 2462 defer mtc.Stop() 2463 mtc.Start(t, 2) 2464 2465 const rangeID = roachpb.RangeID(1) 2466 splitKey := roachpb.Key("m") 2467 key := roachpb.Key("z") 2468 2469 store0 := mtc.stores[0] 2470 // Make the split 2471 splitArgs := adminSplitArgs(splitKey) 2472 if _, err := kv.SendWrapped(context.Background(), store0.TestSender(), splitArgs); err != nil { 2473 t.Fatal(err) 2474 } 2475 2476 rangeID2 := store0.LookupReplica(roachpb.RKey(key)).RangeID 2477 if rangeID2 == rangeID { 2478 t.Fatal("got same range id after split") 2479 } 2480 // Issue an increment for later check. 2481 incArgs := incrementArgs(key, 11) 2482 if _, err := kv.SendWrappedWith(context.Background(), store0.TestSender(), roachpb.Header{ 2483 RangeID: rangeID2, 2484 }, incArgs); err != nil { 2485 t.Fatal(err) 2486 } 2487 // Now add the second replica. 2488 mtc.replicateRange(rangeID2, 1) 2489 2490 if mtc.stores[1].LookupReplica(roachpb.RKey(key)).GetMaxBytes() == 0 { 2491 t.Error("Range MaxBytes is not set after snapshot applied") 2492 } 2493 // Once it catches up, the effects of increment commands can be seen. 2494 testutils.SucceedsSoon(t, func() error { 2495 getArgs := getArgs(key) 2496 // Reading on non-lease holder replica should use inconsistent read 2497 if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{ 2498 RangeID: rangeID2, 2499 ReadConsistency: roachpb.INCONSISTENT, 2500 }, getArgs); err != nil { 2501 return errors.Errorf("failed to read data: %s", err) 2502 } else if e, v := int64(11), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { 2503 return errors.Errorf("failed to read correct data: expected %d, got %d", e, v) 2504 } 2505 return nil 2506 }) 2507 } 2508 2509 // TestReplicaRemovalCampaign verifies that a new replica after a split can be 2510 // transferred away/replaced without campaigning the old one. 2511 func TestReplicaRemovalCampaign(t *testing.T) { 2512 defer leaktest.AfterTest(t)() 2513 2514 testData := []struct { 2515 remove bool 2516 expectAdvance bool 2517 }{ 2518 { // Replica removed 2519 remove: true, 2520 expectAdvance: false, 2521 }, 2522 { // Default behavior 2523 remove: false, 2524 expectAdvance: true, 2525 }, 2526 } 2527 2528 const rangeID = roachpb.RangeID(1) 2529 splitKey := roachpb.Key("m") 2530 key2 := roachpb.Key("z") 2531 2532 for i, td := range testData { 2533 func() { 2534 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 2535 storeCfg.TestingKnobs.DisableMergeQueue = true 2536 mtc := &multiTestContext{ 2537 storeConfig: &storeCfg, 2538 } 2539 defer mtc.Stop() 2540 mtc.Start(t, 2) 2541 2542 // Replicate range to enable raft campaigning. 2543 mtc.replicateRange(rangeID, 1) 2544 store0 := mtc.stores[0] 2545 2546 // Make the split. 2547 splitArgs := adminSplitArgs(splitKey) 2548 if _, err := kv.SendWrapped(context.Background(), store0.TestSender(), splitArgs); err != nil { 2549 t.Fatal(err) 2550 } 2551 2552 replica2 := store0.LookupReplica(roachpb.RKey(key2)) 2553 2554 rg2 := func(s *kvserver.Store) kv.Sender { 2555 return kv.Wrap(s, func(ba roachpb.BatchRequest) roachpb.BatchRequest { 2556 if ba.RangeID == 0 { 2557 ba.RangeID = replica2.RangeID 2558 } 2559 return ba 2560 }) 2561 } 2562 2563 // Raft processing is initialized lazily; issue a no-op write request to 2564 // ensure that the Raft group has been started. 2565 incArgs := incrementArgs(key2, 0) 2566 if _, err := kv.SendWrapped(context.Background(), rg2(store0), incArgs); err != nil { 2567 t.Fatal(err) 2568 } 2569 2570 if td.remove { 2571 // Simulate second replica being transferred by removing it. 2572 if err := store0.RemoveReplica(context.Background(), replica2, replica2.Desc().NextReplicaID, kvserver.RemoveOptions{ 2573 DestroyData: true, 2574 }); err != nil { 2575 t.Fatal(err) 2576 } 2577 } 2578 2579 var latestTerm uint64 2580 if td.expectAdvance { 2581 testutils.SucceedsSoon(t, func() error { 2582 if raftStatus := replica2.RaftStatus(); raftStatus != nil { 2583 if term := raftStatus.Term; term <= latestTerm { 2584 return errors.Errorf("%d: raft term has not yet advanced: %d", i, term) 2585 } else if latestTerm == 0 { 2586 latestTerm = term 2587 } 2588 } else { 2589 return errors.Errorf("%d: raft group is not yet initialized", i) 2590 } 2591 return nil 2592 }) 2593 } else { 2594 for start := timeutil.Now(); timeutil.Since(start) < time.Second; time.Sleep(10 * time.Millisecond) { 2595 if raftStatus := replica2.RaftStatus(); raftStatus != nil { 2596 if term := raftStatus.Term; term > latestTerm { 2597 if latestTerm == 0 { 2598 latestTerm = term 2599 } else { 2600 t.Errorf("%d: raft term unexpectedly advanced: %d", i, term) 2601 break 2602 } 2603 } 2604 } 2605 } 2606 } 2607 }() 2608 } 2609 } 2610 2611 // TestRaftAfterRemoveRange verifies that the raft state removes 2612 // a remote node correctly after the Replica was removed from the Store. 2613 func TestRaftAfterRemoveRange(t *testing.T) { 2614 defer leaktest.AfterTest(t)() 2615 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 2616 storeCfg.TestingKnobs.DisableMergeQueue = true 2617 storeCfg.Clock = nil // manual clock 2618 mtc := &multiTestContext{ 2619 storeConfig: &storeCfg, 2620 } 2621 defer mtc.Stop() 2622 mtc.Start(t, 3) 2623 2624 // Make the split. 2625 splitArgs := adminSplitArgs(roachpb.Key("b")) 2626 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil { 2627 t.Fatal(err) 2628 } 2629 2630 const rangeID = roachpb.RangeID(2) 2631 mtc.replicateRange(rangeID, 1, 2) 2632 2633 mtc.unreplicateRange(rangeID, 2) 2634 mtc.unreplicateRange(rangeID, 1) 2635 2636 // Wait for the removal to be processed. 2637 testutils.SucceedsSoon(t, func() error { 2638 for _, s := range mtc.stores[1:] { 2639 _, err := s.GetReplica(rangeID) 2640 if !errors.HasType(err, (*roachpb.RangeNotFoundError)(nil)) { 2641 return errors.Wrapf(err, "range %d not yet removed from %s", rangeID, s) 2642 } 2643 } 2644 return nil 2645 }) 2646 2647 // Test that a coalesced heartbeat is ingested correctly. 2648 replica1 := roachpb.ReplicaDescriptor{ 2649 ReplicaID: roachpb.ReplicaID(mtc.stores[1].StoreID()), 2650 NodeID: roachpb.NodeID(mtc.stores[1].StoreID()), 2651 StoreID: mtc.stores[1].StoreID(), 2652 } 2653 replica2 := roachpb.ReplicaDescriptor{ 2654 ReplicaID: roachpb.ReplicaID(mtc.stores[2].StoreID()), 2655 NodeID: roachpb.NodeID(mtc.stores[2].StoreID()), 2656 StoreID: mtc.stores[2].StoreID(), 2657 } 2658 mtc.transport.SendAsync(&kvserver.RaftMessageRequest{ 2659 ToReplica: replica1, 2660 FromReplica: replica2, 2661 Heartbeats: []kvserver.RaftHeartbeat{ 2662 { 2663 RangeID: rangeID, 2664 FromReplicaID: replica2.ReplicaID, 2665 ToReplicaID: replica1.ReplicaID, 2666 }, 2667 }, 2668 }, rpc.DefaultClass) 2669 // Execute another replica change to ensure that raft has processed 2670 // the heartbeat just sent. 2671 mtc.replicateRange(roachpb.RangeID(1), 1) 2672 2673 // Expire leases to ensure any remaining intent resolutions can complete. 2674 // TODO(bdarnell): understand why some tests need this. 2675 mtc.advanceClock(context.Background()) 2676 } 2677 2678 // TestRaftRemoveRace adds and removes a replica repeatedly in an attempt to 2679 // reproduce a race (see #1911 and #9037). 2680 func TestRaftRemoveRace(t *testing.T) { 2681 defer leaktest.AfterTest(t)() 2682 mtc := &multiTestContext{} 2683 defer mtc.Stop() 2684 const rangeID = roachpb.RangeID(1) 2685 2686 if !util.RaceEnabled { 2687 mtc.Start(t, 10) 2688 // Up-replicate to a bunch of nodes which stresses a condition where a 2689 // replica created via a preemptive snapshot receives a message for a 2690 // previous incarnation of the replica (i.e. has a smaller replica ID) that 2691 // existed on the same store. 2692 mtc.replicateRange(rangeID, 1, 2, 3, 4, 5, 6, 7, 8, 9) 2693 } else { 2694 // In race builds, running 10 nodes needs more than 1 full CPU 2695 // (due to background gossip and heartbeat overhead), so it can't 2696 // keep up when run under stress with one process per CPU. Run a 2697 // reduced version of this test in race builds. This isn't as 2698 // likely to reproduce the preemptive-snapshot race described in 2699 // the previous comment, but will still have a chance to do so, or 2700 // to find other races. 2701 mtc.Start(t, 3) 2702 mtc.replicateRange(rangeID, 1, 2) 2703 } 2704 2705 for i := 0; i < 10; i++ { 2706 mtc.unreplicateRange(rangeID, 2) 2707 mtc.replicateRange(rangeID, 2) 2708 2709 // Verify the tombstone key does not exist. See #12130. 2710 tombstoneKey := keys.RangeTombstoneKey(rangeID) 2711 var tombstone roachpb.RangeTombstone 2712 if ok, err := storage.MVCCGetProto( 2713 context.Background(), mtc.stores[2].Engine(), tombstoneKey, 2714 hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{}, 2715 ); err != nil { 2716 t.Fatal(err) 2717 } else if ok { 2718 t.Fatal("tombstone should not exist") 2719 } 2720 } 2721 } 2722 2723 // TestRemovePlaceholderRace adds and removes a replica repeatedly (similar to 2724 // TestRaftRemoveRace) in an attempt to stress the locking around replica 2725 // placeholders. 2726 func TestRemovePlaceholderRace(t *testing.T) { 2727 defer leaktest.AfterTest(t)() 2728 mtc := &multiTestContext{} 2729 defer mtc.Stop() 2730 mtc.Start(t, 3) 2731 2732 const rangeID = roachpb.RangeID(1) 2733 mtc.replicateRange(rangeID, 1, 2) 2734 2735 repl, err := mtc.stores[0].GetReplica(rangeID) 2736 if err != nil { 2737 t.Fatal(err) 2738 } 2739 ctx := repl.AnnotateCtx(context.Background()) 2740 2741 for i := 0; i < 100; i++ { 2742 for _, action := range []roachpb.ReplicaChangeType{roachpb.REMOVE_REPLICA, roachpb.ADD_REPLICA} { 2743 for { 2744 chgs := roachpb.MakeReplicationChanges(action, roachpb.ReplicationTarget{ 2745 NodeID: mtc.stores[1].Ident.NodeID, 2746 StoreID: mtc.stores[1].Ident.StoreID, 2747 }) 2748 if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonUnknown, "", chgs); err != nil { 2749 if kvserver.IsSnapshotError(err) { 2750 continue 2751 } else { 2752 t.Fatal(err) 2753 } 2754 } 2755 break 2756 } 2757 } 2758 } 2759 } 2760 2761 type noConfChangeTestHandler struct { 2762 rangeID roachpb.RangeID 2763 kvserver.RaftMessageHandler 2764 } 2765 2766 func (ncc *noConfChangeTestHandler) HandleRaftRequest( 2767 ctx context.Context, 2768 req *kvserver.RaftMessageRequest, 2769 respStream kvserver.RaftMessageResponseStream, 2770 ) *roachpb.Error { 2771 for i, e := range req.Message.Entries { 2772 if e.Type == raftpb.EntryConfChange { 2773 var cc raftpb.ConfChange 2774 if err := protoutil.Unmarshal(e.Data, &cc); err != nil { 2775 panic(err) 2776 } 2777 var ccCtx kvserver.ConfChangeContext 2778 if err := protoutil.Unmarshal(cc.Context, &ccCtx); err != nil { 2779 panic(err) 2780 } 2781 var command kvserverpb.RaftCommand 2782 if err := protoutil.Unmarshal(ccCtx.Payload, &command); err != nil { 2783 panic(err) 2784 } 2785 if req.RangeID == ncc.rangeID { 2786 if command.ReplicatedEvalResult.ChangeReplicas != nil { 2787 // We found a configuration change headed for our victim range; 2788 // sink it. 2789 req.Message.Entries = req.Message.Entries[:i] 2790 } 2791 } 2792 } 2793 } 2794 return ncc.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) 2795 } 2796 2797 func (ncc *noConfChangeTestHandler) HandleRaftResponse( 2798 ctx context.Context, resp *kvserver.RaftMessageResponse, 2799 ) error { 2800 switch val := resp.Union.GetValue().(type) { 2801 case *roachpb.Error: 2802 switch val.GetDetail().(type) { 2803 case *roachpb.ReplicaTooOldError: 2804 // We're going to manually GC the replica, so ignore these errors. 2805 return nil 2806 } 2807 } 2808 return ncc.RaftMessageHandler.HandleRaftResponse(ctx, resp) 2809 } 2810 2811 func TestReplicaGCRace(t *testing.T) { 2812 defer leaktest.AfterTest(t)() 2813 2814 mtc := &multiTestContext{} 2815 defer mtc.Stop() 2816 mtc.Start(t, 3) 2817 2818 const rangeID = roachpb.RangeID(1) 2819 mtc.replicateRange(rangeID, 1) 2820 2821 leaderStore := mtc.stores[0] 2822 fromStore := mtc.stores[1] 2823 toStore := mtc.stores[2] 2824 2825 // Prevent the victim replica from processing configuration changes. 2826 mtc.transport.Stop(toStore.Ident.StoreID) 2827 mtc.transport.Listen(toStore.Ident.StoreID, &noConfChangeTestHandler{ 2828 rangeID: rangeID, 2829 RaftMessageHandler: toStore, 2830 }) 2831 2832 repl, err := leaderStore.GetReplica(rangeID) 2833 if err != nil { 2834 t.Fatal(err) 2835 } 2836 ctx := repl.AnnotateCtx(context.Background()) 2837 2838 // Add the victim replica. Note that it will receive a snapshot and raft log 2839 // replays, but will not process the configuration change containing the new 2840 // range descriptor, preventing it from learning of the new NextReplicaID. 2841 chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{ 2842 NodeID: toStore.Ident.NodeID, 2843 StoreID: toStore.Ident.StoreID, 2844 }) 2845 if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil { 2846 t.Fatal(err) 2847 } 2848 2849 // Craft a heartbeat addressed to the victim replica. Note that this 2850 // heartbeat will be sent after the replica has been GC'ed. 2851 rangeDesc := repl.Desc() 2852 fromReplicaDesc, ok := rangeDesc.GetReplicaDescriptor(fromStore.Ident.StoreID) 2853 if !ok { 2854 t.Fatalf("expected %s to have a replica on %s", rangeDesc, fromStore) 2855 } 2856 toReplicaDesc, ok := rangeDesc.GetReplicaDescriptor(toStore.Ident.StoreID) 2857 if !ok { 2858 t.Fatalf("expected %s to have a replica on %s", rangeDesc, toStore) 2859 } 2860 2861 hbReq := kvserver.RaftMessageRequest{ 2862 FromReplica: fromReplicaDesc, 2863 ToReplica: toReplicaDesc, 2864 Heartbeats: []kvserver.RaftHeartbeat{ 2865 { 2866 RangeID: rangeID, 2867 FromReplicaID: fromReplicaDesc.ReplicaID, 2868 ToReplicaID: toReplicaDesc.ReplicaID, 2869 }, 2870 }, 2871 } 2872 2873 // Wait for the victim's raft log to be non-empty, then configure the heartbeat 2874 // with the raft state. 2875 testutils.SucceedsSoon(t, func() error { 2876 status := repl.RaftStatus() 2877 progressByID := status.Progress 2878 progress, ok := progressByID[uint64(toReplicaDesc.ReplicaID)] 2879 if !ok { 2880 return errors.Errorf("%+v does not yet contain %s", progressByID, toReplicaDesc) 2881 } 2882 if progress.Match == 0 { 2883 return errors.Errorf("%+v has not yet advanced", progress) 2884 } 2885 for i := range hbReq.Heartbeats { 2886 hbReq.Heartbeats[i].Term = status.Term 2887 hbReq.Heartbeats[i].Commit = progress.Match 2888 } 2889 return nil 2890 }) 2891 2892 // Remove the victim replica and manually GC it. 2893 chgs[0].ChangeType = roachpb.REMOVE_REPLICA 2894 if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeOverReplicated, "", chgs); err != nil { 2895 t.Fatal(err) 2896 } 2897 2898 { 2899 removedReplica, err := toStore.GetReplica(rangeID) 2900 if err != nil { 2901 t.Fatal(err) 2902 } 2903 if err := toStore.ManualReplicaGC(removedReplica); err != nil { 2904 t.Fatal(err) 2905 } 2906 } 2907 2908 // Create a new transport for store 0. Error responses are passed 2909 // back along the same grpc stream as the request so it's ok that 2910 // there are two (this one and the one actually used by the store). 2911 fromTransport := kvserver.NewRaftTransport(log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer}, 2912 cluster.MakeTestingClusterSettings(), 2913 nodedialer.New(mtc.rpcContext, gossip.AddressResolver(fromStore.Gossip())), 2914 nil, /* grpcServer */ 2915 mtc.transportStopper, 2916 ) 2917 errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1)) 2918 fromTransport.Listen(fromStore.StoreID(), errChan) 2919 2920 // Send the heartbeat. Boom. See #11591. 2921 // We have to send this multiple times to protect against 2922 // dropped messages (see #18355). 2923 sendHeartbeat := func() (sent bool) { 2924 r := hbReq 2925 return fromTransport.SendAsync(&r, rpc.DefaultClass) 2926 } 2927 if sent := sendHeartbeat(); !sent { 2928 t.Fatal("failed to send heartbeat") 2929 } 2930 heartbeatsSent := 1 2931 2932 // The receiver of this message should return an error. If we don't get a 2933 // quick response, assume that the message got dropped and try sending it 2934 // again. 2935 select { 2936 case pErr := <-errChan: 2937 switch pErr.GetDetail().(type) { 2938 case *roachpb.RaftGroupDeletedError: 2939 default: 2940 t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr) 2941 } 2942 case <-time.After(time.Second): 2943 if heartbeatsSent >= 5 { 2944 t.Fatal("did not get expected error") 2945 } 2946 heartbeatsSent++ 2947 if sent := sendHeartbeat(); !sent { 2948 t.Fatal("failed to send heartbeat") 2949 } 2950 } 2951 } 2952 2953 func requireOnlyAtomicChanges( 2954 t *testing.T, db *sqlutils.SQLRunner, rangeID roachpb.RangeID, repFactor int, start time.Time, 2955 ) { 2956 // From all events pertaining to the given rangeID and post-dating the start time, 2957 // filter out those infos which indicate a (full and incoming) voter count in 2958 // excess of the replication factor. Any rows returned have the full info JSON 2959 // strings in them. 2960 const q = ` 2961 SELECT 2962 "uniqueID", 2963 count(t) AS repfactor, 2964 string_agg(info, e'\\n') AS infos 2965 FROM 2966 [ 2967 SELECT 2968 "uniqueID", 2969 replicas->'node_id' AS n, 2970 COALESCE(replicas->'type', '0') AS t, 2971 info 2972 FROM 2973 system.rangelog, 2974 ROWS FROM ( 2975 jsonb_array_elements( 2976 info::JSONB->'UpdatedDesc'->'internal_replicas' 2977 ) 2978 ) 2979 AS replicas 2980 WHERE 2981 info::JSONB->'UpdatedDesc'->'range_id' = $1::JSONB AND timestamp >= $2 2982 ORDER BY 2983 "timestamp" ASC 2984 ] 2985 WHERE 2986 t IN ('0', '2') 2987 GROUP BY 2988 "uniqueID" 2989 HAVING 2990 count(t) > $3; 2991 ` 2992 matrix := db.QueryStr(t, q, rangeID, start, repFactor) 2993 if len(matrix) > 0 { 2994 t.Fatalf("more than %d voting replicas: %s", repFactor, sqlutils.MatrixToStr(matrix)) 2995 } 2996 } 2997 2998 func TestDecommission(t *testing.T) { 2999 defer leaktest.AfterTest(t)() 3000 3001 if util.RaceEnabled { 3002 // Five nodes is too much to reliably run under testrace with our aggressive 3003 // liveness timings. 3004 t.Skip("skipping under testrace: #39807 and #37811") 3005 } 3006 3007 // This test relies on concurrently waiting for a value to change in the 3008 // underlying engine(s). Since the teeing engine does not respond well to 3009 // value mismatches, whether transient or permanent, skip this test if the 3010 // teeing engine is being used. See 3011 // https://github.com/cockroachdb/cockroach/issues/42656 for more context. 3012 if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB { 3013 t.Skip("disabled on teeing engine") 3014 } 3015 3016 ctx := context.Background() 3017 tc := testcluster.StartTestCluster(t, 5, base.TestClusterArgs{ 3018 ReplicationMode: base.ReplicationAuto, 3019 }) 3020 defer tc.Stopper().Stop(ctx) 3021 3022 k := tc.ScratchRange(t) 3023 cc, err := tc.Server(0).RPCContext().GRPCDialNode(tc.Server(0).RPCAddr(), 1, rpc.DefaultClass).Connect(ctx) 3024 require.NoError(t, err) 3025 admin := serverpb.NewAdminClient(cc) 3026 // Decommission the first node, which holds most of the leases. 3027 _, err = admin.Decommission( 3028 ctx, &serverpb.DecommissionRequest{Decommissioning: true}, 3029 ) 3030 require.NoError(t, err) 3031 3032 requireNoReplicas := func(storeID roachpb.StoreID, repFactor int) { 3033 testutils.SucceedsSoon(t, func() error { 3034 desc := tc.LookupRangeOrFatal(t, k) 3035 for _, rDesc := range desc.Replicas().Voters() { 3036 store, err := tc.Servers[int(rDesc.NodeID-1)].Stores().GetStore(rDesc.StoreID) 3037 require.NoError(t, err) 3038 if err := store.ForceReplicationScanAndProcess(); err != nil { 3039 return err 3040 } 3041 } 3042 if sl := desc.Replicas().Filter(func(rDesc roachpb.ReplicaDescriptor) bool { 3043 return rDesc.StoreID == storeID 3044 }); len(sl) > 0 { 3045 return errors.Errorf("still a replica on s%d: %s", storeID, &desc) 3046 } 3047 if len(desc.Replicas().Voters()) != repFactor { 3048 return errors.Errorf("expected %d replicas: %s", repFactor, &desc) 3049 } 3050 return nil 3051 }) 3052 } 3053 3054 const triplicated = 3 3055 3056 requireNoReplicas(1, triplicated) 3057 3058 runner := sqlutils.MakeSQLRunner(tc.ServerConn(0)) 3059 ts := timeutil.Now() 3060 3061 _, err = admin.Decommission( 3062 ctx, &serverpb.DecommissionRequest{NodeIDs: []roachpb.NodeID{2}, Decommissioning: true}, 3063 ) 3064 require.NoError(t, err) 3065 3066 // Both s1 and s2 are out, so neither ought to have replicas. 3067 requireNoReplicas(1, triplicated) 3068 requireNoReplicas(2, triplicated) 3069 3070 // Going from three replicas to three replicas should have used atomic swaps 3071 // only. We didn't verify this before the first decommissioning op because 3072 // lots of ranges were over-replicated due to ranges recently having split 3073 // off from the five-fold replicated system ranges. 3074 requireOnlyAtomicChanges(t, runner, tc.LookupRangeOrFatal(t, k).RangeID, triplicated, ts) 3075 3076 sqlutils.SetZoneConfig(t, runner, "RANGE default", "num_replicas: 1") 3077 3078 const single = 1 3079 3080 // The range should drop down to one replica on a non-decommissioning store. 3081 requireNoReplicas(1, single) 3082 requireNoReplicas(2, single) 3083 3084 // Decommission two more nodes. Only n5 is left; getting the replicas there 3085 // can't use atomic replica swaps because the leaseholder can't be removed. 3086 _, err = admin.Decommission( 3087 ctx, &serverpb.DecommissionRequest{NodeIDs: []roachpb.NodeID{3, 4}, Decommissioning: true}, 3088 ) 3089 require.NoError(t, err) 3090 3091 requireNoReplicas(1, single) 3092 requireNoReplicas(2, single) 3093 requireNoReplicas(3, single) 3094 requireNoReplicas(4, single) 3095 } 3096 3097 // TestReplicateRogueRemovedNode ensures that a rogue removed node 3098 // (i.e. a node that has been removed from the range but doesn't know 3099 // it yet because it was down or partitioned away when it happened) 3100 // cannot cause other removed nodes to recreate their ranges. 3101 func TestReplicateRogueRemovedNode(t *testing.T) { 3102 defer leaktest.AfterTest(t)() 3103 3104 sc := kvserver.TestStoreConfig(nil) 3105 // Newly-started stores (including the "rogue" one) should not GC 3106 // their replicas. We'll turn this back on when needed. 3107 sc.TestingKnobs.DisableReplicaGCQueue = true 3108 sc.Clock = nil // manual clock 3109 mtc := &multiTestContext{ 3110 storeConfig: &sc, 3111 // This test was written before the multiTestContext started creating many 3112 // system ranges at startup, and hasn't been update to take that into 3113 // account. 3114 startWithSingleRange: true, 3115 } 3116 defer mtc.Stop() 3117 mtc.Start(t, 3) 3118 3119 // We're going to set up the cluster with partitioning so that we can 3120 // partition node 0 from the others. The partition is not initially active. 3121 partRange, err := setupPartitionedRange(mtc, 1, 0, 0, false /* activated */, unreliableRaftHandlerFuncs{}) 3122 require.NoError(t, err) 3123 // First put the range on all three nodes. 3124 raftID := roachpb.RangeID(1) 3125 mtc.replicateRange(raftID, 1, 2) 3126 3127 // Put some data in the range so we'll have something to test for. 3128 incArgs := incrementArgs([]byte("a"), 5) 3129 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3130 t.Fatal(err) 3131 } 3132 3133 // Wait for all nodes to catch up. 3134 mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5}) 3135 3136 // Stop node 2; while it is down remove the range from nodes 2 and 1. 3137 mtc.stopStore(2) 3138 mtc.unreplicateRange(raftID, 2) 3139 mtc.unreplicateRange(raftID, 1) 3140 3141 // Make a write on node 0; this will not be replicated because 0 is the only node left. 3142 incArgs = incrementArgs([]byte("a"), 11) 3143 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3144 t.Fatal(err) 3145 } 3146 3147 // Wait for the replica to be GC'd on node 1. 3148 // Store 0 has two writes, 1 has erased everything, and 2 still has the first write. 3149 // A single pass of ForceReplicaGCScanAndProcess is not enough, since the replica 3150 // may be recreated by a stray raft message, so we run the GC scan inside the loop. 3151 // TODO(bdarnell): if the call to RemoveReplica in replicaGCQueue.process can be 3152 // moved under the lock, then the GC scan can be moved out of this loop. 3153 mtc.stores[1].SetReplicaGCQueueActive(true) 3154 testutils.SucceedsSoon(t, func() error { 3155 mtc.advanceClock(context.Background()) 3156 mtc.manualClock.Increment(int64( 3157 kvserver.ReplicaGCQueueInactivityThreshold) + 1) 3158 mtc.stores[1].MustForceReplicaGCScanAndProcess() 3159 3160 actual := mtc.readIntFromEngines(roachpb.Key("a")) 3161 expected := []int64{16, 0, 5} 3162 if !reflect.DeepEqual(expected, actual) { 3163 return errors.Errorf("expected %v, got %v", expected, actual) 3164 } 3165 return nil 3166 }) 3167 // Partition nodes 1 and 2 from node 0. Otherwise they'd get a 3168 // ReplicaTooOldError from node 0 and proceed to remove themselves. 3169 partRange.activate() 3170 // Bring node 2 back up. 3171 mtc.restartStore(2) 3172 3173 // Try to issue a command on node 2. It should not be able to commit 3174 // (so we add it asynchronously). 3175 var startWG sync.WaitGroup 3176 startWG.Add(1) 3177 var finishWG sync.WaitGroup 3178 finishWG.Add(1) 3179 3180 rep, err := mtc.stores[2].GetReplica(raftID) 3181 if err != nil { 3182 t.Fatal(err) 3183 } 3184 replicaDesc, ok := rep.Desc().GetReplicaDescriptor(mtc.stores[2].StoreID()) 3185 if !ok { 3186 t.Fatalf("ReplicaID %d not found", raftID) 3187 } 3188 go func() { 3189 incArgs := incrementArgs([]byte("a"), 23) 3190 startWG.Done() 3191 defer finishWG.Done() 3192 _, pErr := kv.SendWrappedWith( 3193 context.Background(), 3194 mtc.stores[2], 3195 roachpb.Header{ 3196 Replica: replicaDesc, 3197 Timestamp: mtc.stores[2].Clock().Now(), 3198 }, incArgs, 3199 ) 3200 if _, ok := pErr.GetDetail().(*roachpb.RangeNotFoundError); !ok { 3201 // We're on a goroutine and passing the error out is awkward since 3202 // it would only surface at shutdown time. A panic ought to be good 3203 // enough to get visibility. 3204 panic(fmt.Sprintf("unexpected error: %v", pErr)) 3205 } 3206 }() 3207 startWG.Wait() 3208 3209 // Sleep a bit to let the command proposed on node 2 proceed if it's 3210 // going to. Prior to the introduction of replica tombstones, this 3211 // would lead to split-brain: Node 2 would wake up node 1 and they 3212 // would form a quorum, even though node 0 had removed them both. 3213 // Now the tombstone on node 1 prevents it from rejoining the rogue 3214 // copy of the group. 3215 time.Sleep(100 * time.Millisecond) 3216 testutils.SucceedsSoon(t, func() error { 3217 actual := mtc.readIntFromEngines(roachpb.Key("a")) 3218 // Normally, replica GC has not happened yet on store 2, so we 3219 // expect {16, 0, 5}. However, it is possible (on a 3220 // slow/overloaded machine) for the end of the ChangeReplicas 3221 // transaction to be queued up inside the raft transport for long 3222 // enough that it doesn't arrive until after store 2 has been 3223 // restarted, so it is able to trigger an early GC on the 3224 // restarted node, resulting in {16, 0, 0}. 3225 // TODO(bdarnell): When #5789 is fixed, the probabilities flip and 3226 // {16, 0, 0} becomes the expected case. When this happens 3227 // we should just combine this check with the following one. 3228 expected1 := []int64{16, 0, 5} 3229 expected2 := []int64{16, 0, 0} 3230 if !reflect.DeepEqual(expected1, actual) && !reflect.DeepEqual(expected2, actual) { 3231 return errors.Errorf("expected %v or %v, got %v", expected1, expected2, actual) 3232 } 3233 return nil 3234 }) 3235 3236 // Run garbage collection on node 2. The lack of an active lease holder 3237 // lease will cause GC to do a consistent range lookup, where it 3238 // will see that the range has been moved and delete the old 3239 // replica. 3240 mtc.stores[2].SetReplicaGCQueueActive(true) 3241 mtc.advanceClock(context.Background()) 3242 mtc.manualClock.Increment(int64( 3243 kvserver.ReplicaGCQueueInactivityThreshold) + 1) 3244 mtc.stores[2].MustForceReplicaGCScanAndProcess() 3245 mtc.waitForValues(roachpb.Key("a"), []int64{16, 0, 0}) 3246 3247 // Now that the group has been GC'd, the goroutine that was 3248 // attempting to write has finished (with an error). 3249 finishWG.Wait() 3250 } 3251 3252 type errorChannelTestHandler chan *roachpb.Error 3253 3254 func (errorChannelTestHandler) HandleRaftRequest( 3255 _ context.Context, _ *kvserver.RaftMessageRequest, _ kvserver.RaftMessageResponseStream, 3256 ) *roachpb.Error { 3257 panic("unimplemented") 3258 } 3259 3260 func (d errorChannelTestHandler) HandleRaftResponse( 3261 ctx context.Context, resp *kvserver.RaftMessageResponse, 3262 ) error { 3263 switch val := resp.Union.GetValue().(type) { 3264 case *roachpb.Error: 3265 d <- val 3266 default: 3267 log.Fatalf(ctx, "unexpected response type %T", val) 3268 } 3269 return nil 3270 } 3271 3272 func (errorChannelTestHandler) HandleSnapshot( 3273 _ *kvserver.SnapshotRequest_Header, _ kvserver.SnapshotResponseStream, 3274 ) error { 3275 panic("unimplemented") 3276 } 3277 3278 // This test simulates a scenario where one replica has been removed from the 3279 // range's Raft group but it is unaware of the fact. We check that this replica 3280 // coming back from the dead cannot cause elections. 3281 func TestReplicateRemovedNodeDisruptiveElection(t *testing.T) { 3282 defer leaktest.AfterTest(t)() 3283 3284 mtc := &multiTestContext{ 3285 // This test was written before the multiTestContext started creating many 3286 // system ranges at startup, and hasn't been update to take that into 3287 // account. 3288 startWithSingleRange: true, 3289 } 3290 defer mtc.Stop() 3291 mtc.Start(t, 4) 3292 3293 // Move the first range from the first node to the other three. 3294 const rangeID = roachpb.RangeID(1) 3295 mtc.replicateRange(rangeID, 1, 2, 3) 3296 mtc.transferLease(context.Background(), rangeID, 0, 1) 3297 mtc.unreplicateRange(rangeID, 0) 3298 3299 // Ensure that we have a stable lease and raft leader so we can tell if the 3300 // removed node causes a disruption. This is a three-step process. 3301 3302 // 1. Write on the second node, to ensure that a lease has been 3303 // established after the first node's removal. 3304 key := roachpb.Key("a") 3305 value := int64(5) 3306 incArgs := incrementArgs(key, value) 3307 if _, err := kv.SendWrapped(context.Background(), mtc.distSenders[1], incArgs); err != nil { 3308 t.Fatal(err) 3309 } 3310 3311 // 2. Wait for all nodes to process the increment (and therefore the 3312 // new lease). 3313 mtc.waitForValues(key, []int64{0, value, value, value}) 3314 3315 // 3. Wait for the lease holder to obtain raft leadership too. 3316 testutils.SucceedsSoon(t, func() error { 3317 req := &roachpb.LeaseInfoRequest{ 3318 RequestHeader: roachpb.RequestHeader{ 3319 Key: roachpb.KeyMin, 3320 }, 3321 } 3322 reply, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[1], req) 3323 if pErr != nil { 3324 return pErr.GoError() 3325 } 3326 leaseReplica := reply.(*roachpb.LeaseInfoResponse).Lease.Replica.ReplicaID 3327 leadReplica := roachpb.ReplicaID(mtc.stores[1].RaftStatus(rangeID).Lead) 3328 if leaseReplica != leadReplica { 3329 return errors.Errorf("leaseReplica %s does not match leadReplica %s", 3330 leaseReplica, leadReplica) 3331 } 3332 3333 return nil 3334 }) 3335 3336 // Save the current term, which is the latest among the live stores. 3337 findTerm := func() uint64 { 3338 var term uint64 3339 for i := 1; i < 4; i++ { 3340 s := mtc.stores[i].RaftStatus(rangeID) 3341 if s.Term > term { 3342 term = s.Term 3343 } 3344 } 3345 return term 3346 } 3347 term := findTerm() 3348 if term == 0 { 3349 t.Fatalf("expected non-zero term") 3350 } 3351 3352 // replica0 is the one that has been removed; replica1 is a current 3353 // member of the group. 3354 replica0 := roachpb.ReplicaDescriptor{ 3355 ReplicaID: roachpb.ReplicaID(mtc.stores[0].StoreID()), 3356 NodeID: roachpb.NodeID(mtc.stores[0].StoreID()), 3357 StoreID: mtc.stores[0].StoreID(), 3358 } 3359 replica1 := roachpb.ReplicaDescriptor{ 3360 ReplicaID: roachpb.ReplicaID(mtc.stores[1].StoreID()), 3361 NodeID: roachpb.NodeID(mtc.stores[1].StoreID()), 3362 StoreID: mtc.stores[1].StoreID(), 3363 } 3364 3365 // Create a new transport for store 0 so that we can intercept the responses. 3366 // Error responses are passed back along the same grpc stream as the request 3367 // so it's ok that there are two (this one and the one actually used by the 3368 // store). 3369 transport0 := kvserver.NewRaftTransport(log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer}, 3370 cluster.MakeTestingClusterSettings(), 3371 nodedialer.New(mtc.rpcContext, gossip.AddressResolver(mtc.gossips[0])), 3372 nil, /* grpcServer */ 3373 mtc.transportStopper, 3374 ) 3375 errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1)) 3376 transport0.Listen(mtc.stores[0].StoreID(), errChan) 3377 3378 // Simulate the removed node asking to trigger an election. Try and try again 3379 // until we're reasonably sure the message was sent. 3380 for !transport0.SendAsync(&kvserver.RaftMessageRequest{ 3381 RangeID: rangeID, 3382 ToReplica: replica1, 3383 FromReplica: replica0, 3384 Message: raftpb.Message{ 3385 From: uint64(replica0.ReplicaID), 3386 To: uint64(replica1.ReplicaID), 3387 Type: raftpb.MsgVote, 3388 Term: term + 1, 3389 }, 3390 }, rpc.DefaultClass) { 3391 } 3392 3393 // The receiver of this message (i.e. replica1) should return an error telling 3394 // the sender that it's no longer part of the group. 3395 select { 3396 case pErr := <-errChan: 3397 switch pErr.GetDetail().(type) { 3398 case *roachpb.ReplicaTooOldError: 3399 default: 3400 t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr) 3401 } 3402 case <-time.After(45 * time.Second): 3403 t.Fatal("did not get expected ReplicaTooOldError error") 3404 } 3405 3406 // The message should have been discarded without triggering an 3407 // election or changing the term. 3408 newTerm := findTerm() 3409 if term != newTerm { 3410 t.Errorf("expected term to be constant, but changed from %v to %v", term, newTerm) 3411 } 3412 } 3413 3414 func TestReplicaTooOldGC(t *testing.T) { 3415 defer leaktest.AfterTest(t)() 3416 3417 sc := kvserver.TestStoreConfig(nil) 3418 sc.TestingKnobs.DisableScanner = true 3419 mtc := &multiTestContext{ 3420 storeConfig: &sc, 3421 // This test was written before the multiTestContext started creating many 3422 // system ranges at startup, and hasn't been update to take that into 3423 // account. 3424 startWithSingleRange: true, 3425 } 3426 defer mtc.Stop() 3427 mtc.Start(t, 4) 3428 3429 // Replicate the first range onto all of the nodes. 3430 const rangeID = 1 3431 mtc.replicateRange(rangeID, 1, 2, 3) 3432 3433 // Put some data in the range so we'll have something to test for. 3434 incArgs := incrementArgs([]byte("a"), 5) 3435 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3436 t.Fatal(err) 3437 } 3438 // Wait for all nodes to catch up. 3439 mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5, 5}) 3440 3441 // Verify store 3 has the replica. 3442 if _, err := mtc.stores[3].GetReplica(rangeID); err != nil { 3443 t.Fatal(err) 3444 } 3445 3446 // Stop node 3; while it is down remove the range from it. Since the node is 3447 // down it won't see the removal and won't clean up its replica. 3448 mtc.stopStore(3) 3449 mtc.unreplicateRange(rangeID, 3) 3450 3451 // Perform another write. 3452 incArgs = incrementArgs([]byte("a"), 11) 3453 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3454 t.Fatal(err) 3455 } 3456 mtc.waitForValues(roachpb.Key("a"), []int64{16, 16, 16, 5}) 3457 3458 // Wait for a bunch of raft ticks in order to flush any heartbeats through 3459 // the system. In particular, a coalesced heartbeat containing a quiesce 3460 // message could have been sent before the node was removed from range but 3461 // arrive after the node restarted. 3462 ticks := mtc.stores[0].Metrics().RaftTicks.Count 3463 for targetTicks := ticks() + 5; ticks() < targetTicks; { 3464 time.Sleep(time.Millisecond) 3465 } 3466 3467 // Restart node 3. The removed replica will start talking to the other 3468 // replicas and determine it needs to be GC'd. 3469 mtc.restartStore(3) 3470 3471 // Because we lazily initialize Raft groups, we have to force the Raft group 3472 // to get created in order to get the replica talking to the other replicas. 3473 mtc.stores[3].EnqueueRaftUpdateCheck(rangeID) 3474 3475 testutils.SucceedsSoon(t, func() error { 3476 replica, err := mtc.stores[3].GetReplica(rangeID) 3477 if err != nil { 3478 if errors.HasType(err, (*roachpb.RangeNotFoundError)(nil)) { 3479 return nil 3480 } 3481 return err 3482 } else if replica != nil { 3483 // Make sure the replica is unquiesced so that it will tick and 3484 // contact the leader to discover it's no longer part of the range. 3485 replica.UnquiesceAndWakeLeader() 3486 } 3487 return errors.Errorf("found %s, waiting for it to be GC'd", replica) 3488 }) 3489 } 3490 3491 func TestReplicaLazyLoad(t *testing.T) { 3492 defer leaktest.AfterTest(t)() 3493 3494 sc := kvserver.TestStoreConfig(nil) 3495 sc.RaftTickInterval = 10 * time.Millisecond // safe because there is only a single node 3496 sc.TestingKnobs.DisableScanner = true 3497 sc.TestingKnobs.DisablePeriodicGossips = true 3498 sc.TestingKnobs.DisableMergeQueue = true 3499 mtc := &multiTestContext{ 3500 storeConfig: &sc, 3501 // This test was written before the multiTestContext started creating many 3502 // system ranges at startup, and hasn't been update to take that into 3503 // account. 3504 startWithSingleRange: true, 3505 } 3506 defer mtc.Stop() 3507 mtc.Start(t, 1) 3508 3509 // Split so we can rely on RHS range being quiescent after a restart. 3510 // We use UserTableDataMin to avoid having the range activated to 3511 // gossip system table data. 3512 splitKey := keys.UserTableDataMin 3513 splitArgs := adminSplitArgs(splitKey) 3514 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil { 3515 t.Fatal(err) 3516 } 3517 3518 mtc.stopStore(0) 3519 mtc.restartStore(0) 3520 3521 // Wait for a bunch of raft ticks. 3522 ticks := mtc.stores[0].Metrics().RaftTicks.Count 3523 for targetTicks := ticks() + 3; ticks() < targetTicks; { 3524 time.Sleep(time.Millisecond) 3525 } 3526 3527 splitKeyAddr, err := keys.Addr(splitKey) 3528 if err != nil { 3529 t.Fatal(err) 3530 } 3531 3532 replica := mtc.stores[0].LookupReplica(splitKeyAddr) 3533 if replica == nil { 3534 t.Fatalf("lookup replica at key %q returned nil", splitKey) 3535 } 3536 if replica.RaftStatus() != nil { 3537 t.Fatalf("expected replica Raft group to be uninitialized") 3538 } 3539 } 3540 3541 func TestReplicateReAddAfterDown(t *testing.T) { 3542 defer leaktest.AfterTest(t)() 3543 3544 mtc := &multiTestContext{ 3545 // This test was written before the multiTestContext started creating many 3546 // system ranges at startup, and hasn't been update to take that into 3547 // account. 3548 startWithSingleRange: true, 3549 } 3550 defer mtc.Stop() 3551 mtc.Start(t, 3) 3552 3553 downedStoreIdx := 2 3554 3555 // First put the range on all three nodes. 3556 raftID := roachpb.RangeID(1) 3557 mtc.replicateRange(raftID, 1, 2) 3558 3559 // Put some data in the range so we'll have something to test for. 3560 incArgs := incrementArgs([]byte("a"), 5) 3561 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3562 t.Fatal(err) 3563 } 3564 3565 // Wait for all nodes to catch up. 3566 mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5}) 3567 3568 // Stop node 2; while it is down remove the range from it. Since the node is 3569 // down it won't see the removal and clean up its replica. 3570 mtc.stopStore(downedStoreIdx) 3571 mtc.unreplicateRange(raftID, 2) 3572 3573 // Perform another write. 3574 incArgs = incrementArgs([]byte("a"), 11) 3575 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3576 t.Fatal(err) 3577 } 3578 mtc.waitForValues(roachpb.Key("a"), []int64{16, 16, 5}) 3579 3580 // Bring it back up and re-add the range. There is a race when the 3581 // store applies its removal and re-addition back to back: the 3582 // replica may or may not have (asynchronously) garbage collected 3583 // its data in between. Whether the existing data is reused or the 3584 // replica gets recreated, the replica ID is changed by this 3585 // process. An ill-timed GC has been known to cause bugs including 3586 // https://github.com/cockroachdb/cockroach/issues/2873. 3587 mtc.restartStore(downedStoreIdx) 3588 mtc.replicateRange(raftID, downedStoreIdx) 3589 3590 // The range should be synced back up. 3591 mtc.waitForValues(roachpb.Key("a"), []int64{16, 16, 16}) 3592 } 3593 3594 // TestLeaseHolderRemoveSelf verifies that a lease holder cannot remove itself 3595 // without encountering an error. 3596 func TestLeaseHolderRemoveSelf(t *testing.T) { 3597 defer leaktest.AfterTest(t)() 3598 3599 mtc := &multiTestContext{} 3600 defer mtc.Stop() 3601 mtc.Start(t, 2) 3602 3603 leaseHolder := mtc.stores[0] 3604 3605 raftID := roachpb.RangeID(1) 3606 mtc.replicateRange(raftID, 1) 3607 3608 // Attempt to remove the replica from first store. 3609 expectedErr := "invalid ChangeReplicasTrigger" 3610 if err := mtc.unreplicateRangeNonFatal(raftID, 0); !testutils.IsError(err, expectedErr) { 3611 t.Fatalf("expected %q error trying to remove leaseholder replica; got %v", expectedErr, err) 3612 } 3613 3614 // Expect that we can still successfully do a get on the range. 3615 getArgs := getArgs([]byte("a")) 3616 _, pErr := kv.SendWrappedWith(context.Background(), leaseHolder.TestSender(), roachpb.Header{}, getArgs) 3617 if pErr != nil { 3618 t.Fatal(pErr) 3619 } 3620 } 3621 3622 // TestRemovedReplicaError verifies that a replica that has been removed from a 3623 // range returns a RangeNotFoundError if it receives a request for that range 3624 // (not RaftGroupDeletedError, and even before the ReplicaGCQueue has run). 3625 func TestRemovedReplicaError(t *testing.T) { 3626 defer leaktest.AfterTest(t)() 3627 3628 mtc := &multiTestContext{ 3629 // This test was written before the multiTestContext started creating many 3630 // system ranges at startup, and hasn't been update to take that into 3631 // account. 3632 startWithSingleRange: true, 3633 } 3634 defer mtc.Stop() 3635 mtc.Start(t, 2) 3636 3637 // Disable the replica GC queues. This verifies that the replica is 3638 // considered removed even before the gc queue has run, and also 3639 // helps avoid a deadlock at shutdown. 3640 mtc.stores[0].SetReplicaGCQueueActive(false) 3641 3642 raftID := roachpb.RangeID(1) 3643 mtc.replicateRange(raftID, 1) 3644 mtc.transferLease(context.Background(), raftID, 0, 1) 3645 mtc.unreplicateRange(raftID, 0) 3646 3647 mtc.manualClock.Increment(mtc.storeConfig.LeaseExpiration()) 3648 3649 // Expect to get a RangeNotFoundError. We have to allow for ambiguous result 3650 // errors to avoid the occasional test flake. Since we use demotions to remove 3651 // voters, the actual removal sees a learner, and so the learner is not in 3652 // the commit quorum for the removal itself. That is to say, we will only 3653 // start seeing the RangeNotFoundError after a little bit of time has passed. 3654 getArgs := getArgs([]byte("a")) 3655 testutils.SucceedsSoon(t, func() error { 3656 _, pErr := kv.SendWrappedWith(context.Background(), mtc.stores[0].TestSender(), roachpb.Header{}, getArgs) 3657 switch pErr.GetDetail().(type) { 3658 case *roachpb.AmbiguousResultError: 3659 return pErr.GoError() 3660 case *roachpb.NotLeaseHolderError: 3661 return pErr.GoError() 3662 case *roachpb.RangeNotFoundError: 3663 return nil 3664 default: 3665 } 3666 t.Fatal(pErr) 3667 return errors.New("unreachable") 3668 }) 3669 } 3670 3671 func TestTransferRaftLeadership(t *testing.T) { 3672 defer leaktest.AfterTest(t)() 3673 3674 const numStores = 3 3675 sc := kvserver.TestStoreConfig(nil) 3676 sc.TestingKnobs.DisableMergeQueue = true 3677 // Suppress timeout-based elections (which also includes a previous 3678 // leader stepping down due to a quorum check). Running tests on a 3679 // heavily loaded CPU is enough to reach the raft election timeout 3680 // and cause leadership to change hands in ways this test doesn't 3681 // expect. 3682 sc.RaftElectionTimeoutTicks = 100000 3683 // This test can rapidly advance the clock via mtc.advanceClock(), 3684 // which could lead the replication queue to consider a store dead 3685 // and remove a replica in the middle of the test. Disable the 3686 // replication queue; we'll control replication manually. 3687 sc.TestingKnobs.DisableReplicateQueue = true 3688 sc.Clock = nil // manual clock 3689 mtc := &multiTestContext{ 3690 storeConfig: &sc, 3691 // This test was written before the multiTestContext started creating many 3692 // system ranges at startup, and hasn't been update to take that into 3693 // account. 3694 startWithSingleRange: true, 3695 } 3696 defer mtc.Stop() 3697 mtc.Start(t, numStores) 3698 store0 := mtc.Store(0) 3699 store1 := mtc.Store(1) 3700 3701 key := roachpb.Key("a") 3702 3703 { 3704 // Split off a range to avoid interacting with the initial splits. 3705 splitArgs := adminSplitArgs(key) 3706 if _, err := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); err != nil { 3707 t.Fatal(err) 3708 } 3709 } 3710 3711 repl0 := store0.LookupReplica(keys.MustAddr(key)) 3712 if repl0 == nil { 3713 t.Fatalf("no replica found for key '%s'", key) 3714 } 3715 rd0, err := repl0.GetReplicaDescriptor() 3716 if err != nil { 3717 t.Fatal(err) 3718 } 3719 mtc.replicateRange(repl0.RangeID, 1, 2) 3720 3721 repl1 := store1.LookupReplica(keys.MustAddr(key)) 3722 if repl1 == nil { 3723 t.Fatalf("no replica found for key '%s'", key) 3724 } 3725 rd1, err := repl1.GetReplicaDescriptor() 3726 if err != nil { 3727 t.Fatal(err) 3728 } 3729 3730 getArgs := getArgs([]byte("a")) 3731 if _, pErr := kv.SendWrappedWith( 3732 context.Background(), store0, roachpb.Header{RangeID: repl0.RangeID}, getArgs, 3733 ); pErr != nil { 3734 t.Fatalf("expect get nil, actual get %v ", pErr) 3735 } 3736 3737 status := repl0.RaftStatus() 3738 if status == nil || status.Lead != uint64(rd0.ReplicaID) { 3739 t.Fatalf("raft leader should be %d, but got status %+v", rd0.ReplicaID, status) 3740 } 3741 3742 // Force a read on Store 2 to request a new lease. Other moving parts in 3743 // the system could have requested another lease as well, so we 3744 // expire-request in a loop until we get our foot in the door. 3745 origCount0 := store0.Metrics().RangeRaftLeaderTransfers.Count() 3746 for { 3747 mtc.advanceClock(context.Background()) 3748 if _, pErr := kv.SendWrappedWith( 3749 context.Background(), store1, roachpb.Header{RangeID: repl0.RangeID}, getArgs, 3750 ); pErr == nil { 3751 break 3752 } else { 3753 switch pErr.GetDetail().(type) { 3754 case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError: 3755 default: 3756 t.Fatal(pErr) 3757 } 3758 } 3759 } 3760 // Verify lease is transferred. 3761 testutils.SucceedsSoon(t, func() error { 3762 if a, e := repl0.RaftStatus().Lead, uint64(rd1.ReplicaID); a != e { 3763 return errors.Errorf("expected raft leader be %d; got %d", e, a) 3764 } 3765 if a, e := store0.Metrics().RangeRaftLeaderTransfers.Count()-origCount0, int64(1); a < e { 3766 return errors.Errorf("expected raft leader transfer count >= %d; got %d", e, a) 3767 } 3768 return nil 3769 }) 3770 } 3771 3772 // Test that a single blocked replica does not block other replicas. 3773 func TestRaftBlockedReplica(t *testing.T) { 3774 defer leaktest.AfterTest(t)() 3775 3776 sc := kvserver.TestStoreConfig(nil) 3777 sc.TestingKnobs.DisableMergeQueue = true 3778 sc.TestingKnobs.DisableScanner = true 3779 mtc := &multiTestContext{ 3780 storeConfig: &sc, 3781 // This test was written before the multiTestContext started creating many 3782 // system ranges at startup, and hasn't been update to take that into 3783 // account. 3784 startWithSingleRange: true, 3785 } 3786 defer mtc.Stop() 3787 mtc.Start(t, 3) 3788 3789 // Create 2 ranges by splitting range 1. 3790 splitArgs := adminSplitArgs(roachpb.Key("b")) 3791 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil { 3792 t.Fatal(err) 3793 } 3794 3795 // Replicate range 1 to all 3 nodes. This ensures the usage of the network. 3796 mtc.replicateRange(1, 1, 2) 3797 3798 // Lock range 2 for raft processing. 3799 rep, err := mtc.stores[0].GetReplica(2) 3800 if err != nil { 3801 t.Fatal(err) 3802 } 3803 3804 // NB: We perform the actual locking on a different goroutine in order to 3805 // workaround a spurious inconsistent lock order warning when running with 3806 // TAGS=deadlock. The issue is that we're grabbing Replica 2's raftMu and 3807 // then later Replica 1's from the same goroutine due to the direct calling 3808 // of client.SendWrapped down the callstack into the Replica code (via the 3809 // local RPC optimization). 3810 var wg sync.WaitGroup 3811 wg.Add(1) 3812 go func() { 3813 rep.RaftLock() 3814 wg.Done() 3815 }() 3816 wg.Wait() 3817 defer rep.RaftUnlock() 3818 3819 // Verify that we're still ticking the non-blocked replica. 3820 ticks := mtc.stores[0].Metrics().RaftTicks.Count 3821 for targetTicks := ticks() + 3; ticks() < targetTicks; { 3822 time.Sleep(time.Millisecond) 3823 } 3824 3825 // Verify we can still perform operations on the non-blocked replica. 3826 incArgs := incrementArgs([]byte("a"), 5) 3827 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { 3828 t.Fatal(err) 3829 } 3830 mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5}) 3831 } 3832 3833 // Test that ranges quiesce and if a follower unquiesces the leader is woken 3834 // up. 3835 func TestRangeQuiescence(t *testing.T) { 3836 defer leaktest.AfterTest(t)() 3837 3838 sc := kvserver.TestStoreConfig(nil) 3839 sc.TestingKnobs.DisableScanner = true 3840 sc.TestingKnobs.DisablePeriodicGossips = true 3841 mtc := &multiTestContext{ 3842 storeConfig: &sc, 3843 // This test was written before the multiTestContext started creating many 3844 // system ranges at startup, and hasn't been update to take that into 3845 // account. 3846 startWithSingleRange: true, 3847 } 3848 defer mtc.Stop() 3849 mtc.Start(t, 3) 3850 3851 pauseNodeLivenessHeartbeats(mtc, true) 3852 3853 // Replica range 1 to all 3 nodes. 3854 const rangeID = roachpb.RangeID(1) 3855 mtc.replicateRange(rangeID, 1, 2) 3856 3857 waitForQuiescence := func(rangeID roachpb.RangeID) { 3858 testutils.SucceedsSoon(t, func() error { 3859 for _, s := range mtc.stores { 3860 rep, err := s.GetReplica(rangeID) 3861 if err != nil { 3862 t.Fatal(err) 3863 } 3864 if !rep.IsQuiescent() { 3865 return errors.Errorf("%s not quiescent", rep) 3866 } 3867 } 3868 return nil 3869 }) 3870 } 3871 3872 // Wait for the range to quiesce. 3873 waitForQuiescence(rangeID) 3874 3875 // Find the leader replica. 3876 var rep *kvserver.Replica 3877 var leaderIdx int 3878 for leaderIdx = range mtc.stores { 3879 var err error 3880 if rep, err = mtc.stores[leaderIdx].GetReplica(1); err != nil { 3881 t.Fatal(err) 3882 } 3883 if rep.RaftStatus().SoftState.RaftState == raft.StateLeader { 3884 break 3885 } 3886 } 3887 3888 // Unquiesce a follower range, this should "wake the leader" and not result 3889 // in an election. 3890 followerIdx := (leaderIdx + 1) % len(mtc.stores) 3891 mtc.stores[followerIdx].EnqueueRaftUpdateCheck(rangeID) 3892 3893 // Wait for a bunch of ticks to occur which will allow the follower time to 3894 // campaign. 3895 ticks := mtc.stores[followerIdx].Metrics().RaftTicks.Count 3896 for targetTicks := ticks() + int64(2*sc.RaftElectionTimeoutTicks); ticks() < targetTicks; { 3897 time.Sleep(time.Millisecond) 3898 } 3899 3900 // Wait for the range to quiesce again. 3901 waitForQuiescence(rangeID) 3902 3903 // The leadership should not have changed. 3904 if state := rep.RaftStatus().SoftState.RaftState; state != raft.StateLeader { 3905 t.Fatalf("%s should be the leader: %s", rep, state) 3906 } 3907 } 3908 3909 // TestInitRaftGroupOnRequest verifies that an uninitialized Raft group 3910 // is initialized if a request is received, even if the current range 3911 // lease points to a different replica. 3912 func TestInitRaftGroupOnRequest(t *testing.T) { 3913 defer leaktest.AfterTest(t)() 3914 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 3915 storeCfg.TestingKnobs.DisableMergeQueue = true 3916 // Don't timeout range leases (see the relation between 3917 // RaftElectionTimeoutTicks and RangeLeaseActiveDuration). This test expects 3918 // the replica that holds the lease before the cluster is restarted to 3919 // continue holding it after the restart, regardless of how long the restart 3920 // takes. 3921 storeCfg.RaftElectionTimeoutTicks = 1000000 3922 // Disable async intent resolution. This can lead to flakiness in the test 3923 // because it allows for the intents written by the split transaction to be 3924 // resolved at any time, including after the nodes are restarted. The intent 3925 // resolution on the RHS's local range descriptor can both wake up the RHS 3926 // range's Raft group and result in the wrong replica acquiring the lease. 3927 storeCfg.TestingKnobs.IntentResolverKnobs.DisableAsyncIntentResolution = true 3928 mtc := &multiTestContext{ 3929 storeConfig: &storeCfg, 3930 // TODO(andrei): This test was written before multiTestContexts started with 3931 // multiple ranges, and for some unknown reason is flaky if we're not 3932 // forcing it to start with a single range, although it doesnt look like it 3933 // should be. 3934 startWithSingleRange: true, 3935 } 3936 defer mtc.Stop() 3937 mtc.Start(t, 2) 3938 3939 // Split so we can rely on RHS range being quiescent after a restart. 3940 // We use UserTableDataMin to avoid having the range activated to 3941 // gossip system table data. 3942 splitKey := keys.UserTableDataMin 3943 splitArgs := adminSplitArgs(splitKey) 3944 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil { 3945 t.Fatal(err) 3946 } 3947 3948 repl := mtc.stores[0].LookupReplica(roachpb.RKey(splitKey)) 3949 if repl == nil { 3950 t.Fatal("replica should not be nil for RHS range") 3951 } 3952 mtc.replicateRange(repl.RangeID, 1) 3953 3954 // Find the leaseholder and then restart the test context. 3955 lease, _ := repl.GetLease() 3956 mtc.restart() 3957 3958 // Get replica from the store which isn't the leaseholder. 3959 // NOTE: StoreID is 1-indexed and storeIdx is 0-indexed, so despite what 3960 // this might look like, this is grabbing the replica without the lease. 3961 storeIdx := int(lease.Replica.StoreID) % len(mtc.stores) 3962 if repl = mtc.stores[storeIdx].LookupReplica(roachpb.RKey(splitKey)); repl == nil { 3963 t.Fatal("replica should not be nil for RHS range") 3964 } 3965 3966 // TODO(spencer): Raft messages seem to turn up 3967 // occasionally on restart, which initialize the replica, so 3968 // this is not a test failure. Not sure how to work around this 3969 // problem. 3970 // Verify the raft group isn't initialized yet. 3971 if repl.IsRaftGroupInitialized() { 3972 log.Errorf(context.Background(), "expected raft group to be uninitialized") 3973 } 3974 3975 // Send an increment and verify that initializes the Raft group. 3976 incArgs := incrementArgs(splitKey, 1) 3977 _, pErr := kv.SendWrappedWith( 3978 context.Background(), mtc.stores[storeIdx], roachpb.Header{RangeID: repl.RangeID}, incArgs, 3979 ) 3980 if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); !ok { 3981 t.Fatalf("expected NotLeaseHolderError; got %s", pErr) 3982 } 3983 if !repl.IsRaftGroupInitialized() { 3984 t.Fatal("expected raft group to be initialized") 3985 } 3986 } 3987 3988 // TestFailedConfChange verifies correct behavior after a configuration change 3989 // experiences an error when applying EndTxn. Specifically, it verifies that 3990 // https://github.com/cockroachdb/cockroach/issues/13506 has been fixed. 3991 func TestFailedConfChange(t *testing.T) { 3992 defer leaktest.AfterTest(t)() 3993 3994 // Trigger errors at apply time so they happen on both leaders and 3995 // followers. 3996 var filterActive int32 3997 sc := kvserver.TestStoreConfig(nil) 3998 sc.TestingKnobs.TestingApplyFilter = func(filterArgs kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) { 3999 if atomic.LoadInt32(&filterActive) == 1 && filterArgs.ChangeReplicas != nil { 4000 return 0, roachpb.NewErrorf("boom") 4001 } 4002 return 0, nil 4003 } 4004 mtc := &multiTestContext{ 4005 storeConfig: &sc, 4006 } 4007 defer mtc.Stop() 4008 mtc.Start(t, 3) 4009 ctx := context.Background() 4010 4011 // Replicate the range (successfully) to the second node. 4012 const rangeID = roachpb.RangeID(1) 4013 mtc.replicateRange(rangeID, 1) 4014 4015 // Try and fail to replicate it to the third node. 4016 atomic.StoreInt32(&filterActive, 1) 4017 if err := mtc.replicateRangeNonFatal(rangeID, 2); !testutils.IsError(err, "boom") { 4018 t.Fatal(err) 4019 } 4020 4021 // Raft state is only exposed on the leader, so we must transfer 4022 // leadership and check the stores one at a time. 4023 checkLeaderStore := func(i int) error { 4024 store := mtc.stores[i] 4025 repl, err := store.GetReplica(rangeID) 4026 if err != nil { 4027 t.Fatal(err) 4028 } 4029 if l := len(repl.Desc().InternalReplicas); l != 2 { 4030 return errors.Errorf("store %d: expected 2 replicas in descriptor, found %d in %s", 4031 i, l, repl.Desc()) 4032 } 4033 status := repl.RaftStatus() 4034 if status.RaftState != raft.StateLeader { 4035 return errors.Errorf("store %d: expected StateLeader, was %s", i, status.RaftState) 4036 } 4037 // In issue #13506, the Progress map would be updated as if the 4038 // change had succeeded. 4039 if l := len(status.Progress); l != 2 { 4040 return errors.Errorf("store %d: expected 2 replicas in raft, found %d in %s", i, l, status) 4041 } 4042 return nil 4043 } 4044 4045 if err := checkLeaderStore(0); err != nil { 4046 t.Fatal(err) 4047 } 4048 4049 // Transfer leadership to the second node and wait for it to become leader. 4050 mtc.transferLease(ctx, rangeID, 0, 1) 4051 testutils.SucceedsSoon(t, func() error { 4052 repl, err := mtc.stores[1].GetReplica(rangeID) 4053 if err != nil { 4054 return err 4055 } 4056 status := repl.RaftStatus() 4057 if status.RaftState != raft.StateLeader { 4058 return errors.Errorf("store %d: expected StateLeader, was %s", 1, status.RaftState) 4059 } 4060 return nil 4061 }) 4062 4063 if err := checkLeaderStore(1); err != nil { 4064 t.Fatal(err) 4065 } 4066 } 4067 4068 // TestStoreRangeRemovalCompactionSuggestion verifies that if a replica 4069 // is removed from a store, a compaction suggestion is made to the 4070 // compactor queue. 4071 func TestStoreRangeRemovalCompactionSuggestion(t *testing.T) { 4072 defer leaktest.AfterTest(t)() 4073 sc := kvserver.TestStoreConfig(nil) 4074 mtc := &multiTestContext{storeConfig: &sc} 4075 defer mtc.Stop() 4076 mtc.Start(t, 3) 4077 4078 const rangeID = roachpb.RangeID(1) 4079 mtc.replicateRange(rangeID, 1, 2) 4080 4081 repl, err := mtc.stores[0].GetReplica(rangeID) 4082 if err != nil { 4083 t.Fatal(err) 4084 } 4085 ctx := repl.AnnotateCtx(context.Background()) 4086 4087 deleteStore := mtc.stores[2] 4088 chgs := roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, roachpb.ReplicationTarget{ 4089 NodeID: deleteStore.Ident.NodeID, 4090 StoreID: deleteStore.Ident.StoreID, 4091 }) 4092 if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRebalance, "", chgs); err != nil { 4093 t.Fatal(err) 4094 } 4095 4096 testutils.SucceedsSoon(t, func() error { 4097 // Function to check compaction metrics indicating a suggestion 4098 // was queued or a compaction was processed or skipped. 4099 haveCompaction := func(s *kvserver.Store, exp bool) error { 4100 queued := s.Compactor().Metrics.BytesQueued.Value() 4101 comps := s.Compactor().Metrics.BytesCompacted.Count() 4102 skipped := s.Compactor().Metrics.BytesSkipped.Count() 4103 if exp != (queued > 0 || comps > 0 || skipped > 0) { 4104 return errors.Errorf("%s: expected non-zero compaction metrics? %t; got queued=%d, compactions=%d, skipped=%d", 4105 s, exp, queued, comps, skipped) 4106 } 4107 return nil 4108 } 4109 // Verify that no compaction metrics are showing non-zero bytes in the 4110 // other stores. 4111 for _, s := range mtc.stores { 4112 if err := haveCompaction(s, s == deleteStore); err != nil { 4113 return err 4114 } 4115 } 4116 return nil 4117 }) 4118 } 4119 4120 func TestStoreRangeWaitForApplication(t *testing.T) { 4121 defer leaktest.AfterTest(t)() 4122 4123 var filterRangeIDAtomic int64 4124 4125 ctx := context.Background() 4126 sc := kvserver.TestStoreConfig(nil) 4127 sc.TestingKnobs.DisableReplicateQueue = true 4128 sc.TestingKnobs.DisableReplicaGCQueue = true 4129 sc.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) (retErr *roachpb.Error) { 4130 if rangeID := roachpb.RangeID(atomic.LoadInt64(&filterRangeIDAtomic)); rangeID != ba.RangeID { 4131 return nil 4132 } 4133 pErr := roachpb.NewErrorf("blocking %s in this test", ba.Summary()) 4134 if len(ba.Requests) != 1 { 4135 return pErr 4136 } 4137 _, ok := ba.Requests[0].GetInner().(*roachpb.PutRequest) 4138 if !ok { 4139 return pErr 4140 } 4141 return nil 4142 } 4143 mtc := &multiTestContext{storeConfig: &sc} 4144 mtc.Start(t, 3) 4145 defer mtc.Stop() 4146 store0, store2 := mtc.Store(0), mtc.Store(2) 4147 distSender := mtc.distSenders[0] 4148 4149 // Split off a non-system range so we don't have to account for node liveness 4150 // traffic. 4151 splitArgs := adminSplitArgs(roachpb.Key("a")) 4152 if _, pErr := kv.SendWrapped(ctx, distSender, splitArgs); pErr != nil { 4153 t.Fatal(pErr) 4154 } 4155 rangeID := store0.LookupReplica(roachpb.RKey("a")).RangeID 4156 mtc.replicateRange(rangeID, 1, 2) 4157 4158 repl0, err := store0.GetReplica(rangeID) 4159 if err != nil { 4160 t.Fatal(err) 4161 } 4162 4163 atomic.StoreInt64(&filterRangeIDAtomic, int64(rangeID)) 4164 4165 leaseIndex0 := repl0.LastAssignedLeaseIndex() 4166 4167 type target struct { 4168 client kvserver.PerReplicaClient 4169 header kvserver.StoreRequestHeader 4170 } 4171 4172 var targets []target 4173 for _, s := range mtc.stores { 4174 conn, err := mtc.nodeDialer.Dial(ctx, s.Ident.NodeID, rpc.DefaultClass) 4175 if err != nil { 4176 t.Fatal(err) 4177 } 4178 targets = append(targets, target{ 4179 client: kvserver.NewPerReplicaClient(conn), 4180 header: kvserver.StoreRequestHeader{NodeID: s.Ident.NodeID, StoreID: s.Ident.StoreID}, 4181 }) 4182 } 4183 4184 // Wait for a command that is already applied. The request should return 4185 // immediately. 4186 for i, target := range targets { 4187 _, err := target.client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{ 4188 StoreRequestHeader: target.header, 4189 RangeID: rangeID, 4190 LeaseIndex: leaseIndex0, 4191 }) 4192 if err != nil { 4193 t.Fatalf("%d: %+v", i, err) 4194 } 4195 } 4196 4197 const count = 5 4198 4199 // Wait for a command that is `count` indexes later. 4200 var errChs []chan error 4201 for _, target := range targets { 4202 errCh := make(chan error) 4203 errChs = append(errChs, errCh) 4204 target := target 4205 go func() { 4206 _, err := target.client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{ 4207 StoreRequestHeader: target.header, 4208 RangeID: rangeID, 4209 LeaseIndex: leaseIndex0 + count, 4210 }) 4211 errCh <- err 4212 }() 4213 } 4214 4215 // The request should not return when less than `count` commands have 4216 // been issued. 4217 putArgs := putArgs(roachpb.Key("foo"), []byte("bar")) 4218 for i := 0; i < count-1; i++ { 4219 if _, pErr := kv.SendWrapped(ctx, distSender, putArgs); pErr != nil { 4220 t.Fatal(pErr) 4221 } 4222 // Wait a little bit to increase the likelihood that we observe an invalid 4223 // ordering. This is not intended to be foolproof. 4224 time.Sleep(10 * time.Millisecond) 4225 for j, errCh := range errChs { 4226 select { 4227 case err := <-errCh: 4228 t.Fatalf("%d: WaitForApplication returned early (request: %d, err: %v)", j, i, err) 4229 default: 4230 } 4231 } 4232 } 4233 4234 // Once the `count`th command has been issued, the request should return. 4235 if _, pErr := kv.SendWrapped(ctx, distSender, putArgs); pErr != nil { 4236 t.Fatal(pErr) 4237 } 4238 for i, errCh := range errChs { 4239 if err := <-errCh; err != nil { 4240 t.Fatalf("%d: %+v", i, err) 4241 } 4242 } 4243 4244 atomic.StoreInt64(&filterRangeIDAtomic, 0) 4245 4246 // GC the replica while a request is in progress. The request should return 4247 // an error. 4248 go func() { 4249 _, err := targets[2].client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{ 4250 StoreRequestHeader: targets[2].header, 4251 RangeID: rangeID, 4252 LeaseIndex: math.MaxUint64, 4253 }) 4254 errChs[2] <- err 4255 }() 4256 repl2, err := store2.GetReplica(rangeID) 4257 if err != nil { 4258 t.Fatal(err) 4259 } 4260 mtc.unreplicateRange(repl2.RangeID, 2) 4261 if err := store2.ManualReplicaGC(repl2); err != nil { 4262 t.Fatal(err) 4263 } 4264 if _, err := repl2.IsDestroyed(); err == nil { 4265 t.Fatalf("replica was not destroyed after gc on store2") 4266 } 4267 err = <-errChs[2] 4268 if exp := fmt.Sprintf("r%d was not found", rangeID); !testutils.IsError(err, exp) { 4269 t.Fatalf("expected %q error, but got %v", exp, err) 4270 } 4271 4272 // Allow the client context to time out while a request is in progress. The 4273 // request should return an error. 4274 { 4275 var cancel context.CancelFunc 4276 ctx, cancel = context.WithTimeout(ctx, 50*time.Millisecond) 4277 defer cancel() 4278 _, err := targets[0].client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{ 4279 StoreRequestHeader: targets[0].header, 4280 RangeID: rangeID, 4281 LeaseIndex: math.MaxUint64, 4282 }) 4283 if exp := "context deadline exceeded"; !testutils.IsError(err, exp) { 4284 t.Fatalf("expected %q error, but got %v", exp, err) 4285 } 4286 } 4287 } 4288 4289 func TestStoreWaitForReplicaInit(t *testing.T) { 4290 defer leaktest.AfterTest(t)() 4291 4292 ctx := context.Background() 4293 sc := kvserver.TestStoreConfig(nil) 4294 mtc := &multiTestContext{ 4295 storeConfig: &sc, 4296 // This test was written before the multiTestContext started creating many 4297 // system ranges at startup, and hasn't been update to take that into 4298 // account. 4299 startWithSingleRange: true, 4300 } 4301 mtc.Start(t, 1) 4302 defer mtc.Stop() 4303 store := mtc.Store(0) 4304 4305 conn, err := mtc.nodeDialer.Dial(ctx, store.Ident.NodeID, rpc.DefaultClass) 4306 if err != nil { 4307 t.Fatal(err) 4308 } 4309 client := kvserver.NewPerReplicaClient(conn) 4310 storeHeader := kvserver.StoreRequestHeader{NodeID: store.Ident.NodeID, StoreID: store.Ident.StoreID} 4311 4312 // Test that WaitForReplicaInit returns successfully if the replica exists. 4313 _, err = client.WaitForReplicaInit(ctx, &kvserver.WaitForReplicaInitRequest{ 4314 StoreRequestHeader: storeHeader, 4315 RangeID: roachpb.RangeID(1), 4316 }) 4317 if err != nil { 4318 t.Fatal(err) 4319 } 4320 4321 // Test that WaitForReplicaInit times out if the replica does not exist. 4322 { 4323 timeoutCtx, cancel := context.WithTimeout(ctx, 50*time.Millisecond) 4324 defer cancel() 4325 _, err = client.WaitForReplicaInit(timeoutCtx, &kvserver.WaitForReplicaInitRequest{ 4326 StoreRequestHeader: storeHeader, 4327 RangeID: roachpb.RangeID(2), 4328 }) 4329 if exp := "context deadline exceeded"; !testutils.IsError(err, exp) { 4330 t.Fatalf("expected %q error, but got %v", exp, err) 4331 } 4332 } 4333 4334 // Test that WaitForReplicaInit times out if the replica exists but is not 4335 // initialized. 4336 { 4337 // Constructing an permanently-uninitialized replica is somewhat difficult. 4338 // Sending a fake Raft heartbeat for a range ID that the store hasn't seen 4339 // before does the trick. 4340 var repl42 *kvserver.Replica 4341 testutils.SucceedsSoon(t, func() (err error) { 4342 // Try several times, as the message may be dropped (see #18355). 4343 mtc.transport.SendAsync(&kvserver.RaftMessageRequest{ 4344 ToReplica: roachpb.ReplicaDescriptor{ 4345 NodeID: store.Ident.NodeID, 4346 StoreID: store.Ident.StoreID, 4347 }, 4348 Heartbeats: []kvserver.RaftHeartbeat{{RangeID: 42, ToReplicaID: 1}}, 4349 }, rpc.DefaultClass) 4350 repl42, err = store.GetReplica(42) 4351 return err 4352 }) 4353 if repl42.IsInitialized() { 4354 t.Fatalf("test bug: repl42 is initialized") 4355 } 4356 4357 timeoutCtx, cancel := context.WithTimeout(ctx, 50*time.Millisecond) 4358 defer cancel() 4359 _, err = client.WaitForReplicaInit(timeoutCtx, &kvserver.WaitForReplicaInitRequest{ 4360 StoreRequestHeader: storeHeader, 4361 RangeID: roachpb.RangeID(42), 4362 }) 4363 if exp := "context deadline exceeded"; !testutils.IsError(err, exp) { 4364 t.Fatalf("expected %q error, but got %v", exp, err) 4365 } 4366 } 4367 } 4368 4369 // TestTracingDoesNotRaceWithCancelation ensures that the tracing underneath 4370 // raft does not race with tracing operations which might occur concurrently 4371 // due to a request cancelation. When this bug existed this test only 4372 // uncovered it when run under stress. 4373 func TestTracingDoesNotRaceWithCancelation(t *testing.T) { 4374 defer leaktest.AfterTest(t)() 4375 4376 sc := kvserver.TestStoreConfig(nil) 4377 sc.TestingKnobs.TraceAllRaftEvents = true 4378 sc.TestingKnobs.DisableSplitQueue = true 4379 sc.TestingKnobs.DisableMergeQueue = true 4380 mtc := &multiTestContext{ 4381 storeConfig: &sc, 4382 } 4383 mtc.Start(t, 3) 4384 defer mtc.Stop() 4385 4386 db := mtc.Store(0).DB() 4387 ctx := context.Background() 4388 // Make the transport flaky for the range in question to encourage proposals 4389 // to be sent more times and ultimately traced more. 4390 ri, err := getRangeInfo(ctx, db, roachpb.Key("foo")) 4391 require.Nil(t, err) 4392 4393 for i := 0; i < 3; i++ { 4394 mtc.transport.Listen(mtc.stores[i].Ident.StoreID, &unreliableRaftHandler{ 4395 rangeID: ri.Desc.RangeID, 4396 RaftMessageHandler: mtc.stores[i], 4397 unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{ 4398 dropReq: func(req *kvserver.RaftMessageRequest) bool { 4399 return rand.Intn(2) == 0 4400 }, 4401 }, 4402 }) 4403 } 4404 val := []byte("asdf") 4405 var wg sync.WaitGroup 4406 put := func(i int) func() { 4407 wg.Add(1) 4408 return func() { 4409 defer wg.Done() 4410 totalDelay := 1 * time.Millisecond 4411 delay := time.Duration(rand.Intn(int(totalDelay))) 4412 startDelay := totalDelay - delay 4413 time.Sleep(startDelay) 4414 ctx, cancel := context.WithTimeout(context.Background(), delay) 4415 defer cancel() 4416 _ = db.Put(ctx, roachpb.Key(fmt.Sprintf("foo%d", i)), val) 4417 } 4418 } 4419 const N = 256 4420 for i := 0; i < N; i++ { 4421 go put(i)() 4422 } 4423 wg.Wait() 4424 } 4425 4426 type disablingClientStream struct { 4427 grpc.ClientStream 4428 disabled *atomic.Value 4429 } 4430 4431 func (cs *disablingClientStream) SendMsg(m interface{}) error { 4432 if cs.disabled.Load().(bool) { 4433 return nil 4434 } 4435 return cs.ClientStream.SendMsg(m) 4436 } 4437 4438 // TestDefaultConnectionDisruptionDoesNotInterfereWithSystemTraffic tests that 4439 // disconnection on connections of the rpc.DefaultClass do not interfere with 4440 // traffic on the SystemClass connection. 4441 func TestDefaultConnectionDisruptionDoesNotInterfereWithSystemTraffic(t *testing.T) { 4442 defer leaktest.AfterTest(t)() 4443 // This test relies on concurrently waiting for a value to change in the 4444 // underlying engine(s). Since the teeing engine does not respond well to 4445 // value mismatches, whether transient or permanent, skip this test if the 4446 // teeing engine is being used. See 4447 // https://github.com/cockroachdb/cockroach/issues/42656 for more context. 4448 if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB { 4449 t.Skip("disabled on teeing engine") 4450 } 4451 4452 stopper := stop.NewStopper() 4453 ctx := context.Background() 4454 defer stopper.Stop(ctx) 4455 // disabled controls whether to disrupt DefaultClass streams. 4456 var disabled atomic.Value 4457 disabled.Store(false) 4458 knobs := rpc.ContextTestingKnobs{ 4459 StreamClientInterceptor: func(target string, class rpc.ConnectionClass) grpc.StreamClientInterceptor { 4460 if class == rpc.SystemClass { 4461 return nil 4462 } 4463 return func( 4464 ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, 4465 method string, streamer grpc.Streamer, opts ...grpc.CallOption, 4466 ) (grpc.ClientStream, error) { 4467 cs, err := streamer(ctx, desc, cc, method, opts...) 4468 if err != nil { 4469 return nil, err 4470 } 4471 return &disablingClientStream{ 4472 disabled: &disabled, 4473 ClientStream: cs, 4474 }, nil 4475 } 4476 }, 4477 } 4478 // Prevent the split queue from creating additional ranges while we're 4479 // waiting for replication. 4480 sc := kvserver.TestStoreConfig(nil) 4481 mtc := &multiTestContext{ 4482 storeConfig: &sc, 4483 rpcTestingKnobs: knobs, 4484 } 4485 4486 const numReplicas = 3 4487 mtc.Start(t, numReplicas) 4488 defer mtc.Stop() 4489 for _, s := range mtc.stores { 4490 s.SetReplicateQueueActive(true) 4491 } 4492 mtc.replicateRange(1, 1, 2) 4493 // Make a key that's in the user data space. 4494 keyA := append(keys.SystemSQLCodec.TablePrefix(100), 'a') 4495 replica1 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) 4496 mtc.replicateRange(replica1.RangeID, 1, 2) 4497 // Create a test function so that we can run the test both immediately after 4498 // up-replicating and after a restart. 4499 runTest := func(t *testing.T) { 4500 // Look up the replica again because we may have restarted the store. 4501 replica1 = mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) 4502 // Put some data in the range so we'll have something to test for. 4503 db := mtc.Store(0).DB() 4504 require.NoError(t, db.Put(ctx, keyA, 1)) 4505 4506 // Wait for all nodes to catch up. 4507 mtc.waitForValues(keyA, []int64{1, 1, 1}) 4508 disabled.Store(true) 4509 repl1, err := mtc.stores[0].GetReplica(1) 4510 require.Nil(t, err) 4511 // Transfer the lease on range 1. Make sure there's no pending transfer. 4512 var lease roachpb.Lease 4513 testutils.SucceedsSoon(t, func() error { 4514 var next roachpb.Lease 4515 lease, next = repl1.GetLease() 4516 if next != (roachpb.Lease{}) { 4517 return fmt.Errorf("lease transfer in process, next = %v", next) 4518 } 4519 return nil 4520 }) 4521 4522 var target int 4523 for i := roachpb.StoreID(1); i <= numReplicas; i++ { 4524 if lease.Replica.StoreID != i { 4525 target = int(i - 1) 4526 break 4527 } 4528 } 4529 // Use SucceedsSoon to deal with rare stress cases where the lease 4530 // transfer may fail. 4531 testutils.SucceedsSoon(t, func() error { 4532 return mtc.transferLeaseNonFatal(ctx, 1, target, int(lease.Replica.StoreID-1)) 4533 }) 4534 // Set a relatively short timeout so that this test doesn't take too long. 4535 // We should always hit it. 4536 withTimeout, cancel := context.WithTimeout(ctx, 20*time.Millisecond) 4537 defer cancel() 4538 err = db.Put(withTimeout, keyA, 2) 4539 require.True(t, testutils.IsError(err, "deadline exceeded"), err) 4540 // Transfer the lease back to demonstrate that the system range is still live. 4541 testutils.SucceedsSoon(t, func() error { 4542 return mtc.transferLeaseNonFatal(ctx, 1, target, int(lease.Replica.StoreID-1)) 4543 }) 4544 4545 // Heal the partition, the previous proposal may now succeed but it may have 4546 // have been canceled. 4547 disabled.Store(false) 4548 // Overwrite with a new value and ensure that it propagates. 4549 require.NoError(t, db.Put(ctx, keyA, 3)) 4550 mtc.waitForValuesT(t, keyA, []int64{3, 3, 3}) 4551 } 4552 t.Run("initial_run", runTest) 4553 mtc.restart() 4554 t.Run("after_restart", runTest) 4555 } 4556 4557 // TestAckWriteBeforeApplication tests that the success of transactional writes 4558 // is acknowledged after those writes have been committed to a Range's Raft log 4559 // but before those writes have been applied to its replicated state machine. 4560 func TestAckWriteBeforeApplication(t *testing.T) { 4561 defer leaktest.AfterTest(t)() 4562 for _, tc := range []struct { 4563 repls int 4564 expAckBeforeAppl bool 4565 }{ 4566 // In a single-replica Range, each handleRaftReady iteration will append 4567 // new entries to the Raft log and immediately apply them. This prevents 4568 // "early acknowledgement" from being possible or useful. See the comment 4569 // on apply.Task.AckCommittedEntriesBeforeApplication. 4570 {1, false}, 4571 // In a three-replica Range, each handleRaftReady iteration will append 4572 // a set of entries to the Raft log and then apply the previous set of 4573 // entries. This makes "early acknowledgement" a major optimization, as 4574 // it pulls the entire latency required to append the next set of entries 4575 // to the Raft log out of the client-perceived latency of the previous 4576 // set of entries. 4577 {3, true}, 4578 } { 4579 t.Run(fmt.Sprintf("numRepls=%d", tc.repls), func(t *testing.T) { 4580 var filterActive int32 4581 var magicTS hlc.Timestamp 4582 blockPreApplication, blockPostApplication := make(chan struct{}), make(chan struct{}) 4583 applyFilterFn := func(ch chan struct{}) kvserverbase.ReplicaApplyFilter { 4584 return func(filterArgs kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) { 4585 if atomic.LoadInt32(&filterActive) == 1 && filterArgs.Timestamp == magicTS { 4586 <-ch 4587 } 4588 return 0, nil 4589 } 4590 } 4591 4592 tsc := kvserver.TestStoreConfig(nil) 4593 tsc.TestingKnobs.TestingApplyFilter = applyFilterFn(blockPreApplication) 4594 tsc.TestingKnobs.TestingPostApplyFilter = applyFilterFn(blockPostApplication) 4595 4596 mtc := &multiTestContext{storeConfig: &tsc} 4597 defer mtc.Stop() 4598 mtc.Start(t, tc.repls) 4599 4600 // Replicate the Range, if necessary. 4601 key := roachpb.Key("a") 4602 rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(key)).RangeID 4603 for i := 1; i < tc.repls; i++ { 4604 mtc.replicateRange(rangeID, i) 4605 } 4606 4607 // Begin peforming a write on the Range. 4608 magicTS = mtc.stores[0].Clock().Now() 4609 atomic.StoreInt32(&filterActive, 1) 4610 ch := make(chan *roachpb.Error, 1) 4611 go func() { 4612 ctx := context.Background() 4613 put := putArgs(key, []byte("val")) 4614 _, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), roachpb.Header{ 4615 Timestamp: magicTS, 4616 }, put) 4617 ch <- pErr 4618 }() 4619 4620 expResult := func() { 4621 t.Helper() 4622 if pErr := <-ch; pErr != nil { 4623 t.Fatalf("unexpected proposal result error: %v", pErr) 4624 } 4625 } 4626 dontExpResult := func() { 4627 t.Helper() 4628 select { 4629 case <-time.After(10 * time.Millisecond): 4630 // Expected. 4631 case pErr := <-ch: 4632 t.Fatalf("unexpected proposal acknowledged before TestingApplyFilter: %v", pErr) 4633 } 4634 } 4635 4636 // The result should be blocked on the pre-apply filter. 4637 dontExpResult() 4638 4639 // Release the pre-apply filter. 4640 close(blockPreApplication) 4641 // Depending on the cluster configuration, The result may not be blocked 4642 // on the post-apply filter because it may be able to acknowledges the 4643 // client before applying. 4644 if tc.expAckBeforeAppl { 4645 expResult() 4646 } else { 4647 dontExpResult() 4648 } 4649 4650 // Stop blocking Raft application to allow everything to shut down cleanly. 4651 // This also confirms that the proposal does eventually apply. 4652 close(blockPostApplication) 4653 // If we didn't expect an acknowledgement before, we do now. 4654 if !tc.expAckBeforeAppl { 4655 expResult() 4656 } 4657 }) 4658 } 4659 } 4660 4661 // TestProcessSplitAfterRightHandSideHasBeenRemoved tests cases where we have 4662 // a follower replica which has received information about the RHS of a split 4663 // before it has processed that split. The replica can't both have an 4664 // initialized RHS and process the split but it can have (1) an uninitialized 4665 // RHS with a higher replica ID than in the split and (2) a RHS with an unknown 4666 // replica ID and a tombstone with a higher replica ID than in the split. 4667 // It may learn about a newer replica ID than the split without ever hearing 4668 // about the split replica. If it does not crash (3) it will know that the 4669 // split replica is too old and will not initialize it. If the node does 4670 // crash (4) it will forget it had learned about the higher replica ID and 4671 // will initialize the RHS as the split replica. 4672 // 4673 // Starting in 19.2 if a replica discovers from a raft message that it is an 4674 // old replica then it knows that it has been removed and re-added to the range. 4675 // In this case the Replica eagerly destroys itself and its data. 4676 // 4677 // Given this behavior there are 4 troubling cases with regards to splits. 4678 // 4679 // * In all cases we begin with s1 processing a presplit snapshot for 4680 // r20. After the split the store should have r21/3. 4681 // 4682 // In the first two cases the following occurs: 4683 // 4684 // * s1 receives a message for r21/3 prior to acquiring the split lock 4685 // in r21. This will create an uninitialized r21/3 which may write 4686 // HardState. 4687 // 4688 // * Before the r20 processes the split r21 is removed and re-added to 4689 // s1 as r21/4. s1 receives a raft message destined for r21/4 and proceeds 4690 // to destroy its uninitialized r21/3, laying down a tombstone at 4 in the 4691 // process. 4692 // 4693 // (1) s1 processes the split and finds the RHS to be an uninitialized replica 4694 // with a higher replica ID. 4695 // 4696 // (2) s1 crashes before processing the split, forgetting the replica ID of the 4697 // RHS but retaining its tombstone. 4698 // 4699 // In both cases we know that the RHS could not have committed anything because 4700 // it cannot have gotten a snapshot but we want to be sure to not synthesize a 4701 // HardState for the RHS that contains a non-zero commit index if we know that 4702 // the RHS will need another snapshot later. 4703 // 4704 // In the third and fourth cases: 4705 // 4706 // * s1 never receives a message for r21/3. 4707 // 4708 // * Before the r20 processes the split r21 is removed and re-added to 4709 // s1 as r21/4. s1 receives a raft message destined for r21/4 and has never 4710 // heard about r21/3. 4711 // 4712 // (3) s1 processes the split and finds the RHS to be an uninitialized replica 4713 // with a higher replica ID (but without a tombstone). This case is very 4714 // similar to (1) 4715 // 4716 // (4) s1 crashes still before processing the split, forgetting that it had 4717 // known about r21/4. When it reboots r21/4 is totally partitioned and 4718 // r20 becomes unpartitioned. 4719 // 4720 // * r20 processes the split successfully and initialized r21/3. 4721 // 4722 // In the 4th case we find that until we unpartition r21/4 (the RHS) and let it 4723 // learn about its removal with a ReplicaTooOldError that it will be initialized 4724 // with a CommitIndex at 10 as r21/3, the split's value. After r21/4 becomes 4725 // unpartitioned it will learn it is removed by either catching up on its 4726 // its log or receiving a ReplicaTooOldError which will lead to a tombstone. 4727 // 4728 func TestProcessSplitAfterRightHandSideHasBeenRemoved(t *testing.T) { 4729 defer leaktest.AfterTest(t)() 4730 // This test relies on concurrently waiting for a value to change in the 4731 // underlying engine(s). Since the teeing engine does not respond well to 4732 // value mismatches, whether transient or permanent, skip this test if the 4733 // teeing engine is being used. See 4734 // https://github.com/cockroachdb/cockroach/issues/42656 for more context. 4735 if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB { 4736 t.Skip("disabled on teeing engine") 4737 } 4738 sc := kvserver.TestStoreConfig(nil) 4739 // Newly-started stores (including the "rogue" one) should not GC 4740 // their replicas. We'll turn this back on when needed. 4741 sc.TestingKnobs.DisableReplicaGCQueue = true 4742 sc.RaftDelaySplitToSuppressSnapshotTicks = 0 4743 // Make the tick interval short so we don't need to wait too long for the 4744 // partitioned leader to time out. Also make the 4745 // RangeLeaseRaftElectionTimeout multiplier high so that system ranges 4746 // like node liveness can actually get leases. 4747 sc.RaftTickInterval = 10 * time.Millisecond 4748 sc.RangeLeaseRaftElectionTimeoutMultiplier = 1000 4749 noopProposalFilter := kvserverbase.ReplicaProposalFilter(func(args kvserverbase.ProposalFilterArgs) *roachpb.Error { 4750 return nil 4751 }) 4752 var proposalFilter atomic.Value 4753 proposalFilter.Store(noopProposalFilter) 4754 sc.TestingKnobs.TestingProposalFilter = func(args kvserverbase.ProposalFilterArgs) *roachpb.Error { 4755 return proposalFilter.Load().(kvserverbase.ReplicaProposalFilter)(args) 4756 } 4757 4758 ctx := context.Background() 4759 increment := func(t *testing.T, db *kv.DB, key roachpb.Key, by int64) { 4760 b := &kv.Batch{} 4761 b.AddRawRequest(incrementArgs(key, by)) 4762 require.NoError(t, db.Run(ctx, b)) 4763 } 4764 changeReplicas := func( 4765 t *testing.T, db *kv.DB, typ roachpb.ReplicaChangeType, key roachpb.Key, idx int, 4766 ) error { 4767 ri, err := getRangeInfo(ctx, db, key) 4768 require.NoError(t, err) 4769 _, err = db.AdminChangeReplicas(ctx, ri.Desc.StartKey.AsRawKey(), ri.Desc, 4770 roachpb.MakeReplicationChanges(typ, makeReplicationTargets(idx+1)...)) 4771 return err 4772 } 4773 split := func(t *testing.T, db *kv.DB, key roachpb.Key) { 4774 b := &kv.Batch{} 4775 b.AddRawRequest(adminSplitArgs(key)) 4776 require.NoError(t, db.Run(ctx, b)) 4777 } 4778 ensureNoTombstone := func(t *testing.T, store *kvserver.Store, rangeID roachpb.RangeID) { 4779 var tombstone roachpb.RangeTombstone 4780 tombstoneKey := keys.RangeTombstoneKey(rangeID) 4781 ok, err := storage.MVCCGetProto( 4782 ctx, store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{}, 4783 ) 4784 require.NoError(t, err) 4785 require.False(t, ok) 4786 } 4787 getHardState := func( 4788 t *testing.T, store *kvserver.Store, rangeID roachpb.RangeID, 4789 ) raftpb.HardState { 4790 hs, err := stateloader.Make(rangeID).LoadHardState(ctx, store.Engine()) 4791 require.NoError(t, err) 4792 return hs 4793 } 4794 partitionReplicaOnSplit := func(t *testing.T, mtc *multiTestContext, key roachpb.Key, basePartition *mtcPartitionedRange, partRange **mtcPartitionedRange) { 4795 // Set up a hook to partition the RHS range at its initial range ID 4796 // before proposing the split trigger. 4797 var setupOnce sync.Once 4798 f := kvserverbase.ReplicaProposalFilter(func(args kvserverbase.ProposalFilterArgs) *roachpb.Error { 4799 req, ok := args.Req.GetArg(roachpb.EndTxn) 4800 if !ok { 4801 return nil 4802 } 4803 endTxn := req.(*roachpb.EndTxnRequest) 4804 if endTxn.InternalCommitTrigger == nil || endTxn.InternalCommitTrigger.SplitTrigger == nil { 4805 return nil 4806 } 4807 split := endTxn.InternalCommitTrigger.SplitTrigger 4808 4809 if !split.RightDesc.StartKey.Equal(key) { 4810 return nil 4811 } 4812 setupOnce.Do(func() { 4813 replDesc, ok := split.RightDesc.GetReplicaDescriptor(1) 4814 require.True(t, ok) 4815 var err error 4816 *partRange, err = basePartition.extend(mtc, split.RightDesc.RangeID, replDesc.ReplicaID, 4817 0 /* partitionedNode */, true /* activated */, unreliableRaftHandlerFuncs{}) 4818 require.NoError(t, err) 4819 proposalFilter.Store(noopProposalFilter) 4820 }) 4821 return nil 4822 }) 4823 proposalFilter.Store(f) 4824 } 4825 4826 // The basic setup for all of these tests are that we have a LHS range on 3 4827 // nodes and we've partitioned store 0 for the LHS range. The tests will now 4828 // perform a split, remove the RHS, add it back and validate assumptions. 4829 // 4830 // Different outcomes will occur depending on whether and how the RHS is 4831 // partitioned and whether the server is killed. In all cases we want the 4832 // split to succeed and the RHS to eventually also be on all 3 nodes. 4833 setup := func(t *testing.T) ( 4834 mtc *multiTestContext, 4835 db *kv.DB, 4836 keyA, keyB roachpb.Key, 4837 lhsID roachpb.RangeID, 4838 lhsPartition *mtcPartitionedRange, 4839 ) { 4840 mtc = &multiTestContext{ 4841 storeConfig: &sc, 4842 } 4843 mtc.Start(t, 3) 4844 4845 db = mtc.Store(1).DB() 4846 4847 // Split off a non-system range so we don't have to account for node liveness 4848 // traffic. 4849 scratchTableKey := keys.SystemSQLCodec.TablePrefix(math.MaxUint32) 4850 // Put some data in the range so we'll have something to test for. 4851 keyA = append(append(roachpb.Key{}, scratchTableKey...), 'a') 4852 keyB = append(append(roachpb.Key{}, scratchTableKey...), 'b') 4853 4854 split(t, db, scratchTableKey) 4855 ri, err := getRangeInfo(ctx, db, scratchTableKey) 4856 require.Nil(t, err) 4857 lhsID = ri.Desc.RangeID 4858 // First put the range on all three nodes. 4859 mtc.replicateRange(lhsID, 1, 2) 4860 4861 // Set up a partition for the LHS range only. Initially it is not active. 4862 lhsPartition, err = setupPartitionedRange(mtc, lhsID, 4863 0 /* replicaID */, 0 /* partitionedNode */, false /* activated */, unreliableRaftHandlerFuncs{}) 4864 require.NoError(t, err) 4865 // Wait for all nodes to catch up. 4866 increment(t, db, keyA, 5) 4867 mtc.waitForValues(keyA, []int64{5, 5, 5}) 4868 4869 // Transfer the lease off of node 0. 4870 mtc.transferLease(ctx, lhsID, 0, 2) 4871 4872 // Make sure everybody knows about that transfer. 4873 increment(t, db, keyA, 1) 4874 mtc.waitForValues(keyA, []int64{6, 6, 6}) 4875 lhsPartition.activate() 4876 4877 increment(t, db, keyA, 1) 4878 mtc.waitForValues(keyA, []int64{6, 7, 7}) 4879 return mtc, db, keyA, keyB, lhsID, lhsPartition 4880 } 4881 4882 // In this case we only have the LHS partitioned. The RHS will learn about its 4883 // identity as the replica in the split and after being re-added will learn 4884 // about the new replica ID and will lay down a tombstone. At this point we'll 4885 // partition the RHS and ensure that the split does not clobber the RHS's hard 4886 // state. 4887 t.Run("(1) no RHS partition", func(t *testing.T) { 4888 mtc, db, keyA, keyB, _, lhsPartition := setup(t) 4889 defer mtc.Stop() 4890 4891 split(t, db, keyB) 4892 4893 // Write a value which we can observe to know when the split has been 4894 // applied by the LHS. 4895 increment(t, db, keyA, 1) 4896 mtc.waitForValues(keyA, []int64{6, 8, 8}) 4897 4898 increment(t, db, keyB, 6) 4899 // Wait for all non-partitioned nodes to catch up. 4900 mtc.waitForValues(keyB, []int64{0, 6, 6}) 4901 4902 rhsInfo, err := getRangeInfo(ctx, db, keyB) 4903 require.NoError(t, err) 4904 rhsID := rhsInfo.Desc.RangeID 4905 _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) 4906 require.True(t, store0Exists) 4907 4908 // Remove and re-add the RHS to create a new uninitialized replica at 4909 // a higher replica ID. This will lead to a tombstone being written. 4910 require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) 4911 // Unsuccessful because the RHS will not accept the learner snapshot 4912 // and will be rolled back. Nevertheless it will have learned that it 4913 // has been removed at the old replica ID. 4914 err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 4915 require.True(t, 4916 testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) 4917 4918 // Without a partitioned RHS we'll end up always writing a tombstone here because 4919 // the RHS will be created at the initial replica ID because it will get 4920 // raft message when the other nodes split and then after the above call 4921 // it will find out about its new replica ID and write a tombstone for the 4922 // old one. 4923 waitForTombstone(t, mtc.Store(0).Engine(), rhsID) 4924 lhsPartition.deactivate() 4925 mtc.waitForValues(keyA, []int64{8, 8, 8}) 4926 hs := getHardState(t, mtc.Store(0), rhsID) 4927 require.Equal(t, uint64(0), hs.Commit) 4928 testutils.SucceedsSoon(t, func() error { 4929 return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 4930 }) 4931 mtc.waitForValues(keyB, []int64{6, 6, 6}) 4932 }) 4933 4934 // This case is like the previous case except the store crashes after 4935 // laying down a tombstone. 4936 t.Run("(2) no RHS partition, with restart", func(t *testing.T) { 4937 mtc, db, keyA, keyB, _, lhsPartition := setup(t) 4938 defer mtc.Stop() 4939 4940 split(t, db, keyB) 4941 4942 // Write a value which we can observe to know when the split has been 4943 // applied by the LHS. 4944 increment(t, db, keyA, 1) 4945 mtc.waitForValues(keyA, []int64{6, 8, 8}) 4946 4947 increment(t, db, keyB, 6) 4948 // Wait for all non-partitioned nodes to catch up. 4949 mtc.waitForValues(keyB, []int64{0, 6, 6}) 4950 4951 rhsInfo, err := getRangeInfo(ctx, db, keyB) 4952 require.NoError(t, err) 4953 rhsID := rhsInfo.Desc.RangeID 4954 _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) 4955 require.True(t, store0Exists) 4956 4957 // Remove and re-add the RHS to create a new uninitialized replica at 4958 // a higher replica ID. This will lead to a tombstone being written. 4959 require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) 4960 // Unsuccessfuly because the RHS will not accept the learner snapshot 4961 // and will be rolled back. Nevertheless it will have learned that it 4962 // has been removed at the old replica ID. 4963 err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 4964 require.True(t, 4965 testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) 4966 4967 // Without a partitioned RHS we'll end up always writing a tombstone here because 4968 // the RHS will be created at the initial replica ID because it will get 4969 // raft message when the other nodes split and then after the above call 4970 // it will find out about its new replica ID and write a tombstone for the 4971 // old one. 4972 waitForTombstone(t, mtc.Store(0).Engine(), rhsID) 4973 4974 // We do all of this incrementing to ensure that nobody will ever 4975 // succeed in sending a message the new RHS replica after we restart 4976 // the store. Previously there were races which could happen if we 4977 // stopped the store immediately. Sleeps worked but this feels somehow 4978 // more principled. 4979 curB := int64(6) 4980 for curB < 100 { 4981 curB++ 4982 increment(t, db, keyB, 1) 4983 mtc.waitForValues(keyB, []int64{0, curB, curB}) 4984 } 4985 4986 // Restart store 0 so that it forgets about the newer replicaID. 4987 mtc.stopStore(0) 4988 mtc.restartStore(0) 4989 4990 lhsPartition.deactivate() 4991 mtc.waitForValues(keyA, []int64{8, 8, 8}) 4992 hs := getHardState(t, mtc.Store(0), rhsID) 4993 require.Equal(t, uint64(0), hs.Commit) 4994 testutils.SucceedsSoon(t, func() error { 4995 return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 4996 }) 4997 mtc.waitForValues(keyB, []int64{curB, curB, curB}) 4998 }) 4999 5000 // In this case the RHS will be partitioned from hearing anything about 5001 // the initial replica ID of the RHS after the split. It will learn about 5002 // the higher replica ID and have that higher replica ID in memory when 5003 // the split is processed. We partition the RHS's new replica ID before 5004 // processing the split to ensure that the RHS doesn't get initialized. 5005 t.Run("(3) initial replica RHS partition, no restart", func(t *testing.T) { 5006 mtc, db, keyA, keyB, _, lhsPartition := setup(t) 5007 defer mtc.Stop() 5008 var rhsPartition *mtcPartitionedRange 5009 partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition) 5010 split(t, db, keyB) 5011 5012 // Write a value which we can observe to know when the split has been 5013 // applied by the LHS. 5014 increment(t, db, keyA, 1) 5015 mtc.waitForValues(keyA, []int64{6, 8, 8}) 5016 5017 increment(t, db, keyB, 6) 5018 // Wait for all non-partitioned nodes to catch up. 5019 mtc.waitForValues(keyB, []int64{0, 6, 6}) 5020 5021 rhsInfo, err := getRangeInfo(ctx, db, keyB) 5022 require.NoError(t, err) 5023 rhsID := rhsInfo.Desc.RangeID 5024 _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) 5025 require.True(t, store0Exists) 5026 5027 // Remove and re-add the RHS to create a new uninitialized replica at 5028 // a higher replica ID. This will lead to a tombstone being written. 5029 require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) 5030 // Unsuccessful because the RHS will not accept the learner snapshot 5031 // and will be rolled back. Nevertheless it will have learned that it 5032 // has been removed at the old replica ID. 5033 err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 5034 require.True(t, 5035 testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) 5036 // Ensure that the replica exists with the higher replica ID. 5037 repl, err := mtc.Store(0).GetReplica(rhsInfo.Desc.RangeID) 5038 require.NoError(t, err) 5039 require.Equal(t, repl.ReplicaID(), rhsInfo.Desc.NextReplicaID) 5040 rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID) 5041 5042 // Ensure that there's no tombstone. 5043 // The RHS on store 0 never should have heard about its original ID. 5044 ensureNoTombstone(t, mtc.Store(0), rhsID) 5045 lhsPartition.deactivate() 5046 mtc.waitForValues(keyA, []int64{8, 8, 8}) 5047 hs := getHardState(t, mtc.Store(0), rhsID) 5048 require.Equal(t, uint64(0), hs.Commit) 5049 // Now succeed in adding the RHS. Use SucceedsSoon because in rare cases 5050 // the learner snapshot can fail due to a race with a raft snapshot from 5051 // a raft leader on a different node. 5052 testutils.SucceedsSoon(t, func() error { 5053 return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 5054 }) 5055 mtc.waitForValues(keyB, []int64{6, 6, 6}) 5056 }) 5057 5058 // This case is set up like the previous one except after the RHS learns about 5059 // its higher replica ID the store crahes and forgets. The RHS replica gets 5060 // initialized by the split. 5061 t.Run("(4) initial replica RHS partition, with restart", func(t *testing.T) { 5062 mtc, db, keyA, keyB, _, lhsPartition := setup(t) 5063 defer mtc.Stop() 5064 var rhsPartition *mtcPartitionedRange 5065 5066 partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition) 5067 split(t, db, keyB) 5068 5069 // Write a value which we can observe to know when the split has been 5070 // applied by the LHS. 5071 increment(t, db, keyA, 1) 5072 mtc.waitForValues(keyA, []int64{6, 8, 8}) 5073 5074 increment(t, db, keyB, 6) 5075 // Wait for all non-partitioned nodes to catch up. 5076 mtc.waitForValues(keyB, []int64{0, 6, 6}) 5077 5078 rhsInfo, err := getRangeInfo(ctx, db, keyB) 5079 require.NoError(t, err) 5080 rhsID := rhsInfo.Desc.RangeID 5081 _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) 5082 require.True(t, store0Exists) 5083 5084 // Remove and re-add the RHS to create a new uninitialized replica at 5085 // a higher replica ID. This will lead to a tombstone being written. 5086 require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) 5087 // Unsuccessfuly because the RHS will not accept the learner snapshot 5088 // and will be rolled back. Nevertheless it will have learned that it 5089 // has been removed at the old replica ID. 5090 err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 5091 require.True(t, 5092 testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) 5093 // Ensure that there's no tombstone. 5094 // The RHS on store 0 never should have heard about its original ID. 5095 ensureNoTombstone(t, mtc.Store(0), rhsID) 5096 5097 // Now, before we deactivate the LHS partition, partition the newer replica 5098 // on the RHS too. 5099 rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID) 5100 5101 // We do all of this incrementing to ensure that nobody will ever 5102 // succeed in sending a message the new RHS replica after we restart 5103 // the store. Previously there were races which could happen if we 5104 // stopped the store immediately. Sleeps worked but this feels somehow 5105 // more principled. 5106 curB := int64(6) 5107 for curB < 100 { 5108 curB++ 5109 increment(t, db, keyB, 1) 5110 mtc.waitForValues(keyB, []int64{0, curB, curB}) 5111 } 5112 5113 mtc.stopStore(0) 5114 mtc.restartStore(0) 5115 5116 lhsPartition.deactivate() 5117 mtc.waitForValues(keyA, []int64{8, 8, 8}) 5118 // In this case the store has forgotten that it knew the RHS of the split 5119 // could not exist. We ensure that it has been initialized to the initial 5120 // commit value, which is 10. 5121 testutils.SucceedsSoon(t, func() error { 5122 hs := getHardState(t, mtc.Store(0), rhsID) 5123 if hs.Commit != uint64(10) { 5124 return errors.Errorf("hard state not yet initialized: got %v, expected %v", 5125 hs.Commit, uint64(10)) 5126 } 5127 return nil 5128 }) 5129 rhsPartition.deactivate() 5130 testutils.SucceedsSoon(t, func() error { 5131 return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) 5132 }) 5133 mtc.waitForValues(keyB, []int64{curB, curB, curB}) 5134 }) 5135 } 5136 5137 // TestReplicaRemovalClosesProposalQuota is a somewhat contrived test to ensure 5138 // that when a replica is removed that it closes its proposal quota if it has 5139 // one. This used to not be the case though it wasn't really very consequential. 5140 // Firstly, it's rare that a removed replica has a proposal quota to begin with. 5141 // Replicas which believe they are they leaseholder can only be removed if they 5142 // have lost the lease and are behind. This requires a network partition. 5143 // Regardless, there was never actually a problem because once the replica has 5144 // been removed, all commands will eventually fail and remove themselves from 5145 // the quota pool. This potentially adds latency as every pending request will 5146 // need to acquire and release their quota. This is almost always very fast as 5147 // it is rarely the case that there are more outstanding requests than there is 5148 // quota. Nevertheless, we have this test to ensure that the pool does get 5149 // closed if only to avoid asking the question and to ensure that that case is 5150 // tested. 5151 func TestReplicaRemovalClosesProposalQuota(t *testing.T) { 5152 defer leaktest.AfterTest(t)() 5153 ctx := context.Background() 5154 // These variables track the request count to make sure that all of the 5155 // requests have made it to the Replica. 5156 var ( 5157 rangeID int64 5158 putRequestCount int64 5159 ) 5160 tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ 5161 ServerArgs: base.TestServerArgs{ 5162 Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ 5163 DisableReplicaGCQueue: true, 5164 TestingRequestFilter: kvserverbase.ReplicaRequestFilter(func(_ context.Context, r roachpb.BatchRequest) *roachpb.Error { 5165 if r.RangeID == roachpb.RangeID(atomic.LoadInt64(&rangeID)) { 5166 if _, isPut := r.GetArg(roachpb.Put); isPut { 5167 atomic.AddInt64(&putRequestCount, 1) 5168 } 5169 } 5170 return nil 5171 }), 5172 }}, 5173 RaftConfig: base.RaftConfig{ 5174 // Set the proposal quota to a tiny amount so that each write will 5175 // exceed it. 5176 RaftProposalQuota: 512, 5177 // RaftMaxInflightMsgs * RaftMaxSizePerMsg cannot exceed RaftProposalQuota. 5178 RaftMaxInflightMsgs: 2, 5179 RaftMaxSizePerMsg: 256, 5180 }, 5181 }, 5182 ReplicationMode: base.ReplicationManual, 5183 }) 5184 defer tc.Stopper().Stop(ctx) 5185 5186 key := tc.ScratchRange(t) 5187 require.NoError(t, tc.WaitForSplitAndInitialization(key)) 5188 desc, err := tc.LookupRange(key) 5189 require.NoError(t, err) 5190 atomic.StoreInt64(&rangeID, int64(desc.RangeID)) 5191 tc.AddReplicasOrFatal(t, key, tc.Target(1), tc.Target(2)) 5192 // Partition node 1 from receiving any requests or responses. 5193 // This will prevent it from successfully replicating anything. 5194 require.NoError(t, tc.WaitForSplitAndInitialization(key)) 5195 require.NoError(t, tc.TransferRangeLease(desc, tc.Target(0))) 5196 store, repl := getFirstStoreReplica(t, tc.Server(0), key) 5197 funcs := unreliableRaftHandlerFuncs{} 5198 tc.Servers[0].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{ 5199 rangeID: desc.RangeID, 5200 RaftMessageHandler: store, 5201 unreliableRaftHandlerFuncs: funcs, 5202 }) 5203 // NB: We need to be sure that our Replica is the leaseholder for this 5204 // test to make sense. It usually is. 5205 lease, pendingLease := repl.GetLease() 5206 if pendingLease != (roachpb.Lease{}) || lease.OwnedBy(store.StoreID()) { 5207 t.Skip("the replica is not the leaseholder, this happens rarely under stressrace") 5208 } 5209 var wg sync.WaitGroup 5210 const N = 100 5211 for i := 0; i < N; i++ { 5212 wg.Add(1) 5213 go func(i int) { 5214 defer wg.Done() 5215 k := append(key[0:len(key):len(key)], strconv.Itoa(i)...) 5216 _, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{ 5217 RangeID: desc.RangeID, 5218 }, putArgs(k, bytes.Repeat([]byte{'a'}, 1000))) 5219 require.Regexp(t, 5220 `result is ambiguous \(removing replica\)|`+ 5221 `r`+strconv.Itoa(int(desc.RangeID))+" was not found on s1", pErr.GoError()) 5222 }(i) 5223 } 5224 testutils.SucceedsSoon(t, func() error { 5225 if seen := atomic.LoadInt64(&putRequestCount); seen < N { 5226 return fmt.Errorf("saw %d, waiting for %d", seen, N) 5227 } 5228 return nil 5229 }) 5230 desc = *repl.Desc() 5231 fromReplDesc, found := desc.GetReplicaDescriptor(3) 5232 require.True(t, found) 5233 replDesc, found := desc.GetReplicaDescriptor(store.StoreID()) 5234 require.True(t, found) 5235 newReplDesc := replDesc 5236 newReplDesc.ReplicaID = desc.NextReplicaID 5237 require.Nil(t, store.HandleRaftRequest(ctx, &kvserver.RaftMessageRequest{ 5238 RangeID: desc.RangeID, 5239 RangeStartKey: desc.StartKey, 5240 FromReplica: fromReplDesc, 5241 ToReplica: newReplDesc, 5242 Message: raftpb.Message{Type: raftpb.MsgVote, Term: 2}, 5243 }, noopRaftMessageResponseSteam{})) 5244 ts := waitForTombstone(t, store.Engine(), desc.RangeID) 5245 require.Equal(t, ts.NextReplicaID, desc.NextReplicaID) 5246 wg.Wait() 5247 _, err = repl.GetProposalQuota().Acquire(ctx, 1) 5248 require.Regexp(t, "closed.*destroyed", err) 5249 } 5250 5251 type noopRaftMessageResponseSteam struct{} 5252 5253 func (n noopRaftMessageResponseSteam) Context() context.Context { 5254 return context.Background() 5255 } 5256 5257 func (n noopRaftMessageResponseSteam) Send(*kvserver.RaftMessageResponse) error { 5258 return nil 5259 } 5260 5261 var _ kvserver.RaftMessageResponseStream = noopRaftMessageResponseSteam{}