github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_merge_test.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math" 18 "math/rand" 19 "reflect" 20 "regexp" 21 "strconv" 22 "strings" 23 "sync" 24 "sync/atomic" 25 "testing" 26 "time" 27 28 "github.com/cockroachdb/cockroach/pkg/base" 29 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 30 "github.com/cockroachdb/cockroach/pkg/gossip" 31 "github.com/cockroachdb/cockroach/pkg/keys" 32 "github.com/cockroachdb/cockroach/pkg/kv" 33 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 34 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 35 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 36 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer" 37 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 38 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait" 39 "github.com/cockroachdb/cockroach/pkg/roachpb" 40 "github.com/cockroachdb/cockroach/pkg/rpc" 41 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 42 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 43 "github.com/cockroachdb/cockroach/pkg/storage" 44 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 45 "github.com/cockroachdb/cockroach/pkg/testutils" 46 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 47 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" 48 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 49 "github.com/cockroachdb/cockroach/pkg/util/hlc" 50 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 51 "github.com/cockroachdb/cockroach/pkg/util/log" 52 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 53 "github.com/cockroachdb/cockroach/pkg/util/randutil" 54 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 55 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 56 "github.com/cockroachdb/errors" 57 "github.com/gogo/protobuf/proto" 58 "github.com/stretchr/testify/assert" 59 "github.com/stretchr/testify/require" 60 "go.etcd.io/etcd/raft/raftpb" 61 ) 62 63 func adminMergeArgs(key roachpb.Key) *roachpb.AdminMergeRequest { 64 return &roachpb.AdminMergeRequest{ 65 RequestHeader: roachpb.RequestHeader{ 66 Key: key, 67 }, 68 } 69 } 70 71 // createSplitRanges issues an AdminSplit command for the key "b". It returns 72 // the descriptors for the ranges to the left and right of the split. 73 func createSplitRanges( 74 ctx context.Context, store *kvserver.Store, 75 ) (*roachpb.RangeDescriptor, *roachpb.RangeDescriptor, error) { 76 args := adminSplitArgs(roachpb.Key("b")) 77 if _, err := kv.SendWrapped(ctx, store.TestSender(), args); err != nil { 78 return nil, nil, err.GoError() 79 } 80 81 lhsDesc := store.LookupReplica(roachpb.RKey("a")).Desc() 82 rhsDesc := store.LookupReplica(roachpb.RKey("c")).Desc() 83 84 if bytes.Equal(lhsDesc.StartKey, rhsDesc.StartKey) { 85 return nil, nil, fmt.Errorf("split ranges have the same start key: %q = %q", 86 lhsDesc.StartKey, rhsDesc.StartKey) 87 } 88 89 return lhsDesc, rhsDesc, nil 90 } 91 92 // TestStoreRangeMergeTwoEmptyRanges tries to merge two empty ranges together. 93 func TestStoreRangeMergeTwoEmptyRanges(t *testing.T) { 94 defer leaktest.AfterTest(t)() 95 96 ctx := context.Background() 97 storeCfg := kvserver.TestStoreConfig(nil) 98 storeCfg.TestingKnobs.DisableMergeQueue = true 99 mtc := &multiTestContext{storeConfig: &storeCfg} 100 mtc.Start(t, 1) 101 defer mtc.Stop() 102 store := mtc.Store(0) 103 104 lhsDesc, _, err := createSplitRanges(ctx, store) 105 if err != nil { 106 t.Fatal(err) 107 } 108 109 // Merge the RHS back into the LHS. 110 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 111 _, pErr := kv.SendWrapped(ctx, store.TestSender(), args) 112 if pErr != nil { 113 t.Fatal(pErr) 114 } 115 116 // Verify the merge by looking up keys from both ranges. 117 lhsRepl := store.LookupReplica(roachpb.RKey("a")) 118 rhsRepl := store.LookupReplica(roachpb.RKey("c")) 119 120 if !reflect.DeepEqual(lhsRepl, rhsRepl) { 121 t.Fatalf("ranges were not merged: %s != %s", lhsRepl, rhsRepl) 122 } 123 124 // The LHS has been split once and merged once, so it should have received 125 // two generation bumps. 126 if e, a := int64(2), lhsRepl.Desc().Generation; e != a { 127 t.Fatalf("expected LHS to have generation %d, but got %d", e, a) 128 } 129 } 130 131 func getEngineKeySet(t *testing.T, e storage.Engine) map[string]struct{} { 132 t.Helper() 133 kvs, err := storage.Scan(e, roachpb.KeyMin, roachpb.KeyMax, 0 /* max */) 134 if err != nil { 135 t.Fatal(err) 136 } 137 out := map[string]struct{}{} 138 for _, kv := range kvs { 139 out[string(kv.Key.Key)] = struct{}{} 140 } 141 return out 142 } 143 144 // TestStoreRangeMergeMetadataCleanup tests that all metadata of a 145 // subsumed range is cleaned up on merge. 146 func TestStoreRangeMergeMetadataCleanup(t *testing.T) { 147 defer leaktest.AfterTest(t)() 148 149 ctx := context.Background() 150 storeCfg := kvserver.TestStoreConfig(nil) 151 storeCfg.TestingKnobs.DisableMergeQueue = true 152 mtc := &multiTestContext{storeConfig: &storeCfg} 153 mtc.Start(t, 1) 154 defer mtc.Stop() 155 store := mtc.Store(0) 156 157 content := roachpb.Key("testing!") 158 159 // Write some values left of the proposed split key. 160 pArgs := putArgs(roachpb.Key("aaa"), content) 161 if _, pErr := kv.SendWrapped(ctx, store.TestSender(), pArgs); pErr != nil { 162 t.Fatal(pErr) 163 } 164 165 // Collect all the keys. 166 preKeys := getEngineKeySet(t, store.Engine()) 167 168 // Split the range. 169 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store) 170 if err != nil { 171 t.Fatal(err) 172 } 173 174 // Write some values right of the split key. 175 pArgs = putArgs(roachpb.Key("ccc"), content) 176 if _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{ 177 RangeID: rhsDesc.RangeID, 178 }, pArgs); pErr != nil { 179 t.Fatal(pErr) 180 } 181 182 // Merge the b range back into the a range. 183 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 184 if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil { 185 t.Fatal(pErr) 186 } 187 188 // Collect all the keys again. 189 postKeys := getEngineKeySet(t, store.Engine()) 190 191 // Compute the new keys. 192 for k := range preKeys { 193 delete(postKeys, k) 194 } 195 196 tombstoneKey := string(keys.RangeTombstoneKey(rhsDesc.RangeID)) 197 if _, ok := postKeys[tombstoneKey]; !ok { 198 t.Errorf("tombstone key (%s) missing after merge", roachpb.Key(tombstoneKey)) 199 } 200 delete(postKeys, tombstoneKey) 201 202 // Keep only the subsumed range's local keys. 203 localRangeKeyPrefix := string(keys.MakeRangeIDPrefix(rhsDesc.RangeID)) 204 for k := range postKeys { 205 if !strings.HasPrefix(k, localRangeKeyPrefix) { 206 delete(postKeys, k) 207 } 208 } 209 210 if numKeys := len(postKeys); numKeys > 0 { 211 var buf bytes.Buffer 212 fmt.Fprintf(&buf, "%d keys were not cleaned up:\n", numKeys) 213 for k := range postKeys { 214 fmt.Fprintf(&buf, "%s (%q)\n", roachpb.Key(k), k) 215 } 216 t.Fatal(buf.String()) 217 } 218 } 219 220 // TestStoreRangeMergeWithData attempts to merge two ranges, each containing 221 // data. 222 func TestStoreRangeMergeWithData(t *testing.T) { 223 defer leaktest.AfterTest(t)() 224 225 for _, retries := range []int64{0, 3} { 226 t.Run(fmt.Sprintf("retries=%d", retries), func(t *testing.T) { 227 mergeWithData(t, retries) 228 }) 229 } 230 } 231 232 func mergeWithData(t *testing.T, retries int64) { 233 ctx := context.Background() 234 storeCfg := kvserver.TestStoreConfig(nil) 235 storeCfg.TestingKnobs.DisableReplicateQueue = true 236 storeCfg.TestingKnobs.DisableMergeQueue = true 237 storeCfg.Clock = nil // manual clock 238 239 // Maybe inject some retryable errors when the merge transaction commits. 240 var mtc *multiTestContext 241 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 242 for _, req := range ba.Requests { 243 if et := req.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil { 244 if atomic.AddInt64(&retries, -1) >= 0 { 245 return roachpb.NewError( 246 roachpb.NewTransactionRetryError(roachpb.RETRY_SERIALIZABLE, "filter err")) 247 } 248 } 249 if req.GetSubsume() != nil { 250 // Introduce targeted chaos by forcing a lease acquisition before 251 // Subsume can execute. This triggers an unusual code path where the 252 // lease acquisition, not Subsume, notices the merge and installs a 253 // mergeComplete channel on the replica. 254 mtc.advanceClock(ctx) 255 } 256 } 257 return nil 258 } 259 260 mtc = &multiTestContext{ 261 storeConfig: &storeCfg, 262 // This test was written before the multiTestContext started creating many 263 // system ranges at startup, and hasn't been update to take that into 264 // account. 265 startWithSingleRange: true, 266 } 267 268 var store1, store2 *kvserver.Store 269 mtc.Start(t, 1) 270 store1, store2 = mtc.stores[0], mtc.stores[0] 271 defer mtc.Stop() 272 273 lhsDesc, rhsDesc, pErr := createSplitRanges(ctx, store1) 274 if pErr != nil { 275 t.Fatal(pErr) 276 } 277 278 content := []byte("testing!") 279 280 // Write some values left and right of the proposed split key. 281 pArgs := putArgs(roachpb.Key("aaa"), content) 282 if _, pErr := kv.SendWrapped(ctx, store1.TestSender(), pArgs); pErr != nil { 283 t.Fatal(pErr) 284 } 285 pArgs = putArgs(roachpb.Key("ccc"), content) 286 if _, pErr := kv.SendWrappedWith(ctx, store2.TestSender(), roachpb.Header{ 287 RangeID: rhsDesc.RangeID, 288 }, pArgs); pErr != nil { 289 t.Fatal(pErr) 290 } 291 292 // Confirm the values are there. 293 gArgs := getArgs(roachpb.Key("aaa")) 294 if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil { 295 t.Fatal(pErr) 296 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 297 t.Fatal(err) 298 } else if !bytes.Equal(replyBytes, content) { 299 t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) 300 } 301 gArgs = getArgs(roachpb.Key("ccc")) 302 if reply, pErr := kv.SendWrappedWith(ctx, store2.TestSender(), roachpb.Header{ 303 RangeID: rhsDesc.RangeID, 304 }, gArgs); pErr != nil { 305 t.Fatal(pErr) 306 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 307 t.Fatal(err) 308 } else if !bytes.Equal(replyBytes, content) { 309 t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) 310 } 311 312 // Merge the b range back into the a range. 313 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 314 if _, pErr := kv.SendWrapped(ctx, store1.TestSender(), args); pErr != nil { 315 t.Fatal(pErr) 316 } 317 318 // Verify no intents remains on range descriptor keys. 319 for _, key := range []roachpb.Key{keys.RangeDescriptorKey(lhsDesc.StartKey), keys.RangeDescriptorKey(rhsDesc.StartKey)} { 320 if _, _, err := storage.MVCCGet( 321 ctx, store1.Engine(), key, store1.Clock().Now(), storage.MVCCGetOptions{}, 322 ); err != nil { 323 t.Fatal(err) 324 } 325 } 326 327 // Verify the merge by looking up keys from both ranges. 328 lhsRepl := store1.LookupReplica(roachpb.RKey("a")) 329 rhsRepl := store1.LookupReplica(roachpb.RKey("c")) 330 331 if lhsRepl != rhsRepl { 332 t.Fatalf("ranges were not merged %+v=%+v", lhsRepl.Desc(), rhsRepl.Desc()) 333 } 334 if startKey := lhsRepl.Desc().StartKey; !bytes.Equal(startKey, roachpb.RKeyMin) { 335 t.Fatalf("The start key is not equal to KeyMin %q=%q", startKey, roachpb.RKeyMin) 336 } 337 if endKey := rhsRepl.Desc().EndKey; !bytes.Equal(endKey, roachpb.RKeyMax) { 338 t.Fatalf("The end key is not equal to KeyMax %q=%q", endKey, roachpb.RKeyMax) 339 } 340 341 // Try to get values from after the merge. 342 gArgs = getArgs(roachpb.Key("aaa")) 343 if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil { 344 t.Fatal(pErr) 345 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 346 t.Fatal(err) 347 } else if !bytes.Equal(replyBytes, content) { 348 t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) 349 } 350 gArgs = getArgs(roachpb.Key("ccc")) 351 if reply, pErr := kv.SendWrappedWith(ctx, store1.TestSender(), roachpb.Header{ 352 RangeID: rhsRepl.RangeID, 353 }, gArgs); pErr != nil { 354 t.Fatal(pErr) 355 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 356 t.Fatal(err) 357 } else if !bytes.Equal(replyBytes, content) { 358 t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) 359 } 360 361 // Put new values after the merge on both sides. 362 pArgs = putArgs(roachpb.Key("aaaa"), content) 363 if _, pErr := kv.SendWrapped(ctx, store1.TestSender(), pArgs); pErr != nil { 364 t.Fatal(pErr) 365 } 366 pArgs = putArgs(roachpb.Key("cccc"), content) 367 if _, pErr := kv.SendWrappedWith(ctx, store1.TestSender(), roachpb.Header{ 368 RangeID: rhsRepl.RangeID, 369 }, pArgs); pErr != nil { 370 t.Fatal(pErr) 371 } 372 373 // Try to get the newly placed values. 374 gArgs = getArgs(roachpb.Key("aaaa")) 375 if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil { 376 t.Fatal(pErr) 377 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 378 t.Fatal(err) 379 } else if !bytes.Equal(replyBytes, content) { 380 t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) 381 } 382 gArgs = getArgs(roachpb.Key("cccc")) 383 if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil { 384 t.Fatal(pErr) 385 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 386 t.Fatal(err) 387 } else if !bytes.Equal(replyBytes, content) { 388 t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) 389 } 390 391 gArgs = getArgs(roachpb.Key("cccc")) 392 if _, pErr := kv.SendWrappedWith(ctx, store2, roachpb.Header{ 393 RangeID: rhsDesc.RangeID, 394 }, gArgs); !testutils.IsPError( 395 pErr, `r2 was not found`, 396 ) { 397 t.Fatalf("expected get on rhs to fail after merge, but got err=%v", pErr) 398 } 399 400 if atomic.LoadInt64(&retries) >= 0 { 401 t.Fatalf("%d retries remaining (expected less than zero)", retries) 402 } 403 } 404 405 // TestStoreRangeMergeTimestampCache verifies that the timestamp cache on the 406 // LHS is properly updated after a merge. 407 func TestStoreRangeMergeTimestampCache(t *testing.T) { 408 defer leaktest.AfterTest(t)() 409 410 testutils.RunTrueAndFalse(t, "disjoint-leaseholders", mergeCheckingTimestampCaches) 411 } 412 413 func mergeCheckingTimestampCaches(t *testing.T, disjointLeaseholders bool) { 414 ctx := context.Background() 415 storeCfg := kvserver.TestStoreConfig(nil) 416 storeCfg.TestingKnobs.DisableMergeQueue = true 417 mtc := &multiTestContext{storeConfig: &storeCfg} 418 var lhsStore, rhsStore *kvserver.Store 419 if disjointLeaseholders { 420 mtc.Start(t, 2) 421 lhsStore, rhsStore = mtc.Store(0), mtc.Store(1) 422 } else { 423 mtc.Start(t, 1) 424 lhsStore, rhsStore = mtc.Store(0), mtc.Store(0) 425 } 426 defer mtc.Stop() 427 428 lhsDesc, rhsDesc, err := createSplitRanges(ctx, lhsStore) 429 if err != nil { 430 t.Fatal(err) 431 } 432 433 if disjointLeaseholders { 434 mtc.replicateRange(lhsDesc.RangeID, 1) 435 mtc.replicateRange(rhsDesc.RangeID, 1) 436 mtc.transferLease(ctx, rhsDesc.RangeID, 0, 1) 437 testutils.SucceedsSoon(t, func() error { 438 rhsRepl, err := rhsStore.GetReplica(rhsDesc.RangeID) 439 if err != nil { 440 return err 441 } 442 if !rhsRepl.OwnsValidLease(mtc.clock().Now()) { 443 return errors.New("rhs store does not own valid lease for rhs range") 444 } 445 return nil 446 }) 447 } 448 449 // Write a key to the RHS. 450 rhsKey := roachpb.Key("c") 451 if _, pErr := kv.SendWrappedWith(ctx, rhsStore, roachpb.Header{ 452 RangeID: rhsDesc.RangeID, 453 }, incrementArgs(rhsKey, 1)); pErr != nil { 454 t.Fatal(pErr) 455 } 456 457 readTS := mtc.clock().Now() 458 459 // Simulate a read on the RHS from a node with a newer clock. 460 var ba roachpb.BatchRequest 461 ba.Timestamp = readTS 462 ba.RangeID = rhsDesc.RangeID 463 ba.Add(getArgs(rhsKey)) 464 if br, pErr := rhsStore.Send(ctx, ba); pErr != nil { 465 t.Fatal(pErr) 466 } else if v, err := br.Responses[0].GetGet().Value.GetInt(); err != nil { 467 t.Fatal(err) 468 } else if v != 1 { 469 t.Fatalf("expected 1, but got %d", v) 470 } else if br.Timestamp != readTS { 471 t.Fatalf("expected read to execute at %v, but executed at %v", readTS, br.Timestamp) 472 } 473 474 // Simulate a txn abort on the RHS from a node with a newer clock. Because 475 // the transaction record for the pushee was not yet written, this will bump 476 // the timestamp cache to record the abort. 477 pushee := roachpb.MakeTransaction("pushee", rhsKey, roachpb.MinUserPriority, readTS, 0) 478 pusher := roachpb.MakeTransaction("pusher", rhsKey, roachpb.MaxUserPriority, readTS, 0) 479 ba = roachpb.BatchRequest{} 480 ba.Timestamp = mtc.clock().Now() 481 ba.RangeID = rhsDesc.RangeID 482 ba.Add(pushTxnArgs(&pusher, &pushee, roachpb.PUSH_ABORT)) 483 if br, pErr := rhsStore.Send(ctx, ba); pErr != nil { 484 t.Fatal(pErr) 485 } else if txn := br.Responses[0].GetPushTxn().PusheeTxn; txn.Status != roachpb.ABORTED { 486 t.Fatalf("expected aborted pushee, but got %v", txn) 487 } 488 489 // Merge the RHS back into the LHS. 490 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 491 if _, pErr := kv.SendWrapped(ctx, lhsStore.TestSender(), args); pErr != nil { 492 t.Fatal(pErr) 493 } 494 495 // After the merge, attempt to write under the read. The batch should get 496 // forwarded to a timestamp after the read. 497 ba = roachpb.BatchRequest{} 498 ba.Timestamp = readTS 499 ba.RangeID = lhsDesc.RangeID 500 ba.Add(incrementArgs(rhsKey, 1)) 501 if br, pErr := lhsStore.Send(ctx, ba); pErr != nil { 502 t.Fatal(pErr) 503 } else if br.Timestamp.LessEq(readTS) { 504 t.Fatalf("expected write to execute after %v, but executed at %v", readTS, br.Timestamp) 505 } 506 507 // Attempt to create a transaction record for the pushee transaction, which 508 // was aborted before the merge. This should be rejected with a transaction 509 // aborted error. The reason will depend on whether the leaseholders were 510 // disjoint or not because disjoint leaseholders will lead to a loss of 511 // resolution in the timestamp cache. Either way though, the transaction 512 // should not be allowed to create its record. 513 hb, hbH := heartbeatArgs(&pushee, mtc.clock().Now()) 514 ba = roachpb.BatchRequest{} 515 ba.Header = hbH 516 ba.RangeID = lhsDesc.RangeID 517 ba.Add(hb) 518 var expReason roachpb.TransactionAbortedReason 519 if disjointLeaseholders { 520 expReason = roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED 521 } else { 522 expReason = roachpb.ABORT_REASON_ABORTED_RECORD_FOUND 523 } 524 if _, pErr := lhsStore.Send(ctx, ba); pErr == nil { 525 t.Fatalf("expected TransactionAbortedError(%s) but got %v", expReason, pErr) 526 } else if abortErr, ok := pErr.GetDetail().(*roachpb.TransactionAbortedError); !ok { 527 t.Fatalf("expected TransactionAbortedError(%s) but got %v", expReason, pErr) 528 } else if abortErr.Reason != expReason { 529 t.Fatalf("expected TransactionAbortedError(%s) but got %v", expReason, pErr) 530 } 531 } 532 533 // TestStoreRangeMergeTimestampCacheCausality verifies that range merges update 534 // the clock on the subsuming store as necessary to preserve causality. 535 // 536 // The test simulates a particularly diabolical sequence of events in which 537 // causality information is not communicated through the normal channels. 538 // Suppose two adjacent ranges, A and B, are collocated on S2, S3, and S4. (S1 539 // is omitted for consistency with the store numbering in the test itself.) S3 540 // holds the lease on A, while S4 holds the lease on B. Every store's clock 541 // starts at time T1. 542 // 543 // To merge A and B, S3 will launch a merge transaction that sends several RPCs 544 // to S4. Suppose that, just before S4 begins executing the Subsume request, a 545 // read sneaks in for some key K at a large timestamp T3. S4 will bump its clock 546 // from T1 to T3, so when the Subsume goes to determine the current time to use 547 // for the FreezeStart field in the Subsume response, it will use T3. When S3 548 // completes the merge, it will thus use T3 as the timestamp cache's low water 549 // mark for the keys that previously belonged to B. 550 // 551 // Importantly, S3 must also update its clock from T1 to T3. Otherwise, as this 552 // test demonstrates, it is possible for S3 to send a lease to another store, in 553 // this case S2, that begins at T2. S2 will then assume it is free to accept a 554 // write at T2, when in fact we already served a read at T3. This would be a 555 // serializability violation! 556 // 557 // Note that there are several mechanisms that *almost* prevent this problem. If 558 // the read of K at T3 occurs slightly earlier, the batch response for Subsume 559 // will set the Now field to T3, which S3 will use to bump its clock. 560 // (BatchResponse.Now is computed when the batch is received, not when it 561 // finishes executing.) If S3 receives a write for K at T2, it will a) properly 562 // bump the write to T4, because its timestamp cache is up to date, and then b) 563 // bump its clock to T4. Or if S4 were to send a single RPC to S3, S3 would bump 564 // its clock based on the BatchRequest.Timestamp. 565 // 566 // In short, this sequence of events is likely to be exceedingly unlikely in 567 // practice, but is subtle enough to warrant a test. 568 func TestStoreRangeMergeTimestampCacheCausality(t *testing.T) { 569 defer leaktest.AfterTest(t)() 570 571 ctx := context.Background() 572 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 573 storeCfg.TestingKnobs.DisableMergeQueue = true 574 storeCfg.Clock = nil // manual clock 575 mtc := &multiTestContext{storeConfig: &storeCfg} 576 var readTS hlc.Timestamp 577 rhsKey := roachpb.Key("c") 578 mtc.storeConfig.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 579 if ba.IsSingleSubsumeRequest() { 580 // Before we execute a Subsume request, execute a read on the same store 581 // at a much higher timestamp. 582 gba := roachpb.BatchRequest{} 583 gba.RangeID = ba.RangeID 584 gba.Timestamp = ba.Timestamp.Add(42 /* wallTime */, 0 /* logical */) 585 gba.Add(getArgs(rhsKey)) 586 store := mtc.Store(int(ba.Header.Replica.StoreID - 1)) 587 gbr, pErr := store.Send(ctx, gba) 588 if pErr != nil { 589 t.Error(pErr) // different goroutine, so can't use t.Fatal 590 } 591 readTS = gbr.Timestamp 592 } 593 return nil 594 } 595 for i := 0; i < 4; i++ { 596 clock := hlc.NewClock(hlc.NewManualClock(123).UnixNano, time.Millisecond /* maxOffset */) 597 mtc.clocks = append(mtc.clocks, clock) 598 } 599 mtc.Start(t, 4) 600 defer mtc.Stop() 601 distSender := mtc.distSenders[0] 602 603 for _, key := range []roachpb.Key{roachpb.Key("a"), roachpb.Key("b")} { 604 if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(key)); pErr != nil { 605 t.Fatal(pErr) 606 } 607 } 608 609 lhsRangeID := mtc.Store(0).LookupReplica(roachpb.RKey("a")).RangeID 610 rhsRangeID := mtc.Store(0).LookupReplica(roachpb.RKey("b")).RangeID 611 612 // Replicate [a, b) to s2, s3, and s4, and put the lease on s3. 613 mtc.replicateRange(lhsRangeID, 1, 2, 3) 614 mtc.transferLease(ctx, lhsRangeID, 0, 2) 615 mtc.unreplicateRange(lhsRangeID, 0) 616 617 // Replicate [b, Max) to s2, s3, and s4, and put the lease on s4. 618 mtc.replicateRange(rhsRangeID, 1, 2, 3) 619 mtc.transferLease(ctx, rhsRangeID, 0, 3) 620 mtc.unreplicateRange(rhsRangeID, 0) 621 622 // N.B. We isolate r1 on s1 so that node liveness heartbeats do not interfere 623 // with our precise clock management on s2, s3, and s4. 624 625 // Write a key to [b, Max). 626 if _, pErr := kv.SendWrapped(ctx, distSender, incrementArgs(rhsKey, 1)); pErr != nil { 627 t.Fatal(pErr) 628 } 629 630 // Wait for all relevant stores to have the same value. This indirectly 631 // ensures the lease transfers have applied on all relevant stores. 632 mtc.waitForValues(rhsKey, []int64{0, 1, 1, 1}) 633 634 // Merge [a, b) and [b, Max). Our request filter above will intercept the 635 // merge and execute a read with a large timestamp immediately before the 636 // Subsume request executes. 637 if _, pErr := kv.SendWrappedWith(ctx, mtc.Store(2), roachpb.Header{ 638 RangeID: lhsRangeID, 639 }, adminMergeArgs(roachpb.Key("a"))); pErr != nil { 640 t.Fatal(pErr) 641 } 642 643 // Immediately transfer the lease on the merged range [a, Max) from s3 to s2. 644 // To test that it is, in fact, the merge trigger that properly bumps s3's 645 // clock, s3 must not send or receive any requests before it transfers the 646 // lease, as those requests could bump s3's clock through other code paths. 647 mtc.transferLease(ctx, lhsRangeID, 2, 1) 648 testutils.SucceedsSoon(t, func() error { 649 lhsRepl1, err := mtc.Store(1).GetReplica(lhsRangeID) 650 if err != nil { 651 return err 652 } 653 if !lhsRepl1.OwnsValidLease(mtc.clocks[1].Now()) { 654 return errors.New("s2 does not own valid lease for lhs range") 655 } 656 return nil 657 }) 658 659 // Attempt to write at the same time as the read. The write's timestamp 660 // should be forwarded to after the read. 661 ba := roachpb.BatchRequest{} 662 ba.Timestamp = readTS 663 ba.RangeID = lhsRangeID 664 ba.Add(incrementArgs(rhsKey, 1)) 665 if br, pErr := mtc.Store(1).Send(ctx, ba); pErr != nil { 666 t.Fatal(pErr) 667 } else if br.Timestamp.LessEq(readTS) { 668 t.Fatalf("expected write to execute after %v, but executed at %v", readTS, br.Timestamp) 669 } 670 } 671 672 // TestStoreRangeMergeLastRange verifies that merging the last range fails. 673 func TestStoreRangeMergeLastRange(t *testing.T) { 674 defer leaktest.AfterTest(t)() 675 676 ctx := context.Background() 677 mtc := multiTestContext{ 678 // This test was written before the multiTestContext started creating many 679 // system ranges at startup, and hasn't been update to take that into 680 // account. 681 startWithSingleRange: true, 682 } 683 mtc.Start(t, 1) 684 defer mtc.Stop() 685 store := mtc.Store(0) 686 687 // Merge last range. 688 _, pErr := kv.SendWrapped(ctx, store.TestSender(), adminMergeArgs(roachpb.KeyMin)) 689 if !testutils.IsPError(pErr, "cannot merge final range") { 690 t.Fatalf("expected 'cannot merge final range' error; got %s", pErr) 691 } 692 } 693 694 func TestStoreRangeMergeTxnFailure(t *testing.T) { 695 defer leaktest.AfterTest(t)() 696 697 ctx := context.Background() 698 storeCfg := kvserver.TestStoreConfig(nil) 699 storeCfg.TestingKnobs.DisableSplitQueue = true 700 storeCfg.TestingKnobs.DisableMergeQueue = true 701 702 // Install a store filter that maybe injects retryable errors into a merge 703 // transaction before ultimately aborting the merge. 704 var retriesBeforeFailure int64 705 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 706 for _, req := range ba.Requests { 707 if et := req.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil { 708 if atomic.AddInt64(&retriesBeforeFailure, -1) >= 0 { 709 return roachpb.NewError( 710 roachpb.NewTransactionRetryError(roachpb.RETRY_SERIALIZABLE, "filter err")) 711 } 712 return roachpb.NewError(errors.New("injected permafail")) 713 } 714 } 715 return nil 716 } 717 718 mtc := &multiTestContext{storeConfig: &storeCfg} 719 mtc.Start(t, 1) 720 defer mtc.Stop() 721 store := mtc.Store(0) 722 kvDB := store.DB() 723 724 if err := kvDB.Put(ctx, "aa", "val"); err != nil { 725 t.Fatal(err) 726 } 727 if err := kvDB.Put(ctx, "cc", "val"); err != nil { 728 t.Fatal(err) 729 } 730 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store) 731 if err != nil { 732 t.Fatal(err) 733 } 734 735 verifyLHSAndRHSLive := func() { 736 t.Helper() 737 for _, tc := range []struct { 738 rangeID roachpb.RangeID 739 key roachpb.Key 740 }{ 741 {lhsDesc.RangeID, roachpb.Key("aa")}, 742 {rhsDesc.RangeID, roachpb.Key("cc")}, 743 } { 744 if reply, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{ 745 RangeID: tc.rangeID, 746 }, getArgs(tc.key)); pErr != nil { 747 t.Fatal(pErr) 748 } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { 749 t.Fatal(err) 750 } else if !bytes.Equal(replyBytes, []byte("val")) { 751 t.Fatalf("actual value %q did not match expected value %q", replyBytes, []byte("val")) 752 } 753 } 754 } 755 756 attemptMerge := func() { 757 t.Helper() 758 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 759 _, pErr := kv.SendWrapped(ctx, store.TestSender(), args) 760 if exp := "injected permafail"; !testutils.IsPError(pErr, exp) { 761 t.Fatalf("expected %q error, but got %q", exp, pErr) 762 } 763 } 764 765 verifyLHSAndRHSLive() 766 767 atomic.StoreInt64(&retriesBeforeFailure, 0) 768 attemptMerge() 769 verifyLHSAndRHSLive() 770 if atomic.LoadInt64(&retriesBeforeFailure) >= 0 { 771 t.Fatalf("%d retries remaining (expected less than zero)", retriesBeforeFailure) 772 } 773 774 atomic.StoreInt64(&retriesBeforeFailure, 3) 775 attemptMerge() 776 verifyLHSAndRHSLive() 777 if atomic.LoadInt64(&retriesBeforeFailure) >= 0 { 778 t.Fatalf("%d retries remaining (expected less than zero)", retriesBeforeFailure) 779 } 780 } 781 782 // TestStoreRangeSplitMergeGeneration verifies that splits and merges both 783 // update the range descriptor generations of the involved ranges according to 784 // the comment on the RangeDescriptor.Generation field. 785 func TestStoreRangeSplitMergeGeneration(t *testing.T) { 786 defer leaktest.AfterTest(t)() 787 788 testutils.RunTrueAndFalse(t, "rhsHasHigherGen", func(t *testing.T, rhsHasHigherGen bool) { 789 s, _, _ := serverutils.StartServer(t, base.TestServerArgs{ 790 Knobs: base.TestingKnobs{ 791 Store: &kvserver.StoreTestingKnobs{ 792 // Disable both splits and merges so that we're in full 793 // control over them. 794 DisableMergeQueue: true, 795 DisableSplitQueue: true, 796 }, 797 }, 798 }) 799 defer s.Stopper().Stop(context.Background()) 800 801 leftKey := roachpb.Key("z") 802 rightKey := leftKey.Next().Next() 803 804 // First, split at the left key for convenience, so that we can check 805 // leftDesc.StartKey == leftKey later. 806 _, _, err := s.SplitRange(leftKey) 807 assert.NoError(t, err) 808 809 store, err := s.GetStores().(*kvserver.Stores).GetStore(s.GetFirstStoreID()) 810 assert.NoError(t, err) 811 leftRepl := store.LookupReplica(keys.MustAddr(leftKey)) 812 assert.NotNil(t, leftRepl) 813 preSplitGen := leftRepl.Desc().Generation 814 leftDesc, rightDesc, err := s.SplitRange(rightKey) 815 assert.NoError(t, err) 816 817 // Split should increment the LHS' generation and also propagate the result 818 // to the RHS. 819 assert.Equal(t, preSplitGen+1, leftDesc.Generation) 820 assert.Equal(t, preSplitGen+1, rightDesc.Generation) 821 822 if rhsHasHigherGen { 823 // Split the RHS again to increment its generation once more, so that 824 // we get (assuming preSplitGen=1): 825 // 826 // |--left@2---||---right@3---||--don't care--| 827 // 828 rightDesc, _, err = s.SplitRange(rightKey.Next()) 829 assert.NoError(t, err) 830 assert.Equal(t, preSplitGen+2, rightDesc.Generation) 831 } else { 832 // Split and merge the LHS to increment the generation (it ends up 833 // being incremented by two). Note that leftKey.Next() is still in 834 // the left range. Assuming preSplitGen=1, we'll end up in the 835 // situation: 836 // 837 // |--left@4---||---right@2---| 838 var tmpRightDesc roachpb.RangeDescriptor 839 leftDesc, tmpRightDesc, err = s.SplitRange(leftKey.Next()) 840 assert.Equal(t, preSplitGen+2, leftDesc.Generation) 841 assert.Equal(t, preSplitGen+2, tmpRightDesc.Generation) 842 assert.NoError(t, err) 843 leftDesc, err = s.MergeRanges(leftKey) 844 assert.NoError(t, err) 845 assert.Equal(t, preSplitGen+3, leftDesc.Generation) 846 } 847 848 // Make sure the split/merge shenanigans above didn't get the range 849 // descriptors confused. 850 assert.Equal(t, leftKey, leftDesc.StartKey.AsRawKey()) 851 assert.Equal(t, rightKey, rightDesc.StartKey.AsRawKey()) 852 853 // Merge the two ranges back to verify that the resulting descriptor 854 // has the correct generation. 855 mergedDesc, err := s.MergeRanges(leftKey) 856 assert.NoError(t, err) 857 858 maxPreMergeGen := leftDesc.Generation 859 if rhsGen := rightDesc.Generation; rhsGen > maxPreMergeGen { 860 maxPreMergeGen = rhsGen 861 } 862 863 assert.Equal(t, maxPreMergeGen+1, mergedDesc.Generation) 864 assert.Equal(t, leftDesc.RangeID, mergedDesc.RangeID) 865 }) 866 } 867 868 // TestStoreRangeMergeStats starts by splitting a range, then writing random 869 // data to both sides of the split. It then merges the ranges and verifies the 870 // merged range has stats consistent with recomputations. 871 func TestStoreRangeMergeStats(t *testing.T) { 872 defer leaktest.AfterTest(t)() 873 ctx := context.Background() 874 storeCfg := kvserver.TestStoreConfig(nil) 875 storeCfg.TestingKnobs.DisableMergeQueue = true 876 storeCfg.Clock = nil // manual clock 877 mtc := &multiTestContext{storeConfig: &storeCfg} 878 mtc.Start(t, 1) 879 defer mtc.Stop() 880 store := mtc.Store(0) 881 882 // Split the range. 883 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store) 884 if err != nil { 885 t.Fatal(err) 886 } 887 888 // Write some values left and right of the proposed split key. 889 kvserver.WriteRandomDataToRange(t, store, lhsDesc.RangeID, []byte("aaa")) 890 kvserver.WriteRandomDataToRange(t, store, rhsDesc.RangeID, []byte("ccc")) 891 892 // Litter some abort span records. txn1 will leave a record on the LHS, txn2 893 // will leave a record on the RHS, and txn3 will leave a record on both. This 894 // tests whether the merge code properly accounts for merging abort span 895 // records for the same transaction. 896 txn1 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 897 if err := txn1.Put(ctx, "a-txn1", "val"); err != nil { 898 t.Fatal(err) 899 } 900 txn2 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 901 if err := txn2.Put(ctx, "c-txn2", "val"); err != nil { 902 t.Fatal(err) 903 } 904 txn3 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 905 if err := txn3.Put(ctx, "a-txn3", "val"); err != nil { 906 t.Fatal(err) 907 } 908 if err := txn3.Put(ctx, "c-txn3", "val"); err != nil { 909 t.Fatal(err) 910 } 911 hiPriTxn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 912 hiPriTxn.TestingSetPriority(enginepb.MaxTxnPriority) 913 for _, key := range []string{"a-txn1", "c-txn2", "a-txn3", "c-txn3"} { 914 if err := hiPriTxn.Put(ctx, key, "val"); err != nil { 915 t.Fatal(err) 916 } 917 } 918 if err := hiPriTxn.Commit(ctx); err != nil { 919 t.Fatal(err) 920 } 921 // Leave txn1-txn3 open so that their abort span records exist during the 922 // merge below. 923 924 // Get the range stats for both ranges now that we have data. 925 snap := store.Engine().NewSnapshot() 926 defer snap.Close() 927 msA, err := stateloader.Make(lhsDesc.RangeID).LoadMVCCStats(ctx, snap) 928 if err != nil { 929 t.Fatal(err) 930 } 931 msB, err := stateloader.Make(rhsDesc.RangeID).LoadMVCCStats(ctx, snap) 932 if err != nil { 933 t.Fatal(err) 934 } 935 936 // Stats should agree with recomputation. 937 if err := verifyRecomputedStats(snap, lhsDesc, msA, mtc.manualClock.UnixNano()); err != nil { 938 t.Fatalf("failed to verify range A's stats before split: %+v", err) 939 } 940 if err := verifyRecomputedStats(snap, rhsDesc, msB, mtc.manualClock.UnixNano()); err != nil { 941 t.Fatalf("failed to verify range B's stats before split: %+v", err) 942 } 943 944 mtc.manualClock.Increment(100) 945 946 // Merge the b range back into the a range. 947 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 948 if _, err := kv.SendWrapped(ctx, store.TestSender(), args); err != nil { 949 t.Fatal(err) 950 } 951 replMerged := store.LookupReplica(lhsDesc.StartKey) 952 953 // Get the range stats for the merged range and verify. 954 snap = store.Engine().NewSnapshot() 955 defer snap.Close() 956 msMerged, err := stateloader.Make(replMerged.RangeID).LoadMVCCStats(ctx, snap) 957 if err != nil { 958 t.Fatal(err) 959 } 960 961 // Merged stats should agree with recomputation. 962 nowNanos := mtc.manualClock.UnixNano() 963 msMerged.AgeTo(nowNanos) 964 if err := verifyRecomputedStats(snap, replMerged.Desc(), msMerged, nowNanos); err != nil { 965 t.Errorf("failed to verify range's stats after merge: %+v", err) 966 } 967 } 968 969 func TestStoreRangeMergeInFlightTxns(t *testing.T) { 970 defer leaktest.AfterTest(t)() 971 972 ctx := context.Background() 973 storeCfg := kvserver.TestStoreConfig(nil) 974 storeCfg.TestingKnobs.DisableReplicateQueue = true 975 storeCfg.TestingKnobs.DisableMergeQueue = true 976 mtc := &multiTestContext{storeConfig: &storeCfg} 977 mtc.Start(t, 1) 978 defer mtc.Stop() 979 store := mtc.Store(0) 980 981 // Create two adjacent ranges. 982 setupReplicas := func() (lhsDesc, rhsDesc *roachpb.RangeDescriptor, err error) { 983 lhsDesc, rhsDesc, err = createSplitRanges(ctx, store) 984 if err != nil { 985 return nil, nil, err 986 } 987 return lhsDesc, rhsDesc, nil 988 } 989 990 // Verify that a transaction can span a merge. 991 t.Run("valid", func(t *testing.T) { 992 lhsDesc, _, err := setupReplicas() 993 if err != nil { 994 t.Fatal(err) 995 } 996 lhsKey, rhsKey := roachpb.Key("aa"), roachpb.Key("cc") 997 998 txn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 999 // Put the key on the RHS side first so ownership of the transaction record 1000 // will need to transfer to the LHS range during the merge. 1001 if err := txn.Put(ctx, rhsKey, t.Name()); err != nil { 1002 t.Fatal(err) 1003 } 1004 if err := txn.Put(ctx, lhsKey, t.Name()); err != nil { 1005 t.Fatal(err) 1006 } 1007 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1008 if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil { 1009 t.Fatal(pErr) 1010 } 1011 if err := txn.Commit(ctx); err != nil { 1012 t.Fatal(err) 1013 } 1014 1015 for _, key := range []roachpb.Key{lhsKey, rhsKey} { 1016 kv, err := store.DB().Get(ctx, key) 1017 if err != nil { 1018 t.Fatal(err) 1019 } else if string(kv.ValueBytes()) != t.Name() { 1020 t.Fatalf("actual value %q did not match expected value %q", kv.ValueBytes(), t.Name()) 1021 } 1022 } 1023 }) 1024 1025 // Verify that a transaction's abort span records are preserved when the 1026 // transaction spans a merge. 1027 t.Run("abort-span", func(t *testing.T) { 1028 lhsDesc, _, err := setupReplicas() 1029 if err != nil { 1030 t.Fatal(err) 1031 } 1032 rhsKey := roachpb.Key("cc") 1033 1034 // Create a transaction that will be aborted before the merge but won't 1035 // realize until after the merge. 1036 txn1 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 1037 // Put the key on the RHS side so ownership of the transaction record and 1038 // abort span records will need to transfer to the LHS during the merge. 1039 if err := txn1.Put(ctx, rhsKey, t.Name()); err != nil { 1040 t.Fatal(err) 1041 } 1042 1043 // Create and commit a txn that aborts txn1. 1044 txn2 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 1045 txn2.TestingSetPriority(enginepb.MaxTxnPriority) 1046 if err := txn2.Put(ctx, rhsKey, "muhahahah"); err != nil { 1047 t.Fatal(err) 1048 } 1049 if err := txn2.Commit(ctx); err != nil { 1050 t.Fatal(err) 1051 } 1052 1053 // Complete the merge. 1054 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1055 if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil { 1056 t.Fatal(pErr) 1057 } 1058 expErr := "TransactionAbortedError(ABORT_REASON_ABORT_SPAN)" 1059 if _, err := txn1.Get(ctx, rhsKey); !testutils.IsError(err, regexp.QuoteMeta(expErr)) { 1060 t.Fatalf("expected %s but got %v", expErr, err) 1061 } 1062 }) 1063 1064 // Verify that the transaction wait queue on the right-hand range in a merge 1065 // is cleared if the merge commits. 1066 t.Run("wait-queue", func(t *testing.T) { 1067 lhsDesc, rhsDesc, err := setupReplicas() 1068 if err != nil { 1069 t.Fatal(err) 1070 } 1071 rhsKey := roachpb.Key("cc") 1072 1073 // Set a timeout, and set the the transaction liveness threshold to 1074 // something much larger than our timeout. We want transactions to get stuck 1075 // in the transaction wait queue and trigger the timeout if we forget to 1076 // clear it. 1077 var cancel func() 1078 ctx, cancel = context.WithTimeout(ctx, testutils.DefaultSucceedsSoonDuration) 1079 defer cancel() 1080 defer txnwait.TestingOverrideTxnLivenessThreshold(2 * testutils.DefaultSucceedsSoonDuration) 1081 1082 // Create a transaction that won't complete until after the merge. 1083 txn1 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 1084 // Put the key on the RHS side so ownership of the transaction record and 1085 // abort span records will need to transfer to the LHS during the merge. 1086 if err := txn1.Put(ctx, rhsKey, t.Name()); err != nil { 1087 t.Fatal(err) 1088 } 1089 1090 // Create a txn that will conflict with txn1. 1091 txn2 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 1092 txn2ErrCh := make(chan error) 1093 go func() { 1094 // Get should block on txn1's intent until txn1 commits. 1095 kv, err := txn2.Get(ctx, rhsKey) 1096 if err != nil { 1097 txn2ErrCh <- err 1098 } else if string(kv.ValueBytes()) != t.Name() { 1099 txn2ErrCh <- errors.Errorf("actual value %q did not match expected value %q", kv.ValueBytes(), t.Name()) 1100 } 1101 txn2ErrCh <- nil 1102 }() 1103 1104 // Wait for txn2 to realize it conflicts with txn1 and enter its wait queue. 1105 { 1106 repl, err := store.GetReplica(rhsDesc.RangeID) 1107 if err != nil { 1108 t.Fatal(err) 1109 } 1110 for { 1111 if _, ok := repl.GetConcurrencyManager().TxnWaitQueue().TrackedTxns()[txn1.ID()]; ok { 1112 break 1113 } 1114 select { 1115 case <-time.After(10 * time.Millisecond): 1116 case <-ctx.Done(): 1117 t.Fatal("timed out waiting for txn2 to enter wait queue") 1118 } 1119 } 1120 } 1121 1122 // Complete the merge. 1123 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1124 if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil { 1125 t.Fatal(pErr) 1126 } 1127 1128 if err := txn1.Commit(ctx); err != nil { 1129 t.Fatal(err) 1130 } 1131 1132 // Now that txn1 has committed, txn2's get operation should complete. 1133 select { 1134 case err := <-txn2ErrCh: 1135 if err != nil { 1136 t.Fatal(err) 1137 } 1138 case <-ctx.Done(): 1139 t.Fatal("timed out waiting for txn2 to complete get") 1140 } 1141 1142 if err := txn2.Commit(ctx); err != nil { 1143 t.Fatal(err) 1144 } 1145 }) 1146 } 1147 1148 // TestStoreRangeMergeSplitRace_MergeWins (occasionally) reproduces a race where 1149 // a concurrent merge and split could deadlock. It exercises the case where the 1150 // merge commits and the split aborts. See the SplitWins variant of this test 1151 // for the inverse case. 1152 // 1153 // The bug works like this. A merge of adjacent ranges P and Q and a split of Q 1154 // execute concurrently, though the merge executes with an earlier timestamp. 1155 // The merge updates Q's meta2 range descriptor. The split updates Q's local 1156 // range descriptor, then tries to update Q's meta2 range descriptor, but runs 1157 // into the merge's intent and attempts to push the merge. Under our current 1158 // concurrency control strategy, this results in the split waiting for the merge 1159 // to complete. The merge then tries to update Q's local range descriptor but 1160 // runs into the split's intent. While pushing the split, the merge realizes 1161 // that waiting for the split to complete would cause deadlock, so it aborts the 1162 // split instead. 1163 // 1164 // But before the split can clean up its transaction record and intents, the 1165 // merge locks Q and launches a goroutine to unlock Q when the merge commits. 1166 // Then the merge completes, which has a weird side effect: the split's push of 1167 // the merge will succeed! How is this possible? The split's push request is not 1168 // guaranteed to notice that the split has been aborted before it notices that 1169 // the merge has completed. So the aborted split winds up resolving the merge's 1170 // intent on Q's meta2 range descriptor and leaving its own intent in its place. 1171 // 1172 // In the past, the merge watcher goroutine would perform a range lookup for Q; 1173 // this would indirectly wait for the merge to complete by waiting for its 1174 // intent in meta2 to be resolved. In this case, however, its the *split*'s 1175 // intent that the watcher goroutine sees. This intent can't be resolved because 1176 // the split's transaction record is located on the locked range Q! And so Q can 1177 // never be unlocked. 1178 // 1179 // This bug was fixed by teaching the watcher goroutine to push the merge 1180 // transaction directly instead of doing so indirectly by querying meta2. 1181 // 1182 // Attempting a foolproof reproduction of the bug proved challenging and would 1183 // have required a mess of store filters. This test takes a simpler approach of 1184 // running the necessary split and a merge concurrently and allowing the race 1185 // scheduler to occasionally strike the right interleaving. At the time of 1186 // writing, the test would reliably reproduce the bug in about 50 runs (about 1187 // ten seconds of stress on an eight core laptop). 1188 func TestStoreRangeMergeSplitRace_MergeWins(t *testing.T) { 1189 defer leaktest.AfterTest(t)() 1190 1191 ctx := context.Background() 1192 storeCfg := kvserver.TestStoreConfig(nil) 1193 storeCfg.TestingKnobs.DisableReplicateQueue = true 1194 mtc := &multiTestContext{storeConfig: &storeCfg} 1195 mtc.Start(t, 1) 1196 defer mtc.Stop() 1197 distSender := mtc.distSenders[0] 1198 1199 lhsDesc, rhsDesc, err := createSplitRanges(ctx, mtc.Store(0)) 1200 if err != nil { 1201 t.Fatal(err) 1202 } 1203 1204 splitErrCh := make(chan error) 1205 go func() { 1206 time.Sleep(10 * time.Millisecond) 1207 splitArgs := adminSplitArgs(rhsDesc.StartKey.AsRawKey().Next()) 1208 _, pErr := kv.SendWrapped(ctx, distSender, splitArgs) 1209 splitErrCh <- pErr.GoError() 1210 }() 1211 1212 mergeArgs := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1213 if _, pErr := kv.SendWrapped(ctx, distSender, mergeArgs); pErr != nil { 1214 t.Fatal(pErr) 1215 } 1216 1217 if err := <-splitErrCh; err != nil { 1218 t.Fatal(err) 1219 } 1220 } 1221 1222 // TestStoreRangeMergeSplitRace_SplitWins reproduces a race where a concurrent 1223 // merge and split could deadlock. It exercises the case where the split commits 1224 // and the merge aborts. See the MergeWins variant of this test for the inverse 1225 // case. 1226 // 1227 // The bug works like this. A merge of adjacent ranges P and Q and a split of Q 1228 // execute concurrently, though the merge executes with an earlier timestamp. 1229 // First, the merge transaction reads Q's local range descriptor to determine 1230 // the combined range's range descriptor. Then it writes an intent to update P's 1231 // local range descriptor. 1232 // 1233 // Next, the split transaction runs from start to finish, updating Q's local 1234 // descriptor and its associated meta2 record. Notably, the split transaction 1235 // does not encounter any intents from the merge transaction, since the merge 1236 // transaction's only intent so far is on P's local range descriptor, and so the 1237 // split transaction can happily commit. 1238 // 1239 // The merge transaction then continues, writing an intent on Q's local 1240 // descriptor. Since the merge transaction is executing at an earlier timestamp 1241 // than the split transaction, the intent is written "under" the updated 1242 // descriptor written by the split transaction. 1243 // 1244 // In the past, the merge transaction would simply push its commit timestamp 1245 // forward and proceed, even though, upon committing, it would discover that it 1246 // was forbidden from committing with a pushed timestamp and abort instead. (For 1247 // why merge transactions cannot forward their commit timestamps, see the 1248 // discussion on the retry loop within AdminMerge.) This was problematic. Before 1249 // the doomed merge transaction attempted to commit, it would send a Subsume 1250 // request, launching a merge watcher goroutine on Q. This watcher goroutine 1251 // could incorrectly think that the merge transaction committed. Why? To 1252 // determine whether a merge has truly aborted, the watcher goroutine sends a 1253 // Get(/Meta2/QEndKey) request with a read uncommitted isolation level. If the 1254 // Get request returns either nil or a descriptor for a different range, the 1255 // merge is assumed to have committed. In this case, unfortunately, QEndKey is 1256 // the Q's end key post-split. After all, the split has committed and updated 1257 // Q's in-memory descriptor. The split transactions intents are cleaned up 1258 // asynchronously, however, and since the watcher goroutine is not performing a 1259 // consistent read it will not wait for the intents to be cleaned up. So 1260 // Get(/Meta2/QEndKey) might return nil, in which case the watcher goroutine 1261 // will incorrectly infer that the merge committed. (Note that the watcher 1262 // goroutine can't perform a consistent read, as that would look up the 1263 // transaction record on Q and deadlock, since Q is blocked for merging.) 1264 // 1265 // The bug was fixed by updating Q's local descriptor with a conditional put 1266 // instead of a put. This forces the merge transaction to fail early if writing 1267 // the intent would require forwarding the commit timestamp. In other words, 1268 // this ensures that the merge watcher goroutine is never launched if the RHS 1269 // local descriptor is updated while the merge transaction is executing. 1270 func TestStoreRangeMergeSplitRace_SplitWins(t *testing.T) { 1271 defer leaktest.AfterTest(t)() 1272 1273 ctx := context.Background() 1274 storeCfg := kvserver.TestStoreConfig(nil) 1275 storeCfg.TestingKnobs.DisableReplicateQueue = true 1276 1277 var distSender *kvcoord.DistSender 1278 var lhsDescKey atomic.Value 1279 var launchSplit int64 1280 var mergeRetries int64 1281 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 1282 for _, req := range ba.Requests { 1283 if cput := req.GetConditionalPut(); cput != nil { 1284 if v := lhsDescKey.Load(); v != nil && v.(roachpb.Key).Equal(cput.Key) { 1285 // If this is the first merge attempt, launch the split 1286 // before the merge's first write succeeds. 1287 if atomic.CompareAndSwapInt64(&launchSplit, 1, 0) { 1288 _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(roachpb.Key("c"))) 1289 return pErr 1290 } 1291 // Otherwise, record that the merge retried and proceed. 1292 atomic.AddInt64(&mergeRetries, 1) 1293 } 1294 } 1295 } 1296 return nil 1297 } 1298 1299 mtc := &multiTestContext{storeConfig: &storeCfg} 1300 mtc.Start(t, 1) 1301 defer mtc.Stop() 1302 distSender = mtc.distSenders[0] 1303 1304 lhsDesc, _, err := createSplitRanges(ctx, mtc.Store(0)) 1305 if err != nil { 1306 t.Fatal(err) 1307 } 1308 lhsDescKey.Store(keys.RangeDescriptorKey(lhsDesc.StartKey)) 1309 atomic.StoreInt64(&launchSplit, 1) 1310 1311 mergeArgs := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1312 if _, pErr := kv.SendWrapped(ctx, distSender, mergeArgs); pErr != nil { 1313 t.Fatal(pErr) 1314 } 1315 if atomic.LoadInt64(&mergeRetries) == 0 { 1316 t.Fatal("expected merge to retry at least once due to concurrent split") 1317 } 1318 } 1319 1320 // TestStoreRangeMergeRHSLeaseExpiration verifies that, if the right-hand range 1321 // in a merge loses its lease while a merge is in progress, the new leaseholder 1322 // does not incorrectly serve traffic before the merge completes. 1323 func TestStoreRangeMergeRHSLeaseExpiration(t *testing.T) { 1324 defer leaktest.AfterTest(t)() 1325 1326 ctx := context.Background() 1327 storeCfg := kvserver.TestStoreConfig(nil) 1328 storeCfg.TestingKnobs.DisableReplicateQueue = true 1329 storeCfg.TestingKnobs.DisableMergeQueue = true 1330 storeCfg.Clock = nil // manual clock 1331 1332 // The synchronization in this test is tricky. The merge transaction is 1333 // controlled by the AdminMerge function and normally commits quite quickly, 1334 // but we need to ensure an expiration of the RHS's lease occurs while the 1335 // merge transaction is open. To do so we install various hooks to observe 1336 // and control requests. It's easiest to understand these hooks after you've 1337 // read the meat of the test. 1338 1339 // Install a hook to control when the merge transaction commits. 1340 mergeEndTxnReceived := make(chan *roachpb.Transaction, 10) // headroom in case the merge transaction retries 1341 finishMerge := make(chan struct{}) 1342 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 1343 for _, r := range ba.Requests { 1344 if et := r.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil { 1345 mergeEndTxnReceived <- ba.Txn 1346 <-finishMerge 1347 } 1348 } 1349 return nil 1350 } 1351 1352 // Install a hook to observe when a get or a put request for a special key, 1353 // rhsSentinel, acquires latches and begins evaluating. 1354 const reqConcurrency = 10 1355 rhsSentinel := roachpb.Key("rhs-sentinel") 1356 reqAcquiredLatch := make(chan struct{}, reqConcurrency) 1357 storeCfg.TestingKnobs.TestingLatchFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 1358 for _, r := range ba.Requests { 1359 req := r.GetInner() 1360 switch req.Method() { 1361 case roachpb.Get, roachpb.Put: 1362 if req.Header().Key.Equal(rhsSentinel) { 1363 reqAcquiredLatch <- struct{}{} 1364 } 1365 } 1366 } 1367 return nil 1368 } 1369 1370 mtc := &multiTestContext{ 1371 storeConfig: &storeCfg, 1372 // This test was written before the multiTestContext started creating many 1373 // system ranges at startup, and hasn't been update to take that into 1374 // account. 1375 startWithSingleRange: true, 1376 } 1377 1378 mtc.Start(t, 2) 1379 defer mtc.Stop() 1380 1381 // Create the ranges to be merged. Put both ranges on both stores, but give 1382 // the second store the lease on the RHS. The LHS is largely irrelevant. What 1383 // matters is that the RHS exists on two stores so we can transfer its lease 1384 // during the merge. 1385 lhsDesc, rhsDesc, err := createSplitRanges(ctx, mtc.stores[0]) 1386 if err != nil { 1387 t.Fatal(err) 1388 } 1389 mtc.replicateRange(lhsDesc.RangeID, 1) 1390 mtc.replicateRange(rhsDesc.RangeID, 1) 1391 mtc.transferLease(ctx, rhsDesc.RangeID, 0, 1) 1392 1393 // Launch the merge. 1394 mergeErr := make(chan error) 1395 go func() { 1396 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1397 _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), args) 1398 mergeErr <- pErr.GoError() 1399 }() 1400 1401 // Wait for the merge transaction to send its EndTxn request. It won't 1402 // be able to complete just yet, thanks to the hook we installed above. 1403 mergeTxn := <-mergeEndTxnReceived 1404 1405 // Now's our chance to move the lease on the RHS from the second store to the 1406 // first. This isn't entirely straightforward. The replica on the second store 1407 // is aware of the merge and is refusing all traffic, so we can't just send a 1408 // TransferLease request. Instead, we need to expire the second store's lease, 1409 // then acquire the lease on the first store. 1410 1411 // Before doing so, however, ensure that the merge transaction has written 1412 // its transaction record so that it doesn't run into trouble with the low 1413 // water mark of the new leaseholder's timestamp cache. This could result in 1414 // the transaction being inadvertently aborted during its first attempt, 1415 // which this test is not designed to handle. If the merge transaction did 1416 // abort then the get requests could complete on r2 before the merge retried. 1417 hb, hbH := heartbeatArgs(mergeTxn, mtc.clock().Now()) 1418 if _, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), hbH, hb); pErr != nil { 1419 t.Fatal(pErr) 1420 } 1421 1422 // Turn off liveness heartbeats on the second store, then advance the clock 1423 // past the liveness expiration time. This expires all leases on all stores. 1424 mtc.nodeLivenesses[1].PauseHeartbeat(true) 1425 mtc.advanceClock(ctx) 1426 1427 // Manually heartbeat the liveness on the first store to ensure it's 1428 // considered live. The automatic heartbeat might not come for a while. 1429 require.NoError(t, mtc.heartbeatLiveness(ctx, 0)) 1430 1431 // Send several get and put requests to the the RHS. The first of these to 1432 // arrive will acquire the lease; the remaining requests will wait for that 1433 // lease acquisition to complete. Then all requests should block waiting for 1434 // the Subsume request to complete. By sending several of these requests in 1435 // parallel, we attempt to trigger a race where a request could slip through 1436 // on the replica between when the new lease is installed and when the 1437 // mergeComplete channel is installed. 1438 // 1439 // Note that the first request would never hit this race on its own. Nor would 1440 // any request that arrived early enough to see an outdated lease in 1441 // Replica.mu.state.Lease. All of these requests joined the in-progress lease 1442 // acquisition and blocked until the lease command acquires its latches, 1443 // at which point the mergeComplete channel was updated. To hit the race, the 1444 // request needed to arrive exactly between the update to 1445 // Replica.mu.state.Lease and the update to Replica.mu.mergeComplete. 1446 // 1447 // This race has since been fixed by installing the mergeComplete channel 1448 // before the new lease. 1449 reqErrs := make(chan *roachpb.Error) // closed when all reqs done 1450 var wg sync.WaitGroup 1451 wg.Add(reqConcurrency) 1452 go func() { 1453 wg.Wait() 1454 close(reqErrs) 1455 }() 1456 1457 for i := 0; i < reqConcurrency; i++ { 1458 go func(i int) { 1459 defer wg.Done() 1460 // For this test to have a shot at triggering a race, this log message 1461 // must be interleaved with the "new range lease" message, like so: 1462 // 1463 // I180821 21:57:53.799207 388 storage/client_merge_test.go:1079 starting get 5 1464 // I180821 21:57:53.800122 72 storage/replica_proposal.go:214 [s1,r2/1:{b-/Max}] new range lease ... 1465 // I180821 21:57:53.800447 318 storage/client_merge_test.go:1079 starting get 6 1466 // 1467 // When this test was written, it would always produce the above 1468 // interleaving, and successfully trigger the race when run with the race 1469 // detector enabled about 50% of the time. 1470 log.Infof(ctx, "starting req %d", i) 1471 var req roachpb.Request 1472 if i%2 == 0 { 1473 req = getArgs(rhsSentinel) 1474 } else { 1475 req = putArgs(rhsSentinel, []byte(fmt.Sprintf("val%d", i))) 1476 } 1477 _, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), roachpb.Header{ 1478 RangeID: rhsDesc.RangeID, 1479 }, req) 1480 reqErrs <- pErr 1481 }(i) 1482 time.Sleep(time.Millisecond) 1483 } 1484 1485 // Wait for the get and put requests to acquire latches, which is as far as 1486 // they can get while the merge is in progress. Then wait a little bit 1487 // longer. This tests that the requests really do get stuck waiting for the 1488 // merge to complete without depending too heavily on implementation 1489 // details. 1490 for i := 0; i < reqConcurrency; i++ { 1491 select { 1492 case <-reqAcquiredLatch: 1493 // Latch acquired. 1494 case pErr := <-reqErrs: 1495 // Requests may never make it to the latch acquisition if s1 has not 1496 // yet learned s2's lease is expired. Instead, we'll see a 1497 // NotLeaseholderError. 1498 require.IsType(t, &roachpb.NotLeaseHolderError{}, pErr.GetDetail()) 1499 } 1500 } 1501 time.Sleep(50 * time.Millisecond) 1502 1503 // Finally, allow the merge to complete. It should complete successfully. 1504 close(finishMerge) 1505 require.NoError(t, <-mergeErr) 1506 1507 // Because the merge completed successfully, r2 has ceased to exist. We 1508 // therefore *must* see only RangeNotFoundErrors here from every pending get 1509 // and put request. Anything else is a consistency error (or a bug in the 1510 // test). 1511 for pErr := range reqErrs { 1512 require.IsType(t, &roachpb.RangeNotFoundError{}, pErr.GetDetail()) 1513 } 1514 } 1515 1516 // TestStoreRangeMergeConcurrentRequests tests merging ranges that are serving 1517 // other traffic concurrently. 1518 func TestStoreRangeMergeConcurrentRequests(t *testing.T) { 1519 defer leaktest.AfterTest(t)() 1520 1521 ctx := context.Background() 1522 storeCfg := kvserver.TestStoreConfig(nil) 1523 storeCfg.TestingKnobs.DisableSplitQueue = true 1524 storeCfg.TestingKnobs.DisableMergeQueue = true 1525 storeCfg.TestingKnobs.DisableReplicateQueue = true 1526 storeCfg.Clock = nil // manual clock 1527 1528 var mtc *multiTestContext 1529 storeCfg.TestingKnobs.TestingResponseFilter = func( 1530 ctx context.Context, ba roachpb.BatchRequest, _ *roachpb.BatchResponse, 1531 ) *roachpb.Error { 1532 del := ba.Requests[0].GetDelete() 1533 if del != nil && bytes.HasSuffix(del.Key, keys.LocalRangeDescriptorSuffix) && rand.Int()%4 == 0 { 1534 // After every few deletions of the local range descriptor, expire all 1535 // range leases. This makes the following sequence of events quite likely: 1536 // 1537 // 1. The merge transaction begins and lays down deletion intents for 1538 // the meta2 and local copies of the RHS range descriptor. 1539 // 2. The RHS replica loses its lease, thanks to the following call to 1540 // mtc.advanceClock. 1541 // 3. A Get request arrives at the RHS replica and triggers a 1542 // synchronous lease acquisition. The lease acquisition notices 1543 // that a merge is in progress and installs a mergeComplete 1544 // channel. 1545 // 4. The Get request blocks on the newly installed mergeComplete 1546 // channel. 1547 // 5. The Subsume request arrives. (Or, if the merge transaction is 1548 // incorrectly pipelined, the QueryIntent request for the RHS range 1549 // descriptor key that precedes the Subsume request arrives.) 1550 // 1551 // This scenario previously caused deadlock. The merge was not able to 1552 // complete until the Subsume request completed, but the Subsume request 1553 // was unable to acquire latches until the Get request finished, which 1554 // was itself waiting for the merge to complete. Whoops! 1555 mtc.advanceClock(ctx) 1556 } 1557 return nil 1558 } 1559 1560 mtc = &multiTestContext{storeConfig: &storeCfg} 1561 mtc.Start(t, 1) 1562 defer mtc.Stop() 1563 store := mtc.Store(0) 1564 1565 keys := []roachpb.Key{ 1566 roachpb.Key("a1"), roachpb.Key("a2"), roachpb.Key("a3"), 1567 roachpb.Key("c1"), roachpb.Key("c2"), roachpb.Key("c3"), 1568 } 1569 1570 for _, k := range keys { 1571 if err := store.DB().Put(ctx, k, "val"); err != nil { 1572 t.Fatal(err) 1573 } 1574 } 1575 1576 // Failures in this test often present as a deadlock. Set a short timeout to 1577 // limit the damage. 1578 ctx, cancel := context.WithTimeout(ctx, testutils.DefaultSucceedsSoonDuration) 1579 defer cancel() 1580 1581 const numGetWorkers = 16 1582 const numMerges = 16 1583 1584 var numGets int64 1585 doneCh := make(chan struct{}) 1586 g := ctxgroup.WithContext(ctx) 1587 for i := 0; i < numGetWorkers; i++ { 1588 g.GoCtx(func(ctx context.Context) error { 1589 for { 1590 select { 1591 case <-ctx.Done(): 1592 return ctx.Err() 1593 case <-doneCh: 1594 return nil 1595 default: 1596 } 1597 key := keys[rand.Intn(len(keys))] 1598 if kv, err := store.DB().Get(ctx, key); err != nil { 1599 return err 1600 } else if v := string(kv.ValueBytes()); v != "val" { 1601 return fmt.Errorf(`expected "val", but got %q`, v) 1602 } 1603 atomic.AddInt64(&numGets, 1) 1604 } 1605 }) 1606 } 1607 1608 for i := 0; i < numMerges; i++ { 1609 lhsDesc, _, err := createSplitRanges(ctx, store) 1610 if err != nil { 1611 t.Fatal(err) 1612 } 1613 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1614 if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil { 1615 t.Fatal(pErr) 1616 } 1617 } 1618 1619 close(doneCh) 1620 if err := g.Wait(); err != nil { 1621 t.Fatal(err) 1622 } 1623 1624 // Expect that each worker was able to issue one at least one get request 1625 // during every split/merge cycle. Empirical evidence suggests that this a 1626 // very conservative estimate that is unlikely to be flaky. 1627 if n := atomic.LoadInt64(&numGets); n < numGetWorkers*numMerges { 1628 t.Fatalf("suspiciously low numGets (expected at least %d): %d", numGetWorkers*numMerges, n) 1629 } 1630 } 1631 1632 // TestStoreReplicaGCAfterMerge verifies that the replica GC queue writes the 1633 // correct tombstone when it GCs a replica of range that has been merged away. 1634 // 1635 // Consider the following sequence of events observed in a real cluster: 1636 // 1637 // 1. Adjacent ranges Q and R are slated to be merged. Q has replicas on 1638 // stores S1, S2, and S3, while R has replicas on S1, S2, and S4. 1639 // 2. To collocate Q and R, the merge queue adds a replica of R on S3 and 1640 // removes the replica on S4. The replica on S4 is queued for garbage 1641 // collection, but is not yet processed. 1642 // 3. The merge transaction commits, deleting R's range descriptor from the 1643 // meta2 index. 1644 // 4. The replica GC queue processes the former replica of R on S4. It 1645 // performs a consistent lookup of R's start key in the meta2 index to 1646 // determine whether the replica is still a member of R. Since R has been 1647 // deleted, the lookup returns Q's range descriptor, not R's. 1648 // 1649 // The replica GC queue would previously fail to notice that it had received Q's 1650 // range descriptor, not R's. It would then proceed to call store.RemoveReplica 1651 // with Q's descriptor, which would write a replica tombstone for Q, when in 1652 // fact the replica tombstone needed to be written for R. Without the correct 1653 // replica tombstone, if S4 received a slow Raft message for the now-GC'd 1654 // replica, it would incorrectly construct an uninitialized replica and panic. 1655 // 1656 // This test also ensures that the nodes which processes the Merge writes a 1657 // tombstone which prevents the range from being resurrected by a raft message. 1658 // 1659 // This test's approach to simulating this sequence of events is based on 1660 // TestReplicaGCRace. 1661 func TestStoreReplicaGCAfterMerge(t *testing.T) { 1662 defer leaktest.AfterTest(t)() 1663 1664 ctx := context.Background() 1665 storeCfg := kvserver.TestStoreConfig(nil) 1666 storeCfg.TestingKnobs.DisableReplicateQueue = true 1667 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 1668 storeCfg.TestingKnobs.DisableMergeQueue = true 1669 storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true 1670 mtc := &multiTestContext{storeConfig: &storeCfg} 1671 mtc.Start(t, 2) 1672 defer mtc.Stop() 1673 store0, store1 := mtc.Store(0), mtc.Store(1) 1674 1675 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 1676 mtc.replicateRange(rngID, 1) 1677 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 1678 if err != nil { 1679 t.Fatal(err) 1680 } 1681 1682 mtc.unreplicateRange(lhsDesc.RangeID, 1) 1683 mtc.unreplicateRange(rhsDesc.RangeID, 1) 1684 1685 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1686 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 1687 if pErr != nil { 1688 t.Fatal(pErr) 1689 } 1690 1691 for _, rangeID := range []roachpb.RangeID{lhsDesc.RangeID, rhsDesc.RangeID} { 1692 repl, err := store1.GetReplica(rangeID) 1693 if err != nil { 1694 t.Fatal(err) 1695 } 1696 if err := store1.ManualReplicaGC(repl); err != nil { 1697 t.Fatal(err) 1698 } 1699 if _, err := store1.GetReplica(rangeID); err == nil { 1700 t.Fatalf("replica of r%d not gc'd from s1", rangeID) 1701 } 1702 } 1703 1704 rhsReplDesc0, ok := rhsDesc.GetReplicaDescriptor(store0.StoreID()) 1705 if !ok { 1706 t.Fatalf("expected %s to have a replica on %s", rhsDesc, store0) 1707 } 1708 rhsReplDesc1, ok := rhsDesc.GetReplicaDescriptor(store1.StoreID()) 1709 if !ok { 1710 t.Fatalf("expected %s to have a replica on %s", rhsDesc, store1) 1711 } 1712 1713 transport := kvserver.NewRaftTransport( 1714 log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer}, 1715 cluster.MakeTestingClusterSettings(), 1716 nodedialer.New(mtc.rpcContext, gossip.AddressResolver(mtc.gossips[0])), 1717 nil, /* grpcServer */ 1718 mtc.transportStopper, 1719 ) 1720 errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1)) 1721 transport.Listen(store0.StoreID(), errChan) 1722 transport.Listen(store1.StoreID(), errChan) 1723 1724 sendHeartbeat := func( 1725 rangeID roachpb.RangeID, 1726 fromReplDesc, toReplDesc roachpb.ReplicaDescriptor, 1727 ) { 1728 // Try several times, as the message may be dropped (see #18355). 1729 for i := 0; i < 5; i++ { 1730 if sent := transport.SendAsync(&kvserver.RaftMessageRequest{ 1731 FromReplica: fromReplDesc, 1732 ToReplica: toReplDesc, 1733 Heartbeats: []kvserver.RaftHeartbeat{ 1734 { 1735 RangeID: rangeID, 1736 FromReplicaID: fromReplDesc.ReplicaID, 1737 ToReplicaID: toReplDesc.ReplicaID, 1738 Commit: 42, 1739 }, 1740 }, 1741 }, rpc.DefaultClass); !sent { 1742 t.Fatal("failed to send heartbeat") 1743 } 1744 select { 1745 case pErr := <-errChan: 1746 switch pErr.GetDetail().(type) { 1747 case *roachpb.RaftGroupDeletedError: 1748 return 1749 default: 1750 t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr) 1751 } 1752 case <-time.After(time.Second): 1753 } 1754 } 1755 t.Fatal("did not get expected RaftGroupDeleted error") 1756 } 1757 1758 // Send a heartbeat to the now-GC'd replica on the stores. If the replica 1759 // tombstone was not written correctly when the replica was GC'd, this will 1760 // cause a panic. 1761 sendHeartbeat(rhsDesc.RangeID, rhsReplDesc0, rhsReplDesc1) 1762 sendHeartbeat(rhsDesc.RangeID, rhsReplDesc1, rhsReplDesc0) 1763 1764 // Send a heartbeat to a fictional replicas on with a large replica ID. 1765 // This tests an implementation detail: the replica tombstone that gets 1766 // written in this case will use the maximum possible replica ID, so the 1767 // stores should ignore heartbeats for replicas of the range with _any_ 1768 // replica ID. 1769 sendHeartbeat(rhsDesc.RangeID, rhsReplDesc0, roachpb.ReplicaDescriptor{ 1770 NodeID: store1.Ident.NodeID, 1771 StoreID: store1.Ident.StoreID, 1772 ReplicaID: 123456, 1773 }) 1774 1775 sendHeartbeat(rhsDesc.RangeID, rhsReplDesc1, roachpb.ReplicaDescriptor{ 1776 NodeID: store0.Ident.NodeID, 1777 StoreID: store0.Ident.StoreID, 1778 ReplicaID: 123456, 1779 }) 1780 1781 // Be extra paranoid and verify the exact value of the replica tombstone. 1782 checkTombstone := func(eng storage.Engine) { 1783 var rhsTombstone roachpb.RangeTombstone 1784 rhsTombstoneKey := keys.RangeTombstoneKey(rhsDesc.RangeID) 1785 ok, err = storage.MVCCGetProto(ctx, eng, rhsTombstoneKey, hlc.Timestamp{}, 1786 &rhsTombstone, storage.MVCCGetOptions{}) 1787 if err != nil { 1788 t.Fatal(err) 1789 } else if !ok { 1790 t.Fatalf("missing range tombstone at key %s", rhsTombstoneKey) 1791 } 1792 if e, a := roachpb.ReplicaID(math.MaxInt32), rhsTombstone.NextReplicaID; e != a { 1793 t.Fatalf("expected next replica ID to be %d, but got %d", e, a) 1794 } 1795 } 1796 checkTombstone(store0.Engine()) 1797 checkTombstone(store1.Engine()) 1798 } 1799 1800 // TestStoreRangeMergeAddReplicaRace verifies that when an add replica request 1801 // occurs concurrently with a merge, one of them is aborted with a "descriptor 1802 // changed" CPut error. 1803 func TestStoreRangeMergeAddReplicaRace(t *testing.T) { 1804 defer leaktest.AfterTest(t)() 1805 ctx := context.Background() 1806 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 1807 ReplicationMode: base.ReplicationManual, 1808 }) 1809 defer tc.Stopper().Stop(ctx) 1810 1811 scratchStartKey := tc.ScratchRange(t) 1812 origDesc := tc.LookupRangeOrFatal(t, scratchStartKey) 1813 splitKey := scratchStartKey.Next() 1814 beforeDesc, _ := tc.SplitRangeOrFatal(t, splitKey) 1815 1816 mergeErrCh, addErrCh := make(chan error, 1), make(chan error, 1) 1817 go func() { 1818 mergeErrCh <- tc.Server(0).DB().AdminMerge(ctx, scratchStartKey) 1819 }() 1820 go func() { 1821 _, err := tc.Server(0).DB().AdminChangeReplicas( 1822 ctx, scratchStartKey, beforeDesc, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(1))) 1823 addErrCh <- err 1824 }() 1825 mergeErr := <-mergeErrCh 1826 addErr := <-addErrCh 1827 afterDesc := tc.LookupRangeOrFatal(t, scratchStartKey) 1828 1829 const acceptableMergeErr = `unexpected value: raw_bytes|ranges not collocated` + 1830 `|cannot merge range with non-voter replicas` 1831 if mergeErr == nil && testutils.IsError(addErr, `descriptor changed: \[expected\]`) { 1832 // Merge won the race, no add happened. 1833 require.Len(t, afterDesc.Replicas().Voters(), 1) 1834 require.Equal(t, origDesc.EndKey, afterDesc.EndKey) 1835 } else if addErr == nil && testutils.IsError(mergeErr, acceptableMergeErr) { 1836 // Add won the race, no merge happened. 1837 require.Len(t, afterDesc.Replicas().Voters(), 2) 1838 require.Equal(t, beforeDesc.EndKey, afterDesc.EndKey) 1839 } else { 1840 t.Fatalf(`expected exactly one of merge or add to succeed got: [merge] %v [add] %v`, 1841 mergeErr, addErr) 1842 } 1843 } 1844 1845 // TestStoreRangeMergeResplitAddReplicaRace tests a diabolical edge case in the 1846 // merge/add replica race. If two replicas merge and then split at the previous 1847 // boundary, the descriptor will look unchanged and our usual CPut protection 1848 // would fail. For this reason, we introduced RangeDescriptor.Generation. 1849 // 1850 // Note that splits will not increment the generation counter until the cluster 1851 // version includes VersionRangeMerges. That's ok, because a sequence of splits 1852 // alone will always result in a descriptor with a smaller end key. Only a 1853 // sequence of splits AND merges can result in an unchanged end key, and merges 1854 // always increment the generation counter. 1855 func TestStoreRangeMergeResplitAddReplicaRace(t *testing.T) { 1856 defer leaktest.AfterTest(t)() 1857 ctx := context.Background() 1858 tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ 1859 ReplicationMode: base.ReplicationManual, 1860 }) 1861 defer tc.Stopper().Stop(ctx) 1862 1863 scratchStartKey := tc.ScratchRange(t) 1864 splitKey := scratchStartKey.Next() 1865 origDesc, _ := tc.SplitRangeOrFatal(t, splitKey) 1866 require.NoError(t, tc.Server(0).DB().AdminMerge(ctx, scratchStartKey)) 1867 resplitDesc, _ := tc.SplitRangeOrFatal(t, splitKey) 1868 1869 assert.Equal(t, origDesc.RangeID, resplitDesc.RangeID) 1870 assert.Equal(t, origDesc.StartKey, resplitDesc.StartKey) 1871 assert.Equal(t, origDesc.EndKey, resplitDesc.EndKey) 1872 assert.Equal(t, origDesc.Replicas().All(), resplitDesc.Replicas().All()) 1873 assert.NotEqual(t, origDesc.Generation, resplitDesc.Generation) 1874 1875 _, err := tc.Server(0).DB().AdminChangeReplicas( 1876 ctx, scratchStartKey, origDesc, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(1))) 1877 if !testutils.IsError(err, `descriptor changed`) { 1878 t.Fatalf(`expected "descriptor changed" error got: %+v`, err) 1879 } 1880 } 1881 1882 func TestStoreRangeMergeSlowUnabandonedFollower_NoSplit(t *testing.T) { 1883 defer leaktest.AfterTest(t)() 1884 1885 ctx := context.Background() 1886 storeCfg := kvserver.TestStoreConfig(nil) 1887 storeCfg.TestingKnobs.DisableReplicateQueue = true 1888 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 1889 mtc := &multiTestContext{storeConfig: &storeCfg} 1890 mtc.Start(t, 3) 1891 defer mtc.Stop() 1892 store0, store2 := mtc.Store(0), mtc.Store(2) 1893 1894 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 1895 mtc.replicateRange(rngID, 1, 2) 1896 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 1897 if err != nil { 1898 t.Fatal(err) 1899 } 1900 1901 // Wait for store2 to hear about the split. 1902 testutils.SucceedsSoon(t, func() error { 1903 if rhsRepl2, err := store2.GetReplica(rhsDesc.RangeID); err != nil || !rhsRepl2.IsInitialized() { 1904 return errors.Errorf("store2 has not yet processed split. err: %v", err) 1905 } 1906 return nil 1907 }) 1908 1909 // Block Raft traffic to the LHS replica on store2, by holding its raftMu, so 1910 // that its LHS isn't aware there's a merge in progress. 1911 lhsRepl2, err := store2.GetReplica(lhsDesc.RangeID) 1912 if err != nil { 1913 t.Fatal(err) 1914 } 1915 lhsRepl2.RaftLock() 1916 1917 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1918 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 1919 if pErr != nil { 1920 t.Fatal(pErr) 1921 } 1922 1923 // Verify that store2 won't inadvertently GC the RHS before it's heard about 1924 // the merge. This is a tricky case for the replica GC queue, as meta2 will 1925 // indicate that the range has been merged away. 1926 rhsRepl2, err := store2.GetReplica(rhsDesc.RangeID) 1927 if err != nil { 1928 t.Fatal(err) 1929 } 1930 if err := store2.ManualReplicaGC(rhsRepl2); err != nil { 1931 t.Fatal(err) 1932 } 1933 if _, err := store2.GetReplica(rhsDesc.RangeID); err != nil { 1934 t.Fatalf("non-abandoned rhs replica unexpectedly GC'd before merge") 1935 } 1936 1937 // Restore communication with store2. Give it the lease to force all commands 1938 // to be applied, including the merge trigger. 1939 lhsRepl2.RaftUnlock() 1940 mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2) 1941 } 1942 1943 func TestStoreRangeMergeSlowUnabandonedFollower_WithSplit(t *testing.T) { 1944 defer leaktest.AfterTest(t)() 1945 1946 ctx := context.Background() 1947 storeCfg := kvserver.TestStoreConfig(nil) 1948 storeCfg.TestingKnobs.DisableReplicateQueue = true 1949 mtc := &multiTestContext{storeConfig: &storeCfg} 1950 mtc.Start(t, 3) 1951 defer mtc.Stop() 1952 store0, store2 := mtc.Store(0), mtc.Store(2) 1953 1954 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 1955 mtc.replicateRange(rngID, 1, 2) 1956 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 1957 if err != nil { 1958 t.Fatal(err) 1959 } 1960 1961 // Wait for store2 to hear about the split. 1962 testutils.SucceedsSoon(t, func() error { 1963 _, err := store2.GetReplica(rhsDesc.RangeID) 1964 return err 1965 }) 1966 1967 // Start dropping all Raft traffic to the LHS on store2 so that it won't be 1968 // aware that there is a merge in progress. 1969 mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ 1970 rangeID: lhsDesc.RangeID, 1971 RaftMessageHandler: store2, 1972 }) 1973 1974 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 1975 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 1976 if pErr != nil { 1977 t.Fatal(pErr) 1978 } 1979 1980 // Now split the newly merged range splits back out at exactly the same key. 1981 // When the replica GC queue looks in meta2 it will find the new RHS range, of 1982 // which store2 is a member. Note that store2 does not yet have an initialized 1983 // replica for this range, since it would intersect with the old RHS replica. 1984 _, newRHSDesc, err := createSplitRanges(ctx, store0) 1985 if err != nil { 1986 t.Fatal(err) 1987 } 1988 1989 // Remove the LHS replica from store2. 1990 mtc.unreplicateRange(lhsDesc.RangeID, 2) 1991 1992 // Transfer the lease on the new RHS to store2 and wait for it to apply. This 1993 // will force its replica to of the new RHS to become up to date, which 1994 // indirectly tests that the replica GC queue cleans up both the LHS replica 1995 // and the old RHS replica. 1996 mtc.transferLease(ctx, newRHSDesc.RangeID, 0, 2) 1997 testutils.SucceedsSoon(t, func() error { 1998 rhsRepl, err := store2.GetReplica(newRHSDesc.RangeID) 1999 if err != nil { 2000 return err 2001 } 2002 if !rhsRepl.OwnsValidLease(mtc.clock().Now()) { 2003 return errors.New("rhs store does not own valid lease for rhs range") 2004 } 2005 return nil 2006 }) 2007 } 2008 2009 func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) { 2010 defer leaktest.AfterTest(t)() 2011 2012 ctx := context.Background() 2013 storeCfg := kvserver.TestStoreConfig(nil) 2014 storeCfg.TestingKnobs.DisableMergeQueue = true 2015 storeCfg.TestingKnobs.DisableReplicateQueue = true 2016 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 2017 mtc := &multiTestContext{storeConfig: &storeCfg} 2018 mtc.Start(t, 3) 2019 defer mtc.Stop() 2020 store0, store2 := mtc.Store(0), mtc.Store(2) 2021 2022 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 2023 mtc.replicateRange(rngID, 1, 2) 2024 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 2025 if err != nil { 2026 t.Fatal(err) 2027 } 2028 2029 // Wait for store2 to hear about the split. 2030 var rhsRepl2 *kvserver.Replica 2031 testutils.SucceedsSoon(t, func() error { 2032 if rhsRepl2, err = store2.GetReplica(rhsDesc.RangeID); err != nil || !rhsRepl2.IsInitialized() { 2033 return errors.New("store2 has not yet processed split") 2034 } 2035 return nil 2036 }) 2037 2038 // Block Raft traffic to the LHS replica on store2, by holding its raftMu, so 2039 // that its LHS isn't aware there's a merge in progress. 2040 lhsRepl2, err := store2.GetReplica(lhsDesc.RangeID) 2041 if err != nil { 2042 t.Fatal(err) 2043 } 2044 lhsRepl2.RaftLock() 2045 2046 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 2047 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 2048 if pErr != nil { 2049 t.Fatal(pErr) 2050 } 2051 2052 // Remove store2 from the range after the merge. It won't hear about this yet, 2053 // but we'll be able to commit the configuration change because we have two 2054 // other live members. 2055 mtc.unreplicateRange(lhsDesc.RangeID, 2) 2056 2057 // Verify that store2 won't inadvertently GC the RHS before it's heard about 2058 // the merge. This is a particularly tricky case for the replica GC queue, as 2059 // meta2 will indicate that the range has been merged away AND that store2 is 2060 // not a member of the new range. 2061 if err := store2.ManualReplicaGC(rhsRepl2); err != nil { 2062 t.Fatal(err) 2063 } 2064 if _, err := store2.GetReplica(rhsDesc.RangeID); err != nil { 2065 t.Fatal("rhs replica on store2 destroyed before lhs applied merge") 2066 } 2067 2068 // Flush store2's queued requests. 2069 lhsRepl2.RaftUnlock() 2070 2071 // Ensure that the unblocked merge eventually applies and subsumes the RHS. 2072 // In general this will happen due to receiving a ReplicaTooOldError but 2073 // it may require the replica GC queue. In rare cases the LHS will never 2074 // hear about the merge and may need to be GC'd on its own. 2075 testutils.SucceedsSoon(t, func() error { 2076 // Make the the LHS gets destroyed. 2077 if lhsRepl, err := store2.GetReplica(lhsDesc.RangeID); err == nil { 2078 if err := store2.ManualReplicaGC(lhsRepl); err != nil { 2079 t.Fatal(err) 2080 } 2081 } 2082 if rhsRepl, err := store2.GetReplica(rhsDesc.RangeID); err == nil { 2083 if err := store2.ManualReplicaGC(rhsRepl); err != nil { 2084 t.Fatal(err) 2085 } 2086 return errors.New("rhs not yet destroyed") 2087 } 2088 return nil 2089 }) 2090 } 2091 2092 func TestStoreRangeMergeAbandonedFollowers(t *testing.T) { 2093 defer leaktest.AfterTest(t)() 2094 2095 ctx := context.Background() 2096 storeCfg := kvserver.TestStoreConfig(nil) 2097 storeCfg.TestingKnobs.DisableReplicateQueue = true 2098 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 2099 storeCfg.TestingKnobs.DisableSplitQueue = true 2100 storeCfg.TestingKnobs.DisableMergeQueue = true 2101 storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true 2102 mtc := &multiTestContext{storeConfig: &storeCfg} 2103 mtc.Start(t, 3) 2104 defer mtc.Stop() 2105 store2 := mtc.Store(2) 2106 2107 rngID := mtc.Store(0).LookupReplica(roachpb.RKey("a")).Desc().RangeID 2108 mtc.replicateRange(rngID, 1, 2) 2109 2110 // Split off three ranges. 2111 keys := []roachpb.RKey{roachpb.RKey("a"), roachpb.RKey("b"), roachpb.RKey("c")} 2112 for _, key := range keys { 2113 splitArgs := adminSplitArgs(key.AsRawKey()) 2114 if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], splitArgs); pErr != nil { 2115 t.Fatal(pErr) 2116 } 2117 } 2118 2119 // Wait for store2 to hear about all three splits. 2120 var repls []*kvserver.Replica 2121 testutils.SucceedsSoon(t, func() error { 2122 repls = nil 2123 for _, key := range keys { 2124 repl := store2.LookupReplica(key) /* end */ 2125 if repl == nil || !repl.Desc().StartKey.Equal(key) { 2126 return fmt.Errorf("replica for key %q is missing or has wrong start key: %s", key, repl) 2127 } 2128 repls = append(repls, repl) 2129 } 2130 return nil 2131 }) 2132 2133 // Remove all replicas from store2. 2134 for _, repl := range repls { 2135 mtc.unreplicateRange(repl.RangeID, 2) 2136 } 2137 2138 // Merge all three ranges together. store2 won't hear about this merge. 2139 for i := 0; i < 2; i++ { 2140 if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], adminMergeArgs(roachpb.Key("a"))); pErr != nil { 2141 t.Fatal(pErr) 2142 } 2143 } 2144 2145 // Verify that the abandoned ranges on store2 can only be GC'd from left to 2146 // right. 2147 if err := store2.ManualReplicaGC(repls[2]); err != nil { 2148 t.Fatal(err) 2149 } 2150 if _, err := store2.GetReplica(repls[2].RangeID); err != nil { 2151 t.Fatal("c replica on store2 destroyed before b") 2152 } 2153 if err := store2.ManualReplicaGC(repls[1]); err != nil { 2154 t.Fatal(err) 2155 } 2156 if _, err := store2.GetReplica(repls[1].RangeID); err != nil { 2157 t.Fatal("b replica on store2 destroyed before a") 2158 } 2159 if err := store2.ManualReplicaGC(repls[0]); err != nil { 2160 t.Fatal(err) 2161 } 2162 if _, err := store2.GetReplica(repls[0].RangeID); err == nil { 2163 t.Fatal("a replica not destroyed") 2164 } 2165 2166 if err := store2.ManualReplicaGC(repls[2]); err != nil { 2167 t.Fatal(err) 2168 } 2169 if _, err := store2.GetReplica(repls[2].RangeID); err != nil { 2170 t.Fatal("c replica on store2 destroyed before b") 2171 } 2172 if err := store2.ManualReplicaGC(repls[1]); err != nil { 2173 t.Fatal(err) 2174 } 2175 if _, err := store2.GetReplica(repls[1].RangeID); err == nil { 2176 t.Fatal("b replica not destroyed") 2177 } 2178 2179 if err := store2.ManualReplicaGC(repls[2]); err != nil { 2180 t.Fatal(err) 2181 } 2182 if _, err := store2.GetReplica(repls[2].RangeID); err == nil { 2183 t.Fatal("c replica not destroyed") 2184 } 2185 } 2186 2187 // TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected verifies 2188 // that the replica GC queue will clean up an abandoned RHS replica whose 2189 // destroyStatus is destroyReasonMergePending. The RHS replica ends up in this 2190 // state when its merge watcher goroutine notices that the merge committed, and 2191 // thus marks it as destroyed with reason destroyReasonMergePending, but the 2192 // corresponding LHS is rebalanced off the store before it can apply the merge 2193 // trigger. The replica GC queue would previously refuse to GC the abandoned 2194 // RHS, as it interpreted destroyReasonMergePending to mean that the RHS replica 2195 // had already been garbage collected. 2196 func TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected(t *testing.T) { 2197 defer leaktest.AfterTest(t)() 2198 2199 ctx := context.Background() 2200 storeCfg := kvserver.TestStoreConfig(nil) 2201 storeCfg.TestingKnobs.DisableReplicateQueue = true 2202 mtc := &multiTestContext{storeConfig: &storeCfg} 2203 mtc.Start(t, 3) 2204 defer mtc.Stop() 2205 store0, store2 := mtc.Store(0), mtc.Store(2) 2206 2207 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 2208 mtc.replicateRange(rngID, 1, 2) 2209 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 2210 if err != nil { 2211 t.Fatal(err) 2212 } 2213 2214 // Make store2 the leaseholder for the RHS and wait for the lease transfer to 2215 // apply. 2216 mtc.transferLease(ctx, rhsDesc.RangeID, 0, 2) 2217 testutils.SucceedsSoon(t, func() error { 2218 rhsRepl, err := store2.GetReplica(rhsDesc.RangeID) 2219 if err != nil { 2220 return err 2221 } 2222 if !rhsRepl.OwnsValidLease(mtc.clock().Now()) { 2223 return errors.New("store2 does not own valid lease for rhs range") 2224 } 2225 return nil 2226 }) 2227 2228 // Start dropping all Raft traffic to the LHS replica on store2 so that it 2229 // won't be aware that there is a merge in progress. 2230 mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ 2231 rangeID: lhsDesc.RangeID, 2232 RaftMessageHandler: store2, 2233 }) 2234 2235 // Perform the merge. The LHS replica on store2 whon't hear about this merge 2236 // and thus won't subsume its RHS replica. The RHS replica's merge watcher 2237 // goroutine will, however, notice the merge and mark the RHS replica as 2238 // destroyed with reason destroyReasonMergePending. 2239 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 2240 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 2241 if pErr != nil { 2242 t.Fatal(pErr) 2243 } 2244 2245 // Remove the merged range from store2. Its replicas of both the LHS and RHS 2246 // are now eligible for GC. 2247 mtc.unreplicateRange(lhsDesc.RangeID, 2) 2248 2249 // Note that we purposely do not call store.ManualReplicaGC here, as that 2250 // calls replicaGCQueue.process directly, bypassing the logic in 2251 // baseQueue.MaybeAdd and baseQueue.Add. We specifically want to test that 2252 // queuing logic, which has been broken in the past. 2253 testutils.SucceedsSoon(t, func() error { 2254 if _, err := store2.GetReplica(lhsDesc.RangeID); err == nil { 2255 return errors.New("lhs not destroyed") 2256 } 2257 if _, err := store2.GetReplica(rhsDesc.RangeID); err == nil { 2258 return errors.New("rhs not destroyed") 2259 } 2260 return nil 2261 }) 2262 } 2263 2264 func TestStoreRangeMergeDeadFollowerBeforeTxn(t *testing.T) { 2265 defer leaktest.AfterTest(t)() 2266 2267 ctx := context.Background() 2268 var mtc *multiTestContext 2269 storeCfg := kvserver.TestStoreConfig(nil) 2270 storeCfg.TestingKnobs.DisableMergeQueue = true 2271 mtc = &multiTestContext{storeConfig: &storeCfg} 2272 mtc.Start(t, 3) 2273 defer mtc.Stop() 2274 store0 := mtc.Store(0) 2275 2276 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 2277 mtc.replicateRange(rngID, 1, 2) 2278 lhsDesc, _, err := createSplitRanges(ctx, store0) 2279 if err != nil { 2280 t.Fatal(err) 2281 } 2282 2283 mtc.stopStore(2) 2284 2285 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 2286 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 2287 expErr := "waiting for all left-hand replicas to initialize" 2288 if !testutils.IsPError(pErr, expErr) { 2289 t.Fatalf("expected %q error, but got %v", expErr, pErr) 2290 } 2291 } 2292 2293 func TestStoreRangeMergeDeadFollowerDuringTxn(t *testing.T) { 2294 defer leaktest.AfterTest(t)() 2295 2296 ctx := context.Background() 2297 var mtc *multiTestContext 2298 storeCfg := kvserver.TestStoreConfig(nil) 2299 storeCfg.TestingKnobs.DisableMergeQueue = true 2300 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 2301 if ba.IsSingleSubsumeRequest() && mtc.Store(2) != nil { 2302 mtc.stopStore(2) 2303 } 2304 return nil 2305 } 2306 mtc = &multiTestContext{storeConfig: &storeCfg} 2307 mtc.Start(t, 3) 2308 defer mtc.Stop() 2309 store0 := mtc.Store(0) 2310 2311 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 2312 mtc.replicateRange(rngID, 1, 2) 2313 lhsDesc, _, err := createSplitRanges(ctx, store0) 2314 if err != nil { 2315 t.Fatal(err) 2316 } 2317 2318 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 2319 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 2320 expErr := "merge failed: waiting for all right-hand replicas to catch up" 2321 if !testutils.IsPError(pErr, expErr) { 2322 t.Fatalf("expected %q error, but got %v", expErr, pErr) 2323 } 2324 } 2325 2326 func TestStoreRangeReadoptedLHSFollower(t *testing.T) { 2327 defer leaktest.AfterTest(t)() 2328 2329 run := func(t *testing.T, withMerge bool) { 2330 ctx := context.Background() 2331 storeCfg := kvserver.TestStoreConfig(nil) 2332 storeCfg.TestingKnobs.DisableReplicateQueue = true 2333 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 2334 storeCfg.TestingKnobs.DisableMergeQueue = true 2335 mtc := &multiTestContext{storeConfig: &storeCfg} 2336 mtc.Start(t, 3) 2337 defer mtc.Stop() 2338 store0, store2 := mtc.Store(0), mtc.Store(2) 2339 2340 // Create two ranges on store0 and store1. 2341 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 2342 if err != nil { 2343 t.Fatal(err) 2344 } 2345 mtc.replicateRange(lhsDesc.RangeID, 1) 2346 mtc.replicateRange(rhsDesc.RangeID, 1) 2347 2348 // Abandon a replica of the LHS on store2. 2349 mtc.replicateRange(lhsDesc.RangeID, 2) 2350 var lhsRepl2 *kvserver.Replica 2351 testutils.SucceedsSoon(t, func() error { 2352 lhsRepl2, err = store2.GetReplica(lhsDesc.RangeID) 2353 if err != nil { 2354 return err 2355 } 2356 if !lhsRepl2.IsInitialized() { 2357 // Make sure the replica is initialized before unreplicating. 2358 // Uninitialized replicas that have a replicaID are hard to 2359 // GC (not implemented at the time of writing). 2360 return errors.Errorf("%s not initialized", lhsRepl2) 2361 } 2362 return nil 2363 }) 2364 mtc.unreplicateRange(lhsDesc.RangeID, 2) 2365 2366 if withMerge { 2367 // Merge the two ranges together. 2368 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 2369 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 2370 if pErr != nil { 2371 t.Fatal(pErr) 2372 } 2373 } 2374 2375 // Attempt to re-add the merged range to store2. This should succeed 2376 // immediately because there are no overlapping replicas that would interfere 2377 // with the widening of the existing LHS replica. 2378 if _, err := mtc.dbs[0].AdminChangeReplicas( 2379 ctx, lhsDesc.StartKey.AsRawKey(), 2380 *lhsDesc, 2381 roachpb.MakeReplicationChanges( 2382 roachpb.ADD_REPLICA, 2383 roachpb.ReplicationTarget{ 2384 NodeID: mtc.idents[2].NodeID, 2385 StoreID: mtc.idents[2].StoreID, 2386 }), 2387 ); !testutils.IsError(err, "descriptor changed") { 2388 t.Fatal(err) 2389 } 2390 2391 if err := store2.ManualReplicaGC(lhsRepl2); err != nil { 2392 t.Fatal(err) 2393 } 2394 2395 mtc.replicateRange(lhsDesc.RangeID, 2) 2396 // Give store2 the lease to force all commands to be applied, including the 2397 // ChangeReplicas. 2398 mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2) 2399 } 2400 2401 testutils.RunTrueAndFalse(t, "withMerge", run) 2402 } 2403 2404 // slowSnapRaftHandler delays any snapshots to rangeID until waitCh is closed. 2405 type slowSnapRaftHandler struct { 2406 rangeID roachpb.RangeID 2407 waitCh chan struct{} 2408 kvserver.RaftMessageHandler 2409 syncutil.Mutex 2410 } 2411 2412 func (h *slowSnapRaftHandler) unblock() { 2413 h.Lock() 2414 if h.waitCh != nil { 2415 close(h.waitCh) 2416 h.waitCh = nil 2417 } 2418 h.Unlock() 2419 } 2420 2421 func (h *slowSnapRaftHandler) HandleSnapshot( 2422 header *kvserver.SnapshotRequest_Header, respStream kvserver.SnapshotResponseStream, 2423 ) error { 2424 if header.RaftMessageRequest.RangeID == h.rangeID { 2425 h.Lock() 2426 waitCh := h.waitCh 2427 h.Unlock() 2428 if waitCh != nil { 2429 <-waitCh 2430 } 2431 } 2432 return h.RaftMessageHandler.HandleSnapshot(header, respStream) 2433 } 2434 2435 // TestStoreRangeMergeUninitializedLHSFollower reproduces a rare bug in which a 2436 // replica of the RHS of a merge could be garbage collected too soon. 2437 // 2438 // Consider two adjacent ranges, A and B. Suppose the replica of 2439 // A on the last store, S3, is uninitialized, e.g. because A was recently 2440 // created by a split and S3 has neither processed the split trigger nor 2441 // received a snapshot. The leaseholder for A will attempt to send a Raft 2442 // snapshot to bring S3's replica up to date, but this Raft snapshot may be 2443 // delayed due to a busy Raft snapshot queue or a slow network. 2444 // 2445 // Now suppose a merge of A and B commits before S3 receives a Raft snapshot for 2446 // A. There is a small window of time in which S3 can garbage collect its 2447 // replica of B! When S3 looks up B's meta2 descriptor, it will find that B has 2448 // been merged away. S3 will then try to prove that B's local left neighbor is 2449 // generationally up-to-date; if it is, it safe to GC B. Usually, S3 would 2450 // determine A to be B's left neighbor, realize that A has not yet processed the 2451 // merge, and correctly refuse to GC its replica of B. In this case, however, 2452 // S3's replica of A is uninitialized and thus doesn't know its start and end 2453 // key, so S3 will instead discover some more-distant left neighbor of B. This 2454 // distant neighbor might very well be up-to-date, and S3 will incorrectly 2455 // conclude that it can GC its replica of B! 2456 // 2457 // So say S3 GCs its replica of B. There are now two paths that A might take. 2458 // The happy case is that A receives a Raft snapshot that postdates the merge. 2459 // The unhappy case is that A receives a Raft snapshot that predates the merge, 2460 // and is then required to apply the merge via a MsgApp. Since there is no 2461 // longer a replica of B on S3, applying the merge trigger will explode. 2462 // 2463 // The solution was to require that all LHS replicas are initialized before 2464 // beginning a merge transaction. This ensures that the replica GC queue will 2465 // always discover the correct left neighbor when considering whether a subsumed 2466 // range can be GC'd. 2467 func TestStoreRangeMergeUninitializedLHSFollower(t *testing.T) { 2468 defer leaktest.AfterTest(t)() 2469 2470 ctx := context.Background() 2471 storeCfg := kvserver.TestStoreConfig(nil) 2472 storeCfg.TestingKnobs.DisableReplicateQueue = true 2473 mtc := &multiTestContext{storeConfig: &storeCfg} 2474 mtc.Start(t, 3) 2475 defer mtc.Stop() 2476 store0, store2 := mtc.Store(0), mtc.Store(2) 2477 distSender := mtc.distSenders[0] 2478 2479 split := func(key roachpb.RKey) roachpb.RangeID { 2480 t.Helper() 2481 if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(key.AsRawKey())); pErr != nil { 2482 t.Fatal(pErr) 2483 } 2484 return store0.LookupReplica(key).RangeID 2485 } 2486 2487 // We'll create two ranges, A and B, as described in the comment on this test 2488 // function. 2489 aKey, bKey := roachpb.RKey("a"), roachpb.RKey("b") 2490 2491 // Put range 1 on all three stores. 2492 rngID := store0.LookupReplica(aKey).Desc().RangeID 2493 mtc.replicateRange(rngID, 1, 2) 2494 2495 // Create range B and wait for store2 to process the split. 2496 bRangeID := split(bKey) 2497 var bRepl2 *kvserver.Replica 2498 testutils.SucceedsSoon(t, func() (err error) { 2499 if bRepl2, err = store2.GetReplica(bRangeID); err != nil || !bRepl2.IsInitialized() { 2500 return errors.New("store2 has not yet processed split of c") 2501 } 2502 return nil 2503 }) 2504 2505 // Now we want to create range A, but we need to make sure store2's replica of 2506 // A is not initialized. This requires dropping all Raft traffic to store2 2507 // from range 1, which will be the LHS of the split, so that store2's replica 2508 // of range 1 never processes the split trigger, which would create an 2509 // initialized replica of A. 2510 unreliableHandler := &unreliableRaftHandler{ 2511 rangeID: rngID, 2512 RaftMessageHandler: store2, 2513 } 2514 mtc.transport.Listen(store2.Ident.StoreID, unreliableHandler) 2515 2516 // Perform the split of A, now that store2 won't be able to initialize its 2517 // replica of A. 2518 aRangeID := split(aKey) 2519 2520 // Wedge a Raft snapshot that's destined for A. This allows us to capture a 2521 // pre-merge Raft snapshot, which we'll let loose after the merge commits. 2522 slowSnapHandler := &slowSnapRaftHandler{ 2523 rangeID: aRangeID, 2524 waitCh: make(chan struct{}), 2525 RaftMessageHandler: unreliableHandler, 2526 } 2527 defer slowSnapHandler.unblock() 2528 mtc.transport.Listen(store2.Ident.StoreID, slowSnapHandler) 2529 2530 // Remove the replica of range 1 on store2. If we were to leave it in place, 2531 // store2 would refuse to GC its replica of C after the merge commits, because 2532 // the left neighbor of C would be this out-of-date replica of range 1. 2533 // (Remember that we refused to let it process the split of A.) So we need to 2534 // remove it so that C has no left neighbor and thus will be eligible for GC. 2535 { 2536 r1Repl2, err := store2.GetReplica(rngID) 2537 if err != nil { 2538 t.Fatal(err) 2539 } 2540 mtc.unreplicateRange(rngID, 2) 2541 testutils.SucceedsSoon(t, func() error { 2542 if err := store2.ManualReplicaGC(r1Repl2); err != nil { 2543 return err 2544 } 2545 if _, err := store2.GetReplica(rngID); err == nil { 2546 return errors.New("r1Repl2 still exists") 2547 } 2548 return nil 2549 }) 2550 } 2551 2552 // Launch the merge of A and B. 2553 mergeErr := make(chan error) 2554 go func() { 2555 _, pErr := kv.SendWrapped(ctx, distSender, adminMergeArgs(aKey.AsRawKey())) 2556 mergeErr <- pErr.GoError() 2557 }() 2558 2559 // We want to assert that the merge does not complete until we allow store2's 2560 // replica of B to be initialized (by releasing the blocked Raft snapshot). A 2561 // happens-before assertion is nearly impossible to express, though, so 2562 // instead we just wait in the hope that, if the merge is buggy, it will 2563 // commit while we wait. Before the bug was fixed, this caused the test 2564 // to fail reliably. 2565 start := timeutil.Now() 2566 for timeutil.Since(start) < 50*time.Millisecond { 2567 if _, err := store2.GetReplica(bRangeID); err == nil { 2568 // Attempt to reproduce the exact fatal error described in the comment on 2569 // the test by running range B through the GC queue. If the bug is 2570 // present, GC will be successful and so the application of the merge 2571 // trigger on A to fail once we allow the Raft snapshot through. If the 2572 // bug is not present, we'll be unable to GC range B because it won't get 2573 // subsumed until after we allow the Raft snapshot through. 2574 _ = store2.ManualReplicaGC(bRepl2) 2575 } 2576 time.Sleep(5 * time.Millisecond) // don't spin too hot to give the merge CPU time to complete 2577 } 2578 2579 select { 2580 case err := <-mergeErr: 2581 t.Errorf("merge completed early (err: %v)", err) 2582 close(mergeErr) 2583 default: 2584 } 2585 2586 // Allow store2's replica of A to initialize with a Raft snapshot that 2587 // predates the merge. 2588 slowSnapHandler.unblock() 2589 2590 // Assert that the merge completes successfully. 2591 if err := <-mergeErr; err != nil { 2592 t.Fatal(err) 2593 } 2594 2595 // Give store2 the lease on the merged range to force all commands to be 2596 // applied, including the merge trigger. 2597 mtc.transferLease(ctx, aRangeID, 0, 2) 2598 } 2599 2600 // TestStoreRangeMergeWatcher verifies that the watcher goroutine for a merge's 2601 // RHS does not erroneously permit traffic after the merge commits. 2602 func TestStoreRangeMergeWatcher(t *testing.T) { 2603 defer leaktest.AfterTest(t)() 2604 2605 testutils.RunTrueAndFalse(t, "inject-failures", testMergeWatcher) 2606 } 2607 2608 func testMergeWatcher(t *testing.T, injectFailures bool) { 2609 ctx := context.Background() 2610 storeCfg := kvserver.TestStoreConfig(nil) 2611 storeCfg.TestingKnobs.DisableReplicateQueue = true 2612 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 2613 2614 var mergeTxnRetries, pushTxnRetries, meta2GetRetries int64 2615 if injectFailures { 2616 mergeTxnRetries = 3 2617 pushTxnRetries = 3 2618 meta2GetRetries = 3 2619 } 2620 2621 // Maybe inject some retryable errors when the merge transaction commits. 2622 var mtc *multiTestContext 2623 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 2624 for _, req := range ba.Requests { 2625 if et := req.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil { 2626 if atomic.AddInt64(&mergeTxnRetries, -1) >= 0 { 2627 return roachpb.NewError( 2628 roachpb.NewTransactionRetryError(roachpb.RETRY_SERIALIZABLE, "filter err")) 2629 } 2630 } 2631 if pt := req.GetPushTxn(); pt != nil { 2632 if atomic.AddInt64(&pushTxnRetries, -1) >= 0 { 2633 return roachpb.NewErrorf("injected failure") 2634 } 2635 } 2636 if g := req.GetGet(); g != nil && ba.ReadConsistency == roachpb.READ_UNCOMMITTED { 2637 if atomic.AddInt64(&meta2GetRetries, -1) >= 0 { 2638 return roachpb.NewErrorf("injected failure") 2639 } 2640 } 2641 } 2642 return nil 2643 } 2644 2645 mtc = &multiTestContext{ 2646 storeConfig: &storeCfg, 2647 // This test was written before the multiTestContext started creating many 2648 // system ranges at startup, and hasn't been update to take that into 2649 // account. 2650 startWithSingleRange: true, 2651 } 2652 2653 mtc.Start(t, 3) 2654 defer mtc.Stop() 2655 store0, store2 := mtc.Store(0), mtc.Store(2) 2656 2657 // Make store0 the leaseholder of the LHS and store2 the leaseholder of the 2658 // RHS. We'll be forcing store2's LHS to fall behind. This creates an 2659 // interesting scenario in which the leaseholder for the RHS has very 2660 // out-of-date information about the status of the merge. 2661 rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID 2662 mtc.replicateRange(rngID, 1, 2) 2663 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) 2664 if err != nil { 2665 t.Fatal(err) 2666 } 2667 mtc.transferLease(ctx, rhsDesc.RangeID, 0, 2) 2668 2669 // After the LHS replica on store2 processes the split, block Raft traffic to 2670 // it by holding its raftMu, so that it isn't aware there's a merge in 2671 // progress. 2672 lhsRepl2, err := store2.GetReplica(lhsDesc.RangeID) 2673 if err != nil { 2674 t.Fatal(err) 2675 } 2676 testutils.SucceedsSoon(t, func() error { 2677 if !lhsRepl2.Desc().Equal(lhsDesc) { 2678 return errors.New("store2 has not processed split") 2679 } 2680 return nil 2681 }) 2682 lhsRepl2.RaftLock() 2683 2684 args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 2685 _, pErr := kv.SendWrapped(ctx, store0.TestSender(), args) 2686 if pErr != nil { 2687 t.Fatal(pErr) 2688 } 2689 2690 // Immediately after the merge completes, send a request to the RHS which will 2691 // be handled by the leaseholder, on store2. This exercises a tricky scenario. 2692 // We've forced store2's LHS replica to fall behind, so it can't subsume 2693 // store2's RHS. store2's RHS is watching for the merge to complete, however, 2694 // and will notice that the merge has committed before the LHS does. 2695 getErr := make(chan error) 2696 go func() { 2697 _, pErr = kv.SendWrappedWith(ctx, store2.TestSender(), roachpb.Header{ 2698 RangeID: rhsDesc.RangeID, 2699 }, getArgs(rhsDesc.StartKey.AsRawKey())) 2700 getErr <- pErr.GoError() 2701 }() 2702 2703 // Restore communication with store2. Give it the lease to force all commands 2704 // to be applied, including the merge trigger. 2705 lhsRepl2.RaftUnlock() 2706 mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2) 2707 2708 // We *must* see a RangeNotFound error from the get request we sent earlier 2709 // because we sent it after the merge completed. Anything else is a 2710 // consistency error (or a bug in the test). 2711 if err := <-getErr; !testutils.IsError(err, "r2 was not found") { 2712 t.Fatalf("expected RangeNotFound error from get after merge, but got %v", err) 2713 } 2714 } 2715 2716 // TestStoreRangeMergeSlowWatcher verifies that the watcher goroutine for the 2717 // RHS of a merge does not erroneously permit traffic after the merge commits, 2718 // even if the watcher goroutine is so slow in noticing the merge that another 2719 // merge occurs. 2720 // 2721 // This test is a more complicated version of TestStoreRangeMergeWatcher that 2722 // exercises a rare but important edge case. 2723 // 2724 // The test creates three ranges, [a, b), [b, c), and [c, /Max). Hereafter these 2725 // ranges will be referred to as A, B, and C, respectively. store0 holds the 2726 // lease on A and C, while store1 holds the lease on B. The test will execute 2727 // two merges such that first A subsumes B, then AB subsumes C. The idea is to 2728 // inform store1 that the A <- B merge is in progress so that it locks B down, 2729 // but then keep it in the dark about the status of the merge for long enough 2730 // that the AB <- C merge commits. 2731 // 2732 // When store1's merge watcher goroutine looks up whether the A <- B merge 2733 // commit occurred in meta2 with a Get(/Meta2/c) request, it won't find the 2734 // descriptor for B, which would indicate that the merge aborted, nor the 2735 // descriptor for AB, which would indicate that the merge committed. Instead it 2736 // will find no descriptor at all, since the AB <- C merge has committed and the 2737 // descriptor for the merged range ABC is stored at /Meta2/Max, not /Meta2/c. 2738 func TestStoreRangeMergeSlowWatcher(t *testing.T) { 2739 defer leaktest.AfterTest(t)() 2740 2741 ctx := context.Background() 2742 aKey, bKey, cKey := roachpb.RKey("a"), roachpb.RKey("b"), roachpb.RKey("c") 2743 storeCfg := kvserver.TestStoreConfig(nil) 2744 storeCfg.TestingKnobs.DisableReplicateQueue = true 2745 var mtc *multiTestContext 2746 var store0, store1 *kvserver.Store 2747 2748 // Force PushTxn requests generated by the watcher goroutine to wait on a 2749 // channel. This is how we control when store1's merge watcher goroutine hears 2750 // about the status of the A <- B merge. 2751 var syn syncutil.Mutex 2752 cond := sync.NewCond(&syn) 2753 storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error { 2754 syn.Lock() 2755 defer syn.Unlock() 2756 for _, req := range ba.Requests { 2757 // We can detect PushTxn requests generated by the watcher goroutine 2758 // because they use the minimum transaction priority. Note that we 2759 // only block the watcher goroutine on store1 so that we only interfere 2760 // with the first merge (A <- B) and not the later merge (AB <- C). 2761 if pt := req.GetPushTxn(); pt != nil && pt.PusherTxn.Priority == enginepb.MinTxnPriority && 2762 ba.GatewayNodeID == store1.Ident.NodeID { 2763 cond.Wait() 2764 } 2765 if et := req.GetEndTxn(); et != nil && !et.Commit && ba.Txn.Name == "merge" { 2766 // The merge transaction needed to restart for some reason. To avoid 2767 // deadlocking, we need to allow the watcher goroutine's PushTxn request 2768 // through so that it allows traffic on the range again. We'll try again 2769 // with the restarted merge transaction. 2770 cond.Signal() 2771 } 2772 } 2773 return nil 2774 } 2775 2776 // Record whether we've seen a request to Get(/Meta2/c) that returned nil. 2777 // This verifies that we're actually testing what we claim to. 2778 var sawMeta2Req int64 2779 meta2CKey := keys.RangeMetaKey(cKey).AsRawKey() 2780 storeCfg.TestingKnobs.TestingResponseFilter = func( 2781 ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, 2782 ) *roachpb.Error { 2783 for i, req := range ba.Requests { 2784 if g := req.GetGet(); g != nil && g.Key.Equal(meta2CKey) && br.Responses[i].GetGet().Value == nil { 2785 atomic.StoreInt64(&sawMeta2Req, 1) 2786 } 2787 } 2788 return nil 2789 } 2790 2791 mtc = &multiTestContext{storeConfig: &storeCfg} 2792 mtc.Start(t, 3) 2793 defer mtc.Stop() 2794 store0, store1 = mtc.Store(0), mtc.Store(1) 2795 2796 // Create and place the ranges as described in the comment on this test. 2797 rngID := store0.LookupReplica(aKey).Desc().RangeID 2798 mtc.replicateRange(rngID, 1, 2) 2799 keys := []roachpb.RKey{aKey, bKey, cKey} 2800 for _, key := range keys { 2801 splitArgs := adminSplitArgs(key.AsRawKey()) 2802 if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], splitArgs); pErr != nil { 2803 t.Fatal(pErr) 2804 } 2805 } 2806 bRangeID := store0.LookupReplica(bKey).RangeID 2807 mtc.transferLease(ctx, bRangeID, 0, 1) 2808 2809 // Warm the DistSender cache on each node. We'll be blocking requests to B 2810 // during the test, and we don't want requests headed for A or C to get routed 2811 // to B while its blocked because of a stale DistSender cache. 2812 for _, key := range keys { 2813 for _, distSender := range mtc.distSenders { 2814 if _, pErr := kv.SendWrapped(ctx, distSender, getArgs(key.AsRawKey())); pErr != nil { 2815 t.Fatal(pErr) 2816 } 2817 } 2818 } 2819 2820 // Force the replica of A on store1 to fall behind so that it doesn't apply 2821 // any merge triggers. This makes the watcher goroutine responsible for 2822 // marking B as destroyed. 2823 aRepl1 := store1.LookupReplica(aKey) 2824 aRepl1.RaftLock() 2825 defer aRepl1.RaftUnlock() 2826 2827 // Merge A <- B. 2828 mergeArgs := adminMergeArgs(aKey.AsRawKey()) 2829 if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], mergeArgs); pErr != nil { 2830 t.Fatal(pErr) 2831 } 2832 2833 // Immediately after the merge completes, send a request to B. 2834 getErr := make(chan error) 2835 go func() { 2836 _, pErr := kv.SendWrappedWith(ctx, store1.TestSender(), roachpb.Header{ 2837 RangeID: bRangeID, 2838 }, getArgs(bKey.AsRawKey())) 2839 getErr <- pErr.GoError() 2840 }() 2841 2842 // Merge AB <- C. 2843 if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], mergeArgs); pErr != nil { 2844 t.Fatal(pErr) 2845 } 2846 2847 // Synchronously ensure that the intent on meta2CKey has been cleaned up. 2848 // The merge committed, but the intent resolution happens asynchronously. 2849 _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], getArgs(meta2CKey)) 2850 if pErr != nil { 2851 t.Fatal(pErr) 2852 } 2853 2854 // With the meta2CKey intent cleaned up, allow store1's merge watcher 2855 // goroutine to proceed. 2856 cond.Signal() 2857 2858 // We *must* see a RangeNotFound error from the get request we sent earlier 2859 // because we sent it after the merge completed. Anything else is a 2860 // consistency error (or a bug in the test). 2861 expErr := fmt.Sprintf("r%d was not found", bRangeID) 2862 if err := <-getErr; !testutils.IsError(err, expErr) { 2863 t.Fatalf("expected %q error from get after merge, but got %v", expErr, err) 2864 } 2865 2866 if atomic.LoadInt64(&sawMeta2Req) != 1 { 2867 t.Fatalf("test did not generate expected meta2 get request/response") 2868 } 2869 } 2870 2871 func TestStoreRangeMergeRaftSnapshot(t *testing.T) { 2872 defer leaktest.AfterTest(t)() 2873 2874 // We will be testing the SSTs written on store2's engine. 2875 var receivingEng, sendingEng storage.Engine 2876 ctx := context.Background() 2877 storeCfg := kvserver.TestStoreConfig(nil) 2878 storeCfg.TestingKnobs.DisableReplicateQueue = true 2879 storeCfg.TestingKnobs.DisableReplicaGCQueue = true 2880 storeCfg.Clock = nil // manual clock 2881 storeCfg.TestingKnobs.BeforeSnapshotSSTIngestion = func( 2882 inSnap kvserver.IncomingSnapshot, 2883 snapType kvserver.SnapshotRequest_Type, 2884 sstNames []string, 2885 ) error { 2886 // Only verify snapshots of type RAFT and on the range under exercise 2887 // (range 2). Note that the keys of range 2 aren't verified in this 2888 // functions. Unreplicated range-id local keys are not verified because 2889 // there are too many keys and the other replicated keys are verified later 2890 // on in the test. This function verifies that the subsumed replicas have 2891 // been handled properly. 2892 if snapType != kvserver.SnapshotRequest_RAFT || inSnap.State.Desc.RangeID != roachpb.RangeID(2) { 2893 return nil 2894 } 2895 // The seven SSTs we are expecting to ingest are in the following order: 2896 // 1. Replicated range-id local keys of the range in the snapshot. 2897 // 2. Range-local keys of the range in the snapshot. 2898 // 3. User keys of the range in the snapshot. 2899 // 4. Unreplicated range-id local keys of the range in the snapshot. 2900 // 5. SST to clear range-id local keys of the subsumed replica with 2901 // RangeID 3. 2902 // 6. SST to clear range-id local keys of the subsumed replica with 2903 // RangeID 4. 2904 // 7. SST to clear the user keys of the subsumed replicas. 2905 // 2906 // NOTE: There are no range-local keys in [d, /Max) in the store we're 2907 // sending a snapshot to, so we aren't expecting an SST to clear those 2908 // keys. 2909 if len(sstNames) != 7 { 2910 return errors.Errorf("expected to ingest 7 SSTs, got %d SSTs", len(sstNames)) 2911 } 2912 2913 // Only try to predict SSTs 3 and 5-7. SSTs 1, 2 and 4 are excluded in 2914 // the test since the state of the Raft log can be non-deterministic 2915 // with extra entries being appended to the sender's log after the 2916 // snapshot has already been sent. 2917 var sstNamesSubset []string 2918 sstNamesSubset = append(sstNamesSubset, sstNames[2]) 2919 sstNamesSubset = append(sstNamesSubset, sstNames[4:]...) 2920 2921 // Construct the expected SSTs and ensure that they are byte-by-byte 2922 // equal. This verification ensures that the SSTs have the same 2923 // tombstones and range deletion tombstones. 2924 var expectedSSTs [][]byte 2925 2926 // Construct SST #1 through #3 as numbered above, but only ultimately 2927 // keep the 3rd one. 2928 keyRanges := rditer.MakeReplicatedKeyRanges(inSnap.State.Desc) 2929 it := rditer.NewReplicaDataIterator(inSnap.State.Desc, sendingEng, true /* replicatedOnly */, false /* seekEnd */) 2930 defer it.Close() 2931 // Write a range deletion tombstone to each of the SSTs then put in the 2932 // kv entries from the sender of the snapshot. 2933 for _, r := range keyRanges { 2934 sstFile := &storage.MemFile{} 2935 sst := storage.MakeIngestionSSTWriter(sstFile) 2936 if err := sst.ClearRange(r.Start, r.End); err != nil { 2937 return err 2938 } 2939 2940 // Keep adding kv data to the SST until the the key exceeds the 2941 // bounds of the range, then proceed to the next range. 2942 for ; ; it.Next() { 2943 valid, err := it.Valid() 2944 if err != nil { 2945 return err 2946 } 2947 if !valid || r.End.Key.Compare(it.Key().Key) <= 0 { 2948 if err := sst.Finish(); err != nil { 2949 return err 2950 } 2951 sst.Close() 2952 expectedSSTs = append(expectedSSTs, sstFile.Data()) 2953 break 2954 } 2955 if err := sst.Put(it.Key(), it.Value()); err != nil { 2956 return err 2957 } 2958 } 2959 } 2960 expectedSSTs = expectedSSTs[2:] 2961 2962 // Construct SSTs #5 and #6: range-id local keys of subsumed replicas 2963 // with RangeIDs 3 and 4. 2964 for _, rangeID := range []roachpb.RangeID{roachpb.RangeID(3), roachpb.RangeID(4)} { 2965 sstFile := &storage.MemFile{} 2966 sst := storage.MakeIngestionSSTWriter(sstFile) 2967 defer sst.Close() 2968 r := rditer.MakeRangeIDLocalKeyRange(rangeID, false /* replicatedOnly */) 2969 if err := sst.ClearRange(r.Start, r.End); err != nil { 2970 return err 2971 } 2972 tombstoneKey := keys.RangeTombstoneKey(rangeID) 2973 tombstoneValue := &roachpb.RangeTombstone{NextReplicaID: math.MaxInt32} 2974 if err := storage.MVCCBlindPutProto(context.Background(), &sst, nil, tombstoneKey, hlc.Timestamp{}, tombstoneValue, nil); err != nil { 2975 return err 2976 } 2977 err := sst.Finish() 2978 if err != nil { 2979 return err 2980 } 2981 expectedSSTs = append(expectedSSTs, sstFile.Data()) 2982 } 2983 2984 // Construct SST #7: user key range of subsumed replicas. 2985 sstFile := &storage.MemFile{} 2986 sst := storage.MakeIngestionSSTWriter(sstFile) 2987 defer sst.Close() 2988 desc := roachpb.RangeDescriptor{ 2989 StartKey: roachpb.RKey("d"), 2990 EndKey: roachpb.RKeyMax, 2991 } 2992 r := rditer.MakeUserKeyRange(&desc) 2993 if err := storage.ClearRangeWithHeuristic(receivingEng, &sst, r.Start.Key, r.End.Key); err != nil { 2994 return err 2995 } 2996 err := sst.Finish() 2997 if err != nil { 2998 return err 2999 } 3000 expectedSSTs = append(expectedSSTs, sstFile.Data()) 3001 3002 var mismatchedSstsIdx []int 3003 // Iterate over all the tested SSTs and check that they're byte-by-byte equal. 3004 for i := range sstNamesSubset { 3005 actualSST, err := receivingEng.ReadFile(sstNamesSubset[i]) 3006 if err != nil { 3007 return err 3008 } 3009 if !bytes.Equal(actualSST, expectedSSTs[i]) { 3010 mismatchedSstsIdx = append(mismatchedSstsIdx, i) 3011 } 3012 } 3013 if len(mismatchedSstsIdx) != 0 { 3014 return errors.Errorf("SST indices %v don't match", mismatchedSstsIdx) 3015 } 3016 return nil 3017 } 3018 mtc := &multiTestContext{ 3019 storeConfig: &storeCfg, 3020 // This test was written before the multiTestContext started creating many 3021 // system ranges at startup, and hasn't been update to take that into 3022 // account. 3023 startWithSingleRange: true, 3024 } 3025 mtc.Start(t, 3) 3026 defer mtc.Stop() 3027 store0, store2 := mtc.Store(0), mtc.Store(2) 3028 sendingEng = store0.Engine() 3029 receivingEng = store2.Engine() 3030 distSender := mtc.distSenders[0] 3031 3032 // Create three fully-caught-up, adjacent ranges on all three stores. 3033 mtc.replicateRange(roachpb.RangeID(1), 1, 2) 3034 for _, key := range []roachpb.Key{roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")} { 3035 if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(key)); pErr != nil { 3036 t.Fatal(pErr) 3037 } 3038 if _, pErr := kv.SendWrapped(ctx, distSender, incrementArgs(key, 1)); pErr != nil { 3039 t.Fatal(pErr) 3040 } 3041 mtc.waitForValues(key, []int64{1, 1, 1}) 3042 } 3043 3044 // Put some keys in [d, /Max) so the subsumed replica of [c, /Max) with range 3045 // ID 4 has tombstones. We will clear uncontained key range of subsumed 3046 // replicas, so when we are receiving a snapshot for [a, d), we expect to 3047 // clear the keys in [d, /Max). 3048 for i := 0; i < 10; i++ { 3049 key := roachpb.Key("d" + strconv.Itoa(i)) 3050 if _, pErr := kv.SendWrapped(ctx, distSender, incrementArgs(key, 1)); pErr != nil { 3051 t.Fatal(pErr) 3052 } 3053 mtc.waitForValues(key, []int64{1, 1, 1}) 3054 } 3055 3056 aRepl0 := store0.LookupReplica(roachpb.RKey("a")) 3057 3058 // Start dropping all Raft traffic to the first range on store2. 3059 mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ 3060 rangeID: aRepl0.RangeID, 3061 RaftMessageHandler: store2, 3062 }) 3063 3064 // Merge [a, b) into [b, c), then [a, c) into [c, /Max). 3065 for i := 0; i < 2; i++ { 3066 if _, pErr := kv.SendWrapped(ctx, distSender, adminMergeArgs(roachpb.Key("a"))); pErr != nil { 3067 t.Fatal(pErr) 3068 } 3069 } 3070 3071 // Split [a, /Max) into [a, d) and [d, /Max). This means the Raft snapshot 3072 // will span both a merge and a split. 3073 if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(roachpb.Key("d"))); pErr != nil { 3074 t.Fatal(pErr) 3075 } 3076 3077 // Truncate the logs of the LHS. 3078 index := func() uint64 { 3079 repl := store0.LookupReplica(roachpb.RKey("a")) 3080 index, err := repl.GetLastIndex() 3081 if err != nil { 3082 t.Fatal(err) 3083 } 3084 // Truncate the log at index+1 (log entries < N are removed, so this 3085 // includes the merge). 3086 truncArgs := &roachpb.TruncateLogRequest{ 3087 RequestHeader: roachpb.RequestHeader{Key: roachpb.Key("a")}, 3088 Index: index, 3089 RangeID: repl.RangeID, 3090 } 3091 if _, err := kv.SendWrapped(ctx, mtc.distSenders[0], truncArgs); err != nil { 3092 t.Fatal(err) 3093 } 3094 return index 3095 }() 3096 3097 beforeRaftSnaps := store2.Metrics().RangeSnapshotsNormalApplied.Count() 3098 3099 // Restore Raft traffic to the LHS on store2. 3100 log.Infof(ctx, "restored traffic to store 2") 3101 mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ 3102 rangeID: aRepl0.RangeID, 3103 RaftMessageHandler: store2, 3104 unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{ 3105 dropReq: func(req *kvserver.RaftMessageRequest) bool { 3106 // Make sure that even going forward no MsgApp for what we just 3107 // truncated can make it through. The Raft transport is asynchronous 3108 // so this is necessary to make the test pass reliably - otherwise 3109 // the follower on store2 may catch up without needing a snapshot, 3110 // tripping up the test. 3111 // 3112 // NB: the Index on the message is the log index that _precedes_ any of the 3113 // entries in the MsgApp, so filter where msg.Index < index, not <= index. 3114 return req.Message.Type == raftpb.MsgApp && req.Message.Index < index 3115 }, 3116 // Don't drop heartbeats or responses. 3117 dropHB: func(*kvserver.RaftHeartbeat) bool { return false }, 3118 dropResp: func(*kvserver.RaftMessageResponse) bool { return false }, 3119 }, 3120 }) 3121 3122 // Wait for all replicas to catch up to the same point. Because we truncated 3123 // the log while store2 was unavailable, this will require a Raft snapshot. 3124 testutils.SucceedsSoon(t, func() error { 3125 afterRaftSnaps := store2.Metrics().RangeSnapshotsNormalApplied.Count() 3126 if afterRaftSnaps <= beforeRaftSnaps { 3127 return errors.New("expected store2 to apply at least 1 additional raft snapshot") 3128 } 3129 3130 // Verify that the sets of keys in store0 and store2 are identical. 3131 storeKeys0 := getEngineKeySet(t, store0.Engine()) 3132 storeKeys2 := getEngineKeySet(t, store2.Engine()) 3133 dRepl0 := store0.LookupReplica(roachpb.RKey("d")) 3134 ignoreKey := func(k string) bool { 3135 // Unreplicated keys for the remaining ranges are allowed to differ. 3136 for _, id := range []roachpb.RangeID{1, aRepl0.RangeID, dRepl0.RangeID} { 3137 if strings.HasPrefix(k, string(keys.MakeRangeIDUnreplicatedPrefix(id))) { 3138 return true 3139 } 3140 } 3141 return false 3142 } 3143 for k := range storeKeys0 { 3144 if ignoreKey(k) { 3145 continue 3146 } 3147 if _, ok := storeKeys2[k]; !ok { 3148 return fmt.Errorf("store2 missing key %s", roachpb.Key(k)) 3149 } 3150 } 3151 for k := range storeKeys2 { 3152 if ignoreKey(k) { 3153 continue 3154 } 3155 if _, ok := storeKeys0[k]; !ok { 3156 return fmt.Errorf("store2 has extra key %s", roachpb.Key(k)) 3157 } 3158 } 3159 return nil 3160 }) 3161 } 3162 3163 // TestStoreRangeMergeDuringShutdown verifies that a shutdown of a store 3164 // containing the RHS of a merge can occur cleanly. This previously triggered 3165 // a fatal error (#27552). 3166 func TestStoreRangeMergeDuringShutdown(t *testing.T) { 3167 defer leaktest.AfterTest(t)() 3168 3169 ctx := context.Background() 3170 storeCfg := kvserver.TestStoreConfig(nil) 3171 storeCfg.TestingKnobs.DisableSplitQueue = true 3172 storeCfg.TestingKnobs.DisableMergeQueue = true 3173 storeCfg.TestingKnobs.DisableReplicateQueue = true 3174 storeCfg.Clock = nil // manual clock 3175 3176 // Install a filter that triggers a shutdown when stop is non-zero and the 3177 // rhsDesc requests a new lease. 3178 var mtc *multiTestContext 3179 var state struct { 3180 syncutil.Mutex 3181 rhsDesc *roachpb.RangeDescriptor 3182 stop, stopping bool 3183 } 3184 storeCfg.TestingKnobs.TestingPostApplyFilter = func(args kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) { 3185 state.Lock() 3186 if state.stop && !state.stopping && args.RangeID == state.rhsDesc.RangeID && args.IsLeaseRequest { 3187 // Shut down the store. The lease acquisition will notice that a merge is 3188 // in progress and attempt to run a task to watch for its completion. 3189 // Shutting down the store before running leasePostApply will prevent that 3190 // task from launching. This error path would previously fatal a node 3191 // incorrectly (#27552). 3192 state.stopping = true 3193 state.Unlock() 3194 go mtc.Stop() 3195 // Sleep to give the shutdown time to propagate. The test appeared to work 3196 // without this sleep, but best to be somewhat robust to different 3197 // goroutine schedules. 3198 time.Sleep(10 * time.Millisecond) 3199 } else { 3200 state.Unlock() 3201 } 3202 return 0, nil 3203 } 3204 3205 mtc = &multiTestContext{ 3206 storeConfig: &storeCfg, 3207 // This test was written before the multiTestContext started creating many 3208 // system ranges at startup, and hasn't been update to take that into 3209 // account. 3210 startWithSingleRange: true, 3211 } 3212 mtc.Start(t, 1) 3213 store := mtc.Store(0) 3214 stopper := mtc.engineStoppers[0] 3215 3216 _, rhsDesc, err := createSplitRanges(ctx, store) 3217 if err != nil { 3218 t.Fatal(err) 3219 } 3220 state.Lock() 3221 state.rhsDesc = rhsDesc 3222 state.Unlock() 3223 3224 // Simulate a merge transaction by launching a transaction that lays down 3225 // intents on the two copies of the RHS range descriptor. 3226 txn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */) 3227 if err := txn.Del(ctx, keys.RangeDescriptorKey(rhsDesc.StartKey)); err != nil { 3228 t.Fatal(err) 3229 } 3230 if err := txn.Del(ctx, keys.RangeMetaKey(rhsDesc.StartKey)); err != nil { 3231 t.Fatal(err) 3232 } 3233 3234 // Indicate to the store filter installed above that the next lease 3235 // acquisition for the RHS should trigger a shutdown. 3236 state.Lock() 3237 state.stop = true 3238 state.Unlock() 3239 3240 // Expire all leases. 3241 mtc.advanceClock(ctx) 3242 3243 // Send a dummy get request on the RHS to force a lease acquisition. We expect 3244 // this to fail, as quiescing stores cannot acquire leases. 3245 err = stopper.RunTaskWithErr(ctx, "test-get-rhs-key", func(ctx context.Context) error { 3246 _, err := store.DB().Get(ctx, "dummy-rhs-key") 3247 return err 3248 }) 3249 if exp := "not lease holder"; !testutils.IsError(err, exp) { 3250 t.Fatalf("expected %q error, but got %v", err, exp) 3251 } 3252 } 3253 3254 func TestMergeQueue(t *testing.T) { 3255 defer leaktest.AfterTest(t)() 3256 3257 ctx := context.Background() 3258 manualClock := hlc.NewManualClock(123) 3259 clock := hlc.NewClock(manualClock.UnixNano, time.Nanosecond) 3260 storeCfg := kvserver.TestStoreConfig(nil) 3261 storeCfg.TestingKnobs.DisableSplitQueue = true 3262 storeCfg.TestingKnobs.DisableReplicateQueue = true 3263 storeCfg.TestingKnobs.DisableScanner = true 3264 rangeMinBytes := int64(1 << 10) // 1KB 3265 storeCfg.DefaultZoneConfig.RangeMinBytes = &rangeMinBytes 3266 sv := &storeCfg.Settings.SV 3267 kvserverbase.MergeQueueEnabled.Override(sv, true) 3268 kvserver.MergeQueueInterval.Override(sv, 0) // process greedily 3269 var mtc multiTestContext 3270 // This test was written before the multiTestContext started creating many 3271 // system ranges at startup, and hasn't been update to take that into account. 3272 mtc.startWithSingleRange = true 3273 3274 mtc.storeConfig = &storeCfg 3275 // Inject clock for manipulation in tests. 3276 mtc.storeConfig.Clock = clock 3277 mtc.Start(t, 2) 3278 defer mtc.Stop() 3279 mtc.initGossipNetwork() // needed for the non-collocated case's rebalancing to work 3280 store := mtc.Store(0) 3281 store.SetMergeQueueActive(true) 3282 3283 split := func(t *testing.T, key roachpb.Key, expirationTime hlc.Timestamp) { 3284 t.Helper() 3285 args := adminSplitArgs(key) 3286 args.ExpirationTime = expirationTime 3287 if _, pErr := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), args); pErr != nil { 3288 t.Fatal(pErr) 3289 } 3290 } 3291 3292 clearRange := func(t *testing.T, start, end roachpb.RKey) { 3293 if _, pErr := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), &roachpb.ClearRangeRequest{ 3294 RequestHeader: roachpb.RequestHeader{Key: start.AsRawKey(), EndKey: end.AsRawKey()}, 3295 }); pErr != nil { 3296 t.Fatal(pErr) 3297 } 3298 } 3299 3300 // Create two empty ranges, a - b and b - c, by splitting at a, b, and c. 3301 lhsStartKey := roachpb.RKey("a") 3302 rhsStartKey := roachpb.RKey("b") 3303 rhsEndKey := roachpb.RKey("c") 3304 for _, k := range []roachpb.RKey{lhsStartKey, rhsStartKey, rhsEndKey} { 3305 split(t, k.AsRawKey(), hlc.Timestamp{} /* expirationTime */) 3306 } 3307 lhs := func() *kvserver.Replica { return store.LookupReplica(lhsStartKey) } 3308 rhs := func() *kvserver.Replica { return store.LookupReplica(rhsStartKey) } 3309 3310 // setThresholds simulates a zone config update that updates the ranges' 3311 // minimum and maximum sizes. 3312 setZones := func(zone zonepb.ZoneConfig) { 3313 lhs().SetZoneConfig(&zone) 3314 rhs().SetZoneConfig(&zone) 3315 } 3316 3317 rng, _ := randutil.NewPseudoRand() 3318 randBytes := randutil.RandBytes(rng, int(*storeCfg.DefaultZoneConfig.RangeMinBytes)) 3319 3320 reset := func(t *testing.T) { 3321 t.Helper() 3322 clearRange(t, lhsStartKey, rhsEndKey) 3323 for _, k := range []roachpb.RKey{lhsStartKey, rhsStartKey} { 3324 if err := store.DB().Put(ctx, k, randBytes); err != nil { 3325 t.Fatal(err) 3326 } 3327 } 3328 setZones(*storeCfg.DefaultZoneConfig) 3329 store.MustForceMergeScanAndProcess() // drain any merges that might already be queued 3330 split(t, roachpb.Key("b"), hlc.Timestamp{} /* expirationTime */) 3331 } 3332 3333 verifyMerged := func(t *testing.T) { 3334 t.Helper() 3335 repl := store.LookupReplica(rhsStartKey) 3336 if !repl.Desc().StartKey.Equal(lhsStartKey) { 3337 t.Fatalf("ranges unexpectedly unmerged") 3338 } 3339 } 3340 3341 verifyUnmerged := func(t *testing.T) { 3342 t.Helper() 3343 repl := store.LookupReplica(rhsStartKey) 3344 if repl.Desc().StartKey.Equal(lhsStartKey) { 3345 t.Fatalf("ranges unexpectedly merged") 3346 } 3347 } 3348 3349 t.Run("sanity", func(t *testing.T) { 3350 // Check that ranges are not trivially merged after reset. 3351 reset(t) 3352 store.MustForceMergeScanAndProcess() 3353 verifyUnmerged(t) 3354 reset(t) 3355 store.MustForceMergeScanAndProcess() 3356 verifyUnmerged(t) 3357 }) 3358 3359 t.Run("both-empty", func(t *testing.T) { 3360 reset(t) 3361 clearRange(t, lhsStartKey, rhsEndKey) 3362 store.MustForceMergeScanAndProcess() 3363 verifyMerged(t) 3364 }) 3365 3366 t.Run("lhs-undersize", func(t *testing.T) { 3367 reset(t) 3368 zone := protoutil.Clone(storeCfg.DefaultZoneConfig).(*zonepb.ZoneConfig) 3369 *zone.RangeMinBytes *= 2 3370 lhs().SetZoneConfig(zone) 3371 store.MustForceMergeScanAndProcess() 3372 verifyMerged(t) 3373 }) 3374 3375 t.Run("combined-threshold", func(t *testing.T) { 3376 reset(t) 3377 3378 // The ranges are individually beneath the minimum size threshold, but 3379 // together they'll exceed the maximum size threshold. 3380 zone := protoutil.Clone(storeCfg.DefaultZoneConfig).(*zonepb.ZoneConfig) 3381 zone.RangeMinBytes = proto.Int64(lhs().GetMVCCStats().Total() + 1) 3382 zone.RangeMaxBytes = proto.Int64(lhs().GetMVCCStats().Total()*2 - 1) 3383 setZones(*zone) 3384 store.MustForceMergeScanAndProcess() 3385 verifyUnmerged(t) 3386 3387 // Once the maximum size threshold is increased, the merge can occur. 3388 zone.RangeMaxBytes = proto.Int64(*zone.RangeMaxBytes + 1) 3389 setZones(*zone) 3390 store.MustForceMergeScanAndProcess() 3391 verifyMerged(t) 3392 }) 3393 3394 t.Run("non-collocated", func(t *testing.T) { 3395 reset(t) 3396 verifyUnmerged(t) 3397 rhsRangeID := rhs().RangeID 3398 mtc.replicateRange(rhsRangeID, 1) 3399 mtc.transferLease(ctx, rhsRangeID, 0, 1) 3400 mtc.unreplicateRange(rhsRangeID, 0) 3401 require.NoError(t, mtc.waitForUnreplicated(rhsRangeID, 0)) 3402 3403 clearRange(t, lhsStartKey, rhsEndKey) 3404 store.MustForceMergeScanAndProcess() 3405 verifyMerged(t) 3406 }) 3407 3408 // TODO(jeffreyxiao): Add subtest to consider load when making merging 3409 // decisions. 3410 3411 t.Run("sticky-bit", func(t *testing.T) { 3412 reset(t) 3413 store.MustForceMergeScanAndProcess() 3414 verifyUnmerged(t) 3415 3416 // Perform manual merge and verify that no merge occurred. 3417 split(t, rhsStartKey.AsRawKey(), hlc.MaxTimestamp /* expirationTime */) 3418 clearRange(t, lhsStartKey, rhsEndKey) 3419 store.MustForceMergeScanAndProcess() 3420 verifyUnmerged(t) 3421 3422 // Delete sticky bit and verify that merge occurs. 3423 unsplitArgs := &roachpb.AdminUnsplitRequest{ 3424 RequestHeader: roachpb.RequestHeader{ 3425 Key: rhsStartKey.AsRawKey(), 3426 }, 3427 } 3428 if _, err := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), unsplitArgs); err != nil { 3429 t.Fatal(err) 3430 } 3431 store.MustForceMergeScanAndProcess() 3432 verifyMerged(t) 3433 }) 3434 3435 t.Run("sticky-bit-expiration", func(t *testing.T) { 3436 manualSplitTTL := time.Millisecond * 200 3437 reset(t) 3438 store.MustForceMergeScanAndProcess() 3439 verifyUnmerged(t) 3440 3441 // Perform manual merge and verify that no merge occurred. 3442 split(t, rhsStartKey.AsRawKey(), clock.Now().Add(manualSplitTTL.Nanoseconds(), 0) /* expirationTime */) 3443 clearRange(t, lhsStartKey, rhsEndKey) 3444 store.MustForceMergeScanAndProcess() 3445 verifyUnmerged(t) 3446 3447 // Sticky bit is not expired yet. 3448 manualClock.Set(manualSplitTTL.Nanoseconds()) 3449 store.MustForceMergeScanAndProcess() 3450 verifyUnmerged(t) 3451 3452 // Sticky bit is expired. 3453 manualClock.Set(manualSplitTTL.Nanoseconds() * 2) 3454 store.MustForceMergeScanAndProcess() 3455 verifyMerged(t) 3456 }) 3457 } 3458 3459 func TestInvalidSubsumeRequest(t *testing.T) { 3460 defer leaktest.AfterTest(t)() 3461 3462 ctx := context.Background() 3463 var mtc multiTestContext 3464 mtc.Start(t, 1) 3465 defer mtc.Stop() 3466 store := mtc.Store(0) 3467 3468 // A Subsume request that succeeds when it shouldn't will wedge a 3469 // store because it waits for a merge that is not actually in progress. Set a 3470 // short timeout to limit the damage. 3471 ctx, cancel := context.WithTimeout(ctx, testutils.DefaultSucceedsSoonDuration) 3472 defer cancel() 3473 3474 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store) 3475 if err != nil { 3476 t.Fatal(err) 3477 } 3478 3479 getSnapArgs := roachpb.SubsumeRequest{ 3480 RequestHeader: roachpb.RequestHeader{Key: rhsDesc.StartKey.AsRawKey()}, 3481 LeftDesc: *lhsDesc, 3482 RightDesc: *rhsDesc, 3483 } 3484 3485 // Subsume with an incorrect RightDesc should fail. 3486 { 3487 badRHSDesc := *rhsDesc 3488 badRHSDesc.EndKey = badRHSDesc.EndKey.Next() 3489 badArgs := getSnapArgs 3490 badArgs.RightDesc = badRHSDesc 3491 _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{ 3492 RangeID: rhsDesc.RangeID, 3493 }, &badArgs) 3494 if exp := "RHS range bounds do not match"; !testutils.IsPError(pErr, exp) { 3495 t.Fatalf("expected %q error, but got %v", exp, pErr) 3496 } 3497 } 3498 3499 // Subsume from a non-neighboring LHS should fail. 3500 { 3501 badArgs := getSnapArgs 3502 badArgs.LeftDesc.EndKey = badArgs.LeftDesc.EndKey.Next() 3503 _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{ 3504 RangeID: rhsDesc.RangeID, 3505 }, &badArgs) 3506 if exp := "ranges are not adjacent"; !testutils.IsPError(pErr, exp) { 3507 t.Fatalf("expected %q error, but got %v", exp, pErr) 3508 } 3509 } 3510 3511 // Subsume without an intent on the local range descriptor should fail. 3512 _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{ 3513 RangeID: rhsDesc.RangeID, 3514 }, &getSnapArgs) 3515 if exp := "range missing intent on its local descriptor"; !testutils.IsPError(pErr, exp) { 3516 t.Fatalf("expected %q error, but got %v", exp, pErr) 3517 } 3518 3519 // Subsume when a non-deletion intent is present on the 3520 // local range descriptor should fail. 3521 err = store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 3522 if err := txn.Put(ctx, keys.RangeDescriptorKey(rhsDesc.StartKey), "garbage"); err != nil { 3523 return err 3524 } 3525 // NB: Subsume intentionally takes place outside of the txn so 3526 // that it sees an intent rather than the value the txn just wrote. 3527 _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{ 3528 RangeID: rhsDesc.RangeID, 3529 }, &getSnapArgs) 3530 if exp := "non-deletion intent on local range descriptor"; !testutils.IsPError(pErr, exp) { 3531 return fmt.Errorf("expected %q error, but got %v", exp, pErr) 3532 } 3533 return nil 3534 }) 3535 if err != nil { 3536 t.Fatal(err) 3537 } 3538 } 3539 3540 func BenchmarkStoreRangeMerge(b *testing.B) { 3541 ctx := context.Background() 3542 var mtc multiTestContext 3543 mtc.Start(b, 1) 3544 defer mtc.Stop() 3545 store := mtc.Store(0) 3546 3547 lhsDesc, rhsDesc, err := createSplitRanges(ctx, store) 3548 if err != nil { 3549 b.Fatal(err) 3550 } 3551 3552 // Write some values left and right of the proposed split key. 3553 kvserver.WriteRandomDataToRange(b, store, lhsDesc.RangeID, []byte("aaa")) 3554 kvserver.WriteRandomDataToRange(b, store, rhsDesc.RangeID, []byte("ccc")) 3555 3556 // Create args to merge the b range back into the a range. 3557 mArgs := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) 3558 3559 b.ResetTimer() 3560 for i := 0; i < b.N; i++ { 3561 // Merge the ranges. 3562 b.StartTimer() 3563 if _, err := kv.SendWrapped(ctx, store.TestSender(), mArgs); err != nil { 3564 b.Fatal(err) 3565 } 3566 3567 // Split the range. 3568 b.StopTimer() 3569 if _, _, err := createSplitRanges(ctx, store); err != nil { 3570 b.Fatal(err) 3571 } 3572 } 3573 }