github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/batcheval/cmd_end_transaction.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package batcheval 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math" 18 "sync/atomic" 19 20 "github.com/cockroachdb/cockroach/pkg/clusterversion" 21 "github.com/cockroachdb/cockroach/pkg/keys" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 26 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 29 "github.com/cockroachdb/cockroach/pkg/roachpb" 30 "github.com/cockroachdb/cockroach/pkg/storage" 31 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 32 "github.com/cockroachdb/cockroach/pkg/util/hlc" 33 "github.com/cockroachdb/cockroach/pkg/util/log" 34 "github.com/cockroachdb/cockroach/pkg/util/tracing" 35 "github.com/cockroachdb/errors" 36 "github.com/cockroachdb/logtags" 37 ) 38 39 func init() { 40 RegisterReadWriteCommand(roachpb.EndTxn, declareKeysEndTxn, EndTxn) 41 } 42 43 // declareKeysWriteTransaction is the shared portion of 44 // declareKeys{End,Heartbeat}Transaction. 45 func declareKeysWriteTransaction( 46 _ *roachpb.RangeDescriptor, 47 header roachpb.Header, 48 req roachpb.Request, 49 latchSpans *spanset.SpanSet, 50 ) { 51 if header.Txn != nil { 52 header.Txn.AssertInitialized(context.TODO()) 53 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 54 Key: keys.TransactionKey(req.Header().Key, header.Txn.ID), 55 }) 56 } 57 } 58 59 func declareKeysEndTxn( 60 desc *roachpb.RangeDescriptor, 61 header roachpb.Header, 62 req roachpb.Request, 63 latchSpans, _ *spanset.SpanSet, 64 ) { 65 et := req.(*roachpb.EndTxnRequest) 66 declareKeysWriteTransaction(desc, header, req, latchSpans) 67 var minTxnTS hlc.Timestamp 68 if header.Txn != nil { 69 header.Txn.AssertInitialized(context.TODO()) 70 minTxnTS = header.Txn.MinTimestamp 71 abortSpanAccess := spanset.SpanReadOnly 72 if !et.Commit { 73 // Rollback EndTxn requests may write to the abort span, either if 74 // their Poison flag is set, in which case they will add an abort 75 // span entry, or if their Poison flag is not set and an abort span 76 // entry already exists on this Range, in which case they will clear 77 // that entry. 78 abortSpanAccess = spanset.SpanReadWrite 79 } 80 latchSpans.AddNonMVCC(abortSpanAccess, roachpb.Span{ 81 Key: keys.AbortSpanKey(header.RangeID, header.Txn.ID), 82 }) 83 } 84 85 // If the request is intending to finalize the transaction record then it 86 // needs to declare a few extra keys. 87 if !et.IsParallelCommit() { 88 // All requests that intend on resolving local locks need to depend on 89 // the range descriptor because they need to determine which locks are 90 // within the local range. 91 latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{Key: keys.RangeDescriptorKey(desc.StartKey)}) 92 93 // The spans may extend beyond this Range, but it's ok for the 94 // purpose of acquiring latches. The parts in our Range will 95 // be resolved eagerly. 96 for _, span := range et.LockSpans { 97 latchSpans.AddMVCC(spanset.SpanReadWrite, span, minTxnTS) 98 } 99 100 if et.InternalCommitTrigger != nil { 101 if st := et.InternalCommitTrigger.SplitTrigger; st != nil { 102 // Splits may read from the entire pre-split range (they read 103 // from the LHS in all cases, and the RHS only when the existing 104 // stats contain estimates). Splits declare non-MVCC read access 105 // across the entire LHS to block all concurrent writes to the 106 // LHS because their stat deltas will interfere with the 107 // non-delta stats computed as a part of the split. Splits 108 // declare non-MVCC write access across the entire RHS to block 109 // all concurrent reads and writes to the RHS because they will 110 // fail if applied after the split. (see 111 // https://github.com/cockroachdb/cockroach/issues/14881) 112 latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ 113 Key: st.LeftDesc.StartKey.AsRawKey(), 114 EndKey: st.LeftDesc.EndKey.AsRawKey(), 115 }) 116 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 117 Key: st.RightDesc.StartKey.AsRawKey(), 118 EndKey: st.RightDesc.EndKey.AsRawKey(), 119 }) 120 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 121 Key: keys.MakeRangeKeyPrefix(st.LeftDesc.StartKey), 122 EndKey: keys.MakeRangeKeyPrefix(st.RightDesc.EndKey).PrefixEnd(), 123 }) 124 125 leftRangeIDPrefix := keys.MakeRangeIDReplicatedPrefix(header.RangeID) 126 latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ 127 Key: leftRangeIDPrefix, 128 EndKey: leftRangeIDPrefix.PrefixEnd(), 129 }) 130 rightRangeIDPrefix := keys.MakeRangeIDReplicatedPrefix(st.RightDesc.RangeID) 131 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 132 Key: rightRangeIDPrefix, 133 EndKey: rightRangeIDPrefix.PrefixEnd(), 134 }) 135 136 rightRangeIDUnreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(st.RightDesc.RangeID) 137 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 138 Key: rightRangeIDUnreplicatedPrefix, 139 EndKey: rightRangeIDUnreplicatedPrefix.PrefixEnd(), 140 }) 141 142 latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ 143 Key: keys.RangeLastReplicaGCTimestampKey(st.LeftDesc.RangeID), 144 }) 145 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 146 Key: keys.RangeLastReplicaGCTimestampKey(st.RightDesc.RangeID), 147 }) 148 149 latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ 150 Key: abortspan.MinKey(header.RangeID), 151 EndKey: abortspan.MaxKey(header.RangeID), 152 }) 153 } 154 if mt := et.InternalCommitTrigger.MergeTrigger; mt != nil { 155 // Merges copy over the RHS abort span to the LHS, and compute 156 // replicated range ID stats over the RHS in the merge trigger. 157 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{ 158 Key: abortspan.MinKey(mt.LeftDesc.RangeID), 159 EndKey: abortspan.MaxKey(mt.LeftDesc.RangeID).PrefixEnd(), 160 }) 161 latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ 162 Key: keys.MakeRangeIDReplicatedPrefix(mt.RightDesc.RangeID), 163 EndKey: keys.MakeRangeIDReplicatedPrefix(mt.RightDesc.RangeID).PrefixEnd(), 164 }) 165 } 166 } 167 } 168 } 169 170 // EndTxn either commits or aborts (rolls back) an extant transaction according 171 // to the args.Commit parameter. Rolling back an already rolled-back txn is ok. 172 // TODO(nvanbenschoten): rename this file to cmd_end_txn.go once some of andrei's 173 // recent PRs have landed. 174 func EndTxn( 175 ctx context.Context, readWriter storage.ReadWriter, cArgs CommandArgs, resp roachpb.Response, 176 ) (result.Result, error) { 177 args := cArgs.Args.(*roachpb.EndTxnRequest) 178 h := cArgs.Header 179 ms := cArgs.Stats 180 reply := resp.(*roachpb.EndTxnResponse) 181 182 if err := VerifyTransaction(h, args, roachpb.PENDING, roachpb.STAGING, roachpb.ABORTED); err != nil { 183 return result.Result{}, err 184 } 185 if args.Require1PC { 186 // If a 1PC txn was required and we're in EndTxn, we've failed to evaluate 187 // the batch as a 1PC. We're returning early instead of preferring a 188 // possible retriable error because we might want to leave locks behind in 189 // case of retriable errors - which Require1PC does not want. 190 return result.Result{}, roachpb.NewTransactionStatusError("could not commit in one phase as requested") 191 } 192 if args.Commit && args.Poison { 193 return result.Result{}, errors.Errorf("cannot poison during a committing EndTxn request") 194 } 195 196 key := keys.TransactionKey(h.Txn.Key, h.Txn.ID) 197 198 // Fetch existing transaction. 199 var existingTxn roachpb.Transaction 200 if ok, err := storage.MVCCGetProto( 201 ctx, readWriter, key, hlc.Timestamp{}, &existingTxn, storage.MVCCGetOptions{}, 202 ); err != nil { 203 return result.Result{}, err 204 } else if !ok { 205 // No existing transaction record was found - create one by writing it 206 // below in updateFinalizedTxn. 207 reply.Txn = h.Txn.Clone() 208 209 // Verify that it is safe to create the transaction record. We only need 210 // to perform this verification for commits. Rollbacks can always write 211 // an aborted txn record. 212 if args.Commit { 213 if err := CanCreateTxnRecord(cArgs.EvalCtx, reply.Txn); err != nil { 214 return result.Result{}, err 215 } 216 } 217 } else { 218 // We're using existingTxn on the reply, although it can be stale 219 // compared to the Transaction in the request (e.g. the Sequence, 220 // and various timestamps). We must be careful to update it with the 221 // supplied ba.Txn if we return it with an error which might be 222 // retried, as for example to avoid client-side serializable restart. 223 reply.Txn = &existingTxn 224 225 // Verify that we can either commit it or abort it (according 226 // to args.Commit), and also that the Timestamp and Epoch have 227 // not suffered regression. 228 switch reply.Txn.Status { 229 case roachpb.COMMITTED: 230 // This can happen if the coordinator had left the transaction in the 231 // implicitly committed state, and is now coming to clean it up. Someone 232 // else must have performed the STAGING->COMMITTED transition in the 233 // meantime. The TransactionStatusError is going to be handled by the 234 // txnCommitter interceptor. 235 log.VEventf(ctx, 2, "transaction found to be already committed") 236 return result.Result{}, roachpb.NewTransactionCommittedStatusError() 237 238 case roachpb.ABORTED: 239 if !args.Commit { 240 // The transaction has already been aborted by other. 241 // Do not return TransactionAbortedError since the client anyway 242 // wanted to abort the transaction. 243 desc := cArgs.EvalCtx.Desc() 244 resolvedLocks, externalLocks, err := resolveLocalLocks(ctx, desc, readWriter, ms, args, reply.Txn, cArgs.EvalCtx) 245 if err != nil { 246 return result.Result{}, err 247 } 248 if err := updateFinalizedTxn( 249 ctx, readWriter, ms, key, args, reply.Txn, externalLocks, 250 ); err != nil { 251 return result.Result{}, err 252 } 253 // Use alwaysReturn==true because the transaction is definitely 254 // aborted, no matter what happens to this command. 255 res := result.FromEndTxn(reply.Txn, true /* alwaysReturn */, args.Poison) 256 res.Local.ResolvedLocks = resolvedLocks 257 return res, nil 258 } 259 // If the transaction was previously aborted by a concurrent writer's 260 // push, any intents written are still open. It's only now that we know 261 // them, so we return them all for asynchronous resolution (we're 262 // currently not able to write on error, but see #1989). 263 // 264 // Similarly to above, use alwaysReturn==true. The caller isn't trying 265 // to abort, but the transaction is definitely aborted and its locks 266 // can go. 267 reply.Txn.LockSpans = args.LockSpans 268 return result.FromEndTxn(reply.Txn, true /* alwaysReturn */, args.Poison), 269 roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_ABORTED_RECORD_FOUND) 270 271 case roachpb.PENDING, roachpb.STAGING: 272 if h.Txn.Epoch < reply.Txn.Epoch { 273 return result.Result{}, errors.AssertionFailedf( 274 "programming error: epoch regression: %d", h.Txn.Epoch) 275 } 276 277 default: 278 return result.Result{}, errors.AssertionFailedf("bad txn status: %s", reply.Txn) 279 } 280 281 // Update the existing txn with the supplied txn. 282 reply.Txn.Update(h.Txn) 283 } 284 285 // Attempt to commit or abort the transaction per the args.Commit parameter. 286 if args.Commit { 287 if retry, reason, extraMsg := IsEndTxnTriggeringRetryError(reply.Txn, args); retry { 288 return result.Result{}, roachpb.NewTransactionRetryError(reason, extraMsg) 289 } 290 291 // If the transaction needs to be staged as part of an implicit commit 292 // before being explicitly committed, write the staged transaction 293 // record and return without running commit triggers or resolving local 294 // locks. 295 if args.IsParallelCommit() { 296 // It's not clear how to combine transaction recovery with commit 297 // triggers, so for now we don't allow them to mix. This shouldn't 298 // cause any issues and the txn coordinator knows not to mix them. 299 if ct := args.InternalCommitTrigger; ct != nil { 300 err := errors.Errorf("cannot stage transaction with a commit trigger: %+v", ct) 301 return result.Result{}, err 302 } 303 304 reply.Txn.Status = roachpb.STAGING 305 reply.StagingTimestamp = reply.Txn.WriteTimestamp 306 if err := updateStagingTxn(ctx, readWriter, ms, key, args, reply.Txn); err != nil { 307 return result.Result{}, err 308 } 309 return result.Result{}, nil 310 } 311 312 // Else, the transaction can be explicitly committed. 313 reply.Txn.Status = roachpb.COMMITTED 314 } else { 315 reply.Txn.Status = roachpb.ABORTED 316 } 317 318 // Resolve locks on the local range synchronously so that their resolution 319 // ends up in the same Raft entry. There should always be at least one because 320 // we position the transaction record next to the first write of a transaction. 321 // This avoids the need for the intentResolver to have to return to this range 322 // to resolve locks for this transaction in the future. 323 desc := cArgs.EvalCtx.Desc() 324 resolvedLocks, externalLocks, err := resolveLocalLocks(ctx, desc, readWriter, ms, args, reply.Txn, cArgs.EvalCtx) 325 if err != nil { 326 return result.Result{}, err 327 } 328 if err := updateFinalizedTxn(ctx, readWriter, ms, key, args, reply.Txn, externalLocks); err != nil { 329 return result.Result{}, err 330 } 331 332 // Note: there's no need to clear the AbortSpan state if we've successfully 333 // finalized a transaction, as there's no way in which an abort cache entry 334 // could have been written (the txn would already have been in 335 // state=ABORTED). 336 // 337 // Summary of transaction replay protection after EndTxn: When a 338 // transactional write gets replayed over its own resolved intents, the 339 // write will succeed but only as an intent with a newer timestamp (with a 340 // WriteTooOldError). However, the replayed intent cannot be resolved by a 341 // subsequent replay of this EndTxn call because the txn timestamp will be 342 // too old. Replays of requests which attempt to create a new txn record 343 // (HeartbeatTxn or EndTxn) never succeed because EndTxn inserts in the 344 // timestamp cache in Replica's updateTimestampCache method, forcing 345 // the call to CanCreateTxnRecord to return false, resulting in a 346 // transaction retry error. If the replay didn't attempt to create a txn 347 // record, any push will immediately succeed as a missing txn record on push 348 // where CanCreateTxnRecord returns false succeeds. In both cases, the txn 349 // will be GC'd on the slow path. 350 // 351 // We specify alwaysReturn==false because if the commit fails below Raft, we 352 // don't want the locks to be up for resolution. That should happen only if 353 // the commit actually happens; otherwise, we risk losing writes. 354 txnResult := result.FromEndTxn(reply.Txn, false /* alwaysReturn */, args.Poison) 355 txnResult.Local.UpdatedTxns = []*roachpb.Transaction{reply.Txn} 356 txnResult.Local.ResolvedLocks = resolvedLocks 357 358 // Run the rest of the commit triggers if successfully committed. 359 if reply.Txn.Status == roachpb.COMMITTED { 360 triggerResult, err := RunCommitTrigger( 361 ctx, cArgs.EvalCtx, readWriter.(storage.Batch), ms, args, reply.Txn, 362 ) 363 if err != nil { 364 return result.Result{}, roachpb.NewReplicaCorruptionError(err) 365 } 366 if err := txnResult.MergeAndDestroy(triggerResult); err != nil { 367 return result.Result{}, err 368 } 369 } else if reply.Txn.Status == roachpb.ABORTED { 370 // If this is the system config span and we're aborted, add a trigger to 371 // potentially gossip now that we've removed an intent. This is important 372 // to deal with cases where previously committed values were not gossipped 373 // due to an outstanding intent. 374 if cArgs.EvalCtx.ContainsKey(keys.SystemConfigSpan.Key) { 375 txnResult.Local.MaybeGossipSystemConfigIfHaveFailure = true 376 } 377 } 378 379 return txnResult, nil 380 } 381 382 // IsEndTxnExceedingDeadline returns true if the transaction exceeded its 383 // deadline. 384 func IsEndTxnExceedingDeadline(t hlc.Timestamp, args *roachpb.EndTxnRequest) bool { 385 return args.Deadline != nil && args.Deadline.LessEq(t) 386 } 387 388 // IsEndTxnTriggeringRetryError returns true if the EndTxnRequest cannot be 389 // committed and needs to return a TransactionRetryError. It also returns the 390 // reason and possibly an extra message to be used for the error. 391 func IsEndTxnTriggeringRetryError( 392 txn *roachpb.Transaction, args *roachpb.EndTxnRequest, 393 ) (retry bool, reason roachpb.TransactionRetryReason, extraMsg string) { 394 // If we saw any WriteTooOldErrors, we must restart to avoid lost 395 // update anomalies. 396 if txn.WriteTooOld { 397 retry, reason = true, roachpb.RETRY_WRITE_TOO_OLD 398 } else { 399 readTimestamp := txn.ReadTimestamp 400 isTxnPushed := txn.WriteTimestamp != readTimestamp 401 402 // Return a transaction retry error if the commit timestamp isn't equal to 403 // the txn timestamp. 404 if isTxnPushed { 405 retry, reason = true, roachpb.RETRY_SERIALIZABLE 406 } 407 } 408 409 // A transaction must obey its deadline, if set. 410 if !retry && IsEndTxnExceedingDeadline(txn.WriteTimestamp, args) { 411 exceededBy := txn.WriteTimestamp.GoTime().Sub(args.Deadline.GoTime()) 412 extraMsg = fmt.Sprintf( 413 "txn timestamp pushed too much; deadline exceeded by %s (%s > %s)", 414 exceededBy, txn.WriteTimestamp, args.Deadline) 415 retry, reason = true, roachpb.RETRY_COMMIT_DEADLINE_EXCEEDED 416 } 417 return retry, reason, extraMsg 418 } 419 420 // CanForwardCommitTimestampWithoutRefresh returns whether a txn can be 421 // safely committed with a timestamp above its read timestamp without 422 // requiring a read refresh (see txnSpanRefresher). This requires that 423 // the transaction's timestamp has not leaked and that the transaction 424 // has encountered no spans which require refreshing at the forwarded 425 // timestamp. If either of those conditions are true, a client-side 426 // retry is required. 427 // 428 // Note that when deciding whether a transaction can be bumped to a particular 429 // timestamp, the transaction's deadling must also be taken into account. 430 func CanForwardCommitTimestampWithoutRefresh( 431 txn *roachpb.Transaction, args *roachpb.EndTxnRequest, 432 ) bool { 433 return !txn.CommitTimestampFixed && args.CanCommitAtHigherTimestamp 434 } 435 436 const lockResolutionBatchSize = 500 437 438 // resolveLocalLocks synchronously resolves any locks that are local to this 439 // range in the same batch and returns those lock spans. The remainder are 440 // collected and returned so that they can be handed off to asynchronous 441 // processing. Note that there is a maximum lock resolution allowance of 442 // lockResolutionBatchSize meant to avoid creating a batch which is too large 443 // for Raft. Any local locks which exceed the allowance are treated as 444 // external and are resolved asynchronously with the external locks. 445 func resolveLocalLocks( 446 ctx context.Context, 447 desc *roachpb.RangeDescriptor, 448 readWriter storage.ReadWriter, 449 ms *enginepb.MVCCStats, 450 args *roachpb.EndTxnRequest, 451 txn *roachpb.Transaction, 452 evalCtx EvalContext, 453 ) (resolvedLocks []roachpb.LockUpdate, externalLocks []roachpb.Span, _ error) { 454 if mergeTrigger := args.InternalCommitTrigger.GetMergeTrigger(); mergeTrigger != nil { 455 // If this is a merge, then use the post-merge descriptor to determine 456 // which locks are local (note that for a split, we want to use the 457 // pre-split one instead because it's larger). 458 desc = &mergeTrigger.LeftDesc 459 } 460 461 iter := readWriter.NewIterator(storage.IterOptions{ 462 UpperBound: desc.EndKey.AsRawKey(), 463 }) 464 iterAndBuf := storage.GetBufUsingIter(iter) 465 defer iterAndBuf.Cleanup() 466 467 var resolveAllowance int64 = lockResolutionBatchSize 468 if args.InternalCommitTrigger != nil { 469 // If this is a system transaction (such as a split or merge), don't enforce the resolve allowance. 470 // These transactions rely on having their locks resolved synchronously. 471 resolveAllowance = math.MaxInt64 472 } 473 for _, span := range args.LockSpans { 474 if err := func() error { 475 if resolveAllowance == 0 { 476 externalLocks = append(externalLocks, span) 477 return nil 478 } 479 update := roachpb.MakeLockUpdate(txn, span) 480 if len(span.EndKey) == 0 { 481 // For single-key lock updates, do a KeyAddress-aware check of 482 // whether it's contained in our Range. 483 if !kvserverbase.ContainsKey(desc, span.Key) { 484 externalLocks = append(externalLocks, span) 485 return nil 486 } 487 resolveMS := ms 488 ok, err := storage.MVCCResolveWriteIntentUsingIter(ctx, readWriter, iterAndBuf, resolveMS, update) 489 if err != nil { 490 return err 491 } 492 if ok { 493 resolveAllowance-- 494 } 495 resolvedLocks = append(resolvedLocks, update) 496 return nil 497 } 498 // For update ranges, cut into parts inside and outside our key 499 // range. Resolve locally inside, delegate the rest. In particular, 500 // an update range for range-local data is correctly considered local. 501 inSpan, outSpans := kvserverbase.IntersectSpan(span, desc) 502 externalLocks = append(externalLocks, outSpans...) 503 if inSpan != nil { 504 update.Span = *inSpan 505 num, resumeSpan, err := storage.MVCCResolveWriteIntentRangeUsingIter(ctx, readWriter, iterAndBuf, ms, update, resolveAllowance) 506 if err != nil { 507 return err 508 } 509 if evalCtx.EvalKnobs().NumKeysEvaluatedForRangeIntentResolution != nil { 510 atomic.AddInt64(evalCtx.EvalKnobs().NumKeysEvaluatedForRangeIntentResolution, num) 511 } 512 resolveAllowance -= num 513 if resumeSpan != nil { 514 if resolveAllowance != 0 { 515 log.Fatalf(ctx, "expected resolve allowance to be exactly 0 resolving %s; got %d", update.Span, resolveAllowance) 516 } 517 update.EndKey = resumeSpan.Key 518 externalLocks = append(externalLocks, *resumeSpan) 519 } 520 resolvedLocks = append(resolvedLocks, update) 521 return nil 522 } 523 return nil 524 }(); err != nil { 525 return nil, nil, errors.Wrapf(err, "resolving lock at %s on end transaction [%s]", span, txn.Status) 526 } 527 } 528 529 removedAny := resolveAllowance != lockResolutionBatchSize 530 if WriteAbortSpanOnResolve(txn.Status, args.Poison, removedAny) { 531 if err := UpdateAbortSpan(ctx, evalCtx, readWriter, ms, txn.TxnMeta, args.Poison); err != nil { 532 return nil, nil, err 533 } 534 } 535 return resolvedLocks, externalLocks, nil 536 } 537 538 // updateStagingTxn persists the STAGING transaction record with updated status 539 // (and possibly timestamp). It persists the record with the EndTxn request's 540 // declared in-flight writes along with all of the transaction's (local and 541 // remote) locks. 542 func updateStagingTxn( 543 ctx context.Context, 544 readWriter storage.ReadWriter, 545 ms *enginepb.MVCCStats, 546 key []byte, 547 args *roachpb.EndTxnRequest, 548 txn *roachpb.Transaction, 549 ) error { 550 txn.LockSpans = args.LockSpans 551 txn.InFlightWrites = args.InFlightWrites 552 txnRecord := txn.AsRecord() 553 return storage.MVCCPutProto(ctx, readWriter, ms, key, hlc.Timestamp{}, nil /* txn */, &txnRecord) 554 } 555 556 // updateFinalizedTxn persists the COMMITTED or ABORTED transaction record with 557 // updated status (and possibly timestamp). If we've already resolved all locks 558 // locally, we actually delete the record right away - no use in keeping it 559 // around. 560 func updateFinalizedTxn( 561 ctx context.Context, 562 readWriter storage.ReadWriter, 563 ms *enginepb.MVCCStats, 564 key []byte, 565 args *roachpb.EndTxnRequest, 566 txn *roachpb.Transaction, 567 externalLocks []roachpb.Span, 568 ) error { 569 if txnAutoGC && len(externalLocks) == 0 { 570 if log.V(2) { 571 log.Infof(ctx, "auto-gc'ed %s (%d locks)", txn.Short(), len(args.LockSpans)) 572 } 573 return storage.MVCCDelete(ctx, readWriter, ms, key, hlc.Timestamp{}, nil /* txn */) 574 } 575 txn.LockSpans = externalLocks 576 txn.InFlightWrites = nil 577 txnRecord := txn.AsRecord() 578 return storage.MVCCPutProto(ctx, readWriter, ms, key, hlc.Timestamp{}, nil /* txn */, &txnRecord) 579 } 580 581 // RunCommitTrigger runs the commit trigger from an end transaction request. 582 func RunCommitTrigger( 583 ctx context.Context, 584 rec EvalContext, 585 batch storage.Batch, 586 ms *enginepb.MVCCStats, 587 args *roachpb.EndTxnRequest, 588 txn *roachpb.Transaction, 589 ) (result.Result, error) { 590 ct := args.InternalCommitTrigger 591 if ct == nil { 592 return result.Result{}, nil 593 } 594 595 if ct.GetSplitTrigger() != nil { 596 newMS, trigger, err := splitTrigger( 597 ctx, rec, batch, *ms, ct.SplitTrigger, txn.WriteTimestamp, 598 ) 599 *ms = newMS 600 return trigger, err 601 } 602 if mt := ct.GetMergeTrigger(); mt != nil { 603 return mergeTrigger(ctx, rec, batch, ms, mt, txn.WriteTimestamp) 604 } 605 if crt := ct.GetChangeReplicasTrigger(); crt != nil { 606 // TODO(tbg): once we support atomic replication changes, check that 607 // crt.Added() and crt.Removed() don't intersect (including mentioning 608 // the same replica more than once individually) because it would be 609 // silly (though possible) to have to attach semantics to that. 610 return changeReplicasTrigger(ctx, rec, batch, crt), nil 611 } 612 if ct.GetModifiedSpanTrigger() != nil { 613 var pd result.Result 614 if ct.ModifiedSpanTrigger.SystemConfigSpan { 615 // Check if we need to gossip the system config. 616 // NOTE: System config gossiping can only execute correctly if 617 // the transaction record is located on the range that contains 618 // the system span. If a transaction is created which modifies 619 // both system *and* non-system data, it should be ensured that 620 // the transaction record itself is on the system span. This can 621 // be done by making sure a system key is the first key touched 622 // in the transaction. 623 if rec.ContainsKey(keys.SystemConfigSpan.Key) { 624 if err := pd.MergeAndDestroy( 625 result.Result{ 626 Local: result.LocalResult{ 627 MaybeGossipSystemConfig: true, 628 }, 629 }, 630 ); err != nil { 631 return result.Result{}, err 632 } 633 } else { 634 log.Errorf(ctx, "System configuration span was modified, but the "+ 635 "modification trigger is executing on a non-system range. "+ 636 "Configuration changes will not be gossiped.") 637 } 638 } 639 if nlSpan := ct.ModifiedSpanTrigger.NodeLivenessSpan; nlSpan != nil { 640 if err := pd.MergeAndDestroy( 641 result.Result{ 642 Local: result.LocalResult{ 643 MaybeGossipNodeLiveness: nlSpan, 644 }, 645 }, 646 ); err != nil { 647 return result.Result{}, err 648 } 649 } 650 return pd, nil 651 } 652 if sbt := ct.GetStickyBitTrigger(); sbt != nil { 653 newDesc := *rec.Desc() 654 if sbt.StickyBit != (hlc.Timestamp{}) { 655 newDesc.StickyBit = &sbt.StickyBit 656 } else { 657 newDesc.StickyBit = nil 658 } 659 var res result.Result 660 res.Replicated.State = &kvserverpb.ReplicaState{ 661 Desc: &newDesc, 662 } 663 return res, nil 664 } 665 666 log.Fatalf(ctx, "unknown commit trigger: %+v", ct) 667 return result.Result{}, nil 668 } 669 670 // splitTrigger is called on a successful commit of a transaction 671 // containing an AdminSplit operation. It copies the AbortSpan for 672 // the new range and recomputes stats for both the existing, left hand 673 // side (LHS) range and the right hand side (RHS) range. For 674 // performance it only computes the stats for the original range (the 675 // left hand side) and infers the RHS stats by subtracting from the 676 // original stats. We compute the LHS stats because the split key 677 // computation ensures that we do not create large LHS 678 // ranges. However, this optimization is only possible if the stats 679 // are fully accurate. If they contain estimates, stats for both the 680 // LHS and RHS are computed. 681 // 682 // Splits are complicated. A split is initiated when a replica receives an 683 // AdminSplit request. Note that this request (and other "admin" requests) 684 // differs from normal requests in that it doesn't go through Raft but instead 685 // allows the lease holder Replica to act as the orchestrator for the 686 // distributed transaction that performs the split. As such, this request is 687 // only executed on the lease holder replica and the request is redirected to 688 // the lease holder if the recipient is a follower. 689 // 690 // Splits do not require the lease for correctness (which is good, because we 691 // only check that the lease is held at the beginning of the operation, and 692 // have no way to ensure that it is continually held until the end). Followers 693 // could perform splits too, and the only downside would be that if two splits 694 // were attempted concurrently (or a split and a ChangeReplicas), one would 695 // fail. The lease is used to designate one replica for this role and avoid 696 // wasting time on splits that may fail. 697 // 698 // The processing of splits is divided into two phases. The first phase occurs 699 // in Replica.AdminSplit. In that phase, the split-point is computed, and a 700 // transaction is started which updates both the LHS and RHS range descriptors 701 // and the meta range addressing information. (If we're splitting a meta2 range 702 // we'll be updating the meta1 addressing, otherwise we'll be updating the 703 // meta2 addressing). That transaction includes a special SplitTrigger flag on 704 // the EndTxn request. Like all transactions, the requests within the 705 // transaction are replicated via Raft, including the EndTxn request. 706 // 707 // The second phase of split processing occurs when each replica for the range 708 // encounters the SplitTrigger. Processing of the SplitTrigger happens below, 709 // in Replica.splitTrigger. The processing of the SplitTrigger occurs in two 710 // stages. The first stage operates within the context of an engine.Batch and 711 // updates all of the on-disk state for the old and new ranges atomically. The 712 // second stage is invoked when the batch commits and updates the in-memory 713 // state, creating the new replica in memory and populating its timestamp cache 714 // and registering it with the store. 715 // 716 // There is lots of subtlety here. The easy scenario is that all of the 717 // replicas process the SplitTrigger before processing any Raft message for RHS 718 // (right hand side) of the newly split range. Something like: 719 // 720 // Node A Node B Node C 721 // ---------------------------------------------------- 722 // range 1 | | | 723 // | | | 724 // SplitTrigger | | 725 // | SplitTrigger | 726 // | | SplitTrigger 727 // | | | 728 // ---------------------------------------------------- 729 // split finished on A, B and C | | 730 // | | | 731 // range 2 | | | 732 // | ---- MsgVote --> | | 733 // | ---------------------- MsgVote ---> | 734 // 735 // But that ideal ordering is not guaranteed. The split is "finished" when two 736 // of the replicas have appended the end-txn request containing the 737 // SplitTrigger to their Raft log. The following scenario is possible: 738 // 739 // Node A Node B Node C 740 // ---------------------------------------------------- 741 // range 1 | | | 742 // | | | 743 // SplitTrigger | | 744 // | SplitTrigger | 745 // | | | 746 // ---------------------------------------------------- 747 // split finished on A and B | | 748 // | | | 749 // range 2 | | | 750 // | ---- MsgVote --> | | 751 // | --------------------- MsgVote ---> ??? 752 // | | | 753 // | | SplitTrigger 754 // 755 // In this scenario, C will create range 2 upon reception of the MsgVote from 756 // A, though locally that span of keys is still part of range 1. This is 757 // possible because at the Raft level ranges are identified by integer IDs and 758 // it isn't until C receives a snapshot of range 2 from the leader that it 759 // discovers the span of keys it covers. In order to prevent C from fully 760 // initializing range 2 in this instance, we prohibit applying a snapshot to a 761 // range if the snapshot overlaps another range. See Store.canApplySnapshotLocked. 762 // 763 // But while a snapshot may not have been applied at C, an uninitialized 764 // Replica was created. An uninitialized Replica is one which belongs to a Raft 765 // group but for which the range descriptor has not been received. This Replica 766 // will have participated in the Raft elections. When we're creating the new 767 // Replica below we take control of this uninitialized Replica and stop it from 768 // responding to Raft messages by marking it "destroyed". Note that we use the 769 // Replica.mu.destroyed field for this, but we don't do everything that 770 // Replica.Destroy does (so we should probably rename that field in light of 771 // its new uses). In particular we don't touch any data on disk or leave a 772 // tombstone. This is especially important because leaving a tombstone would 773 // prevent the legitimate recreation of this replica. 774 // 775 // There is subtle synchronization here that is currently controlled by the 776 // Store.processRaft goroutine. In particular, the serial execution of 777 // Replica.handleRaftReady by Store.processRaft ensures that an uninitialized 778 // RHS won't be concurrently executing in Replica.handleRaftReady because we're 779 // currently running on that goroutine (i.e. Replica.splitTrigger is called on 780 // the processRaft goroutine). 781 // 782 // TODO(peter): The above synchronization needs to be fixed. Using a single 783 // goroutine for executing Replica.handleRaftReady is undesirable from a 784 // performance perspective. Likely we will have to add a mutex to Replica to 785 // protect handleRaftReady and to grab that mutex below when marking the 786 // uninitialized Replica as "destroyed". Hopefully we'll also be able to remove 787 // Store.processRaftMu. 788 // 789 // Note that in this more complex scenario, A (which performed the SplitTrigger 790 // first) will create the associated Raft group for range 2 and start 791 // campaigning immediately. It is possible for B to receive MsgVote requests 792 // before it has applied the SplitTrigger as well. Both B and C will vote for A 793 // (and preserve the records of that vote in their HardState). It is critically 794 // important for Raft correctness that we do not lose the records of these 795 // votes. After electing A the Raft leader for range 2, A will then attempt to 796 // send a snapshot to B and C and we'll fall into the situation above where a 797 // snapshot is received for a range before it has finished splitting from its 798 // sibling and is thus rejected. An interesting subtlety here: A will send a 799 // snapshot to B and C because when range 2 is initialized we were careful set 800 // synthesize its HardState to set its Raft log index to 10. If we had instead 801 // used log index 0, Raft would have believed the group to be empty, but the 802 // RHS has something. Using a non-zero initial log index causes Raft to believe 803 // that there is a discarded prefix to the log and will thus send a snapshot to 804 // followers. 805 // 806 // A final point of clarification: when we split a range we're splitting the 807 // data the range contains. But we're not forking or splitting the associated 808 // Raft group. Instead, we're creating a new Raft group to control the RHS of 809 // the split. That Raft group is starting from an empty Raft log (positioned at 810 // log entry 10) and a snapshot of the RHS of the split range. 811 // 812 // After the split trigger returns, the on-disk state of the right-hand side 813 // will be suitable for instantiating the right hand side Replica, and 814 // a suitable trigger is returned, along with the updated stats which represent 815 // the LHS delta caused by the split (i.e. all writes in the current batch 816 // which went to the left-hand side, minus the kv pairs which moved to the 817 // RHS). 818 // 819 // These stats are suitable for returning up the callstack like those for 820 // regular commands; the corresponding delta for the RHS is part of the 821 // returned trigger and is handled by the Store. 822 func splitTrigger( 823 ctx context.Context, 824 rec EvalContext, 825 batch storage.Batch, 826 bothDeltaMS enginepb.MVCCStats, 827 split *roachpb.SplitTrigger, 828 ts hlc.Timestamp, 829 ) (enginepb.MVCCStats, result.Result, error) { 830 // TODO(andrei): should this span be a child of the ctx's (if any)? 831 sp := rec.ClusterSettings().Tracer.StartRootSpan( 832 "split", logtags.FromContext(ctx), tracing.NonRecordableSpan, 833 ) 834 defer sp.Finish() 835 desc := rec.Desc() 836 if !bytes.Equal(desc.StartKey, split.LeftDesc.StartKey) || 837 !bytes.Equal(desc.EndKey, split.RightDesc.EndKey) { 838 return enginepb.MVCCStats{}, result.Result{}, errors.Errorf("range does not match splits: (%s-%s) + (%s-%s) != %s", 839 split.LeftDesc.StartKey, split.LeftDesc.EndKey, 840 split.RightDesc.StartKey, split.RightDesc.EndKey, desc) 841 } 842 843 // Compute the absolute stats for the (post-split) LHS. No more 844 // modifications to it are allowed after this line. 845 846 leftMS, err := rditer.ComputeStatsForRange(&split.LeftDesc, batch, ts.WallTime) 847 if err != nil { 848 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to compute stats for LHS range after split") 849 } 850 log.Event(ctx, "computed stats for left hand side range") 851 852 h := splitStatsHelperInput{ 853 AbsPreSplitBothEstimated: rec.GetMVCCStats(), 854 DeltaBatchEstimated: bothDeltaMS, 855 AbsPostSplitLeft: leftMS, 856 AbsPostSplitRightFn: func() (enginepb.MVCCStats, error) { 857 rightMS, err := rditer.ComputeStatsForRange( 858 &split.RightDesc, batch, ts.WallTime, 859 ) 860 return rightMS, errors.Wrap( 861 err, 862 "unable to compute stats for RHS range after split", 863 ) 864 }, 865 } 866 return splitTriggerHelper(ctx, rec, batch, h, split, ts) 867 } 868 869 // splitTriggerHelper continues the work begun by splitTrigger, but has a 870 // reduced scope that has all stats-related concerns bundled into a 871 // splitStatsHelper. 872 func splitTriggerHelper( 873 ctx context.Context, 874 rec EvalContext, 875 batch storage.Batch, 876 statsInput splitStatsHelperInput, 877 split *roachpb.SplitTrigger, 878 ts hlc.Timestamp, 879 ) (enginepb.MVCCStats, result.Result, error) { 880 // TODO(d4l3k): we should check which side of the split is smaller 881 // and compute stats for it instead of having a constraint that the 882 // left hand side is smaller. 883 884 // NB: the replicated post-split left hand keyspace is frozen at this point. 885 // Only the RHS can be mutated (and we do so to seed its state). 886 887 // Copy the last replica GC timestamp. This value is unreplicated, 888 // which is why the MVCC stats are set to nil on calls to 889 // MVCCPutProto. 890 replicaGCTS, err := rec.GetLastReplicaGCTimestamp(ctx) 891 if err != nil { 892 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to fetch last replica GC timestamp") 893 } 894 if err := storage.MVCCPutProto(ctx, batch, nil, keys.RangeLastReplicaGCTimestampKey(split.RightDesc.RangeID), hlc.Timestamp{}, nil, &replicaGCTS); err != nil { 895 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to copy last replica GC timestamp") 896 } 897 898 h, err := makeSplitStatsHelper(statsInput) 899 if err != nil { 900 return enginepb.MVCCStats{}, result.Result{}, err 901 } 902 903 // Initialize the RHS range's AbortSpan by copying the LHS's. 904 if err := rec.AbortSpan().CopyTo( 905 ctx, batch, batch, h.AbsPostSplitRight(), ts, split.RightDesc.RangeID, 906 ); err != nil { 907 return enginepb.MVCCStats{}, result.Result{}, err 908 } 909 910 // Note: we don't copy the queue last processed times. This means 911 // we'll process the RHS range in consistency and time series 912 // maintenance queues again possibly sooner than if we copied. The 913 // lock is to limit post-raft logic. 914 915 // Now that we've computed the stats for the RHS so far, we persist them. 916 // This looks a bit more complicated than it really is: updating the stats 917 // also changes the stats, and we write not only the stats but a complete 918 // initial state. Additionally, since bothDeltaMS is tracking writes to 919 // both sides, we need to update it as well. 920 { 921 // Various pieces of code rely on a replica's lease never being unitialized, 922 // but it's more than that - it ensures that we properly initialize the 923 // timestamp cache, which is only populated on the lease holder, from that 924 // of the original Range. We found out about a regression here the hard way 925 // in #7899. Prior to this block, the following could happen: 926 // - a client reads key 'd', leaving an entry in the timestamp cache on the 927 // lease holder of [a,e) at the time, node one. 928 // - the range [a,e) splits at key 'c'. [c,e) starts out without a lease. 929 // - the replicas of [a,e) on nodes one and two both process the split 930 // trigger and thus copy their timestamp caches to the new right-hand side 931 // Replica. However, only node one's timestamp cache contains information 932 // about the read of key 'd' in the first place. 933 // - node two becomes the lease holder for [c,e). Its timestamp cache does 934 // not know about the read at 'd' which happened at the beginning. 935 // - node two can illegally propose a write to 'd' at a lower timestamp. 936 // 937 // TODO(tschottdorf): why would this use r.store.Engine() and not the 938 // batch? 939 leftLease, err := MakeStateLoader(rec).LoadLease(ctx, rec.Engine()) 940 if err != nil { 941 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load lease") 942 } 943 if (leftLease == roachpb.Lease{}) { 944 log.Fatalf(ctx, "LHS of split has no lease") 945 } 946 947 replica, found := split.RightDesc.GetReplicaDescriptor(leftLease.Replica.StoreID) 948 if !found { 949 return enginepb.MVCCStats{}, result.Result{}, errors.Errorf( 950 "pre-split lease holder %+v not found in post-split descriptor %+v", 951 leftLease.Replica, split.RightDesc, 952 ) 953 } 954 rightLease := leftLease 955 rightLease.Replica = replica 956 957 gcThreshold, err := MakeStateLoader(rec).LoadGCThreshold(ctx, rec.Engine()) 958 if err != nil { 959 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load GCThreshold") 960 } 961 if (*gcThreshold == hlc.Timestamp{}) { 962 log.VEventf(ctx, 1, "LHS's GCThreshold of split is not set") 963 } 964 965 // We're about to write the initial state for the replica. We migrated 966 // the formerly replicated truncated state into unreplicated keyspace 967 // in 19.1, but this range may still be using the replicated version 968 // and we need to make a decision about what to use for the RHS that 969 // is consistent across the followers: do for the RHS what the LHS 970 // does: if the LHS has the legacy key, initialize the RHS with a 971 // legacy key as well. 972 // 973 // See VersionUnreplicatedRaftTruncatedState. 974 truncStateType := stateloader.TruncatedStateUnreplicated 975 if found, err := storage.MVCCGetProto( 976 ctx, 977 batch, 978 keys.RaftTruncatedStateLegacyKey(rec.GetRangeID()), 979 hlc.Timestamp{}, 980 nil, 981 storage.MVCCGetOptions{}, 982 ); err != nil { 983 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load legacy truncated state") 984 } else if found { 985 truncStateType = stateloader.TruncatedStateLegacyReplicated 986 } 987 988 // Writing the initial state is subtle since this also seeds the Raft 989 // group. It becomes more subtle due to proposer-evaluated Raft. 990 // 991 // We are writing to the right hand side's Raft group state in this 992 // batch so we need to synchronize with anything else that could be 993 // touching that replica's Raft state. Specifically, we want to prohibit 994 // an uninitialized Replica from receiving a message for the right hand 995 // side range and performing raft processing. This is achieved by 996 // serializing execution of uninitialized Replicas in Store.processRaft 997 // and ensuring that no uninitialized Replica is being processed while 998 // an initialized one (like the one currently being split) is being 999 // processed. 1000 // 1001 // Since the right hand side of the split's Raft group may already 1002 // exist, we must be prepared to absorb an existing HardState. The Raft 1003 // group may already exist because other nodes could already have 1004 // processed the split and started talking to our node, prompting the 1005 // creation of a Raft group that can vote and bump its term, but not 1006 // much else: it can't receive snapshots because those intersect the 1007 // pre-split range; it can't apply log commands because it needs a 1008 // snapshot first. 1009 // 1010 // However, we can't absorb the right-hand side's HardState here because 1011 // we only *evaluate* the proposal here, but by the time it is 1012 // *applied*, the HardState could have changed. We do this downstream of 1013 // Raft, in splitPostApply, where we write the last index and the 1014 // HardState via a call to synthesizeRaftState. Here, we only call 1015 // writeInitialReplicaState which essentially writes a ReplicaState 1016 // only. 1017 1018 *h.AbsPostSplitRight(), err = stateloader.WriteInitialReplicaState( 1019 ctx, batch, *h.AbsPostSplitRight(), split.RightDesc, rightLease, 1020 *gcThreshold, truncStateType, 1021 ) 1022 if err != nil { 1023 return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to write initial Replica state") 1024 } 1025 } 1026 1027 var pd result.Result 1028 pd.Replicated.Split = &kvserverpb.Split{ 1029 SplitTrigger: *split, 1030 // NB: the RHSDelta is identical to the stats for the newly created right 1031 // hand side range (i.e. it goes from zero to its stats). 1032 RHSDelta: *h.AbsPostSplitRight(), 1033 } 1034 1035 deltaPostSplitLeft := h.DeltaPostSplitLeft() 1036 if !rec.ClusterSettings().Version.IsActive(ctx, clusterversion.VersionContainsEstimatesCounter) { 1037 deltaPostSplitLeft.ContainsEstimates = 0 1038 } 1039 return deltaPostSplitLeft, pd, nil 1040 } 1041 1042 // mergeTrigger is called on a successful commit of an AdminMerge transaction. 1043 // It calculates stats for the LHS by merging in RHS stats, and copies over the 1044 // abort span entries from the RHS. 1045 func mergeTrigger( 1046 ctx context.Context, 1047 rec EvalContext, 1048 batch storage.Batch, 1049 ms *enginepb.MVCCStats, 1050 merge *roachpb.MergeTrigger, 1051 ts hlc.Timestamp, 1052 ) (result.Result, error) { 1053 desc := rec.Desc() 1054 if !bytes.Equal(desc.StartKey, merge.LeftDesc.StartKey) { 1055 return result.Result{}, errors.Errorf("LHS range start keys do not match: %s != %s", 1056 desc.StartKey, merge.LeftDesc.StartKey) 1057 } 1058 if !desc.EndKey.Less(merge.LeftDesc.EndKey) { 1059 return result.Result{}, errors.Errorf("original LHS end key is not less than the post merge end key: %s >= %s", 1060 desc.EndKey, merge.LeftDesc.EndKey) 1061 } 1062 1063 if err := abortspan.New(merge.RightDesc.RangeID).CopyTo( 1064 ctx, batch, batch, ms, ts, merge.LeftDesc.RangeID, 1065 ); err != nil { 1066 return result.Result{}, err 1067 } 1068 1069 // The stats for the merged range are the sum of the LHS and RHS stats, less 1070 // the RHS's replicated range ID stats. The only replicated range ID keys we 1071 // copy from the RHS are the keys in the abort span, and we've already 1072 // accounted for those stats above. 1073 ms.Add(merge.RightMVCCStats) 1074 { 1075 ridPrefix := keys.MakeRangeIDReplicatedPrefix(merge.RightDesc.RangeID) 1076 iter := batch.NewIterator(storage.IterOptions{UpperBound: ridPrefix.PrefixEnd()}) 1077 defer iter.Close() 1078 sysMS, err := iter.ComputeStats(ridPrefix, ridPrefix.PrefixEnd(), 0 /* nowNanos */) 1079 if err != nil { 1080 return result.Result{}, err 1081 } 1082 ms.Subtract(sysMS) 1083 } 1084 1085 var pd result.Result 1086 pd.Replicated.Merge = &kvserverpb.Merge{ 1087 MergeTrigger: *merge, 1088 } 1089 return pd, nil 1090 } 1091 1092 func changeReplicasTrigger( 1093 _ context.Context, rec EvalContext, _ storage.Batch, change *roachpb.ChangeReplicasTrigger, 1094 ) result.Result { 1095 var pd result.Result 1096 // After a successful replica addition or removal check to see if the 1097 // range needs to be split. Splitting usually takes precedence over 1098 // replication via configuration of the split and replicate queues, but 1099 // if the split occurs concurrently with the replicas change the split 1100 // can fail and won't retry until the next scanner cycle. Re-queuing 1101 // the replica here removes that latency. 1102 pd.Local.MaybeAddToSplitQueue = true 1103 1104 // Gossip the first range whenever the range descriptor changes. We also 1105 // gossip the first range whenever the lease holder changes, but that might 1106 // not have occurred if a replica was being added or the non-lease-holder 1107 // replica was being removed. Note that we attempt the gossiping even from 1108 // the removed replica in case it was the lease-holder and it is still 1109 // holding the lease. 1110 pd.Local.GossipFirstRange = rec.IsFirstRange() 1111 1112 var desc roachpb.RangeDescriptor 1113 if change.Desc != nil { 1114 // Trigger proposed by a 19.2+ node (and we're a 19.2+ node as well). 1115 desc = *change.Desc 1116 } else { 1117 // Trigger proposed by a 19.1 node. Reconstruct descriptor from deprecated 1118 // fields. 1119 desc = *rec.Desc() 1120 desc.SetReplicas(roachpb.MakeReplicaDescriptors(change.DeprecatedUpdatedReplicas)) 1121 desc.NextReplicaID = change.DeprecatedNextReplicaID 1122 } 1123 1124 pd.Replicated.State = &kvserverpb.ReplicaState{ 1125 Desc: &desc, 1126 } 1127 pd.Replicated.ChangeReplicas = &kvserverpb.ChangeReplicas{ 1128 ChangeReplicasTrigger: *change, 1129 } 1130 1131 return pd 1132 } 1133 1134 // txnAutoGC controls whether Transaction entries are automatically gc'ed upon 1135 // EndTxn if they only have local locks (which can be resolved synchronously 1136 // with EndTxn). Certain tests become simpler with this being turned off. 1137 var txnAutoGC = true 1138 1139 // TestingSetTxnAutoGC is used in tests to temporarily enable/disable 1140 // txnAutoGC. 1141 func TestingSetTxnAutoGC(to bool) func() { 1142 prev := txnAutoGC 1143 txnAutoGC = to 1144 return func() { txnAutoGC = prev } 1145 }