github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_send.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "reflect" 16 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/util/hlc" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 "github.com/cockroachdb/cockroach/pkg/util/tracing" 26 "github.com/cockroachdb/errors" 27 opentracing "github.com/opentracing/opentracing-go" 28 ) 29 30 // Send executes a command on this range, dispatching it to the 31 // read-only, read-write, or admin execution path as appropriate. 32 // ctx should contain the log tags from the store (and up). 33 func (r *Replica) Send( 34 ctx context.Context, ba roachpb.BatchRequest, 35 ) (*roachpb.BatchResponse, *roachpb.Error) { 36 return r.sendWithRangeID(ctx, r.RangeID, &ba) 37 } 38 39 // sendWithRangeID takes an unused rangeID argument so that the range 40 // ID will be accessible in stack traces (both in panics and when 41 // sampling goroutines from a live server). This line is subject to 42 // the whims of the compiler and it can be difficult to find the right 43 // value, but as of this writing the following example shows a stack 44 // while processing range 21 (0x15) (the first occurrence of that 45 // number is the rangeID argument, the second is within the encoded 46 // BatchRequest, although we don't want to rely on that occurring 47 // within the portion printed in the stack trace): 48 // 49 // github.com/cockroachdb/cockroach/pkg/storage.(*Replica).sendWithRangeID(0xc420d1a000, 0x64bfb80, 0xc421564b10, 0x15, 0x153fd4634aeb0193, 0x0, 0x100000001, 0x1, 0x15, 0x0, ...) 50 func (r *Replica) sendWithRangeID( 51 ctx context.Context, rangeID roachpb.RangeID, ba *roachpb.BatchRequest, 52 ) (*roachpb.BatchResponse, *roachpb.Error) { 53 var br *roachpb.BatchResponse 54 if r.leaseholderStats != nil && ba.Header.GatewayNodeID != 0 { 55 r.leaseholderStats.record(ba.Header.GatewayNodeID) 56 } 57 58 // Add the range log tag. 59 ctx = r.AnnotateCtx(ctx) 60 ctx, cleanup := tracing.EnsureContext(ctx, r.AmbientContext.Tracer, "replica send") 61 defer cleanup() 62 63 // If the internal Raft group is not initialized, create it and wake the leader. 64 r.maybeInitializeRaftGroup(ctx) 65 66 isReadOnly := ba.IsReadOnly() 67 useRaft := !isReadOnly && ba.IsWrite() 68 69 if err := r.checkBatchRequest(ba, isReadOnly); err != nil { 70 return nil, roachpb.NewError(err) 71 } 72 73 if err := r.maybeBackpressureBatch(ctx, ba); err != nil { 74 return nil, roachpb.NewError(err) 75 } 76 77 // NB: must be performed before collecting request spans. 78 ba, err := maybeStripInFlightWrites(ba) 79 if err != nil { 80 return nil, roachpb.NewError(err) 81 } 82 83 if filter := r.store.cfg.TestingKnobs.TestingRequestFilter; filter != nil { 84 if pErr := filter(ctx, *ba); pErr != nil { 85 return nil, pErr 86 } 87 } 88 89 // Differentiate between read-write, read-only, and admin. 90 var pErr *roachpb.Error 91 if useRaft { 92 log.Event(ctx, "read-write path") 93 fn := (*Replica).executeWriteBatch 94 br, pErr = r.executeBatchWithConcurrencyRetries(ctx, ba, fn) 95 } else if isReadOnly { 96 log.Event(ctx, "read-only path") 97 fn := (*Replica).executeReadOnlyBatch 98 br, pErr = r.executeBatchWithConcurrencyRetries(ctx, ba, fn) 99 } else if ba.IsAdmin() { 100 log.Event(ctx, "admin path") 101 br, pErr = r.executeAdminBatch(ctx, ba) 102 } else if len(ba.Requests) == 0 { 103 // empty batch; shouldn't happen (we could handle it, but it hints 104 // at someone doing weird things, and once we drop the key range 105 // from the header it won't be clear how to route those requests). 106 log.Fatalf(ctx, "empty batch") 107 } else { 108 log.Fatalf(ctx, "don't know how to handle command %s", ba) 109 } 110 if pErr != nil { 111 log.Eventf(ctx, "replica.Send got error: %s", pErr) 112 } else { 113 if filter := r.store.cfg.TestingKnobs.TestingResponseFilter; filter != nil { 114 pErr = filter(ctx, *ba, br) 115 } 116 } 117 return br, pErr 118 } 119 120 // batchExecutionFn is a method on Replica that is able to execute a 121 // BatchRequest. It is called with the batch, along with the status of 122 // the lease that the batch is operating under and a guard for the 123 // latches protecting the request. 124 // 125 // The function will return either a batch response or an error. The function 126 // also has the option to pass ownership of the concurrency guard back to the 127 // caller. However, it does not need to. Instead, it can assume responsibility 128 // for releasing the concurrency guard it was provided by returning nil. This is 129 // useful is cases where the function: 130 // 1. eagerly released the concurrency guard after it determined that isolation 131 // from conflicting requests was no longer needed. 132 // 2. is continuing to execute asynchronously and needs to maintain isolation 133 // from conflicting requests throughout the lifetime of its asynchronous 134 // processing. The most prominent example of asynchronous processing is 135 // with requests that have the "async consensus" flag set. A more subtle 136 // case is with requests that are acknowledged by the Raft machinery after 137 // their Raft entry has been committed but before it has been applied to 138 // the replicated state machine. In all of these cases, responsibility 139 // for releasing the concurrency guard is handed to Raft. 140 // 141 // However, this option is not permitted if the function returns a "server-side 142 // concurrency retry error" (see isConcurrencyRetryError for more details). If 143 // the function returns one of these errors, it must also pass ownership of the 144 // concurrency guard back to the caller. 145 type batchExecutionFn func( 146 *Replica, context.Context, *roachpb.BatchRequest, kvserverpb.LeaseStatus, *concurrency.Guard, 147 ) (*roachpb.BatchResponse, *concurrency.Guard, *roachpb.Error) 148 149 var _ batchExecutionFn = (*Replica).executeWriteBatch 150 var _ batchExecutionFn = (*Replica).executeReadOnlyBatch 151 152 // executeBatchWithConcurrencyRetries is the entry point for client (non-admin) 153 // requests that execute against the range's state. The method coordinates the 154 // execution of requests that may require multiple retries due to interactions 155 // with concurrent transactions. 156 // 157 // The method acquires latches for the request, which synchronizes it with 158 // conflicting requests. This permits the execution function to run without 159 // concern of coordinating with logically conflicting operations, although it 160 // still needs to worry about coordinating with non-conflicting operations when 161 // accessing shared data structures. 162 // 163 // If the execution function hits a concurrency error like a WriteIntentError or 164 // a TransactionPushError it will propagate the error back to this method, which 165 // handles the process of retrying batch execution after addressing the error. 166 func (r *Replica) executeBatchWithConcurrencyRetries( 167 ctx context.Context, ba *roachpb.BatchRequest, fn batchExecutionFn, 168 ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { 169 // Try to execute command; exit retry loop on success. 170 var g *concurrency.Guard 171 var latchSpans, lockSpans *spanset.SpanSet 172 defer func() { 173 // NB: wrapped to delay g evaluation to its value when returning. 174 if g != nil { 175 r.concMgr.FinishReq(g) 176 } 177 }() 178 for { 179 // Exit loop if context has been canceled or timed out. 180 if err := ctx.Err(); err != nil { 181 return nil, roachpb.NewError(errors.Wrap(err, "aborted during Replica.Send")) 182 } 183 184 // Determine the lease under which to evaluate the request. 185 var status kvserverpb.LeaseStatus 186 if !ba.ReadConsistency.RequiresReadLease() { 187 // Get a clock reading for checkExecutionCanProceed. 188 status.Timestamp = r.Clock().Now() 189 } else if ba.IsSingleSkipLeaseCheckRequest() { 190 // For lease commands, use the provided previous lease for verification. 191 status.Lease = ba.GetPrevLeaseForLeaseRequest() 192 status.Timestamp = r.Clock().Now() 193 } else { 194 // If the request is a write or a consistent read, it requires the 195 // range lease or permission to serve via follower reads. 196 if status, pErr = r.redirectOnOrAcquireLease(ctx); pErr != nil { 197 if nErr := r.canServeFollowerRead(ctx, ba, pErr); nErr != nil { 198 return nil, nErr 199 } 200 } 201 } 202 // Limit the transaction's maximum timestamp using observed timestamps. 203 r.limitTxnMaxTimestamp(ctx, ba, status) 204 205 // Determine the maximal set of key spans that the batch will operate 206 // on. We only need to do this once and we make sure to do so after we 207 // have limited the transaction's maximum timestamp. 208 if latchSpans == nil { 209 var err error 210 latchSpans, lockSpans, err = r.collectSpans(ba) 211 if err != nil { 212 return nil, roachpb.NewError(err) 213 } 214 215 // Handle load-based splitting. 216 r.recordBatchForLoadBasedSplitting(ctx, ba, latchSpans) 217 } 218 219 // Acquire latches to prevent overlapping requests from executing until 220 // this request completes. After latching, wait on any conflicting locks 221 // to ensure that the request has full isolation during evaluation. This 222 // returns a request guard that must be eventually released. 223 var resp []roachpb.ResponseUnion 224 g, resp, pErr = r.concMgr.SequenceReq(ctx, g, concurrency.Request{ 225 Txn: ba.Txn, 226 Timestamp: ba.Timestamp, 227 Priority: ba.UserPriority, 228 ReadConsistency: ba.ReadConsistency, 229 Requests: ba.Requests, 230 LatchSpans: latchSpans, 231 LockSpans: lockSpans, 232 }) 233 if pErr != nil { 234 return nil, pErr 235 } else if resp != nil { 236 br = new(roachpb.BatchResponse) 237 br.Responses = resp 238 return br, nil 239 } 240 241 if filter := r.store.cfg.TestingKnobs.TestingLatchFilter; filter != nil { 242 if pErr := filter(ctx, *ba); pErr != nil { 243 return nil, pErr 244 } 245 } 246 247 br, g, pErr = fn(r, ctx, ba, status, g) 248 if pErr == nil { 249 // Success. 250 return br, nil 251 } else if !isConcurrencyRetryError(pErr) { 252 // Propagate error. 253 return nil, pErr 254 } 255 256 // The batch execution func returned a server-side concurrency retry 257 // error. It must have also handed back ownership of the concurrency 258 // guard without having already released the guard's latches. 259 g.AssertLatches() 260 switch t := pErr.GetDetail().(type) { 261 case *roachpb.WriteIntentError: 262 // Drop latches, but retain lock wait-queues. 263 if g, pErr = r.handleWriteIntentError(ctx, ba, g, pErr, t); pErr != nil { 264 return nil, pErr 265 } 266 case *roachpb.TransactionPushError: 267 // Drop latches, but retain lock wait-queues. 268 if g, pErr = r.handleTransactionPushError(ctx, ba, g, pErr, t); pErr != nil { 269 return nil, pErr 270 } 271 case *roachpb.IndeterminateCommitError: 272 // Drop latches and lock wait-queues. 273 r.concMgr.FinishReq(g) 274 g = nil 275 // Then launch a task to handle the indeterminate commit error. 276 if pErr = r.handleIndeterminateCommitError(ctx, ba, pErr, t); pErr != nil { 277 return nil, pErr 278 } 279 case *roachpb.MergeInProgressError: 280 // Drop latches and lock wait-queues. 281 r.concMgr.FinishReq(g) 282 g = nil 283 // Then listen for the merge to complete. 284 if pErr = r.handleMergeInProgressError(ctx, ba, pErr, t); pErr != nil { 285 return nil, pErr 286 } 287 default: 288 log.Fatalf(ctx, "unexpected concurrency retry error %T", t) 289 } 290 // Retry... 291 } 292 } 293 294 // isConcurrencyRetryError returns whether or not the provided error is a 295 // "server-side concurrency retry error" that will be captured and retried by 296 // executeBatchWithConcurrencyRetries. Server-side concurrency retry errors are 297 // handled by dropping a request's latches, waiting for and/or ensuring that the 298 // condition which caused the error is handled, re-sequencing through the 299 // concurrency manager, and executing the request again. 300 func isConcurrencyRetryError(pErr *roachpb.Error) bool { 301 switch pErr.GetDetail().(type) { 302 case *roachpb.WriteIntentError: 303 // If a request hits a WriteIntentError, it adds the conflicting intent 304 // to the lockTable through a process called "lock discovery". It then 305 // waits in the lock's wait-queue during its next sequencing pass. 306 case *roachpb.TransactionPushError: 307 // If a PushTxn request hits a TransactionPushError, it attempted to 308 // push another transactions record but did not succeed. It enqueues the 309 // pushee transaction in the txnWaitQueue and waits on the record to 310 // change or expire during its next sequencing pass. 311 case *roachpb.IndeterminateCommitError: 312 // If a PushTxn hits a IndeterminateCommitError, it attempted to push an 313 // expired transaction record in the STAGING state. It's unclear whether 314 // the pushee is aborted or committed, so the request must kick off the 315 // "transaction recovery procedure" to resolve this ambiguity before 316 // retrying. 317 case *roachpb.MergeInProgressError: 318 // If a request hits a MergeInProgressError, the replica it is being 319 // evaluted against is in the process of being merged into its left-hand 320 // neighbor. The request cannot proceed until the range merge completes, 321 // either successfully or unsuccessfully, so it waits before retrying. 322 // If the merge does complete successfully, the retry will be rejected 323 // with an error that will propagate back to the client. 324 default: 325 return false 326 } 327 return true 328 } 329 330 func (r *Replica) handleWriteIntentError( 331 ctx context.Context, 332 ba *roachpb.BatchRequest, 333 g *concurrency.Guard, 334 pErr *roachpb.Error, 335 t *roachpb.WriteIntentError, 336 ) (*concurrency.Guard, *roachpb.Error) { 337 if r.store.cfg.TestingKnobs.DontPushOnWriteIntentError { 338 return g, pErr 339 } 340 // g's latches will be dropped, but it retains its spot in lock wait-queues. 341 return r.concMgr.HandleWriterIntentError(ctx, g, t) 342 } 343 344 func (r *Replica) handleTransactionPushError( 345 ctx context.Context, 346 ba *roachpb.BatchRequest, 347 g *concurrency.Guard, 348 pErr *roachpb.Error, 349 t *roachpb.TransactionPushError, 350 ) (*concurrency.Guard, *roachpb.Error) { 351 // On a transaction push error, retry immediately if doing so will enqueue 352 // into the txnWaitQueue in order to await further updates to the unpushed 353 // txn's status. We check ShouldPushImmediately to avoid retrying 354 // non-queueable PushTxnRequests (see #18191). 355 dontRetry := r.store.cfg.TestingKnobs.DontRetryPushTxnFailures 356 if !dontRetry && ba.IsSinglePushTxnRequest() { 357 pushReq := ba.Requests[0].GetInner().(*roachpb.PushTxnRequest) 358 dontRetry = txnwait.ShouldPushImmediately(pushReq) 359 } 360 if dontRetry { 361 return g, pErr 362 } 363 // g's latches will be dropped, but it retains its spot in lock wait-queues 364 // (though a PushTxn shouldn't be in any lock wait-queues). 365 return r.concMgr.HandleTransactionPushError(ctx, g, t), nil 366 } 367 368 func (r *Replica) handleIndeterminateCommitError( 369 ctx context.Context, 370 ba *roachpb.BatchRequest, 371 pErr *roachpb.Error, 372 t *roachpb.IndeterminateCommitError, 373 ) *roachpb.Error { 374 if r.store.cfg.TestingKnobs.DontRecoverIndeterminateCommits { 375 return pErr 376 } 377 // On an indeterminate commit error, attempt to recover and finalize the 378 // stuck transaction. Retry immediately if successful. 379 if _, err := r.store.recoveryMgr.ResolveIndeterminateCommit(ctx, t); err != nil { 380 // Do not propagate ambiguous results; assume success and retry original op. 381 if errors.HasType(err, (*roachpb.AmbiguousResultError)(nil)) { 382 return nil 383 } 384 // Propagate new error. Preserve the error index. 385 newPErr := roachpb.NewError(err) 386 newPErr.Index = pErr.Index 387 return newPErr 388 } 389 // We've recovered the transaction that blocked the push; retry command. 390 return nil 391 } 392 393 func (r *Replica) handleMergeInProgressError( 394 ctx context.Context, 395 ba *roachpb.BatchRequest, 396 pErr *roachpb.Error, 397 t *roachpb.MergeInProgressError, 398 ) *roachpb.Error { 399 // A merge was in progress. We need to retry the command after the merge 400 // completes, as signaled by the closing of the replica's mergeComplete 401 // channel. Note that the merge may have already completed, in which case 402 // its mergeComplete channel will be nil. 403 mergeCompleteCh := r.getMergeCompleteCh() 404 if mergeCompleteCh == nil { 405 // Merge no longer in progress. Retry the command. 406 return nil 407 } 408 log.Event(ctx, "waiting on in-progress merge") 409 select { 410 case <-mergeCompleteCh: 411 // Merge complete. Retry the command. 412 return nil 413 case <-ctx.Done(): 414 return roachpb.NewError(errors.Wrap(ctx.Err(), "aborted during merge")) 415 case <-r.store.stopper.ShouldQuiesce(): 416 return roachpb.NewError(&roachpb.NodeUnavailableError{}) 417 } 418 } 419 420 // executeAdminBatch executes the command directly. There is no interaction 421 // with the spanlatch manager or the timestamp cache, as admin commands 422 // are not meant to consistently access or modify the underlying data. 423 // Admin commands must run on the lease holder replica. Batch support here is 424 // limited to single-element batches; everything else catches an error. 425 func (r *Replica) executeAdminBatch( 426 ctx context.Context, ba *roachpb.BatchRequest, 427 ) (*roachpb.BatchResponse, *roachpb.Error) { 428 if len(ba.Requests) != 1 { 429 return nil, roachpb.NewErrorf("only single-element admin batches allowed") 430 } 431 432 args := ba.Requests[0].GetInner() 433 if sp := opentracing.SpanFromContext(ctx); sp != nil { 434 sp.SetOperationName(reflect.TypeOf(args).String()) 435 } 436 437 // Admin commands always require the range lease. 438 status, pErr := r.redirectOnOrAcquireLease(ctx) 439 if pErr != nil { 440 return nil, pErr 441 } 442 // Note there is no need to limit transaction max timestamp on admin requests. 443 444 // Verify that the batch can be executed. 445 // NB: we pass nil for the spanlatch guard because we haven't acquired 446 // latches yet. This is ok because each individual request that the admin 447 // request sends will acquire latches. 448 if err := r.checkExecutionCanProceed(ba, nil /* g */, &status); err != nil { 449 return nil, roachpb.NewError(err) 450 } 451 452 var resp roachpb.Response 453 switch tArgs := args.(type) { 454 case *roachpb.AdminSplitRequest: 455 var reply roachpb.AdminSplitResponse 456 reply, pErr = r.AdminSplit(ctx, *tArgs, "manual") 457 resp = &reply 458 459 case *roachpb.AdminUnsplitRequest: 460 var reply roachpb.AdminUnsplitResponse 461 reply, pErr = r.AdminUnsplit(ctx, *tArgs, "manual") 462 resp = &reply 463 464 case *roachpb.AdminMergeRequest: 465 var reply roachpb.AdminMergeResponse 466 reply, pErr = r.AdminMerge(ctx, *tArgs, "manual") 467 resp = &reply 468 469 case *roachpb.AdminTransferLeaseRequest: 470 pErr = roachpb.NewError(r.AdminTransferLease(ctx, tArgs.Target)) 471 resp = &roachpb.AdminTransferLeaseResponse{} 472 473 case *roachpb.AdminChangeReplicasRequest: 474 chgs := tArgs.Changes() 475 desc, err := r.ChangeReplicas(ctx, &tArgs.ExpDesc, SnapshotRequest_REBALANCE, kvserverpb.ReasonAdminRequest, "", chgs) 476 pErr = roachpb.NewError(err) 477 if pErr != nil { 478 resp = &roachpb.AdminChangeReplicasResponse{} 479 } else { 480 resp = &roachpb.AdminChangeReplicasResponse{ 481 Desc: *desc, 482 } 483 } 484 485 case *roachpb.AdminRelocateRangeRequest: 486 err := r.store.AdminRelocateRange(ctx, *r.Desc(), tArgs.Targets) 487 pErr = roachpb.NewError(err) 488 resp = &roachpb.AdminRelocateRangeResponse{} 489 490 case *roachpb.CheckConsistencyRequest: 491 var reply roachpb.CheckConsistencyResponse 492 reply, pErr = r.CheckConsistency(ctx, *tArgs) 493 resp = &reply 494 495 case *roachpb.ImportRequest: 496 cArgs := batcheval.CommandArgs{ 497 EvalCtx: NewReplicaEvalContext(r, todoSpanSet), 498 Header: ba.Header, 499 Args: args, 500 } 501 var err error 502 resp, err = importCmdFn(ctx, cArgs) 503 pErr = roachpb.NewError(err) 504 505 case *roachpb.AdminScatterRequest: 506 reply, err := r.adminScatter(ctx, *tArgs) 507 pErr = roachpb.NewError(err) 508 resp = &reply 509 510 case *roachpb.AdminVerifyProtectedTimestampRequest: 511 reply, err := r.adminVerifyProtectedTimestamp(ctx, *tArgs) 512 pErr = roachpb.NewError(err) 513 resp = &reply 514 515 default: 516 return nil, roachpb.NewErrorf("unrecognized admin command: %T", args) 517 } 518 519 if pErr != nil { 520 return nil, pErr 521 } 522 523 if ba.Header.ReturnRangeInfo { 524 returnRangeInfo(resp, r) 525 } 526 527 br := &roachpb.BatchResponse{} 528 br.Add(resp) 529 br.Txn = resp.Header().Txn 530 return br, nil 531 } 532 533 // checkBatchRequest verifies BatchRequest validity requirements. In particular, 534 // the batch must have an assigned timestamp, and either all requests must be 535 // read-only, or none. 536 // 537 // TODO(tschottdorf): should check that request is contained in range and that 538 // EndTxn only occurs at the very end. 539 func (r *Replica) checkBatchRequest(ba *roachpb.BatchRequest, isReadOnly bool) error { 540 if ba.Timestamp == (hlc.Timestamp{}) { 541 // For transactional requests, Store.Send sets the timestamp. For non- 542 // transactional requests, the client sets the timestamp. Either way, we 543 // need to have a timestamp at this point. 544 return errors.New("Replica.checkBatchRequest: batch does not have timestamp assigned") 545 } 546 consistent := ba.ReadConsistency == roachpb.CONSISTENT 547 if isReadOnly { 548 if !consistent && ba.Txn != nil { 549 // Disallow any inconsistent reads within txns. 550 return errors.Errorf("cannot allow %v reads within a transaction", ba.ReadConsistency) 551 } 552 } else if !consistent { 553 return errors.Errorf("%v mode is only available to reads", ba.ReadConsistency) 554 } 555 556 return nil 557 } 558 559 func (r *Replica) collectSpans( 560 ba *roachpb.BatchRequest, 561 ) (latchSpans, lockSpans *spanset.SpanSet, _ error) { 562 latchSpans, lockSpans = new(spanset.SpanSet), new(spanset.SpanSet) 563 // TODO(bdarnell): need to make this less global when local 564 // latches are used more heavily. For example, a split will 565 // have a large read-only span but also a write (see #10084). 566 // Currently local spans are the exception, so preallocate for the 567 // common case in which all are global. We rarely mix read and 568 // write commands, so preallocate for writes if there are any 569 // writes present in the batch. 570 // 571 // TODO(bdarnell): revisit as the local portion gets its appropriate 572 // use. 573 if ba.IsLocking() { 574 guess := len(ba.Requests) 575 if et, ok := ba.GetArg(roachpb.EndTxn); ok { 576 // EndTxn declares a global write for each of its lock spans. 577 guess += len(et.(*roachpb.EndTxnRequest).LockSpans) - 1 578 } 579 latchSpans.Reserve(spanset.SpanReadWrite, spanset.SpanGlobal, guess) 580 } else { 581 latchSpans.Reserve(spanset.SpanReadOnly, spanset.SpanGlobal, len(ba.Requests)) 582 } 583 584 // For non-local, MVCC spans we annotate them with the request timestamp 585 // during declaration. This is the timestamp used during latch acquisitions. 586 // For read requests this works as expected, reads are performed at the same 587 // timestamp. During writes however, we may encounter a versioned value newer 588 // than the request timestamp, and may have to retry at a higher timestamp. 589 // This is still safe as we're only ever writing at timestamps higher than the 590 // timestamp any write latch would be declared at. 591 desc := r.Desc() 592 batcheval.DeclareKeysForBatch(desc, ba.Header, latchSpans) 593 for _, union := range ba.Requests { 594 inner := union.GetInner() 595 if cmd, ok := batcheval.LookupCommand(inner.Method()); ok { 596 cmd.DeclareKeys(desc, ba.Header, inner, latchSpans, lockSpans) 597 } else { 598 return nil, nil, errors.Errorf("unrecognized command %s", inner.Method()) 599 } 600 } 601 602 // Commands may create a large number of duplicate spans. De-duplicate 603 // them to reduce the number of spans we pass to the spanlatch manager. 604 for _, s := range [...]*spanset.SpanSet{latchSpans, lockSpans} { 605 s.SortAndDedup() 606 607 // If any command gave us spans that are invalid, bail out early 608 // (before passing them to the spanlatch manager, which may panic). 609 if err := s.Validate(); err != nil { 610 return nil, nil, err 611 } 612 } 613 614 return latchSpans, lockSpans, nil 615 } 616 617 // limitTxnMaxTimestamp limits the batch transaction's max timestamp 618 // so that it respects any timestamp already observed on this node. 619 // This prevents unnecessary uncertainty interval restarts caused by 620 // reading a value written at a timestamp between txn.Timestamp and 621 // txn.MaxTimestamp. The replica lease's start time is also taken into 622 // consideration to ensure that a lease transfer does not result in 623 // the observed timestamp for this node being inapplicable to data 624 // previously written by the former leaseholder. To wit: 625 // 626 // 1. put(k on leaseholder n1), gateway chooses t=1.0 627 // 2. begin; read(unrelated key on n2); gateway chooses t=0.98 628 // 3. pick up observed timestamp for n2 of t=0.99 629 // 4. n1 transfers lease for range with k to n2 @ t=1.1 630 // 5. read(k) on leaseholder n2 at ReadTimestamp=0.98 should get 631 // ReadWithinUncertaintyInterval because of the write in step 1, so 632 // even though we observed n2's timestamp in step 3 we must expand 633 // the uncertainty interval to the lease's start time, which is 634 // guaranteed to be greater than any write which occurred under 635 // the previous leaseholder. 636 func (r *Replica) limitTxnMaxTimestamp( 637 ctx context.Context, ba *roachpb.BatchRequest, status kvserverpb.LeaseStatus, 638 ) { 639 if ba.Txn == nil { 640 return 641 } 642 // For calls that read data within a txn, we keep track of timestamps 643 // observed from the various participating nodes' HLC clocks. If we have 644 // a timestamp on file for this Node which is smaller than MaxTimestamp, 645 // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below 646 // ReadTimestamp, we effectively can't see uncertainty restarts anymore. 647 // TODO(nvanbenschoten): This should use the lease's node id. 648 obsTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID) 649 if !ok { 650 return 651 } 652 // If the lease is valid, we use the greater of the observed 653 // timestamp and the lease start time, up to the max timestamp. This 654 // ensures we avoid incorrect assumptions about when data was 655 // written, in absolute time on a different node, which held the 656 // lease before this replica acquired it. 657 // TODO(nvanbenschoten): Do we ever need to call this when 658 // status.State != VALID? 659 if status.State == kvserverpb.LeaseState_VALID { 660 obsTS.Forward(status.Lease.Start) 661 } 662 if obsTS.Less(ba.Txn.MaxTimestamp) { 663 // Copy-on-write to protect others we might be sharing the Txn with. 664 txnClone := ba.Txn.Clone() 665 // The uncertainty window is [ReadTimestamp, maxTS), so if that window 666 // is empty, there won't be any uncertainty restarts. 667 if obsTS.LessEq(ba.Txn.ReadTimestamp) { 668 log.Event(ctx, "read has no clock uncertainty") 669 } 670 txnClone.MaxTimestamp.Backward(obsTS) 671 ba.Txn = txnClone 672 } 673 }