github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_send.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"reflect"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait"
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    24  	"github.com/cockroachdb/cockroach/pkg/util/log"
    25  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    26  	"github.com/cockroachdb/errors"
    27  	opentracing "github.com/opentracing/opentracing-go"
    28  )
    29  
    30  // Send executes a command on this range, dispatching it to the
    31  // read-only, read-write, or admin execution path as appropriate.
    32  // ctx should contain the log tags from the store (and up).
    33  func (r *Replica) Send(
    34  	ctx context.Context, ba roachpb.BatchRequest,
    35  ) (*roachpb.BatchResponse, *roachpb.Error) {
    36  	return r.sendWithRangeID(ctx, r.RangeID, &ba)
    37  }
    38  
    39  // sendWithRangeID takes an unused rangeID argument so that the range
    40  // ID will be accessible in stack traces (both in panics and when
    41  // sampling goroutines from a live server). This line is subject to
    42  // the whims of the compiler and it can be difficult to find the right
    43  // value, but as of this writing the following example shows a stack
    44  // while processing range 21 (0x15) (the first occurrence of that
    45  // number is the rangeID argument, the second is within the encoded
    46  // BatchRequest, although we don't want to rely on that occurring
    47  // within the portion printed in the stack trace):
    48  //
    49  // github.com/cockroachdb/cockroach/pkg/storage.(*Replica).sendWithRangeID(0xc420d1a000, 0x64bfb80, 0xc421564b10, 0x15, 0x153fd4634aeb0193, 0x0, 0x100000001, 0x1, 0x15, 0x0, ...)
    50  func (r *Replica) sendWithRangeID(
    51  	ctx context.Context, rangeID roachpb.RangeID, ba *roachpb.BatchRequest,
    52  ) (*roachpb.BatchResponse, *roachpb.Error) {
    53  	var br *roachpb.BatchResponse
    54  	if r.leaseholderStats != nil && ba.Header.GatewayNodeID != 0 {
    55  		r.leaseholderStats.record(ba.Header.GatewayNodeID)
    56  	}
    57  
    58  	// Add the range log tag.
    59  	ctx = r.AnnotateCtx(ctx)
    60  	ctx, cleanup := tracing.EnsureContext(ctx, r.AmbientContext.Tracer, "replica send")
    61  	defer cleanup()
    62  
    63  	// If the internal Raft group is not initialized, create it and wake the leader.
    64  	r.maybeInitializeRaftGroup(ctx)
    65  
    66  	isReadOnly := ba.IsReadOnly()
    67  	useRaft := !isReadOnly && ba.IsWrite()
    68  
    69  	if err := r.checkBatchRequest(ba, isReadOnly); err != nil {
    70  		return nil, roachpb.NewError(err)
    71  	}
    72  
    73  	if err := r.maybeBackpressureBatch(ctx, ba); err != nil {
    74  		return nil, roachpb.NewError(err)
    75  	}
    76  
    77  	// NB: must be performed before collecting request spans.
    78  	ba, err := maybeStripInFlightWrites(ba)
    79  	if err != nil {
    80  		return nil, roachpb.NewError(err)
    81  	}
    82  
    83  	if filter := r.store.cfg.TestingKnobs.TestingRequestFilter; filter != nil {
    84  		if pErr := filter(ctx, *ba); pErr != nil {
    85  			return nil, pErr
    86  		}
    87  	}
    88  
    89  	// Differentiate between read-write, read-only, and admin.
    90  	var pErr *roachpb.Error
    91  	if useRaft {
    92  		log.Event(ctx, "read-write path")
    93  		fn := (*Replica).executeWriteBatch
    94  		br, pErr = r.executeBatchWithConcurrencyRetries(ctx, ba, fn)
    95  	} else if isReadOnly {
    96  		log.Event(ctx, "read-only path")
    97  		fn := (*Replica).executeReadOnlyBatch
    98  		br, pErr = r.executeBatchWithConcurrencyRetries(ctx, ba, fn)
    99  	} else if ba.IsAdmin() {
   100  		log.Event(ctx, "admin path")
   101  		br, pErr = r.executeAdminBatch(ctx, ba)
   102  	} else if len(ba.Requests) == 0 {
   103  		// empty batch; shouldn't happen (we could handle it, but it hints
   104  		// at someone doing weird things, and once we drop the key range
   105  		// from the header it won't be clear how to route those requests).
   106  		log.Fatalf(ctx, "empty batch")
   107  	} else {
   108  		log.Fatalf(ctx, "don't know how to handle command %s", ba)
   109  	}
   110  	if pErr != nil {
   111  		log.Eventf(ctx, "replica.Send got error: %s", pErr)
   112  	} else {
   113  		if filter := r.store.cfg.TestingKnobs.TestingResponseFilter; filter != nil {
   114  			pErr = filter(ctx, *ba, br)
   115  		}
   116  	}
   117  	return br, pErr
   118  }
   119  
   120  // batchExecutionFn is a method on Replica that is able to execute a
   121  // BatchRequest. It is called with the batch, along with the status of
   122  // the lease that the batch is operating under and a guard for the
   123  // latches protecting the request.
   124  //
   125  // The function will return either a batch response or an error. The function
   126  // also has the option to pass ownership of the concurrency guard back to the
   127  // caller. However, it does not need to. Instead, it can assume responsibility
   128  // for releasing the concurrency guard it was provided by returning nil. This is
   129  // useful is cases where the function:
   130  // 1. eagerly released the concurrency guard after it determined that isolation
   131  //    from conflicting requests was no longer needed.
   132  // 2. is continuing to execute asynchronously and needs to maintain isolation
   133  //    from conflicting requests throughout the lifetime of its asynchronous
   134  //    processing. The most prominent example of asynchronous processing is
   135  //    with requests that have the "async consensus" flag set. A more subtle
   136  //    case is with requests that are acknowledged by the Raft machinery after
   137  //    their Raft entry has been committed but before it has been applied to
   138  //    the replicated state machine. In all of these cases, responsibility
   139  //    for releasing the concurrency guard is handed to Raft.
   140  //
   141  // However, this option is not permitted if the function returns a "server-side
   142  // concurrency retry error" (see isConcurrencyRetryError for more details). If
   143  // the function returns one of these errors, it must also pass ownership of the
   144  // concurrency guard back to the caller.
   145  type batchExecutionFn func(
   146  	*Replica, context.Context, *roachpb.BatchRequest, kvserverpb.LeaseStatus, *concurrency.Guard,
   147  ) (*roachpb.BatchResponse, *concurrency.Guard, *roachpb.Error)
   148  
   149  var _ batchExecutionFn = (*Replica).executeWriteBatch
   150  var _ batchExecutionFn = (*Replica).executeReadOnlyBatch
   151  
   152  // executeBatchWithConcurrencyRetries is the entry point for client (non-admin)
   153  // requests that execute against the range's state. The method coordinates the
   154  // execution of requests that may require multiple retries due to interactions
   155  // with concurrent transactions.
   156  //
   157  // The method acquires latches for the request, which synchronizes it with
   158  // conflicting requests. This permits the execution function to run without
   159  // concern of coordinating with logically conflicting operations, although it
   160  // still needs to worry about coordinating with non-conflicting operations when
   161  // accessing shared data structures.
   162  //
   163  // If the execution function hits a concurrency error like a WriteIntentError or
   164  // a TransactionPushError it will propagate the error back to this method, which
   165  // handles the process of retrying batch execution after addressing the error.
   166  func (r *Replica) executeBatchWithConcurrencyRetries(
   167  	ctx context.Context, ba *roachpb.BatchRequest, fn batchExecutionFn,
   168  ) (br *roachpb.BatchResponse, pErr *roachpb.Error) {
   169  	// Try to execute command; exit retry loop on success.
   170  	var g *concurrency.Guard
   171  	var latchSpans, lockSpans *spanset.SpanSet
   172  	defer func() {
   173  		// NB: wrapped to delay g evaluation to its value when returning.
   174  		if g != nil {
   175  			r.concMgr.FinishReq(g)
   176  		}
   177  	}()
   178  	for {
   179  		// Exit loop if context has been canceled or timed out.
   180  		if err := ctx.Err(); err != nil {
   181  			return nil, roachpb.NewError(errors.Wrap(err, "aborted during Replica.Send"))
   182  		}
   183  
   184  		// Determine the lease under which to evaluate the request.
   185  		var status kvserverpb.LeaseStatus
   186  		if !ba.ReadConsistency.RequiresReadLease() {
   187  			// Get a clock reading for checkExecutionCanProceed.
   188  			status.Timestamp = r.Clock().Now()
   189  		} else if ba.IsSingleSkipLeaseCheckRequest() {
   190  			// For lease commands, use the provided previous lease for verification.
   191  			status.Lease = ba.GetPrevLeaseForLeaseRequest()
   192  			status.Timestamp = r.Clock().Now()
   193  		} else {
   194  			// If the request is a write or a consistent read, it requires the
   195  			// range lease or permission to serve via follower reads.
   196  			if status, pErr = r.redirectOnOrAcquireLease(ctx); pErr != nil {
   197  				if nErr := r.canServeFollowerRead(ctx, ba, pErr); nErr != nil {
   198  					return nil, nErr
   199  				}
   200  			}
   201  		}
   202  		// Limit the transaction's maximum timestamp using observed timestamps.
   203  		r.limitTxnMaxTimestamp(ctx, ba, status)
   204  
   205  		// Determine the maximal set of key spans that the batch will operate
   206  		// on. We only need to do this once and we make sure to do so after we
   207  		// have limited the transaction's maximum timestamp.
   208  		if latchSpans == nil {
   209  			var err error
   210  			latchSpans, lockSpans, err = r.collectSpans(ba)
   211  			if err != nil {
   212  				return nil, roachpb.NewError(err)
   213  			}
   214  
   215  			// Handle load-based splitting.
   216  			r.recordBatchForLoadBasedSplitting(ctx, ba, latchSpans)
   217  		}
   218  
   219  		// Acquire latches to prevent overlapping requests from executing until
   220  		// this request completes. After latching, wait on any conflicting locks
   221  		// to ensure that the request has full isolation during evaluation. This
   222  		// returns a request guard that must be eventually released.
   223  		var resp []roachpb.ResponseUnion
   224  		g, resp, pErr = r.concMgr.SequenceReq(ctx, g, concurrency.Request{
   225  			Txn:             ba.Txn,
   226  			Timestamp:       ba.Timestamp,
   227  			Priority:        ba.UserPriority,
   228  			ReadConsistency: ba.ReadConsistency,
   229  			Requests:        ba.Requests,
   230  			LatchSpans:      latchSpans,
   231  			LockSpans:       lockSpans,
   232  		})
   233  		if pErr != nil {
   234  			return nil, pErr
   235  		} else if resp != nil {
   236  			br = new(roachpb.BatchResponse)
   237  			br.Responses = resp
   238  			return br, nil
   239  		}
   240  
   241  		if filter := r.store.cfg.TestingKnobs.TestingLatchFilter; filter != nil {
   242  			if pErr := filter(ctx, *ba); pErr != nil {
   243  				return nil, pErr
   244  			}
   245  		}
   246  
   247  		br, g, pErr = fn(r, ctx, ba, status, g)
   248  		if pErr == nil {
   249  			// Success.
   250  			return br, nil
   251  		} else if !isConcurrencyRetryError(pErr) {
   252  			// Propagate error.
   253  			return nil, pErr
   254  		}
   255  
   256  		// The batch execution func returned a server-side concurrency retry
   257  		// error. It must have also handed back ownership of the concurrency
   258  		// guard without having already released the guard's latches.
   259  		g.AssertLatches()
   260  		switch t := pErr.GetDetail().(type) {
   261  		case *roachpb.WriteIntentError:
   262  			// Drop latches, but retain lock wait-queues.
   263  			if g, pErr = r.handleWriteIntentError(ctx, ba, g, pErr, t); pErr != nil {
   264  				return nil, pErr
   265  			}
   266  		case *roachpb.TransactionPushError:
   267  			// Drop latches, but retain lock wait-queues.
   268  			if g, pErr = r.handleTransactionPushError(ctx, ba, g, pErr, t); pErr != nil {
   269  				return nil, pErr
   270  			}
   271  		case *roachpb.IndeterminateCommitError:
   272  			// Drop latches and lock wait-queues.
   273  			r.concMgr.FinishReq(g)
   274  			g = nil
   275  			// Then launch a task to handle the indeterminate commit error.
   276  			if pErr = r.handleIndeterminateCommitError(ctx, ba, pErr, t); pErr != nil {
   277  				return nil, pErr
   278  			}
   279  		case *roachpb.MergeInProgressError:
   280  			// Drop latches and lock wait-queues.
   281  			r.concMgr.FinishReq(g)
   282  			g = nil
   283  			// Then listen for the merge to complete.
   284  			if pErr = r.handleMergeInProgressError(ctx, ba, pErr, t); pErr != nil {
   285  				return nil, pErr
   286  			}
   287  		default:
   288  			log.Fatalf(ctx, "unexpected concurrency retry error %T", t)
   289  		}
   290  		// Retry...
   291  	}
   292  }
   293  
   294  // isConcurrencyRetryError returns whether or not the provided error is a
   295  // "server-side concurrency retry error" that will be captured and retried by
   296  // executeBatchWithConcurrencyRetries. Server-side concurrency retry errors are
   297  // handled by dropping a request's latches, waiting for and/or ensuring that the
   298  // condition which caused the error is handled, re-sequencing through the
   299  // concurrency manager, and executing the request again.
   300  func isConcurrencyRetryError(pErr *roachpb.Error) bool {
   301  	switch pErr.GetDetail().(type) {
   302  	case *roachpb.WriteIntentError:
   303  		// If a request hits a WriteIntentError, it adds the conflicting intent
   304  		// to the lockTable through a process called "lock discovery". It then
   305  		// waits in the lock's wait-queue during its next sequencing pass.
   306  	case *roachpb.TransactionPushError:
   307  		// If a PushTxn request hits a TransactionPushError, it attempted to
   308  		// push another transactions record but did not succeed. It enqueues the
   309  		// pushee transaction in the txnWaitQueue and waits on the record to
   310  		// change or expire during its next sequencing pass.
   311  	case *roachpb.IndeterminateCommitError:
   312  		// If a PushTxn hits a IndeterminateCommitError, it attempted to push an
   313  		// expired transaction record in the STAGING state. It's unclear whether
   314  		// the pushee is aborted or committed, so the request must kick off the
   315  		// "transaction recovery procedure" to resolve this ambiguity before
   316  		// retrying.
   317  	case *roachpb.MergeInProgressError:
   318  		// If a request hits a MergeInProgressError, the replica it is being
   319  		// evaluted against is in the process of being merged into its left-hand
   320  		// neighbor. The request cannot proceed until the range merge completes,
   321  		// either successfully or unsuccessfully, so it waits before retrying.
   322  		// If the merge does complete successfully, the retry will be rejected
   323  		// with an error that will propagate back to the client.
   324  	default:
   325  		return false
   326  	}
   327  	return true
   328  }
   329  
   330  func (r *Replica) handleWriteIntentError(
   331  	ctx context.Context,
   332  	ba *roachpb.BatchRequest,
   333  	g *concurrency.Guard,
   334  	pErr *roachpb.Error,
   335  	t *roachpb.WriteIntentError,
   336  ) (*concurrency.Guard, *roachpb.Error) {
   337  	if r.store.cfg.TestingKnobs.DontPushOnWriteIntentError {
   338  		return g, pErr
   339  	}
   340  	// g's latches will be dropped, but it retains its spot in lock wait-queues.
   341  	return r.concMgr.HandleWriterIntentError(ctx, g, t)
   342  }
   343  
   344  func (r *Replica) handleTransactionPushError(
   345  	ctx context.Context,
   346  	ba *roachpb.BatchRequest,
   347  	g *concurrency.Guard,
   348  	pErr *roachpb.Error,
   349  	t *roachpb.TransactionPushError,
   350  ) (*concurrency.Guard, *roachpb.Error) {
   351  	// On a transaction push error, retry immediately if doing so will enqueue
   352  	// into the txnWaitQueue in order to await further updates to the unpushed
   353  	// txn's status. We check ShouldPushImmediately to avoid retrying
   354  	// non-queueable PushTxnRequests (see #18191).
   355  	dontRetry := r.store.cfg.TestingKnobs.DontRetryPushTxnFailures
   356  	if !dontRetry && ba.IsSinglePushTxnRequest() {
   357  		pushReq := ba.Requests[0].GetInner().(*roachpb.PushTxnRequest)
   358  		dontRetry = txnwait.ShouldPushImmediately(pushReq)
   359  	}
   360  	if dontRetry {
   361  		return g, pErr
   362  	}
   363  	// g's latches will be dropped, but it retains its spot in lock wait-queues
   364  	// (though a PushTxn shouldn't be in any lock wait-queues).
   365  	return r.concMgr.HandleTransactionPushError(ctx, g, t), nil
   366  }
   367  
   368  func (r *Replica) handleIndeterminateCommitError(
   369  	ctx context.Context,
   370  	ba *roachpb.BatchRequest,
   371  	pErr *roachpb.Error,
   372  	t *roachpb.IndeterminateCommitError,
   373  ) *roachpb.Error {
   374  	if r.store.cfg.TestingKnobs.DontRecoverIndeterminateCommits {
   375  		return pErr
   376  	}
   377  	// On an indeterminate commit error, attempt to recover and finalize the
   378  	// stuck transaction. Retry immediately if successful.
   379  	if _, err := r.store.recoveryMgr.ResolveIndeterminateCommit(ctx, t); err != nil {
   380  		// Do not propagate ambiguous results; assume success and retry original op.
   381  		if errors.HasType(err, (*roachpb.AmbiguousResultError)(nil)) {
   382  			return nil
   383  		}
   384  		// Propagate new error. Preserve the error index.
   385  		newPErr := roachpb.NewError(err)
   386  		newPErr.Index = pErr.Index
   387  		return newPErr
   388  	}
   389  	// We've recovered the transaction that blocked the push; retry command.
   390  	return nil
   391  }
   392  
   393  func (r *Replica) handleMergeInProgressError(
   394  	ctx context.Context,
   395  	ba *roachpb.BatchRequest,
   396  	pErr *roachpb.Error,
   397  	t *roachpb.MergeInProgressError,
   398  ) *roachpb.Error {
   399  	// A merge was in progress. We need to retry the command after the merge
   400  	// completes, as signaled by the closing of the replica's mergeComplete
   401  	// channel. Note that the merge may have already completed, in which case
   402  	// its mergeComplete channel will be nil.
   403  	mergeCompleteCh := r.getMergeCompleteCh()
   404  	if mergeCompleteCh == nil {
   405  		// Merge no longer in progress. Retry the command.
   406  		return nil
   407  	}
   408  	log.Event(ctx, "waiting on in-progress merge")
   409  	select {
   410  	case <-mergeCompleteCh:
   411  		// Merge complete. Retry the command.
   412  		return nil
   413  	case <-ctx.Done():
   414  		return roachpb.NewError(errors.Wrap(ctx.Err(), "aborted during merge"))
   415  	case <-r.store.stopper.ShouldQuiesce():
   416  		return roachpb.NewError(&roachpb.NodeUnavailableError{})
   417  	}
   418  }
   419  
   420  // executeAdminBatch executes the command directly. There is no interaction
   421  // with the spanlatch manager or the timestamp cache, as admin commands
   422  // are not meant to consistently access or modify the underlying data.
   423  // Admin commands must run on the lease holder replica. Batch support here is
   424  // limited to single-element batches; everything else catches an error.
   425  func (r *Replica) executeAdminBatch(
   426  	ctx context.Context, ba *roachpb.BatchRequest,
   427  ) (*roachpb.BatchResponse, *roachpb.Error) {
   428  	if len(ba.Requests) != 1 {
   429  		return nil, roachpb.NewErrorf("only single-element admin batches allowed")
   430  	}
   431  
   432  	args := ba.Requests[0].GetInner()
   433  	if sp := opentracing.SpanFromContext(ctx); sp != nil {
   434  		sp.SetOperationName(reflect.TypeOf(args).String())
   435  	}
   436  
   437  	// Admin commands always require the range lease.
   438  	status, pErr := r.redirectOnOrAcquireLease(ctx)
   439  	if pErr != nil {
   440  		return nil, pErr
   441  	}
   442  	// Note there is no need to limit transaction max timestamp on admin requests.
   443  
   444  	// Verify that the batch can be executed.
   445  	// NB: we pass nil for the spanlatch guard because we haven't acquired
   446  	// latches yet. This is ok because each individual request that the admin
   447  	// request sends will acquire latches.
   448  	if err := r.checkExecutionCanProceed(ba, nil /* g */, &status); err != nil {
   449  		return nil, roachpb.NewError(err)
   450  	}
   451  
   452  	var resp roachpb.Response
   453  	switch tArgs := args.(type) {
   454  	case *roachpb.AdminSplitRequest:
   455  		var reply roachpb.AdminSplitResponse
   456  		reply, pErr = r.AdminSplit(ctx, *tArgs, "manual")
   457  		resp = &reply
   458  
   459  	case *roachpb.AdminUnsplitRequest:
   460  		var reply roachpb.AdminUnsplitResponse
   461  		reply, pErr = r.AdminUnsplit(ctx, *tArgs, "manual")
   462  		resp = &reply
   463  
   464  	case *roachpb.AdminMergeRequest:
   465  		var reply roachpb.AdminMergeResponse
   466  		reply, pErr = r.AdminMerge(ctx, *tArgs, "manual")
   467  		resp = &reply
   468  
   469  	case *roachpb.AdminTransferLeaseRequest:
   470  		pErr = roachpb.NewError(r.AdminTransferLease(ctx, tArgs.Target))
   471  		resp = &roachpb.AdminTransferLeaseResponse{}
   472  
   473  	case *roachpb.AdminChangeReplicasRequest:
   474  		chgs := tArgs.Changes()
   475  		desc, err := r.ChangeReplicas(ctx, &tArgs.ExpDesc, SnapshotRequest_REBALANCE, kvserverpb.ReasonAdminRequest, "", chgs)
   476  		pErr = roachpb.NewError(err)
   477  		if pErr != nil {
   478  			resp = &roachpb.AdminChangeReplicasResponse{}
   479  		} else {
   480  			resp = &roachpb.AdminChangeReplicasResponse{
   481  				Desc: *desc,
   482  			}
   483  		}
   484  
   485  	case *roachpb.AdminRelocateRangeRequest:
   486  		err := r.store.AdminRelocateRange(ctx, *r.Desc(), tArgs.Targets)
   487  		pErr = roachpb.NewError(err)
   488  		resp = &roachpb.AdminRelocateRangeResponse{}
   489  
   490  	case *roachpb.CheckConsistencyRequest:
   491  		var reply roachpb.CheckConsistencyResponse
   492  		reply, pErr = r.CheckConsistency(ctx, *tArgs)
   493  		resp = &reply
   494  
   495  	case *roachpb.ImportRequest:
   496  		cArgs := batcheval.CommandArgs{
   497  			EvalCtx: NewReplicaEvalContext(r, todoSpanSet),
   498  			Header:  ba.Header,
   499  			Args:    args,
   500  		}
   501  		var err error
   502  		resp, err = importCmdFn(ctx, cArgs)
   503  		pErr = roachpb.NewError(err)
   504  
   505  	case *roachpb.AdminScatterRequest:
   506  		reply, err := r.adminScatter(ctx, *tArgs)
   507  		pErr = roachpb.NewError(err)
   508  		resp = &reply
   509  
   510  	case *roachpb.AdminVerifyProtectedTimestampRequest:
   511  		reply, err := r.adminVerifyProtectedTimestamp(ctx, *tArgs)
   512  		pErr = roachpb.NewError(err)
   513  		resp = &reply
   514  
   515  	default:
   516  		return nil, roachpb.NewErrorf("unrecognized admin command: %T", args)
   517  	}
   518  
   519  	if pErr != nil {
   520  		return nil, pErr
   521  	}
   522  
   523  	if ba.Header.ReturnRangeInfo {
   524  		returnRangeInfo(resp, r)
   525  	}
   526  
   527  	br := &roachpb.BatchResponse{}
   528  	br.Add(resp)
   529  	br.Txn = resp.Header().Txn
   530  	return br, nil
   531  }
   532  
   533  // checkBatchRequest verifies BatchRequest validity requirements. In particular,
   534  // the batch must have an assigned timestamp, and either all requests must be
   535  // read-only, or none.
   536  //
   537  // TODO(tschottdorf): should check that request is contained in range and that
   538  // EndTxn only occurs at the very end.
   539  func (r *Replica) checkBatchRequest(ba *roachpb.BatchRequest, isReadOnly bool) error {
   540  	if ba.Timestamp == (hlc.Timestamp{}) {
   541  		// For transactional requests, Store.Send sets the timestamp. For non-
   542  		// transactional requests, the client sets the timestamp. Either way, we
   543  		// need to have a timestamp at this point.
   544  		return errors.New("Replica.checkBatchRequest: batch does not have timestamp assigned")
   545  	}
   546  	consistent := ba.ReadConsistency == roachpb.CONSISTENT
   547  	if isReadOnly {
   548  		if !consistent && ba.Txn != nil {
   549  			// Disallow any inconsistent reads within txns.
   550  			return errors.Errorf("cannot allow %v reads within a transaction", ba.ReadConsistency)
   551  		}
   552  	} else if !consistent {
   553  		return errors.Errorf("%v mode is only available to reads", ba.ReadConsistency)
   554  	}
   555  
   556  	return nil
   557  }
   558  
   559  func (r *Replica) collectSpans(
   560  	ba *roachpb.BatchRequest,
   561  ) (latchSpans, lockSpans *spanset.SpanSet, _ error) {
   562  	latchSpans, lockSpans = new(spanset.SpanSet), new(spanset.SpanSet)
   563  	// TODO(bdarnell): need to make this less global when local
   564  	// latches are used more heavily. For example, a split will
   565  	// have a large read-only span but also a write (see #10084).
   566  	// Currently local spans are the exception, so preallocate for the
   567  	// common case in which all are global. We rarely mix read and
   568  	// write commands, so preallocate for writes if there are any
   569  	// writes present in the batch.
   570  	//
   571  	// TODO(bdarnell): revisit as the local portion gets its appropriate
   572  	// use.
   573  	if ba.IsLocking() {
   574  		guess := len(ba.Requests)
   575  		if et, ok := ba.GetArg(roachpb.EndTxn); ok {
   576  			// EndTxn declares a global write for each of its lock spans.
   577  			guess += len(et.(*roachpb.EndTxnRequest).LockSpans) - 1
   578  		}
   579  		latchSpans.Reserve(spanset.SpanReadWrite, spanset.SpanGlobal, guess)
   580  	} else {
   581  		latchSpans.Reserve(spanset.SpanReadOnly, spanset.SpanGlobal, len(ba.Requests))
   582  	}
   583  
   584  	// For non-local, MVCC spans we annotate them with the request timestamp
   585  	// during declaration. This is the timestamp used during latch acquisitions.
   586  	// For read requests this works as expected, reads are performed at the same
   587  	// timestamp. During writes however, we may encounter a versioned value newer
   588  	// than the request timestamp, and may have to retry at a higher timestamp.
   589  	// This is still safe as we're only ever writing at timestamps higher than the
   590  	// timestamp any write latch would be declared at.
   591  	desc := r.Desc()
   592  	batcheval.DeclareKeysForBatch(desc, ba.Header, latchSpans)
   593  	for _, union := range ba.Requests {
   594  		inner := union.GetInner()
   595  		if cmd, ok := batcheval.LookupCommand(inner.Method()); ok {
   596  			cmd.DeclareKeys(desc, ba.Header, inner, latchSpans, lockSpans)
   597  		} else {
   598  			return nil, nil, errors.Errorf("unrecognized command %s", inner.Method())
   599  		}
   600  	}
   601  
   602  	// Commands may create a large number of duplicate spans. De-duplicate
   603  	// them to reduce the number of spans we pass to the spanlatch manager.
   604  	for _, s := range [...]*spanset.SpanSet{latchSpans, lockSpans} {
   605  		s.SortAndDedup()
   606  
   607  		// If any command gave us spans that are invalid, bail out early
   608  		// (before passing them to the spanlatch manager, which may panic).
   609  		if err := s.Validate(); err != nil {
   610  			return nil, nil, err
   611  		}
   612  	}
   613  
   614  	return latchSpans, lockSpans, nil
   615  }
   616  
   617  // limitTxnMaxTimestamp limits the batch transaction's max timestamp
   618  // so that it respects any timestamp already observed on this node.
   619  // This prevents unnecessary uncertainty interval restarts caused by
   620  // reading a value written at a timestamp between txn.Timestamp and
   621  // txn.MaxTimestamp. The replica lease's start time is also taken into
   622  // consideration to ensure that a lease transfer does not result in
   623  // the observed timestamp for this node being inapplicable to data
   624  // previously written by the former leaseholder. To wit:
   625  //
   626  // 1. put(k on leaseholder n1), gateway chooses t=1.0
   627  // 2. begin; read(unrelated key on n2); gateway chooses t=0.98
   628  // 3. pick up observed timestamp for n2 of t=0.99
   629  // 4. n1 transfers lease for range with k to n2 @ t=1.1
   630  // 5. read(k) on leaseholder n2 at ReadTimestamp=0.98 should get
   631  //    ReadWithinUncertaintyInterval because of the write in step 1, so
   632  //    even though we observed n2's timestamp in step 3 we must expand
   633  //    the uncertainty interval to the lease's start time, which is
   634  //    guaranteed to be greater than any write which occurred under
   635  //    the previous leaseholder.
   636  func (r *Replica) limitTxnMaxTimestamp(
   637  	ctx context.Context, ba *roachpb.BatchRequest, status kvserverpb.LeaseStatus,
   638  ) {
   639  	if ba.Txn == nil {
   640  		return
   641  	}
   642  	// For calls that read data within a txn, we keep track of timestamps
   643  	// observed from the various participating nodes' HLC clocks. If we have
   644  	// a timestamp on file for this Node which is smaller than MaxTimestamp,
   645  	// we can lower MaxTimestamp accordingly. If MaxTimestamp drops below
   646  	// ReadTimestamp, we effectively can't see uncertainty restarts anymore.
   647  	// TODO(nvanbenschoten): This should use the lease's node id.
   648  	obsTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID)
   649  	if !ok {
   650  		return
   651  	}
   652  	// If the lease is valid, we use the greater of the observed
   653  	// timestamp and the lease start time, up to the max timestamp. This
   654  	// ensures we avoid incorrect assumptions about when data was
   655  	// written, in absolute time on a different node, which held the
   656  	// lease before this replica acquired it.
   657  	// TODO(nvanbenschoten): Do we ever need to call this when
   658  	//   status.State != VALID?
   659  	if status.State == kvserverpb.LeaseState_VALID {
   660  		obsTS.Forward(status.Lease.Start)
   661  	}
   662  	if obsTS.Less(ba.Txn.MaxTimestamp) {
   663  		// Copy-on-write to protect others we might be sharing the Txn with.
   664  		txnClone := ba.Txn.Clone()
   665  		// The uncertainty window is [ReadTimestamp, maxTS), so if that window
   666  		// is empty, there won't be any uncertainty restarts.
   667  		if obsTS.LessEq(ba.Txn.ReadTimestamp) {
   668  			log.Event(ctx, "read has no clock uncertainty")
   669  		}
   670  		txnClone.MaxTimestamp.Backward(obsTS)
   671  		ba.Txn = txnClone
   672  	}
   673  }