github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/dist_sender.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvcoord
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"runtime"
    17  	"sync/atomic"
    18  	"time"
    19  	"unsafe"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/base"
    22  	"github.com/cockroachdb/cockroach/pkg/gossip"
    23  	"github.com/cockroachdb/cockroach/pkg/keys"
    24  	"github.com/cockroachdb/cockroach/pkg/kv"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/rpc"
    27  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    28  	"github.com/cockroachdb/cockroach/pkg/settings"
    29  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    30  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    31  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    32  	"github.com/cockroachdb/cockroach/pkg/util/log"
    33  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    34  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    35  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    36  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    38  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    39  	"github.com/cockroachdb/errors"
    40  )
    41  
    42  var (
    43  	metaDistSenderBatchCount = metric.Metadata{
    44  		Name:        "distsender.batches",
    45  		Help:        "Number of batches processed",
    46  		Measurement: "Batches",
    47  		Unit:        metric.Unit_COUNT,
    48  	}
    49  	metaDistSenderPartialBatchCount = metric.Metadata{
    50  		Name:        "distsender.batches.partial",
    51  		Help:        "Number of partial batches processed after being divided on range boundaries",
    52  		Measurement: "Partial Batches",
    53  		Unit:        metric.Unit_COUNT,
    54  	}
    55  	metaDistSenderAsyncSentCount = metric.Metadata{
    56  		Name:        "distsender.batches.async.sent",
    57  		Help:        "Number of partial batches sent asynchronously",
    58  		Measurement: "Partial Batches",
    59  		Unit:        metric.Unit_COUNT,
    60  	}
    61  	metaDistSenderAsyncThrottledCount = metric.Metadata{
    62  		Name:        "distsender.batches.async.throttled",
    63  		Help:        "Number of partial batches not sent asynchronously due to throttling",
    64  		Measurement: "Partial Batches",
    65  		Unit:        metric.Unit_COUNT,
    66  	}
    67  	metaTransportSentCount = metric.Metadata{
    68  		Name:        "distsender.rpc.sent",
    69  		Help:        "Number of RPCs sent",
    70  		Measurement: "RPCs",
    71  		Unit:        metric.Unit_COUNT,
    72  	}
    73  	metaTransportLocalSentCount = metric.Metadata{
    74  		Name:        "distsender.rpc.sent.local",
    75  		Help:        "Number of local RPCs sent",
    76  		Measurement: "RPCs",
    77  		Unit:        metric.Unit_COUNT,
    78  	}
    79  	metaTransportSenderNextReplicaErrCount = metric.Metadata{
    80  		Name:        "distsender.rpc.sent.nextreplicaerror",
    81  		Help:        "Number of RPCs sent due to per-replica errors",
    82  		Measurement: "RPCs",
    83  		Unit:        metric.Unit_COUNT,
    84  	}
    85  	metaDistSenderNotLeaseHolderErrCount = metric.Metadata{
    86  		Name:        "distsender.errors.notleaseholder",
    87  		Help:        "Number of NotLeaseHolderErrors encountered",
    88  		Measurement: "Errors",
    89  		Unit:        metric.Unit_COUNT,
    90  	}
    91  	metaDistSenderInLeaseTransferBackoffsCount = metric.Metadata{
    92  		Name:        "distsender.errors.inleasetransferbackoffs",
    93  		Help:        "Number of times backed off due to NotLeaseHolderErrors during lease transfer.",
    94  		Measurement: "Errors",
    95  		Unit:        metric.Unit_COUNT,
    96  	}
    97  	metaDistSenderRangeLookups = metric.Metadata{
    98  		Name:        "distsender.rangelookups",
    99  		Help:        "Number of range lookups.",
   100  		Measurement: "Range Lookups",
   101  		Unit:        metric.Unit_COUNT,
   102  	}
   103  	metaDistSenderSlowRPCs = metric.Metadata{
   104  		Name:        "requests.slow.distsender",
   105  		Help:        "Number of RPCs stuck or retrying for a long time",
   106  		Measurement: "Requests",
   107  		Unit:        metric.Unit_COUNT,
   108  	}
   109  )
   110  
   111  // CanSendToFollower is used by the DistSender to determine if it needs to look
   112  // up the current lease holder for a request. It is used by the
   113  // followerreadsccl code to inject logic to check if follower reads are enabled.
   114  // By default, without CCL code, this function returns false.
   115  var CanSendToFollower = func(
   116  	clusterID uuid.UUID, st *cluster.Settings, ba roachpb.BatchRequest,
   117  ) bool {
   118  	return false
   119  }
   120  
   121  const (
   122  	// The default limit for asynchronous senders.
   123  	defaultSenderConcurrency = 500
   124  	// The maximum number of range descriptors to prefetch during range lookups.
   125  	rangeLookupPrefetchCount = 8
   126  )
   127  
   128  var rangeDescriptorCacheSize = settings.RegisterIntSetting(
   129  	"kv.range_descriptor_cache.size",
   130  	"maximum number of entries in the range descriptor and leaseholder caches",
   131  	1e6,
   132  )
   133  
   134  var senderConcurrencyLimit = settings.RegisterNonNegativeIntSetting(
   135  	"kv.dist_sender.concurrency_limit",
   136  	"maximum number of asynchronous send requests",
   137  	max(defaultSenderConcurrency, int64(32*runtime.NumCPU())),
   138  )
   139  
   140  func max(a, b int64) int64 {
   141  	if a > b {
   142  		return a
   143  	}
   144  	return b
   145  }
   146  
   147  // DistSenderMetrics is the set of metrics for a given distributed sender.
   148  type DistSenderMetrics struct {
   149  	BatchCount              *metric.Counter
   150  	PartialBatchCount       *metric.Counter
   151  	AsyncSentCount          *metric.Counter
   152  	AsyncThrottledCount     *metric.Counter
   153  	SentCount               *metric.Counter
   154  	LocalSentCount          *metric.Counter
   155  	NextReplicaErrCount     *metric.Counter
   156  	NotLeaseHolderErrCount  *metric.Counter
   157  	InLeaseTransferBackoffs *metric.Counter
   158  	RangeLookups            *metric.Counter
   159  	SlowRPCs                *metric.Gauge
   160  }
   161  
   162  func makeDistSenderMetrics() DistSenderMetrics {
   163  	return DistSenderMetrics{
   164  		BatchCount:              metric.NewCounter(metaDistSenderBatchCount),
   165  		PartialBatchCount:       metric.NewCounter(metaDistSenderPartialBatchCount),
   166  		AsyncSentCount:          metric.NewCounter(metaDistSenderAsyncSentCount),
   167  		AsyncThrottledCount:     metric.NewCounter(metaDistSenderAsyncThrottledCount),
   168  		SentCount:               metric.NewCounter(metaTransportSentCount),
   169  		LocalSentCount:          metric.NewCounter(metaTransportLocalSentCount),
   170  		NextReplicaErrCount:     metric.NewCounter(metaTransportSenderNextReplicaErrCount),
   171  		NotLeaseHolderErrCount:  metric.NewCounter(metaDistSenderNotLeaseHolderErrCount),
   172  		InLeaseTransferBackoffs: metric.NewCounter(metaDistSenderInLeaseTransferBackoffsCount),
   173  		RangeLookups:            metric.NewCounter(metaDistSenderRangeLookups),
   174  		SlowRPCs:                metric.NewGauge(metaDistSenderSlowRPCs),
   175  	}
   176  }
   177  
   178  // A firstRangeMissingError indicates that the first range has not yet
   179  // been gossiped. This will be the case for a node which hasn't yet
   180  // joined the gossip network.
   181  type firstRangeMissingError struct{}
   182  
   183  // Error is part of the error interface.
   184  func (f firstRangeMissingError) Error() string {
   185  	return "the descriptor for the first range is not available via gossip"
   186  }
   187  
   188  // A DistSender provides methods to access Cockroach's monolithic,
   189  // distributed key value store. Each method invocation triggers a
   190  // lookup or lookups to find replica metadata for implicated key
   191  // ranges. RPCs are sent to one or more of the replicas to satisfy
   192  // the method invocation.
   193  type DistSender struct {
   194  	log.AmbientContext
   195  
   196  	st *cluster.Settings
   197  	// nodeDescriptor, if set, holds the descriptor of the node the
   198  	// DistSender lives on. It should be accessed via getNodeDescriptor(),
   199  	// which tries to obtain the value from the Gossip network if the
   200  	// descriptor is unknown.
   201  	nodeDescriptor unsafe.Pointer
   202  	// clock is used to set time for some calls. E.g. read-only ops
   203  	// which span ranges and don't require read consistency.
   204  	clock *hlc.Clock
   205  	// gossip provides up-to-date information about the start of the
   206  	// key range, used to find the replica metadata for arbitrary key
   207  	// ranges.
   208  	gossip  *gossip.Gossip
   209  	metrics DistSenderMetrics
   210  	// rangeCache caches replica metadata for key ranges.
   211  	rangeCache *RangeDescriptorCache
   212  	// leaseHolderCache caches range lease holders by range ID.
   213  	leaseHolderCache *LeaseHolderCache
   214  	transportFactory TransportFactory
   215  	rpcContext       *rpc.Context
   216  	nodeDialer       *nodedialer.Dialer
   217  	rpcRetryOptions  retry.Options
   218  	asyncSenderSem   *quotapool.IntPool
   219  	// clusterID is used to verify access to enterprise features.
   220  	// It is copied out of the rpcContext at construction time and used in
   221  	// testing.
   222  	clusterID *base.ClusterIDContainer
   223  
   224  	// disableFirstRangeUpdates disables updates of the first range via
   225  	// gossip. Used by tests which want finer control of the contents of the
   226  	// range cache.
   227  	disableFirstRangeUpdates int32
   228  
   229  	// disableParallelBatches instructs DistSender to never parallelize
   230  	// the transmission of partial batch requests across ranges.
   231  	disableParallelBatches bool
   232  }
   233  
   234  var _ kv.Sender = &DistSender{}
   235  
   236  // DistSenderConfig holds configuration and auxiliary objects that can be passed
   237  // to NewDistSender.
   238  type DistSenderConfig struct {
   239  	AmbientCtx log.AmbientContext
   240  
   241  	Settings        *cluster.Settings
   242  	Clock           *hlc.Clock
   243  	RPCRetryOptions *retry.Options
   244  	// nodeDescriptor, if provided, is used to describe which node the DistSender
   245  	// lives on, for instance when deciding where to send RPCs.
   246  	// Usually it is filled in from the Gossip network on demand.
   247  	nodeDescriptor    *roachpb.NodeDescriptor
   248  	RPCContext        *rpc.Context
   249  	RangeDescriptorDB RangeDescriptorDB
   250  
   251  	NodeDialer *nodedialer.Dialer
   252  
   253  	TestingKnobs ClientTestingKnobs
   254  }
   255  
   256  // NewDistSender returns a batch.Sender instance which connects to the
   257  // Cockroach cluster via the supplied gossip instance. Supplying a
   258  // DistSenderContext or the fields within is optional. For omitted values, sane
   259  // defaults will be used.
   260  func NewDistSender(cfg DistSenderConfig, g *gossip.Gossip) *DistSender {
   261  	ds := &DistSender{
   262  		st:         cfg.Settings,
   263  		clock:      cfg.Clock,
   264  		gossip:     g,
   265  		metrics:    makeDistSenderMetrics(),
   266  		nodeDialer: cfg.NodeDialer,
   267  	}
   268  	if ds.st == nil {
   269  		ds.st = cluster.MakeTestingClusterSettings()
   270  	}
   271  
   272  	ds.AmbientContext = cfg.AmbientCtx
   273  	if ds.AmbientContext.Tracer == nil {
   274  		panic("no tracer set in AmbientCtx")
   275  	}
   276  
   277  	if cfg.nodeDescriptor != nil {
   278  		atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(cfg.nodeDescriptor))
   279  	}
   280  	rdb := cfg.RangeDescriptorDB
   281  	if rdb == nil {
   282  		rdb = ds
   283  	}
   284  	getRangeDescCacheSize := func() int64 {
   285  		return rangeDescriptorCacheSize.Get(&ds.st.SV)
   286  	}
   287  	ds.rangeCache = NewRangeDescriptorCache(ds.st, rdb, getRangeDescCacheSize, cfg.RPCContext.Stopper)
   288  	ds.leaseHolderCache = NewLeaseHolderCache(getRangeDescCacheSize)
   289  	if tf := cfg.TestingKnobs.TransportFactory; tf != nil {
   290  		ds.transportFactory = tf
   291  	} else {
   292  		ds.transportFactory = GRPCTransportFactory
   293  	}
   294  	ds.rpcRetryOptions = base.DefaultRetryOptions()
   295  	if cfg.RPCRetryOptions != nil {
   296  		ds.rpcRetryOptions = *cfg.RPCRetryOptions
   297  	}
   298  	if cfg.RPCContext == nil {
   299  		panic("no RPCContext set in DistSenderConfig")
   300  	}
   301  	ds.rpcContext = cfg.RPCContext
   302  	if ds.rpcRetryOptions.Closer == nil {
   303  		ds.rpcRetryOptions.Closer = ds.rpcContext.Stopper.ShouldQuiesce()
   304  	}
   305  	ds.clusterID = &cfg.RPCContext.ClusterID
   306  	ds.nodeDialer = cfg.NodeDialer
   307  	ds.asyncSenderSem = quotapool.NewIntPool("DistSender async concurrency",
   308  		uint64(senderConcurrencyLimit.Get(&cfg.Settings.SV)))
   309  	senderConcurrencyLimit.SetOnChange(&cfg.Settings.SV, func() {
   310  		ds.asyncSenderSem.UpdateCapacity(uint64(senderConcurrencyLimit.Get(&cfg.Settings.SV)))
   311  	})
   312  	ds.rpcContext.Stopper.AddCloser(ds.asyncSenderSem.Closer("stopper"))
   313  
   314  	if g != nil {
   315  		ctx := ds.AnnotateCtx(context.Background())
   316  		g.RegisterCallback(gossip.KeyFirstRangeDescriptor,
   317  			func(_ string, value roachpb.Value) {
   318  				if atomic.LoadInt32(&ds.disableFirstRangeUpdates) == 1 {
   319  					return
   320  				}
   321  				if log.V(1) {
   322  					var desc roachpb.RangeDescriptor
   323  					if err := value.GetProto(&desc); err != nil {
   324  						log.Errorf(ctx, "unable to parse gossiped first range descriptor: %s", err)
   325  					} else {
   326  						log.Infof(ctx, "gossiped first range descriptor: %+v", desc.Replicas())
   327  					}
   328  				}
   329  				ds.rangeCache.EvictByKey(ctx, roachpb.RKeyMin)
   330  			})
   331  	}
   332  	return ds
   333  }
   334  
   335  // DisableFirstRangeUpdates disables updates of the first range via
   336  // gossip. Used by tests which want finer control of the contents of the range
   337  // cache.
   338  func (ds *DistSender) DisableFirstRangeUpdates() {
   339  	atomic.StoreInt32(&ds.disableFirstRangeUpdates, 1)
   340  }
   341  
   342  // DisableParallelBatches instructs DistSender to never parallelize the
   343  // transmission of partial batch requests across ranges.
   344  func (ds *DistSender) DisableParallelBatches() {
   345  	ds.disableParallelBatches = true
   346  }
   347  
   348  // Metrics returns a struct which contains metrics related to the distributed
   349  // sender's activity.
   350  func (ds *DistSender) Metrics() DistSenderMetrics {
   351  	return ds.metrics
   352  }
   353  
   354  // RangeDescriptorCache gives access to the DistSender's range cache.
   355  func (ds *DistSender) RangeDescriptorCache() *RangeDescriptorCache {
   356  	return ds.rangeCache
   357  }
   358  
   359  // LeaseHolderCache gives access to the DistSender's lease cache.
   360  func (ds *DistSender) LeaseHolderCache() *LeaseHolderCache {
   361  	return ds.leaseHolderCache
   362  }
   363  
   364  // RangeLookup implements the RangeDescriptorDB interface. It uses LookupRange
   365  // to perform a lookup scan for the provided key, using DistSender itself as the
   366  // client.Sender. This means that the scan will recurse into DistSender, which
   367  // will in turn use the RangeDescriptorCache again to lookup the RangeDescriptor
   368  // necessary to perform the scan.
   369  func (ds *DistSender) RangeLookup(
   370  	ctx context.Context, key roachpb.RKey, useReverseScan bool,
   371  ) ([]roachpb.RangeDescriptor, []roachpb.RangeDescriptor, error) {
   372  	ds.metrics.RangeLookups.Inc(1)
   373  	// We perform the range lookup scan with a READ_UNCOMMITTED consistency
   374  	// level because we want the scan to return intents as well as committed
   375  	// values. The reason for this is because it's not clear whether the intent
   376  	// or the previous value points to the correct location of the Range. It
   377  	// gets even more complicated when there are split-related intents or a txn
   378  	// record co-located with a replica involved in the split. Since we cannot
   379  	// know the correct answer, we lookup both the pre- and post- transaction
   380  	// values.
   381  	rc := roachpb.READ_UNCOMMITTED
   382  	// By using DistSender as the sender, we guarantee that even if the desired
   383  	// RangeDescriptor is not on the first range we send the lookup too, we'll
   384  	// still find it when we scan to the next range. This addresses the issue
   385  	// described in #18032 and #16266, allowing us to support meta2 splits.
   386  	return kv.RangeLookup(ctx, ds, key.AsRawKey(), rc, rangeLookupPrefetchCount, useReverseScan)
   387  }
   388  
   389  // FirstRange implements the RangeDescriptorDB interface.
   390  // FirstRange returns the RangeDescriptor for the first range on the cluster,
   391  // which is retrieved from the gossip protocol instead of the datastore.
   392  func (ds *DistSender) FirstRange() (*roachpb.RangeDescriptor, error) {
   393  	if ds.gossip == nil {
   394  		panic("with `nil` Gossip, DistSender must not use itself as rangeDescriptorDB")
   395  	}
   396  	rangeDesc := &roachpb.RangeDescriptor{}
   397  	if err := ds.gossip.GetInfoProto(gossip.KeyFirstRangeDescriptor, rangeDesc); err != nil {
   398  		return nil, firstRangeMissingError{}
   399  	}
   400  	return rangeDesc, nil
   401  }
   402  
   403  // getNodeDescriptor returns ds.nodeDescriptor, but makes an attempt to load
   404  // it from the Gossip network if a nil value is found.
   405  // We must jump through hoops here to get the node descriptor because it's not available
   406  // until after the node has joined the gossip network and been allowed to initialize
   407  // its stores.
   408  func (ds *DistSender) getNodeDescriptor() *roachpb.NodeDescriptor {
   409  	if desc := atomic.LoadPointer(&ds.nodeDescriptor); desc != nil {
   410  		return (*roachpb.NodeDescriptor)(desc)
   411  	}
   412  	if ds.gossip == nil {
   413  		return nil
   414  	}
   415  
   416  	ownNodeID := ds.gossip.NodeID.Get()
   417  	if ownNodeID > 0 {
   418  		// TODO(tschottdorf): Consider instead adding the NodeID of the
   419  		// coordinator to the header, so we can get this from incoming
   420  		// requests. Just in case we want to mostly eliminate gossip here.
   421  		nodeDesc := &roachpb.NodeDescriptor{}
   422  		if err := ds.gossip.GetInfoProto(gossip.MakeNodeIDKey(ownNodeID), nodeDesc); err == nil {
   423  			atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(nodeDesc))
   424  			return nodeDesc
   425  		}
   426  	}
   427  	if log.V(1) {
   428  		ctx := ds.AnnotateCtx(context.TODO())
   429  		log.Infof(ctx, "unable to determine this node's attributes for replica "+
   430  			"selection; node is most likely bootstrapping")
   431  	}
   432  	return nil
   433  }
   434  
   435  // sendRPC sends one or more RPCs to replicas from the supplied
   436  // roachpb.Replica slice. Returns an RPC error if the request could
   437  // not be sent. Note that the reply may contain a higher level error
   438  // and must be checked in addition to the RPC error.
   439  //
   440  // The replicas are assumed to be ordered by preference, with closer
   441  // ones (i.e. expected lowest latency) first.
   442  //
   443  // See sendToReplicas for a description of the withCommit parameter.
   444  func (ds *DistSender) sendRPC(
   445  	ctx context.Context,
   446  	ba roachpb.BatchRequest,
   447  	class rpc.ConnectionClass,
   448  	rangeID roachpb.RangeID,
   449  	replicas ReplicaSlice,
   450  	li leaseholderInfo,
   451  	withCommit bool,
   452  ) (*roachpb.BatchResponse, error) {
   453  	if len(replicas) == 0 {
   454  		return nil, roachpb.NewSendError(
   455  			fmt.Sprintf("no replica node addresses available via gossip for r%d", rangeID))
   456  	}
   457  
   458  	ba.RangeID = rangeID
   459  
   460  	tracing.AnnotateTrace()
   461  	defer tracing.AnnotateTrace()
   462  
   463  	return ds.sendToReplicas(
   464  		ctx,
   465  		ba,
   466  		SendOptions{
   467  			class:   class,
   468  			metrics: &ds.metrics,
   469  		},
   470  		rangeID,
   471  		replicas,
   472  		ds.nodeDialer,
   473  		li,
   474  		withCommit,
   475  	)
   476  }
   477  
   478  // CountRanges returns the number of ranges that encompass the given key span.
   479  func (ds *DistSender) CountRanges(ctx context.Context, rs roachpb.RSpan) (int64, error) {
   480  	var count int64
   481  	ri := NewRangeIterator(ds)
   482  	for ri.Seek(ctx, rs.Key, Ascending); ri.Valid(); ri.Next(ctx) {
   483  		count++
   484  		if !ri.NeedAnother(rs) {
   485  			break
   486  		}
   487  	}
   488  	return count, ri.Error()
   489  }
   490  
   491  // getDescriptor looks up the range descriptor to use for a query of
   492  // the key descKey with the given options. The lookup takes into
   493  // consideration the last range descriptor that the caller had used
   494  // for this key span, if any, and if the last range descriptor has
   495  // been evicted because it was found to be stale, which is all managed
   496  // through the EvictionToken. The function should be provided with an
   497  // EvictionToken if one was acquired from this function on a previous
   498  // call. If not, an empty EvictionToken can be provided.
   499  //
   500  // The range descriptor which contains the range in which the request should
   501  // start its query is returned first. Next returned is an EvictionToken. In
   502  // case the descriptor is discovered stale, the returned EvictionToken's evict
   503  // method should be called; it evicts the cache appropriately.
   504  //
   505  // If useReverseScan is set and descKey is the boundary between the two ranges,
   506  // the left range will be returned (even though descKey is actually contained on
   507  // the right range). This is useful for ReverseScans, which call this method
   508  // with their exclusive EndKey.
   509  func (ds *DistSender) getDescriptor(
   510  	ctx context.Context, descKey roachpb.RKey, evictToken *EvictionToken, useReverseScan bool,
   511  ) (*roachpb.RangeDescriptor, *EvictionToken, error) {
   512  	desc, returnToken, err := ds.rangeCache.LookupRangeDescriptorWithEvictionToken(
   513  		ctx, descKey, evictToken, useReverseScan,
   514  	)
   515  	if err != nil {
   516  		return nil, returnToken, err
   517  	}
   518  
   519  	// Sanity check: the descriptor we're about to return must include the key
   520  	// we're interested in.
   521  	{
   522  		containsFn := (*roachpb.RangeDescriptor).ContainsKey
   523  		if useReverseScan {
   524  			containsFn = (*roachpb.RangeDescriptor).ContainsKeyInverted
   525  		}
   526  		if !containsFn(desc, descKey) {
   527  			log.Fatalf(ctx, "programming error: range resolution returning non-matching descriptor: "+
   528  				"desc: %s, key: %s, reverse: %t", desc, descKey, log.Safe(useReverseScan))
   529  		}
   530  	}
   531  
   532  	return desc, returnToken, nil
   533  }
   534  
   535  // sendSingleRange gathers and rearranges the replicas, and makes an RPC call.
   536  func (ds *DistSender) sendSingleRange(
   537  	ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor, withCommit bool,
   538  ) (*roachpb.BatchResponse, *roachpb.Error) {
   539  	// Try to send the call. Learner replicas won't serve reads/writes, so send
   540  	// only to the `Voters` replicas. This is just an optimization to save a
   541  	// network hop, everything would still work if we had `All` here.
   542  	replicas := NewReplicaSlice(ds.gossip, desc.Replicas().Voters())
   543  
   544  	// Rearrange the replicas so that they're ordered in expectation of
   545  	// request latency.
   546  	replicas.OptimizeReplicaOrder(ds.getNodeDescriptor(), ds.rpcContext.RemoteClocks.Latency)
   547  
   548  	var cachedLeaseHolder roachpb.ReplicaDescriptor
   549  	if storeID, ok := ds.leaseHolderCache.Lookup(ctx, desc.RangeID); ok {
   550  		if i := replicas.FindReplica(storeID); i >= 0 {
   551  			cachedLeaseHolder = replicas[i].ReplicaDescriptor
   552  		}
   553  	}
   554  	canFollowerRead := ds.clusterID != nil &&
   555  		CanSendToFollower(ds.clusterID.Get(), ds.st, ba)
   556  	// If this request needs to go to a lease holder and we know who that is, move
   557  	// it to the front.
   558  	sendToLeaseholder :=
   559  		cachedLeaseHolder != (roachpb.ReplicaDescriptor{}) &&
   560  			!canFollowerRead &&
   561  			ba.RequiresLeaseHolder()
   562  	if sendToLeaseholder {
   563  		if i := replicas.FindReplica(cachedLeaseHolder.StoreID); i >= 0 {
   564  			replicas.MoveToFront(i)
   565  		}
   566  	}
   567  	li := leaseholderInfo{
   568  		routeToFollower:   canFollowerRead || !ba.RequiresLeaseHolder(),
   569  		cachedLeaseholder: cachedLeaseHolder,
   570  	}
   571  
   572  	class := rpc.ConnectionClassForKey(desc.RSpan().Key)
   573  	br, err := ds.sendRPC(ctx, ba, class, desc.RangeID, replicas, li, withCommit)
   574  	if err != nil {
   575  		log.VErrEventf(ctx, 2, "%v", err)
   576  		return nil, roachpb.NewError(err)
   577  	}
   578  
   579  	// If the reply contains a timestamp, update the local HLC with it.
   580  	if br.Error != nil && br.Error.Now != (hlc.Timestamp{}) {
   581  		ds.clock.Update(br.Error.Now)
   582  	} else if br.Now != (hlc.Timestamp{}) {
   583  		ds.clock.Update(br.Now)
   584  	}
   585  
   586  	// Untangle the error from the received response.
   587  	pErr := br.Error
   588  	br.Error = nil // scrub the response error
   589  	return br, pErr
   590  }
   591  
   592  // initAndVerifyBatch initializes timestamp-related information and
   593  // verifies batch constraints before splitting.
   594  func (ds *DistSender) initAndVerifyBatch(
   595  	ctx context.Context, ba *roachpb.BatchRequest,
   596  ) *roachpb.Error {
   597  	// Attach the local node ID to each request.
   598  	if ba.Header.GatewayNodeID == 0 && ds.gossip != nil {
   599  		ba.Header.GatewayNodeID = ds.gossip.NodeID.Get()
   600  	}
   601  
   602  	// In the event that timestamp isn't set and read consistency isn't
   603  	// required, set the timestamp using the local clock.
   604  	if ba.ReadConsistency != roachpb.CONSISTENT && ba.Timestamp == (hlc.Timestamp{}) {
   605  		ba.Timestamp = ds.clock.Now()
   606  	}
   607  
   608  	if len(ba.Requests) < 1 {
   609  		return roachpb.NewErrorf("empty batch")
   610  	}
   611  
   612  	if ba.MaxSpanRequestKeys != 0 || ba.TargetBytes != 0 {
   613  		// Verify that the batch contains only specific range requests or the
   614  		// EndTxnRequest. Verify that a batch with a ReverseScan only contains
   615  		// ReverseScan range requests.
   616  		isReverse := ba.IsReverse()
   617  		for _, req := range ba.Requests {
   618  			inner := req.GetInner()
   619  			switch inner.(type) {
   620  			case *roachpb.ScanRequest, *roachpb.ResolveIntentRangeRequest,
   621  				*roachpb.DeleteRangeRequest, *roachpb.RevertRangeRequest:
   622  				// Accepted forward range requests.
   623  				if isReverse {
   624  					return roachpb.NewErrorf("batch with limit contains both forward and reverse scans")
   625  				}
   626  
   627  			case *roachpb.ReverseScanRequest:
   628  				// Accepted reverse range requests.
   629  
   630  			case *roachpb.QueryIntentRequest, *roachpb.EndTxnRequest:
   631  				// Accepted point requests that can be in batches with limit.
   632  
   633  			default:
   634  				return roachpb.NewErrorf("batch with limit contains %T request", inner)
   635  			}
   636  		}
   637  	}
   638  
   639  	return nil
   640  }
   641  
   642  // errNo1PCTxn indicates that a batch cannot be sent as a 1 phase
   643  // commit because it spans multiple ranges and must be split into at
   644  // least two parts, with the final part containing the EndTxn
   645  // request.
   646  var errNo1PCTxn = roachpb.NewErrorf("cannot send 1PC txn to multiple ranges")
   647  
   648  // splitBatchAndCheckForRefreshSpans splits the batch according to the
   649  // canSplitET parameter and checks whether the batch can forward its
   650  // read timestamp. If the batch has its CanForwardReadTimestamp flag
   651  // set but is being split across multiple sub-batches then the flag in
   652  // the batch header is unset.
   653  func splitBatchAndCheckForRefreshSpans(
   654  	ba *roachpb.BatchRequest, canSplitET bool,
   655  ) [][]roachpb.RequestUnion {
   656  	parts := ba.Split(canSplitET)
   657  
   658  	// If the batch is split and the header has its CanForwardReadTimestamp flag
   659  	// set then we much check whether any request would need to be refreshed in
   660  	// the event that the one of the partial batches was to forward its read
   661  	// timestamp during a server-side refresh. If any such request exists then
   662  	// we unset the CanForwardReadTimestamp flag.
   663  	if len(parts) > 1 && ba.CanForwardReadTimestamp {
   664  		hasRefreshSpans := func() bool {
   665  			for _, part := range parts {
   666  				for _, req := range part {
   667  					if roachpb.NeedsRefresh(req.GetInner()) {
   668  						return true
   669  					}
   670  				}
   671  			}
   672  			return false
   673  		}()
   674  		if hasRefreshSpans {
   675  			ba.CanForwardReadTimestamp = false
   676  
   677  			// If the final part contains an EndTxn request, unset its
   678  			// CanCommitAtHigherTimestamp flag as well.
   679  			lastPart := parts[len(parts)-1]
   680  			if et := lastPart[len(lastPart)-1].GetEndTxn(); et != nil {
   681  				etCopy := *et
   682  				etCopy.CanCommitAtHigherTimestamp = false
   683  				lastPart = append([]roachpb.RequestUnion(nil), lastPart...)
   684  				lastPart[len(lastPart)-1].MustSetInner(&etCopy)
   685  				parts[len(parts)-1] = lastPart
   686  			}
   687  		}
   688  	}
   689  	return parts
   690  }
   691  
   692  // Send implements the batch.Sender interface. It subdivides the Batch
   693  // into batches admissible for sending (preventing certain illegal
   694  // mixtures of requests), executes each individual part (which may
   695  // span multiple ranges), and recombines the response.
   696  //
   697  // When the request spans ranges, it is split by range and a partial
   698  // subset of the batch request is sent to affected ranges in parallel.
   699  func (ds *DistSender) Send(
   700  	ctx context.Context, ba roachpb.BatchRequest,
   701  ) (*roachpb.BatchResponse, *roachpb.Error) {
   702  	ds.metrics.BatchCount.Inc(1)
   703  
   704  	tracing.AnnotateTrace()
   705  
   706  	// TODO(nvanbenschoten): This causes ba to escape to the heap. Either
   707  	// commit to passing BatchRequests by reference or return an updated
   708  	// value from this method instead.
   709  	if pErr := ds.initAndVerifyBatch(ctx, &ba); pErr != nil {
   710  		return nil, pErr
   711  	}
   712  
   713  	ctx = ds.AnnotateCtx(ctx)
   714  	ctx, sp := tracing.EnsureChildSpan(ctx, ds.AmbientContext.Tracer, "dist sender send")
   715  	defer sp.Finish()
   716  
   717  	var rplChunks []*roachpb.BatchResponse
   718  	splitET := false
   719  	var require1PC bool
   720  	lastReq := ba.Requests[len(ba.Requests)-1].GetInner()
   721  	if et, ok := lastReq.(*roachpb.EndTxnRequest); ok && et.Require1PC {
   722  		require1PC = true
   723  	}
   724  	// To ensure that we lay down intents to prevent starvation, always
   725  	// split the end transaction request into its own batch on retries.
   726  	// Txns requiring 1PC are an exception and should never be split.
   727  	if ba.Txn != nil && ba.Txn.Epoch > 0 && !require1PC {
   728  		splitET = true
   729  	}
   730  	parts := splitBatchAndCheckForRefreshSpans(&ba, splitET)
   731  	if len(parts) > 1 && (ba.MaxSpanRequestKeys != 0 || ba.TargetBytes != 0) {
   732  		// We already verified above that the batch contains only scan requests of the same type.
   733  		// Such a batch should never need splitting.
   734  		log.Fatalf(ctx, "batch with MaxSpanRequestKeys=%d, TargetBytes=%d needs splitting",
   735  			log.Safe(ba.MaxSpanRequestKeys), log.Safe(ba.TargetBytes))
   736  	}
   737  
   738  	errIdxOffset := 0
   739  	for len(parts) > 0 {
   740  		part := parts[0]
   741  		ba.Requests = part
   742  		// The minimal key range encompassing all requests contained within.
   743  		// Local addressing has already been resolved.
   744  		// TODO(tschottdorf): consider rudimentary validation of the batch here
   745  		// (for example, non-range requests with EndKey, or empty key ranges).
   746  		rs, err := keys.Range(ba.Requests)
   747  		if err != nil {
   748  			return nil, roachpb.NewError(err)
   749  		}
   750  
   751  		// Determine whether this part of the BatchRequest contains a committing
   752  		// EndTxn request.
   753  		var withCommit, withParallelCommit bool
   754  		if etArg, ok := ba.GetArg(roachpb.EndTxn); ok {
   755  			et := etArg.(*roachpb.EndTxnRequest)
   756  			withCommit = et.Commit
   757  			withParallelCommit = et.IsParallelCommit()
   758  		}
   759  
   760  		var rpl *roachpb.BatchResponse
   761  		var pErr *roachpb.Error
   762  		if withParallelCommit {
   763  			rpl, pErr = ds.divideAndSendParallelCommit(ctx, ba, rs, 0 /* batchIdx */)
   764  		} else {
   765  			rpl, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, withCommit, 0 /* batchIdx */)
   766  		}
   767  
   768  		if pErr == errNo1PCTxn {
   769  			// If we tried to send a single round-trip EndTxn but it looks like
   770  			// it's going to hit multiple ranges, split it here and try again.
   771  			if len(parts) != 1 {
   772  				panic("EndTxn not in last chunk of batch")
   773  			} else if require1PC {
   774  				log.Fatalf(ctx, "required 1PC transaction cannot be split: %s", ba)
   775  			}
   776  			parts = splitBatchAndCheckForRefreshSpans(&ba, true /* split ET */)
   777  			// Restart transaction of the last chunk as multiple parts with
   778  			// EndTxn in the last part.
   779  			continue
   780  		}
   781  		if pErr != nil {
   782  			if pErr.Index != nil && pErr.Index.Index != -1 {
   783  				pErr.Index.Index += int32(errIdxOffset)
   784  			}
   785  			return nil, pErr
   786  		}
   787  
   788  		errIdxOffset += len(ba.Requests)
   789  
   790  		// Propagate transaction from last reply to next request. The final
   791  		// update is taken and put into the response's main header.
   792  		ba.UpdateTxn(rpl.Txn)
   793  		rplChunks = append(rplChunks, rpl)
   794  		parts = parts[1:]
   795  	}
   796  
   797  	var reply *roachpb.BatchResponse
   798  	if len(rplChunks) > 0 {
   799  		reply = rplChunks[0]
   800  		for _, rpl := range rplChunks[1:] {
   801  			reply.Responses = append(reply.Responses, rpl.Responses...)
   802  			reply.CollectedSpans = append(reply.CollectedSpans, rpl.CollectedSpans...)
   803  		}
   804  		lastHeader := rplChunks[len(rplChunks)-1].BatchResponse_Header
   805  		lastHeader.CollectedSpans = reply.CollectedSpans
   806  		reply.BatchResponse_Header = lastHeader
   807  	}
   808  
   809  	return reply, nil
   810  }
   811  
   812  type response struct {
   813  	reply     *roachpb.BatchResponse
   814  	positions []int
   815  	pErr      *roachpb.Error
   816  }
   817  
   818  // divideAndSendParallelCommit divides a parallel-committing batch into
   819  // sub-batches that can be evaluated in parallel but should not be evaluated
   820  // on a Store together.
   821  //
   822  // The case where this comes up is if the batch is performing a parallel commit
   823  // and the transaction has previously pipelined writes that have yet to be
   824  // proven successful. In this scenario, the EndTxn request will be preceded by a
   825  // series of QueryIntent requests (see txn_pipeliner.go). Before evaluating,
   826  // each of these QueryIntent requests will grab latches and wait for their
   827  // corresponding write to finish. This is how the QueryIntent requests
   828  // synchronize with the write they are trying to verify.
   829  //
   830  // If these QueryIntents remained in the same batch as the EndTxn request then
   831  // they would force the EndTxn request to wait for the previous write before
   832  // evaluating itself. This "pipeline stall" would effectively negate the benefit
   833  // of the parallel commit. To avoid this, we make sure that these "pre-commit"
   834  // QueryIntent requests are split from and issued concurrently with the rest of
   835  // the parallel commit batch.
   836  //
   837  // batchIdx indicates which partial fragment of the larger batch is being
   838  // processed by this method. Currently it is always set to zero because this
   839  // method is never invoked recursively, but it is exposed to maintain symmetry
   840  // with divideAndSendBatchToRanges.
   841  func (ds *DistSender) divideAndSendParallelCommit(
   842  	ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, batchIdx int,
   843  ) (br *roachpb.BatchResponse, pErr *roachpb.Error) {
   844  	// Search backwards, looking for the first pre-commit QueryIntent.
   845  	swapIdx := -1
   846  	lastIdx := len(ba.Requests) - 1
   847  	for i := lastIdx - 1; i >= 0; i-- {
   848  		req := ba.Requests[i].GetInner()
   849  		if req.Method() == roachpb.QueryIntent {
   850  			swapIdx = i
   851  		} else {
   852  			break
   853  		}
   854  	}
   855  	if swapIdx == -1 {
   856  		// No pre-commit QueryIntents. Nothing to split.
   857  		return ds.divideAndSendBatchToRanges(ctx, ba, rs, true /* withCommit */, batchIdx)
   858  	}
   859  
   860  	// Swap the EndTxn request and the first pre-commit QueryIntent. This
   861  	// effectively creates a split point between the two groups of requests.
   862  	//
   863  	//  Before:    [put qi(1) put del qi(2) qi(3) qi(4) et]
   864  	//  After:     [put qi(1) put del et qi(3) qi(4) qi(2)]
   865  	//  Separated: [put qi(1) put del et] [qi(3) qi(4) qi(2)]
   866  	//
   867  	// NOTE: the non-pre-commit QueryIntent's must remain where they are in the
   868  	// batch. These ensure that the transaction always reads its writes (see
   869  	// txnPipeliner.chainToInFlightWrites). These will introduce pipeline stalls
   870  	// and undo most of the benefit of this method, but luckily they are rare in
   871  	// practice.
   872  	swappedReqs := append([]roachpb.RequestUnion(nil), ba.Requests...)
   873  	swappedReqs[swapIdx], swappedReqs[lastIdx] = swappedReqs[lastIdx], swappedReqs[swapIdx]
   874  
   875  	// Create a new pre-commit QueryIntent-only batch and issue it
   876  	// in a non-limited async task. This batch may need to be split
   877  	// over multiple ranges, so call into divideAndSendBatchToRanges.
   878  	qiBa := ba
   879  	qiBa.Requests = swappedReqs[swapIdx+1:]
   880  	qiRS, err := keys.Range(qiBa.Requests)
   881  	if err != nil {
   882  		return br, roachpb.NewError(err)
   883  	}
   884  	qiBatchIdx := batchIdx + 1
   885  	qiResponseCh := make(chan response, 1)
   886  
   887  	runTask := ds.rpcContext.Stopper.RunAsyncTask
   888  	if ds.disableParallelBatches {
   889  		runTask = ds.rpcContext.Stopper.RunTask
   890  	}
   891  	if err := runTask(ctx, "kv.DistSender: sending pre-commit query intents", func(ctx context.Context) {
   892  		// Map response index to the original un-swapped batch index.
   893  		// Remember that we moved the last QueryIntent in this batch
   894  		// from swapIdx to the end.
   895  		//
   896  		// From the example above:
   897  		//  Before:    [put qi(1) put del qi(2) qi(3) qi(4) et]
   898  		//  Separated: [put qi(1) put del et] [qi(3) qi(4) qi(2)]
   899  		//
   900  		//  qiBa.Requests = [qi(3) qi(4) qi(2)]
   901  		//  swapIdx       = 4
   902  		//  positions     = [5 6 4]
   903  		//
   904  		positions := make([]int, len(qiBa.Requests))
   905  		positions[len(positions)-1] = swapIdx
   906  		for i := range positions[:len(positions)-1] {
   907  			positions[i] = swapIdx + 1 + i
   908  		}
   909  
   910  		// Send the batch with withCommit=true since it will be inflight
   911  		// concurrently with the EndTxn batch below.
   912  		reply, pErr := ds.divideAndSendBatchToRanges(ctx, qiBa, qiRS, true /* withCommit */, qiBatchIdx)
   913  		qiResponseCh <- response{reply: reply, positions: positions, pErr: pErr}
   914  	}); err != nil {
   915  		return nil, roachpb.NewError(err)
   916  	}
   917  
   918  	// Adjust the original batch request to ignore the pre-commit
   919  	// QueryIntent requests. Make sure to determine the request's
   920  	// new key span.
   921  	ba.Requests = swappedReqs[:swapIdx+1]
   922  	rs, err = keys.Range(ba.Requests)
   923  	if err != nil {
   924  		return nil, roachpb.NewError(err)
   925  	}
   926  	br, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, true /* withCommit */, batchIdx)
   927  
   928  	// Wait for the QueryIntent-only batch to complete and stitch
   929  	// the responses together.
   930  	qiReply := <-qiResponseCh
   931  
   932  	// Handle error conditions.
   933  	if pErr != nil {
   934  		// The batch with the EndTxn returned an error. Ignore errors from the
   935  		// pre-commit QueryIntent requests because that request is read-only and
   936  		// will produce the same errors next time, if applicable.
   937  		if qiReply.reply != nil {
   938  			pErr.UpdateTxn(qiReply.reply.Txn)
   939  		}
   940  		maybeSwapErrorIndex(pErr, swapIdx, lastIdx)
   941  		return nil, pErr
   942  	}
   943  	if qiPErr := qiReply.pErr; qiPErr != nil {
   944  		// The batch with the pre-commit QueryIntent requests returned an error.
   945  		ignoreMissing := false
   946  		if _, ok := qiPErr.GetDetail().(*roachpb.IntentMissingError); ok {
   947  			// If the error is an IntentMissingError, detect whether this is due
   948  			// to intent resolution and can be safely ignored.
   949  			ignoreMissing, err = ds.detectIntentMissingDueToIntentResolution(ctx, br.Txn)
   950  			if err != nil {
   951  				return nil, roachpb.NewError(err)
   952  			}
   953  		}
   954  		if !ignoreMissing {
   955  			qiPErr.UpdateTxn(br.Txn)
   956  			maybeSwapErrorIndex(qiPErr, swapIdx, lastIdx)
   957  			return nil, qiPErr
   958  		}
   959  		// Populate the pre-commit QueryIntent batch response. If we made it
   960  		// here then we know we can ignore intent missing errors.
   961  		qiReply.reply = qiBa.CreateReply()
   962  		for _, ru := range qiReply.reply.Responses {
   963  			ru.GetQueryIntent().FoundIntent = true
   964  		}
   965  	}
   966  
   967  	// Both halves of the split batch succeeded. Piece them back together.
   968  	resps := make([]roachpb.ResponseUnion, len(swappedReqs))
   969  	copy(resps, br.Responses)
   970  	resps[swapIdx], resps[lastIdx] = resps[lastIdx], resps[swapIdx]
   971  	br.Responses = resps
   972  	if err := br.Combine(qiReply.reply, qiReply.positions); err != nil {
   973  		return nil, roachpb.NewError(err)
   974  	}
   975  	return br, nil
   976  }
   977  
   978  // detectIntentMissingDueToIntentResolution attempts to detect whether a missing
   979  // intent error thrown by a pre-commit QueryIntent request was due to intent
   980  // resolution after the transaction was already finalized instead of due to a
   981  // failure of the corresponding pipelined write. It is possible for these two
   982  // situations to be confused because the pre-commit QueryIntent requests are
   983  // issued in parallel with the staging EndTxn request and may evaluate after the
   984  // transaction becomes implicitly committed. If this happens and a concurrent
   985  // transaction observes the implicit commit and makes the commit explicit, it is
   986  // allowed to begin resolving the transactions intents.
   987  //
   988  // MVCC values don't remember their transaction once they have been resolved.
   989  // This loss of information means that QueryIntent returns an intent missing
   990  // error if it finds the resolved value that correspond to its desired intent.
   991  // Because of this, the race discussed above can result in intent missing errors
   992  // during a parallel commit even when the transaction successfully committed.
   993  //
   994  // This method queries the transaction record to determine whether an intent
   995  // missing error was caused by this race or whether the intent missing error
   996  // is real and guarantees that the transaction is not implicitly committed.
   997  //
   998  // See #37866 (issue) and #37900 (corresponding tla+ update).
   999  func (ds *DistSender) detectIntentMissingDueToIntentResolution(
  1000  	ctx context.Context, txn *roachpb.Transaction,
  1001  ) (bool, error) {
  1002  	ba := roachpb.BatchRequest{}
  1003  	ba.Timestamp = ds.clock.Now()
  1004  	ba.Add(&roachpb.QueryTxnRequest{
  1005  		RequestHeader: roachpb.RequestHeader{
  1006  			Key: txn.TxnMeta.Key,
  1007  		},
  1008  		Txn: txn.TxnMeta,
  1009  	})
  1010  	log.VEvent(ctx, 1, "detecting whether missing intent is due to intent resolution")
  1011  	br, pErr := ds.Send(ctx, ba)
  1012  	if pErr != nil {
  1013  		// We weren't able to determine whether the intent missing error is
  1014  		// due to intent resolution or not, so it is still ambiguous whether
  1015  		// the commit succeeded.
  1016  		return false, roachpb.NewAmbiguousResultError(fmt.Sprintf("error=%s [intent missing]", pErr))
  1017  	}
  1018  	respTxn := &br.Responses[0].GetQueryTxn().QueriedTxn
  1019  	switch respTxn.Status {
  1020  	case roachpb.COMMITTED:
  1021  		// The transaction has already been finalized as committed. The missing
  1022  		// intent error must have been a result of a concurrent transaction
  1023  		// recovery finding the transaction in the implicit commit state and
  1024  		// resolving one of its intents before the pre-commit QueryIntent
  1025  		// queried that intent. We know that the transaction was committed
  1026  		// successfully, so ignore the error.
  1027  		return true, nil
  1028  	case roachpb.ABORTED:
  1029  		// The transaction has either already been finalized as aborted or has
  1030  		// been finalized as committed and already had its transaction record
  1031  		// GCed. We can't distinguish between these two conditions with full
  1032  		// certainty, so we're forced to return an ambiguous commit error.
  1033  		// TODO(nvanbenschoten): QueryTxn will materialize an ABORTED transaction
  1034  		// record if one does not already exist. If we are certain that no actor
  1035  		// will ever persist an ABORTED transaction record after a COMMIT record is
  1036  		// GCed and we returned whether the record was synthesized in the QueryTxn
  1037  		// response then we could use the existence of an ABORTED transaction record
  1038  		// to further isolates the ambiguity caused by the loss of information
  1039  		// during intent resolution. If this error becomes a problem, we can explore
  1040  		// this option.
  1041  		return false, roachpb.NewAmbiguousResultError("intent missing and record aborted")
  1042  	default:
  1043  		// The transaction has not been finalized yet, so the missing intent
  1044  		// error must have been caused by a real missing intent. Propagate the
  1045  		// missing intent error.
  1046  		// NB: we don't expect the record to be PENDING at this point, but it's
  1047  		// not worth making any hard assertions about what we get back here.
  1048  		return false, nil
  1049  	}
  1050  }
  1051  
  1052  // maybeSwapErrorIndex swaps the error index from a to b or b to a if the
  1053  // error's index is set and is equal to one of these to values.
  1054  func maybeSwapErrorIndex(pErr *roachpb.Error, a, b int) {
  1055  	if pErr.Index == nil {
  1056  		return
  1057  	}
  1058  	if pErr.Index.Index == int32(a) {
  1059  		pErr.Index.Index = int32(b)
  1060  	} else if pErr.Index.Index == int32(b) {
  1061  		pErr.Index.Index = int32(a)
  1062  	}
  1063  }
  1064  
  1065  // mergeErrors merges the two errors, combining their transaction state and
  1066  // returning the error with the highest priority.
  1067  func mergeErrors(pErr1, pErr2 *roachpb.Error) *roachpb.Error {
  1068  	ret, drop := pErr1, pErr2
  1069  	if roachpb.ErrPriority(drop.GoError()) > roachpb.ErrPriority(ret.GoError()) {
  1070  		ret, drop = drop, ret
  1071  	}
  1072  	ret.UpdateTxn(drop.GetTxn())
  1073  	return ret
  1074  }
  1075  
  1076  // divideAndSendBatchToRanges sends the supplied batch to all of the
  1077  // ranges which comprise the span specified by rs. The batch request
  1078  // is trimmed against each range which is part of the span and sent
  1079  // either serially or in parallel, if possible.
  1080  //
  1081  // batchIdx indicates which partial fragment of the larger batch is
  1082  // being processed by this method. It's specified as non-zero when
  1083  // this method is invoked recursively.
  1084  //
  1085  // withCommit indicates that the batch contains a transaction commit
  1086  // or that a transaction commit is being run concurrently with this
  1087  // batch. Either way, if this is true then sendToReplicas will need
  1088  // to handle errors differently.
  1089  func (ds *DistSender) divideAndSendBatchToRanges(
  1090  	ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, withCommit bool, batchIdx int,
  1091  ) (br *roachpb.BatchResponse, pErr *roachpb.Error) {
  1092  	// Clone the BatchRequest's transaction so that future mutations to the
  1093  	// proto don't affect the proto in this batch.
  1094  	if ba.Txn != nil {
  1095  		ba.Txn = ba.Txn.Clone()
  1096  	}
  1097  	// Get initial seek key depending on direction of iteration.
  1098  	var scanDir ScanDirection
  1099  	var seekKey roachpb.RKey
  1100  	if !ba.IsReverse() {
  1101  		scanDir = Ascending
  1102  		seekKey = rs.Key
  1103  	} else {
  1104  		scanDir = Descending
  1105  		seekKey = rs.EndKey
  1106  	}
  1107  	ri := NewRangeIterator(ds)
  1108  	ri.Seek(ctx, seekKey, scanDir)
  1109  	if !ri.Valid() {
  1110  		return nil, roachpb.NewError(ri.Error())
  1111  	}
  1112  	// Take the fast path if this batch fits within a single range.
  1113  	if !ri.NeedAnother(rs) {
  1114  		resp := ds.sendPartialBatch(
  1115  			ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, false, /* needsTruncate */
  1116  		)
  1117  		return resp.reply, resp.pErr
  1118  	}
  1119  
  1120  	// The batch spans ranges (according to our cached range descriptors).
  1121  	// Verify that this is ok.
  1122  	// TODO(tschottdorf): we should have a mechanism for discovering range
  1123  	// merges (descriptor staleness will mostly go unnoticed), or we'll be
  1124  	// turning single-range queries into multi-range queries for no good
  1125  	// reason.
  1126  	if ba.IsUnsplittable() {
  1127  		mismatch := roachpb.NewRangeKeyMismatchError(rs.Key.AsRawKey(), rs.EndKey.AsRawKey(), ri.Desc())
  1128  		return nil, roachpb.NewError(mismatch)
  1129  	}
  1130  	// If there's no transaction and ba spans ranges, possibly re-run as part of
  1131  	// a transaction for consistency. The case where we don't need to re-run is
  1132  	// if the read consistency is not required.
  1133  	if ba.Txn == nil && ba.IsTransactional() && ba.ReadConsistency == roachpb.CONSISTENT {
  1134  		return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{})
  1135  	}
  1136  	// If the batch contains a non-parallel commit EndTxn and spans ranges then
  1137  	// we want the caller to come again with the EndTxn in a separate
  1138  	// (non-concurrent) batch.
  1139  	//
  1140  	// NB: withCommit allows us to short-circuit the check in the common case,
  1141  	// but even when that's true, we still need to search for the EndTxn in the
  1142  	// batch.
  1143  	if withCommit {
  1144  		etArg, ok := ba.GetArg(roachpb.EndTxn)
  1145  		if ok && !etArg.(*roachpb.EndTxnRequest).IsParallelCommit() {
  1146  			return nil, errNo1PCTxn
  1147  		}
  1148  	}
  1149  
  1150  	// Make an empty slice of responses which will be populated with responses
  1151  	// as they come in via Combine().
  1152  	br = &roachpb.BatchResponse{
  1153  		Responses: make([]roachpb.ResponseUnion, len(ba.Requests)),
  1154  	}
  1155  	// This function builds a channel of responses for each range
  1156  	// implicated in the span (rs) and combines them into a single
  1157  	// BatchResponse when finished.
  1158  	var responseChs []chan response
  1159  	// couldHaveSkippedResponses is set if a ResumeSpan needs to be sent back.
  1160  	var couldHaveSkippedResponses bool
  1161  	// If couldHaveSkippedResponses is set, resumeReason indicates the reason why
  1162  	// the ResumeSpan is necessary. This reason is common to all individual
  1163  	// responses that carry a ResumeSpan.
  1164  	var resumeReason roachpb.ResponseHeader_ResumeReason
  1165  	defer func() {
  1166  		if r := recover(); r != nil {
  1167  			// If we're in the middle of a panic, don't wait on responseChs.
  1168  			panic(r)
  1169  		}
  1170  		// Combine all the responses.
  1171  		// It's important that we wait for all of them even if an error is caught
  1172  		// because the client.Sender() contract mandates that we don't "hold on" to
  1173  		// any part of a request after DistSender.Send() returns.
  1174  		for _, responseCh := range responseChs {
  1175  			resp := <-responseCh
  1176  			if resp.pErr != nil {
  1177  				if pErr == nil {
  1178  					pErr = resp.pErr
  1179  					// Update the error's transaction with any new information from
  1180  					// the batch response. This may contain interesting updates if
  1181  					// the batch was parallelized and part of it succeeded.
  1182  					pErr.UpdateTxn(br.Txn)
  1183  				} else {
  1184  					// The batch was split and saw (at least) two different errors.
  1185  					// Merge their transaction state and determine which to return
  1186  					// based on their priorities.
  1187  					pErr = mergeErrors(pErr, resp.pErr)
  1188  				}
  1189  				continue
  1190  			}
  1191  
  1192  			// Combine the new response with the existing one (including updating
  1193  			// the headers) if we haven't yet seen an error.
  1194  			if pErr == nil {
  1195  				if err := br.Combine(resp.reply, resp.positions); err != nil {
  1196  					pErr = roachpb.NewError(err)
  1197  				}
  1198  			} else {
  1199  				// Update the error's transaction with any new information from
  1200  				// the batch response. This may contain interesting updates if
  1201  				// the batch was parallelized and part of it succeeded.
  1202  				pErr.UpdateTxn(resp.reply.Txn)
  1203  			}
  1204  		}
  1205  
  1206  		if pErr == nil && couldHaveSkippedResponses {
  1207  			fillSkippedResponses(ba, br, seekKey, resumeReason)
  1208  		}
  1209  	}()
  1210  
  1211  	canParallelize := ba.Header.MaxSpanRequestKeys == 0 && ba.Header.TargetBytes == 0
  1212  	if ba.IsSingleCheckConsistencyRequest() {
  1213  		// Don't parallelize full checksum requests as they have to touch the
  1214  		// entirety of each replica of each range they touch.
  1215  		isExpensive := ba.Requests[0].GetCheckConsistency().Mode == roachpb.ChecksumMode_CHECK_FULL
  1216  		canParallelize = canParallelize && !isExpensive
  1217  	}
  1218  
  1219  	for ; ri.Valid(); ri.Seek(ctx, seekKey, scanDir) {
  1220  		responseCh := make(chan response, 1)
  1221  		responseChs = append(responseChs, responseCh)
  1222  
  1223  		// Determine next seek key, taking a potentially sparse batch into
  1224  		// consideration.
  1225  		var err error
  1226  		nextRS := rs
  1227  		if scanDir == Descending {
  1228  			// In next iteration, query previous range.
  1229  			// We use the StartKey of the current descriptor as opposed to the
  1230  			// EndKey of the previous one since that doesn't have bugs when
  1231  			// stale descriptors come into play.
  1232  			seekKey, err = prev(ba, ri.Desc().StartKey)
  1233  			nextRS.EndKey = seekKey
  1234  		} else {
  1235  			// In next iteration, query next range.
  1236  			// It's important that we use the EndKey of the current descriptor
  1237  			// as opposed to the StartKey of the next one: if the former is stale,
  1238  			// it's possible that the next range has since merged the subsequent
  1239  			// one, and unless both descriptors are stale, the next descriptor's
  1240  			// StartKey would move us to the beginning of the current range,
  1241  			// resulting in a duplicate scan.
  1242  			seekKey, err = next(ba, ri.Desc().EndKey)
  1243  			nextRS.Key = seekKey
  1244  		}
  1245  		if err != nil {
  1246  			responseCh <- response{pErr: roachpb.NewError(err)}
  1247  			return
  1248  		}
  1249  
  1250  		lastRange := !ri.NeedAnother(rs)
  1251  		// Send the next partial batch to the first range in the "rs" span.
  1252  		// If we can reserve one of the limited goroutines available for parallel
  1253  		// batch RPCs, send asynchronously.
  1254  		if canParallelize && !lastRange && !ds.disableParallelBatches &&
  1255  			ds.sendPartialBatchAsync(ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, responseCh) {
  1256  			// Sent the batch asynchronously.
  1257  		} else {
  1258  			resp := ds.sendPartialBatch(
  1259  				ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, true, /* needsTruncate */
  1260  			)
  1261  			responseCh <- resp
  1262  			if resp.pErr != nil {
  1263  				return
  1264  			}
  1265  			// Update the transaction from the response. Note that this wouldn't happen
  1266  			// on the asynchronous path, but if we have newer information it's good to
  1267  			// use it.
  1268  			if !lastRange {
  1269  				ba.UpdateTxn(resp.reply.Txn)
  1270  			}
  1271  
  1272  			mightStopEarly := ba.MaxSpanRequestKeys > 0 || ba.TargetBytes > 0
  1273  			// Check whether we've received enough responses to exit query loop.
  1274  			if mightStopEarly {
  1275  				var replyResults int64
  1276  				var replyBytes int64
  1277  				for _, r := range resp.reply.Responses {
  1278  					replyResults += r.GetInner().Header().NumKeys
  1279  					replyBytes += r.GetInner().Header().NumBytes
  1280  				}
  1281  				// Update MaxSpanRequestKeys, if applicable. Note that ba might be
  1282  				// passed recursively to further divideAndSendBatchToRanges() calls.
  1283  				if ba.MaxSpanRequestKeys > 0 {
  1284  					if replyResults > ba.MaxSpanRequestKeys {
  1285  						// NOTE: v19.2 and below have a bug where MaxSpanRequestKeys
  1286  						// is not respected by ResolveIntentRangeRequest once the
  1287  						// limit has already been exhausted by the batch. This is
  1288  						// mostly harmless (or at least, the damage has already been
  1289  						// done by this point and resulted in a large Raft entry)
  1290  						// and has been fixed in v20.1+, so don't bother hitting the
  1291  						// assertion.
  1292  						//
  1293  						// TODO(nvanbenschoten): remove this hack in v20.2.
  1294  						if _, ok := ba.GetArg(roachpb.ResolveIntentRange); ok {
  1295  							replyResults = ba.MaxSpanRequestKeys
  1296  						} else {
  1297  							log.Fatalf(ctx, "received %d results, limit was %d",
  1298  								replyResults, ba.MaxSpanRequestKeys)
  1299  						}
  1300  					}
  1301  					ba.MaxSpanRequestKeys -= replyResults
  1302  					// Exiting; any missing responses will be filled in via defer().
  1303  					if ba.MaxSpanRequestKeys == 0 {
  1304  						couldHaveSkippedResponses = true
  1305  						resumeReason = roachpb.RESUME_KEY_LIMIT
  1306  						return
  1307  					}
  1308  				}
  1309  				if ba.TargetBytes > 0 {
  1310  					ba.TargetBytes -= replyBytes
  1311  					if ba.TargetBytes <= 0 {
  1312  						couldHaveSkippedResponses = true
  1313  						resumeReason = roachpb.RESUME_KEY_LIMIT
  1314  						return
  1315  					}
  1316  				}
  1317  			}
  1318  		}
  1319  
  1320  		// The iteration is complete if the iterator's current range
  1321  		// encompasses the remaining span, OR if the next span has
  1322  		// inverted. This can happen if this method is invoked
  1323  		// re-entrantly due to ranges being split or merged. In that case
  1324  		// the batch request has all the original requests but the span is
  1325  		// a sub-span of the original, causing next() and prev() methods
  1326  		// to potentially return values which invert the span.
  1327  		if lastRange || !nextRS.Key.Less(nextRS.EndKey) {
  1328  			return
  1329  		}
  1330  		batchIdx++
  1331  		rs = nextRS
  1332  	}
  1333  
  1334  	// We've exited early. Return the range iterator error.
  1335  	responseCh := make(chan response, 1)
  1336  	responseCh <- response{pErr: roachpb.NewError(ri.Error())}
  1337  	responseChs = append(responseChs, responseCh)
  1338  	return
  1339  }
  1340  
  1341  // sendPartialBatchAsync sends the partial batch asynchronously if
  1342  // there aren't currently more than the allowed number of concurrent
  1343  // async requests outstanding. Returns whether the partial batch was
  1344  // sent.
  1345  func (ds *DistSender) sendPartialBatchAsync(
  1346  	ctx context.Context,
  1347  	ba roachpb.BatchRequest,
  1348  	rs roachpb.RSpan,
  1349  	desc *roachpb.RangeDescriptor,
  1350  	evictToken *EvictionToken,
  1351  	withCommit bool,
  1352  	batchIdx int,
  1353  	responseCh chan response,
  1354  ) bool {
  1355  	if err := ds.rpcContext.Stopper.RunLimitedAsyncTask(
  1356  		ctx, "kv.DistSender: sending partial batch",
  1357  		ds.asyncSenderSem, false, /* wait */
  1358  		func(ctx context.Context) {
  1359  			ds.metrics.AsyncSentCount.Inc(1)
  1360  			responseCh <- ds.sendPartialBatch(
  1361  				ctx, ba, rs, desc, evictToken, withCommit, batchIdx, true, /* needsTruncate */
  1362  			)
  1363  		},
  1364  	); err != nil {
  1365  		ds.metrics.AsyncThrottledCount.Inc(1)
  1366  		return false
  1367  	}
  1368  	return true
  1369  }
  1370  
  1371  func slowRangeRPCWarningStr(
  1372  	dur time.Duration, attempts int64, desc *roachpb.RangeDescriptor, pErr *roachpb.Error,
  1373  ) string {
  1374  	return fmt.Sprintf("have been waiting %.2fs (%d attempts) for RPC to %s: %s", dur.Seconds(), attempts, desc, pErr)
  1375  }
  1376  
  1377  func slowRangeRPCReturnWarningStr(dur time.Duration, attempts int64) string {
  1378  	return fmt.Sprintf("slow RPC finished after %.2fs (%d attempts)", dur.Seconds(), attempts)
  1379  }
  1380  
  1381  // sendPartialBatch sends the supplied batch to the range specified by
  1382  // desc. The batch request is first truncated so that it contains only
  1383  // requests which intersect the range descriptor and keys for each
  1384  // request are limited to the range's key span. The send occurs in a
  1385  // retry loop to handle send failures. On failure to send to any
  1386  // replicas, we backoff and retry by refetching the range
  1387  // descriptor. If the underlying range seems to have split, we
  1388  // recursively invoke divideAndSendBatchToRanges to re-enumerate the
  1389  // ranges in the span and resend to each. If needsTruncate is true,
  1390  // the supplied batch and span must be truncated to the supplied range
  1391  // descriptor.
  1392  func (ds *DistSender) sendPartialBatch(
  1393  	ctx context.Context,
  1394  	ba roachpb.BatchRequest,
  1395  	rs roachpb.RSpan,
  1396  	desc *roachpb.RangeDescriptor,
  1397  	evictToken *EvictionToken,
  1398  	withCommit bool,
  1399  	batchIdx int,
  1400  	needsTruncate bool,
  1401  ) response {
  1402  	if batchIdx == 1 {
  1403  		ds.metrics.PartialBatchCount.Inc(2) // account for first batch
  1404  	} else if batchIdx > 1 {
  1405  		ds.metrics.PartialBatchCount.Inc(1)
  1406  	}
  1407  	var reply *roachpb.BatchResponse
  1408  	var pErr *roachpb.Error
  1409  	var err error
  1410  	var positions []int
  1411  
  1412  	isReverse := ba.IsReverse()
  1413  
  1414  	if needsTruncate {
  1415  		// Truncate the request to range descriptor.
  1416  		rs, err = rs.Intersect(desc)
  1417  		if err != nil {
  1418  			return response{pErr: roachpb.NewError(err)}
  1419  		}
  1420  		ba, positions, err = truncate(ba, rs)
  1421  		if len(positions) == 0 && err == nil {
  1422  			// This shouldn't happen in the wild, but some tests exercise it.
  1423  			return response{
  1424  				pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", rs, ba),
  1425  			}
  1426  		}
  1427  		if err != nil {
  1428  			return response{pErr: roachpb.NewError(err)}
  1429  		}
  1430  	}
  1431  
  1432  	// Start a retry loop for sending the batch to the range.
  1433  	tBegin, attempts := timeutil.Now(), int64(0) // for slow log message
  1434  	for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); {
  1435  		attempts++
  1436  		// If we've cleared the descriptor on a send failure, re-lookup.
  1437  		if desc == nil {
  1438  			var descKey roachpb.RKey
  1439  			if isReverse {
  1440  				descKey = rs.EndKey
  1441  			} else {
  1442  				descKey = rs.Key
  1443  			}
  1444  			desc, evictToken, err = ds.getDescriptor(ctx, descKey, evictToken, isReverse)
  1445  			if err != nil {
  1446  				log.VErrEventf(ctx, 1, "range descriptor re-lookup failed: %s", err)
  1447  				// We set pErr if we encountered an error getting the descriptor in
  1448  				// order to return the most recent error when we are out of retries.
  1449  				pErr = roachpb.NewError(err)
  1450  				continue
  1451  			}
  1452  		}
  1453  
  1454  		reply, pErr = ds.sendSingleRange(ctx, ba, desc, withCommit)
  1455  
  1456  		// If sending succeeded, return immediately.
  1457  		if pErr == nil {
  1458  			return response{reply: reply, positions: positions}
  1459  		}
  1460  
  1461  		// Re-map the error index within this partial batch back
  1462  		// to its position in the encompassing batch.
  1463  		if pErr.Index != nil && pErr.Index.Index != -1 && positions != nil {
  1464  			pErr.Index.Index = int32(positions[pErr.Index.Index])
  1465  		}
  1466  
  1467  		const slowDistSenderThreshold = time.Minute
  1468  		if dur := timeutil.Since(tBegin); dur > slowDistSenderThreshold && !tBegin.IsZero() {
  1469  			ds.metrics.SlowRPCs.Inc(1)
  1470  			dur := dur // leak dur to heap only when branch taken
  1471  			log.Warningf(ctx, "slow range RPC: %v",
  1472  				slowRangeRPCWarningStr(dur, attempts, desc, pErr))
  1473  			defer func(tBegin time.Time, attempts int64) {
  1474  				ds.metrics.SlowRPCs.Dec(1)
  1475  				log.Warningf(ctx, "slow RPC response: %v",
  1476  					slowRangeRPCReturnWarningStr(timeutil.Since(tBegin), attempts))
  1477  			}(tBegin, attempts)
  1478  			tBegin = time.Time{} // prevent reentering branch for this RPC
  1479  		}
  1480  		log.VErrEventf(ctx, 2, "reply error %s: %s", ba, pErr)
  1481  
  1482  		// Error handling: If the error indicates that our range
  1483  		// descriptor is out of date, evict it from the cache and try
  1484  		// again. Errors that apply only to a single replica were
  1485  		// handled in send().
  1486  		//
  1487  		// TODO(bdarnell): Don't retry endlessly. If we fail twice in a
  1488  		// row and the range descriptor hasn't changed, return the error
  1489  		// to our caller.
  1490  		switch tErr := pErr.GetDetail().(type) {
  1491  		case *roachpb.SendError:
  1492  			// We've tried all the replicas without success. Either they're all down,
  1493  			// or we're using an out-of-date range descriptor. Invalidate the cache
  1494  			// and try again with the new metadata. Re-sending the request is ok even
  1495  			// though it might have succeeded the first time around because of
  1496  			// idempotency.
  1497  			log.VEventf(ctx, 1, "evicting range descriptor on %T and backoff for re-lookup: %+v", tErr, desc)
  1498  			evictToken.Evict(ctx)
  1499  			// Clear the descriptor to reload on the next attempt.
  1500  			desc = nil
  1501  			continue
  1502  		case *roachpb.RangeKeyMismatchError:
  1503  			// Range descriptor might be out of date - evict it. This is
  1504  			// likely the result of a range split. If we have new range
  1505  			// descriptors, insert them instead as long as they are different
  1506  			// from the last descriptor to avoid endless loops.
  1507  			var replacements []roachpb.RangeDescriptor
  1508  			different := func(rd *roachpb.RangeDescriptor) bool {
  1509  				return !desc.RSpan().Equal(rd.RSpan())
  1510  			}
  1511  			if different(&tErr.MismatchedRange) {
  1512  				replacements = append(replacements, tErr.MismatchedRange)
  1513  			}
  1514  			if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) {
  1515  				if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) {
  1516  					replacements = append(replacements, *tErr.SuggestedRange)
  1517  				}
  1518  			}
  1519  			// Same as Evict() if replacements is empty.
  1520  			evictToken.EvictAndReplace(ctx, replacements...)
  1521  			// On addressing errors (likely a split), we need to re-invoke
  1522  			// the range descriptor lookup machinery, so we recurse by
  1523  			// sending batch to just the partial span this descriptor was
  1524  			// supposed to cover. Note that for the resending, we use the
  1525  			// already truncated batch, so that we know that the response
  1526  			// to it matches the positions into our batch (using the full
  1527  			// batch here would give a potentially larger response slice
  1528  			// with unknown mapping to our truncated reply).
  1529  			log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr)
  1530  			reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, withCommit, batchIdx)
  1531  			return response{reply: reply, positions: positions, pErr: pErr}
  1532  		}
  1533  		break
  1534  	}
  1535  
  1536  	// Propagate error if either the retry closer or context done
  1537  	// channels were closed.
  1538  	if pErr == nil {
  1539  		if err := ds.deduceRetryEarlyExitError(ctx); err == nil {
  1540  			log.Fatal(ctx, "exited retry loop without an error")
  1541  		} else {
  1542  			pErr = roachpb.NewError(err)
  1543  		}
  1544  	}
  1545  
  1546  	return response{pErr: pErr}
  1547  }
  1548  
  1549  func (ds *DistSender) deduceRetryEarlyExitError(ctx context.Context) error {
  1550  	select {
  1551  	case <-ds.rpcRetryOptions.Closer:
  1552  		// Typically happens during shutdown.
  1553  		return &roachpb.NodeUnavailableError{}
  1554  	case <-ctx.Done():
  1555  		// Happens when the client request is canceled.
  1556  		return errors.Wrap(ctx.Err(), "aborted in distSender")
  1557  	default:
  1558  	}
  1559  	return nil
  1560  }
  1561  
  1562  func includesFrontOfCurSpan(isReverse bool, rd *roachpb.RangeDescriptor, rs roachpb.RSpan) bool {
  1563  	if isReverse {
  1564  		return rd.ContainsKeyInverted(rs.EndKey)
  1565  	}
  1566  	return rd.ContainsKey(rs.Key)
  1567  }
  1568  
  1569  // fillSkippedResponses fills in responses and ResumeSpans for requests
  1570  // when a batch finished without fully processing the requested key spans for
  1571  // (some of) the requests in the batch. This can happen when processing has met
  1572  // the batch key max limit for range requests, or some other stop condition
  1573  // based on ScanOptions.
  1574  //
  1575  // nextKey is the first key that was not processed. This will be used when
  1576  // filling up the ResumeSpan's.
  1577  func fillSkippedResponses(
  1578  	ba roachpb.BatchRequest,
  1579  	br *roachpb.BatchResponse,
  1580  	nextKey roachpb.RKey,
  1581  	resumeReason roachpb.ResponseHeader_ResumeReason,
  1582  ) {
  1583  	// Some requests might have no response at all if we used a batch-wide
  1584  	// limit; simply create trivial responses for those. Note that any type
  1585  	// of request can crop up here - simply take a batch that exceeds the
  1586  	// limit, and add any other requests at higher keys at the end of the
  1587  	// batch -- they'll all come back without any response since they never
  1588  	// execute.
  1589  	var scratchBA roachpb.BatchRequest
  1590  	for i := range br.Responses {
  1591  		if br.Responses[i] != (roachpb.ResponseUnion{}) {
  1592  			continue
  1593  		}
  1594  		req := ba.Requests[i].GetInner()
  1595  		// We need to summon an empty response. The most convenient (but not
  1596  		// most efficient) way is to use (*BatchRequest).CreateReply.
  1597  		//
  1598  		// TODO(tschottdorf): can autogenerate CreateReply for individual
  1599  		// requests, see roachpb/gen_batch.go.
  1600  		if scratchBA.Requests == nil {
  1601  			scratchBA.Requests = make([]roachpb.RequestUnion, 1)
  1602  		}
  1603  		scratchBA.Requests[0].MustSetInner(req)
  1604  		br.Responses[i] = scratchBA.CreateReply().Responses[0]
  1605  	}
  1606  	// Set the ResumeSpan for future batch requests.
  1607  	isReverse := ba.IsReverse()
  1608  	for i, resp := range br.Responses {
  1609  		req := ba.Requests[i].GetInner()
  1610  		if !roachpb.IsRange(req) {
  1611  			continue
  1612  		}
  1613  		hdr := resp.GetInner().Header()
  1614  		hdr.ResumeReason = resumeReason
  1615  		origSpan := req.Header().Span()
  1616  		if isReverse {
  1617  			if hdr.ResumeSpan != nil {
  1618  				// The ResumeSpan.Key might be set to the StartKey of a range;
  1619  				// correctly set it to the Key of the original request span.
  1620  				hdr.ResumeSpan.Key = origSpan.Key
  1621  			} else if roachpb.RKey(origSpan.Key).Less(nextKey) {
  1622  				// Some keys have yet to be processed.
  1623  				hdr.ResumeSpan = new(roachpb.Span)
  1624  				*hdr.ResumeSpan = origSpan
  1625  				if nextKey.Less(roachpb.RKey(origSpan.EndKey)) {
  1626  					// The original span has been partially processed.
  1627  					hdr.ResumeSpan.EndKey = nextKey.AsRawKey()
  1628  				}
  1629  			}
  1630  		} else {
  1631  			if hdr.ResumeSpan != nil {
  1632  				// The ResumeSpan.EndKey might be set to the EndKey of a range because
  1633  				// that's what a store will set it to when the limit is reached; it
  1634  				// doesn't know any better). In that case, we correct it to the EndKey
  1635  				// of the original request span. Note that this doesn't touch
  1636  				// ResumeSpan.Key, which is really the important part of the ResumeSpan.
  1637  				hdr.ResumeSpan.EndKey = origSpan.EndKey
  1638  			} else {
  1639  				// The request might have been fully satisfied, in which case it doesn't
  1640  				// need a ResumeSpan, or it might not have. Figure out if we're in the
  1641  				// latter case.
  1642  				if nextKey.Less(roachpb.RKey(origSpan.EndKey)) {
  1643  					// Some keys have yet to be processed.
  1644  					hdr.ResumeSpan = new(roachpb.Span)
  1645  					*hdr.ResumeSpan = origSpan
  1646  					if roachpb.RKey(origSpan.Key).Less(nextKey) {
  1647  						// The original span has been partially processed.
  1648  						hdr.ResumeSpan.Key = nextKey.AsRawKey()
  1649  					}
  1650  				}
  1651  			}
  1652  		}
  1653  		br.Responses[i].GetInner().SetHeader(hdr)
  1654  	}
  1655  }
  1656  
  1657  // leaseholderInfo contains some routing information for RPCs.
  1658  type leaseholderInfo struct {
  1659  	// routeToFollower is set if the request is intended to be routed to a
  1660  	// follower - either because it's a read that looks stale enough to be served
  1661  	// by a follower, or otherwise because the respective batch simply doesn't
  1662  	// need the leaseholder.
  1663  	routeToFollower bool
  1664  	// cachedLeaseholder is the leaseholder that the cache indicated. Empty if the
  1665  	// cache didn't have an entry for the range.
  1666  	cachedLeaseholder roachpb.ReplicaDescriptor
  1667  }
  1668  
  1669  // sendToReplicas sends one or more RPCs to clients specified by the
  1670  // slice of replicas. On success, Send returns the first successful
  1671  // reply. If an error occurs which is not specific to a single
  1672  // replica, it's returned immediately. Otherwise, when all replicas
  1673  // have been tried and failed, returns a send error.
  1674  //
  1675  // The method accepts a boolean declaring whether a transaction commit
  1676  // is either in this batch or in-flight concurrently with this batch.
  1677  // If withCommit is false (i.e. either no EndTxn is in flight,
  1678  // or it is attempting to abort), ambiguous results will never be
  1679  // returned from this method. This is because both transactional writes
  1680  // and aborts can be retried (the former due to seqno idempotency, the
  1681  // latter because aborting is idempotent). If withCommit is true, any
  1682  // errors that do not definitively rule out the possibility that the
  1683  // batch could have succeeded are transformed into AmbiguousResultErrors.
  1684  func (ds *DistSender) sendToReplicas(
  1685  	ctx context.Context,
  1686  	ba roachpb.BatchRequest,
  1687  	opts SendOptions,
  1688  	rangeID roachpb.RangeID,
  1689  	replicas ReplicaSlice,
  1690  	nodeDialer *nodedialer.Dialer,
  1691  	leaseholder leaseholderInfo,
  1692  	withCommit bool,
  1693  ) (*roachpb.BatchResponse, error) {
  1694  	transport, err := ds.transportFactory(opts, nodeDialer, replicas)
  1695  	if err != nil {
  1696  		return nil, err
  1697  	}
  1698  	if transport.IsExhausted() {
  1699  		return nil, roachpb.NewSendError(
  1700  			fmt.Sprintf("sending to all %d replicas failed", len(replicas)))
  1701  	}
  1702  
  1703  	curReplica := transport.NextReplica()
  1704  	if log.ExpensiveLogEnabled(ctx, 2) {
  1705  		log.VEventf(ctx, 2, "r%d: sending batch %s to %s", rangeID, ba.Summary(), curReplica)
  1706  	}
  1707  	br, err := transport.SendNext(ctx, ba)
  1708  	// maxSeenLeaseSequence tracks the maximum LeaseSequence seen in a
  1709  	// NotLeaseHolderError. If we encounter a sequence number less than or equal
  1710  	// to maxSeenLeaseSequence number in a subsequent NotLeaseHolderError then
  1711  	// the range must be experiencing a least transfer and the client should back
  1712  	// off using inTransferRetry.
  1713  	maxSeenLeaseSequence := roachpb.LeaseSequence(-1)
  1714  	inTransferRetry := retry.StartWithCtx(ctx, ds.rpcRetryOptions)
  1715  	inTransferRetry.Next() // The first call to Next does not block.
  1716  
  1717  	// This loop will retry operations that fail with errors that reflect
  1718  	// per-replica state and may succeed on other replicas.
  1719  	var ambiguousError error
  1720  	for {
  1721  		if err != nil {
  1722  			// For most connection errors, we cannot tell whether or not the request
  1723  			// may have succeeded on the remote server (exceptions are captured in the
  1724  			// grpcutil.RequestDidNotStart function). We'll retry the request in order
  1725  			// to attempt to eliminate the ambiguity; see below. If there's a commit
  1726  			// in the batch, we track the ambiguity more explicitly by setting
  1727  			// ambiguousError. This serves two purposes:
  1728  			// 1) the higher-level retries in the DistSender will not forget the
  1729  			// ambiguity, like they forget it for non-commit batches. This in turn
  1730  			// will ensure that TxnCoordSender-level retries don't happen across
  1731  			// commits; that'd be bad since requests are not idempotent across
  1732  			// commits.
  1733  			// TODO(andrei): This higher-level does things too bluntly, retrying only
  1734  			// in case of SendError. It should also retry in case of
  1735  			// AmbiguousRetryError as long as it makes sure to not forget about the
  1736  			// ambiguity.
  1737  			// 2) SQL recognizes AmbiguousResultErrors and gives them a special code
  1738  			// (StatementCompletionUnknown).
  1739  			// TODO(andrei): The use of this code is inconsistent because a) the
  1740  			// DistSender tries to only return the code for commits, but it'll happily
  1741  			// forward along AmbiguousResultErrors coming from the replica and b) we
  1742  			// probably should be returning that code for non-commit statements too.
  1743  			//
  1744  			// We retry requests in order to avoid returning errors (in particular,
  1745  			// AmbiguousResultError). Retrying the batch will either:
  1746  			// a) succeed if the request had not been evaluated the first time.
  1747  			// b) succeed if the request also succeeded the first time, but is
  1748  			//    idempotent (i.e. it is internal to a txn, without a commit in the
  1749  			//    batch).
  1750  			// c) fail if it succeeded the first time and the request is not
  1751  			//    idempotent. In the case of EndTxn requests, this is ensured by the
  1752  			//    tombstone keys in the timestamp cache. The retry failing does not
  1753  			//    prove that the request did not succeed the first time around, so we
  1754  			//    can't claim success (and even if we could claim success, we still
  1755  			//    wouldn't have the complete result of the successful evaluation).
  1756  			//
  1757  			// Case a) is great - the retry made the request succeed. Case b) is also
  1758  			// good; due to idempotency we managed to swallow a communication error.
  1759  			// Case c) is not great - we'll end up returning an error even though the
  1760  			// request might have succeeded (an AmbiguousResultError if withCommit is
  1761  			// set).
  1762  			//
  1763  			// TODO(andrei): Case c) is broken for non-transactional requests: nothing
  1764  			// prevents them from double evaluation. This can result in, for example,
  1765  			// an increment applying twice, or more subtle problems like a blind write
  1766  			// evaluating twice, overwriting another unrelated write that fell
  1767  			// in-between.
  1768  			//
  1769  			if withCommit && !grpcutil.RequestDidNotStart(err) {
  1770  				ambiguousError = err
  1771  			}
  1772  			log.VErrEventf(ctx, 2, "RPC error: %s", err)
  1773  
  1774  			// If the error wasn't just a context cancellation and the down replica
  1775  			// is cached as the lease holder, evict it. The only other eviction
  1776  			// happens below on NotLeaseHolderError, but if the next replica is the
  1777  			// actual lease holder, we're never going to receive one of those and
  1778  			// will thus pay the price of trying the down node first forever.
  1779  			//
  1780  			// NB: we should consider instead adding a successful reply from the next
  1781  			// replica into the cache, but without a leaseholder (and taking into
  1782  			// account that the local node can't be down) it won't take long until we
  1783  			// talk to a replica that tells us who the leaseholder is.
  1784  			if ctx.Err() == nil {
  1785  				if storeID, ok := ds.leaseHolderCache.Lookup(ctx, rangeID); ok && curReplica.StoreID == storeID {
  1786  					ds.leaseHolderCache.Update(ctx, rangeID, 0 /* evict */)
  1787  				}
  1788  			}
  1789  		} else {
  1790  			// NB: This section of code may have unfortunate performance implications. If we
  1791  			// exit the below type switch with propagateError remaining at `false`, we'll try
  1792  			// more replicas. That may succeed and future requests might do the same thing over
  1793  			// and over again, adding needless round-trips to the earlier replicas.
  1794  			propagateError := false
  1795  			switch tErr := br.Error.GetDetail().(type) {
  1796  			case nil:
  1797  				// When a request that we've attempted to route to the leaseholder comes
  1798  				// back as successful, we assume that it must have been served by the
  1799  				// leaseholder and so we update the leaseholder cache. In steady state,
  1800  				// this is almost always the case, and so we gate the update on whether
  1801  				// the response comes from a node that we didn't know held the lease.
  1802  				updateLeaseholderCache :=
  1803  					!leaseholder.routeToFollower &&
  1804  						leaseholder.cachedLeaseholder != curReplica
  1805  				if updateLeaseholderCache {
  1806  					ds.leaseHolderCache.Update(ctx, rangeID, curReplica.StoreID)
  1807  				}
  1808  				return br, nil
  1809  			case *roachpb.StoreNotFoundError, *roachpb.NodeUnavailableError:
  1810  				// These errors are likely to be unique to the replica that reported
  1811  				// them, so no action is required before the next retry.
  1812  			case *roachpb.RangeNotFoundError:
  1813  				// The store we routed to doesn't have this replica. This can happen when
  1814  				// our descriptor is outright outdated, but it can also be caused by a
  1815  				// replica that has just been added but needs a snapshot to be caught up.
  1816  				//
  1817  				// We'll try other replicas which typically gives us the leaseholder, either
  1818  				// via the NotLeaseHolderError or nil error paths, both of which update the
  1819  				// leaseholder cache.
  1820  			case *roachpb.NotLeaseHolderError:
  1821  				ds.metrics.NotLeaseHolderErrCount.Inc(1)
  1822  				if lh := tErr.LeaseHolder; lh != nil {
  1823  					// Update the leaseholder cache. Naively this would also happen when the
  1824  					// next RPC comes back, but we don't want to wait out the additional RPC
  1825  					// latency.
  1826  					ds.leaseHolderCache.Update(ctx, rangeID, lh.StoreID)
  1827  					// Avoid an extra update to the leaseholder cache if the next RPC succeeds.
  1828  					leaseholder.cachedLeaseholder = *lh
  1829  
  1830  					// If the implicated leaseholder is not a known replica, return a SendError
  1831  					// to signal eviction of the cached RangeDescriptor and re-send.
  1832  					if replicas.FindReplica(lh.StoreID) == -1 {
  1833  						br.Error = roachpb.NewError(roachpb.NewSendError(fmt.Sprintf(
  1834  							"leaseholder s%d (via %+v) not in cached replicas %v", lh.StoreID, curReplica, replicas,
  1835  						)))
  1836  						propagateError = true
  1837  					} else {
  1838  						// Move the new lease holder to the head of the queue for the next retry.
  1839  						transport.MoveToFront(*lh)
  1840  					}
  1841  				}
  1842  				if l := tErr.Lease; !propagateError && l != nil {
  1843  					// Check whether we've seen this lease or a prior lease before and
  1844  					// backoff if so or update maxSeenLeaseSequence if not.
  1845  					if l.Sequence > maxSeenLeaseSequence {
  1846  						maxSeenLeaseSequence = l.Sequence
  1847  						inTransferRetry.Reset() // The following Next call will not block.
  1848  					} else {
  1849  						ds.metrics.InLeaseTransferBackoffs.Inc(1)
  1850  						log.VErrEventf(ctx, 2, "backing off due to NotLeaseHolderErr at "+
  1851  							"LeaseSequence %d <= %d", l.Sequence, maxSeenLeaseSequence)
  1852  					}
  1853  					inTransferRetry.Next()
  1854  				}
  1855  			default:
  1856  				propagateError = true
  1857  			}
  1858  
  1859  			if propagateError {
  1860  				if ambiguousError != nil {
  1861  					return nil, roachpb.NewAmbiguousResultError(fmt.Sprintf("error=%s [propagate]", ambiguousError))
  1862  				}
  1863  
  1864  				// The error received is likely not specific to this
  1865  				// replica, so we should return it instead of trying other
  1866  				// replicas.
  1867  				return br, nil
  1868  			}
  1869  
  1870  			log.VErrEventf(ctx, 1, "application error: %s", br.Error)
  1871  		}
  1872  
  1873  		// Has the caller given up?
  1874  		if ctx.Err() != nil {
  1875  			reportedErr := errors.Wrap(ctx.Err(), "context done during DistSender.Send")
  1876  			log.Eventf(ctx, "%v", reportedErr)
  1877  			if ambiguousError != nil {
  1878  				return nil, roachpb.NewAmbiguousResultError(reportedErr.Error())
  1879  			}
  1880  			// Don't consider this a SendError, because SendErrors indicate that we
  1881  			// were unable to reach a replica that could serve the request, and they
  1882  			// cause range cache evictions. Context cancellations just mean the
  1883  			// sender changed its mind or the request timed out.
  1884  			return nil, errors.Wrap(ctx.Err(), "aborted during DistSender.Send")
  1885  		}
  1886  
  1887  		if transport.IsExhausted() {
  1888  			if ambiguousError != nil {
  1889  				return nil, roachpb.NewAmbiguousResultError(fmt.Sprintf("error=%s [exhausted]", ambiguousError))
  1890  			}
  1891  
  1892  			// TODO(bdarnell): The last error is not necessarily the best
  1893  			// one to return; we may want to remember the "best" error
  1894  			// we've seen (for example, a NotLeaseHolderError conveys more
  1895  			// information than a RangeNotFound).
  1896  			return nil, roachpb.NewSendError(
  1897  				fmt.Sprintf("sending to all %d replicas failed; last error: %v %v", len(replicas), br, err),
  1898  			)
  1899  		}
  1900  
  1901  		ds.metrics.NextReplicaErrCount.Inc(1)
  1902  		curReplica = transport.NextReplica()
  1903  		log.VEventf(ctx, 2, "error: %v %v; trying next peer %s", br, err, curReplica.String())
  1904  		br, err = transport.SendNext(ctx, ba)
  1905  	}
  1906  }