github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/dist_sender.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvcoord 12 13 import ( 14 "context" 15 "fmt" 16 "runtime" 17 "sync/atomic" 18 "time" 19 "unsafe" 20 21 "github.com/cockroachdb/cockroach/pkg/base" 22 "github.com/cockroachdb/cockroach/pkg/gossip" 23 "github.com/cockroachdb/cockroach/pkg/keys" 24 "github.com/cockroachdb/cockroach/pkg/kv" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/rpc" 27 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 28 "github.com/cockroachdb/cockroach/pkg/settings" 29 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 30 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 31 "github.com/cockroachdb/cockroach/pkg/util/hlc" 32 "github.com/cockroachdb/cockroach/pkg/util/log" 33 "github.com/cockroachdb/cockroach/pkg/util/metric" 34 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 35 "github.com/cockroachdb/cockroach/pkg/util/retry" 36 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 37 "github.com/cockroachdb/cockroach/pkg/util/tracing" 38 "github.com/cockroachdb/cockroach/pkg/util/uuid" 39 "github.com/cockroachdb/errors" 40 ) 41 42 var ( 43 metaDistSenderBatchCount = metric.Metadata{ 44 Name: "distsender.batches", 45 Help: "Number of batches processed", 46 Measurement: "Batches", 47 Unit: metric.Unit_COUNT, 48 } 49 metaDistSenderPartialBatchCount = metric.Metadata{ 50 Name: "distsender.batches.partial", 51 Help: "Number of partial batches processed after being divided on range boundaries", 52 Measurement: "Partial Batches", 53 Unit: metric.Unit_COUNT, 54 } 55 metaDistSenderAsyncSentCount = metric.Metadata{ 56 Name: "distsender.batches.async.sent", 57 Help: "Number of partial batches sent asynchronously", 58 Measurement: "Partial Batches", 59 Unit: metric.Unit_COUNT, 60 } 61 metaDistSenderAsyncThrottledCount = metric.Metadata{ 62 Name: "distsender.batches.async.throttled", 63 Help: "Number of partial batches not sent asynchronously due to throttling", 64 Measurement: "Partial Batches", 65 Unit: metric.Unit_COUNT, 66 } 67 metaTransportSentCount = metric.Metadata{ 68 Name: "distsender.rpc.sent", 69 Help: "Number of RPCs sent", 70 Measurement: "RPCs", 71 Unit: metric.Unit_COUNT, 72 } 73 metaTransportLocalSentCount = metric.Metadata{ 74 Name: "distsender.rpc.sent.local", 75 Help: "Number of local RPCs sent", 76 Measurement: "RPCs", 77 Unit: metric.Unit_COUNT, 78 } 79 metaTransportSenderNextReplicaErrCount = metric.Metadata{ 80 Name: "distsender.rpc.sent.nextreplicaerror", 81 Help: "Number of RPCs sent due to per-replica errors", 82 Measurement: "RPCs", 83 Unit: metric.Unit_COUNT, 84 } 85 metaDistSenderNotLeaseHolderErrCount = metric.Metadata{ 86 Name: "distsender.errors.notleaseholder", 87 Help: "Number of NotLeaseHolderErrors encountered", 88 Measurement: "Errors", 89 Unit: metric.Unit_COUNT, 90 } 91 metaDistSenderInLeaseTransferBackoffsCount = metric.Metadata{ 92 Name: "distsender.errors.inleasetransferbackoffs", 93 Help: "Number of times backed off due to NotLeaseHolderErrors during lease transfer.", 94 Measurement: "Errors", 95 Unit: metric.Unit_COUNT, 96 } 97 metaDistSenderRangeLookups = metric.Metadata{ 98 Name: "distsender.rangelookups", 99 Help: "Number of range lookups.", 100 Measurement: "Range Lookups", 101 Unit: metric.Unit_COUNT, 102 } 103 metaDistSenderSlowRPCs = metric.Metadata{ 104 Name: "requests.slow.distsender", 105 Help: "Number of RPCs stuck or retrying for a long time", 106 Measurement: "Requests", 107 Unit: metric.Unit_COUNT, 108 } 109 ) 110 111 // CanSendToFollower is used by the DistSender to determine if it needs to look 112 // up the current lease holder for a request. It is used by the 113 // followerreadsccl code to inject logic to check if follower reads are enabled. 114 // By default, without CCL code, this function returns false. 115 var CanSendToFollower = func( 116 clusterID uuid.UUID, st *cluster.Settings, ba roachpb.BatchRequest, 117 ) bool { 118 return false 119 } 120 121 const ( 122 // The default limit for asynchronous senders. 123 defaultSenderConcurrency = 500 124 // The maximum number of range descriptors to prefetch during range lookups. 125 rangeLookupPrefetchCount = 8 126 ) 127 128 var rangeDescriptorCacheSize = settings.RegisterIntSetting( 129 "kv.range_descriptor_cache.size", 130 "maximum number of entries in the range descriptor and leaseholder caches", 131 1e6, 132 ) 133 134 var senderConcurrencyLimit = settings.RegisterNonNegativeIntSetting( 135 "kv.dist_sender.concurrency_limit", 136 "maximum number of asynchronous send requests", 137 max(defaultSenderConcurrency, int64(32*runtime.NumCPU())), 138 ) 139 140 func max(a, b int64) int64 { 141 if a > b { 142 return a 143 } 144 return b 145 } 146 147 // DistSenderMetrics is the set of metrics for a given distributed sender. 148 type DistSenderMetrics struct { 149 BatchCount *metric.Counter 150 PartialBatchCount *metric.Counter 151 AsyncSentCount *metric.Counter 152 AsyncThrottledCount *metric.Counter 153 SentCount *metric.Counter 154 LocalSentCount *metric.Counter 155 NextReplicaErrCount *metric.Counter 156 NotLeaseHolderErrCount *metric.Counter 157 InLeaseTransferBackoffs *metric.Counter 158 RangeLookups *metric.Counter 159 SlowRPCs *metric.Gauge 160 } 161 162 func makeDistSenderMetrics() DistSenderMetrics { 163 return DistSenderMetrics{ 164 BatchCount: metric.NewCounter(metaDistSenderBatchCount), 165 PartialBatchCount: metric.NewCounter(metaDistSenderPartialBatchCount), 166 AsyncSentCount: metric.NewCounter(metaDistSenderAsyncSentCount), 167 AsyncThrottledCount: metric.NewCounter(metaDistSenderAsyncThrottledCount), 168 SentCount: metric.NewCounter(metaTransportSentCount), 169 LocalSentCount: metric.NewCounter(metaTransportLocalSentCount), 170 NextReplicaErrCount: metric.NewCounter(metaTransportSenderNextReplicaErrCount), 171 NotLeaseHolderErrCount: metric.NewCounter(metaDistSenderNotLeaseHolderErrCount), 172 InLeaseTransferBackoffs: metric.NewCounter(metaDistSenderInLeaseTransferBackoffsCount), 173 RangeLookups: metric.NewCounter(metaDistSenderRangeLookups), 174 SlowRPCs: metric.NewGauge(metaDistSenderSlowRPCs), 175 } 176 } 177 178 // A firstRangeMissingError indicates that the first range has not yet 179 // been gossiped. This will be the case for a node which hasn't yet 180 // joined the gossip network. 181 type firstRangeMissingError struct{} 182 183 // Error is part of the error interface. 184 func (f firstRangeMissingError) Error() string { 185 return "the descriptor for the first range is not available via gossip" 186 } 187 188 // A DistSender provides methods to access Cockroach's monolithic, 189 // distributed key value store. Each method invocation triggers a 190 // lookup or lookups to find replica metadata for implicated key 191 // ranges. RPCs are sent to one or more of the replicas to satisfy 192 // the method invocation. 193 type DistSender struct { 194 log.AmbientContext 195 196 st *cluster.Settings 197 // nodeDescriptor, if set, holds the descriptor of the node the 198 // DistSender lives on. It should be accessed via getNodeDescriptor(), 199 // which tries to obtain the value from the Gossip network if the 200 // descriptor is unknown. 201 nodeDescriptor unsafe.Pointer 202 // clock is used to set time for some calls. E.g. read-only ops 203 // which span ranges and don't require read consistency. 204 clock *hlc.Clock 205 // gossip provides up-to-date information about the start of the 206 // key range, used to find the replica metadata for arbitrary key 207 // ranges. 208 gossip *gossip.Gossip 209 metrics DistSenderMetrics 210 // rangeCache caches replica metadata for key ranges. 211 rangeCache *RangeDescriptorCache 212 // leaseHolderCache caches range lease holders by range ID. 213 leaseHolderCache *LeaseHolderCache 214 transportFactory TransportFactory 215 rpcContext *rpc.Context 216 nodeDialer *nodedialer.Dialer 217 rpcRetryOptions retry.Options 218 asyncSenderSem *quotapool.IntPool 219 // clusterID is used to verify access to enterprise features. 220 // It is copied out of the rpcContext at construction time and used in 221 // testing. 222 clusterID *base.ClusterIDContainer 223 224 // disableFirstRangeUpdates disables updates of the first range via 225 // gossip. Used by tests which want finer control of the contents of the 226 // range cache. 227 disableFirstRangeUpdates int32 228 229 // disableParallelBatches instructs DistSender to never parallelize 230 // the transmission of partial batch requests across ranges. 231 disableParallelBatches bool 232 } 233 234 var _ kv.Sender = &DistSender{} 235 236 // DistSenderConfig holds configuration and auxiliary objects that can be passed 237 // to NewDistSender. 238 type DistSenderConfig struct { 239 AmbientCtx log.AmbientContext 240 241 Settings *cluster.Settings 242 Clock *hlc.Clock 243 RPCRetryOptions *retry.Options 244 // nodeDescriptor, if provided, is used to describe which node the DistSender 245 // lives on, for instance when deciding where to send RPCs. 246 // Usually it is filled in from the Gossip network on demand. 247 nodeDescriptor *roachpb.NodeDescriptor 248 RPCContext *rpc.Context 249 RangeDescriptorDB RangeDescriptorDB 250 251 NodeDialer *nodedialer.Dialer 252 253 TestingKnobs ClientTestingKnobs 254 } 255 256 // NewDistSender returns a batch.Sender instance which connects to the 257 // Cockroach cluster via the supplied gossip instance. Supplying a 258 // DistSenderContext or the fields within is optional. For omitted values, sane 259 // defaults will be used. 260 func NewDistSender(cfg DistSenderConfig, g *gossip.Gossip) *DistSender { 261 ds := &DistSender{ 262 st: cfg.Settings, 263 clock: cfg.Clock, 264 gossip: g, 265 metrics: makeDistSenderMetrics(), 266 nodeDialer: cfg.NodeDialer, 267 } 268 if ds.st == nil { 269 ds.st = cluster.MakeTestingClusterSettings() 270 } 271 272 ds.AmbientContext = cfg.AmbientCtx 273 if ds.AmbientContext.Tracer == nil { 274 panic("no tracer set in AmbientCtx") 275 } 276 277 if cfg.nodeDescriptor != nil { 278 atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(cfg.nodeDescriptor)) 279 } 280 rdb := cfg.RangeDescriptorDB 281 if rdb == nil { 282 rdb = ds 283 } 284 getRangeDescCacheSize := func() int64 { 285 return rangeDescriptorCacheSize.Get(&ds.st.SV) 286 } 287 ds.rangeCache = NewRangeDescriptorCache(ds.st, rdb, getRangeDescCacheSize, cfg.RPCContext.Stopper) 288 ds.leaseHolderCache = NewLeaseHolderCache(getRangeDescCacheSize) 289 if tf := cfg.TestingKnobs.TransportFactory; tf != nil { 290 ds.transportFactory = tf 291 } else { 292 ds.transportFactory = GRPCTransportFactory 293 } 294 ds.rpcRetryOptions = base.DefaultRetryOptions() 295 if cfg.RPCRetryOptions != nil { 296 ds.rpcRetryOptions = *cfg.RPCRetryOptions 297 } 298 if cfg.RPCContext == nil { 299 panic("no RPCContext set in DistSenderConfig") 300 } 301 ds.rpcContext = cfg.RPCContext 302 if ds.rpcRetryOptions.Closer == nil { 303 ds.rpcRetryOptions.Closer = ds.rpcContext.Stopper.ShouldQuiesce() 304 } 305 ds.clusterID = &cfg.RPCContext.ClusterID 306 ds.nodeDialer = cfg.NodeDialer 307 ds.asyncSenderSem = quotapool.NewIntPool("DistSender async concurrency", 308 uint64(senderConcurrencyLimit.Get(&cfg.Settings.SV))) 309 senderConcurrencyLimit.SetOnChange(&cfg.Settings.SV, func() { 310 ds.asyncSenderSem.UpdateCapacity(uint64(senderConcurrencyLimit.Get(&cfg.Settings.SV))) 311 }) 312 ds.rpcContext.Stopper.AddCloser(ds.asyncSenderSem.Closer("stopper")) 313 314 if g != nil { 315 ctx := ds.AnnotateCtx(context.Background()) 316 g.RegisterCallback(gossip.KeyFirstRangeDescriptor, 317 func(_ string, value roachpb.Value) { 318 if atomic.LoadInt32(&ds.disableFirstRangeUpdates) == 1 { 319 return 320 } 321 if log.V(1) { 322 var desc roachpb.RangeDescriptor 323 if err := value.GetProto(&desc); err != nil { 324 log.Errorf(ctx, "unable to parse gossiped first range descriptor: %s", err) 325 } else { 326 log.Infof(ctx, "gossiped first range descriptor: %+v", desc.Replicas()) 327 } 328 } 329 ds.rangeCache.EvictByKey(ctx, roachpb.RKeyMin) 330 }) 331 } 332 return ds 333 } 334 335 // DisableFirstRangeUpdates disables updates of the first range via 336 // gossip. Used by tests which want finer control of the contents of the range 337 // cache. 338 func (ds *DistSender) DisableFirstRangeUpdates() { 339 atomic.StoreInt32(&ds.disableFirstRangeUpdates, 1) 340 } 341 342 // DisableParallelBatches instructs DistSender to never parallelize the 343 // transmission of partial batch requests across ranges. 344 func (ds *DistSender) DisableParallelBatches() { 345 ds.disableParallelBatches = true 346 } 347 348 // Metrics returns a struct which contains metrics related to the distributed 349 // sender's activity. 350 func (ds *DistSender) Metrics() DistSenderMetrics { 351 return ds.metrics 352 } 353 354 // RangeDescriptorCache gives access to the DistSender's range cache. 355 func (ds *DistSender) RangeDescriptorCache() *RangeDescriptorCache { 356 return ds.rangeCache 357 } 358 359 // LeaseHolderCache gives access to the DistSender's lease cache. 360 func (ds *DistSender) LeaseHolderCache() *LeaseHolderCache { 361 return ds.leaseHolderCache 362 } 363 364 // RangeLookup implements the RangeDescriptorDB interface. It uses LookupRange 365 // to perform a lookup scan for the provided key, using DistSender itself as the 366 // client.Sender. This means that the scan will recurse into DistSender, which 367 // will in turn use the RangeDescriptorCache again to lookup the RangeDescriptor 368 // necessary to perform the scan. 369 func (ds *DistSender) RangeLookup( 370 ctx context.Context, key roachpb.RKey, useReverseScan bool, 371 ) ([]roachpb.RangeDescriptor, []roachpb.RangeDescriptor, error) { 372 ds.metrics.RangeLookups.Inc(1) 373 // We perform the range lookup scan with a READ_UNCOMMITTED consistency 374 // level because we want the scan to return intents as well as committed 375 // values. The reason for this is because it's not clear whether the intent 376 // or the previous value points to the correct location of the Range. It 377 // gets even more complicated when there are split-related intents or a txn 378 // record co-located with a replica involved in the split. Since we cannot 379 // know the correct answer, we lookup both the pre- and post- transaction 380 // values. 381 rc := roachpb.READ_UNCOMMITTED 382 // By using DistSender as the sender, we guarantee that even if the desired 383 // RangeDescriptor is not on the first range we send the lookup too, we'll 384 // still find it when we scan to the next range. This addresses the issue 385 // described in #18032 and #16266, allowing us to support meta2 splits. 386 return kv.RangeLookup(ctx, ds, key.AsRawKey(), rc, rangeLookupPrefetchCount, useReverseScan) 387 } 388 389 // FirstRange implements the RangeDescriptorDB interface. 390 // FirstRange returns the RangeDescriptor for the first range on the cluster, 391 // which is retrieved from the gossip protocol instead of the datastore. 392 func (ds *DistSender) FirstRange() (*roachpb.RangeDescriptor, error) { 393 if ds.gossip == nil { 394 panic("with `nil` Gossip, DistSender must not use itself as rangeDescriptorDB") 395 } 396 rangeDesc := &roachpb.RangeDescriptor{} 397 if err := ds.gossip.GetInfoProto(gossip.KeyFirstRangeDescriptor, rangeDesc); err != nil { 398 return nil, firstRangeMissingError{} 399 } 400 return rangeDesc, nil 401 } 402 403 // getNodeDescriptor returns ds.nodeDescriptor, but makes an attempt to load 404 // it from the Gossip network if a nil value is found. 405 // We must jump through hoops here to get the node descriptor because it's not available 406 // until after the node has joined the gossip network and been allowed to initialize 407 // its stores. 408 func (ds *DistSender) getNodeDescriptor() *roachpb.NodeDescriptor { 409 if desc := atomic.LoadPointer(&ds.nodeDescriptor); desc != nil { 410 return (*roachpb.NodeDescriptor)(desc) 411 } 412 if ds.gossip == nil { 413 return nil 414 } 415 416 ownNodeID := ds.gossip.NodeID.Get() 417 if ownNodeID > 0 { 418 // TODO(tschottdorf): Consider instead adding the NodeID of the 419 // coordinator to the header, so we can get this from incoming 420 // requests. Just in case we want to mostly eliminate gossip here. 421 nodeDesc := &roachpb.NodeDescriptor{} 422 if err := ds.gossip.GetInfoProto(gossip.MakeNodeIDKey(ownNodeID), nodeDesc); err == nil { 423 atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(nodeDesc)) 424 return nodeDesc 425 } 426 } 427 if log.V(1) { 428 ctx := ds.AnnotateCtx(context.TODO()) 429 log.Infof(ctx, "unable to determine this node's attributes for replica "+ 430 "selection; node is most likely bootstrapping") 431 } 432 return nil 433 } 434 435 // sendRPC sends one or more RPCs to replicas from the supplied 436 // roachpb.Replica slice. Returns an RPC error if the request could 437 // not be sent. Note that the reply may contain a higher level error 438 // and must be checked in addition to the RPC error. 439 // 440 // The replicas are assumed to be ordered by preference, with closer 441 // ones (i.e. expected lowest latency) first. 442 // 443 // See sendToReplicas for a description of the withCommit parameter. 444 func (ds *DistSender) sendRPC( 445 ctx context.Context, 446 ba roachpb.BatchRequest, 447 class rpc.ConnectionClass, 448 rangeID roachpb.RangeID, 449 replicas ReplicaSlice, 450 li leaseholderInfo, 451 withCommit bool, 452 ) (*roachpb.BatchResponse, error) { 453 if len(replicas) == 0 { 454 return nil, roachpb.NewSendError( 455 fmt.Sprintf("no replica node addresses available via gossip for r%d", rangeID)) 456 } 457 458 ba.RangeID = rangeID 459 460 tracing.AnnotateTrace() 461 defer tracing.AnnotateTrace() 462 463 return ds.sendToReplicas( 464 ctx, 465 ba, 466 SendOptions{ 467 class: class, 468 metrics: &ds.metrics, 469 }, 470 rangeID, 471 replicas, 472 ds.nodeDialer, 473 li, 474 withCommit, 475 ) 476 } 477 478 // CountRanges returns the number of ranges that encompass the given key span. 479 func (ds *DistSender) CountRanges(ctx context.Context, rs roachpb.RSpan) (int64, error) { 480 var count int64 481 ri := NewRangeIterator(ds) 482 for ri.Seek(ctx, rs.Key, Ascending); ri.Valid(); ri.Next(ctx) { 483 count++ 484 if !ri.NeedAnother(rs) { 485 break 486 } 487 } 488 return count, ri.Error() 489 } 490 491 // getDescriptor looks up the range descriptor to use for a query of 492 // the key descKey with the given options. The lookup takes into 493 // consideration the last range descriptor that the caller had used 494 // for this key span, if any, and if the last range descriptor has 495 // been evicted because it was found to be stale, which is all managed 496 // through the EvictionToken. The function should be provided with an 497 // EvictionToken if one was acquired from this function on a previous 498 // call. If not, an empty EvictionToken can be provided. 499 // 500 // The range descriptor which contains the range in which the request should 501 // start its query is returned first. Next returned is an EvictionToken. In 502 // case the descriptor is discovered stale, the returned EvictionToken's evict 503 // method should be called; it evicts the cache appropriately. 504 // 505 // If useReverseScan is set and descKey is the boundary between the two ranges, 506 // the left range will be returned (even though descKey is actually contained on 507 // the right range). This is useful for ReverseScans, which call this method 508 // with their exclusive EndKey. 509 func (ds *DistSender) getDescriptor( 510 ctx context.Context, descKey roachpb.RKey, evictToken *EvictionToken, useReverseScan bool, 511 ) (*roachpb.RangeDescriptor, *EvictionToken, error) { 512 desc, returnToken, err := ds.rangeCache.LookupRangeDescriptorWithEvictionToken( 513 ctx, descKey, evictToken, useReverseScan, 514 ) 515 if err != nil { 516 return nil, returnToken, err 517 } 518 519 // Sanity check: the descriptor we're about to return must include the key 520 // we're interested in. 521 { 522 containsFn := (*roachpb.RangeDescriptor).ContainsKey 523 if useReverseScan { 524 containsFn = (*roachpb.RangeDescriptor).ContainsKeyInverted 525 } 526 if !containsFn(desc, descKey) { 527 log.Fatalf(ctx, "programming error: range resolution returning non-matching descriptor: "+ 528 "desc: %s, key: %s, reverse: %t", desc, descKey, log.Safe(useReverseScan)) 529 } 530 } 531 532 return desc, returnToken, nil 533 } 534 535 // sendSingleRange gathers and rearranges the replicas, and makes an RPC call. 536 func (ds *DistSender) sendSingleRange( 537 ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor, withCommit bool, 538 ) (*roachpb.BatchResponse, *roachpb.Error) { 539 // Try to send the call. Learner replicas won't serve reads/writes, so send 540 // only to the `Voters` replicas. This is just an optimization to save a 541 // network hop, everything would still work if we had `All` here. 542 replicas := NewReplicaSlice(ds.gossip, desc.Replicas().Voters()) 543 544 // Rearrange the replicas so that they're ordered in expectation of 545 // request latency. 546 replicas.OptimizeReplicaOrder(ds.getNodeDescriptor(), ds.rpcContext.RemoteClocks.Latency) 547 548 var cachedLeaseHolder roachpb.ReplicaDescriptor 549 if storeID, ok := ds.leaseHolderCache.Lookup(ctx, desc.RangeID); ok { 550 if i := replicas.FindReplica(storeID); i >= 0 { 551 cachedLeaseHolder = replicas[i].ReplicaDescriptor 552 } 553 } 554 canFollowerRead := ds.clusterID != nil && 555 CanSendToFollower(ds.clusterID.Get(), ds.st, ba) 556 // If this request needs to go to a lease holder and we know who that is, move 557 // it to the front. 558 sendToLeaseholder := 559 cachedLeaseHolder != (roachpb.ReplicaDescriptor{}) && 560 !canFollowerRead && 561 ba.RequiresLeaseHolder() 562 if sendToLeaseholder { 563 if i := replicas.FindReplica(cachedLeaseHolder.StoreID); i >= 0 { 564 replicas.MoveToFront(i) 565 } 566 } 567 li := leaseholderInfo{ 568 routeToFollower: canFollowerRead || !ba.RequiresLeaseHolder(), 569 cachedLeaseholder: cachedLeaseHolder, 570 } 571 572 class := rpc.ConnectionClassForKey(desc.RSpan().Key) 573 br, err := ds.sendRPC(ctx, ba, class, desc.RangeID, replicas, li, withCommit) 574 if err != nil { 575 log.VErrEventf(ctx, 2, "%v", err) 576 return nil, roachpb.NewError(err) 577 } 578 579 // If the reply contains a timestamp, update the local HLC with it. 580 if br.Error != nil && br.Error.Now != (hlc.Timestamp{}) { 581 ds.clock.Update(br.Error.Now) 582 } else if br.Now != (hlc.Timestamp{}) { 583 ds.clock.Update(br.Now) 584 } 585 586 // Untangle the error from the received response. 587 pErr := br.Error 588 br.Error = nil // scrub the response error 589 return br, pErr 590 } 591 592 // initAndVerifyBatch initializes timestamp-related information and 593 // verifies batch constraints before splitting. 594 func (ds *DistSender) initAndVerifyBatch( 595 ctx context.Context, ba *roachpb.BatchRequest, 596 ) *roachpb.Error { 597 // Attach the local node ID to each request. 598 if ba.Header.GatewayNodeID == 0 && ds.gossip != nil { 599 ba.Header.GatewayNodeID = ds.gossip.NodeID.Get() 600 } 601 602 // In the event that timestamp isn't set and read consistency isn't 603 // required, set the timestamp using the local clock. 604 if ba.ReadConsistency != roachpb.CONSISTENT && ba.Timestamp == (hlc.Timestamp{}) { 605 ba.Timestamp = ds.clock.Now() 606 } 607 608 if len(ba.Requests) < 1 { 609 return roachpb.NewErrorf("empty batch") 610 } 611 612 if ba.MaxSpanRequestKeys != 0 || ba.TargetBytes != 0 { 613 // Verify that the batch contains only specific range requests or the 614 // EndTxnRequest. Verify that a batch with a ReverseScan only contains 615 // ReverseScan range requests. 616 isReverse := ba.IsReverse() 617 for _, req := range ba.Requests { 618 inner := req.GetInner() 619 switch inner.(type) { 620 case *roachpb.ScanRequest, *roachpb.ResolveIntentRangeRequest, 621 *roachpb.DeleteRangeRequest, *roachpb.RevertRangeRequest: 622 // Accepted forward range requests. 623 if isReverse { 624 return roachpb.NewErrorf("batch with limit contains both forward and reverse scans") 625 } 626 627 case *roachpb.ReverseScanRequest: 628 // Accepted reverse range requests. 629 630 case *roachpb.QueryIntentRequest, *roachpb.EndTxnRequest: 631 // Accepted point requests that can be in batches with limit. 632 633 default: 634 return roachpb.NewErrorf("batch with limit contains %T request", inner) 635 } 636 } 637 } 638 639 return nil 640 } 641 642 // errNo1PCTxn indicates that a batch cannot be sent as a 1 phase 643 // commit because it spans multiple ranges and must be split into at 644 // least two parts, with the final part containing the EndTxn 645 // request. 646 var errNo1PCTxn = roachpb.NewErrorf("cannot send 1PC txn to multiple ranges") 647 648 // splitBatchAndCheckForRefreshSpans splits the batch according to the 649 // canSplitET parameter and checks whether the batch can forward its 650 // read timestamp. If the batch has its CanForwardReadTimestamp flag 651 // set but is being split across multiple sub-batches then the flag in 652 // the batch header is unset. 653 func splitBatchAndCheckForRefreshSpans( 654 ba *roachpb.BatchRequest, canSplitET bool, 655 ) [][]roachpb.RequestUnion { 656 parts := ba.Split(canSplitET) 657 658 // If the batch is split and the header has its CanForwardReadTimestamp flag 659 // set then we much check whether any request would need to be refreshed in 660 // the event that the one of the partial batches was to forward its read 661 // timestamp during a server-side refresh. If any such request exists then 662 // we unset the CanForwardReadTimestamp flag. 663 if len(parts) > 1 && ba.CanForwardReadTimestamp { 664 hasRefreshSpans := func() bool { 665 for _, part := range parts { 666 for _, req := range part { 667 if roachpb.NeedsRefresh(req.GetInner()) { 668 return true 669 } 670 } 671 } 672 return false 673 }() 674 if hasRefreshSpans { 675 ba.CanForwardReadTimestamp = false 676 677 // If the final part contains an EndTxn request, unset its 678 // CanCommitAtHigherTimestamp flag as well. 679 lastPart := parts[len(parts)-1] 680 if et := lastPart[len(lastPart)-1].GetEndTxn(); et != nil { 681 etCopy := *et 682 etCopy.CanCommitAtHigherTimestamp = false 683 lastPart = append([]roachpb.RequestUnion(nil), lastPart...) 684 lastPart[len(lastPart)-1].MustSetInner(&etCopy) 685 parts[len(parts)-1] = lastPart 686 } 687 } 688 } 689 return parts 690 } 691 692 // Send implements the batch.Sender interface. It subdivides the Batch 693 // into batches admissible for sending (preventing certain illegal 694 // mixtures of requests), executes each individual part (which may 695 // span multiple ranges), and recombines the response. 696 // 697 // When the request spans ranges, it is split by range and a partial 698 // subset of the batch request is sent to affected ranges in parallel. 699 func (ds *DistSender) Send( 700 ctx context.Context, ba roachpb.BatchRequest, 701 ) (*roachpb.BatchResponse, *roachpb.Error) { 702 ds.metrics.BatchCount.Inc(1) 703 704 tracing.AnnotateTrace() 705 706 // TODO(nvanbenschoten): This causes ba to escape to the heap. Either 707 // commit to passing BatchRequests by reference or return an updated 708 // value from this method instead. 709 if pErr := ds.initAndVerifyBatch(ctx, &ba); pErr != nil { 710 return nil, pErr 711 } 712 713 ctx = ds.AnnotateCtx(ctx) 714 ctx, sp := tracing.EnsureChildSpan(ctx, ds.AmbientContext.Tracer, "dist sender send") 715 defer sp.Finish() 716 717 var rplChunks []*roachpb.BatchResponse 718 splitET := false 719 var require1PC bool 720 lastReq := ba.Requests[len(ba.Requests)-1].GetInner() 721 if et, ok := lastReq.(*roachpb.EndTxnRequest); ok && et.Require1PC { 722 require1PC = true 723 } 724 // To ensure that we lay down intents to prevent starvation, always 725 // split the end transaction request into its own batch on retries. 726 // Txns requiring 1PC are an exception and should never be split. 727 if ba.Txn != nil && ba.Txn.Epoch > 0 && !require1PC { 728 splitET = true 729 } 730 parts := splitBatchAndCheckForRefreshSpans(&ba, splitET) 731 if len(parts) > 1 && (ba.MaxSpanRequestKeys != 0 || ba.TargetBytes != 0) { 732 // We already verified above that the batch contains only scan requests of the same type. 733 // Such a batch should never need splitting. 734 log.Fatalf(ctx, "batch with MaxSpanRequestKeys=%d, TargetBytes=%d needs splitting", 735 log.Safe(ba.MaxSpanRequestKeys), log.Safe(ba.TargetBytes)) 736 } 737 738 errIdxOffset := 0 739 for len(parts) > 0 { 740 part := parts[0] 741 ba.Requests = part 742 // The minimal key range encompassing all requests contained within. 743 // Local addressing has already been resolved. 744 // TODO(tschottdorf): consider rudimentary validation of the batch here 745 // (for example, non-range requests with EndKey, or empty key ranges). 746 rs, err := keys.Range(ba.Requests) 747 if err != nil { 748 return nil, roachpb.NewError(err) 749 } 750 751 // Determine whether this part of the BatchRequest contains a committing 752 // EndTxn request. 753 var withCommit, withParallelCommit bool 754 if etArg, ok := ba.GetArg(roachpb.EndTxn); ok { 755 et := etArg.(*roachpb.EndTxnRequest) 756 withCommit = et.Commit 757 withParallelCommit = et.IsParallelCommit() 758 } 759 760 var rpl *roachpb.BatchResponse 761 var pErr *roachpb.Error 762 if withParallelCommit { 763 rpl, pErr = ds.divideAndSendParallelCommit(ctx, ba, rs, 0 /* batchIdx */) 764 } else { 765 rpl, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, withCommit, 0 /* batchIdx */) 766 } 767 768 if pErr == errNo1PCTxn { 769 // If we tried to send a single round-trip EndTxn but it looks like 770 // it's going to hit multiple ranges, split it here and try again. 771 if len(parts) != 1 { 772 panic("EndTxn not in last chunk of batch") 773 } else if require1PC { 774 log.Fatalf(ctx, "required 1PC transaction cannot be split: %s", ba) 775 } 776 parts = splitBatchAndCheckForRefreshSpans(&ba, true /* split ET */) 777 // Restart transaction of the last chunk as multiple parts with 778 // EndTxn in the last part. 779 continue 780 } 781 if pErr != nil { 782 if pErr.Index != nil && pErr.Index.Index != -1 { 783 pErr.Index.Index += int32(errIdxOffset) 784 } 785 return nil, pErr 786 } 787 788 errIdxOffset += len(ba.Requests) 789 790 // Propagate transaction from last reply to next request. The final 791 // update is taken and put into the response's main header. 792 ba.UpdateTxn(rpl.Txn) 793 rplChunks = append(rplChunks, rpl) 794 parts = parts[1:] 795 } 796 797 var reply *roachpb.BatchResponse 798 if len(rplChunks) > 0 { 799 reply = rplChunks[0] 800 for _, rpl := range rplChunks[1:] { 801 reply.Responses = append(reply.Responses, rpl.Responses...) 802 reply.CollectedSpans = append(reply.CollectedSpans, rpl.CollectedSpans...) 803 } 804 lastHeader := rplChunks[len(rplChunks)-1].BatchResponse_Header 805 lastHeader.CollectedSpans = reply.CollectedSpans 806 reply.BatchResponse_Header = lastHeader 807 } 808 809 return reply, nil 810 } 811 812 type response struct { 813 reply *roachpb.BatchResponse 814 positions []int 815 pErr *roachpb.Error 816 } 817 818 // divideAndSendParallelCommit divides a parallel-committing batch into 819 // sub-batches that can be evaluated in parallel but should not be evaluated 820 // on a Store together. 821 // 822 // The case where this comes up is if the batch is performing a parallel commit 823 // and the transaction has previously pipelined writes that have yet to be 824 // proven successful. In this scenario, the EndTxn request will be preceded by a 825 // series of QueryIntent requests (see txn_pipeliner.go). Before evaluating, 826 // each of these QueryIntent requests will grab latches and wait for their 827 // corresponding write to finish. This is how the QueryIntent requests 828 // synchronize with the write they are trying to verify. 829 // 830 // If these QueryIntents remained in the same batch as the EndTxn request then 831 // they would force the EndTxn request to wait for the previous write before 832 // evaluating itself. This "pipeline stall" would effectively negate the benefit 833 // of the parallel commit. To avoid this, we make sure that these "pre-commit" 834 // QueryIntent requests are split from and issued concurrently with the rest of 835 // the parallel commit batch. 836 // 837 // batchIdx indicates which partial fragment of the larger batch is being 838 // processed by this method. Currently it is always set to zero because this 839 // method is never invoked recursively, but it is exposed to maintain symmetry 840 // with divideAndSendBatchToRanges. 841 func (ds *DistSender) divideAndSendParallelCommit( 842 ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, batchIdx int, 843 ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { 844 // Search backwards, looking for the first pre-commit QueryIntent. 845 swapIdx := -1 846 lastIdx := len(ba.Requests) - 1 847 for i := lastIdx - 1; i >= 0; i-- { 848 req := ba.Requests[i].GetInner() 849 if req.Method() == roachpb.QueryIntent { 850 swapIdx = i 851 } else { 852 break 853 } 854 } 855 if swapIdx == -1 { 856 // No pre-commit QueryIntents. Nothing to split. 857 return ds.divideAndSendBatchToRanges(ctx, ba, rs, true /* withCommit */, batchIdx) 858 } 859 860 // Swap the EndTxn request and the first pre-commit QueryIntent. This 861 // effectively creates a split point between the two groups of requests. 862 // 863 // Before: [put qi(1) put del qi(2) qi(3) qi(4) et] 864 // After: [put qi(1) put del et qi(3) qi(4) qi(2)] 865 // Separated: [put qi(1) put del et] [qi(3) qi(4) qi(2)] 866 // 867 // NOTE: the non-pre-commit QueryIntent's must remain where they are in the 868 // batch. These ensure that the transaction always reads its writes (see 869 // txnPipeliner.chainToInFlightWrites). These will introduce pipeline stalls 870 // and undo most of the benefit of this method, but luckily they are rare in 871 // practice. 872 swappedReqs := append([]roachpb.RequestUnion(nil), ba.Requests...) 873 swappedReqs[swapIdx], swappedReqs[lastIdx] = swappedReqs[lastIdx], swappedReqs[swapIdx] 874 875 // Create a new pre-commit QueryIntent-only batch and issue it 876 // in a non-limited async task. This batch may need to be split 877 // over multiple ranges, so call into divideAndSendBatchToRanges. 878 qiBa := ba 879 qiBa.Requests = swappedReqs[swapIdx+1:] 880 qiRS, err := keys.Range(qiBa.Requests) 881 if err != nil { 882 return br, roachpb.NewError(err) 883 } 884 qiBatchIdx := batchIdx + 1 885 qiResponseCh := make(chan response, 1) 886 887 runTask := ds.rpcContext.Stopper.RunAsyncTask 888 if ds.disableParallelBatches { 889 runTask = ds.rpcContext.Stopper.RunTask 890 } 891 if err := runTask(ctx, "kv.DistSender: sending pre-commit query intents", func(ctx context.Context) { 892 // Map response index to the original un-swapped batch index. 893 // Remember that we moved the last QueryIntent in this batch 894 // from swapIdx to the end. 895 // 896 // From the example above: 897 // Before: [put qi(1) put del qi(2) qi(3) qi(4) et] 898 // Separated: [put qi(1) put del et] [qi(3) qi(4) qi(2)] 899 // 900 // qiBa.Requests = [qi(3) qi(4) qi(2)] 901 // swapIdx = 4 902 // positions = [5 6 4] 903 // 904 positions := make([]int, len(qiBa.Requests)) 905 positions[len(positions)-1] = swapIdx 906 for i := range positions[:len(positions)-1] { 907 positions[i] = swapIdx + 1 + i 908 } 909 910 // Send the batch with withCommit=true since it will be inflight 911 // concurrently with the EndTxn batch below. 912 reply, pErr := ds.divideAndSendBatchToRanges(ctx, qiBa, qiRS, true /* withCommit */, qiBatchIdx) 913 qiResponseCh <- response{reply: reply, positions: positions, pErr: pErr} 914 }); err != nil { 915 return nil, roachpb.NewError(err) 916 } 917 918 // Adjust the original batch request to ignore the pre-commit 919 // QueryIntent requests. Make sure to determine the request's 920 // new key span. 921 ba.Requests = swappedReqs[:swapIdx+1] 922 rs, err = keys.Range(ba.Requests) 923 if err != nil { 924 return nil, roachpb.NewError(err) 925 } 926 br, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, true /* withCommit */, batchIdx) 927 928 // Wait for the QueryIntent-only batch to complete and stitch 929 // the responses together. 930 qiReply := <-qiResponseCh 931 932 // Handle error conditions. 933 if pErr != nil { 934 // The batch with the EndTxn returned an error. Ignore errors from the 935 // pre-commit QueryIntent requests because that request is read-only and 936 // will produce the same errors next time, if applicable. 937 if qiReply.reply != nil { 938 pErr.UpdateTxn(qiReply.reply.Txn) 939 } 940 maybeSwapErrorIndex(pErr, swapIdx, lastIdx) 941 return nil, pErr 942 } 943 if qiPErr := qiReply.pErr; qiPErr != nil { 944 // The batch with the pre-commit QueryIntent requests returned an error. 945 ignoreMissing := false 946 if _, ok := qiPErr.GetDetail().(*roachpb.IntentMissingError); ok { 947 // If the error is an IntentMissingError, detect whether this is due 948 // to intent resolution and can be safely ignored. 949 ignoreMissing, err = ds.detectIntentMissingDueToIntentResolution(ctx, br.Txn) 950 if err != nil { 951 return nil, roachpb.NewError(err) 952 } 953 } 954 if !ignoreMissing { 955 qiPErr.UpdateTxn(br.Txn) 956 maybeSwapErrorIndex(qiPErr, swapIdx, lastIdx) 957 return nil, qiPErr 958 } 959 // Populate the pre-commit QueryIntent batch response. If we made it 960 // here then we know we can ignore intent missing errors. 961 qiReply.reply = qiBa.CreateReply() 962 for _, ru := range qiReply.reply.Responses { 963 ru.GetQueryIntent().FoundIntent = true 964 } 965 } 966 967 // Both halves of the split batch succeeded. Piece them back together. 968 resps := make([]roachpb.ResponseUnion, len(swappedReqs)) 969 copy(resps, br.Responses) 970 resps[swapIdx], resps[lastIdx] = resps[lastIdx], resps[swapIdx] 971 br.Responses = resps 972 if err := br.Combine(qiReply.reply, qiReply.positions); err != nil { 973 return nil, roachpb.NewError(err) 974 } 975 return br, nil 976 } 977 978 // detectIntentMissingDueToIntentResolution attempts to detect whether a missing 979 // intent error thrown by a pre-commit QueryIntent request was due to intent 980 // resolution after the transaction was already finalized instead of due to a 981 // failure of the corresponding pipelined write. It is possible for these two 982 // situations to be confused because the pre-commit QueryIntent requests are 983 // issued in parallel with the staging EndTxn request and may evaluate after the 984 // transaction becomes implicitly committed. If this happens and a concurrent 985 // transaction observes the implicit commit and makes the commit explicit, it is 986 // allowed to begin resolving the transactions intents. 987 // 988 // MVCC values don't remember their transaction once they have been resolved. 989 // This loss of information means that QueryIntent returns an intent missing 990 // error if it finds the resolved value that correspond to its desired intent. 991 // Because of this, the race discussed above can result in intent missing errors 992 // during a parallel commit even when the transaction successfully committed. 993 // 994 // This method queries the transaction record to determine whether an intent 995 // missing error was caused by this race or whether the intent missing error 996 // is real and guarantees that the transaction is not implicitly committed. 997 // 998 // See #37866 (issue) and #37900 (corresponding tla+ update). 999 func (ds *DistSender) detectIntentMissingDueToIntentResolution( 1000 ctx context.Context, txn *roachpb.Transaction, 1001 ) (bool, error) { 1002 ba := roachpb.BatchRequest{} 1003 ba.Timestamp = ds.clock.Now() 1004 ba.Add(&roachpb.QueryTxnRequest{ 1005 RequestHeader: roachpb.RequestHeader{ 1006 Key: txn.TxnMeta.Key, 1007 }, 1008 Txn: txn.TxnMeta, 1009 }) 1010 log.VEvent(ctx, 1, "detecting whether missing intent is due to intent resolution") 1011 br, pErr := ds.Send(ctx, ba) 1012 if pErr != nil { 1013 // We weren't able to determine whether the intent missing error is 1014 // due to intent resolution or not, so it is still ambiguous whether 1015 // the commit succeeded. 1016 return false, roachpb.NewAmbiguousResultError(fmt.Sprintf("error=%s [intent missing]", pErr)) 1017 } 1018 respTxn := &br.Responses[0].GetQueryTxn().QueriedTxn 1019 switch respTxn.Status { 1020 case roachpb.COMMITTED: 1021 // The transaction has already been finalized as committed. The missing 1022 // intent error must have been a result of a concurrent transaction 1023 // recovery finding the transaction in the implicit commit state and 1024 // resolving one of its intents before the pre-commit QueryIntent 1025 // queried that intent. We know that the transaction was committed 1026 // successfully, so ignore the error. 1027 return true, nil 1028 case roachpb.ABORTED: 1029 // The transaction has either already been finalized as aborted or has 1030 // been finalized as committed and already had its transaction record 1031 // GCed. We can't distinguish between these two conditions with full 1032 // certainty, so we're forced to return an ambiguous commit error. 1033 // TODO(nvanbenschoten): QueryTxn will materialize an ABORTED transaction 1034 // record if one does not already exist. If we are certain that no actor 1035 // will ever persist an ABORTED transaction record after a COMMIT record is 1036 // GCed and we returned whether the record was synthesized in the QueryTxn 1037 // response then we could use the existence of an ABORTED transaction record 1038 // to further isolates the ambiguity caused by the loss of information 1039 // during intent resolution. If this error becomes a problem, we can explore 1040 // this option. 1041 return false, roachpb.NewAmbiguousResultError("intent missing and record aborted") 1042 default: 1043 // The transaction has not been finalized yet, so the missing intent 1044 // error must have been caused by a real missing intent. Propagate the 1045 // missing intent error. 1046 // NB: we don't expect the record to be PENDING at this point, but it's 1047 // not worth making any hard assertions about what we get back here. 1048 return false, nil 1049 } 1050 } 1051 1052 // maybeSwapErrorIndex swaps the error index from a to b or b to a if the 1053 // error's index is set and is equal to one of these to values. 1054 func maybeSwapErrorIndex(pErr *roachpb.Error, a, b int) { 1055 if pErr.Index == nil { 1056 return 1057 } 1058 if pErr.Index.Index == int32(a) { 1059 pErr.Index.Index = int32(b) 1060 } else if pErr.Index.Index == int32(b) { 1061 pErr.Index.Index = int32(a) 1062 } 1063 } 1064 1065 // mergeErrors merges the two errors, combining their transaction state and 1066 // returning the error with the highest priority. 1067 func mergeErrors(pErr1, pErr2 *roachpb.Error) *roachpb.Error { 1068 ret, drop := pErr1, pErr2 1069 if roachpb.ErrPriority(drop.GoError()) > roachpb.ErrPriority(ret.GoError()) { 1070 ret, drop = drop, ret 1071 } 1072 ret.UpdateTxn(drop.GetTxn()) 1073 return ret 1074 } 1075 1076 // divideAndSendBatchToRanges sends the supplied batch to all of the 1077 // ranges which comprise the span specified by rs. The batch request 1078 // is trimmed against each range which is part of the span and sent 1079 // either serially or in parallel, if possible. 1080 // 1081 // batchIdx indicates which partial fragment of the larger batch is 1082 // being processed by this method. It's specified as non-zero when 1083 // this method is invoked recursively. 1084 // 1085 // withCommit indicates that the batch contains a transaction commit 1086 // or that a transaction commit is being run concurrently with this 1087 // batch. Either way, if this is true then sendToReplicas will need 1088 // to handle errors differently. 1089 func (ds *DistSender) divideAndSendBatchToRanges( 1090 ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, withCommit bool, batchIdx int, 1091 ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { 1092 // Clone the BatchRequest's transaction so that future mutations to the 1093 // proto don't affect the proto in this batch. 1094 if ba.Txn != nil { 1095 ba.Txn = ba.Txn.Clone() 1096 } 1097 // Get initial seek key depending on direction of iteration. 1098 var scanDir ScanDirection 1099 var seekKey roachpb.RKey 1100 if !ba.IsReverse() { 1101 scanDir = Ascending 1102 seekKey = rs.Key 1103 } else { 1104 scanDir = Descending 1105 seekKey = rs.EndKey 1106 } 1107 ri := NewRangeIterator(ds) 1108 ri.Seek(ctx, seekKey, scanDir) 1109 if !ri.Valid() { 1110 return nil, roachpb.NewError(ri.Error()) 1111 } 1112 // Take the fast path if this batch fits within a single range. 1113 if !ri.NeedAnother(rs) { 1114 resp := ds.sendPartialBatch( 1115 ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, false, /* needsTruncate */ 1116 ) 1117 return resp.reply, resp.pErr 1118 } 1119 1120 // The batch spans ranges (according to our cached range descriptors). 1121 // Verify that this is ok. 1122 // TODO(tschottdorf): we should have a mechanism for discovering range 1123 // merges (descriptor staleness will mostly go unnoticed), or we'll be 1124 // turning single-range queries into multi-range queries for no good 1125 // reason. 1126 if ba.IsUnsplittable() { 1127 mismatch := roachpb.NewRangeKeyMismatchError(rs.Key.AsRawKey(), rs.EndKey.AsRawKey(), ri.Desc()) 1128 return nil, roachpb.NewError(mismatch) 1129 } 1130 // If there's no transaction and ba spans ranges, possibly re-run as part of 1131 // a transaction for consistency. The case where we don't need to re-run is 1132 // if the read consistency is not required. 1133 if ba.Txn == nil && ba.IsTransactional() && ba.ReadConsistency == roachpb.CONSISTENT { 1134 return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}) 1135 } 1136 // If the batch contains a non-parallel commit EndTxn and spans ranges then 1137 // we want the caller to come again with the EndTxn in a separate 1138 // (non-concurrent) batch. 1139 // 1140 // NB: withCommit allows us to short-circuit the check in the common case, 1141 // but even when that's true, we still need to search for the EndTxn in the 1142 // batch. 1143 if withCommit { 1144 etArg, ok := ba.GetArg(roachpb.EndTxn) 1145 if ok && !etArg.(*roachpb.EndTxnRequest).IsParallelCommit() { 1146 return nil, errNo1PCTxn 1147 } 1148 } 1149 1150 // Make an empty slice of responses which will be populated with responses 1151 // as they come in via Combine(). 1152 br = &roachpb.BatchResponse{ 1153 Responses: make([]roachpb.ResponseUnion, len(ba.Requests)), 1154 } 1155 // This function builds a channel of responses for each range 1156 // implicated in the span (rs) and combines them into a single 1157 // BatchResponse when finished. 1158 var responseChs []chan response 1159 // couldHaveSkippedResponses is set if a ResumeSpan needs to be sent back. 1160 var couldHaveSkippedResponses bool 1161 // If couldHaveSkippedResponses is set, resumeReason indicates the reason why 1162 // the ResumeSpan is necessary. This reason is common to all individual 1163 // responses that carry a ResumeSpan. 1164 var resumeReason roachpb.ResponseHeader_ResumeReason 1165 defer func() { 1166 if r := recover(); r != nil { 1167 // If we're in the middle of a panic, don't wait on responseChs. 1168 panic(r) 1169 } 1170 // Combine all the responses. 1171 // It's important that we wait for all of them even if an error is caught 1172 // because the client.Sender() contract mandates that we don't "hold on" to 1173 // any part of a request after DistSender.Send() returns. 1174 for _, responseCh := range responseChs { 1175 resp := <-responseCh 1176 if resp.pErr != nil { 1177 if pErr == nil { 1178 pErr = resp.pErr 1179 // Update the error's transaction with any new information from 1180 // the batch response. This may contain interesting updates if 1181 // the batch was parallelized and part of it succeeded. 1182 pErr.UpdateTxn(br.Txn) 1183 } else { 1184 // The batch was split and saw (at least) two different errors. 1185 // Merge their transaction state and determine which to return 1186 // based on their priorities. 1187 pErr = mergeErrors(pErr, resp.pErr) 1188 } 1189 continue 1190 } 1191 1192 // Combine the new response with the existing one (including updating 1193 // the headers) if we haven't yet seen an error. 1194 if pErr == nil { 1195 if err := br.Combine(resp.reply, resp.positions); err != nil { 1196 pErr = roachpb.NewError(err) 1197 } 1198 } else { 1199 // Update the error's transaction with any new information from 1200 // the batch response. This may contain interesting updates if 1201 // the batch was parallelized and part of it succeeded. 1202 pErr.UpdateTxn(resp.reply.Txn) 1203 } 1204 } 1205 1206 if pErr == nil && couldHaveSkippedResponses { 1207 fillSkippedResponses(ba, br, seekKey, resumeReason) 1208 } 1209 }() 1210 1211 canParallelize := ba.Header.MaxSpanRequestKeys == 0 && ba.Header.TargetBytes == 0 1212 if ba.IsSingleCheckConsistencyRequest() { 1213 // Don't parallelize full checksum requests as they have to touch the 1214 // entirety of each replica of each range they touch. 1215 isExpensive := ba.Requests[0].GetCheckConsistency().Mode == roachpb.ChecksumMode_CHECK_FULL 1216 canParallelize = canParallelize && !isExpensive 1217 } 1218 1219 for ; ri.Valid(); ri.Seek(ctx, seekKey, scanDir) { 1220 responseCh := make(chan response, 1) 1221 responseChs = append(responseChs, responseCh) 1222 1223 // Determine next seek key, taking a potentially sparse batch into 1224 // consideration. 1225 var err error 1226 nextRS := rs 1227 if scanDir == Descending { 1228 // In next iteration, query previous range. 1229 // We use the StartKey of the current descriptor as opposed to the 1230 // EndKey of the previous one since that doesn't have bugs when 1231 // stale descriptors come into play. 1232 seekKey, err = prev(ba, ri.Desc().StartKey) 1233 nextRS.EndKey = seekKey 1234 } else { 1235 // In next iteration, query next range. 1236 // It's important that we use the EndKey of the current descriptor 1237 // as opposed to the StartKey of the next one: if the former is stale, 1238 // it's possible that the next range has since merged the subsequent 1239 // one, and unless both descriptors are stale, the next descriptor's 1240 // StartKey would move us to the beginning of the current range, 1241 // resulting in a duplicate scan. 1242 seekKey, err = next(ba, ri.Desc().EndKey) 1243 nextRS.Key = seekKey 1244 } 1245 if err != nil { 1246 responseCh <- response{pErr: roachpb.NewError(err)} 1247 return 1248 } 1249 1250 lastRange := !ri.NeedAnother(rs) 1251 // Send the next partial batch to the first range in the "rs" span. 1252 // If we can reserve one of the limited goroutines available for parallel 1253 // batch RPCs, send asynchronously. 1254 if canParallelize && !lastRange && !ds.disableParallelBatches && 1255 ds.sendPartialBatchAsync(ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, responseCh) { 1256 // Sent the batch asynchronously. 1257 } else { 1258 resp := ds.sendPartialBatch( 1259 ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, true, /* needsTruncate */ 1260 ) 1261 responseCh <- resp 1262 if resp.pErr != nil { 1263 return 1264 } 1265 // Update the transaction from the response. Note that this wouldn't happen 1266 // on the asynchronous path, but if we have newer information it's good to 1267 // use it. 1268 if !lastRange { 1269 ba.UpdateTxn(resp.reply.Txn) 1270 } 1271 1272 mightStopEarly := ba.MaxSpanRequestKeys > 0 || ba.TargetBytes > 0 1273 // Check whether we've received enough responses to exit query loop. 1274 if mightStopEarly { 1275 var replyResults int64 1276 var replyBytes int64 1277 for _, r := range resp.reply.Responses { 1278 replyResults += r.GetInner().Header().NumKeys 1279 replyBytes += r.GetInner().Header().NumBytes 1280 } 1281 // Update MaxSpanRequestKeys, if applicable. Note that ba might be 1282 // passed recursively to further divideAndSendBatchToRanges() calls. 1283 if ba.MaxSpanRequestKeys > 0 { 1284 if replyResults > ba.MaxSpanRequestKeys { 1285 // NOTE: v19.2 and below have a bug where MaxSpanRequestKeys 1286 // is not respected by ResolveIntentRangeRequest once the 1287 // limit has already been exhausted by the batch. This is 1288 // mostly harmless (or at least, the damage has already been 1289 // done by this point and resulted in a large Raft entry) 1290 // and has been fixed in v20.1+, so don't bother hitting the 1291 // assertion. 1292 // 1293 // TODO(nvanbenschoten): remove this hack in v20.2. 1294 if _, ok := ba.GetArg(roachpb.ResolveIntentRange); ok { 1295 replyResults = ba.MaxSpanRequestKeys 1296 } else { 1297 log.Fatalf(ctx, "received %d results, limit was %d", 1298 replyResults, ba.MaxSpanRequestKeys) 1299 } 1300 } 1301 ba.MaxSpanRequestKeys -= replyResults 1302 // Exiting; any missing responses will be filled in via defer(). 1303 if ba.MaxSpanRequestKeys == 0 { 1304 couldHaveSkippedResponses = true 1305 resumeReason = roachpb.RESUME_KEY_LIMIT 1306 return 1307 } 1308 } 1309 if ba.TargetBytes > 0 { 1310 ba.TargetBytes -= replyBytes 1311 if ba.TargetBytes <= 0 { 1312 couldHaveSkippedResponses = true 1313 resumeReason = roachpb.RESUME_KEY_LIMIT 1314 return 1315 } 1316 } 1317 } 1318 } 1319 1320 // The iteration is complete if the iterator's current range 1321 // encompasses the remaining span, OR if the next span has 1322 // inverted. This can happen if this method is invoked 1323 // re-entrantly due to ranges being split or merged. In that case 1324 // the batch request has all the original requests but the span is 1325 // a sub-span of the original, causing next() and prev() methods 1326 // to potentially return values which invert the span. 1327 if lastRange || !nextRS.Key.Less(nextRS.EndKey) { 1328 return 1329 } 1330 batchIdx++ 1331 rs = nextRS 1332 } 1333 1334 // We've exited early. Return the range iterator error. 1335 responseCh := make(chan response, 1) 1336 responseCh <- response{pErr: roachpb.NewError(ri.Error())} 1337 responseChs = append(responseChs, responseCh) 1338 return 1339 } 1340 1341 // sendPartialBatchAsync sends the partial batch asynchronously if 1342 // there aren't currently more than the allowed number of concurrent 1343 // async requests outstanding. Returns whether the partial batch was 1344 // sent. 1345 func (ds *DistSender) sendPartialBatchAsync( 1346 ctx context.Context, 1347 ba roachpb.BatchRequest, 1348 rs roachpb.RSpan, 1349 desc *roachpb.RangeDescriptor, 1350 evictToken *EvictionToken, 1351 withCommit bool, 1352 batchIdx int, 1353 responseCh chan response, 1354 ) bool { 1355 if err := ds.rpcContext.Stopper.RunLimitedAsyncTask( 1356 ctx, "kv.DistSender: sending partial batch", 1357 ds.asyncSenderSem, false, /* wait */ 1358 func(ctx context.Context) { 1359 ds.metrics.AsyncSentCount.Inc(1) 1360 responseCh <- ds.sendPartialBatch( 1361 ctx, ba, rs, desc, evictToken, withCommit, batchIdx, true, /* needsTruncate */ 1362 ) 1363 }, 1364 ); err != nil { 1365 ds.metrics.AsyncThrottledCount.Inc(1) 1366 return false 1367 } 1368 return true 1369 } 1370 1371 func slowRangeRPCWarningStr( 1372 dur time.Duration, attempts int64, desc *roachpb.RangeDescriptor, pErr *roachpb.Error, 1373 ) string { 1374 return fmt.Sprintf("have been waiting %.2fs (%d attempts) for RPC to %s: %s", dur.Seconds(), attempts, desc, pErr) 1375 } 1376 1377 func slowRangeRPCReturnWarningStr(dur time.Duration, attempts int64) string { 1378 return fmt.Sprintf("slow RPC finished after %.2fs (%d attempts)", dur.Seconds(), attempts) 1379 } 1380 1381 // sendPartialBatch sends the supplied batch to the range specified by 1382 // desc. The batch request is first truncated so that it contains only 1383 // requests which intersect the range descriptor and keys for each 1384 // request are limited to the range's key span. The send occurs in a 1385 // retry loop to handle send failures. On failure to send to any 1386 // replicas, we backoff and retry by refetching the range 1387 // descriptor. If the underlying range seems to have split, we 1388 // recursively invoke divideAndSendBatchToRanges to re-enumerate the 1389 // ranges in the span and resend to each. If needsTruncate is true, 1390 // the supplied batch and span must be truncated to the supplied range 1391 // descriptor. 1392 func (ds *DistSender) sendPartialBatch( 1393 ctx context.Context, 1394 ba roachpb.BatchRequest, 1395 rs roachpb.RSpan, 1396 desc *roachpb.RangeDescriptor, 1397 evictToken *EvictionToken, 1398 withCommit bool, 1399 batchIdx int, 1400 needsTruncate bool, 1401 ) response { 1402 if batchIdx == 1 { 1403 ds.metrics.PartialBatchCount.Inc(2) // account for first batch 1404 } else if batchIdx > 1 { 1405 ds.metrics.PartialBatchCount.Inc(1) 1406 } 1407 var reply *roachpb.BatchResponse 1408 var pErr *roachpb.Error 1409 var err error 1410 var positions []int 1411 1412 isReverse := ba.IsReverse() 1413 1414 if needsTruncate { 1415 // Truncate the request to range descriptor. 1416 rs, err = rs.Intersect(desc) 1417 if err != nil { 1418 return response{pErr: roachpb.NewError(err)} 1419 } 1420 ba, positions, err = truncate(ba, rs) 1421 if len(positions) == 0 && err == nil { 1422 // This shouldn't happen in the wild, but some tests exercise it. 1423 return response{ 1424 pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", rs, ba), 1425 } 1426 } 1427 if err != nil { 1428 return response{pErr: roachpb.NewError(err)} 1429 } 1430 } 1431 1432 // Start a retry loop for sending the batch to the range. 1433 tBegin, attempts := timeutil.Now(), int64(0) // for slow log message 1434 for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { 1435 attempts++ 1436 // If we've cleared the descriptor on a send failure, re-lookup. 1437 if desc == nil { 1438 var descKey roachpb.RKey 1439 if isReverse { 1440 descKey = rs.EndKey 1441 } else { 1442 descKey = rs.Key 1443 } 1444 desc, evictToken, err = ds.getDescriptor(ctx, descKey, evictToken, isReverse) 1445 if err != nil { 1446 log.VErrEventf(ctx, 1, "range descriptor re-lookup failed: %s", err) 1447 // We set pErr if we encountered an error getting the descriptor in 1448 // order to return the most recent error when we are out of retries. 1449 pErr = roachpb.NewError(err) 1450 continue 1451 } 1452 } 1453 1454 reply, pErr = ds.sendSingleRange(ctx, ba, desc, withCommit) 1455 1456 // If sending succeeded, return immediately. 1457 if pErr == nil { 1458 return response{reply: reply, positions: positions} 1459 } 1460 1461 // Re-map the error index within this partial batch back 1462 // to its position in the encompassing batch. 1463 if pErr.Index != nil && pErr.Index.Index != -1 && positions != nil { 1464 pErr.Index.Index = int32(positions[pErr.Index.Index]) 1465 } 1466 1467 const slowDistSenderThreshold = time.Minute 1468 if dur := timeutil.Since(tBegin); dur > slowDistSenderThreshold && !tBegin.IsZero() { 1469 ds.metrics.SlowRPCs.Inc(1) 1470 dur := dur // leak dur to heap only when branch taken 1471 log.Warningf(ctx, "slow range RPC: %v", 1472 slowRangeRPCWarningStr(dur, attempts, desc, pErr)) 1473 defer func(tBegin time.Time, attempts int64) { 1474 ds.metrics.SlowRPCs.Dec(1) 1475 log.Warningf(ctx, "slow RPC response: %v", 1476 slowRangeRPCReturnWarningStr(timeutil.Since(tBegin), attempts)) 1477 }(tBegin, attempts) 1478 tBegin = time.Time{} // prevent reentering branch for this RPC 1479 } 1480 log.VErrEventf(ctx, 2, "reply error %s: %s", ba, pErr) 1481 1482 // Error handling: If the error indicates that our range 1483 // descriptor is out of date, evict it from the cache and try 1484 // again. Errors that apply only to a single replica were 1485 // handled in send(). 1486 // 1487 // TODO(bdarnell): Don't retry endlessly. If we fail twice in a 1488 // row and the range descriptor hasn't changed, return the error 1489 // to our caller. 1490 switch tErr := pErr.GetDetail().(type) { 1491 case *roachpb.SendError: 1492 // We've tried all the replicas without success. Either they're all down, 1493 // or we're using an out-of-date range descriptor. Invalidate the cache 1494 // and try again with the new metadata. Re-sending the request is ok even 1495 // though it might have succeeded the first time around because of 1496 // idempotency. 1497 log.VEventf(ctx, 1, "evicting range descriptor on %T and backoff for re-lookup: %+v", tErr, desc) 1498 evictToken.Evict(ctx) 1499 // Clear the descriptor to reload on the next attempt. 1500 desc = nil 1501 continue 1502 case *roachpb.RangeKeyMismatchError: 1503 // Range descriptor might be out of date - evict it. This is 1504 // likely the result of a range split. If we have new range 1505 // descriptors, insert them instead as long as they are different 1506 // from the last descriptor to avoid endless loops. 1507 var replacements []roachpb.RangeDescriptor 1508 different := func(rd *roachpb.RangeDescriptor) bool { 1509 return !desc.RSpan().Equal(rd.RSpan()) 1510 } 1511 if different(&tErr.MismatchedRange) { 1512 replacements = append(replacements, tErr.MismatchedRange) 1513 } 1514 if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { 1515 if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { 1516 replacements = append(replacements, *tErr.SuggestedRange) 1517 } 1518 } 1519 // Same as Evict() if replacements is empty. 1520 evictToken.EvictAndReplace(ctx, replacements...) 1521 // On addressing errors (likely a split), we need to re-invoke 1522 // the range descriptor lookup machinery, so we recurse by 1523 // sending batch to just the partial span this descriptor was 1524 // supposed to cover. Note that for the resending, we use the 1525 // already truncated batch, so that we know that the response 1526 // to it matches the positions into our batch (using the full 1527 // batch here would give a potentially larger response slice 1528 // with unknown mapping to our truncated reply). 1529 log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) 1530 reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, withCommit, batchIdx) 1531 return response{reply: reply, positions: positions, pErr: pErr} 1532 } 1533 break 1534 } 1535 1536 // Propagate error if either the retry closer or context done 1537 // channels were closed. 1538 if pErr == nil { 1539 if err := ds.deduceRetryEarlyExitError(ctx); err == nil { 1540 log.Fatal(ctx, "exited retry loop without an error") 1541 } else { 1542 pErr = roachpb.NewError(err) 1543 } 1544 } 1545 1546 return response{pErr: pErr} 1547 } 1548 1549 func (ds *DistSender) deduceRetryEarlyExitError(ctx context.Context) error { 1550 select { 1551 case <-ds.rpcRetryOptions.Closer: 1552 // Typically happens during shutdown. 1553 return &roachpb.NodeUnavailableError{} 1554 case <-ctx.Done(): 1555 // Happens when the client request is canceled. 1556 return errors.Wrap(ctx.Err(), "aborted in distSender") 1557 default: 1558 } 1559 return nil 1560 } 1561 1562 func includesFrontOfCurSpan(isReverse bool, rd *roachpb.RangeDescriptor, rs roachpb.RSpan) bool { 1563 if isReverse { 1564 return rd.ContainsKeyInverted(rs.EndKey) 1565 } 1566 return rd.ContainsKey(rs.Key) 1567 } 1568 1569 // fillSkippedResponses fills in responses and ResumeSpans for requests 1570 // when a batch finished without fully processing the requested key spans for 1571 // (some of) the requests in the batch. This can happen when processing has met 1572 // the batch key max limit for range requests, or some other stop condition 1573 // based on ScanOptions. 1574 // 1575 // nextKey is the first key that was not processed. This will be used when 1576 // filling up the ResumeSpan's. 1577 func fillSkippedResponses( 1578 ba roachpb.BatchRequest, 1579 br *roachpb.BatchResponse, 1580 nextKey roachpb.RKey, 1581 resumeReason roachpb.ResponseHeader_ResumeReason, 1582 ) { 1583 // Some requests might have no response at all if we used a batch-wide 1584 // limit; simply create trivial responses for those. Note that any type 1585 // of request can crop up here - simply take a batch that exceeds the 1586 // limit, and add any other requests at higher keys at the end of the 1587 // batch -- they'll all come back without any response since they never 1588 // execute. 1589 var scratchBA roachpb.BatchRequest 1590 for i := range br.Responses { 1591 if br.Responses[i] != (roachpb.ResponseUnion{}) { 1592 continue 1593 } 1594 req := ba.Requests[i].GetInner() 1595 // We need to summon an empty response. The most convenient (but not 1596 // most efficient) way is to use (*BatchRequest).CreateReply. 1597 // 1598 // TODO(tschottdorf): can autogenerate CreateReply for individual 1599 // requests, see roachpb/gen_batch.go. 1600 if scratchBA.Requests == nil { 1601 scratchBA.Requests = make([]roachpb.RequestUnion, 1) 1602 } 1603 scratchBA.Requests[0].MustSetInner(req) 1604 br.Responses[i] = scratchBA.CreateReply().Responses[0] 1605 } 1606 // Set the ResumeSpan for future batch requests. 1607 isReverse := ba.IsReverse() 1608 for i, resp := range br.Responses { 1609 req := ba.Requests[i].GetInner() 1610 if !roachpb.IsRange(req) { 1611 continue 1612 } 1613 hdr := resp.GetInner().Header() 1614 hdr.ResumeReason = resumeReason 1615 origSpan := req.Header().Span() 1616 if isReverse { 1617 if hdr.ResumeSpan != nil { 1618 // The ResumeSpan.Key might be set to the StartKey of a range; 1619 // correctly set it to the Key of the original request span. 1620 hdr.ResumeSpan.Key = origSpan.Key 1621 } else if roachpb.RKey(origSpan.Key).Less(nextKey) { 1622 // Some keys have yet to be processed. 1623 hdr.ResumeSpan = new(roachpb.Span) 1624 *hdr.ResumeSpan = origSpan 1625 if nextKey.Less(roachpb.RKey(origSpan.EndKey)) { 1626 // The original span has been partially processed. 1627 hdr.ResumeSpan.EndKey = nextKey.AsRawKey() 1628 } 1629 } 1630 } else { 1631 if hdr.ResumeSpan != nil { 1632 // The ResumeSpan.EndKey might be set to the EndKey of a range because 1633 // that's what a store will set it to when the limit is reached; it 1634 // doesn't know any better). In that case, we correct it to the EndKey 1635 // of the original request span. Note that this doesn't touch 1636 // ResumeSpan.Key, which is really the important part of the ResumeSpan. 1637 hdr.ResumeSpan.EndKey = origSpan.EndKey 1638 } else { 1639 // The request might have been fully satisfied, in which case it doesn't 1640 // need a ResumeSpan, or it might not have. Figure out if we're in the 1641 // latter case. 1642 if nextKey.Less(roachpb.RKey(origSpan.EndKey)) { 1643 // Some keys have yet to be processed. 1644 hdr.ResumeSpan = new(roachpb.Span) 1645 *hdr.ResumeSpan = origSpan 1646 if roachpb.RKey(origSpan.Key).Less(nextKey) { 1647 // The original span has been partially processed. 1648 hdr.ResumeSpan.Key = nextKey.AsRawKey() 1649 } 1650 } 1651 } 1652 } 1653 br.Responses[i].GetInner().SetHeader(hdr) 1654 } 1655 } 1656 1657 // leaseholderInfo contains some routing information for RPCs. 1658 type leaseholderInfo struct { 1659 // routeToFollower is set if the request is intended to be routed to a 1660 // follower - either because it's a read that looks stale enough to be served 1661 // by a follower, or otherwise because the respective batch simply doesn't 1662 // need the leaseholder. 1663 routeToFollower bool 1664 // cachedLeaseholder is the leaseholder that the cache indicated. Empty if the 1665 // cache didn't have an entry for the range. 1666 cachedLeaseholder roachpb.ReplicaDescriptor 1667 } 1668 1669 // sendToReplicas sends one or more RPCs to clients specified by the 1670 // slice of replicas. On success, Send returns the first successful 1671 // reply. If an error occurs which is not specific to a single 1672 // replica, it's returned immediately. Otherwise, when all replicas 1673 // have been tried and failed, returns a send error. 1674 // 1675 // The method accepts a boolean declaring whether a transaction commit 1676 // is either in this batch or in-flight concurrently with this batch. 1677 // If withCommit is false (i.e. either no EndTxn is in flight, 1678 // or it is attempting to abort), ambiguous results will never be 1679 // returned from this method. This is because both transactional writes 1680 // and aborts can be retried (the former due to seqno idempotency, the 1681 // latter because aborting is idempotent). If withCommit is true, any 1682 // errors that do not definitively rule out the possibility that the 1683 // batch could have succeeded are transformed into AmbiguousResultErrors. 1684 func (ds *DistSender) sendToReplicas( 1685 ctx context.Context, 1686 ba roachpb.BatchRequest, 1687 opts SendOptions, 1688 rangeID roachpb.RangeID, 1689 replicas ReplicaSlice, 1690 nodeDialer *nodedialer.Dialer, 1691 leaseholder leaseholderInfo, 1692 withCommit bool, 1693 ) (*roachpb.BatchResponse, error) { 1694 transport, err := ds.transportFactory(opts, nodeDialer, replicas) 1695 if err != nil { 1696 return nil, err 1697 } 1698 if transport.IsExhausted() { 1699 return nil, roachpb.NewSendError( 1700 fmt.Sprintf("sending to all %d replicas failed", len(replicas))) 1701 } 1702 1703 curReplica := transport.NextReplica() 1704 if log.ExpensiveLogEnabled(ctx, 2) { 1705 log.VEventf(ctx, 2, "r%d: sending batch %s to %s", rangeID, ba.Summary(), curReplica) 1706 } 1707 br, err := transport.SendNext(ctx, ba) 1708 // maxSeenLeaseSequence tracks the maximum LeaseSequence seen in a 1709 // NotLeaseHolderError. If we encounter a sequence number less than or equal 1710 // to maxSeenLeaseSequence number in a subsequent NotLeaseHolderError then 1711 // the range must be experiencing a least transfer and the client should back 1712 // off using inTransferRetry. 1713 maxSeenLeaseSequence := roachpb.LeaseSequence(-1) 1714 inTransferRetry := retry.StartWithCtx(ctx, ds.rpcRetryOptions) 1715 inTransferRetry.Next() // The first call to Next does not block. 1716 1717 // This loop will retry operations that fail with errors that reflect 1718 // per-replica state and may succeed on other replicas. 1719 var ambiguousError error 1720 for { 1721 if err != nil { 1722 // For most connection errors, we cannot tell whether or not the request 1723 // may have succeeded on the remote server (exceptions are captured in the 1724 // grpcutil.RequestDidNotStart function). We'll retry the request in order 1725 // to attempt to eliminate the ambiguity; see below. If there's a commit 1726 // in the batch, we track the ambiguity more explicitly by setting 1727 // ambiguousError. This serves two purposes: 1728 // 1) the higher-level retries in the DistSender will not forget the 1729 // ambiguity, like they forget it for non-commit batches. This in turn 1730 // will ensure that TxnCoordSender-level retries don't happen across 1731 // commits; that'd be bad since requests are not idempotent across 1732 // commits. 1733 // TODO(andrei): This higher-level does things too bluntly, retrying only 1734 // in case of SendError. It should also retry in case of 1735 // AmbiguousRetryError as long as it makes sure to not forget about the 1736 // ambiguity. 1737 // 2) SQL recognizes AmbiguousResultErrors and gives them a special code 1738 // (StatementCompletionUnknown). 1739 // TODO(andrei): The use of this code is inconsistent because a) the 1740 // DistSender tries to only return the code for commits, but it'll happily 1741 // forward along AmbiguousResultErrors coming from the replica and b) we 1742 // probably should be returning that code for non-commit statements too. 1743 // 1744 // We retry requests in order to avoid returning errors (in particular, 1745 // AmbiguousResultError). Retrying the batch will either: 1746 // a) succeed if the request had not been evaluated the first time. 1747 // b) succeed if the request also succeeded the first time, but is 1748 // idempotent (i.e. it is internal to a txn, without a commit in the 1749 // batch). 1750 // c) fail if it succeeded the first time and the request is not 1751 // idempotent. In the case of EndTxn requests, this is ensured by the 1752 // tombstone keys in the timestamp cache. The retry failing does not 1753 // prove that the request did not succeed the first time around, so we 1754 // can't claim success (and even if we could claim success, we still 1755 // wouldn't have the complete result of the successful evaluation). 1756 // 1757 // Case a) is great - the retry made the request succeed. Case b) is also 1758 // good; due to idempotency we managed to swallow a communication error. 1759 // Case c) is not great - we'll end up returning an error even though the 1760 // request might have succeeded (an AmbiguousResultError if withCommit is 1761 // set). 1762 // 1763 // TODO(andrei): Case c) is broken for non-transactional requests: nothing 1764 // prevents them from double evaluation. This can result in, for example, 1765 // an increment applying twice, or more subtle problems like a blind write 1766 // evaluating twice, overwriting another unrelated write that fell 1767 // in-between. 1768 // 1769 if withCommit && !grpcutil.RequestDidNotStart(err) { 1770 ambiguousError = err 1771 } 1772 log.VErrEventf(ctx, 2, "RPC error: %s", err) 1773 1774 // If the error wasn't just a context cancellation and the down replica 1775 // is cached as the lease holder, evict it. The only other eviction 1776 // happens below on NotLeaseHolderError, but if the next replica is the 1777 // actual lease holder, we're never going to receive one of those and 1778 // will thus pay the price of trying the down node first forever. 1779 // 1780 // NB: we should consider instead adding a successful reply from the next 1781 // replica into the cache, but without a leaseholder (and taking into 1782 // account that the local node can't be down) it won't take long until we 1783 // talk to a replica that tells us who the leaseholder is. 1784 if ctx.Err() == nil { 1785 if storeID, ok := ds.leaseHolderCache.Lookup(ctx, rangeID); ok && curReplica.StoreID == storeID { 1786 ds.leaseHolderCache.Update(ctx, rangeID, 0 /* evict */) 1787 } 1788 } 1789 } else { 1790 // NB: This section of code may have unfortunate performance implications. If we 1791 // exit the below type switch with propagateError remaining at `false`, we'll try 1792 // more replicas. That may succeed and future requests might do the same thing over 1793 // and over again, adding needless round-trips to the earlier replicas. 1794 propagateError := false 1795 switch tErr := br.Error.GetDetail().(type) { 1796 case nil: 1797 // When a request that we've attempted to route to the leaseholder comes 1798 // back as successful, we assume that it must have been served by the 1799 // leaseholder and so we update the leaseholder cache. In steady state, 1800 // this is almost always the case, and so we gate the update on whether 1801 // the response comes from a node that we didn't know held the lease. 1802 updateLeaseholderCache := 1803 !leaseholder.routeToFollower && 1804 leaseholder.cachedLeaseholder != curReplica 1805 if updateLeaseholderCache { 1806 ds.leaseHolderCache.Update(ctx, rangeID, curReplica.StoreID) 1807 } 1808 return br, nil 1809 case *roachpb.StoreNotFoundError, *roachpb.NodeUnavailableError: 1810 // These errors are likely to be unique to the replica that reported 1811 // them, so no action is required before the next retry. 1812 case *roachpb.RangeNotFoundError: 1813 // The store we routed to doesn't have this replica. This can happen when 1814 // our descriptor is outright outdated, but it can also be caused by a 1815 // replica that has just been added but needs a snapshot to be caught up. 1816 // 1817 // We'll try other replicas which typically gives us the leaseholder, either 1818 // via the NotLeaseHolderError or nil error paths, both of which update the 1819 // leaseholder cache. 1820 case *roachpb.NotLeaseHolderError: 1821 ds.metrics.NotLeaseHolderErrCount.Inc(1) 1822 if lh := tErr.LeaseHolder; lh != nil { 1823 // Update the leaseholder cache. Naively this would also happen when the 1824 // next RPC comes back, but we don't want to wait out the additional RPC 1825 // latency. 1826 ds.leaseHolderCache.Update(ctx, rangeID, lh.StoreID) 1827 // Avoid an extra update to the leaseholder cache if the next RPC succeeds. 1828 leaseholder.cachedLeaseholder = *lh 1829 1830 // If the implicated leaseholder is not a known replica, return a SendError 1831 // to signal eviction of the cached RangeDescriptor and re-send. 1832 if replicas.FindReplica(lh.StoreID) == -1 { 1833 br.Error = roachpb.NewError(roachpb.NewSendError(fmt.Sprintf( 1834 "leaseholder s%d (via %+v) not in cached replicas %v", lh.StoreID, curReplica, replicas, 1835 ))) 1836 propagateError = true 1837 } else { 1838 // Move the new lease holder to the head of the queue for the next retry. 1839 transport.MoveToFront(*lh) 1840 } 1841 } 1842 if l := tErr.Lease; !propagateError && l != nil { 1843 // Check whether we've seen this lease or a prior lease before and 1844 // backoff if so or update maxSeenLeaseSequence if not. 1845 if l.Sequence > maxSeenLeaseSequence { 1846 maxSeenLeaseSequence = l.Sequence 1847 inTransferRetry.Reset() // The following Next call will not block. 1848 } else { 1849 ds.metrics.InLeaseTransferBackoffs.Inc(1) 1850 log.VErrEventf(ctx, 2, "backing off due to NotLeaseHolderErr at "+ 1851 "LeaseSequence %d <= %d", l.Sequence, maxSeenLeaseSequence) 1852 } 1853 inTransferRetry.Next() 1854 } 1855 default: 1856 propagateError = true 1857 } 1858 1859 if propagateError { 1860 if ambiguousError != nil { 1861 return nil, roachpb.NewAmbiguousResultError(fmt.Sprintf("error=%s [propagate]", ambiguousError)) 1862 } 1863 1864 // The error received is likely not specific to this 1865 // replica, so we should return it instead of trying other 1866 // replicas. 1867 return br, nil 1868 } 1869 1870 log.VErrEventf(ctx, 1, "application error: %s", br.Error) 1871 } 1872 1873 // Has the caller given up? 1874 if ctx.Err() != nil { 1875 reportedErr := errors.Wrap(ctx.Err(), "context done during DistSender.Send") 1876 log.Eventf(ctx, "%v", reportedErr) 1877 if ambiguousError != nil { 1878 return nil, roachpb.NewAmbiguousResultError(reportedErr.Error()) 1879 } 1880 // Don't consider this a SendError, because SendErrors indicate that we 1881 // were unable to reach a replica that could serve the request, and they 1882 // cause range cache evictions. Context cancellations just mean the 1883 // sender changed its mind or the request timed out. 1884 return nil, errors.Wrap(ctx.Err(), "aborted during DistSender.Send") 1885 } 1886 1887 if transport.IsExhausted() { 1888 if ambiguousError != nil { 1889 return nil, roachpb.NewAmbiguousResultError(fmt.Sprintf("error=%s [exhausted]", ambiguousError)) 1890 } 1891 1892 // TODO(bdarnell): The last error is not necessarily the best 1893 // one to return; we may want to remember the "best" error 1894 // we've seen (for example, a NotLeaseHolderError conveys more 1895 // information than a RangeNotFound). 1896 return nil, roachpb.NewSendError( 1897 fmt.Sprintf("sending to all %d replicas failed; last error: %v %v", len(replicas), br, err), 1898 ) 1899 } 1900 1901 ds.metrics.NextReplicaErrCount.Inc(1) 1902 curReplica = transport.NextReplica() 1903 log.VEventf(ctx, 2, "error: %v %v; trying next peer %s", br, err, curReplica.String()) 1904 br, err = transport.SendNext(ctx, ba) 1905 } 1906 }