github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvcoord
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sort"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/settings"
    21  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    22  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    23  	"github.com/cockroachdb/cockroach/pkg/util/log"
    24  	"github.com/google/btree"
    25  )
    26  
    27  // The degree of the inFlightWrites btree.
    28  const txnPipelinerBtreeDegree = 32
    29  
    30  var pipelinedWritesEnabled = settings.RegisterBoolSetting(
    31  	"kv.transaction.write_pipelining_enabled",
    32  	"if enabled, transactional writes are pipelined through Raft consensus",
    33  	true,
    34  )
    35  var pipelinedWritesMaxInFlightSize = settings.RegisterByteSizeSetting(
    36  	// TODO(nvanbenschoten): The need for this extra setting alongside
    37  	// kv.transaction.max_intents_bytes indicates that we should explore
    38  	// the unification of intent tracking and in-flight write tracking.
    39  	// The two mechanisms track subtly different information, but there's
    40  	// no fundamental reason why they can't be unified.
    41  	"kv.transaction.write_pipelining_max_outstanding_size",
    42  	"maximum number of bytes used to track in-flight pipelined writes before disabling pipelining",
    43  	1<<18, /* 256 KB */
    44  )
    45  var pipelinedWritesMaxBatchSize = settings.RegisterNonNegativeIntSetting(
    46  	"kv.transaction.write_pipelining_max_batch_size",
    47  	"if non-zero, defines that maximum size batch that will be pipelined through Raft consensus",
    48  	// NB: there is a tradeoff between the overhead of synchronously waiting for
    49  	// consensus for a batch if we don't pipeline and proving that all of the
    50  	// writes in the batch succeed if we do pipeline. We set this default to a
    51  	// value which experimentally strikes a balance between the two costs.
    52  	//
    53  	// Notably, this is well below sql.max{Insert/Update/Upsert/Delete}BatchSize,
    54  	// so implicit SQL txns should never pipeline their writes - they should either
    55  	// hit the 1PC fast-path or should have batches which exceed this limit.
    56  	128,
    57  )
    58  
    59  // trackedWritesMaxSize is a threshold in bytes for lock spans stored on the
    60  // coordinator during the lifetime of a transaction. Locks are included with a
    61  // transaction on commit or abort, to be cleaned up asynchronously. If they
    62  // exceed this threshold, they're condensed to avoid memory blowup both on the
    63  // coordinator and (critically) on the EndTxn command at the Raft group
    64  // responsible for the transaction record.
    65  //
    66  // NB: this is called "max_intents_bytes" instead of "max_lock_bytes" because
    67  // it was created before the concept of intents were generalized to locks.
    68  // Switching it would require a migration which doesn't seem worth it.
    69  var trackedWritesMaxSize = settings.RegisterPublicIntSetting(
    70  	"kv.transaction.max_intents_bytes",
    71  	"maximum number of bytes used to track locks in transactions",
    72  	1<<18, /* 256 KB */
    73  )
    74  
    75  // txnPipeliner is a txnInterceptor that pipelines transactional writes by using
    76  // asynchronous consensus. The interceptor then tracks all writes that have been
    77  // asynchronously proposed through Raft and ensures that all interfering
    78  // requests chain on to them by first proving that the async writes succeeded.
    79  // The interceptor also ensures that when committing a transaction all writes
    80  // that have been proposed but not proven to have succeeded are first checked
    81  // before considering the transaction committed. These async writes are referred
    82  // to as "in-flight writes" and this process of proving that an in-flight write
    83  // succeeded is called "proving" the write. Once writes are proven to have
    84  // finished, they are considered "stable".
    85  //
    86  // Chaining on to in-flight async writes is important for two main reasons to
    87  // txnPipeliner:
    88  //
    89  // 1. requests proposed to Raft will not necessarily succeed. For any number of
    90  //    reasons, the request may make it through Raft and be discarded or fail to
    91  //    ever even be replicated. A transaction must check that all async writes
    92  //    succeeded before committing. However, when these proposals do fail, their
    93  //    errors aren't particularly interesting to a transaction. This is because
    94  //    these errors are not deterministic Transaction-domain errors that a
    95  //    transaction must adhere to for correctness such as conditional-put errors or
    96  //    other symptoms of constraint violations. These kinds of errors are all
    97  //    discovered during write *evaluation*, which an async write will perform
    98  //    synchronously before consensus. Any error during consensus is outside of the
    99  //    Transaction-domain and can always trigger a transaction retry.
   100  //
   101  // 2. transport layers beneath the txnPipeliner do not provide strong enough
   102  //    ordering guarantees between concurrent requests in the same transaction to
   103  //    avoid needing explicit chaining. For instance, DistSender uses unary gRPC
   104  //    requests instead of gRPC streams, so it can't natively expose strong ordering
   105  //    guarantees. Perhaps more importantly, even when a command has acquired latches
   106  //    and evaluated on a Replica, it is not guaranteed to be applied before
   107  //    interfering commands. This is because the command may be retried outside of
   108  //    the serialization of the spanlatch manager for any number of reasons, such as
   109  //    leaseholder changes. When the command re-acquired its latches, it's possible
   110  //    that interfering commands may jump ahead of it. To combat this, the
   111  //    txnPipeliner uses chaining to throw an error when these re-orderings would
   112  //    have affected the order that transactional requests evaluate in.
   113  //
   114  // The interceptor proves all in-flight writes before explicitly committing a
   115  // transaction by tacking on a QueryIntent request for each one to the front of
   116  // an EndTxn(Commit=true) request. The in-flight writes that are being queried
   117  // in the batch with the EndTxn request are treated as in-flight writes for the
   118  // purposes of parallel commits. The effect of this is that the in-flight writes
   119  // must all be proven for a transaction to be considered implicitly committed.
   120  // It also follows that they will need to be queried during transaction
   121  // recovery.
   122  //
   123  // This is beneficial from the standpoint of latency because it means that the
   124  // consensus latency for every write in a transaction, including the write to
   125  // the transaction record, is paid in parallel (mod pipeline stalls) and an
   126  // entire transaction can commit in a single consensus round-trip!
   127  //
   128  // On the flip side, this means that every unproven write is considered
   129  // in-flight at the time of the commit and needs to be proven at the time of the
   130  // commit. This is a little unfortunate because a transaction could have
   131  // accumulated a large number of in-flight writes over a long period of time
   132  // without proving any of them, and the more of these writes there are, the
   133  // greater the chance that querying one of them gets delayed and delays the
   134  // overall transaction. Additionally, the more of these writes there are, the
   135  // more expensive transaction recovery will be if the transaction ends up stuck
   136  // in an indeterminate commit state.
   137  //
   138  // Three approaches have been considered to address this, all of which revolve
   139  // around the idea that earlier writes in a transaction may have finished
   140  // consensus well before the EndTxn is sent. Following this logic, it would be
   141  // in the txnPipeliner's best interest to prove in-flight writes as early as
   142  // possible, even if no other overlapping requests force them to be proven. The
   143  // approaches are:
   144  //
   145  // 1. launch a background process after each successful async write to query its
   146  //    intents and wait for it to succeed. This would effectively solve the issue,
   147  //    but at the cost of many more goroutines and many more QueryIntent requests,
   148  //    most of which would be redundant because their corresponding write wouldn't
   149  //    complete until after an EndTxn synchronously needed to prove them anyway.
   150  //
   151  // 2. to address the issue of an unbounded number of background goroutines
   152  //    proving writes in approach 1, a single background goroutine could be run
   153  //    that repeatedly loops over all in-flight writes and attempts to prove
   154  //    them. This approach was used in an early revision of #26599 and has the nice
   155  //    property that only one batch of QueryIntent requests is ever active at a
   156  //    given time. It may be revisited, but for now it is not used for the same
   157  //    reason as approach 1: most of its QueryIntent requests will be useless
   158  //    because a transaction will send an EndTxn immediately after sending all
   159  //    of its writes.
   160  //
   161  // 3. turn the KV interface into a streaming protocol (#8360) that could support
   162  //    returning multiple results. This would allow clients to return immediately
   163  //    after a writes "evaluation" phase completed but hold onto a handle to the
   164  //    request and be notified immediately after its "replication" phase completes.
   165  //    This would allow txnPipeliner to prove in-flight writes immediately after
   166  //    they finish consensus without any extra RPCs.
   167  //
   168  // So far, none of these approaches have been integrated.
   169  //
   170  // The txnPipeliner also tracks the locks that a transaction has acquired in a
   171  // set of spans known as the "lock footprint". This lock footprint contains
   172  // spans encompassing all keys and key ranges where locks have been acquired at
   173  // some point by the transaction. This set includes the bounds of locks acquired
   174  // by all locking read and write requests. Additionally, it includes the bounds
   175  // of locks acquired by the current and all previous epochs. These spans are
   176  // attached to any end transaction request that is passed through the pipeliner
   177  // to ensure that they the locks within them are released.
   178  type txnPipeliner struct {
   179  	st       *cluster.Settings
   180  	riGen    rangeIteratorFactory // used to condense lock spans, if provided
   181  	wrapped  lockedSender
   182  	disabled bool
   183  
   184  	// In-flight writes are intent point writes that have not yet been proved
   185  	// to have succeeded. They will need to be proven before the transaction
   186  	// can commit.
   187  	ifWrites inFlightWriteSet
   188  	// The transaction's lock footprint contains spans where locks (replicated
   189  	// and unreplicated) have been acquired at some point by the transaction.
   190  	// The span set contains spans encompassing the keys from all intent writes
   191  	// that have already been proven during this epoch and the keys from all
   192  	// locking reads that have been performed during this epoch. Additionally,
   193  	// the span set contains all locks held at the end of prior epochs. All of
   194  	// the transaction's in-flight writes are morally in this set as well, but
   195  	// they are not stored here to avoid duplication.
   196  	//
   197  	// Unlike the in-flight writes, this set does not need to be tracked with
   198  	// full precision. Instead, the tracking can be an overestimate (i.e. the
   199  	// spans may cover keys never locked) and should be thought of as an
   200  	// upper-bound on the influence that the transaction has had. The set
   201  	// contains all keys spans that the transaction will need to eventually
   202  	// clean up upon its completion.
   203  	lockFootprint condensableSpanSet
   204  }
   205  
   206  // condensableSpanSetRangeIterator describes the interface of RangeIterator
   207  // needed by the condensableSpanSetRangeIterator. Useful for mocking an
   208  // iterator in tests.
   209  type condensableSpanSetRangeIterator interface {
   210  	Valid() bool
   211  	Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection)
   212  	Error() error
   213  	Desc() *roachpb.RangeDescriptor
   214  }
   215  
   216  // rangeIteratorFactory is used to create a condensableSpanSetRangeIterator
   217  // lazily. It's used to avoid allocating an iterator when it's not needed. The
   218  // factory can be configured either with a callback, used for mocking in tests,
   219  // or with a DistSender. Can also be left empty for unittests that don't push
   220  // memory limits in their span sets (and thus don't need collapsing).
   221  type rangeIteratorFactory struct {
   222  	factory func() condensableSpanSetRangeIterator
   223  	ds      *DistSender
   224  }
   225  
   226  // newRangeIterator creates a range iterator. If no factory was configured, it panics.
   227  func (f rangeIteratorFactory) newRangeIterator() condensableSpanSetRangeIterator {
   228  	if f.factory != nil {
   229  		return f.factory()
   230  	}
   231  	if f.ds != nil {
   232  		return NewRangeIterator(f.ds)
   233  	}
   234  	panic("no iterator factory configured")
   235  }
   236  
   237  // SendLocked implements the lockedSender interface.
   238  func (tp *txnPipeliner) SendLocked(
   239  	ctx context.Context, ba roachpb.BatchRequest,
   240  ) (*roachpb.BatchResponse, *roachpb.Error) {
   241  	// If an EndTxn request is part of this batch, attach the in-flight writes
   242  	// and the lock footprint to it.
   243  	ba, pErr := tp.attachLocksToEndTxn(ctx, ba)
   244  	if pErr != nil {
   245  		return nil, pErr
   246  	}
   247  
   248  	// Adjust the batch so that it doesn't miss any in-flight writes.
   249  	ba = tp.chainToInFlightWrites(ba)
   250  
   251  	// Send through wrapped lockedSender. Unlocks while sending then re-locks.
   252  	br, pErr := tp.wrapped.SendLocked(ctx, ba)
   253  
   254  	// Update the in-flight write set and the lock footprint with the results of
   255  	// the request.
   256  	tp.updateLockTracking(ctx, ba, br)
   257  	if pErr != nil {
   258  		return nil, tp.adjustError(ctx, ba, pErr)
   259  	}
   260  	return tp.stripQueryIntents(br), nil
   261  }
   262  
   263  // attachLocksToEndTxn attaches the in-flight writes and the lock footprint that
   264  // the interceptor has been tracking to any EndTxn requests present in the
   265  // provided batch. It augments these sets with locking requests from the current
   266  // batch.
   267  func (tp *txnPipeliner) attachLocksToEndTxn(
   268  	ctx context.Context, ba roachpb.BatchRequest,
   269  ) (roachpb.BatchRequest, *roachpb.Error) {
   270  	args, hasET := ba.GetArg(roachpb.EndTxn)
   271  	if !hasET {
   272  		return ba, nil
   273  	}
   274  	et := args.(*roachpb.EndTxnRequest)
   275  	if len(et.LockSpans) > 0 {
   276  		return ba, roachpb.NewErrorf("client must not pass intents to EndTxn")
   277  	}
   278  	if len(et.InFlightWrites) > 0 {
   279  		return ba, roachpb.NewErrorf("client must not pass in-flight writes to EndTxn")
   280  	}
   281  
   282  	// Populate et.LockSpans and et.InFlightWrites.
   283  	if !tp.lockFootprint.empty() {
   284  		et.LockSpans = append([]roachpb.Span(nil), tp.lockFootprint.asSlice()...)
   285  	}
   286  	if inFlight := tp.ifWrites.len(); inFlight != 0 {
   287  		et.InFlightWrites = make([]roachpb.SequencedWrite, 0, inFlight)
   288  		tp.ifWrites.ascend(func(w *inFlightWrite) {
   289  			et.InFlightWrites = append(et.InFlightWrites, w.SequencedWrite)
   290  		})
   291  	}
   292  
   293  	// Augment et.LockSpans and et.InFlightWrites with writes from the current
   294  	// batch.
   295  	for _, ru := range ba.Requests[:len(ba.Requests)-1] {
   296  		req := ru.GetInner()
   297  		h := req.Header()
   298  		if roachpb.IsLocking(req) {
   299  			// Ranged writes are added immediately to the lock spans because
   300  			// it's not clear where they will actually leave intents. Point
   301  			// writes are added to the in-flight writes set. All other locking
   302  			// requests are also added to the lock spans.
   303  			//
   304  			// If we see any ranged writes then we know that the txnCommitter
   305  			// will fold the in-flight writes into the lock spans immediately
   306  			// and forgo a parallel commit, but let's not break that abstraction
   307  			// boundary here.
   308  			if roachpb.IsIntentWrite(req) && !roachpb.IsRange(req) {
   309  				w := roachpb.SequencedWrite{Key: h.Key, Sequence: h.Sequence}
   310  				et.InFlightWrites = append(et.InFlightWrites, w)
   311  			} else {
   312  				et.LockSpans = append(et.LockSpans, h.Span())
   313  			}
   314  		}
   315  	}
   316  
   317  	// Sort both sets and condense the lock spans.
   318  	et.LockSpans, _ = roachpb.MergeSpans(et.LockSpans)
   319  	sort.Sort(roachpb.SequencedWriteBySeq(et.InFlightWrites))
   320  
   321  	if log.V(3) {
   322  		for _, intent := range et.LockSpans {
   323  			log.Infof(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey)
   324  		}
   325  		for _, write := range et.InFlightWrites {
   326  			log.Infof(ctx, "in-flight: %d:%s", write.Sequence, write.Key)
   327  		}
   328  	}
   329  	return ba, nil
   330  }
   331  
   332  // chainToInFlightWrites ensures that we "chain" on to any in-flight writes that
   333  // overlap the keys we're trying to read/write. We do this by prepending
   334  // QueryIntent requests with the ErrorIfMissing option before each request that
   335  // touches any of the in-flight writes. In effect, this allows us to prove that
   336  // a write succeeded before depending on its existence. We later prune down the
   337  // list of writes we proved to exist that are no longer "in-flight" in
   338  // updateLockTracking.
   339  func (tp *txnPipeliner) chainToInFlightWrites(ba roachpb.BatchRequest) roachpb.BatchRequest {
   340  	asyncConsensus := pipelinedWritesEnabled.Get(&tp.st.SV) && !tp.disabled
   341  
   342  	// We provide a setting to bound the size of in-flight writes that the
   343  	// pipeliner is tracking. If this batch would push us over this setting,
   344  	// don't allow it to perform async consensus.
   345  	addedIFBytes := int64(0)
   346  	maxIFBytes := pipelinedWritesMaxInFlightSize.Get(&tp.st.SV)
   347  
   348  	// We provide a setting to bound the number of writes we permit in a batch
   349  	// that uses async consensus. This is useful because we'll have to prove
   350  	// each write that uses async consensus using a QueryIntent, so there's a
   351  	// point where it makes more sense to just perform consensus for the entire
   352  	// batch synchronously and avoid all of the overhead of pipelining.
   353  	if maxBatch := pipelinedWritesMaxBatchSize.Get(&tp.st.SV); maxBatch > 0 {
   354  		batchSize := int64(len(ba.Requests))
   355  		if batchSize > maxBatch {
   356  			asyncConsensus = false
   357  		}
   358  	}
   359  
   360  	forked := false
   361  	oldReqs := ba.Requests
   362  	// TODO(nvanbenschoten): go 1.11 includes an optimization to quickly clear
   363  	// out an entire map. That might make it cost effective to maintain a single
   364  	// chainedKeys map between calls to this function.
   365  	var chainedKeys map[string]struct{}
   366  	for i, ru := range oldReqs {
   367  		if !asyncConsensus && !forked && tp.ifWrites.len() == len(chainedKeys) {
   368  			// If there are no in-flight writes or all in-flight writes
   369  			// have been chained onto and async consensus is disallowed,
   370  			// short-circuit immediately.
   371  			break
   372  		}
   373  		req := ru.GetInner()
   374  
   375  		if asyncConsensus {
   376  			// If we're currently planning on performing the batch with
   377  			// performing async consensus, determine whether this request
   378  			// changes that.
   379  			if !roachpb.IsIntentWrite(req) || roachpb.IsRange(req) {
   380  				// Only allow batches consisting of solely transactional point
   381  				// writes to perform consensus asynchronously.
   382  				// TODO(nvanbenschoten): We could allow batches with reads and point
   383  				// writes to perform async consensus, but this would be a bit
   384  				// tricky. Any read would need to chain on to any write that came
   385  				// before it in the batch and overlaps. For now, it doesn't seem
   386  				// worth it.
   387  				asyncConsensus = false
   388  			} else {
   389  				// Only allow batches that would not push us over the maximum
   390  				// in-flight write size limit to perform consensus asynchronously.
   391  				//
   392  				// NB: this estimation is conservative because it doesn't factor
   393  				// in that some writes may be proven by this batch and removed
   394  				// from the in-flight write set. The real accounting in
   395  				// inFlightWriteSet.{insert,remove} gets this right.
   396  				addedIFBytes += keySize(req.Header().Key)
   397  				asyncConsensus = (tp.ifWrites.byteSize() + addedIFBytes) <= maxIFBytes
   398  			}
   399  		}
   400  
   401  		if tp.ifWrites.len() > len(chainedKeys) {
   402  			// For each conflicting in-flight write, add a QueryIntent request
   403  			// to the batch to assert that it has succeeded and "chain" onto it.
   404  			writeIter := func(w *inFlightWrite) {
   405  				// We don't want to modify the batch's request slice directly,
   406  				// so fork it before modifying it.
   407  				if !forked {
   408  					ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests[:i]...)
   409  					forked = true
   410  				}
   411  
   412  				if _, ok := chainedKeys[string(w.Key)]; !ok {
   413  					// The write has not already been chained onto by an earlier
   414  					// request in this batch. Add a QueryIntent request to the
   415  					// batch (before the conflicting request) to ensure that we
   416  					// chain on to the success of the in-flight write.
   417  					meta := ba.Txn.TxnMeta
   418  					meta.Sequence = w.Sequence
   419  					ba.Add(&roachpb.QueryIntentRequest{
   420  						RequestHeader: roachpb.RequestHeader{
   421  							Key: w.Key,
   422  						},
   423  						Txn:            meta,
   424  						ErrorIfMissing: true,
   425  					})
   426  
   427  					// Record that the key has been chained onto at least once
   428  					// in this batch so that we don't chain onto it again.
   429  					if chainedKeys == nil {
   430  						chainedKeys = make(map[string]struct{})
   431  					}
   432  					chainedKeys[string(w.Key)] = struct{}{}
   433  				}
   434  			}
   435  
   436  			if !roachpb.IsTransactional(req) {
   437  				// Non-transactional requests require that we stall the entire
   438  				// pipeline by chaining on to all in-flight writes. This is
   439  				// because their request header is often insufficient to
   440  				// determine all of the keys that they will interact with.
   441  				tp.ifWrites.ascend(writeIter)
   442  			} else if et, ok := req.(*roachpb.EndTxnRequest); ok {
   443  				if et.Commit {
   444  					// EndTxns need to prove all in-flight writes before being
   445  					// allowed to succeed themselves.
   446  					tp.ifWrites.ascend(writeIter)
   447  				}
   448  			} else {
   449  				// Transactional reads and writes needs to chain on to any
   450  				// overlapping in-flight writes.
   451  				s := req.Header().Span()
   452  				tp.ifWrites.ascendRange(s.Key, s.EndKey, writeIter)
   453  			}
   454  		}
   455  
   456  		// If the BatchRequest's slice of requests has been forked from the original,
   457  		// append the request to the new slice.
   458  		if forked {
   459  			ba.Add(req)
   460  		}
   461  	}
   462  
   463  	// Set the batch's AsyncConsensus flag based on whether AsyncConsensus is
   464  	// permitted for the batch.
   465  	ba.AsyncConsensus = asyncConsensus
   466  	return ba
   467  }
   468  
   469  // updateLockTracking reads the response for the given request and uses it to
   470  // update the tracked in-flight write set and lock footprint. It does so by
   471  // performing three actions:
   472  //  1. it adds all async writes that the request performed to the in-flight
   473  //     write set.
   474  //  2. it adds all non-async writes and locking reads that the request
   475  //     performed to the lock footprint.
   476  //  3. it moves all in-flight writes that the request proved to exist from
   477  //     the in-flight writes set to the lock footprint.
   478  //
   479  // After updating the write sets, the lock footprint is condensed to ensure that
   480  // it remains under its memory limit.
   481  //
   482  // If no response is provided (indicating an error), all writes from the batch
   483  // are added directly to the lock footprint to avoid leaking any locks when the
   484  // transaction cleans up.
   485  func (tp *txnPipeliner) updateLockTracking(
   486  	ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse,
   487  ) {
   488  	// After adding new writes to the lock footprint, check whether we need to
   489  	// condense the set to stay below memory limits.
   490  	defer tp.lockFootprint.maybeCondense(ctx, tp.riGen, trackedWritesMaxSize.Get(&tp.st.SV))
   491  
   492  	// If the request failed, add all lock acquisitions attempts directly to the
   493  	// lock footprint. This reduces the likelihood of dangling locks blocking
   494  	// concurrent requests for extended periods of time. See #3346.
   495  	if br == nil {
   496  		// The transaction cannot continue in this epoch whether this is
   497  		// a retryable error or not.
   498  		ba.LockSpanIterate(nil, tp.trackLocks)
   499  		return
   500  	}
   501  
   502  	// Similarly, if the transaction is now finalized, we don't need to
   503  	// accurately update the lock tracking.
   504  	if br.Txn.Status.IsFinalized() {
   505  		switch br.Txn.Status {
   506  		case roachpb.ABORTED:
   507  			// If the transaction is now ABORTED, add all locks acquired by the
   508  			// batch directly to the lock footprint. We don't know which of
   509  			// these succeeded.
   510  			ba.LockSpanIterate(nil, tp.trackLocks)
   511  		case roachpb.COMMITTED:
   512  			// If the transaction is now COMMITTED, it must not have any more
   513  			// in-flight writes, so clear them. Technically we should move all
   514  			// of these to the lock footprint, but since the transaction is
   515  			// already committed, there's no reason to.
   516  			tp.ifWrites.clear(
   517  				/* reuse - we're not going to use this Btree again, so there's no point in
   518  				   moving the nodes to a free list */
   519  				false)
   520  		default:
   521  			panic("unexpected")
   522  		}
   523  		return
   524  	}
   525  
   526  	for i, ru := range ba.Requests {
   527  		req := ru.GetInner()
   528  		resp := br.Responses[i].GetInner()
   529  
   530  		if qiReq, ok := req.(*roachpb.QueryIntentRequest); ok {
   531  			// Remove any in-flight writes that were proven to exist.
   532  			// It shouldn't be possible for a QueryIntentRequest with
   533  			// the ErrorIfMissing option set to return without error
   534  			// and with with FoundIntent=false, but we handle that
   535  			// case here because it happens a lot in tests.
   536  			if resp.(*roachpb.QueryIntentResponse).FoundIntent {
   537  				tp.ifWrites.remove(qiReq.Key, qiReq.Txn.Sequence)
   538  				// Move to lock footprint.
   539  				tp.lockFootprint.insert(roachpb.Span{Key: qiReq.Key})
   540  			}
   541  		} else if roachpb.IsLocking(req) {
   542  			// If the request intended to acquire locks, track its lock spans.
   543  			if ba.AsyncConsensus {
   544  				// Record any writes that were performed asynchronously. We'll
   545  				// need to prove that these succeeded sometime before we commit.
   546  				header := req.Header()
   547  				tp.ifWrites.insert(header.Key, header.Sequence)
   548  			} else {
   549  				// If the lock acquisitions weren't performed asynchronously
   550  				// then add them directly to our lock footprint. Locking read
   551  				// requests will always hit this path because they will never
   552  				// use async consensus.
   553  				if sp, ok := roachpb.ActualSpan(req, resp); ok {
   554  					tp.lockFootprint.insert(sp)
   555  				}
   556  			}
   557  		}
   558  	}
   559  }
   560  
   561  func (tp *txnPipeliner) trackLocks(s roachpb.Span, _ lock.Durability) {
   562  	tp.lockFootprint.insert(s)
   563  }
   564  
   565  // stripQueryIntents adjusts the BatchResponse to hide the fact that this
   566  // interceptor added new requests to the batch. It returns an adjusted batch
   567  // response without the responses that correspond to these added requests.
   568  func (tp *txnPipeliner) stripQueryIntents(br *roachpb.BatchResponse) *roachpb.BatchResponse {
   569  	j := 0
   570  	for i, ru := range br.Responses {
   571  		if ru.GetQueryIntent() != nil {
   572  			continue
   573  		}
   574  		if i != j {
   575  			br.Responses[j] = br.Responses[i]
   576  		}
   577  		j++
   578  	}
   579  	br.Responses = br.Responses[:j]
   580  	return br
   581  }
   582  
   583  // adjustError adjusts the provided error based on the request that caused it.
   584  // It transforms any IntentMissingError into a TransactionRetryError and fixes
   585  // the error's index position.
   586  func (tp *txnPipeliner) adjustError(
   587  	ctx context.Context, ba roachpb.BatchRequest, pErr *roachpb.Error,
   588  ) *roachpb.Error {
   589  	// Fix the error index to hide the impact of any QueryIntent requests.
   590  	if pErr.Index != nil {
   591  		before := int32(0)
   592  		for _, ru := range ba.Requests[:int(pErr.Index.Index)] {
   593  			req := ru.GetInner()
   594  			if req.Method() == roachpb.QueryIntent {
   595  				before++
   596  			}
   597  		}
   598  		pErr.Index.Index -= before
   599  	}
   600  
   601  	// Turn an IntentMissingError into a transactional retry error.
   602  	if ime, ok := pErr.GetDetail().(*roachpb.IntentMissingError); ok {
   603  		log.VEventf(ctx, 2, "transforming intent missing error into retry: %v", ime)
   604  		err := roachpb.NewTransactionRetryError(
   605  			roachpb.RETRY_ASYNC_WRITE_FAILURE, fmt.Sprintf("missing intent on: %s", ime.Key))
   606  		retryErr := roachpb.NewErrorWithTxn(err, pErr.GetTxn())
   607  		retryErr.Index = pErr.Index
   608  		return retryErr
   609  	}
   610  	return pErr
   611  }
   612  
   613  // setWrapped implements the txnInterceptor interface.
   614  func (tp *txnPipeliner) setWrapped(wrapped lockedSender) {
   615  	tp.wrapped = wrapped
   616  }
   617  
   618  // populateLeafInputState is part of the txnInterceptor interface.
   619  func (tp *txnPipeliner) populateLeafInputState(tis *roachpb.LeafTxnInputState) {
   620  	tis.InFlightWrites = tp.ifWrites.asSlice()
   621  }
   622  
   623  // initializeLeaf loads the in-flight writes for a leaf transaction.
   624  func (tp *txnPipeliner) initializeLeaf(tis *roachpb.LeafTxnInputState) {
   625  	// Copy all in-flight writes into the inFlightWrite tree.
   626  	for _, w := range tis.InFlightWrites {
   627  		tp.ifWrites.insert(w.Key, w.Sequence)
   628  	}
   629  }
   630  
   631  // populateLeafFinalState is part of the txnInterceptor interface.
   632  func (tp *txnPipeliner) populateLeafFinalState(*roachpb.LeafTxnFinalState) {}
   633  
   634  // importLeafFinalState is part of the txnInterceptor interface.
   635  func (tp *txnPipeliner) importLeafFinalState(context.Context, *roachpb.LeafTxnFinalState) {}
   636  
   637  // epochBumpedLocked implements the txnReqInterceptor interface.
   638  func (tp *txnPipeliner) epochBumpedLocked() {
   639  	// Move all in-flight writes into the lock footprint. These writes no longer
   640  	// need to be tracked precisely, but we don't want to forget about them and
   641  	// fail to clean them up.
   642  	if tp.ifWrites.len() > 0 {
   643  		tp.ifWrites.ascend(func(w *inFlightWrite) {
   644  			tp.lockFootprint.insert(roachpb.Span{Key: w.Key})
   645  		})
   646  		tp.lockFootprint.mergeAndSort()
   647  		tp.ifWrites.clear(true /* reuse */)
   648  	}
   649  }
   650  
   651  // createSavepointLocked is part of the txnReqInterceptor interface.
   652  func (tp *txnPipeliner) createSavepointLocked(context.Context, *savepoint) {}
   653  
   654  // rollbackToSavepointLocked is part of the txnReqInterceptor interface.
   655  func (tp *txnPipeliner) rollbackToSavepointLocked(ctx context.Context, s savepoint) {
   656  	// Move all the writes in txnPipeliner that are not in the savepoint to the
   657  	// lock footprint. We no longer care if these write succeed or fail, so we're
   658  	// going to stop tracking these as in-flight writes. The respective intents
   659  	// still need to be cleaned up at the end of the transaction.
   660  	var writesToDelete []*inFlightWrite
   661  	needCollecting := !s.Initial()
   662  	tp.ifWrites.ascend(func(w *inFlightWrite) {
   663  		if w.Sequence > s.seqNum {
   664  			tp.lockFootprint.insert(roachpb.Span{Key: w.Key})
   665  			if needCollecting {
   666  				writesToDelete = append(writesToDelete, w)
   667  			}
   668  		}
   669  	})
   670  	tp.lockFootprint.mergeAndSort()
   671  
   672  	// Restore the inflight writes from the savepoint (minus the ones that have
   673  	// been verified in the meantime) by removing all the extra ones.
   674  	if needCollecting {
   675  		for _, ifw := range writesToDelete {
   676  			tp.ifWrites.remove(ifw.Key, ifw.Sequence)
   677  		}
   678  	} else {
   679  		tp.ifWrites.clear(true /* reuse */)
   680  	}
   681  }
   682  
   683  // closeLocked implements the txnReqInterceptor interface.
   684  func (tp *txnPipeliner) closeLocked() {}
   685  
   686  // hasAcquiredLocks returns whether the interceptor has made an attempt to
   687  // acquire any locks, whether doing so was known to be successful or not.
   688  func (tp *txnPipeliner) hasAcquiredLocks() bool {
   689  	return tp.ifWrites.len() > 0 || !tp.lockFootprint.empty()
   690  }
   691  
   692  // inFlightWrites represent a commitment to proving (via QueryIntent) that
   693  // a point write succeeded in replicating an intent with a specific sequence
   694  // number.
   695  type inFlightWrite struct {
   696  	roachpb.SequencedWrite
   697  }
   698  
   699  // Less implements the btree.Item interface.
   700  func (a *inFlightWrite) Less(b btree.Item) bool {
   701  	return a.Key.Compare(b.(*inFlightWrite).Key) < 0
   702  }
   703  
   704  // inFlightWriteSet is an ordered set of in-flight point writes. Given a set
   705  // of n elements, the structure supports O(log n) insertion of new in-flight
   706  // writes, O(log n) removal of existing in-flight writes, and O(m + log n)
   707  // retrieval over m in-flight writes that overlap with a given key.
   708  type inFlightWriteSet struct {
   709  	t     *btree.BTree
   710  	bytes int64
   711  
   712  	// Avoids allocs.
   713  	tmp1, tmp2 inFlightWrite
   714  	alloc      inFlightWriteAlloc
   715  }
   716  
   717  // insert attempts to insert an in-flight write that has not been proven to have
   718  // succeeded into the in-flight write set. If the write with an equal or larger
   719  // sequence number already exists in the set, the method is a no-op.
   720  func (s *inFlightWriteSet) insert(key roachpb.Key, seq enginepb.TxnSeq) {
   721  	if s.t == nil {
   722  		// Lazily initialize btree.
   723  		s.t = btree.New(txnPipelinerBtreeDegree)
   724  	}
   725  
   726  	s.tmp1.Key = key
   727  	item := s.t.Get(&s.tmp1)
   728  	if item != nil {
   729  		otherW := item.(*inFlightWrite)
   730  		if seq > otherW.Sequence {
   731  			// Existing in-flight write has old information.
   732  			otherW.Sequence = seq
   733  		}
   734  		return
   735  	}
   736  
   737  	w := s.alloc.alloc(key, seq)
   738  	s.t.ReplaceOrInsert(w)
   739  	s.bytes += keySize(key)
   740  }
   741  
   742  // remove attempts to remove an in-flight write from the in-flight write set.
   743  // The method will be a no-op if the write was already proved. Care is taken
   744  // not to accidentally remove a write to the same key but at a later epoch or
   745  // sequence number.
   746  func (s *inFlightWriteSet) remove(key roachpb.Key, seq enginepb.TxnSeq) {
   747  	if s.len() == 0 {
   748  		// Set is empty.
   749  		return
   750  	}
   751  
   752  	s.tmp1.Key = key
   753  	item := s.t.Get(&s.tmp1)
   754  	if item == nil {
   755  		// The write was already proven or the txn epoch was incremented.
   756  		return
   757  	}
   758  
   759  	w := item.(*inFlightWrite)
   760  	if seq < w.Sequence {
   761  		// The sequence might have changed, which means that a new write was
   762  		// sent to the same key. This write would have been forced to prove
   763  		// the existence of current write already.
   764  		return
   765  	}
   766  
   767  	// Delete the write from the in-flight writes set.
   768  	delItem := s.t.Delete(item)
   769  	if delItem != nil {
   770  		*delItem.(*inFlightWrite) = inFlightWrite{} // for GC
   771  	}
   772  	s.bytes -= keySize(key)
   773  
   774  	// Assert that the byte accounting is believable.
   775  	if s.bytes < 0 {
   776  		panic("negative in-flight write size")
   777  	} else if s.t.Len() == 0 && s.bytes != 0 {
   778  		panic("non-zero in-flight write size with 0 in-flight writes")
   779  	}
   780  }
   781  
   782  // ascend calls the provided function for every write in the set.
   783  func (s *inFlightWriteSet) ascend(f func(w *inFlightWrite)) {
   784  	if s.len() == 0 {
   785  		// Set is empty.
   786  		return
   787  	}
   788  	s.t.Ascend(func(i btree.Item) bool {
   789  		f(i.(*inFlightWrite))
   790  		return true
   791  	})
   792  }
   793  
   794  // ascendRange calls the provided function for every write in the set
   795  // with a key in the range [start, end).
   796  func (s *inFlightWriteSet) ascendRange(start, end roachpb.Key, f func(w *inFlightWrite)) {
   797  	if s.len() == 0 {
   798  		// Set is empty.
   799  		return
   800  	}
   801  	if end == nil {
   802  		// Point lookup.
   803  		s.tmp1.Key = start
   804  		if i := s.t.Get(&s.tmp1); i != nil {
   805  			f(i.(*inFlightWrite))
   806  		}
   807  	} else {
   808  		// Range lookup.
   809  		s.tmp1.Key, s.tmp2.Key = start, end
   810  		s.t.AscendRange(&s.tmp1, &s.tmp2, func(i btree.Item) bool {
   811  			f(i.(*inFlightWrite))
   812  			return true
   813  		})
   814  	}
   815  }
   816  
   817  // len returns the number of the in-flight writes in the set.
   818  func (s *inFlightWriteSet) len() int {
   819  	if s.t == nil {
   820  		return 0
   821  	}
   822  	return s.t.Len()
   823  }
   824  
   825  // byteSize returns the size in bytes of the in-flight writes in the set.
   826  func (s *inFlightWriteSet) byteSize() int64 {
   827  	return s.bytes
   828  }
   829  
   830  // clear purges all elements from the in-flight write set and frees associated
   831  // memory. The reuse flag indicates whether the caller is intending to reuse
   832  // the set or not.
   833  func (s *inFlightWriteSet) clear(reuse bool) {
   834  	if s.t == nil {
   835  		return
   836  	}
   837  	s.t.Clear(reuse /* addNodesToFreelist */)
   838  	s.bytes = 0
   839  	s.alloc.clear()
   840  }
   841  
   842  // asSlice returns the in-flight writes, ordered by key.
   843  func (s *inFlightWriteSet) asSlice() []roachpb.SequencedWrite {
   844  	l := s.len()
   845  	if l == 0 {
   846  		return nil
   847  	}
   848  	writes := make([]roachpb.SequencedWrite, 0, l)
   849  	s.ascend(func(w *inFlightWrite) {
   850  		writes = append(writes, w.SequencedWrite)
   851  	})
   852  	return writes
   853  }
   854  
   855  // inFlightWriteAlloc provides chunk allocation of inFlightWrites,
   856  // amortizing the overhead of each allocation.
   857  type inFlightWriteAlloc []inFlightWrite
   858  
   859  // alloc allocates a new inFlightWrite with the specified key and sequence
   860  // number.
   861  func (a *inFlightWriteAlloc) alloc(key roachpb.Key, seq enginepb.TxnSeq) *inFlightWrite {
   862  	// If the current alloc slice has no extra capacity, reallocate a new chunk.
   863  	if cap(*a)-len(*a) == 0 {
   864  		const chunkAllocMinSize = 4
   865  		const chunkAllocMaxSize = 1024
   866  
   867  		allocSize := cap(*a) * 2
   868  		if allocSize < chunkAllocMinSize {
   869  			allocSize = chunkAllocMinSize
   870  		} else if allocSize > chunkAllocMaxSize {
   871  			allocSize = chunkAllocMaxSize
   872  		}
   873  		*a = make([]inFlightWrite, 0, allocSize)
   874  	}
   875  
   876  	*a = (*a)[:len(*a)+1]
   877  	w := &(*a)[len(*a)-1]
   878  	*w = inFlightWrite{
   879  		SequencedWrite: roachpb.SequencedWrite{Key: key, Sequence: seq},
   880  	}
   881  	return w
   882  }
   883  
   884  // clear removes all allocated in-flight writes and attempts to reclaim as
   885  // much allocated memory as possible.
   886  func (a *inFlightWriteAlloc) clear() {
   887  	for i := range *a {
   888  		(*a)[i] = inFlightWrite{} // for GC
   889  	}
   890  	*a = (*a)[:0]
   891  }