github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_batch_updates.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    17  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    20  	"github.com/cockroachdb/cockroach/pkg/util/log"
    21  	"github.com/cockroachdb/errors"
    22  )
    23  
    24  // ----------------------------------------------------------------------------
    25  // This files contains functions performing updates to a BatchRequest performed
    26  // on the server-side, specifically after the request has been routed to a
    27  // replica (and thus the request has been split based on range boundaries).
    28  // As per the client.Sender contract, these function need to consider the input
    29  // batches as copy-on-write.
    30  // ----------------------------------------------------------------------------
    31  
    32  // maybeStripInFlightWrites attempts to remove all point writes and query
    33  // intents that ended up in the same batch as an EndTxn request from that EndTxn
    34  // request's "in-flight" write set. The entire batch will commit atomically, so
    35  // there is no need to consider the writes in the same batch concurrent.
    36  //
    37  // The transformation can lead to bypassing the STAGING state for a transaction
    38  // entirely. This is possible if the function removes all of the in-flight
    39  // writes from an EndTxn request that was committing in parallel with writes
    40  // which all happened to be on the same range as the transaction record.
    41  func maybeStripInFlightWrites(ba *roachpb.BatchRequest) (*roachpb.BatchRequest, error) {
    42  	args, hasET := ba.GetArg(roachpb.EndTxn)
    43  	if !hasET {
    44  		return ba, nil
    45  	}
    46  
    47  	et := args.(*roachpb.EndTxnRequest)
    48  	otherReqs := ba.Requests[:len(ba.Requests)-1]
    49  	if !et.IsParallelCommit() || len(otherReqs) == 0 {
    50  		return ba, nil
    51  	}
    52  
    53  	// Clone the BatchRequest and the EndTxn request before modifying it. We nil
    54  	// out the request's in-flight writes and make the lock spans immutable on
    55  	// append. Code below can use origET to recreate the in-flight write set if
    56  	// any elements remain in it.
    57  	origET := et
    58  	et = origET.ShallowCopy().(*roachpb.EndTxnRequest)
    59  	et.InFlightWrites = nil
    60  	et.LockSpans = et.LockSpans[:len(et.LockSpans):len(et.LockSpans)] // immutable
    61  	ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...)
    62  	ba.Requests[len(ba.Requests)-1].MustSetInner(et)
    63  
    64  	// Fast-path: If we know that this batch contains all of the transaction's
    65  	// in-flight writes, then we can avoid searching in the in-flight writes set
    66  	// for each request. Instead, we can blindly merge all in-flight writes into
    67  	// the lock spans and clear out the in-flight writes set.
    68  	if len(otherReqs) >= len(origET.InFlightWrites) {
    69  		writes := 0
    70  		for _, ru := range otherReqs {
    71  			req := ru.GetInner()
    72  			switch {
    73  			case roachpb.IsIntentWrite(req) && !roachpb.IsRange(req):
    74  				// Concurrent point write.
    75  				writes++
    76  			case req.Method() == roachpb.QueryIntent:
    77  				// Earlier pipelined point write that hasn't been proven yet.
    78  				writes++
    79  			default:
    80  				// Ranged write or read. See below.
    81  			}
    82  		}
    83  		if len(origET.InFlightWrites) < writes {
    84  			return ba, errors.New("more write in batch with EndTxn than listed in in-flight writes")
    85  		} else if len(origET.InFlightWrites) == writes {
    86  			et.LockSpans = make([]roachpb.Span, len(origET.LockSpans)+len(origET.InFlightWrites))
    87  			copy(et.LockSpans, origET.LockSpans)
    88  			for i, w := range origET.InFlightWrites {
    89  				et.LockSpans[len(origET.LockSpans)+i] = roachpb.Span{Key: w.Key}
    90  			}
    91  			// See below for why we set Header.DistinctSpans here.
    92  			et.LockSpans, ba.Header.DistinctSpans = roachpb.MergeSpans(et.LockSpans)
    93  			return ba, nil
    94  		}
    95  	}
    96  
    97  	// Slow-path: If not then we remove each transaction write in the batch from
    98  	// the in-flight write set and merge it into the lock spans.
    99  	copiedTo := 0
   100  	for _, ru := range otherReqs {
   101  		req := ru.GetInner()
   102  		seq := req.Header().Sequence
   103  		switch {
   104  		case roachpb.IsIntentWrite(req) && !roachpb.IsRange(req):
   105  			// Concurrent point write.
   106  		case req.Method() == roachpb.QueryIntent:
   107  			// Earlier pipelined point write that hasn't been proven yet. We
   108  			// could remove from the in-flight writes set when we see these,
   109  			// but doing so would prevent us from using the optimization we
   110  			// have below where we rely on increasing sequence numbers for
   111  			// each subsequent request.
   112  			//
   113  			// We already don't intend on sending QueryIntent requests in the
   114  			// same batch as EndTxn requests because doing so causes a pipeline
   115  			// stall, so this doesn't seem worthwhile to support.
   116  			continue
   117  		default:
   118  			// Ranged write or read. These can make it into the final batch with
   119  			// a parallel committing EndTxn request if the entire batch issued
   120  			// by DistSender lands on the same range. Skip.
   121  			continue
   122  		}
   123  
   124  		// Remove the write from the in-flight writes set. We only need to
   125  		// search from after the previously removed sequence number forward
   126  		// because both the InFlightWrites and the Requests in the batch are
   127  		// stored in increasing sequence order.
   128  		//
   129  		// Maintaining an iterator into the in-flight writes slice and scanning
   130  		// instead of performing a binary search on each request changes the
   131  		// complexity of this loop from O(n*log(m)) to O(m) where n is the
   132  		// number of point writes in the batch and m is the number of in-flight
   133  		// writes. These complexities aren't directly comparable, but copying
   134  		// all unstripped writes back into et.InFlightWrites is already O(m),
   135  		// so the approach here was preferred over repeat binary searches.
   136  		match := -1
   137  		for i, w := range origET.InFlightWrites[copiedTo:] {
   138  			if w.Sequence == seq {
   139  				match = i + copiedTo
   140  				break
   141  			}
   142  		}
   143  		if match == -1 {
   144  			return ba, errors.New("write in batch with EndTxn missing from in-flight writes")
   145  		}
   146  		w := origET.InFlightWrites[match]
   147  		notInBa := origET.InFlightWrites[copiedTo:match]
   148  		et.InFlightWrites = append(et.InFlightWrites, notInBa...)
   149  		copiedTo = match + 1
   150  
   151  		// Move the write to the lock spans set since it's no
   152  		// longer being tracked in the in-flight write set.
   153  		et.LockSpans = append(et.LockSpans, roachpb.Span{Key: w.Key})
   154  	}
   155  	if et != origET {
   156  		// Finish building up the remaining in-flight writes.
   157  		notInBa := origET.InFlightWrites[copiedTo:]
   158  		et.InFlightWrites = append(et.InFlightWrites, notInBa...)
   159  		// Re-sort and merge the lock spans. We can set the batch request's
   160  		// DistinctSpans flag based on whether any of in-flight writes in this
   161  		// batch overlap with each other. This will have (rare) false negatives
   162  		// when the in-flight writes overlap with existing lock spans, but never
   163  		// false positives.
   164  		et.LockSpans, ba.Header.DistinctSpans = roachpb.MergeSpans(et.LockSpans)
   165  	}
   166  	return ba, nil
   167  }
   168  
   169  // maybeBumpReadTimestampToWriteTimestamp bumps the batch's read timestamp to
   170  // the write timestamp for transactional batches where these timestamp have
   171  // diverged and where bumping is possible. When possible, this allows the
   172  // transaction to commit without having to retry.
   173  //
   174  // Returns true if the timestamp was bumped.
   175  //
   176  // Note that this, like all the server-side bumping of the read timestamp, only
   177  // works for batches that exclusively contain writes; reads cannot be bumped
   178  // like this because they've already acquired timestamp-aware latches.
   179  func maybeBumpReadTimestampToWriteTimestamp(
   180  	ctx context.Context, ba *roachpb.BatchRequest, latchSpans *spanset.SpanSet,
   181  ) bool {
   182  	if ba.Txn == nil {
   183  		return false
   184  	}
   185  	if ba.Txn.ReadTimestamp.Equal(ba.Txn.WriteTimestamp) {
   186  		return false
   187  	}
   188  	arg, ok := ba.GetArg(roachpb.EndTxn)
   189  	if !ok {
   190  		return false
   191  	}
   192  	etArg := arg.(*roachpb.EndTxnRequest)
   193  	if batcheval.CanForwardCommitTimestampWithoutRefresh(ba.Txn, etArg) &&
   194  		!batcheval.IsEndTxnExceedingDeadline(ba.Txn.WriteTimestamp, etArg) {
   195  		return tryBumpBatchTimestamp(ctx, ba, ba.Txn.WriteTimestamp, latchSpans)
   196  	}
   197  	return false
   198  }
   199  
   200  // tryBumpBatchTimestamp attempts to bump ba's read and write timestamps to ts.
   201  //
   202  // Returns true if the timestamp was bumped. Returns false if the timestamp could
   203  // not be bumped.
   204  func tryBumpBatchTimestamp(
   205  	ctx context.Context, ba *roachpb.BatchRequest, ts hlc.Timestamp, latchSpans *spanset.SpanSet,
   206  ) bool {
   207  	if latchSpans.MaxProtectedTimestamp().Less(ts) {
   208  		// If the batch acquired any read latches with bounded (MVCC) timestamps
   209  		// below this new timestamp then we can not trivially bump the batch's
   210  		// timestamp without dropping and re-acquiring those latches. Doing so
   211  		// could allow the request to read at an unprotected timestamp.
   212  		//
   213  		// NOTE: we could consider adding a retry-loop above the latch
   214  		// acquisition to allow this to be retried, but given that we try not to
   215  		// mix read-only and read-write requests, doing so doesn't seem worth
   216  		// it.
   217  		return false
   218  	}
   219  	if ts.Less(ba.Timestamp) {
   220  		log.Fatalf(ctx, "trying to bump to %s <= ba.Timestamp: %s", ts, ba.Timestamp)
   221  	}
   222  	ba.Timestamp = ts
   223  	if txn := ba.Txn; txn == nil {
   224  		return true
   225  	}
   226  	if ts.Less(ba.Txn.ReadTimestamp) || ts.Less(ba.Txn.WriteTimestamp) {
   227  		log.Fatalf(ctx, "trying to bump to %s inconsistent with ba.Txn.ReadTimestamp: %s, "+
   228  			"ba.Txn.WriteTimestamp: %s", ts, ba.Txn.ReadTimestamp, ba.Txn.WriteTimestamp)
   229  	}
   230  	log.VEventf(ctx, 2, "bumping batch timestamp to: %s from read: %s, write: %s)",
   231  		ts, ba.Txn.ReadTimestamp, ba.Txn.WriteTimestamp)
   232  	ba.Txn = ba.Txn.Clone()
   233  	ba.Txn.ReadTimestamp = ts
   234  	ba.Txn.WriteTimestamp = ts
   235  	ba.Txn.WriteTooOld = false
   236  	return true
   237  }