github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_tscache.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_tscache.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/keys"
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/tscache"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    22  	"github.com/cockroachdb/cockroach/pkg/util"
    23  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    24  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    25  )
    26  
    27  // setTimestampCacheLowWaterMark updates the low water mark of the timestamp
    28  // cache to the provided timestamp for all key ranges owned by the provided
    29  // Range descriptor. This ensures that no future writes in either the local or
    30  // global keyspace are allowed at times equal to or earlier than this timestamp,
    31  // which could invalidate prior reads.
    32  func setTimestampCacheLowWaterMark(
    33  	tc tscache.Cache, desc *roachpb.RangeDescriptor, ts hlc.Timestamp,
    34  ) {
    35  	for _, keyRange := range rditer.MakeReplicatedKeyRanges(desc) {
    36  		tc.SetLowWater(keyRange.Start.Key, keyRange.End.Key, ts)
    37  	}
    38  }
    39  
    40  // updateTimestampCache updates the timestamp cache in order to set a low water
    41  // mark for the timestamp at which mutations to keys overlapping the provided
    42  // request can write, such that they don't re-write history.
    43  func (r *Replica) updateTimestampCache(
    44  	ctx context.Context, ba *roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error,
    45  ) {
    46  	if ba.ReadConsistency != roachpb.CONSISTENT {
    47  		// Inconsistent reads are excluded from the timestamp cache.
    48  		return
    49  	}
    50  	addToTSCache := r.store.tsCache.Add
    51  	if util.RaceEnabled {
    52  		addToTSCache = checkedTSCacheUpdate(r.store.Clock().Now(), r.store.tsCache, ba, br, pErr)
    53  	}
    54  	// Update the timestamp cache using the timestamp at which the batch
    55  	// was executed. Note this may have moved forward from ba.Timestamp,
    56  	// as when the request is retried locally on WriteTooOldErrors.
    57  	ts := ba.Timestamp
    58  	if br != nil {
    59  		ts = br.Timestamp
    60  	}
    61  	var txnID uuid.UUID
    62  	if ba.Txn != nil {
    63  		txnID = ba.Txn.ID
    64  	}
    65  	for i, union := range ba.Requests {
    66  		args := union.GetInner()
    67  		if !roachpb.UpdatesTimestampCache(args) {
    68  			continue
    69  		}
    70  		// Skip update if there's an error and it's not for this index
    71  		// or the request doesn't update the timestamp cache on errors.
    72  		if pErr != nil {
    73  			if index := pErr.Index; !roachpb.UpdatesTimestampCacheOnError(args) ||
    74  				index == nil || int32(i) != index.Index {
    75  				continue
    76  			}
    77  		}
    78  		header := args.Header()
    79  		start, end := header.Key, header.EndKey
    80  		switch t := args.(type) {
    81  		case *roachpb.EndTxnRequest:
    82  			// EndTxn requests that finalize their transaction record a
    83  			// tombstone in the timestamp cache to ensure replays and concurrent
    84  			// requests aren't able to recreate the transaction record.
    85  			//
    86  			// It inserts the timestamp of the final batch in the transaction.
    87  			// This timestamp must necessarily be equal to or greater than the
    88  			// transaction's MinTimestamp, which is consulted in
    89  			// CanCreateTxnRecord.
    90  			if br.Txn.Status.IsFinalized() {
    91  				key := transactionTombstoneMarker(start, txnID)
    92  				addToTSCache(key, nil, ts, txnID)
    93  			}
    94  		case *roachpb.RecoverTxnRequest:
    95  			// A successful RecoverTxn request may or may not have finalized the
    96  			// transaction that it was trying to recover. If so, then we record
    97  			// a tombstone to the timestamp cache to ensure that replays and
    98  			// concurrent requests aren't able to recreate the transaction record.
    99  			// This parallels what we do in the EndTxn request case.
   100  			//
   101  			// Insert the timestamp of the batch, which we asserted during
   102  			// command evaluation was equal to or greater than the transaction's
   103  			// MinTimestamp.
   104  			recovered := br.Responses[i].GetInner().(*roachpb.RecoverTxnResponse).RecoveredTxn
   105  			if recovered.Status.IsFinalized() {
   106  				key := transactionTombstoneMarker(start, recovered.ID)
   107  				addToTSCache(key, nil, ts, recovered.ID)
   108  			}
   109  		case *roachpb.PushTxnRequest:
   110  			// A successful PushTxn request bumps the timestamp cache for
   111  			// the pushee's transaction key. The pushee will consult the
   112  			// timestamp cache when creating its record. If the push left
   113  			// the transaction in a PENDING state (PUSH_TIMESTAMP) then we
   114  			// update the timestamp cache. This will cause the creator
   115  			// of the transaction record to forward its provisional commit
   116  			// timestamp to honor the result of this push. If the push left
   117  			// the transaction in an ABORTED state (PUSH_ABORT) then we
   118  			// update the a special record in the timestamp cache. This will prevent
   119  			// the creation of the transaction record entirely.
   120  			pushee := br.Responses[i].GetInner().(*roachpb.PushTxnResponse).PusheeTxn
   121  
   122  			var tombstone bool
   123  			switch pushee.Status {
   124  			case roachpb.PENDING:
   125  				tombstone = false
   126  			case roachpb.ABORTED:
   127  				tombstone = true
   128  			case roachpb.STAGING:
   129  				// No need to update the timestamp cache. If a transaction
   130  				// is in this state then it must have a transaction record.
   131  				continue
   132  			case roachpb.COMMITTED:
   133  				// No need to update the timestamp cache. It was already
   134  				// updated by the corresponding EndTxn request.
   135  				continue
   136  			}
   137  
   138  			var key roachpb.Key
   139  			if tombstone {
   140  				key = transactionTombstoneMarker(start, pushee.ID)
   141  			} else {
   142  				key = transactionPushMarker(start, pushee.ID)
   143  			}
   144  			addToTSCache(key, nil, pushee.WriteTimestamp, t.PusherTxn.ID)
   145  		case *roachpb.ConditionalPutRequest:
   146  			// ConditionalPut only updates on ConditionFailedErrors. On other
   147  			// errors, no information is returned. On successful writes, the
   148  			// intent already protects against writes underneath the read.
   149  			if _, ok := pErr.GetDetail().(*roachpb.ConditionFailedError); ok {
   150  				addToTSCache(start, end, ts, txnID)
   151  			}
   152  		case *roachpb.InitPutRequest:
   153  			// InitPut only updates on ConditionFailedErrors. On other errors,
   154  			// no information is returned. On successful writes, the intent
   155  			// already protects against writes underneath the read.
   156  			if _, ok := pErr.GetDetail().(*roachpb.ConditionFailedError); ok {
   157  				addToTSCache(start, end, ts, txnID)
   158  			}
   159  		case *roachpb.ScanRequest:
   160  			resp := br.Responses[i].GetInner().(*roachpb.ScanResponse)
   161  			if resp.ResumeSpan != nil {
   162  				// Note that for forward scan, the resume span will start at
   163  				// the (last key read).Next(), which is actually the correct
   164  				// end key for the span to update the timestamp cache.
   165  				end = resp.ResumeSpan.Key
   166  			}
   167  			addToTSCache(start, end, ts, txnID)
   168  		case *roachpb.ReverseScanRequest:
   169  			resp := br.Responses[i].GetInner().(*roachpb.ReverseScanResponse)
   170  			if resp.ResumeSpan != nil {
   171  				// Note that for reverse scans, the resume span's end key is
   172  				// an open interval. That means it was read as part of this op
   173  				// and won't be read on resume. It is the correct start key for
   174  				// the span to update the timestamp cache.
   175  				start = resp.ResumeSpan.EndKey
   176  			}
   177  			addToTSCache(start, end, ts, txnID)
   178  		case *roachpb.QueryIntentRequest:
   179  			missing := false
   180  			if pErr != nil {
   181  				switch t := pErr.GetDetail().(type) {
   182  				case *roachpb.IntentMissingError:
   183  					missing = true
   184  				case *roachpb.TransactionRetryError:
   185  					// QueryIntent will return a TxnRetry(SERIALIZABLE) error
   186  					// if a transaction is querying its own intent and finds
   187  					// it pushed.
   188  					//
   189  					// NB: we check the index of the error above, so this
   190  					// TransactionRetryError should indicate a missing intent
   191  					// from the QueryIntent request. However, bumping the
   192  					// timestamp cache wouldn't cause a correctness issue
   193  					// if we found the intent.
   194  					missing = t.Reason == roachpb.RETRY_SERIALIZABLE
   195  				}
   196  			} else {
   197  				missing = !br.Responses[i].GetInner().(*roachpb.QueryIntentResponse).FoundIntent
   198  			}
   199  			if missing {
   200  				// If the QueryIntent determined that the intent is missing
   201  				// then we update the timestamp cache at the intent's key to
   202  				// the intent's transactional timestamp. This will prevent
   203  				// the intent from ever being written in the future. We use
   204  				// an empty transaction ID so that we block the intent
   205  				// regardless of whether it is part of the current batch's
   206  				// transaction or not.
   207  				addToTSCache(start, end, t.Txn.WriteTimestamp, uuid.UUID{})
   208  			}
   209  		default:
   210  			addToTSCache(start, end, ts, txnID)
   211  		}
   212  	}
   213  }
   214  
   215  // checkedTSCacheUpdate wraps tscache.Cache and asserts that any update to the
   216  // cache is at or below the specified time.
   217  func checkedTSCacheUpdate(
   218  	now hlc.Timestamp,
   219  	tc tscache.Cache,
   220  	ba *roachpb.BatchRequest,
   221  	br *roachpb.BatchResponse,
   222  	pErr *roachpb.Error,
   223  ) func(roachpb.Key, roachpb.Key, hlc.Timestamp, uuid.UUID) {
   224  	return func(start, end roachpb.Key, ts hlc.Timestamp, txnID uuid.UUID) {
   225  		if now.Less(ts) {
   226  			panic(fmt.Sprintf("Unsafe timestamp cache update! Cannot add timestamp %s to timestamp "+
   227  				"cache after evaluating %v (resp=%v; err=%v) with local hlc clock at timestamp %s. "+
   228  				"The timestamp cache update could be lost on a lease transfer.", ts, ba, br, pErr, now))
   229  		}
   230  		tc.Add(start, end, ts, txnID)
   231  	}
   232  }
   233  
   234  // txnsPushedDueToClosedTimestamp is a telemetry counter for the number of
   235  // batch requests which have been pushed due to the closed timestamp.
   236  var batchesPushedDueToClosedTimestamp telemetry.Counter
   237  
   238  func init() {
   239  	batchesPushedDueToClosedTimestamp = telemetry.GetCounter("kv.closed_timestamp.txns_pushed")
   240  }
   241  
   242  // applyTimestampCache moves the batch timestamp forward depending on
   243  // the presence of overlapping entries in the timestamp cache. If the
   244  // batch is transactional, the txn timestamp and the txn.WriteTooOld
   245  // bool are updated.
   246  //
   247  // Two important invariants of Cockroach: 1) encountering a more
   248  // recently written value means transaction restart. 2) values must
   249  // be written with a greater timestamp than the most recent read to
   250  // the same key. Check the timestamp cache for reads/writes which
   251  // are at least as recent as the timestamp of this write. The cmd must
   252  // update its timestamp to be greater than more recent values in the
   253  // timestamp cache. When the write returns, the updated timestamp
   254  // will inform the batch response timestamp or batch response txn
   255  // timestamp.
   256  //
   257  // minReadTS is used as a per-request low water mark for the value returned from
   258  // the timestamp cache. That is, if the timestamp cache returns a value below
   259  // minReadTS, minReadTS (without an associated txn id) will be used instead to
   260  // adjust the batch's timestamp.
   261  func (r *Replica) applyTimestampCache(
   262  	ctx context.Context, ba *roachpb.BatchRequest, minReadTS hlc.Timestamp,
   263  ) bool {
   264  	// bumpedDueToMinReadTS is set to true if the highest timestamp bump encountered
   265  	// below is due to the minReadTS.
   266  	var bumpedDueToMinReadTS bool
   267  	var bumped bool
   268  
   269  	for _, union := range ba.Requests {
   270  		args := union.GetInner()
   271  		if roachpb.ConsultsTimestampCache(args) {
   272  			header := args.Header()
   273  
   274  			// Forward the timestamp if there's been a more recent read (by someone else).
   275  			rTS, rTxnID := r.store.tsCache.GetMax(header.Key, header.EndKey)
   276  			var forwardedToMinReadTS bool
   277  			if rTS.Forward(minReadTS) {
   278  				forwardedToMinReadTS = true
   279  				rTxnID = uuid.Nil
   280  			}
   281  			nextRTS := rTS.Next()
   282  			var bumpedCurReq bool
   283  			if ba.Txn != nil {
   284  				if ba.Txn.ID != rTxnID {
   285  					if ba.Txn.WriteTimestamp.Less(nextRTS) {
   286  						txn := ba.Txn.Clone()
   287  						bumpedCurReq = txn.WriteTimestamp.Forward(nextRTS)
   288  						ba.Txn = txn
   289  					}
   290  				}
   291  			} else {
   292  				bumpedCurReq = ba.Timestamp.Forward(nextRTS)
   293  			}
   294  			// Preserve bumpedDueToMinReadTS if we did not just bump or set it
   295  			// appropriately if we did.
   296  			bumpedDueToMinReadTS = (!bumpedCurReq && bumpedDueToMinReadTS) || (bumpedCurReq && forwardedToMinReadTS)
   297  			bumped, bumpedCurReq = bumped || bumpedCurReq, false
   298  		}
   299  	}
   300  	if bumpedDueToMinReadTS {
   301  		telemetry.Inc(batchesPushedDueToClosedTimestamp)
   302  	}
   303  	return bumped
   304  }
   305  
   306  // CanCreateTxnRecord determines whether a transaction record can be created for
   307  // the provided transaction information. Callers must provide the transaction's
   308  // minimum timestamp across all epochs, along with its ID and its key.
   309  //
   310  // If the method return true, it also returns the minimum provisional commit
   311  // timestamp that the record can be created with. If the method returns false,
   312  // it returns the reason that transaction record was rejected. If the method
   313  // ever determines that a transaction record must be rejected, it will continue
   314  // to reject that transaction going forwards.
   315  //
   316  // The method performs two critical roles:
   317  //
   318  //  1. It protects against replayed requests or new requests from a
   319  //     transaction's coordinator that could otherwise cause a transaction record
   320  //     to be created after the transaction has already been finalized and its
   321  //     record cleaned up.
   322  //
   323  //  2. It serves as the mechanism by which successful push requests convey
   324  //     information to transactions who have not yet written their transaction
   325  //     record. In doing so, it ensures that transaction records are created
   326  //     with a sufficiently high timestamp after a successful PushTxn(TIMESTAMP)
   327  //     and ensures that transactions records are never created at all after a
   328  //     successful PushTxn(ABORT). As a result of this mechanism, a transaction
   329  //     never needs to explicitly create the transaction record for contending
   330  //     transactions.
   331  //
   332  // This is detailed in the transaction record state machine below:
   333  //
   334  //  +----------------------------------------------------+
   335  //  | vars                                               |
   336  //  |----------------------------------------------------|
   337  //  | v1 = tsCache[push_marker(txn.id)]      = timestamp |
   338  //  | v2 = tsCache[tombstone_marker(txn.id)] = timestamp |
   339  //  +----------------------------------------------------+
   340  //  | operations                                         |
   341  //  |----------------------------------------------------|
   342  //  | v -> t = forward v by timestamp t                  |
   343  //  +----------------------------------------------------+
   344  //
   345  //                   PushTxn(TIMESTAMP)                                HeartbeatTxn
   346  //                   then: v1 -> push.ts                             then: update record
   347  //                       +------+                                        +------+
   348  //     PushTxn(ABORT)    |      |        HeartbeatTxn                    |      |   PushTxn(TIMESTAMP)
   349  //    then: v2 -> txn.ts |      v        if: v2 < txn.orig               |      v  then: update record
   350  //                  +-----------------+  then: txn.ts -> v1      +--------------------+
   351  //             +----|                 |  else: fail              |                    |----+
   352  //             |    |                 |------------------------->|                    |    |
   353  //             |    |  no txn record  |                          | txn record written |    |
   354  //             +--->|                 |  EndTxn(STAGING)         |     [pending]      |<---+
   355  //                  |                 |__  if: v2 < txn.orig     |                    |
   356  //                  +-----------------+  \__ then: txn.ts -> v1  +--------------------+
   357  //                     |            ^       \__ else: fail       _/   |            ^
   358  //                     |            |          \__             _/     |            |
   359  //  EndTxn(!STAGING)   |            |             \__        _/       | EndTxn(STAGING)
   360  //  if: v2 < txn.orig  |   Eager GC |                \____ _/______   |            |
   361  //  then: v2 -> txn.ts |      or    |                    _/        \  |            | HeartbeatTxn
   362  //  else: fail         |   GC queue |  /----------------/          |  |            | if: epoch update
   363  //                     v            | v    EndTxn(!STAGING)        v  v            |
   364  //                 +--------------------+  or PushTxn(ABORT)     +--------------------+
   365  //                 |                    |  then: v2 -> txn.ts    |                    |
   366  //            +--->|                    |<-----------------------|                    |----+
   367  //            |    | txn record written |                        | txn record written |    |
   368  //            |    |     [finalized]    |                        |      [staging]     |    |
   369  //            +----|                    |                        |                    |<---+
   370  //    PushTxn(*)   +--------------------+                        +--------------------+
   371  //    then: no-op                    ^   PushTxn(*) + RecoverTxn    |              EndTxn(STAGING)
   372  //                                   |     then: v2 -> txn.ts       |              or HeartbeatTxn
   373  //                                   +------------------------------+            then: update record
   374  //
   375  //
   376  // In the diagram, CanCreateTxnRecord is consulted in all three of the
   377  // state transitions that move away from the "no txn record" state.
   378  // Updating v1 and v2 is performed in updateTimestampCache.
   379  //
   380  // The are three separate simplifications to the transaction model that would
   381  // allow us to simplify this state machine:
   382  //
   383  //  1. as discussed on the comment on txnHeartbeater, it is reasonable to expect
   384  //     that we will eventually move away from tracking transaction liveness on
   385  //     a per-transaction basis. This means that we would no longer need
   386  //     transaction heartbeats and would never need to write a transaction record
   387  //     until a transaction is ready to complete.
   388  //
   389  //  2. one of the two possibilities for the "txn record written [finalized]"
   390  //     state is that the transaction record is aborted. There used to be two
   391  //     reasons to persist transaction records with the ABORTED status. The first
   392  //     was because doing so was the only way for concurrent actors to prevent
   393  //     the record from being re-written by the transaction going forward. The
   394  //     concurrent actor would write an aborted transaction record and then wait
   395  //     for the GC to clean it up later. The other reasons for writing the
   396  //     transaction records with the ABORTED status was because these records
   397  //     could point at intents, which assisted the cleanup process for these
   398  //     intents. However, this only held for ABORTED records written on behalf
   399  //     of the transaction coordinator itself. If a transaction was aborted by a
   400  //     concurrent actor, its record would not immediately contain any of the
   401  //     transaction's intents.
   402  //
   403  //     The first reason here no longer holds. Concurrent actors now bump the
   404  //     timestamp cache when aborting a transaction, which has the same
   405  //     effect as writing an ABORTED transaction record. See the "tombstone
   406  //     marker". The second reason still holds but is fairly weak. A transaction
   407  //     coordinator can kick off intent resolution for an aborted transaction
   408  //     without needing to write these intents into the record itself. In the
   409  //     worst case, this intent resolution fails and each intent is cleaned up
   410  //     individually as it is discovered. All in all, neither justification for
   411  //     this state holds much weight anymore.
   412  //
   413  //  3. the other possibility for the "txn record written [finalized]" state is
   414  //     that the transaction record is committed. This state is currently
   415  //     critical for the transaction model because intent resolution cannot begin
   416  //     before a transaction record enters this state. However, this doesn't need
   417  //     to be the case forever. There are proposals to modify the state of
   418  //     committed key-value writes slightly such that intent resolution could be
   419  //     run for implicitly committed transactions while their transaction record
   420  //     remains in the  "txn record written [staging]" state. For this to work,
   421  //     the recovery mechanism for indeterminate commit errors would need to be
   422  //     able to determine whether an intent or a **committed value** indicated
   423  //     the success of a write that was in-flight at the time the transaction
   424  //     record was staged. This poses challenges migration and garbage
   425  //     collection, but it would have a number of performance benefits.
   426  //
   427  // If we were to perform change #1, we could remove the "txn record written
   428  // [pending]" state. If we were to perform change #2 and #3, we could remove the
   429  // "txn record written [finalized]" state. All together, this would leave us
   430  // with only two states that the transaction record could be in, written or not
   431  // written. At that point, it begins to closely resemble any other write in the
   432  // system.
   433  //
   434  func (r *Replica) CanCreateTxnRecord(
   435  	txnID uuid.UUID, txnKey []byte, txnMinTS hlc.Timestamp,
   436  ) (ok bool, minCommitTS hlc.Timestamp, reason roachpb.TransactionAbortedReason) {
   437  	// Consult the timestamp cache with the transaction's key. The timestamp
   438  	// cache is used in two ways for transactions without transaction records.
   439  	// The timestamp cache is used to push the timestamp of transactions
   440  	// that don't have transaction records. The timestamp cache is used
   441  	// to abort transactions entirely that don't have transaction records.
   442  	//
   443  	// Using this strategy, we enforce the invariant that only requests sent
   444  	// from a transaction's own coordinator can create its transaction record.
   445  	// However, once a transaction record is written, other concurrent actors
   446  	// can modify it. This is reflected in the diagram above.
   447  	tombstoneKey := transactionTombstoneMarker(txnKey, txnID)
   448  	pushKey := transactionPushMarker(txnKey, txnID)
   449  
   450  	// Look in the timestamp cache to see if there is an entry for this
   451  	// transaction, which indicates the minimum timestamp that the transaction
   452  	// can commit at. This is used by pushers to push the timestamp of a
   453  	// transaction that hasn't yet written its transaction record.
   454  	minCommitTS, _ = r.store.tsCache.GetMax(pushKey, nil /* end */)
   455  
   456  	// Also look in the timestamp cache to see if there is a tombstone entry for
   457  	// this transaction, which would indicate this transaction has already been
   458  	// finalized or was already aborted by a concurrent transaction. If there is
   459  	// an entry, then we return a retriable error: if this is a re-evaluation,
   460  	// then the error will be transformed into an ambiguous one higher up.
   461  	// Otherwise, if the client is still waiting for a result, then this cannot
   462  	// be a "replay" of any sort.
   463  	tombstoneTimestamp, tombstomeTxnID := r.store.tsCache.GetMax(tombstoneKey, nil /* end */)
   464  	// Compare against the minimum timestamp that the transaction could have
   465  	// written intents at.
   466  	if txnMinTS.LessEq(tombstoneTimestamp) {
   467  		switch tombstomeTxnID {
   468  		case txnID:
   469  			// If we find our own transaction ID then an EndTxn request sent by
   470  			// our coordinator has already been processed. We might be a replay (e.g.
   471  			// a DistSender retry), or we raced with an asynchronous abort. Either
   472  			// way, return an error.
   473  			//
   474  			// TODO(andrei): We could keep a bit more info in the tscache to return a
   475  			// different error for COMMITTED transactions. If the EndTxn(commit) was
   476  			// the only request in the batch, this this would be sufficient for the
   477  			// client to swallow the error and declare the transaction as committed.
   478  			// If there were other requests in the EndTxn batch, then the client would
   479  			// still have trouble reconstructing the result, but at least it could
   480  			// provide a non-ambiguous error to the application.
   481  			return false, hlc.Timestamp{},
   482  				roachpb.ABORT_REASON_ALREADY_COMMITTED_OR_ROLLED_BACK_POSSIBLE_REPLAY
   483  		case uuid.Nil:
   484  			lease, _ /* nextLease */ := r.GetLease()
   485  			// Recognize the case where a lease started recently. Lease transfers bump
   486  			// the ts cache low water mark.
   487  			if tombstoneTimestamp == lease.Start {
   488  				return false, hlc.Timestamp{}, roachpb.ABORT_REASON_NEW_LEASE_PREVENTS_TXN
   489  			}
   490  			return false, hlc.Timestamp{}, roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED
   491  		default:
   492  			// If we find another transaction's ID then that transaction has
   493  			// aborted us before our transaction record was written. It obeyed
   494  			// the restriction that it couldn't create a transaction record for
   495  			// us, so it recorded a tombstone cache instead to prevent us
   496  			// from ever creating a transaction record.
   497  			return false, hlc.Timestamp{}, roachpb.ABORT_REASON_ABORTED_RECORD_FOUND
   498  		}
   499  	}
   500  	return true, minCommitTS, 0
   501  }
   502  
   503  // transactionTombstoneMarker returns the key used as a marker indicating that a
   504  // particular txn was finalized (i.e. by an EndTransaction, RecoverTxn or
   505  // PushTxn(Abort)). It is used as a marker in the timestamp cache serving as a
   506  // guard against creating a transaction record after the transaction record has
   507  // been cleaned up (i.e. by a BeginTxn being evaluated out of order or arriving
   508  // after another txn Push(Abort)'ed the txn).
   509  func transactionTombstoneMarker(key roachpb.Key, txnID uuid.UUID) roachpb.Key {
   510  	return append(keys.TransactionKey(key, txnID), []byte("-tmbs")...)
   511  }
   512  
   513  // transactionPushMarker returns the key used by the marker indicating that a
   514  // particular txn was pushed before writing its transaction record. It is used
   515  // as a marker in the timestamp cache indicating that the transaction was pushed
   516  // in case the push happens before there's a transaction record.
   517  func transactionPushMarker(key roachpb.Key, txnID uuid.UUID) roachpb.Key {
   518  	return append(keys.TransactionKey(key, txnID), []byte("-push")...)
   519  }