github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_write.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_write.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/base"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/storage"
    28  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    29  	"github.com/cockroachdb/cockroach/pkg/util"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    33  	"github.com/cockroachdb/errors"
    34  	"go.etcd.io/etcd/raft"
    35  )
    36  
    37  // executeWriteBatch is the entry point for client requests which may mutate the
    38  // range's replicated state. Requests taking this path are evaluated and ultimately
    39  // serialized through Raft, but pass through additional machinery whose goal is
    40  // to allow commands which commute to be proposed in parallel. The naive
    41  // alternative, submitting requests to Raft one after another, paying massive
    42  // latency, is only taken for commands whose effects may overlap.
    43  //
    44  // Concretely,
    45  //
    46  // - The timestamp cache is checked to determine if the command's affected keys
    47  //   were accessed with a timestamp exceeding that of the command; if so, the
    48  //   command's timestamp is incremented accordingly.
    49  // - A RaftCommand is constructed. If proposer-evaluated KV is active,
    50  //   the request is evaluated and the Result is placed in the
    51  //   RaftCommand. If not, the request itself is added to the command.
    52  // - The proposal is inserted into the Replica's in-flight proposals map,
    53  //   a lease index is assigned to it, and it is submitted to Raft, returning
    54  //   a channel.
    55  // - The result of the Raft proposal is read from the channel and the command
    56  //   registered with the timestamp cache, its latches are released, and
    57  //   its result (which could be an error) is returned to the client.
    58  //
    59  // Returns either a response or an error, along with the provided concurrency
    60  // guard if it is passing ownership back to the caller of the function.
    61  //
    62  // NB: changing BatchRequest to a pointer here would have to be done cautiously
    63  // as this method makes the assumption that it operates on a shallow copy (see
    64  // call to applyTimestampCache).
    65  func (r *Replica) executeWriteBatch(
    66  	ctx context.Context, ba *roachpb.BatchRequest, st kvserverpb.LeaseStatus, g *concurrency.Guard,
    67  ) (br *roachpb.BatchResponse, _ *concurrency.Guard, pErr *roachpb.Error) {
    68  	startTime := timeutil.Now()
    69  
    70  	// TODO(nvanbenschoten): unlike on the read-path (executeReadOnlyBatch), we
    71  	// don't synchronize with r.readOnlyCmdMu here. Is that ok? What if the
    72  	// replica is destroyed concurrently with a write? We won't be able to
    73  	// successfully propose as the lease will presumably have changed, but what
    74  	// if we hit an error during evaluation (e.g. a ConditionFailedError)?
    75  
    76  	// Verify that the batch can be executed.
    77  	// NB: we only need to check that the request is in the Range's key bounds
    78  	// at proposal time, not at application time, because the spanlatch manager
    79  	// will synchronize all requests (notably EndTxn with SplitTrigger) that may
    80  	// cause this condition to change.
    81  	if err := r.checkExecutionCanProceed(ba, g, &st); err != nil {
    82  		return nil, g, roachpb.NewError(err)
    83  	}
    84  
    85  	minTS, untrack := r.store.cfg.ClosedTimestamp.Tracker.Track(ctx)
    86  	defer untrack(ctx, 0, 0, 0) // covers all error returns below
    87  
    88  	// Examine the timestamp cache for preceding commands which require this
    89  	// command to move its timestamp forward. Or, in the case of a transactional
    90  	// write, the txn timestamp and possible write-too-old bool.
    91  	if bumped := r.applyTimestampCache(ctx, ba, minTS); bumped {
    92  		// If we bump the transaction's timestamp, we must absolutely
    93  		// tell the client in a response transaction (for otherwise it
    94  		// doesn't know about the incremented timestamp). Response
    95  		// transactions are set far away from this code, but at the time
    96  		// of writing, they always seem to be set. Since that is a
    97  		// likely target of future micro-optimization, this assertion is
    98  		// meant to protect against future correctness anomalies.
    99  		defer func() {
   100  			if br != nil && ba.Txn != nil && br.Txn == nil {
   101  				log.Fatalf(ctx, "assertion failed: transaction updated by "+
   102  					"timestamp cache, but transaction returned in response; "+
   103  					"updated timestamp would have been lost (recovered): "+
   104  					"%s in batch %s", ba.Txn, ba,
   105  				)
   106  			}
   107  		}()
   108  	}
   109  	log.Event(ctx, "applied timestamp cache")
   110  
   111  	// Checking the context just before proposing can help avoid ambiguous errors.
   112  	if err := ctx.Err(); err != nil {
   113  		log.VEventf(ctx, 2, "%s before proposing: %s", err, ba.Summary())
   114  		return nil, g, roachpb.NewError(errors.Wrap(err, "aborted before proposing"))
   115  	}
   116  
   117  	// Check that the lease is still valid before proposing to avoid discovering
   118  	// this after replication and potentially missing out on the chance to retry
   119  	// if the request is using AsyncConsensus. This is best-effort, but can help
   120  	// in cases where the request waited arbitrarily long for locks acquired by
   121  	// other transactions to be released while sequencing in the concurrency
   122  	// manager.
   123  	if curLease, _ := r.GetLease(); curLease.Sequence > st.Lease.Sequence {
   124  		curLeaseCpy := curLease // avoid letting curLease escape
   125  		err := newNotLeaseHolderError(&curLeaseCpy, r.store.StoreID(), r.Desc())
   126  		log.VEventf(ctx, 2, "%s before proposing: %s", err, ba.Summary())
   127  		return nil, g, roachpb.NewError(err)
   128  	}
   129  
   130  	// If the command is proposed to Raft, ownership of and responsibility for
   131  	// the concurrency guard will be assumed by Raft, so provide the guard to
   132  	// evalAndPropose.
   133  	ch, abandon, maxLeaseIndex, pErr := r.evalAndPropose(ctx, ba, g, &st.Lease)
   134  	if pErr != nil {
   135  		if maxLeaseIndex != 0 {
   136  			log.Fatalf(
   137  				ctx, "unexpected max lease index %d assigned to failed proposal: %s, error %s",
   138  				maxLeaseIndex, ba, pErr,
   139  			)
   140  		}
   141  		return nil, g, pErr
   142  	}
   143  	g = nil // ownership passed to Raft, prevent misuse
   144  
   145  	// A max lease index of zero is returned when no proposal was made or a lease was proposed.
   146  	// In both cases, we don't need to communicate a MLAI. Furthermore, for lease proposals we
   147  	// cannot communicate under the lease's epoch. Instead the code calls EmitMLAI explicitly
   148  	// as a side effect of stepping up as leaseholder.
   149  	if maxLeaseIndex != 0 {
   150  		untrack(ctx, ctpb.Epoch(st.Lease.Epoch), r.RangeID, ctpb.LAI(maxLeaseIndex))
   151  	}
   152  
   153  	// If the command was accepted by raft, wait for the range to apply it.
   154  	ctxDone := ctx.Done()
   155  	shouldQuiesce := r.store.stopper.ShouldQuiesce()
   156  	startPropTime := timeutil.Now()
   157  	slowTimer := timeutil.NewTimer()
   158  	defer slowTimer.Stop()
   159  	slowTimer.Reset(base.SlowRequestThreshold)
   160  	// NOTE: this defer was moved from a case in the select statement to here
   161  	// because escape analysis does a better job avoiding allocations to the
   162  	// heap when defers are unconditional. When this was in the slowTimer select
   163  	// case, it was causing pErr to escape.
   164  	defer func() {
   165  		if slowTimer.Read {
   166  			r.store.metrics.SlowRaftRequests.Dec(1)
   167  			log.Infof(
   168  				ctx,
   169  				"slow command %s finished after %.2fs with error %v",
   170  				ba,
   171  				timeutil.Since(startPropTime).Seconds(),
   172  				pErr,
   173  			)
   174  		}
   175  	}()
   176  
   177  	for {
   178  		select {
   179  		case propResult := <-ch:
   180  			// Semi-synchronously process any intents that need resolving here in
   181  			// order to apply back pressure on the client which generated them. The
   182  			// resolution is semi-synchronous in that there is a limited number of
   183  			// outstanding asynchronous resolution tasks allowed after which
   184  			// further calls will block.
   185  			if len(propResult.EncounteredIntents) > 0 {
   186  				// TODO(peter): Re-proposed and canceled (but executed) commands can
   187  				// both leave intents to GC that don't hit this code path. No good
   188  				// solution presents itself at the moment and such intents will be
   189  				// resolved on reads.
   190  				if err := r.store.intentResolver.CleanupIntentsAsync(
   191  					ctx, propResult.EncounteredIntents, true, /* allowSync */
   192  				); err != nil {
   193  					log.Warningf(ctx, "%v", err)
   194  				}
   195  			}
   196  			if len(propResult.EndTxns) > 0 {
   197  				if err := r.store.intentResolver.CleanupTxnIntentsAsync(
   198  					ctx, r.RangeID, propResult.EndTxns, true, /* allowSync */
   199  				); err != nil {
   200  					log.Warningf(ctx, "%v", err)
   201  				}
   202  			}
   203  			return propResult.Reply, nil, propResult.Err
   204  		case <-slowTimer.C:
   205  			slowTimer.Read = true
   206  			r.store.metrics.SlowRaftRequests.Inc(1)
   207  
   208  			log.Errorf(ctx, "range unavailable: %v",
   209  				rangeUnavailableMessage(r.Desc(), r.store.cfg.NodeLiveness.GetIsLiveMap(),
   210  					r.RaftStatus(), ba, timeutil.Since(startPropTime)))
   211  		case <-ctxDone:
   212  			// If our context was canceled, return an AmbiguousResultError,
   213  			// which indicates to the caller that the command may have executed.
   214  			abandon()
   215  			log.VEventf(ctx, 2, "context cancellation after %0.1fs of attempting command %s",
   216  				timeutil.Since(startTime).Seconds(), ba)
   217  			return nil, nil, roachpb.NewError(roachpb.NewAmbiguousResultError(ctx.Err().Error()))
   218  		case <-shouldQuiesce:
   219  			// If shutting down, return an AmbiguousResultError, which indicates
   220  			// to the caller that the command may have executed.
   221  			abandon()
   222  			log.VEventf(ctx, 2, "shutdown cancellation after %0.1fs of attempting command %s",
   223  				timeutil.Since(startTime).Seconds(), ba)
   224  			return nil, nil, roachpb.NewError(roachpb.NewAmbiguousResultError("server shutdown"))
   225  		}
   226  	}
   227  }
   228  
   229  func rangeUnavailableMessage(
   230  	desc *roachpb.RangeDescriptor,
   231  	lm IsLiveMap,
   232  	rs *raft.Status,
   233  	ba *roachpb.BatchRequest,
   234  	dur time.Duration,
   235  ) string {
   236  	cpy := *desc
   237  	desc = &cpy
   238  	desc.StartKey, desc.EndKey = nil, nil // scrub PII
   239  
   240  	var liveReplicas, otherReplicas []roachpb.ReplicaDescriptor
   241  	for _, rDesc := range desc.Replicas().All() {
   242  		if lm[rDesc.NodeID].IsLive {
   243  			liveReplicas = append(liveReplicas, rDesc)
   244  		} else {
   245  			otherReplicas = append(otherReplicas, rDesc)
   246  		}
   247  	}
   248  	return fmt.Sprintf(`have been waiting %.2fs for proposing command %s.
   249  This range is likely unavailable.
   250  Please submit this message to Cockroach Labs support along with the following information:
   251  
   252  Descriptor:  %s
   253  Live:        %s
   254  Non-live:    %s
   255  Raft Status: %+v
   256  
   257  and a copy of https://yourhost:8080/#/reports/range/%d
   258  
   259  If you are using CockroachDB Enterprise, reach out through your
   260  support contract. Otherwise, please open an issue at:
   261  
   262    https://github.com/cockroachdb/cockroach/issues/new/choose
   263  `,
   264  		dur.Seconds(),
   265  		ba,
   266  		desc,
   267  		roachpb.MakeReplicaDescriptors(liveReplicas),
   268  		roachpb.MakeReplicaDescriptors(otherReplicas),
   269  		rs,
   270  		desc.RangeID,
   271  	)
   272  }
   273  
   274  // canAttempt1PCEvaluation looks at the batch and decides whether it can be
   275  // executed as 1PC.
   276  func (r *Replica) canAttempt1PCEvaluation(
   277  	ctx context.Context, ba *roachpb.BatchRequest, latchSpans *spanset.SpanSet,
   278  ) (bool, *roachpb.Error) {
   279  	if !isOnePhaseCommit(ba) {
   280  		return false, nil
   281  	}
   282  
   283  	if ba.Timestamp != ba.Txn.WriteTimestamp {
   284  		log.Fatalf(ctx, "unexpected 1PC execution with diverged timestamp. %s != %s",
   285  			ba.Timestamp, ba.Txn.WriteTimestamp)
   286  	}
   287  
   288  	// The EndTxn checks whether the txn record can be created, but we're
   289  	// eliding the EndTxn. So, we'll do the check instead.
   290  	ok, minCommitTS, reason := r.CanCreateTxnRecord(ba.Txn.ID, ba.Txn.Key, ba.Txn.MinTimestamp)
   291  	if !ok {
   292  		newTxn := ba.Txn.Clone()
   293  		newTxn.Status = roachpb.ABORTED
   294  		return false, roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(reason), newTxn)
   295  	}
   296  	if ba.Timestamp.Less(minCommitTS) {
   297  		ba.Txn.WriteTimestamp = minCommitTS
   298  		// We can only evaluate at the new timestamp if we manage to bump the read
   299  		// timestamp.
   300  		return maybeBumpReadTimestampToWriteTimestamp(ctx, ba, latchSpans), nil
   301  	}
   302  	return true, nil
   303  }
   304  
   305  // evaluateWriteBatch evaluates the supplied batch.
   306  //
   307  // If the batch is transactional and has all the hallmarks of a 1PC commit (i.e.
   308  // includes all intent writes & EndTxn, and there's nothing to suggest that the
   309  // transaction will require retry or restart), the batch's txn is stripped and
   310  // it's executed as an atomic batch write. If the writes cannot all be completed
   311  // at the intended timestamp, the batch's txn is restored and it's re-executed
   312  // in full. This allows it to lay down intents and return an appropriate
   313  // retryable error.
   314  func (r *Replica) evaluateWriteBatch(
   315  	ctx context.Context,
   316  	idKey kvserverbase.CmdIDKey,
   317  	ba *roachpb.BatchRequest,
   318  	latchSpans *spanset.SpanSet,
   319  ) (storage.Batch, enginepb.MVCCStats, *roachpb.BatchResponse, result.Result, *roachpb.Error) {
   320  	log.Event(ctx, "executing read-write batch")
   321  
   322  	// If the transaction has been pushed but it can commit at the higher
   323  	// timestamp, let's evaluate the batch at the bumped timestamp. This will
   324  	// allow it commit, and also it'll allow us to attempt the 1PC code path.
   325  	maybeBumpReadTimestampToWriteTimestamp(ctx, ba, latchSpans)
   326  
   327  	// Attempt 1PC execution, if applicable. If not transactional or there are
   328  	// indications that the batch's txn will require retry, execute as normal.
   329  	ok, pErr := r.canAttempt1PCEvaluation(ctx, ba, latchSpans)
   330  	if pErr != nil {
   331  		return nil, enginepb.MVCCStats{}, nil, result.Result{}, pErr
   332  	}
   333  	if ok {
   334  		res := r.evaluate1PC(ctx, idKey, ba, latchSpans)
   335  		switch res.success {
   336  		case onePCSucceeded:
   337  			return res.batch, res.stats, res.br, res.res, nil
   338  		case onePCFailed:
   339  			if res.pErr == nil {
   340  				log.Fatalf(ctx, "1PC failed but no err. ba: %s", ba.String())
   341  			}
   342  			return nil, enginepb.MVCCStats{}, nil, result.Result{}, res.pErr
   343  		case onePCFallbackToTransactionalEvaluation:
   344  		}
   345  	}
   346  
   347  	ms := new(enginepb.MVCCStats)
   348  	rec := NewReplicaEvalContext(r, latchSpans)
   349  	batch, br, res, pErr := r.evaluateWriteBatchWithServersideRefreshes(
   350  		ctx, idKey, rec, ms, ba, latchSpans, nil /* deadline */)
   351  	return batch, *ms, br, res, pErr
   352  }
   353  
   354  type onePCSuccess int
   355  
   356  const (
   357  	// onePCSucceeded means that the 1PC evaluation succeeded and the results should be
   358  	// returned to the client.
   359  	onePCSucceeded onePCSuccess = iota
   360  	// onePCFailed means that the 1PC evaluation failed and the attached error should be
   361  	// returned to the client.
   362  	onePCFailed
   363  	// onePCFallbackToTransactionalEvaluation means that 1PC evaluation failed, but
   364  	// regular transactional evaluation should be attempted.
   365  	onePCFallbackToTransactionalEvaluation
   366  )
   367  
   368  type onePCResult struct {
   369  	success onePCSuccess
   370  	// pErr is set if success == onePCFailed. This is the error that should be
   371  	// returned to the client for this request.
   372  	pErr *roachpb.Error
   373  
   374  	// The fields below are only set when success == onePCSucceeded.
   375  	stats enginepb.MVCCStats
   376  	br    *roachpb.BatchResponse
   377  	res   result.Result
   378  	batch storage.Batch
   379  }
   380  
   381  // evaluate1PC attempts to evaluate the batch as a 1PC transaction - meaning it
   382  // attempts to evaluate the batch as a non-transactional request. This is only
   383  // possible if the batch contains all of the transaction's writes, which the
   384  // caller needs to ensure. If successful, evaluating the batch this way is more
   385  // efficient - we're avoiding writing the transaction record and writing and the
   386  // immediately deleting intents.
   387  func (r *Replica) evaluate1PC(
   388  	ctx context.Context,
   389  	idKey kvserverbase.CmdIDKey,
   390  	ba *roachpb.BatchRequest,
   391  	latchSpans *spanset.SpanSet,
   392  ) (onePCRes onePCResult) {
   393  	log.VEventf(ctx, 2, "attempting 1PC execution")
   394  
   395  	var batch storage.Batch
   396  	defer func() {
   397  		// Close the batch unless it's passed to the caller (when the evaluation
   398  		// succeeds).
   399  		if onePCRes.success != onePCSucceeded {
   400  			batch.Close()
   401  		}
   402  	}()
   403  
   404  	// Try executing with transaction stripped.
   405  	strippedBa := *ba
   406  	strippedBa.Txn = nil
   407  	strippedBa.Requests = ba.Requests[:len(ba.Requests)-1] // strip end txn req
   408  
   409  	rec := NewReplicaEvalContext(r, latchSpans)
   410  	var br *roachpb.BatchResponse
   411  	var res result.Result
   412  	var pErr *roachpb.Error
   413  
   414  	arg, _ := ba.GetArg(roachpb.EndTxn)
   415  	etArg := arg.(*roachpb.EndTxnRequest)
   416  	canFwdTimestamp := batcheval.CanForwardCommitTimestampWithoutRefresh(ba.Txn, etArg)
   417  
   418  	// Evaluate strippedBa. If the transaction allows, permit refreshes.
   419  	ms := new(enginepb.MVCCStats)
   420  	if canFwdTimestamp {
   421  		batch, br, res, pErr = r.evaluateWriteBatchWithServersideRefreshes(
   422  			ctx, idKey, rec, ms, &strippedBa, latchSpans, etArg.Deadline)
   423  	} else {
   424  		batch, br, res, pErr = r.evaluateWriteBatchWrapper(
   425  			ctx, idKey, rec, ms, &strippedBa, latchSpans)
   426  	}
   427  
   428  	if pErr != nil || (!canFwdTimestamp && ba.Timestamp != br.Timestamp) {
   429  		if pErr != nil {
   430  			log.VEventf(ctx, 2,
   431  				"1PC execution failed, falling back to transactional execution. pErr: %v", pErr.String())
   432  		} else {
   433  			log.VEventf(ctx, 2,
   434  				"1PC execution failed, falling back to transactional execution; the batch was pushed")
   435  		}
   436  		return onePCResult{success: onePCFallbackToTransactionalEvaluation}
   437  	}
   438  
   439  	// 1PC execution was successful, let's synthesize an EndTxnResponse.
   440  
   441  	clonedTxn := ba.Txn.Clone()
   442  	clonedTxn.Status = roachpb.COMMITTED
   443  	// Make sure the returned txn has the actual commit timestamp. This can be
   444  	// different from ba.Txn's if the stripped batch was evaluated at a bumped
   445  	// timestamp.
   446  	clonedTxn.ReadTimestamp = br.Timestamp
   447  	clonedTxn.WriteTimestamp = br.Timestamp
   448  
   449  	// If the end transaction is not committed, clear the batch and mark the status aborted.
   450  	if !etArg.Commit {
   451  		clonedTxn.Status = roachpb.ABORTED
   452  		batch.Close()
   453  		batch = r.store.Engine().NewBatch()
   454  		ms = new(enginepb.MVCCStats)
   455  	} else {
   456  		// Run commit trigger manually.
   457  		innerResult, err := batcheval.RunCommitTrigger(ctx, rec, batch, ms, etArg, clonedTxn)
   458  		if err != nil {
   459  			return onePCResult{
   460  				success: onePCFailed,
   461  				pErr:    roachpb.NewErrorf("failed to run commit trigger: %s", err),
   462  			}
   463  		}
   464  		if err := res.MergeAndDestroy(innerResult); err != nil {
   465  			return onePCResult{
   466  				success: onePCFailed,
   467  				pErr:    roachpb.NewError(err),
   468  			}
   469  		}
   470  	}
   471  
   472  	// Even though the transaction is 1PC and hasn't written any intents, it may
   473  	// have acquired unreplicated locks, so inform the concurrency manager that
   474  	// it is finalized and than any unreplicated locks that it has acquired can
   475  	// be released.
   476  	res.Local.UpdatedTxns = []*roachpb.Transaction{clonedTxn}
   477  	res.Local.ResolvedLocks = make([]roachpb.LockUpdate, len(etArg.LockSpans))
   478  	for i, sp := range etArg.LockSpans {
   479  		res.Local.ResolvedLocks[i] = roachpb.LockUpdate{
   480  			Span:           sp,
   481  			Txn:            clonedTxn.TxnMeta,
   482  			Status:         clonedTxn.Status,
   483  			IgnoredSeqNums: clonedTxn.IgnoredSeqNums,
   484  		}
   485  	}
   486  
   487  	// Add placeholder responses for end transaction requests.
   488  	br.Add(&roachpb.EndTxnResponse{OnePhaseCommit: true})
   489  	br.Txn = clonedTxn
   490  	return onePCResult{
   491  		success: onePCSucceeded,
   492  		stats:   *ms,
   493  		br:      br,
   494  		res:     res,
   495  		batch:   batch,
   496  	}
   497  }
   498  
   499  // evaluateWriteBatchWithServersideRefreshes invokes evaluateBatch and retries
   500  // at a higher timestamp in the event of some retriable errors if allowed by the
   501  // batch/txn.
   502  //
   503  // deadline, if not nil, specifies the highest timestamp (exclusive) at which
   504  // the request can be evaluated. If ba is a transactional request, then dealine
   505  // cannot be specified; a transaction's deadline comes from it's EndTxn request.
   506  func (r *Replica) evaluateWriteBatchWithServersideRefreshes(
   507  	ctx context.Context,
   508  	idKey kvserverbase.CmdIDKey,
   509  	rec batcheval.EvalContext,
   510  	ms *enginepb.MVCCStats,
   511  	ba *roachpb.BatchRequest,
   512  	latchSpans *spanset.SpanSet,
   513  	deadline *hlc.Timestamp,
   514  ) (batch storage.Batch, br *roachpb.BatchResponse, res result.Result, pErr *roachpb.Error) {
   515  	goldenMS := *ms
   516  	for retries := 0; ; retries++ {
   517  		if retries > 0 {
   518  			log.VEventf(ctx, 2, "server-side retry of batch")
   519  		}
   520  		if batch != nil {
   521  			// Reset the stats.
   522  			*ms = goldenMS
   523  			batch.Close()
   524  		}
   525  
   526  		batch, br, res, pErr = r.evaluateWriteBatchWrapper(ctx, idKey, rec, ms, ba, latchSpans)
   527  
   528  		var success bool
   529  		if pErr == nil {
   530  			wto := br.Txn != nil && br.Txn.WriteTooOld
   531  			success = !wto
   532  		} else {
   533  			success = false
   534  		}
   535  
   536  		// If we can retry, set a higher batch timestamp and continue.
   537  		// Allow one retry only; a non-txn batch containing overlapping
   538  		// spans will always experience WriteTooOldError.
   539  		if success || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, latchSpans, deadline) {
   540  			break
   541  		}
   542  	}
   543  	return batch, br, res, pErr
   544  }
   545  
   546  // evaluateWriteBatchWrapper is a wrapper on top of evaluateBatch() which deals
   547  // with filling out result.LogicalOpLog.
   548  func (r *Replica) evaluateWriteBatchWrapper(
   549  	ctx context.Context,
   550  	idKey kvserverbase.CmdIDKey,
   551  	rec batcheval.EvalContext,
   552  	ms *enginepb.MVCCStats,
   553  	ba *roachpb.BatchRequest,
   554  	latchSpans *spanset.SpanSet,
   555  ) (storage.Batch, *roachpb.BatchResponse, result.Result, *roachpb.Error) {
   556  	batch, opLogger := r.newBatchedEngine(latchSpans)
   557  	br, res, pErr := evaluateBatch(ctx, idKey, batch, rec, ms, ba, false /* readOnly */)
   558  	if pErr == nil {
   559  		if opLogger != nil {
   560  			res.LogicalOpLog = &kvserverpb.LogicalOpLog{
   561  				Ops: opLogger.LogicalOps(),
   562  			}
   563  		}
   564  	}
   565  	return batch, br, res, pErr
   566  }
   567  
   568  // newBatchedEngine creates an engine.Batch. Depending on whether rangefeeds
   569  // are enabled, it also returns an engine.OpLoggerBatch. If non-nil, then this
   570  // OpLogger is attached to the returned engine.Batch, recording all operations.
   571  // Its recording should be attached to the Result of request evaluation.
   572  func (r *Replica) newBatchedEngine(spans *spanset.SpanSet) (storage.Batch, *storage.OpLoggerBatch) {
   573  	batch := r.store.Engine().NewBatch()
   574  	var opLogger *storage.OpLoggerBatch
   575  	if r.isSystemRange() || RangefeedEnabled.Get(&r.store.cfg.Settings.SV) {
   576  		// TODO(nvanbenschoten): once we get rid of the RangefeedEnabled
   577  		// cluster setting we'll need a way to turn this on when any
   578  		// replica (not just the leaseholder) wants it and off when no
   579  		// replicas want it. This turns out to be pretty involved.
   580  		//
   581  		// The current plan is to:
   582  		// - create a range-id local key that stores all replicas that are
   583  		//   subscribed to logical operations, along with their corresponding
   584  		//   liveness epoch.
   585  		// - create a new command that adds or subtracts replicas from this
   586  		//   structure. The command will be a write across the entire replica
   587  		//   span so that it is serialized with all writes.
   588  		// - each replica will add itself to this set when it first needs
   589  		//   logical ops. It will then wait until it sees the replicated command
   590  		//   that added itself pop out through Raft so that it knows all
   591  		//   commands that are missing logical ops are gone.
   592  		// - It will then proceed as normal, relying on the logical ops to
   593  		//   always be included on the raft commands. When its no longer
   594  		//   needs logical ops, it will remove itself from the set.
   595  		// - The leaseholder will have a new queue to detect registered
   596  		//   replicas that are no longer live and remove them from the
   597  		//   set to prevent "leaking" subscriptions.
   598  		// - The condition here to add logical logging will be:
   599  		//     if len(replicaState.logicalOpsSubs) > 0 { ... }
   600  		//
   601  		// An alternative to this is the reduce the cost of the including
   602  		// the logical op log to a negligible amount such that it can be
   603  		// included on all raft commands, regardless of whether any replica
   604  		// has a rangefeed running or not.
   605  		//
   606  		// Another alternative is to make the setting table/zone-scoped
   607  		// instead of a fine-grained per-replica state.
   608  		opLogger = storage.NewOpLoggerBatch(batch)
   609  		batch = opLogger
   610  	}
   611  	if util.RaceEnabled {
   612  		// During writes we may encounter a versioned value newer than the request
   613  		// timestamp, and may have to retry at a higher timestamp. This is still
   614  		// safe as we're only ever writing at timestamps higher than the timestamp
   615  		// any write latch would be declared at. But because of this, we don't
   616  		// assert on access timestamps using spanset.NewBatchAt.
   617  		batch = spanset.NewBatch(batch, spans)
   618  	}
   619  	return batch, opLogger
   620  }
   621  
   622  // isOnePhaseCommit returns true iff the BatchRequest contains all writes in the
   623  // transaction and ends with an EndTxn. One phase commits are disallowed if any
   624  // of the following conditions are true:
   625  // (1) the transaction has already been flagged with a write too old error
   626  // (2) the transaction's commit timestamp has been forwarded
   627  // (3) the transaction exceeded its deadline
   628  // (4) the transaction is not in its first epoch and the EndTxn request does
   629  //     not require one phase commit.
   630  func isOnePhaseCommit(ba *roachpb.BatchRequest) bool {
   631  	if ba.Txn == nil {
   632  		return false
   633  	}
   634  	if !ba.IsCompleteTransaction() {
   635  		return false
   636  	}
   637  	arg, _ := ba.GetArg(roachpb.EndTxn)
   638  	etArg := arg.(*roachpb.EndTxnRequest)
   639  	if retry, _, _ := batcheval.IsEndTxnTriggeringRetryError(ba.Txn, etArg); retry {
   640  		return false
   641  	}
   642  	// If the transaction has already restarted at least once then it may have
   643  	// left intents at prior epochs that need to be cleaned up during the
   644  	// process of committing the transaction. Even if the current epoch could
   645  	// perform a one phase commit, we don't allow it to because that could
   646  	// prevent it from properly resolving intents from prior epochs and cause
   647  	// it to abandon them instead.
   648  	//
   649  	// The exception to this rule is transactions that require a one phase
   650  	// commit. We know that if they also required a one phase commit in past
   651  	// epochs then they couldn't have left any intents that they now need to
   652  	// clean up.
   653  	return ba.Txn.Epoch == 0 || etArg.Require1PC
   654  }