github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_proposal.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"io/ioutil"
    17  	"os"
    18  	"path/filepath"
    19  	"strings"
    20  	"time"
    21  	"unsafe"
    22  
    23  	"github.com/cockroachdb/cockroach/pkg/base"
    24  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    25  	"github.com/cockroachdb/cockroach/pkg/keys"
    26  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    31  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    32  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    33  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    34  	"github.com/cockroachdb/cockroach/pkg/storage"
    35  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    36  	"github.com/cockroachdb/cockroach/pkg/util"
    37  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    38  	"github.com/cockroachdb/cockroach/pkg/util/log"
    39  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    40  	"github.com/cockroachdb/cockroach/pkg/util/sysutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    43  	"github.com/cockroachdb/errors"
    44  	"github.com/kr/pretty"
    45  	opentracing "github.com/opentracing/opentracing-go"
    46  	"golang.org/x/time/rate"
    47  )
    48  
    49  // ProposalData is data about a command which allows it to be
    50  // evaluated, proposed to raft, and for the result of the command to
    51  // be returned to the caller.
    52  type ProposalData struct {
    53  	// The caller's context, used for logging proposals, reproposals, message
    54  	// sends, and command application. In order to enable safely tracing events
    55  	// beneath, modifying this ctx field in *ProposalData requires holding the
    56  	// raftMu.
    57  	ctx context.Context
    58  
    59  	// An optional tracing span bound to the proposal. Will be cleaned
    60  	// up when the proposal finishes.
    61  	sp opentracing.Span
    62  
    63  	// idKey uniquely identifies this proposal.
    64  	// TODO(andreimatei): idKey is legacy at this point: We could easily key
    65  	// commands by their MaxLeaseIndex, and doing so should be ok with a stop-
    66  	// the-world migration. However, various test facilities depend on the
    67  	// command ID for e.g. replay protection.
    68  	idKey kvserverbase.CmdIDKey
    69  
    70  	// proposedAtTicks is the (logical) time at which this command was
    71  	// last (re-)proposed.
    72  	proposedAtTicks int
    73  
    74  	// command is serialized and proposed to raft. In the event of
    75  	// reproposals its MaxLeaseIndex field is mutated.
    76  	command *kvserverpb.RaftCommand
    77  
    78  	// encodedCommand is the encoded Raft command, with an optional prefix
    79  	// containing the command ID.
    80  	encodedCommand []byte
    81  
    82  	// quotaAlloc is the allocation retrieved from the proposalQuota. Once a
    83  	// proposal has been passed to raft modifying this field requires holding the
    84  	// raftMu. Once the proposal comes out of Raft, ownerwhip of this quota is
    85  	// passed to r.mu.quotaReleaseQueue.
    86  	quotaAlloc *quotapool.IntAlloc
    87  
    88  	// tmpFooter is used to avoid an allocation.
    89  	tmpFooter kvserverpb.RaftCommandFooter
    90  
    91  	// ec.done is called after command application to update the timestamp
    92  	// cache and optionally release latches and exits lock wait-queues.
    93  	ec endCmds
    94  
    95  	// applied is set when the a command finishes application. It is used to
    96  	// avoid reproposing a failed proposal if an earlier version of the same
    97  	// proposal succeeded in applying.
    98  	applied bool
    99  
   100  	// doneCh is used to signal the waiting RPC handler (the contents of
   101  	// proposalResult come from LocalEvalResult).
   102  	//
   103  	// Attention: this channel is not to be signaled directly downstream of Raft.
   104  	// Always use ProposalData.finishApplication().
   105  	doneCh chan proposalResult
   106  
   107  	// Local contains the results of evaluating the request tying the upstream
   108  	// evaluation of the request to the downstream application of the command.
   109  	// Nil when the proposal came from another node (i.e. the evaluation wasn't
   110  	// done here).
   111  	Local *result.LocalResult
   112  
   113  	// Request is the client's original BatchRequest.
   114  	// TODO(tschottdorf): tests which use TestingCommandFilter use this.
   115  	// Decide how that will work in the future, presumably the
   116  	// CommandFilter would run at proposal time or we allow an opaque
   117  	// struct to be attached to a proposal which is then available as it
   118  	// applies. Other than tests, we only need a few bits of the request
   119  	// here; this could be replaced with isLease and isChangeReplicas
   120  	// booleans.
   121  	Request *roachpb.BatchRequest
   122  }
   123  
   124  // finishApplication is called when a command application has finished. The
   125  // method will be called downstream of Raft if the command required consensus,
   126  // but can be called upstream of Raft if the command did not and was never
   127  // proposed.
   128  //
   129  // It first invokes the endCmds function and then sends the specified
   130  // proposalResult on the proposal's done channel. endCmds is invoked here in
   131  // order to allow the original client to be canceled. (When the original client
   132  // is canceled, it won't be listening to this done channel, and so it can't be
   133  // counted on to invoke endCmds itself.)
   134  //
   135  // The method is safe to call more than once, but only the first result will be
   136  // returned to the client.
   137  func (proposal *ProposalData) finishApplication(ctx context.Context, pr proposalResult) {
   138  	proposal.ec.done(ctx, proposal.Request, pr.Reply, pr.Err)
   139  	proposal.signalProposalResult(pr)
   140  	if proposal.sp != nil {
   141  		tracing.FinishSpan(proposal.sp)
   142  		proposal.sp = nil
   143  	}
   144  }
   145  
   146  // returnProposalResult signals proposal.doneCh with the proposal result if it
   147  // has not already been signaled. The method can be called even before the
   148  // proposal has finished replication and command application, and does not
   149  // release the request's latches.
   150  //
   151  // The method is safe to call more than once, but only the first result will be
   152  // returned to the client.
   153  func (proposal *ProposalData) signalProposalResult(pr proposalResult) {
   154  	if proposal.doneCh != nil {
   155  		proposal.doneCh <- pr
   156  		proposal.doneCh = nil
   157  	}
   158  }
   159  
   160  // releaseQuota releases the proposal's quotaAlloc and sets it to nil.
   161  // If the quotaAlloc is already nil it is a no-op.
   162  func (proposal *ProposalData) releaseQuota() {
   163  	if proposal.quotaAlloc != nil {
   164  		proposal.quotaAlloc.Release()
   165  		proposal.quotaAlloc = nil
   166  	}
   167  }
   168  
   169  // TODO(tschottdorf): we should find new homes for the checksum, lease
   170  // code, and various others below to leave here only the core logic.
   171  // Not moving anything right now to avoid awkward diffs. These should
   172  // all be moved to replica_application_result.go.
   173  
   174  func (r *Replica) gcOldChecksumEntriesLocked(now time.Time) {
   175  	for id, val := range r.mu.checksums {
   176  		// The timestamp is valid only if set.
   177  		if !val.gcTimestamp.IsZero() && now.After(val.gcTimestamp) {
   178  			delete(r.mu.checksums, id)
   179  		}
   180  	}
   181  }
   182  
   183  func (r *Replica) computeChecksumPostApply(ctx context.Context, cc kvserverpb.ComputeChecksum) {
   184  	stopper := r.store.Stopper()
   185  	now := timeutil.Now()
   186  	r.mu.Lock()
   187  	var notify chan struct{}
   188  	if c, ok := r.mu.checksums[cc.ChecksumID]; !ok {
   189  		// There is no record of this ID. Make a new notification.
   190  		notify = make(chan struct{})
   191  	} else if !c.started {
   192  		// A CollectChecksumRequest is waiting on the existing notification.
   193  		notify = c.notify
   194  	} else {
   195  		log.Fatalf(ctx, "attempted to apply ComputeChecksum command with duplicated checksum ID %s",
   196  			cc.ChecksumID)
   197  	}
   198  
   199  	r.gcOldChecksumEntriesLocked(now)
   200  
   201  	// Create an entry with checksum == nil and gcTimestamp unset.
   202  	r.mu.checksums[cc.ChecksumID] = ReplicaChecksum{started: true, notify: notify}
   203  	desc := *r.mu.state.Desc
   204  	r.mu.Unlock()
   205  
   206  	if cc.Version != batcheval.ReplicaChecksumVersion {
   207  		r.computeChecksumDone(ctx, cc.ChecksumID, nil, nil)
   208  		log.Infof(ctx, "incompatible ComputeChecksum versions (requested: %d, have: %d)",
   209  			cc.Version, batcheval.ReplicaChecksumVersion)
   210  		return
   211  	}
   212  
   213  	// Caller is holding raftMu, so an engine snapshot is automatically
   214  	// Raft-consistent (i.e. not in the middle of an AddSSTable).
   215  	snap := r.store.engine.NewSnapshot()
   216  	if cc.Checkpoint {
   217  		sl := stateloader.Make(r.RangeID)
   218  		rai, _, err := sl.LoadAppliedIndex(ctx, snap)
   219  		if err != nil {
   220  			log.Warningf(ctx, "unable to load applied index, continuing anyway")
   221  		}
   222  		// NB: the names here will match on all nodes, which is nice for debugging.
   223  		tag := fmt.Sprintf("r%d_at_%d", r.RangeID, rai)
   224  		if dir, err := r.store.checkpoint(ctx, tag); err != nil {
   225  			log.Warningf(ctx, "unable to create checkpoint %s: %+v", dir, err)
   226  		} else {
   227  			log.Warningf(ctx, "created checkpoint %s", dir)
   228  		}
   229  	}
   230  
   231  	// Compute SHA asynchronously and store it in a map by UUID.
   232  	if err := stopper.RunAsyncTask(ctx, "storage.Replica: computing checksum", func(ctx context.Context) {
   233  		func() {
   234  			defer snap.Close()
   235  			var snapshot *roachpb.RaftSnapshotData
   236  			if cc.SaveSnapshot {
   237  				snapshot = &roachpb.RaftSnapshotData{}
   238  			}
   239  			result, err := r.sha512(ctx, desc, snap, snapshot, cc.Mode)
   240  			if err != nil {
   241  				log.Errorf(ctx, "%v", err)
   242  				result = nil
   243  			}
   244  			r.computeChecksumDone(ctx, cc.ChecksumID, result, snapshot)
   245  		}()
   246  
   247  		var shouldFatal bool
   248  		for _, rDesc := range cc.Terminate {
   249  			if rDesc.StoreID == r.store.StoreID() && rDesc.ReplicaID == r.mu.replicaID {
   250  				shouldFatal = true
   251  			}
   252  		}
   253  
   254  		if shouldFatal {
   255  			// This node should fatal as a result of a previous consistency
   256  			// check (i.e. this round is carried out only to obtain a diff).
   257  			// If we fatal too early, the diff won't make it back to the lease-
   258  			// holder and thus won't be printed to the logs. Since we're already
   259  			// in a goroutine that's about to end, simply sleep for a few seconds
   260  			// and then terminate.
   261  			auxDir := r.store.engine.GetAuxiliaryDir()
   262  			_ = os.MkdirAll(auxDir, 0755)
   263  			path := base.PreventedStartupFile(auxDir)
   264  
   265  			preventStartupMsg := fmt.Sprintf(`ATTENTION:
   266  
   267  this node is terminating because a replica inconsistency was detected between %s
   268  and its other replicas. Please check your cluster-wide log files for more
   269  information and contact the CockroachDB support team. It is not necessarily safe
   270  to replace this node; cluster data may still be at risk of corruption.
   271  
   272  A checkpoints directory to aid (expert) debugging should be present in:
   273  %s
   274  
   275  A file preventing this node from restarting was placed at:
   276  %s
   277  `, r, auxDir, path)
   278  
   279  			if err := ioutil.WriteFile(path, []byte(preventStartupMsg), 0644); err != nil {
   280  				log.Warningf(ctx, "%v", err)
   281  			}
   282  
   283  			if p := r.store.cfg.TestingKnobs.ConsistencyTestingKnobs.OnBadChecksumFatal; p != nil {
   284  				p(*r.store.Ident)
   285  			} else {
   286  				time.Sleep(10 * time.Second)
   287  				log.Fatalf(r.AnnotateCtx(context.Background()), preventStartupMsg)
   288  			}
   289  		}
   290  
   291  	}); err != nil {
   292  		defer snap.Close()
   293  		log.Errorf(ctx, "could not run async checksum computation (ID = %s): %v", cc.ChecksumID, err)
   294  		// Set checksum to nil.
   295  		r.computeChecksumDone(ctx, cc.ChecksumID, nil, nil)
   296  	}
   297  }
   298  
   299  // leasePostApply updates the Replica's internal state to reflect the
   300  // application of a new Range lease. The method is idempotent, so it can be
   301  // called repeatedly for the same lease safely. However, the method will panic
   302  // if passed a lease with a lower sequence number than the current lease. By
   303  // default, the method will also panic if passed a lease that indicates a
   304  // forward sequence number jump (i.e. a skipped lease). This behavior can
   305  // be disabled by passing permitJump as true.
   306  func (r *Replica) leasePostApply(ctx context.Context, newLease roachpb.Lease, permitJump bool) {
   307  	r.mu.Lock()
   308  	replicaID := r.mu.replicaID
   309  	// Pull out the last lease known to this Replica. It's possible that this is
   310  	// not actually the last lease in the Range's lease sequence because the
   311  	// Replica may have missed the application of a lease between prevLease and
   312  	// newLease. However, this should only be possible if a snapshot includes a
   313  	// lease update. All other forms of lease updates should be continuous
   314  	// without jumps (see permitJump).
   315  	prevLease := *r.mu.state.Lease
   316  	r.mu.Unlock()
   317  
   318  	iAmTheLeaseHolder := newLease.Replica.ReplicaID == replicaID
   319  	// NB: in the case in which a node restarts, minLeaseProposedTS forces it to
   320  	// get a new lease and we make sure it gets a new sequence number, thus
   321  	// causing the right half of the disjunction to fire so that we update the
   322  	// timestamp cache.
   323  	leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID || prevLease.Sequence != newLease.Sequence
   324  
   325  	if iAmTheLeaseHolder {
   326  		// Log lease acquisition whenever an Epoch-based lease changes hands (or verbose
   327  		// logging is enabled).
   328  		if newLease.Type() == roachpb.LeaseEpoch && leaseChangingHands || log.V(1) {
   329  			log.VEventf(ctx, 1, "new range lease %s following %s", newLease, prevLease)
   330  		}
   331  	}
   332  
   333  	if leaseChangingHands && iAmTheLeaseHolder {
   334  		// When taking over the lease, we need to check whether a merge is in
   335  		// progress, as only the old leaseholder would have been explicitly notified
   336  		// of the merge. If there is a merge in progress, maybeWatchForMerge will
   337  		// arrange to block all traffic to this replica unless the merge aborts.
   338  		if err := r.maybeWatchForMerge(ctx); err != nil {
   339  			// We were unable to determine whether a merge was in progress. We cannot
   340  			// safely proceed.
   341  			log.Fatalf(ctx, "failed checking for in-progress merge while installing new lease %s: %s",
   342  				newLease, err)
   343  		}
   344  
   345  		// If this replica is a new holder of the lease, update the low water
   346  		// mark of the timestamp cache. Note that clock offset scenarios are
   347  		// handled via a stasis period inherent in the lease which is documented
   348  		// in the Lease struct.
   349  		//
   350  		// The introduction of lease transfers implies that the previous lease
   351  		// may have been shortened and we are now applying a formally overlapping
   352  		// lease (since the old lease holder has promised not to serve any more
   353  		// requests, this is kosher). This means that we don't use the old
   354  		// lease's expiration but instead use the new lease's start to initialize
   355  		// the timestamp cache low water.
   356  		setTimestampCacheLowWaterMark(r.store.tsCache, r.Desc(), newLease.Start)
   357  
   358  		// Reset the request counts used to make lease placement decisions whenever
   359  		// starting a new lease.
   360  		if r.leaseholderStats != nil {
   361  			r.leaseholderStats.resetRequestCounts()
   362  		}
   363  	}
   364  
   365  	// Sanity check to make sure that the lease sequence is moving in the right
   366  	// direction.
   367  	if s1, s2 := prevLease.Sequence, newLease.Sequence; s1 != 0 {
   368  		// We're at a version that supports lease sequence numbers.
   369  		switch {
   370  		case s2 < s1:
   371  			log.Fatalf(ctx, "lease sequence inversion, prevLease=%s, newLease=%s",
   372  				log.Safe(prevLease), log.Safe(newLease))
   373  		case s2 == s1:
   374  			// If the sequence numbers are the same, make sure they're actually
   375  			// the same lease. This can happen when callers are using
   376  			// leasePostApply for some of its side effects, like with
   377  			// splitPostApply. It can also happen during lease extensions.
   378  			if !prevLease.Equivalent(newLease) {
   379  				log.Fatalf(ctx, "sequence identical for different leases, prevLease=%s, newLease=%s",
   380  					log.Safe(prevLease), log.Safe(newLease))
   381  			}
   382  		case s2 == s1+1:
   383  			// Lease sequence incremented by 1. Expected case.
   384  		case s2 > s1+1 && !permitJump:
   385  			log.Fatalf(ctx, "lease sequence jump, prevLease=%s, newLease=%s",
   386  				log.Safe(prevLease), log.Safe(newLease))
   387  		}
   388  	}
   389  
   390  	// Ordering is critical here. We only install the new lease after we've
   391  	// checked for an in-progress merge and updated the timestamp cache. If the
   392  	// ordering were reversed, it would be possible for requests to see the new
   393  	// lease but not the updated merge or timestamp cache state, which can result
   394  	// in serializability violations.
   395  	r.mu.Lock()
   396  	r.mu.state.Lease = &newLease
   397  	expirationBasedLease := r.requiresExpiringLeaseRLocked()
   398  	r.mu.Unlock()
   399  
   400  	// Gossip the first range whenever its lease is acquired. We check to make
   401  	// sure the lease is active so that a trailing replica won't process an old
   402  	// lease request and attempt to gossip the first range.
   403  	if leaseChangingHands && iAmTheLeaseHolder && r.IsFirstRange() && r.IsLeaseValid(newLease, r.store.Clock().Now()) {
   404  		r.gossipFirstRange(ctx)
   405  	}
   406  
   407  	// Whenever we first acquire an expiration-based lease, notify the lease
   408  	// renewer worker that we want it to keep proactively renewing the lease
   409  	// before it expires.
   410  	if leaseChangingHands && iAmTheLeaseHolder && expirationBasedLease && r.IsLeaseValid(newLease, r.store.Clock().Now()) {
   411  		r.store.renewableLeases.Store(int64(r.RangeID), unsafe.Pointer(r))
   412  		select {
   413  		case r.store.renewableLeasesSignal <- struct{}{}:
   414  		default:
   415  		}
   416  	}
   417  
   418  	// If we're the current raft leader, may want to transfer the leadership to
   419  	// the new leaseholder. Note that this condition is also checked periodically
   420  	// when ticking the replica.
   421  	r.maybeTransferRaftLeadership(ctx)
   422  
   423  	// Notify the store that a lease change occurred and it may need to
   424  	// gossip the updated store descriptor (with updated capacity).
   425  	prevOwner := prevLease.OwnedBy(r.store.StoreID())
   426  	currentOwner := newLease.OwnedBy(r.store.StoreID())
   427  	if leaseChangingHands && (prevOwner || currentOwner) {
   428  		if currentOwner {
   429  			r.store.maybeGossipOnCapacityChange(ctx, leaseAddEvent)
   430  		} else if prevOwner {
   431  			r.store.maybeGossipOnCapacityChange(ctx, leaseRemoveEvent)
   432  		}
   433  		if r.leaseholderStats != nil {
   434  			r.leaseholderStats.resetRequestCounts()
   435  		}
   436  	}
   437  
   438  	// Inform the concurrency manager that the lease holder has been updated.
   439  	r.concMgr.OnRangeLeaseUpdated(iAmTheLeaseHolder)
   440  
   441  	// Potentially re-gossip if the range contains system data (e.g. system
   442  	// config or node liveness). We need to perform this gossip at startup as
   443  	// soon as possible. Trying to minimize how often we gossip is a fool's
   444  	// errand. The node liveness info will be gossiped frequently (every few
   445  	// seconds) in any case due to the liveness heartbeats. And the system config
   446  	// will be gossiped rarely because it falls on a range with an epoch-based
   447  	// range lease that is only reacquired extremely infrequently.
   448  	if iAmTheLeaseHolder {
   449  		if err := r.MaybeGossipSystemConfig(ctx); err != nil {
   450  			log.Errorf(ctx, "%v", err)
   451  		}
   452  		if err := r.MaybeGossipNodeLiveness(ctx, keys.NodeLivenessSpan); err != nil {
   453  			log.Errorf(ctx, "%v", err)
   454  		}
   455  
   456  		// Emit an MLAI on the leaseholder replica, as follower will be looking
   457  		// for one and if we went on to quiesce, they wouldn't necessarily get
   458  		// one otherwise (unless they ask for it, which adds latency).
   459  		r.EmitMLAI()
   460  
   461  		if leaseChangingHands && log.V(1) {
   462  			// This logging is useful to troubleshoot incomplete drains.
   463  			log.Info(ctx, "is now leaseholder")
   464  		}
   465  	}
   466  
   467  	// Mark the new lease in the replica's lease history.
   468  	if r.leaseHistory != nil {
   469  		r.leaseHistory.add(newLease)
   470  	}
   471  }
   472  
   473  func addSSTablePreApply(
   474  	ctx context.Context,
   475  	st *cluster.Settings,
   476  	eng storage.Engine,
   477  	sideloaded SideloadStorage,
   478  	term, index uint64,
   479  	sst kvserverpb.ReplicatedEvalResult_AddSSTable,
   480  	limiter *rate.Limiter,
   481  ) bool {
   482  	checksum := util.CRC32(sst.Data)
   483  
   484  	if checksum != sst.CRC32 {
   485  		log.Fatalf(
   486  			ctx,
   487  			"checksum for AddSSTable at index term %d, index %d does not match; at proposal time %x (%d), now %x (%d)",
   488  			term, index, sst.CRC32, sst.CRC32, checksum, checksum,
   489  		)
   490  	}
   491  
   492  	path, err := sideloaded.Filename(ctx, index, term)
   493  	if err != nil {
   494  		log.Fatalf(ctx, "sideloaded SSTable at term %d, index %d is missing", term, index)
   495  	}
   496  
   497  	eng.PreIngestDelay(ctx)
   498  
   499  	copied := false
   500  	if eng.InMem() {
   501  		path = fmt.Sprintf("%x", checksum)
   502  		if err := eng.WriteFile(path, sst.Data); err != nil {
   503  			panic(err)
   504  		}
   505  	} else {
   506  		ingestPath := path + ".ingested"
   507  
   508  		canLinkToRaftFile := false
   509  		// The SST may already be on disk, thanks to the sideloading mechanism.  If
   510  		// so we can try to add that file directly, via a new hardlink if the file-
   511  		// system support it, rather than writing a new copy of it. However, this is
   512  		// only safe if we can do so without modifying the file since it is still
   513  		// part of an immutable raft log message, but in some cases, described in
   514  		// DBIngestExternalFile, RocksDB would modify the file. Fortunately we can
   515  		// tell Rocks that it is not allowed to modify the file, in which case it
   516  		// will return and error if it would have tried to do so, at which point we
   517  		// can fall back to writing a new copy for Rocks to ingest.
   518  		if _, links, err := sysutil.StatAndLinkCount(path); err == nil {
   519  			// HACK: RocksDB does not like ingesting the same file (by inode) twice.
   520  			// See facebook/rocksdb#5133. We can tell that we have tried to ingest
   521  			// this file already if it has more than one link – one from the file raft
   522  			// wrote and one from rocks. In that case, we should not try to give
   523  			// rocks a link to the same file again.
   524  			if links == 1 {
   525  				canLinkToRaftFile = true
   526  			} else {
   527  				log.Warningf(ctx, "SSTable at index %d term %d may have already been ingested (link count %d) -- falling back to ingesting a copy",
   528  					index, term, links)
   529  			}
   530  		}
   531  
   532  		if canLinkToRaftFile {
   533  			// If the fs supports it, make a hard-link for rocks to ingest. We cannot
   534  			// pass it the path in the sideload store as it deletes the passed path on
   535  			// success.
   536  			if linkErr := eng.Link(path, ingestPath); linkErr == nil {
   537  				ingestErr := eng.IngestExternalFiles(ctx, []string{ingestPath})
   538  				if ingestErr == nil {
   539  					// Adding without modification succeeded, no copy necessary.
   540  					log.Eventf(ctx, "ingested SSTable at index %d, term %d: %s", index, term, ingestPath)
   541  					return false
   542  				}
   543  				if rmErr := eng.Remove(ingestPath); rmErr != nil {
   544  					log.Fatalf(ctx, "failed to move ingest sst: %v", rmErr)
   545  				}
   546  				const seqNoMsg = "Global seqno is required, but disabled"
   547  				const seqNoOnReIngest = "external file have non zero sequence number"
   548  				// Repeated ingestion is still possible even with the link count checked
   549  				// above, since rocks might have already compacted away the file.
   550  				// However it does not flush compacted files from its cache, so it can
   551  				// still react poorly to attempting to ingest again. If we get an error
   552  				// that indicates we can't ingest, we'll make a copy and try again. That
   553  				// attempt must succeed or we'll fatal, so any persistent error is still
   554  				// going to be surfaced.
   555  				ingestErrMsg := ingestErr.Error()
   556  				isSeqNoErr := strings.Contains(ingestErrMsg, seqNoMsg) || strings.Contains(ingestErrMsg, seqNoOnReIngest)
   557  				if ingestErr := (*storage.Error)(nil); !errors.As(err, &ingestErr) || !isSeqNoErr {
   558  					log.Fatalf(ctx, "while ingesting %s: %v", ingestPath, ingestErr)
   559  				}
   560  			}
   561  		}
   562  
   563  		path = ingestPath
   564  
   565  		log.Eventf(ctx, "copying SSTable for ingestion at index %d, term %d: %s", index, term, path)
   566  
   567  		// TODO(tschottdorf): remove this once sideloaded storage guarantees its
   568  		// existence.
   569  		if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil {
   570  			panic(err)
   571  		}
   572  		if _, err := os.Stat(path); err == nil {
   573  			// The file we want to ingest exists. This can happen since the
   574  			// ingestion may apply twice (we ingest before we mark the Raft
   575  			// command as committed). Just unlink the file (RocksDB created a
   576  			// hard link); after that we're free to write it again.
   577  			if err := os.Remove(path); err != nil {
   578  				log.Fatalf(ctx, "while removing existing file during ingestion of %s: %+v", path, err)
   579  			}
   580  		}
   581  
   582  		if err := writeFileSyncing(ctx, path, sst.Data, eng, 0600, st, limiter); err != nil {
   583  			log.Fatalf(ctx, "while ingesting %s: %+v", path, err)
   584  		}
   585  		copied = true
   586  	}
   587  
   588  	if err := eng.IngestExternalFiles(ctx, []string{path}); err != nil {
   589  		log.Fatalf(ctx, "while ingesting %s: %+v", path, err)
   590  	}
   591  	log.Eventf(ctx, "ingested SSTable at index %d, term %d: %s", index, term, path)
   592  	return copied
   593  }
   594  
   595  func (r *Replica) handleReadWriteLocalEvalResult(ctx context.Context, lResult result.LocalResult) {
   596  	// Fields for which no action is taken in this method are zeroed so that
   597  	// they don't trigger an assertion at the end of the method (which checks
   598  	// that all fields were handled).
   599  	{
   600  		lResult.Reply = nil
   601  	}
   602  
   603  	// The caller is required to detach and handle the following three fields.
   604  	if lResult.EncounteredIntents != nil {
   605  		log.Fatalf(ctx, "LocalEvalResult.EncounteredIntents should be nil: %+v", lResult.EncounteredIntents)
   606  	}
   607  	if lResult.EndTxns != nil {
   608  		log.Fatalf(ctx, "LocalEvalResult.EndTxns should be nil: %+v", lResult.EndTxns)
   609  	}
   610  	if lResult.MaybeWatchForMerge {
   611  		log.Fatalf(ctx, "LocalEvalResult.MaybeWatchForMerge should be false")
   612  	}
   613  
   614  	if lResult.AcquiredLocks != nil {
   615  		for i := range lResult.AcquiredLocks {
   616  			r.concMgr.OnLockAcquired(ctx, &lResult.AcquiredLocks[i])
   617  		}
   618  		lResult.AcquiredLocks = nil
   619  	}
   620  
   621  	if lResult.ResolvedLocks != nil {
   622  		for i := range lResult.ResolvedLocks {
   623  			r.concMgr.OnLockUpdated(ctx, &lResult.ResolvedLocks[i])
   624  		}
   625  		lResult.ResolvedLocks = nil
   626  	}
   627  
   628  	if lResult.UpdatedTxns != nil {
   629  		for _, txn := range lResult.UpdatedTxns {
   630  			r.concMgr.OnTransactionUpdated(ctx, txn)
   631  		}
   632  		lResult.UpdatedTxns = nil
   633  	}
   634  
   635  	if lResult.GossipFirstRange {
   636  		// We need to run the gossip in an async task because gossiping requires
   637  		// the range lease and we'll deadlock if we try to acquire it while
   638  		// holding processRaftMu. Specifically, Replica.redirectOnOrAcquireLease
   639  		// blocks waiting for the lease acquisition to finish but it can't finish
   640  		// because we're not processing raft messages due to holding
   641  		// processRaftMu (and running on the processRaft goroutine).
   642  		if err := r.store.Stopper().RunAsyncTask(
   643  			ctx, "storage.Replica: gossipping first range",
   644  			func(ctx context.Context) {
   645  				hasLease, pErr := r.getLeaseForGossip(ctx)
   646  
   647  				if pErr != nil {
   648  					log.Infof(ctx, "unable to gossip first range; hasLease=%t, err=%s", hasLease, pErr)
   649  				} else if !hasLease {
   650  					return
   651  				}
   652  				r.gossipFirstRange(ctx)
   653  			}); err != nil {
   654  			log.Infof(ctx, "unable to gossip first range: %s", err)
   655  		}
   656  		lResult.GossipFirstRange = false
   657  	}
   658  
   659  	if lResult.MaybeAddToSplitQueue {
   660  		r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now())
   661  		lResult.MaybeAddToSplitQueue = false
   662  	}
   663  
   664  	if lResult.MaybeGossipSystemConfig {
   665  		if err := r.MaybeGossipSystemConfig(ctx); err != nil {
   666  			log.Errorf(ctx, "%v", err)
   667  		}
   668  		lResult.MaybeGossipSystemConfig = false
   669  	}
   670  
   671  	if lResult.MaybeGossipSystemConfigIfHaveFailure {
   672  		if err := r.MaybeGossipSystemConfigIfHaveFailure(ctx); err != nil {
   673  			log.Errorf(ctx, "%v", err)
   674  		}
   675  		lResult.MaybeGossipSystemConfigIfHaveFailure = false
   676  	}
   677  
   678  	if lResult.MaybeGossipNodeLiveness != nil {
   679  		if err := r.MaybeGossipNodeLiveness(ctx, *lResult.MaybeGossipNodeLiveness); err != nil {
   680  			log.Errorf(ctx, "%v", err)
   681  		}
   682  		lResult.MaybeGossipNodeLiveness = nil
   683  	}
   684  
   685  	if lResult.Metrics != nil {
   686  		r.store.metrics.handleMetricsResult(ctx, *lResult.Metrics)
   687  		lResult.Metrics = nil
   688  	}
   689  
   690  	if !lResult.IsZero() {
   691  		log.Fatalf(ctx, "unhandled field in LocalEvalResult: %s", pretty.Diff(lResult, result.LocalResult{}))
   692  	}
   693  }
   694  
   695  // proposalResult indicates the result of a proposal. Exactly one of
   696  // Reply and Err is set, and it represents the result of the proposal.
   697  type proposalResult struct {
   698  	Reply              *roachpb.BatchResponse
   699  	Err                *roachpb.Error
   700  	EncounteredIntents []roachpb.Intent
   701  	EndTxns            []result.EndTxnIntents
   702  }
   703  
   704  // evaluateProposal generates a Result from the given request by
   705  // evaluating it, returning both state which is held only on the
   706  // proposer and that which is to be replicated through Raft. The
   707  // return value is ready to be inserted into Replica's proposal map
   708  // and subsequently passed to submitProposalLocked.
   709  //
   710  // The method also returns a flag indicating if the request needs to
   711  // be proposed through Raft and replicated. This flag will be false
   712  // either if the request was a no-op or if it hit an error. In this
   713  // case, the result can be sent directly back to the client without
   714  // going through Raft, but while still handling LocalEvalResult.
   715  //
   716  // Replica.mu must not be held.
   717  func (r *Replica) evaluateProposal(
   718  	ctx context.Context,
   719  	idKey kvserverbase.CmdIDKey,
   720  	ba *roachpb.BatchRequest,
   721  	latchSpans *spanset.SpanSet,
   722  ) (*result.Result, bool, *roachpb.Error) {
   723  	if ba.Timestamp == (hlc.Timestamp{}) {
   724  		return nil, false, roachpb.NewErrorf("can't propose Raft command with zero timestamp")
   725  	}
   726  
   727  	// Evaluate the commands. If this returns without an error, the batch should
   728  	// be committed. Note that we don't hold any locks at this point. This is
   729  	// important since evaluating a proposal is expensive.
   730  	// TODO(tschottdorf): absorb all returned values in `res` below this point
   731  	// in the call stack as well.
   732  	batch, ms, br, res, pErr := r.evaluateWriteBatch(ctx, idKey, ba, latchSpans)
   733  
   734  	// Note: reusing the proposer's batch when applying the command on the
   735  	// proposer was explored as an optimization but resulted in no performance
   736  	// benefit.
   737  	if batch != nil {
   738  		defer batch.Close()
   739  	}
   740  
   741  	if pErr != nil {
   742  		pErr = r.maybeSetCorrupt(ctx, pErr)
   743  
   744  		txn := pErr.GetTxn()
   745  		if txn != nil && ba.Txn == nil {
   746  			log.Fatalf(ctx, "error had a txn but batch is non-transactional. Err txn: %s", txn)
   747  		}
   748  
   749  		// Failed proposals can't have any Result except for what's
   750  		// whitelisted here.
   751  		res.Local = result.LocalResult{
   752  			EncounteredIntents: res.Local.DetachEncounteredIntents(),
   753  			EndTxns:            res.Local.DetachEndTxns(true /* alwaysOnly */),
   754  			Metrics:            res.Local.Metrics,
   755  		}
   756  		res.Replicated.Reset()
   757  		return &res, false /* needConsensus */, pErr
   758  	}
   759  
   760  	// Set the local reply, which is held only on the proposing replica and is
   761  	// returned to the client after the proposal completes, or immediately if
   762  	// replication is not necessary.
   763  	res.Local.Reply = br
   764  
   765  	// needConsensus determines if the result needs to be replicated and
   766  	// proposed through Raft. This is necessary if at least one of the
   767  	// following conditions is true:
   768  	// 1. the request created a non-empty write batch.
   769  	// 2. the request had an impact on the MVCCStats. NB: this is possible
   770  	//    even with an empty write batch when stats are recomputed.
   771  	// 3. the request has replicated side-effects.
   772  	needConsensus := !batch.Empty() ||
   773  		ms != (enginepb.MVCCStats{}) ||
   774  		!res.Replicated.Equal(kvserverpb.ReplicatedEvalResult{})
   775  
   776  	if needConsensus {
   777  		// Set the proposal's WriteBatch, which is the serialized representation of
   778  		// the proposals effect on RocksDB.
   779  		res.WriteBatch = &kvserverpb.WriteBatch{
   780  			Data: batch.Repr(),
   781  		}
   782  
   783  		// Set the proposal's replicated result, which contains metadata and
   784  		// side-effects that are to be replicated to all replicas.
   785  		res.Replicated.IsLeaseRequest = ba.IsLeaseRequest()
   786  		res.Replicated.Timestamp = ba.Timestamp
   787  		res.Replicated.Delta = ms.ToStatsDelta()
   788  
   789  		_ = clusterversion.VersionContainsEstimatesCounter // see for info on ContainsEstimates migration
   790  		if r.ClusterSettings().Version.IsActive(ctx, clusterversion.VersionContainsEstimatesCounter) {
   791  			// Encode that this command (and any that follow) uses regular arithmetic for ContainsEstimates
   792  			// by making sure ContainsEstimates is > 1.
   793  			// This will be interpreted during command application.
   794  			if res.Replicated.Delta.ContainsEstimates > 0 {
   795  				res.Replicated.Delta.ContainsEstimates *= 2
   796  			}
   797  		} else {
   798  			// This range may still need to have its commands processed by nodes which treat ContainsEstimates
   799  			// as a bool, so clamp it to {0,1}. This enables use of bool semantics in command application.
   800  			if res.Replicated.Delta.ContainsEstimates > 1 {
   801  				res.Replicated.Delta.ContainsEstimates = 1
   802  			} else if res.Replicated.Delta.ContainsEstimates < 0 {
   803  				// The caller should have checked the cluster version. At the
   804  				// time of writing, this is only RecomputeStats and the split
   805  				// trigger, which both have the check, but better safe than sorry.
   806  				log.Fatalf(ctx, "cannot propose negative ContainsEstimates "+
   807  					"without VersionContainsEstimatesCounter in %s", ba.Summary())
   808  			}
   809  		}
   810  
   811  		// If the RangeAppliedState key is not being used and the cluster version is
   812  		// high enough to guarantee that all current and future binaries will
   813  		// understand the key, we send the migration flag through Raft. Because
   814  		// there is a delay between command proposal and application, we may end up
   815  		// setting this migration flag multiple times. This is ok, because the
   816  		// migration is idempotent.
   817  		// TODO(nvanbenschoten): This will be baked in to 2.1, so it can be removed
   818  		// in the 2.2 release.
   819  		r.mu.RLock()
   820  		usingAppliedStateKey := r.mu.state.UsingAppliedStateKey
   821  		r.mu.RUnlock()
   822  		if !usingAppliedStateKey {
   823  			// The range applied state was introduced in v2.1. It's possible to
   824  			// still find ranges that haven't activated it. If so, activate it.
   825  			// We can remove this code if we introduce a boot-time check that
   826  			// fails the startup process when any legacy replicas are found. The
   827  			// operator can then run the old binary for a while to upgrade the
   828  			// stragglers.
   829  			if res.Replicated.State == nil {
   830  				res.Replicated.State = &kvserverpb.ReplicaState{}
   831  			}
   832  			res.Replicated.State.UsingAppliedStateKey = true
   833  		}
   834  	}
   835  
   836  	return &res, needConsensus, nil
   837  }
   838  
   839  // requestToProposal converts a BatchRequest into a ProposalData, by
   840  // evaluating it. The returned ProposalData is partially valid even
   841  // on a non-nil *roachpb.Error and should be proposed through Raft
   842  // if ProposalData.command is non-nil.
   843  //
   844  // TODO(nvanbenschoten): combine idKey, ba, and latchSpans into a
   845  // `serializedRequest` struct.
   846  func (r *Replica) requestToProposal(
   847  	ctx context.Context,
   848  	idKey kvserverbase.CmdIDKey,
   849  	ba *roachpb.BatchRequest,
   850  	latchSpans *spanset.SpanSet,
   851  ) (*ProposalData, *roachpb.Error) {
   852  	res, needConsensus, pErr := r.evaluateProposal(ctx, idKey, ba, latchSpans)
   853  
   854  	// Fill out the results even if pErr != nil; we'll return the error below.
   855  	proposal := &ProposalData{
   856  		ctx:     ctx,
   857  		idKey:   idKey,
   858  		doneCh:  make(chan proposalResult, 1),
   859  		Local:   &res.Local,
   860  		Request: ba,
   861  	}
   862  
   863  	if needConsensus {
   864  		proposal.command = &kvserverpb.RaftCommand{
   865  			ReplicatedEvalResult: res.Replicated,
   866  			WriteBatch:           res.WriteBatch,
   867  			LogicalOpLog:         res.LogicalOpLog,
   868  			TraceData:            r.getTraceData(ctx),
   869  		}
   870  	}
   871  
   872  	return proposal, pErr
   873  }
   874  
   875  // getTraceData extracts the SpanContext of the current span.
   876  func (r *Replica) getTraceData(ctx context.Context) opentracing.TextMapCarrier {
   877  	sp := opentracing.SpanFromContext(ctx)
   878  	if sp == nil {
   879  		return nil
   880  	}
   881  	if tracing.IsBlackHoleSpan(sp) {
   882  		return nil
   883  	}
   884  	traceData := opentracing.TextMapCarrier{}
   885  	if err := r.AmbientContext.Tracer.Inject(
   886  		sp.Context(), opentracing.TextMap, traceData,
   887  	); err != nil {
   888  		log.Errorf(ctx, "failed to inject sp context (%+v) as trace data: %s", sp.Context(), err)
   889  		return nil
   890  	}
   891  	return traceData
   892  }