github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_protected_timestamp.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    17  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc"
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/errors"
    24  )
    25  
    26  // cachedProtectedTimestampState is used to cache information about the state
    27  // of protected timestamps as they pertain to this replica. The data is
    28  // refreshed when the replica examines protected timestamps when being
    29  // considered for gc or when verifying a protected timestamp record.
    30  // It is consulted when determining whether a request can be served.
    31  type cachedProtectedTimestampState struct {
    32  	// readAt denotes the timestamp at which this record was read.
    33  	// It is used to coordinate updates to this field. It is also used to
    34  	// ensure that the protected timestamp subsystem can be relied upon. If
    35  	// the cache state is older than the lease start time then it is possible
    36  	// that protected timestamps have not been observed. In this case we must
    37  	// assume that any protected timestamp could exist to provide the contract
    38  	// on verify.
    39  	readAt         hlc.Timestamp
    40  	earliestRecord *ptpb.Record
    41  }
    42  
    43  // clearIfNotNewer clears the state in ts if it is not newer than the passed
    44  // value. This is used in conjunction with Replica.maybedUpdateCachedProtectedTS().
    45  // This optimization allows most interactions with protected timestamps to
    46  // operate using a shared lock. Only in cases where the cached value is known to
    47  // be older will the update be attempted.
    48  func (ts *cachedProtectedTimestampState) clearIfNotNewer(existing cachedProtectedTimestampState) {
    49  	if !existing.readAt.Less(ts.readAt) {
    50  		*ts = cachedProtectedTimestampState{}
    51  	}
    52  }
    53  
    54  // maybeUpdateCachedProtectedTS is used to optimize updates. We learn about
    55  // needs to update the cache while holding Replica.mu for reading but need to
    56  // perform the update with the exclusive lock. This function is intended to
    57  // be deferred.
    58  func (r *Replica) maybeUpdateCachedProtectedTS(ts *cachedProtectedTimestampState) {
    59  	if *ts == (cachedProtectedTimestampState{}) {
    60  		return
    61  	}
    62  	r.mu.Lock()
    63  	defer r.mu.Unlock()
    64  	if r.mu.cachedProtectedTS.readAt.Less(ts.readAt) {
    65  		r.mu.cachedProtectedTS = *ts
    66  	}
    67  }
    68  
    69  // protectedTimestampRecordApplies returns true if it is this case that the
    70  // record which protects the `protected` timestamp. It returns false if it may
    71  // not. If the state of the cache  is not sufficiently new to determine whether
    72  // the record will apply, the cache is refreshed and then the check is performed
    73  // again. See r.protectedTimestampRecordCurrentlyApplies() for more details.
    74  func (r *Replica) protectedTimestampRecordApplies(
    75  	ctx context.Context, args *roachpb.AdminVerifyProtectedTimestampRequest,
    76  ) (willApply bool, _ error) {
    77  	// Check the state of the cache without a refresh.
    78  	willApply, cacheTooOld, err := r.protectedTimestampRecordCurrentlyApplies(ctx, args)
    79  	if err != nil {
    80  		return false, err
    81  	}
    82  	if !cacheTooOld {
    83  		return willApply, nil
    84  	}
    85  	// Refresh the cache so that we know that the next time we come around we're
    86  	// certain to either see the record or see a timestamp for readAt that is
    87  	// greater than or equal to recordAliveAt.
    88  	if err := r.store.protectedtsCache.Refresh(ctx, args.RecordAliveAt); err != nil {
    89  		return false, err
    90  	}
    91  	willApply, cacheTooOld, err = r.protectedTimestampRecordCurrentlyApplies(ctx, args)
    92  	if err != nil {
    93  		return false, err
    94  	}
    95  	if cacheTooOld {
    96  		return false, errors.AssertionFailedf("cache was not updated after being refreshed")
    97  	}
    98  	return willApply, nil
    99  }
   100  
   101  func (r *Replica) readProtectedTimestampsRLocked(
   102  	ctx context.Context, f func(r *ptpb.Record),
   103  ) (ts cachedProtectedTimestampState) {
   104  	desc := r.descRLocked()
   105  	gcThreshold := *r.mu.state.GCThreshold
   106  
   107  	ts.readAt = r.store.protectedtsCache.Iterate(ctx,
   108  		roachpb.Key(desc.StartKey),
   109  		roachpb.Key(desc.EndKey),
   110  		func(rec *ptpb.Record) (wantMore bool) {
   111  			// Check if we've already GC'd past the timestamp this record was trying
   112  			// to protect, in which case we know that the record does not apply.
   113  			// Note that when we implement PROTECT_AT, we'll need to consult some
   114  			// replica state here to determine whether the record indeed has been
   115  			// applied.
   116  			if isValid := gcThreshold.LessEq(rec.Timestamp); !isValid {
   117  				return true
   118  			}
   119  			if f != nil {
   120  				f(rec)
   121  			}
   122  			if ts.earliestRecord == nil || rec.Timestamp.Less(ts.earliestRecord.Timestamp) {
   123  				ts.earliestRecord = rec
   124  			}
   125  			return true
   126  		})
   127  	return ts
   128  }
   129  
   130  // protectedTimestampRecordCurrentlyApplies determines whether a record with
   131  // the specified ID which protects `protected` and is known to exist at
   132  // `recordAliveAt` will apply given the current state of the cache. This method
   133  // is called by `r.protectedTimestampRecordApplies()`. It may be the case that
   134  // the current state of the cache is too old to determine whether the record
   135  // will apply. In such cases the cache should be refreshed to recordAliveAt and
   136  // then this method should be called again.
   137  func (r *Replica) protectedTimestampRecordCurrentlyApplies(
   138  	ctx context.Context, args *roachpb.AdminVerifyProtectedTimestampRequest,
   139  ) (willApply, cacheTooOld bool, _ error) {
   140  	// We first need to check that we're the current leaseholder.
   141  	// TODO(ajwerner): what other conditions with regards to time do we need to
   142  	// check? I don't think there are any. If the recordAliveAt is after our
   143  	// liveness expiration that's okay because we're either going to find the
   144  	// record or we're not and if we don't then we'll push the cache and re-assert
   145  	// that we're still the leaseholder. If somebody else becomes the leaseholder
   146  	// then they will have to go through the same process.
   147  	ls, pErr := r.redirectOnOrAcquireLease(ctx)
   148  	if pErr != nil {
   149  		return false, false, pErr.GoError()
   150  	}
   151  
   152  	// NB: It should be the case that the recordAliveAt timestamp
   153  	// is before the current time and that the above lease check means that
   154  	// the replica is the leaseholder at the current time. If recordAliveAt
   155  	// happened to be newer than the current time we'd need to make sure that
   156  	// the current Replica will be live at that time. Given that recordAliveAt
   157  	// has to be before the batch timestamp for this request and we should
   158  	// have forwarded the local clock to the batch timestamp this can't
   159  	// happen.
   160  	// TODO(ajwerner): do we need to assert that indeed the recordAliveAt precedes
   161  	// the batch timestamp? Probably not a bad sanity check.
   162  
   163  	// We may be reading the protected timestamp cache while we're holding
   164  	// the Replica.mu for reading. If we do so and find newer state in the cache
   165  	// then we want to, update the replica's cache of its state. The guarantee
   166  	// we provide is that if a record is successfully verified then the Replica's
   167  	// cachedProtectedTS will have a readAt value high enough to include that
   168  	// record.
   169  	var read cachedProtectedTimestampState
   170  	defer r.maybeUpdateCachedProtectedTS(&read)
   171  	r.mu.RLock()
   172  	defer r.mu.RUnlock()
   173  	defer read.clearIfNotNewer(r.mu.cachedProtectedTS)
   174  
   175  	// If the key that routed this request to this range is now out of this
   176  	// range's bounds, return an error for the client to try again on the
   177  	// correct range.
   178  	desc := r.descRLocked()
   179  	if !kvserverbase.ContainsKeyRange(desc, args.Key, args.EndKey) {
   180  		return false, false, roachpb.NewRangeKeyMismatchError(args.Key, args.EndKey, desc)
   181  	}
   182  	if args.Protected.LessEq(*r.mu.state.GCThreshold) {
   183  		return false, false, nil
   184  	}
   185  	if args.RecordAliveAt.Less(ls.Lease.Start) {
   186  		return true, false, nil
   187  	}
   188  
   189  	// Now we're in the case where maybe it is possible that we're going to later
   190  	// attempt to set the GC threshold above our protected point so to prevent
   191  	// that we add some state to the replica.
   192  	r.protectedTimestampMu.Lock()
   193  	defer r.protectedTimestampMu.Unlock()
   194  	if args.Protected.Less(r.protectedTimestampMu.pendingGCThreshold) {
   195  		return false, false, nil
   196  	}
   197  
   198  	var seen bool
   199  	read = r.readProtectedTimestampsRLocked(ctx, func(r *ptpb.Record) {
   200  		if r.ID == args.RecordID {
   201  			seen = true
   202  		}
   203  	})
   204  
   205  	// If we observed the record in question then we know that all future attempts
   206  	// to run GC will observe the Record if it still exists. The one hazard we
   207  	// need to avoid is a race whereby an attempt to run GC first checks the
   208  	// protected timestamp state and then attempts to increase the GC threshold.
   209  	// We set the minStateReadTimestamp here to avoid such races. The GC queue
   210  	// will call markPendingGC just prior to sending a request to update the GC
   211  	// threshold which will verify the safety of the new value relative to
   212  	// minStateReadTimestamp.
   213  	if seen {
   214  		r.protectedTimestampMu.minStateReadTimestamp = read.readAt
   215  		return true, false, nil
   216  	}
   217  
   218  	// Protected timestamp state has progressed past the point at which we
   219  	// should see this record. This implies that the record has been removed.
   220  	return false, read.readAt.Less(args.RecordAliveAt), nil
   221  }
   222  
   223  // checkProtectedTimestampsForGC determines whether the Replica can run GC.
   224  // If the Replica can run GC, this method returns the latest timestamp which
   225  // can be used to determine a valid new GCThreshold. The policy is passed in
   226  // rather than read from the replica state to ensure that the same value used
   227  // for this calculation is used later.
   228  //
   229  // In the case that GC can proceed, three timestamps are returned: The timestamp
   230  // corresponding to the state of the cache used to make the determination (used
   231  // for markPendingGC when actually performing GC), the timestamp used as the
   232  // basis to calculate the new gc threshold (used for scoring and reporting), and
   233  // the new gc threshold itself.
   234  func (r *Replica) checkProtectedTimestampsForGC(
   235  	ctx context.Context, policy zonepb.GCPolicy,
   236  ) (canGC bool, cacheTimestamp, gcTimestamp, newThreshold hlc.Timestamp) {
   237  
   238  	// We may be reading the protected timestamp cache while we're holding
   239  	// the Replica.mu for reading. If we do so and find newer state in the cache
   240  	// then we want to, update the replica's cache of its state. The guarantee
   241  	// we provide is that if a record is successfully verified then the Replica's
   242  	// cachedProtectedTS will have a readAt value high enough to include that
   243  	// record.
   244  	var read cachedProtectedTimestampState
   245  	defer r.maybeUpdateCachedProtectedTS(&read)
   246  	r.mu.RLock()
   247  	defer r.mu.RUnlock()
   248  	defer read.clearIfNotNewer(r.mu.cachedProtectedTS)
   249  
   250  	gcThreshold := *r.mu.state.GCThreshold
   251  	lease := *r.mu.state.Lease
   252  
   253  	// earliestValidRecord is the record with the earliest timestamp which is
   254  	// greater than the existing gcThreshold.
   255  	read = r.readProtectedTimestampsRLocked(ctx, nil)
   256  	gcTimestamp = read.readAt
   257  	if read.earliestRecord != nil {
   258  		// NB: we want to allow GC up to the timestamp preceding the earliest valid
   259  		// record.
   260  		impliedGCTimestamp := gc.TimestampForThreshold(read.earliestRecord.Timestamp.Prev(), policy)
   261  		if impliedGCTimestamp.Less(gcTimestamp) {
   262  			gcTimestamp = impliedGCTimestamp
   263  		}
   264  	}
   265  
   266  	if gcTimestamp.Less(lease.Start) {
   267  		log.VEventf(ctx, 1, "not gc'ing replica %v due to new lease %v started after %v",
   268  			r, lease, gcTimestamp)
   269  		return false, hlc.Timestamp{}, hlc.Timestamp{}, hlc.Timestamp{}
   270  	}
   271  
   272  	newThreshold = gc.CalculateThreshold(gcTimestamp, policy)
   273  
   274  	// If we've already GC'd right up to this record, there's no reason to
   275  	// gc again.
   276  	if newThreshold.Equal(gcThreshold) {
   277  		return false, hlc.Timestamp{}, hlc.Timestamp{}, hlc.Timestamp{}
   278  	}
   279  
   280  	return true, read.readAt, gcTimestamp, newThreshold
   281  }
   282  
   283  // markPendingGC is called just prior to sending the GC request to increase the
   284  // GC threshold during GC queue processing. This method synchronizes such
   285  // requests with the processing of AdminVerifyProtectedTimestamp requests. Such
   286  // synchronization is important to prevent races where the protected timestamp
   287  // state is read from a stale point in time and then concurrently, a
   288  // verification request arrives which applies under a later cache state and then
   289  // the gc queue, acting on older cache state, attempts to set the gc threshold
   290  // above a successfully verified record.
   291  func (r *Replica) markPendingGC(readAt, newThreshold hlc.Timestamp) error {
   292  	r.protectedTimestampMu.Lock()
   293  	defer r.protectedTimestampMu.Unlock()
   294  	if readAt.Less(r.protectedTimestampMu.minStateReadTimestamp) {
   295  		return errors.Errorf("cannot set gc threshold to %v because read at %v < min %v",
   296  			newThreshold, readAt, r.protectedTimestampMu.minStateReadTimestamp)
   297  	}
   298  	r.protectedTimestampMu.pendingGCThreshold = newThreshold
   299  	return nil
   300  }