github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_protected_timestamp.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 16 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc" 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 "github.com/cockroachdb/errors" 24 ) 25 26 // cachedProtectedTimestampState is used to cache information about the state 27 // of protected timestamps as they pertain to this replica. The data is 28 // refreshed when the replica examines protected timestamps when being 29 // considered for gc or when verifying a protected timestamp record. 30 // It is consulted when determining whether a request can be served. 31 type cachedProtectedTimestampState struct { 32 // readAt denotes the timestamp at which this record was read. 33 // It is used to coordinate updates to this field. It is also used to 34 // ensure that the protected timestamp subsystem can be relied upon. If 35 // the cache state is older than the lease start time then it is possible 36 // that protected timestamps have not been observed. In this case we must 37 // assume that any protected timestamp could exist to provide the contract 38 // on verify. 39 readAt hlc.Timestamp 40 earliestRecord *ptpb.Record 41 } 42 43 // clearIfNotNewer clears the state in ts if it is not newer than the passed 44 // value. This is used in conjunction with Replica.maybedUpdateCachedProtectedTS(). 45 // This optimization allows most interactions with protected timestamps to 46 // operate using a shared lock. Only in cases where the cached value is known to 47 // be older will the update be attempted. 48 func (ts *cachedProtectedTimestampState) clearIfNotNewer(existing cachedProtectedTimestampState) { 49 if !existing.readAt.Less(ts.readAt) { 50 *ts = cachedProtectedTimestampState{} 51 } 52 } 53 54 // maybeUpdateCachedProtectedTS is used to optimize updates. We learn about 55 // needs to update the cache while holding Replica.mu for reading but need to 56 // perform the update with the exclusive lock. This function is intended to 57 // be deferred. 58 func (r *Replica) maybeUpdateCachedProtectedTS(ts *cachedProtectedTimestampState) { 59 if *ts == (cachedProtectedTimestampState{}) { 60 return 61 } 62 r.mu.Lock() 63 defer r.mu.Unlock() 64 if r.mu.cachedProtectedTS.readAt.Less(ts.readAt) { 65 r.mu.cachedProtectedTS = *ts 66 } 67 } 68 69 // protectedTimestampRecordApplies returns true if it is this case that the 70 // record which protects the `protected` timestamp. It returns false if it may 71 // not. If the state of the cache is not sufficiently new to determine whether 72 // the record will apply, the cache is refreshed and then the check is performed 73 // again. See r.protectedTimestampRecordCurrentlyApplies() for more details. 74 func (r *Replica) protectedTimestampRecordApplies( 75 ctx context.Context, args *roachpb.AdminVerifyProtectedTimestampRequest, 76 ) (willApply bool, _ error) { 77 // Check the state of the cache without a refresh. 78 willApply, cacheTooOld, err := r.protectedTimestampRecordCurrentlyApplies(ctx, args) 79 if err != nil { 80 return false, err 81 } 82 if !cacheTooOld { 83 return willApply, nil 84 } 85 // Refresh the cache so that we know that the next time we come around we're 86 // certain to either see the record or see a timestamp for readAt that is 87 // greater than or equal to recordAliveAt. 88 if err := r.store.protectedtsCache.Refresh(ctx, args.RecordAliveAt); err != nil { 89 return false, err 90 } 91 willApply, cacheTooOld, err = r.protectedTimestampRecordCurrentlyApplies(ctx, args) 92 if err != nil { 93 return false, err 94 } 95 if cacheTooOld { 96 return false, errors.AssertionFailedf("cache was not updated after being refreshed") 97 } 98 return willApply, nil 99 } 100 101 func (r *Replica) readProtectedTimestampsRLocked( 102 ctx context.Context, f func(r *ptpb.Record), 103 ) (ts cachedProtectedTimestampState) { 104 desc := r.descRLocked() 105 gcThreshold := *r.mu.state.GCThreshold 106 107 ts.readAt = r.store.protectedtsCache.Iterate(ctx, 108 roachpb.Key(desc.StartKey), 109 roachpb.Key(desc.EndKey), 110 func(rec *ptpb.Record) (wantMore bool) { 111 // Check if we've already GC'd past the timestamp this record was trying 112 // to protect, in which case we know that the record does not apply. 113 // Note that when we implement PROTECT_AT, we'll need to consult some 114 // replica state here to determine whether the record indeed has been 115 // applied. 116 if isValid := gcThreshold.LessEq(rec.Timestamp); !isValid { 117 return true 118 } 119 if f != nil { 120 f(rec) 121 } 122 if ts.earliestRecord == nil || rec.Timestamp.Less(ts.earliestRecord.Timestamp) { 123 ts.earliestRecord = rec 124 } 125 return true 126 }) 127 return ts 128 } 129 130 // protectedTimestampRecordCurrentlyApplies determines whether a record with 131 // the specified ID which protects `protected` and is known to exist at 132 // `recordAliveAt` will apply given the current state of the cache. This method 133 // is called by `r.protectedTimestampRecordApplies()`. It may be the case that 134 // the current state of the cache is too old to determine whether the record 135 // will apply. In such cases the cache should be refreshed to recordAliveAt and 136 // then this method should be called again. 137 func (r *Replica) protectedTimestampRecordCurrentlyApplies( 138 ctx context.Context, args *roachpb.AdminVerifyProtectedTimestampRequest, 139 ) (willApply, cacheTooOld bool, _ error) { 140 // We first need to check that we're the current leaseholder. 141 // TODO(ajwerner): what other conditions with regards to time do we need to 142 // check? I don't think there are any. If the recordAliveAt is after our 143 // liveness expiration that's okay because we're either going to find the 144 // record or we're not and if we don't then we'll push the cache and re-assert 145 // that we're still the leaseholder. If somebody else becomes the leaseholder 146 // then they will have to go through the same process. 147 ls, pErr := r.redirectOnOrAcquireLease(ctx) 148 if pErr != nil { 149 return false, false, pErr.GoError() 150 } 151 152 // NB: It should be the case that the recordAliveAt timestamp 153 // is before the current time and that the above lease check means that 154 // the replica is the leaseholder at the current time. If recordAliveAt 155 // happened to be newer than the current time we'd need to make sure that 156 // the current Replica will be live at that time. Given that recordAliveAt 157 // has to be before the batch timestamp for this request and we should 158 // have forwarded the local clock to the batch timestamp this can't 159 // happen. 160 // TODO(ajwerner): do we need to assert that indeed the recordAliveAt precedes 161 // the batch timestamp? Probably not a bad sanity check. 162 163 // We may be reading the protected timestamp cache while we're holding 164 // the Replica.mu for reading. If we do so and find newer state in the cache 165 // then we want to, update the replica's cache of its state. The guarantee 166 // we provide is that if a record is successfully verified then the Replica's 167 // cachedProtectedTS will have a readAt value high enough to include that 168 // record. 169 var read cachedProtectedTimestampState 170 defer r.maybeUpdateCachedProtectedTS(&read) 171 r.mu.RLock() 172 defer r.mu.RUnlock() 173 defer read.clearIfNotNewer(r.mu.cachedProtectedTS) 174 175 // If the key that routed this request to this range is now out of this 176 // range's bounds, return an error for the client to try again on the 177 // correct range. 178 desc := r.descRLocked() 179 if !kvserverbase.ContainsKeyRange(desc, args.Key, args.EndKey) { 180 return false, false, roachpb.NewRangeKeyMismatchError(args.Key, args.EndKey, desc) 181 } 182 if args.Protected.LessEq(*r.mu.state.GCThreshold) { 183 return false, false, nil 184 } 185 if args.RecordAliveAt.Less(ls.Lease.Start) { 186 return true, false, nil 187 } 188 189 // Now we're in the case where maybe it is possible that we're going to later 190 // attempt to set the GC threshold above our protected point so to prevent 191 // that we add some state to the replica. 192 r.protectedTimestampMu.Lock() 193 defer r.protectedTimestampMu.Unlock() 194 if args.Protected.Less(r.protectedTimestampMu.pendingGCThreshold) { 195 return false, false, nil 196 } 197 198 var seen bool 199 read = r.readProtectedTimestampsRLocked(ctx, func(r *ptpb.Record) { 200 if r.ID == args.RecordID { 201 seen = true 202 } 203 }) 204 205 // If we observed the record in question then we know that all future attempts 206 // to run GC will observe the Record if it still exists. The one hazard we 207 // need to avoid is a race whereby an attempt to run GC first checks the 208 // protected timestamp state and then attempts to increase the GC threshold. 209 // We set the minStateReadTimestamp here to avoid such races. The GC queue 210 // will call markPendingGC just prior to sending a request to update the GC 211 // threshold which will verify the safety of the new value relative to 212 // minStateReadTimestamp. 213 if seen { 214 r.protectedTimestampMu.minStateReadTimestamp = read.readAt 215 return true, false, nil 216 } 217 218 // Protected timestamp state has progressed past the point at which we 219 // should see this record. This implies that the record has been removed. 220 return false, read.readAt.Less(args.RecordAliveAt), nil 221 } 222 223 // checkProtectedTimestampsForGC determines whether the Replica can run GC. 224 // If the Replica can run GC, this method returns the latest timestamp which 225 // can be used to determine a valid new GCThreshold. The policy is passed in 226 // rather than read from the replica state to ensure that the same value used 227 // for this calculation is used later. 228 // 229 // In the case that GC can proceed, three timestamps are returned: The timestamp 230 // corresponding to the state of the cache used to make the determination (used 231 // for markPendingGC when actually performing GC), the timestamp used as the 232 // basis to calculate the new gc threshold (used for scoring and reporting), and 233 // the new gc threshold itself. 234 func (r *Replica) checkProtectedTimestampsForGC( 235 ctx context.Context, policy zonepb.GCPolicy, 236 ) (canGC bool, cacheTimestamp, gcTimestamp, newThreshold hlc.Timestamp) { 237 238 // We may be reading the protected timestamp cache while we're holding 239 // the Replica.mu for reading. If we do so and find newer state in the cache 240 // then we want to, update the replica's cache of its state. The guarantee 241 // we provide is that if a record is successfully verified then the Replica's 242 // cachedProtectedTS will have a readAt value high enough to include that 243 // record. 244 var read cachedProtectedTimestampState 245 defer r.maybeUpdateCachedProtectedTS(&read) 246 r.mu.RLock() 247 defer r.mu.RUnlock() 248 defer read.clearIfNotNewer(r.mu.cachedProtectedTS) 249 250 gcThreshold := *r.mu.state.GCThreshold 251 lease := *r.mu.state.Lease 252 253 // earliestValidRecord is the record with the earliest timestamp which is 254 // greater than the existing gcThreshold. 255 read = r.readProtectedTimestampsRLocked(ctx, nil) 256 gcTimestamp = read.readAt 257 if read.earliestRecord != nil { 258 // NB: we want to allow GC up to the timestamp preceding the earliest valid 259 // record. 260 impliedGCTimestamp := gc.TimestampForThreshold(read.earliestRecord.Timestamp.Prev(), policy) 261 if impliedGCTimestamp.Less(gcTimestamp) { 262 gcTimestamp = impliedGCTimestamp 263 } 264 } 265 266 if gcTimestamp.Less(lease.Start) { 267 log.VEventf(ctx, 1, "not gc'ing replica %v due to new lease %v started after %v", 268 r, lease, gcTimestamp) 269 return false, hlc.Timestamp{}, hlc.Timestamp{}, hlc.Timestamp{} 270 } 271 272 newThreshold = gc.CalculateThreshold(gcTimestamp, policy) 273 274 // If we've already GC'd right up to this record, there's no reason to 275 // gc again. 276 if newThreshold.Equal(gcThreshold) { 277 return false, hlc.Timestamp{}, hlc.Timestamp{}, hlc.Timestamp{} 278 } 279 280 return true, read.readAt, gcTimestamp, newThreshold 281 } 282 283 // markPendingGC is called just prior to sending the GC request to increase the 284 // GC threshold during GC queue processing. This method synchronizes such 285 // requests with the processing of AdminVerifyProtectedTimestamp requests. Such 286 // synchronization is important to prevent races where the protected timestamp 287 // state is read from a stale point in time and then concurrently, a 288 // verification request arrives which applies under a later cache state and then 289 // the gc queue, acting on older cache state, attempts to set the gc threshold 290 // above a successfully verified record. 291 func (r *Replica) markPendingGC(readAt, newThreshold hlc.Timestamp) error { 292 r.protectedTimestampMu.Lock() 293 defer r.protectedTimestampMu.Unlock() 294 if readAt.Less(r.protectedTimestampMu.minStateReadTimestamp) { 295 return errors.Errorf("cannot set gc threshold to %v because read at %v < min %v", 296 newThreshold, readAt, r.protectedTimestampMu.minStateReadTimestamp) 297 } 298 r.protectedTimestampMu.pendingGCThreshold = newThreshold 299 return nil 300 }