github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/closedts/minprop/tracker.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package minprop 12 13 import ( 14 "context" 15 "fmt" 16 "sort" 17 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 24 ) 25 26 // Tracker implements TrackerI. 27 type Tracker struct { 28 mu struct { 29 syncutil.Mutex 30 // closed is the most recently closed timestamp. 31 closed hlc.Timestamp 32 closedEpoch ctpb.Epoch 33 34 // The variables below track required information for the next closed 35 // timestamp and beyond. First, `next` is the timestamp that will be 36 // closed out next (i.e. will replace `closed`). 37 // 38 // "left" and "right" refers to how the timestamps at which the 39 // associated command evaluations take place relate to `next`. 40 // `left`-tracked proposals are taken into account for the next closed 41 // timestamp, i.e. they could mutate at timestamps <= `next`. `right` 42 // proposals affect only MVCC timestamps > `next` and thus will become 43 // relevant only after `next` has been closed out, at which point the 44 // "right" set will replace the "left". 45 // 46 // closed next 47 // | left | right 48 // | | 49 // | | 50 // v v 51 //---------------------------------------------------------> time 52 // 53 // A replica wishing to serve a follower read will first have to catch 54 // up to a lease applied index that is guaranteed to include all writes 55 // affecting the closed timestamp or below. When `next` is closed out, 56 // the set of relevant Lease Applied Indexes will be stored in `leftMLAI`. 57 // 58 // This is augmented by reference counts for the proposals currently in 59 // the process of evaluating. `next` can only be closed out once 60 // `leftRef` has been drained (i.e. has dropped to zero); new proposals 61 // are always forced above `next` and consequently count towards 62 // `rightRef`. 63 // 64 // Epochs track the highest liveness epoch observed for any released 65 // proposals. Tracking a max epoch allows the MPT to provide some MLAI 66 // information about the current epoch when calls to Close straddle multiple 67 // different epochs. Before epoch tracking was added the client of the MPT 68 // was forced to assume that the MLAI information from the current call to 69 // Close corresponded to the highest known epoch as of the previous call to 70 // Close. This is problematic in cases where an epoch change leads to a 71 // lease change for an otherwise quiescent range. If this mechanism were 72 // not in place then the client would never learn about an MLAI for the 73 // current epoch. Clients provide their view of the current epoch to calls 74 // to Close which use this information to determine whether the current 75 // state should be moved and whether the caller can make use of the 76 // currently tracked data. Each side tracks data which corresponds exactly 77 // to the side's epoch value. Releasing a proposal into the tracker at a 78 // later epoch than is currently tracked will result in the current data 79 // corresponding to the prior epoch to be evicted. 80 81 next hlc.Timestamp 82 leftMLAI, rightMLAI map[roachpb.RangeID]ctpb.LAI 83 leftRef, rightRef int 84 leftEpoch, rightEpoch ctpb.Epoch 85 } 86 } 87 88 var _ closedts.TrackerI = (*Tracker)(nil) 89 90 // NewTracker returns a Tracker initialized to a closed timestamp of zero and 91 // a next closed timestamp of one logical tick past zero. 92 func NewTracker() *Tracker { 93 t := &Tracker{} 94 const initialEpoch = 1 95 t.mu.closedEpoch = initialEpoch 96 t.mu.leftEpoch = initialEpoch 97 t.mu.rightEpoch = initialEpoch 98 t.mu.next = hlc.Timestamp{Logical: 1} 99 t.mu.leftMLAI = map[roachpb.RangeID]ctpb.LAI{} 100 t.mu.rightMLAI = map[roachpb.RangeID]ctpb.LAI{} 101 return t 102 } 103 104 // String prints a string representation of the Tracker's state. 105 func (t *Tracker) String() string { 106 t.mu.Lock() 107 defer t.mu.Unlock() 108 closed, next := t.mu.closed, t.mu.next 109 leftRef, rightRef := t.mu.leftRef, t.mu.rightRef 110 leftEpoch, rightEpoch := t.mu.leftEpoch, t.mu.rightEpoch 111 112 type item struct { 113 rangeID roachpb.RangeID 114 mlai ctpb.LAI 115 left bool 116 } 117 118 var lais []item 119 for rangeID, mlai := range t.mu.leftMLAI { 120 lais = append(lais, item{rangeID, mlai, true}) 121 } 122 for rangeID, mlai := range t.mu.rightMLAI { 123 lais = append(lais, item{rangeID, mlai, false}) 124 } 125 126 sort.Slice(lais, func(i, j int) bool { 127 if lais[i].rangeID != lais[j].rangeID { 128 return lais[i].rangeID < lais[j].rangeID 129 } 130 return lais[i].mlai < lais[j].mlai 131 }) 132 133 var lines string 134 for _, item := range lais { 135 var format string 136 if !item.left { 137 format = ` | @ %-2d (r%d) 138 ` 139 } else { 140 format = ` | %11d @ (r%d) 141 ` 142 } 143 lines += fmt.Sprintf(format, item.mlai, item.rangeID) 144 } 145 146 return fmt.Sprintf(` 147 closed=%s 148 | next=%s 149 | left | right 150 | %3d # %d 151 | %3d e %d 152 `+lines+ 153 ` v v 154 ---------------------------------------------------------> time 155 `, 156 closed, next, leftRef, rightRef, leftEpoch, rightEpoch, 157 ) 158 } 159 160 // Close attempts to close out the current candidate timestamp (replacing it 161 // with the provided one). This is possible only if tracked proposals that were 162 // evaluating when Close was previously called have since completed. On success, 163 // all subsequent proposals will be forced to evaluate strictly above the 164 // provided timestamp, and the timestamp previously passed to Close is returned 165 // as a closed timestamp along with a map of minimum Lease Applied Indexes 166 // reflecting the updates for the past period. On failure, the previous closed 167 // timestamp is returned along with a nil map (which can be treated by callers 168 // like a successful call that happens to not return any new information). 169 // Similarly, failure to provide a timestamp strictly larger than that to be 170 // closed out next results in the same "idempotent" return values. 171 // 172 // Callers additionally provide the current expected epoch value, the liveness 173 // epoch at which the caller intends to advertise this closed timestamp. The 174 // caller must know that it is live at a timestamp greater than or equal to the 175 // timestamp which the tracker will close. For correctness purposes this will 176 // be the case if the caller knows that it is live at next and calls to Close() 177 // pass monontic calues for next. If the current expected epoch is older than 178 // the currently tracked data then the timestamp will fail to be closed. If the 179 // expected epoch value is older than the epoch tracked on the left but 180 // corresponds to the epoch of the previous successful close then the previous 181 // closed timestamp is returned along with a nil map. This situation is just 182 // like the unsuccessful close scenario due to unreleased proposals. This 183 // behavior enables the caller to successfully obtain the tracked data at the 184 // newer epoch in a later query after its epoch has updated. If the caller's 185 // expected epoch is even older than the previously returned epoch then zero 186 // values are returned. If the caller's expected epoch is newer than that of 187 // tracked data the state of the tracker is progressed but zero values are 188 // returned. 189 func (t *Tracker) Close( 190 next hlc.Timestamp, expCurEpoch ctpb.Epoch, 191 ) (ts hlc.Timestamp, mlai map[roachpb.RangeID]ctpb.LAI, ok bool) { 192 t.mu.Lock() 193 defer t.mu.Unlock() 194 195 if log.V(3) { 196 log.Infof(context.TODO(), 197 "close: leftRef=%d (ep: %d) rightRef=%d (ep: %d) next=%s closed=%s@ (ep: %d) new=%s (ep: %d)", 198 t.mu.leftRef, t.mu.leftEpoch, t.mu.rightRef, t.mu.rightEpoch, t.mu.next, 199 t.mu.closed, t.mu.closedEpoch, next, expCurEpoch) 200 } 201 202 // Make sure to not let `t.mu.next` regress, or we'll accept proposals 203 // that violate earlier closed timestamps. (And if it stayed the same 204 // the logic in the closure returned from Track would fall apart). 205 canClose := t.mu.leftRef == 0 && t.mu.next.Less(next) 206 207 // NB: the expected closed epoch may not match the epoch for the timestamp we 208 // are currently closing. If the expected closed epoch is earlier than the 209 // epoch tracked on the left then the caller likely read its liveness just 210 // before an epoch change and we should not move the tracker state as the 211 // caller will likely visit again with the new epoch and would like the 212 // tracked information. If the expCurEpoch is greater than or equal to the 213 // current epoch, proceed with closing out the current timestamp, deferring 214 // the decision regarding whether to return the updated state based on epoch 215 // until after updating the data. 216 if canClose && t.mu.leftEpoch <= expCurEpoch { 217 // NB: if rightRef is also zero, then nothing is in flight right now and 218 // we could theoretically close out `next`. However, we'd also have to 219 // merge the left and right MLAI maps, and would force followers to 220 // catch up to more commands much more rapidly than can be expected of 221 // them. If we want to make use of this optimization, we should emit 222 // two closed timestamp updates for this case. 223 t.mu.closed = t.mu.next 224 t.mu.closedEpoch = t.mu.leftEpoch 225 mlai = t.mu.leftMLAI 226 227 // NB: if the expCurEpoch is after the epoch tracked on the right, we'll 228 // never be able to use that information so clear it. The below logic is 229 // not required for correctness but adds an invariant that after a call to 230 // Close with a give expCurEpoch no state corresponding to an earlier epoch 231 // will be tracked on either side. Without this logic, subsequent proposals 232 // or Close calls at the later epoch would lead to this data being 233 // discarded at that point. 234 if t.mu.rightEpoch < expCurEpoch { 235 t.mu.rightEpoch = expCurEpoch 236 clearMLAIMap(t.mu.rightMLAI) 237 } 238 239 // `next` moves forward to the provided timestamp, and picks up the 240 // right refcount and MLAIs (so that it is now responsible for tracking 241 // everything that's in-flight). 242 t.mu.leftMLAI = t.mu.rightMLAI 243 t.mu.leftRef = t.mu.rightRef 244 t.mu.leftEpoch = t.mu.rightEpoch 245 t.mu.rightMLAI = map[roachpb.RangeID]ctpb.LAI{} 246 t.mu.rightRef = 0 247 248 t.mu.next = next 249 } 250 251 if t.mu.closedEpoch != expCurEpoch { 252 return hlc.Timestamp{}, nil, false 253 } 254 return t.mu.closed, mlai, true 255 } 256 257 // Track is called before evaluating a proposal. It returns the minimum 258 // timestamp at which the proposal can be evaluated (i.e. the request timestamp 259 // needs to be forwarded if necessary), and acquires a reference with the 260 // Tracker. This reference is released by calling the returned closure either 261 // a) before proposing the command, supplying the Lease Applied Index at which 262 // the proposal will be carried out, or 263 // b) with zero arguments if the command won't end up being proposed (i.e. hit 264 // an error during evaluation). 265 // 266 // The ReleaseFunc is not thread safe. For convenience, it may be called with 267 // zero arguments once after a regular call. 268 func (t *Tracker) Track(ctx context.Context) (hlc.Timestamp, closedts.ReleaseFunc) { 269 shouldLog := log.V(3) 270 271 t.mu.Lock() 272 minProp := t.mu.next.Next() 273 t.mu.rightRef++ 274 t.mu.Unlock() 275 276 if shouldLog { 277 log.Infof(ctx, "track: proposal on the right at minProp %s", minProp) 278 } 279 280 var calls int 281 release := func(ctx context.Context, epoch ctpb.Epoch, rangeID roachpb.RangeID, lai ctpb.LAI) { 282 calls++ 283 if calls != 1 { 284 if lai != 0 || rangeID != 0 || calls > 2 { 285 log.Fatalf(ctx, "command released %d times, this time with arguments (%d, %d)", 286 log.Safe(calls), log.Safe(rangeID), log.Safe(lai)) 287 } 288 return 289 } 290 t.release(ctx, minProp, epoch, rangeID, lai, shouldLog) 291 } 292 293 return minProp, release 294 } 295 296 // release is the business logic to release properly account for the release of 297 // a tracked proposal. It is called from the ReleaseFunc closure returned from 298 // Track. 299 func (t *Tracker) release( 300 ctx context.Context, 301 minProp hlc.Timestamp, 302 epoch ctpb.Epoch, 303 rangeID roachpb.RangeID, 304 lai ctpb.LAI, 305 shouldLog bool, 306 ) { 307 t.mu.Lock() 308 defer t.mu.Unlock() 309 var left bool 310 if minProp == t.mu.closed.Next() { 311 left = true 312 } else if minProp == t.mu.next.Next() { 313 left = false 314 } else { 315 log.Fatalf(ctx, "min proposal %s not tracked under closed (%s) or next (%s) timestamp", minProp, t.mu.closed, t.mu.next) 316 } 317 // If the update is from the left side, clear all existing MLAIs from the left 318 // to uphold the invariant that all tracked MLAIs belong to the same (and 319 // largest seen) epoch. It would not violate correctness to clear the data on 320 // the left even if the proposal being released is tracked on the right; it is 321 // likely that the next call to close will observe the later epoch and thus 322 // not read this data but the code chooses to retain it. 323 if left && epoch > t.mu.leftEpoch { 324 t.mu.leftEpoch = epoch 325 clearMLAIMap(t.mu.leftMLAI) 326 } 327 // The right side is bumped and cleared when the epoch increases without 328 // taking into account which side the current proposal is tracked under 329 // because bumping the left side implies that the information from the right 330 // side will never be retrieved by the client (as epochs only ever go up and 331 // the current left will be emitted before the current right side). 332 if epoch > t.mu.rightEpoch { 333 t.mu.rightEpoch = epoch 334 clearMLAIMap(t.mu.rightMLAI) 335 } 336 if left { 337 releaseProposal(ctx, "left", shouldLog, minProp, rangeID, lai, 338 &t.mu.leftRef, t.mu.leftMLAI, t.mu.leftEpoch != epoch) 339 } else { 340 releaseProposal(ctx, "right", shouldLog, minProp, rangeID, lai, 341 &t.mu.rightRef, t.mu.rightMLAI, t.mu.rightEpoch != epoch) 342 } 343 } 344 345 func clearMLAIMap(m map[roachpb.RangeID]ctpb.LAI) { 346 for rangeID := range m { 347 delete(m, rangeID) 348 } 349 } 350 351 func releaseProposal( 352 ctx context.Context, 353 side string, 354 shouldLog bool, 355 minProp hlc.Timestamp, 356 rangeID roachpb.RangeID, 357 lai ctpb.LAI, 358 refs *int, 359 mlaiMap map[roachpb.RangeID]ctpb.LAI, 360 fromPreviousEpoch bool, 361 ) { 362 if shouldLog { 363 log.Infof(ctx, "release: minprop %s on r%d@%d tracked on the %s", minProp, rangeID, lai, side) 364 } 365 *refs-- 366 if *refs < 0 { 367 log.Fatalf(ctx, "min proposal %s ref count < 0", side) 368 } 369 if rangeID == 0 { 370 return 371 } 372 if !fromPreviousEpoch { 373 if curLAI, found := mlaiMap[rangeID]; !found || curLAI < lai { 374 mlaiMap[rangeID] = lai 375 } 376 } 377 }