github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/rangefeed/resolved_timestamp.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rangefeed 12 13 import ( 14 "bytes" 15 "container/heap" 16 "fmt" 17 18 "github.com/cockroachdb/cockroach/pkg/roachpb" 19 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 20 "github.com/cockroachdb/cockroach/pkg/util/hlc" 21 "github.com/cockroachdb/cockroach/pkg/util/uuid" 22 ) 23 24 // A rangefeed's "resolved timestamp" is defined as the timestamp at which no 25 // future updates will be emitted to the feed at or before. The timestamp is 26 // monotonically increasing and is communicated through RangeFeedCheckpoint 27 // notifications whenever it changes. 28 // 29 // The resolved timestamp is closely tied to a Range's closed timestamp, but 30 // these concepts are not the same. Fundamentally, a closed timestamp is a 31 // property of a Range that restricts its state such that no "visible" data 32 // mutations are permitted at equal or earlier timestamps. This enables the 33 // guarantee that if the closed timestamp conditions are met, a follower replica 34 // has all state necessary to satisfy reads at or before the CT. On the other 35 // hand, a resolved timestamp is a property of a rangefeed that restrict its 36 // state such that no value notifications will be emitted at equal or earlier 37 // timestamps. The key difference here is that data mutations are allowed on a 38 // Range beneath a closed timestamp as long as they are not externally 39 // "visible". This is not true of the resolved timestamp because a rangefeed is 40 // driven directly off the state of a Range through its Raft log updates. As 41 // such, all changes to a Range beneath a given timestamp that will end up 42 // published on a rangefeed, "visible" or not, must be made to the Range before 43 // its resolved timestamp can be advanced to that timestamp. 44 // 45 // This distinction becomes interesting when considering how committed 46 // transaction intents are published as rangefeed events. Because the rangefeed 47 // is driven off the Raft log, these events are published when the intents are 48 // resolved, not when their corresponding transactions are committed. This leads 49 // to an important case with unresolved intents where a Range's closed timestamp 50 // and its corresponding rangefeed's resolved timestamp diverge because the 51 // closed timestamp advances past the timestamp of the unresolved intents. This 52 // is permitted because intent resolution of an intent which is part of a 53 // committed or aborted transaction does not change the visible state of a 54 // range. In effect, this means that a range can "close" timestamp t2 before 55 // resolving an intent for a transaction at timestamp t1, where t1 < t2. 56 // However, the rangefeed cannot "resolve" timestamp t2 until after the event 57 // for the committed intent is published. This creates a scenario where the 58 // Range's closed timestamp could have advanced to t2 while the corresponding 59 // rangefeed's resolved timestamp lags behind at some time earlier than t1, 60 // waiting for the intent resolution to result in a rangefeed publication. 61 // 62 // It follows that the closed timestamp mechanism is a necessary, but not 63 // sufficient, solution to creating a resolved timstamp. The closed timestamp is 64 // a necessary basis for the resolved timestamp because without it there could 65 // never be any guarantee about new changes to a range. However, the closed 66 // timestamp is not sufficient and could not replace the resolved timestamp 67 // directly because it does not wait for all changes to result in rangefeed 68 // notifications before advancing. In order to provide the proper guarantees for 69 // the resolved timestamp, it must be computed as the minimum of the closed 70 // timestamp and the timestamp of the earliest "unresolved intent" (see below) 71 // in the rangefeed's corresponding range of keys. This relies on one implicit 72 // assumption that is important enough to state: the closed timestamp must 73 // ensure that no unresolved intents will appear at timestamps that it has 74 // already closed off. Naively this property will hold, but it requires that 75 // changes to the closed timestamp and the tracking of unresolved intents be 76 // handled carefully and in a total order. 77 type resolvedTimestamp struct { 78 init bool 79 closedTS hlc.Timestamp 80 resolvedTS hlc.Timestamp 81 intentQ unresolvedIntentQueue 82 } 83 84 func makeResolvedTimestamp() resolvedTimestamp { 85 return resolvedTimestamp{ 86 intentQ: makeUnresolvedIntentQueue(), 87 } 88 } 89 90 // Get returns the current value of the resolved timestamp. 91 func (rts *resolvedTimestamp) Get() hlc.Timestamp { 92 return rts.resolvedTS 93 } 94 95 // Init informs the resolved timestamp that it has been provided all unresolved 96 // intents within its key range that may have timestamps lower than the initial 97 // closed timestamp. Once initialized, the resolvedTimestamp can begin operating 98 // in its steady state. The method returns whether this caused the resolved 99 // timestamp to move forward. 100 func (rts *resolvedTimestamp) Init() bool { 101 rts.init = true 102 // Once the resolvedTimestamp is initialized, all prior written intents 103 // should be accounted for, so reference counts for transactions that 104 // would drop below zero will all be due to aborted transactions. These 105 // can all be ignored. 106 rts.intentQ.AllowNegRefCount(false) 107 return rts.recompute() 108 } 109 110 // IsInit returns whether the resolved timestamp is initialized. 111 func (rts *resolvedTimestamp) IsInit() bool { 112 return rts.init 113 } 114 115 // ForwardClosedTS indicates that the closed timestamp that serves as the basis 116 // for the resolved timestamp has advanced. The method returns whether this 117 // caused the resolved timestamp to move forward. 118 func (rts *resolvedTimestamp) ForwardClosedTS(newClosedTS hlc.Timestamp) bool { 119 if rts.closedTS.Forward(newClosedTS) { 120 return rts.recompute() 121 } 122 rts.assertNoChange() 123 return false 124 } 125 126 // ConsumeLogicalOp informs the resolved timestamp of the occupance of a logical 127 // operation within its range of tracked keys. This allows the structure to 128 // update its internal intent tracking to reflect the change. The method returns 129 // whether this caused the resolved timestamp to move forward. 130 func (rts *resolvedTimestamp) ConsumeLogicalOp(op enginepb.MVCCLogicalOp) bool { 131 if rts.consumeLogicalOp(op) { 132 return rts.recompute() 133 } 134 rts.assertNoChange() 135 return false 136 } 137 138 func (rts *resolvedTimestamp) consumeLogicalOp(op enginepb.MVCCLogicalOp) bool { 139 switch t := op.GetValue().(type) { 140 case *enginepb.MVCCWriteValueOp: 141 rts.assertOpAboveRTS(op, t.Timestamp) 142 return false 143 144 case *enginepb.MVCCWriteIntentOp: 145 rts.assertOpAboveRTS(op, t.Timestamp) 146 return rts.intentQ.IncRef(t.TxnID, t.TxnKey, t.TxnMinTimestamp, t.Timestamp) 147 148 case *enginepb.MVCCUpdateIntentOp: 149 return rts.intentQ.UpdateTS(t.TxnID, t.Timestamp) 150 151 case *enginepb.MVCCCommitIntentOp: 152 return rts.intentQ.DecrRef(t.TxnID, t.Timestamp) 153 154 case *enginepb.MVCCAbortIntentOp: 155 // An aborted intent does not necessarily indicate an aborted 156 // transaction. An AbortIntent operation can be the result of an intent 157 // that was written only in an earlier epoch being resolved after its 158 // transaction committed in a later epoch. Don't make any assumptions 159 // about the transaction other than to decrement its reference count. 160 return rts.intentQ.DecrRef(t.TxnID, hlc.Timestamp{}) 161 162 case *enginepb.MVCCAbortTxnOp: 163 // Unlike the previous case, an aborted transaction does indicate 164 // that none of the transaction's intents will ever be committed. 165 // This means that we can stop tracking the transaction entirely. 166 // Doing so is critical to ensure forward progress of the resolved 167 // timestamp in situtations where the oldest transaction on a range 168 // is abandoned and the locations of its intents are unknown. 169 // 170 // However, the transaction may also still be writing, updating, and 171 // resolving (aborting) its intents, so we need to be careful with 172 // how we handle any future operations from this transaction. There 173 // are three different operations we could see the zombie transaction 174 // perform: 175 // 176 // - MVCCWriteIntentOp: it could write another intent. This could result 177 // in "reintroducing" the transaction to the queue. We allow this 178 // to happen and rely on pushing the transaction again, eventually 179 // evicting the transaction from the queue for good. 180 // 181 // Just like any other transaction, this new intent will necessarily 182 // be pushed above the closed timestamp, so we don't need to worry 183 // about resolved timestamp regressions. 184 // 185 // - MVCCUpdateIntentOp: it could update one of its intents. If we're 186 // not already tracking the transaction then the queue will ignore 187 // the intent update. 188 // 189 // - MVCCAbortIntentOp: it could resolve one of its intents as aborted. 190 // This is the most likely case. Again, if we're not already tracking 191 // the transaction then the queue will ignore the intent abort. 192 // 193 if !rts.IsInit() { 194 // We ignore MVCCAbortTxnOp operations until the queue is 195 // initialized. This is necessary because we allow txn reference 196 // counts to drop below zero before the queue is initialized and 197 // expect that all reference count decrements be balanced by a 198 // corresponding reference count increment. 199 // 200 // We could remove this restriction if we evicted all transactions 201 // with negative reference counts after initialization, but this is 202 // easier and more clear. 203 return false 204 } 205 return rts.intentQ.Del(t.TxnID) 206 207 default: 208 panic(fmt.Sprintf("unknown logical op %T", t)) 209 } 210 } 211 212 // recompute computes the resolved timestamp based on its respective closed 213 // timestamp and the in-flight intents that it is tracking. The method returns 214 // whether this caused the resolved timestamp to move forward. 215 func (rts *resolvedTimestamp) recompute() bool { 216 if !rts.IsInit() { 217 return false 218 } 219 newTS := rts.closedTS 220 if txn := rts.intentQ.Oldest(); txn != nil { 221 txnTS := txn.timestamp.FloorPrev() 222 if txnTS.Less(newTS) { 223 newTS = txnTS 224 } 225 } 226 if newTS.Less(rts.resolvedTS) { 227 panic(fmt.Sprintf("resolved timestamp regression, was %s, recomputed as %s", 228 rts.resolvedTS, newTS)) 229 } 230 return rts.resolvedTS.Forward(newTS) 231 } 232 233 // assertNoChange asserts that a recomputation of the resolved timestamp does 234 // not change its value. A violation of this assertion would indicate a logic 235 // error in the resolvedTimestamp implementation. 236 func (rts *resolvedTimestamp) assertNoChange() { 237 before := rts.resolvedTS 238 changed := rts.recompute() 239 if changed || (before != rts.resolvedTS) { 240 panic(fmt.Sprintf("unexpected resolved timestamp change on recomputation, "+ 241 "was %s, recomputed as %s", before, rts.resolvedTS)) 242 } 243 } 244 245 // assertOpAboveTimestamp asserts that this operation is at a larger timestamp 246 // than the current resolved timestamp. A violation of this assertion would 247 // indicate a failure of the closed timestamp mechanism. 248 func (rts *resolvedTimestamp) assertOpAboveRTS(op enginepb.MVCCLogicalOp, opTS hlc.Timestamp) { 249 if opTS.LessEq(rts.resolvedTS) { 250 panic(fmt.Sprintf("resolved timestamp %s equal to or above timestamp of operation %v", 251 rts.resolvedTS, op)) 252 } 253 } 254 255 // An "unresolved intent" in the context of the rangefeed primitive is an intent 256 // that may at some point in the future result in a RangeFeedValue publication. 257 // Based on this definition, there are three possible states that an extent 258 // intent can be in while fitting the requirement to be an "unresolved intent": 259 // 1. part of a PENDING transaction 260 // 2. part of a STAGING transaction that has not been explicitly committed yet 261 // 3. part of a COMMITTED transaction but not yet resolved due to the asynchronous 262 // nature of intent resolution 263 // Notably, this means that an intent that exists but that is known to be part 264 // of an ABORTED transaction is not considered "unresolved", even if it has yet 265 // to be cleaned up. In the context of rangefeeds, the intent's fate is resolved 266 // to never result in a RangeFeedValue publication. 267 // 268 // Defining unresolved intents in this way presents two paths for an unresolved 269 // intent to become resolved (and thus decrement the unresolvedTxn's ref count). 270 // An unresolved intent can become resolved if: 271 // 1. it is COMMITTED or ABORTED through the traditional intent resolution 272 // process. 273 // 2. it's transaction is observed to be ABORTED, meaning that it is by 274 // definition resolved even if it has yet to be cleaned up by the intent 275 // resolution process. 276 // 277 // An unresolvedTxn is a transaction that has one or more unresolved intents on 278 // a given range. The structure itself maintains metadata about the transaction 279 // along with a reference count of the number of unresolved intents created by 280 // the transaction on a given range. 281 type unresolvedTxn struct { 282 txnID uuid.UUID 283 txnKey roachpb.Key 284 txnMinTimestamp hlc.Timestamp 285 timestamp hlc.Timestamp 286 refCount int // count of unresolved intents 287 288 // The index of the item in the unresolvedTxnHeap, maintained by the 289 // heap.Interface methods. 290 index int 291 } 292 293 // asTxnMeta returns a TxnMeta representation of the unresolved transaction. 294 func (t *unresolvedTxn) asTxnMeta() enginepb.TxnMeta { 295 return enginepb.TxnMeta{ 296 ID: t.txnID, 297 Key: t.txnKey, 298 MinTimestamp: t.txnMinTimestamp, 299 WriteTimestamp: t.timestamp, 300 } 301 } 302 303 // unresolvedTxnHeap implements heap.Interface and holds unresolvedTxns. 304 // Transactions are prioritized based on their timestamp such that the oldest 305 // unresolved transaction will rise to the top of the heap. 306 type unresolvedTxnHeap []*unresolvedTxn 307 308 func (h unresolvedTxnHeap) Len() int { return len(h) } 309 310 func (h unresolvedTxnHeap) Less(i, j int) bool { 311 // container/heap constructs a min-heap by default, so prioritize the txn 312 // with the smaller timestamp. Break ties by comparing IDs to establish a 313 // total order. 314 if h[i].timestamp == h[j].timestamp { 315 return bytes.Compare(h[i].txnID.GetBytes(), h[j].txnID.GetBytes()) < 0 316 } 317 return h[i].timestamp.Less(h[j].timestamp) 318 } 319 320 func (h unresolvedTxnHeap) Swap(i, j int) { 321 h[i], h[j] = h[j], h[i] 322 h[i].index, h[j].index = i, j 323 } 324 325 func (h *unresolvedTxnHeap) Push(x interface{}) { 326 n := len(*h) 327 txn := x.(*unresolvedTxn) 328 txn.index = n 329 *h = append(*h, txn) 330 } 331 332 func (h *unresolvedTxnHeap) Pop() interface{} { 333 old := *h 334 n := len(old) 335 txn := old[n-1] 336 txn.index = -1 // for safety 337 old[n-1] = nil // for gc 338 *h = old[0 : n-1] 339 return txn 340 } 341 342 // unresolvedIntentQueue tracks all unresolved intents that exist within the key 343 // bounds of a range. It does so by tracking every transaction that contains at 344 // least one unresolved intent on the range. For each of these transactions, the 345 // queue maintains a count of the number of unresolved intents it contains on 346 // the range. By doing so and watching for when the count drops to zero, the 347 // queue can determine when a transaction is no longer unresolved. 348 // 349 // The queue maintains an ordering of transactions by timestamp. This allows it 350 // to determine the oldest unresolved intent that it's tracking and by extension 351 // the earliest possible time that a RangeFeedValue can be emitted at. Combined 352 // with a closed timestamp, which guarantees that no transactions can write new 353 // intents at or beneath it, a resolved timestamp can be constructed. 354 type unresolvedIntentQueue struct { 355 txns map[uuid.UUID]*unresolvedTxn 356 minHeap unresolvedTxnHeap 357 allowNegRefCount bool 358 } 359 360 func makeUnresolvedIntentQueue() unresolvedIntentQueue { 361 return unresolvedIntentQueue{ 362 txns: make(map[uuid.UUID]*unresolvedTxn), 363 allowNegRefCount: true, 364 } 365 } 366 367 // Len returns the number of transactions being tracked. 368 func (uiq *unresolvedIntentQueue) Len() int { 369 return uiq.minHeap.Len() 370 } 371 372 // Oldest returns the oldest transaction that is being tracked in the 373 // unresolvedIntentQueue, or nil if the queue is empty. If two transactions have 374 // the same timestamp, breaks the tie by returning the transaction with the ID 375 // that sorts first. 376 func (uiq *unresolvedIntentQueue) Oldest() *unresolvedTxn { 377 if uiq.Len() == 0 { 378 return nil 379 } 380 return uiq.minHeap[0] 381 } 382 383 // Before returns all transactions that have timestamps before a certain 384 // timestamp. It does so in O(n) time, where n is the number of matching 385 // transactions, NOT the total number of transactions being tracked. The 386 // resulting transactions will not be in sorted order. 387 func (uiq *unresolvedIntentQueue) Before(ts hlc.Timestamp) []*unresolvedTxn { 388 var txns []*unresolvedTxn 389 var collect func(int) 390 collect = func(i int) { 391 if len(uiq.minHeap) > i && uiq.minHeap[i].timestamp.Less(ts) { 392 txns = append(txns, uiq.minHeap[i]) 393 collect((2 * i) + 1) // left child 394 collect((2 * i) + 2) // right child 395 } 396 } 397 collect(0) 398 return txns 399 } 400 401 // IncRef increments the reference count of the specified transaction. It 402 // returns whether the update advanced the timestamp of the oldest transaction 403 // in the queue. 404 func (uiq *unresolvedIntentQueue) IncRef( 405 txnID uuid.UUID, txnKey roachpb.Key, txnMinTS, ts hlc.Timestamp, 406 ) bool { 407 return uiq.updateTxn(txnID, txnKey, txnMinTS, ts, +1) 408 } 409 410 // DecrRef decrements the reference count of the specified transaction. It 411 // returns whether the update advanced the timestamp of the oldest transaction 412 // in the queue. 413 func (uiq *unresolvedIntentQueue) DecrRef(txnID uuid.UUID, ts hlc.Timestamp) bool { 414 return uiq.updateTxn(txnID, nil, hlc.Timestamp{}, ts, -1) 415 } 416 417 // UpdateTS updates the timestamp of the specified transaction without modifying 418 // its intent reference count. It returns whether the update advanced the 419 // timestamp of the oldest transaction in the queue. 420 func (uiq *unresolvedIntentQueue) UpdateTS(txnID uuid.UUID, ts hlc.Timestamp) bool { 421 return uiq.updateTxn(txnID, nil, hlc.Timestamp{}, ts, 0) 422 } 423 424 func (uiq *unresolvedIntentQueue) updateTxn( 425 txnID uuid.UUID, txnKey roachpb.Key, txnMinTS, ts hlc.Timestamp, delta int, 426 ) bool { 427 txn, ok := uiq.txns[txnID] 428 if !ok { 429 if delta == 0 || (delta < 0 && !uiq.allowNegRefCount) { 430 // Unknown txn. 431 return false 432 } 433 434 // Add new txn to the queue. 435 txn = &unresolvedTxn{ 436 txnID: txnID, 437 txnKey: txnKey, 438 txnMinTimestamp: txnMinTS, 439 timestamp: ts, 440 refCount: delta, 441 } 442 uiq.txns[txn.txnID] = txn 443 heap.Push(&uiq.minHeap, txn) 444 445 // Adding a new txn can't advance the queue's earliest timestamp. 446 return false 447 } 448 449 // Will changes to the txn advance the queue's earliest timestamp? 450 wasMin := txn.index == 0 451 452 txn.refCount += delta 453 if txn.refCount == 0 || (txn.refCount < 0 && !uiq.allowNegRefCount) { 454 // Remove txn from the queue. 455 // NB: the txn.refCount < 0 case is not exercised by the external 456 // interface of this type because currently |delta| <= 1, but it 457 // is included for robustness. 458 delete(uiq.txns, txn.txnID) 459 heap.Remove(&uiq.minHeap, txn.index) 460 return wasMin 461 } 462 463 // Forward the txn's timestamp. Need to fix heap if timestamp changes. 464 if txn.timestamp.Forward(ts) { 465 heap.Fix(&uiq.minHeap, txn.index) 466 return wasMin 467 } 468 return false 469 } 470 471 // Del removes the transaction from the queue. It returns whether the update had 472 // an effect on the oldest transaction in the queue. 473 func (uiq *unresolvedIntentQueue) Del(txnID uuid.UUID) bool { 474 // This implementation is logically equivalent to the following, but 475 // it avoids underflow conditions: 476 // return uiq.updateTxn(txnID, nil, hlc.Timestamp{}, hlc.Timestamp{}, math.MinInt64) 477 478 txn, ok := uiq.txns[txnID] 479 if !ok { 480 // Unknown txn. 481 return false 482 } 483 484 // Will deleting the txn advance the queue's earliest timestamp? 485 wasMin := txn.index == 0 486 487 // Remove txn from the queue. 488 delete(uiq.txns, txn.txnID) 489 heap.Remove(&uiq.minHeap, txn.index) 490 return wasMin 491 } 492 493 // AllowNegRefCount instruts the unresolvedIntentQueue on whether or not to 494 // allow the reference count on transactions to drop below zero. If disallowed, 495 // the method also asserts that all unresolved intent refcounts for transactions 496 // currently in the queue are positive. Assertion takes O(n) time, where n is 497 // the total number of transactions being tracked in the queue. 498 func (uiq *unresolvedIntentQueue) AllowNegRefCount(b bool) { 499 if !b { 500 // Assert that the queue is currently in compliance. 501 uiq.assertOnlyPositiveRefCounts() 502 } 503 uiq.allowNegRefCount = b 504 } 505 506 func (uiq *unresolvedIntentQueue) assertOnlyPositiveRefCounts() { 507 for _, txn := range uiq.txns { 508 if txn.refCount <= 0 { 509 panic(fmt.Sprintf("negative refcount %d for txn %+v", txn.refCount, txn)) 510 } 511 } 512 }