github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/rangefeed/resolved_timestamp.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rangefeed
    12  
    13  import (
    14  	"bytes"
    15  	"container/heap"
    16  	"fmt"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    20  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    21  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    22  )
    23  
    24  // A rangefeed's "resolved timestamp" is defined as the timestamp at which no
    25  // future updates will be emitted to the feed at or before. The timestamp is
    26  // monotonically increasing and is communicated through RangeFeedCheckpoint
    27  // notifications whenever it changes.
    28  //
    29  // The resolved timestamp is closely tied to a Range's closed timestamp, but
    30  // these concepts are not the same. Fundamentally, a closed timestamp is a
    31  // property of a Range that restricts its state such that no "visible" data
    32  // mutations are permitted at equal or earlier timestamps. This enables the
    33  // guarantee that if the closed timestamp conditions are met, a follower replica
    34  // has all state necessary to satisfy reads at or before the CT. On the other
    35  // hand, a resolved timestamp is a property of a rangefeed that restrict its
    36  // state such that no value notifications will be emitted at equal or earlier
    37  // timestamps. The key difference here is that data mutations are allowed on a
    38  // Range beneath a closed timestamp as long as they are not externally
    39  // "visible". This is not true of the resolved timestamp because a rangefeed is
    40  // driven directly off the state of a Range through its Raft log updates. As
    41  // such, all changes to a Range beneath a given timestamp that will end up
    42  // published on a rangefeed, "visible" or not, must be made to the Range before
    43  // its resolved timestamp can be advanced to that timestamp.
    44  //
    45  // This distinction becomes interesting when considering how committed
    46  // transaction intents are published as rangefeed events. Because the rangefeed
    47  // is driven off the Raft log, these events are published when the intents are
    48  // resolved, not when their corresponding transactions are committed. This leads
    49  // to an important case with unresolved intents where a Range's closed timestamp
    50  // and its corresponding rangefeed's resolved timestamp diverge because the
    51  // closed timestamp advances past the timestamp of the unresolved intents. This
    52  // is permitted because intent resolution of an intent which is part of a
    53  // committed or aborted transaction does not change the visible state of a
    54  // range. In effect, this means that a range can "close" timestamp t2 before
    55  // resolving an intent for a transaction at timestamp t1, where t1 < t2.
    56  // However, the rangefeed cannot "resolve" timestamp t2 until after the event
    57  // for the committed intent is published. This creates a scenario where the
    58  // Range's closed timestamp could have advanced to t2 while the corresponding
    59  // rangefeed's resolved timestamp lags behind at some time earlier than t1,
    60  // waiting for the intent resolution to result in a rangefeed publication.
    61  //
    62  // It follows that the closed timestamp mechanism is a necessary, but not
    63  // sufficient, solution to creating a resolved timstamp. The closed timestamp is
    64  // a necessary basis for the resolved timestamp because without it there could
    65  // never be any guarantee about new changes to a range. However, the closed
    66  // timestamp is not sufficient and could not replace the resolved timestamp
    67  // directly because it does not wait for all changes to result in rangefeed
    68  // notifications before advancing. In order to provide the proper guarantees for
    69  // the resolved timestamp, it must be computed as the minimum of the closed
    70  // timestamp and the timestamp of the earliest "unresolved intent" (see below)
    71  // in the rangefeed's corresponding range of keys. This relies on one implicit
    72  // assumption that is important enough to state: the closed timestamp must
    73  // ensure that no unresolved intents will appear at timestamps that it has
    74  // already closed off. Naively this property will hold, but it requires that
    75  // changes to the closed timestamp and the tracking of unresolved intents be
    76  // handled carefully and in a total order.
    77  type resolvedTimestamp struct {
    78  	init       bool
    79  	closedTS   hlc.Timestamp
    80  	resolvedTS hlc.Timestamp
    81  	intentQ    unresolvedIntentQueue
    82  }
    83  
    84  func makeResolvedTimestamp() resolvedTimestamp {
    85  	return resolvedTimestamp{
    86  		intentQ: makeUnresolvedIntentQueue(),
    87  	}
    88  }
    89  
    90  // Get returns the current value of the resolved timestamp.
    91  func (rts *resolvedTimestamp) Get() hlc.Timestamp {
    92  	return rts.resolvedTS
    93  }
    94  
    95  // Init informs the resolved timestamp that it has been provided all unresolved
    96  // intents within its key range that may have timestamps lower than the initial
    97  // closed timestamp. Once initialized, the resolvedTimestamp can begin operating
    98  // in its steady state. The method returns whether this caused the resolved
    99  // timestamp to move forward.
   100  func (rts *resolvedTimestamp) Init() bool {
   101  	rts.init = true
   102  	// Once the resolvedTimestamp is initialized, all prior written intents
   103  	// should be accounted for, so reference counts for transactions that
   104  	// would drop below zero will all be due to aborted transactions. These
   105  	// can all be ignored.
   106  	rts.intentQ.AllowNegRefCount(false)
   107  	return rts.recompute()
   108  }
   109  
   110  // IsInit returns whether the resolved timestamp is initialized.
   111  func (rts *resolvedTimestamp) IsInit() bool {
   112  	return rts.init
   113  }
   114  
   115  // ForwardClosedTS indicates that the closed timestamp that serves as the basis
   116  // for the resolved timestamp has advanced. The method returns whether this
   117  // caused the resolved timestamp to move forward.
   118  func (rts *resolvedTimestamp) ForwardClosedTS(newClosedTS hlc.Timestamp) bool {
   119  	if rts.closedTS.Forward(newClosedTS) {
   120  		return rts.recompute()
   121  	}
   122  	rts.assertNoChange()
   123  	return false
   124  }
   125  
   126  // ConsumeLogicalOp informs the resolved timestamp of the occupance of a logical
   127  // operation within its range of tracked keys. This allows the structure to
   128  // update its internal intent tracking to reflect the change. The method returns
   129  // whether this caused the resolved timestamp to move forward.
   130  func (rts *resolvedTimestamp) ConsumeLogicalOp(op enginepb.MVCCLogicalOp) bool {
   131  	if rts.consumeLogicalOp(op) {
   132  		return rts.recompute()
   133  	}
   134  	rts.assertNoChange()
   135  	return false
   136  }
   137  
   138  func (rts *resolvedTimestamp) consumeLogicalOp(op enginepb.MVCCLogicalOp) bool {
   139  	switch t := op.GetValue().(type) {
   140  	case *enginepb.MVCCWriteValueOp:
   141  		rts.assertOpAboveRTS(op, t.Timestamp)
   142  		return false
   143  
   144  	case *enginepb.MVCCWriteIntentOp:
   145  		rts.assertOpAboveRTS(op, t.Timestamp)
   146  		return rts.intentQ.IncRef(t.TxnID, t.TxnKey, t.TxnMinTimestamp, t.Timestamp)
   147  
   148  	case *enginepb.MVCCUpdateIntentOp:
   149  		return rts.intentQ.UpdateTS(t.TxnID, t.Timestamp)
   150  
   151  	case *enginepb.MVCCCommitIntentOp:
   152  		return rts.intentQ.DecrRef(t.TxnID, t.Timestamp)
   153  
   154  	case *enginepb.MVCCAbortIntentOp:
   155  		// An aborted intent does not necessarily indicate an aborted
   156  		// transaction. An AbortIntent operation can be the result of an intent
   157  		// that was written only in an earlier epoch being resolved after its
   158  		// transaction committed in a later epoch. Don't make any assumptions
   159  		// about the transaction other than to decrement its reference count.
   160  		return rts.intentQ.DecrRef(t.TxnID, hlc.Timestamp{})
   161  
   162  	case *enginepb.MVCCAbortTxnOp:
   163  		// Unlike the previous case, an aborted transaction does indicate
   164  		// that none of the transaction's intents will ever be committed.
   165  		// This means that we can stop tracking the transaction entirely.
   166  		// Doing so is critical to ensure forward progress of the resolved
   167  		// timestamp in situtations where the oldest transaction on a range
   168  		// is abandoned and the locations of its intents are unknown.
   169  		//
   170  		// However, the transaction may also still be writing, updating, and
   171  		// resolving (aborting) its intents, so we need to be careful with
   172  		// how we handle any future operations from this transaction. There
   173  		// are three different operations we could see the zombie transaction
   174  		// perform:
   175  		//
   176  		// - MVCCWriteIntentOp: it could write another intent. This could result
   177  		//     in "reintroducing" the transaction to the queue. We allow this
   178  		//     to happen and rely on pushing the transaction again, eventually
   179  		//     evicting the transaction from the queue for good.
   180  		//
   181  		//     Just like any other transaction, this new intent will necessarily
   182  		//     be pushed above the closed timestamp, so we don't need to worry
   183  		//     about resolved timestamp regressions.
   184  		//
   185  		// - MVCCUpdateIntentOp: it could update one of its intents. If we're
   186  		//     not already tracking the transaction then the queue will ignore
   187  		//     the intent update.
   188  		//
   189  		// - MVCCAbortIntentOp: it could resolve one of its intents as aborted.
   190  		//     This is the most likely case. Again, if we're not already tracking
   191  		//     the transaction then the queue will ignore the intent abort.
   192  		//
   193  		if !rts.IsInit() {
   194  			// We ignore MVCCAbortTxnOp operations until the queue is
   195  			// initialized. This is necessary because we allow txn reference
   196  			// counts to drop below zero before the queue is initialized and
   197  			// expect that all reference count decrements be balanced by a
   198  			// corresponding reference count increment.
   199  			//
   200  			// We could remove this restriction if we evicted all transactions
   201  			// with negative reference counts after initialization, but this is
   202  			// easier and more clear.
   203  			return false
   204  		}
   205  		return rts.intentQ.Del(t.TxnID)
   206  
   207  	default:
   208  		panic(fmt.Sprintf("unknown logical op %T", t))
   209  	}
   210  }
   211  
   212  // recompute computes the resolved timestamp based on its respective closed
   213  // timestamp and the in-flight intents that it is tracking. The method returns
   214  // whether this caused the resolved timestamp to move forward.
   215  func (rts *resolvedTimestamp) recompute() bool {
   216  	if !rts.IsInit() {
   217  		return false
   218  	}
   219  	newTS := rts.closedTS
   220  	if txn := rts.intentQ.Oldest(); txn != nil {
   221  		txnTS := txn.timestamp.FloorPrev()
   222  		if txnTS.Less(newTS) {
   223  			newTS = txnTS
   224  		}
   225  	}
   226  	if newTS.Less(rts.resolvedTS) {
   227  		panic(fmt.Sprintf("resolved timestamp regression, was %s, recomputed as %s",
   228  			rts.resolvedTS, newTS))
   229  	}
   230  	return rts.resolvedTS.Forward(newTS)
   231  }
   232  
   233  // assertNoChange asserts that a recomputation of the resolved timestamp does
   234  // not change its value. A violation of this assertion would indicate a logic
   235  // error in the resolvedTimestamp implementation.
   236  func (rts *resolvedTimestamp) assertNoChange() {
   237  	before := rts.resolvedTS
   238  	changed := rts.recompute()
   239  	if changed || (before != rts.resolvedTS) {
   240  		panic(fmt.Sprintf("unexpected resolved timestamp change on recomputation, "+
   241  			"was %s, recomputed as %s", before, rts.resolvedTS))
   242  	}
   243  }
   244  
   245  // assertOpAboveTimestamp asserts that this operation is at a larger timestamp
   246  // than the current resolved timestamp. A violation of this assertion would
   247  // indicate a failure of the closed timestamp mechanism.
   248  func (rts *resolvedTimestamp) assertOpAboveRTS(op enginepb.MVCCLogicalOp, opTS hlc.Timestamp) {
   249  	if opTS.LessEq(rts.resolvedTS) {
   250  		panic(fmt.Sprintf("resolved timestamp %s equal to or above timestamp of operation %v",
   251  			rts.resolvedTS, op))
   252  	}
   253  }
   254  
   255  // An "unresolved intent" in the context of the rangefeed primitive is an intent
   256  // that may at some point in the future result in a RangeFeedValue publication.
   257  // Based on this definition, there are three possible states that an extent
   258  // intent can be in while fitting the requirement to be an "unresolved intent":
   259  // 1. part of a PENDING transaction
   260  // 2. part of a STAGING transaction that has not been explicitly committed yet
   261  // 3. part of a COMMITTED transaction but not yet resolved due to the asynchronous
   262  //    nature of intent resolution
   263  // Notably, this means that an intent that exists but that is known to be part
   264  // of an ABORTED transaction is not considered "unresolved", even if it has yet
   265  // to be cleaned up. In the context of rangefeeds, the intent's fate is resolved
   266  // to never result in a RangeFeedValue publication.
   267  //
   268  // Defining unresolved intents in this way presents two paths for an unresolved
   269  // intent to become resolved (and thus decrement the unresolvedTxn's ref count).
   270  // An unresolved intent can become resolved if:
   271  // 1. it is COMMITTED or ABORTED through the traditional intent resolution
   272  //    process.
   273  // 2. it's transaction is observed to be ABORTED, meaning that it is by
   274  //    definition resolved even if it has yet to be cleaned up by the intent
   275  //    resolution process.
   276  //
   277  // An unresolvedTxn is a transaction that has one or more unresolved intents on
   278  // a given range. The structure itself maintains metadata about the transaction
   279  // along with a reference count of the number of unresolved intents created by
   280  // the transaction on a given range.
   281  type unresolvedTxn struct {
   282  	txnID           uuid.UUID
   283  	txnKey          roachpb.Key
   284  	txnMinTimestamp hlc.Timestamp
   285  	timestamp       hlc.Timestamp
   286  	refCount        int // count of unresolved intents
   287  
   288  	// The index of the item in the unresolvedTxnHeap, maintained by the
   289  	// heap.Interface methods.
   290  	index int
   291  }
   292  
   293  // asTxnMeta returns a TxnMeta representation of the unresolved transaction.
   294  func (t *unresolvedTxn) asTxnMeta() enginepb.TxnMeta {
   295  	return enginepb.TxnMeta{
   296  		ID:             t.txnID,
   297  		Key:            t.txnKey,
   298  		MinTimestamp:   t.txnMinTimestamp,
   299  		WriteTimestamp: t.timestamp,
   300  	}
   301  }
   302  
   303  // unresolvedTxnHeap implements heap.Interface and holds unresolvedTxns.
   304  // Transactions are prioritized based on their timestamp such that the oldest
   305  // unresolved transaction will rise to the top of the heap.
   306  type unresolvedTxnHeap []*unresolvedTxn
   307  
   308  func (h unresolvedTxnHeap) Len() int { return len(h) }
   309  
   310  func (h unresolvedTxnHeap) Less(i, j int) bool {
   311  	// container/heap constructs a min-heap by default, so prioritize the txn
   312  	// with the smaller timestamp. Break ties by comparing IDs to establish a
   313  	// total order.
   314  	if h[i].timestamp == h[j].timestamp {
   315  		return bytes.Compare(h[i].txnID.GetBytes(), h[j].txnID.GetBytes()) < 0
   316  	}
   317  	return h[i].timestamp.Less(h[j].timestamp)
   318  }
   319  
   320  func (h unresolvedTxnHeap) Swap(i, j int) {
   321  	h[i], h[j] = h[j], h[i]
   322  	h[i].index, h[j].index = i, j
   323  }
   324  
   325  func (h *unresolvedTxnHeap) Push(x interface{}) {
   326  	n := len(*h)
   327  	txn := x.(*unresolvedTxn)
   328  	txn.index = n
   329  	*h = append(*h, txn)
   330  }
   331  
   332  func (h *unresolvedTxnHeap) Pop() interface{} {
   333  	old := *h
   334  	n := len(old)
   335  	txn := old[n-1]
   336  	txn.index = -1 // for safety
   337  	old[n-1] = nil // for gc
   338  	*h = old[0 : n-1]
   339  	return txn
   340  }
   341  
   342  // unresolvedIntentQueue tracks all unresolved intents that exist within the key
   343  // bounds of a range. It does so by tracking every transaction that contains at
   344  // least one unresolved intent on the range. For each of these transactions, the
   345  // queue maintains a count of the number of unresolved intents it contains on
   346  // the range. By doing so and watching for when the count drops to zero, the
   347  // queue can determine when a transaction is no longer unresolved.
   348  //
   349  // The queue maintains an ordering of transactions by timestamp. This allows it
   350  // to determine the oldest unresolved intent that it's tracking and by extension
   351  // the earliest possible time that a RangeFeedValue can be emitted at. Combined
   352  // with a closed timestamp, which guarantees that no transactions can write new
   353  // intents at or beneath it, a resolved timestamp can be constructed.
   354  type unresolvedIntentQueue struct {
   355  	txns             map[uuid.UUID]*unresolvedTxn
   356  	minHeap          unresolvedTxnHeap
   357  	allowNegRefCount bool
   358  }
   359  
   360  func makeUnresolvedIntentQueue() unresolvedIntentQueue {
   361  	return unresolvedIntentQueue{
   362  		txns:             make(map[uuid.UUID]*unresolvedTxn),
   363  		allowNegRefCount: true,
   364  	}
   365  }
   366  
   367  // Len returns the number of transactions being tracked.
   368  func (uiq *unresolvedIntentQueue) Len() int {
   369  	return uiq.minHeap.Len()
   370  }
   371  
   372  // Oldest returns the oldest transaction that is being tracked in the
   373  // unresolvedIntentQueue, or nil if the queue is empty. If two transactions have
   374  // the same timestamp, breaks the tie by returning the transaction with the ID
   375  // that sorts first.
   376  func (uiq *unresolvedIntentQueue) Oldest() *unresolvedTxn {
   377  	if uiq.Len() == 0 {
   378  		return nil
   379  	}
   380  	return uiq.minHeap[0]
   381  }
   382  
   383  // Before returns all transactions that have timestamps before a certain
   384  // timestamp. It does so in O(n) time, where n is the number of matching
   385  // transactions, NOT the total number of transactions being tracked. The
   386  // resulting transactions will not be in sorted order.
   387  func (uiq *unresolvedIntentQueue) Before(ts hlc.Timestamp) []*unresolvedTxn {
   388  	var txns []*unresolvedTxn
   389  	var collect func(int)
   390  	collect = func(i int) {
   391  		if len(uiq.minHeap) > i && uiq.minHeap[i].timestamp.Less(ts) {
   392  			txns = append(txns, uiq.minHeap[i])
   393  			collect((2 * i) + 1) // left child
   394  			collect((2 * i) + 2) // right child
   395  		}
   396  	}
   397  	collect(0)
   398  	return txns
   399  }
   400  
   401  // IncRef increments the reference count of the specified transaction. It
   402  // returns whether the update advanced the timestamp of the oldest transaction
   403  // in the queue.
   404  func (uiq *unresolvedIntentQueue) IncRef(
   405  	txnID uuid.UUID, txnKey roachpb.Key, txnMinTS, ts hlc.Timestamp,
   406  ) bool {
   407  	return uiq.updateTxn(txnID, txnKey, txnMinTS, ts, +1)
   408  }
   409  
   410  // DecrRef decrements the reference count of the specified transaction. It
   411  // returns whether the update advanced the timestamp of the oldest transaction
   412  // in the queue.
   413  func (uiq *unresolvedIntentQueue) DecrRef(txnID uuid.UUID, ts hlc.Timestamp) bool {
   414  	return uiq.updateTxn(txnID, nil, hlc.Timestamp{}, ts, -1)
   415  }
   416  
   417  // UpdateTS updates the timestamp of the specified transaction without modifying
   418  // its intent reference count. It returns whether the update advanced the
   419  // timestamp of the oldest transaction in the queue.
   420  func (uiq *unresolvedIntentQueue) UpdateTS(txnID uuid.UUID, ts hlc.Timestamp) bool {
   421  	return uiq.updateTxn(txnID, nil, hlc.Timestamp{}, ts, 0)
   422  }
   423  
   424  func (uiq *unresolvedIntentQueue) updateTxn(
   425  	txnID uuid.UUID, txnKey roachpb.Key, txnMinTS, ts hlc.Timestamp, delta int,
   426  ) bool {
   427  	txn, ok := uiq.txns[txnID]
   428  	if !ok {
   429  		if delta == 0 || (delta < 0 && !uiq.allowNegRefCount) {
   430  			// Unknown txn.
   431  			return false
   432  		}
   433  
   434  		// Add new txn to the queue.
   435  		txn = &unresolvedTxn{
   436  			txnID:           txnID,
   437  			txnKey:          txnKey,
   438  			txnMinTimestamp: txnMinTS,
   439  			timestamp:       ts,
   440  			refCount:        delta,
   441  		}
   442  		uiq.txns[txn.txnID] = txn
   443  		heap.Push(&uiq.minHeap, txn)
   444  
   445  		// Adding a new txn can't advance the queue's earliest timestamp.
   446  		return false
   447  	}
   448  
   449  	// Will changes to the txn advance the queue's earliest timestamp?
   450  	wasMin := txn.index == 0
   451  
   452  	txn.refCount += delta
   453  	if txn.refCount == 0 || (txn.refCount < 0 && !uiq.allowNegRefCount) {
   454  		// Remove txn from the queue.
   455  		// NB: the txn.refCount < 0 case is not exercised by the external
   456  		// interface of this type because currently |delta| <= 1, but it
   457  		// is included for robustness.
   458  		delete(uiq.txns, txn.txnID)
   459  		heap.Remove(&uiq.minHeap, txn.index)
   460  		return wasMin
   461  	}
   462  
   463  	// Forward the txn's timestamp. Need to fix heap if timestamp changes.
   464  	if txn.timestamp.Forward(ts) {
   465  		heap.Fix(&uiq.minHeap, txn.index)
   466  		return wasMin
   467  	}
   468  	return false
   469  }
   470  
   471  // Del removes the transaction from the queue. It returns whether the update had
   472  // an effect on the oldest transaction in the queue.
   473  func (uiq *unresolvedIntentQueue) Del(txnID uuid.UUID) bool {
   474  	// This implementation is logically equivalent to the following, but
   475  	// it avoids underflow conditions:
   476  	//  return uiq.updateTxn(txnID, nil, hlc.Timestamp{}, hlc.Timestamp{}, math.MinInt64)
   477  
   478  	txn, ok := uiq.txns[txnID]
   479  	if !ok {
   480  		// Unknown txn.
   481  		return false
   482  	}
   483  
   484  	// Will deleting the txn advance the queue's earliest timestamp?
   485  	wasMin := txn.index == 0
   486  
   487  	// Remove txn from the queue.
   488  	delete(uiq.txns, txn.txnID)
   489  	heap.Remove(&uiq.minHeap, txn.index)
   490  	return wasMin
   491  }
   492  
   493  // AllowNegRefCount instruts the unresolvedIntentQueue on whether or not to
   494  // allow the reference count on transactions to drop below zero. If disallowed,
   495  // the method also asserts that all unresolved intent refcounts for transactions
   496  // currently in the queue are positive. Assertion takes O(n) time, where n is
   497  // the total number of transactions being tracked in the queue.
   498  func (uiq *unresolvedIntentQueue) AllowNegRefCount(b bool) {
   499  	if !b {
   500  		// Assert that the queue is currently in compliance.
   501  		uiq.assertOnlyPositiveRefCounts()
   502  	}
   503  	uiq.allowNegRefCount = b
   504  }
   505  
   506  func (uiq *unresolvedIntentQueue) assertOnlyPositiveRefCounts() {
   507  	for _, txn := range uiq.txns {
   508  		if txn.refCount <= 0 {
   509  			panic(fmt.Sprintf("negative refcount %d for txn %+v", txn.refCount, txn))
   510  		}
   511  	}
   512  }