github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/concurrency_control.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/concurrency_control.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Package concurrency provides a concurrency manager structure that
    12  // encapsulates the details of concurrency control and contention handling for
    13  // serializable key-value transactions.
    14  package concurrency
    15  
    16  import (
    17  	"context"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    25  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    26  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    27  )
    28  
    29  // Manager is a structure that sequences incoming requests and provides
    30  // isolation between requests that intend to perform conflicting operations.
    31  // During sequencing, conflicts are discovered and any found are resolved
    32  // through a combination of passive queuing and active pushing. Once a request
    33  // has been sequenced, it is free to evaluate without concerns of conflicting
    34  // with other in-flight requests due to the isolation provided by the manager.
    35  // This isolation is guaranteed for the lifetime of the request but terminates
    36  // once the request completes.
    37  //
    38  // Transactions require isolation both within requests and across requests. The
    39  // manager accommodates this by allowing transactional requests to acquire
    40  // locks, which outlive the requests themselves. Locks extend the duration of
    41  // the isolation provided over specific keys to the lifetime of the lock-holder
    42  // transaction itself. They are (typically) only released when the transaction
    43  // commits or aborts. Other requests that find these locks while being sequenced
    44  // wait on them to be released in a queue before proceeding. Because locks are
    45  // checked during sequencing, requests are guaranteed access to all declared
    46  // keys after they have been sequenced. In other words, locks don't need to be
    47  // checked again during evaluation.
    48  //
    49  // However, at the time of writing, not all locks are stored directly under the
    50  // manager's control, so not all locks are discoverable during sequencing.
    51  // Specifically, write intents (replicated, exclusive locks) are stored inline
    52  // in the MVCC keyspace, so they are not detectable until request evaluation
    53  // time. To accommodate this form of lock storage, the manager exposes a
    54  // HandleWriterIntentError method, which can be used in conjunction with a retry
    55  // loop around evaluation to integrate external locks with the concurrency
    56  // manager structure. In the future, we intend to pull all locks, including
    57  // those associated with write intents, into the concurrency manager directly
    58  // through a replicated lock table structure.
    59  //
    60  // Fairness is ensured between requests. In general, if any two requests
    61  // conflict then the request that arrived first will be sequenced first. As
    62  // such, sequencing guarantees FIFO semantics. The primary exception to this is
    63  // that a request that is part of a transaction which has already acquired a
    64  // lock does not need to wait on that lock during sequencing, and can therefore
    65  // ignore any queue that has formed on the lock. For other exceptions, see the
    66  // later comment for lockTable.
    67  //
    68  // Internal Components
    69  //
    70  // The concurrency manager is composed of a number of internal synchronization,
    71  // bookkeeping, and queueing structures. Each of these is discussed in more
    72  // detail on their interface definition. The following diagram details how the
    73  // components are tied together:
    74  //
    75  //  +---------------------+---------------------------------------------+
    76  //  | concurrency.Manager |                                             |
    77  //  +---------------------+                                             |
    78  //  |                                                                   |
    79  //  +------------+  acquire  +--------------+        acquire            |
    80  //    Sequence() |--->--->---| latchManager |<---<---<---<---<---<---+  |
    81  //  +------------+           +--------------+                        |  |
    82  //  |                         / check locks + wait queues            |  |
    83  //  |                        v  if conflict, enter q & drop latches  ^  |
    84  //  |         +---------------------------------------------------+  |  |
    85  //  |         | [ lockTable ]                                     |  |  |
    86  //  |         | [    key1   ]    -------------+-----------------+ |  ^  |
    87  //  |         | [    key2   ]  /  lockState:  | lockWaitQueue:  |----<---<---<----+
    88  //  |         | [    key3   ]-{   - lock type | +-[a]<-[b]<-[c] | |  |  |         |
    89  //  |         | [    key4   ]  \  - txn  meta | |  (no latches) |-->-^  |         |
    90  //  |         | [    key5   ]    -------------+-|---------------+ |     |         |
    91  //  |         | [    ...    ]                   v                 |     |         ^
    92  //  |         +---------------------------------|-----------------+     |         | if lock found, HandleWriterIntentError()
    93  //  |                 |                         |                       |         |  - enter lockWaitQueue
    94  //  |                 |       +- may be remote -+--+                    |         |  - drop latches
    95  //  |                 |       |                    |                    |         |  - wait for lock update / release
    96  //  |                 v       v                    ^                    |         |
    97  //  |                 |    +--------------------------+                 |         ^
    98  //  |                 |    | txnWaitQueue:            |                 |         |
    99  //  |                 |    | (located on txn record's |                 |         |
   100  //  |                 v    |  leaseholder replica)    |                 |         |
   101  //  |                 |    |--------------------------|                 |         ^
   102  //  |                 |    | [txn1] [txn2] [txn3] ... |----<---<---<---<----+     |
   103  //  |                 |    +--------------------------+                 |   | if txn push failed, HandleTransactionPushError()
   104  //  |                 |                                                 |   |  - enter txnWaitQueue
   105  //  |                 |                                                 |   ^  - drop latches
   106  //  |                 |                                                 |   |  - wait for txn record update
   107  //  |                 |                                                 |   |     |
   108  //  |                 |                                                 |   |     |
   109  //  |                 +--> retain latches --> remain at head of queues ---> evaluate ---> Finish()
   110  //  |                                                                   |
   111  //  +----------+                                                        |
   112  //    Finish() | ---> exit wait queues ---> drop latches -----------------> respond ...
   113  //  +----------+                                                        |
   114  //  |                                                                   |
   115  //  +-------------------------------------------------------------------+
   116  //
   117  // See the comments on individual components for a more detailed look at their
   118  // interface and inner-workings.
   119  //
   120  // At a high-level, a request enters the concurrency manager and immediately
   121  // acquires latches from the latchManager to serialize access to the keys that
   122  // it intends to touch. This latching takes into account the keys being
   123  // accessed, the MVCC timestamp of accesses, and the access method being used
   124  // (read vs. write) to allow for concurrency where possible. This has the effect
   125  // of queuing on conflicting in-flight operations until their completion.
   126  //
   127  // Once latched, the request consults the lockTable to check for any conflicting
   128  // locks owned by other transactions. If any are found, the request enters the
   129  // corresponding lockWaitQueue and its latches are dropped. Requests in the
   130  // queue wait for the corresponding lock to be released by intent resolution.
   131  // While waiting, the head of the lockWaitQueue pushes the owner of the lock
   132  // through a remote RPC that ends up in the pushee's txnWaitQueue. This queue
   133  // exists on the leaseholder replica of the range that contains the pushee's
   134  // transaction record. Other entries in the queue wait for the head of the
   135  // queue, eventually pushing it to detect coordinator failures and transaction
   136  // deadlocks. Once the lock is released, the head of the queue reacquires
   137  // latches and attempts to proceed while remaining at the head of that
   138  // lockWaitQueue to ensure fairness.
   139  //
   140  // Once a request is latched and observes no conflicting locks in the lockTable
   141  // and no conflicting lockWaitQueues that it is not already the head of, the
   142  // request can proceed to evaluate. During evaluation, the request may insert or
   143  // remove locks from the lockTable for its own transaction.
   144  //
   145  // When the request completes, it exits any lockWaitQueues that it was a part of
   146  // and releases its latches. However, if the request was successful, any locks
   147  // that it inserted into the lockTable remain.
   148  type Manager interface {
   149  	RequestSequencer
   150  	ContentionHandler
   151  	LockManager
   152  	TransactionManager
   153  	RangeStateListener
   154  	MetricExporter
   155  }
   156  
   157  // RequestSequencer is concerned with the sequencing of concurrent requests. It
   158  // is one of the roles of Manager.
   159  type RequestSequencer interface {
   160  	// SequenceReq acquires latches, checks for locks, and queues behind and/or
   161  	// pushes other transactions to resolve any conflicts. Once sequenced, the
   162  	// request is guaranteed sufficient isolation for the duration of its
   163  	// evaluation, until the returned request guard is released.
   164  	// NOTE: this last part will not be true until replicated locks are pulled
   165  	// into the concurrency manager.
   166  	//
   167  	// An optional existing request guard can be provided to SequenceReq. This
   168  	// allows the request's position in lock wait-queues to be retained across
   169  	// sequencing attempts. If provided, the guard should not be holding latches
   170  	// already. The expected usage of this parameter is that it will only be
   171  	// provided after acquiring a Guard from a ContentionHandler method.
   172  	//
   173  	// If the method returns a non-nil request guard then the caller must ensure
   174  	// that the guard is eventually released by passing it to FinishReq.
   175  	//
   176  	// Alternatively, the concurrency manager may be able to serve the request
   177  	// directly, in which case it will return a Response for the request. If it
   178  	// does so, it will not return a request guard.
   179  	SequenceReq(context.Context, *Guard, Request) (*Guard, Response, *Error)
   180  
   181  	// FinishReq marks the request as complete, releasing any protection
   182  	// the request had against conflicting requests and allowing conflicting
   183  	// requests that are blocked on this one to proceed. The guard should not
   184  	// be used after being released.
   185  	FinishReq(*Guard)
   186  }
   187  
   188  // ContentionHandler is concerned with handling contention-related errors. This
   189  // typically involves preparing the request to be queued upon a retry. It is one
   190  // of the roles of Manager.
   191  type ContentionHandler interface {
   192  	// HandleWriterIntentError consumes a WriteIntentError by informing the
   193  	// concurrency manager about the replicated write intent that was missing
   194  	// from its lock table which was found during request evaluation (while
   195  	// holding latches). After doing so, it enqueues the request that hit the
   196  	// error in the lock's wait-queue (but does not wait) and releases the
   197  	// guard's latches. It returns an updated guard reflecting this change.
   198  	// After the method returns, the original guard should no longer be used.
   199  	// If an error is returned then the provided guard will be released and no
   200  	// guard will be returned.
   201  	//
   202  	// Example usage: Txn A scans the lock table and does not see an intent on
   203  	// key K from txn B because the intent is not being tracked in the lock
   204  	// table. Txn A moves on to evaluation. While scanning, it notices the
   205  	// intent on key K. It throws a WriteIntentError which is consumed by this
   206  	// method before txn A retries its scan. During the retry, txn A scans the
   207  	// lock table and observes the lock on key K, so it enters the lock's
   208  	// wait-queue and waits for it to be resolved.
   209  	HandleWriterIntentError(context.Context, *Guard, *roachpb.WriteIntentError) (*Guard, *Error)
   210  
   211  	// HandleTransactionPushError consumes a TransactionPushError thrown by a
   212  	// PushTxnRequest by informing the concurrency manager about a transaction
   213  	// record that could not be pushed during request evaluation (while holding
   214  	// latches). After doing so, it releases the guard's latches. It returns an
   215  	// updated guard reflecting this change. After the method returns, the
   216  	// original guard should no longer be used.
   217  	//
   218  	// Example usage: Txn A sends a PushTxn request to push abort txn B. When
   219  	// the request is originally sequenced through the concurrency manager, it
   220  	// checks the txn wait-queue and finds that txn B is not being tracked, so
   221  	// it does not queue up behind it. Txn A moves on to evaluation and tries to
   222  	// push txn B's record. This push fails because txn B is not expired, which
   223  	// results in a TransactionPushError. This error is consumed by this method
   224  	// before txn A retries its push. During the retry, txn A finds that txn B
   225  	// is being tracked in the txn wait-queue so it waits there for txn B to
   226  	// finish.
   227  	HandleTransactionPushError(context.Context, *Guard, *roachpb.TransactionPushError) *Guard
   228  }
   229  
   230  // LockManager is concerned with tracking locks that are stored on the manager's
   231  // range. It is one of the roles of Manager.
   232  type LockManager interface {
   233  	// OnLockAcquired informs the concurrency manager that a transaction has
   234  	// acquired a new lock or re-acquired an existing lock that it already held.
   235  	OnLockAcquired(context.Context, *roachpb.LockAcquisition)
   236  
   237  	// OnLockUpdated informs the concurrency manager that a transaction has
   238  	// updated or released a lock or range of locks that it previously held.
   239  	// The Durability field of the lock update struct is ignored.
   240  	OnLockUpdated(context.Context, *roachpb.LockUpdate)
   241  }
   242  
   243  // TransactionManager is concerned with tracking transactions that have their
   244  // record stored on the manager's range. It is one of the roles of Manager.
   245  type TransactionManager interface {
   246  	// OnTransactionUpdated informs the concurrency manager that a transaction's
   247  	// status was updated.
   248  	OnTransactionUpdated(context.Context, *roachpb.Transaction)
   249  
   250  	// GetDependents returns a set of transactions waiting on the specified
   251  	// transaction either directly or indirectly. The method is used to perform
   252  	// deadlock detection. See txnWaitQueue for more.
   253  	GetDependents(uuid.UUID) []uuid.UUID
   254  }
   255  
   256  // RangeStateListener is concerned with observing updates to the concurrency
   257  // manager's range. It is one of the roles of Manager.
   258  type RangeStateListener interface {
   259  	// OnRangeDescUpdated informs the manager that its range's descriptor has been
   260  	// updated.
   261  	OnRangeDescUpdated(*roachpb.RangeDescriptor)
   262  
   263  	// OnRangeLeaseUpdated informs the concurrency manager that its range's
   264  	// lease has been updated. The argument indicates whether this manager's
   265  	// replica is the leaseholder going forward.
   266  	OnRangeLeaseUpdated(isLeaseholder bool)
   267  
   268  	// OnRangeSplit informs the concurrency manager that its range has split off
   269  	// a new range to its RHS.
   270  	OnRangeSplit()
   271  
   272  	// OnRangeMerge informs the concurrency manager that its range has merged
   273  	// into its LHS neighbor. This is not called on the LHS range being merged
   274  	// into.
   275  	OnRangeMerge()
   276  
   277  	// OnReplicaSnapshotApplied informs the concurrency manager that its replica
   278  	// has received a snapshot from another replica in its range.
   279  	OnReplicaSnapshotApplied()
   280  }
   281  
   282  // MetricExporter is concerned with providing observability into the state of
   283  // the concurrency manager. It is one of the roles of Manager.
   284  type MetricExporter interface {
   285  	// LatchMetrics returns information about the state of the latchManager.
   286  	LatchMetrics() (global, local kvserverpb.LatchManagerInfo)
   287  
   288  	// LockTableDebug returns a debug string representing the state of the
   289  	// lockTable.
   290  	LockTableDebug() string
   291  
   292  	// TxnWaitQueue returns the concurrency manager's txnWaitQueue.
   293  	// TODO(nvanbenschoten): this doesn't really fit into this interface. It
   294  	// would be nice if the txnWaitQueue was hidden behind the concurrency
   295  	// manager abstraction entirely, but tests want to access it directly.
   296  	TxnWaitQueue() *txnwait.Queue
   297  
   298  	// TODO(nvanbenschoten): fill out this interface to provide observability
   299  	// into the state of the concurrency manager.
   300  	// LatchMetrics()
   301  	// LockTableMetrics()
   302  	// TxnWaitQueueMetrics()
   303  }
   304  
   305  ///////////////////////////////////
   306  // External API Type Definitions //
   307  ///////////////////////////////////
   308  
   309  // Request is the input to Manager.SequenceReq. The struct contains all of the
   310  // information necessary to sequence a KV request and determine which locks and
   311  // other in-flight requests it conflicts with.
   312  type Request struct {
   313  	// The (optional) transaction that sent the request.
   314  	// Non-transactional requests do not acquire locks.
   315  	Txn *roachpb.Transaction
   316  
   317  	// The timestamp that the request should evaluate at.
   318  	// Should be set to Txn.ReadTimestamp if Txn is non-nil.
   319  	Timestamp hlc.Timestamp
   320  
   321  	// The priority of the request. Only set if Txn is nil.
   322  	Priority roachpb.UserPriority
   323  
   324  	// The consistency level of the request. Only set if Txn is nil.
   325  	ReadConsistency roachpb.ReadConsistencyType
   326  
   327  	// The individual requests in the batch.
   328  	Requests []roachpb.RequestUnion
   329  
   330  	// The maximal set of spans that the request will access. Latches
   331  	// will be acquired for these spans.
   332  	// TODO(nvanbenschoten): don't allocate these SpanSet objects.
   333  	LatchSpans *spanset.SpanSet
   334  
   335  	// The maximal set of spans within which the request expects to have
   336  	// isolation from conflicting transactions. Conflicting locks within
   337  	// these spans will be queued on and conditionally pushed.
   338  	//
   339  	// Note that unlike LatchSpans, the timestamps that these spans are
   340  	// declared at are NOT consulted. All read spans are considered to take
   341  	// place at the transaction's read timestamp (Txn.ReadTimestamp) and all
   342  	// write spans are considered to take place the transaction's write
   343  	// timestamp (Txn.WriteTimestamp). If the request is non-transactional
   344  	// (Txn == nil), all reads and writes are considered to take place at
   345  	// Timestamp.
   346  	LockSpans *spanset.SpanSet
   347  }
   348  
   349  // Guard is returned from Manager.SequenceReq. The guard is passed back in to
   350  // Manager.FinishReq to release the request's resources when it has completed.
   351  type Guard struct {
   352  	Req Request
   353  	lg  latchGuard
   354  	ltg lockTableGuard
   355  }
   356  
   357  // Response is a slice of responses to requests in a batch. This type is used
   358  // when the concurrency manager is able to respond to a request directly during
   359  // sequencing.
   360  type Response = []roachpb.ResponseUnion
   361  
   362  // Error is an alias for a roachpb.Error.
   363  type Error = roachpb.Error
   364  
   365  ///////////////////////////////////
   366  // Internal Structure Interfaces //
   367  ///////////////////////////////////
   368  
   369  // latchManager serializes access to keys and key ranges.
   370  //
   371  // See additional documentation in pkg/storage/spanlatch.
   372  type latchManager interface {
   373  	// Acquires latches, providing mutual exclusion for conflicting requests.
   374  	Acquire(context.Context, Request) (latchGuard, *Error)
   375  
   376  	// Releases latches, relinquish its protection from conflicting requests.
   377  	Release(latchGuard)
   378  
   379  	// Info returns information about the state of the latchManager.
   380  	Info() (global, local kvserverpb.LatchManagerInfo)
   381  }
   382  
   383  // latchGuard is a handle to a set of acquired key latches.
   384  type latchGuard interface{}
   385  
   386  // lockTable holds a collection of locks acquired by in-progress transactions.
   387  // Each lock in the table has a possibly-empty lock wait-queue associated with
   388  // it, where conflicting transactions can queue while waiting for the lock to be
   389  // released.
   390  //
   391  //  +---------------------------------------------------+
   392  //  | [ lockTable ]                                     |
   393  //  | [    key1   ]    -------------+-----------------+ |
   394  //  | [    key2   ]  /  lockState:  | lockWaitQueue:  | |
   395  //  | [    key3   ]-{   - lock type | <-[a]<-[b]<-[c] | |
   396  //  | [    key4   ]  \  - txn meta  |                 | |
   397  //  | [    key5   ]    -------------+-----------------+ |
   398  //  | [    ...    ]                                     |
   399  //  +---------------------------------------------------+
   400  //
   401  // The database is read and written using "requests". Transactions are composed
   402  // of one or more requests. Isolation is needed across requests. Additionally,
   403  // since transactions represent a group of requests, isolation is needed across
   404  // such groups. Part of this isolation is accomplished by maintaining multiple
   405  // versions and part by allowing requests to acquire locks. Even the isolation
   406  // based on multiple versions requires some form of mutual exclusion to ensure
   407  // that a read and a conflicting lock acquisition do not happen concurrently.
   408  // The lock table provides both locking and sequencing of requests (in concert
   409  // with the use of latches). The lock table sequences both transactional and
   410  // non-transactional requests, but the latter cannot acquire locks.
   411  //
   412  // Locks outlive the requests themselves and thereby extend the duration of the
   413  // isolation provided over specific keys to the lifetime of the lock-holder
   414  // transaction itself. They are (typically) only released when the transaction
   415  // commits or aborts. Other requests that find these locks while being sequenced
   416  // wait on them to be released in a queue before proceeding. Because locks are
   417  // checked during sequencing, requests are guaranteed access to all declared
   418  // keys after they have been sequenced. In other words, locks don't need to be
   419  // checked again during evaluation.
   420  //
   421  // However, at the time of writing, not all locks are stored directly under
   422  // lock table control, so not all locks are discoverable during sequencing.
   423  // Specifically, write intents (replicated, exclusive locks) are stored inline
   424  // in the MVCC keyspace, so they are often not detectable until request
   425  // evaluation time. To accommodate this form of lock storage, the lock table
   426  // exposes an AddDiscoveredLock method. In the future, we intend to pull all
   427  // locks, including those associated with write intents, into the lock table
   428  // directly.
   429  //
   430  // The lock table also provides fairness between requests. If two requests
   431  // conflict then the request that arrived first will typically be sequenced
   432  // first. There are some exceptions:
   433  //
   434  //  - a request that is part of a transaction which has already acquired a lock
   435  //    does not need to wait on that lock during sequencing, and can therefore
   436  //    ignore any queue that has formed on the lock.
   437  //
   438  //  - contending requests that encounter different levels of contention may be
   439  //    sequenced in non-FIFO order. This is to allow for more concurrency. e.g.
   440  //    if request R1 and R2 contend on key K2, but R1 is also waiting at key K1,
   441  //    R2 could slip past R1 and evaluate.
   442  //
   443  type lockTable interface {
   444  	requestQueuer
   445  
   446  	// ScanAndEnqueue scans over the spans that the request will access and
   447  	// enqueues the request in the lock wait-queue of any conflicting locks
   448  	// encountered.
   449  	//
   450  	// The first call to ScanAndEnqueue for a given request uses a nil
   451  	// lockTableGuard and the subsequent calls reuse the previously returned
   452  	// one. The latches needed by the request must be held when calling this
   453  	// function.
   454  	ScanAndEnqueue(Request, lockTableGuard) lockTableGuard
   455  
   456  	// Dequeue removes the request from its lock wait-queues. It should be
   457  	// called when the request is finished, whether it evaluated or not. The
   458  	// guard should not be used after being dequeued.
   459  	//
   460  	// This method does not release any locks. This method must be called on the
   461  	// last guard returned from ScanAndEnqueue for the request, even if one of
   462  	// the (a) lockTable calls that use a lockTableGuard parameter, or (b) a
   463  	// lockTableGuard call, returned an error. The method allows but does not
   464  	// require latches to be held.
   465  	Dequeue(lockTableGuard)
   466  
   467  	// AddDiscoveredLock informs the lockTable of a lock that was discovered
   468  	// during evaluation which the lockTable wasn't previously tracking.
   469  	//
   470  	// The method is called when an exclusive replicated lock held by a
   471  	// different transaction is discovered when reading the MVCC keys during
   472  	// evaluation of this request. It adds the lock and enqueues this requester
   473  	// in its wait-queue. It is required that request evaluation discover such
   474  	// locks before acquiring its own locks, since the request needs to repeat
   475  	// ScanAndEnqueue.
   476  	//
   477  	// A latch consistent with the access desired by the guard must be held on
   478  	// the span containing the discovered lock's key.
   479  	//
   480  	// The method returns a boolean indicating whether the discovered lock was
   481  	// added to the lockTable (true) or whether it was ignored because the
   482  	// lockTable is currently disabled (false).
   483  	AddDiscoveredLock(*roachpb.Intent, lockTableGuard) (bool, error)
   484  
   485  	// AcquireLock informs the lockTable that a new lock was acquired or an
   486  	// existing lock was updated.
   487  	//
   488  	// The provided TxnMeta must be the same one used when the request scanned
   489  	// the lockTable initially. It must only be called in the evaluation phase
   490  	// before calling Dequeue, which means all the latches needed by the request
   491  	// are held. The key must be in the request's SpanSet with the appropriate
   492  	// SpanAccess: currently the strength is always Exclusive, so the span
   493  	// containing this key must be SpanReadWrite. This contract ensures that the
   494  	// lock is not held in a conflicting manner by a different transaction.
   495  	// Acquiring a lock that is already held by this transaction upgrades the
   496  	// lock's timestamp and strength, if necessary.
   497  	//
   498  	// For replicated locks, this must be called after the corresponding write
   499  	// intent has been applied to the replicated state machine.
   500  	AcquireLock(*enginepb.TxnMeta, roachpb.Key, lock.Strength, lock.Durability) error
   501  
   502  	// UpdateLocks informs the lockTable that an existing lock or range of locks
   503  	// was either updated or released.
   504  	//
   505  	// The method is called during intent resolution. For spans containing
   506  	// Replicated locks, this must be called after intent resolution has been
   507  	// applied to the replicated state machine. The method itself, however,
   508  	// ignores the Durability field in the LockUpdate. It can therefore be
   509  	// used to update locks for a given transaction for all durability levels.
   510  	//
   511  	// A latch with SpanReadWrite must be held on span with the lowest timestamp
   512  	// at which any of the locks could be held. This is explained below.
   513  	//
   514  	// Note that spans can be wider than the actual keys on which locks were
   515  	// acquired, and it is ok if no locks are found or locks held by other
   516  	// transactions are found (for those lock this call is a noop).
   517  	//
   518  	// For COMMITTED or ABORTED transactions, all locks are released.
   519  	//
   520  	// For PENDING or STAGING transactions, the behavior is:
   521  	//
   522  	// - All replicated locks known to the lockTable are dropped. This is not
   523  	//   because those intents are necessarily deleted, but because in the
   524  	//   current code where intents are not managed by the lockTable (this will
   525  	//   change when we have a segregated lock table), we do not want to risk
   526  	//   code divergence between lockTable and mvccResolveWriteIntent: the
   527  	//   danger is that the latter removes or changes an intent while the
   528  	//   lockTable retains it, and a waiter is stuck forever.
   529  	//
   530  	//   Note that even the conservative behavior of dropping locks requires
   531  	//   that intent resolution acquire latches using the oldest timestamp at
   532  	//   which the intent could have been written: if the intent was at ts=5 and
   533  	//   the intent resolution is using ts=10 (since the transaction has been
   534  	//   pushed), there is a race where a reader at ts=8 can be concurrently
   535  	//   holding latches and the following bad sequence occurs (both thread1 and
   536  	//   thread2 are concurrent since their latches do not conflict):
   537  	//
   538  	//   - [thread1-txn1] reader sees intent at ts=5
   539  	//   - [thread2-txn2] intent resolution changes that intent to ts=10
   540  	//   - [thread2-txn2] updateLocks is called and lock is removed since it is a
   541  	//     replicated lock.
   542  	//   - [thread1-txn1] reader calls addDiscoveredLock() for ts=5.
   543  	//
   544  	//   Now the lockTable thinks there is a lock and subsequent pushes of txn2
   545  	//   by txn1 will do nothing since the txn2 is already at timestamp 10. Txn1
   546  	//   will unnecessarily block until txn2 is done.
   547  	//
   548  	// - Unreplicated locks:
   549  	//   - for epochs older than txn.Epoch, locks are dropped.
   550  	//   - locks in the current epoch that are at a TxnMeta.Sequence
   551  	//     contained in IgnoredSeqNums are dropped.
   552  	//   - the remaining locks are changed to timestamp equal to
   553  	//     txn.WriteTimestamp.
   554  	UpdateLocks(*roachpb.LockUpdate) error
   555  
   556  	// String returns a debug string representing the state of the lockTable.
   557  	String() string
   558  }
   559  
   560  // lockTableGuard is a handle to a request as it waits on conflicting locks in a
   561  // lockTable or as it holds a place in lock wait-queues as it evaluates.
   562  type lockTableGuard interface {
   563  	// ShouldWait must be called after each ScanAndEnqueue. The request should
   564  	// proceed to evaluation if it returns false, else it releases latches and
   565  	// listens to the channel returned by NewStateChan.
   566  	ShouldWait() bool
   567  
   568  	// NewStateChan returns the channel to listen on for notification that the
   569  	// state may have changed. If ShouldWait returns true, this channel will
   570  	// have an initial notification. Note that notifications are collapsed if
   571  	// not retrieved, since it is not necessary for the waiter to see every
   572  	// state transition.
   573  	NewStateChan() chan struct{}
   574  
   575  	// CurState returns the latest waiting state.
   576  	CurState() waitingState
   577  }
   578  
   579  // lockTableWaiter is concerned with waiting in lock wait-queues for locks held
   580  // by conflicting transactions. It ensures that waiting requests continue to
   581  // make forward progress even in the presence of faulty transaction coordinators
   582  // and transaction deadlocks.
   583  //
   584  // The waiter implements logic for a request to wait on conflicting locks in the
   585  // lockTable until they are released. Similarly, it implements logic to wait on
   586  // conflicting requests ahead of the caller's request in any lock wait-queues
   587  // that it is a part of.
   588  //
   589  // This waiting state responds to a set of state transitions in the lock table:
   590  //  - a conflicting lock is released
   591  //  - a conflicting lock is updated such that it no longer conflicts
   592  //  - a conflicting request in the lock wait-queue acquires the lock
   593  //  - a conflicting request in the lock wait-queue exits the lock wait-queue
   594  //
   595  // These state transitions are typically reactive - the waiter can simply wait
   596  // for locks to be released or lock wait-queues to be exited by other actors.
   597  // Reacting to state transitions for conflicting locks is powered by the
   598  // LockManager and reacting to state transitions for conflicting lock
   599  // wait-queues is powered by the RequestSequencer interface.
   600  //
   601  // However, in the case of transaction coordinator failures or transaction
   602  // deadlocks, a state transition may never occur without intervention from the
   603  // waiter. To ensure forward-progress, the waiter may need to actively push
   604  // either a lock holder of a conflicting lock or the head of a conflicting lock
   605  // wait-queue. This active pushing requires an RPC to the leaseholder of the
   606  // conflicting transaction's record, and will typically result in the RPC
   607  // queuing in that leaseholder's txnWaitQueue. Because this can be expensive,
   608  // the push is not immediately performed. Instead, it is only performed after a
   609  // delay.
   610  type lockTableWaiter interface {
   611  	// WaitOn accepts and waits on a lockTableGuard that has returned true from
   612  	// ShouldWait.
   613  	//
   614  	// The method should be called after dropping any latches that a request
   615  	// has acquired. It returns when the request is at the front of all lock
   616  	// wait-queues and it is safe to re-acquire latches and scan the lockTable
   617  	// again.
   618  	WaitOn(context.Context, Request, lockTableGuard) *Error
   619  
   620  	// WaitOnLock waits on the transaction responsible for the specified lock
   621  	// and then ensures that the lock is cleared out of the request's way.
   622  	//
   623  	// The method should be called after dropping any latches that a request has
   624  	// acquired. It returns when the lock has been resolved.
   625  	//
   626  	// NOTE: this method is used when the lockTable is disabled (e.g. on a
   627  	// follower replica) and a lock is discovered that must be waited on (e.g.
   628  	// during a follower read). If/when lockTables are maintained on follower
   629  	// replicas by propagating lockTable state transitions through the Raft log
   630  	// in the ReplicatedEvalResult instead of through the (leaseholder-only)
   631  	// LocalResult, we should be able to remove the lockTable "disabled" state
   632  	// and, in turn, remove this method. This will likely fall out of pulling
   633  	// all replicated locks into the lockTable.
   634  	WaitOnLock(context.Context, Request, *roachpb.Intent) *Error
   635  
   636  	// ClearCaches wipes all caches maintained by the lockTableWaiter. This is
   637  	// primarily used to recover memory when a replica loses a lease. However,
   638  	// it is also used in tests to reset the state of the lockTableWaiter.
   639  	ClearCaches()
   640  }
   641  
   642  // txnWaitQueue holds a collection of wait-queues for transaction records.
   643  // Conflicting transactions, known as "pushers", sit in a queue associated with
   644  // an extant transaction that they conflict with, known as the "pushee", and
   645  // wait for the pushee transaction to commit or abort.
   646  //
   647  // Typically, waiting for a pushee's transaction record to undergo a state
   648  // transition is sufficient to satisfy a pusher transaction. Reacting to state
   649  // transitions for conflicting transactions is powered by the TransactionManager
   650  // interface.
   651  //
   652  // Just like with the lockTableWaiter, there are cases where reacting to state
   653  // transitions alone in insufficient to make forward progress. However, unlike
   654  // with the lockTableWaiter, the location of the txnWaitQueue on the range
   655  // containing the conflicting transaction's record instead of on the range
   656  // containing the conflicting transaction's lock presents an opportunity to
   657  // actively resolve these situations. This is because a transaction's record
   658  // reflects its authoritative status.
   659  //
   660  // The first of these situations is failure of the conflicting transaction's
   661  // coordinator. This situation comes in two flavors:
   662  //  - before a transaction has been finalized (committed or aborted)
   663  //  - after a transaction has been finalized but before all of its intents have
   664  //    been resolved
   665  //
   666  // In the first of these flavors, the transaction record may still have a
   667  // PENDING status. Without a live transaction coordinator heartbeating it, the
   668  // record will eventually expire and be abortable. The the second of these
   669  // flavors, the transaction's record will already be committed or aborted.
   670  // Regardless of which case the push falls into, once the transaction record
   671  // is observed in a finalized state, the push will succeed, kick off intent
   672  // resolution, and return to the sender.
   673  //
   674  // The second of these situations is transaction deadlock. Deadlocks occur when
   675  // the lock acquisition patterns of two or more transactions interact in such a
   676  // way that a cycle emerges in the "waits-for" graph of transactions. To break
   677  // this cycle, one of the transactions must be aborted or it is impossible for
   678  // any of the transactions that are part of the deadlock to continue making
   679  // progress.
   680  //
   681  // The txnWaitQueue provides a mechanism for detecting these cycles across a
   682  // distributed graph of transactions. Distributed deadlock detection works by
   683  // having each pusher transaction that is waiting in the queue for a different
   684  // transaction periodically query its own record using a QueryTxn request. While
   685  // on the pusher's own transaction record range, the QueryTxn request uses the
   686  // GetDependents method to collect the IDs of all locally-known transactions
   687  // that are waiting for the pusher itself to release its locks. Of course, this
   688  // local view of the dependency graph is incomplete, as it does not initially
   689  // take into consideration transitive dependencies. To address this, when the
   690  // QueryTxn returns to the initial txnWaitQueue, the pusher records its own
   691  // dependencies as dependencies of its pushee transaction. As this process
   692  // continues and pushers periodically query for their own dependencies and
   693  // transfer these to their pushee, each txnWaitQueue accumulate more information
   694  // about the global "waits-for" graph. Eventually, one of the txnWaitQueues is
   695  // able to observe a full cycle in this graph and aborts one of the transactions
   696  // in the cycle to break the deadlock.
   697  //
   698  // Example of Distributed Deadlock Detection
   699  //
   700  // The following diagram demonstrates how the txnWaitQueue interacts with
   701  // distributed deadlock detection.
   702  //
   703  //   - txnA enters txnB's txnWaitQueue during a PushTxn request (MaybeWaitForPush)
   704  //   - txnB enters txnC's txnWaitQueue during a PushTxn request (MaybeWaitForPush)
   705  //   - txnC enters txnA's txnWaitQueue during a PushTxn request (MaybeWaitForPush)
   706  //
   707  //          .-----------------------------------.
   708  //          |                                   |
   709  //          v                                   |
   710  //    [txnA record] --> [txnB record] --> [txnC record]
   711  //     deps:             deps:             deps:
   712  //     - txnC            - txnA            - txnB
   713  //
   714  //   - txnA queries its own txnWaitQueue using a QueryTxn request (MaybeWaitForQuery)
   715  //
   716  //          .-----------------------------------.
   717  //          |    ............                   |
   718  //          v    v          .                   |
   719  //    [txnA record] --> [txnB record] --> [txnC record]
   720  //     deps:             deps:             deps:
   721  //     - txnC            - txnA            - txnB
   722  //
   723  //   - txnA finds that txnC is a dependent. It transfers this dependency to txnB
   724  //
   725  //          .-----------------------------------.
   726  //          |                                   |
   727  //          v                                   |
   728  //    [txnA record] --> [txnB record] --> [txnC record]
   729  //     deps:             deps:             deps:
   730  //     - txnC            - txnA            - txnB
   731  //                       - txnC
   732  //
   733  //   - txnC queries its own txnWaitQueue using a QueryTxn request (MaybeWaitForQuery)
   734  //   - txnB queries its own txnWaitQueue using a QueryTxn request (MaybeWaitForQuery)
   735  //   - txnC finds that txnB is a dependent. It transfers this dependency to txnA
   736  //   - txnB finds that txnA and txnC are dependents. It transfers these dependencies to txnC
   737  //
   738  //          .-----------------------------------.
   739  //          |                                   |
   740  //          v                                   |
   741  //    [txnA record] --> [txnB record] --> [txnC record]
   742  //     deps:             deps:             deps:
   743  //     - txnC            - txnA            - txnB
   744  //     - txnB            - txnC            - txnA
   745  //                                         - txnC
   746  //
   747  //   - txnB notices that txnC is a transitive dependency of itself. This indicates
   748  //     a cycle in the global wait-for graph. txnC is aborted, breaking the cycle
   749  //     and the deadlock
   750  //
   751  //    [txnA record] --> [txnB record] --> [txnC record: ABORTED]
   752  //
   753  //   - txnC releases its locks and the transactions proceed in order.
   754  //
   755  //    [txnA record] --> [txnB record] --> (free to commit)
   756  //
   757  // TODO(nvanbenschoten): if we exposed a "queue guard" interface, we could make
   758  // stronger guarantees around cleaning up enqueued txns when there are no
   759  // waiters.
   760  type txnWaitQueue interface {
   761  	requestQueuer
   762  
   763  	// EnqueueTxn creates a queue associated with the provided transaction. Once
   764  	// a queue is established, pushers of this transaction can wait in the queue
   765  	// and will be informed of state transitions that the transaction undergoes.
   766  	EnqueueTxn(*roachpb.Transaction)
   767  
   768  	// UpdateTxn informs the queue that the provided transaction has undergone
   769  	// a state transition. This will be communicated to any waiting pushers.
   770  	UpdateTxn(context.Context, *roachpb.Transaction)
   771  
   772  	// GetDependents returns a set of transactions waiting on the specified
   773  	// transaction either directly or indirectly. The method is used to perform
   774  	// deadlock detection.
   775  	GetDependents(uuid.UUID) []uuid.UUID
   776  
   777  	// MaybeWaitForPush checks whether there is a queue already established for
   778  	// transaction being pushed by the provided request. If not, or if the
   779  	// PushTxn request isn't queueable, the method returns immediately. If there
   780  	// is a queue, the method enqueues this request as a waiter and waits for
   781  	// the transaction to be pushed/finalized.
   782  	//
   783  	// If the transaction is successfully pushed while this method is waiting,
   784  	// the first return value is a non-nil PushTxnResponse object.
   785  	MaybeWaitForPush(context.Context, *roachpb.PushTxnRequest) (*roachpb.PushTxnResponse, *Error)
   786  
   787  	// MaybeWaitForQuery checks whether there is a queue already established for
   788  	// transaction being queried. If not, or if the QueryTxn request hasn't
   789  	// specified WaitForUpdate, the method returns immediately. If there is a
   790  	// queue, the method enqueues this request as a waiter and waits for any
   791  	// updates to the target transaction.
   792  	MaybeWaitForQuery(context.Context, *roachpb.QueryTxnRequest) *Error
   793  
   794  	// OnRangeDescUpdated informs the Queue that its range's descriptor has been
   795  	// updated.
   796  	OnRangeDescUpdated(*roachpb.RangeDescriptor)
   797  }
   798  
   799  // requestQueuer queues requests until some condition is met.
   800  type requestQueuer interface {
   801  	// Enable allows requests to be queued. The method is idempotent.
   802  	Enable()
   803  
   804  	// Clear empties the queue(s) and causes all waiting requests to
   805  	// return. If disable is true, future requests must not be enqueued.
   806  	Clear(disable bool)
   807  }