github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/lock_table.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package concurrency
    12  
    13  import (
    14  	"container/list"
    15  	"fmt"
    16  	"sort"
    17  	"strings"
    18  	"sync"
    19  	"sync/atomic"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/keys"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    26  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    29  	"github.com/cockroachdb/errors"
    30  )
    31  
    32  // Default upper bound on the number of locks in a lockTable.
    33  const defaultLockTableSize = 10000
    34  
    35  // The kind of waiting that the request is subject to.
    36  type waitKind int
    37  
    38  const (
    39  	_ waitKind = iota
    40  
    41  	// waitFor indicates that the request is waiting on another transaction to
    42  	// to release its locks or complete its own request. waitingStates with this
    43  	// waitKind will provide information on who the request is waiting on. The
    44  	// request will likely want to eventually push the conflicting transaction.
    45  	waitFor
    46  
    47  	// waitForDistinguished is a sub-case of waitFor. It implies everything that
    48  	// waitFor does and additionally indicates that the request is currently the
    49  	// "distinguished waiter". A distinguished waiter is responsible for taking
    50  	// extra actions, e.g. immediately pushing the transaction it is waiting
    51  	// for. If there are multiple requests in the waitFor state waiting on the
    52  	// same transaction, at least one will be a distinguished waiter.
    53  	waitForDistinguished
    54  
    55  	// waitElsewhere is used when the lockTable is under memory pressure and is
    56  	// clearing its internal queue state. Like the waitFor* states, it informs
    57  	// the request who it is waiting for so that deadlock detection works.
    58  	// However, sequencing information inside the lockTable is mostly discarded.
    59  	waitElsewhere
    60  
    61  	// waitSelf indicates that a different requests from the same transaction
    62  	// has a conflicting reservation. See the comment about "Reservations" in
    63  	// lockState. This request should sit tight and wait for a new notification
    64  	// without pushing anyone.
    65  	waitSelf
    66  
    67  	// doneWaiting indicates that the request is done waiting on this pass
    68  	// through the lockTable and should make another call to ScanAndEnqueue.
    69  	doneWaiting
    70  )
    71  
    72  // The current waiting state of the request.
    73  //
    74  // See the detailed comment about "Waiting logic" on lockTableGuardImpl.
    75  type waitingState struct {
    76  	kind waitKind
    77  
    78  	// Fields below are populated for waitFor* and waitElsewhere kinds.
    79  
    80  	// Represents who the request is waiting for. The conflicting
    81  	// transaction may be a lock holder of a conflicting lock or a
    82  	// conflicting request being sequenced through the same lockTable.
    83  	txn  *enginepb.TxnMeta // always non-nil
    84  	key  roachpb.Key       // the key of the conflict
    85  	held bool              // is the conflict a held lock?
    86  
    87  	// Represents the action that the request was trying to perform when
    88  	// it hit the conflict. E.g. was it trying to read or write?
    89  	guardAccess spanset.SpanAccess
    90  }
    91  
    92  // Implementation
    93  // TODO(sbhola):
    94  // - metrics about lockTable state to export to observability debug pages:
    95  //   number of locks, number of waiting requests, wait time?, ...
    96  // - test cases where guard.readTS != guard.writeTS.
    97  
    98  // The btree for a particular SpanScope.
    99  type treeMu struct {
   100  	mu syncutil.RWMutex // Protects everything in this struct.
   101  
   102  	// For assigning sequence numbers to the lockState objects as required by
   103  	// the util/interval/generic type contract.
   104  	lockIDSeqNum uint64
   105  
   106  	// Container for lockState structs. Locks that are not held or reserved and
   107  	// have no waiting requests are garbage collected. Additionally, locks that
   108  	// are only held with Replicated durability and have no waiting requests may
   109  	// also be garbage collected since their state can be recovered from
   110  	// persistent storage.
   111  	btree
   112  
   113  	// For constraining memory consumption. We need better memory accounting
   114  	// than this.
   115  	numLocks int64
   116  }
   117  
   118  // lockTableImpl is an implementation of lockTable.
   119  //
   120  // Concurrency: in addition to holding latches, we require for a particular
   121  // request ScanAndEnqueue() and CurState() must be called by the same
   122  // thread.
   123  //
   124  // Mutex ordering:   lockTableImpl.enabledMu
   125  //                 > treeMu.mu
   126  //                 > lockState.mu
   127  //                 > lockTableGuardImpl.mu
   128  type lockTableImpl struct {
   129  	// Is the lockTable enabled? When enabled, the lockTable tracks locks and
   130  	// allows requests to queue in wait-queues on these locks. When disabled,
   131  	// no locks or wait-queues are maintained.
   132  	//
   133  	// enabledMu is held in read-mode when determining whether the lockTable
   134  	// is enabled and when acting on that information (e.g. adding new locks).
   135  	// It is held in write-mode when enabling or disabling the lockTable.
   136  	enabled   bool
   137  	enabledMu syncutil.RWMutex
   138  
   139  	// A sequence number is assigned to each request seen by the lockTable. This
   140  	// is to preserve fairness despite the design choice of allowing
   141  	// out-of-order evaluation of requests with overlapping spans where the
   142  	// latter request does not encounter contention. This out-of-order
   143  	// evaluation happens because requests do not reserve spans that are
   144  	// uncontended while they wait for on contended locks after releasing their
   145  	// latches. Consider the following examples:
   146  	//
   147  	// Example 1:
   148  	// - req1 wants to write to A, B
   149  	// - req2 wants to write to B
   150  	// - lock at A is held by some other txn.
   151  	// - Even though req2 arrives later, req1 will wait only in the queue for A
   152  	//   and allow req2 to proceed to evaluation.
   153  	//
   154  	// Example 2:
   155  	// - Same as example 1 but lock at A is held by txn3 and lock at B is held
   156  	//   by txn4.
   157  	// - Lock at A is released so req1 acquires the reservation at A and starts
   158  	//   waiting at B.
   159  	// - It is unfair for req1 to wait behind req2 at B. The sequence number
   160  	//   assigned to req1 and req2 will restore the fairness by making req1
   161  	//   wait before req2.
   162  	//
   163  	// Example 3: Deadlock in lock table if it did not use sequence numbers.
   164  	// - Lock at B is acquired by txn0.
   165  	// - req1 (from txn1) arrives at lockTable and wants to write to A and B.
   166  	//   It queues at B.
   167  	// - req2 (from txn2) arrives at lockTable and only wants to write A.
   168  	//   It proceeds to evaluation and acquires the lock at A for txn2 and then
   169  	//   the request is done. The lock is still held.
   170  	// - req3 (from txn3) wants to write to A and B. It queues at A.
   171  	// - txn2 releases A. req3 is in the front of the queue at A and gets the
   172  	//   reservation and starts waiting at B behind req1.
   173  	// - txn0 releases B. req1 gets the reservation at B and does another scan
   174  	//   and adds itself to the queue at A, behind req3 which holds the
   175  	//   reservation at A.
   176  	// Now in the queues for A and B req1 is behind req3 and vice versa and
   177  	// this deadlock has been created entirely due to the lock table's behavior.
   178  	seqNum uint64
   179  
   180  	locks [spanset.NumSpanScope]treeMu
   181  
   182  	maxLocks int64
   183  }
   184  
   185  var _ lockTable = &lockTableImpl{}
   186  
   187  // lockTableGuardImpl is an implementation of lockTableGuard.
   188  //
   189  // The struct is a guard that is returned to the request the first time it calls
   190  // lockTable.ScanAndEnqueue() and used in later calls to ScanAndEnqueue() and
   191  // done(). After a call to ScanAndEnqueue() (which is made while holding
   192  // latches), the caller must first call lockTableGuard.StartWaiting() and if it
   193  // returns true release the latches and continue interacting with the
   194  // lockTableGuard. If StartWaiting() returns false, the request can proceed to
   195  // evaluation.
   196  //
   197  // Waiting logic: The interface hides the queues that the request is waiting on,
   198  // and the request's position in the queue. One of the reasons for this hiding
   199  // is that queues are not FIFO since a request that did not wait on a queue for
   200  // key k in a preceding call to ScanAndEnqueue() (because k was not locked and
   201  // there was no queue) may need to wait on the queue in a later call to
   202  // ScanAndEnqueue(). So sequencing of requests arriving at the lockTable is
   203  // partially decided by a sequence number assigned to a request when it first
   204  // called ScanAndEnqueue() and queues are ordered by this sequence number.
   205  // However the sequencing is not fully described by the sequence numbers -- a
   206  // request R1 encountering contention over some keys in its span does not
   207  // prevent a request R2 that has a higher sequence number and overlapping span
   208  // to proceed if R2 does not encounter contention. This concurrency (that is not
   209  // completely fair) is deemed desirable.
   210  //
   211  // The interface exposes an abstracted version of the waiting logic in a way
   212  // that the request that starts waiting is considered waiting for at most one
   213  // other request or transaction. This is exposed as a series of state
   214  // transitions where the transitions are notified via newState() and the current
   215  // state can be read using CurState().
   216  //
   217  // - The waitFor* states provide information on who the request is waiting for.
   218  //   The waitForDistinguished state is a sub-case -- a distinguished waiter is
   219  //   responsible for taking extra actions e.g. immediately pushing the transaction
   220  //   it is waiting for. The implementation ensures that if there are multiple
   221  //   requests in waitFor state waiting on the same transaction at least one will
   222  //   be a distinguished waiter.
   223  //
   224  //   TODO(sbhola): investigate removing the waitForDistinguished state which
   225  //   will simplify the code here. All waitFor requests would wait (currently
   226  //   50ms) before pushing the transaction (for deadlock detection) they are
   227  //   waiting on, say T. Typically T will be done before 50ms which is considered
   228  //   ok: the one exception we will need to make is if T has the min priority or
   229  //   the waiting transaction has max priority -- in both cases it will push
   230  //   immediately. The bad case is if T is ABORTED: the push will succeed after,
   231  //   and if T left N intents, each push would wait for 50ms, incurring a latency
   232  //   of 50*N ms. A cache of recently encountered ABORTED transactions on each
   233  //   Store should mitigate this latency increase. Whenever a transaction sees a
   234  //   waitFor state, it will consult this cache and if T is found, push
   235  //   immediately (if there isn't already a push in-flight) -- even if T is not
   236  //   initially in the cache, the first push will place it in the cache, so the
   237  //   maximum latency increase is 50ms.
   238  //
   239  // - The waitElsewhere state is a rare state that is used when the lockTable is
   240  //   under memory pressure and is clearing its internal queue state. Like the
   241  //   waitFor* states, it informs the request who it is waiting for so that
   242  //   deadlock detection works. However, sequencing information inside the
   243  //   lockTable is mostly discarded.
   244  //
   245  // - The waitSelf state is a rare state when a different request from the same
   246  //   transaction has a reservation. See the comment about "Reservations" in
   247  //   lockState.
   248  //
   249  // - The doneWaiting state is used to indicate that the request should make
   250  //   another call to ScanAndEnqueue() (that next call is more likely to return a
   251  //   lockTableGuard that returns false from StartWaiting()).
   252  type lockTableGuardImpl struct {
   253  	seqNum uint64
   254  
   255  	// Information about this request.
   256  	txn     *enginepb.TxnMeta
   257  	spans   *spanset.SpanSet
   258  	readTS  hlc.Timestamp
   259  	writeTS hlc.Timestamp
   260  
   261  	// Snapshots of the trees for which this request has some spans. Note that
   262  	// the lockStates in these snapshots may have been removed from
   263  	// lockTableImpl. Additionally, it is possible that there is a new lockState
   264  	// for the same key. This can result in various harmless anomalies:
   265  	// - the request may hold a reservation on a lockState that is no longer
   266  	//   in the tree. When it next does a scan, it will either find a new
   267  	//   lockState where it will compete or none. Both lockStates can be in
   268  	//   the mu.locks map, which is harmless.
   269  	// - the request may wait behind a reservation holder that is not the
   270  	//   lock holder. This could cause a delay in pushing the lock holder.
   271  	//   This is not a correctness issue (the whole system is not deadlocked)
   272  	//   and we expect will not be a real performance issue.
   273  	//
   274  	// TODO(sbhola): experimentally evaluate the lazy queueing of the current
   275  	// implementation, in comparison with eager queueing. If eager queueing
   276  	// is comparable in system throughput, one can eliminate the above anomalies.
   277  	//
   278  	tableSnapshot [spanset.NumSpanScope]btree
   279  
   280  	// A request whose startWait is set to true in ScanAndEnqueue is actively
   281  	// waiting at a particular key. This is the first key encountered when
   282  	// iterating through spans that it needs to wait at. A future event (lock
   283  	// release etc.) may cause the request to no longer need to wait at this
   284  	// key. It then needs to continue iterating through spans to find the next
   285  	// key to wait at (we don't want to wastefully start at the beginning since
   286  	// this request probably has a reservation at the contended keys there): sa,
   287  	// ss, index, key collectively track the current position to allow it to
   288  	// continue iterating.
   289  
   290  	// The key for the lockState.
   291  	key roachpb.Key
   292  	// The key for the lockState is contained in the Span specified by
   293  	// spans[sa][ss][index].
   294  	ss    spanset.SpanScope
   295  	sa    spanset.SpanAccess // Iterates from stronger to weaker strength
   296  	index int
   297  
   298  	mu struct {
   299  		syncutil.Mutex
   300  		startWait bool
   301  
   302  		state  waitingState
   303  		signal chan struct{}
   304  
   305  		// locks for which this request has a reservation or is in the queue of
   306  		// writers (active or inactive) or actively waiting as a reader.
   307  		//
   308  		// TODO(sbhola): investigate whether the logic to maintain this locks map
   309  		// can be simplified so it doesn't need to be adjusted by various
   310  		// lockState methods. It adds additional bookkeeping burden that means it
   311  		// is more prone to inconsistencies. There are two main uses: (a) removing
   312  		// from various lockStates when done() is called, (b) tryActiveWait() uses
   313  		// it as an optimization to know that this request is not known to the
   314  		// lockState. (b) can be handled by other means -- the first scan the
   315  		// request won't be in the lockState and the second scan it likely will.
   316  		// (a) doesn't necessarily require this map to be consistent -- the
   317  		// request could track the places where it is has enqueued as places where
   318  		// it could be present and then do the search.
   319  
   320  		locks map[*lockState]struct{}
   321  
   322  		// If this is true, the state has changed and the channel has been
   323  		// signaled, but what the state should be has not been computed. The call
   324  		// to CurState() needs to compute that current state. Deferring the
   325  		// computation makes the waiters do this work themselves instead of making
   326  		// the call to release/update locks or release reservations do this work
   327  		// (proportional to number of waiters).
   328  		mustFindNextLockAfter bool
   329  	}
   330  }
   331  
   332  var _ lockTableGuard = &lockTableGuardImpl{}
   333  
   334  // Used to avoid allocations.
   335  var lockTableGuardImplPool = sync.Pool{
   336  	New: func() interface{} {
   337  		g := new(lockTableGuardImpl)
   338  		g.mu.signal = make(chan struct{}, 1)
   339  		g.mu.locks = make(map[*lockState]struct{})
   340  		return g
   341  	},
   342  }
   343  
   344  // newLockTableGuardImpl returns a new lockTableGuardImpl. The struct will
   345  // contain pre-allocated mu.signal and mu.locks fields, so it shouldn't be
   346  // overwritten blindly.
   347  func newLockTableGuardImpl() *lockTableGuardImpl {
   348  	return lockTableGuardImplPool.Get().(*lockTableGuardImpl)
   349  }
   350  
   351  // releaseLockTableGuardImpl releases the guard back into the object pool.
   352  func releaseLockTableGuardImpl(g *lockTableGuardImpl) {
   353  	// Preserve the signal channel and locks map fields in the pooled
   354  	// object. Drain the signal channel and assert that the map is empty.
   355  	// The map should have been cleared by lockState.requestDone.
   356  	signal, locks := g.mu.signal, g.mu.locks
   357  	select {
   358  	case <-signal:
   359  	default:
   360  	}
   361  	if len(locks) != 0 {
   362  		panic("lockTableGuardImpl.mu.locks not empty after Dequeue")
   363  	}
   364  
   365  	*g = lockTableGuardImpl{}
   366  	g.mu.signal = signal
   367  	g.mu.locks = locks
   368  	lockTableGuardImplPool.Put(g)
   369  }
   370  
   371  func (g *lockTableGuardImpl) ShouldWait() bool {
   372  	g.mu.Lock()
   373  	defer g.mu.Unlock()
   374  	return g.mu.startWait
   375  }
   376  
   377  func (g *lockTableGuardImpl) NewStateChan() chan struct{} {
   378  	g.mu.Lock()
   379  	defer g.mu.Unlock()
   380  	return g.mu.signal
   381  }
   382  
   383  func (g *lockTableGuardImpl) CurState() waitingState {
   384  	g.mu.Lock()
   385  	defer g.mu.Unlock()
   386  	if !g.mu.mustFindNextLockAfter {
   387  		return g.mu.state
   388  	}
   389  	// Not actively waiting anywhere so no one else can set
   390  	// mustFindNextLockAfter to true while this method executes.
   391  	g.mu.mustFindNextLockAfter = false
   392  	g.mu.Unlock()
   393  	g.findNextLockAfter(false /* notify */)
   394  	g.mu.Lock() // Unlock deferred
   395  	return g.mu.state
   396  }
   397  
   398  func (g *lockTableGuardImpl) notify() {
   399  	select {
   400  	case g.mu.signal <- struct{}{}:
   401  	default:
   402  	}
   403  }
   404  
   405  // Called when the request is no longer actively waiting at lock l, and should
   406  // look for the next lock to wait at. hasReservation is true iff the request
   407  // acquired the reservation at l. Note that it will be false for requests that
   408  // were doing a read at the key, or non-transactional writes at the key.
   409  func (g *lockTableGuardImpl) doneWaitingAtLock(hasReservation bool, l *lockState) {
   410  	g.mu.Lock()
   411  	if !hasReservation {
   412  		delete(g.mu.locks, l)
   413  	}
   414  	g.mu.mustFindNextLockAfter = true
   415  	g.notify()
   416  	g.mu.Unlock()
   417  }
   418  
   419  func (g *lockTableGuardImpl) isSameTxn(txn *enginepb.TxnMeta) bool {
   420  	return g.txn != nil && g.txn.ID == txn.ID
   421  }
   422  
   423  func (g *lockTableGuardImpl) isSameTxnAsReservation(ws waitingState) bool {
   424  	return !ws.held && g.isSameTxn(ws.txn)
   425  }
   426  
   427  // Finds the next lock, after the current one, to actively wait at. If it
   428  // finds the next lock the request starts actively waiting there, else it is
   429  // told that it is done waiting.
   430  // Acquires g.mu.
   431  func (g *lockTableGuardImpl) findNextLockAfter(notify bool) {
   432  	spans := g.spans.GetSpans(g.sa, g.ss)
   433  	var span *spanset.Span
   434  	resumingInSameSpan := false
   435  	if g.index == -1 || len(spans[g.index].EndKey) == 0 {
   436  		span = stepToNextSpan(g)
   437  	} else {
   438  		span = &spans[g.index]
   439  		resumingInSameSpan = true
   440  	}
   441  	for span != nil {
   442  		startKey := span.Key
   443  		if resumingInSameSpan {
   444  			startKey = g.key
   445  		}
   446  		tree := g.tableSnapshot[g.ss]
   447  		iter := tree.MakeIter()
   448  
   449  		// From here on, the use of resumingInSameSpan is just a performance
   450  		// optimization to deal with the interface limitation of btree that
   451  		// prevents us from specifying an exclusive start key. We need to check
   452  		// that the lock is not the same as our exclusive start key and only need
   453  		// to do that check once -- for the first lock.
   454  		ltRange := &lockState{key: startKey, endKey: span.EndKey}
   455  		for iter.FirstOverlap(ltRange); iter.Valid(); iter.NextOverlap(ltRange) {
   456  			l := iter.Cur()
   457  			if resumingInSameSpan {
   458  				resumingInSameSpan = false
   459  				if l.key.Equal(startKey) {
   460  					// This lock is where it stopped waiting.
   461  					continue
   462  				}
   463  				// Else, past the lock where it stopped waiting. We may not
   464  				// encounter that lock since it may have been garbage collected.
   465  			}
   466  			if l.tryActiveWait(g, g.sa, notify) {
   467  				return
   468  			}
   469  		}
   470  		resumingInSameSpan = false
   471  		span = stepToNextSpan(g)
   472  	}
   473  	g.mu.Lock()
   474  	defer g.mu.Unlock()
   475  	g.mu.state = waitingState{kind: doneWaiting}
   476  	if notify {
   477  		g.notify()
   478  	}
   479  }
   480  
   481  // Waiting writers in a lockState are wrapped in a queuedGuard. A waiting
   482  // writer is typically waiting in an active state, i.e., the
   483  // lockTableGuardImpl.key refers to this lockState. However, breaking of
   484  // reservations (see the comment on reservations below, in lockState) can
   485  // cause a writer to be an inactive waiter.
   486  type queuedGuard struct {
   487  	guard  *lockTableGuardImpl
   488  	active bool // protected by lockState.mu
   489  }
   490  
   491  // Information about a lock holder.
   492  type lockHolderInfo struct {
   493  	// nil if there is no holder. Else this is the TxnMeta of the latest call to
   494  	// acquire/update the lock by this transaction. For a given transaction if
   495  	// the lock is continuously held by a succession of different TxnMetas, the
   496  	// epoch must be monotonic and the ts (derived from txn.WriteTimestamp for
   497  	// some calls, and request.ts for other calls) must be monotonic. After ts
   498  	// is intialized, the timestamps inside txn are not used.
   499  	txn *enginepb.TxnMeta
   500  
   501  	// All the TxnSeqs in the current epoch at which this lock has been
   502  	// acquired. In increasing order. We track these so that if a lock is
   503  	// acquired at both seq 5 and seq 7, rollback of 7 does not cause the lock
   504  	// to be released. This is also consistent with PostgreSQL semantics
   505  	// https://www.postgresql.org/docs/12/sql-select.html#SQL-FOR-UPDATE-SHARE
   506  	seqs []enginepb.TxnSeq
   507  
   508  	// The timestamp at which the lock is held.
   509  	ts hlc.Timestamp
   510  }
   511  
   512  // Per lock state in lockTableImpl.
   513  //
   514  // NOTE: we can't easily pool lockState objects without some form of reference
   515  // counting because they are used as elements in a copy-on-write btree and may
   516  // still be referenced by clones of the tree even when deleted from the primary.
   517  // However, other objects referenced by lockState can be pooled as long as they
   518  // are removed from all lockStates that reference them first.
   519  type lockState struct {
   520  	id     uint64 // needed for implementing util/interval/generic type contract
   521  	endKey []byte // used in btree iteration and tests
   522  
   523  	// The key being locked and the scope of that key. This state is never
   524  	// mutated.
   525  	key roachpb.Key
   526  	ss  spanset.SpanScope
   527  
   528  	mu syncutil.Mutex // Protects everything below.
   529  
   530  	// Invariant summary (see detailed comments below):
   531  	// - both holder.locked and waitQ.reservation != nil cannot be true.
   532  	// - if holder.locked and multiple holderInfos have txn != nil: all the
   533  	//   txns must have the same txn.ID.
   534  	// - !holder.locked => waitingReaders.Len() == 0. That is, readers wait
   535  	//   only if the lock is held. They do not wait for a reservation.
   536  	// - If reservation != nil, that request is not in queuedWriters.
   537  
   538  	// Information about whether the lock is held and the holder. We track
   539  	// information for each durability level separately since a transaction can
   540  	// go through multiple epochs and TxnSeq and may acquire the same lock in
   541  	// replicated and unreplicated mode at different stages.
   542  	holder struct {
   543  		locked bool
   544  		// LockStrength is always Exclusive
   545  		holder [lock.MaxDurability + 1]lockHolderInfo
   546  	}
   547  
   548  	// Information about the requests waiting on the lock.
   549  	lockWaitQueue
   550  }
   551  
   552  type lockWaitQueue struct {
   553  	// Reservations:
   554  	//
   555  	// A not-held lock can be "reserved". A reservation is just a claim that
   556  	// prevents multiple requests from racing when the lock is released. A
   557  	// reservation by req2 can be broken by req1 is req1 has a smaller seqNum
   558  	// than req2. Only requests that specify SpanReadWrite for a key can make
   559  	// reservations. This means a reservation can only be made when the lock is
   560  	// not held, since the reservation (which can acquire an Exclusive lock) and
   561  	// the lock holder (which is an Exclusive lock) conflict.
   562  	//
   563  	// Read reservations are not permitted due to the complexities discussed in
   564  	// the review for #43740. Additionally, reads do not queue for their turn at
   565  	// all -- they are held in the waitingReaders list while the lock is held
   566  	// and removed when the lock is not released, so they race with
   567  	// reservations. Let us consider scenarios where reads did wait in the same
   568  	// queue: the lock could be held or reserved by a write at ts=20, followed
   569  	// by a waiting writer at ts=18, writer at ts=10, reader at ts=12. That
   570  	// reader is waiting not because of a conflict with the holder, or reserver,
   571  	// or the first waiter, but because there is a waiter ahead of it which it
   572  	// conflicts with. This introduces more complexity in tracking who this
   573  	// reader should push. Also consider a scenario where a reader did not wait
   574  	// in the queue and waited on the side like in waitingReaders but acquired a
   575  	// read reservation (together with other readers) when the lock was
   576  	// released. Ignoring the unfairness of this, we can construct a deadlock
   577  	// scenario with request req1 with seqnum 1 and req2 with seqnum 2 where
   578  	// req1 and req2 both want to write at one key and so get ordered by their
   579  	// seqnums but at another key req2 wants to read and req1 wants to write and
   580  	// since req2 does not wait in the queue it acquires a read reservation
   581  	// before req1. See the discussion at the end of this comment section on how
   582  	// the behavior will extend when we start supporting Shared and Upgrade
   583  	// locks.
   584  	//
   585  	// Non-transactional requests can do both reads and writes but cannot be
   586  	// depended on since they don't have a transaction that can be pushed.
   587  	// Therefore they not only do not acquire locks, but cannot make reservations.
   588  	// The non-reservation for reads is already covered in the previous
   589  	// paragraph. For non-transactional writes, the request waits in the queue
   590  	// with other writers. The difference occurs:
   591  	// - when it gets to the front of the queue and there is no lock holder
   592  	//   or reservation: instead of acquiring the reservation it removes
   593  	//   itself from the lockState and proceeds to the next lock. If it
   594  	//   does not need to wait for any more locks and manages to acquire
   595  	//   latches before those locks are acquired by some other request, it
   596  	//   will evaluate.
   597  	// - when deciding to wait at a lock: if the lock has a reservation with
   598  	//   a sequence num higher than this non-transactional request it will
   599  	//   ignore that reservation. Note that ignoring such reservations is
   600  	//   safe since when this non-transactional request is holding latches
   601  	//   those reservation holders cannot be holding latches, so they cannot
   602  	//   conflict.
   603  	//
   604  	// Multiple requests from the same transaction wait independently, including
   605  	// the situation where one of the requests has a reservation and the other
   606  	// is waiting (currently this can only happen if both requests are doing
   607  	// SpanReadWrite). Making multiple requests from the same transaction
   608  	// jointly hold the reservation introduces code complexity since joint
   609  	// reservations can be partially broken (see deadlock example below), and is
   610  	// not necessarily fair to other requests. Additionally, if req1 from txn1
   611  	// is holding a a reservation and req2 from txn1 is waiting, they must
   612  	// conflict wrt latches and cannot evaluate concurrently so there isn't a
   613  	// benefit to joint reservations. However, if one of the requests acquires
   614  	// the lock the other request no longer needs to wait on this lock. This
   615  	// situation motivates the waitSelf state.
   616  	//
   617  	// Deadlock example if joint reservations were supported and we did not
   618  	// allow partial breaking of such reservations:
   619  	//
   620  	// - Keys are A, B, C, D.
   621  	// - Key D is locked by some random txn.
   622  	// - req1 from txn1 writes A, B, D. It waits at D.
   623  	// - Some other request from some random txn that writes C arrives,
   624  	//   evaluates, and locks C.
   625  	// - req2 from txn2 that writes A, C. It waits at C.
   626  	// - Some other request from some random txn that writes A arrives,
   627  	//   evaluates, and locks A.
   628  	// - req3 from txn1 that writes A, C. It waits at A. Note that req1 and req3
   629  	//   are from the same txn.
   630  	// - A is unlocked. req3 reserves A and waits at C behind req2.
   631  	// - B is locked by some random txn.
   632  	// - D is unlocked. req1 reserves D and proceeds to scan again and finds A
   633  	//   is reserved by req3 which is the same txn so becomes a joint
   634  	//   reservation holder at A.
   635  	// - Since B is locked, req1 waits at B.
   636  	// - C is unlocked. req2 reserves C. It scans and finds req1+req3 holding
   637  	//   the joint reservation at A. If it queues behind this joint reservation
   638  	//   we have the following situation:
   639  	//        reservation   waiter
   640  	//   A     req1+req3     req2
   641  	//   C       req2        req3
   642  	//   This is a deadlock caused by the lock table unless req2 partially
   643  	//   breaks the reservation at A.
   644  	//
   645  	// Extension for Shared and Upgrade locks:
   646  	// There are 3 aspects to consider: holders; reservers; the dependencies
   647  	// that need to be captured when waiting.
   648  	//
   649  	// - Holders: only shared locks are compatible with themselves, so there can
   650  	//   be one of (a) no holder (b) multiple shared lock holders, (c) one
   651  	//   exclusive holder, (d) one upgrade holder. Non-locking reads will
   652  	//   wait in waitingReaders for only an incompatible exclusive holder.
   653  	//
   654  	// - Reservers: This follows the same pattern as holders. Non-locking reads
   655  	//   do not wait on reservers.
   656  	//
   657  	// - Queueing and dependencies: All potential lockers and non-transactional
   658  	//   writers will wait in the same queue. A sequence of consecutive requests
   659  	//   that have the potential to acquire a shared lock will jointly reserve
   660  	//   that shared lock. Such requests cannot jump ahead of requests with a
   661  	//   lower seqnum just because there is currently a shared lock reservation
   662  	//   (this can cause lockTable induced deadlocks). Such joint reservations
   663  	//   can be partially broken by a waiter desiring an exclusive or upgrade
   664  	//   lock. Like the current code, non-transactional writes will wait for
   665  	//   reservations that have a lower sequence num, but not make their own
   666  	//   reservation. Additionally, they can partially break joint reservations.
   667  	//
   668  	//   Reservations that are (partially or fully) broken cause requests to
   669  	//   reenter the queue as inactive waiters. This is no different than the
   670  	//   current behavior. Each request can specify the same key in spans for
   671  	//   ReadOnly, ReadShared, ReadUpgrade, ReadWrite. The spans will be
   672  	//   iterated over in decreasing order of strength, to only wait at a lock
   673  	//   at the highest strength (this is similar to the current behavior using
   674  	//   accessDecreasingStrength).
   675  	//
   676  	//   For dependencies, a waiter desiring an exclusive or upgrade lock always
   677  	//   conflicts with the holder(s) or reserver(s) so that is the dependency
   678  	//   that will be captured. A waiter desiring a shared lock may encounter a
   679  	//   situation where it does not conflict with the holder(s) or reserver(s)
   680  	//   since those are also shared lockers. In that case it will depend on the
   681  	//   first waiter since that waiter must be desiring a lock that is
   682  	//   incompatible with a shared lock.
   683  
   684  	reservation *lockTableGuardImpl
   685  
   686  	// TODO(sbhola): There are a number of places where we iterate over these
   687  	// lists looking for something, as described below. If some of these turn
   688  	// out to be inefficient, consider better data-structures. One idea is that
   689  	// for cases that find a particular guard the lockTableGuardImpl.locks can be
   690  	// a map instead of a set to point directly to the *list.Element.
   691  	//
   692  	// queuedWriters:
   693  	// - to find all active queuedWriters.
   694  	// - to find the first active writer to make it distinguished.
   695  	// - to find a particular guard.
   696  	// - to find the position, based on seqNum, for inserting a particular guard.
   697  	// - to find all waiting writers with a particular txn ID.
   698  	//
   699  	// waitingReaders:
   700  	// - readers with a higher timestamp than some timestamp.
   701  	// - to find a particular guard.
   702  
   703  	// Waiters: An active waiter needs to be notified about changes in who it is
   704  	// waiting for.
   705  
   706  	// List of *queuedGuard. A subset of these are actively waiting. If
   707  	// non-empty, either the lock is held or there is a reservation.
   708  	queuedWriters list.List
   709  
   710  	// List of *lockTableGuardImpl. All of these are actively waiting. If
   711  	// non-empty, the lock must be held. By definition these cannot be in
   712  	// waitSelf state since that state is only used when there is a reservation.
   713  	waitingReaders list.List
   714  
   715  	// If there is a non-empty set of active waiters that are not waitSelf, then
   716  	// at least one must be distinguished.
   717  	distinguishedWaiter *lockTableGuardImpl
   718  }
   719  
   720  //go:generate ../../../util/interval/generic/gen.sh *lockState concurrency
   721  
   722  // Methods required by util/interval/generic type contract.
   723  func (l *lockState) ID() uint64         { return l.id }
   724  func (l *lockState) Key() []byte        { return l.key }
   725  func (l *lockState) EndKey() []byte     { return l.endKey }
   726  func (l *lockState) New() *lockState    { return new(lockState) }
   727  func (l *lockState) SetID(v uint64)     { l.id = v }
   728  func (l *lockState) SetKey(v []byte)    { l.key = v }
   729  func (l *lockState) SetEndKey(v []byte) { l.endKey = v }
   730  
   731  // REQUIRES: l.mu is locked.
   732  func (l *lockState) String() string {
   733  	var buf strings.Builder
   734  	l.Format(&buf)
   735  	return buf.String()
   736  }
   737  
   738  // REQUIRES: l.mu is locked.
   739  func (l *lockState) Format(buf *strings.Builder) {
   740  	fmt.Fprintf(buf, " lock: %s\n", l.key)
   741  	if l.isEmptyLock() {
   742  		fmt.Fprintln(buf, "  empty")
   743  		return
   744  	}
   745  	writeResInfo := func(b *strings.Builder, txn *enginepb.TxnMeta, ts hlc.Timestamp) {
   746  		// TODO(sbhola): strip the leading 0 bytes from the UUID string since tests are assigning
   747  		// UUIDs using a counter and makes this output more readable.
   748  		fmt.Fprintf(b, "txn: %v, ts: %v, seq: %v\n", txn.ID, ts, txn.Sequence)
   749  	}
   750  	writeHolderInfo := func(b *strings.Builder, txn *enginepb.TxnMeta, ts hlc.Timestamp) {
   751  		fmt.Fprintf(b, "  holder: txn: %v, ts: %v, info: ", txn.ID, ts)
   752  		first := true
   753  		for i := range l.holder.holder {
   754  			h := &l.holder.holder[i]
   755  			if h.txn == nil {
   756  				continue
   757  			}
   758  			if !first {
   759  				fmt.Fprintf(b, ", ")
   760  			}
   761  			first = false
   762  			if lock.Durability(i) == lock.Replicated {
   763  				fmt.Fprintf(b, "repl ")
   764  			} else {
   765  				fmt.Fprintf(b, "unrepl ")
   766  			}
   767  			fmt.Fprintf(b, "epoch: %d, seqs: [%d", h.txn.Epoch, h.seqs[0])
   768  			for j := 1; j < len(h.seqs); j++ {
   769  				fmt.Fprintf(b, ", %d", h.seqs[j])
   770  			}
   771  			fmt.Fprintf(b, "]")
   772  		}
   773  		fmt.Fprintln(b, "")
   774  	}
   775  	txn, ts := l.getLockerInfo()
   776  	if txn == nil {
   777  		fmt.Fprintf(buf, "  res: req: %d, ", l.reservation.seqNum)
   778  		writeResInfo(buf, l.reservation.txn, l.reservation.writeTS)
   779  	} else {
   780  		writeHolderInfo(buf, txn, ts)
   781  	}
   782  	if l.waitingReaders.Len() > 0 {
   783  		fmt.Fprintln(buf, "   waiting readers:")
   784  		for e := l.waitingReaders.Front(); e != nil; e = e.Next() {
   785  			g := e.Value.(*lockTableGuardImpl)
   786  			fmt.Fprintf(buf, "    req: %d, txn: ", g.seqNum)
   787  			if g.txn == nil {
   788  				fmt.Fprintln(buf, "none")
   789  			} else {
   790  				fmt.Fprintf(buf, "%v\n", g.txn.ID)
   791  			}
   792  		}
   793  	}
   794  	if l.queuedWriters.Len() > 0 {
   795  		fmt.Fprintln(buf, "   queued writers:")
   796  		for e := l.queuedWriters.Front(); e != nil; e = e.Next() {
   797  			qg := e.Value.(*queuedGuard)
   798  			g := qg.guard
   799  			fmt.Fprintf(buf, "    active: %t req: %d, txn: ",
   800  				qg.active, qg.guard.seqNum)
   801  			if g.txn == nil {
   802  				fmt.Fprintln(buf, "none")
   803  			} else {
   804  				fmt.Fprintf(buf, "%v\n", g.txn.ID)
   805  			}
   806  		}
   807  	}
   808  	if l.distinguishedWaiter != nil {
   809  		fmt.Fprintf(buf, "   distinguished req: %d\n", l.distinguishedWaiter.seqNum)
   810  	}
   811  }
   812  
   813  // Called for a write request when there is a reservation. Returns true iff it
   814  // succeeds.
   815  // REQUIRES: l.mu is locked.
   816  func (l *lockState) tryBreakReservation(seqNum uint64) bool {
   817  	if l.reservation.seqNum > seqNum {
   818  		qg := &queuedGuard{
   819  			guard:  l.reservation,
   820  			active: false,
   821  		}
   822  		l.queuedWriters.PushFront(qg)
   823  		l.reservation = nil
   824  		return true
   825  	}
   826  	return false
   827  }
   828  
   829  // Informs active waiters about reservation or lock holder. The reservation
   830  // may have changed so this needs to fix any inconsistencies wrt waitSelf and
   831  // waitForDistinguished states.
   832  // REQUIRES: l.mu is locked.
   833  func (l *lockState) informActiveWaiters() {
   834  	waitForState := waitingState{kind: waitFor, key: l.key}
   835  	findDistinguished := l.distinguishedWaiter == nil
   836  	if lockHolderTxn, _ := l.getLockerInfo(); lockHolderTxn != nil {
   837  		waitForState.txn = lockHolderTxn
   838  		waitForState.held = true
   839  	} else {
   840  		waitForState.txn = l.reservation.txn
   841  		if !findDistinguished && l.distinguishedWaiter.isSameTxnAsReservation(waitForState) {
   842  			findDistinguished = true
   843  			l.distinguishedWaiter = nil
   844  		}
   845  	}
   846  
   847  	for e := l.waitingReaders.Front(); e != nil; e = e.Next() {
   848  		state := waitForState
   849  		state.guardAccess = spanset.SpanReadOnly
   850  		// Since there are waiting readers we could not have transitioned out of
   851  		// or into a state with a reservation, since readers do not wait for
   852  		// reservations.
   853  		g := e.Value.(*lockTableGuardImpl)
   854  		if findDistinguished {
   855  			l.distinguishedWaiter = g
   856  			findDistinguished = false
   857  		}
   858  		g.mu.Lock()
   859  		g.mu.state = state
   860  		if l.distinguishedWaiter == g {
   861  			g.mu.state.kind = waitForDistinguished
   862  		}
   863  		g.notify()
   864  		g.mu.Unlock()
   865  	}
   866  	for e := l.queuedWriters.Front(); e != nil; e = e.Next() {
   867  		qg := e.Value.(*queuedGuard)
   868  		if !qg.active {
   869  			continue
   870  		}
   871  		g := qg.guard
   872  		var state waitingState
   873  		if g.isSameTxnAsReservation(waitForState) {
   874  			state = waitingState{kind: waitSelf}
   875  		} else {
   876  			state = waitForState
   877  			state.guardAccess = spanset.SpanReadWrite
   878  			if findDistinguished {
   879  				l.distinguishedWaiter = g
   880  				findDistinguished = false
   881  			}
   882  			if l.distinguishedWaiter == g {
   883  				state.kind = waitForDistinguished
   884  			}
   885  		}
   886  		g.mu.Lock()
   887  		g.mu.state = state
   888  		g.notify()
   889  		g.mu.Unlock()
   890  	}
   891  }
   892  
   893  // releaseWritersFromTxn removes all waiting writers for the lockState that are
   894  // part of the specified transaction.
   895  // REQUIRES: l.mu is locked.
   896  func (l *lockState) releaseWritersFromTxn(txn *enginepb.TxnMeta) {
   897  	for e := l.queuedWriters.Front(); e != nil; {
   898  		qg := e.Value.(*queuedGuard)
   899  		curr := e
   900  		e = e.Next()
   901  		g := qg.guard
   902  		if g.isSameTxn(txn) {
   903  			if qg.active {
   904  				if g == l.distinguishedWaiter {
   905  					l.distinguishedWaiter = nil
   906  				}
   907  				g.doneWaitingAtLock(false, l)
   908  			} else {
   909  				g.mu.Lock()
   910  				delete(g.mu.locks, l)
   911  				g.mu.Unlock()
   912  			}
   913  			l.queuedWriters.Remove(curr)
   914  		}
   915  	}
   916  }
   917  
   918  // When the active waiters have shrunk and the distinguished waiter has gone,
   919  // try to make a new distinguished waiter if there is at least 1 active
   920  // waiter.
   921  // REQUIRES: l.mu is locked.
   922  func (l *lockState) tryMakeNewDistinguished() {
   923  	var g *lockTableGuardImpl
   924  	if l.waitingReaders.Len() > 0 {
   925  		g = l.waitingReaders.Front().Value.(*lockTableGuardImpl)
   926  	} else if l.queuedWriters.Len() > 0 {
   927  		for e := l.queuedWriters.Front(); e != nil; e = e.Next() {
   928  			qg := e.Value.(*queuedGuard)
   929  			if qg.active && (l.reservation == nil || !qg.guard.isSameTxn(l.reservation.txn)) {
   930  				g = qg.guard
   931  				break
   932  			}
   933  		}
   934  	}
   935  	if g != nil {
   936  		l.distinguishedWaiter = g
   937  		g.mu.Lock()
   938  		g.mu.state.kind = waitForDistinguished
   939  		// The rest of g.state is already up-to-date.
   940  		g.notify()
   941  		g.mu.Unlock()
   942  	}
   943  }
   944  
   945  // Returns true iff the lock is currently held by the transaction with the
   946  // given id.
   947  // REQUIRES: l.mu is locked.
   948  func (l *lockState) isLockedBy(id uuid.UUID) bool {
   949  	if l.holder.locked {
   950  		var holderID uuid.UUID
   951  		if l.holder.holder[lock.Unreplicated].txn != nil {
   952  			holderID = l.holder.holder[lock.Unreplicated].txn.ID
   953  		} else {
   954  			holderID = l.holder.holder[lock.Replicated].txn.ID
   955  		}
   956  		return id == holderID
   957  	}
   958  	return false
   959  }
   960  
   961  // Returns information about the current lock holder if the lock is held, else
   962  // returns nil.
   963  // REQUIRES: l.mu is locked.
   964  func (l *lockState) getLockerInfo() (*enginepb.TxnMeta, hlc.Timestamp) {
   965  	if !l.holder.locked {
   966  		return nil, hlc.Timestamp{}
   967  	}
   968  
   969  	// If the lock is held as both replicated and unreplicated we want to
   970  	// provide the lower of the two timestamps, since the lower timestamp
   971  	// contends with more transactions. Else we provide whichever one it is held
   972  	// at.
   973  
   974  	// Start with the assumption that it is held as replicated.
   975  	index := lock.Replicated
   976  	// Condition under which we prefer the unreplicated holder.
   977  	if l.holder.holder[index].txn == nil || (l.holder.holder[lock.Unreplicated].txn != nil &&
   978  		// If we are evaluating the following clause we are sure that it is held
   979  		// as both replicated and unreplicated.
   980  		l.holder.holder[lock.Unreplicated].ts.Less(l.holder.holder[lock.Replicated].ts)) {
   981  		index = lock.Unreplicated
   982  	}
   983  	return l.holder.holder[index].txn, l.holder.holder[index].ts
   984  }
   985  
   986  // Decides whether the request g with access sa should actively wait at this
   987  // lock and if yes, adjusts the data-structures appropriately. The notify
   988  // parameter is true iff the request's new state channel should be notified --
   989  // it is set to false when the call to tryActiveWait is happening due to an
   990  // event for a different request or transaction (like a lock release) since in
   991  // that case the channel is notified first and the call to tryActiveWait()
   992  // happens later in lockTableGuard.CurState(). The return value is true iff
   993  // it is actively waiting.
   994  // Acquires l.mu, g.mu.
   995  func (l *lockState) tryActiveWait(g *lockTableGuardImpl, sa spanset.SpanAccess, notify bool) bool {
   996  	l.mu.Lock()
   997  	defer l.mu.Unlock()
   998  
   999  	// It is possible that this lock is empty and has not yet been deleted.
  1000  	if l.isEmptyLock() {
  1001  		return false
  1002  	}
  1003  
  1004  	// Lock is not empty.
  1005  	lockHolderTxn, lockHolderTS := l.getLockerInfo()
  1006  	if lockHolderTxn != nil && g.isSameTxn(lockHolderTxn) {
  1007  		// Already locked by this txn.
  1008  		return false
  1009  	}
  1010  
  1011  	if sa == spanset.SpanReadOnly {
  1012  		if lockHolderTxn == nil {
  1013  			// Reads only care about locker, not a reservation.
  1014  			return false
  1015  		}
  1016  		// Locked by some other txn.
  1017  		if g.readTS.Less(lockHolderTS) {
  1018  			return false
  1019  		}
  1020  		g.mu.Lock()
  1021  		_, alsoHasStrongerAccess := g.mu.locks[l]
  1022  		g.mu.Unlock()
  1023  
  1024  		// If the request already has this lock in its locks map, it must also be
  1025  		// writing to this key and must be either a reservation holder or inactive
  1026  		// waiter at this lock. The former has already been handled above. For the
  1027  		// latter, it must have had its reservation broken. Since this is a weaker
  1028  		// access we defer to the stronger access and don't wait here.
  1029  		//
  1030  		// For non-transactional requests that have the key specified as both
  1031  		// SpanReadOnly and SpanReadWrite, the request never acquires a
  1032  		// reservation, so using the locks map to detect this duplication of the
  1033  		// key is not possible. In the rare case, the lock is now held at a
  1034  		// timestamp that is not compatible with this request and it will wait
  1035  		// here -- there is no correctness issue with doing that.
  1036  		if alsoHasStrongerAccess {
  1037  			return false
  1038  		}
  1039  	}
  1040  
  1041  	waitForState := waitingState{kind: waitFor, key: l.key}
  1042  	if lockHolderTxn != nil {
  1043  		waitForState.txn = lockHolderTxn
  1044  		waitForState.held = true
  1045  	} else {
  1046  		if l.reservation == g {
  1047  			// Already reserved by this request.
  1048  			return false
  1049  		}
  1050  		// A non-transactional write request never makes or breaks reservations,
  1051  		// and only waits for a reservation if the reservation has a lower
  1052  		// seqNum. Note that `sa == spanset.SpanRead && lockHolderTxn == nil`
  1053  		// was already checked above.
  1054  		if g.txn == nil && l.reservation.seqNum > g.seqNum {
  1055  			// Reservation is held by a request with a higher seqNum and g is a
  1056  			// non-transactional request. Ignore the reservation.
  1057  			return false
  1058  		}
  1059  		waitForState.txn = l.reservation.txn
  1060  	}
  1061  
  1062  	// Incompatible with whoever is holding lock or reservation.
  1063  
  1064  	if l.reservation != nil && sa == spanset.SpanReadWrite && l.tryBreakReservation(g.seqNum) {
  1065  		l.reservation = g
  1066  		g.mu.Lock()
  1067  		g.mu.locks[l] = struct{}{}
  1068  		g.mu.Unlock()
  1069  		// There cannot be waitingReaders, since they do not wait for
  1070  		// reservations. And the set of active queuedWriters has not changed, but
  1071  		// they do need to be told about the change in who they are waiting for.
  1072  		l.informActiveWaiters()
  1073  		return false
  1074  	}
  1075  
  1076  	// Need to wait.
  1077  
  1078  	g.mu.Lock()
  1079  	defer g.mu.Unlock()
  1080  	if sa == spanset.SpanReadWrite {
  1081  		if _, inQueue := g.mu.locks[l]; inQueue {
  1082  			// Already in queue and must be in the right position, so mark as active
  1083  			// waiter there. We expect this to be rare.
  1084  			var qg *queuedGuard
  1085  			for e := l.queuedWriters.Front(); e != nil; e = e.Next() {
  1086  				qqg := e.Value.(*queuedGuard)
  1087  				if qqg.guard == g {
  1088  					qg = qqg
  1089  					break
  1090  				}
  1091  			}
  1092  			if qg == nil {
  1093  				panic("lockTable bug")
  1094  			}
  1095  			qg.active = true
  1096  		} else {
  1097  			// Not in queue so insert as active waiter.
  1098  			qg := &queuedGuard{
  1099  				guard:  g,
  1100  				active: true,
  1101  			}
  1102  			if l.queuedWriters.Len() == 0 {
  1103  				l.queuedWriters.PushFront(qg)
  1104  			} else {
  1105  				var e *list.Element
  1106  				for e = l.queuedWriters.Back(); e != nil; e = e.Prev() {
  1107  					qqg := e.Value.(*queuedGuard)
  1108  					if qqg.guard.seqNum < qg.guard.seqNum {
  1109  						break
  1110  					}
  1111  				}
  1112  				if e == nil {
  1113  					l.queuedWriters.PushFront(qg)
  1114  				} else {
  1115  					l.queuedWriters.InsertAfter(qg, e)
  1116  				}
  1117  			}
  1118  			g.mu.locks[l] = struct{}{}
  1119  		}
  1120  	} else {
  1121  		l.waitingReaders.PushFront(g)
  1122  		g.mu.locks[l] = struct{}{}
  1123  	}
  1124  	// Make it an active waiter.
  1125  	g.key = l.key
  1126  	g.mu.startWait = true
  1127  	if g.isSameTxnAsReservation(waitForState) {
  1128  		g.mu.state = waitingState{kind: waitSelf}
  1129  	} else {
  1130  		state := waitForState
  1131  		state.guardAccess = sa
  1132  		if l.distinguishedWaiter == nil {
  1133  			l.distinguishedWaiter = g
  1134  			state.kind = waitForDistinguished
  1135  		}
  1136  		g.mu.state = state
  1137  	}
  1138  	if notify {
  1139  		g.notify()
  1140  	}
  1141  	return true
  1142  }
  1143  
  1144  // Acquires this lock. Returns the list of guards that are done actively
  1145  // waiting at this key -- these will be requests from the same transaction
  1146  // that is acquiring the lock.
  1147  // Acquires l.mu.
  1148  func (l *lockState) acquireLock(
  1149  	_ lock.Strength, durability lock.Durability, txn *enginepb.TxnMeta, ts hlc.Timestamp,
  1150  ) error {
  1151  	l.mu.Lock()
  1152  	defer l.mu.Unlock()
  1153  	if l.holder.locked {
  1154  		// Already held.
  1155  		beforeTxn, beforeTs := l.getLockerInfo()
  1156  		if txn.ID != beforeTxn.ID {
  1157  			return errors.Errorf("caller violated contract: " +
  1158  				"existing lock cannot be acquired by different transaction")
  1159  		}
  1160  		seqs := l.holder.holder[durability].seqs
  1161  		if l.holder.holder[durability].txn != nil && l.holder.holder[durability].txn.Epoch < txn.Epoch {
  1162  			// Clear the sequences for the older epoch.
  1163  			seqs = seqs[:0]
  1164  		}
  1165  		if len(seqs) > 0 && seqs[len(seqs)-1] >= txn.Sequence {
  1166  			// Idempotent lock acquisition. In this case, we simply ignore the lock
  1167  			// acquisition as long as it corresponds to an existing sequence number.
  1168  			// If the sequence number is not being tracked yet, insert it into the
  1169  			// sequence history. The validity of such a lock re-acquisition should
  1170  			// have already been determined at the MVCC level.
  1171  			if i := sort.Search(len(seqs), func(i int) bool {
  1172  				return seqs[i] >= txn.Sequence
  1173  			}); i == len(seqs) {
  1174  				panic("lockTable bug - search value <= last element")
  1175  			} else if seqs[i] != txn.Sequence {
  1176  				seqs = append(seqs, 0)
  1177  				copy(seqs[i+1:], seqs[i:])
  1178  				seqs[i] = txn.Sequence
  1179  				l.holder.holder[durability].seqs = seqs
  1180  			}
  1181  			return nil
  1182  		}
  1183  		l.holder.holder[durability].txn = txn
  1184  		// Forward the lock's timestamp instead of assigning to it blindly.
  1185  		// While lock acquisition uses monotonically increasing timestamps
  1186  		// from the perspective of the transaction's coordinator, this does
  1187  		// not guarantee that a lock will never be acquired at a higher
  1188  		// epoch and/or sequence number but with a lower timestamp when in
  1189  		// the presence of transaction pushes. Consider the following
  1190  		// sequence of events:
  1191  		//
  1192  		//  - txn A acquires lock at sequence 1, ts 10
  1193  		//  - txn B pushes txn A to ts 20
  1194  		//  - txn B updates lock to ts 20
  1195  		//  - txn A's coordinator does not immediately learn of the push
  1196  		//  - txn A re-acquires lock at sequence 2, ts 15
  1197  		//
  1198  		// A lock's timestamp at a given durability level is not allowed to
  1199  		// regress, so by forwarding its timestamp during the second acquisition
  1200  		// instead if assigning to it blindly, it remains at 20.
  1201  		//
  1202  		// However, a lock's timestamp as reported by getLockerInfo can regress
  1203  		// if it is acquired at a lower timestamp and a different durability
  1204  		// than it was previously held with. This is necessary to support
  1205  		// because the hard constraint which we must uphold here that the
  1206  		// lockHolderInfo for a replicated lock cannot diverge from the
  1207  		// replicated state machine in such a way that its timestamp in the
  1208  		// lockTable exceeds that in the replicated keyspace. If this invariant
  1209  		// were to be violated, we'd risk infinite lock-discovery loops for
  1210  		// requests that conflict with the lock as is written in the replicated
  1211  		// state machine but not as is reflected in the lockTable.
  1212  		//
  1213  		// Lock timestamp regressions are safe from the perspective of other
  1214  		// transactions because the request which re-acquired the lock at the
  1215  		// lower timestamp must have been holding a write latch at or below the
  1216  		// new lock's timestamp. This means that no conflicting requests could
  1217  		// be evaluating concurrently. Instead, all will need to re-scan the
  1218  		// lockTable once they acquire latches and will notice the reduced
  1219  		// timestamp at that point, which may cause them to conflict with the
  1220  		// lock even if they had not conflicted before. In a sense, it is no
  1221  		// different than the first time a lock is added to the lockTable.
  1222  		l.holder.holder[durability].ts.Forward(ts)
  1223  		l.holder.holder[durability].seqs = append(seqs, txn.Sequence)
  1224  
  1225  		_, afterTs := l.getLockerInfo()
  1226  		if beforeTs.Less(afterTs) {
  1227  			l.increasedLockTs(afterTs)
  1228  		}
  1229  		return nil
  1230  	}
  1231  	// Not already held, so may be reserved by this request. There is also the
  1232  	// possibility that some other request has broken this reservation because
  1233  	// of a concurrent release but that is harmless since this request is
  1234  	// holding latches and has proceeded to evaluation.
  1235  	if l.reservation != nil {
  1236  		if l.reservation.txn.ID != txn.ID {
  1237  			// Reservation is broken.
  1238  			qg := &queuedGuard{
  1239  				guard:  l.reservation,
  1240  				active: false,
  1241  			}
  1242  			l.queuedWriters.PushFront(qg)
  1243  		} else {
  1244  			// Else, reservation is not broken, or broken by a different request
  1245  			// from the same transaction. In the latter case, both requests are not
  1246  			// actively waiting at this lock. We don't know which is in the queue
  1247  			// and which is holding the reservation but it does not matter. Both
  1248  			// will have their requestGuardImpl.mu.locks updated and neither will be
  1249  			// in the queue at the end of this method.
  1250  			l.reservation.mu.Lock()
  1251  			delete(l.reservation.mu.locks, l)
  1252  			l.reservation.mu.Unlock()
  1253  		}
  1254  		if l.waitingReaders.Len() > 0 {
  1255  			panic("lockTable bug")
  1256  		}
  1257  	} else {
  1258  		if l.queuedWriters.Len() > 0 || l.waitingReaders.Len() > 0 {
  1259  			panic("lockTable bug")
  1260  		}
  1261  	}
  1262  	l.reservation = nil
  1263  	l.holder.locked = true
  1264  	l.holder.holder[durability].txn = txn
  1265  	l.holder.holder[durability].ts = ts
  1266  	l.holder.holder[durability].seqs = append([]enginepb.TxnSeq(nil), txn.Sequence)
  1267  
  1268  	// If there are waiting requests from the same txn, they no longer need to wait.
  1269  	l.releaseWritersFromTxn(txn)
  1270  
  1271  	// Inform active waiters since lock has transitioned to held.
  1272  	l.informActiveWaiters()
  1273  	return nil
  1274  }
  1275  
  1276  // A replicated lock held by txn with timestamp ts was discovered by guard g
  1277  // where g is trying to access this key with access sa.
  1278  // Acquires l.mu.
  1279  func (l *lockState) discoveredLock(
  1280  	txn *enginepb.TxnMeta, ts hlc.Timestamp, g *lockTableGuardImpl, sa spanset.SpanAccess,
  1281  ) error {
  1282  	l.mu.Lock()
  1283  	defer l.mu.Unlock()
  1284  
  1285  	if l.holder.locked {
  1286  		if !l.isLockedBy(txn.ID) {
  1287  			return errors.Errorf("caller violated contract: " +
  1288  				"discovered lock by different transaction than existing lock")
  1289  		}
  1290  	} else {
  1291  		l.holder.locked = true
  1292  	}
  1293  	holder := &l.holder.holder[lock.Replicated]
  1294  	if holder.txn == nil {
  1295  		holder.txn = txn
  1296  		holder.ts = ts
  1297  		holder.seqs = append(holder.seqs, txn.Sequence)
  1298  	}
  1299  
  1300  	// Queue the existing reservation holder. Note that this reservation
  1301  	// holder may not be equal to g due to two reasons (a) the reservation
  1302  	// of g could have been broken even though g is holding latches (see
  1303  	// the comment in acquireLock()), (b) g may be a non-transactional
  1304  	// request (read or write) that can ignore the reservation.
  1305  	if l.reservation != nil {
  1306  		qg := &queuedGuard{
  1307  			guard:  l.reservation,
  1308  			active: false,
  1309  		}
  1310  		l.queuedWriters.PushFront(qg)
  1311  		l.reservation = nil
  1312  	}
  1313  
  1314  	switch sa {
  1315  	case spanset.SpanReadOnly:
  1316  		// Don't enter the lock's queuedReaders list, because all queued readers
  1317  		// are expected to be active. Instead, wait until the next scan.
  1318  
  1319  		// Confirm that the guard will wait on the lock the next time it scans
  1320  		// the lock table. If not then it shouldn't have discovered the lock in
  1321  		// the first place. Bugs here would cause infinite loops where the same
  1322  		// lock is repeatedly re-discovered.
  1323  		if g.readTS.Less(ts) {
  1324  			return errors.Errorf("caller violated contract: discovered non-conflicting lock")
  1325  		}
  1326  
  1327  	case spanset.SpanReadWrite:
  1328  		// Immediately enter the lock's queuedWriters list.
  1329  		g.mu.Lock()
  1330  		_, presentHere := g.mu.locks[l]
  1331  		if !presentHere {
  1332  			// Since g will place itself in queue as inactive waiter below.
  1333  			g.mu.locks[l] = struct{}{}
  1334  		}
  1335  		g.mu.Unlock()
  1336  
  1337  		if !presentHere {
  1338  			// Put self in queue as inactive waiter.
  1339  			qg := &queuedGuard{
  1340  				guard:  g,
  1341  				active: false,
  1342  			}
  1343  			// g is not necessarily first in the queue in the (rare) case (a) above.
  1344  			var e *list.Element
  1345  			for e = l.queuedWriters.Front(); e != nil; e = e.Next() {
  1346  				qqg := e.Value.(*queuedGuard)
  1347  				if qqg.guard.seqNum > g.seqNum {
  1348  					break
  1349  				}
  1350  			}
  1351  			if e == nil {
  1352  				l.queuedWriters.PushBack(qg)
  1353  			} else {
  1354  				l.queuedWriters.InsertBefore(qg, e)
  1355  			}
  1356  		}
  1357  	}
  1358  
  1359  	// If there are waiting requests from the same txn, they no longer need to wait.
  1360  	l.releaseWritersFromTxn(txn)
  1361  
  1362  	// Active waiters need to be told about who they are waiting for.
  1363  	l.informActiveWaiters()
  1364  	return nil
  1365  }
  1366  
  1367  // Acquires l.mu.
  1368  func (l *lockState) tryClearLock(force bool) bool {
  1369  	l.mu.Lock()
  1370  	defer l.mu.Unlock()
  1371  	replicatedHeld := l.holder.locked && l.holder.holder[lock.Replicated].txn != nil
  1372  	if replicatedHeld && l.distinguishedWaiter == nil && !force {
  1373  		// Replicated lock is held and has no distinguished waiter.
  1374  		return false
  1375  	}
  1376  
  1377  	// Remove unreplicated holder.
  1378  	l.holder.holder[lock.Unreplicated] = lockHolderInfo{}
  1379  	var waitState waitingState
  1380  	if replicatedHeld && !force {
  1381  		lockHolderTxn, _ := l.getLockerInfo()
  1382  		// Note that none of the current waiters can be requests
  1383  		// from lockHolderTxn.
  1384  		waitState = waitingState{
  1385  			kind:        waitElsewhere,
  1386  			txn:         lockHolderTxn,
  1387  			key:         l.key,
  1388  			held:        true,
  1389  			guardAccess: spanset.SpanReadOnly,
  1390  		}
  1391  	} else {
  1392  		l.holder.locked = false
  1393  		waitState = waitingState{kind: doneWaiting}
  1394  	}
  1395  
  1396  	l.distinguishedWaiter = nil
  1397  	if l.reservation != nil {
  1398  		g := l.reservation
  1399  		g.mu.Lock()
  1400  		delete(g.mu.locks, l)
  1401  		g.mu.Unlock()
  1402  		l.reservation = nil
  1403  	}
  1404  	for e := l.waitingReaders.Front(); e != nil; {
  1405  		g := e.Value.(*lockTableGuardImpl)
  1406  		curr := e
  1407  		e = e.Next()
  1408  		l.waitingReaders.Remove(curr)
  1409  
  1410  		g.mu.Lock()
  1411  		g.mu.state = waitState
  1412  		g.notify()
  1413  		delete(g.mu.locks, l)
  1414  		g.mu.Unlock()
  1415  	}
  1416  
  1417  	waitState.guardAccess = spanset.SpanReadWrite
  1418  	for e := l.queuedWriters.Front(); e != nil; {
  1419  		qg := e.Value.(*queuedGuard)
  1420  		curr := e
  1421  		e = e.Next()
  1422  		l.queuedWriters.Remove(curr)
  1423  
  1424  		g := qg.guard
  1425  		g.mu.Lock()
  1426  		if qg.active {
  1427  			g.mu.state = waitState
  1428  			g.notify()
  1429  		}
  1430  		delete(g.mu.locks, l)
  1431  		g.mu.Unlock()
  1432  	}
  1433  	return true
  1434  }
  1435  
  1436  // Returns true iff the lockState is empty, i.e., there is no lock holder or
  1437  // reservation.
  1438  // REQUIRES: l.mu is locked.
  1439  func (l *lockState) isEmptyLock() bool {
  1440  	if !l.holder.locked && l.reservation == nil {
  1441  		if l.waitingReaders.Len() > 0 || l.queuedWriters.Len() > 0 {
  1442  			panic("lockTable bug")
  1443  		}
  1444  		return true
  1445  	}
  1446  	return false
  1447  }
  1448  
  1449  // Removes the TxnSeqs in heldSeqNums that are contained in ignoredSeqNums.
  1450  // REQUIRES: ignoredSeqNums contains non-overlapping ranges and sorted in
  1451  // increasing seq order.
  1452  func removeIgnored(
  1453  	heldSeqNums []enginepb.TxnSeq, ignoredSeqNums []enginepb.IgnoredSeqNumRange,
  1454  ) []enginepb.TxnSeq {
  1455  	if len(ignoredSeqNums) == 0 {
  1456  		return heldSeqNums
  1457  	}
  1458  	held := heldSeqNums[:0]
  1459  	for _, n := range heldSeqNums {
  1460  		i := sort.Search(len(ignoredSeqNums), func(i int) bool { return ignoredSeqNums[i].End >= n })
  1461  		if i == len(ignoredSeqNums) || ignoredSeqNums[i].Start > n {
  1462  			held = append(held, n)
  1463  		}
  1464  	}
  1465  	return held
  1466  }
  1467  
  1468  // Tries to update the lock: noop if this lock is held by a different
  1469  // transaction, else the lock is updated. Returns whether the lockState can be
  1470  // garbage collected.
  1471  // Acquires l.mu.
  1472  func (l *lockState) tryUpdateLock(up *roachpb.LockUpdate) (gc bool, err error) {
  1473  	l.mu.Lock()
  1474  	defer l.mu.Unlock()
  1475  	if !l.isLockedBy(up.Txn.ID) {
  1476  		return false, nil
  1477  	}
  1478  	if up.Status.IsFinalized() {
  1479  		l.holder.locked = false
  1480  		for i := range l.holder.holder {
  1481  			l.holder.holder[i] = lockHolderInfo{}
  1482  		}
  1483  		gc = l.lockIsFree()
  1484  		return gc, nil
  1485  	}
  1486  
  1487  	txn := &up.Txn
  1488  	ts := up.Txn.WriteTimestamp
  1489  	_, beforeTs := l.getLockerInfo()
  1490  	advancedTs := beforeTs.Less(ts)
  1491  	isLocked := false
  1492  	for i := range l.holder.holder {
  1493  		holder := &l.holder.holder[i]
  1494  		if holder.txn == nil {
  1495  			continue
  1496  		}
  1497  		// Note that mvccResolveWriteIntent() has special handling of the case
  1498  		// where the pusher is using an epoch lower than the epoch of the intent
  1499  		// (replicated lock), but is trying to push to a higher timestamp. The
  1500  		// replicated lock gets written with the newer epoch (not the epoch known
  1501  		// to the pusher) but a higher timestamp. Then the pusher will call into
  1502  		// this function with that lower epoch. Instead of trying to be consistent
  1503  		// with mvccResolveWriteIntent() in the current state of the replicated
  1504  		// lock we simply forget the replicated lock since it is no longer in the
  1505  		// way of this request. Eventually, once we have segregated locks, the
  1506  		// lock table will be the source of truth for replicated locks too, and
  1507  		// this forgetting behavior will go away.
  1508  		//
  1509  		// For unreplicated locks the lock table is the source of truth, so we
  1510  		// best-effort mirror the behavior of mvccResolveWriteIntent() by updating
  1511  		// the timestamp.
  1512  		if lock.Durability(i) == lock.Replicated || txn.Epoch > holder.txn.Epoch {
  1513  			holder.txn = nil
  1514  			holder.seqs = nil
  1515  			continue
  1516  		}
  1517  		// Unreplicated lock held in same epoch or a higher epoch.
  1518  		if advancedTs {
  1519  			// We may advance ts here but not update the holder.txn object below
  1520  			// for the reason stated in the comment about mvccResolveWriteIntent().
  1521  			// The lockHolderInfo.ts is the source of truth regarding the timestamp
  1522  			// of the lock, and not TxnMeta.WriteTimestamp.
  1523  			holder.ts = ts
  1524  		}
  1525  		if txn.Epoch == holder.txn.Epoch {
  1526  			holder.seqs = removeIgnored(holder.seqs, up.IgnoredSeqNums)
  1527  			if len(holder.seqs) == 0 {
  1528  				holder.txn = nil
  1529  				continue
  1530  			}
  1531  			if advancedTs {
  1532  				holder.txn = txn
  1533  			}
  1534  		}
  1535  		// Else txn.Epoch < lockHolderTxn.Epoch, so only the timestamp has been
  1536  		// potentially updated.
  1537  		isLocked = true
  1538  	}
  1539  
  1540  	if !isLocked {
  1541  		l.holder.locked = false
  1542  		gc = l.lockIsFree()
  1543  		return gc, nil
  1544  	}
  1545  
  1546  	if advancedTs {
  1547  		l.increasedLockTs(ts)
  1548  	}
  1549  	// Else no change for waiters. This can happen due to a race between different
  1550  	// callers of UpdateLocks().
  1551  
  1552  	return false, nil
  1553  }
  1554  
  1555  // The lock holder timestamp has increased. Some of the waiters may no longer
  1556  // need to wait.
  1557  // REQUIRES: l.mu is locked.
  1558  func (l *lockState) increasedLockTs(newTs hlc.Timestamp) {
  1559  	distinguishedRemoved := false
  1560  	for e := l.waitingReaders.Front(); e != nil; {
  1561  		g := e.Value.(*lockTableGuardImpl)
  1562  		curr := e
  1563  		e = e.Next()
  1564  		if g.readTS.Less(newTs) {
  1565  			// Stop waiting.
  1566  			l.waitingReaders.Remove(curr)
  1567  			if g == l.distinguishedWaiter {
  1568  				distinguishedRemoved = true
  1569  				l.distinguishedWaiter = nil
  1570  			}
  1571  			g.doneWaitingAtLock(false, l)
  1572  		}
  1573  		// Else don't inform an active waiter which continues to be an active waiter
  1574  		// despite the timestamp increase.
  1575  	}
  1576  	if distinguishedRemoved {
  1577  		l.tryMakeNewDistinguished()
  1578  	}
  1579  }
  1580  
  1581  // A request known to this lockState is done. The request could be a reserver,
  1582  // or waiting reader or writer. Acquires l.mu. Note that there is the
  1583  // possibility of a race and the g may no longer be known to l, which we treat
  1584  // as a noop (this race is allowed since we order l.mu > g.mu). Returns whether
  1585  // the lockState can be garbage collected.
  1586  // Acquires l.mu.
  1587  func (l *lockState) requestDone(g *lockTableGuardImpl) (gc bool) {
  1588  	l.mu.Lock()
  1589  	defer l.mu.Unlock()
  1590  
  1591  	g.mu.Lock()
  1592  	if _, present := g.mu.locks[l]; !present {
  1593  		g.mu.Unlock()
  1594  		return false
  1595  	}
  1596  	delete(g.mu.locks, l)
  1597  	g.mu.Unlock()
  1598  
  1599  	if l.reservation == g {
  1600  		l.reservation = nil
  1601  		return l.lockIsFree()
  1602  	}
  1603  	// May be in queuedWriters or waitingReaders.
  1604  	distinguishedRemoved := false
  1605  	doneRemoval := false
  1606  	for e := l.queuedWriters.Front(); e != nil; e = e.Next() {
  1607  		qg := e.Value.(*queuedGuard)
  1608  		if qg.guard == g {
  1609  			l.queuedWriters.Remove(e)
  1610  			if qg.guard == l.distinguishedWaiter {
  1611  				distinguishedRemoved = true
  1612  				l.distinguishedWaiter = nil
  1613  			}
  1614  			doneRemoval = true
  1615  			break
  1616  		}
  1617  	}
  1618  	if !doneRemoval {
  1619  		for e := l.waitingReaders.Front(); e != nil; e = e.Next() {
  1620  			gg := e.Value.(*lockTableGuardImpl)
  1621  			if gg == g {
  1622  				l.waitingReaders.Remove(e)
  1623  				if g == l.distinguishedWaiter {
  1624  					distinguishedRemoved = true
  1625  					l.distinguishedWaiter = nil
  1626  				}
  1627  				doneRemoval = true
  1628  				break
  1629  			}
  1630  		}
  1631  	}
  1632  	if !doneRemoval {
  1633  		panic("lockTable bug")
  1634  	}
  1635  	if distinguishedRemoved {
  1636  		l.tryMakeNewDistinguished()
  1637  	}
  1638  	return false
  1639  }
  1640  
  1641  // The lock has transitioned from locked/reserved to unlocked. There could be
  1642  // waiters, but there cannot be a reservation.
  1643  // REQUIRES: l.mu is locked.
  1644  func (l *lockState) lockIsFree() (gc bool) {
  1645  	if l.reservation != nil {
  1646  		panic("lockTable bug")
  1647  	}
  1648  	// All waiting readers don't need to wait here anymore.
  1649  	for e := l.waitingReaders.Front(); e != nil; {
  1650  		g := e.Value.(*lockTableGuardImpl)
  1651  		curr := e
  1652  		e = e.Next()
  1653  		l.waitingReaders.Remove(curr)
  1654  		if g == l.distinguishedWaiter {
  1655  			l.distinguishedWaiter = nil
  1656  		}
  1657  		g.doneWaitingAtLock(false, l)
  1658  	}
  1659  
  1660  	// The prefix of the queue that is non-transactional writers is done
  1661  	// waiting.
  1662  	for e := l.queuedWriters.Front(); e != nil; {
  1663  		qg := e.Value.(*queuedGuard)
  1664  		g := qg.guard
  1665  		if g.txn == nil {
  1666  			curr := e
  1667  			e = e.Next()
  1668  			l.queuedWriters.Remove(curr)
  1669  			if g == l.distinguishedWaiter {
  1670  				l.distinguishedWaiter = nil
  1671  			}
  1672  			g.doneWaitingAtLock(false, l)
  1673  		} else {
  1674  			break
  1675  		}
  1676  	}
  1677  
  1678  	if l.queuedWriters.Len() == 0 {
  1679  		return true
  1680  	}
  1681  
  1682  	// First waiting writer (it must be transactional) gets the reservation.
  1683  	e := l.queuedWriters.Front()
  1684  	qg := e.Value.(*queuedGuard)
  1685  	g := qg.guard
  1686  	l.reservation = g
  1687  	l.queuedWriters.Remove(e)
  1688  	if qg.active {
  1689  		if g == l.distinguishedWaiter {
  1690  			l.distinguishedWaiter = nil
  1691  		}
  1692  		g.doneWaitingAtLock(true, l)
  1693  	}
  1694  	// Else inactive waiter and is waiting elsewhere.
  1695  
  1696  	// Tell the active waiters who they are waiting for.
  1697  	l.informActiveWaiters()
  1698  	return false
  1699  }
  1700  
  1701  func (t *treeMu) nextLockSeqNum() uint64 {
  1702  	t.lockIDSeqNum++
  1703  	return t.lockIDSeqNum
  1704  }
  1705  
  1706  // ScanAndEnqueue implements the lockTable interface.
  1707  func (t *lockTableImpl) ScanAndEnqueue(req Request, guard lockTableGuard) lockTableGuard {
  1708  	// NOTE: there is no need to synchronize with enabledMu here. ScanAndEnqueue
  1709  	// scans the lockTable and enters any conflicting lock wait-queues, but a
  1710  	// disabled lockTable will be empty. If the scan's btree snapshot races with
  1711  	// a concurrent call to clear/disable then it might enter some wait-queues,
  1712  	// but it will quickly be released from them.
  1713  
  1714  	var g *lockTableGuardImpl
  1715  	if guard == nil {
  1716  		g = newLockTableGuardImpl()
  1717  		g.seqNum = atomic.AddUint64(&t.seqNum, 1)
  1718  		g.txn = req.txnMeta()
  1719  		g.spans = req.LockSpans
  1720  		g.readTS = req.readConflictTimestamp()
  1721  		g.writeTS = req.writeConflictTimestamp()
  1722  		g.sa = spanset.NumSpanAccess - 1
  1723  		g.index = -1
  1724  	} else {
  1725  		g = guard.(*lockTableGuardImpl)
  1726  		g.key = nil
  1727  		g.sa = spanset.NumSpanAccess - 1
  1728  		g.ss = spanset.SpanScope(0)
  1729  		g.index = -1
  1730  		g.mu.Lock()
  1731  		g.mu.startWait = false
  1732  		g.mu.mustFindNextLockAfter = false
  1733  		g.mu.Unlock()
  1734  	}
  1735  	for ss := spanset.SpanScope(0); ss < spanset.NumSpanScope; ss++ {
  1736  		for sa := spanset.SpanAccess(0); sa < spanset.NumSpanAccess; sa++ {
  1737  			if len(g.spans.GetSpans(sa, ss)) > 0 {
  1738  				// Since the spans are constant for a request, every call to
  1739  				// ScanAndEnqueue for that request will execute the following code
  1740  				// for the same SpanScope(s). Any SpanScope for which this code does
  1741  				// not execute will always have an empty snapshot.
  1742  				t.locks[ss].mu.RLock()
  1743  				g.tableSnapshot[ss].Reset()
  1744  				g.tableSnapshot[ss] = t.locks[ss].Clone()
  1745  				t.locks[ss].mu.RUnlock()
  1746  				break
  1747  			}
  1748  		}
  1749  	}
  1750  	g.findNextLockAfter(true /* notify */)
  1751  	return g
  1752  }
  1753  
  1754  // Dequeue implements the lockTable interface.
  1755  func (t *lockTableImpl) Dequeue(guard lockTableGuard) {
  1756  	// NOTE: there is no need to synchronize with enabledMu here. Dequeue only
  1757  	// accesses state already held by the guard and does not add anything to the
  1758  	// lockTable.
  1759  
  1760  	g := guard.(*lockTableGuardImpl)
  1761  	defer releaseLockTableGuardImpl(g)
  1762  
  1763  	var candidateLocks []*lockState
  1764  	g.mu.Lock()
  1765  	for l := range g.mu.locks {
  1766  		candidateLocks = append(candidateLocks, l)
  1767  	}
  1768  	g.mu.Unlock()
  1769  	var locksToGC [spanset.NumSpanScope][]*lockState
  1770  	for _, l := range candidateLocks {
  1771  		if gc := l.requestDone(g); gc {
  1772  			locksToGC[l.ss] = append(locksToGC[l.ss], l)
  1773  		}
  1774  	}
  1775  
  1776  	for i := 0; i < len(locksToGC); i++ {
  1777  		if len(locksToGC[i]) > 0 {
  1778  			t.tryGCLocks(&t.locks[i], locksToGC[i])
  1779  		}
  1780  	}
  1781  }
  1782  
  1783  // AddDiscoveredLock implements the lockTable interface.
  1784  func (t *lockTableImpl) AddDiscoveredLock(
  1785  	intent *roachpb.Intent, guard lockTableGuard,
  1786  ) (added bool, _ error) {
  1787  	t.enabledMu.RLock()
  1788  	defer t.enabledMu.RUnlock()
  1789  	if !t.enabled {
  1790  		// If not enabled, don't track any locks.
  1791  		return false, nil
  1792  	}
  1793  	g := guard.(*lockTableGuardImpl)
  1794  	key := intent.Key
  1795  	sa, ss, err := findAccessInSpans(key, g.spans)
  1796  	if err != nil {
  1797  		return false, err
  1798  	}
  1799  	var l *lockState
  1800  	tree := &t.locks[ss]
  1801  	tree.mu.Lock()
  1802  	// Can't release tree.mu until call l.discoveredLock() since someone may
  1803  	// find an empty lock and remove it from the tree.
  1804  	defer tree.mu.Unlock()
  1805  	iter := tree.MakeIter()
  1806  	iter.FirstOverlap(&lockState{key: key})
  1807  	if !iter.Valid() {
  1808  		l = &lockState{id: tree.nextLockSeqNum(), key: key, ss: ss}
  1809  		l.queuedWriters.Init()
  1810  		l.waitingReaders.Init()
  1811  		tree.Set(l)
  1812  		atomic.AddInt64(&tree.numLocks, 1)
  1813  	} else {
  1814  		l = iter.Cur()
  1815  	}
  1816  	return true, l.discoveredLock(&intent.Txn, intent.Txn.WriteTimestamp, g, sa)
  1817  }
  1818  
  1819  // AcquireLock implements the lockTable interface.
  1820  func (t *lockTableImpl) AcquireLock(
  1821  	txn *enginepb.TxnMeta, key roachpb.Key, strength lock.Strength, durability lock.Durability,
  1822  ) error {
  1823  	t.enabledMu.RLock()
  1824  	defer t.enabledMu.RUnlock()
  1825  	if !t.enabled {
  1826  		// If not enabled, don't track any locks.
  1827  		return nil
  1828  	}
  1829  	if strength != lock.Exclusive {
  1830  		return errors.Errorf("caller violated contract: lock strength not Exclusive")
  1831  	}
  1832  	ss := spanset.SpanGlobal
  1833  	if keys.IsLocal(key) {
  1834  		ss = spanset.SpanLocal
  1835  	}
  1836  	var l *lockState
  1837  	tree := &t.locks[ss]
  1838  	tree.mu.Lock()
  1839  	// Can't release tree.mu until call l.acquireLock() since someone may find
  1840  	// an empty lock and remove it from the tree. If we expect that lockState
  1841  	// will already be in tree we can optimize this by first trying with a
  1842  	// tree.mu.RLock().
  1843  	iter := tree.MakeIter()
  1844  	iter.FirstOverlap(&lockState{key: key})
  1845  	if !iter.Valid() {
  1846  		if durability == lock.Replicated {
  1847  			tree.mu.Unlock()
  1848  			// Don't remember uncontended replicated locks.
  1849  			return nil
  1850  		}
  1851  		l = &lockState{id: tree.nextLockSeqNum(), key: key, ss: ss}
  1852  		tree.lockIDSeqNum++
  1853  		l.queuedWriters.Init()
  1854  		l.waitingReaders.Init()
  1855  		tree.Set(l)
  1856  		atomic.AddInt64(&tree.numLocks, 1)
  1857  	} else {
  1858  		l = iter.Cur()
  1859  	}
  1860  	err := l.acquireLock(strength, durability, txn, txn.WriteTimestamp)
  1861  	tree.mu.Unlock()
  1862  
  1863  	var totalLocks int64
  1864  	for i := 0; i < len(t.locks); i++ {
  1865  		totalLocks += atomic.LoadInt64(&t.locks[i].numLocks)
  1866  	}
  1867  	if totalLocks > t.maxLocks {
  1868  		t.tryClearLocks(false /* force */)
  1869  	}
  1870  	return err
  1871  }
  1872  
  1873  // If force is false, removes all locks, except for those that are held with
  1874  // replicated durability and have no distinguished waiter, and tells those
  1875  // waiters to wait elsewhere or that they are done waiting. A replicated lock
  1876  // which has been discovered by a request but no request is actively waiting on
  1877  // it will be preserved since we need to tell that request who it is waiting for
  1878  // when it next calls ScanAndEnqueue(). If we aggressively removed even these
  1879  // locks, the next ScanAndEnqueue() would not find the lock, the request would
  1880  // evaluate again, again discover that lock and if tryClearLocks() keeps getting
  1881  // called would be stuck in this loop without pushing.
  1882  //
  1883  // If force is true, removes all locks and marks all guards as doneWaiting.
  1884  func (t *lockTableImpl) tryClearLocks(force bool) {
  1885  	for i := 0; i < int(spanset.NumSpanScope); i++ {
  1886  		tree := &t.locks[i]
  1887  		tree.mu.Lock()
  1888  		var locksToClear []*lockState
  1889  		iter := tree.MakeIter()
  1890  		for iter.First(); iter.Valid(); iter.Next() {
  1891  			l := iter.Cur()
  1892  			if l.tryClearLock(force) {
  1893  				locksToClear = append(locksToClear, l)
  1894  			}
  1895  		}
  1896  		atomic.AddInt64(&tree.numLocks, int64(-len(locksToClear)))
  1897  		if tree.Len() == len(locksToClear) {
  1898  			// Fast-path full clear.
  1899  			tree.Reset()
  1900  		} else {
  1901  			for _, l := range locksToClear {
  1902  				tree.Delete(l)
  1903  			}
  1904  		}
  1905  		tree.mu.Unlock()
  1906  	}
  1907  }
  1908  
  1909  // Given the key must be in spans, returns the strongest access
  1910  // specified in the spans, along with the scope of the key.
  1911  func findAccessInSpans(
  1912  	key roachpb.Key, spans *spanset.SpanSet,
  1913  ) (spanset.SpanAccess, spanset.SpanScope, error) {
  1914  	ss := spanset.SpanGlobal
  1915  	if keys.IsLocal(key) {
  1916  		ss = spanset.SpanLocal
  1917  	}
  1918  	for sa := spanset.NumSpanAccess - 1; sa >= 0; sa-- {
  1919  		s := spans.GetSpans(sa, ss)
  1920  		// First span that starts after key
  1921  		i := sort.Search(len(s), func(i int) bool {
  1922  			return key.Compare(s[i].Key) < 0
  1923  		})
  1924  		if i > 0 &&
  1925  			((len(s[i-1].EndKey) > 0 && key.Compare(s[i-1].EndKey) < 0) || key.Equal(s[i-1].Key)) {
  1926  			return sa, ss, nil
  1927  		}
  1928  	}
  1929  	return 0, 0, errors.Errorf("caller violated contract: could not find access in spans")
  1930  }
  1931  
  1932  // Tries to GC locks that were previously known to have become empty.
  1933  func (t *lockTableImpl) tryGCLocks(tree *treeMu, locks []*lockState) {
  1934  	tree.mu.Lock()
  1935  	defer tree.mu.Unlock()
  1936  	for _, l := range locks {
  1937  		iter := tree.MakeIter()
  1938  		iter.FirstOverlap(l)
  1939  		// Since the same lockState can go from non-empty to empty multiple times
  1940  		// it is possible that multiple threads are racing to delete it and
  1941  		// multiple find it empty and one wins. If a concurrent thread made the
  1942  		// lockState non-empty we do not want to delete it accidentally.
  1943  		if !iter.Valid() {
  1944  			continue
  1945  		}
  1946  		l = iter.Cur()
  1947  		l.mu.Lock()
  1948  		empty := l.isEmptyLock()
  1949  		l.mu.Unlock()
  1950  		if empty {
  1951  			tree.Delete(l)
  1952  			atomic.AddInt64(&tree.numLocks, -1)
  1953  		}
  1954  	}
  1955  }
  1956  
  1957  // UpdateLocks implements the lockTable interface.
  1958  func (t *lockTableImpl) UpdateLocks(up *roachpb.LockUpdate) error {
  1959  	// NOTE: there is no need to synchronize with enabledMu here. Update only
  1960  	// accesses locks already in the lockTable, but a disabled lockTable will be
  1961  	// empty. If the lock-table scan below races with a concurrent call to clear
  1962  	// then it might update a few locks, but they will quickly be cleared.
  1963  
  1964  	span := up.Span
  1965  	ss := spanset.SpanGlobal
  1966  	if keys.IsLocal(span.Key) {
  1967  		ss = spanset.SpanLocal
  1968  	}
  1969  	tree := &t.locks[ss]
  1970  	var err error
  1971  	var locksToGC []*lockState
  1972  	changeFunc := func(l *lockState) {
  1973  		gc, err2 := l.tryUpdateLock(up)
  1974  		if err2 != nil {
  1975  			err = err2
  1976  			return
  1977  		}
  1978  		if gc {
  1979  			locksToGC = append(locksToGC, l)
  1980  		}
  1981  	}
  1982  	tree.mu.RLock()
  1983  	iter := tree.MakeIter()
  1984  	ltRange := &lockState{key: span.Key, endKey: span.EndKey}
  1985  	for iter.FirstOverlap(ltRange); iter.Valid(); iter.NextOverlap(ltRange) {
  1986  		changeFunc(iter.Cur())
  1987  		// Optimization to avoid a second key comparison (not for correctness).
  1988  		if len(span.EndKey) == 0 {
  1989  			break
  1990  		}
  1991  	}
  1992  	tree.mu.RUnlock()
  1993  
  1994  	if len(locksToGC) > 0 {
  1995  		t.tryGCLocks(tree, locksToGC)
  1996  	}
  1997  	return err
  1998  }
  1999  
  2000  // Iteration helper for findNextLockAfter. Returns the next span to search
  2001  // over, or nil if the iteration is done.
  2002  // REQUIRES: g.mu is locked.
  2003  func stepToNextSpan(g *lockTableGuardImpl) *spanset.Span {
  2004  	g.index++
  2005  	for ; g.ss < spanset.NumSpanScope; g.ss++ {
  2006  		for ; g.sa >= 0; g.sa-- {
  2007  			spans := g.spans.GetSpans(g.sa, g.ss)
  2008  			if g.index < len(spans) {
  2009  				span := &spans[g.index]
  2010  				g.key = span.Key
  2011  				return span
  2012  			}
  2013  			g.index = 0
  2014  		}
  2015  		g.sa = spanset.NumSpanAccess - 1
  2016  	}
  2017  	return nil
  2018  }
  2019  
  2020  // Enable implements the lockTable interface.
  2021  func (t *lockTableImpl) Enable() {
  2022  	// Avoid disrupting other requests if the lockTable is already enabled.
  2023  	// NOTE: This may be a premature optimization, but it can't hurt.
  2024  	t.enabledMu.RLock()
  2025  	enabled := t.enabled
  2026  	t.enabledMu.RUnlock()
  2027  	if enabled {
  2028  		return
  2029  	}
  2030  	t.enabledMu.Lock()
  2031  	t.enabled = true
  2032  	t.enabledMu.Unlock()
  2033  }
  2034  
  2035  // Clear implements the lockTable interface.
  2036  func (t *lockTableImpl) Clear(disable bool) {
  2037  	// If disabling, lock the entire table to prevent concurrent accesses
  2038  	// from adding state to the table as we clear it. If not, there's no
  2039  	// need to synchronize with enabledMu because we're only removing state.
  2040  	if disable {
  2041  		t.enabledMu.Lock()
  2042  		defer t.enabledMu.Unlock()
  2043  		t.enabled = false
  2044  	}
  2045  	t.tryClearLocks(true /* force */)
  2046  }
  2047  
  2048  // For tests.
  2049  func (t *lockTableImpl) String() string {
  2050  	var buf strings.Builder
  2051  	for i := 0; i < len(t.locks); i++ {
  2052  		tree := &t.locks[i]
  2053  		scope := spanset.SpanScope(i).String()
  2054  		tree.mu.RLock()
  2055  		fmt.Fprintf(&buf, "%s: num=%d\n", scope, atomic.LoadInt64(&tree.numLocks))
  2056  		iter := tree.MakeIter()
  2057  		for iter.First(); iter.Valid(); iter.Next() {
  2058  			l := iter.Cur()
  2059  			l.mu.Lock()
  2060  			l.Format(&buf)
  2061  			l.mu.Unlock()
  2062  		}
  2063  		tree.mu.RUnlock()
  2064  	}
  2065  	return buf.String()
  2066  }