github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/lock_table.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package concurrency 12 13 import ( 14 "container/list" 15 "fmt" 16 "sort" 17 "strings" 18 "sync" 19 "sync/atomic" 20 21 "github.com/cockroachdb/cockroach/pkg/keys" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 26 "github.com/cockroachdb/cockroach/pkg/util/hlc" 27 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 28 "github.com/cockroachdb/cockroach/pkg/util/uuid" 29 "github.com/cockroachdb/errors" 30 ) 31 32 // Default upper bound on the number of locks in a lockTable. 33 const defaultLockTableSize = 10000 34 35 // The kind of waiting that the request is subject to. 36 type waitKind int 37 38 const ( 39 _ waitKind = iota 40 41 // waitFor indicates that the request is waiting on another transaction to 42 // to release its locks or complete its own request. waitingStates with this 43 // waitKind will provide information on who the request is waiting on. The 44 // request will likely want to eventually push the conflicting transaction. 45 waitFor 46 47 // waitForDistinguished is a sub-case of waitFor. It implies everything that 48 // waitFor does and additionally indicates that the request is currently the 49 // "distinguished waiter". A distinguished waiter is responsible for taking 50 // extra actions, e.g. immediately pushing the transaction it is waiting 51 // for. If there are multiple requests in the waitFor state waiting on the 52 // same transaction, at least one will be a distinguished waiter. 53 waitForDistinguished 54 55 // waitElsewhere is used when the lockTable is under memory pressure and is 56 // clearing its internal queue state. Like the waitFor* states, it informs 57 // the request who it is waiting for so that deadlock detection works. 58 // However, sequencing information inside the lockTable is mostly discarded. 59 waitElsewhere 60 61 // waitSelf indicates that a different requests from the same transaction 62 // has a conflicting reservation. See the comment about "Reservations" in 63 // lockState. This request should sit tight and wait for a new notification 64 // without pushing anyone. 65 waitSelf 66 67 // doneWaiting indicates that the request is done waiting on this pass 68 // through the lockTable and should make another call to ScanAndEnqueue. 69 doneWaiting 70 ) 71 72 // The current waiting state of the request. 73 // 74 // See the detailed comment about "Waiting logic" on lockTableGuardImpl. 75 type waitingState struct { 76 kind waitKind 77 78 // Fields below are populated for waitFor* and waitElsewhere kinds. 79 80 // Represents who the request is waiting for. The conflicting 81 // transaction may be a lock holder of a conflicting lock or a 82 // conflicting request being sequenced through the same lockTable. 83 txn *enginepb.TxnMeta // always non-nil 84 key roachpb.Key // the key of the conflict 85 held bool // is the conflict a held lock? 86 87 // Represents the action that the request was trying to perform when 88 // it hit the conflict. E.g. was it trying to read or write? 89 guardAccess spanset.SpanAccess 90 } 91 92 // Implementation 93 // TODO(sbhola): 94 // - metrics about lockTable state to export to observability debug pages: 95 // number of locks, number of waiting requests, wait time?, ... 96 // - test cases where guard.readTS != guard.writeTS. 97 98 // The btree for a particular SpanScope. 99 type treeMu struct { 100 mu syncutil.RWMutex // Protects everything in this struct. 101 102 // For assigning sequence numbers to the lockState objects as required by 103 // the util/interval/generic type contract. 104 lockIDSeqNum uint64 105 106 // Container for lockState structs. Locks that are not held or reserved and 107 // have no waiting requests are garbage collected. Additionally, locks that 108 // are only held with Replicated durability and have no waiting requests may 109 // also be garbage collected since their state can be recovered from 110 // persistent storage. 111 btree 112 113 // For constraining memory consumption. We need better memory accounting 114 // than this. 115 numLocks int64 116 } 117 118 // lockTableImpl is an implementation of lockTable. 119 // 120 // Concurrency: in addition to holding latches, we require for a particular 121 // request ScanAndEnqueue() and CurState() must be called by the same 122 // thread. 123 // 124 // Mutex ordering: lockTableImpl.enabledMu 125 // > treeMu.mu 126 // > lockState.mu 127 // > lockTableGuardImpl.mu 128 type lockTableImpl struct { 129 // Is the lockTable enabled? When enabled, the lockTable tracks locks and 130 // allows requests to queue in wait-queues on these locks. When disabled, 131 // no locks or wait-queues are maintained. 132 // 133 // enabledMu is held in read-mode when determining whether the lockTable 134 // is enabled and when acting on that information (e.g. adding new locks). 135 // It is held in write-mode when enabling or disabling the lockTable. 136 enabled bool 137 enabledMu syncutil.RWMutex 138 139 // A sequence number is assigned to each request seen by the lockTable. This 140 // is to preserve fairness despite the design choice of allowing 141 // out-of-order evaluation of requests with overlapping spans where the 142 // latter request does not encounter contention. This out-of-order 143 // evaluation happens because requests do not reserve spans that are 144 // uncontended while they wait for on contended locks after releasing their 145 // latches. Consider the following examples: 146 // 147 // Example 1: 148 // - req1 wants to write to A, B 149 // - req2 wants to write to B 150 // - lock at A is held by some other txn. 151 // - Even though req2 arrives later, req1 will wait only in the queue for A 152 // and allow req2 to proceed to evaluation. 153 // 154 // Example 2: 155 // - Same as example 1 but lock at A is held by txn3 and lock at B is held 156 // by txn4. 157 // - Lock at A is released so req1 acquires the reservation at A and starts 158 // waiting at B. 159 // - It is unfair for req1 to wait behind req2 at B. The sequence number 160 // assigned to req1 and req2 will restore the fairness by making req1 161 // wait before req2. 162 // 163 // Example 3: Deadlock in lock table if it did not use sequence numbers. 164 // - Lock at B is acquired by txn0. 165 // - req1 (from txn1) arrives at lockTable and wants to write to A and B. 166 // It queues at B. 167 // - req2 (from txn2) arrives at lockTable and only wants to write A. 168 // It proceeds to evaluation and acquires the lock at A for txn2 and then 169 // the request is done. The lock is still held. 170 // - req3 (from txn3) wants to write to A and B. It queues at A. 171 // - txn2 releases A. req3 is in the front of the queue at A and gets the 172 // reservation and starts waiting at B behind req1. 173 // - txn0 releases B. req1 gets the reservation at B and does another scan 174 // and adds itself to the queue at A, behind req3 which holds the 175 // reservation at A. 176 // Now in the queues for A and B req1 is behind req3 and vice versa and 177 // this deadlock has been created entirely due to the lock table's behavior. 178 seqNum uint64 179 180 locks [spanset.NumSpanScope]treeMu 181 182 maxLocks int64 183 } 184 185 var _ lockTable = &lockTableImpl{} 186 187 // lockTableGuardImpl is an implementation of lockTableGuard. 188 // 189 // The struct is a guard that is returned to the request the first time it calls 190 // lockTable.ScanAndEnqueue() and used in later calls to ScanAndEnqueue() and 191 // done(). After a call to ScanAndEnqueue() (which is made while holding 192 // latches), the caller must first call lockTableGuard.StartWaiting() and if it 193 // returns true release the latches and continue interacting with the 194 // lockTableGuard. If StartWaiting() returns false, the request can proceed to 195 // evaluation. 196 // 197 // Waiting logic: The interface hides the queues that the request is waiting on, 198 // and the request's position in the queue. One of the reasons for this hiding 199 // is that queues are not FIFO since a request that did not wait on a queue for 200 // key k in a preceding call to ScanAndEnqueue() (because k was not locked and 201 // there was no queue) may need to wait on the queue in a later call to 202 // ScanAndEnqueue(). So sequencing of requests arriving at the lockTable is 203 // partially decided by a sequence number assigned to a request when it first 204 // called ScanAndEnqueue() and queues are ordered by this sequence number. 205 // However the sequencing is not fully described by the sequence numbers -- a 206 // request R1 encountering contention over some keys in its span does not 207 // prevent a request R2 that has a higher sequence number and overlapping span 208 // to proceed if R2 does not encounter contention. This concurrency (that is not 209 // completely fair) is deemed desirable. 210 // 211 // The interface exposes an abstracted version of the waiting logic in a way 212 // that the request that starts waiting is considered waiting for at most one 213 // other request or transaction. This is exposed as a series of state 214 // transitions where the transitions are notified via newState() and the current 215 // state can be read using CurState(). 216 // 217 // - The waitFor* states provide information on who the request is waiting for. 218 // The waitForDistinguished state is a sub-case -- a distinguished waiter is 219 // responsible for taking extra actions e.g. immediately pushing the transaction 220 // it is waiting for. The implementation ensures that if there are multiple 221 // requests in waitFor state waiting on the same transaction at least one will 222 // be a distinguished waiter. 223 // 224 // TODO(sbhola): investigate removing the waitForDistinguished state which 225 // will simplify the code here. All waitFor requests would wait (currently 226 // 50ms) before pushing the transaction (for deadlock detection) they are 227 // waiting on, say T. Typically T will be done before 50ms which is considered 228 // ok: the one exception we will need to make is if T has the min priority or 229 // the waiting transaction has max priority -- in both cases it will push 230 // immediately. The bad case is if T is ABORTED: the push will succeed after, 231 // and if T left N intents, each push would wait for 50ms, incurring a latency 232 // of 50*N ms. A cache of recently encountered ABORTED transactions on each 233 // Store should mitigate this latency increase. Whenever a transaction sees a 234 // waitFor state, it will consult this cache and if T is found, push 235 // immediately (if there isn't already a push in-flight) -- even if T is not 236 // initially in the cache, the first push will place it in the cache, so the 237 // maximum latency increase is 50ms. 238 // 239 // - The waitElsewhere state is a rare state that is used when the lockTable is 240 // under memory pressure and is clearing its internal queue state. Like the 241 // waitFor* states, it informs the request who it is waiting for so that 242 // deadlock detection works. However, sequencing information inside the 243 // lockTable is mostly discarded. 244 // 245 // - The waitSelf state is a rare state when a different request from the same 246 // transaction has a reservation. See the comment about "Reservations" in 247 // lockState. 248 // 249 // - The doneWaiting state is used to indicate that the request should make 250 // another call to ScanAndEnqueue() (that next call is more likely to return a 251 // lockTableGuard that returns false from StartWaiting()). 252 type lockTableGuardImpl struct { 253 seqNum uint64 254 255 // Information about this request. 256 txn *enginepb.TxnMeta 257 spans *spanset.SpanSet 258 readTS hlc.Timestamp 259 writeTS hlc.Timestamp 260 261 // Snapshots of the trees for which this request has some spans. Note that 262 // the lockStates in these snapshots may have been removed from 263 // lockTableImpl. Additionally, it is possible that there is a new lockState 264 // for the same key. This can result in various harmless anomalies: 265 // - the request may hold a reservation on a lockState that is no longer 266 // in the tree. When it next does a scan, it will either find a new 267 // lockState where it will compete or none. Both lockStates can be in 268 // the mu.locks map, which is harmless. 269 // - the request may wait behind a reservation holder that is not the 270 // lock holder. This could cause a delay in pushing the lock holder. 271 // This is not a correctness issue (the whole system is not deadlocked) 272 // and we expect will not be a real performance issue. 273 // 274 // TODO(sbhola): experimentally evaluate the lazy queueing of the current 275 // implementation, in comparison with eager queueing. If eager queueing 276 // is comparable in system throughput, one can eliminate the above anomalies. 277 // 278 tableSnapshot [spanset.NumSpanScope]btree 279 280 // A request whose startWait is set to true in ScanAndEnqueue is actively 281 // waiting at a particular key. This is the first key encountered when 282 // iterating through spans that it needs to wait at. A future event (lock 283 // release etc.) may cause the request to no longer need to wait at this 284 // key. It then needs to continue iterating through spans to find the next 285 // key to wait at (we don't want to wastefully start at the beginning since 286 // this request probably has a reservation at the contended keys there): sa, 287 // ss, index, key collectively track the current position to allow it to 288 // continue iterating. 289 290 // The key for the lockState. 291 key roachpb.Key 292 // The key for the lockState is contained in the Span specified by 293 // spans[sa][ss][index]. 294 ss spanset.SpanScope 295 sa spanset.SpanAccess // Iterates from stronger to weaker strength 296 index int 297 298 mu struct { 299 syncutil.Mutex 300 startWait bool 301 302 state waitingState 303 signal chan struct{} 304 305 // locks for which this request has a reservation or is in the queue of 306 // writers (active or inactive) or actively waiting as a reader. 307 // 308 // TODO(sbhola): investigate whether the logic to maintain this locks map 309 // can be simplified so it doesn't need to be adjusted by various 310 // lockState methods. It adds additional bookkeeping burden that means it 311 // is more prone to inconsistencies. There are two main uses: (a) removing 312 // from various lockStates when done() is called, (b) tryActiveWait() uses 313 // it as an optimization to know that this request is not known to the 314 // lockState. (b) can be handled by other means -- the first scan the 315 // request won't be in the lockState and the second scan it likely will. 316 // (a) doesn't necessarily require this map to be consistent -- the 317 // request could track the places where it is has enqueued as places where 318 // it could be present and then do the search. 319 320 locks map[*lockState]struct{} 321 322 // If this is true, the state has changed and the channel has been 323 // signaled, but what the state should be has not been computed. The call 324 // to CurState() needs to compute that current state. Deferring the 325 // computation makes the waiters do this work themselves instead of making 326 // the call to release/update locks or release reservations do this work 327 // (proportional to number of waiters). 328 mustFindNextLockAfter bool 329 } 330 } 331 332 var _ lockTableGuard = &lockTableGuardImpl{} 333 334 // Used to avoid allocations. 335 var lockTableGuardImplPool = sync.Pool{ 336 New: func() interface{} { 337 g := new(lockTableGuardImpl) 338 g.mu.signal = make(chan struct{}, 1) 339 g.mu.locks = make(map[*lockState]struct{}) 340 return g 341 }, 342 } 343 344 // newLockTableGuardImpl returns a new lockTableGuardImpl. The struct will 345 // contain pre-allocated mu.signal and mu.locks fields, so it shouldn't be 346 // overwritten blindly. 347 func newLockTableGuardImpl() *lockTableGuardImpl { 348 return lockTableGuardImplPool.Get().(*lockTableGuardImpl) 349 } 350 351 // releaseLockTableGuardImpl releases the guard back into the object pool. 352 func releaseLockTableGuardImpl(g *lockTableGuardImpl) { 353 // Preserve the signal channel and locks map fields in the pooled 354 // object. Drain the signal channel and assert that the map is empty. 355 // The map should have been cleared by lockState.requestDone. 356 signal, locks := g.mu.signal, g.mu.locks 357 select { 358 case <-signal: 359 default: 360 } 361 if len(locks) != 0 { 362 panic("lockTableGuardImpl.mu.locks not empty after Dequeue") 363 } 364 365 *g = lockTableGuardImpl{} 366 g.mu.signal = signal 367 g.mu.locks = locks 368 lockTableGuardImplPool.Put(g) 369 } 370 371 func (g *lockTableGuardImpl) ShouldWait() bool { 372 g.mu.Lock() 373 defer g.mu.Unlock() 374 return g.mu.startWait 375 } 376 377 func (g *lockTableGuardImpl) NewStateChan() chan struct{} { 378 g.mu.Lock() 379 defer g.mu.Unlock() 380 return g.mu.signal 381 } 382 383 func (g *lockTableGuardImpl) CurState() waitingState { 384 g.mu.Lock() 385 defer g.mu.Unlock() 386 if !g.mu.mustFindNextLockAfter { 387 return g.mu.state 388 } 389 // Not actively waiting anywhere so no one else can set 390 // mustFindNextLockAfter to true while this method executes. 391 g.mu.mustFindNextLockAfter = false 392 g.mu.Unlock() 393 g.findNextLockAfter(false /* notify */) 394 g.mu.Lock() // Unlock deferred 395 return g.mu.state 396 } 397 398 func (g *lockTableGuardImpl) notify() { 399 select { 400 case g.mu.signal <- struct{}{}: 401 default: 402 } 403 } 404 405 // Called when the request is no longer actively waiting at lock l, and should 406 // look for the next lock to wait at. hasReservation is true iff the request 407 // acquired the reservation at l. Note that it will be false for requests that 408 // were doing a read at the key, or non-transactional writes at the key. 409 func (g *lockTableGuardImpl) doneWaitingAtLock(hasReservation bool, l *lockState) { 410 g.mu.Lock() 411 if !hasReservation { 412 delete(g.mu.locks, l) 413 } 414 g.mu.mustFindNextLockAfter = true 415 g.notify() 416 g.mu.Unlock() 417 } 418 419 func (g *lockTableGuardImpl) isSameTxn(txn *enginepb.TxnMeta) bool { 420 return g.txn != nil && g.txn.ID == txn.ID 421 } 422 423 func (g *lockTableGuardImpl) isSameTxnAsReservation(ws waitingState) bool { 424 return !ws.held && g.isSameTxn(ws.txn) 425 } 426 427 // Finds the next lock, after the current one, to actively wait at. If it 428 // finds the next lock the request starts actively waiting there, else it is 429 // told that it is done waiting. 430 // Acquires g.mu. 431 func (g *lockTableGuardImpl) findNextLockAfter(notify bool) { 432 spans := g.spans.GetSpans(g.sa, g.ss) 433 var span *spanset.Span 434 resumingInSameSpan := false 435 if g.index == -1 || len(spans[g.index].EndKey) == 0 { 436 span = stepToNextSpan(g) 437 } else { 438 span = &spans[g.index] 439 resumingInSameSpan = true 440 } 441 for span != nil { 442 startKey := span.Key 443 if resumingInSameSpan { 444 startKey = g.key 445 } 446 tree := g.tableSnapshot[g.ss] 447 iter := tree.MakeIter() 448 449 // From here on, the use of resumingInSameSpan is just a performance 450 // optimization to deal with the interface limitation of btree that 451 // prevents us from specifying an exclusive start key. We need to check 452 // that the lock is not the same as our exclusive start key and only need 453 // to do that check once -- for the first lock. 454 ltRange := &lockState{key: startKey, endKey: span.EndKey} 455 for iter.FirstOverlap(ltRange); iter.Valid(); iter.NextOverlap(ltRange) { 456 l := iter.Cur() 457 if resumingInSameSpan { 458 resumingInSameSpan = false 459 if l.key.Equal(startKey) { 460 // This lock is where it stopped waiting. 461 continue 462 } 463 // Else, past the lock where it stopped waiting. We may not 464 // encounter that lock since it may have been garbage collected. 465 } 466 if l.tryActiveWait(g, g.sa, notify) { 467 return 468 } 469 } 470 resumingInSameSpan = false 471 span = stepToNextSpan(g) 472 } 473 g.mu.Lock() 474 defer g.mu.Unlock() 475 g.mu.state = waitingState{kind: doneWaiting} 476 if notify { 477 g.notify() 478 } 479 } 480 481 // Waiting writers in a lockState are wrapped in a queuedGuard. A waiting 482 // writer is typically waiting in an active state, i.e., the 483 // lockTableGuardImpl.key refers to this lockState. However, breaking of 484 // reservations (see the comment on reservations below, in lockState) can 485 // cause a writer to be an inactive waiter. 486 type queuedGuard struct { 487 guard *lockTableGuardImpl 488 active bool // protected by lockState.mu 489 } 490 491 // Information about a lock holder. 492 type lockHolderInfo struct { 493 // nil if there is no holder. Else this is the TxnMeta of the latest call to 494 // acquire/update the lock by this transaction. For a given transaction if 495 // the lock is continuously held by a succession of different TxnMetas, the 496 // epoch must be monotonic and the ts (derived from txn.WriteTimestamp for 497 // some calls, and request.ts for other calls) must be monotonic. After ts 498 // is intialized, the timestamps inside txn are not used. 499 txn *enginepb.TxnMeta 500 501 // All the TxnSeqs in the current epoch at which this lock has been 502 // acquired. In increasing order. We track these so that if a lock is 503 // acquired at both seq 5 and seq 7, rollback of 7 does not cause the lock 504 // to be released. This is also consistent with PostgreSQL semantics 505 // https://www.postgresql.org/docs/12/sql-select.html#SQL-FOR-UPDATE-SHARE 506 seqs []enginepb.TxnSeq 507 508 // The timestamp at which the lock is held. 509 ts hlc.Timestamp 510 } 511 512 // Per lock state in lockTableImpl. 513 // 514 // NOTE: we can't easily pool lockState objects without some form of reference 515 // counting because they are used as elements in a copy-on-write btree and may 516 // still be referenced by clones of the tree even when deleted from the primary. 517 // However, other objects referenced by lockState can be pooled as long as they 518 // are removed from all lockStates that reference them first. 519 type lockState struct { 520 id uint64 // needed for implementing util/interval/generic type contract 521 endKey []byte // used in btree iteration and tests 522 523 // The key being locked and the scope of that key. This state is never 524 // mutated. 525 key roachpb.Key 526 ss spanset.SpanScope 527 528 mu syncutil.Mutex // Protects everything below. 529 530 // Invariant summary (see detailed comments below): 531 // - both holder.locked and waitQ.reservation != nil cannot be true. 532 // - if holder.locked and multiple holderInfos have txn != nil: all the 533 // txns must have the same txn.ID. 534 // - !holder.locked => waitingReaders.Len() == 0. That is, readers wait 535 // only if the lock is held. They do not wait for a reservation. 536 // - If reservation != nil, that request is not in queuedWriters. 537 538 // Information about whether the lock is held and the holder. We track 539 // information for each durability level separately since a transaction can 540 // go through multiple epochs and TxnSeq and may acquire the same lock in 541 // replicated and unreplicated mode at different stages. 542 holder struct { 543 locked bool 544 // LockStrength is always Exclusive 545 holder [lock.MaxDurability + 1]lockHolderInfo 546 } 547 548 // Information about the requests waiting on the lock. 549 lockWaitQueue 550 } 551 552 type lockWaitQueue struct { 553 // Reservations: 554 // 555 // A not-held lock can be "reserved". A reservation is just a claim that 556 // prevents multiple requests from racing when the lock is released. A 557 // reservation by req2 can be broken by req1 is req1 has a smaller seqNum 558 // than req2. Only requests that specify SpanReadWrite for a key can make 559 // reservations. This means a reservation can only be made when the lock is 560 // not held, since the reservation (which can acquire an Exclusive lock) and 561 // the lock holder (which is an Exclusive lock) conflict. 562 // 563 // Read reservations are not permitted due to the complexities discussed in 564 // the review for #43740. Additionally, reads do not queue for their turn at 565 // all -- they are held in the waitingReaders list while the lock is held 566 // and removed when the lock is not released, so they race with 567 // reservations. Let us consider scenarios where reads did wait in the same 568 // queue: the lock could be held or reserved by a write at ts=20, followed 569 // by a waiting writer at ts=18, writer at ts=10, reader at ts=12. That 570 // reader is waiting not because of a conflict with the holder, or reserver, 571 // or the first waiter, but because there is a waiter ahead of it which it 572 // conflicts with. This introduces more complexity in tracking who this 573 // reader should push. Also consider a scenario where a reader did not wait 574 // in the queue and waited on the side like in waitingReaders but acquired a 575 // read reservation (together with other readers) when the lock was 576 // released. Ignoring the unfairness of this, we can construct a deadlock 577 // scenario with request req1 with seqnum 1 and req2 with seqnum 2 where 578 // req1 and req2 both want to write at one key and so get ordered by their 579 // seqnums but at another key req2 wants to read and req1 wants to write and 580 // since req2 does not wait in the queue it acquires a read reservation 581 // before req1. See the discussion at the end of this comment section on how 582 // the behavior will extend when we start supporting Shared and Upgrade 583 // locks. 584 // 585 // Non-transactional requests can do both reads and writes but cannot be 586 // depended on since they don't have a transaction that can be pushed. 587 // Therefore they not only do not acquire locks, but cannot make reservations. 588 // The non-reservation for reads is already covered in the previous 589 // paragraph. For non-transactional writes, the request waits in the queue 590 // with other writers. The difference occurs: 591 // - when it gets to the front of the queue and there is no lock holder 592 // or reservation: instead of acquiring the reservation it removes 593 // itself from the lockState and proceeds to the next lock. If it 594 // does not need to wait for any more locks and manages to acquire 595 // latches before those locks are acquired by some other request, it 596 // will evaluate. 597 // - when deciding to wait at a lock: if the lock has a reservation with 598 // a sequence num higher than this non-transactional request it will 599 // ignore that reservation. Note that ignoring such reservations is 600 // safe since when this non-transactional request is holding latches 601 // those reservation holders cannot be holding latches, so they cannot 602 // conflict. 603 // 604 // Multiple requests from the same transaction wait independently, including 605 // the situation where one of the requests has a reservation and the other 606 // is waiting (currently this can only happen if both requests are doing 607 // SpanReadWrite). Making multiple requests from the same transaction 608 // jointly hold the reservation introduces code complexity since joint 609 // reservations can be partially broken (see deadlock example below), and is 610 // not necessarily fair to other requests. Additionally, if req1 from txn1 611 // is holding a a reservation and req2 from txn1 is waiting, they must 612 // conflict wrt latches and cannot evaluate concurrently so there isn't a 613 // benefit to joint reservations. However, if one of the requests acquires 614 // the lock the other request no longer needs to wait on this lock. This 615 // situation motivates the waitSelf state. 616 // 617 // Deadlock example if joint reservations were supported and we did not 618 // allow partial breaking of such reservations: 619 // 620 // - Keys are A, B, C, D. 621 // - Key D is locked by some random txn. 622 // - req1 from txn1 writes A, B, D. It waits at D. 623 // - Some other request from some random txn that writes C arrives, 624 // evaluates, and locks C. 625 // - req2 from txn2 that writes A, C. It waits at C. 626 // - Some other request from some random txn that writes A arrives, 627 // evaluates, and locks A. 628 // - req3 from txn1 that writes A, C. It waits at A. Note that req1 and req3 629 // are from the same txn. 630 // - A is unlocked. req3 reserves A and waits at C behind req2. 631 // - B is locked by some random txn. 632 // - D is unlocked. req1 reserves D and proceeds to scan again and finds A 633 // is reserved by req3 which is the same txn so becomes a joint 634 // reservation holder at A. 635 // - Since B is locked, req1 waits at B. 636 // - C is unlocked. req2 reserves C. It scans and finds req1+req3 holding 637 // the joint reservation at A. If it queues behind this joint reservation 638 // we have the following situation: 639 // reservation waiter 640 // A req1+req3 req2 641 // C req2 req3 642 // This is a deadlock caused by the lock table unless req2 partially 643 // breaks the reservation at A. 644 // 645 // Extension for Shared and Upgrade locks: 646 // There are 3 aspects to consider: holders; reservers; the dependencies 647 // that need to be captured when waiting. 648 // 649 // - Holders: only shared locks are compatible with themselves, so there can 650 // be one of (a) no holder (b) multiple shared lock holders, (c) one 651 // exclusive holder, (d) one upgrade holder. Non-locking reads will 652 // wait in waitingReaders for only an incompatible exclusive holder. 653 // 654 // - Reservers: This follows the same pattern as holders. Non-locking reads 655 // do not wait on reservers. 656 // 657 // - Queueing and dependencies: All potential lockers and non-transactional 658 // writers will wait in the same queue. A sequence of consecutive requests 659 // that have the potential to acquire a shared lock will jointly reserve 660 // that shared lock. Such requests cannot jump ahead of requests with a 661 // lower seqnum just because there is currently a shared lock reservation 662 // (this can cause lockTable induced deadlocks). Such joint reservations 663 // can be partially broken by a waiter desiring an exclusive or upgrade 664 // lock. Like the current code, non-transactional writes will wait for 665 // reservations that have a lower sequence num, but not make their own 666 // reservation. Additionally, they can partially break joint reservations. 667 // 668 // Reservations that are (partially or fully) broken cause requests to 669 // reenter the queue as inactive waiters. This is no different than the 670 // current behavior. Each request can specify the same key in spans for 671 // ReadOnly, ReadShared, ReadUpgrade, ReadWrite. The spans will be 672 // iterated over in decreasing order of strength, to only wait at a lock 673 // at the highest strength (this is similar to the current behavior using 674 // accessDecreasingStrength). 675 // 676 // For dependencies, a waiter desiring an exclusive or upgrade lock always 677 // conflicts with the holder(s) or reserver(s) so that is the dependency 678 // that will be captured. A waiter desiring a shared lock may encounter a 679 // situation where it does not conflict with the holder(s) or reserver(s) 680 // since those are also shared lockers. In that case it will depend on the 681 // first waiter since that waiter must be desiring a lock that is 682 // incompatible with a shared lock. 683 684 reservation *lockTableGuardImpl 685 686 // TODO(sbhola): There are a number of places where we iterate over these 687 // lists looking for something, as described below. If some of these turn 688 // out to be inefficient, consider better data-structures. One idea is that 689 // for cases that find a particular guard the lockTableGuardImpl.locks can be 690 // a map instead of a set to point directly to the *list.Element. 691 // 692 // queuedWriters: 693 // - to find all active queuedWriters. 694 // - to find the first active writer to make it distinguished. 695 // - to find a particular guard. 696 // - to find the position, based on seqNum, for inserting a particular guard. 697 // - to find all waiting writers with a particular txn ID. 698 // 699 // waitingReaders: 700 // - readers with a higher timestamp than some timestamp. 701 // - to find a particular guard. 702 703 // Waiters: An active waiter needs to be notified about changes in who it is 704 // waiting for. 705 706 // List of *queuedGuard. A subset of these are actively waiting. If 707 // non-empty, either the lock is held or there is a reservation. 708 queuedWriters list.List 709 710 // List of *lockTableGuardImpl. All of these are actively waiting. If 711 // non-empty, the lock must be held. By definition these cannot be in 712 // waitSelf state since that state is only used when there is a reservation. 713 waitingReaders list.List 714 715 // If there is a non-empty set of active waiters that are not waitSelf, then 716 // at least one must be distinguished. 717 distinguishedWaiter *lockTableGuardImpl 718 } 719 720 //go:generate ../../../util/interval/generic/gen.sh *lockState concurrency 721 722 // Methods required by util/interval/generic type contract. 723 func (l *lockState) ID() uint64 { return l.id } 724 func (l *lockState) Key() []byte { return l.key } 725 func (l *lockState) EndKey() []byte { return l.endKey } 726 func (l *lockState) New() *lockState { return new(lockState) } 727 func (l *lockState) SetID(v uint64) { l.id = v } 728 func (l *lockState) SetKey(v []byte) { l.key = v } 729 func (l *lockState) SetEndKey(v []byte) { l.endKey = v } 730 731 // REQUIRES: l.mu is locked. 732 func (l *lockState) String() string { 733 var buf strings.Builder 734 l.Format(&buf) 735 return buf.String() 736 } 737 738 // REQUIRES: l.mu is locked. 739 func (l *lockState) Format(buf *strings.Builder) { 740 fmt.Fprintf(buf, " lock: %s\n", l.key) 741 if l.isEmptyLock() { 742 fmt.Fprintln(buf, " empty") 743 return 744 } 745 writeResInfo := func(b *strings.Builder, txn *enginepb.TxnMeta, ts hlc.Timestamp) { 746 // TODO(sbhola): strip the leading 0 bytes from the UUID string since tests are assigning 747 // UUIDs using a counter and makes this output more readable. 748 fmt.Fprintf(b, "txn: %v, ts: %v, seq: %v\n", txn.ID, ts, txn.Sequence) 749 } 750 writeHolderInfo := func(b *strings.Builder, txn *enginepb.TxnMeta, ts hlc.Timestamp) { 751 fmt.Fprintf(b, " holder: txn: %v, ts: %v, info: ", txn.ID, ts) 752 first := true 753 for i := range l.holder.holder { 754 h := &l.holder.holder[i] 755 if h.txn == nil { 756 continue 757 } 758 if !first { 759 fmt.Fprintf(b, ", ") 760 } 761 first = false 762 if lock.Durability(i) == lock.Replicated { 763 fmt.Fprintf(b, "repl ") 764 } else { 765 fmt.Fprintf(b, "unrepl ") 766 } 767 fmt.Fprintf(b, "epoch: %d, seqs: [%d", h.txn.Epoch, h.seqs[0]) 768 for j := 1; j < len(h.seqs); j++ { 769 fmt.Fprintf(b, ", %d", h.seqs[j]) 770 } 771 fmt.Fprintf(b, "]") 772 } 773 fmt.Fprintln(b, "") 774 } 775 txn, ts := l.getLockerInfo() 776 if txn == nil { 777 fmt.Fprintf(buf, " res: req: %d, ", l.reservation.seqNum) 778 writeResInfo(buf, l.reservation.txn, l.reservation.writeTS) 779 } else { 780 writeHolderInfo(buf, txn, ts) 781 } 782 if l.waitingReaders.Len() > 0 { 783 fmt.Fprintln(buf, " waiting readers:") 784 for e := l.waitingReaders.Front(); e != nil; e = e.Next() { 785 g := e.Value.(*lockTableGuardImpl) 786 fmt.Fprintf(buf, " req: %d, txn: ", g.seqNum) 787 if g.txn == nil { 788 fmt.Fprintln(buf, "none") 789 } else { 790 fmt.Fprintf(buf, "%v\n", g.txn.ID) 791 } 792 } 793 } 794 if l.queuedWriters.Len() > 0 { 795 fmt.Fprintln(buf, " queued writers:") 796 for e := l.queuedWriters.Front(); e != nil; e = e.Next() { 797 qg := e.Value.(*queuedGuard) 798 g := qg.guard 799 fmt.Fprintf(buf, " active: %t req: %d, txn: ", 800 qg.active, qg.guard.seqNum) 801 if g.txn == nil { 802 fmt.Fprintln(buf, "none") 803 } else { 804 fmt.Fprintf(buf, "%v\n", g.txn.ID) 805 } 806 } 807 } 808 if l.distinguishedWaiter != nil { 809 fmt.Fprintf(buf, " distinguished req: %d\n", l.distinguishedWaiter.seqNum) 810 } 811 } 812 813 // Called for a write request when there is a reservation. Returns true iff it 814 // succeeds. 815 // REQUIRES: l.mu is locked. 816 func (l *lockState) tryBreakReservation(seqNum uint64) bool { 817 if l.reservation.seqNum > seqNum { 818 qg := &queuedGuard{ 819 guard: l.reservation, 820 active: false, 821 } 822 l.queuedWriters.PushFront(qg) 823 l.reservation = nil 824 return true 825 } 826 return false 827 } 828 829 // Informs active waiters about reservation or lock holder. The reservation 830 // may have changed so this needs to fix any inconsistencies wrt waitSelf and 831 // waitForDistinguished states. 832 // REQUIRES: l.mu is locked. 833 func (l *lockState) informActiveWaiters() { 834 waitForState := waitingState{kind: waitFor, key: l.key} 835 findDistinguished := l.distinguishedWaiter == nil 836 if lockHolderTxn, _ := l.getLockerInfo(); lockHolderTxn != nil { 837 waitForState.txn = lockHolderTxn 838 waitForState.held = true 839 } else { 840 waitForState.txn = l.reservation.txn 841 if !findDistinguished && l.distinguishedWaiter.isSameTxnAsReservation(waitForState) { 842 findDistinguished = true 843 l.distinguishedWaiter = nil 844 } 845 } 846 847 for e := l.waitingReaders.Front(); e != nil; e = e.Next() { 848 state := waitForState 849 state.guardAccess = spanset.SpanReadOnly 850 // Since there are waiting readers we could not have transitioned out of 851 // or into a state with a reservation, since readers do not wait for 852 // reservations. 853 g := e.Value.(*lockTableGuardImpl) 854 if findDistinguished { 855 l.distinguishedWaiter = g 856 findDistinguished = false 857 } 858 g.mu.Lock() 859 g.mu.state = state 860 if l.distinguishedWaiter == g { 861 g.mu.state.kind = waitForDistinguished 862 } 863 g.notify() 864 g.mu.Unlock() 865 } 866 for e := l.queuedWriters.Front(); e != nil; e = e.Next() { 867 qg := e.Value.(*queuedGuard) 868 if !qg.active { 869 continue 870 } 871 g := qg.guard 872 var state waitingState 873 if g.isSameTxnAsReservation(waitForState) { 874 state = waitingState{kind: waitSelf} 875 } else { 876 state = waitForState 877 state.guardAccess = spanset.SpanReadWrite 878 if findDistinguished { 879 l.distinguishedWaiter = g 880 findDistinguished = false 881 } 882 if l.distinguishedWaiter == g { 883 state.kind = waitForDistinguished 884 } 885 } 886 g.mu.Lock() 887 g.mu.state = state 888 g.notify() 889 g.mu.Unlock() 890 } 891 } 892 893 // releaseWritersFromTxn removes all waiting writers for the lockState that are 894 // part of the specified transaction. 895 // REQUIRES: l.mu is locked. 896 func (l *lockState) releaseWritersFromTxn(txn *enginepb.TxnMeta) { 897 for e := l.queuedWriters.Front(); e != nil; { 898 qg := e.Value.(*queuedGuard) 899 curr := e 900 e = e.Next() 901 g := qg.guard 902 if g.isSameTxn(txn) { 903 if qg.active { 904 if g == l.distinguishedWaiter { 905 l.distinguishedWaiter = nil 906 } 907 g.doneWaitingAtLock(false, l) 908 } else { 909 g.mu.Lock() 910 delete(g.mu.locks, l) 911 g.mu.Unlock() 912 } 913 l.queuedWriters.Remove(curr) 914 } 915 } 916 } 917 918 // When the active waiters have shrunk and the distinguished waiter has gone, 919 // try to make a new distinguished waiter if there is at least 1 active 920 // waiter. 921 // REQUIRES: l.mu is locked. 922 func (l *lockState) tryMakeNewDistinguished() { 923 var g *lockTableGuardImpl 924 if l.waitingReaders.Len() > 0 { 925 g = l.waitingReaders.Front().Value.(*lockTableGuardImpl) 926 } else if l.queuedWriters.Len() > 0 { 927 for e := l.queuedWriters.Front(); e != nil; e = e.Next() { 928 qg := e.Value.(*queuedGuard) 929 if qg.active && (l.reservation == nil || !qg.guard.isSameTxn(l.reservation.txn)) { 930 g = qg.guard 931 break 932 } 933 } 934 } 935 if g != nil { 936 l.distinguishedWaiter = g 937 g.mu.Lock() 938 g.mu.state.kind = waitForDistinguished 939 // The rest of g.state is already up-to-date. 940 g.notify() 941 g.mu.Unlock() 942 } 943 } 944 945 // Returns true iff the lock is currently held by the transaction with the 946 // given id. 947 // REQUIRES: l.mu is locked. 948 func (l *lockState) isLockedBy(id uuid.UUID) bool { 949 if l.holder.locked { 950 var holderID uuid.UUID 951 if l.holder.holder[lock.Unreplicated].txn != nil { 952 holderID = l.holder.holder[lock.Unreplicated].txn.ID 953 } else { 954 holderID = l.holder.holder[lock.Replicated].txn.ID 955 } 956 return id == holderID 957 } 958 return false 959 } 960 961 // Returns information about the current lock holder if the lock is held, else 962 // returns nil. 963 // REQUIRES: l.mu is locked. 964 func (l *lockState) getLockerInfo() (*enginepb.TxnMeta, hlc.Timestamp) { 965 if !l.holder.locked { 966 return nil, hlc.Timestamp{} 967 } 968 969 // If the lock is held as both replicated and unreplicated we want to 970 // provide the lower of the two timestamps, since the lower timestamp 971 // contends with more transactions. Else we provide whichever one it is held 972 // at. 973 974 // Start with the assumption that it is held as replicated. 975 index := lock.Replicated 976 // Condition under which we prefer the unreplicated holder. 977 if l.holder.holder[index].txn == nil || (l.holder.holder[lock.Unreplicated].txn != nil && 978 // If we are evaluating the following clause we are sure that it is held 979 // as both replicated and unreplicated. 980 l.holder.holder[lock.Unreplicated].ts.Less(l.holder.holder[lock.Replicated].ts)) { 981 index = lock.Unreplicated 982 } 983 return l.holder.holder[index].txn, l.holder.holder[index].ts 984 } 985 986 // Decides whether the request g with access sa should actively wait at this 987 // lock and if yes, adjusts the data-structures appropriately. The notify 988 // parameter is true iff the request's new state channel should be notified -- 989 // it is set to false when the call to tryActiveWait is happening due to an 990 // event for a different request or transaction (like a lock release) since in 991 // that case the channel is notified first and the call to tryActiveWait() 992 // happens later in lockTableGuard.CurState(). The return value is true iff 993 // it is actively waiting. 994 // Acquires l.mu, g.mu. 995 func (l *lockState) tryActiveWait(g *lockTableGuardImpl, sa spanset.SpanAccess, notify bool) bool { 996 l.mu.Lock() 997 defer l.mu.Unlock() 998 999 // It is possible that this lock is empty and has not yet been deleted. 1000 if l.isEmptyLock() { 1001 return false 1002 } 1003 1004 // Lock is not empty. 1005 lockHolderTxn, lockHolderTS := l.getLockerInfo() 1006 if lockHolderTxn != nil && g.isSameTxn(lockHolderTxn) { 1007 // Already locked by this txn. 1008 return false 1009 } 1010 1011 if sa == spanset.SpanReadOnly { 1012 if lockHolderTxn == nil { 1013 // Reads only care about locker, not a reservation. 1014 return false 1015 } 1016 // Locked by some other txn. 1017 if g.readTS.Less(lockHolderTS) { 1018 return false 1019 } 1020 g.mu.Lock() 1021 _, alsoHasStrongerAccess := g.mu.locks[l] 1022 g.mu.Unlock() 1023 1024 // If the request already has this lock in its locks map, it must also be 1025 // writing to this key and must be either a reservation holder or inactive 1026 // waiter at this lock. The former has already been handled above. For the 1027 // latter, it must have had its reservation broken. Since this is a weaker 1028 // access we defer to the stronger access and don't wait here. 1029 // 1030 // For non-transactional requests that have the key specified as both 1031 // SpanReadOnly and SpanReadWrite, the request never acquires a 1032 // reservation, so using the locks map to detect this duplication of the 1033 // key is not possible. In the rare case, the lock is now held at a 1034 // timestamp that is not compatible with this request and it will wait 1035 // here -- there is no correctness issue with doing that. 1036 if alsoHasStrongerAccess { 1037 return false 1038 } 1039 } 1040 1041 waitForState := waitingState{kind: waitFor, key: l.key} 1042 if lockHolderTxn != nil { 1043 waitForState.txn = lockHolderTxn 1044 waitForState.held = true 1045 } else { 1046 if l.reservation == g { 1047 // Already reserved by this request. 1048 return false 1049 } 1050 // A non-transactional write request never makes or breaks reservations, 1051 // and only waits for a reservation if the reservation has a lower 1052 // seqNum. Note that `sa == spanset.SpanRead && lockHolderTxn == nil` 1053 // was already checked above. 1054 if g.txn == nil && l.reservation.seqNum > g.seqNum { 1055 // Reservation is held by a request with a higher seqNum and g is a 1056 // non-transactional request. Ignore the reservation. 1057 return false 1058 } 1059 waitForState.txn = l.reservation.txn 1060 } 1061 1062 // Incompatible with whoever is holding lock or reservation. 1063 1064 if l.reservation != nil && sa == spanset.SpanReadWrite && l.tryBreakReservation(g.seqNum) { 1065 l.reservation = g 1066 g.mu.Lock() 1067 g.mu.locks[l] = struct{}{} 1068 g.mu.Unlock() 1069 // There cannot be waitingReaders, since they do not wait for 1070 // reservations. And the set of active queuedWriters has not changed, but 1071 // they do need to be told about the change in who they are waiting for. 1072 l.informActiveWaiters() 1073 return false 1074 } 1075 1076 // Need to wait. 1077 1078 g.mu.Lock() 1079 defer g.mu.Unlock() 1080 if sa == spanset.SpanReadWrite { 1081 if _, inQueue := g.mu.locks[l]; inQueue { 1082 // Already in queue and must be in the right position, so mark as active 1083 // waiter there. We expect this to be rare. 1084 var qg *queuedGuard 1085 for e := l.queuedWriters.Front(); e != nil; e = e.Next() { 1086 qqg := e.Value.(*queuedGuard) 1087 if qqg.guard == g { 1088 qg = qqg 1089 break 1090 } 1091 } 1092 if qg == nil { 1093 panic("lockTable bug") 1094 } 1095 qg.active = true 1096 } else { 1097 // Not in queue so insert as active waiter. 1098 qg := &queuedGuard{ 1099 guard: g, 1100 active: true, 1101 } 1102 if l.queuedWriters.Len() == 0 { 1103 l.queuedWriters.PushFront(qg) 1104 } else { 1105 var e *list.Element 1106 for e = l.queuedWriters.Back(); e != nil; e = e.Prev() { 1107 qqg := e.Value.(*queuedGuard) 1108 if qqg.guard.seqNum < qg.guard.seqNum { 1109 break 1110 } 1111 } 1112 if e == nil { 1113 l.queuedWriters.PushFront(qg) 1114 } else { 1115 l.queuedWriters.InsertAfter(qg, e) 1116 } 1117 } 1118 g.mu.locks[l] = struct{}{} 1119 } 1120 } else { 1121 l.waitingReaders.PushFront(g) 1122 g.mu.locks[l] = struct{}{} 1123 } 1124 // Make it an active waiter. 1125 g.key = l.key 1126 g.mu.startWait = true 1127 if g.isSameTxnAsReservation(waitForState) { 1128 g.mu.state = waitingState{kind: waitSelf} 1129 } else { 1130 state := waitForState 1131 state.guardAccess = sa 1132 if l.distinguishedWaiter == nil { 1133 l.distinguishedWaiter = g 1134 state.kind = waitForDistinguished 1135 } 1136 g.mu.state = state 1137 } 1138 if notify { 1139 g.notify() 1140 } 1141 return true 1142 } 1143 1144 // Acquires this lock. Returns the list of guards that are done actively 1145 // waiting at this key -- these will be requests from the same transaction 1146 // that is acquiring the lock. 1147 // Acquires l.mu. 1148 func (l *lockState) acquireLock( 1149 _ lock.Strength, durability lock.Durability, txn *enginepb.TxnMeta, ts hlc.Timestamp, 1150 ) error { 1151 l.mu.Lock() 1152 defer l.mu.Unlock() 1153 if l.holder.locked { 1154 // Already held. 1155 beforeTxn, beforeTs := l.getLockerInfo() 1156 if txn.ID != beforeTxn.ID { 1157 return errors.Errorf("caller violated contract: " + 1158 "existing lock cannot be acquired by different transaction") 1159 } 1160 seqs := l.holder.holder[durability].seqs 1161 if l.holder.holder[durability].txn != nil && l.holder.holder[durability].txn.Epoch < txn.Epoch { 1162 // Clear the sequences for the older epoch. 1163 seqs = seqs[:0] 1164 } 1165 if len(seqs) > 0 && seqs[len(seqs)-1] >= txn.Sequence { 1166 // Idempotent lock acquisition. In this case, we simply ignore the lock 1167 // acquisition as long as it corresponds to an existing sequence number. 1168 // If the sequence number is not being tracked yet, insert it into the 1169 // sequence history. The validity of such a lock re-acquisition should 1170 // have already been determined at the MVCC level. 1171 if i := sort.Search(len(seqs), func(i int) bool { 1172 return seqs[i] >= txn.Sequence 1173 }); i == len(seqs) { 1174 panic("lockTable bug - search value <= last element") 1175 } else if seqs[i] != txn.Sequence { 1176 seqs = append(seqs, 0) 1177 copy(seqs[i+1:], seqs[i:]) 1178 seqs[i] = txn.Sequence 1179 l.holder.holder[durability].seqs = seqs 1180 } 1181 return nil 1182 } 1183 l.holder.holder[durability].txn = txn 1184 // Forward the lock's timestamp instead of assigning to it blindly. 1185 // While lock acquisition uses monotonically increasing timestamps 1186 // from the perspective of the transaction's coordinator, this does 1187 // not guarantee that a lock will never be acquired at a higher 1188 // epoch and/or sequence number but with a lower timestamp when in 1189 // the presence of transaction pushes. Consider the following 1190 // sequence of events: 1191 // 1192 // - txn A acquires lock at sequence 1, ts 10 1193 // - txn B pushes txn A to ts 20 1194 // - txn B updates lock to ts 20 1195 // - txn A's coordinator does not immediately learn of the push 1196 // - txn A re-acquires lock at sequence 2, ts 15 1197 // 1198 // A lock's timestamp at a given durability level is not allowed to 1199 // regress, so by forwarding its timestamp during the second acquisition 1200 // instead if assigning to it blindly, it remains at 20. 1201 // 1202 // However, a lock's timestamp as reported by getLockerInfo can regress 1203 // if it is acquired at a lower timestamp and a different durability 1204 // than it was previously held with. This is necessary to support 1205 // because the hard constraint which we must uphold here that the 1206 // lockHolderInfo for a replicated lock cannot diverge from the 1207 // replicated state machine in such a way that its timestamp in the 1208 // lockTable exceeds that in the replicated keyspace. If this invariant 1209 // were to be violated, we'd risk infinite lock-discovery loops for 1210 // requests that conflict with the lock as is written in the replicated 1211 // state machine but not as is reflected in the lockTable. 1212 // 1213 // Lock timestamp regressions are safe from the perspective of other 1214 // transactions because the request which re-acquired the lock at the 1215 // lower timestamp must have been holding a write latch at or below the 1216 // new lock's timestamp. This means that no conflicting requests could 1217 // be evaluating concurrently. Instead, all will need to re-scan the 1218 // lockTable once they acquire latches and will notice the reduced 1219 // timestamp at that point, which may cause them to conflict with the 1220 // lock even if they had not conflicted before. In a sense, it is no 1221 // different than the first time a lock is added to the lockTable. 1222 l.holder.holder[durability].ts.Forward(ts) 1223 l.holder.holder[durability].seqs = append(seqs, txn.Sequence) 1224 1225 _, afterTs := l.getLockerInfo() 1226 if beforeTs.Less(afterTs) { 1227 l.increasedLockTs(afterTs) 1228 } 1229 return nil 1230 } 1231 // Not already held, so may be reserved by this request. There is also the 1232 // possibility that some other request has broken this reservation because 1233 // of a concurrent release but that is harmless since this request is 1234 // holding latches and has proceeded to evaluation. 1235 if l.reservation != nil { 1236 if l.reservation.txn.ID != txn.ID { 1237 // Reservation is broken. 1238 qg := &queuedGuard{ 1239 guard: l.reservation, 1240 active: false, 1241 } 1242 l.queuedWriters.PushFront(qg) 1243 } else { 1244 // Else, reservation is not broken, or broken by a different request 1245 // from the same transaction. In the latter case, both requests are not 1246 // actively waiting at this lock. We don't know which is in the queue 1247 // and which is holding the reservation but it does not matter. Both 1248 // will have their requestGuardImpl.mu.locks updated and neither will be 1249 // in the queue at the end of this method. 1250 l.reservation.mu.Lock() 1251 delete(l.reservation.mu.locks, l) 1252 l.reservation.mu.Unlock() 1253 } 1254 if l.waitingReaders.Len() > 0 { 1255 panic("lockTable bug") 1256 } 1257 } else { 1258 if l.queuedWriters.Len() > 0 || l.waitingReaders.Len() > 0 { 1259 panic("lockTable bug") 1260 } 1261 } 1262 l.reservation = nil 1263 l.holder.locked = true 1264 l.holder.holder[durability].txn = txn 1265 l.holder.holder[durability].ts = ts 1266 l.holder.holder[durability].seqs = append([]enginepb.TxnSeq(nil), txn.Sequence) 1267 1268 // If there are waiting requests from the same txn, they no longer need to wait. 1269 l.releaseWritersFromTxn(txn) 1270 1271 // Inform active waiters since lock has transitioned to held. 1272 l.informActiveWaiters() 1273 return nil 1274 } 1275 1276 // A replicated lock held by txn with timestamp ts was discovered by guard g 1277 // where g is trying to access this key with access sa. 1278 // Acquires l.mu. 1279 func (l *lockState) discoveredLock( 1280 txn *enginepb.TxnMeta, ts hlc.Timestamp, g *lockTableGuardImpl, sa spanset.SpanAccess, 1281 ) error { 1282 l.mu.Lock() 1283 defer l.mu.Unlock() 1284 1285 if l.holder.locked { 1286 if !l.isLockedBy(txn.ID) { 1287 return errors.Errorf("caller violated contract: " + 1288 "discovered lock by different transaction than existing lock") 1289 } 1290 } else { 1291 l.holder.locked = true 1292 } 1293 holder := &l.holder.holder[lock.Replicated] 1294 if holder.txn == nil { 1295 holder.txn = txn 1296 holder.ts = ts 1297 holder.seqs = append(holder.seqs, txn.Sequence) 1298 } 1299 1300 // Queue the existing reservation holder. Note that this reservation 1301 // holder may not be equal to g due to two reasons (a) the reservation 1302 // of g could have been broken even though g is holding latches (see 1303 // the comment in acquireLock()), (b) g may be a non-transactional 1304 // request (read or write) that can ignore the reservation. 1305 if l.reservation != nil { 1306 qg := &queuedGuard{ 1307 guard: l.reservation, 1308 active: false, 1309 } 1310 l.queuedWriters.PushFront(qg) 1311 l.reservation = nil 1312 } 1313 1314 switch sa { 1315 case spanset.SpanReadOnly: 1316 // Don't enter the lock's queuedReaders list, because all queued readers 1317 // are expected to be active. Instead, wait until the next scan. 1318 1319 // Confirm that the guard will wait on the lock the next time it scans 1320 // the lock table. If not then it shouldn't have discovered the lock in 1321 // the first place. Bugs here would cause infinite loops where the same 1322 // lock is repeatedly re-discovered. 1323 if g.readTS.Less(ts) { 1324 return errors.Errorf("caller violated contract: discovered non-conflicting lock") 1325 } 1326 1327 case spanset.SpanReadWrite: 1328 // Immediately enter the lock's queuedWriters list. 1329 g.mu.Lock() 1330 _, presentHere := g.mu.locks[l] 1331 if !presentHere { 1332 // Since g will place itself in queue as inactive waiter below. 1333 g.mu.locks[l] = struct{}{} 1334 } 1335 g.mu.Unlock() 1336 1337 if !presentHere { 1338 // Put self in queue as inactive waiter. 1339 qg := &queuedGuard{ 1340 guard: g, 1341 active: false, 1342 } 1343 // g is not necessarily first in the queue in the (rare) case (a) above. 1344 var e *list.Element 1345 for e = l.queuedWriters.Front(); e != nil; e = e.Next() { 1346 qqg := e.Value.(*queuedGuard) 1347 if qqg.guard.seqNum > g.seqNum { 1348 break 1349 } 1350 } 1351 if e == nil { 1352 l.queuedWriters.PushBack(qg) 1353 } else { 1354 l.queuedWriters.InsertBefore(qg, e) 1355 } 1356 } 1357 } 1358 1359 // If there are waiting requests from the same txn, they no longer need to wait. 1360 l.releaseWritersFromTxn(txn) 1361 1362 // Active waiters need to be told about who they are waiting for. 1363 l.informActiveWaiters() 1364 return nil 1365 } 1366 1367 // Acquires l.mu. 1368 func (l *lockState) tryClearLock(force bool) bool { 1369 l.mu.Lock() 1370 defer l.mu.Unlock() 1371 replicatedHeld := l.holder.locked && l.holder.holder[lock.Replicated].txn != nil 1372 if replicatedHeld && l.distinguishedWaiter == nil && !force { 1373 // Replicated lock is held and has no distinguished waiter. 1374 return false 1375 } 1376 1377 // Remove unreplicated holder. 1378 l.holder.holder[lock.Unreplicated] = lockHolderInfo{} 1379 var waitState waitingState 1380 if replicatedHeld && !force { 1381 lockHolderTxn, _ := l.getLockerInfo() 1382 // Note that none of the current waiters can be requests 1383 // from lockHolderTxn. 1384 waitState = waitingState{ 1385 kind: waitElsewhere, 1386 txn: lockHolderTxn, 1387 key: l.key, 1388 held: true, 1389 guardAccess: spanset.SpanReadOnly, 1390 } 1391 } else { 1392 l.holder.locked = false 1393 waitState = waitingState{kind: doneWaiting} 1394 } 1395 1396 l.distinguishedWaiter = nil 1397 if l.reservation != nil { 1398 g := l.reservation 1399 g.mu.Lock() 1400 delete(g.mu.locks, l) 1401 g.mu.Unlock() 1402 l.reservation = nil 1403 } 1404 for e := l.waitingReaders.Front(); e != nil; { 1405 g := e.Value.(*lockTableGuardImpl) 1406 curr := e 1407 e = e.Next() 1408 l.waitingReaders.Remove(curr) 1409 1410 g.mu.Lock() 1411 g.mu.state = waitState 1412 g.notify() 1413 delete(g.mu.locks, l) 1414 g.mu.Unlock() 1415 } 1416 1417 waitState.guardAccess = spanset.SpanReadWrite 1418 for e := l.queuedWriters.Front(); e != nil; { 1419 qg := e.Value.(*queuedGuard) 1420 curr := e 1421 e = e.Next() 1422 l.queuedWriters.Remove(curr) 1423 1424 g := qg.guard 1425 g.mu.Lock() 1426 if qg.active { 1427 g.mu.state = waitState 1428 g.notify() 1429 } 1430 delete(g.mu.locks, l) 1431 g.mu.Unlock() 1432 } 1433 return true 1434 } 1435 1436 // Returns true iff the lockState is empty, i.e., there is no lock holder or 1437 // reservation. 1438 // REQUIRES: l.mu is locked. 1439 func (l *lockState) isEmptyLock() bool { 1440 if !l.holder.locked && l.reservation == nil { 1441 if l.waitingReaders.Len() > 0 || l.queuedWriters.Len() > 0 { 1442 panic("lockTable bug") 1443 } 1444 return true 1445 } 1446 return false 1447 } 1448 1449 // Removes the TxnSeqs in heldSeqNums that are contained in ignoredSeqNums. 1450 // REQUIRES: ignoredSeqNums contains non-overlapping ranges and sorted in 1451 // increasing seq order. 1452 func removeIgnored( 1453 heldSeqNums []enginepb.TxnSeq, ignoredSeqNums []enginepb.IgnoredSeqNumRange, 1454 ) []enginepb.TxnSeq { 1455 if len(ignoredSeqNums) == 0 { 1456 return heldSeqNums 1457 } 1458 held := heldSeqNums[:0] 1459 for _, n := range heldSeqNums { 1460 i := sort.Search(len(ignoredSeqNums), func(i int) bool { return ignoredSeqNums[i].End >= n }) 1461 if i == len(ignoredSeqNums) || ignoredSeqNums[i].Start > n { 1462 held = append(held, n) 1463 } 1464 } 1465 return held 1466 } 1467 1468 // Tries to update the lock: noop if this lock is held by a different 1469 // transaction, else the lock is updated. Returns whether the lockState can be 1470 // garbage collected. 1471 // Acquires l.mu. 1472 func (l *lockState) tryUpdateLock(up *roachpb.LockUpdate) (gc bool, err error) { 1473 l.mu.Lock() 1474 defer l.mu.Unlock() 1475 if !l.isLockedBy(up.Txn.ID) { 1476 return false, nil 1477 } 1478 if up.Status.IsFinalized() { 1479 l.holder.locked = false 1480 for i := range l.holder.holder { 1481 l.holder.holder[i] = lockHolderInfo{} 1482 } 1483 gc = l.lockIsFree() 1484 return gc, nil 1485 } 1486 1487 txn := &up.Txn 1488 ts := up.Txn.WriteTimestamp 1489 _, beforeTs := l.getLockerInfo() 1490 advancedTs := beforeTs.Less(ts) 1491 isLocked := false 1492 for i := range l.holder.holder { 1493 holder := &l.holder.holder[i] 1494 if holder.txn == nil { 1495 continue 1496 } 1497 // Note that mvccResolveWriteIntent() has special handling of the case 1498 // where the pusher is using an epoch lower than the epoch of the intent 1499 // (replicated lock), but is trying to push to a higher timestamp. The 1500 // replicated lock gets written with the newer epoch (not the epoch known 1501 // to the pusher) but a higher timestamp. Then the pusher will call into 1502 // this function with that lower epoch. Instead of trying to be consistent 1503 // with mvccResolveWriteIntent() in the current state of the replicated 1504 // lock we simply forget the replicated lock since it is no longer in the 1505 // way of this request. Eventually, once we have segregated locks, the 1506 // lock table will be the source of truth for replicated locks too, and 1507 // this forgetting behavior will go away. 1508 // 1509 // For unreplicated locks the lock table is the source of truth, so we 1510 // best-effort mirror the behavior of mvccResolveWriteIntent() by updating 1511 // the timestamp. 1512 if lock.Durability(i) == lock.Replicated || txn.Epoch > holder.txn.Epoch { 1513 holder.txn = nil 1514 holder.seqs = nil 1515 continue 1516 } 1517 // Unreplicated lock held in same epoch or a higher epoch. 1518 if advancedTs { 1519 // We may advance ts here but not update the holder.txn object below 1520 // for the reason stated in the comment about mvccResolveWriteIntent(). 1521 // The lockHolderInfo.ts is the source of truth regarding the timestamp 1522 // of the lock, and not TxnMeta.WriteTimestamp. 1523 holder.ts = ts 1524 } 1525 if txn.Epoch == holder.txn.Epoch { 1526 holder.seqs = removeIgnored(holder.seqs, up.IgnoredSeqNums) 1527 if len(holder.seqs) == 0 { 1528 holder.txn = nil 1529 continue 1530 } 1531 if advancedTs { 1532 holder.txn = txn 1533 } 1534 } 1535 // Else txn.Epoch < lockHolderTxn.Epoch, so only the timestamp has been 1536 // potentially updated. 1537 isLocked = true 1538 } 1539 1540 if !isLocked { 1541 l.holder.locked = false 1542 gc = l.lockIsFree() 1543 return gc, nil 1544 } 1545 1546 if advancedTs { 1547 l.increasedLockTs(ts) 1548 } 1549 // Else no change for waiters. This can happen due to a race between different 1550 // callers of UpdateLocks(). 1551 1552 return false, nil 1553 } 1554 1555 // The lock holder timestamp has increased. Some of the waiters may no longer 1556 // need to wait. 1557 // REQUIRES: l.mu is locked. 1558 func (l *lockState) increasedLockTs(newTs hlc.Timestamp) { 1559 distinguishedRemoved := false 1560 for e := l.waitingReaders.Front(); e != nil; { 1561 g := e.Value.(*lockTableGuardImpl) 1562 curr := e 1563 e = e.Next() 1564 if g.readTS.Less(newTs) { 1565 // Stop waiting. 1566 l.waitingReaders.Remove(curr) 1567 if g == l.distinguishedWaiter { 1568 distinguishedRemoved = true 1569 l.distinguishedWaiter = nil 1570 } 1571 g.doneWaitingAtLock(false, l) 1572 } 1573 // Else don't inform an active waiter which continues to be an active waiter 1574 // despite the timestamp increase. 1575 } 1576 if distinguishedRemoved { 1577 l.tryMakeNewDistinguished() 1578 } 1579 } 1580 1581 // A request known to this lockState is done. The request could be a reserver, 1582 // or waiting reader or writer. Acquires l.mu. Note that there is the 1583 // possibility of a race and the g may no longer be known to l, which we treat 1584 // as a noop (this race is allowed since we order l.mu > g.mu). Returns whether 1585 // the lockState can be garbage collected. 1586 // Acquires l.mu. 1587 func (l *lockState) requestDone(g *lockTableGuardImpl) (gc bool) { 1588 l.mu.Lock() 1589 defer l.mu.Unlock() 1590 1591 g.mu.Lock() 1592 if _, present := g.mu.locks[l]; !present { 1593 g.mu.Unlock() 1594 return false 1595 } 1596 delete(g.mu.locks, l) 1597 g.mu.Unlock() 1598 1599 if l.reservation == g { 1600 l.reservation = nil 1601 return l.lockIsFree() 1602 } 1603 // May be in queuedWriters or waitingReaders. 1604 distinguishedRemoved := false 1605 doneRemoval := false 1606 for e := l.queuedWriters.Front(); e != nil; e = e.Next() { 1607 qg := e.Value.(*queuedGuard) 1608 if qg.guard == g { 1609 l.queuedWriters.Remove(e) 1610 if qg.guard == l.distinguishedWaiter { 1611 distinguishedRemoved = true 1612 l.distinguishedWaiter = nil 1613 } 1614 doneRemoval = true 1615 break 1616 } 1617 } 1618 if !doneRemoval { 1619 for e := l.waitingReaders.Front(); e != nil; e = e.Next() { 1620 gg := e.Value.(*lockTableGuardImpl) 1621 if gg == g { 1622 l.waitingReaders.Remove(e) 1623 if g == l.distinguishedWaiter { 1624 distinguishedRemoved = true 1625 l.distinguishedWaiter = nil 1626 } 1627 doneRemoval = true 1628 break 1629 } 1630 } 1631 } 1632 if !doneRemoval { 1633 panic("lockTable bug") 1634 } 1635 if distinguishedRemoved { 1636 l.tryMakeNewDistinguished() 1637 } 1638 return false 1639 } 1640 1641 // The lock has transitioned from locked/reserved to unlocked. There could be 1642 // waiters, but there cannot be a reservation. 1643 // REQUIRES: l.mu is locked. 1644 func (l *lockState) lockIsFree() (gc bool) { 1645 if l.reservation != nil { 1646 panic("lockTable bug") 1647 } 1648 // All waiting readers don't need to wait here anymore. 1649 for e := l.waitingReaders.Front(); e != nil; { 1650 g := e.Value.(*lockTableGuardImpl) 1651 curr := e 1652 e = e.Next() 1653 l.waitingReaders.Remove(curr) 1654 if g == l.distinguishedWaiter { 1655 l.distinguishedWaiter = nil 1656 } 1657 g.doneWaitingAtLock(false, l) 1658 } 1659 1660 // The prefix of the queue that is non-transactional writers is done 1661 // waiting. 1662 for e := l.queuedWriters.Front(); e != nil; { 1663 qg := e.Value.(*queuedGuard) 1664 g := qg.guard 1665 if g.txn == nil { 1666 curr := e 1667 e = e.Next() 1668 l.queuedWriters.Remove(curr) 1669 if g == l.distinguishedWaiter { 1670 l.distinguishedWaiter = nil 1671 } 1672 g.doneWaitingAtLock(false, l) 1673 } else { 1674 break 1675 } 1676 } 1677 1678 if l.queuedWriters.Len() == 0 { 1679 return true 1680 } 1681 1682 // First waiting writer (it must be transactional) gets the reservation. 1683 e := l.queuedWriters.Front() 1684 qg := e.Value.(*queuedGuard) 1685 g := qg.guard 1686 l.reservation = g 1687 l.queuedWriters.Remove(e) 1688 if qg.active { 1689 if g == l.distinguishedWaiter { 1690 l.distinguishedWaiter = nil 1691 } 1692 g.doneWaitingAtLock(true, l) 1693 } 1694 // Else inactive waiter and is waiting elsewhere. 1695 1696 // Tell the active waiters who they are waiting for. 1697 l.informActiveWaiters() 1698 return false 1699 } 1700 1701 func (t *treeMu) nextLockSeqNum() uint64 { 1702 t.lockIDSeqNum++ 1703 return t.lockIDSeqNum 1704 } 1705 1706 // ScanAndEnqueue implements the lockTable interface. 1707 func (t *lockTableImpl) ScanAndEnqueue(req Request, guard lockTableGuard) lockTableGuard { 1708 // NOTE: there is no need to synchronize with enabledMu here. ScanAndEnqueue 1709 // scans the lockTable and enters any conflicting lock wait-queues, but a 1710 // disabled lockTable will be empty. If the scan's btree snapshot races with 1711 // a concurrent call to clear/disable then it might enter some wait-queues, 1712 // but it will quickly be released from them. 1713 1714 var g *lockTableGuardImpl 1715 if guard == nil { 1716 g = newLockTableGuardImpl() 1717 g.seqNum = atomic.AddUint64(&t.seqNum, 1) 1718 g.txn = req.txnMeta() 1719 g.spans = req.LockSpans 1720 g.readTS = req.readConflictTimestamp() 1721 g.writeTS = req.writeConflictTimestamp() 1722 g.sa = spanset.NumSpanAccess - 1 1723 g.index = -1 1724 } else { 1725 g = guard.(*lockTableGuardImpl) 1726 g.key = nil 1727 g.sa = spanset.NumSpanAccess - 1 1728 g.ss = spanset.SpanScope(0) 1729 g.index = -1 1730 g.mu.Lock() 1731 g.mu.startWait = false 1732 g.mu.mustFindNextLockAfter = false 1733 g.mu.Unlock() 1734 } 1735 for ss := spanset.SpanScope(0); ss < spanset.NumSpanScope; ss++ { 1736 for sa := spanset.SpanAccess(0); sa < spanset.NumSpanAccess; sa++ { 1737 if len(g.spans.GetSpans(sa, ss)) > 0 { 1738 // Since the spans are constant for a request, every call to 1739 // ScanAndEnqueue for that request will execute the following code 1740 // for the same SpanScope(s). Any SpanScope for which this code does 1741 // not execute will always have an empty snapshot. 1742 t.locks[ss].mu.RLock() 1743 g.tableSnapshot[ss].Reset() 1744 g.tableSnapshot[ss] = t.locks[ss].Clone() 1745 t.locks[ss].mu.RUnlock() 1746 break 1747 } 1748 } 1749 } 1750 g.findNextLockAfter(true /* notify */) 1751 return g 1752 } 1753 1754 // Dequeue implements the lockTable interface. 1755 func (t *lockTableImpl) Dequeue(guard lockTableGuard) { 1756 // NOTE: there is no need to synchronize with enabledMu here. Dequeue only 1757 // accesses state already held by the guard and does not add anything to the 1758 // lockTable. 1759 1760 g := guard.(*lockTableGuardImpl) 1761 defer releaseLockTableGuardImpl(g) 1762 1763 var candidateLocks []*lockState 1764 g.mu.Lock() 1765 for l := range g.mu.locks { 1766 candidateLocks = append(candidateLocks, l) 1767 } 1768 g.mu.Unlock() 1769 var locksToGC [spanset.NumSpanScope][]*lockState 1770 for _, l := range candidateLocks { 1771 if gc := l.requestDone(g); gc { 1772 locksToGC[l.ss] = append(locksToGC[l.ss], l) 1773 } 1774 } 1775 1776 for i := 0; i < len(locksToGC); i++ { 1777 if len(locksToGC[i]) > 0 { 1778 t.tryGCLocks(&t.locks[i], locksToGC[i]) 1779 } 1780 } 1781 } 1782 1783 // AddDiscoveredLock implements the lockTable interface. 1784 func (t *lockTableImpl) AddDiscoveredLock( 1785 intent *roachpb.Intent, guard lockTableGuard, 1786 ) (added bool, _ error) { 1787 t.enabledMu.RLock() 1788 defer t.enabledMu.RUnlock() 1789 if !t.enabled { 1790 // If not enabled, don't track any locks. 1791 return false, nil 1792 } 1793 g := guard.(*lockTableGuardImpl) 1794 key := intent.Key 1795 sa, ss, err := findAccessInSpans(key, g.spans) 1796 if err != nil { 1797 return false, err 1798 } 1799 var l *lockState 1800 tree := &t.locks[ss] 1801 tree.mu.Lock() 1802 // Can't release tree.mu until call l.discoveredLock() since someone may 1803 // find an empty lock and remove it from the tree. 1804 defer tree.mu.Unlock() 1805 iter := tree.MakeIter() 1806 iter.FirstOverlap(&lockState{key: key}) 1807 if !iter.Valid() { 1808 l = &lockState{id: tree.nextLockSeqNum(), key: key, ss: ss} 1809 l.queuedWriters.Init() 1810 l.waitingReaders.Init() 1811 tree.Set(l) 1812 atomic.AddInt64(&tree.numLocks, 1) 1813 } else { 1814 l = iter.Cur() 1815 } 1816 return true, l.discoveredLock(&intent.Txn, intent.Txn.WriteTimestamp, g, sa) 1817 } 1818 1819 // AcquireLock implements the lockTable interface. 1820 func (t *lockTableImpl) AcquireLock( 1821 txn *enginepb.TxnMeta, key roachpb.Key, strength lock.Strength, durability lock.Durability, 1822 ) error { 1823 t.enabledMu.RLock() 1824 defer t.enabledMu.RUnlock() 1825 if !t.enabled { 1826 // If not enabled, don't track any locks. 1827 return nil 1828 } 1829 if strength != lock.Exclusive { 1830 return errors.Errorf("caller violated contract: lock strength not Exclusive") 1831 } 1832 ss := spanset.SpanGlobal 1833 if keys.IsLocal(key) { 1834 ss = spanset.SpanLocal 1835 } 1836 var l *lockState 1837 tree := &t.locks[ss] 1838 tree.mu.Lock() 1839 // Can't release tree.mu until call l.acquireLock() since someone may find 1840 // an empty lock and remove it from the tree. If we expect that lockState 1841 // will already be in tree we can optimize this by first trying with a 1842 // tree.mu.RLock(). 1843 iter := tree.MakeIter() 1844 iter.FirstOverlap(&lockState{key: key}) 1845 if !iter.Valid() { 1846 if durability == lock.Replicated { 1847 tree.mu.Unlock() 1848 // Don't remember uncontended replicated locks. 1849 return nil 1850 } 1851 l = &lockState{id: tree.nextLockSeqNum(), key: key, ss: ss} 1852 tree.lockIDSeqNum++ 1853 l.queuedWriters.Init() 1854 l.waitingReaders.Init() 1855 tree.Set(l) 1856 atomic.AddInt64(&tree.numLocks, 1) 1857 } else { 1858 l = iter.Cur() 1859 } 1860 err := l.acquireLock(strength, durability, txn, txn.WriteTimestamp) 1861 tree.mu.Unlock() 1862 1863 var totalLocks int64 1864 for i := 0; i < len(t.locks); i++ { 1865 totalLocks += atomic.LoadInt64(&t.locks[i].numLocks) 1866 } 1867 if totalLocks > t.maxLocks { 1868 t.tryClearLocks(false /* force */) 1869 } 1870 return err 1871 } 1872 1873 // If force is false, removes all locks, except for those that are held with 1874 // replicated durability and have no distinguished waiter, and tells those 1875 // waiters to wait elsewhere or that they are done waiting. A replicated lock 1876 // which has been discovered by a request but no request is actively waiting on 1877 // it will be preserved since we need to tell that request who it is waiting for 1878 // when it next calls ScanAndEnqueue(). If we aggressively removed even these 1879 // locks, the next ScanAndEnqueue() would not find the lock, the request would 1880 // evaluate again, again discover that lock and if tryClearLocks() keeps getting 1881 // called would be stuck in this loop without pushing. 1882 // 1883 // If force is true, removes all locks and marks all guards as doneWaiting. 1884 func (t *lockTableImpl) tryClearLocks(force bool) { 1885 for i := 0; i < int(spanset.NumSpanScope); i++ { 1886 tree := &t.locks[i] 1887 tree.mu.Lock() 1888 var locksToClear []*lockState 1889 iter := tree.MakeIter() 1890 for iter.First(); iter.Valid(); iter.Next() { 1891 l := iter.Cur() 1892 if l.tryClearLock(force) { 1893 locksToClear = append(locksToClear, l) 1894 } 1895 } 1896 atomic.AddInt64(&tree.numLocks, int64(-len(locksToClear))) 1897 if tree.Len() == len(locksToClear) { 1898 // Fast-path full clear. 1899 tree.Reset() 1900 } else { 1901 for _, l := range locksToClear { 1902 tree.Delete(l) 1903 } 1904 } 1905 tree.mu.Unlock() 1906 } 1907 } 1908 1909 // Given the key must be in spans, returns the strongest access 1910 // specified in the spans, along with the scope of the key. 1911 func findAccessInSpans( 1912 key roachpb.Key, spans *spanset.SpanSet, 1913 ) (spanset.SpanAccess, spanset.SpanScope, error) { 1914 ss := spanset.SpanGlobal 1915 if keys.IsLocal(key) { 1916 ss = spanset.SpanLocal 1917 } 1918 for sa := spanset.NumSpanAccess - 1; sa >= 0; sa-- { 1919 s := spans.GetSpans(sa, ss) 1920 // First span that starts after key 1921 i := sort.Search(len(s), func(i int) bool { 1922 return key.Compare(s[i].Key) < 0 1923 }) 1924 if i > 0 && 1925 ((len(s[i-1].EndKey) > 0 && key.Compare(s[i-1].EndKey) < 0) || key.Equal(s[i-1].Key)) { 1926 return sa, ss, nil 1927 } 1928 } 1929 return 0, 0, errors.Errorf("caller violated contract: could not find access in spans") 1930 } 1931 1932 // Tries to GC locks that were previously known to have become empty. 1933 func (t *lockTableImpl) tryGCLocks(tree *treeMu, locks []*lockState) { 1934 tree.mu.Lock() 1935 defer tree.mu.Unlock() 1936 for _, l := range locks { 1937 iter := tree.MakeIter() 1938 iter.FirstOverlap(l) 1939 // Since the same lockState can go from non-empty to empty multiple times 1940 // it is possible that multiple threads are racing to delete it and 1941 // multiple find it empty and one wins. If a concurrent thread made the 1942 // lockState non-empty we do not want to delete it accidentally. 1943 if !iter.Valid() { 1944 continue 1945 } 1946 l = iter.Cur() 1947 l.mu.Lock() 1948 empty := l.isEmptyLock() 1949 l.mu.Unlock() 1950 if empty { 1951 tree.Delete(l) 1952 atomic.AddInt64(&tree.numLocks, -1) 1953 } 1954 } 1955 } 1956 1957 // UpdateLocks implements the lockTable interface. 1958 func (t *lockTableImpl) UpdateLocks(up *roachpb.LockUpdate) error { 1959 // NOTE: there is no need to synchronize with enabledMu here. Update only 1960 // accesses locks already in the lockTable, but a disabled lockTable will be 1961 // empty. If the lock-table scan below races with a concurrent call to clear 1962 // then it might update a few locks, but they will quickly be cleared. 1963 1964 span := up.Span 1965 ss := spanset.SpanGlobal 1966 if keys.IsLocal(span.Key) { 1967 ss = spanset.SpanLocal 1968 } 1969 tree := &t.locks[ss] 1970 var err error 1971 var locksToGC []*lockState 1972 changeFunc := func(l *lockState) { 1973 gc, err2 := l.tryUpdateLock(up) 1974 if err2 != nil { 1975 err = err2 1976 return 1977 } 1978 if gc { 1979 locksToGC = append(locksToGC, l) 1980 } 1981 } 1982 tree.mu.RLock() 1983 iter := tree.MakeIter() 1984 ltRange := &lockState{key: span.Key, endKey: span.EndKey} 1985 for iter.FirstOverlap(ltRange); iter.Valid(); iter.NextOverlap(ltRange) { 1986 changeFunc(iter.Cur()) 1987 // Optimization to avoid a second key comparison (not for correctness). 1988 if len(span.EndKey) == 0 { 1989 break 1990 } 1991 } 1992 tree.mu.RUnlock() 1993 1994 if len(locksToGC) > 0 { 1995 t.tryGCLocks(tree, locksToGC) 1996 } 1997 return err 1998 } 1999 2000 // Iteration helper for findNextLockAfter. Returns the next span to search 2001 // over, or nil if the iteration is done. 2002 // REQUIRES: g.mu is locked. 2003 func stepToNextSpan(g *lockTableGuardImpl) *spanset.Span { 2004 g.index++ 2005 for ; g.ss < spanset.NumSpanScope; g.ss++ { 2006 for ; g.sa >= 0; g.sa-- { 2007 spans := g.spans.GetSpans(g.sa, g.ss) 2008 if g.index < len(spans) { 2009 span := &spans[g.index] 2010 g.key = span.Key 2011 return span 2012 } 2013 g.index = 0 2014 } 2015 g.sa = spanset.NumSpanAccess - 1 2016 } 2017 return nil 2018 } 2019 2020 // Enable implements the lockTable interface. 2021 func (t *lockTableImpl) Enable() { 2022 // Avoid disrupting other requests if the lockTable is already enabled. 2023 // NOTE: This may be a premature optimization, but it can't hurt. 2024 t.enabledMu.RLock() 2025 enabled := t.enabled 2026 t.enabledMu.RUnlock() 2027 if enabled { 2028 return 2029 } 2030 t.enabledMu.Lock() 2031 t.enabled = true 2032 t.enabledMu.Unlock() 2033 } 2034 2035 // Clear implements the lockTable interface. 2036 func (t *lockTableImpl) Clear(disable bool) { 2037 // If disabling, lock the entire table to prevent concurrent accesses 2038 // from adding state to the table as we clear it. If not, there's no 2039 // need to synchronize with enabledMu because we're only removing state. 2040 if disable { 2041 t.enabledMu.Lock() 2042 defer t.enabledMu.Unlock() 2043 t.enabled = false 2044 } 2045 t.tryClearLocks(true /* force */) 2046 } 2047 2048 // For tests. 2049 func (t *lockTableImpl) String() string { 2050 var buf strings.Builder 2051 for i := 0; i < len(t.locks); i++ { 2052 tree := &t.locks[i] 2053 scope := spanset.SpanScope(i).String() 2054 tree.mu.RLock() 2055 fmt.Fprintf(&buf, "%s: num=%d\n", scope, atomic.LoadInt64(&tree.numLocks)) 2056 iter := tree.MakeIter() 2057 for iter.First(); iter.Valid(); iter.Next() { 2058 l := iter.Cur() 2059 l.mu.Lock() 2060 l.Format(&buf) 2061 l.mu.Unlock() 2062 } 2063 tree.mu.RUnlock() 2064 } 2065 return buf.String() 2066 }