github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/concurrency_control.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Package concurrency provides a concurrency manager structure that 12 // encapsulates the details of concurrency control and contention handling for 13 // serializable key-value transactions. 14 package concurrency 15 16 import ( 17 "context" 18 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 25 "github.com/cockroachdb/cockroach/pkg/util/hlc" 26 "github.com/cockroachdb/cockroach/pkg/util/uuid" 27 ) 28 29 // Manager is a structure that sequences incoming requests and provides 30 // isolation between requests that intend to perform conflicting operations. 31 // During sequencing, conflicts are discovered and any found are resolved 32 // through a combination of passive queuing and active pushing. Once a request 33 // has been sequenced, it is free to evaluate without concerns of conflicting 34 // with other in-flight requests due to the isolation provided by the manager. 35 // This isolation is guaranteed for the lifetime of the request but terminates 36 // once the request completes. 37 // 38 // Transactions require isolation both within requests and across requests. The 39 // manager accommodates this by allowing transactional requests to acquire 40 // locks, which outlive the requests themselves. Locks extend the duration of 41 // the isolation provided over specific keys to the lifetime of the lock-holder 42 // transaction itself. They are (typically) only released when the transaction 43 // commits or aborts. Other requests that find these locks while being sequenced 44 // wait on them to be released in a queue before proceeding. Because locks are 45 // checked during sequencing, requests are guaranteed access to all declared 46 // keys after they have been sequenced. In other words, locks don't need to be 47 // checked again during evaluation. 48 // 49 // However, at the time of writing, not all locks are stored directly under the 50 // manager's control, so not all locks are discoverable during sequencing. 51 // Specifically, write intents (replicated, exclusive locks) are stored inline 52 // in the MVCC keyspace, so they are not detectable until request evaluation 53 // time. To accommodate this form of lock storage, the manager exposes a 54 // HandleWriterIntentError method, which can be used in conjunction with a retry 55 // loop around evaluation to integrate external locks with the concurrency 56 // manager structure. In the future, we intend to pull all locks, including 57 // those associated with write intents, into the concurrency manager directly 58 // through a replicated lock table structure. 59 // 60 // Fairness is ensured between requests. In general, if any two requests 61 // conflict then the request that arrived first will be sequenced first. As 62 // such, sequencing guarantees FIFO semantics. The primary exception to this is 63 // that a request that is part of a transaction which has already acquired a 64 // lock does not need to wait on that lock during sequencing, and can therefore 65 // ignore any queue that has formed on the lock. For other exceptions, see the 66 // later comment for lockTable. 67 // 68 // Internal Components 69 // 70 // The concurrency manager is composed of a number of internal synchronization, 71 // bookkeeping, and queueing structures. Each of these is discussed in more 72 // detail on their interface definition. The following diagram details how the 73 // components are tied together: 74 // 75 // +---------------------+---------------------------------------------+ 76 // | concurrency.Manager | | 77 // +---------------------+ | 78 // | | 79 // +------------+ acquire +--------------+ acquire | 80 // Sequence() |--->--->---| latchManager |<---<---<---<---<---<---+ | 81 // +------------+ +--------------+ | | 82 // | / check locks + wait queues | | 83 // | v if conflict, enter q & drop latches ^ | 84 // | +---------------------------------------------------+ | | 85 // | | [ lockTable ] | | | 86 // | | [ key1 ] -------------+-----------------+ | ^ | 87 // | | [ key2 ] / lockState: | lockWaitQueue: |----<---<---<----+ 88 // | | [ key3 ]-{ - lock type | +-[a]<-[b]<-[c] | | | | | 89 // | | [ key4 ] \ - txn meta | | (no latches) |-->-^ | | 90 // | | [ key5 ] -------------+-|---------------+ | | | 91 // | | [ ... ] v | | ^ 92 // | +---------------------------------|-----------------+ | | if lock found, HandleWriterIntentError() 93 // | | | | | - enter lockWaitQueue 94 // | | +- may be remote -+--+ | | - drop latches 95 // | | | | | | - wait for lock update / release 96 // | v v ^ | | 97 // | | +--------------------------+ | ^ 98 // | | | txnWaitQueue: | | | 99 // | | | (located on txn record's | | | 100 // | v | leaseholder replica) | | | 101 // | | |--------------------------| | ^ 102 // | | | [txn1] [txn2] [txn3] ... |----<---<---<---<----+ | 103 // | | +--------------------------+ | | if txn push failed, HandleTransactionPushError() 104 // | | | | - enter txnWaitQueue 105 // | | | ^ - drop latches 106 // | | | | - wait for txn record update 107 // | | | | | 108 // | | | | | 109 // | +--> retain latches --> remain at head of queues ---> evaluate ---> Finish() 110 // | | 111 // +----------+ | 112 // Finish() | ---> exit wait queues ---> drop latches -----------------> respond ... 113 // +----------+ | 114 // | | 115 // +-------------------------------------------------------------------+ 116 // 117 // See the comments on individual components for a more detailed look at their 118 // interface and inner-workings. 119 // 120 // At a high-level, a request enters the concurrency manager and immediately 121 // acquires latches from the latchManager to serialize access to the keys that 122 // it intends to touch. This latching takes into account the keys being 123 // accessed, the MVCC timestamp of accesses, and the access method being used 124 // (read vs. write) to allow for concurrency where possible. This has the effect 125 // of queuing on conflicting in-flight operations until their completion. 126 // 127 // Once latched, the request consults the lockTable to check for any conflicting 128 // locks owned by other transactions. If any are found, the request enters the 129 // corresponding lockWaitQueue and its latches are dropped. Requests in the 130 // queue wait for the corresponding lock to be released by intent resolution. 131 // While waiting, the head of the lockWaitQueue pushes the owner of the lock 132 // through a remote RPC that ends up in the pushee's txnWaitQueue. This queue 133 // exists on the leaseholder replica of the range that contains the pushee's 134 // transaction record. Other entries in the queue wait for the head of the 135 // queue, eventually pushing it to detect coordinator failures and transaction 136 // deadlocks. Once the lock is released, the head of the queue reacquires 137 // latches and attempts to proceed while remaining at the head of that 138 // lockWaitQueue to ensure fairness. 139 // 140 // Once a request is latched and observes no conflicting locks in the lockTable 141 // and no conflicting lockWaitQueues that it is not already the head of, the 142 // request can proceed to evaluate. During evaluation, the request may insert or 143 // remove locks from the lockTable for its own transaction. 144 // 145 // When the request completes, it exits any lockWaitQueues that it was a part of 146 // and releases its latches. However, if the request was successful, any locks 147 // that it inserted into the lockTable remain. 148 type Manager interface { 149 RequestSequencer 150 ContentionHandler 151 LockManager 152 TransactionManager 153 RangeStateListener 154 MetricExporter 155 } 156 157 // RequestSequencer is concerned with the sequencing of concurrent requests. It 158 // is one of the roles of Manager. 159 type RequestSequencer interface { 160 // SequenceReq acquires latches, checks for locks, and queues behind and/or 161 // pushes other transactions to resolve any conflicts. Once sequenced, the 162 // request is guaranteed sufficient isolation for the duration of its 163 // evaluation, until the returned request guard is released. 164 // NOTE: this last part will not be true until replicated locks are pulled 165 // into the concurrency manager. 166 // 167 // An optional existing request guard can be provided to SequenceReq. This 168 // allows the request's position in lock wait-queues to be retained across 169 // sequencing attempts. If provided, the guard should not be holding latches 170 // already. The expected usage of this parameter is that it will only be 171 // provided after acquiring a Guard from a ContentionHandler method. 172 // 173 // If the method returns a non-nil request guard then the caller must ensure 174 // that the guard is eventually released by passing it to FinishReq. 175 // 176 // Alternatively, the concurrency manager may be able to serve the request 177 // directly, in which case it will return a Response for the request. If it 178 // does so, it will not return a request guard. 179 SequenceReq(context.Context, *Guard, Request) (*Guard, Response, *Error) 180 181 // FinishReq marks the request as complete, releasing any protection 182 // the request had against conflicting requests and allowing conflicting 183 // requests that are blocked on this one to proceed. The guard should not 184 // be used after being released. 185 FinishReq(*Guard) 186 } 187 188 // ContentionHandler is concerned with handling contention-related errors. This 189 // typically involves preparing the request to be queued upon a retry. It is one 190 // of the roles of Manager. 191 type ContentionHandler interface { 192 // HandleWriterIntentError consumes a WriteIntentError by informing the 193 // concurrency manager about the replicated write intent that was missing 194 // from its lock table which was found during request evaluation (while 195 // holding latches). After doing so, it enqueues the request that hit the 196 // error in the lock's wait-queue (but does not wait) and releases the 197 // guard's latches. It returns an updated guard reflecting this change. 198 // After the method returns, the original guard should no longer be used. 199 // If an error is returned then the provided guard will be released and no 200 // guard will be returned. 201 // 202 // Example usage: Txn A scans the lock table and does not see an intent on 203 // key K from txn B because the intent is not being tracked in the lock 204 // table. Txn A moves on to evaluation. While scanning, it notices the 205 // intent on key K. It throws a WriteIntentError which is consumed by this 206 // method before txn A retries its scan. During the retry, txn A scans the 207 // lock table and observes the lock on key K, so it enters the lock's 208 // wait-queue and waits for it to be resolved. 209 HandleWriterIntentError(context.Context, *Guard, *roachpb.WriteIntentError) (*Guard, *Error) 210 211 // HandleTransactionPushError consumes a TransactionPushError thrown by a 212 // PushTxnRequest by informing the concurrency manager about a transaction 213 // record that could not be pushed during request evaluation (while holding 214 // latches). After doing so, it releases the guard's latches. It returns an 215 // updated guard reflecting this change. After the method returns, the 216 // original guard should no longer be used. 217 // 218 // Example usage: Txn A sends a PushTxn request to push abort txn B. When 219 // the request is originally sequenced through the concurrency manager, it 220 // checks the txn wait-queue and finds that txn B is not being tracked, so 221 // it does not queue up behind it. Txn A moves on to evaluation and tries to 222 // push txn B's record. This push fails because txn B is not expired, which 223 // results in a TransactionPushError. This error is consumed by this method 224 // before txn A retries its push. During the retry, txn A finds that txn B 225 // is being tracked in the txn wait-queue so it waits there for txn B to 226 // finish. 227 HandleTransactionPushError(context.Context, *Guard, *roachpb.TransactionPushError) *Guard 228 } 229 230 // LockManager is concerned with tracking locks that are stored on the manager's 231 // range. It is one of the roles of Manager. 232 type LockManager interface { 233 // OnLockAcquired informs the concurrency manager that a transaction has 234 // acquired a new lock or re-acquired an existing lock that it already held. 235 OnLockAcquired(context.Context, *roachpb.LockAcquisition) 236 237 // OnLockUpdated informs the concurrency manager that a transaction has 238 // updated or released a lock or range of locks that it previously held. 239 // The Durability field of the lock update struct is ignored. 240 OnLockUpdated(context.Context, *roachpb.LockUpdate) 241 } 242 243 // TransactionManager is concerned with tracking transactions that have their 244 // record stored on the manager's range. It is one of the roles of Manager. 245 type TransactionManager interface { 246 // OnTransactionUpdated informs the concurrency manager that a transaction's 247 // status was updated. 248 OnTransactionUpdated(context.Context, *roachpb.Transaction) 249 250 // GetDependents returns a set of transactions waiting on the specified 251 // transaction either directly or indirectly. The method is used to perform 252 // deadlock detection. See txnWaitQueue for more. 253 GetDependents(uuid.UUID) []uuid.UUID 254 } 255 256 // RangeStateListener is concerned with observing updates to the concurrency 257 // manager's range. It is one of the roles of Manager. 258 type RangeStateListener interface { 259 // OnRangeDescUpdated informs the manager that its range's descriptor has been 260 // updated. 261 OnRangeDescUpdated(*roachpb.RangeDescriptor) 262 263 // OnRangeLeaseUpdated informs the concurrency manager that its range's 264 // lease has been updated. The argument indicates whether this manager's 265 // replica is the leaseholder going forward. 266 OnRangeLeaseUpdated(isLeaseholder bool) 267 268 // OnRangeSplit informs the concurrency manager that its range has split off 269 // a new range to its RHS. 270 OnRangeSplit() 271 272 // OnRangeMerge informs the concurrency manager that its range has merged 273 // into its LHS neighbor. This is not called on the LHS range being merged 274 // into. 275 OnRangeMerge() 276 277 // OnReplicaSnapshotApplied informs the concurrency manager that its replica 278 // has received a snapshot from another replica in its range. 279 OnReplicaSnapshotApplied() 280 } 281 282 // MetricExporter is concerned with providing observability into the state of 283 // the concurrency manager. It is one of the roles of Manager. 284 type MetricExporter interface { 285 // LatchMetrics returns information about the state of the latchManager. 286 LatchMetrics() (global, local kvserverpb.LatchManagerInfo) 287 288 // LockTableDebug returns a debug string representing the state of the 289 // lockTable. 290 LockTableDebug() string 291 292 // TxnWaitQueue returns the concurrency manager's txnWaitQueue. 293 // TODO(nvanbenschoten): this doesn't really fit into this interface. It 294 // would be nice if the txnWaitQueue was hidden behind the concurrency 295 // manager abstraction entirely, but tests want to access it directly. 296 TxnWaitQueue() *txnwait.Queue 297 298 // TODO(nvanbenschoten): fill out this interface to provide observability 299 // into the state of the concurrency manager. 300 // LatchMetrics() 301 // LockTableMetrics() 302 // TxnWaitQueueMetrics() 303 } 304 305 /////////////////////////////////// 306 // External API Type Definitions // 307 /////////////////////////////////// 308 309 // Request is the input to Manager.SequenceReq. The struct contains all of the 310 // information necessary to sequence a KV request and determine which locks and 311 // other in-flight requests it conflicts with. 312 type Request struct { 313 // The (optional) transaction that sent the request. 314 // Non-transactional requests do not acquire locks. 315 Txn *roachpb.Transaction 316 317 // The timestamp that the request should evaluate at. 318 // Should be set to Txn.ReadTimestamp if Txn is non-nil. 319 Timestamp hlc.Timestamp 320 321 // The priority of the request. Only set if Txn is nil. 322 Priority roachpb.UserPriority 323 324 // The consistency level of the request. Only set if Txn is nil. 325 ReadConsistency roachpb.ReadConsistencyType 326 327 // The individual requests in the batch. 328 Requests []roachpb.RequestUnion 329 330 // The maximal set of spans that the request will access. Latches 331 // will be acquired for these spans. 332 // TODO(nvanbenschoten): don't allocate these SpanSet objects. 333 LatchSpans *spanset.SpanSet 334 335 // The maximal set of spans within which the request expects to have 336 // isolation from conflicting transactions. Conflicting locks within 337 // these spans will be queued on and conditionally pushed. 338 // 339 // Note that unlike LatchSpans, the timestamps that these spans are 340 // declared at are NOT consulted. All read spans are considered to take 341 // place at the transaction's read timestamp (Txn.ReadTimestamp) and all 342 // write spans are considered to take place the transaction's write 343 // timestamp (Txn.WriteTimestamp). If the request is non-transactional 344 // (Txn == nil), all reads and writes are considered to take place at 345 // Timestamp. 346 LockSpans *spanset.SpanSet 347 } 348 349 // Guard is returned from Manager.SequenceReq. The guard is passed back in to 350 // Manager.FinishReq to release the request's resources when it has completed. 351 type Guard struct { 352 Req Request 353 lg latchGuard 354 ltg lockTableGuard 355 } 356 357 // Response is a slice of responses to requests in a batch. This type is used 358 // when the concurrency manager is able to respond to a request directly during 359 // sequencing. 360 type Response = []roachpb.ResponseUnion 361 362 // Error is an alias for a roachpb.Error. 363 type Error = roachpb.Error 364 365 /////////////////////////////////// 366 // Internal Structure Interfaces // 367 /////////////////////////////////// 368 369 // latchManager serializes access to keys and key ranges. 370 // 371 // See additional documentation in pkg/storage/spanlatch. 372 type latchManager interface { 373 // Acquires latches, providing mutual exclusion for conflicting requests. 374 Acquire(context.Context, Request) (latchGuard, *Error) 375 376 // Releases latches, relinquish its protection from conflicting requests. 377 Release(latchGuard) 378 379 // Info returns information about the state of the latchManager. 380 Info() (global, local kvserverpb.LatchManagerInfo) 381 } 382 383 // latchGuard is a handle to a set of acquired key latches. 384 type latchGuard interface{} 385 386 // lockTable holds a collection of locks acquired by in-progress transactions. 387 // Each lock in the table has a possibly-empty lock wait-queue associated with 388 // it, where conflicting transactions can queue while waiting for the lock to be 389 // released. 390 // 391 // +---------------------------------------------------+ 392 // | [ lockTable ] | 393 // | [ key1 ] -------------+-----------------+ | 394 // | [ key2 ] / lockState: | lockWaitQueue: | | 395 // | [ key3 ]-{ - lock type | <-[a]<-[b]<-[c] | | 396 // | [ key4 ] \ - txn meta | | | 397 // | [ key5 ] -------------+-----------------+ | 398 // | [ ... ] | 399 // +---------------------------------------------------+ 400 // 401 // The database is read and written using "requests". Transactions are composed 402 // of one or more requests. Isolation is needed across requests. Additionally, 403 // since transactions represent a group of requests, isolation is needed across 404 // such groups. Part of this isolation is accomplished by maintaining multiple 405 // versions and part by allowing requests to acquire locks. Even the isolation 406 // based on multiple versions requires some form of mutual exclusion to ensure 407 // that a read and a conflicting lock acquisition do not happen concurrently. 408 // The lock table provides both locking and sequencing of requests (in concert 409 // with the use of latches). The lock table sequences both transactional and 410 // non-transactional requests, but the latter cannot acquire locks. 411 // 412 // Locks outlive the requests themselves and thereby extend the duration of the 413 // isolation provided over specific keys to the lifetime of the lock-holder 414 // transaction itself. They are (typically) only released when the transaction 415 // commits or aborts. Other requests that find these locks while being sequenced 416 // wait on them to be released in a queue before proceeding. Because locks are 417 // checked during sequencing, requests are guaranteed access to all declared 418 // keys after they have been sequenced. In other words, locks don't need to be 419 // checked again during evaluation. 420 // 421 // However, at the time of writing, not all locks are stored directly under 422 // lock table control, so not all locks are discoverable during sequencing. 423 // Specifically, write intents (replicated, exclusive locks) are stored inline 424 // in the MVCC keyspace, so they are often not detectable until request 425 // evaluation time. To accommodate this form of lock storage, the lock table 426 // exposes an AddDiscoveredLock method. In the future, we intend to pull all 427 // locks, including those associated with write intents, into the lock table 428 // directly. 429 // 430 // The lock table also provides fairness between requests. If two requests 431 // conflict then the request that arrived first will typically be sequenced 432 // first. There are some exceptions: 433 // 434 // - a request that is part of a transaction which has already acquired a lock 435 // does not need to wait on that lock during sequencing, and can therefore 436 // ignore any queue that has formed on the lock. 437 // 438 // - contending requests that encounter different levels of contention may be 439 // sequenced in non-FIFO order. This is to allow for more concurrency. e.g. 440 // if request R1 and R2 contend on key K2, but R1 is also waiting at key K1, 441 // R2 could slip past R1 and evaluate. 442 // 443 type lockTable interface { 444 requestQueuer 445 446 // ScanAndEnqueue scans over the spans that the request will access and 447 // enqueues the request in the lock wait-queue of any conflicting locks 448 // encountered. 449 // 450 // The first call to ScanAndEnqueue for a given request uses a nil 451 // lockTableGuard and the subsequent calls reuse the previously returned 452 // one. The latches needed by the request must be held when calling this 453 // function. 454 ScanAndEnqueue(Request, lockTableGuard) lockTableGuard 455 456 // Dequeue removes the request from its lock wait-queues. It should be 457 // called when the request is finished, whether it evaluated or not. The 458 // guard should not be used after being dequeued. 459 // 460 // This method does not release any locks. This method must be called on the 461 // last guard returned from ScanAndEnqueue for the request, even if one of 462 // the (a) lockTable calls that use a lockTableGuard parameter, or (b) a 463 // lockTableGuard call, returned an error. The method allows but does not 464 // require latches to be held. 465 Dequeue(lockTableGuard) 466 467 // AddDiscoveredLock informs the lockTable of a lock that was discovered 468 // during evaluation which the lockTable wasn't previously tracking. 469 // 470 // The method is called when an exclusive replicated lock held by a 471 // different transaction is discovered when reading the MVCC keys during 472 // evaluation of this request. It adds the lock and enqueues this requester 473 // in its wait-queue. It is required that request evaluation discover such 474 // locks before acquiring its own locks, since the request needs to repeat 475 // ScanAndEnqueue. 476 // 477 // A latch consistent with the access desired by the guard must be held on 478 // the span containing the discovered lock's key. 479 // 480 // The method returns a boolean indicating whether the discovered lock was 481 // added to the lockTable (true) or whether it was ignored because the 482 // lockTable is currently disabled (false). 483 AddDiscoveredLock(*roachpb.Intent, lockTableGuard) (bool, error) 484 485 // AcquireLock informs the lockTable that a new lock was acquired or an 486 // existing lock was updated. 487 // 488 // The provided TxnMeta must be the same one used when the request scanned 489 // the lockTable initially. It must only be called in the evaluation phase 490 // before calling Dequeue, which means all the latches needed by the request 491 // are held. The key must be in the request's SpanSet with the appropriate 492 // SpanAccess: currently the strength is always Exclusive, so the span 493 // containing this key must be SpanReadWrite. This contract ensures that the 494 // lock is not held in a conflicting manner by a different transaction. 495 // Acquiring a lock that is already held by this transaction upgrades the 496 // lock's timestamp and strength, if necessary. 497 // 498 // For replicated locks, this must be called after the corresponding write 499 // intent has been applied to the replicated state machine. 500 AcquireLock(*enginepb.TxnMeta, roachpb.Key, lock.Strength, lock.Durability) error 501 502 // UpdateLocks informs the lockTable that an existing lock or range of locks 503 // was either updated or released. 504 // 505 // The method is called during intent resolution. For spans containing 506 // Replicated locks, this must be called after intent resolution has been 507 // applied to the replicated state machine. The method itself, however, 508 // ignores the Durability field in the LockUpdate. It can therefore be 509 // used to update locks for a given transaction for all durability levels. 510 // 511 // A latch with SpanReadWrite must be held on span with the lowest timestamp 512 // at which any of the locks could be held. This is explained below. 513 // 514 // Note that spans can be wider than the actual keys on which locks were 515 // acquired, and it is ok if no locks are found or locks held by other 516 // transactions are found (for those lock this call is a noop). 517 // 518 // For COMMITTED or ABORTED transactions, all locks are released. 519 // 520 // For PENDING or STAGING transactions, the behavior is: 521 // 522 // - All replicated locks known to the lockTable are dropped. This is not 523 // because those intents are necessarily deleted, but because in the 524 // current code where intents are not managed by the lockTable (this will 525 // change when we have a segregated lock table), we do not want to risk 526 // code divergence between lockTable and mvccResolveWriteIntent: the 527 // danger is that the latter removes or changes an intent while the 528 // lockTable retains it, and a waiter is stuck forever. 529 // 530 // Note that even the conservative behavior of dropping locks requires 531 // that intent resolution acquire latches using the oldest timestamp at 532 // which the intent could have been written: if the intent was at ts=5 and 533 // the intent resolution is using ts=10 (since the transaction has been 534 // pushed), there is a race where a reader at ts=8 can be concurrently 535 // holding latches and the following bad sequence occurs (both thread1 and 536 // thread2 are concurrent since their latches do not conflict): 537 // 538 // - [thread1-txn1] reader sees intent at ts=5 539 // - [thread2-txn2] intent resolution changes that intent to ts=10 540 // - [thread2-txn2] updateLocks is called and lock is removed since it is a 541 // replicated lock. 542 // - [thread1-txn1] reader calls addDiscoveredLock() for ts=5. 543 // 544 // Now the lockTable thinks there is a lock and subsequent pushes of txn2 545 // by txn1 will do nothing since the txn2 is already at timestamp 10. Txn1 546 // will unnecessarily block until txn2 is done. 547 // 548 // - Unreplicated locks: 549 // - for epochs older than txn.Epoch, locks are dropped. 550 // - locks in the current epoch that are at a TxnMeta.Sequence 551 // contained in IgnoredSeqNums are dropped. 552 // - the remaining locks are changed to timestamp equal to 553 // txn.WriteTimestamp. 554 UpdateLocks(*roachpb.LockUpdate) error 555 556 // String returns a debug string representing the state of the lockTable. 557 String() string 558 } 559 560 // lockTableGuard is a handle to a request as it waits on conflicting locks in a 561 // lockTable or as it holds a place in lock wait-queues as it evaluates. 562 type lockTableGuard interface { 563 // ShouldWait must be called after each ScanAndEnqueue. The request should 564 // proceed to evaluation if it returns false, else it releases latches and 565 // listens to the channel returned by NewStateChan. 566 ShouldWait() bool 567 568 // NewStateChan returns the channel to listen on for notification that the 569 // state may have changed. If ShouldWait returns true, this channel will 570 // have an initial notification. Note that notifications are collapsed if 571 // not retrieved, since it is not necessary for the waiter to see every 572 // state transition. 573 NewStateChan() chan struct{} 574 575 // CurState returns the latest waiting state. 576 CurState() waitingState 577 } 578 579 // lockTableWaiter is concerned with waiting in lock wait-queues for locks held 580 // by conflicting transactions. It ensures that waiting requests continue to 581 // make forward progress even in the presence of faulty transaction coordinators 582 // and transaction deadlocks. 583 // 584 // The waiter implements logic for a request to wait on conflicting locks in the 585 // lockTable until they are released. Similarly, it implements logic to wait on 586 // conflicting requests ahead of the caller's request in any lock wait-queues 587 // that it is a part of. 588 // 589 // This waiting state responds to a set of state transitions in the lock table: 590 // - a conflicting lock is released 591 // - a conflicting lock is updated such that it no longer conflicts 592 // - a conflicting request in the lock wait-queue acquires the lock 593 // - a conflicting request in the lock wait-queue exits the lock wait-queue 594 // 595 // These state transitions are typically reactive - the waiter can simply wait 596 // for locks to be released or lock wait-queues to be exited by other actors. 597 // Reacting to state transitions for conflicting locks is powered by the 598 // LockManager and reacting to state transitions for conflicting lock 599 // wait-queues is powered by the RequestSequencer interface. 600 // 601 // However, in the case of transaction coordinator failures or transaction 602 // deadlocks, a state transition may never occur without intervention from the 603 // waiter. To ensure forward-progress, the waiter may need to actively push 604 // either a lock holder of a conflicting lock or the head of a conflicting lock 605 // wait-queue. This active pushing requires an RPC to the leaseholder of the 606 // conflicting transaction's record, and will typically result in the RPC 607 // queuing in that leaseholder's txnWaitQueue. Because this can be expensive, 608 // the push is not immediately performed. Instead, it is only performed after a 609 // delay. 610 type lockTableWaiter interface { 611 // WaitOn accepts and waits on a lockTableGuard that has returned true from 612 // ShouldWait. 613 // 614 // The method should be called after dropping any latches that a request 615 // has acquired. It returns when the request is at the front of all lock 616 // wait-queues and it is safe to re-acquire latches and scan the lockTable 617 // again. 618 WaitOn(context.Context, Request, lockTableGuard) *Error 619 620 // WaitOnLock waits on the transaction responsible for the specified lock 621 // and then ensures that the lock is cleared out of the request's way. 622 // 623 // The method should be called after dropping any latches that a request has 624 // acquired. It returns when the lock has been resolved. 625 // 626 // NOTE: this method is used when the lockTable is disabled (e.g. on a 627 // follower replica) and a lock is discovered that must be waited on (e.g. 628 // during a follower read). If/when lockTables are maintained on follower 629 // replicas by propagating lockTable state transitions through the Raft log 630 // in the ReplicatedEvalResult instead of through the (leaseholder-only) 631 // LocalResult, we should be able to remove the lockTable "disabled" state 632 // and, in turn, remove this method. This will likely fall out of pulling 633 // all replicated locks into the lockTable. 634 WaitOnLock(context.Context, Request, *roachpb.Intent) *Error 635 636 // ClearCaches wipes all caches maintained by the lockTableWaiter. This is 637 // primarily used to recover memory when a replica loses a lease. However, 638 // it is also used in tests to reset the state of the lockTableWaiter. 639 ClearCaches() 640 } 641 642 // txnWaitQueue holds a collection of wait-queues for transaction records. 643 // Conflicting transactions, known as "pushers", sit in a queue associated with 644 // an extant transaction that they conflict with, known as the "pushee", and 645 // wait for the pushee transaction to commit or abort. 646 // 647 // Typically, waiting for a pushee's transaction record to undergo a state 648 // transition is sufficient to satisfy a pusher transaction. Reacting to state 649 // transitions for conflicting transactions is powered by the TransactionManager 650 // interface. 651 // 652 // Just like with the lockTableWaiter, there are cases where reacting to state 653 // transitions alone in insufficient to make forward progress. However, unlike 654 // with the lockTableWaiter, the location of the txnWaitQueue on the range 655 // containing the conflicting transaction's record instead of on the range 656 // containing the conflicting transaction's lock presents an opportunity to 657 // actively resolve these situations. This is because a transaction's record 658 // reflects its authoritative status. 659 // 660 // The first of these situations is failure of the conflicting transaction's 661 // coordinator. This situation comes in two flavors: 662 // - before a transaction has been finalized (committed or aborted) 663 // - after a transaction has been finalized but before all of its intents have 664 // been resolved 665 // 666 // In the first of these flavors, the transaction record may still have a 667 // PENDING status. Without a live transaction coordinator heartbeating it, the 668 // record will eventually expire and be abortable. The the second of these 669 // flavors, the transaction's record will already be committed or aborted. 670 // Regardless of which case the push falls into, once the transaction record 671 // is observed in a finalized state, the push will succeed, kick off intent 672 // resolution, and return to the sender. 673 // 674 // The second of these situations is transaction deadlock. Deadlocks occur when 675 // the lock acquisition patterns of two or more transactions interact in such a 676 // way that a cycle emerges in the "waits-for" graph of transactions. To break 677 // this cycle, one of the transactions must be aborted or it is impossible for 678 // any of the transactions that are part of the deadlock to continue making 679 // progress. 680 // 681 // The txnWaitQueue provides a mechanism for detecting these cycles across a 682 // distributed graph of transactions. Distributed deadlock detection works by 683 // having each pusher transaction that is waiting in the queue for a different 684 // transaction periodically query its own record using a QueryTxn request. While 685 // on the pusher's own transaction record range, the QueryTxn request uses the 686 // GetDependents method to collect the IDs of all locally-known transactions 687 // that are waiting for the pusher itself to release its locks. Of course, this 688 // local view of the dependency graph is incomplete, as it does not initially 689 // take into consideration transitive dependencies. To address this, when the 690 // QueryTxn returns to the initial txnWaitQueue, the pusher records its own 691 // dependencies as dependencies of its pushee transaction. As this process 692 // continues and pushers periodically query for their own dependencies and 693 // transfer these to their pushee, each txnWaitQueue accumulate more information 694 // about the global "waits-for" graph. Eventually, one of the txnWaitQueues is 695 // able to observe a full cycle in this graph and aborts one of the transactions 696 // in the cycle to break the deadlock. 697 // 698 // Example of Distributed Deadlock Detection 699 // 700 // The following diagram demonstrates how the txnWaitQueue interacts with 701 // distributed deadlock detection. 702 // 703 // - txnA enters txnB's txnWaitQueue during a PushTxn request (MaybeWaitForPush) 704 // - txnB enters txnC's txnWaitQueue during a PushTxn request (MaybeWaitForPush) 705 // - txnC enters txnA's txnWaitQueue during a PushTxn request (MaybeWaitForPush) 706 // 707 // .-----------------------------------. 708 // | | 709 // v | 710 // [txnA record] --> [txnB record] --> [txnC record] 711 // deps: deps: deps: 712 // - txnC - txnA - txnB 713 // 714 // - txnA queries its own txnWaitQueue using a QueryTxn request (MaybeWaitForQuery) 715 // 716 // .-----------------------------------. 717 // | ............ | 718 // v v . | 719 // [txnA record] --> [txnB record] --> [txnC record] 720 // deps: deps: deps: 721 // - txnC - txnA - txnB 722 // 723 // - txnA finds that txnC is a dependent. It transfers this dependency to txnB 724 // 725 // .-----------------------------------. 726 // | | 727 // v | 728 // [txnA record] --> [txnB record] --> [txnC record] 729 // deps: deps: deps: 730 // - txnC - txnA - txnB 731 // - txnC 732 // 733 // - txnC queries its own txnWaitQueue using a QueryTxn request (MaybeWaitForQuery) 734 // - txnB queries its own txnWaitQueue using a QueryTxn request (MaybeWaitForQuery) 735 // - txnC finds that txnB is a dependent. It transfers this dependency to txnA 736 // - txnB finds that txnA and txnC are dependents. It transfers these dependencies to txnC 737 // 738 // .-----------------------------------. 739 // | | 740 // v | 741 // [txnA record] --> [txnB record] --> [txnC record] 742 // deps: deps: deps: 743 // - txnC - txnA - txnB 744 // - txnB - txnC - txnA 745 // - txnC 746 // 747 // - txnB notices that txnC is a transitive dependency of itself. This indicates 748 // a cycle in the global wait-for graph. txnC is aborted, breaking the cycle 749 // and the deadlock 750 // 751 // [txnA record] --> [txnB record] --> [txnC record: ABORTED] 752 // 753 // - txnC releases its locks and the transactions proceed in order. 754 // 755 // [txnA record] --> [txnB record] --> (free to commit) 756 // 757 // TODO(nvanbenschoten): if we exposed a "queue guard" interface, we could make 758 // stronger guarantees around cleaning up enqueued txns when there are no 759 // waiters. 760 type txnWaitQueue interface { 761 requestQueuer 762 763 // EnqueueTxn creates a queue associated with the provided transaction. Once 764 // a queue is established, pushers of this transaction can wait in the queue 765 // and will be informed of state transitions that the transaction undergoes. 766 EnqueueTxn(*roachpb.Transaction) 767 768 // UpdateTxn informs the queue that the provided transaction has undergone 769 // a state transition. This will be communicated to any waiting pushers. 770 UpdateTxn(context.Context, *roachpb.Transaction) 771 772 // GetDependents returns a set of transactions waiting on the specified 773 // transaction either directly or indirectly. The method is used to perform 774 // deadlock detection. 775 GetDependents(uuid.UUID) []uuid.UUID 776 777 // MaybeWaitForPush checks whether there is a queue already established for 778 // transaction being pushed by the provided request. If not, or if the 779 // PushTxn request isn't queueable, the method returns immediately. If there 780 // is a queue, the method enqueues this request as a waiter and waits for 781 // the transaction to be pushed/finalized. 782 // 783 // If the transaction is successfully pushed while this method is waiting, 784 // the first return value is a non-nil PushTxnResponse object. 785 MaybeWaitForPush(context.Context, *roachpb.PushTxnRequest) (*roachpb.PushTxnResponse, *Error) 786 787 // MaybeWaitForQuery checks whether there is a queue already established for 788 // transaction being queried. If not, or if the QueryTxn request hasn't 789 // specified WaitForUpdate, the method returns immediately. If there is a 790 // queue, the method enqueues this request as a waiter and waits for any 791 // updates to the target transaction. 792 MaybeWaitForQuery(context.Context, *roachpb.QueryTxnRequest) *Error 793 794 // OnRangeDescUpdated informs the Queue that its range's descriptor has been 795 // updated. 796 OnRangeDescUpdated(*roachpb.RangeDescriptor) 797 } 798 799 // requestQueuer queues requests until some condition is met. 800 type requestQueuer interface { 801 // Enable allows requests to be queued. The method is idempotent. 802 Enable() 803 804 // Clear empties the queue(s) and causes all waiting requests to 805 // return. If disable is true, future requests must not be enqueued. 806 Clear(disable bool) 807 }