github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_tscache.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/keys" 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/tscache" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 22 "github.com/cockroachdb/cockroach/pkg/util" 23 "github.com/cockroachdb/cockroach/pkg/util/hlc" 24 "github.com/cockroachdb/cockroach/pkg/util/uuid" 25 ) 26 27 // setTimestampCacheLowWaterMark updates the low water mark of the timestamp 28 // cache to the provided timestamp for all key ranges owned by the provided 29 // Range descriptor. This ensures that no future writes in either the local or 30 // global keyspace are allowed at times equal to or earlier than this timestamp, 31 // which could invalidate prior reads. 32 func setTimestampCacheLowWaterMark( 33 tc tscache.Cache, desc *roachpb.RangeDescriptor, ts hlc.Timestamp, 34 ) { 35 for _, keyRange := range rditer.MakeReplicatedKeyRanges(desc) { 36 tc.SetLowWater(keyRange.Start.Key, keyRange.End.Key, ts) 37 } 38 } 39 40 // updateTimestampCache updates the timestamp cache in order to set a low water 41 // mark for the timestamp at which mutations to keys overlapping the provided 42 // request can write, such that they don't re-write history. 43 func (r *Replica) updateTimestampCache( 44 ctx context.Context, ba *roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error, 45 ) { 46 if ba.ReadConsistency != roachpb.CONSISTENT { 47 // Inconsistent reads are excluded from the timestamp cache. 48 return 49 } 50 addToTSCache := r.store.tsCache.Add 51 if util.RaceEnabled { 52 addToTSCache = checkedTSCacheUpdate(r.store.Clock().Now(), r.store.tsCache, ba, br, pErr) 53 } 54 // Update the timestamp cache using the timestamp at which the batch 55 // was executed. Note this may have moved forward from ba.Timestamp, 56 // as when the request is retried locally on WriteTooOldErrors. 57 ts := ba.Timestamp 58 if br != nil { 59 ts = br.Timestamp 60 } 61 var txnID uuid.UUID 62 if ba.Txn != nil { 63 txnID = ba.Txn.ID 64 } 65 for i, union := range ba.Requests { 66 args := union.GetInner() 67 if !roachpb.UpdatesTimestampCache(args) { 68 continue 69 } 70 // Skip update if there's an error and it's not for this index 71 // or the request doesn't update the timestamp cache on errors. 72 if pErr != nil { 73 if index := pErr.Index; !roachpb.UpdatesTimestampCacheOnError(args) || 74 index == nil || int32(i) != index.Index { 75 continue 76 } 77 } 78 header := args.Header() 79 start, end := header.Key, header.EndKey 80 switch t := args.(type) { 81 case *roachpb.EndTxnRequest: 82 // EndTxn requests that finalize their transaction record a 83 // tombstone in the timestamp cache to ensure replays and concurrent 84 // requests aren't able to recreate the transaction record. 85 // 86 // It inserts the timestamp of the final batch in the transaction. 87 // This timestamp must necessarily be equal to or greater than the 88 // transaction's MinTimestamp, which is consulted in 89 // CanCreateTxnRecord. 90 if br.Txn.Status.IsFinalized() { 91 key := transactionTombstoneMarker(start, txnID) 92 addToTSCache(key, nil, ts, txnID) 93 } 94 case *roachpb.RecoverTxnRequest: 95 // A successful RecoverTxn request may or may not have finalized the 96 // transaction that it was trying to recover. If so, then we record 97 // a tombstone to the timestamp cache to ensure that replays and 98 // concurrent requests aren't able to recreate the transaction record. 99 // This parallels what we do in the EndTxn request case. 100 // 101 // Insert the timestamp of the batch, which we asserted during 102 // command evaluation was equal to or greater than the transaction's 103 // MinTimestamp. 104 recovered := br.Responses[i].GetInner().(*roachpb.RecoverTxnResponse).RecoveredTxn 105 if recovered.Status.IsFinalized() { 106 key := transactionTombstoneMarker(start, recovered.ID) 107 addToTSCache(key, nil, ts, recovered.ID) 108 } 109 case *roachpb.PushTxnRequest: 110 // A successful PushTxn request bumps the timestamp cache for 111 // the pushee's transaction key. The pushee will consult the 112 // timestamp cache when creating its record. If the push left 113 // the transaction in a PENDING state (PUSH_TIMESTAMP) then we 114 // update the timestamp cache. This will cause the creator 115 // of the transaction record to forward its provisional commit 116 // timestamp to honor the result of this push. If the push left 117 // the transaction in an ABORTED state (PUSH_ABORT) then we 118 // update the a special record in the timestamp cache. This will prevent 119 // the creation of the transaction record entirely. 120 pushee := br.Responses[i].GetInner().(*roachpb.PushTxnResponse).PusheeTxn 121 122 var tombstone bool 123 switch pushee.Status { 124 case roachpb.PENDING: 125 tombstone = false 126 case roachpb.ABORTED: 127 tombstone = true 128 case roachpb.STAGING: 129 // No need to update the timestamp cache. If a transaction 130 // is in this state then it must have a transaction record. 131 continue 132 case roachpb.COMMITTED: 133 // No need to update the timestamp cache. It was already 134 // updated by the corresponding EndTxn request. 135 continue 136 } 137 138 var key roachpb.Key 139 if tombstone { 140 key = transactionTombstoneMarker(start, pushee.ID) 141 } else { 142 key = transactionPushMarker(start, pushee.ID) 143 } 144 addToTSCache(key, nil, pushee.WriteTimestamp, t.PusherTxn.ID) 145 case *roachpb.ConditionalPutRequest: 146 // ConditionalPut only updates on ConditionFailedErrors. On other 147 // errors, no information is returned. On successful writes, the 148 // intent already protects against writes underneath the read. 149 if _, ok := pErr.GetDetail().(*roachpb.ConditionFailedError); ok { 150 addToTSCache(start, end, ts, txnID) 151 } 152 case *roachpb.InitPutRequest: 153 // InitPut only updates on ConditionFailedErrors. On other errors, 154 // no information is returned. On successful writes, the intent 155 // already protects against writes underneath the read. 156 if _, ok := pErr.GetDetail().(*roachpb.ConditionFailedError); ok { 157 addToTSCache(start, end, ts, txnID) 158 } 159 case *roachpb.ScanRequest: 160 resp := br.Responses[i].GetInner().(*roachpb.ScanResponse) 161 if resp.ResumeSpan != nil { 162 // Note that for forward scan, the resume span will start at 163 // the (last key read).Next(), which is actually the correct 164 // end key for the span to update the timestamp cache. 165 end = resp.ResumeSpan.Key 166 } 167 addToTSCache(start, end, ts, txnID) 168 case *roachpb.ReverseScanRequest: 169 resp := br.Responses[i].GetInner().(*roachpb.ReverseScanResponse) 170 if resp.ResumeSpan != nil { 171 // Note that for reverse scans, the resume span's end key is 172 // an open interval. That means it was read as part of this op 173 // and won't be read on resume. It is the correct start key for 174 // the span to update the timestamp cache. 175 start = resp.ResumeSpan.EndKey 176 } 177 addToTSCache(start, end, ts, txnID) 178 case *roachpb.QueryIntentRequest: 179 missing := false 180 if pErr != nil { 181 switch t := pErr.GetDetail().(type) { 182 case *roachpb.IntentMissingError: 183 missing = true 184 case *roachpb.TransactionRetryError: 185 // QueryIntent will return a TxnRetry(SERIALIZABLE) error 186 // if a transaction is querying its own intent and finds 187 // it pushed. 188 // 189 // NB: we check the index of the error above, so this 190 // TransactionRetryError should indicate a missing intent 191 // from the QueryIntent request. However, bumping the 192 // timestamp cache wouldn't cause a correctness issue 193 // if we found the intent. 194 missing = t.Reason == roachpb.RETRY_SERIALIZABLE 195 } 196 } else { 197 missing = !br.Responses[i].GetInner().(*roachpb.QueryIntentResponse).FoundIntent 198 } 199 if missing { 200 // If the QueryIntent determined that the intent is missing 201 // then we update the timestamp cache at the intent's key to 202 // the intent's transactional timestamp. This will prevent 203 // the intent from ever being written in the future. We use 204 // an empty transaction ID so that we block the intent 205 // regardless of whether it is part of the current batch's 206 // transaction or not. 207 addToTSCache(start, end, t.Txn.WriteTimestamp, uuid.UUID{}) 208 } 209 default: 210 addToTSCache(start, end, ts, txnID) 211 } 212 } 213 } 214 215 // checkedTSCacheUpdate wraps tscache.Cache and asserts that any update to the 216 // cache is at or below the specified time. 217 func checkedTSCacheUpdate( 218 now hlc.Timestamp, 219 tc tscache.Cache, 220 ba *roachpb.BatchRequest, 221 br *roachpb.BatchResponse, 222 pErr *roachpb.Error, 223 ) func(roachpb.Key, roachpb.Key, hlc.Timestamp, uuid.UUID) { 224 return func(start, end roachpb.Key, ts hlc.Timestamp, txnID uuid.UUID) { 225 if now.Less(ts) { 226 panic(fmt.Sprintf("Unsafe timestamp cache update! Cannot add timestamp %s to timestamp "+ 227 "cache after evaluating %v (resp=%v; err=%v) with local hlc clock at timestamp %s. "+ 228 "The timestamp cache update could be lost on a lease transfer.", ts, ba, br, pErr, now)) 229 } 230 tc.Add(start, end, ts, txnID) 231 } 232 } 233 234 // txnsPushedDueToClosedTimestamp is a telemetry counter for the number of 235 // batch requests which have been pushed due to the closed timestamp. 236 var batchesPushedDueToClosedTimestamp telemetry.Counter 237 238 func init() { 239 batchesPushedDueToClosedTimestamp = telemetry.GetCounter("kv.closed_timestamp.txns_pushed") 240 } 241 242 // applyTimestampCache moves the batch timestamp forward depending on 243 // the presence of overlapping entries in the timestamp cache. If the 244 // batch is transactional, the txn timestamp and the txn.WriteTooOld 245 // bool are updated. 246 // 247 // Two important invariants of Cockroach: 1) encountering a more 248 // recently written value means transaction restart. 2) values must 249 // be written with a greater timestamp than the most recent read to 250 // the same key. Check the timestamp cache for reads/writes which 251 // are at least as recent as the timestamp of this write. The cmd must 252 // update its timestamp to be greater than more recent values in the 253 // timestamp cache. When the write returns, the updated timestamp 254 // will inform the batch response timestamp or batch response txn 255 // timestamp. 256 // 257 // minReadTS is used as a per-request low water mark for the value returned from 258 // the timestamp cache. That is, if the timestamp cache returns a value below 259 // minReadTS, minReadTS (without an associated txn id) will be used instead to 260 // adjust the batch's timestamp. 261 func (r *Replica) applyTimestampCache( 262 ctx context.Context, ba *roachpb.BatchRequest, minReadTS hlc.Timestamp, 263 ) bool { 264 // bumpedDueToMinReadTS is set to true if the highest timestamp bump encountered 265 // below is due to the minReadTS. 266 var bumpedDueToMinReadTS bool 267 var bumped bool 268 269 for _, union := range ba.Requests { 270 args := union.GetInner() 271 if roachpb.ConsultsTimestampCache(args) { 272 header := args.Header() 273 274 // Forward the timestamp if there's been a more recent read (by someone else). 275 rTS, rTxnID := r.store.tsCache.GetMax(header.Key, header.EndKey) 276 var forwardedToMinReadTS bool 277 if rTS.Forward(minReadTS) { 278 forwardedToMinReadTS = true 279 rTxnID = uuid.Nil 280 } 281 nextRTS := rTS.Next() 282 var bumpedCurReq bool 283 if ba.Txn != nil { 284 if ba.Txn.ID != rTxnID { 285 if ba.Txn.WriteTimestamp.Less(nextRTS) { 286 txn := ba.Txn.Clone() 287 bumpedCurReq = txn.WriteTimestamp.Forward(nextRTS) 288 ba.Txn = txn 289 } 290 } 291 } else { 292 bumpedCurReq = ba.Timestamp.Forward(nextRTS) 293 } 294 // Preserve bumpedDueToMinReadTS if we did not just bump or set it 295 // appropriately if we did. 296 bumpedDueToMinReadTS = (!bumpedCurReq && bumpedDueToMinReadTS) || (bumpedCurReq && forwardedToMinReadTS) 297 bumped, bumpedCurReq = bumped || bumpedCurReq, false 298 } 299 } 300 if bumpedDueToMinReadTS { 301 telemetry.Inc(batchesPushedDueToClosedTimestamp) 302 } 303 return bumped 304 } 305 306 // CanCreateTxnRecord determines whether a transaction record can be created for 307 // the provided transaction information. Callers must provide the transaction's 308 // minimum timestamp across all epochs, along with its ID and its key. 309 // 310 // If the method return true, it also returns the minimum provisional commit 311 // timestamp that the record can be created with. If the method returns false, 312 // it returns the reason that transaction record was rejected. If the method 313 // ever determines that a transaction record must be rejected, it will continue 314 // to reject that transaction going forwards. 315 // 316 // The method performs two critical roles: 317 // 318 // 1. It protects against replayed requests or new requests from a 319 // transaction's coordinator that could otherwise cause a transaction record 320 // to be created after the transaction has already been finalized and its 321 // record cleaned up. 322 // 323 // 2. It serves as the mechanism by which successful push requests convey 324 // information to transactions who have not yet written their transaction 325 // record. In doing so, it ensures that transaction records are created 326 // with a sufficiently high timestamp after a successful PushTxn(TIMESTAMP) 327 // and ensures that transactions records are never created at all after a 328 // successful PushTxn(ABORT). As a result of this mechanism, a transaction 329 // never needs to explicitly create the transaction record for contending 330 // transactions. 331 // 332 // This is detailed in the transaction record state machine below: 333 // 334 // +----------------------------------------------------+ 335 // | vars | 336 // |----------------------------------------------------| 337 // | v1 = tsCache[push_marker(txn.id)] = timestamp | 338 // | v2 = tsCache[tombstone_marker(txn.id)] = timestamp | 339 // +----------------------------------------------------+ 340 // | operations | 341 // |----------------------------------------------------| 342 // | v -> t = forward v by timestamp t | 343 // +----------------------------------------------------+ 344 // 345 // PushTxn(TIMESTAMP) HeartbeatTxn 346 // then: v1 -> push.ts then: update record 347 // +------+ +------+ 348 // PushTxn(ABORT) | | HeartbeatTxn | | PushTxn(TIMESTAMP) 349 // then: v2 -> txn.ts | v if: v2 < txn.orig | v then: update record 350 // +-----------------+ then: txn.ts -> v1 +--------------------+ 351 // +----| | else: fail | |----+ 352 // | | |------------------------->| | | 353 // | | no txn record | | txn record written | | 354 // +--->| | EndTxn(STAGING) | [pending] |<---+ 355 // | |__ if: v2 < txn.orig | | 356 // +-----------------+ \__ then: txn.ts -> v1 +--------------------+ 357 // | ^ \__ else: fail _/ | ^ 358 // | | \__ _/ | | 359 // EndTxn(!STAGING) | | \__ _/ | EndTxn(STAGING) 360 // if: v2 < txn.orig | Eager GC | \____ _/______ | | 361 // then: v2 -> txn.ts | or | _/ \ | | HeartbeatTxn 362 // else: fail | GC queue | /----------------/ | | | if: epoch update 363 // v | v EndTxn(!STAGING) v v | 364 // +--------------------+ or PushTxn(ABORT) +--------------------+ 365 // | | then: v2 -> txn.ts | | 366 // +--->| |<-----------------------| |----+ 367 // | | txn record written | | txn record written | | 368 // | | [finalized] | | [staging] | | 369 // +----| | | |<---+ 370 // PushTxn(*) +--------------------+ +--------------------+ 371 // then: no-op ^ PushTxn(*) + RecoverTxn | EndTxn(STAGING) 372 // | then: v2 -> txn.ts | or HeartbeatTxn 373 // +------------------------------+ then: update record 374 // 375 // 376 // In the diagram, CanCreateTxnRecord is consulted in all three of the 377 // state transitions that move away from the "no txn record" state. 378 // Updating v1 and v2 is performed in updateTimestampCache. 379 // 380 // The are three separate simplifications to the transaction model that would 381 // allow us to simplify this state machine: 382 // 383 // 1. as discussed on the comment on txnHeartbeater, it is reasonable to expect 384 // that we will eventually move away from tracking transaction liveness on 385 // a per-transaction basis. This means that we would no longer need 386 // transaction heartbeats and would never need to write a transaction record 387 // until a transaction is ready to complete. 388 // 389 // 2. one of the two possibilities for the "txn record written [finalized]" 390 // state is that the transaction record is aborted. There used to be two 391 // reasons to persist transaction records with the ABORTED status. The first 392 // was because doing so was the only way for concurrent actors to prevent 393 // the record from being re-written by the transaction going forward. The 394 // concurrent actor would write an aborted transaction record and then wait 395 // for the GC to clean it up later. The other reasons for writing the 396 // transaction records with the ABORTED status was because these records 397 // could point at intents, which assisted the cleanup process for these 398 // intents. However, this only held for ABORTED records written on behalf 399 // of the transaction coordinator itself. If a transaction was aborted by a 400 // concurrent actor, its record would not immediately contain any of the 401 // transaction's intents. 402 // 403 // The first reason here no longer holds. Concurrent actors now bump the 404 // timestamp cache when aborting a transaction, which has the same 405 // effect as writing an ABORTED transaction record. See the "tombstone 406 // marker". The second reason still holds but is fairly weak. A transaction 407 // coordinator can kick off intent resolution for an aborted transaction 408 // without needing to write these intents into the record itself. In the 409 // worst case, this intent resolution fails and each intent is cleaned up 410 // individually as it is discovered. All in all, neither justification for 411 // this state holds much weight anymore. 412 // 413 // 3. the other possibility for the "txn record written [finalized]" state is 414 // that the transaction record is committed. This state is currently 415 // critical for the transaction model because intent resolution cannot begin 416 // before a transaction record enters this state. However, this doesn't need 417 // to be the case forever. There are proposals to modify the state of 418 // committed key-value writes slightly such that intent resolution could be 419 // run for implicitly committed transactions while their transaction record 420 // remains in the "txn record written [staging]" state. For this to work, 421 // the recovery mechanism for indeterminate commit errors would need to be 422 // able to determine whether an intent or a **committed value** indicated 423 // the success of a write that was in-flight at the time the transaction 424 // record was staged. This poses challenges migration and garbage 425 // collection, but it would have a number of performance benefits. 426 // 427 // If we were to perform change #1, we could remove the "txn record written 428 // [pending]" state. If we were to perform change #2 and #3, we could remove the 429 // "txn record written [finalized]" state. All together, this would leave us 430 // with only two states that the transaction record could be in, written or not 431 // written. At that point, it begins to closely resemble any other write in the 432 // system. 433 // 434 func (r *Replica) CanCreateTxnRecord( 435 txnID uuid.UUID, txnKey []byte, txnMinTS hlc.Timestamp, 436 ) (ok bool, minCommitTS hlc.Timestamp, reason roachpb.TransactionAbortedReason) { 437 // Consult the timestamp cache with the transaction's key. The timestamp 438 // cache is used in two ways for transactions without transaction records. 439 // The timestamp cache is used to push the timestamp of transactions 440 // that don't have transaction records. The timestamp cache is used 441 // to abort transactions entirely that don't have transaction records. 442 // 443 // Using this strategy, we enforce the invariant that only requests sent 444 // from a transaction's own coordinator can create its transaction record. 445 // However, once a transaction record is written, other concurrent actors 446 // can modify it. This is reflected in the diagram above. 447 tombstoneKey := transactionTombstoneMarker(txnKey, txnID) 448 pushKey := transactionPushMarker(txnKey, txnID) 449 450 // Look in the timestamp cache to see if there is an entry for this 451 // transaction, which indicates the minimum timestamp that the transaction 452 // can commit at. This is used by pushers to push the timestamp of a 453 // transaction that hasn't yet written its transaction record. 454 minCommitTS, _ = r.store.tsCache.GetMax(pushKey, nil /* end */) 455 456 // Also look in the timestamp cache to see if there is a tombstone entry for 457 // this transaction, which would indicate this transaction has already been 458 // finalized or was already aborted by a concurrent transaction. If there is 459 // an entry, then we return a retriable error: if this is a re-evaluation, 460 // then the error will be transformed into an ambiguous one higher up. 461 // Otherwise, if the client is still waiting for a result, then this cannot 462 // be a "replay" of any sort. 463 tombstoneTimestamp, tombstomeTxnID := r.store.tsCache.GetMax(tombstoneKey, nil /* end */) 464 // Compare against the minimum timestamp that the transaction could have 465 // written intents at. 466 if txnMinTS.LessEq(tombstoneTimestamp) { 467 switch tombstomeTxnID { 468 case txnID: 469 // If we find our own transaction ID then an EndTxn request sent by 470 // our coordinator has already been processed. We might be a replay (e.g. 471 // a DistSender retry), or we raced with an asynchronous abort. Either 472 // way, return an error. 473 // 474 // TODO(andrei): We could keep a bit more info in the tscache to return a 475 // different error for COMMITTED transactions. If the EndTxn(commit) was 476 // the only request in the batch, this this would be sufficient for the 477 // client to swallow the error and declare the transaction as committed. 478 // If there were other requests in the EndTxn batch, then the client would 479 // still have trouble reconstructing the result, but at least it could 480 // provide a non-ambiguous error to the application. 481 return false, hlc.Timestamp{}, 482 roachpb.ABORT_REASON_ALREADY_COMMITTED_OR_ROLLED_BACK_POSSIBLE_REPLAY 483 case uuid.Nil: 484 lease, _ /* nextLease */ := r.GetLease() 485 // Recognize the case where a lease started recently. Lease transfers bump 486 // the ts cache low water mark. 487 if tombstoneTimestamp == lease.Start { 488 return false, hlc.Timestamp{}, roachpb.ABORT_REASON_NEW_LEASE_PREVENTS_TXN 489 } 490 return false, hlc.Timestamp{}, roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED 491 default: 492 // If we find another transaction's ID then that transaction has 493 // aborted us before our transaction record was written. It obeyed 494 // the restriction that it couldn't create a transaction record for 495 // us, so it recorded a tombstone cache instead to prevent us 496 // from ever creating a transaction record. 497 return false, hlc.Timestamp{}, roachpb.ABORT_REASON_ABORTED_RECORD_FOUND 498 } 499 } 500 return true, minCommitTS, 0 501 } 502 503 // transactionTombstoneMarker returns the key used as a marker indicating that a 504 // particular txn was finalized (i.e. by an EndTransaction, RecoverTxn or 505 // PushTxn(Abort)). It is used as a marker in the timestamp cache serving as a 506 // guard against creating a transaction record after the transaction record has 507 // been cleaned up (i.e. by a BeginTxn being evaluated out of order or arriving 508 // after another txn Push(Abort)'ed the txn). 509 func transactionTombstoneMarker(key roachpb.Key, txnID uuid.UUID) roachpb.Key { 510 return append(keys.TransactionKey(key, txnID), []byte("-tmbs")...) 511 } 512 513 // transactionPushMarker returns the key used by the marker indicating that a 514 // particular txn was pushed before writing its transaction record. It is used 515 // as a marker in the timestamp cache indicating that the transaction was pushed 516 // in case the push happens before there's a transaction record. 517 func transactionPushMarker(key roachpb.Key, txnID uuid.UUID) roachpb.Key { 518 return append(keys.TransactionKey(key, txnID), []byte("-push")...) 519 }