github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/batcheval/cmd_push_txn.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package batcheval 12 13 import ( 14 "bytes" 15 "context" 16 17 "github.com/cockroachdb/cockroach/pkg/keys" 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/storage" 23 "github.com/cockroachdb/cockroach/pkg/util/hlc" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 "github.com/cockroachdb/errors" 26 ) 27 28 func init() { 29 RegisterReadWriteCommand(roachpb.PushTxn, declareKeysPushTransaction, PushTxn) 30 } 31 32 func declareKeysPushTransaction( 33 _ *roachpb.RangeDescriptor, 34 header roachpb.Header, 35 req roachpb.Request, 36 latchSpans, _ *spanset.SpanSet, 37 ) { 38 pr := req.(*roachpb.PushTxnRequest) 39 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.TransactionKey(pr.PusheeTxn.Key, pr.PusheeTxn.ID)}) 40 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.AbortSpanKey(header.RangeID, pr.PusheeTxn.ID)}) 41 } 42 43 // PushTxn resolves conflicts between concurrent txns (or between 44 // a non-transactional reader or writer and a txn) in several ways, 45 // depending on the statuses and priorities of the conflicting 46 // transactions. The PushTxn operation is invoked by a "pusher" 47 // (args.PusherTxn -- the writer trying to abort a conflicting txn 48 // or the reader trying to push a conflicting txn's commit timestamp 49 // forward), who attempts to resolve a conflict with a "pushee" 50 // (args.PusheeTxn -- the pushee txn whose intent(s) caused the 51 // conflict). A pusher is either transactional, in which case 52 // PusherTxn is completely initialized, or not, in which case the 53 // PusherTxn has only the priority set. 54 // 55 // The request arrives and immediately tries to determine the current 56 // disposition of the pushee transaction by reading its transaction 57 // record. If it finds one, it continues with the push. If not, it 58 // uses knowledge from the existence of the conflicting intent to 59 // determine the current state of the pushee. It's possible that the 60 // transaction record is missing either because it hasn't been written 61 // yet or because it has already been GCed after being finalized. Once 62 // the request determines which case its in, it decides whether to 63 // continue with the push. There are a number of different outcomes 64 // that a push can result in, based on the state that the pushee's 65 // transaction record is found in: 66 // 67 // Txn already committed/aborted: If the pushee txn is committed or 68 // aborted return success. 69 // 70 // Txn record expired: If the pushee txn is pending, its last 71 // heartbeat timestamp is observed to determine the latest client 72 // activity. This heartbeat is forwarded by the conflicting intent's 73 // timestamp because that timestamp also indicates definitive client 74 // activity. This time of "last activity" is compared against the 75 // current time to determine whether the transaction has expired. 76 // If so, it is aborted. NOTE: the intent timestamp used is not 77 // updated on intent pushes. This is important because it allows us 78 // to use its timestamp as an indication of recent activity. If this 79 // is ever changed, we don't run the risk of any correctness violations, 80 // but we do make it possible for intent pushes to look like client 81 // activity and extend the waiting period until a transaction is 82 // considered expired. This waiting period is a "courtesy" - if we 83 // simply aborted txns right away then we would see worse performance 84 // under contention, but everything would still be correct. 85 // 86 // Txn record not expired: If the pushee txn is not expired, its 87 // priority is compared against the pusher's (see CanPushWithPriority). 88 // 89 // Push cannot proceed: a TransactionPushError is returned. 90 // 91 // Push can proceed but txn record staging: if the transaction record 92 // is STAGING then it can't be changed by a pusher without going through 93 // the transaction recovery process. An IndeterminateCommitError is returned 94 // to kick off recovery. 95 // 96 // Push can proceed: the pushee's transaction record is modified and 97 // rewritten, based on the value of args.PushType. If args.PushType 98 // is PUSH_ABORT, txn.Status is set to ABORTED. If args.PushType is 99 // PUSH_TIMESTAMP, txn.Timestamp is set to just after args.PushTo. 100 // 101 // If the pushee is aborted, its timestamp will be forwarded to match 102 // its last client activity timestamp (i.e. last heartbeat), if available. 103 // This is done so that the updated timestamp populates the AbortSpan when 104 // the pusher proceeds to resolve intents, allowing the GC queue to purge 105 // records for which the transaction coordinator must have found out via 106 // its heartbeats that the transaction has failed. 107 func PushTxn( 108 ctx context.Context, readWriter storage.ReadWriter, cArgs CommandArgs, resp roachpb.Response, 109 ) (result.Result, error) { 110 args := cArgs.Args.(*roachpb.PushTxnRequest) 111 h := cArgs.Header 112 reply := resp.(*roachpb.PushTxnResponse) 113 114 if h.Txn != nil { 115 return result.Result{}, ErrTransactionUnsupported 116 } 117 if h.Timestamp.Less(args.PushTo) { 118 // Verify that the PushTxn's timestamp is not less than the timestamp that 119 // the request intends to push the transaction to. Transactions should not 120 // be pushed into the future or their effect may not be fully reflected in 121 // a future leaseholder's timestamp cache. This is analogous to how reads 122 // should not be performed at a timestamp in the future. 123 return result.Result{}, errors.Errorf("request timestamp %s less than PushTo timestamp %s", h.Timestamp, args.PushTo) 124 } 125 if h.Timestamp.Less(args.PusheeTxn.WriteTimestamp) { 126 // This condition must hold for the timestamp cache access/update to be safe. 127 return result.Result{}, errors.Errorf("request timestamp %s less than pushee txn timestamp %s", h.Timestamp, args.PusheeTxn.WriteTimestamp) 128 } 129 now := cArgs.EvalCtx.Clock().Now() 130 if now.Less(h.Timestamp) { 131 // The batch's timestamp should have been used to update the clock. 132 return result.Result{}, errors.Errorf("request timestamp %s less than current clock time %s", h.Timestamp, now) 133 } 134 if !bytes.Equal(args.Key, args.PusheeTxn.Key) { 135 return result.Result{}, errors.Errorf("request key %s should match pushee txn key %s", args.Key, args.PusheeTxn.Key) 136 } 137 key := keys.TransactionKey(args.PusheeTxn.Key, args.PusheeTxn.ID) 138 139 // Fetch existing transaction; if missing, we're allowed to abort. 140 var existTxn roachpb.Transaction 141 ok, err := storage.MVCCGetProto(ctx, readWriter, key, hlc.Timestamp{}, &existTxn, storage.MVCCGetOptions{}) 142 if err != nil { 143 return result.Result{}, err 144 } else if !ok { 145 log.VEventf(ctx, 2, "pushee txn record not found") 146 // There are three cases in which there is no transaction record: 147 // 148 // * the pushee is still active but its transaction record has not 149 // been written yet. This is fairly common because transactions 150 // do not eagerly write their transaction record before writing 151 // intents, which another reader or writer might stumble upon and 152 // be forced to push. 153 // * the pushee resolved its intents synchronously on successful commit; 154 // in this case, the transaction record of the pushee is also removed. 155 // Note that in this case, the intent which prompted this PushTxn 156 // doesn't exist any more. 157 // * the pushee timed out or was aborted and the intent not cleaned up, 158 // but the transaction record was garbage collected. 159 // 160 // To determine which case we're in, we check whether the transaction could 161 // ever write a transaction record. We do this by using the metadata from 162 // the intent and attempting to synthesize a transaction record while 163 // verifying that it would be possible for the transaction record to ever be 164 // written. If a transaction record for the transaction could be written in 165 // the future then we must be in the first case. If one could not be written 166 // then we know we're in either the second or the third case. 167 reply.PusheeTxn = SynthesizeTxnFromMeta(cArgs.EvalCtx, args.PusheeTxn) 168 if reply.PusheeTxn.Status == roachpb.ABORTED { 169 // If the transaction is uncommittable, we don't even need to 170 // persist an ABORTED transaction record, we can just consider it 171 // aborted. This is good because it allows us to obey the invariant 172 // that only the transaction's own coordinator can create its 173 // transaction record. 174 result := result.Result{} 175 result.Local.UpdatedTxns = []*roachpb.Transaction{&reply.PusheeTxn} 176 return result, nil 177 } 178 } else { 179 // Start with the persisted transaction record. 180 reply.PusheeTxn = existTxn 181 } 182 183 // If already committed or aborted, return success. 184 if reply.PusheeTxn.Status.IsFinalized() { 185 // Trivial noop. 186 return result.Result{}, nil 187 } 188 189 // If we're trying to move the timestamp forward, and it's already 190 // far enough forward, return success. 191 if args.PushType == roachpb.PUSH_TIMESTAMP && args.PushTo.LessEq(reply.PusheeTxn.WriteTimestamp) { 192 // Trivial noop. 193 return result.Result{}, nil 194 } 195 196 // The pusher might be aware of a newer version of the pushee. 197 increasedEpochOrTimestamp := false 198 if reply.PusheeTxn.WriteTimestamp.Less(args.PusheeTxn.WriteTimestamp) { 199 reply.PusheeTxn.WriteTimestamp = args.PusheeTxn.WriteTimestamp 200 increasedEpochOrTimestamp = true 201 } 202 if reply.PusheeTxn.Epoch < args.PusheeTxn.Epoch { 203 reply.PusheeTxn.Epoch = args.PusheeTxn.Epoch 204 increasedEpochOrTimestamp = true 205 } 206 reply.PusheeTxn.UpgradePriority(args.PusheeTxn.Priority) 207 208 // If the pusher is aware that the pushee's currently recorded attempt at a 209 // parallel commit failed, either because it found intents at a higher 210 // timestamp than the parallel commit attempt or because it found intents at 211 // a higher epoch than the parallel commit attempt, it should not consider 212 // the pushee to be performing a parallel commit. Its commit status is not 213 // indeterminate. 214 if increasedEpochOrTimestamp && reply.PusheeTxn.Status == roachpb.STAGING { 215 reply.PusheeTxn.Status = roachpb.PENDING 216 reply.PusheeTxn.InFlightWrites = nil 217 } 218 219 pushType := args.PushType 220 var pusherWins bool 221 var reason string 222 223 switch { 224 case txnwait.IsExpired(now, &reply.PusheeTxn): 225 reason = "pushee is expired" 226 // When cleaning up, actually clean up (as opposed to simply pushing 227 // the garbage in the path of future writers). 228 pushType = roachpb.PUSH_ABORT 229 pusherWins = true 230 case pushType == roachpb.PUSH_TOUCH: 231 // If just attempting to cleanup old or already-committed txns, 232 // pusher always fails. 233 pusherWins = false 234 case CanPushWithPriority(&args.PusherTxn, &reply.PusheeTxn): 235 reason = "pusher has priority" 236 pusherWins = true 237 case args.Force: 238 reason = "forced push" 239 pusherWins = true 240 } 241 242 if log.V(1) && reason != "" { 243 s := "pushed" 244 if !pusherWins { 245 s = "failed to push" 246 } 247 log.Infof(ctx, "%s %s (push type=%s) %s: %s (pushee last active: %s)", 248 args.PusherTxn.Short(), log.Safe(s), 249 log.Safe(pushType), 250 args.PusheeTxn.Short(), 251 log.Safe(reason), 252 reply.PusheeTxn.LastActive()) 253 } 254 255 // If the pushed transaction is in the staging state, we can't change its 256 // record without first going through the transaction recovery process and 257 // attempting to finalize it. 258 recoverOnFailedPush := cArgs.EvalCtx.EvalKnobs().RecoverIndeterminateCommitsOnFailedPushes 259 if reply.PusheeTxn.Status == roachpb.STAGING && (pusherWins || recoverOnFailedPush) { 260 err := roachpb.NewIndeterminateCommitError(reply.PusheeTxn) 261 log.VEventf(ctx, 1, "%v", err) 262 return result.Result{}, err 263 } 264 265 if !pusherWins { 266 err := roachpb.NewTransactionPushError(reply.PusheeTxn) 267 log.VEventf(ctx, 1, "%v", err) 268 return result.Result{}, err 269 } 270 271 // Upgrade priority of pushed transaction to one less than pusher's. 272 reply.PusheeTxn.UpgradePriority(args.PusherTxn.Priority - 1) 273 274 // Determine what to do with the pushee, based on the push type. 275 switch pushType { 276 case roachpb.PUSH_ABORT: 277 // If aborting the transaction, set the new status. 278 reply.PusheeTxn.Status = roachpb.ABORTED 279 // If the transaction record was already present, forward the timestamp 280 // to accommodate AbortSpan GC. See method comment for details. 281 if ok { 282 reply.PusheeTxn.WriteTimestamp.Forward(reply.PusheeTxn.LastActive()) 283 } 284 case roachpb.PUSH_TIMESTAMP: 285 // Otherwise, update timestamp to be one greater than the request's 286 // timestamp. This new timestamp will be use to update the read timestamp 287 // cache. If the transaction record was not already present then we rely on 288 // the timestamp cache to prevent the record from ever being written with a 289 // timestamp beneath this timestamp. 290 reply.PusheeTxn.WriteTimestamp.Forward(args.PushTo) 291 default: 292 return result.Result{}, errors.Errorf("unexpected push type: %v", pushType) 293 } 294 295 // If the transaction record was already present, persist the updates to it. 296 // If not, then we don't want to create it. This could allow for finalized 297 // transactions to be revived. Instead, we obey the invariant that only the 298 // transaction's own coordinator can issue requests that create its 299 // transaction record. To ensure that a timestamp push or an abort is 300 // respected for transactions without transaction records, we rely on markers 301 // in the timestamp cache. 302 if ok { 303 txnRecord := reply.PusheeTxn.AsRecord() 304 if err := storage.MVCCPutProto(ctx, readWriter, cArgs.Stats, key, hlc.Timestamp{}, nil, &txnRecord); err != nil { 305 return result.Result{}, err 306 } 307 } 308 309 result := result.Result{} 310 result.Local.UpdatedTxns = []*roachpb.Transaction{&reply.PusheeTxn} 311 return result, nil 312 }