github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/batcheval/cmd_recover_txn.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package batcheval 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/storage" 23 "github.com/cockroachdb/cockroach/pkg/util/hlc" 24 "github.com/cockroachdb/errors" 25 ) 26 27 func init() { 28 RegisterReadWriteCommand(roachpb.RecoverTxn, declareKeysRecoverTransaction, RecoverTxn) 29 } 30 31 func declareKeysRecoverTransaction( 32 _ *roachpb.RangeDescriptor, 33 header roachpb.Header, 34 req roachpb.Request, 35 latchSpans, _ *spanset.SpanSet, 36 ) { 37 rr := req.(*roachpb.RecoverTxnRequest) 38 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.TransactionKey(rr.Txn.Key, rr.Txn.ID)}) 39 latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{Key: keys.AbortSpanKey(header.RangeID, rr.Txn.ID)}) 40 } 41 42 // RecoverTxn attempts to recover the specified transaction from an 43 // indeterminate commit state. Transactions enter this state when abandoned 44 // after updating their transaction record with a STAGING status. The RecoverTxn 45 // operation is invoked by a caller who encounters a transaction in this state 46 // after they have already queried all of the STAGING transaction's declared 47 // in-flight writes. The caller specifies whether all of these in-flight writes 48 // were found to have succeeded or whether at least one of them was prevented 49 // from ever succeeding. This is used by RecoverTxn to determine whether the 50 // result of the recovery should be committing the abandoned transaction or 51 // aborting it. 52 func RecoverTxn( 53 ctx context.Context, readWriter storage.ReadWriter, cArgs CommandArgs, resp roachpb.Response, 54 ) (result.Result, error) { 55 args := cArgs.Args.(*roachpb.RecoverTxnRequest) 56 h := cArgs.Header 57 reply := resp.(*roachpb.RecoverTxnResponse) 58 59 if cArgs.Header.Txn != nil { 60 return result.Result{}, ErrTransactionUnsupported 61 } 62 if !bytes.Equal(args.Key, args.Txn.Key) { 63 return result.Result{}, errors.Errorf("request key %s does not match txn key %s", args.Key, args.Txn.Key) 64 } 65 if h.Timestamp.Less(args.Txn.WriteTimestamp) { 66 // This condition must hold for the timestamp cache access/update to be safe. 67 return result.Result{}, errors.Errorf("request timestamp %s less than txn timestamp %s", h.Timestamp, args.Txn.WriteTimestamp) 68 } 69 key := keys.TransactionKey(args.Txn.Key, args.Txn.ID) 70 71 // Fetch transaction record; if missing, attempt to synthesize one. 72 if ok, err := storage.MVCCGetProto( 73 ctx, readWriter, key, hlc.Timestamp{}, &reply.RecoveredTxn, storage.MVCCGetOptions{}, 74 ); err != nil { 75 return result.Result{}, err 76 } else if !ok { 77 // The transaction's record must have been removed already. If all 78 // writes were found then it must have committed and if not then it 79 // could have committed or could have aborted. 80 // 81 // Synthesize it from the provided TxnMeta to have something to return. 82 // The synthesized record should have an ABORTED status because it was 83 // already GCed. If not, something went wrong for us to get to this 84 // point. Just like with PushTxn, we allow an ABORTED status to be 85 // returned even if it is possible that the transaction was actually 86 // COMMITTED. This is safe because a COMMITTED transaction must have 87 // resolved all of its intents before garbage collecting its intents. 88 synthTxn := SynthesizeTxnFromMeta(cArgs.EvalCtx, args.Txn) 89 if synthTxn.Status != roachpb.ABORTED { 90 err := errors.Errorf("txn record synthesized with non-ABORTED status: %v", synthTxn) 91 return result.Result{}, err 92 } 93 reply.RecoveredTxn = synthTxn 94 return result.Result{}, nil 95 } 96 97 // Determine whether to continue with recovery based on the state of 98 // the transaction record and whether or not the transaction was found 99 // to be implicitly committed. 100 if args.ImplicitlyCommitted { 101 // Finding all writes means that the transaction was at one point 102 // implicitly committed. It should not be possible for it to have 103 // changed its epoch or timestamp, and the only other valid status 104 // for it to have is COMMITTED. 105 switch reply.RecoveredTxn.Status { 106 case roachpb.PENDING, roachpb.ABORTED: 107 // Once implicitly committed, the transaction should never move back 108 // to the PENDING status and it should never be ABORTED. 109 // 110 // In order for the second statement to be true, we need to ensure 111 // that transaction records that are GCed after being COMMITTED are 112 // never re-written as ABORTED. We used to allow this to happen when 113 // PushTxn requests found missing transaction records because it was 114 // harmless, but we now use the timestamp cache to avoid 115 // needing to ever do so. If this ever becomes possible again, we'll 116 // need to relax this check. 117 return result.Result{}, roachpb.NewTransactionStatusError(fmt.Sprintf( 118 "programming error: found %s record for implicitly committed transaction: %v", 119 reply.RecoveredTxn.Status, reply.RecoveredTxn, 120 )) 121 case roachpb.STAGING, roachpb.COMMITTED: 122 if was, is := args.Txn.Epoch, reply.RecoveredTxn.Epoch; was != is { 123 return result.Result{}, roachpb.NewTransactionStatusError(fmt.Sprintf( 124 "programming error: epoch change by implicitly committed transaction: %v->%v", was, is, 125 )) 126 } 127 if was, is := args.Txn.WriteTimestamp, reply.RecoveredTxn.WriteTimestamp; was != is { 128 return result.Result{}, roachpb.NewTransactionStatusError(fmt.Sprintf( 129 "programming error: timestamp change by implicitly committed transaction: %v->%v", was, is, 130 )) 131 } 132 if reply.RecoveredTxn.Status == roachpb.COMMITTED { 133 // The transaction commit was already made explicit. 134 return result.Result{}, nil 135 } 136 // Continue with recovery. 137 default: 138 return result.Result{}, roachpb.NewTransactionStatusError( 139 fmt.Sprintf("bad txn status: %s", reply.RecoveredTxn), 140 ) 141 } 142 } else { 143 // Did the transaction change its epoch or timestamp in such a 144 // way that it would be allowed to continue trying to commit? 145 legalChange := args.Txn.Epoch < reply.RecoveredTxn.Epoch || 146 args.Txn.WriteTimestamp.Less(reply.RecoveredTxn.WriteTimestamp) 147 148 switch reply.RecoveredTxn.Status { 149 case roachpb.ABORTED: 150 // The transaction was aborted by some other process. 151 return result.Result{}, nil 152 case roachpb.COMMITTED: 153 // If we believe we successfully prevented a write that was in-flight 154 // while a transaction was performing a parallel commit then we would 155 // expect that the transaction record could only be committed if it has 156 // a higher epoch or timestamp (see legalChange). This is true if we did 157 // actually prevent the in-flight write. 158 // 159 // However, due to QueryIntent's implementation, a successful intent 160 // write that was already resolved after the parallel commit finished 161 // can be mistaken for a missing in-flight write by a recovery process. 162 // This ambiguity is harmless, as the transaction stays committed either 163 // way, but it means that we can't be quite as strict about what we 164 // assert here as we would like to be. 165 // 166 // If QueryIntent could detect that a resolved intent satisfied its 167 // query then we could assert that the transaction record can only be 168 // COMMITTED if legalChange=true. 169 return result.Result{}, nil 170 case roachpb.PENDING: 171 if args.Txn.Epoch < reply.RecoveredTxn.Epoch { 172 // Recovery not immediately needed because the transaction is 173 // still in progress. 174 return result.Result{}, nil 175 } 176 177 // We should never hit this. The transaction recovery process will only 178 // ever be launched for a STAGING transaction and it is not possible for 179 // a transaction to move back to the PENDING status in the same epoch. 180 return result.Result{}, roachpb.NewTransactionStatusError(fmt.Sprintf( 181 "programming error: cannot recover PENDING transaction in same epoch: %s", reply.RecoveredTxn, 182 )) 183 case roachpb.STAGING: 184 if legalChange { 185 // Recovery not immediately needed because the transaction is 186 // still in progress. 187 return result.Result{}, nil 188 } 189 // Continue with recovery. 190 default: 191 return result.Result{}, roachpb.NewTransactionStatusError( 192 fmt.Sprintf("bad txn status: %s", reply.RecoveredTxn), 193 ) 194 } 195 } 196 197 // Merge all of the transaction's in-flight writes into its lock 198 // spans set and clear the in-flight write set. Make sure to re-sort 199 // and merge the lock spans to eliminate duplicates. 200 for _, w := range reply.RecoveredTxn.InFlightWrites { 201 sp := roachpb.Span{Key: w.Key} 202 reply.RecoveredTxn.LockSpans = append(reply.RecoveredTxn.LockSpans, sp) 203 } 204 reply.RecoveredTxn.LockSpans, _ = roachpb.MergeSpans(reply.RecoveredTxn.LockSpans) 205 reply.RecoveredTxn.InFlightWrites = nil 206 207 // Recover the transaction based on whether or not all of its writes 208 // succeeded. If all of the writes succeeded then the transaction was 209 // implicitly committed and an acknowledgement of success may have already 210 // been returned to clients. If not, then we should have prevented the 211 // transaction from ever becoming implicitly committed at this timestamp 212 // using a QueryIntent, so we're free to abort the transaction record. 213 if args.ImplicitlyCommitted { 214 reply.RecoveredTxn.Status = roachpb.COMMITTED 215 } else { 216 reply.RecoveredTxn.Status = roachpb.ABORTED 217 } 218 txnRecord := reply.RecoveredTxn.AsRecord() 219 if err := storage.MVCCPutProto(ctx, readWriter, cArgs.Stats, key, hlc.Timestamp{}, nil, &txnRecord); err != nil { 220 return result.Result{}, err 221 } 222 223 // TODO(nvanbenschoten): This could use result.FromEndTxn to trigger 224 // intent resolution for the recovered transaction's intents. To do 225 // that, we might need to plumb in a "poison" flag on the RecoverTxn 226 // request. 227 result := result.Result{} 228 result.Local.UpdatedTxns = []*roachpb.Transaction{&reply.RecoveredTxn} 229 return result, nil 230 }