github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/enginepb/mvcc3.proto (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 syntax = "proto3"; 12 package cockroach.storage.enginepb; 13 14 import "util/hlc/timestamp.proto"; 15 import "gogoproto/gogo.proto"; 16 17 // TxnMeta is the metadata of a Transaction record. 18 message TxnMeta { 19 option (gogoproto.goproto_stringer) = false; 20 option (gogoproto.equal) = true; 21 option (gogoproto.populate) = true; 22 23 // id is a unique UUID value which identifies the transaction. 24 // This field is always filled in. 25 bytes id = 1 [(gogoproto.customname) = "ID", 26 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID", 27 (gogoproto.nullable) = false]; 28 reserved 2; 29 // key is the key which anchors the transaction. This is typically 30 // the first key read or written during the transaction and 31 // determines which range in the cluster will hold the transaction 32 // record. 33 bytes key = 3; // TODO(tschottdorf): [(gogoproto.casttype) = "Key"]; 34 // Incremented on txn retry. 35 int32 epoch = 4 [(gogoproto.casttype) = "TxnEpoch"]; 36 // The proposed timestamp for the transaction. This starts as the current wall 37 // time on the txn coordinator, and is forwarded by the timestamp cache if the 38 // txn attempts to write "beneath" another txn's writes. 39 // 40 // Writes within the txn are performed using the most up-to-date value of this 41 // timestamp that is available. For example, suppose a txn starts at some 42 // timestamp, writes a key/value, and has its timestamp forwarded while doing 43 // so because a later version already exists at that key. As soon as the txn 44 // coordinator learns of the updated timestamp, it will begin performing 45 // writes at the updated timestamp. The coordinator may, however, continue 46 // issuing writes at the original timestamp before it learns about the 47 // forwarded timestamp. The process of resolving the intents when the txn 48 // commits will bump any intents written at an older timestamp to the final 49 // commit timestamp. 50 // 51 // Note that reads do not occur at this timestamp; they instead occur at 52 // ReadTimestamp, which is tracked in the containing roachpb.Transaction. 53 // 54 // Writes used to be performed at the txn's read timestamp, which was 55 // necessary to avoid lost update anomalies in snapshot isolation mode. We no 56 // longer support snapshot isolation mode, and there are now several important 57 // reasons that writes are performed at this timestamp instead of the txn's 58 // original timestamp: 59 // 60 // 1. This timestamp is forwarded by the timestamp cache when this 61 // transaction attempts to write beneath a more recent read. Leaving the 62 // intent at the original timestamp would write beneath that read, which 63 // would violate an invariant that time-bound iterators rely on. 64 // 65 // For example, consider a client that uses a time-bound iterator to 66 // poll for changes to a key. The client reads (ts5, ts10], sees no 67 // writes, and reports that no changes have occurred up to t10. Then a 68 // txn writes an intent at its original timestamp ts7. The txn's 69 // timestamp is forwarded to ts11 by the timestamp cache thanks to the 70 // client's read. Meanwhile, the client reads (ts10, ts15] and, again 71 // seeing no intents, reports that no changes have occurred to the key 72 // up to t15. Now the txn commits at ts11 and bumps the intent to ts11. 73 // But the client thinks it has seen all changes up to t15, and so never 74 // sees the intent! We avoid this problem by writing intents at the 75 // provisional commit timestamp insteadr. In this example, the intent 76 // would instead be written at ts11 and picked up by the client's next 77 // read from (ts10, ts15]. 78 // 79 // 2. Unnecessary PushTxn roundtrips are avoided. If a transaction is 80 // forwarded from ts5 to ts10, the rest of its intents will be written 81 // at ts10. Reads at t < ts10 that encounter these intents can ignore 82 // them; if the intents had instead been left at ts5, these reads would 83 // have needed to send PushTxn requests just to find out that the txn 84 // had, in fact, been forwarded to a non-conflicting time. 85 // 86 // 3. Unnecessary intent rewriting is avoided. Writing at the original 87 // timestamp when this timestamp has been forwarded guarantees that the 88 // value will need to be rewritten at the forwarded timestamp if the 89 // transaction commits. 90 // 91 util.hlc.Timestamp write_timestamp = 5 [(gogoproto.nullable) = false]; 92 // The timestamp that the transaction was assigned by its gateway when it 93 // began its first epoch. This is the earliest timestamp that the transaction 94 // could have written any of its intents at. 95 // 96 // The timestamp is currently used in three places: 97 // 1. by the transaction itself and by concurrent transactions when 98 // determining whether this transaction's record can be initially 99 // written. The timestamp is compared against the transaction's 100 // corresponding timestamp cache entry to ensure that a 101 // finalized transaction can never commit, either after a replay 102 // or a transaction abort. See CanCreateTxnRecord. 103 // 2. by intent resolution to efficiently scan for intents while 104 // using a time-bound iterator - i.e. there can be intents to 105 // resolve up to the timestamp that the txn started with. 106 // 3. by would-be pushers, when they run into an intent but the corresponding 107 // txn record was not yet written. In that case, the pusher uses this field 108 // as an indication of a timestamp when the pushee's coordinator is known 109 // to have been alive. 110 util.hlc.Timestamp min_timestamp = 9 [(gogoproto.nullable) = false]; 111 // The transaction's priority, ratcheted on transaction pushes. 112 int32 priority = 6 [(gogoproto.casttype) = "TxnPriority"]; 113 // A zero-indexed sequence number which is increased on each request 114 // sent as part of the transaction. When set in the header of a batch of 115 // requests, the value will correspond to the sequence number of the 116 // last request. Used to provide idempotency and to protect against 117 // out-of-order application (by means of a transaction retry). 118 int32 sequence = 7 [(gogoproto.casttype) = "TxnSeq"]; 119 120 reserved 8; 121 } 122 123 // IgnoredSeqNumRange describes a range of ignored seqnums. 124 // The range is inclusive on both ends. 125 message IgnoredSeqNumRange { 126 option (gogoproto.equal) = true; 127 option (gogoproto.populate) = true; 128 int32 start = 1 [(gogoproto.casttype) = "TxnSeq"]; 129 int32 end = 2 [(gogoproto.casttype) = "TxnSeq"]; 130 } 131 132 // MVCCStatsDelta is convertible to MVCCStats, but uses signed variable width 133 // encodings for most fields that make it more efficient to store negative 134 // values. This makes the encodings incompatible. 135 message MVCCStatsDelta { 136 option (gogoproto.equal) = true; 137 138 int64 contains_estimates = 14; 139 sfixed64 last_update_nanos = 1; 140 sfixed64 intent_age = 2; 141 sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"]; 142 sint64 live_bytes = 4; 143 sint64 live_count = 5; 144 sint64 key_bytes = 6; 145 sint64 key_count = 7; 146 sint64 val_bytes = 8; 147 sint64 val_count = 9; 148 sint64 intent_bytes = 10; 149 sint64 intent_count = 11; 150 sint64 sys_bytes = 12; 151 sint64 sys_count = 13; 152 } 153 154 // MVCCPersistentStats is convertible to MVCCStats, but uses signed variable 155 // width encodings for most fields that make it efficient to store positive 156 // values but inefficient to store negative values. This makes the encodings 157 // incompatible. 158 message MVCCPersistentStats { 159 option (gogoproto.equal) = true; 160 option (gogoproto.populate) = true; 161 162 int64 contains_estimates = 14; // must never go negative absent a bug 163 sfixed64 last_update_nanos = 1; 164 sfixed64 intent_age = 2; 165 sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"]; 166 int64 live_bytes = 4; 167 int64 live_count = 5; 168 int64 key_bytes = 6; 169 int64 key_count = 7; 170 int64 val_bytes = 8; 171 int64 val_count = 9; 172 int64 intent_bytes = 10; 173 int64 intent_count = 11; 174 int64 sys_bytes = 12; 175 int64 sys_count = 13; 176 } 177 178 // RangeAppliedState combines the raft and lease applied indices with 179 // mvcc stats. These are all persisted on each transition of the Raft 180 // state machine (i.e. on each Raft application), so they are stored 181 // in the same RocksDB key for efficiency. 182 message RangeAppliedState { 183 option (gogoproto.equal) = true; 184 option (gogoproto.populate) = true; 185 186 // raft_applied_index is the highest (and last) index applied to the Raft 187 // state machine. 188 uint64 raft_applied_index = 1; 189 // lease_applied_index is the highest (and last) lease index applied to the 190 // Raft state machine. 191 uint64 lease_applied_index = 2; 192 // range_stats is the set of mvcc stats that accounts for the current value 193 // of the Raft state machine. 194 MVCCPersistentStats range_stats = 3 [(gogoproto.nullable) = false]; 195 } 196 197 // MVCCWriteValueOp corresponds to a value being written outside of a 198 // transaction. 199 message MVCCWriteValueOp { 200 bytes key = 1; 201 util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; 202 bytes value = 3; 203 bytes prev_value = 4; 204 } 205 206 // MVCCUpdateIntentOp corresponds to an intent being written for a given 207 // transaction. 208 message MVCCWriteIntentOp { 209 bytes txn_id = 1 [ 210 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID", 211 (gogoproto.customname) = "TxnID", 212 (gogoproto.nullable) = false]; 213 bytes txn_key = 2; 214 util.hlc.Timestamp txn_min_timestamp = 4 [(gogoproto.nullable) = false]; 215 util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false]; 216 } 217 218 // MVCCUpdateIntentOp corresponds to an intent being updates at a larger 219 // timestamp for a given transaction. 220 message MVCCUpdateIntentOp { 221 bytes txn_id = 1 [ 222 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID", 223 (gogoproto.customname) = "TxnID", 224 (gogoproto.nullable) = false]; 225 util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; 226 } 227 228 // MVCCCommitIntentOp corresponds to an intent being committed for a given 229 // transaction. 230 message MVCCCommitIntentOp { 231 bytes txn_id = 1 [ 232 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID", 233 (gogoproto.customname) = "TxnID", 234 (gogoproto.nullable) = false]; 235 bytes key = 2; 236 util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false]; 237 bytes value = 4; 238 bytes prev_value = 5; 239 } 240 241 // MVCCAbortIntentOp corresponds to an intent being aborted for a given 242 // transaction. 243 // 244 // This operation does not necessarily indicate that the intent's transaction 245 // was aborted, just that an intent was removed without being committed. For 246 // instance, a committed transaction will abort any intents it decided not to 247 // write in its final epoch. 248 message MVCCAbortIntentOp { 249 bytes txn_id = 1 [ 250 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID", 251 (gogoproto.customname) = "TxnID", 252 (gogoproto.nullable) = false]; 253 } 254 255 // MVCCAbortTxnOp corresponds to an entire transaction being aborted. The 256 // operation indicates that none of the transaction's intents will ever be 257 // committed. 258 message MVCCAbortTxnOp { 259 bytes txn_id = 1 [ 260 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID", 261 (gogoproto.customname) = "TxnID", 262 (gogoproto.nullable) = false]; 263 } 264 265 // MVCCLogicalOp is a union of all logical MVCC operation types. 266 message MVCCLogicalOp { 267 option (gogoproto.onlyone) = true; 268 269 MVCCWriteValueOp write_value = 1; 270 MVCCWriteIntentOp write_intent = 2; 271 MVCCUpdateIntentOp update_intent = 3; 272 MVCCCommitIntentOp commit_intent = 4; 273 MVCCAbortIntentOp abort_intent = 5; 274 MVCCAbortTxnOp abort_txn = 6; 275 }