github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/data.proto (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 syntax = "proto3"; 12 package cockroach.roachpb; 13 option go_package = "roachpb"; 14 15 import "kv/kvserver/concurrency/lock/locking.proto"; 16 import "roachpb/metadata.proto"; 17 import "storage/enginepb/mvcc.proto"; 18 import "storage/enginepb/mvcc3.proto"; 19 import "util/hlc/timestamp.proto"; 20 import "gogoproto/gogo.proto"; 21 22 // Span is a key range with an inclusive start Key and an exclusive end Key. 23 message Span { 24 option (gogoproto.equal) = true; 25 26 option (gogoproto.goproto_stringer) = false; 27 option (gogoproto.populate) = true; 28 29 reserved 1, 2; 30 // The start key of the key range. 31 bytes key = 3 [(gogoproto.casttype) = "Key"]; 32 // The end key of the key range. The value is empty if the key range 33 // contains only a single key. Otherwise, it must order strictly after Key. 34 // In such a case, the Span encompasses the key range from Key to EndKey, 35 // including Key and excluding EndKey. 36 bytes end_key = 4 [(gogoproto.casttype) = "Key"]; 37 } 38 39 // ValueType defines a set of type constants placed in the "tag" field of Value 40 // messages. These are defined as a protocol buffer enumeration so that they 41 // can be used portably between our Go and C code. The tags are used by the 42 // RocksDB Merge Operator to perform specialized merges. 43 enum ValueType { 44 // This is a subset of the SQL column type values, representing the underlying 45 // storage for various types. The DELIMITED_foo entries each represent a foo 46 // variant that self-delimits length. 47 UNKNOWN = 0; 48 reserved 7; 49 INT = 1; 50 FLOAT = 2; 51 BYTES = 3; 52 DELIMITED_BYTES = 8; 53 TIME = 4; 54 DECIMAL = 5; 55 DELIMITED_DECIMAL = 9; 56 DURATION = 6; 57 TIMETZ = 12; 58 GEO = 13; 59 60 // TUPLE represents a DTuple, encoded as repeated pairs of varint field number 61 // followed by a value encoded Datum. 62 TUPLE = 10; 63 64 BITARRAY = 11; 65 66 // TIMESERIES is applied to values which contain InternalTimeSeriesData. 67 TIMESERIES = 100; 68 } 69 70 // Value specifies the value at a key. Multiple values at the same key are 71 // supported based on timestamp. The data stored within a value is typed 72 // (ValueType) and custom encoded into the raw_bytes field. A custom encoding 73 // is used instead of separate proto fields to avoid proto overhead and to 74 // avoid unnecessary encoding and decoding as the value gets read from disk and 75 // passed through the network. The format is: 76 // 77 // <4-byte-checksum><1-byte-tag><encoded-data> 78 // 79 // A CRC-32-IEEE checksum is computed from the associated key, tag and encoded 80 // data, in that order. 81 // 82 // TODO(peter): Is a 4-byte checksum overkill when most (all?) values 83 // will be less than 64KB? 84 message Value { 85 option (gogoproto.equal) = true; 86 87 // raw_bytes contains the encoded value and checksum. 88 // 89 // Its contents may be modified on the next call to Value.SetFoo. 90 bytes raw_bytes = 1; 91 // Timestamp of value. 92 util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; 93 } 94 95 // KeyValue is a pair of Key and Value for returned Key/Value pairs 96 // from ScanRequest/ScanResponse. It embeds a Key and a Value. 97 message KeyValue { 98 bytes key = 1 [(gogoproto.casttype) = "Key"]; 99 Value value = 2 [(gogoproto.nullable) = false]; 100 } 101 102 // A StoreIdent uniquely identifies a store in the cluster. The 103 // StoreIdent is written to the underlying storage engine at a 104 // store-reserved system key (KeyLocalIdent). 105 message StoreIdent { 106 bytes cluster_id = 1 [(gogoproto.nullable) = false, 107 (gogoproto.customname) = "ClusterID", 108 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"]; 109 int32 node_id = 2 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; 110 int32 store_id = 3 [(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"]; 111 } 112 113 // A SplitTrigger is run after a successful commit of an AdminSplit 114 // command. It provides the updated left hand side of the split's 115 // range descriptor (left_desc) and the new range descriptor covering 116 // the right hand side of the split (right_desc). This information 117 // allows the final bookkeeping for the split to be completed and the 118 // new range put into operation. 119 message SplitTrigger { 120 option (gogoproto.equal) = true; 121 122 RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false]; 123 RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false]; 124 reserved 3; 125 } 126 127 // A MergeTrigger is run after a successful commit of an AdminMerge 128 // command. It provides the updated left hand side of the split's 129 // range descriptor (left_desc) that now encompasses what was 130 // originally both ranges and the soon-to-be-invalid range descriptor 131 // that used to cover the subsumed, right hand side of the merge 132 // (right_desc). This information allows the final bookkeeping for the 133 // merge to be completed and put into operation. 134 message MergeTrigger { 135 option (gogoproto.equal) = true; 136 137 RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false]; 138 RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false]; 139 140 reserved 3; 141 142 storage.enginepb.MVCCStats right_mvcc_stats = 4 [ 143 (gogoproto.customname) = "RightMVCCStats", 144 (gogoproto.nullable) = false 145 ]; 146 147 // FreezeStart is a timestamp that is guaranteed to be greater than the 148 // timestamps at which any requests were serviced by the responding replica 149 // before it stopped responding to requests altogether (in anticipation of 150 // being subsumed). It is suitable for use as the timestamp cache's low water 151 // mark for the keys previously owned by the subsumed range. 152 util.hlc.Timestamp freeze_start = 5 [(gogoproto.nullable) = false]; 153 } 154 155 // ReplicaChangeType is a parameter of ChangeReplicasTrigger. 156 enum ReplicaChangeType { 157 option (gogoproto.goproto_enum_prefix) = false; 158 159 ADD_REPLICA = 0; 160 REMOVE_REPLICA = 1; 161 } 162 163 // ChangeReplicasTrigger carries out a replication change. The Added() and 164 // Removed() methods return the replicas being added and removed, respectively. 165 // If more than one change is specified (i.e. len(Added())+len(Removed()) 166 // exceeds one), this initiates an atomic replication change in which the 167 // "removed" replicas are of type VOTER_OUTGOING or VOTER_DEMOTING (if they are 168 // to be turned into learners instead); as a caveat a single demotion already 169 // counts as two changes (and is tracked as a Removal() only). This joint 170 // configuration is left via another ChangeReplicasTrigger which does not 171 // specify any additions nor removals. 172 message ChangeReplicasTrigger { 173 option (gogoproto.equal) = true; 174 175 option (gogoproto.goproto_stringer) = false; 176 177 // TODO(tbg): remove once we know that no trigger using this will ever be 178 // applied (this will require something like #39182). 179 // 180 // TODO(tbg): when removing this, also rename internal_x_replicas to just 181 // x_replicas and remove the getter. 182 ReplicaChangeType deprecated_change_type = 1; 183 // The replica being modified. 184 // TODO(tbg): remove once we know that no trigger using this will ever be 185 // applied (this will require something like #39182). 186 ReplicaDescriptor deprecated_replica = 2 [(gogoproto.nullable) = false]; 187 // The new replica list with this change applied. 188 repeated ReplicaDescriptor deprecated_updated_replicas = 3 [(gogoproto.nullable) = false]; 189 // The next replica id to use with this change applied. 190 int32 deprecated_next_replica_id = 4 [(gogoproto.customname) = "DeprecatedNextReplicaID", (gogoproto.casttype) = "ReplicaID"]; 191 // The updated range descriptor. If desc is non-nil, then it overrides 192 // updated_replicas and next_replica_id. This incremental addition is needed 193 // to maintain backwards compatibility. 194 // TODO(jeffreyxiao): Remove deprecated_updated_replicas and 195 // deprecated_next_replica_id in 20.1. 196 RangeDescriptor desc = 5; 197 // The new replicas added to the range descriptor in this change, exactly as 198 // they appear in the updated range descriptor. 199 repeated ReplicaDescriptor internal_added_replicas = 6 [(gogoproto.nullable) = false]; 200 // The replicas whose removal is being initiated in this change. If the 201 // replica is still present as an outgoing voter in the updated descriptor 202 // (i.e. if this is a full atomic replication change), then the replica here 203 // must match that in the descriptor; otherwise it must match the replica 204 // removed from the descriptor in the course of this change (which is itself 205 // not visible to this trigger). 206 repeated ReplicaDescriptor internal_removed_replicas = 7 [(gogoproto.nullable) = false]; 207 } 208 209 // ModifiedSpanTrigger indicates that a specific span has been modified. 210 // This can be used to trigger scan-and-gossip for the given span. 211 message ModifiedSpanTrigger { 212 option (gogoproto.equal) = true; 213 214 bool system_config_span = 1; 215 // node_liveness_span is set to indicate that node liveness records 216 // need re-gossiping after modification or range lease updates. The 217 // span is set to a single key when nodes update their liveness records 218 // with heartbeats to extend the expiration timestamp. Changes to the 219 // range lease for the range containing node liveness triggers re-gossip 220 // of the entire node liveness key range. 221 Span node_liveness_span = 2; 222 } 223 224 // StickyBitTrigger indicates that the sticky bit of a range should be changed. 225 // This trigger is used in two cases: 226 // 1. Unsplitting a range. Note that unsplitting and merging are different 227 // operations. Unsplitting a range will only update the expiration time 228 // associated with the range to hlc.Timestamp{}. 229 // 2. Splitting at the start key of a range. In this case, no range is split but 230 // the sticky bit is might be updated, so we need to use this trigger instead 231 // of SplitTrigger. 232 // 233 // Note that the sticky_bit should always be set to the same timestamp used to 234 // update the range descriptor and it's the client's responsibility that the 235 // timestamps are aligned. 236 message StickyBitTrigger { 237 option (gogoproto.equal) = true; 238 239 // Set to nil to remove a RangeDescriptor's sticky bit. 240 util.hlc.Timestamp sticky_bit = 1 [(gogoproto.nullable) = false]; 241 } 242 243 // InternalCommitTrigger encapsulates all of the internal-only commit triggers. 244 // Only one may be set. 245 message InternalCommitTrigger { 246 option (gogoproto.equal) = true; 247 248 // InternalCommitTrigger is always nullable, and these getters are 249 // nil-safe, which is often convenient. 250 option (gogoproto.goproto_getters) = true; 251 252 SplitTrigger split_trigger = 1; 253 MergeTrigger merge_trigger = 2; 254 ChangeReplicasTrigger change_replicas_trigger = 3; 255 ModifiedSpanTrigger modified_span_trigger = 4; 256 StickyBitTrigger sticky_bit_trigger = 5; 257 } 258 259 // TransactionStatus specifies possible states for a transaction. 260 enum TransactionStatus { 261 option (gogoproto.goproto_enum_prefix) = false; 262 263 // PENDING is the default state for a new transaction. Transactions 264 // move from PENDING to one of COMMITTED or ABORTED. Mutations made 265 // as part of a PENDING transactions are recorded as "intents" in 266 // the underlying MVCC model. 267 PENDING = 0; 268 // STAGING is the state for a transaction which has issued all of 269 // its writes and is in the process of committing. Mutations made 270 // as part of a transaction in this state may still be in-flight 271 // and can not be assumed to have succeeded. A transaction may 272 // transition from the STAGING to the COMMITTED state only if all 273 // of its in-flight mutations are confirmed to have succeeded. A 274 // transaction may transition from the STAGING to PENDING or ABORTED 275 // state only if one of its in-flight requests is prevented from ever 276 // succeeding. 277 STAGING = 3; 278 // COMMITTED is the state for a transaction which has been 279 // committed. Mutations made as part of a transaction which is moved 280 // into COMMITTED state become durable and visible to other 281 // transactions, moving from "intents" to permanent versioned 282 // values. 283 COMMITTED = 1; 284 // ABORTED is the state for a transaction which has been aborted. 285 // Mutations made as part of a transaction which is moved into 286 // ABORTED state are deleted and are never made visible to other 287 // transactions. 288 ABORTED = 2; 289 } 290 291 message ObservedTimestamp { 292 option (gogoproto.equal) = true; 293 294 option (gogoproto.populate) = true; 295 296 int32 node_id = 1 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; 297 util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; 298 } 299 300 // A Transaction is a unit of work performed on the database. 301 // Cockroach transactions always operate at the serializable isolation 302 // level. Each Cockroach transaction is assigned a random priority. 303 // This priority will be used to decide whether a transaction will be 304 // aborted during contention. 305 // 306 // If you add fields to Transaction you'll need to update 307 // Transaction.Clone. Failure to do so will result in test failures. 308 message Transaction { 309 option (gogoproto.equal) = true; 310 311 option (gogoproto.goproto_stringer) = false; 312 option (gogoproto.populate) = true; 313 314 // The transaction metadata. This field includes the subset of information 315 // that is persisted with every write intent. 316 storage.enginepb.TxnMeta meta = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 317 // A free-text identifier for debug purposes. 318 string name = 2; 319 // The status of the transaction. 320 TransactionStatus status = 4; 321 // The last time that the transaction's record was sent a heartbeat by its 322 // coordinator to indicate client activity. Concurrent transactions will 323 // avoid aborting a transaction if it observes recent-enough activity. 324 util.hlc.Timestamp last_heartbeat = 5 [(gogoproto.nullable) = false]; 325 // The timestamp at which the transaction's current epoch started. Up until 326 // version 19.2, this was used in conjunction with read_timestamp to 327 // determine a transaction's read timestamp. In 20.1, read_timestamp 328 // alone is sufficient. This is just maintained for compatibility with 19.2. 329 // TODO(andrei): Remove in 20.2. 330 util.hlc.Timestamp deprecated_orig_timestamp = 6 [(gogoproto.nullable) = false]; 331 // This flag is set if the transaction's timestamp was "leaked" beyond the 332 // transaction (e.g. via cluster_logical_timestamp()). If true, this prevents 333 // the transaction's timestamp from being pushed, which means that the txn 334 // can't commit at a higher timestamp without resorting to a client-side 335 // retry. 336 bool commit_timestamp_fixed = 16; 337 // The transaction's read timestamp. All reads are performed at this 338 // timestamp, ensuring that the transaction runs on top of a consistent 339 // snapshot of the database. 340 // Writes are performed at the transaction's write timestamp (meta.timestamp). 341 // The write timestamp can diverge from the read timestamp when a write is 342 // "pushed": for example in case a write runs into the timestamp cache, we're 343 // forced to write at a higher timestamp. Being serializable, the transaction 344 // can't commit if the write timestamp diverged from the read timestamp unless 345 // we prove that the read timestamp can also be advanced to match the 346 // write timestamp; it can be advanced if the two timestamps are equivalent 347 // for everything that the transaction has read (meaning that there's no 348 // values in between the read timestamp and the write timestamp for any key in 349 // the txn's read set). We call checking whether the read timestamp can 350 // advance "refreshing the read set". So, the read timestamp advances after a 351 // successful refresh or, if the refresh is unsuccessful, after a transaction 352 // restart. 353 util.hlc.Timestamp read_timestamp = 15 [(gogoproto.nullable) = false]; 354 // Initial Timestamp + clock skew. Reads which encounter values with 355 // timestamps between timestamp and max_timestamp trigger a txn 356 // retry error, unless the node being read is listed in observed_timestamps 357 // (in which case no more read uncertainty can occur). 358 // The case max_timestamp < timestamp is possible for transactions which have 359 // been pushed; in this case, max_timestamp should be ignored. 360 util.hlc.Timestamp max_timestamp = 7 [(gogoproto.nullable) = false]; 361 // A list of <NodeID, timestamp> pairs. The list maps NodeIDs to timestamps 362 // as observed from their local clock during this transaction. The purpose of 363 // this list is to avoid uncertainty related restarts which normally occur 364 // when reading a value in the near future as per the max_timestamp field. 365 // 366 // ### Meaning: 367 // 368 // Morally speaking, having an entry for a node in this list means that this 369 // node has been visited before, and that no more uncertainty restarts are 370 // expected for operations served from it. However, this is not entirely 371 // accurate. For example, say a txn starts with read_timestamp=1 (and some 372 // large max_timestamp). It then reads key "a" from node A, registering an 373 // entry `A -> 5` in the process (`5` happens to be a timestamp taken off 374 // that node's clock at the start of the read). 375 // 376 // Now assume that some other transaction writes and commits a value at key "b" 377 // and timestamp 4 (again, served by node A), and our transaction attempts to 378 // read that key. Since there is an entry in its observed_timestamps for A, 379 // our uncertainty window is `[read_timestamp, 5) = [1, 5)` but the value at 380 // key "b" is in that window, and so we will restart. However, we will restart 381 // with a timestamp that is at least high as our entry in the list for node A, 382 // so no future operation on node A will be uncertain. 383 // 384 // ### Correctness: 385 // 386 // Thus, expressed properly, we can say that when a node has been read from 387 // successfully before by a transaction, uncertainty for values written by a 388 // leaseholder on that node is restricted to values with timestamps in the 389 // interval [read_timestamp, first_visit_timestamp). An upper bound can be 390 // placed on the uncertainty window because we are guaranteed that at the time 391 // that the transaction first visited the node, none of the Ranges that it was 392 // a leaseholder for had served any writes at higher timestamps than the clock 393 // reading we observe. This implies the following property: 394 // 395 // Any writes that the transaction may later see written by leaseholders on 396 // this node at higher timestamps than the observed timestamp could not have 397 // taken place causally before this transaction and can be ignored for the 398 // purposes of uncertainty. 399 // 400 // There are two invariants necessary for this property to hold: 401 // 1. a leaseholder's clock must always be equal to or greater than the timestamp 402 // of all writes that it has served. This is trivial to enforce for 403 // non-transactional writes. It is more complicated for transactional writes 404 // which may move their commit timestamp forward over their lifetime before 405 // committing, even after writing intents on remote Ranges. To accommodate 406 // this situation, transactions ensure that at the time of their commit, any 407 // leaseholder for a Range that contains one of its intent has an HLC clock 408 // with an equal or greater timestamp than the transaction's commit timestamp. 409 // TODO(nvanbenschoten): This is violated by txn refreshes. See #36431. 410 // 2. a leaseholder's clock must always be equal to or greater than the timestamp 411 // of all writes that previous leaseholders for its Range have served. We 412 // enforce that when a Replica acquires a lease it bumps its node's clock to a 413 // time higher than the previous leaseholder's clock when it stopped serving 414 // writes. This is accomplished cooperatively for lease transfers and through 415 // a statis period before lease expiration for lease acquisitions. It then 416 // follows by induction that, in conjunction with the previous invariant, this 417 // invariant holds for all leaseholders, given that a Range's initial 418 // leaseholder assumes responsibility for an empty range with no writes. 419 // 420 // ### Usage: 421 // 422 // The property ensures that when this list holds a corresponding entry for 423 // the node who owns the lease that the current request is executing under, we 424 // can run the request with the list's timestamp as the upper bound for its 425 // uncertainty interval, limiting (and often avoiding) uncertainty restarts. 426 // We do this by lowering the request's max_timestamp down to the timestamp in 427 // the observed timestamp entry, which is done in Replica.limitTxnMaxTimestamp. 428 // 429 // However, as stated, the correctness property only holds for values at 430 // higher timestamps than the observed timestamp written *by leaseholders on 431 // this node*. This is critical, as the property tells us nothing about values 432 // written by leaseholders on different nodes, even if a lease for one of 433 // those Ranges has since moved to a node that we have an observed timestamp 434 // entry for. To accommodate this limitation, Replica.limitTxnMaxTimestamp 435 // first forwards the timestamp in the observed timestamp entry by the start 436 // timestamp of the lease that the request is executing under before using it 437 // to limit the request's uncertainty interval. 438 // 439 // When a transaction is first initialized on a node, it may use a timestamp 440 // from the local hybrid logical clock to initialize the corresponding entry 441 // in the list. In particular, if `read_timestamp` is taken from that node's 442 // clock, we may add that to the list, which eliminates read uncertainty for 443 // reads on that node. 444 // 445 // The slice of observed timestamps is kept sorted by NodeID. Use 446 // Transaction.UpdateObservedTimestamp to maintain the sorted order. The 447 // slice should be treated as immutable and all updates should be performed 448 // on a copy of the slice. 449 repeated ObservedTimestamp observed_timestamps = 8 [(gogoproto.nullable) = false]; 450 // If set, a write performed by the transaction could not be performed at the 451 // transaction's read timestamp because a newer value was present. Had our 452 // write been performed, it would have overwritten the other value even though 453 // that value might not have been read by a previous read in the transaction 454 // (i.e. lost update anomaly). The write is still performed, but this flag is 455 // set and the txn's write timestamp is bumped, so the client will not be able 456 // to commit without performing a refresh. 457 // 458 // Since 20.1, errors do not carry this flag; only successful BatchResponses 459 // do. When possible, such a BatchResponse is preferred to a WriteTooOldError 460 // because the former leaves intents behind to act as locks. 461 // 462 // On the client, the txnSpanRefresher terminates this flag by refreshing 463 // eagerly when the flag is set. If the key that generated the write too old 464 // condition had been previously read by the transaction, a refresh of the 465 // transaction's read span will surely fail. The client is not currently smart 466 // enough to avoid hopeless refreshes, though. 467 // 468 // Historically, this field was also important for SNAPSHOT transactions which 469 // could commit in other situations when the write timestamp is bumped, but 470 // not when this flag is set (since lost updates cannot be tolerated even in 471 // SNAPSHOT). In SERIALIZABLE isolation, transactions generally don't commit 472 // with a bumped write timestamp, so this flag is only telling us that a 473 // refresh is less likely to succeed than in other cases where 474 // ReadTimestamp != WriteTimestamp. 475 bool write_too_old = 12; 476 // Set of spans that the transaction has acquired locks within. These are 477 // spans which must be resolved on txn completion. Note that these spans 478 // may be condensed to cover aggregate spans if the keys locked by the 479 // transaction exceeded a size threshold. 480 // 481 // The set logically extends to include the keys of all writes in the 482 // in-flight write set. However, those keys are not stored in this set 483 // to avoid duplication. This means that elements that are removed from 484 // that set should be merged into this one. 485 // 486 // The slice is maintained in sorted order and all spans are maximally 487 // merged such that no two spans here overlap each other. It should be 488 // treated as immutable and all updates should be performed on a copy 489 // of the slice. 490 repeated Span lock_spans = 11 [(gogoproto.nullable) = false]; 491 // Set of in-flight intent writes that have been issued by the transaction but 492 // which may not have succeeded yet. If any in-flight writes are provided, a 493 // committing EndTxn request will move a PENDING transaction to the STAGING 494 // status instead of the COMMITTED status. These in-flight writes must then 495 // all be confirmed as successful before the transaction can be moved from 496 // STAGING to COMMITTED. Because of this, the set will only ever contain 497 // entries when the transaction is STAGING. For more, see txnCommitter. 498 // 499 // The slice is maintained in sorted order by sequence number. It should be 500 // treated as immutable and all updates should be performed on a copy of the 501 // slice. 502 repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false]; 503 // A list of ignored seqnum ranges. 504 // 505 // The slice is maintained as non-overlapping, non-contiguous (i.e. it must 506 // coalesce ranges to avoid situations where a range's end seqnum is equal to 507 // the next range's start seqnum), and sorted in seqnum order. It should be 508 // treated as immutable and all updates should be performed on a copy of the 509 // slice. 510 repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 18 511 [(gogoproto.nullable) = false, (gogoproto.customname) = "IgnoredSeqNums"]; 512 513 reserved 3, 9, 13, 14; 514 } 515 516 // A TransactionRecord message contains the subset of the fields in a 517 // Transaction message that must be persisted in a transaction record. 518 // It can be thought of as a mask for the fields in Transaction that 519 // end up persisted in a transaction record. 520 // 521 // The message type is wire-compatible with persisted Transaction protos, 522 // but avoids the overhead of the fields in Transaction that don't need to 523 // be persisted in a transaction record. It also serves as a specification 524 // for the fields that must be present in a transaction record. 525 // 526 // NOTE: any changes to this type must be reflected in the AsRecord and 527 // AsTransaction methods. 528 message TransactionRecord { 529 option (gogoproto.equal) = true; 530 option (gogoproto.populate) = true; 531 532 // See comments on Transaction proto. 533 storage.enginepb.TxnMeta meta = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 534 TransactionStatus status = 4; 535 util.hlc.Timestamp last_heartbeat = 5 [(gogoproto.nullable) = false]; 536 repeated Span lock_spans = 11 [(gogoproto.nullable) = false]; 537 repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false]; 538 repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 18 539 [(gogoproto.nullable) = false, (gogoproto.customname) = "IgnoredSeqNums"]; 540 541 // Fields on Transaction that are not present in a transaction record. 542 reserved 2, 3, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16; 543 } 544 545 // A Intent is a Span together with a Transaction metadata. Intents messages 546 // are used to reference persistent on-disk write intents. They are used on 547 // the return path of e.g. scans, to report the existence of a write intent 548 // on a key. 549 // 550 // Note: avoid constructing Intent directly; consider using MakeIntent() instead. 551 message Intent { 552 option (gogoproto.equal) = true; 553 554 // SingleKeySpan preseves wire compatibility with an earlier version of this 555 // proto which used a Span. An Intent never spans keys, so there was no need 556 // for this to contain an EndKey. 557 message SingleKeySpan { 558 option (gogoproto.equal) = true; 559 560 reserved 1, 2, 4; 561 // The start key of the key range. 562 bytes key = 3 [(gogoproto.casttype) = "Key"]; 563 } 564 SingleKeySpan single_key_span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 565 storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false]; 566 } 567 568 // A LockAcquisition represents the action of a Transaction acquiring a lock 569 // with a specified durbility level over a Span of keys. 570 message LockAcquisition { 571 option (gogoproto.equal) = true; 572 573 Span span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 574 storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false]; 575 kv.kvserver.concurrency.lock.Durability durability = 3; 576 } 577 578 // A LockUpdate is a Span together with Transaction state. LockUpdate messages 579 // are used to update all locks held by the transaction within the span to the 580 // transaction's authoritative state. As such, the message is used as input 581 // argument to intent resolution, to pass the current txn status, timestamps and 582 // ignored seqnum ranges to the resolution algorithm. 583 message LockUpdate { 584 option (gogoproto.equal) = true; 585 586 Span span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 587 storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false]; 588 TransactionStatus status = 3; 589 repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 4 [(gogoproto.nullable) = false, (gogoproto.customname) = "IgnoredSeqNums"]; 590 } 591 592 // A SequencedWrite is a point write to a key with a certain sequence number. 593 message SequencedWrite { 594 option (gogoproto.equal) = true; 595 option (gogoproto.populate) = true; 596 597 // The key that the write was made at. 598 bytes key = 1 [(gogoproto.casttype) = "Key"]; 599 // The sequence number of the request that created the write. 600 int32 sequence = 2 [ 601 (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"]; 602 } 603 604 // Lease contains information about range leases including the 605 // expiration and lease holder. 606 message Lease { 607 option (gogoproto.goproto_stringer) = false; 608 option (gogoproto.populate) = true; 609 610 // The start is a timestamp at which the lease begins. This value 611 // must be greater than the last lease expiration or the lease request 612 // is considered invalid. 613 util.hlc.Timestamp start = 1 [(gogoproto.nullable) = false]; 614 615 // The expiration is a timestamp at which the lease expires. This means that 616 // a new lease can be granted for a later timestamp. 617 util.hlc.Timestamp expiration = 2; 618 619 // The address of the would-be lease holder. 620 ReplicaDescriptor replica = 3 [(gogoproto.nullable) = false]; 621 622 // The start of the lease stasis period. This field is deprecated. 623 util.hlc.Timestamp deprecated_start_stasis = 4; 624 625 // The current timestamp when this lease has been proposed. Used after a 626 // transfer and after a node restart to enforce that a node only uses leases 627 // proposed after the time of the said transfer or restart. This is nullable 628 // to help with the rollout (such that a lease applied by some nodes before 629 // the rollout and some nodes after the rollout is serialized the same). 630 // TODO(andrei): Make this non-nullable after the rollout. 631 util.hlc.Timestamp proposed_ts = 5 [(gogoproto.customname) = "ProposedTS"]; 632 633 // The epoch of the lease holder's node liveness entry. If this value 634 // is non-zero, the start and expiration values are ignored. 635 int64 epoch = 6; 636 637 // A zero-indexed sequence number which is incremented during the acquisition 638 // of each new range lease that is not equivalent to the previous range lease 639 // (i.e. an acquisition that implies a leaseholder change). The sequence 640 // number is used to detect lease changes between command proposal and 641 // application without requiring that we send the entire lease through Raft. 642 // Lease sequence numbers are a reflection of the "lease equivalency" property 643 // (see Lease.Equivalent). Two adjacent leases that are equivalent will have 644 // the same sequence number and two adjacent leases that are not equivalent 645 // will have different sequence numbers. 646 int64 sequence = 7 [(gogoproto.casttype) = "LeaseSequence"]; 647 } 648 649 // AbortSpanEntry contains information about a transaction which has 650 // been aborted. It's written to a range's AbortSpan if the range 651 // may have contained intents of the aborted txn. In the event that 652 // the same transaction attempts to read keys it may have written 653 // previously, this entry informs the transaction that it has aborted 654 // and must start fresh with an updated priority. 655 message AbortSpanEntry { 656 option (gogoproto.equal) = true; 657 option (gogoproto.populate) = true; 658 659 // The key of the associated transaction. 660 bytes key = 1 [(gogoproto.casttype) = "Key"]; 661 // The candidate commit timestamp the transaction record held at the time 662 // it was aborted. 663 util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; 664 // The priority of the transaction. 665 int32 priority = 3 [ 666 (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnPriority"]; 667 } 668 669 // LeafTxnInputState is the state from a transaction coordinator 670 // necessary and sufficient to set up a leaf transaction coordinator 671 // on another node. 672 message LeafTxnInputState { 673 // txn is a copy of the transaction record. 674 Transaction txn = 1 [(gogoproto.nullable) = false]; 675 reserved 2, 3, 4, 5, 6; 676 // refresh_invalid indicates that the root txn is not 677 // collecting refresh spans so the leaf should also avoid 678 // collecting them. This is an optimization: it avoids 679 // the collection work in that cases and also possibly 680 // reduces memory usage. 681 bool refresh_invalid = 7; 682 // in_flight_writes stores all writes that are in-flight and have not yet 683 // been proven to have succeeded. Overlapping requests must chain on to 684 // their success using a QueryIntent request. 685 repeated SequencedWrite in_flight_writes = 8 [(gogoproto.nullable) = false]; 686 // Whether stepping mode is enabled. False indicates synchronous 687 // read-own-writes, where every KV read is able to observe the 688 // latest writes. True indicates that KV reads should be done at the 689 // read_seq_num specified below. 690 bool stepping_mode_enabled = 9; 691 // Current read seqnum. When stepping_mode_enabled is true, 692 // this field becomes the sequence number used for reads, 693 // regardless of the current seqnum generated for writes. This is 694 // updated via the (client.TxnSender).Step() operation. 695 int32 read_seq_num = 10 [ 696 (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"]; 697 } 698 699 // LeafTxnFinalState is the state from a leaf transaction coordinator 700 // necessary and sufficient to update a RootTxn on the gateway 701 // coordinator. 702 message LeafTxnFinalState { 703 // txn is a copy of the transaction record. 704 // TODO(knz,andrei): We don't actually need the fully txn 705 // record. This can be simplified. 706 // See: https://github.com/cockroachdb/cockroach/issues/43192 707 Transaction txn = 1 [(gogoproto.nullable) = false]; 708 reserved 2; 709 // deprecated_command_count indicates that at least one request 710 // has been processed in this transaction. 711 // Populated only for compatibility with pre-20.1 nodes. 712 // TODO(knz,andrei): Remove this in 20.2. 713 int32 deprecated_command_count = 3; 714 // refresh_spans contains the key spans read by the leaf. The root will add 715 // them to its own tracking of reads. 716 repeated Span refresh_spans = 4 [(gogoproto.nullable) = false]; 717 reserved 5; 718 reserved 6; 719 // refresh_invalid is set if refresh spans have not been collected. In this 720 // case, refresh_spans is empty. It may be set because the leaf was asked not 721 // to collect spans or because the leaf's reads exceeded the tracking memory 722 // budget. 723 bool refresh_invalid = 7; 724 reserved 8; 725 }