github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/enginepb/mvcc3.proto (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  syntax = "proto3";
    12  package cockroach.storage.enginepb;
    13  
    14  import "util/hlc/timestamp.proto";
    15  import "gogoproto/gogo.proto";
    16  
    17  // TxnMeta is the metadata of a Transaction record.
    18  message TxnMeta {
    19    option (gogoproto.goproto_stringer) = false;
    20    option (gogoproto.equal) = true;
    21    option (gogoproto.populate) = true;
    22  
    23    // id is a unique UUID value which identifies the transaction.
    24    // This field is always filled in.
    25    bytes id = 1 [(gogoproto.customname) = "ID",
    26        (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
    27        (gogoproto.nullable) = false];
    28    reserved 2;
    29    // key is the key which anchors the transaction. This is typically
    30    // the first key read or written during the transaction and
    31    // determines which range in the cluster will hold the transaction
    32    // record.
    33    bytes key = 3; // TODO(tschottdorf): [(gogoproto.casttype) = "Key"];
    34    // Incremented on txn retry.
    35    int32 epoch = 4 [(gogoproto.casttype) = "TxnEpoch"];
    36    // The proposed timestamp for the transaction. This starts as the current wall
    37    // time on the txn coordinator, and is forwarded by the timestamp cache if the
    38    // txn attempts to write "beneath" another txn's writes.
    39    //
    40    // Writes within the txn are performed using the most up-to-date value of this
    41    // timestamp that is available. For example, suppose a txn starts at some
    42    // timestamp, writes a key/value, and has its timestamp forwarded while doing
    43    // so because a later version already exists at that key. As soon as the txn
    44    // coordinator learns of the updated timestamp, it will begin performing
    45    // writes at the updated timestamp. The coordinator may, however, continue
    46    // issuing writes at the original timestamp before it learns about the
    47    // forwarded timestamp. The process of resolving the intents when the txn
    48    // commits will bump any intents written at an older timestamp to the final
    49    // commit timestamp.
    50    //
    51    // Note that reads do not occur at this timestamp; they instead occur at
    52    // ReadTimestamp, which is tracked in the containing roachpb.Transaction.
    53    //
    54    // Writes used to be performed at the txn's read timestamp, which was
    55    // necessary to avoid lost update anomalies in snapshot isolation mode. We no
    56    // longer support snapshot isolation mode, and there are now several important
    57    // reasons that writes are performed at this timestamp instead of the txn's
    58    // original timestamp:
    59    //
    60    //    1. This timestamp is forwarded by the timestamp cache when this
    61    //       transaction attempts to write beneath a more recent read. Leaving the
    62    //       intent at the original timestamp would write beneath that read, which
    63    //       would violate an invariant that time-bound iterators rely on.
    64    //
    65    //       For example, consider a client that uses a time-bound iterator to
    66    //       poll for changes to a key. The client reads (ts5, ts10], sees no
    67    //       writes, and reports that no changes have occurred up to t10. Then a
    68    //       txn writes an intent at its original timestamp ts7. The txn's
    69    //       timestamp is forwarded to ts11 by the timestamp cache thanks to the
    70    //       client's read. Meanwhile, the client reads (ts10, ts15] and, again
    71    //       seeing no intents, reports that no changes have occurred to the key
    72    //       up to t15. Now the txn commits at ts11 and bumps the intent to ts11.
    73    //       But the client thinks it has seen all changes up to t15, and so never
    74    //       sees the intent! We avoid this problem by writing intents at the
    75    //       provisional commit timestamp insteadr. In this example, the intent
    76    //       would instead be written at ts11 and picked up by the client's next
    77    //       read from (ts10, ts15].
    78    //
    79    //    2. Unnecessary PushTxn roundtrips are avoided. If a transaction is
    80    //       forwarded from ts5 to ts10, the rest of its intents will be written
    81    //       at ts10. Reads at t < ts10 that encounter these intents can ignore
    82    //       them; if the intents had instead been left at ts5, these reads would
    83    //       have needed to send PushTxn requests just to find out that the txn
    84    //       had, in fact, been forwarded to a non-conflicting time.
    85    //
    86    //    3. Unnecessary intent rewriting is avoided. Writing at the original
    87    //       timestamp when this timestamp has been forwarded guarantees that the
    88    //       value will need to be rewritten at the forwarded timestamp if the
    89    //       transaction commits.
    90    //
    91    util.hlc.Timestamp write_timestamp = 5 [(gogoproto.nullable) = false];
    92    // The timestamp that the transaction was assigned by its gateway when it
    93    // began its first epoch. This is the earliest timestamp that the transaction
    94    // could have written any of its intents at.
    95    //
    96    // The timestamp is currently used in three places:
    97    // 1. by the transaction itself and by concurrent transactions when
    98    //    determining whether this transaction's record can be initially
    99    //    written. The timestamp is compared against the transaction's
   100    //    corresponding timestamp cache entry to ensure that a
   101    //    finalized transaction can never commit, either after a replay
   102    //    or a transaction abort. See CanCreateTxnRecord.
   103    // 2. by intent resolution to efficiently scan for intents while
   104    //    using a time-bound iterator - i.e. there can be intents to
   105    //    resolve up to the timestamp that the txn started with.
   106    // 3. by would-be pushers, when they run into an intent but the corresponding
   107    //    txn record was not yet written. In that case, the pusher uses this field
   108    //    as an indication of a timestamp when the pushee's coordinator is known
   109    //    to have been alive.
   110    util.hlc.Timestamp min_timestamp = 9 [(gogoproto.nullable) = false];
   111    // The transaction's priority, ratcheted on transaction pushes.
   112    int32 priority = 6 [(gogoproto.casttype) = "TxnPriority"];
   113    // A zero-indexed sequence number which is increased on each request
   114    // sent as part of the transaction. When set in the header of a batch of
   115    // requests, the value will correspond to the sequence number of the
   116    // last request. Used to provide idempotency and to protect against
   117    // out-of-order application (by means of a transaction retry).
   118    int32 sequence = 7 [(gogoproto.casttype) = "TxnSeq"];
   119  
   120    reserved 8;
   121  }
   122  
   123  // IgnoredSeqNumRange describes a range of ignored seqnums.
   124  // The range is inclusive on both ends.
   125  message IgnoredSeqNumRange {
   126    option (gogoproto.equal) = true;
   127    option (gogoproto.populate) = true;
   128    int32 start = 1 [(gogoproto.casttype) = "TxnSeq"];
   129    int32 end = 2 [(gogoproto.casttype) = "TxnSeq"];
   130  }
   131  
   132  // MVCCStatsDelta is convertible to MVCCStats, but uses signed variable width
   133  // encodings for most fields that make it more efficient to store negative
   134  // values. This makes the encodings incompatible.
   135  message MVCCStatsDelta {
   136    option (gogoproto.equal) = true;
   137  
   138    int64 contains_estimates = 14;
   139    sfixed64 last_update_nanos = 1;
   140    sfixed64 intent_age = 2;
   141    sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"];
   142    sint64 live_bytes = 4;
   143    sint64 live_count = 5;
   144    sint64 key_bytes = 6;
   145    sint64 key_count = 7;
   146    sint64 val_bytes = 8;
   147    sint64 val_count = 9;
   148    sint64 intent_bytes = 10;
   149    sint64 intent_count = 11;
   150    sint64 sys_bytes = 12;
   151    sint64 sys_count = 13;
   152  }
   153  
   154  // MVCCPersistentStats is convertible to MVCCStats, but uses signed variable
   155  // width encodings for most fields that make it efficient to store positive
   156  // values but inefficient to store negative values. This makes the encodings
   157  // incompatible.
   158  message MVCCPersistentStats {
   159    option (gogoproto.equal) = true;
   160    option (gogoproto.populate) = true;
   161  
   162    int64 contains_estimates = 14; // must never go negative absent a bug
   163    sfixed64 last_update_nanos = 1;
   164    sfixed64 intent_age = 2;
   165    sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"];
   166    int64 live_bytes = 4;
   167    int64 live_count = 5;
   168    int64 key_bytes = 6;
   169    int64 key_count = 7;
   170    int64 val_bytes = 8;
   171    int64 val_count = 9;
   172    int64 intent_bytes = 10;
   173    int64 intent_count = 11;
   174    int64 sys_bytes = 12;
   175    int64 sys_count = 13;
   176  }
   177  
   178  // RangeAppliedState combines the raft and lease applied indices with
   179  // mvcc stats. These are all persisted on each transition of the Raft
   180  // state machine (i.e. on each Raft application), so they are stored
   181  // in the same RocksDB key for efficiency.
   182  message RangeAppliedState {
   183    option (gogoproto.equal) = true;
   184    option (gogoproto.populate) = true;
   185  
   186    // raft_applied_index is the highest (and last) index applied to the Raft
   187    // state machine.
   188    uint64 raft_applied_index = 1;
   189    // lease_applied_index is the highest (and last) lease index applied to the
   190    // Raft state machine.
   191    uint64 lease_applied_index = 2;
   192    // range_stats is the set of mvcc stats that accounts for the current value
   193    // of the Raft state machine.
   194    MVCCPersistentStats range_stats = 3 [(gogoproto.nullable) = false];
   195  }
   196  
   197  // MVCCWriteValueOp corresponds to a value being written outside of a
   198  // transaction.
   199  message MVCCWriteValueOp {
   200    bytes key = 1;
   201    util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
   202    bytes value = 3;
   203    bytes prev_value = 4;
   204  }
   205  
   206  // MVCCUpdateIntentOp corresponds to an intent being written for a given
   207  // transaction.
   208  message MVCCWriteIntentOp {
   209    bytes txn_id = 1 [
   210      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
   211      (gogoproto.customname) = "TxnID",
   212      (gogoproto.nullable) = false];
   213    bytes txn_key = 2;
   214    util.hlc.Timestamp txn_min_timestamp = 4 [(gogoproto.nullable) = false];
   215    util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false];
   216  }
   217  
   218  // MVCCUpdateIntentOp corresponds to an intent being updates at a larger
   219  // timestamp for a given transaction.
   220  message MVCCUpdateIntentOp {
   221    bytes txn_id = 1 [
   222      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
   223      (gogoproto.customname) = "TxnID",
   224      (gogoproto.nullable) = false];
   225    util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
   226  }
   227  
   228  // MVCCCommitIntentOp corresponds to an intent being committed for a given
   229  // transaction.
   230  message MVCCCommitIntentOp {
   231    bytes txn_id = 1 [
   232      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
   233      (gogoproto.customname) = "TxnID",
   234      (gogoproto.nullable) = false];
   235    bytes key = 2;
   236    util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false];
   237    bytes value = 4;
   238    bytes prev_value = 5;
   239  }
   240  
   241  // MVCCAbortIntentOp corresponds to an intent being aborted for a given
   242  // transaction.
   243  //
   244  // This operation does not necessarily indicate that the intent's transaction
   245  // was aborted, just that an intent was removed without being committed. For
   246  // instance, a committed transaction will abort any intents it decided not to
   247  // write in its final epoch.
   248  message MVCCAbortIntentOp {
   249    bytes txn_id = 1 [
   250      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
   251      (gogoproto.customname) = "TxnID",
   252      (gogoproto.nullable) = false];
   253  }
   254  
   255  // MVCCAbortTxnOp corresponds to an entire transaction being aborted. The
   256  // operation indicates that none of the transaction's intents will ever be
   257  // committed.
   258  message MVCCAbortTxnOp {
   259    bytes txn_id = 1 [
   260      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
   261      (gogoproto.customname) = "TxnID",
   262      (gogoproto.nullable) = false];
   263  }
   264  
   265  // MVCCLogicalOp is a union of all logical MVCC operation types.
   266  message MVCCLogicalOp {
   267    option (gogoproto.onlyone) = true;
   268  
   269    MVCCWriteValueOp   write_value   = 1;
   270    MVCCWriteIntentOp  write_intent  = 2;
   271    MVCCUpdateIntentOp update_intent = 3;
   272    MVCCCommitIntentOp commit_intent = 4;
   273    MVCCAbortIntentOp  abort_intent  = 5;
   274    MVCCAbortTxnOp     abort_txn     = 6;
   275  }