github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/data.proto (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  syntax = "proto3";
    12  package cockroach.roachpb;
    13  option go_package = "roachpb";
    14  
    15  import "kv/kvserver/concurrency/lock/locking.proto";
    16  import "roachpb/metadata.proto";
    17  import "storage/enginepb/mvcc.proto";
    18  import "storage/enginepb/mvcc3.proto";
    19  import "util/hlc/timestamp.proto";
    20  import "gogoproto/gogo.proto";
    21  
    22  // Span is a key range with an inclusive start Key and an exclusive end Key.
    23  message Span {
    24    option (gogoproto.equal) = true;
    25  
    26    option (gogoproto.goproto_stringer) = false;
    27    option (gogoproto.populate) = true;
    28  
    29    reserved 1, 2;
    30    // The start key of the key range.
    31    bytes key = 3 [(gogoproto.casttype) = "Key"];
    32    // The end key of the key range. The value is empty if the key range
    33    // contains only a single key. Otherwise, it must order strictly after Key.
    34    // In such a case, the Span encompasses the key range from Key to EndKey,
    35    // including Key and excluding EndKey.
    36    bytes end_key = 4 [(gogoproto.casttype) = "Key"];
    37  }
    38  
    39  // ValueType defines a set of type constants placed in the "tag" field of Value
    40  // messages. These are defined as a protocol buffer enumeration so that they
    41  // can be used portably between our Go and C code. The tags are used by the
    42  // RocksDB Merge Operator to perform specialized merges.
    43  enum ValueType {
    44    // This is a subset of the SQL column type values, representing the underlying
    45    // storage for various types. The DELIMITED_foo entries each represent a foo
    46    // variant that self-delimits length.
    47    UNKNOWN = 0;
    48    reserved 7;
    49    INT = 1;
    50    FLOAT = 2;
    51    BYTES = 3;
    52    DELIMITED_BYTES = 8;
    53    TIME = 4;
    54    DECIMAL = 5;
    55    DELIMITED_DECIMAL = 9;
    56    DURATION = 6;
    57    TIMETZ = 12;
    58    GEO = 13;
    59  
    60    // TUPLE represents a DTuple, encoded as repeated pairs of varint field number
    61    // followed by a value encoded Datum.
    62    TUPLE = 10;
    63  
    64    BITARRAY = 11;
    65  
    66    // TIMESERIES is applied to values which contain InternalTimeSeriesData.
    67    TIMESERIES = 100;
    68  }
    69  
    70  // Value specifies the value at a key. Multiple values at the same key are
    71  // supported based on timestamp. The data stored within a value is typed
    72  // (ValueType) and custom encoded into the raw_bytes field. A custom encoding
    73  // is used instead of separate proto fields to avoid proto overhead and to
    74  // avoid unnecessary encoding and decoding as the value gets read from disk and
    75  // passed through the network. The format is:
    76  //
    77  //   <4-byte-checksum><1-byte-tag><encoded-data>
    78  //
    79  // A CRC-32-IEEE checksum is computed from the associated key, tag and encoded
    80  // data, in that order.
    81  //
    82  // TODO(peter): Is a 4-byte checksum overkill when most (all?) values
    83  // will be less than 64KB?
    84  message Value {
    85    option (gogoproto.equal) = true;
    86  
    87    // raw_bytes contains the encoded value and checksum.
    88    //
    89    // Its contents may be modified on the next call to Value.SetFoo.
    90    bytes raw_bytes = 1;
    91    // Timestamp of value.
    92    util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
    93  }
    94  
    95  // KeyValue is a pair of Key and Value for returned Key/Value pairs
    96  // from ScanRequest/ScanResponse. It embeds a Key and a Value.
    97  message KeyValue {
    98    bytes key = 1 [(gogoproto.casttype) = "Key"];
    99    Value value = 2 [(gogoproto.nullable) = false];
   100  }
   101  
   102  // A StoreIdent uniquely identifies a store in the cluster. The
   103  // StoreIdent is written to the underlying storage engine at a
   104  // store-reserved system key (KeyLocalIdent).
   105  message StoreIdent {
   106    bytes cluster_id = 1 [(gogoproto.nullable) = false,
   107        (gogoproto.customname) = "ClusterID",
   108        (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
   109    int32 node_id = 2 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
   110    int32 store_id = 3 [(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
   111  }
   112  
   113  // A SplitTrigger is run after a successful commit of an AdminSplit
   114  // command. It provides the updated left hand side of the split's
   115  // range descriptor (left_desc) and the new range descriptor covering
   116  // the right hand side of the split (right_desc). This information
   117  // allows the final bookkeeping for the split to be completed and the
   118  // new range put into operation.
   119  message SplitTrigger {
   120    option (gogoproto.equal) = true;
   121  
   122    RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false];
   123    RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false];
   124    reserved 3;
   125  }
   126  
   127  // A MergeTrigger is run after a successful commit of an AdminMerge
   128  // command. It provides the updated left hand side of the split's
   129  // range descriptor (left_desc) that now encompasses what was
   130  // originally both ranges and the soon-to-be-invalid range descriptor
   131  // that used to cover the subsumed, right hand side of the merge
   132  // (right_desc). This information allows the final bookkeeping for the
   133  // merge to be completed and put into operation.
   134  message MergeTrigger {
   135    option (gogoproto.equal) = true;
   136  
   137    RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false];
   138    RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false];
   139  
   140    reserved 3;
   141  
   142    storage.enginepb.MVCCStats right_mvcc_stats = 4 [
   143      (gogoproto.customname) = "RightMVCCStats",
   144      (gogoproto.nullable) = false
   145    ];
   146  
   147    // FreezeStart is a timestamp that is guaranteed to be greater than the
   148    // timestamps at which any requests were serviced by the responding replica
   149    // before it stopped responding to requests altogether (in anticipation of
   150    // being subsumed). It is suitable for use as the timestamp cache's low water
   151    // mark for the keys previously owned by the subsumed range.
   152    util.hlc.Timestamp freeze_start = 5 [(gogoproto.nullable) = false];
   153  }
   154  
   155  // ReplicaChangeType is a parameter of ChangeReplicasTrigger.
   156  enum ReplicaChangeType {
   157    option (gogoproto.goproto_enum_prefix) = false;
   158  
   159    ADD_REPLICA = 0;
   160    REMOVE_REPLICA = 1;
   161  }
   162  
   163  // ChangeReplicasTrigger carries out a replication change. The Added() and
   164  // Removed() methods return the replicas being added and removed, respectively.
   165  // If more than one change is specified (i.e. len(Added())+len(Removed())
   166  // exceeds one), this initiates an atomic replication change in which the
   167  // "removed" replicas are of type VOTER_OUTGOING or VOTER_DEMOTING (if they are
   168  // to be turned into learners instead); as a caveat a single demotion already
   169  // counts as two changes (and is tracked as a Removal() only). This joint
   170  // configuration is left via another ChangeReplicasTrigger which does not
   171  // specify any additions nor removals.
   172  message ChangeReplicasTrigger {
   173    option (gogoproto.equal) = true;
   174  
   175    option (gogoproto.goproto_stringer) = false;
   176  
   177    // TODO(tbg): remove once we know that no trigger using this will ever be
   178    // applied (this will require something like #39182).
   179    //
   180    // TODO(tbg): when removing this, also rename internal_x_replicas to just
   181    // x_replicas and remove the getter.
   182    ReplicaChangeType deprecated_change_type = 1;
   183    // The replica being modified.
   184    // TODO(tbg): remove once we know that no trigger using this will ever be
   185    // applied (this will require something like #39182).
   186    ReplicaDescriptor deprecated_replica = 2 [(gogoproto.nullable) = false];
   187    // The new replica list with this change applied.
   188    repeated ReplicaDescriptor deprecated_updated_replicas = 3 [(gogoproto.nullable) = false];
   189    // The next replica id to use with this change applied.
   190    int32 deprecated_next_replica_id = 4 [(gogoproto.customname) = "DeprecatedNextReplicaID", (gogoproto.casttype) = "ReplicaID"];
   191    // The updated range descriptor. If desc is non-nil, then it overrides
   192    // updated_replicas and next_replica_id. This incremental addition is needed
   193    // to maintain backwards compatibility.
   194    // TODO(jeffreyxiao): Remove deprecated_updated_replicas and
   195    // deprecated_next_replica_id in 20.1.
   196    RangeDescriptor desc = 5;
   197    // The new replicas added to the range descriptor in this change, exactly as
   198    // they appear in the updated range descriptor.
   199    repeated ReplicaDescriptor internal_added_replicas = 6 [(gogoproto.nullable) = false];
   200    // The replicas whose removal is being initiated in this change. If the
   201    // replica is still present as an outgoing voter in the updated descriptor
   202    // (i.e. if this is a full atomic replication change), then the replica here
   203    // must match that in the descriptor; otherwise it must match the replica
   204    // removed from the descriptor in the course of this change (which is itself
   205    // not visible to this trigger).
   206    repeated ReplicaDescriptor internal_removed_replicas = 7 [(gogoproto.nullable) = false];
   207  }
   208  
   209  // ModifiedSpanTrigger indicates that a specific span has been modified.
   210  // This can be used to trigger scan-and-gossip for the given span.
   211  message ModifiedSpanTrigger {
   212    option (gogoproto.equal) = true;
   213  
   214    bool system_config_span = 1;
   215    // node_liveness_span is set to indicate that node liveness records
   216    // need re-gossiping after modification or range lease updates. The
   217    // span is set to a single key when nodes update their liveness records
   218    // with heartbeats to extend the expiration timestamp. Changes to the
   219    // range lease for the range containing node liveness triggers re-gossip
   220    // of the entire node liveness key range.
   221    Span node_liveness_span = 2;
   222  }
   223  
   224  // StickyBitTrigger indicates that the sticky bit of a range should be changed.
   225  // This trigger is used in two cases:
   226  // 1. Unsplitting a range. Note that unsplitting and merging are different
   227  //    operations. Unsplitting a range will only update the expiration time
   228  //    associated with the range to hlc.Timestamp{}.
   229  // 2. Splitting at the start key of a range. In this case, no range is split but
   230  //    the sticky bit is might be updated, so we need to use this trigger instead
   231  //    of SplitTrigger.
   232  //
   233  // Note that the sticky_bit should always be set to the same timestamp used to
   234  // update the range descriptor and it's the client's responsibility that the
   235  // timestamps are aligned.
   236  message StickyBitTrigger {
   237    option (gogoproto.equal) = true;
   238  
   239    // Set to nil to remove a RangeDescriptor's sticky bit.
   240    util.hlc.Timestamp sticky_bit = 1 [(gogoproto.nullable) = false];
   241  }
   242  
   243  // InternalCommitTrigger encapsulates all of the internal-only commit triggers.
   244  // Only one may be set.
   245  message InternalCommitTrigger {
   246    option (gogoproto.equal) = true;
   247  
   248    // InternalCommitTrigger is always nullable, and these getters are
   249    // nil-safe, which is often convenient.
   250    option (gogoproto.goproto_getters) = true;
   251  
   252    SplitTrigger split_trigger = 1;
   253    MergeTrigger merge_trigger = 2;
   254    ChangeReplicasTrigger change_replicas_trigger = 3;
   255    ModifiedSpanTrigger modified_span_trigger = 4;
   256    StickyBitTrigger sticky_bit_trigger = 5;
   257  }
   258  
   259  // TransactionStatus specifies possible states for a transaction.
   260  enum TransactionStatus {
   261    option (gogoproto.goproto_enum_prefix) = false;
   262  
   263    // PENDING is the default state for a new transaction. Transactions
   264    // move from PENDING to one of COMMITTED or ABORTED. Mutations made
   265    // as part of a PENDING transactions are recorded as "intents" in
   266    // the underlying MVCC model.
   267    PENDING = 0;
   268    // STAGING is the state for a transaction which has issued all of
   269    // its writes and is in the process of committing. Mutations made
   270    // as part of a transaction in this state may still be in-flight
   271    // and can not be assumed to have succeeded. A transaction may
   272    // transition from the STAGING to the COMMITTED state only if all
   273    // of its in-flight mutations are confirmed to have succeeded. A
   274    // transaction may transition from the STAGING to PENDING or ABORTED
   275    // state only if one of its in-flight requests is prevented from ever
   276    // succeeding.
   277    STAGING = 3;
   278    // COMMITTED is the state for a transaction which has been
   279    // committed. Mutations made as part of a transaction which is moved
   280    // into COMMITTED state become durable and visible to other
   281    // transactions, moving from "intents" to permanent versioned
   282    // values.
   283    COMMITTED = 1;
   284    // ABORTED is the state for a transaction which has been aborted.
   285    // Mutations made as part of a transaction which is moved into
   286    // ABORTED state are deleted and are never made visible to other
   287    // transactions.
   288    ABORTED = 2;
   289  }
   290  
   291  message ObservedTimestamp {
   292    option (gogoproto.equal) = true;
   293  
   294    option (gogoproto.populate) = true;
   295  
   296    int32 node_id = 1 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
   297    util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
   298  }
   299  
   300  // A Transaction is a unit of work performed on the database.
   301  // Cockroach transactions always operate at the serializable isolation
   302  // level. Each Cockroach transaction is assigned a random priority.
   303  // This priority will be used to decide whether a transaction will be
   304  // aborted during contention.
   305  //
   306  // If you add fields to Transaction you'll need to update
   307  // Transaction.Clone. Failure to do so will result in test failures.
   308  message Transaction {
   309    option (gogoproto.equal) = true;
   310  
   311    option (gogoproto.goproto_stringer) = false;
   312    option (gogoproto.populate) = true;
   313  
   314    // The transaction metadata. This field includes the subset of information
   315    // that is persisted with every write intent.
   316    storage.enginepb.TxnMeta meta = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
   317    // A free-text identifier for debug purposes.
   318    string name = 2;
   319    // The status of the transaction.
   320    TransactionStatus status = 4;
   321    // The last time that the transaction's record was sent a heartbeat by its
   322    // coordinator to indicate client activity. Concurrent transactions will
   323    // avoid aborting a transaction if it observes recent-enough activity.
   324    util.hlc.Timestamp last_heartbeat = 5 [(gogoproto.nullable) = false];
   325    // The timestamp at which the transaction's current epoch started. Up until
   326    // version 19.2, this was used in conjunction with read_timestamp to
   327    // determine a transaction's read timestamp. In 20.1, read_timestamp
   328    // alone is sufficient. This is just maintained for compatibility with 19.2.
   329    // TODO(andrei): Remove in 20.2.
   330    util.hlc.Timestamp deprecated_orig_timestamp = 6 [(gogoproto.nullable) = false];
   331    // This flag is set if the transaction's timestamp was "leaked" beyond the
   332    // transaction (e.g. via cluster_logical_timestamp()). If true, this prevents
   333    // the transaction's timestamp from being pushed, which means that the txn
   334    // can't commit at a higher timestamp without resorting to a client-side
   335    // retry.
   336    bool commit_timestamp_fixed = 16;
   337    // The transaction's read timestamp. All reads are performed at this
   338    // timestamp, ensuring that the transaction runs on top of a consistent
   339    // snapshot of the database.
   340    // Writes are performed at the transaction's write timestamp (meta.timestamp).
   341    // The write timestamp can diverge from the read timestamp when a write is
   342    // "pushed": for example in case a write runs into the timestamp cache, we're
   343    // forced to write at a higher timestamp. Being serializable, the transaction
   344    // can't commit if the write timestamp diverged from the read timestamp unless
   345    // we prove that the read timestamp can also be advanced to match the
   346    // write timestamp; it can be advanced if the two timestamps are equivalent
   347    // for everything that the transaction has read (meaning that there's no
   348    // values in between the read timestamp and the write timestamp for any key in
   349    // the txn's read set). We call checking whether the read timestamp can
   350    // advance "refreshing the read set". So, the read timestamp advances after a
   351    // successful refresh or, if the refresh is unsuccessful, after a transaction
   352    // restart.
   353    util.hlc.Timestamp read_timestamp = 15 [(gogoproto.nullable) = false];
   354    // Initial Timestamp + clock skew. Reads which encounter values with
   355    // timestamps between timestamp and max_timestamp trigger a txn
   356    // retry error, unless the node being read is listed in observed_timestamps
   357    // (in which case no more read uncertainty can occur).
   358    // The case max_timestamp < timestamp is possible for transactions which have
   359    // been pushed; in this case, max_timestamp should be ignored.
   360    util.hlc.Timestamp max_timestamp = 7 [(gogoproto.nullable) = false];
   361    // A list of <NodeID, timestamp> pairs. The list maps NodeIDs to timestamps
   362    // as observed from their local clock during this transaction. The purpose of
   363    // this list is to avoid uncertainty related restarts which normally occur
   364    // when reading a value in the near future as per the max_timestamp field.
   365    //
   366    // ### Meaning:
   367    //
   368    // Morally speaking, having an entry for a node in this list means that this
   369    // node has been visited before, and that no more uncertainty restarts are
   370    // expected for operations served from it. However, this is not entirely
   371    // accurate. For example, say a txn starts with read_timestamp=1 (and some
   372    // large max_timestamp). It then reads key "a" from node A, registering an
   373    // entry `A -> 5` in the process (`5` happens to be a timestamp taken off
   374    // that node's clock at the start of the read).
   375    //
   376    // Now assume that some other transaction writes and commits a value at key "b"
   377    // and timestamp 4 (again, served by node A), and our transaction attempts to
   378    // read that key. Since there is an entry in its observed_timestamps for A,
   379    // our uncertainty window is `[read_timestamp, 5) = [1, 5)` but the value at
   380    // key "b" is in that window, and so we will restart. However, we will restart
   381    // with a timestamp that is at least high as our entry in the list for node A,
   382    // so no future operation on node A will be uncertain.
   383    //
   384    // ### Correctness:
   385    //
   386    // Thus, expressed properly, we can say that when a node has been read from
   387    // successfully before by a transaction, uncertainty for values written by a
   388    // leaseholder on that node is restricted to values with timestamps in the
   389    // interval [read_timestamp, first_visit_timestamp). An upper bound can be
   390    // placed on the uncertainty window because we are guaranteed that at the time
   391    // that the transaction first visited the node, none of the Ranges that it was
   392    // a leaseholder for had served any writes at higher timestamps than the clock
   393    // reading we observe. This implies the following property:
   394    //
   395    //    Any writes that the transaction may later see written by leaseholders on
   396    //    this node at higher timestamps than the observed timestamp could not have
   397    //    taken place causally before this transaction and can be ignored for the
   398    //    purposes of uncertainty.
   399    //
   400    // There are two invariants necessary for this property to hold:
   401    // 1. a leaseholder's clock must always be equal to or greater than the timestamp
   402    //    of all writes that it has served. This is trivial to enforce for
   403    //    non-transactional writes. It is more complicated for transactional writes
   404    //    which may move their commit timestamp forward over their lifetime before
   405    //    committing, even after writing intents on remote Ranges. To accommodate
   406    //    this situation, transactions ensure that at the time of their commit, any
   407    //    leaseholder for a Range that contains one of its intent has an HLC clock
   408    //    with an equal or greater timestamp than the transaction's commit timestamp.
   409    //    TODO(nvanbenschoten): This is violated by txn refreshes. See #36431.
   410    // 2. a leaseholder's clock must always be equal to or greater than the timestamp
   411    //    of all writes that previous leaseholders for its Range have served. We
   412    //    enforce that when a Replica acquires a lease it bumps its node's clock to a
   413    //    time higher than the previous leaseholder's clock when it stopped serving
   414    //    writes. This is accomplished cooperatively for lease transfers and through
   415    //    a statis period before lease expiration for lease acquisitions. It then
   416    //    follows by induction that, in conjunction with the previous invariant, this
   417    //    invariant holds for all leaseholders, given that a Range's initial
   418    //    leaseholder assumes responsibility for an empty range with no writes.
   419    //
   420    // ### Usage:
   421    //
   422    // The property ensures that when this list holds a corresponding entry for
   423    // the node who owns the lease that the current request is executing under, we
   424    // can run the request with the list's timestamp as the upper bound for its
   425    // uncertainty interval, limiting (and often avoiding) uncertainty restarts.
   426    // We do this by lowering the request's max_timestamp down to the timestamp in
   427    // the observed timestamp entry, which is done in Replica.limitTxnMaxTimestamp.
   428    //
   429    // However, as stated, the correctness property only holds for values at
   430    // higher timestamps than the observed timestamp written *by leaseholders on
   431    // this node*. This is critical, as the property tells us nothing about values
   432    // written by leaseholders on different nodes, even if a lease for one of
   433    // those Ranges has since moved to a node that we have an observed timestamp
   434    // entry for. To accommodate this limitation, Replica.limitTxnMaxTimestamp
   435    // first forwards the timestamp in the observed timestamp entry by the start
   436    // timestamp of the lease that the request is executing under before using it
   437    // to limit the request's uncertainty interval.
   438    //
   439    // When a transaction is first initialized on a node, it may use a timestamp
   440    // from the local hybrid logical clock to initialize the corresponding entry
   441    // in the list. In particular, if `read_timestamp` is taken from that node's
   442    // clock, we may add that to the list, which eliminates read uncertainty for
   443    // reads on that node.
   444    //
   445    // The slice of observed timestamps is kept sorted by NodeID. Use
   446    // Transaction.UpdateObservedTimestamp to maintain the sorted order. The
   447    // slice should be treated as immutable and all updates should be performed
   448    // on a copy of the slice.
   449    repeated ObservedTimestamp observed_timestamps = 8 [(gogoproto.nullable) = false];
   450    // If set, a write performed by the transaction could not be performed at the
   451    // transaction's read timestamp because a newer value was present. Had our
   452    // write been performed, it would have overwritten the other value even though
   453    // that value might not have been read by a previous read in the transaction
   454    // (i.e. lost update anomaly). The write is still performed, but this flag is
   455    // set and the txn's write timestamp is bumped, so the client will not be able
   456    // to commit without performing a refresh.
   457    //
   458    // Since 20.1, errors do not carry this flag; only successful BatchResponses
   459    // do. When possible, such a BatchResponse is preferred to a WriteTooOldError
   460    // because the former leaves intents behind to act as locks.
   461    //
   462    // On the client, the txnSpanRefresher terminates this flag by refreshing
   463    // eagerly when the flag is set. If the key that generated the write too old
   464    // condition had been previously read by the transaction, a refresh of the
   465    // transaction's read span will surely fail. The client is not currently smart
   466    // enough to avoid hopeless refreshes, though.
   467    //
   468    // Historically, this field was also important for SNAPSHOT transactions which
   469    // could commit in other situations when the write timestamp is bumped, but
   470    // not when this flag is set (since lost updates cannot be tolerated even in
   471    // SNAPSHOT). In SERIALIZABLE isolation, transactions generally don't commit
   472    // with a bumped write timestamp, so this flag is only telling us that a
   473    // refresh is less likely to succeed than in other cases where
   474    // ReadTimestamp != WriteTimestamp.
   475    bool write_too_old = 12;
   476    // Set of spans that the transaction has acquired locks within. These are
   477    // spans which must be resolved on txn completion. Note that these spans
   478    // may be condensed to cover aggregate spans if the keys locked by the
   479    // transaction exceeded a size threshold.
   480    //
   481    // The set logically extends to include the keys of all writes in the
   482    // in-flight write set. However, those keys are not stored in this set
   483    // to avoid duplication. This means that elements that are removed from
   484    // that set should be merged into this one.
   485    //
   486    // The slice is maintained in sorted order and all spans are maximally
   487    // merged such that no two spans here overlap each other. It should be
   488    // treated as immutable and all updates should be performed on a copy
   489    // of the slice.
   490    repeated Span lock_spans = 11 [(gogoproto.nullable) = false];
   491    // Set of in-flight intent writes that have been issued by the transaction but
   492    // which may not have succeeded yet. If any in-flight writes are provided, a
   493    // committing EndTxn request will move a PENDING transaction to the STAGING
   494    // status instead of the COMMITTED status. These in-flight writes must then
   495    // all be confirmed as successful before the transaction can be moved from
   496    // STAGING to COMMITTED. Because of this, the set will only ever contain
   497    // entries when the transaction is STAGING. For more, see txnCommitter.
   498    //
   499    // The slice is maintained in sorted order by sequence number. It should be
   500    // treated as immutable and all updates should be performed on a copy of the
   501    // slice.
   502    repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false];
   503    // A list of ignored seqnum ranges.
   504    //
   505    // The slice is maintained as non-overlapping, non-contiguous (i.e. it must
   506    // coalesce ranges to avoid situations where a range's end seqnum is equal to
   507    // the next range's start seqnum), and sorted in seqnum order. It should be
   508    // treated as immutable and all updates should be performed on a copy of the
   509    // slice.
   510    repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 18
   511      [(gogoproto.nullable) = false, (gogoproto.customname) = "IgnoredSeqNums"];
   512  
   513    reserved 3, 9, 13, 14;
   514  }
   515  
   516  // A TransactionRecord message contains the subset of the fields in a
   517  // Transaction message that must be persisted in a transaction record.
   518  // It can be thought of as a mask for the fields in Transaction that
   519  // end up persisted in a transaction record.
   520  //
   521  // The message type is wire-compatible with persisted Transaction protos,
   522  // but avoids the overhead of the fields in Transaction that don't need to
   523  // be persisted in a transaction record. It also serves as a specification
   524  // for the fields that must be present in a transaction record.
   525  //
   526  // NOTE: any changes to this type must be reflected in the AsRecord and
   527  // AsTransaction methods.
   528  message TransactionRecord {
   529    option (gogoproto.equal) = true;
   530    option (gogoproto.populate) = true;
   531  
   532    // See comments on Transaction proto.
   533    storage.enginepb.TxnMeta meta     = 1  [(gogoproto.nullable) = false, (gogoproto.embed) = true];
   534    TransactionStatus status                 = 4;
   535    util.hlc.Timestamp last_heartbeat        = 5  [(gogoproto.nullable) = false];
   536    repeated Span lock_spans                 = 11 [(gogoproto.nullable) = false];
   537    repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false];
   538    repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 18
   539      [(gogoproto.nullable) = false, (gogoproto.customname) = "IgnoredSeqNums"];
   540  
   541    // Fields on Transaction that are not present in a transaction record.
   542    reserved 2, 3, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16;
   543  }
   544  
   545  // A Intent is a Span together with a Transaction metadata. Intents messages
   546  // are used to reference persistent on-disk write intents. They are used on
   547  // the return path of e.g. scans, to report the existence of a write intent
   548  // on a key.
   549  //
   550  // Note: avoid constructing Intent directly; consider using MakeIntent() instead.
   551  message Intent {
   552    option (gogoproto.equal) = true;
   553  
   554    // SingleKeySpan preseves wire compatibility with an earlier version of this
   555    // proto which used a Span. An Intent never spans keys, so there was no need
   556    // for this to contain an EndKey.
   557    message SingleKeySpan {
   558      option (gogoproto.equal) = true;
   559  
   560      reserved 1, 2, 4;
   561      // The start key of the key range.
   562      bytes key = 3 [(gogoproto.casttype) = "Key"];
   563    }
   564    SingleKeySpan single_key_span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
   565    storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];
   566  }
   567  
   568  // A LockAcquisition represents the action of a Transaction acquiring a lock
   569  // with a specified durbility level over a Span of keys.
   570  message LockAcquisition {
   571    option (gogoproto.equal) = true;
   572  
   573    Span span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
   574    storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];
   575    kv.kvserver.concurrency.lock.Durability durability = 3;
   576  }
   577  
   578  // A LockUpdate is a Span together with Transaction state. LockUpdate messages
   579  // are used to update all locks held by the transaction within the span to the
   580  // transaction's authoritative state. As such, the message is used as input
   581  // argument to intent resolution, to pass the current txn status, timestamps and
   582  // ignored seqnum ranges to the resolution algorithm.
   583  message LockUpdate {
   584    option (gogoproto.equal) = true;
   585  
   586    Span span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
   587    storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];
   588    TransactionStatus status = 3;
   589    repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 4 [(gogoproto.nullable) = false, (gogoproto.customname) = "IgnoredSeqNums"];
   590  }
   591  
   592  // A SequencedWrite is a point write to a key with a certain sequence number.
   593  message SequencedWrite {
   594    option (gogoproto.equal) = true;
   595    option (gogoproto.populate) = true;
   596  
   597    // The key that the write was made at.
   598    bytes key = 1 [(gogoproto.casttype) = "Key"];
   599    // The sequence number of the request that created the write.
   600    int32 sequence = 2 [
   601      (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
   602  }
   603  
   604  // Lease contains information about range leases including the
   605  // expiration and lease holder.
   606  message Lease {
   607    option (gogoproto.goproto_stringer) = false;
   608    option (gogoproto.populate) = true;
   609  
   610    // The start is a timestamp at which the lease begins. This value
   611    // must be greater than the last lease expiration or the lease request
   612    // is considered invalid.
   613    util.hlc.Timestamp start = 1 [(gogoproto.nullable) = false];
   614  
   615    // The expiration is a timestamp at which the lease expires. This means that
   616    // a new lease can be granted for a later timestamp.
   617    util.hlc.Timestamp expiration = 2;
   618  
   619    // The address of the would-be lease holder.
   620    ReplicaDescriptor replica = 3 [(gogoproto.nullable) = false];
   621  
   622    // The start of the lease stasis period. This field is deprecated.
   623    util.hlc.Timestamp deprecated_start_stasis = 4;
   624  
   625    // The current timestamp when this lease has been proposed. Used after a
   626    // transfer and after a node restart to enforce that a node only uses leases
   627    // proposed after the time of the said transfer or restart. This is nullable
   628    // to help with the rollout (such that a lease applied by some nodes before
   629    // the rollout and some nodes after the rollout is serialized the same).
   630    // TODO(andrei): Make this non-nullable after the rollout.
   631    util.hlc.Timestamp proposed_ts  = 5 [(gogoproto.customname) = "ProposedTS"];
   632  
   633    // The epoch of the lease holder's node liveness entry. If this value
   634    // is non-zero, the start and expiration values are ignored.
   635    int64 epoch = 6;
   636  
   637    // A zero-indexed sequence number which is incremented during the acquisition
   638    // of each new range lease that is not equivalent to the previous range lease
   639    // (i.e. an acquisition that implies a leaseholder change). The sequence
   640    // number is used to detect lease changes between command proposal and
   641    // application without requiring that we send the entire lease through Raft.
   642    // Lease sequence numbers are a reflection of the "lease equivalency" property
   643    // (see Lease.Equivalent). Two adjacent leases that are equivalent will have
   644    // the same sequence number and two adjacent leases that are not equivalent
   645    // will have different sequence numbers.
   646    int64 sequence = 7 [(gogoproto.casttype) = "LeaseSequence"];
   647  }
   648  
   649  // AbortSpanEntry contains information about a transaction which has
   650  // been aborted. It's written to a range's AbortSpan if the range
   651  // may have contained intents of the aborted txn. In the event that
   652  // the same transaction attempts to read keys it may have written
   653  // previously, this entry informs the transaction that it has aborted
   654  // and must start fresh with an updated priority.
   655  message AbortSpanEntry {
   656    option (gogoproto.equal) = true;
   657    option (gogoproto.populate) = true;
   658  
   659    // The key of the associated transaction.
   660    bytes key = 1 [(gogoproto.casttype) = "Key"];
   661    // The candidate commit timestamp the transaction record held at the time
   662    // it was aborted.
   663    util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
   664    // The priority of the transaction.
   665    int32 priority = 3 [
   666      (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnPriority"];
   667  }
   668  
   669  // LeafTxnInputState is the state from a transaction coordinator
   670  // necessary and sufficient to set up a leaf transaction coordinator
   671  // on another node.
   672  message LeafTxnInputState {
   673    // txn is a copy of the transaction record.
   674    Transaction txn = 1 [(gogoproto.nullable) = false];
   675    reserved 2, 3, 4, 5, 6;
   676    // refresh_invalid indicates that the root txn is not
   677    // collecting refresh spans so the leaf should also avoid
   678    // collecting them. This is an optimization: it avoids
   679    // the collection work in that cases and also possibly
   680    // reduces memory usage.
   681    bool refresh_invalid = 7;
   682    // in_flight_writes stores all writes that are in-flight and have not yet
   683    // been proven to have succeeded. Overlapping requests must chain on to
   684    // their success using a QueryIntent request.
   685    repeated SequencedWrite in_flight_writes = 8 [(gogoproto.nullable) = false];
   686    // Whether stepping mode is enabled. False indicates synchronous
   687    // read-own-writes, where every KV read is able to observe the
   688    // latest writes. True indicates that KV reads should be done at the
   689    // read_seq_num specified below.
   690    bool stepping_mode_enabled = 9;
   691    // Current read seqnum. When stepping_mode_enabled is true,
   692    // this field becomes the sequence number used for reads,
   693    // regardless of the current seqnum generated for writes. This is
   694    // updated via the (client.TxnSender).Step() operation.
   695    int32 read_seq_num = 10 [
   696      (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
   697  }
   698  
   699  // LeafTxnFinalState is the state from a leaf transaction coordinator
   700  // necessary and sufficient to update a RootTxn on the gateway
   701  // coordinator.
   702  message LeafTxnFinalState {
   703    // txn is a copy of the transaction record.
   704    // TODO(knz,andrei): We don't actually need the fully txn
   705    // record. This can be simplified.
   706    // See: https://github.com/cockroachdb/cockroach/issues/43192
   707    Transaction txn = 1 [(gogoproto.nullable) = false];
   708    reserved 2;
   709    // deprecated_command_count indicates that at least one request
   710    // has been processed in this transaction.
   711    // Populated only for compatibility with pre-20.1 nodes.
   712    // TODO(knz,andrei): Remove this in 20.2.
   713    int32 deprecated_command_count = 3;
   714    // refresh_spans contains the key spans read by the leaf. The root will add
   715    // them to its own tracking of reads.
   716    repeated Span refresh_spans = 4 [(gogoproto.nullable) = false];
   717    reserved 5;
   718    reserved 6;
   719    // refresh_invalid is set if refresh spans have not been collected. In this
   720    // case, refresh_spans is empty. It may be set because the leaf was asked not
   721    // to collect spans or because the leaf's reads exceeded the tracking memory
   722    // budget.
   723    bool refresh_invalid = 7;
   724    reserved 8;
   725  }